diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 12:18:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 12:18:05 +0000 |
commit | b46aad6df449445a9fc4aa7b32bd40005438e3f7 (patch) | |
tree | 751aa858ca01f35de800164516b298887382919d /src | |
parent | Initial commit. (diff) | |
download | haproxy-b46aad6df449445a9fc4aa7b32bd40005438e3f7.tar.xz haproxy-b46aad6df449445a9fc4aa7b32bd40005438e3f7.zip |
Adding upstream version 2.9.5.upstream/2.9.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/acl.c | 1377 | ||||
-rw-r--r-- | src/action.c | 363 | ||||
-rw-r--r-- | src/activity.c | 1248 | ||||
-rw-r--r-- | src/applet.c | 501 | ||||
-rw-r--r-- | src/arg.c | 479 | ||||
-rw-r--r-- | src/auth.c | 316 | ||||
-rw-r--r-- | src/backend.c | 3401 | ||||
-rw-r--r-- | src/base64.c | 303 | ||||
-rw-r--r-- | src/cache.c | 3014 | ||||
-rw-r--r-- | src/calltrace.c | 286 | ||||
-rw-r--r-- | src/cbuf.c | 59 | ||||
-rw-r--r-- | src/cfgcond.c | 559 | ||||
-rw-r--r-- | src/cfgdiag.c | 97 | ||||
-rw-r--r-- | src/cfgparse-global.c | 1396 | ||||
-rw-r--r-- | src/cfgparse-listen.c | 3073 | ||||
-rw-r--r-- | src/cfgparse-quic.c | 292 | ||||
-rw-r--r-- | src/cfgparse-ssl.c | 2382 | ||||
-rw-r--r-- | src/cfgparse-tcp.c | 275 | ||||
-rw-r--r-- | src/cfgparse-unix.c | 135 | ||||
-rw-r--r-- | src/cfgparse.c | 4798 | ||||
-rw-r--r-- | src/channel.c | 591 | ||||
-rw-r--r-- | src/check.c | 2642 | ||||
-rw-r--r-- | src/chunk.c | 311 | ||||
-rw-r--r-- | src/cli.c | 3423 | ||||
-rw-r--r-- | src/clock.c | 460 | ||||
-rw-r--r-- | src/compression.c | 742 | ||||
-rw-r--r-- | src/connection.c | 2748 | ||||
-rw-r--r-- | src/cpuset.c | 296 | ||||
-rw-r--r-- | src/debug.c | 2301 | ||||
-rw-r--r-- | src/dgram.c | 79 | ||||
-rw-r--r-- | src/dict.c | 127 | ||||
-rw-r--r-- | src/dns.c | 1330 | ||||
-rw-r--r-- | src/dynbuf.c | 129 | ||||
-rw-r--r-- | src/eb32sctree.c | 472 | ||||
-rw-r--r-- | src/eb32tree.c | 218 | ||||
-rw-r--r-- | src/eb64tree.c | 218 | ||||
-rw-r--r-- | src/ebimtree.c | 44 | ||||
-rw-r--r-- | src/ebistree.c | 42 | ||||
-rw-r--r-- | src/ebmbtree.c | 77 | ||||
-rw-r--r-- | src/ebpttree.c | 208 | ||||
-rw-r--r-- | src/ebsttree.c | 42 | ||||
-rw-r--r-- | src/ebtree.c | 50 | ||||
-rw-r--r-- | src/errors.c | 567 | ||||
-rw-r--r-- | src/ev_epoll.c | 413 | ||||
-rw-r--r-- | src/ev_evports.c | 441 | ||||
-rw-r--r-- | src/ev_kqueue.c | 380 | ||||
-rw-r--r-- | src/ev_poll.c | 348 | ||||
-rw-r--r-- | src/ev_select.c | 335 | ||||
-rw-r--r-- | src/event_hdl.c | 999 | ||||
-rw-r--r-- | src/extcheck.c | 694 | ||||
-rw-r--r-- | src/fcgi-app.c | 1133 | ||||
-rw-r--r-- | src/fcgi.c | 294 | ||||
-rw-r--r-- | src/fd.c | 1348 | ||||
-rw-r--r-- | src/filters.c | 1125 | ||||
-rw-r--r-- | src/fix.c | 264 | ||||
-rw-r--r-- | src/flt_bwlim.c | 976 | ||||
-rw-r--r-- | src/flt_http_comp.c | 1076 | ||||
-rw-r--r-- | src/flt_spoe.c | 4739 | ||||
-rw-r--r-- | src/flt_trace.c | 675 | ||||
-rw-r--r-- | src/freq_ctr.c | 218 | ||||
-rw-r--r-- | src/frontend.c | 339 | ||||
-rw-r--r-- | src/h1.c | 1319 | ||||
-rw-r--r-- | src/h1_htx.c | 1074 | ||||
-rw-r--r-- | src/h2.c | 814 | ||||
-rw-r--r-- | src/h3.c | 2403 | ||||
-rw-r--r-- | src/h3_stats.c | 276 | ||||
-rw-r--r-- | src/haproxy.c | 3962 | ||||
-rw-r--r-- | src/hash.c | 190 | ||||
-rw-r--r-- | src/hlua.c | 13961 | ||||
-rw-r--r-- | src/hlua_fcn.c | 2721 | ||||
-rw-r--r-- | src/hpack-dec.c | 475 | ||||
-rw-r--r-- | src/hpack-enc.c | 210 | ||||
-rw-r--r-- | src/hpack-huff.c | 861 | ||||
-rw-r--r-- | src/hpack-tbl.c | 372 | ||||
-rw-r--r-- | src/hq_interop.c | 174 | ||||
-rw-r--r-- | src/http.c | 1433 | ||||
-rw-r--r-- | src/http_acl.c | 185 | ||||
-rw-r--r-- | src/http_act.c | 2501 | ||||
-rw-r--r-- | src/http_ana.c | 5153 | ||||
-rw-r--r-- | src/http_client.c | 1598 | ||||
-rw-r--r-- | src/http_conv.c | 453 | ||||
-rw-r--r-- | src/http_ext.c | 1881 | ||||
-rw-r--r-- | src/http_fetch.c | 2368 | ||||
-rw-r--r-- | src/http_htx.c | 3028 | ||||
-rw-r--r-- | src/http_rules.c | 530 | ||||
-rw-r--r-- | src/htx.c | 1099 | ||||
-rw-r--r-- | src/init.c | 249 | ||||
-rw-r--r-- | src/jwt.c | 478 | ||||
-rw-r--r-- | src/lb_chash.c | 517 | ||||
-rw-r--r-- | src/lb_fas.c | 348 | ||||
-rw-r--r-- | src/lb_fwlc.c | 375 | ||||
-rw-r--r-- | src/lb_fwrr.c | 623 | ||||
-rw-r--r-- | src/lb_map.c | 281 | ||||
-rw-r--r-- | src/linuxcap.c | 191 | ||||
-rw-r--r-- | src/listener.c | 2487 | ||||
-rw-r--r-- | src/log.c | 4659 | ||||
-rw-r--r-- | src/lru.c | 305 | ||||
-rw-r--r-- | src/mailers.c | 329 | ||||
-rw-r--r-- | src/map.c | 1232 | ||||
-rw-r--r-- | src/mjson.c | 1048 | ||||
-rw-r--r-- | src/mqtt.c | 1281 | ||||
-rw-r--r-- | src/mux_fcgi.c | 4268 | ||||
-rw-r--r-- | src/mux_h1.c | 5374 | ||||
-rw-r--r-- | src/mux_h2.c | 7598 | ||||
-rw-r--r-- | src/mux_pt.c | 904 | ||||
-rw-r--r-- | src/mux_quic.c | 3067 | ||||
-rw-r--r-- | src/mworker-prog.c | 359 | ||||
-rw-r--r-- | src/mworker.c | 821 | ||||
-rw-r--r-- | src/namespace.c | 132 | ||||
-rw-r--r-- | src/ncbuf.c | 986 | ||||
-rw-r--r-- | src/pattern.c | 2683 | ||||
-rw-r--r-- | src/payload.c | 1448 | ||||
-rw-r--r-- | src/peers.c | 4231 | ||||
-rw-r--r-- | src/pipe.c | 136 | ||||
-rw-r--r-- | src/pool.c | 1539 | ||||
-rw-r--r-- | src/proto_quic.c | 799 | ||||
-rw-r--r-- | src/proto_rhttp.c | 464 | ||||
-rw-r--r-- | src/proto_sockpair.c | 589 | ||||
-rw-r--r-- | src/proto_tcp.c | 834 | ||||
-rw-r--r-- | src/proto_udp.c | 247 | ||||
-rw-r--r-- | src/proto_uxdg.c | 159 | ||||
-rw-r--r-- | src/proto_uxst.c | 372 | ||||
-rw-r--r-- | src/protocol.c | 309 | ||||
-rw-r--r-- | src/proxy.c | 3451 | ||||
-rw-r--r-- | src/qmux_http.c | 108 | ||||
-rw-r--r-- | src/qmux_trace.c | 114 | ||||
-rw-r--r-- | src/qpack-dec.c | 563 | ||||
-rw-r--r-- | src/qpack-enc.c | 185 | ||||
-rw-r--r-- | src/qpack-tbl.c | 415 | ||||
-rw-r--r-- | src/queue.c | 761 | ||||
-rw-r--r-- | src/quic_ack.c | 258 | ||||
-rw-r--r-- | src/quic_cc.c | 49 | ||||
-rw-r--r-- | src/quic_cc_cubic.c | 542 | ||||
-rw-r--r-- | src/quic_cc_newreno.c | 220 | ||||
-rw-r--r-- | src/quic_cc_nocc.c | 76 | ||||
-rw-r--r-- | src/quic_cid.c | 286 | ||||
-rw-r--r-- | src/quic_cli.c | 413 | ||||
-rw-r--r-- | src/quic_conn.c | 1893 | ||||
-rw-r--r-- | src/quic_frame.c | 1273 | ||||
-rw-r--r-- | src/quic_loss.c | 312 | ||||
-rw-r--r-- | src/quic_openssl_compat.c | 531 | ||||
-rw-r--r-- | src/quic_retransmit.c | 252 | ||||
-rw-r--r-- | src/quic_retry.c | 320 | ||||
-rw-r--r-- | src/quic_rx.c | 2290 | ||||
-rw-r--r-- | src/quic_sock.c | 1080 | ||||
-rw-r--r-- | src/quic_ssl.c | 790 | ||||
-rw-r--r-- | src/quic_stats.c | 215 | ||||
-rw-r--r-- | src/quic_stream.c | 294 | ||||
-rw-r--r-- | src/quic_tls.c | 1095 | ||||
-rw-r--r-- | src/quic_tp.c | 714 | ||||
-rw-r--r-- | src/quic_trace.c | 633 | ||||
-rw-r--r-- | src/quic_tx.c | 2348 | ||||
-rw-r--r-- | src/raw_sock.c | 489 | ||||
-rw-r--r-- | src/regex.c | 459 | ||||
-rw-r--r-- | src/resolvers.c | 3813 | ||||
-rw-r--r-- | src/ring.c | 482 | ||||
-rw-r--r-- | src/sample.c | 5173 | ||||
-rw-r--r-- | src/server.c | 6765 | ||||
-rw-r--r-- | src/server_state.c | 947 | ||||
-rw-r--r-- | src/session.c | 528 | ||||
-rw-r--r-- | src/sha1.c | 308 | ||||
-rw-r--r-- | src/shctx.c | 320 | ||||
-rw-r--r-- | src/signal.c | 284 | ||||
-rw-r--r-- | src/sink.c | 1406 | ||||
-rw-r--r-- | src/slz.c | 1421 | ||||
-rw-r--r-- | src/sock.c | 1072 | ||||
-rw-r--r-- | src/sock_inet.c | 521 | ||||
-rw-r--r-- | src/sock_unix.c | 387 | ||||
-rw-r--r-- | src/ssl_ckch.c | 3968 | ||||
-rw-r--r-- | src/ssl_crtlist.c | 1577 | ||||
-rw-r--r-- | src/ssl_ocsp.c | 1986 | ||||
-rw-r--r-- | src/ssl_sample.c | 2389 | ||||
-rw-r--r-- | src/ssl_sock.c | 8100 | ||||
-rw-r--r-- | src/ssl_utils.c | 702 | ||||
-rw-r--r-- | src/stats.c | 5521 | ||||
-rw-r--r-- | src/stconn.c | 2050 | ||||
-rw-r--r-- | src/stick_table.c | 5658 | ||||
-rw-r--r-- | src/stream.c | 4045 | ||||
-rw-r--r-- | src/task.c | 979 | ||||
-rw-r--r-- | src/tcp_act.c | 749 | ||||
-rw-r--r-- | src/tcp_rules.c | 1428 | ||||
-rw-r--r-- | src/tcp_sample.c | 641 | ||||
-rw-r--r-- | src/tcpcheck.c | 5150 | ||||
-rw-r--r-- | src/thread.c | 1864 | ||||
-rw-r--r-- | src/time.c | 147 | ||||
-rw-r--r-- | src/tools.c | 6348 | ||||
-rw-r--r-- | src/trace.c | 997 | ||||
-rw-r--r-- | src/uri_auth.c | 318 | ||||
-rw-r--r-- | src/uri_normalizer.c | 467 | ||||
-rw-r--r-- | src/vars.c | 1454 | ||||
-rw-r--r-- | src/version.c | 28 | ||||
-rw-r--r-- | src/wdt.c | 193 | ||||
-rw-r--r-- | src/xprt_handshake.c | 299 | ||||
-rw-r--r-- | src/xprt_quic.c | 175 |
194 files changed, 261120 insertions, 0 deletions
diff --git a/src/acl.c b/src/acl.c new file mode 100644 index 0000000..8ef2b7d --- /dev/null +++ b/src/acl.c @@ -0,0 +1,1377 @@ +/* + * ACL management functions. + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <stdio.h> +#include <string.h> + +#include <import/ebsttree.h> + +#include <haproxy/acl.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/auth.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/pattern.h> +#include <haproxy/proxy-t.h> +#include <haproxy/sample.h> +#include <haproxy/stick_table.h> +#include <haproxy/tools.h> +#include <haproxy/cfgparse.h> + +/* List head of all known ACL keywords */ +static struct acl_kw_list acl_keywords = { + .list = LIST_HEAD_INIT(acl_keywords.list) +}; + +/* input values are 0 or 3, output is the same */ +static inline enum acl_test_res pat2acl(struct pattern *pat) +{ + if (pat) + return ACL_TEST_PASS; + else + return ACL_TEST_FAIL; +} + +/* + * Registers the ACL keyword list <kwl> as a list of valid keywords for next + * parsing sessions. + */ +void acl_register_keywords(struct acl_kw_list *kwl) +{ + LIST_APPEND(&acl_keywords.list, &kwl->list); +} + +/* + * Unregisters the ACL keyword list <kwl> from the list of valid keywords. + */ +void acl_unregister_keywords(struct acl_kw_list *kwl) +{ + LIST_DELETE(&kwl->list); + LIST_INIT(&kwl->list); +} + +/* Return a pointer to the ACL <name> within the list starting at <head>, or + * NULL if not found. + */ +struct acl *find_acl_by_name(const char *name, struct list *head) +{ + struct acl *acl; + list_for_each_entry(acl, head, list) { + if (strcmp(acl->name, name) == 0) + return acl; + } + return NULL; +} + +/* Return a pointer to the ACL keyword <kw>, or NULL if not found. Note that if + * <kw> contains an opening parenthesis or a comma, only the left part of it is + * checked. + */ +struct acl_keyword *find_acl_kw(const char *kw) +{ + int index; + const char *kwend; + struct acl_kw_list *kwl; + + kwend = kw; + while (is_idchar(*kwend)) + kwend++; + + list_for_each_entry(kwl, &acl_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if ((strncmp(kwl->kw[index].kw, kw, kwend - kw) == 0) && + kwl->kw[index].kw[kwend-kw] == 0) + return &kwl->kw[index]; + } + } + return NULL; +} + +static struct acl_expr *prune_acl_expr(struct acl_expr *expr) +{ + struct arg *arg; + + pattern_prune(&expr->pat); + + for (arg = expr->smp->arg_p; arg; arg++) { + if (arg->type == ARGT_STOP) + break; + if (arg->type == ARGT_STR || arg->unresolved) { + chunk_destroy(&arg->data.str); + arg->unresolved = 0; + } + } + + release_sample_expr(expr->smp); + + return expr; +} + +/* Parse an ACL expression starting at <args>[0], and return it. If <err> is + * not NULL, it will be filled with a pointer to an error message in case of + * error. This pointer must be freeable or NULL. <al> is an arg_list serving + * as a list head to report missing dependencies. It may be NULL if such + * dependencies are not allowed. + * + * Right now, the only accepted syntax is : + * <subject> [<value>...] + */ +struct acl_expr *parse_acl_expr(const char **args, char **err, struct arg_list *al, + const char *file, int line) +{ + __label__ out_return, out_free_expr; + struct acl_expr *expr; + struct acl_keyword *aclkw; + int refflags, patflags; + const char *arg; + struct sample_expr *smp = NULL; + int idx = 0; + char *ckw = NULL; + const char *endt; + int cur_type; + int nbargs; + int operator = STD_OP_EQ; + int op; + int contain_colon, have_dot; + const char *dot; + signed long long value, minor; + /* The following buffer contain two numbers, a ':' separator and the final \0. */ + char buffer[NB_LLMAX_STR + 1 + NB_LLMAX_STR + 1]; + int is_loaded; + int unique_id; + char *error; + struct pat_ref *ref; + struct pattern_expr *pattern_expr; + int load_as_map = 0; + int acl_conv_found = 0; + + /* First, we look for an ACL keyword. And if we don't find one, then + * we look for a sample fetch expression starting with a sample fetch + * keyword. + */ + + if (al) { + al->ctx = ARGC_ACL; // to report errors while resolving args late + al->kw = *args; + al->conv = NULL; + } + + aclkw = find_acl_kw(args[0]); + if (aclkw) { + /* OK we have a real ACL keyword */ + + /* build new sample expression for this ACL */ + smp = calloc(1, sizeof(*smp)); + if (!smp) { + memprintf(err, "out of memory when parsing ACL expression"); + goto out_return; + } + LIST_INIT(&(smp->conv_exprs)); + smp->fetch = aclkw->smp; + smp->arg_p = empty_arg_list; + + /* look for the beginning of the subject arguments */ + for (arg = args[0]; is_idchar(*arg); arg++) + ; + + /* At this point, we have : + * - args[0] : beginning of the keyword + * - arg : end of the keyword, first character not part of keyword + */ + nbargs = make_arg_list(arg, -1, smp->fetch->arg_mask, &smp->arg_p, + err, &endt, NULL, al); + if (nbargs < 0) { + /* note that make_arg_list will have set <err> here */ + memprintf(err, "ACL keyword '%s' : %s", aclkw->kw, *err); + goto out_free_smp; + } + + if (!smp->arg_p) { + smp->arg_p = empty_arg_list; + } + else if (smp->fetch->val_args && !smp->fetch->val_args(smp->arg_p, err)) { + /* invalid keyword argument, error must have been + * set by val_args(). + */ + memprintf(err, "in argument to '%s', %s", aclkw->kw, *err); + goto out_free_smp; + } + + /* look for the beginning of the converters list. Those directly attached + * to the ACL keyword are found just after the comma. + * If we find any converter, then we don't use the ACL keyword's match + * anymore but the one related to the converter's output type. + */ + if (!sample_parse_expr_cnv((char **)args, NULL, NULL, err, al, file, line, smp, endt)) { + if (err) + memprintf(err, "ACL keyword '%s' : %s", aclkw->kw, *err); + goto out_free_smp; + } + acl_conv_found = !LIST_ISEMPTY(&smp->conv_exprs); + } + else { + /* This is not an ACL keyword, so we hope this is a sample fetch + * keyword that we're going to transparently use as an ACL. If + * so, we retrieve a completely parsed expression with args and + * convs already done. + */ + smp = sample_parse_expr((char **)args, &idx, file, line, err, al, NULL); + if (!smp) { + memprintf(err, "%s in ACL expression '%s'", *err, *args); + goto out_return; + } + } + + /* get last effective output type for smp */ + cur_type = smp_expr_output_type(smp); + + expr = calloc(1, sizeof(*expr)); + if (!expr) { + memprintf(err, "out of memory when parsing ACL expression"); + goto out_free_smp; + } + + pattern_init_head(&expr->pat); + + expr->pat.expect_type = cur_type; + expr->smp = smp; + expr->kw = smp->fetch->kw; + smp = NULL; /* don't free it anymore */ + + if (aclkw && !acl_conv_found) { + expr->kw = aclkw->kw; + expr->pat.parse = aclkw->parse ? aclkw->parse : pat_parse_fcts[aclkw->match_type]; + expr->pat.index = aclkw->index ? aclkw->index : pat_index_fcts[aclkw->match_type]; + expr->pat.match = aclkw->match ? aclkw->match : pat_match_fcts[aclkw->match_type]; + expr->pat.prune = aclkw->prune ? aclkw->prune : pat_prune_fcts[aclkw->match_type]; + } + + if (!expr->pat.parse) { + /* Parse/index/match functions depend on the expression type, + * so we have to map them now. Some types can be automatically + * converted. + */ + switch (cur_type) { + case SMP_T_BOOL: + expr->pat.parse = pat_parse_fcts[PAT_MATCH_BOOL]; + expr->pat.index = pat_index_fcts[PAT_MATCH_BOOL]; + expr->pat.match = pat_match_fcts[PAT_MATCH_BOOL]; + expr->pat.prune = pat_prune_fcts[PAT_MATCH_BOOL]; + expr->pat.expect_type = pat_match_types[PAT_MATCH_BOOL]; + break; + case SMP_T_SINT: + expr->pat.parse = pat_parse_fcts[PAT_MATCH_INT]; + expr->pat.index = pat_index_fcts[PAT_MATCH_INT]; + expr->pat.match = pat_match_fcts[PAT_MATCH_INT]; + expr->pat.prune = pat_prune_fcts[PAT_MATCH_INT]; + expr->pat.expect_type = pat_match_types[PAT_MATCH_INT]; + break; + case SMP_T_ADDR: + case SMP_T_IPV4: + case SMP_T_IPV6: + expr->pat.parse = pat_parse_fcts[PAT_MATCH_IP]; + expr->pat.index = pat_index_fcts[PAT_MATCH_IP]; + expr->pat.match = pat_match_fcts[PAT_MATCH_IP]; + expr->pat.prune = pat_prune_fcts[PAT_MATCH_IP]; + expr->pat.expect_type = pat_match_types[PAT_MATCH_IP]; + break; + case SMP_T_STR: + expr->pat.parse = pat_parse_fcts[PAT_MATCH_STR]; + expr->pat.index = pat_index_fcts[PAT_MATCH_STR]; + expr->pat.match = pat_match_fcts[PAT_MATCH_STR]; + expr->pat.prune = pat_prune_fcts[PAT_MATCH_STR]; + expr->pat.expect_type = pat_match_types[PAT_MATCH_STR]; + break; + } + } + + /* Additional check to protect against common mistakes */ + if (expr->pat.parse && cur_type != SMP_T_BOOL && !*args[1]) { + ha_warning("parsing acl keyword '%s' :\n" + " no pattern to match against were provided, so this ACL will never match.\n" + " If this is what you intended, please add '--' to get rid of this warning.\n" + " If you intended to match only for existence, please use '-m found'.\n" + " If you wanted to force an int to match as a bool, please use '-m bool'.\n" + "\n", + args[0]); + } + + args++; + + /* check for options before patterns. Supported options are : + * -i : ignore case for all patterns by default + * -f : read patterns from those files + * -m : force matching method (must be used before -f) + * -M : load the file as map file + * -u : force the unique id of the acl + * -- : everything after this is not an option + */ + refflags = PAT_REF_ACL; + patflags = 0; + is_loaded = 0; + unique_id = -1; + while (**args == '-') { + if (strcmp(*args, "-i") == 0) + patflags |= PAT_MF_IGNORE_CASE; + else if (strcmp(*args, "-n") == 0) + patflags |= PAT_MF_NO_DNS; + else if (strcmp(*args, "-u") == 0) { + unique_id = strtol(args[1], &error, 10); + if (*error != '\0') { + memprintf(err, "the argument of -u must be an integer"); + goto out_free_expr; + } + + /* Check if this id is really unique. */ + if (pat_ref_lookupid(unique_id)) { + memprintf(err, "the id is already used"); + goto out_free_expr; + } + + args++; + } + else if (strcmp(*args, "-f") == 0) { + if (!expr->pat.parse) { + memprintf(err, "matching method must be specified first (using '-m') when using a sample fetch of this type ('%s')", expr->kw); + goto out_free_expr; + } + + if (!pattern_read_from_file(&expr->pat, refflags, args[1], patflags, load_as_map, err, file, line)) + goto out_free_expr; + is_loaded = 1; + args++; + } + else if (strcmp(*args, "-m") == 0) { + int idx; + + if (is_loaded) { + memprintf(err, "'-m' must only be specified before patterns and files in parsing ACL expression"); + goto out_free_expr; + } + + idx = pat_find_match_name(args[1]); + if (idx < 0) { + memprintf(err, "unknown matching method '%s' when parsing ACL expression", args[1]); + goto out_free_expr; + } + + /* Note: -m found is always valid, bool/int are compatible, str/bin/reg/len are compatible */ + if (idx != PAT_MATCH_FOUND && !sample_casts[cur_type][pat_match_types[idx]]) { + memprintf(err, "matching method '%s' cannot be used with fetch keyword '%s'", args[1], expr->kw); + goto out_free_expr; + } + expr->pat.parse = pat_parse_fcts[idx]; + expr->pat.index = pat_index_fcts[idx]; + expr->pat.match = pat_match_fcts[idx]; + expr->pat.prune = pat_prune_fcts[idx]; + expr->pat.expect_type = pat_match_types[idx]; + args++; + } + else if (strcmp(*args, "-M") == 0) { + refflags |= PAT_REF_MAP; + load_as_map = 1; + } + else if (strcmp(*args, "--") == 0) { + args++; + break; + } + else { + memprintf(err, "'%s' is not a valid ACL option. Please use '--' before any pattern beginning with a '-'", args[0]); + goto out_free_expr; + break; + } + args++; + } + + if (!expr->pat.parse) { + memprintf(err, "matching method must be specified first (using '-m') when using a sample fetch of this type ('%s')", expr->kw); + goto out_free_expr; + } + + /* Create displayed reference */ + snprintf(trash.area, trash.size, "acl '%s' file '%s' line %d", + expr->kw, file, line); + trash.area[trash.size - 1] = '\0'; + + /* Create new pattern reference. */ + ref = pat_ref_newid(unique_id, trash.area, PAT_REF_ACL); + if (!ref) { + memprintf(err, "memory error"); + goto out_free_expr; + } + + /* Create new pattern expression associated to this reference. */ + pattern_expr = pattern_new_expr(&expr->pat, ref, patflags, err, NULL); + if (!pattern_expr) + goto out_free_expr; + + /* now parse all patterns */ + while (**args) { + arg = *args; + + /* Compatibility layer. Each pattern can parse only one string per pattern, + * but the pat_parser_int() and pat_parse_dotted_ver() parsers were need + * optionally two operators. The first operator is the match method: eq, + * le, lt, ge and gt. pat_parse_int() and pat_parse_dotted_ver() functions + * can have a compatibility syntax based on ranges: + * + * pat_parse_int(): + * + * "eq x" -> "x" or "x:x" + * "le x" -> ":x" + * "lt x" -> ":y" (with y = x - 1) + * "ge x" -> "x:" + * "gt x" -> "y:" (with y = x + 1) + * + * pat_parse_dotted_ver(): + * + * "eq x.y" -> "x.y" or "x.y:x.y" + * "le x.y" -> ":x.y" + * "lt x.y" -> ":w.z" (with w.z = x.y - 1) + * "ge x.y" -> "x.y:" + * "gt x.y" -> "w.z:" (with w.z = x.y + 1) + * + * If y is not present, assume that is "0". + * + * The syntax eq, le, lt, ge and gt are proper to the acl syntax. The + * following block of code detect the operator, and rewrite each value + * in parsable string. + */ + if (expr->pat.parse == pat_parse_int || + expr->pat.parse == pat_parse_dotted_ver) { + /* Check for operator. If the argument is operator, memorise it and + * continue to the next argument. + */ + op = get_std_op(arg); + if (op != -1) { + operator = op; + args++; + continue; + } + + /* Check if the pattern contain ':' or '-' character. */ + contain_colon = (strchr(arg, ':') || strchr(arg, '-')); + + /* If the pattern contain ':' or '-' character, give it to the parser as is. + * If no contain ':' and operator is STD_OP_EQ, give it to the parser as is. + * In other case, try to convert the value according with the operator. + */ + if (!contain_colon && operator != STD_OP_EQ) { + /* Search '.' separator. */ + dot = strchr(arg, '.'); + if (!dot) { + have_dot = 0; + minor = 0; + dot = arg + strlen(arg); + } + else + have_dot = 1; + + /* convert the integer minor part for the pat_parse_dotted_ver() function. */ + if (expr->pat.parse == pat_parse_dotted_ver && have_dot) { + if (strl2llrc(dot+1, strlen(dot+1), &minor) != 0) { + memprintf(err, "'%s' is neither a number nor a supported operator", arg); + goto out_free_expr; + } + if (minor >= 65536) { + memprintf(err, "'%s' contains too large a minor value", arg); + goto out_free_expr; + } + } + + /* convert the integer value for the pat_parse_int() function, and the + * integer major part for the pat_parse_dotted_ver() function. + */ + if (strl2llrc(arg, dot - arg, &value) != 0) { + memprintf(err, "'%s' is neither a number nor a supported operator", arg); + goto out_free_expr; + } + if (expr->pat.parse == pat_parse_dotted_ver) { + if (value >= 65536) { + memprintf(err, "'%s' contains too large a major value", arg); + goto out_free_expr; + } + value = (value << 16) | (minor & 0xffff); + } + + switch (operator) { + + case STD_OP_EQ: /* this case is not possible. */ + memprintf(err, "internal error"); + goto out_free_expr; + + case STD_OP_GT: + value++; /* gt = ge + 1 */ + __fallthrough; + + case STD_OP_GE: + if (expr->pat.parse == pat_parse_int) + snprintf(buffer, NB_LLMAX_STR+NB_LLMAX_STR+2, "%lld:", value); + else + snprintf(buffer, NB_LLMAX_STR+NB_LLMAX_STR+2, "%lld.%lld:", + value >> 16, value & 0xffff); + arg = buffer; + break; + + case STD_OP_LT: + value--; /* lt = le - 1 */ + __fallthrough; + + case STD_OP_LE: + if (expr->pat.parse == pat_parse_int) + snprintf(buffer, NB_LLMAX_STR+NB_LLMAX_STR+2, ":%lld", value); + else + snprintf(buffer, NB_LLMAX_STR+NB_LLMAX_STR+2, ":%lld.%lld", + value >> 16, value & 0xffff); + arg = buffer; + break; + } + } + } + + /* Add sample to the reference, and try to compile it fior each pattern + * using this value. + */ + if (!pat_ref_add(ref, arg, NULL, err)) + goto out_free_expr; + args++; + } + + return expr; + + out_free_expr: + prune_acl_expr(expr); + free(expr); + out_free_smp: + free(ckw); + free(smp); + out_return: + return NULL; +} + +/* Purge everything in the acl <acl>, then return <acl>. */ +struct acl *prune_acl(struct acl *acl) { + + struct acl_expr *expr, *exprb; + + free(acl->name); + + list_for_each_entry_safe(expr, exprb, &acl->expr, list) { + LIST_DELETE(&expr->list); + prune_acl_expr(expr); + free(expr); + } + + return acl; +} + +/* Walk the ACL tree, following nested acl() sample fetches, for no more than + * max_recurse evaluations. Returns -1 if a recursive loop is detected, 0 if + * the max_recurse was reached, otherwise the number of max_recurse left. + */ +static int parse_acl_recurse(struct acl *acl, struct acl_expr *expr, int max_recurse) +{ + struct acl_term *term; + struct acl_sample *sample; + + if (strcmp(expr->smp->fetch->kw, "acl") != 0) + return max_recurse; + + if (--max_recurse <= 0) + return 0; + + sample = (struct acl_sample *)expr->smp->arg_p->data.ptr; + list_for_each_entry(term, &sample->suite.terms, list) { + if (term->acl == acl) + return -1; + list_for_each_entry(expr, &term->acl->expr, list) { + max_recurse = parse_acl_recurse(acl, expr, max_recurse); + if (max_recurse <= 0) + return max_recurse; + } + } + + return max_recurse; +} + +/* Parse an ACL with the name starting at <args>[0], and with a list of already + * known ACLs in <acl>. If the ACL was not in the list, it will be added. + * A pointer to that ACL is returned. If the ACL has an empty name, then it's + * an anonymous one and it won't be merged with any other one. If <err> is not + * NULL, it will be filled with an appropriate error. This pointer must be + * freeable or NULL. <al> is the arg_list serving as a head for unresolved + * dependencies. It may be NULL if such dependencies are not allowed. + * + * args syntax: <aclname> <acl_expr> + */ +struct acl *parse_acl(const char **args, struct list *known_acl, char **err, struct arg_list *al, + const char *file, int line) +{ + __label__ out_return, out_free_acl_expr, out_free_name; + struct acl *cur_acl; + struct acl_expr *acl_expr; + char *name; + const char *pos; + + if (**args && (pos = invalid_char(*args))) { + memprintf(err, "invalid character in ACL name : '%c'", *pos); + goto out_return; + } + + acl_expr = parse_acl_expr(args + 1, err, al, file, line); + if (!acl_expr) { + /* parse_acl_expr will have filled <err> here */ + goto out_return; + } + + /* Check for args beginning with an opening parenthesis just after the + * subject, as this is almost certainly a typo. Right now we can only + * emit a warning, so let's do so. + */ + if (!strchr(args[1], '(') && *args[2] == '(') + ha_warning("parsing acl '%s' :\n" + " matching '%s' for pattern '%s' is likely a mistake and probably\n" + " not what you want. Maybe you need to remove the extraneous space before '('.\n" + " If you are really sure this is not an error, please insert '--' between the\n" + " match and the pattern to make this warning message disappear.\n", + args[0], args[1], args[2]); + + if (*args[0]) + cur_acl = find_acl_by_name(args[0], known_acl); + else + cur_acl = NULL; + + if (cur_acl) { + int ret = parse_acl_recurse(cur_acl, acl_expr, ACL_MAX_RECURSE); + if (ret <= 0) { + if (ret < 0) + memprintf(err, "have a recursive loop"); + else + memprintf(err, "too deep acl() tree"); + goto out_free_acl_expr; + } + } else { + name = strdup(args[0]); + if (!name) { + memprintf(err, "out of memory when parsing ACL"); + goto out_free_acl_expr; + } + cur_acl = calloc(1, sizeof(*cur_acl)); + if (cur_acl == NULL) { + memprintf(err, "out of memory when parsing ACL"); + goto out_free_name; + } + + LIST_INIT(&cur_acl->expr); + LIST_APPEND(known_acl, &cur_acl->list); + cur_acl->name = name; + } + + /* We want to know what features the ACL needs (typically HTTP parsing), + * and where it may be used. If an ACL relies on multiple matches, it is + * OK if at least one of them may match in the context where it is used. + */ + cur_acl->use |= acl_expr->smp->fetch->use; + cur_acl->val |= acl_expr->smp->fetch->val; + LIST_APPEND(&cur_acl->expr, &acl_expr->list); + return cur_acl; + + out_free_name: + free(name); + out_free_acl_expr: + prune_acl_expr(acl_expr); + free(acl_expr); + out_return: + return NULL; +} + +/* Some useful ACLs provided by default. Only those used are allocated. */ + +const struct { + const char *name; + const char *expr[4]; /* put enough for longest expression */ +} default_acl_list[] = { + { .name = "TRUE", .expr = {"always_true",""}}, + { .name = "FALSE", .expr = {"always_false",""}}, + { .name = "LOCALHOST", .expr = {"src","127.0.0.1/8","::1",""}}, + { .name = "HTTP", .expr = {"req.proto_http",""}}, + { .name = "HTTP_1.0", .expr = {"req.ver","1.0",""}}, + { .name = "HTTP_1.1", .expr = {"req.ver","1.1",""}}, + { .name = "HTTP_2.0", .expr = {"req.ver","2.0",""}}, + { .name = "HTTP_3.0", .expr = {"req.ver","3.0",""}}, + { .name = "METH_CONNECT", .expr = {"method","CONNECT",""}}, + { .name = "METH_DELETE", .expr = {"method","DELETE",""}}, + { .name = "METH_GET", .expr = {"method","GET","HEAD",""}}, + { .name = "METH_HEAD", .expr = {"method","HEAD",""}}, + { .name = "METH_OPTIONS", .expr = {"method","OPTIONS",""}}, + { .name = "METH_POST", .expr = {"method","POST",""}}, + { .name = "METH_PUT", .expr = {"method","PUT",""}}, + { .name = "METH_TRACE", .expr = {"method","TRACE",""}}, + { .name = "HTTP_URL_ABS", .expr = {"url_reg","^[^/:]*://",""}}, + { .name = "HTTP_URL_SLASH", .expr = {"url_beg","/",""}}, + { .name = "HTTP_URL_STAR", .expr = {"url","*",""}}, + { .name = "HTTP_CONTENT", .expr = {"req.hdr_val(content-length)","gt","0",""}}, + { .name = "RDP_COOKIE", .expr = {"req.rdp_cookie_cnt","gt","0",""}}, + { .name = "REQ_CONTENT", .expr = {"req.len","gt","0",""}}, + { .name = "WAIT_END", .expr = {"wait_end",""}}, + { .name = NULL, .expr = {""}} +}; + +/* Find a default ACL from the default_acl list, compile it and return it. + * If the ACL is not found, NULL is returned. In theory, it cannot fail, + * except when default ACLs are broken, in which case it will return NULL. + * If <known_acl> is not NULL, the ACL will be queued at its tail. If <err> is + * not NULL, it will be filled with an error message if an error occurs. This + * pointer must be freeable or NULL. <al> is an arg_list serving as a list head + * to report missing dependencies. It may be NULL if such dependencies are not + * allowed. + */ +static struct acl *find_acl_default(const char *acl_name, struct list *known_acl, + char **err, struct arg_list *al, + const char *file, int line) +{ + __label__ out_return, out_free_acl_expr, out_free_name; + struct acl *cur_acl; + struct acl_expr *acl_expr; + char *name; + int index; + + for (index = 0; default_acl_list[index].name != NULL; index++) { + if (strcmp(acl_name, default_acl_list[index].name) == 0) + break; + } + + if (default_acl_list[index].name == NULL) { + memprintf(err, "no such ACL : '%s'", acl_name); + return NULL; + } + + acl_expr = parse_acl_expr((const char **)default_acl_list[index].expr, err, al, file, line); + if (!acl_expr) { + /* parse_acl_expr must have filled err here */ + goto out_return; + } + + name = strdup(acl_name); + if (!name) { + memprintf(err, "out of memory when building default ACL '%s'", acl_name); + goto out_free_acl_expr; + } + + cur_acl = calloc(1, sizeof(*cur_acl)); + if (cur_acl == NULL) { + memprintf(err, "out of memory when building default ACL '%s'", acl_name); + goto out_free_name; + } + + cur_acl->name = name; + cur_acl->use |= acl_expr->smp->fetch->use; + cur_acl->val |= acl_expr->smp->fetch->val; + LIST_INIT(&cur_acl->expr); + LIST_APPEND(&cur_acl->expr, &acl_expr->list); + if (known_acl) + LIST_APPEND(known_acl, &cur_acl->list); + + return cur_acl; + + out_free_name: + free(name); + out_free_acl_expr: + prune_acl_expr(acl_expr); + free(acl_expr); + out_return: + return NULL; +} + +/* Parse an ACL condition starting at <args>[0], relying on a list of already + * known ACLs passed in <known_acl>. The new condition is returned (or NULL in + * case of low memory). Supports multiple conditions separated by "or". If + * <err> is not NULL, it will be filled with a pointer to an error message in + * case of error, that the caller is responsible for freeing. The initial + * location must either be freeable or NULL. The list <al> serves as a list head + * for unresolved dependencies. It may be NULL if such dependencies are not + * allowed. + */ +struct acl_cond *parse_acl_cond(const char **args, struct list *known_acl, + enum acl_cond_pol pol, char **err, struct arg_list *al, + const char *file, int line) +{ + __label__ out_return, out_free_suite, out_free_term; + int arg, neg; + const char *word; + struct acl *cur_acl; + struct acl_term *cur_term; + struct acl_term_suite *cur_suite; + struct acl_cond *cond; + unsigned int suite_val; + + cond = calloc(1, sizeof(*cond)); + if (cond == NULL) { + memprintf(err, "out of memory when parsing condition"); + goto out_return; + } + + LIST_INIT(&cond->list); + LIST_INIT(&cond->suites); + cond->pol = pol; + cond->val = 0; + + cur_suite = NULL; + suite_val = ~0U; + neg = 0; + for (arg = 0; *args[arg]; arg++) { + word = args[arg]; + + /* remove as many exclamation marks as we can */ + while (*word == '!') { + neg = !neg; + word++; + } + + /* an empty word is allowed because we cannot force the user to + * always think about not leaving exclamation marks alone. + */ + if (!*word) + continue; + + if (strcasecmp(word, "or") == 0 || strcmp(word, "||") == 0) { + /* new term suite */ + cond->val |= suite_val; + suite_val = ~0U; + cur_suite = NULL; + neg = 0; + continue; + } + + if (strcmp(word, "{") == 0) { + /* we may have a complete ACL expression between two braces, + * find the last one. + */ + int arg_end = arg + 1; + const char **args_new; + + while (*args[arg_end] && strcmp(args[arg_end], "}") != 0) + arg_end++; + + if (!*args[arg_end]) { + memprintf(err, "missing closing '}' in condition"); + goto out_free_suite; + } + + args_new = calloc(1, (arg_end - arg + 1) * sizeof(*args_new)); + if (!args_new) { + memprintf(err, "out of memory when parsing condition"); + goto out_free_suite; + } + + args_new[0] = ""; + memcpy(args_new + 1, args + arg + 1, (arg_end - arg) * sizeof(*args_new)); + args_new[arg_end - arg] = ""; + cur_acl = parse_acl(args_new, known_acl, err, al, file, line); + free(args_new); + + if (!cur_acl) { + /* note that parse_acl() must have filled <err> here */ + goto out_free_suite; + } + arg = arg_end; + } + else { + /* search for <word> in the known ACL names. If we do not find + * it, let's look for it in the default ACLs, and if found, add + * it to the list of ACLs of this proxy. This makes it possible + * to override them. + */ + cur_acl = find_acl_by_name(word, known_acl); + if (cur_acl == NULL) { + cur_acl = find_acl_default(word, known_acl, err, al, file, line); + if (cur_acl == NULL) { + /* note that find_acl_default() must have filled <err> here */ + goto out_free_suite; + } + } + } + + cur_term = calloc(1, sizeof(*cur_term)); + if (cur_term == NULL) { + memprintf(err, "out of memory when parsing condition"); + goto out_free_suite; + } + + cur_term->acl = cur_acl; + cur_term->neg = neg; + + /* Here it is a bit complex. The acl_term_suite is a conjunction + * of many terms. It may only be used if all of its terms are + * usable at the same time. So the suite's validity domain is an + * AND between all ACL keywords' ones. But, the global condition + * is valid if at least one term suite is OK. So it's an OR between + * all of their validity domains. We could emit a warning as soon + * as suite_val is null because it means that the last ACL is not + * compatible with the previous ones. Let's remain simple for now. + */ + cond->use |= cur_acl->use; + suite_val &= cur_acl->val; + + if (!cur_suite) { + cur_suite = calloc(1, sizeof(*cur_suite)); + if (cur_suite == NULL) { + memprintf(err, "out of memory when parsing condition"); + goto out_free_term; + } + LIST_INIT(&cur_suite->terms); + LIST_APPEND(&cond->suites, &cur_suite->list); + } + LIST_APPEND(&cur_suite->terms, &cur_term->list); + neg = 0; + } + + cond->val |= suite_val; + return cond; + + out_free_term: + free(cur_term); + out_free_suite: + free_acl_cond(cond); + out_return: + return NULL; +} + +/* Builds an ACL condition starting at the if/unless keyword. The complete + * condition is returned. NULL is returned in case of error or if the first + * word is neither "if" nor "unless". It automatically sets the file name and + * the line number in the condition for better error reporting, and sets the + * HTTP initialization requirements in the proxy. If <err> is not NULL, it will + * be filled with a pointer to an error message in case of error, that the + * caller is responsible for freeing. The initial location must either be + * freeable or NULL. + */ +struct acl_cond *build_acl_cond(const char *file, int line, struct list *known_acl, + struct proxy *px, const char **args, char **err) +{ + enum acl_cond_pol pol = ACL_COND_NONE; + struct acl_cond *cond = NULL; + + if (err) + *err = NULL; + + if (strcmp(*args, "if") == 0) { + pol = ACL_COND_IF; + args++; + } + else if (strcmp(*args, "unless") == 0) { + pol = ACL_COND_UNLESS; + args++; + } + else { + memprintf(err, "conditions must start with either 'if' or 'unless'"); + return NULL; + } + + cond = parse_acl_cond(args, known_acl, pol, err, &px->conf.args, file, line); + if (!cond) { + /* note that parse_acl_cond must have filled <err> here */ + return NULL; + } + + cond->file = file; + cond->line = line; + px->http_needed |= !!(cond->use & SMP_USE_HTTP_ANY); + return cond; +} + +/* Execute condition <cond> and return either ACL_TEST_FAIL, ACL_TEST_MISS or + * ACL_TEST_PASS depending on the test results. ACL_TEST_MISS may only be + * returned if <opt> does not contain SMP_OPT_FINAL, indicating that incomplete + * data is being examined. The function automatically sets SMP_OPT_ITERATE. This + * function only computes the condition, it does not apply the polarity required + * by IF/UNLESS, it's up to the caller to do this using something like this : + * + * res = acl_pass(res); + * if (res == ACL_TEST_MISS) + * return 0; + * if (cond->pol == ACL_COND_UNLESS) + * res = !res; + */ +enum acl_test_res acl_exec_cond(struct acl_cond *cond, struct proxy *px, struct session *sess, struct stream *strm, unsigned int opt) +{ + __label__ fetch_next; + struct acl_term_suite *suite; + struct acl_term *term; + struct acl_expr *expr; + struct acl *acl; + struct sample smp; + enum acl_test_res acl_res, suite_res, cond_res; + + /* ACLs are iterated over all values, so let's always set the flag to + * indicate this to the fetch functions. + */ + opt |= SMP_OPT_ITERATE; + + /* We're doing a logical OR between conditions so we initialize to FAIL. + * The MISS status is propagated down from the suites. + */ + cond_res = ACL_TEST_FAIL; + list_for_each_entry(suite, &cond->suites, list) { + /* Evaluate condition suite <suite>. We stop at the first term + * which returns ACL_TEST_FAIL. The MISS status is still propagated + * in case of uncertainty in the result. + */ + + /* we're doing a logical AND between terms, so we must set the + * initial value to PASS. + */ + suite_res = ACL_TEST_PASS; + list_for_each_entry(term, &suite->terms, list) { + acl = term->acl; + + /* FIXME: use cache ! + * check acl->cache_idx for this. + */ + + /* ACL result not cached. Let's scan all the expressions + * and use the first one to match. + */ + acl_res = ACL_TEST_FAIL; + list_for_each_entry(expr, &acl->expr, list) { + /* we need to reset context and flags */ + memset(&smp, 0, sizeof(smp)); + fetch_next: + if (!sample_process(px, sess, strm, opt, expr->smp, &smp)) { + /* maybe we could not fetch because of missing data */ + if (smp.flags & SMP_F_MAY_CHANGE && !(opt & SMP_OPT_FINAL)) + acl_res |= ACL_TEST_MISS; + continue; + } + + acl_res |= pat2acl(pattern_exec_match(&expr->pat, &smp, 0)); + /* + * OK now acl_res holds the result of this expression + * as one of ACL_TEST_FAIL, ACL_TEST_MISS or ACL_TEST_PASS. + * + * Then if (!MISS) we can cache the result, and put + * (smp.flags & SMP_F_VOLATILE) in the cache flags. + * + * FIXME: implement cache. + * + */ + + /* we're ORing these terms, so a single PASS is enough */ + if (acl_res == ACL_TEST_PASS) + break; + + if (smp.flags & SMP_F_NOT_LAST) + goto fetch_next; + + /* sometimes we know the fetched data is subject to change + * later and give another chance for a new match (eg: request + * size, time, ...) + */ + if (smp.flags & SMP_F_MAY_CHANGE && !(opt & SMP_OPT_FINAL)) + acl_res |= ACL_TEST_MISS; + } + /* + * Here we have the result of an ACL (cached or not). + * ACLs are combined, negated or not, to form conditions. + */ + + if (term->neg) + acl_res = acl_neg(acl_res); + + suite_res &= acl_res; + + /* we're ANDing these terms, so a single FAIL or MISS is enough */ + if (suite_res != ACL_TEST_PASS) + break; + } + cond_res |= suite_res; + + /* we're ORing these terms, so a single PASS is enough */ + if (cond_res == ACL_TEST_PASS) + break; + } + return cond_res; +} + +/* Returns a pointer to the first ACL conflicting with usage at place <where> + * which is one of the SMP_VAL_* bits indicating a check place, or NULL if + * no conflict is found. Only full conflicts are detected (ACL is not usable). + * Use the next function to check for useless keywords. + */ +const struct acl *acl_cond_conflicts(const struct acl_cond *cond, unsigned int where) +{ + struct acl_term_suite *suite; + struct acl_term *term; + struct acl *acl; + + list_for_each_entry(suite, &cond->suites, list) { + list_for_each_entry(term, &suite->terms, list) { + acl = term->acl; + if (!(acl->val & where)) + return acl; + } + } + return NULL; +} + +/* Returns a pointer to the first ACL and its first keyword to conflict with + * usage at place <where> which is one of the SMP_VAL_* bits indicating a check + * place. Returns true if a conflict is found, with <acl> and <kw> set (if non + * null), or false if not conflict is found. The first useless keyword is + * returned. + */ +int acl_cond_kw_conflicts(const struct acl_cond *cond, unsigned int where, struct acl const **acl, char const **kw) +{ + struct acl_term_suite *suite; + struct acl_term *term; + struct acl_expr *expr; + + list_for_each_entry(suite, &cond->suites, list) { + list_for_each_entry(term, &suite->terms, list) { + list_for_each_entry(expr, &term->acl->expr, list) { + if (!(expr->smp->fetch->val & where)) { + if (acl) + *acl = term->acl; + if (kw) + *kw = expr->kw; + return 1; + } + } + } + } + return 0; +} + +/* + * Find targets for userlist and groups in acl. Function returns the number + * of errors or OK if everything is fine. It must be called only once sample + * fetch arguments have been resolved (after smp_resolve_args()). + */ +int acl_find_targets(struct proxy *p) +{ + + struct acl *acl; + struct acl_expr *expr; + struct pattern_list *pattern; + int cfgerr = 0; + struct pattern_expr_list *pexp; + + list_for_each_entry(acl, &p->acl, list) { + list_for_each_entry(expr, &acl->expr, list) { + if (strcmp(expr->kw, "http_auth_group") == 0) { + /* Note: the ARGT_USR argument may only have been resolved earlier + * by smp_resolve_args(). + */ + if (expr->smp->arg_p->unresolved) { + ha_alert("Internal bug in proxy %s: %sacl %s %s() makes use of unresolved userlist '%s'. Please report this.\n", + p->id, *acl->name ? "" : "anonymous ", acl->name, expr->kw, + expr->smp->arg_p->data.str.area); + cfgerr++; + continue; + } + + if (LIST_ISEMPTY(&expr->pat.head)) { + ha_alert("proxy %s: acl %s %s(): no groups specified.\n", + p->id, acl->name, expr->kw); + cfgerr++; + continue; + } + + /* For each pattern, check if the group exists. */ + list_for_each_entry(pexp, &expr->pat.head, list) { + if (LIST_ISEMPTY(&pexp->expr->patterns)) { + ha_alert("proxy %s: acl %s %s(): no groups specified.\n", + p->id, acl->name, expr->kw); + cfgerr++; + continue; + } + + list_for_each_entry(pattern, &pexp->expr->patterns, list) { + /* this keyword only has one argument */ + if (!check_group(expr->smp->arg_p->data.usr, pattern->pat.ptr.str)) { + ha_alert("proxy %s: acl %s %s(): invalid group '%s'.\n", + p->id, acl->name, expr->kw, pattern->pat.ptr.str); + cfgerr++; + } + } + } + } + } + } + + return cfgerr; +} + +/* initializes ACLs by resolving the sample fetch names they rely upon. + * Returns 0 on success, otherwise an error. + */ +int init_acl() +{ + int err = 0; + int index; + const char *name; + struct acl_kw_list *kwl; + struct sample_fetch *smp; + + list_for_each_entry(kwl, &acl_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + name = kwl->kw[index].fetch_kw; + if (!name) + name = kwl->kw[index].kw; + + smp = find_sample_fetch(name, strlen(name)); + if (!smp) { + ha_alert("Critical internal error: ACL keyword '%s' relies on sample fetch '%s' which was not registered!\n", + kwl->kw[index].kw, name); + err++; + continue; + } + kwl->kw[index].smp = smp; + } + } + return err; +} + +/* dump known ACL keywords on stdout */ +void acl_dump_kwd(void) +{ + struct acl_kw_list *kwl; + const struct acl_keyword *kwp, *kw; + const char *name; + int index; + + for (kw = kwp = NULL;; kwp = kw) { + list_for_each_entry(kwl, &acl_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (strordered(kwp ? kwp->kw : NULL, + kwl->kw[index].kw, + kw != kwp ? kw->kw : NULL)) + kw = &kwl->kw[index]; + } + } + + if (kw == kwp) + break; + + name = kw->fetch_kw; + if (!name) + name = kw->kw; + + printf("%s = %s -m %s\n", kw->kw, name, pat_match_names[kw->match_type]); + } +} + +/* Purge everything in the acl_cond <cond>, then free <cond> */ +void free_acl_cond(struct acl_cond *cond) +{ + struct acl_term_suite *suite, *suiteb; + struct acl_term *term, *termb; + + if (!cond) + return; + + list_for_each_entry_safe(suite, suiteb, &cond->suites, list) { + list_for_each_entry_safe(term, termb, &suite->terms, list) { + LIST_DELETE(&term->list); + free(term); + } + LIST_DELETE(&suite->list); + free(suite); + } + + free(cond); +} + + +static int smp_fetch_acl(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct acl_sample *acl_sample = (struct acl_sample *)args->data.ptr; + enum acl_test_res ret; + + ret = acl_exec_cond(&acl_sample->cond, smp->px, smp->sess, smp->strm, smp->opt); + if (ret == ACL_TEST_MISS) + return 0; + smp->data.u.sint = ret == ACL_TEST_PASS; + smp->data.type = SMP_T_BOOL; + return 1; +} + +int smp_fetch_acl_parse(struct arg *args, char **err_msg) +{ + struct acl_sample *acl_sample; + char *name; + int i; + + for (i = 0; args[i].type != ARGT_STOP; i++) + ; + acl_sample = calloc(1, sizeof(struct acl_sample) + sizeof(struct acl_term) * i); + LIST_INIT(&acl_sample->suite.terms); + LIST_INIT(&acl_sample->cond.suites); + LIST_APPEND(&acl_sample->cond.suites, &acl_sample->suite.list); + acl_sample->cond.val = ~0U; // the keyword is valid everywhere for now. + + args->data.ptr = acl_sample; + + for (i = 0; args[i].type != ARGT_STOP; i++) { + name = args[i].data.str.area; + if (name[0] == '!') { + acl_sample->terms[i].neg = 1; + name++; + } + + if (!(acl_sample->terms[i].acl = find_acl_by_name(name, &curproxy->acl))) { + memprintf(err_msg, "ACL '%s' not found", name); + goto err; + } + + acl_sample->cond.use |= acl_sample->terms[i].acl->use; + acl_sample->cond.val &= acl_sample->terms[i].acl->val; + + LIST_APPEND(&acl_sample->suite.terms, &acl_sample->terms[i].list); + } + + return 1; + +err: + free(acl_sample); + return 0; +} + +/************************************************************************/ +/* All supported sample and ACL keywords must be declared here. */ +/************************************************************************/ + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct acl_kw_list acl_kws = {ILH, { + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, acl_register_keywords, &acl_kws); + +static struct sample_fetch_kw_list smp_kws = {ILH, { + { "acl", smp_fetch_acl, ARG12(1,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR), smp_fetch_acl_parse, SMP_T_BOOL, SMP_USE_CONST }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/action.c b/src/action.c new file mode 100644 index 0000000..47f5f86 --- /dev/null +++ b/src/action.c @@ -0,0 +1,363 @@ +/* + * Action management functions. + * + * Copyright 2017 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/acl.h> +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/errors.h> +#include <haproxy/list.h> +#include <haproxy/obj_type.h> +#include <haproxy/pool.h> +#include <haproxy/proxy.h> +#include <haproxy/stick_table.h> +#include <haproxy/task.h> +#include <haproxy/tools.h> + + +/* Check an action ruleset validity. It returns the number of error encountered + * and err_code is updated if a warning is emitted. + */ +int check_action_rules(struct list *rules, struct proxy *px, int *err_code) +{ + struct act_rule *rule; + char *errmsg = NULL; + int err = 0; + + list_for_each_entry(rule, rules, list) { + if (rule->check_ptr && !rule->check_ptr(rule, px, &errmsg)) { + ha_alert("Proxy '%s': %s.\n", px->id, errmsg); + err++; + } + *err_code |= warnif_tcp_http_cond(px, rule->cond); + ha_free(&errmsg); + } + + return err; +} + +/* Find and check the target table used by an action track-sc*. This + * function should be called during the configuration validity check. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +int check_trk_action(struct act_rule *rule, struct proxy *px, char **err) +{ + struct stktable *target; + + if (rule->arg.trk_ctr.table.n) + target = stktable_find_by_name(rule->arg.trk_ctr.table.n); + else + target = px->table; + + if (!target) { + memprintf(err, "unable to find table '%s' referenced by track-sc%d", + rule->arg.trk_ctr.table.n ? rule->arg.trk_ctr.table.n : px->id, + rule->action); + return 0; + } + + if (!stktable_compatible_sample(rule->arg.trk_ctr.expr, target->type)) { + memprintf(err, "stick-table '%s' uses a type incompatible with the 'track-sc%d' rule", + rule->arg.trk_ctr.table.n ? rule->arg.trk_ctr.table.n : px->id, + rule->action); + return 0; + } + else { + if (!in_proxies_list(target->proxies_list, px)) { + px->next_stkt_ref = target->proxies_list; + target->proxies_list = px; + } + free(rule->arg.trk_ctr.table.n); + rule->arg.trk_ctr.table.t = target; + /* Note: if we decide to enhance the track-sc syntax, we may be + * able to pass a list of counters to track and allocate them + * right here using stktable_alloc_data_type(). + */ + } + + if (rule->from == ACT_F_TCP_REQ_CNT && (px->cap & PR_CAP_FE)) { + if (!px->tcp_req.inspect_delay && !(rule->arg.trk_ctr.expr->fetch->val & SMP_VAL_FE_SES_ACC)) { + ha_warning("%s '%s' : a 'tcp-request content track-sc*' rule explicitly depending on request" + " contents without any 'tcp-request inspect-delay' setting." + " This means that this rule will randomly find its contents. This can be fixed by" + " setting the tcp-request inspect-delay.\n", + proxy_type_str(px), px->id); + } + + /* The following warning is emitted because HTTP multiplexers are able to catch errors + * or timeouts at the session level, before instantiating any stream. + * Thus the tcp-request content ruleset will not be evaluated in such case. It means, + * http_req and http_err counters will not be incremented as expected, even if the tracked + * counter does not use the request content. To track invalid requests it should be + * performed at the session level using a tcp-request session rule. + */ + if (px->mode == PR_MODE_HTTP && + !(rule->arg.trk_ctr.expr->fetch->use & (SMP_USE_L6REQ|SMP_USE_HRQHV|SMP_USE_HRQHP|SMP_USE_HRQBO)) && + (!rule->cond || !(rule->cond->use & (SMP_USE_L6REQ|SMP_USE_HRQHV|SMP_USE_HRQHP|SMP_USE_HRQBO)))) { + ha_warning("%s '%s' : a 'tcp-request content track-sc*' rule not depending on request" + " contents for an HTTP frontend should be executed at the session level, using a" + " 'tcp-request session' rule (mandatory to track invalid HTTP requests).\n", + proxy_type_str(px), px->id); + } + } + + return 1; +} + +/* check a capture rule. This function should be called during the configuration + * validity check. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +int check_capture(struct act_rule *rule, struct proxy *px, char **err) +{ + if (rule->from == ACT_F_TCP_REQ_CNT && (px->cap & PR_CAP_FE) && !px->tcp_req.inspect_delay && + !(rule->arg.cap.expr->fetch->val & SMP_VAL_FE_SES_ACC)) { + ha_warning("%s '%s' : a 'tcp-request capture' rule explicitly depending on request" + " contents without any 'tcp-request inspect-delay' setting." + " This means that this rule will randomly find its contents. This can be fixed by" + " setting the tcp-request inspect-delay.\n", + proxy_type_str(px), px->id); + } + + return 1; +} + +int act_resolution_cb(struct resolv_requester *requester, struct dns_counters *counters) +{ + struct stream *stream; + + if (requester->resolution == NULL) + return 0; + + stream = objt_stream(requester->owner); + if (stream == NULL) + return 0; + + task_wakeup(stream->task, TASK_WOKEN_MSG); + + return 0; +} + +/* + * Do resolve error management callback + * returns: + * 0 if we can trash answser items. + * 1 when safely ignored and we must kept answer items + */ +int act_resolution_error_cb(struct resolv_requester *requester, int error_code) +{ + struct stream *stream; + + if (requester->resolution == NULL) + return 0; + + stream = objt_stream(requester->owner); + if (stream == NULL) + return 0; + + task_wakeup(stream->task, TASK_WOKEN_MSG); + + return 0; +} + +/* Parse a set-timeout rule statement. It first checks if the timeout name is + * valid and proxy is capable of handling it, and returns it in <rule->arg.timeout.type>. + * Then the timeout is parsed as a plain value and * returned in <rule->arg.timeout.value>. + * If there is a parsing error, the value is reparsed as an expression and + * returned in <rule->arg.timeout.expr>. + * + * Returns -1 if the name is invalid or neither a time or an expression can be + * parsed, or if the timeout value is 0. + */ +int cfg_parse_rule_set_timeout(const char **args, int idx, struct act_rule *rule, + struct proxy *px, char **err) +{ + const char *res; + const char *timeout_name = args[idx++]; + + if (strcmp(timeout_name, "server") == 0) { + if (!(px->cap & PR_CAP_BE)) { + memprintf(err, "'%s' has no backend capability", px->id); + return -1; + } + rule->arg.timeout.type = ACT_TIMEOUT_SERVER; + } + else if (strcmp(timeout_name, "tunnel") == 0) { + if (!(px->cap & PR_CAP_BE)) { + memprintf(err, "'%s' has no backend capability", px->id); + return -1; + } + rule->arg.timeout.type = ACT_TIMEOUT_TUNNEL; + } + else if (strcmp(timeout_name, "client") == 0) { + if (!(px->cap & PR_CAP_FE)) { + memprintf(err, "'%s' has no frontend capability", px->id); + return -1; + } + rule->arg.timeout.type = ACT_TIMEOUT_CLIENT; + } + else { + memprintf(err, + "'set-timeout' rule supports 'server'/'tunnel'/'client' (got '%s')", + timeout_name); + return -1; + } + + res = parse_time_err(args[idx], (unsigned int *)&rule->arg.timeout.value, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to rule 'set-timeout %s' (maximum value is 2147483647 ms or ~24.8 days)", + args[idx], timeout_name); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to rule 'set-timeout %s' (minimum value is 1 ms)", + args[idx], timeout_name); + return -1; + } + /* res not NULL, parsing error */ + else if (res) { + rule->arg.timeout.expr = sample_parse_expr((char **)args, &idx, px->conf.args.file, + px->conf.args.line, err, &px->conf.args, NULL); + if (!rule->arg.timeout.expr) { + memprintf(err, "unexpected character '%c' in rule 'set-timeout %s'", *res, timeout_name); + return -1; + } + } + /* res NULL, parsing ok but value is 0 */ + else if (!(rule->arg.timeout.value)) { + memprintf(err, "null value is not valid for a 'set-timeout %s' rule", + timeout_name); + return -1; + } + + return 0; +} + +/* tries to find in list <keywords> a similar looking action as the one in + * <word>, and returns it otherwise NULL. <word> may be NULL or empty. An + * optional array of extra words to compare may be passed in <extra>, but it + * must then be terminated by a NULL entry. If unused it may be NULL. + */ +const char *action_suggest(const char *word, const struct list *keywords, const char **extra) +{ + uint8_t word_sig[1024]; + uint8_t list_sig[1024]; + const struct action_kw_list *kwl; + const struct action_kw *best_kw = NULL; + const char *best_ptr = NULL; + int dist, best_dist = INT_MAX; + int index; + + if (!word || !*word) + return NULL; + + make_word_fingerprint(word_sig, word); + list_for_each_entry(kwl, keywords, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + make_word_fingerprint(list_sig, kwl->kw[index].kw); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_kw = &kwl->kw[index]; + best_ptr = best_kw->kw; + } + } + } + + while (extra && *extra) { + make_word_fingerprint(list_sig, *extra); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_kw = NULL; + best_ptr = *extra; + } + extra++; + } + + /* eliminate too different ones, with more tolerance for prefixes + * when they're known to exist (not from extra list). + */ + if (best_ptr && + (best_dist > (2 + (best_kw && (best_kw->flags & KWF_MATCH_PREFIX))) * strlen(word) || + best_dist > (2 + (best_kw && (best_kw->flags & KWF_MATCH_PREFIX))) * strlen(best_ptr))) + best_ptr = NULL; + + return best_ptr; +} + +/* allocates a rule for ruleset <from> (ACT_F_*), from file name <file> and + * line <linenum>. <file> and <linenum> may be zero if unknown. Returns the + * rule, otherwise NULL in case of memory allocation error. + */ +struct act_rule *new_act_rule(enum act_from from, const char *file, int linenum) +{ + struct act_rule *rule; + + rule = calloc(1, sizeof(*rule)); + if (!rule) + return NULL; + rule->from = from; + rule->conf.file = file ? strdup(file) : NULL; + rule->conf.line = linenum; + LIST_INIT(&rule->list); + return rule; +} + +/* fees rule <rule> and its elements as well as the condition */ +void free_act_rule(struct act_rule *rule) +{ + LIST_DELETE(&rule->list); + free_acl_cond(rule->cond); + if (rule->release_ptr) + rule->release_ptr(rule); + free(rule->conf.file); + free(rule); +} + +void free_act_rules(struct list *rules) +{ + struct act_rule *rule, *ruleb; + + list_for_each_entry_safe(rule, ruleb, rules, list) { + free_act_rule(rule); + } +} + +/* dumps all known actions registered in action rules <rules> after prefix + * <pfx> to stdout. The actions are alphabetically sorted. Those with the + * KWF_MATCH_PREFIX flag have their name suffixed with '*'. + */ +void dump_act_rules(const struct list *rules, const char *pfx) +{ + const struct action_kw *akwp, *akwn; + struct action_kw_list *akwl; + int index; + + for (akwn = akwp = NULL;; akwp = akwn) { + list_for_each_entry(akwl, rules, list) { + for (index = 0; akwl->kw[index].kw != NULL; index++) + if (strordered(akwp ? akwp->kw : NULL, + akwl->kw[index].kw, + akwn != akwp ? akwn->kw : NULL)) + akwn = &akwl->kw[index]; + } + if (akwn == akwp) + break; + printf("%s%s%s\n", pfx ? pfx : "", akwn->kw, + (akwn->flags & KWF_MATCH_PREFIX) ? "*" : ""); + } +} diff --git a/src/activity.c b/src/activity.c new file mode 100644 index 0000000..07a30e6 --- /dev/null +++ b/src/activity.c @@ -0,0 +1,1248 @@ +/* + * activity measurement functions. + * + * Copyright 2000-2018 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/activity-t.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cfgparse.h> +#include <haproxy/clock.h> +#include <haproxy/channel.h> +#include <haproxy/cli.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/listener.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> + +/* CLI context for the "show profiling" command */ +struct show_prof_ctx { + int dump_step; /* 0,1,2,4,5,6; see cli_iohandler_show_profiling() */ + int linenum; /* next line to be dumped (starts at 0) */ + int maxcnt; /* max line count per step (0=not set) */ + int by_what; /* 0=sort by usage, 1=sort by address, 2=sort by time */ + int aggr; /* 0=dump raw, 1=aggregate on callee */ +}; + +/* CLI context for the "show activity" command */ +struct show_activity_ctx { + int thr; /* thread ID to show or -1 for all */ + int line; /* line number being dumped */ + int col; /* columnline being dumped, 0 to nbt+1 */ +}; + +#if defined(DEBUG_MEM_STATS) +/* these ones are macros in bug.h when DEBUG_MEM_STATS is set, and will + * prevent the new ones from being redefined. + */ +#undef calloc +#undef malloc +#undef realloc +#endif + +/* bit field of profiling options. Beware, may be modified at runtime! */ +unsigned int profiling __read_mostly = HA_PROF_TASKS_AOFF; + +/* start/stop dates of profiling */ +uint64_t prof_task_start_ns = 0; +uint64_t prof_task_stop_ns = 0; +uint64_t prof_mem_start_ns = 0; +uint64_t prof_mem_stop_ns = 0; + +/* One struct per thread containing all collected measurements */ +struct activity activity[MAX_THREADS] __attribute__((aligned(64))) = { }; + +/* One struct per function pointer hash entry (SCHED_ACT_HASH_BUCKETS values, 0=collision) */ +struct sched_activity sched_activity[SCHED_ACT_HASH_BUCKETS] __attribute__((aligned(64))) = { }; + + +#ifdef USE_MEMORY_PROFILING + +static const char *const memprof_methods[MEMPROF_METH_METHODS] = { + "unknown", "malloc", "calloc", "realloc", "free", "p_alloc", "p_free", +}; + +/* last one is for hash collisions ("others") and has no caller address */ +struct memprof_stats memprof_stats[MEMPROF_HASH_BUCKETS + 1] = { }; + +/* used to detect recursive calls */ +static THREAD_LOCAL int in_memprof = 0; + +/* These ones are used by glibc and will be called early. They are in charge of + * initializing the handlers with the original functions. + */ +static void *memprof_malloc_initial_handler(size_t size); +static void *memprof_calloc_initial_handler(size_t nmemb, size_t size); +static void *memprof_realloc_initial_handler(void *ptr, size_t size); +static void memprof_free_initial_handler(void *ptr); + +/* Fallback handlers for the main alloc/free functions. They are preset to + * the initializer in order to save a test in the functions's critical path. + */ +static void *(*memprof_malloc_handler)(size_t size) = memprof_malloc_initial_handler; +static void *(*memprof_calloc_handler)(size_t nmemb, size_t size) = memprof_calloc_initial_handler; +static void *(*memprof_realloc_handler)(void *ptr, size_t size) = memprof_realloc_initial_handler; +static void (*memprof_free_handler)(void *ptr) = memprof_free_initial_handler; + +/* Used to force to die if it's not possible to retrieve the allocation + * functions. We cannot even use stdio in this case. + */ +static __attribute__((noreturn)) void memprof_die(const char *msg) +{ + DISGUISE(write(2, msg, strlen(msg))); + exit(1); +} + +/* Resolve original allocation functions and initialize all handlers. + * This must be called very early at boot, before the very first malloc() + * call, and is not thread-safe! It's not even possible to use stdio there. + * Worse, we have to account for the risk of reentrance from dlsym() when + * it tries to prepare its error messages. Here its ahndled by in_memprof + * that makes allocators return NULL. dlsym() handles it gracefully. An + * alternate approach consists in calling aligned_alloc() from these places + * but that would mean not being able to intercept it later if considered + * useful to do so. + */ +static void memprof_init() +{ + in_memprof++; + memprof_malloc_handler = get_sym_next_addr("malloc"); + if (!memprof_malloc_handler) + memprof_die("FATAL: malloc() function not found.\n"); + + memprof_calloc_handler = get_sym_next_addr("calloc"); + if (!memprof_calloc_handler) + memprof_die("FATAL: calloc() function not found.\n"); + + memprof_realloc_handler = get_sym_next_addr("realloc"); + if (!memprof_realloc_handler) + memprof_die("FATAL: realloc() function not found.\n"); + + memprof_free_handler = get_sym_next_addr("free"); + if (!memprof_free_handler) + memprof_die("FATAL: free() function not found.\n"); + in_memprof--; +} + +/* the initial handlers will initialize all regular handlers and will call the + * one they correspond to. A single one of these functions will typically be + * called, though it's unknown which one (as any might be called before main). + */ +static void *memprof_malloc_initial_handler(size_t size) +{ + if (in_memprof) { + /* it's likely that dlsym() needs malloc(), let's fail */ + return NULL; + } + + memprof_init(); + return memprof_malloc_handler(size); +} + +static void *memprof_calloc_initial_handler(size_t nmemb, size_t size) +{ + if (in_memprof) { + /* it's likely that dlsym() needs calloc(), let's fail */ + return NULL; + } + memprof_init(); + return memprof_calloc_handler(nmemb, size); +} + +static void *memprof_realloc_initial_handler(void *ptr, size_t size) +{ + if (in_memprof) { + /* it's likely that dlsym() needs realloc(), let's fail */ + return NULL; + } + + memprof_init(); + return memprof_realloc_handler(ptr, size); +} + +static void memprof_free_initial_handler(void *ptr) +{ + memprof_init(); + memprof_free_handler(ptr); +} + +/* Assign a bin for the memprof_stats to the return address. May perform a few + * attempts before finding the right one, but always succeeds (in the worst + * case, returns a default bin). The caller address is atomically set except + * for the default one which is never set. + */ +struct memprof_stats *memprof_get_bin(const void *ra, enum memprof_method meth) +{ + int retries = 16; // up to 16 consecutive entries may be tested. + const void *old; + unsigned int bin; + + bin = ptr_hash(ra, MEMPROF_HASH_BITS); + for (; memprof_stats[bin].caller != ra; bin = (bin + 1) & (MEMPROF_HASH_BUCKETS - 1)) { + if (!--retries) { + bin = MEMPROF_HASH_BUCKETS; + break; + } + + old = NULL; + if (!memprof_stats[bin].caller && + HA_ATOMIC_CAS(&memprof_stats[bin].caller, &old, ra)) { + memprof_stats[bin].method = meth; + break; + } + } + return &memprof_stats[bin]; +} + +/* This is the new global malloc() function. It must optimize for the normal + * case (i.e. profiling disabled) hence the first test to permit a direct jump. + * It must remain simple to guarantee the lack of reentrance. stdio is not + * possible there even for debugging. The reported size is the really allocated + * one as returned by malloc_usable_size(), because this will allow it to be + * compared to the one before realloc() or free(). This is a GNU and jemalloc + * extension but other systems may also store this size in ptr[-1]. + */ +void *malloc(size_t size) +{ + struct memprof_stats *bin; + void *ret; + + if (likely(!(profiling & HA_PROF_MEMORY))) + return memprof_malloc_handler(size); + + ret = memprof_malloc_handler(size); + size = malloc_usable_size(ret) + sizeof(void *); + + bin = memprof_get_bin(__builtin_return_address(0), MEMPROF_METH_MALLOC); + _HA_ATOMIC_ADD(&bin->alloc_calls, 1); + _HA_ATOMIC_ADD(&bin->alloc_tot, size); + return ret; +} + +/* This is the new global calloc() function. It must optimize for the normal + * case (i.e. profiling disabled) hence the first test to permit a direct jump. + * It must remain simple to guarantee the lack of reentrance. stdio is not + * possible there even for debugging. The reported size is the really allocated + * one as returned by malloc_usable_size(), because this will allow it to be + * compared to the one before realloc() or free(). This is a GNU and jemalloc + * extension but other systems may also store this size in ptr[-1]. + */ +void *calloc(size_t nmemb, size_t size) +{ + struct memprof_stats *bin; + void *ret; + + if (likely(!(profiling & HA_PROF_MEMORY))) + return memprof_calloc_handler(nmemb, size); + + ret = memprof_calloc_handler(nmemb, size); + size = malloc_usable_size(ret) + sizeof(void *); + + bin = memprof_get_bin(__builtin_return_address(0), MEMPROF_METH_CALLOC); + _HA_ATOMIC_ADD(&bin->alloc_calls, 1); + _HA_ATOMIC_ADD(&bin->alloc_tot, size); + return ret; +} + +/* This is the new global realloc() function. It must optimize for the normal + * case (i.e. profiling disabled) hence the first test to permit a direct jump. + * It must remain simple to guarantee the lack of reentrance. stdio is not + * possible there even for debugging. The reported size is the really allocated + * one as returned by malloc_usable_size(), because this will allow it to be + * compared to the one before realloc() or free(). This is a GNU and jemalloc + * extension but other systems may also store this size in ptr[-1]. + * Depending on the old vs new size, it's considered as an allocation or a free + * (or neither if the size remains the same). + */ +void *realloc(void *ptr, size_t size) +{ + struct memprof_stats *bin; + size_t size_before; + void *ret; + + if (likely(!(profiling & HA_PROF_MEMORY))) + return memprof_realloc_handler(ptr, size); + + size_before = malloc_usable_size(ptr); + ret = memprof_realloc_handler(ptr, size); + size = malloc_usable_size(ret); + + /* only count the extra link for new allocations */ + if (!ptr) + size += sizeof(void *); + + bin = memprof_get_bin(__builtin_return_address(0), MEMPROF_METH_REALLOC); + if (size > size_before) { + _HA_ATOMIC_ADD(&bin->alloc_calls, 1); + _HA_ATOMIC_ADD(&bin->alloc_tot, size - size_before); + } else if (size < size_before) { + _HA_ATOMIC_ADD(&bin->free_calls, 1); + _HA_ATOMIC_ADD(&bin->free_tot, size_before - size); + } + return ret; +} + +/* This is the new global free() function. It must optimize for the normal + * case (i.e. profiling disabled) hence the first test to permit a direct jump. + * It must remain simple to guarantee the lack of reentrance. stdio is not + * possible there even for debugging. The reported size is the really allocated + * one as returned by malloc_usable_size(), because this will allow it to be + * compared to the one before realloc() or free(). This is a GNU and jemalloc + * extension but other systems may also store this size in ptr[-1]. Since + * free() is often called on NULL pointers to collect garbage at the end of + * many functions or during config parsing, as a special case free(NULL) + * doesn't update any stats. + */ +void free(void *ptr) +{ + struct memprof_stats *bin; + size_t size_before; + + if (likely(!(profiling & HA_PROF_MEMORY) || !ptr)) { + memprof_free_handler(ptr); + return; + } + + size_before = malloc_usable_size(ptr) + sizeof(void *); + memprof_free_handler(ptr); + + bin = memprof_get_bin(__builtin_return_address(0), MEMPROF_METH_FREE); + _HA_ATOMIC_ADD(&bin->free_calls, 1); + _HA_ATOMIC_ADD(&bin->free_tot, size_before); +} + +#endif // USE_MEMORY_PROFILING + +/* Updates the current thread's statistics about stolen CPU time. The unit for + * <stolen> is half-milliseconds. + */ +void report_stolen_time(uint64_t stolen) +{ + activity[tid].cpust_total += stolen; + update_freq_ctr(&activity[tid].cpust_1s, stolen); + update_freq_ctr_period(&activity[tid].cpust_15s, 15000, stolen); +} + +/* Update avg_loop value for the current thread and possibly decide to enable + * task-level profiling on the current thread based on its average run time. + * The <run_time> argument is the number of microseconds elapsed since the + * last time poll() returned. + */ +void activity_count_runtime(uint32_t run_time) +{ + uint32_t up, down; + + /* 1 millisecond per loop on average over last 1024 iterations is + * enough to turn on profiling. + */ + up = 1000; + down = up * 99 / 100; + + run_time = swrate_add(&activity[tid].avg_loop_us, TIME_STATS_SAMPLES, run_time); + + /* In automatic mode, reaching the "up" threshold on average switches + * profiling to "on" when automatic, and going back below the "down" + * threshold switches to off. The forced modes don't check the load. + */ + if (!(_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING)) { + if (unlikely((profiling & HA_PROF_TASKS_MASK) == HA_PROF_TASKS_ON || + ((profiling & HA_PROF_TASKS_MASK) == HA_PROF_TASKS_AON && + swrate_avg(run_time, TIME_STATS_SAMPLES) >= up))) + _HA_ATOMIC_OR(&th_ctx->flags, TH_FL_TASK_PROFILING); + } else { + if (unlikely((profiling & HA_PROF_TASKS_MASK) == HA_PROF_TASKS_OFF || + ((profiling & HA_PROF_TASKS_MASK) == HA_PROF_TASKS_AOFF && + swrate_avg(run_time, TIME_STATS_SAMPLES) <= down))) + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_TASK_PROFILING); + } +} + +#ifdef USE_MEMORY_PROFILING +/* config parser for global "profiling.memory", accepts "on" or "off" */ +static int cfg_parse_prof_memory(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) { + profiling |= HA_PROF_MEMORY; + HA_ATOMIC_STORE(&prof_mem_start_ns, now_ns); + } + else if (strcmp(args[1], "off") == 0) + profiling &= ~HA_PROF_MEMORY; + else { + memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} +#endif // USE_MEMORY_PROFILING + +/* config parser for global "profiling.tasks", accepts "on" or "off" */ +static int cfg_parse_prof_tasks(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) { + profiling = (profiling & ~HA_PROF_TASKS_MASK) | HA_PROF_TASKS_ON; + HA_ATOMIC_STORE(&prof_task_start_ns, now_ns); + } + else if (strcmp(args[1], "auto") == 0) { + profiling = (profiling & ~HA_PROF_TASKS_MASK) | HA_PROF_TASKS_AOFF; + HA_ATOMIC_STORE(&prof_task_start_ns, now_ns); + } + else if (strcmp(args[1], "off") == 0) + profiling = (profiling & ~HA_PROF_TASKS_MASK) | HA_PROF_TASKS_OFF; + else { + memprintf(err, "'%s' expects either 'on', 'auto', or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +/* parse a "set profiling" command. It always returns 1. */ +static int cli_parse_set_profiling(char **args, char *payload, struct appctx *appctx, void *private) +{ + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (strcmp(args[2], "memory") == 0) { +#ifdef USE_MEMORY_PROFILING + if (strcmp(args[3], "on") == 0) { + unsigned int old = profiling; + int i; + + while (!_HA_ATOMIC_CAS(&profiling, &old, old | HA_PROF_MEMORY)) + ; + + HA_ATOMIC_STORE(&prof_mem_start_ns, now_ns); + HA_ATOMIC_STORE(&prof_mem_stop_ns, 0); + + /* also flush current profiling stats */ + for (i = 0; i < sizeof(memprof_stats) / sizeof(memprof_stats[0]); i++) { + HA_ATOMIC_STORE(&memprof_stats[i].alloc_calls, 0); + HA_ATOMIC_STORE(&memprof_stats[i].free_calls, 0); + HA_ATOMIC_STORE(&memprof_stats[i].alloc_tot, 0); + HA_ATOMIC_STORE(&memprof_stats[i].free_tot, 0); + HA_ATOMIC_STORE(&memprof_stats[i].caller, NULL); + } + } + else if (strcmp(args[3], "off") == 0) { + unsigned int old = profiling; + + while (!_HA_ATOMIC_CAS(&profiling, &old, old & ~HA_PROF_MEMORY)) + ; + + if (HA_ATOMIC_LOAD(&prof_mem_start_ns)) + HA_ATOMIC_STORE(&prof_mem_stop_ns, now_ns); + } + else + return cli_err(appctx, "Expects either 'on' or 'off'.\n"); + return 1; +#else + return cli_err(appctx, "Memory profiling not compiled in.\n"); +#endif + } + + if (strcmp(args[2], "tasks") != 0) + return cli_err(appctx, "Expects either 'tasks' or 'memory'.\n"); + + if (strcmp(args[3], "on") == 0) { + unsigned int old = profiling; + int i; + + while (!_HA_ATOMIC_CAS(&profiling, &old, (old & ~HA_PROF_TASKS_MASK) | HA_PROF_TASKS_ON)) + ; + + HA_ATOMIC_STORE(&prof_task_start_ns, now_ns); + HA_ATOMIC_STORE(&prof_task_stop_ns, 0); + + /* also flush current profiling stats */ + for (i = 0; i < SCHED_ACT_HASH_BUCKETS; i++) { + HA_ATOMIC_STORE(&sched_activity[i].calls, 0); + HA_ATOMIC_STORE(&sched_activity[i].cpu_time, 0); + HA_ATOMIC_STORE(&sched_activity[i].lat_time, 0); + HA_ATOMIC_STORE(&sched_activity[i].func, NULL); + HA_ATOMIC_STORE(&sched_activity[i].caller, NULL); + } + } + else if (strcmp(args[3], "auto") == 0) { + unsigned int old = profiling; + unsigned int new; + + do { + if ((old & HA_PROF_TASKS_MASK) >= HA_PROF_TASKS_AON) + new = (old & ~HA_PROF_TASKS_MASK) | HA_PROF_TASKS_AON; + else + new = (old & ~HA_PROF_TASKS_MASK) | HA_PROF_TASKS_AOFF; + } while (!_HA_ATOMIC_CAS(&profiling, &old, new)); + + HA_ATOMIC_STORE(&prof_task_start_ns, now_ns); + HA_ATOMIC_STORE(&prof_task_stop_ns, 0); + } + else if (strcmp(args[3], "off") == 0) { + unsigned int old = profiling; + while (!_HA_ATOMIC_CAS(&profiling, &old, (old & ~HA_PROF_TASKS_MASK) | HA_PROF_TASKS_OFF)) + ; + + if (HA_ATOMIC_LOAD(&prof_task_start_ns)) + HA_ATOMIC_STORE(&prof_task_stop_ns, now_ns); + } + else + return cli_err(appctx, "Expects 'on', 'auto', or 'off'.\n"); + + return 1; +} + +static int cmp_sched_activity_calls(const void *a, const void *b) +{ + const struct sched_activity *l = (const struct sched_activity *)a; + const struct sched_activity *r = (const struct sched_activity *)b; + + if (l->calls > r->calls) + return -1; + else if (l->calls < r->calls) + return 1; + else + return 0; +} + +/* sort by address first, then by call count */ +static int cmp_sched_activity_addr(const void *a, const void *b) +{ + const struct sched_activity *l = (const struct sched_activity *)a; + const struct sched_activity *r = (const struct sched_activity *)b; + + if (l->func > r->func) + return -1; + else if (l->func < r->func) + return 1; + else if (l->calls > r->calls) + return -1; + else if (l->calls < r->calls) + return 1; + else + return 0; +} + +/* sort by cpu time first, then by inverse call count (to spot highest offenders) */ +static int cmp_sched_activity_cpu(const void *a, const void *b) +{ + const struct sched_activity *l = (const struct sched_activity *)a; + const struct sched_activity *r = (const struct sched_activity *)b; + + if (l->cpu_time > r->cpu_time) + return -1; + else if (l->cpu_time < r->cpu_time) + return 1; + else if (l->calls < r->calls) + return -1; + else if (l->calls > r->calls) + return 1; + else + return 0; +} + +#ifdef USE_MEMORY_PROFILING +/* used by qsort below */ +static int cmp_memprof_stats(const void *a, const void *b) +{ + const struct memprof_stats *l = (const struct memprof_stats *)a; + const struct memprof_stats *r = (const struct memprof_stats *)b; + + if (l->alloc_tot + l->free_tot > r->alloc_tot + r->free_tot) + return -1; + else if (l->alloc_tot + l->free_tot < r->alloc_tot + r->free_tot) + return 1; + else + return 0; +} + +static int cmp_memprof_addr(const void *a, const void *b) +{ + const struct memprof_stats *l = (const struct memprof_stats *)a; + const struct memprof_stats *r = (const struct memprof_stats *)b; + + if (l->caller > r->caller) + return -1; + else if (l->caller < r->caller) + return 1; + else + return 0; +} +#endif // USE_MEMORY_PROFILING + +/* Computes the index of function pointer <func> and caller <caller> for use + * with sched_activity[] or any other similar array passed in <array>, and + * returns a pointer to the entry after having atomically assigned it to this + * function pointer and caller combination. Note that in case of collision, + * the first entry is returned instead ("other"). + */ +struct sched_activity *sched_activity_entry(struct sched_activity *array, const void *func, const void *caller) +{ + uint32_t hash = ptr2_hash(func, caller, SCHED_ACT_HASH_BITS); + struct sched_activity *ret; + const void *old; + int tries = 16; + + for (tries = 16; tries > 0; tries--, hash++) { + ret = &array[hash]; + + while (1) { + if (likely(ret->func)) { + if (likely(ret->func == func && ret->caller == caller)) + return ret; + break; + } + + /* try to create the new entry. Func is sufficient to + * reserve the node. + */ + old = NULL; + if (HA_ATOMIC_CAS(&ret->func, &old, func)) { + ret->caller = caller; + return ret; + } + /* changed in parallel, check again */ + } + } + + return array; +} + +/* This function dumps all profiling settings. It returns 0 if the output + * buffer is full and it needs to be called again, otherwise non-zero. + * It dumps some parts depending on the following states from show_prof_ctx: + * dump_step: + * 0, 4: dump status, then jump to 1 if 0 + * 1, 5: dump tasks, then jump to 2 if 1 + * 2, 6: dump memory, then stop + * linenum: + * restart line for each step (starts at zero) + * maxcnt: + * may contain a configured max line count for each step (0=not set) + * byaddr: + * 0: sort by usage + * 1: sort by address + */ +static int cli_io_handler_show_profiling(struct appctx *appctx) +{ + struct show_prof_ctx *ctx = appctx->svcctx; + struct sched_activity tmp_activity[SCHED_ACT_HASH_BUCKETS] __attribute__((aligned(64))); +#ifdef USE_MEMORY_PROFILING + struct memprof_stats tmp_memstats[MEMPROF_HASH_BUCKETS + 1]; + unsigned long long tot_alloc_calls, tot_free_calls; + unsigned long long tot_alloc_bytes, tot_free_bytes; +#endif + struct stconn *sc = appctx_sc(appctx); + struct buffer *name_buffer = get_trash_chunk(); + const struct ha_caller *caller; + const char *str; + int max_lines; + int i, j, max; + + /* FIXME: Don't watch the other side ! */ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + chunk_reset(&trash); + + switch (profiling & HA_PROF_TASKS_MASK) { + case HA_PROF_TASKS_AOFF: str="auto-off"; break; + case HA_PROF_TASKS_AON: str="auto-on"; break; + case HA_PROF_TASKS_ON: str="on"; break; + default: str="off"; break; + } + + if ((ctx->dump_step & 3) != 0) + goto skip_status; + + chunk_printf(&trash, + "Per-task CPU profiling : %-8s # set profiling tasks {on|auto|off}\n" + "Memory usage profiling : %-8s # set profiling memory {on|off}\n", + str, (profiling & HA_PROF_MEMORY) ? "on" : "off"); + + if (applet_putchk(appctx, &trash) == -1) { + /* failed, try again */ + return 0; + } + + ctx->linenum = 0; // reset first line to dump + if ((ctx->dump_step & 4) == 0) + ctx->dump_step++; // next step + + skip_status: + if ((ctx->dump_step & 3) != 1) + goto skip_tasks; + + memcpy(tmp_activity, sched_activity, sizeof(tmp_activity)); + /* for addr sort and for callee aggregation we have to first sort by address */ + if (ctx->aggr || ctx->by_what == 1) // sort by addr + qsort(tmp_activity, SCHED_ACT_HASH_BUCKETS, sizeof(tmp_activity[0]), cmp_sched_activity_addr); + + if (ctx->aggr) { + /* merge entries for the same callee and reset their count */ + for (i = j = 0; i < SCHED_ACT_HASH_BUCKETS; i = j) { + for (j = i + 1; j < SCHED_ACT_HASH_BUCKETS && tmp_activity[j].func == tmp_activity[i].func; j++) { + tmp_activity[i].calls += tmp_activity[j].calls; + tmp_activity[i].cpu_time += tmp_activity[j].cpu_time; + tmp_activity[i].lat_time += tmp_activity[j].lat_time; + tmp_activity[j].calls = 0; + } + } + } + + if (!ctx->by_what) // sort by usage + qsort(tmp_activity, SCHED_ACT_HASH_BUCKETS, sizeof(tmp_activity[0]), cmp_sched_activity_calls); + else if (ctx->by_what == 2) // by cpu_tot + qsort(tmp_activity, SCHED_ACT_HASH_BUCKETS, sizeof(tmp_activity[0]), cmp_sched_activity_cpu); + + if (!ctx->linenum) + chunk_appendf(&trash, "Tasks activity over %.3f sec till %.3f sec ago:\n" + " function calls cpu_tot cpu_avg lat_tot lat_avg\n", + (prof_task_start_ns ? (prof_task_stop_ns ? prof_task_stop_ns : now_ns) - prof_task_start_ns : 0) / 1000000000.0, + (prof_task_stop_ns ? now_ns - prof_task_stop_ns : 0) / 1000000000.0); + + max_lines = ctx->maxcnt; + if (!max_lines) + max_lines = SCHED_ACT_HASH_BUCKETS; + + for (i = ctx->linenum; i < max_lines; i++) { + if (!tmp_activity[i].calls) + continue; // skip aggregated or empty entries + + ctx->linenum = i; + chunk_reset(name_buffer); + caller = HA_ATOMIC_LOAD(&tmp_activity[i].caller); + + if (!tmp_activity[i].func) + chunk_printf(name_buffer, "other"); + else + resolve_sym_name(name_buffer, "", tmp_activity[i].func); + + /* reserve 35 chars for name+' '+#calls, knowing that longer names + * are often used for less often called functions. + */ + max = 35 - name_buffer->data; + if (max < 1) + max = 1; + chunk_appendf(&trash, " %s%*llu", name_buffer->area, max, (unsigned long long)tmp_activity[i].calls); + + print_time_short(&trash, " ", tmp_activity[i].cpu_time, ""); + print_time_short(&trash, " ", tmp_activity[i].cpu_time / tmp_activity[i].calls, ""); + print_time_short(&trash, " ", tmp_activity[i].lat_time, ""); + print_time_short(&trash, " ", tmp_activity[i].lat_time / tmp_activity[i].calls, ""); + + if (caller && !ctx->aggr && caller->what <= WAKEUP_TYPE_APPCTX_WAKEUP) + chunk_appendf(&trash, " <- %s@%s:%d %s", + caller->func, caller->file, caller->line, + task_wakeup_type_str(caller->what)); + + b_putchr(&trash, '\n'); + + if (applet_putchk(appctx, &trash) == -1) { + /* failed, try again */ + return 0; + } + } + + if (applet_putchk(appctx, &trash) == -1) { + /* failed, try again */ + return 0; + } + + ctx->linenum = 0; // reset first line to dump + if ((ctx->dump_step & 4) == 0) + ctx->dump_step++; // next step + + skip_tasks: + +#ifdef USE_MEMORY_PROFILING + if ((ctx->dump_step & 3) != 2) + goto skip_mem; + + memcpy(tmp_memstats, memprof_stats, sizeof(tmp_memstats)); + if (ctx->by_what) + qsort(tmp_memstats, MEMPROF_HASH_BUCKETS+1, sizeof(tmp_memstats[0]), cmp_memprof_addr); + else + qsort(tmp_memstats, MEMPROF_HASH_BUCKETS+1, sizeof(tmp_memstats[0]), cmp_memprof_stats); + + if (!ctx->linenum) + chunk_appendf(&trash, + "Alloc/Free statistics by call place over %.3f sec till %.3f sec ago:\n" + " Calls | Tot Bytes | Caller and method\n" + "<- alloc -> <- free ->|<-- alloc ---> <-- free ---->|\n", + (prof_mem_start_ns ? (prof_mem_stop_ns ? prof_mem_stop_ns : now_ns) - prof_mem_start_ns : 0) / 1000000000.0, + (prof_mem_stop_ns ? now_ns - prof_mem_stop_ns : 0) / 1000000000.0); + + max_lines = ctx->maxcnt; + if (!max_lines) + max_lines = MEMPROF_HASH_BUCKETS + 1; + + for (i = ctx->linenum; i < max_lines; i++) { + struct memprof_stats *entry = &tmp_memstats[i]; + + ctx->linenum = i; + if (!entry->alloc_calls && !entry->free_calls) + continue; + chunk_appendf(&trash, "%11llu %11llu %14llu %14llu| %16p ", + entry->alloc_calls, entry->free_calls, + entry->alloc_tot, entry->free_tot, + entry->caller); + + if (entry->caller) + resolve_sym_name(&trash, NULL, entry->caller); + else + chunk_appendf(&trash, "[other]"); + + chunk_appendf(&trash," %s(%lld)", memprof_methods[entry->method], + (long long)(entry->alloc_tot - entry->free_tot) / (long long)(entry->alloc_calls + entry->free_calls)); + + if (entry->alloc_tot && entry->free_tot) { + /* that's a realloc, show the total diff to help spot leaks */ + chunk_appendf(&trash," [delta=%lld]", (long long)(entry->alloc_tot - entry->free_tot)); + } + + if (entry->info) { + /* that's a pool name */ + const struct pool_head *pool = entry->info; + chunk_appendf(&trash," [pool=%s]", pool->name); + } + + chunk_appendf(&trash, "\n"); + + if (applet_putchk(appctx, &trash) == -1) + return 0; + } + + if (applet_putchk(appctx, &trash) == -1) + return 0; + + tot_alloc_calls = tot_free_calls = tot_alloc_bytes = tot_free_bytes = 0; + for (i = 0; i < max_lines; i++) { + tot_alloc_calls += tmp_memstats[i].alloc_calls; + tot_free_calls += tmp_memstats[i].free_calls; + tot_alloc_bytes += tmp_memstats[i].alloc_tot; + tot_free_bytes += tmp_memstats[i].free_tot; + } + + chunk_appendf(&trash, + "-----------------------|-----------------------------|\n" + "%11llu %11llu %14llu %14llu| <- Total; Delta_calls=%lld; Delta_bytes=%lld\n", + tot_alloc_calls, tot_free_calls, + tot_alloc_bytes, tot_free_bytes, + tot_alloc_calls - tot_free_calls, + tot_alloc_bytes - tot_free_bytes); + + if (applet_putchk(appctx, &trash) == -1) + return 0; + + ctx->linenum = 0; // reset first line to dump + if ((ctx->dump_step & 4) == 0) + ctx->dump_step++; // next step + + skip_mem: +#endif // USE_MEMORY_PROFILING + + return 1; +} + +/* parse a "show profiling" command. It returns 1 on failure, 0 if it starts to dump. + * - cli.i0 is set to the first state (0=all, 4=status, 5=tasks, 6=memory) + * - cli.o1 is set to 1 if the output must be sorted by addr instead of usage + * - cli.o0 is set to the number of lines of output + */ +static int cli_parse_show_profiling(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_prof_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int arg; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + for (arg = 2; *args[arg]; arg++) { + if (strcmp(args[arg], "all") == 0) { + ctx->dump_step = 0; // will cycle through 0,1,2; default + } + else if (strcmp(args[arg], "status") == 0) { + ctx->dump_step = 4; // will visit status only + } + else if (strcmp(args[arg], "tasks") == 0) { + ctx->dump_step = 5; // will visit tasks only + } + else if (strcmp(args[arg], "memory") == 0) { + ctx->dump_step = 6; // will visit memory only + } + else if (strcmp(args[arg], "byaddr") == 0) { + ctx->by_what = 1; // sort output by address instead of usage + } + else if (strcmp(args[arg], "bytime") == 0) { + ctx->by_what = 2; // sort output by total time instead of usage + } + else if (strcmp(args[arg], "aggr") == 0) { + ctx->aggr = 1; // aggregate output by callee + } + else if (isdigit((unsigned char)*args[arg])) { + ctx->maxcnt = atoi(args[arg]); // number of entries to dump + } + else + return cli_err(appctx, "Expects either 'all', 'status', 'tasks', 'memory', 'byaddr', 'bytime', 'aggr' or a max number of output lines.\n"); + } + return 0; +} + +/* This function scans all threads' run queues and collects statistics about + * running tasks. It returns 0 if the output buffer is full and it needs to be + * called again, otherwise non-zero. + */ +static int cli_io_handler_show_tasks(struct appctx *appctx) +{ + struct sched_activity tmp_activity[SCHED_ACT_HASH_BUCKETS] __attribute__((aligned(64))); + struct stconn *sc = appctx_sc(appctx); + struct buffer *name_buffer = get_trash_chunk(); + struct sched_activity *entry; + const struct tasklet *tl; + const struct task *t; + uint64_t now_ns, lat; + struct eb32_node *rqnode; + uint64_t tot_calls; + int thr, queue; + int i, max; + + /* FIXME: Don't watch the other side ! */ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + /* It's not possible to scan queues in small chunks and yield in the + * middle of the dump and come back again. So what we're doing instead + * is to freeze all threads and inspect their queues at once as fast as + * possible, using a sched_activity array to collect metrics with + * limited collision, then we'll report statistics only. The tasks' + * #calls will reflect the number of occurrences, and the lat_time will + * reflect the latency when set. We prefer to take the time before + * calling thread_isolate() so that the wait time doesn't impact the + * measurement accuracy. However this requires to take care of negative + * times since tasks might be queued after we retrieve it. + */ + + now_ns = now_mono_time(); + memset(tmp_activity, 0, sizeof(tmp_activity)); + + thread_isolate(); + + /* 1. global run queue */ + +#ifdef USE_THREAD + for (thr = 0; thr < global.nbthread; thr++) { + /* task run queue */ + rqnode = eb32_first(&ha_thread_ctx[thr].rqueue_shared); + while (rqnode) { + t = eb32_entry(rqnode, struct task, rq); + entry = sched_activity_entry(tmp_activity, t->process, NULL); + if (t->wake_date) { + lat = now_ns - t->wake_date; + if ((int64_t)lat > 0) + entry->lat_time += lat; + } + entry->calls++; + rqnode = eb32_next(rqnode); + } + } +#endif + /* 2. all threads's local run queues */ + for (thr = 0; thr < global.nbthread; thr++) { + /* task run queue */ + rqnode = eb32_first(&ha_thread_ctx[thr].rqueue); + while (rqnode) { + t = eb32_entry(rqnode, struct task, rq); + entry = sched_activity_entry(tmp_activity, t->process, NULL); + if (t->wake_date) { + lat = now_ns - t->wake_date; + if ((int64_t)lat > 0) + entry->lat_time += lat; + } + entry->calls++; + rqnode = eb32_next(rqnode); + } + + /* shared tasklet list */ + list_for_each_entry(tl, mt_list_to_list(&ha_thread_ctx[thr].shared_tasklet_list), list) { + t = (const struct task *)tl; + entry = sched_activity_entry(tmp_activity, t->process, NULL); + if (!TASK_IS_TASKLET(t) && t->wake_date) { + lat = now_ns - t->wake_date; + if ((int64_t)lat > 0) + entry->lat_time += lat; + } + entry->calls++; + } + + /* classful tasklets */ + for (queue = 0; queue < TL_CLASSES; queue++) { + list_for_each_entry(tl, &ha_thread_ctx[thr].tasklets[queue], list) { + t = (const struct task *)tl; + entry = sched_activity_entry(tmp_activity, t->process, NULL); + if (!TASK_IS_TASKLET(t) && t->wake_date) { + lat = now_ns - t->wake_date; + if ((int64_t)lat > 0) + entry->lat_time += lat; + } + entry->calls++; + } + } + } + + /* hopefully we're done */ + thread_release(); + + chunk_reset(&trash); + + tot_calls = 0; + for (i = 0; i < SCHED_ACT_HASH_BUCKETS; i++) + tot_calls += tmp_activity[i].calls; + + qsort(tmp_activity, SCHED_ACT_HASH_BUCKETS, sizeof(tmp_activity[0]), cmp_sched_activity_calls); + + chunk_appendf(&trash, "Running tasks: %d (%d threads)\n" + " function places %% lat_tot lat_avg\n", + (int)tot_calls, global.nbthread); + + for (i = 0; i < SCHED_ACT_HASH_BUCKETS && tmp_activity[i].calls; i++) { + chunk_reset(name_buffer); + + if (!tmp_activity[i].func) + chunk_printf(name_buffer, "other"); + else + resolve_sym_name(name_buffer, "", tmp_activity[i].func); + + /* reserve 35 chars for name+' '+#calls, knowing that longer names + * are often used for less often called functions. + */ + max = 35 - name_buffer->data; + if (max < 1) + max = 1; + chunk_appendf(&trash, " %s%*llu %3d.%1d", + name_buffer->area, max, (unsigned long long)tmp_activity[i].calls, + (int)(100ULL * tmp_activity[i].calls / tot_calls), + (int)((1000ULL * tmp_activity[i].calls / tot_calls)%10)); + print_time_short(&trash, " ", tmp_activity[i].lat_time, ""); + print_time_short(&trash, " ", tmp_activity[i].lat_time / tmp_activity[i].calls, "\n"); + } + + if (applet_putchk(appctx, &trash) == -1) { + /* failed, try again */ + return 0; + } + return 1; +} + +/* This function dumps some activity counters used by developers and support to + * rule out some hypothesis during bug reports. It returns 0 if the output + * buffer is full and it needs to be called again, otherwise non-zero. It dumps + * everything at once in the buffer and is not designed to do it in multiple + * passes. + */ +static int cli_io_handler_show_activity(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct show_activity_ctx *actctx = appctx->svcctx; + int tgt = actctx->thr; // target thread, -1 for all, 0 for total only + uint up_sec, up_usec; + int base_line; + ullong up; + + /* FIXME: Don't watch the other side ! */ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + /* this macro is used below to dump values. The thread number is "thr", + * and runs from 0 to nbt-1 when values are printed using the formula. + * We normally try to dmup integral lines in order to keep counters + * consistent. If we fail once on a line, we'll detect it next time + * because we'll have committed actctx->col=1 thanks to the header + * always being dumped individually. We'll be called again thanks to + * the header being present, leaving some data in the buffer. In this + * case once we restart we'll proceed one column at a time to make sure + * we don't overflow the buffer again. + */ +#undef SHOW_VAL +#define SHOW_VAL(header, x, formula) \ + do { \ + unsigned int _v[MAX_THREADS]; \ + unsigned int _tot; \ + const int _nbt = global.nbthread; \ + int restarted = actctx->col > 0; \ + int thr; \ + _tot = thr = 0; \ + do { \ + _tot += _v[thr] = (x); \ + } while (++thr < _nbt); \ + for (thr = actctx->col - 2; thr <= _nbt; thr++) { \ + if (thr == -2) { \ + /* line header */ \ + chunk_appendf(&trash, "%s", header); \ + } \ + else if (thr == -1) { \ + /* aggregate value only for multi-thread: all & 0 */ \ + if (_nbt > 1 && tgt <= 0) \ + chunk_appendf(&trash, " %u%s", \ + (formula), \ + (tgt < 0) ? \ + " [" : ""); \ + } \ + else if (thr < _nbt) { \ + /* individual value only for all or exact value */ \ + if (tgt == -1 || tgt == thr+1) \ + chunk_appendf(&trash, " %u", \ + _v[thr]); \ + } \ + else /* thr == _nbt */ { \ + chunk_appendf(&trash, "%s\n", \ + (_nbt > 1 && tgt < 0) ? \ + " ]" : ""); \ + } \ + if (thr == -2 || restarted) { \ + /* failed once, emit one column at a time */\ + if (applet_putchk(appctx, &trash) == -1) \ + break; /* main loop handles it */ \ + chunk_reset(&trash); \ + actctx->col = thr + 3; \ + } \ + } \ + if (applet_putchk(appctx, &trash) == -1) \ + break; /* main loop will handle it */ \ + /* OK dump done for this line */ \ + chunk_reset(&trash); \ + if (thr > _nbt) \ + actctx->col = 0; \ + } while (0) + + /* retrieve uptime */ + up = now_ns - start_time_ns; + up_sec = ns_to_sec(up); + up_usec = (up / 1000U) % 1000000U; + + /* iterate over all dump lines. It happily skips over holes so it's + * not a problem not to have an exact match, we just need to have + * stable and consistent lines during a dump. + */ + base_line = __LINE__; + do { + chunk_reset(&trash); + + switch (actctx->line + base_line) { + case __LINE__: chunk_appendf(&trash, "thread_id: %u (%u..%u)\n", tid + 1, 1, global.nbthread); break; + case __LINE__: chunk_appendf(&trash, "date_now: %lu.%06lu\n", (ulong)date.tv_sec, (ulong)date.tv_usec); break; + case __LINE__: chunk_appendf(&trash, "uptime_now: %u.%06u\n", up_sec, up_usec); break; + case __LINE__: SHOW_VAL("ctxsw:", activity[thr].ctxsw, _tot); break; + case __LINE__: SHOW_VAL("tasksw:", activity[thr].tasksw, _tot); break; + case __LINE__: SHOW_VAL("empty_rq:", activity[thr].empty_rq, _tot); break; + case __LINE__: SHOW_VAL("long_rq:", activity[thr].long_rq, _tot); break; + case __LINE__: SHOW_VAL("curr_rq:", _HA_ATOMIC_LOAD(&ha_thread_ctx[thr].rq_total), _tot); break; + case __LINE__: SHOW_VAL("loops:", activity[thr].loops, _tot); break; + case __LINE__: SHOW_VAL("wake_tasks:", activity[thr].wake_tasks, _tot); break; + case __LINE__: SHOW_VAL("wake_signal:", activity[thr].wake_signal, _tot); break; + case __LINE__: SHOW_VAL("poll_io:", activity[thr].poll_io, _tot); break; + case __LINE__: SHOW_VAL("poll_exp:", activity[thr].poll_exp, _tot); break; + case __LINE__: SHOW_VAL("poll_drop_fd:", activity[thr].poll_drop_fd, _tot); break; + case __LINE__: SHOW_VAL("poll_skip_fd:", activity[thr].poll_skip_fd, _tot); break; + case __LINE__: SHOW_VAL("conn_dead:", activity[thr].conn_dead, _tot); break; + case __LINE__: SHOW_VAL("stream_calls:", activity[thr].stream_calls, _tot); break; + case __LINE__: SHOW_VAL("pool_fail:", activity[thr].pool_fail, _tot); break; + case __LINE__: SHOW_VAL("buf_wait:", activity[thr].buf_wait, _tot); break; + case __LINE__: SHOW_VAL("cpust_ms_tot:", activity[thr].cpust_total / 2, _tot); break; + case __LINE__: SHOW_VAL("cpust_ms_1s:", read_freq_ctr(&activity[thr].cpust_1s) / 2, _tot); break; + case __LINE__: SHOW_VAL("cpust_ms_15s:", read_freq_ctr_period(&activity[thr].cpust_15s, 15000) / 2, _tot); break; + case __LINE__: SHOW_VAL("avg_cpu_pct:", (100 - ha_thread_ctx[thr].idle_pct), (_tot + _nbt/2) / _nbt); break; + case __LINE__: SHOW_VAL("avg_loop_us:", swrate_avg(activity[thr].avg_loop_us, TIME_STATS_SAMPLES), (_tot + _nbt/2) / _nbt); break; + case __LINE__: SHOW_VAL("accepted:", activity[thr].accepted, _tot); break; + case __LINE__: SHOW_VAL("accq_pushed:", activity[thr].accq_pushed, _tot); break; + case __LINE__: SHOW_VAL("accq_full:", activity[thr].accq_full, _tot); break; +#ifdef USE_THREAD + case __LINE__: SHOW_VAL("accq_ring:", accept_queue_ring_len(&accept_queue_rings[thr]), _tot); break; + case __LINE__: SHOW_VAL("fd_takeover:", activity[thr].fd_takeover, _tot); break; + case __LINE__: SHOW_VAL("check_adopted:",activity[thr].check_adopted, _tot); break; +#endif + case __LINE__: SHOW_VAL("check_started:",activity[thr].check_started, _tot); break; + case __LINE__: SHOW_VAL("check_active:", _HA_ATOMIC_LOAD(&ha_thread_ctx[thr].active_checks), _tot); break; + case __LINE__: SHOW_VAL("check_running:",_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].running_checks), _tot); break; + +#if defined(DEBUG_DEV) + /* keep these ones at the end */ + case __LINE__: SHOW_VAL("ctr0:", activity[thr].ctr0, _tot); break; + case __LINE__: SHOW_VAL("ctr1:", activity[thr].ctr1, _tot); break; + case __LINE__: SHOW_VAL("ctr2:", activity[thr].ctr2, _tot); break; +#endif + } +#undef SHOW_VAL + + /* try to dump what was possibly not dumped yet */ + + if (applet_putchk(appctx, &trash) == -1) { + /* buffer full, retry later */ + return 0; + } + /* line was dumped, let's commit it */ + actctx->line++; + } while (actctx->line + base_line < __LINE__); + + /* dump complete */ + return 1; +} + +/* parse a "show activity" CLI request. Returns 0 if it needs to continue, 1 if it + * wants to stop here. It sets a show_activity_ctx context where, if a specific + * thread is requested, it puts the thread number into ->thr otherwise sets it to + * -1. + */ +static int cli_parse_show_activity(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_activity_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + ctx->thr = -1; // show all by default + if (*args[2]) + ctx->thr = atoi(args[2]); + + if (ctx->thr < -1 || ctx->thr > global.nbthread) + return cli_err(appctx, "Thread ID number must be between -1 and nbthread\n"); + + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { +#ifdef USE_MEMORY_PROFILING + { CFG_GLOBAL, "profiling.memory", cfg_parse_prof_memory }, +#endif + { CFG_GLOBAL, "profiling.tasks", cfg_parse_prof_tasks }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "set", "profiling", NULL }, "set profiling <what> {auto|on|off} : enable/disable resource profiling (tasks,memory)", cli_parse_set_profiling, NULL }, + { { "show", "activity", NULL }, "show activity [-1|0|thread_num] : show per-thread activity stats (for support/developers)", cli_parse_show_activity, cli_io_handler_show_activity, NULL }, + { { "show", "profiling", NULL }, "show profiling [<what>|<#lines>|<opts>]*: show profiling state (all,status,tasks,memory)", cli_parse_show_profiling, cli_io_handler_show_profiling, NULL }, + { { "show", "tasks", NULL }, "show tasks : show running tasks", NULL, cli_io_handler_show_tasks, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); diff --git a/src/applet.c b/src/applet.c new file mode 100644 index 0000000..a5b0946 --- /dev/null +++ b/src/applet.c @@ -0,0 +1,501 @@ +/* + * Functions managing applets + * + * Copyright 2000-2015 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <stdio.h> +#include <stdlib.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/channel.h> +#include <haproxy/list.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/trace.h> + +unsigned int nb_applets = 0; + +DECLARE_POOL(pool_head_appctx, "appctx", sizeof(struct appctx)); + + +/* trace source and events */ +static void applet_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +/* The event representation is split like this : + * app - applet + */ +static const struct trace_event applet_trace_events[] = { +#define APPLET_EV_NEW (1ULL << 0) + { .mask = APPLET_EV_NEW, .name = "app_new", .desc = "new appctx" }, +#define APPLET_EV_FREE (1ULL << 1) + { .mask = APPLET_EV_FREE, .name = "app_free", .desc = "free appctx" }, +#define APPLET_EV_RELEASE (1ULL << 2) + { .mask = APPLET_EV_RELEASE, .name = "app_release", .desc = "release appctx" }, +#define APPLET_EV_PROCESS (1ULL << 3) + { .mask = APPLET_EV_PROCESS, .name = "app_proc", .desc = "process appctx" }, +#define APPLET_EV_ERR (1ULL << 4) + { .mask = APPLET_EV_ERR, .name = "app_err", .desc = "error on appctx" }, +#define APPLET_EV_START (1ULL << 5) + { .mask = APPLET_EV_START, .name = "app_start", .desc = "start appctx" }, + {} +}; + +static const struct name_desc applet_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the applet */ }, + /* arg2 */ { }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc applet_trace_decoding[] = { +#define STRM_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define STRM_VERB_MINIMAL 2 + { .name="minimal", .desc="report info on streams and connectors" }, +#define STRM_VERB_SIMPLE 3 + { .name="simple", .desc="add info on request and response channels" }, +#define STRM_VERB_ADVANCED 4 + { .name="advanced", .desc="add info on channel's buffer for data and developer levels only" }, +#define STRM_VERB_COMPLETE 5 + { .name="complete", .desc="add info on channel's buffer" }, + { /* end */ } +}; + +static struct trace_source trace_applet = { + .name = IST("applet"), + .desc = "Applet endpoint", + .arg_def = TRC_ARG1_APPCTX, // TRACE()'s first argument is always an appctx + .default_cb = applet_trace, + .known_events = applet_trace_events, + .lockon_args = applet_trace_lockon_args, + .decoding = applet_trace_decoding, + .report_events = ~0, // report everything by default +}; + +#define TRACE_SOURCE &trace_applet +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + +/* the applet traces always expect that arg1, if non-null, is of a appctx (from + * which we can derive everything). + */ +static void applet_trace(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct appctx *appctx = a1; + const struct stconn *sc = NULL, *sco = NULL; + const struct stream *s = NULL; + const struct channel *ic = NULL, *oc = NULL; + + if (!appctx || src->verbosity < STRM_VERB_CLEAN) + return; + + sc = appctx_sc(appctx); + if (sc) { + s = __sc_strm(sc); + sco = sc_opposite(sc); + ic = sc_ic(sc); + oc = sc_oc(sc); + } + + /* General info about the stream (htx/tcp, id...) */ + if (s) + chunk_appendf(&trace_buf, " : [%s,%s]", + appctx->applet->name, ((s->flags & SF_HTX) ? "HTX" : "TCP")); + else + chunk_appendf(&trace_buf, " : [%s]", appctx->applet->name); + + if (sc) + /* local and opposite stream connector state */ + chunk_appendf(&trace_buf, " SC=(%s,%s)", + sc_state_str(sc->state), sc_state_str(sco->state)); + else + /* local and opposite stream connector state */ + chunk_appendf(&trace_buf, " SC=(none,none)"); + + if (src->verbosity == STRM_VERB_CLEAN) + return; + + chunk_appendf(&trace_buf, " appctx=%p .t=%p .t.exp=%d .state=%d .st0=%d .st1=%d", + appctx, appctx->t, tick_isset(appctx->t->expire) ? TICKS_TO_MS(appctx->t->expire - now_ms) : TICK_ETERNITY, + appctx->state, appctx->st0, appctx->st1); + + if (!sc || src->verbosity == STRM_VERB_MINIMAL) + return; + + chunk_appendf(&trace_buf, " - s=(%p,0x%08x,0x%x)", s, s->flags, s->conn_err_type); + + chunk_appendf(&trace_buf, " sc=(%p,%d,0x%08x,0x%x) sco=(%p,%d,0x%08x,0x%x) sc.exp(r,w)=(%d,%d) sco.exp(r,w)=(%d,%d)", + sc, sc->state, sc->flags, sc->sedesc->flags, + sco, sco->state, sco->flags, sco->sedesc->flags, + tick_isset(sc_ep_rcv_ex(sc)) ? TICKS_TO_MS(sc_ep_rcv_ex(sc) - now_ms) : TICK_ETERNITY, + tick_isset(sc_ep_snd_ex(sc)) ? TICKS_TO_MS(sc_ep_snd_ex(sc) - now_ms) : TICK_ETERNITY, + tick_isset(sc_ep_rcv_ex(sco)) ? TICKS_TO_MS(sc_ep_rcv_ex(sco) - now_ms) : TICK_ETERNITY, + tick_isset(sc_ep_snd_ex(sco)) ? TICKS_TO_MS(sc_ep_snd_ex(sco) - now_ms) : TICK_ETERNITY); + + + /* If txn defined, don't display all channel info */ + if (src->verbosity == STRM_VERB_SIMPLE) { + chunk_appendf(&trace_buf, " ic=(%p .fl=0x%08x .exp=%d)", + ic, ic->flags, tick_isset(ic->analyse_exp) ? TICKS_TO_MS(ic->analyse_exp - now_ms) : TICK_ETERNITY); + chunk_appendf(&trace_buf, " oc=(%p .fl=0x%08x .exp=%d)", + oc, oc->flags, tick_isset(oc->analyse_exp) ? TICKS_TO_MS(oc->analyse_exp - now_ms) : TICK_ETERNITY); + } + else { + chunk_appendf(&trace_buf, " ic=(%p .fl=0x%08x .ana=0x%08x .exp=%u .o=%lu .tot=%llu .to_fwd=%u)", + ic, ic->flags, ic->analysers, ic->analyse_exp, + (long)ic->output, ic->total, ic->to_forward); + chunk_appendf(&trace_buf, " oc=(%p .fl=0x%08x .ana=0x%08x .exp=%u .o=%lu .tot=%llu .to_fwd=%u)", + oc, oc->flags, oc->analysers, oc->analyse_exp, + (long)oc->output, oc->total, oc->to_forward); + } + + if (src->verbosity == STRM_VERB_SIMPLE || + (src->verbosity == STRM_VERB_ADVANCED && src->level < TRACE_LEVEL_DATA)) + return; + + /* channels' buffer info */ + if (s->flags & SF_HTX) { + struct htx *ichtx = htxbuf(&ic->buf); + struct htx *ochtx = htxbuf(&oc->buf); + + chunk_appendf(&trace_buf, " htx=(%u/%u#%u, %u/%u#%u)", + ichtx->data, ichtx->size, htx_nbblks(ichtx), + ochtx->data, ochtx->size, htx_nbblks(ochtx)); + } + else { + chunk_appendf(&trace_buf, " buf=(%u@%p+%u/%u, %u@%p+%u/%u)", + (unsigned int)b_data(&ic->buf), b_orig(&ic->buf), + (unsigned int)b_head_ofs(&ic->buf), (unsigned int)b_size(&ic->buf), + (unsigned int)b_data(&oc->buf), b_orig(&oc->buf), + (unsigned int)b_head_ofs(&oc->buf), (unsigned int)b_size(&oc->buf)); + } +} + +/* Tries to allocate a new appctx and initialize all of its fields. The appctx + * is returned on success, NULL on failure. The appctx must be released using + * appctx_free(). <applet> is assigned as the applet, but it can be NULL. <thr> + * is the thread ID to start the applet on, and a negative value allows the + * applet to start anywhere. Backend applets may only be created on the current + * thread. + */ +struct appctx *appctx_new_on(struct applet *applet, struct sedesc *sedesc, int thr) +{ + struct appctx *appctx; + + /* Backend appctx cannot be started on another thread than the local one */ + BUG_ON(thr != tid && sedesc); + + TRACE_ENTER(APPLET_EV_NEW); + + appctx = pool_zalloc(pool_head_appctx); + if (unlikely(!appctx)) { + TRACE_ERROR("APPCTX allocation failure", APPLET_EV_NEW|APPLET_EV_ERR); + goto fail_appctx; + } + + LIST_INIT(&appctx->wait_entry); + appctx->obj_type = OBJ_TYPE_APPCTX; + appctx->applet = applet; + appctx->sess = NULL; + + appctx->t = task_new_on(thr); + if (unlikely(!appctx->t)) { + TRACE_ERROR("APPCTX task allocation failure", APPLET_EV_NEW|APPLET_EV_ERR); + goto fail_task; + } + + if (!sedesc) { + sedesc = sedesc_new(); + if (unlikely(!sedesc)) { + TRACE_ERROR("APPCTX sedesc allocation failure", APPLET_EV_NEW|APPLET_EV_ERR); + goto fail_endp; + } + sedesc->se = appctx; + se_fl_set(sedesc, SE_FL_T_APPLET | SE_FL_ORPHAN); + } + + appctx->sedesc = sedesc; + appctx->t->process = task_run_applet; + appctx->t->context = appctx; + + LIST_INIT(&appctx->buffer_wait.list); + appctx->buffer_wait.target = appctx; + appctx->buffer_wait.wakeup_cb = appctx_buf_available; + + _HA_ATOMIC_INC(&nb_applets); + + TRACE_LEAVE(APPLET_EV_NEW, appctx); + return appctx; + + fail_endp: + task_destroy(appctx->t); + fail_task: + pool_free(pool_head_appctx, appctx); + fail_appctx: + return NULL; +} + +/* Finalize the frontend appctx startup. It must not be called for a backend + * appctx. This function is responsible to create the appctx's session and the + * frontend stream connector. By transitivity, the stream is also created. + * + * It returns 0 on success and -1 on error. In this case, it is the caller + * responsibility to release the appctx. However, the session is released if it + * was created. On success, if an error is encountered in the caller function, + * the stream must be released instead of the appctx. To be sure, + * appctx_free_on_early_error() must be called in this case. + */ +int appctx_finalize_startup(struct appctx *appctx, struct proxy *px, struct buffer *input) +{ + struct session *sess; + + /* async startup is only possible for frontend appctx. Thus for orphan + * appctx. Because no backend appctx can be orphan. + */ + BUG_ON(!se_fl_test(appctx->sedesc, SE_FL_ORPHAN)); + + TRACE_ENTER(APPLET_EV_START, appctx); + + sess = session_new(px, NULL, &appctx->obj_type); + if (!sess) { + TRACE_ERROR("APPCTX session allocation failure", APPLET_EV_START|APPLET_EV_ERR, appctx); + return -1; + } + if (!sc_new_from_endp(appctx->sedesc, sess, input)) { + session_free(sess); + TRACE_ERROR("APPCTX sc allocation failure", APPLET_EV_START|APPLET_EV_ERR, appctx); + return -1; + } + + appctx->sess = sess; + TRACE_LEAVE(APPLET_EV_START, appctx); + return 0; +} + +/* Release function to call when an error occurred during init stage of a + * frontend appctx. For a backend appctx, it just calls appctx_free() + */ +void appctx_free_on_early_error(struct appctx *appctx) +{ + /* If a frontend appctx is attached to a stream connector, release the stream + * instead of the appctx. + */ + if (!se_fl_test(appctx->sedesc, SE_FL_ORPHAN) && !(appctx_sc(appctx)->flags & SC_FL_ISBACK)) { + stream_free(appctx_strm(appctx)); + return; + } + appctx_free(appctx); +} + +void appctx_free(struct appctx *appctx) +{ + /* The task is supposed to be run on this thread, so we can just + * check if it's running already (or about to run) or not + */ + if (!(appctx->t->state & (TASK_QUEUED | TASK_RUNNING))) { + TRACE_POINT(APPLET_EV_FREE, appctx); + __appctx_free(appctx); + } + else { + /* if it's running, or about to run, defer the freeing + * until the callback is called. + */ + appctx->state |= APPLET_WANT_DIE; + task_wakeup(appctx->t, TASK_WOKEN_OTHER); + TRACE_DEVEL("Cannot release APPCTX now, wake it up", APPLET_EV_FREE, appctx); + } +} + +/* reserves a command context of at least <size> bytes in the <appctx>, for + * use by a CLI command or any regular applet. The pointer to this context is + * stored in ctx.svcctx and is returned. The caller doesn't need to release + * it as it's allocated from reserved space. If the size is larger than + * APPLET_MAX_SVCCTX a crash will occur (hence that will never happen outside + * of development). + * + * Note that the command does *not* initialize the area, so that it can easily + * be used upon each entry in a function. It's left to the initialization code + * to do it if needed. The CLI will always zero the whole area before calling + * a keyword's ->parse() function. + */ +void *applet_reserve_svcctx(struct appctx *appctx, size_t size) +{ + BUG_ON(size > APPLET_MAX_SVCCTX); + appctx->svcctx = &appctx->svc.storage; + return appctx->svcctx; +} + +/* This is used to reset an svcctx and the svc.storage without releasing the + * appctx. In fact this is only used by the CLI applet between commands. + */ +void applet_reset_svcctx(struct appctx *appctx) +{ + memset(&appctx->svc.storage, 0, APPLET_MAX_SVCCTX); + appctx->svcctx = NULL; +} + +/* call the applet's release() function if any, and marks the sedesc as shut. + * Needs to be called upon close(). + */ +void appctx_shut(struct appctx *appctx) +{ + if (se_fl_test(appctx->sedesc, SE_FL_SHR | SE_FL_SHW)) + return; + + TRACE_ENTER(APPLET_EV_RELEASE, appctx); + if (appctx->applet->release) + appctx->applet->release(appctx); + + if (LIST_INLIST(&appctx->buffer_wait.list)) + LIST_DEL_INIT(&appctx->buffer_wait.list); + + se_fl_set(appctx->sedesc, SE_FL_SHRR | SE_FL_SHWN); + TRACE_LEAVE(APPLET_EV_RELEASE, appctx); +} + +/* Callback used to wake up an applet when a buffer is available. The applet + * <appctx> is woken up if an input buffer was requested for the associated + * stream connector. In this case the buffer is immediately allocated and the + * function returns 1. Otherwise it returns 0. Note that this automatically + * covers multiple wake-up attempts by ensuring that the same buffer will not + * be accounted for multiple times. + */ +int appctx_buf_available(void *arg) +{ + struct appctx *appctx = arg; + struct stconn *sc = appctx_sc(appctx); + + /* allocation requested ? */ + if (!(sc->flags & SC_FL_NEED_BUFF)) + return 0; + + sc_have_buff(sc); + + /* was already allocated another way ? if so, don't take this one */ + if (c_size(sc_ic(sc)) || sc_ep_have_ff_data(sc_opposite(sc))) + return 0; + + /* allocation possible now ? */ + if (!b_alloc(&sc_ic(sc)->buf)) { + sc_need_buff(sc); + return 0; + } + + task_wakeup(appctx->t, TASK_WOKEN_RES); + return 1; +} + +/* Default applet handler */ +struct task *task_run_applet(struct task *t, void *context, unsigned int state) +{ + struct appctx *app = context; + struct stconn *sc, *sco; + unsigned int rate; + size_t count; + int did_send = 0; + + TRACE_ENTER(APPLET_EV_PROCESS, app); + + if (app->state & APPLET_WANT_DIE) { + TRACE_DEVEL("APPCTX want die, release it", APPLET_EV_FREE, app); + __appctx_free(app); + return NULL; + } + + if (se_fl_test(app->sedesc, SE_FL_ORPHAN)) { + /* Finalize init of orphan appctx. .init callback function must + * be defined and it must finalize appctx startup. + */ + BUG_ON(!app->applet->init); + + if (appctx_init(app) == -1) { + TRACE_DEVEL("APPCTX init failed", APPLET_EV_FREE|APPLET_EV_ERR, app); + appctx_free_on_early_error(app); + return NULL; + } + BUG_ON(!app->sess || !appctx_sc(app) || !appctx_strm(app)); + TRACE_DEVEL("APPCTX initialized", APPLET_EV_PROCESS, app); + } + + sc = appctx_sc(app); + sco = sc_opposite(sc); + + /* We always pretend the applet can't get and doesn't want to + * put, it's up to it to change this if needed. This ensures + * that one applet which ignores any event will not spin. + */ + applet_need_more_data(app); + applet_have_no_more_data(app); + + /* Now we'll try to allocate the input buffer. We wake up the applet in + * all cases. So this is the applet's responsibility to check if this + * buffer was allocated or not. This leaves a chance for applets to do + * some other processing if needed. The applet doesn't have anything to + * do if it needs the buffer, it will be called again upon readiness. + */ + if (!sc_alloc_ibuf(sc, &app->buffer_wait)) + applet_have_more_data(app); + + count = co_data(sc_oc(sc)); + app->applet->fct(app); + + TRACE_POINT(APPLET_EV_PROCESS, app); + + /* now check if the applet has released some room and forgot to + * notify the other side about it. + */ + if (count != co_data(sc_oc(sc))) { + sc_oc(sc)->flags |= CF_WRITE_EVENT | CF_WROTE_DATA; + if (sco->room_needed < 0 || channel_recv_max(sc_oc(sc)) >= sco->room_needed) + sc_have_room(sco); + did_send = 1; + } + else { + if (!sco->room_needed) + sc_have_room(sco); + } + + if (sc_ic(sc)->flags & CF_READ_EVENT) + sc_ep_report_read_activity(sc); + + if (sc_waiting_room(sc) && (sc->flags & SC_FL_ABRT_DONE)) { + sc_ep_set(sc, SE_FL_EOS|SE_FL_ERROR); + } + + if (!co_data(sc_oc(sc))) { + if (did_send) + sc_ep_report_send_activity(sc); + } + else + sc_ep_report_blocked_send(sc, did_send); + + /* measure the call rate and check for anomalies when too high */ + if (((b_size(sc_ib(sc)) && sc->flags & SC_FL_NEED_BUFF) || // asks for a buffer which is present + (b_size(sc_ib(sc)) && !b_data(sc_ib(sc)) && sc->flags & SC_FL_NEED_ROOM) || // asks for room in an empty buffer + (b_data(sc_ob(sc)) && sc_is_send_allowed(sc)) || // asks for data already present + (!b_data(sc_ib(sc)) && b_data(sc_ob(sc)) && // didn't return anything ... + (!(sc_oc(sc)->flags & CF_WRITE_EVENT) && (sc->flags & SC_FL_SHUT_WANTED))))) { // ... and left data pending after a shut + rate = update_freq_ctr(&app->call_rate, 1); + if (rate >= 100000 && app->call_rate.prev_ctr) // looped like this more than 100k times over last second + stream_dump_and_crash(&app->obj_type, read_freq_ctr(&app->call_rate)); + } + + sc->app_ops->wake(sc); + channel_release_buffer(sc_ic(sc), &app->buffer_wait); + TRACE_LEAVE(APPLET_EV_PROCESS, app); + return t; +} diff --git a/src/arg.c b/src/arg.c new file mode 100644 index 0000000..2810050 --- /dev/null +++ b/src/arg.c @@ -0,0 +1,479 @@ +/* + * Functions used to parse typed argument lists + * + * Copyright 2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <arpa/inet.h> + +#include <haproxy/arg.h> +#include <haproxy/chunk.h> +#include <haproxy/global.h> +#include <haproxy/regex.h> +#include <haproxy/tools.h> + +const char *arg_type_names[ARGT_NBTYPES] = { + [ARGT_STOP] = "end of arguments", + [ARGT_SINT] = "integer", + [ARGT_STR] = "string", + [ARGT_IPV4] = "IPv4 address", + [ARGT_MSK4] = "IPv4 mask", + [ARGT_IPV6] = "IPv6 address", + [ARGT_MSK6] = "IPv6 mask", + [ARGT_TIME] = "delay", + [ARGT_SIZE] = "size", + [ARGT_FE] = "frontend", + [ARGT_BE] = "backend", + [ARGT_TAB] = "table", + [ARGT_SRV] = "server", + [ARGT_USR] = "user list", + [ARGT_MAP] = "map", + [ARGT_REG] = "regex", + [ARGT_VAR] = "variable", + [ARGT_PBUF_FNUM] = "Protocol buffers field number", + /* Unassigned types must never happen. Better crash during parsing if they do. */ +}; + +/* This dummy arg list may be used by default when no arg is found, it helps + * parsers by removing pointer checks. + */ +struct arg empty_arg_list[ARGM_NBARGS] = { }; + +/* This function clones a struct arg_list template into a new one which is + * returned. + */ +struct arg_list *arg_list_clone(const struct arg_list *orig) +{ + struct arg_list *new; + + if ((new = calloc(1, sizeof(*new))) != NULL) { + /* ->list will be set by the caller when inserting the element. + * ->arg and ->arg_pos will be set by the caller. + */ + new->ctx = orig->ctx; + new->kw = orig->kw; + new->conv = orig->conv; + new->file = orig->file; + new->line = orig->line; + } + return new; +} + +/* This function clones a struct <arg_list> template into a new one which is + * set to point to arg <arg> at pos <pos>, and which is returned if the caller + * wants to apply further changes. + */ +struct arg_list *arg_list_add(struct arg_list *orig, struct arg *arg, int pos) +{ + struct arg_list *new; + + new = arg_list_clone(orig); + if (new) { + new->arg = arg; + new->arg_pos = pos; + LIST_APPEND(&orig->list, &new->list); + } + return new; +} + +/* This function builds an argument list from a config line, and stops at the + * first non-matching character, which is pointed to in <end_ptr>. A valid arg + * list starts with an opening parenthesis '(', contains a number of comma- + * delimited words, and ends with the closing parenthesis ')'. An empty list + * (with or without the parenthesis) will lead to a valid empty argument if the + * keyword has a mandatory one. The function returns the number of arguments + * emitted, or <0 in case of any error. Everything needed it automatically + * allocated. A pointer to an error message might be returned in err_msg if not + * NULL, in which case it would be allocated and the caller will have to check + * it and free it. The output arg list is returned in argp which must be valid. + * The returned array is always terminated by an arg of type ARGT_STOP (0), + * unless the mask indicates that no argument is supported. Unresolved arguments + * are appended to arg list <al>, which also serves as a template to create new + * entries. <al> may be NULL if unresolved arguments are not allowed. The mask + * is composed of a number of mandatory arguments in its lower ARGM_BITS bits, + * and a concatenation of each argument type in each subsequent ARGT_BITS-bit + * sblock. If <err_msg> is not NULL, it must point to a freeable or NULL + * pointer. The caller is expected to restart the parsing from the new pointer + * set in <end_ptr>, which is the first character considered as not being part + * of the arg list. The input string ends on the first between <len> characters + * (when len is positive) or the first NUL character. Placing -1 in <len> will + * make it virtually unbounded (~2GB long strings). + */ +int make_arg_list(const char *in, int len, uint64_t mask, struct arg **argp, + char **err_msg, const char **end_ptr, int *err_arg, + struct arg_list *al) +{ + int nbarg; + int pos; + struct arg *arg; + const char *beg; + const char *ptr_err = NULL; + int min_arg; + int empty; + struct arg_list *new_al = al; + + *argp = NULL; + + empty = 0; + if (!len || *in != '(') { + /* it's already not for us, stop here */ + empty = 1; + len = 0; + } else { + /* skip opening parenthesis */ + len--; + in++; + } + + min_arg = mask & ARGM_MASK; + mask >>= ARGM_BITS; + + pos = 0; + /* find between 0 and NBARGS the max number of args supported by the mask */ + for (nbarg = 0; nbarg < ARGM_NBARGS && ((mask >> (nbarg * ARGT_BITS)) & ARGT_MASK); nbarg++); + + if (!nbarg) + goto end_parse; + + /* Note: an empty input string contains an empty argument if this argument + * is marked mandatory. Otherwise we can ignore it. + */ + if (empty && !min_arg) + goto end_parse; + + arg = *argp = calloc(nbarg + 1, sizeof(**argp)); + + if (!arg) + goto alloc_err; + + /* Note: empty arguments after a comma always exist. */ + while (pos < nbarg) { + unsigned int uint; + int squote = 0, dquote = 0; + char *out; + + chunk_reset(&trash); + out = trash.area; + + while (len && *in && trash.data < trash.size - 1) { + if (*in == '"' && !squote) { /* double quote outside single quotes */ + if (dquote) + dquote = 0; + else + dquote = 1; + in++; len--; + continue; + } + else if (*in == '\'' && !dquote) { /* single quote outside double quotes */ + if (squote) + squote = 0; + else + squote = 1; + in++; len--; + continue; + } + else if (*in == '\\' && !squote && len != 1) { + /* '\', ', ' ', '"' support being escaped by '\' */ + if (in[1] == 0) + goto unquote_err; + + if (in[1] == '\\' || in[1] == ' ' || in[1] == '"' || in[1] == '\'') { + in++; len--; + *out++ = *in; + } + else if (in[1] == 'r') { + in++; len--; + *out++ = '\r'; + } + else if (in[1] == 'n') { + in++; len--; + *out++ = '\n'; + } + else if (in[1] == 't') { + in++; len--; + *out++ = '\t'; + } + else { + /* just a lone '\' */ + *out++ = *in; + } + in++; len--; + } + else { + if (!squote && !dquote && (*in == ',' || *in == ')')) { + /* end of argument */ + break; + } + /* verbatim copy */ + *out++ = *in++; + len--; + } + trash.data = out - trash.area; + } + + if (len && *in && *in != ',' && *in != ')') + goto buffer_err; + + trash.area[trash.data] = 0; + + arg->type = (mask >> (pos * ARGT_BITS)) & ARGT_MASK; + + switch (arg->type) { + case ARGT_SINT: + if (!trash.data) // empty number + goto empty_err; + beg = trash.area; + arg->data.sint = read_int64(&beg, trash.area + trash.data); + if (beg < trash.area + trash.data) + goto parse_err; + arg->type = ARGT_SINT; + break; + + case ARGT_FE: + case ARGT_BE: + case ARGT_TAB: + case ARGT_SRV: + case ARGT_USR: + case ARGT_REG: + /* These argument types need to be stored as strings during + * parsing then resolved later. + */ + if (!al) + goto resolve_err; + arg->unresolved = 1; + new_al = arg_list_add(al, arg, pos); + __fallthrough; + + case ARGT_STR: + /* all types that must be resolved are stored as strings + * during the parsing. The caller must at one point resolve + * them and free the string. + */ + arg->data.str.area = my_strndup(trash.area, trash.data); + arg->data.str.data = trash.data; + arg->data.str.size = trash.data + 1; + break; + + case ARGT_IPV4: + if (!trash.data) // empty address + goto empty_err; + + if (inet_pton(AF_INET, trash.area, &arg->data.ipv4) <= 0) + goto parse_err; + break; + + case ARGT_MSK4: + if (!trash.data) // empty mask + goto empty_err; + + if (!str2mask(trash.area, &arg->data.ipv4)) + goto parse_err; + + arg->type = ARGT_IPV4; + break; + + case ARGT_IPV6: + if (!trash.data) // empty address + goto empty_err; + + if (inet_pton(AF_INET6, trash.area, &arg->data.ipv6) <= 0) + goto parse_err; + break; + + case ARGT_MSK6: + if (!trash.data) // empty mask + goto empty_err; + + if (!str2mask6(trash.area, &arg->data.ipv6)) + goto parse_err; + + arg->type = ARGT_IPV6; + break; + + case ARGT_TIME: + if (!trash.data) // empty time + goto empty_err; + + ptr_err = parse_time_err(trash.area, &uint, TIME_UNIT_MS); + if (ptr_err) { + if (ptr_err == PARSE_TIME_OVER || ptr_err == PARSE_TIME_UNDER) + ptr_err = trash.area; + goto parse_err; + } + arg->data.sint = uint; + arg->type = ARGT_SINT; + break; + + case ARGT_SIZE: + if (!trash.data) // empty size + goto empty_err; + + ptr_err = parse_size_err(trash.area, &uint); + if (ptr_err) + goto parse_err; + + arg->data.sint = uint; + arg->type = ARGT_SINT; + break; + + case ARGT_PBUF_FNUM: + if (!trash.data) + goto empty_err; + + if (!parse_dotted_uints(trash.area, &arg->data.fid.ids, &arg->data.fid.sz)) + goto parse_err; + + break; + + /* FIXME: other types need to be implemented here */ + default: + goto not_impl; + } + + pos++; + arg++; + + /* don't go back to parsing if we reached end */ + if (!len || !*in || *in == ')' || pos >= nbarg) + break; + + /* skip comma */ + in++; len--; + } + + end_parse: + if (pos < min_arg) { + /* not enough arguments */ + memprintf(err_msg, + "missing arguments (got %d/%d), type '%s' expected", + pos, min_arg, arg_type_names[(mask >> (pos * ARGT_BITS)) & ARGT_MASK]); + goto err; + } + + if (empty) { + /* nothing to do */ + } else if (*in == ')') { + /* skip the expected closing parenthesis */ + in++; + } else { + /* the caller is responsible for freeing this message */ + char *word = (len > 0) ? my_strndup(in, len) : (char *)in; + + if (*word) + memprintf(err_msg, "expected ')' before '%s'", word); + else + memprintf(err_msg, "expected ')'"); + + if (len > 0) + free(word); + /* when we're missing a right paren, the empty part preceding + * already created an empty arg, adding one to the position, so + * let's fix the reporting to avoid being confusing. + */ + if (pos > 1) + pos--; + goto err; + } + + /* note that pos might be < nbarg and this is not an error, it's up to the + * caller to decide what to do with optional args. + */ + if (err_arg) + *err_arg = pos; + if (end_ptr) + *end_ptr = in; + return pos; + + err: + if (new_al == al) { + /* only free the arg area if we have not queued unresolved args + * still pointing to it. + */ + free_args(*argp); + free(*argp); + } + *argp = NULL; + if (err_arg) + *err_arg = pos; + if (end_ptr) + *end_ptr = in; + return -1; + + empty_err: + /* If we've only got an empty set of parenthesis with nothing + * in between, there is no arg at all. + */ + if (!pos) { + ha_free(argp); + } + + if (pos >= min_arg) + goto end_parse; + + memprintf(err_msg, "expected type '%s' at position %d, but got nothing", + arg_type_names[(mask >> (pos * ARGT_BITS)) & ARGT_MASK], pos + 1); + goto err; + + parse_err: + /* come here with the word attempted to parse in trash */ + memprintf(err_msg, "failed to parse '%s' as type '%s' at position %d", + trash.area, arg_type_names[(mask >> (pos * ARGT_BITS)) & ARGT_MASK], pos + 1); + goto err; + + not_impl: + memprintf(err_msg, "parsing for type '%s' was not implemented, please report this bug", + arg_type_names[(mask >> (pos * ARGT_BITS)) & ARGT_MASK]); + goto err; + + buffer_err: + memprintf(err_msg, "too small buffer size to store decoded argument %d, increase bufsize ?", + pos + 1); + goto err; + + unquote_err: + /* come here with the parsed part in <trash.area>:<trash.data> and the + * unparsable part in <in>. + */ + trash.area[trash.data] = 0; + memprintf(err_msg, "failed to parse '%s' after '%s' as type '%s' at position %d", + in, trash.area, arg_type_names[(mask >> (pos * ARGT_BITS)) & ARGT_MASK], pos + 1); + goto err; + +alloc_err: + memprintf(err_msg, "out of memory"); + goto err; + + resolve_err: + memprintf(err_msg, "unresolved argument of type '%s' at position %d not allowed", + arg_type_names[(mask >> (pos * ARGT_BITS)) & ARGT_MASK], pos + 1); + goto err; +} + +/* Free all args of an args array, taking care of unresolved arguments as well. + * It stops at the ARGT_STOP, which must be present. The array itself is not + * freed, it's up to the caller to do it. However it is returned, allowing to + * call free(free_args(argptr)). It is valid to call it with a NULL args, and + * nothing will be done). + */ +struct arg *free_args(struct arg *args) +{ + struct arg *arg; + + for (arg = args; arg && arg->type != ARGT_STOP; arg++) { + if (arg->type == ARGT_STR || arg->unresolved) + chunk_destroy(&arg->data.str); + else if (arg->type == ARGT_REG) + regex_free(arg->data.reg); + else if (arg->type == ARGT_PBUF_FNUM) + ha_free(&arg->data.fid.ids); + } + return args; +} diff --git a/src/auth.c b/src/auth.c new file mode 100644 index 0000000..0031300 --- /dev/null +++ b/src/auth.c @@ -0,0 +1,316 @@ +/* + * User authentication & authorization + * + * Copyright 2010 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifdef USE_LIBCRYPT +/* This is to have crypt() defined on Linux */ +#define _GNU_SOURCE + +#ifdef USE_CRYPT_H +/* some platforms such as Solaris need this */ +#include <crypt.h> +#endif +#endif /* USE_LIBCRYPT */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <haproxy/api.h> +#include <haproxy/auth-t.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/pattern-t.h> +#include <haproxy/sample-t.h> +#include <haproxy/thread.h> + +struct userlist *userlist = NULL; /* list of all existing userlists */ + +#ifdef USE_LIBCRYPT +#define CRYPT_STATE_MSG "yes" +#ifdef HA_HAVE_CRYPT_R +/* context for crypt_r() */ +static THREAD_LOCAL struct crypt_data crypt_data = { .initialized = 0 }; +#else +/* lock for crypt() */ +__decl_thread(static HA_SPINLOCK_T auth_lock); +#endif +#else /* USE_LIBCRYPT */ +#define CRYPT_STATE_MSG "no" +#endif + +/* find targets for selected groups. The function returns pointer to + * the userlist struct or NULL if name is NULL/empty or unresolvable. + */ + +struct userlist * +auth_find_userlist(char *name) +{ + struct userlist *l; + + if (!name || !*name) + return NULL; + + for (l = userlist; l; l = l->next) + if (strcmp(l->name, name) == 0) + return l; + + return NULL; +} + +int check_group(struct userlist *ul, char *name) +{ + struct auth_groups *ag; + + for (ag = ul->groups; ag; ag = ag->next) + if (strcmp(name, ag->name) == 0) + return 1; + return 0; +} + +void +userlist_free(struct userlist *ul) +{ + struct userlist *tul; + struct auth_users *au, *tau; + struct auth_groups_list *agl, *tagl; + struct auth_groups *ag, *tag; + + while (ul) { + /* Free users. */ + au = ul->users; + while (au) { + /* Free groups that own current user. */ + agl = au->u.groups; + while (agl) { + tagl = agl; + agl = agl->next; + free(tagl); + } + + tau = au; + au = au->next; + free(tau->user); + free(tau->pass); + free(tau); + } + + /* Free grouplist. */ + ag = ul->groups; + while (ag) { + tag = ag; + ag = ag->next; + free(tag->name); + free(tag); + } + + tul = ul; + ul = ul->next; + free(tul->name); + free(tul); + }; +} + +int userlist_postinit() +{ + struct userlist *curuserlist = NULL; + + /* Resolve usernames and groupnames. */ + for (curuserlist = userlist; curuserlist; curuserlist = curuserlist->next) { + struct auth_groups *ag; + struct auth_users *curuser; + struct auth_groups_list *grl; + + for (curuser = curuserlist->users; curuser; curuser = curuser->next) { + char *group = NULL; + struct auth_groups_list *groups = NULL; + + if (!curuser->u.groups_names) + continue; + + while ((group = strtok(group?NULL:curuser->u.groups_names, ","))) { + for (ag = curuserlist->groups; ag; ag = ag->next) { + if (strcmp(ag->name, group) == 0) + break; + } + + if (!ag) { + ha_alert("userlist '%s': no such group '%s' specified in user '%s'\n", + curuserlist->name, group, curuser->user); + free(groups); + return ERR_ALERT | ERR_FATAL; + } + + /* Add this group at the group userlist. */ + grl = calloc(1, sizeof(*grl)); + if (!grl) { + ha_alert("userlist '%s': no more memory when trying to allocate the user groups.\n", + curuserlist->name); + free(groups); + return ERR_ALERT | ERR_FATAL; + } + + grl->group = ag; + grl->next = groups; + groups = grl; + } + + free(curuser->u.groups); + curuser->u.groups = groups; + } + + for (ag = curuserlist->groups; ag; ag = ag->next) { + char *user = NULL; + + if (!ag->groupusers) + continue; + + while ((user = strtok(user?NULL:ag->groupusers, ","))) { + for (curuser = curuserlist->users; curuser; curuser = curuser->next) { + if (strcmp(curuser->user, user) == 0) + break; + } + + if (!curuser) { + ha_alert("userlist '%s': no such user '%s' specified in group '%s'\n", + curuserlist->name, user, ag->name); + return ERR_ALERT | ERR_FATAL; + } + + /* Add this group at the group userlist. */ + grl = calloc(1, sizeof(*grl)); + if (!grl) { + ha_alert("userlist '%s': no more memory when trying to allocate the user groups.\n", + curuserlist->name); + return ERR_ALERT | ERR_FATAL; + } + + grl->group = ag; + grl->next = curuser->u.groups; + curuser->u.groups = grl; + } + + ha_free(&ag->groupusers); + } + +#ifdef DEBUG_AUTH + for (ag = curuserlist->groups; ag; ag = ag->next) { + struct auth_groups_list *agl; + + fprintf(stderr, "group %s, id %p, users:", ag->name, ag); + for (curuser = curuserlist->users; curuser; curuser = curuser->next) { + for (agl = curuser->u.groups; agl; agl = agl->next) { + if (agl->group == ag) + fprintf(stderr, " %s", curuser->user); + } + } + fprintf(stderr, "\n"); + } +#endif + } + + return ERR_NONE; +} + +/* + * Authenticate and authorize user; return 1 if OK, 0 if case of error. + */ +int +check_user(struct userlist *ul, const char *user, const char *pass) +{ + + struct auth_users *u; +#ifdef DEBUG_AUTH + struct auth_groups_list *agl; +#endif + const char *ep; + +#ifdef DEBUG_AUTH + fprintf(stderr, "req: userlist=%s, user=%s, pass=%s\n", + ul->name, user, pass); +#endif + + for (u = ul->users; u; u = u->next) + if (strcmp(user, u->user) == 0) + break; + + if (!u) + return 0; + +#ifdef DEBUG_AUTH + fprintf(stderr, "cfg: user=%s, pass=%s, flags=%X, groups=", + u->user, u->pass, u->flags); + for (agl = u->u.groups; agl; agl = agl->next) + fprintf(stderr, " %s", agl->group->name); +#endif + + if (!(u->flags & AU_O_INSECURE)) { +#ifdef USE_LIBCRYPT +#ifdef HA_HAVE_CRYPT_R + ep = crypt_r(pass, u->pass, &crypt_data); +#else + HA_SPIN_LOCK(AUTH_LOCK, &auth_lock); + ep = crypt(pass, u->pass); + HA_SPIN_UNLOCK(AUTH_LOCK, &auth_lock); +#endif +#else + return 0; +#endif + } else + ep = pass; + +#ifdef DEBUG_AUTH + fprintf(stderr, ", crypt=%s\n", ((ep) ? ep : "")); +#endif + + if (ep && strcmp(ep, u->pass) == 0) + return 1; + else + return 0; +} + +struct pattern * +pat_match_auth(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct userlist *ul = smp->ctx.a[0]; + struct pattern_list *lst; + struct auth_users *u; + struct auth_groups_list *agl; + struct pattern *pattern; + + /* Check if the userlist is present in the context data. */ + if (!ul) + return NULL; + + /* Browse the userlist for searching user. */ + for (u = ul->users; u; u = u->next) { + if (strcmp(smp->data.u.str.area, u->user) == 0) + break; + } + if (!u) + return NULL; + + /* Browse each pattern. */ + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + /* Browse each group for searching group name that match the pattern. */ + for (agl = u->u.groups; agl; agl = agl->next) { + if (strcmp(agl->group->name, pattern->ptr.str) == 0) + return pattern; + } + } + return NULL; +} + +REGISTER_BUILD_OPTS("Encrypted password support via crypt(3): "CRYPT_STATE_MSG); diff --git a/src/backend.c b/src/backend.c new file mode 100644 index 0000000..39d2c75 --- /dev/null +++ b/src/backend.c @@ -0,0 +1,3401 @@ +/* + * Backend variables and functions. + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <syslog.h> +#include <string.h> +#include <ctype.h> +#include <sys/types.h> + +#include <import/ebmbtree.h> + +#include <haproxy/api.h> +#include <haproxy/acl.h> +#include <haproxy/activity.h> +#include <haproxy/arg.h> +#include <haproxy/backend.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/hash.h> +#include <haproxy/http.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/lb_chash.h> +#include <haproxy/lb_fas.h> +#include <haproxy/lb_fwlc.h> +#include <haproxy/lb_fwrr.h> +#include <haproxy/lb_map.h> +#include <haproxy/log.h> +#include <haproxy/namespace.h> +#include <haproxy/obj_type.h> +#include <haproxy/payload.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/queue.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/session.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> +#include <haproxy/time.h> +#include <haproxy/trace.h> + +#define TRACE_SOURCE &trace_strm + +int be_lastsession(const struct proxy *be) +{ + if (be->be_counters.last_sess) + return ns_to_sec(now_ns) - be->be_counters.last_sess; + + return -1; +} + +/* helper function to invoke the correct hash method */ +unsigned int gen_hash(const struct proxy* px, const char* key, unsigned long len) +{ + unsigned int hash; + + switch (px->lbprm.algo & BE_LB_HASH_FUNC) { + case BE_LB_HFCN_DJB2: + hash = hash_djb2(key, len); + break; + case BE_LB_HFCN_WT6: + hash = hash_wt6(key, len); + break; + case BE_LB_HFCN_CRC32: + hash = hash_crc32(key, len); + break; + case BE_LB_HFCN_NONE: + /* use key as a hash */ + { + const char *_key = key; + + hash = read_int64(&_key, _key + len); + } + break; + case BE_LB_HFCN_SDBM: + /* this is the default hash function */ + default: + hash = hash_sdbm(key, len); + break; + } + + if ((px->lbprm.algo & BE_LB_HASH_MOD) == BE_LB_HMOD_AVAL) + hash = full_hash(hash); + + return hash; +} + +/* + * This function recounts the number of usable active and backup servers for + * proxy <p>. These numbers are returned into the p->srv_act and p->srv_bck. + * This function also recomputes the total active and backup weights. However, + * it does not update tot_weight nor tot_used. Use update_backend_weight() for + * this. + * This functions is designed to be called before server's weight and state + * commit so it uses 'next' weight and states values. + * + * threads: this is the caller responsibility to lock data. For now, this + * function is called from lb modules, so it should be ok. But if you need to + * call it from another place, be careful (and update this comment). + */ +void recount_servers(struct proxy *px) +{ + struct server *srv; + + px->srv_act = px->srv_bck = 0; + px->lbprm.tot_wact = px->lbprm.tot_wbck = 0; + px->lbprm.fbck = NULL; + for (srv = px->srv; srv != NULL; srv = srv->next) { + if (!srv_willbe_usable(srv)) + continue; + + if (srv->flags & SRV_F_BACKUP) { + if (!px->srv_bck && + !(px->options & PR_O_USE_ALL_BK)) + px->lbprm.fbck = srv; + px->srv_bck++; + srv->cumulative_weight = px->lbprm.tot_wbck; + px->lbprm.tot_wbck += srv->next_eweight; + } else { + px->srv_act++; + srv->cumulative_weight = px->lbprm.tot_wact; + px->lbprm.tot_wact += srv->next_eweight; + } + } +} + +/* This function simply updates the backend's tot_weight and tot_used values + * after servers weights have been updated. It is designed to be used after + * recount_servers() or equivalent. + * + * threads: this is the caller responsibility to lock data. For now, this + * function is called from lb modules, so it should be ok. But if you need to + * call it from another place, be careful (and update this comment). + */ +void update_backend_weight(struct proxy *px) +{ + if (px->srv_act) { + px->lbprm.tot_weight = px->lbprm.tot_wact; + px->lbprm.tot_used = px->srv_act; + } + else if (px->lbprm.fbck) { + /* use only the first backup server */ + px->lbprm.tot_weight = px->lbprm.fbck->next_eweight; + px->lbprm.tot_used = 1; + } + else { + px->lbprm.tot_weight = px->lbprm.tot_wbck; + px->lbprm.tot_used = px->srv_bck; + } +} + +/* + * This function tries to find a running server for the proxy <px> following + * the source hash method. Depending on the number of active/backup servers, + * it will either look for active servers, or for backup servers. + * If any server is found, it will be returned. If no valid server is found, + * NULL is returned. + */ +static struct server *get_server_sh(struct proxy *px, const char *addr, int len, const struct server *avoid) +{ + unsigned int h, l; + + if (px->lbprm.tot_weight == 0) + return NULL; + + l = h = 0; + + /* note: we won't hash if there's only one server left */ + if (px->lbprm.tot_used == 1) + goto hash_done; + + while ((l + sizeof (int)) <= len) { + h ^= ntohl(*(unsigned int *)(&addr[l])); + l += sizeof (int); + } + /* FIXME: why don't we use gen_hash() here as well? + * -> we don't take into account hash function from "hash_type" + * options here.. + */ + if ((px->lbprm.algo & BE_LB_HASH_MOD) == BE_LB_HMOD_AVAL) + h = full_hash(h); + hash_done: + if ((px->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_CHTREE) + return chash_get_server_hash(px, h, avoid); + else + return map_get_server_hash(px, h); +} + +/* + * This function tries to find a running server for the proxy <px> following + * the URI hash method. In order to optimize cache hits, the hash computation + * ends at the question mark. Depending on the number of active/backup servers, + * it will either look for active servers, or for backup servers. + * If any server is found, it will be returned. If no valid server is found, + * NULL is returned. The lbprm.arg_opt{1,2,3} values correspond respectively to + * the "whole" optional argument (boolean, bit0), the "len" argument (numeric) + * and the "depth" argument (numeric). + * + * This code was contributed by Guillaume Dallaire, who also selected this hash + * algorithm out of a tens because it gave him the best results. + * + */ +static struct server *get_server_uh(struct proxy *px, char *uri, int uri_len, const struct server *avoid) +{ + unsigned int hash = 0; + int c; + int slashes = 0; + const char *start, *end; + + if (px->lbprm.tot_weight == 0) + return NULL; + + /* note: we won't hash if there's only one server left */ + if (px->lbprm.tot_used == 1) + goto hash_done; + + if (px->lbprm.arg_opt2) // "len" + uri_len = MIN(uri_len, px->lbprm.arg_opt2); + + start = end = uri; + while (uri_len--) { + c = *end; + if (c == '/') { + slashes++; + if (slashes == px->lbprm.arg_opt3) /* depth+1 */ + break; + } + else if (c == '?' && !(px->lbprm.arg_opt1 & 1)) // "whole" + break; + end++; + } + + hash = gen_hash(px, start, (end - start)); + + hash_done: + if ((px->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_CHTREE) + return chash_get_server_hash(px, hash, avoid); + else + return map_get_server_hash(px, hash); +} + +/* + * This function tries to find a running server for the proxy <px> following + * the URL parameter hash method. It looks for a specific parameter in the + * URL and hashes it to compute the server ID. This is useful to optimize + * performance by avoiding bounces between servers in contexts where sessions + * are shared but cookies are not usable. If the parameter is not found, NULL + * is returned. If any server is found, it will be returned. If no valid server + * is found, NULL is returned. + */ +static struct server *get_server_ph(struct proxy *px, const char *uri, int uri_len, const struct server *avoid) +{ + unsigned int hash = 0; + const char *start, *end; + const char *p; + const char *params; + int plen; + + /* when tot_weight is 0 then so is srv_count */ + if (px->lbprm.tot_weight == 0) + return NULL; + + if ((p = memchr(uri, '?', uri_len)) == NULL) + return NULL; + + p++; + + uri_len -= (p - uri); + plen = px->lbprm.arg_len; + params = p; + + while (uri_len > plen) { + /* Look for the parameter name followed by an equal symbol */ + if (params[plen] == '=') { + if (memcmp(params, px->lbprm.arg_str, plen) == 0) { + /* OK, we have the parameter here at <params>, and + * the value after the equal sign, at <p> + * skip the equal symbol + */ + p += plen + 1; + start = end = p; + uri_len -= plen + 1; + + while (uri_len && *end != '&') { + uri_len--; + end++; + } + hash = gen_hash(px, start, (end - start)); + + if ((px->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_CHTREE) + return chash_get_server_hash(px, hash, avoid); + else + return map_get_server_hash(px, hash); + } + } + /* skip to next parameter */ + p = memchr(params, '&', uri_len); + if (!p) + return NULL; + p++; + uri_len -= (p - params); + params = p; + } + return NULL; +} + +/* + * this does the same as the previous server_ph, but check the body contents + */ +static struct server *get_server_ph_post(struct stream *s, const struct server *avoid) +{ + unsigned int hash = 0; + struct channel *req = &s->req; + struct proxy *px = s->be; + struct htx *htx = htxbuf(&req->buf); + struct htx_blk *blk; + unsigned int plen = px->lbprm.arg_len; + unsigned long len; + const char *params, *p, *start, *end; + + if (px->lbprm.tot_weight == 0) + return NULL; + + p = params = NULL; + len = 0; + for (blk = htx_get_first_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist v; + + if (type != HTX_BLK_DATA) + continue; + v = htx_get_blk_value(htx, blk); + p = params = v.ptr; + len = v.len; + break; + } + + while (len > plen) { + /* Look for the parameter name followed by an equal symbol */ + if (params[plen] == '=') { + if (memcmp(params, px->lbprm.arg_str, plen) == 0) { + /* OK, we have the parameter here at <params>, and + * the value after the equal sign, at <p> + * skip the equal symbol + */ + p += plen + 1; + start = end = p; + len -= plen + 1; + + while (len && *end != '&') { + if (unlikely(!HTTP_IS_TOKEN(*p))) { + /* if in a POST, body must be URI encoded or it's not a URI. + * Do not interpret any possible binary data as a parameter. + */ + if (likely(HTTP_IS_LWS(*p))) /* eol, uncertain uri len */ + break; + return NULL; /* oh, no; this is not uri-encoded. + * This body does not contain parameters. + */ + } + len--; + end++; + /* should we break if vlen exceeds limit? */ + } + hash = gen_hash(px, start, (end - start)); + + if ((px->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_CHTREE) + return chash_get_server_hash(px, hash, avoid); + else + return map_get_server_hash(px, hash); + } + } + /* skip to next parameter */ + p = memchr(params, '&', len); + if (!p) + return NULL; + p++; + len -= (p - params); + params = p; + } + return NULL; +} + + +/* + * This function tries to find a running server for the proxy <px> following + * the Header parameter hash method. It looks for a specific parameter in the + * URL and hashes it to compute the server ID. This is useful to optimize + * performance by avoiding bounces between servers in contexts where sessions + * are shared but cookies are not usable. If the parameter is not found, NULL + * is returned. If any server is found, it will be returned. If no valid server + * is found, NULL is returned. When lbprm.arg_opt1 is set, the hash will only + * apply to the middle part of a domain name ("use_domain_only" option). + */ +static struct server *get_server_hh(struct stream *s, const struct server *avoid) +{ + unsigned int hash = 0; + struct proxy *px = s->be; + unsigned int plen = px->lbprm.arg_len; + unsigned long len; + const char *p; + const char *start, *end; + struct htx *htx = htxbuf(&s->req.buf); + struct http_hdr_ctx ctx = { .blk = NULL }; + + /* tot_weight appears to mean srv_count */ + if (px->lbprm.tot_weight == 0) + return NULL; + + /* note: we won't hash if there's only one server left */ + if (px->lbprm.tot_used == 1) + goto hash_done; + + http_find_header(htx, ist2(px->lbprm.arg_str, plen), &ctx, 0); + + /* if the header is not found or empty, let's fallback to round robin */ + if (!ctx.blk || !ctx.value.len) + return NULL; + + /* Found a the param_name in the headers. + * we will compute the hash based on this value ctx.val. + */ + len = ctx.value.len; + p = ctx.value.ptr; + + if (!px->lbprm.arg_opt1) { + hash = gen_hash(px, p, len); + } else { + int dohash = 0; + p += len; + /* special computation, use only main domain name, not tld/host + * going back from the end of string, start hashing at first + * dot stop at next. + * This is designed to work with the 'Host' header, and requires + * a special option to activate this. + */ + end = p; + while (len) { + if (dohash) { + /* Rewind the pointer until the previous char + * is a dot, this will allow to set the start + * position of the domain. */ + if (*(p - 1) == '.') + break; + } + else if (*p == '.') { + /* The pointer is rewinded to the dot before the + * tld, we memorize the end of the domain and + * can enter the domain processing. */ + end = p; + dohash = 1; + } + p--; + len--; + } + start = p; + hash = gen_hash(px, start, (end - start)); + } + + hash_done: + if ((px->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_CHTREE) + return chash_get_server_hash(px, hash, avoid); + else + return map_get_server_hash(px, hash); +} + +/* RDP Cookie HASH. */ +static struct server *get_server_rch(struct stream *s, const struct server *avoid) +{ + unsigned int hash = 0; + struct proxy *px = s->be; + unsigned long len; + int ret; + struct sample smp; + int rewind; + + /* tot_weight appears to mean srv_count */ + if (px->lbprm.tot_weight == 0) + return NULL; + + memset(&smp, 0, sizeof(smp)); + + rewind = co_data(&s->req); + c_rew(&s->req, rewind); + + ret = fetch_rdp_cookie_name(s, &smp, px->lbprm.arg_str, px->lbprm.arg_len); + len = smp.data.u.str.data; + + c_adv(&s->req, rewind); + + if (ret == 0 || (smp.flags & SMP_F_MAY_CHANGE) || len == 0) + return NULL; + + /* note: we won't hash if there's only one server left */ + if (px->lbprm.tot_used == 1) + goto hash_done; + + /* Found the param_name in the headers. + * we will compute the hash based on this value ctx.val. + */ + hash = gen_hash(px, smp.data.u.str.area, len); + + hash_done: + if ((px->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_CHTREE) + return chash_get_server_hash(px, hash, avoid); + else + return map_get_server_hash(px, hash); +} + +/* sample expression HASH. Returns NULL if the sample is not found or if there + * are no server, relying on the caller to fall back to round robin instead. + */ +static struct server *get_server_expr(struct stream *s, const struct server *avoid) +{ + struct proxy *px = s->be; + struct sample *smp; + unsigned int hash = 0; + + if (px->lbprm.tot_weight == 0) + return NULL; + + /* note: no need to hash if there's only one server left */ + if (px->lbprm.tot_used == 1) + goto hash_done; + + smp = sample_fetch_as_type(px, s->sess, s, SMP_OPT_DIR_REQ | SMP_OPT_FINAL, px->lbprm.expr, SMP_T_BIN); + if (!smp) + return NULL; + + /* We have the desired data. Let's hash it according to the configured + * options and algorithm. + */ + hash = gen_hash(px, smp->data.u.str.area, smp->data.u.str.data); + + hash_done: + if ((px->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_CHTREE) + return chash_get_server_hash(px, hash, avoid); + else + return map_get_server_hash(px, hash); +} + +/* random value */ +static struct server *get_server_rnd(struct stream *s, const struct server *avoid) +{ + unsigned int hash = 0; + struct proxy *px = s->be; + struct server *prev, *curr; + int draws = px->lbprm.arg_opt1; // number of draws + + /* tot_weight appears to mean srv_count */ + if (px->lbprm.tot_weight == 0) + return NULL; + + curr = NULL; + do { + prev = curr; + hash = statistical_prng(); + curr = chash_get_server_hash(px, hash, avoid); + if (!curr) + break; + + /* compare the new server to the previous best choice and pick + * the one with the least currently served requests. + */ + if (prev && prev != curr && + curr->served * prev->cur_eweight > prev->served * curr->cur_eweight) + curr = prev; + } while (--draws > 0); + + /* if the selected server is full, pretend we have none so that we reach + * the backend's queue instead. + */ + if (curr && + (curr->queue.length || (curr->maxconn && curr->served >= srv_dynamic_maxconn(curr)))) + curr = NULL; + + return curr; +} + +/* + * This function applies the load-balancing algorithm to the stream, as + * defined by the backend it is assigned to. The stream is then marked as + * 'assigned'. + * + * This function MAY NOT be called with SF_ASSIGNED already set. If the stream + * had a server previously assigned, it is rebalanced, trying to avoid the same + * server, which should still be present in target_srv(&s->target) before the call. + * The function tries to keep the original connection slot if it reconnects to + * the same server, otherwise it releases it and tries to offer it. + * + * It is illegal to call this function with a stream in a queue. + * + * It may return : + * SRV_STATUS_OK if everything is OK. ->srv and ->target are assigned. + * SRV_STATUS_NOSRV if no server is available. Stream is not ASSIGNED + * SRV_STATUS_FULL if all servers are saturated. Stream is not ASSIGNED + * SRV_STATUS_INTERNAL for other unrecoverable errors. + * + * Upon successful return, the stream flag SF_ASSIGNED is set to indicate that + * it does not need to be called anymore. This means that target_srv(&s->target) + * can be trusted in balance and direct modes. + * + */ + +int assign_server(struct stream *s) +{ + struct connection *conn = NULL; + struct server *conn_slot; + struct server *srv = NULL, *prev_srv; + int err; + + err = SRV_STATUS_INTERNAL; + if (unlikely(s->pend_pos || s->flags & SF_ASSIGNED)) + goto out_err; + + prev_srv = objt_server(s->target); + conn_slot = s->srv_conn; + + /* We have to release any connection slot before applying any LB algo, + * otherwise we may erroneously end up with no available slot. + */ + if (conn_slot) + sess_change_server(s, NULL); + + /* We will now try to find the good server and store it into <objt_server(s->target)>. + * Note that <objt_server(s->target)> may be NULL in case of dispatch or proxy mode, + * as well as if no server is available (check error code). + */ + + srv = NULL; + s->target = NULL; + + if ((s->be->lbprm.algo & BE_LB_KIND) != BE_LB_KIND_HI && + ((s->sess->flags & SESS_FL_PREFER_LAST) || + (s->be->options & PR_O_PREF_LAST))) { + struct sess_srv_list *srv_list; + list_for_each_entry(srv_list, &s->sess->srv_list, srv_list) { + struct server *tmpsrv = objt_server(srv_list->target); + + if (tmpsrv && tmpsrv->proxy == s->be && + ((s->sess->flags & SESS_FL_PREFER_LAST) || + (!s->be->max_ka_queue || + server_has_room(tmpsrv) || ( + tmpsrv->queue.length + 1 < s->be->max_ka_queue))) && + srv_currently_usable(tmpsrv)) { + list_for_each_entry(conn, &srv_list->conn_list, session_list) { + if (!(conn->flags & CO_FL_WAIT_XPRT)) { + srv = tmpsrv; + s->target = &srv->obj_type; + if (conn->flags & CO_FL_SESS_IDLE) { + conn->flags &= ~CO_FL_SESS_IDLE; + s->sess->idle_conns--; + } + goto out_ok; + } + } + } + } + } + + if (s->be->lbprm.algo & BE_LB_KIND) { + /* we must check if we have at least one server available */ + if (!s->be->lbprm.tot_weight) { + err = SRV_STATUS_NOSRV; + goto out; + } + + /* if there's some queue on the backend, with certain algos we + * know it's because all servers are full. + */ + if (s->be->queue.length && s->be->queue.length != s->be->beconn && + (((s->be->lbprm.algo & (BE_LB_KIND|BE_LB_NEED|BE_LB_PARM)) == BE_LB_ALGO_FAS)|| // first + ((s->be->lbprm.algo & (BE_LB_KIND|BE_LB_NEED|BE_LB_PARM)) == BE_LB_ALGO_RR) || // roundrobin + ((s->be->lbprm.algo & (BE_LB_KIND|BE_LB_NEED|BE_LB_PARM)) == BE_LB_ALGO_SRR))) { // static-rr + err = SRV_STATUS_FULL; + goto out; + } + + /* First check whether we need to fetch some data or simply call + * the LB lookup function. Only the hashing functions will need + * some input data in fact, and will support multiple algorithms. + */ + switch (s->be->lbprm.algo & BE_LB_LKUP) { + case BE_LB_LKUP_RRTREE: + srv = fwrr_get_next_server(s->be, prev_srv); + break; + + case BE_LB_LKUP_FSTREE: + srv = fas_get_next_server(s->be, prev_srv); + break; + + case BE_LB_LKUP_LCTREE: + srv = fwlc_get_next_server(s->be, prev_srv); + break; + + case BE_LB_LKUP_CHTREE: + case BE_LB_LKUP_MAP: + if ((s->be->lbprm.algo & BE_LB_KIND) == BE_LB_KIND_RR) { + /* static-rr (map) or random (chash) */ + if ((s->be->lbprm.algo & BE_LB_PARM) == BE_LB_RR_RANDOM) + srv = get_server_rnd(s, prev_srv); + else + srv = map_get_server_rr(s->be, prev_srv); + break; + } + else if ((s->be->lbprm.algo & BE_LB_KIND) != BE_LB_KIND_HI) { + /* unknown balancing algorithm */ + err = SRV_STATUS_INTERNAL; + goto out; + } + + switch (s->be->lbprm.algo & BE_LB_PARM) { + const struct sockaddr_storage *src; + + case BE_LB_HASH_SRC: + src = sc_src(s->scf); + if (src && src->ss_family == AF_INET) { + srv = get_server_sh(s->be, + (void *)&((struct sockaddr_in *)src)->sin_addr, + 4, prev_srv); + } + else if (src && src->ss_family == AF_INET6) { + srv = get_server_sh(s->be, + (void *)&((struct sockaddr_in6 *)src)->sin6_addr, + 16, prev_srv); + } + break; + + case BE_LB_HASH_URI: + /* URI hashing */ + if (IS_HTX_STRM(s) && s->txn->req.msg_state >= HTTP_MSG_BODY) { + struct ist uri; + + uri = htx_sl_req_uri(http_get_stline(htxbuf(&s->req.buf))); + if (s->be->lbprm.arg_opt1 & 2) { + struct http_uri_parser parser = + http_uri_parser_init(uri); + + uri = http_parse_path(&parser); + if (!isttest(uri)) + uri = ist(""); + } + srv = get_server_uh(s->be, uri.ptr, uri.len, prev_srv); + } + break; + + case BE_LB_HASH_PRM: + /* URL Parameter hashing */ + if (IS_HTX_STRM(s) && s->txn->req.msg_state >= HTTP_MSG_BODY) { + struct ist uri; + + uri = htx_sl_req_uri(http_get_stline(htxbuf(&s->req.buf))); + srv = get_server_ph(s->be, uri.ptr, uri.len, prev_srv); + + if (!srv && s->txn->meth == HTTP_METH_POST) + srv = get_server_ph_post(s, prev_srv); + } + break; + + case BE_LB_HASH_HDR: + /* Header Parameter hashing */ + if (IS_HTX_STRM(s) && s->txn->req.msg_state >= HTTP_MSG_BODY) + srv = get_server_hh(s, prev_srv); + break; + + case BE_LB_HASH_RDP: + /* RDP Cookie hashing */ + srv = get_server_rch(s, prev_srv); + break; + + case BE_LB_HASH_SMP: + /* sample expression hashing */ + srv = get_server_expr(s, prev_srv); + break; + + default: + /* unknown balancing algorithm */ + err = SRV_STATUS_INTERNAL; + goto out; + } + + /* If the hashing parameter was not found, let's fall + * back to round robin on the map. + */ + if (!srv) { + if ((s->be->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_CHTREE) + srv = chash_get_next_server(s->be, prev_srv); + else + srv = map_get_server_rr(s->be, prev_srv); + } + + /* end of map-based LB */ + break; + + default: + /* unknown balancing algorithm */ + err = SRV_STATUS_INTERNAL; + goto out; + } + + if (!srv) { + err = SRV_STATUS_FULL; + goto out; + } + else if (srv != prev_srv) { + _HA_ATOMIC_INC(&s->be->be_counters.cum_lbconn); + _HA_ATOMIC_INC(&srv->counters.cum_lbconn); + } + s->target = &srv->obj_type; + } + else if (s->be->options & (PR_O_DISPATCH | PR_O_TRANSP)) { + s->target = &s->be->obj_type; + } + else { + err = SRV_STATUS_NOSRV; + goto out; + } + +out_ok: + s->flags |= SF_ASSIGNED; + err = SRV_STATUS_OK; + out: + + /* Either we take back our connection slot, or we offer it to someone + * else if we don't need it anymore. + */ + if (conn_slot) { + if (conn_slot == srv) { + sess_change_server(s, srv); + } else { + if (may_dequeue_tasks(conn_slot, s->be)) + process_srv_queue(conn_slot); + } + } + + out_err: + return err; +} + +/* Allocate an address for the destination endpoint + * The address is taken from the currently assigned server, or from the + * dispatch or transparent address. + * + * Returns SRV_STATUS_OK on success. Does nothing if the address was + * already set. + * On error, no address is allocated and SRV_STATUS_INTERNAL is returned. + */ +static int alloc_dst_address(struct sockaddr_storage **ss, + struct server *srv, struct stream *s) +{ + const struct sockaddr_storage *dst; + + if (*ss) + return SRV_STATUS_OK; + + if ((s->flags & SF_DIRECT) || (s->be->lbprm.algo & BE_LB_KIND)) { + /* A server is necessarily known for this stream */ + if (!(s->flags & SF_ASSIGNED)) + return SRV_STATUS_INTERNAL; + + if (!sockaddr_alloc(ss, NULL, 0)) + return SRV_STATUS_INTERNAL; + + **ss = srv->addr; + set_host_port(*ss, srv->svc_port); + if (!is_addr(*ss)) { + /* if the server has no address, we use the same address + * the client asked, which is handy for remapping ports + * locally on multiple addresses at once. Nothing is done + * for AF_UNIX addresses. + */ + dst = sc_dst(s->scf); + if (dst && dst->ss_family == AF_INET) { + ((struct sockaddr_in *)*ss)->sin_family = AF_INET; + ((struct sockaddr_in *)*ss)->sin_addr = + ((struct sockaddr_in *)dst)->sin_addr; + } else if (dst && dst->ss_family == AF_INET6) { + ((struct sockaddr_in6 *)*ss)->sin6_family = AF_INET6; + ((struct sockaddr_in6 *)*ss)->sin6_addr = + ((struct sockaddr_in6 *)dst)->sin6_addr; + } + } + + /* if this server remaps proxied ports, we'll use + * the port the client connected to with an offset. */ + if ((srv->flags & SRV_F_MAPPORTS)) { + int base_port; + + dst = sc_dst(s->scf); + if (dst) { + /* First, retrieve the port from the incoming connection */ + base_port = get_host_port(dst); + + /* Second, assign the outgoing connection's port */ + base_port += get_host_port(*ss); + set_host_port(*ss, base_port); + } + } + } + else if (s->be->options & PR_O_DISPATCH) { + if (!sockaddr_alloc(ss, NULL, 0)) + return SRV_STATUS_INTERNAL; + + /* connect to the defined dispatch addr */ + **ss = s->be->dispatch_addr; + } + else if ((s->be->options & PR_O_TRANSP)) { + if (!sockaddr_alloc(ss, NULL, 0)) + return SRV_STATUS_INTERNAL; + + /* in transparent mode, use the original dest addr if no dispatch specified */ + dst = sc_dst(s->scf); + if (dst && (dst->ss_family == AF_INET || dst->ss_family == AF_INET6)) + **ss = *dst; + } + else { + /* no server and no LB algorithm ! */ + return SRV_STATUS_INTERNAL; + } + + return SRV_STATUS_OK; +} + +/* This function assigns a server to stream <s> if required, and can add the + * connection to either the assigned server's queue or to the proxy's queue. + * If ->srv_conn is set, the stream is first released from the server. + * It may also be called with SF_DIRECT and/or SF_ASSIGNED though. It will + * be called before any connection and after any retry or redispatch occurs. + * + * It is not allowed to call this function with a stream in a queue. + * + * Returns : + * + * SRV_STATUS_OK if everything is OK. + * SRV_STATUS_NOSRV if no server is available. objt_server(s->target) = NULL. + * SRV_STATUS_QUEUED if the connection has been queued. + * SRV_STATUS_FULL if the server(s) is/are saturated and the + * connection could not be queued at the server's, + * which may be NULL if we queue on the backend. + * SRV_STATUS_INTERNAL for other unrecoverable errors. + * + */ +int assign_server_and_queue(struct stream *s) +{ + struct pendconn *p; + struct server *srv; + int err; + + if (s->pend_pos) + return SRV_STATUS_INTERNAL; + + err = SRV_STATUS_OK; + if (!(s->flags & SF_ASSIGNED)) { + struct server *prev_srv = objt_server(s->target); + + err = assign_server(s); + if (prev_srv) { + /* This stream was previously assigned to a server. We have to + * update the stream's and the server's stats : + * - if the server changed : + * - set TX_CK_DOWN if txn.flags was TX_CK_VALID + * - set SF_REDISP if it was successfully redispatched + * - increment srv->redispatches and be->redispatches + * - if the server remained the same : update retries. + */ + + if (prev_srv != objt_server(s->target)) { + if (s->txn && (s->txn->flags & TX_CK_MASK) == TX_CK_VALID) { + s->txn->flags &= ~TX_CK_MASK; + s->txn->flags |= TX_CK_DOWN; + } + s->flags |= SF_REDISP; + _HA_ATOMIC_INC(&prev_srv->counters.redispatches); + _HA_ATOMIC_INC(&s->be->be_counters.redispatches); + } else { + _HA_ATOMIC_INC(&prev_srv->counters.retries); + _HA_ATOMIC_INC(&s->be->be_counters.retries); + } + } + } + + switch (err) { + case SRV_STATUS_OK: + /* we have SF_ASSIGNED set */ + srv = objt_server(s->target); + if (!srv) + return SRV_STATUS_OK; /* dispatch or proxy mode */ + + /* If we already have a connection slot, no need to check any queue */ + if (s->srv_conn == srv) + return SRV_STATUS_OK; + + /* OK, this stream already has an assigned server, but no + * connection slot yet. Either it is a redispatch, or it was + * assigned from persistence information (direct mode). + */ + if ((s->flags & SF_REDIRECTABLE) && srv->rdr_len) { + /* server scheduled for redirection, and already assigned. We + * don't want to go further nor check the queue. + */ + sess_change_server(s, srv); /* not really needed in fact */ + return SRV_STATUS_OK; + } + + /* We might have to queue this stream if the assigned server is full. + * We know we have to queue it into the server's queue, so if a maxqueue + * is set on the server, we must also check that the server's queue is + * not full, in which case we have to return FULL. + */ + if (srv->maxconn && + (srv->queue.length || srv->served >= srv_dynamic_maxconn(srv))) { + + if (srv->maxqueue > 0 && srv->queue.length >= srv->maxqueue) + return SRV_STATUS_FULL; + + p = pendconn_add(s); + if (p) + return SRV_STATUS_QUEUED; + else + return SRV_STATUS_INTERNAL; + } + + /* OK, we can use this server. Let's reserve our place */ + sess_change_server(s, srv); + return SRV_STATUS_OK; + + case SRV_STATUS_FULL: + /* queue this stream into the proxy's queue */ + p = pendconn_add(s); + if (p) + return SRV_STATUS_QUEUED; + else + return SRV_STATUS_INTERNAL; + + case SRV_STATUS_NOSRV: + return err; + + case SRV_STATUS_INTERNAL: + return err; + + default: + return SRV_STATUS_INTERNAL; + } +} + +/* Allocate an address if an explicit source address must be used for a backend + * connection. + * + * Two parameters are taken into account to check if specific source address is + * configured. The first one is <srv> which is the server instance to connect + * to. It may be NULL when dispatching is used. The second one <be> is the + * backend instance which contains the target server or dispatch. + * + * A stream instance <s> can be used to set the stream owner of the backend + * connection. It is a required parameter if the source address is a dynamic + * parameter. + * + * Returns SRV_STATUS_OK if either no specific source address specified or its + * allocation is done correctly. On error returns SRV_STATUS_INTERNAL. + */ +int alloc_bind_address(struct sockaddr_storage **ss, + struct server *srv, struct proxy *be, + struct stream *s) +{ +#if defined(CONFIG_HAP_TRANSPARENT) + const struct sockaddr_storage *addr; + struct conn_src *src = NULL; + struct sockaddr_in *sin; + char *vptr; + size_t vlen; +#endif + + /* Ensure the function will not overwrite an allocated address. */ + BUG_ON(*ss); + +#if defined(CONFIG_HAP_TRANSPARENT) + if (srv && srv->conn_src.opts & CO_SRC_BIND) + src = &srv->conn_src; + else if (be->conn_src.opts & CO_SRC_BIND) + src = &be->conn_src; + + /* no transparent mode, no need to allocate an address, returns OK */ + if (!src) + return SRV_STATUS_OK; + + switch (src->opts & CO_SRC_TPROXY_MASK) { + case CO_SRC_TPROXY_ADDR: + if (!sockaddr_alloc(ss, NULL, 0)) + return SRV_STATUS_INTERNAL; + + **ss = src->tproxy_addr; + break; + + case CO_SRC_TPROXY_CLI: + case CO_SRC_TPROXY_CIP: + BUG_ON(!s); /* Dynamic source setting requires a stream instance. */ + + /* FIXME: what can we do if the client connects in IPv6 or unix socket ? */ + addr = sc_src(s->scf); + if (!addr) + return SRV_STATUS_INTERNAL; + + if (!sockaddr_alloc(ss, NULL, 0)) + return SRV_STATUS_INTERNAL; + + **ss = *addr; + break; + + case CO_SRC_TPROXY_DYN: + BUG_ON(!s); /* Dynamic source setting requires a stream instance. */ + + if (!src->bind_hdr_occ || !IS_HTX_STRM(s)) + return SRV_STATUS_INTERNAL; + + if (!sockaddr_alloc(ss, NULL, 0)) + return SRV_STATUS_INTERNAL; + + /* bind to the IP in a header */ + sin = (struct sockaddr_in *)*ss; + sin->sin_family = AF_INET; + sin->sin_port = 0; + sin->sin_addr.s_addr = 0; + if (!http_get_htx_hdr(htxbuf(&s->req.buf), + ist2(src->bind_hdr_name, src->bind_hdr_len), + src->bind_hdr_occ, NULL, &vptr, &vlen)) { + sockaddr_free(ss); + return SRV_STATUS_INTERNAL; + } + + sin->sin_addr.s_addr = htonl(inetaddr_host_lim(vptr, vptr + vlen)); + break; + + default: + ; + } +#endif + + return SRV_STATUS_OK; +} + +/* Attempt to get a backend connection from the specified mt_list array + * (safe or idle connections). The <is_safe> argument means what type of + * connection the caller wants. + */ +struct connection *conn_backend_get(struct stream *s, struct server *srv, int is_safe, int64_t hash) +{ + struct connection *conn = NULL; + int i; // thread number + int found = 0; + int stop; + + /* We need to lock even if this is our own list, because another + * thread may be trying to migrate that connection, and we don't want + * to end up with two threads using the same connection. + */ + i = tid; + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + conn = srv_lookup_conn(is_safe ? &srv->per_thr[tid].safe_conns : &srv->per_thr[tid].idle_conns, hash); + if (conn) + conn_delete_from_tree(conn); + + /* If we failed to pick a connection from the idle list, let's try again with + * the safe list. + */ + if (!conn && !is_safe && srv->curr_safe_nb > 0) { + conn = srv_lookup_conn(&srv->per_thr[tid].safe_conns, hash); + if (conn) { + conn_delete_from_tree(conn); + is_safe = 1; + } + } + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + + /* If we found a connection in our own list, and we don't have to + * steal one from another thread, then we're done. + */ + if (conn) + goto done; + + /* pool sharing globally disabled ? */ + if (!(global.tune.options & GTUNE_IDLE_POOL_SHARED)) + goto done; + + /* Are we allowed to pick from another thread ? We'll still try + * it if we're running low on FDs as we don't want to create + * extra conns in this case, otherwise we can give up if we have + * too few idle conns and the server protocol supports establishing + * connections (i.e. not a reverse-http server for example). + */ + if (srv->curr_idle_conns < srv->low_idle_conns && + ha_used_fds < global.tune.pool_low_count) { + const struct protocol *srv_proto = protocol_lookup(srv->addr.ss_family, PROTO_TYPE_STREAM, 0); + + if (srv_proto && srv_proto->connect) + goto done; + } + + /* Lookup all other threads for an idle connection, starting from last + * unvisited thread, but always staying in the same group. + */ + stop = srv->per_tgrp[tgid - 1].next_takeover; + if (stop >= tg->count) + stop %= tg->count; + + stop += tg->base; + i = stop; + do { + if (!srv->curr_idle_thr[i] || i == tid) + continue; + + if (HA_SPIN_TRYLOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock) != 0) + continue; + conn = srv_lookup_conn(is_safe ? &srv->per_thr[i].safe_conns : &srv->per_thr[i].idle_conns, hash); + while (conn) { + if (conn->mux->takeover && conn->mux->takeover(conn, i) == 0) { + conn_delete_from_tree(conn); + _HA_ATOMIC_INC(&activity[tid].fd_takeover); + found = 1; + break; + } + + conn = srv_lookup_conn_next(conn); + } + + if (!found && !is_safe && srv->curr_safe_nb > 0) { + conn = srv_lookup_conn(&srv->per_thr[i].safe_conns, hash); + while (conn) { + if (conn->mux->takeover && conn->mux->takeover(conn, i) == 0) { + conn_delete_from_tree(conn); + _HA_ATOMIC_INC(&activity[tid].fd_takeover); + found = 1; + is_safe = 1; + break; + } + + conn = srv_lookup_conn_next(conn); + } + } + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock); + } while (!found && (i = (i + 1 == tg->base + tg->count) ? tg->base : i + 1) != stop); + + if (!found) + conn = NULL; + done: + if (conn) { + _HA_ATOMIC_STORE(&srv->per_tgrp[tgid - 1].next_takeover, (i + 1 == tg->base + tg->count) ? tg->base : i + 1); + + srv_use_conn(srv, conn); + + _HA_ATOMIC_DEC(&srv->curr_idle_conns); + _HA_ATOMIC_DEC(conn->flags & CO_FL_SAFE_LIST ? &srv->curr_safe_nb : &srv->curr_idle_nb); + _HA_ATOMIC_DEC(&srv->curr_idle_thr[i]); + conn->flags &= ~CO_FL_LIST_MASK; + __ha_barrier_atomic_store(); + + if ((s->be->options & PR_O_REUSE_MASK) == PR_O_REUSE_SAFE && + conn->mux->flags & MX_FL_HOL_RISK) { + /* attach the connection to the session private list + */ + conn->owner = s->sess; + session_add_conn(s->sess, conn, conn->target); + } + else { + srv_add_to_avail_list(srv, conn); + } + } + return conn; +} + +static int do_connect_server(struct stream *s, struct connection *conn) +{ + int ret = SF_ERR_NONE; + int conn_flags = 0; + + if (unlikely(!conn || !conn->ctrl || !conn->ctrl->connect)) + return SF_ERR_INTERNAL; + + if (co_data(&s->res)) + conn_flags |= CONNECT_HAS_DATA; + if (s->conn_retries == s->be->conn_retries) + conn_flags |= CONNECT_CAN_USE_TFO; + if (!conn_ctrl_ready(conn) || !conn_xprt_ready(conn)) { + ret = conn->ctrl->connect(conn, conn_flags); + if (ret != SF_ERR_NONE) + return ret; + + /* we're in the process of establishing a connection */ + s->scb->state = SC_ST_CON; + } + else { + /* try to reuse the existing connection, it will be + * confirmed once we can send on it. + */ + /* Is the connection really ready ? */ + if (conn->mux->ctl(conn, MUX_CTL_STATUS, NULL) & MUX_STATUS_READY) + s->scb->state = SC_ST_RDY; + else + s->scb->state = SC_ST_CON; + } + + /* needs src ip/port for logging */ + if (s->flags & SF_SRC_ADDR) + conn_get_src(conn); + + return ret; +} + +/* + * This function initiates a connection to the server assigned to this stream + * (s->target, (s->scb)->addr.to). It will assign a server if none + * is assigned yet. + * It can return one of : + * - SF_ERR_NONE if everything's OK + * - SF_ERR_SRVTO if there are no more servers + * - SF_ERR_SRVCL if the connection was refused by the server + * - SF_ERR_PRXCOND if the connection has been limited by the proxy (maxconn) + * - SF_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * - SF_ERR_INTERNAL for any other purely internal errors + * Additionally, in the case of SF_ERR_RESOURCE, an emergency log will be emitted. + * The server-facing stream connector is expected to hold a pre-allocated connection. + */ +int connect_server(struct stream *s) +{ + struct connection *cli_conn = objt_conn(strm_orig(s)); + struct connection *srv_conn = NULL; + struct server *srv; + int reuse_mode = s->be->options & PR_O_REUSE_MASK; + int reuse = 0; + int init_mux = 0; + int err; +#ifdef USE_OPENSSL + struct sample *sni_smp = NULL; +#endif + struct sockaddr_storage *bind_addr = NULL; + int proxy_line_ret; + int64_t hash = 0; + struct conn_hash_params hash_params; + + /* in standard configuration, srv will be valid + * it can be NULL for dispatch mode or transparent backend */ + srv = objt_server(s->target); + + /* Override reuse-mode if reverse-connect is used. */ + if (srv && srv->flags & SRV_F_RHTTP) + reuse_mode = PR_O_REUSE_ALWS; + + err = alloc_dst_address(&s->scb->dst, srv, s); + if (err != SRV_STATUS_OK) + return SF_ERR_INTERNAL; + + err = alloc_bind_address(&bind_addr, srv, s->be, s); + if (err != SRV_STATUS_OK) + return SF_ERR_INTERNAL; + +#ifdef USE_OPENSSL + if (srv && srv->ssl_ctx.sni) { + sni_smp = sample_fetch_as_type(s->be, s->sess, s, + SMP_OPT_DIR_REQ | SMP_OPT_FINAL, + srv->ssl_ctx.sni, SMP_T_STR); + } +#endif + + /* do not reuse if mode is not http */ + if (!IS_HTX_STRM(s)) { + DBG_TRACE_STATE("skip idle connections reuse: no htx", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + goto skip_reuse; + } + + /* disable reuse if websocket stream and the protocol to use is not the + * same as the main protocol of the server. + */ + if (unlikely(s->flags & SF_WEBSOCKET) && srv) { + if (!srv_check_reuse_ws(srv)) { + DBG_TRACE_STATE("skip idle connections reuse: websocket stream", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + goto skip_reuse; + } + } + + /* first, set unique connection parameters and then calculate hash */ + memset(&hash_params, 0, sizeof(hash_params)); + + /* 1. target */ + hash_params.target = s->target; + +#ifdef USE_OPENSSL + /* 2. sni + * only test if the sample is not null as smp_make_safe (called before + * ssl_sock_set_servername) can only fails if this is not the case + */ + if (sni_smp) { + hash_params.sni_prehash = + conn_hash_prehash(sni_smp->data.u.str.area, + sni_smp->data.u.str.data); + } +#endif /* USE_OPENSSL */ + + /* 3. destination address */ + if (srv && srv_is_transparent(srv)) + hash_params.dst_addr = s->scb->dst; + + /* 4. source address */ + hash_params.src_addr = bind_addr; + + /* 5. proxy protocol */ + if (srv && srv->pp_opts) { + proxy_line_ret = make_proxy_line(trash.area, trash.size, srv, cli_conn, s); + if (proxy_line_ret) { + hash_params.proxy_prehash = + conn_hash_prehash(trash.area, proxy_line_ret); + } + } + + hash = conn_calculate_hash(&hash_params); + + /* first, search for a matching connection in the session's idle conns */ + srv_conn = session_get_conn(s->sess, s->target, hash); + if (srv_conn) { + DBG_TRACE_STATE("reuse connection from session", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + reuse = 1; + } + + if (srv && !reuse && reuse_mode != PR_O_REUSE_NEVR) { + /* Below we pick connections from the safe, idle or + * available (which are safe too) lists based + * on the strategy, the fact that this is a first or second + * (retryable) request, with the indicated priority (1 or 2) : + * + * SAFE AGGR ALWS + * + * +-----+-----+ +-----+-----+ +-----+-----+ + * req| 1st | 2nd | req| 1st | 2nd | req| 1st | 2nd | + * ----+-----+-----+ ----+-----+-----+ ----+-----+-----+ + * safe| - | 2 | safe| 1 | 2 | safe| 1 | 2 | + * ----+-----+-----+ ----+-----+-----+ ----+-----+-----+ + * idle| - | 1 | idle| - | 1 | idle| 2 | 1 | + * ----+-----+-----+ ----+-----+-----+ ----+-----+-----+ + * + * Idle conns are necessarily looked up on the same thread so + * that there is no concurrency issues. + */ + if (!eb_is_empty(&srv->per_thr[tid].avail_conns)) { + srv_conn = srv_lookup_conn(&srv->per_thr[tid].avail_conns, hash); + if (srv_conn) { + /* connection cannot be in idle list if used as an avail idle conn. */ + BUG_ON(LIST_INLIST(&srv_conn->idle_list)); + + DBG_TRACE_STATE("reuse connection from avail", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + reuse = 1; + } + } + + /* if no available connections found, search for an idle/safe */ + if (!srv_conn && srv->max_idle_conns && srv->curr_idle_conns > 0) { + const int not_first_req = s->txn && s->txn->flags & TX_NOT_FIRST; + const int idle = srv->curr_idle_nb > 0; + const int safe = srv->curr_safe_nb > 0; + const int retry_safe = (s->be->retry_type & (PR_RE_CONN_FAILED | PR_RE_DISCONNECTED | PR_RE_TIMEOUT)) == + (PR_RE_CONN_FAILED | PR_RE_DISCONNECTED | PR_RE_TIMEOUT); + + /* second column of the tables above, + * search for an idle then safe conn */ + if (not_first_req || retry_safe) { + if (idle || safe) + srv_conn = conn_backend_get(s, srv, 0, hash); + } + /* first column of the tables above */ + else if (reuse_mode >= PR_O_REUSE_AGGR) { + /* search for a safe conn */ + if (safe) + srv_conn = conn_backend_get(s, srv, 1, hash); + + /* search for an idle conn if no safe conn found + * on always reuse mode */ + if (!srv_conn && + reuse_mode == PR_O_REUSE_ALWS && idle) { + /* TODO conn_backend_get should not check the + * safe list is this case */ + srv_conn = conn_backend_get(s, srv, 0, hash); + } + } + + if (srv_conn) { + DBG_TRACE_STATE("reuse connection from idle/safe", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + reuse = 1; + } + } + } + + + /* here reuse might have been set above, indicating srv_conn finally + * is OK. + */ + + if (ha_used_fds > global.tune.pool_high_count && srv) { + struct connection *tokill_conn = NULL; + /* We can't reuse a connection, and e have more FDs than deemd + * acceptable, attempt to kill an idling connection + */ + /* First, try from our own idle list */ + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + if (!LIST_ISEMPTY(&srv->per_thr[tid].idle_conn_list)) { + tokill_conn = LIST_ELEM(srv->per_thr[tid].idle_conn_list.n, struct connection *, idle_list); + conn_delete_from_tree(tokill_conn); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + + /* Release the idle lock before calling mux->destroy. + * It will in turn call srv_release_conn through + * conn_free which also uses it. + */ + tokill_conn->mux->destroy(tokill_conn->ctx); + } + else { + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + + /* If not, iterate over other thread's idling pool, and try to grab one */ + if (!tokill_conn) { + int i; + + for (i = tid; (i = ((i + 1 == global.nbthread) ? 0 : i + 1)) != tid;) { + // just silence stupid gcc which reports an absurd + // out-of-bounds warning for <i> which is always + // exactly zero without threads, but it seems to + // see it possibly larger. + ALREADY_CHECKED(i); + + if (HA_SPIN_TRYLOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock) != 0) + continue; + + if (!LIST_ISEMPTY(&srv->per_thr[i].idle_conn_list)) { + tokill_conn = LIST_ELEM(srv->per_thr[i].idle_conn_list.n, struct connection *, idle_list); + conn_delete_from_tree(tokill_conn); + } + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock); + + if (tokill_conn) { + /* We got one, put it into the concerned thread's to kill list, and wake it's kill task */ + + MT_LIST_APPEND(&idle_conns[i].toremove_conns, + &tokill_conn->toremove_list); + task_wakeup(idle_conns[i].cleanup_task, TASK_WOKEN_OTHER); + break; + } + } + } + + } + + if (reuse) { + if (srv_conn->mux) { + int avail = srv_conn->mux->avail_streams(srv_conn); + + if (avail <= 1) { + /* No more streams available, remove it from the list */ + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + conn_delete_from_tree(srv_conn); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + + if (avail >= 1) { + if (srv_conn->mux->attach(srv_conn, s->scb->sedesc, s->sess) == -1) { + srv_conn = NULL; + if (sc_reset_endp(s->scb) < 0) + return SF_ERR_INTERNAL; + sc_ep_clr(s->scb, ~SE_FL_DETACHED); + } + } + else + srv_conn = NULL; + } + /* otherwise srv_conn is left intact */ + } + else + srv_conn = NULL; + +skip_reuse: + /* no reuse or failed to reuse the connection above, pick a new one */ + if (!srv_conn) { + if (srv && (srv->flags & SRV_F_RHTTP)) { + DBG_TRACE_USER("cannot open a new connection for reverse server", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + s->conn_err_type = STRM_ET_CONN_ERR; + return SF_ERR_INTERNAL; + } + + srv_conn = conn_new(s->target); + if (srv_conn) { + DBG_TRACE_STATE("alloc new be connection", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + srv_conn->owner = s->sess; + + /* connection will be attached to the session if + * http-reuse mode is never or it is not targeted to a + * server */ + if (reuse_mode == PR_O_REUSE_NEVR || !srv) + conn_set_private(srv_conn); + + /* assign bind_addr to srv_conn */ + srv_conn->src = bind_addr; + bind_addr = NULL; + + srv_conn->hash_node->node.key = hash; + } + } + + /* if bind_addr is non NULL free it */ + sockaddr_free(&bind_addr); + + /* srv_conn is still NULL only on allocation failure */ + if (!srv_conn) + return SF_ERR_RESOURCE; + + /* copy the target address into the connection */ + *srv_conn->dst = *s->scb->dst; + + /* Copy network namespace from client connection */ + srv_conn->proxy_netns = cli_conn ? cli_conn->proxy_netns : NULL; + + if (!srv_conn->xprt) { + /* set the correct protocol on the output stream connector */ + if (srv) { + if (conn_prepare(srv_conn, protocol_lookup(srv_conn->dst->ss_family, PROTO_TYPE_STREAM, 0), srv->xprt)) { + conn_free(srv_conn); + return SF_ERR_INTERNAL; + } + } else if (obj_type(s->target) == OBJ_TYPE_PROXY) { + int ret; + + /* proxies exclusively run on raw_sock right now */ + ret = conn_prepare(srv_conn, protocol_lookup(srv_conn->dst->ss_family, PROTO_TYPE_STREAM, 0), xprt_get(XPRT_RAW)); + if (ret < 0 || !(srv_conn->ctrl)) { + conn_free(srv_conn); + return SF_ERR_INTERNAL; + } + } + else { + conn_free(srv_conn); + return SF_ERR_INTERNAL; /* how did we get there ? */ + } + + if (sc_attach_mux(s->scb, NULL, srv_conn) < 0) { + conn_free(srv_conn); + return SF_ERR_INTERNAL; /* how did we get there ? */ + } + srv_conn->ctx = s->scb; + +#if defined(USE_OPENSSL) && defined(TLSEXT_TYPE_application_layer_protocol_negotiation) + if (!srv || + (srv->use_ssl != 1 || (!(srv->ssl_ctx.alpn_str) && !(srv->ssl_ctx.npn_str)) || + srv->mux_proto || !IS_HTX_STRM(s))) +#endif + init_mux = 1; + + /* process the case where the server requires the PROXY protocol to be sent */ + srv_conn->send_proxy_ofs = 0; + + if (srv && srv->pp_opts) { + srv_conn->flags |= CO_FL_SEND_PROXY; + srv_conn->send_proxy_ofs = 1; /* must compute size */ + } + + if (srv && (srv->flags & SRV_F_SOCKS4_PROXY)) { + srv_conn->send_proxy_ofs = 1; + srv_conn->flags |= CO_FL_SOCKS4; + } + +#if defined(USE_OPENSSL) && defined(TLSEXT_TYPE_application_layer_protocol_negotiation) + /* if websocket stream, try to update connection ALPN. */ + if (unlikely(s->flags & SF_WEBSOCKET) && + srv && srv->use_ssl && srv->ssl_ctx.alpn_str) { + char *alpn = ""; + int force = 0; + + switch (srv->ws) { + case SRV_WS_AUTO: + alpn = "\x08http/1.1"; + force = 0; + break; + case SRV_WS_H1: + alpn = "\x08http/1.1"; + force = 1; + break; + case SRV_WS_H2: + alpn = "\x02h2"; + force = 1; + break; + } + + if (!conn_update_alpn(srv_conn, ist(alpn), force)) + DBG_TRACE_STATE("update alpn for websocket", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + } +#endif + } + else { + s->flags |= SF_SRV_REUSED; + + /* Currently there seems to be no known cases of xprt ready + * without the mux installed here. + */ + BUG_ON(!srv_conn->mux); + + if (!(srv_conn->mux->ctl(srv_conn, MUX_CTL_STATUS, NULL) & MUX_STATUS_READY)) + s->flags |= SF_SRV_REUSED_ANTICIPATED; + } + + /* flag for logging source ip/port */ + if (strm_fe(s)->options2 & PR_O2_SRC_ADDR) + s->flags |= SF_SRC_ADDR; + + /* disable lingering */ + if (s->be->options & PR_O_TCP_NOLING) + s->scb->flags |= SC_FL_NOLINGER; + + if (s->flags & SF_SRV_REUSED) { + _HA_ATOMIC_INC(&s->be->be_counters.reuse); + if (srv) + _HA_ATOMIC_INC(&srv->counters.reuse); + } else { + _HA_ATOMIC_INC(&s->be->be_counters.connect); + if (srv) + _HA_ATOMIC_INC(&srv->counters.connect); + } + + err = do_connect_server(s, srv_conn); + if (err != SF_ERR_NONE) + return err; + +#ifdef USE_OPENSSL + if (!(s->flags & SF_SRV_REUSED)) { + if (smp_make_safe(sni_smp)) + ssl_sock_set_servername(srv_conn, sni_smp->data.u.str.area); + } +#endif /* USE_OPENSSL */ + + /* The CO_FL_SEND_PROXY flag may have been set by the connect method, + * if so, add our handshake pseudo-XPRT now. + */ + if ((srv_conn->flags & CO_FL_HANDSHAKE)) { + if (xprt_add_hs(srv_conn) < 0) { + conn_full_close(srv_conn); + return SF_ERR_INTERNAL; + } + } + conn_xprt_start(srv_conn); + + /* We have to defer the mux initialization until after si_connect() + * has been called, as we need the xprt to have been properly + * initialized, or any attempt to recv during the mux init may + * fail, and flag the connection as CO_FL_ERROR. + */ + if (init_mux) { + const struct mux_ops *alt_mux = + likely(!(s->flags & SF_WEBSOCKET)) ? NULL : srv_get_ws_proto(srv); + if (conn_install_mux_be(srv_conn, s->scb, s->sess, alt_mux) < 0) { + conn_full_close(srv_conn); + return SF_ERR_INTERNAL; + } + if (IS_HTX_STRM(s)) { + /* If we're doing http-reuse always, and the connection + * is not private with available streams (an http2 + * connection), add it to the available list, so that + * others can use it right away. If the connection is + * private or we're doing http-reuse safe and the mux + * protocol supports multiplexing, add it in the + * session server list. + */ + if (srv && reuse_mode == PR_O_REUSE_ALWS && + !(srv_conn->flags & CO_FL_PRIVATE) && + srv_conn->mux->avail_streams(srv_conn) > 0) { + srv_add_to_avail_list(srv, srv_conn); + } + else if (srv_conn->flags & CO_FL_PRIVATE || + (reuse_mode == PR_O_REUSE_SAFE && + srv_conn->mux->flags & MX_FL_HOL_RISK)) { + /* If it fail now, the same will be done in mux->detach() callback */ + session_add_conn(s->sess, srv_conn, srv_conn->target); + } + } + } + +#if defined(USE_OPENSSL) && (defined(OPENSSL_IS_BORINGSSL) || (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L)) + + if (!reuse && cli_conn && srv && srv_conn->mux && + (srv->ssl_ctx.options & SRV_SSL_O_EARLY_DATA) && + /* Only attempt to use early data if either the client sent + * early data, so that we know it can handle a 425, or if + * we are allowed to retry requests on early data failure, and + * it's our first try + */ + ((cli_conn->flags & CO_FL_EARLY_DATA) || + ((s->be->retry_type & PR_RE_EARLY_ERROR) && !s->conn_retries)) && + co_data(sc_oc(s->scb)) && + srv_conn->flags & CO_FL_SSL_WAIT_HS) + srv_conn->flags &= ~(CO_FL_SSL_WAIT_HS | CO_FL_WAIT_L6_CONN); +#endif + + /* set connect timeout */ + s->conn_exp = tick_add_ifset(now_ms, s->be->timeout.connect); + + if (srv) { + int count; + + s->flags |= SF_CURR_SESS; + count = _HA_ATOMIC_ADD_FETCH(&srv->cur_sess, 1); + HA_ATOMIC_UPDATE_MAX(&srv->counters.cur_sess_max, count); + if (s->be->lbprm.server_take_conn) + s->be->lbprm.server_take_conn(srv); + } + + /* Now handle synchronously connected sockets. We know the stream connector + * is at least in state SC_ST_CON. These ones typically are UNIX + * sockets, socket pairs, andoccasionally TCP connections on the + * loopback on a heavily loaded system. + */ + if (srv_conn->flags & CO_FL_ERROR) + s->scb->flags |= SC_FL_ERROR; + + /* If we had early data, and the handshake ended, then + * we can remove the flag, and attempt to wake the task up, + * in the event there's an analyser waiting for the end of + * the handshake. + */ + if (!(srv_conn->flags & (CO_FL_WAIT_XPRT | CO_FL_EARLY_SSL_HS))) + sc_ep_clr(s->scb, SE_FL_WAIT_FOR_HS); + + if (!sc_state_in(s->scb->state, SC_SB_EST|SC_SB_DIS|SC_SB_CLO) && + (srv_conn->flags & CO_FL_WAIT_XPRT) == 0) { + s->conn_exp = TICK_ETERNITY; + sc_oc(s->scb)->flags |= CF_WRITE_EVENT; + if (s->scb->state == SC_ST_CON) + s->scb->state = SC_ST_RDY; + } + + /* Report EOI on the channel if it was reached from the mux point of + * view. + * + * Note: This test is only required because si_cs_process is also the SI + * wake callback. Otherwise si_cs_recv()/si_cs_send() already take + * care of it. + */ + if (sc_ep_test(s->scb, SE_FL_EOI) && !(s->scb->flags & SC_FL_EOI)) { + s->scb->flags |= SC_FL_EOI; + sc_ic(s->scb)->flags |= CF_READ_EVENT; + } + + /* catch all sync connect while the mux is not already installed */ + if (!srv_conn->mux && !(srv_conn->flags & CO_FL_WAIT_XPRT)) { + if (conn_create_mux(srv_conn) < 0) { + conn_full_close(srv_conn); + return SF_ERR_INTERNAL; + } + } + + return SF_ERR_NONE; /* connection is OK */ +} + + +/* This function performs the "redispatch" part of a connection attempt. It + * will assign a server if required, queue the connection if required, and + * handle errors that might arise at this level. It can change the server + * state. It will return 1 if it encounters an error, switches the server + * state, or has to queue a connection. Otherwise, it will return 0 indicating + * that the connection is ready to use. + */ + +int srv_redispatch_connect(struct stream *s) +{ + struct server *srv; + int conn_err; + + /* We know that we don't have any connection pending, so we will + * try to get a new one, and wait in this state if it's queued + */ + redispatch: + conn_err = assign_server_and_queue(s); + srv = objt_server(s->target); + + switch (conn_err) { + case SRV_STATUS_OK: + break; + + case SRV_STATUS_FULL: + /* The server has reached its maxqueue limit. Either PR_O_REDISP is set + * and we can redispatch to another server, or it is not and we return + * 503. This only makes sense in DIRECT mode however, because normal LB + * algorithms would never select such a server, and hash algorithms + * would bring us on the same server again. Note that s->target is set + * in this case. + */ + if (((s->flags & (SF_DIRECT|SF_FORCE_PRST)) == SF_DIRECT) && + (s->be->options & PR_O_REDISP)) { + s->flags &= ~(SF_DIRECT | SF_ASSIGNED); + sockaddr_free(&s->scb->dst); + goto redispatch; + } + + if (!s->conn_err_type) { + s->conn_err_type = STRM_ET_QUEUE_ERR; + } + + _HA_ATOMIC_INC(&srv->counters.failed_conns); + _HA_ATOMIC_INC(&s->be->be_counters.failed_conns); + return 1; + + case SRV_STATUS_NOSRV: + /* note: it is guaranteed that srv == NULL here */ + if (!s->conn_err_type) { + s->conn_err_type = STRM_ET_CONN_ERR; + } + + _HA_ATOMIC_INC(&s->be->be_counters.failed_conns); + return 1; + + case SRV_STATUS_QUEUED: + s->conn_exp = tick_add_ifset(now_ms, s->be->timeout.queue); + s->scb->state = SC_ST_QUE; + /* do nothing else and do not wake any other stream up */ + return 1; + + case SRV_STATUS_INTERNAL: + default: + if (!s->conn_err_type) { + s->conn_err_type = STRM_ET_CONN_OTHER; + } + + if (srv) + srv_inc_sess_ctr(srv); + if (srv) + srv_set_sess_last(srv); + if (srv) + _HA_ATOMIC_INC(&srv->counters.failed_conns); + _HA_ATOMIC_INC(&s->be->be_counters.failed_conns); + + /* release other streams waiting for this server */ + if (may_dequeue_tasks(srv, s->be)) + process_srv_queue(srv); + return 1; + } + /* if we get here, it's because we got SRV_STATUS_OK, which also + * means that the connection has not been queued. + */ + return 0; +} + +/* Check if the connection request is in such a state that it can be aborted. */ +static int back_may_abort_req(struct channel *req, struct stream *s) +{ + return ((s->scf->flags & SC_FL_ERROR) || + ((s->scb->flags & (SC_FL_SHUT_WANTED|SC_FL_SHUT_DONE)) && /* empty and client aborted */ + (!co_data(req) || (s->be->options & PR_O_ABRT_CLOSE)))); +} + +/* Update back stream connector status for input states SC_ST_ASS, SC_ST_QUE, + * SC_ST_TAR. Other input states are simply ignored. + * Possible output states are SC_ST_CLO, SC_ST_TAR, SC_ST_ASS, SC_ST_REQ, SC_ST_CON + * and SC_ST_EST. Flags must have previously been updated for timeouts and other + * conditions. + */ +void back_try_conn_req(struct stream *s) +{ + struct server *srv = objt_server(s->target); + struct stconn *sc = s->scb; + struct channel *req = &s->req; + + DBG_TRACE_ENTER(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + + if (sc->state == SC_ST_ASS) { + /* Server assigned to connection request, we have to try to connect now */ + int conn_err; + + /* Before we try to initiate the connection, see if the + * request may be aborted instead. + */ + if (back_may_abort_req(req, s)) { + s->conn_err_type |= STRM_ET_CONN_ABRT; + DBG_TRACE_STATE("connection aborted", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto abort_connection; + } + + conn_err = connect_server(s); + srv = objt_server(s->target); + + if (conn_err == SF_ERR_NONE) { + /* state = SC_ST_CON or SC_ST_EST now */ + if (srv) + srv_inc_sess_ctr(srv); + if (srv) + srv_set_sess_last(srv); + DBG_TRACE_STATE("connection attempt", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + goto end; + } + + /* We have received a synchronous error. We might have to + * abort, retry immediately or redispatch. + */ + if (conn_err == SF_ERR_INTERNAL) { + if (!s->conn_err_type) { + s->conn_err_type = STRM_ET_CONN_OTHER; + } + + if (srv) + srv_inc_sess_ctr(srv); + if (srv) + srv_set_sess_last(srv); + if (srv) + _HA_ATOMIC_INC(&srv->counters.failed_conns); + _HA_ATOMIC_INC(&s->be->be_counters.failed_conns); + + /* release other streams waiting for this server */ + sess_change_server(s, NULL); + if (may_dequeue_tasks(srv, s->be)) + process_srv_queue(srv); + + /* Failed and not retryable. */ + sc_abort(sc); + sc_shutdown(sc); + sc->flags |= SC_FL_ERROR; + + s->logs.t_queue = ns_to_ms(now_ns - s->logs.accept_ts); + + /* we may need to know the position in the queue for logging */ + pendconn_cond_unlink(s->pend_pos); + + /* no stream was ever accounted for this server */ + sc->state = SC_ST_CLO; + if (s->srv_error) + s->srv_error(s, sc); + DBG_TRACE_STATE("internal error during connection", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto end; + } + + /* We are facing a retryable error, but we don't want to run a + * turn-around now, as the problem is likely a source port + * allocation problem, so we want to retry now. + */ + sc->state = SC_ST_CER; + sc->flags &= ~SC_FL_ERROR; + back_handle_st_cer(s); + + DBG_TRACE_STATE("connection error, retry", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + /* now sc->state is one of SC_ST_CLO, SC_ST_TAR, SC_ST_ASS, SC_ST_REQ */ + } + else if (sc->state == SC_ST_QUE) { + /* connection request was queued, check for any update */ + if (!pendconn_dequeue(s)) { + /* The connection is not in the queue anymore. Either + * we have a server connection slot available and we + * go directly to the assigned state, or we need to + * load-balance first and go to the INI state. + */ + s->conn_exp = TICK_ETERNITY; + if (unlikely(!(s->flags & SF_ASSIGNED))) + sc->state = SC_ST_REQ; + else { + s->logs.t_queue = ns_to_ms(now_ns - s->logs.accept_ts); + sc->state = SC_ST_ASS; + } + DBG_TRACE_STATE("dequeue connection request", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + goto end; + } + + /* Connection request still in queue... */ + if (s->flags & SF_CONN_EXP) { + /* ... and timeout expired */ + s->conn_exp = TICK_ETERNITY; + s->flags &= ~SF_CONN_EXP; + s->logs.t_queue = ns_to_ms(now_ns - s->logs.accept_ts); + + /* we may need to know the position in the queue for logging */ + pendconn_cond_unlink(s->pend_pos); + + if (srv) + _HA_ATOMIC_INC(&srv->counters.failed_conns); + _HA_ATOMIC_INC(&s->be->be_counters.failed_conns); + sc_abort(sc); + sc_shutdown(sc); + req->flags |= CF_WRITE_TIMEOUT; + if (!s->conn_err_type) + s->conn_err_type = STRM_ET_QUEUE_TO; + sc->state = SC_ST_CLO; + if (s->srv_error) + s->srv_error(s, sc); + DBG_TRACE_STATE("connection request still queued", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + goto end; + } + + /* Connection remains in queue, check if we have to abort it */ + if (back_may_abort_req(req, s)) { + s->logs.t_queue = ns_to_ms(now_ns - s->logs.accept_ts); + + /* we may need to know the position in the queue for logging */ + pendconn_cond_unlink(s->pend_pos); + + s->conn_err_type |= STRM_ET_QUEUE_ABRT; + DBG_TRACE_STATE("abort queued connection request", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto abort_connection; + } + + /* Nothing changed */ + } + else if (sc->state == SC_ST_TAR) { + /* Connection request might be aborted */ + if (back_may_abort_req(req, s)) { + s->conn_err_type |= STRM_ET_CONN_ABRT; + DBG_TRACE_STATE("connection aborted", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto abort_connection; + } + + if (!(s->flags & SF_CONN_EXP)) + return; /* still in turn-around */ + + s->flags &= ~SF_CONN_EXP; + s->conn_exp = TICK_ETERNITY; + + /* we keep trying on the same server as long as the stream is + * marked "assigned". + * FIXME: Should we force a redispatch attempt when the server is down ? + */ + if (s->flags & SF_ASSIGNED) + sc->state = SC_ST_ASS; + else + sc->state = SC_ST_REQ; + + DBG_TRACE_STATE("retry connection now", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + } + + end: + DBG_TRACE_LEAVE(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + return; + +abort_connection: + /* give up */ + s->conn_exp = TICK_ETERNITY; + s->flags &= ~SF_CONN_EXP; + sc_abort(sc); + sc_shutdown(sc); + sc->state = SC_ST_CLO; + if (s->srv_error) + s->srv_error(s, sc); + DBG_TRACE_DEVEL("leaving on error", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + return; +} + +/* This function initiates a server connection request on a stream connector + * already in SC_ST_REQ state. Upon success, the state goes to SC_ST_ASS for + * a real connection to a server, indicating that a server has been assigned, + * or SC_ST_RDY for a successful connection to an applet. It may also return + * SC_ST_QUE, or SC_ST_CLO upon error. + */ +void back_handle_st_req(struct stream *s) +{ + struct stconn *sc = s->scb; + + if (sc->state != SC_ST_REQ) + return; + + DBG_TRACE_ENTER(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + + if (unlikely(obj_type(s->target) == OBJ_TYPE_APPLET)) { + struct appctx *appctx; + + /* The target is an applet but the SC is in SC_ST_REQ. Thus it + * means no appctx are attached to the SC. Otherwise, it will be + * in SC_ST_RDY state. So, try to create the appctx now. + */ + BUG_ON(sc_appctx(sc)); + appctx = sc_applet_create(sc, objt_applet(s->target)); + if (!appctx) { + /* No more memory, let's immediately abort. Force the + * error code to ignore the ERR_LOCAL which is not a + * real error. + */ + s->flags &= ~(SF_ERR_MASK | SF_FINST_MASK); + + sc_abort(sc); + sc_shutdown(sc); + sc->flags |= SC_FL_ERROR; + s->conn_err_type = STRM_ET_CONN_RES; + sc->state = SC_ST_CLO; + if (s->srv_error) + s->srv_error(s, sc); + DBG_TRACE_STATE("failed to register applet", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto end; + } + + DBG_TRACE_STATE("applet registered", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + goto end; + } + + /* Try to assign a server */ + if (srv_redispatch_connect(s) != 0) { + /* We did not get a server. Either we queued the + * connection request, or we encountered an error. + */ + if (sc->state == SC_ST_QUE) { + DBG_TRACE_STATE("connection request queued", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + goto end; + } + + /* we did not get any server, let's check the cause */ + sc_abort(sc); + sc_shutdown(sc); + sc->flags |= SC_FL_ERROR; + if (!s->conn_err_type) + s->conn_err_type = STRM_ET_CONN_OTHER; + sc->state = SC_ST_CLO; + if (s->srv_error) + s->srv_error(s, sc); + DBG_TRACE_STATE("connection request failed", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto end; + } + + /* The server is assigned */ + s->logs.t_queue = ns_to_ms(now_ns - s->logs.accept_ts); + sc->state = SC_ST_ASS; + be_set_sess_last(s->be); + DBG_TRACE_STATE("connection request assigned to a server", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + + end: + DBG_TRACE_LEAVE(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); +} + +/* This function is called with (sc->state == SC_ST_CON) meaning that a + * connection was attempted and that the file descriptor is already allocated. + * We must check for timeout, error and abort. Possible output states are + * SC_ST_CER (error), SC_ST_DIS (abort), and SC_ST_CON (no change). This only + * works with connection-based streams. We know that there were no I/O event + * when reaching this function. Timeouts and errors are *not* cleared. + */ +void back_handle_st_con(struct stream *s) +{ + struct stconn *sc = s->scb; + struct channel *req = &s->req; + + DBG_TRACE_ENTER(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + + /* the client might want to abort */ + if ((s->scf->flags & SC_FL_SHUT_DONE) || + ((s->scb->flags & SC_FL_SHUT_WANTED) && + (!co_data(req) || (s->be->options & PR_O_ABRT_CLOSE)))) { + sc->flags |= SC_FL_NOLINGER; + sc_shutdown(sc); + s->conn_err_type |= STRM_ET_CONN_ABRT; + if (s->srv_error) + s->srv_error(s, sc); + /* Note: state = SC_ST_DIS now */ + DBG_TRACE_STATE("client abort during connection attempt", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto end; + } + + done: + /* retryable error ? */ + if ((s->flags & SF_CONN_EXP) || (sc->flags & SC_FL_ERROR)) { + if (!s->conn_err_type) { + if ((sc->flags & SC_FL_ERROR)) + s->conn_err_type = STRM_ET_CONN_ERR; + else + s->conn_err_type = STRM_ET_CONN_TO; + } + + sc->state = SC_ST_CER; + DBG_TRACE_STATE("connection failed, retry", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + } + + end: + DBG_TRACE_LEAVE(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); +} + +/* This function is called with (sc->state == SC_ST_CER) meaning that a + * previous connection attempt has failed and that the file descriptor + * has already been released. Possible causes include asynchronous error + * notification and time out. Possible output states are SC_ST_CLO when + * retries are exhausted, SC_ST_TAR when a delay is wanted before a new + * connection attempt, SC_ST_ASS when it's wise to retry on the same server, + * and SC_ST_REQ when an immediate redispatch is wanted. The buffers are + * marked as in error state. Timeouts and errors are cleared before retrying. + */ +void back_handle_st_cer(struct stream *s) +{ + struct stconn *sc = s->scb; + int must_tar = !!(sc->flags & SC_FL_ERROR); + + DBG_TRACE_ENTER(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + + s->conn_exp = TICK_ETERNITY; + s->flags &= ~SF_CONN_EXP; + + /* we probably have to release last stream from the server */ + if (objt_server(s->target)) { + struct connection *conn = sc_conn(sc); + + health_adjust(__objt_server(s->target), HANA_STATUS_L4_ERR); + + if (s->flags & SF_CURR_SESS) { + s->flags &= ~SF_CURR_SESS; + _HA_ATOMIC_DEC(&__objt_server(s->target)->cur_sess); + } + + if ((sc->flags & SC_FL_ERROR) && + conn && conn->err_code == CO_ER_SSL_MISMATCH_SNI) { + /* We tried to connect to a server which is configured + * with "verify required" and which doesn't have the + * "verifyhost" directive. The server presented a wrong + * certificate (a certificate for an unexpected name), + * which implies that we have used SNI in the handshake, + * and that the server doesn't have the associated cert + * and presented a default one. + * + * This is a serious enough issue not to retry. It's + * especially important because this wrong name might + * either be the result of a configuration error, and + * retrying will only hammer the server, or is caused + * by the use of a wrong SNI value, most likely + * provided by the client and we don't want to let the + * client provoke retries. + */ + s->conn_retries = s->be->conn_retries; + DBG_TRACE_DEVEL("Bad SSL cert, disable connection retries", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + } + } + + /* ensure that we have enough retries left */ + if (s->conn_retries >= s->be->conn_retries || !(s->be->retry_type & PR_RE_CONN_FAILED)) { + if (!s->conn_err_type) { + s->conn_err_type = STRM_ET_CONN_ERR; + } + + if (objt_server(s->target)) + _HA_ATOMIC_INC(&objt_server(s->target)->counters.failed_conns); + _HA_ATOMIC_INC(&s->be->be_counters.failed_conns); + sess_change_server(s, NULL); + if (may_dequeue_tasks(objt_server(s->target), s->be)) + process_srv_queue(objt_server(s->target)); + + /* shutw is enough to stop a connecting socket */ + sc_shutdown(sc); + sc->flags |= SC_FL_ERROR; + + sc->state = SC_ST_CLO; + if (s->srv_error) + s->srv_error(s, sc); + + DBG_TRACE_STATE("connection failed", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto end; + } + + /* At this stage, we will trigger a connection retry (with or without + * redispatch). Thus we must reset the SI endpoint on the server side + * an close the attached connection. It is especially important to do it + * now if the retry is not immediately performed, to be sure to release + * resources as soon as possible and to not catch errors from the lower + * layers in an unexpected state (i.e < ST_CONN). + * + * Note: the stream connector will be switched to ST_REQ, ST_ASS or + * ST_TAR and SC_FL_ERROR and SF_CONN_EXP flags will be unset. + */ + if (sc_reset_endp(sc) < 0) { + if (!s->conn_err_type) + s->conn_err_type = STRM_ET_CONN_OTHER; + + if (objt_server(s->target)) + _HA_ATOMIC_INC(&objt_server(s->target)->counters.internal_errors); + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + sess_change_server(s, NULL); + if (may_dequeue_tasks(objt_server(s->target), s->be)) + process_srv_queue(objt_server(s->target)); + + /* shutw is enough to stop a connecting socket */ + sc_shutdown(sc); + sc->flags |= SC_FL_ERROR; + + sc->state = SC_ST_CLO; + if (s->srv_error) + s->srv_error(s, sc); + + DBG_TRACE_STATE("error resetting endpoint", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto end; + } + + s->conn_retries++; + stream_choose_redispatch(s); + + if (must_tar) { + /* The error was an asynchronous connection error, and we will + * likely have to retry connecting to the same server, most + * likely leading to the same result. To avoid this, we wait + * MIN(one second, connect timeout) before retrying. We don't + * do it when the failure happened on a reused connection + * though. + */ + + int delay = 1000; + const int reused = (s->flags & SF_SRV_REUSED) && + !(s->flags & SF_SRV_REUSED_ANTICIPATED); + + if (s->be->timeout.connect && s->be->timeout.connect < delay) + delay = s->be->timeout.connect; + + if (!s->conn_err_type) + s->conn_err_type = STRM_ET_CONN_ERR; + + /* only wait when we're retrying on the same server */ + if ((sc->state == SC_ST_ASS || + (s->be->srv_act <= 1)) && !reused) { + sc->state = SC_ST_TAR; + s->conn_exp = tick_add(now_ms, MS_TO_TICKS(delay)); + } + DBG_TRACE_STATE("retry a new connection", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + } + + end: + DBG_TRACE_LEAVE(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); +} + +/* This function is called with (sc->state == SC_ST_RDY) meaning that a + * connection was attempted, that the file descriptor is already allocated, + * and that it has succeeded. We must still check for errors and aborts. + * Possible output states are SC_ST_EST (established), SC_ST_CER (error), + * and SC_ST_DIS (abort). This only works with connection-based streams. + * Timeouts and errors are *not* cleared. + */ +void back_handle_st_rdy(struct stream *s) +{ + struct stconn *sc = s->scb; + struct channel *req = &s->req; + + DBG_TRACE_ENTER(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + + if (unlikely(obj_type(s->target) == OBJ_TYPE_APPLET)) { + /* Here the appctx must exists because the SC was set to + * SC_ST_RDY state when the appctx was created. + */ + BUG_ON(!sc_appctx(s->scb)); + + if (!s->logs.request_ts) + s->logs.request_ts = now_ns; + s->logs.t_queue = ns_to_ms(now_ns - s->logs.accept_ts); + be_set_sess_last(s->be); + } + + /* We know the connection at least succeeded, though it could have + * since met an error for any other reason. At least it didn't time out + * even though the timeout might have been reported right after success. + * We need to take care of various situations here : + * - everything might be OK. We have to switch to established. + * - an I/O error might have been reported after a successful transfer, + * which is not retryable and needs to be logged correctly, and needs + * established as well + * - SC_ST_CON implies !CF_WROTE_DATA but not conversely as we could + * have validated a connection with incoming data (e.g. TCP with a + * banner protocol), or just a successful connect() probe. + * - the client might have requested a connection abort, this needs to + * be checked before we decide to retry anything. + */ + + /* it's still possible to handle client aborts or connection retries + * before any data were sent. + */ + if (!(req->flags & CF_WROTE_DATA)) { + /* client abort ? */ + if ((s->scf->flags & SC_FL_SHUT_DONE) || + ((s->scb->flags & SC_FL_SHUT_WANTED) && + (!co_data(req) || (s->be->options & PR_O_ABRT_CLOSE)))) { + /* give up */ + sc->flags |= SC_FL_NOLINGER; + sc_shutdown(sc); + s->conn_err_type |= STRM_ET_CONN_ABRT; + if (s->srv_error) + s->srv_error(s, sc); + DBG_TRACE_STATE("client abort during connection attempt", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto end; + } + + /* retryable error ? */ + if (sc->flags & SC_FL_ERROR) { + if (!s->conn_err_type) + s->conn_err_type = STRM_ET_CONN_ERR; + sc->state = SC_ST_CER; + DBG_TRACE_STATE("connection failed, retry", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + goto end; + } + } + + /* data were sent and/or we had no error, back_establish() will + * now take over. + */ + DBG_TRACE_STATE("connection established", STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + s->conn_err_type = STRM_ET_NONE; + sc->state = SC_ST_EST; + + end: + DBG_TRACE_LEAVE(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); +} + +/* sends a log message when a backend goes down, and also sets last + * change date. + */ +void set_backend_down(struct proxy *be) +{ + be->last_change = ns_to_sec(now_ns); + _HA_ATOMIC_INC(&be->down_trans); + + if (!(global.mode & MODE_STARTING)) { + ha_alert("%s '%s' has no server available!\n", proxy_type_str(be), be->id); + send_log(be, LOG_EMERG, "%s %s has no server available!\n", proxy_type_str(be), be->id); + } +} + +/* Apply RDP cookie persistence to the current stream. For this, the function + * tries to extract an RDP cookie from the request buffer, and look for the + * matching server in the list. If the server is found, it is assigned to the + * stream. This always returns 1, and the analyser removes itself from the + * list. Nothing is performed if a server was already assigned. + */ +int tcp_persist_rdp_cookie(struct stream *s, struct channel *req, int an_bit) +{ + struct proxy *px = s->be; + int ret; + struct sample smp; + struct server *srv = px->srv; + uint16_t port; + uint32_t addr; + char *p; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_TCP_ANA, s); + + if (s->flags & SF_ASSIGNED) + goto no_cookie; + + memset(&smp, 0, sizeof(smp)); + + ret = fetch_rdp_cookie_name(s, &smp, s->be->rdp_cookie_name, s->be->rdp_cookie_len); + if (ret == 0 || (smp.flags & SMP_F_MAY_CHANGE) || smp.data.u.str.data == 0) + goto no_cookie; + + /* Considering an rdp cookie detected using acl, str ended with <cr><lf> and should return. + * The cookie format is <ip> "." <port> where "ip" is the integer corresponding to the + * server's IP address in network order, and "port" is the integer corresponding to the + * server's port in network order. Comments please Emeric. + */ + addr = strtoul(smp.data.u.str.area, &p, 10); + if (*p != '.') + goto no_cookie; + p++; + + port = ntohs(strtoul(p, &p, 10)); + if (*p != '.') + goto no_cookie; + + s->target = NULL; + while (srv) { + if (srv->addr.ss_family == AF_INET && + port == srv->svc_port && + addr == ((struct sockaddr_in *)&srv->addr)->sin_addr.s_addr) { + if ((srv->cur_state != SRV_ST_STOPPED) || (px->options & PR_O_PERSIST)) { + /* we found the server and it is usable */ + s->flags |= SF_DIRECT | SF_ASSIGNED; + s->target = &srv->obj_type; + break; + } + } + srv = srv->next; + } + +no_cookie: + req->analysers &= ~an_bit; + req->analyse_exp = TICK_ETERNITY; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_TCP_ANA, s); + return 1; +} + +int be_downtime(struct proxy *px) { + if (px->lbprm.tot_weight && px->last_change < ns_to_sec(now_ns)) // ignore negative time + return px->down_time; + + return ns_to_sec(now_ns) - px->last_change + px->down_time; +} + +/* + * This function returns a string containing the balancing + * mode of the proxy in a format suitable for stats. + */ + +const char *backend_lb_algo_str(int algo) { + + if (algo == BE_LB_ALGO_RR) + return "roundrobin"; + else if (algo == BE_LB_ALGO_SRR) + return "static-rr"; + else if (algo == BE_LB_ALGO_FAS) + return "first"; + else if (algo == BE_LB_ALGO_LC) + return "leastconn"; + else if (algo == BE_LB_ALGO_SH) + return "source"; + else if (algo == BE_LB_ALGO_UH) + return "uri"; + else if (algo == BE_LB_ALGO_PH) + return "url_param"; + else if (algo == BE_LB_ALGO_HH) + return "hdr"; + else if (algo == BE_LB_ALGO_RCH) + return "rdp-cookie"; + else if (algo == BE_LB_ALGO_SMP) + return "hash"; + else if (algo == BE_LB_ALGO_NONE) + return "none"; + else + return "unknown"; +} + +/* This function parses a "balance" statement in a backend section describing + * <curproxy>. It returns -1 if there is any error, otherwise zero. If it + * returns -1, it will write an error message into the <err> buffer which will + * automatically be allocated and must be passed as NULL. The trailing '\n' + * will not be written. The function must be called with <args> pointing to the + * first word after "balance". + */ +int backend_parse_balance(const char **args, char **err, struct proxy *curproxy) +{ + if (!*(args[0])) { + /* if no option is set, use round-robin by default */ + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_RR; + return 0; + } + + if (strcmp(args[0], "roundrobin") == 0) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_RR; + } + else if (strcmp(args[0], "static-rr") == 0) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_SRR; + } + else if (strcmp(args[0], "first") == 0) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_FAS; + } + else if (strcmp(args[0], "leastconn") == 0) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_LC; + } + else if (!strncmp(args[0], "random", 6)) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_RND; + curproxy->lbprm.arg_opt1 = 2; + + if (*(args[0] + 6) == '(' && *(args[0] + 7) != ')') { /* number of draws */ + const char *beg; + char *end; + + beg = args[0] + 7; + curproxy->lbprm.arg_opt1 = strtol(beg, &end, 0); + + if (*end != ')') { + if (!*end) + memprintf(err, "random : missing closing parenthesis."); + else + memprintf(err, "random : unexpected character '%c' after argument.", *end); + return -1; + } + + if (curproxy->lbprm.arg_opt1 < 1) { + memprintf(err, "random : number of draws must be at least 1."); + return -1; + } + } + } + else if (strcmp(args[0], "source") == 0) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_SH; + } + else if (strcmp(args[0], "uri") == 0) { + int arg = 1; + + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_UH; + curproxy->lbprm.arg_opt1 = 0; // "whole", "path-only" + curproxy->lbprm.arg_opt2 = 0; // "len" + curproxy->lbprm.arg_opt3 = 0; // "depth" + + while (*args[arg]) { + if (strcmp(args[arg], "len") == 0) { + if (!*args[arg+1] || (atoi(args[arg+1]) <= 0)) { + memprintf(err, "%s : '%s' expects a positive integer (got '%s').", args[0], args[arg], args[arg+1]); + return -1; + } + curproxy->lbprm.arg_opt2 = atoi(args[arg+1]); + arg += 2; + } + else if (strcmp(args[arg], "depth") == 0) { + if (!*args[arg+1] || (atoi(args[arg+1]) <= 0)) { + memprintf(err, "%s : '%s' expects a positive integer (got '%s').", args[0], args[arg], args[arg+1]); + return -1; + } + /* hint: we store the position of the ending '/' (depth+1) so + * that we avoid a comparison while computing the hash. + */ + curproxy->lbprm.arg_opt3 = atoi(args[arg+1]) + 1; + arg += 2; + } + else if (strcmp(args[arg], "whole") == 0) { + curproxy->lbprm.arg_opt1 |= 1; + arg += 1; + } + else if (strcmp(args[arg], "path-only") == 0) { + curproxy->lbprm.arg_opt1 |= 2; + arg += 1; + } + else { + memprintf(err, "%s only accepts parameters 'len', 'depth', 'path-only', and 'whole' (got '%s').", args[0], args[arg]); + return -1; + } + } + } + else if (strcmp(args[0], "url_param") == 0) { + if (!*args[1]) { + memprintf(err, "%s requires an URL parameter name.", args[0]); + return -1; + } + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_PH; + + free(curproxy->lbprm.arg_str); + curproxy->lbprm.arg_str = strdup(args[1]); + curproxy->lbprm.arg_len = strlen(args[1]); + if (*args[2]) { + if (strcmp(args[2], "check_post") != 0) { + memprintf(err, "%s only accepts 'check_post' modifier (got '%s').", args[0], args[2]); + return -1; + } + } + } + else if (strcmp(args[0], "hash") == 0) { + if (!*args[1]) { + memprintf(err, "%s requires a sample expression.", args[0]); + return -1; + } + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_SMP; + + ha_free(&curproxy->lbprm.arg_str); + curproxy->lbprm.arg_str = strdup(args[1]); + curproxy->lbprm.arg_len = strlen(args[1]); + + if (*args[2]) { + memprintf(err, "%s takes no other argument (got '%s').", args[0], args[2]); + return -1; + } + } + else if (!strncmp(args[0], "hdr(", 4)) { + const char *beg, *end; + + beg = args[0] + 4; + end = strchr(beg, ')'); + + if (!end || end == beg) { + memprintf(err, "hdr requires an http header field name."); + return -1; + } + + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_HH; + + free(curproxy->lbprm.arg_str); + curproxy->lbprm.arg_len = end - beg; + curproxy->lbprm.arg_str = my_strndup(beg, end - beg); + curproxy->lbprm.arg_opt1 = 0; + + if (*args[1]) { + if (strcmp(args[1], "use_domain_only") != 0) { + memprintf(err, "%s only accepts 'use_domain_only' modifier (got '%s').", args[0], args[1]); + return -1; + } + curproxy->lbprm.arg_opt1 = 1; + } + } + else if (!strncmp(args[0], "rdp-cookie", 10)) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_RCH; + + if ( *(args[0] + 10 ) == '(' ) { /* cookie name */ + const char *beg, *end; + + beg = args[0] + 11; + end = strchr(beg, ')'); + + if (!end || end == beg) { + memprintf(err, "rdp-cookie : missing cookie name."); + return -1; + } + + free(curproxy->lbprm.arg_str); + curproxy->lbprm.arg_str = my_strndup(beg, end - beg); + curproxy->lbprm.arg_len = end - beg; + } + else if ( *(args[0] + 10 ) == '\0' ) { /* default cookie name 'mstshash' */ + free(curproxy->lbprm.arg_str); + curproxy->lbprm.arg_str = strdup("mstshash"); + curproxy->lbprm.arg_len = strlen(curproxy->lbprm.arg_str); + } + else { /* syntax */ + memprintf(err, "rdp-cookie : missing cookie name."); + return -1; + } + } + else if (strcmp(args[0], "log-hash") == 0) { + if (!*args[1]) { + memprintf(err, "%s requires a converter list.", args[0]); + return -1; + } + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_LH; + + ha_free(&curproxy->lbprm.arg_str); + curproxy->lbprm.arg_str = strdup(args[1]); + } + else if (strcmp(args[0], "sticky") == 0) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_LS; + } + else { + memprintf(err, "only supports 'roundrobin', 'static-rr', 'leastconn', 'source', 'uri', 'url_param', 'hash', 'hdr(name)', 'rdp-cookie(name)', 'log-hash' and 'sticky' options."); + return -1; + } + return 0; +} + + +/************************************************************************/ +/* All supported sample and ACL keywords must be declared here. */ +/************************************************************************/ + +/* set temp integer to the number of enabled servers on the proxy. + * Accepts exactly 1 argument. Argument is a backend, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_nbsrv(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = args->data.prx; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + + smp->data.u.sint = be_usable_srv(px); + + return 1; +} + +/* report in smp->flags a success or failure depending on the designated + * server's state. There is no match function involved since there's no pattern. + * Accepts exactly 1 argument. Argument is a server, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_srv_is_up(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct server *srv = args->data.srv; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_BOOL; + if (!(srv->cur_admin & SRV_ADMF_MAINT) && + (!(srv->check.state & CHK_ST_CONFIGURED) || (srv->cur_state != SRV_ST_STOPPED))) + smp->data.u.sint = 1; + else + smp->data.u.sint = 0; + return 1; +} + +/* set temp integer to the number of enabled servers on the proxy. + * Accepts exactly 1 argument. Argument is a backend, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_connslots(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct server *iterator; + struct proxy *px = args->data.prx; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + for (iterator = px->srv; iterator; iterator = iterator->next) { + if (iterator->cur_state == SRV_ST_STOPPED) + continue; + + if (iterator->maxconn == 0 || iterator->maxqueue == 0) { + /* configuration is stupid */ + smp->data.u.sint = -1; /* FIXME: stupid value! */ + return 1; + } + + smp->data.u.sint += (iterator->maxconn - iterator->cur_sess) + + (iterator->maxqueue - iterator->queue.length); + } + + return 1; +} + +/* set temp integer to the id of the backend */ +static int +smp_fetch_be_id(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = NULL; + + if (smp->strm) + px = smp->strm->be; + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + px = __objt_check(smp->sess->origin)->proxy; + if (!px) + return 0; + + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = px->uuid; + return 1; +} + +/* set string to the name of the backend */ +static int +smp_fetch_be_name(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = NULL; + + if (smp->strm) + px = smp->strm->be; + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + px = __objt_check(smp->sess->origin)->proxy; + if (!px) + return 0; + + smp->data.u.str.area = (char *)px->id; + if (!smp->data.u.str.area) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + + return 1; +} + +/* set temp integer to the id of the server */ +static int +smp_fetch_srv_id(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct server *srv = NULL; + + if (smp->strm) + srv = objt_server(smp->strm->target); + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + srv = __objt_check(smp->sess->origin)->server; + if (!srv) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = srv->puid; + + return 1; +} + +/* set string to the name of the server */ +static int +smp_fetch_srv_name(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct server *srv = NULL; + + if (smp->strm) + srv = objt_server(smp->strm->target); + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + srv = __objt_check(smp->sess->origin)->server; + if (!srv) + return 0; + + smp->data.u.str.area = srv->id; + if (!smp->data.u.str.area) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str.data = strlen(smp->data.u.str.area); + + return 1; +} + +/* set temp integer to the number of connections per second reaching the backend. + * Accepts exactly 1 argument. Argument is a backend, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_be_sess_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = args->data.prx; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = read_freq_ctr(&px->be_sess_per_sec); + return 1; +} + +/* set temp integer to the number of concurrent connections on the backend. + * Accepts exactly 1 argument. Argument is a backend, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_be_conn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = args->data.prx; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = px->beconn; + return 1; +} + +/* set temp integer to the number of available connections across available + * servers on the backend. + * Accepts exactly 1 argument. Argument is a backend, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_be_conn_free(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct server *iterator; + struct proxy *px = args->data.prx; + unsigned int maxconn; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + for (iterator = px->srv; iterator; iterator = iterator->next) { + if (iterator->cur_state == SRV_ST_STOPPED) + continue; + + px = iterator->proxy; + if (!srv_currently_usable(iterator) || + ((iterator->flags & SRV_F_BACKUP) && + (px->srv_act || (iterator != px->lbprm.fbck && !(px->options & PR_O_USE_ALL_BK))))) + continue; + + if (iterator->maxconn == 0) { + /* one active server is unlimited, return -1 */ + smp->data.u.sint = -1; + return 1; + } + + maxconn = srv_dynamic_maxconn(iterator); + if (maxconn > iterator->cur_sess) + smp->data.u.sint += maxconn - iterator->cur_sess; + } + + return 1; +} + +/* set temp integer to the total number of queued connections on the backend. + * Accepts exactly 1 argument. Argument is a backend, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_queue_size(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = args->data.prx; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = px->totpend; + return 1; +} + +/* set temp integer to the total number of queued connections on the backend divided + * by the number of running servers and rounded up. If there is no running + * server, we return twice the total, just as if we had half a running server. + * This is more or less correct anyway, since we expect the last server to come + * back soon. + * Accepts exactly 1 argument. Argument is a backend, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_avg_queue_size(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = args->data.prx; + int nbsrv; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + + nbsrv = be_usable_srv(px); + + if (nbsrv > 0) + smp->data.u.sint = (px->totpend + nbsrv - 1) / nbsrv; + else + smp->data.u.sint = px->totpend * 2; + + return 1; +} + +/* set temp integer to the number of concurrent connections on the server in the backend. + * Accepts exactly 1 argument. Argument is a server, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_srv_conn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = args->data.srv->cur_sess; + return 1; +} + +/* set temp integer to the number of available connections on the server in the backend. + * Accepts exactly 1 argument. Argument is a server, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_srv_conn_free(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + unsigned int maxconn; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + + if (args->data.srv->maxconn == 0) { + /* one active server is unlimited, return -1 */ + smp->data.u.sint = -1; + return 1; + } + + maxconn = srv_dynamic_maxconn(args->data.srv); + if (maxconn > args->data.srv->cur_sess) + smp->data.u.sint = maxconn - args->data.srv->cur_sess; + else + smp->data.u.sint = 0; + + return 1; +} + +/* set temp integer to the number of connections pending in the server's queue. + * Accepts exactly 1 argument. Argument is a server, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_srv_queue(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = args->data.srv->queue.length; + return 1; +} + +/* set temp integer to the number of enabled servers on the proxy. + * Accepts exactly 1 argument. Argument is a server, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_srv_sess_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = read_freq_ctr(&args->data.srv->sess_per_sec); + return 1; +} + +/* set temp integer to the server weight. + * Accepts exactly 1 argument. Argument is a server, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_srv_weight(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct server *srv = args->data.srv; + struct proxy *px = srv->proxy; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = (srv->cur_eweight * px->lbprm.wmult + px->lbprm.wdiv - 1) / px->lbprm.wdiv; + return 1; +} + +/* set temp integer to the server initial weight. + * Accepts exactly 1 argument. Argument is a server, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_srv_iweight(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = args->data.srv->iweight; + return 1; +} + +/* set temp integer to the server user-specified weight. + * Accepts exactly 1 argument. Argument is a server, other types will lead to + * undefined behaviour. + */ +static int +smp_fetch_srv_uweight(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = args->data.srv->uweight; + return 1; +} + +static int +smp_fetch_be_server_timeout(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + struct proxy *px = NULL; + + if (smp->strm) + px = smp->strm->be; + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + px = __objt_check(smp->sess->origin)->proxy; + if (!px) + return 0; + + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = TICKS_TO_MS(px->timeout.server); + return 1; +} + +static int +smp_fetch_be_tunnel_timeout(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + struct proxy *px = NULL; + + if (smp->strm) + px = smp->strm->be; + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + px = __objt_check(smp->sess->origin)->proxy; + if (!px) + return 0; + + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = TICKS_TO_MS(px->timeout.tunnel); + return 1; +} + +static int sample_conv_nbsrv(const struct arg *args, struct sample *smp, void *private) +{ + + struct proxy *px; + + if (!smp_make_safe(smp)) + return 0; + + px = proxy_find_by_name(smp->data.u.str.area, PR_CAP_BE, 0); + if (!px) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = be_usable_srv(px); + + return 1; +} + +static int +sample_conv_srv_queue(const struct arg *args, struct sample *smp, void *private) +{ + struct proxy *px; + struct server *srv; + char *bksep; + + if (!smp_make_safe(smp)) + return 0; + + bksep = strchr(smp->data.u.str.area, '/'); + + if (bksep) { + *bksep = '\0'; + px = proxy_find_by_name(smp->data.u.str.area, PR_CAP_BE, 0); + if (!px) + return 0; + smp->data.u.str.area = bksep + 1; + } else { + if (!(smp->px->cap & PR_CAP_BE)) + return 0; + px = smp->px; + } + + srv = server_find_by_name(px, smp->data.u.str.area); + if (!srv) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = srv->queue.length; + return 1; +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct sample_fetch_kw_list smp_kws = {ILH, { + { "avg_queue", smp_fetch_avg_queue_size, ARG1(1,BE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "be_conn", smp_fetch_be_conn, ARG1(1,BE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "be_conn_free", smp_fetch_be_conn_free, ARG1(1,BE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "be_id", smp_fetch_be_id, 0, NULL, SMP_T_SINT, SMP_USE_BKEND, }, + { "be_name", smp_fetch_be_name, 0, NULL, SMP_T_STR, SMP_USE_BKEND, }, + { "be_server_timeout", smp_fetch_be_server_timeout, 0, NULL, SMP_T_SINT, SMP_USE_BKEND, }, + { "be_sess_rate", smp_fetch_be_sess_rate, ARG1(1,BE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "be_tunnel_timeout", smp_fetch_be_tunnel_timeout, 0, NULL, SMP_T_SINT, SMP_USE_BKEND, }, + { "connslots", smp_fetch_connslots, ARG1(1,BE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "nbsrv", smp_fetch_nbsrv, ARG1(1,BE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "queue", smp_fetch_queue_size, ARG1(1,BE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "srv_conn", smp_fetch_srv_conn, ARG1(1,SRV), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "srv_conn_free", smp_fetch_srv_conn_free, ARG1(1,SRV), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "srv_id", smp_fetch_srv_id, 0, NULL, SMP_T_SINT, SMP_USE_SERVR, }, + { "srv_is_up", smp_fetch_srv_is_up, ARG1(1,SRV), NULL, SMP_T_BOOL, SMP_USE_INTRN, }, + { "srv_name", smp_fetch_srv_name, 0, NULL, SMP_T_STR, SMP_USE_SERVR, }, + { "srv_queue", smp_fetch_srv_queue, ARG1(1,SRV), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "srv_sess_rate", smp_fetch_srv_sess_rate, ARG1(1,SRV), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "srv_weight", smp_fetch_srv_weight, ARG1(1,SRV), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "srv_iweight", smp_fetch_srv_iweight, ARG1(1,SRV), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "srv_uweight", smp_fetch_srv_uweight, ARG1(1,SRV), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_conv_kw_list sample_conv_kws = {ILH, { + { "nbsrv", sample_conv_nbsrv, 0, NULL, SMP_T_STR, SMP_T_SINT }, + { "srv_queue", sample_conv_srv_queue, 0, NULL, SMP_T_STR, SMP_T_SINT }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_convs, &sample_conv_kws); + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct acl_kw_list acl_kws = {ILH, { + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, acl_register_keywords, &acl_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/base64.c b/src/base64.c new file mode 100644 index 0000000..0601bf6 --- /dev/null +++ b/src/base64.c @@ -0,0 +1,303 @@ +/* + * ASCII <-> Base64 conversion as described in RFC1421. + * + * Copyright 2006-2010 Willy Tarreau <w@1wt.eu> + * Copyright 2009-2010 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <stdlib.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/base64.h> + +#define B64BASE '#' /* arbitrary chosen base value */ +#define B64CMIN '+' +#define UB64CMIN '-' +#define B64CMAX 'z' +#define B64PADV 64 /* Base64 chosen special pad value */ + +const char base64tab[65]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +const char base64rev[]="b###cXYZ[\\]^_`a###d###$%&'()*+,-./0123456789:;<=######>?@ABCDEFGHIJKLMNOPQRSTUVW"; +const char ubase64tab[65]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; +const char ubase64rev[]="b##XYZ[\\]^_`a###c###$%&'()*+,-./0123456789:;<=####c#>?@ABCDEFGHIJKLMNOPQRSTUVW"; + +/* Encodes <ilen> bytes from <in> to <out> for at most <olen> chars (including + * the trailing zero). Returns the number of bytes written. No check is made + * for <in> or <out> to be NULL. Returns negative value if <olen> is too short + * to accept <ilen>. 4 output bytes are produced for 1 to 3 input bytes. + */ +int a2base64(char *in, int ilen, char *out, int olen) +{ + int convlen; + + convlen = ((ilen + 2) / 3) * 4; + + if (convlen >= olen) + return -1; + + /* we don't need to check olen anymore */ + while (ilen >= 3) { + out[0] = base64tab[(((unsigned char)in[0]) >> 2)]; + out[1] = base64tab[(((unsigned char)in[0] & 0x03) << 4) | (((unsigned char)in[1]) >> 4)]; + out[2] = base64tab[(((unsigned char)in[1] & 0x0F) << 2) | (((unsigned char)in[2]) >> 6)]; + out[3] = base64tab[(((unsigned char)in[2] & 0x3F))]; + out += 4; + in += 3; ilen -= 3; + } + + if (!ilen) { + out[0] = '\0'; + } else { + out[0] = base64tab[((unsigned char)in[0]) >> 2]; + if (ilen == 1) { + out[1] = base64tab[((unsigned char)in[0] & 0x03) << 4]; + out[2] = '='; + } else { + out[1] = base64tab[(((unsigned char)in[0] & 0x03) << 4) | + (((unsigned char)in[1]) >> 4)]; + out[2] = base64tab[((unsigned char)in[1] & 0x0F) << 2]; + } + out[3] = '='; + out[4] = '\0'; + } + + return convlen; +} + +/* url variant of a2base64 */ +int a2base64url(const char *in, size_t ilen, char *out, size_t olen) +{ + int convlen; + + convlen = ((ilen + 2) / 3) * 4; + + if (convlen >= olen) + return -1; + + /* we don't need to check olen anymore */ + while (ilen >= 3) { + out[0] = ubase64tab[(((unsigned char)in[0]) >> 2)]; + out[1] = ubase64tab[(((unsigned char)in[0] & 0x03) << 4) | (((unsigned char)in[1]) >> 4)]; + out[2] = ubase64tab[(((unsigned char)in[1] & 0x0F) << 2) | (((unsigned char)in[2]) >> 6)]; + out[3] = ubase64tab[(((unsigned char)in[2] & 0x3F))]; + out += 4; + in += 3; + ilen -= 3; + } + + if (!ilen) { + out[0] = '\0'; + return convlen; + } + + out[0] = ubase64tab[((unsigned char)in[0]) >> 2]; + if (ilen == 1) { + out[1] = ubase64tab[((unsigned char)in[0] & 0x03) << 4]; + out[2] = '\0'; + convlen -= 2; + } else { + out[1] = ubase64tab[(((unsigned char)in[0] & 0x03) << 4) | + (((unsigned char)in[1]) >> 4)]; + out[2] = ubase64tab[((unsigned char)in[1] & 0x0F) << 2]; + out[3] = '\0'; + convlen -= 1; + } + + return convlen; +} + +/* Decodes <ilen> bytes from <in> to <out> for at most <olen> chars. + * Returns the number of bytes converted. No check is made for + * <in> or <out> to be NULL. Returns -1 if <in> is invalid or ilen + * has wrong size, -2 if <olen> is too short. + * 1 to 3 output bytes are produced for 4 input bytes. + */ +int base64dec(const char *in, size_t ilen, char *out, size_t olen) { + + unsigned char t[4]; + signed char b; + int convlen = 0, i = 0, pad = 0; + + if (ilen % 4) + return -1; + + if (olen < ((ilen / 4 * 3) + - (in[ilen-1] == '=' ? 1 : 0) + - (in[ilen-2] == '=' ? 1 : 0))) + return -2; + + while (ilen) { + + /* if (*p < B64CMIN || *p > B64CMAX) */ + b = (signed char)*in - B64CMIN; + if ((unsigned char)b > (B64CMAX-B64CMIN)) + return -1; + + b = base64rev[b] - B64BASE - 1; + + /* b == -1: invalid character */ + if (b < 0) + return -1; + + /* padding has to be continuous */ + if (pad && b != B64PADV) + return -1; + + /* valid padding: "XX==" or "XXX=", but never "X===" or "====" */ + if (pad && i < 2) + return -1; + + if (b == B64PADV) + pad++; + + t[i++] = b; + + if (i == 4) { + /* + * WARNING: we allow to write little more data than we + * should, but the checks from the beginning of the + * functions guarantee that we can safely do that. + */ + + /* xx000000 xx001111 xx111122 xx222222 */ + if (convlen < olen) + out[convlen] = ((t[0] << 2) + (t[1] >> 4)); + if (convlen+1 < olen) + out[convlen+1] = ((t[1] << 4) + (t[2] >> 2)); + if (convlen+2 < olen) + out[convlen+2] = ((t[2] << 6) + (t[3] >> 0)); + + convlen += 3-pad; + + pad = i = 0; + } + + in++; + ilen--; + } + + return convlen; +} + +/* url variant of base64dec */ +/* The reverse tab used to decode base64 is generated via /dev/base64/base64rev-gen.c */ +int base64urldec(const char *in, size_t ilen, char *out, size_t olen) +{ + unsigned char t[4]; + signed char b; + int convlen = 0, i = 0, pad = 0, padlen = 0; + + switch (ilen % 4) { + case 0: + break; + case 2: + padlen = pad = 2; + break; + case 3: + padlen = pad = 1; + break; + default: + return -1; + } + + if (olen < (((ilen + pad) / 4 * 3) - pad)) + return -2; + + while (ilen + pad) { + if (ilen) { + /* if (*p < UB64CMIN || *p > B64CMAX) */ + b = (signed char) * in - UB64CMIN; + if ((unsigned char)b > (B64CMAX - UB64CMIN)) + return -1; + + b = ubase64rev[b] - B64BASE - 1; + /* b == -1: invalid character */ + if (b < 0) + return -1; + + in++; + ilen--; + + } else { + b = B64PADV; + pad--; + } + + t[i++] = b; + + if (i == 4) { + /* + * WARNING: we allow to write little more data than we + * should, but the checks from the beginning of the + * functions guarantee that we can safely do that. + */ + + /* xx000000 xx001111 xx111122 xx222222 */ + if (convlen < olen) + out[convlen] = ((t[0] << 2) + (t[1] >> 4)); + if (convlen+1 < olen) + out[convlen+1] = ((t[1] << 4) + (t[2] >> 2)); + if (convlen+2 < olen) + out[convlen+2] = ((t[2] << 6) + (t[3] >> 0)); + + convlen += 3; + i = 0; + } + } + convlen -= padlen; + + return convlen; +} + +/* Converts the lower 30 bits of an integer to a 5-char base64 string. The + * caller is responsible for ensuring that the output buffer can accept 6 bytes + * (5 + the trailing zero). The pointer to the string is returned. The + * conversion is performed with MSB first and in a format that can be + * decoded with b64tos30(). This format is not padded and thus is not + * compatible with usual base64 routines. + */ +const char *s30tob64(int in, char *out) +{ + int i; + for (i = 0; i < 5; i++) { + out[i] = base64tab[(in >> 24) & 0x3F]; + in <<= 6; + } + out[5] = '\0'; + return out; +} + +/* Converts a 5-char base64 string encoded by s30tob64() into a 30-bit integer. + * The caller is responsible for ensuring that the input contains at least 5 + * chars. If any unexpected character is encountered, a negative value is + * returned. Otherwise the decoded value is returned. + */ +int b64tos30(const char *in) +{ + int i, out; + signed char b; + + out = 0; + for (i = 0; i < 5; i++) { + b = (signed char)in[i] - B64CMIN; + if ((unsigned char)b > (B64CMAX - B64CMIN)) + return -1; /* input character out of range */ + + b = base64rev[b] - B64BASE - 1; + if (b < 0) /* invalid character */ + return -1; + + if (b == B64PADV) /* padding not allowed */ + return -1; + + out = (out << 6) + b; + } + return out; +} diff --git a/src/cache.c b/src/cache.c new file mode 100644 index 0000000..9f12f10 --- /dev/null +++ b/src/cache.c @@ -0,0 +1,3014 @@ +/* + * Cache management + * + * Copyright 2017 HAProxy Technologies + * William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <import/eb32tree.h> +#include <import/sha1.h> + +#include <haproxy/action-t.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/filters.h> +#include <haproxy/hash.h> +#include <haproxy/http.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_htx.h> +#include <haproxy/http_rules.h> +#include <haproxy/htx.h> +#include <haproxy/net_helper.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/shctx.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/tools.h> +#include <haproxy/xxhash.h> + +#define CACHE_FLT_F_IMPLICIT_DECL 0x00000001 /* The cache filtre was implicitly declared (ie without + * the filter keyword) */ +#define CACHE_FLT_INIT 0x00000002 /* Whether the cache name was freed. */ + +static uint64_t cache_hash_seed = 0; + +const char *cache_store_flt_id = "cache store filter"; + +extern struct applet http_cache_applet; + +struct flt_ops cache_ops; + +struct cache_tree { + struct eb_root entries; /* head of cache entries based on keys */ + __decl_thread(HA_RWLOCK_T lock); + + struct list cleanup_list; + __decl_thread(HA_SPINLOCK_T cleanup_lock); +} ALIGNED(64); + +struct cache { + struct cache_tree trees[CACHE_TREE_NUM]; + struct list list; /* cache linked list */ + unsigned int maxage; /* max-age */ + unsigned int maxblocks; + unsigned int maxobjsz; /* max-object-size (in bytes) */ + unsigned int max_secondary_entries; /* maximum number of secondary entries with the same primary hash */ + uint8_t vary_processing_enabled; /* boolean : manage Vary header (disabled by default) */ + char id[33]; /* cache name */ +}; + +/* the appctx context of a cache applet, stored in appctx->svcctx */ +struct cache_appctx { + struct cache_tree *cache_tree; + struct cache_entry *entry; /* Entry to be sent from cache. */ + unsigned int sent; /* The number of bytes already sent for this cache entry. */ + unsigned int offset; /* start offset of remaining data relative to beginning of the next block */ + unsigned int rem_data; /* Remaining bytes for the last data block (HTX only, 0 means process next block) */ + unsigned int send_notmodified:1; /* In case of conditional request, we might want to send a "304 Not Modified" response instead of the stored data. */ + unsigned int unused:31; + struct shared_block *next; /* The next block of data to be sent for this cache entry. */ +}; + +/* cache config for filters */ +struct cache_flt_conf { + union { + struct cache *cache; /* cache used by the filter */ + char *name; /* cache name used during conf parsing */ + } c; + unsigned int flags; /* CACHE_FLT_F_* */ +}; + +/* CLI context used during "show cache" */ +struct show_cache_ctx { + struct cache *cache; + struct cache_tree *cache_tree; + uint next_key; +}; + + +/* + * Vary-related structures and functions + */ +enum vary_header_bit { + VARY_ACCEPT_ENCODING = (1 << 0), + VARY_REFERER = (1 << 1), + VARY_ORIGIN = (1 << 2), + VARY_LAST /* should always be last */ +}; + +/* + * Encoding list extracted from + * https://www.iana.org/assignments/http-parameters/http-parameters.xhtml + * and RFC7231#5.3.4. + */ +enum vary_encoding { + VARY_ENCODING_GZIP = (1 << 0), + VARY_ENCODING_DEFLATE = (1 << 1), + VARY_ENCODING_BR = (1 << 2), + VARY_ENCODING_COMPRESS = (1 << 3), + VARY_ENCODING_AES128GCM = (1 << 4), + VARY_ENCODING_EXI = (1 << 5), + VARY_ENCODING_PACK200_GZIP = (1 << 6), + VARY_ENCODING_ZSTD = (1 << 7), + VARY_ENCODING_IDENTITY = (1 << 8), + VARY_ENCODING_STAR = (1 << 9), + VARY_ENCODING_OTHER = (1 << 10) +}; + +struct vary_hashing_information { + struct ist hdr_name; /* Header name */ + enum vary_header_bit value; /* Bit representing the header in a vary signature */ + unsigned int hash_length; /* Size of the sub hash for this header's value */ + int(*norm_fn)(struct htx*,struct ist hdr_name,char* buf,unsigned int* buf_len); /* Normalization function */ + int(*cmp_fn)(const void *ref, const void *new, unsigned int len); /* Comparison function, should return 0 if the hashes are alike */ +}; + +static int http_request_prebuild_full_secondary_key(struct stream *s); +static int http_request_build_secondary_key(struct stream *s, int vary_signature); +static int http_request_reduce_secondary_key(unsigned int vary_signature, + char prebuilt_key[HTTP_CACHE_SEC_KEY_LEN]); + +static int parse_encoding_value(struct ist value, unsigned int *encoding_value, + unsigned int *has_null_weight); + +static int accept_encoding_normalizer(struct htx *htx, struct ist hdr_name, + char *buf, unsigned int *buf_len); +static int default_normalizer(struct htx *htx, struct ist hdr_name, + char *buf, unsigned int *buf_len); + +static int accept_encoding_bitmap_cmp(const void *ref, const void *new, unsigned int len); + +/* Warning : do not forget to update HTTP_CACHE_SEC_KEY_LEN when new items are + * added to this array. */ +const struct vary_hashing_information vary_information[] = { + { IST("accept-encoding"), VARY_ACCEPT_ENCODING, sizeof(uint32_t), &accept_encoding_normalizer, &accept_encoding_bitmap_cmp }, + { IST("referer"), VARY_REFERER, sizeof(uint64_t), &default_normalizer, NULL }, + { IST("origin"), VARY_ORIGIN, sizeof(uint64_t), &default_normalizer, NULL }, +}; + + +static inline void cache_rdlock(struct cache_tree *cache) +{ + HA_RWLOCK_RDLOCK(CACHE_LOCK, &cache->lock); +} + +static inline void cache_rdunlock(struct cache_tree *cache) +{ + HA_RWLOCK_RDUNLOCK(CACHE_LOCK, &cache->lock); +} + +static inline void cache_wrlock(struct cache_tree *cache) +{ + HA_RWLOCK_WRLOCK(CACHE_LOCK, &cache->lock); +} + +static inline void cache_wrunlock(struct cache_tree *cache) +{ + HA_RWLOCK_WRUNLOCK(CACHE_LOCK, &cache->lock); +} + +/* + * cache ctx for filters + */ +struct cache_st { + struct shared_block *first_block; + struct list detached_head; +}; + +#define DEFAULT_MAX_SECONDARY_ENTRY 10 + +struct cache_entry { + unsigned int complete; /* An entry won't be valid until complete is not null. */ + unsigned int latest_validation; /* latest validation date */ + unsigned int expire; /* expiration date (wall clock time) */ + unsigned int age; /* Origin server "Age" header value */ + + int refcount; + + struct eb32_node eb; /* ebtree node used to hold the cache object */ + char hash[20]; + + struct list cleanup_list;/* List used between the cache_free_blocks and cache_reserve_finish calls */ + + char secondary_key[HTTP_CACHE_SEC_KEY_LEN]; /* Optional secondary key. */ + unsigned int secondary_key_signature; /* Bitfield of the HTTP headers that should be used + * to build secondary keys for this cache entry. */ + unsigned int secondary_entries_count; /* Should only be filled in the last entry of a list of dup entries */ + unsigned int last_clear_ts; /* Timestamp of the last call to clear_expired_duplicates. */ + + unsigned int etag_length; /* Length of the ETag value (if one was found in the response). */ + unsigned int etag_offset; /* Offset of the ETag value in the data buffer. */ + + time_t last_modified; /* Origin server "Last-Modified" header value converted in + * seconds since epoch. If no "Last-Modified" + * header is found, use "Date" header value, + * otherwise use reception time. This field will + * be used in case of an "If-Modified-Since"-based + * conditional request. */ + + unsigned char data[0]; +}; + +#define CACHE_BLOCKSIZE 1024 +#define CACHE_ENTRY_MAX_AGE 2147483648U + +static struct list caches = LIST_HEAD_INIT(caches); +static struct list caches_config = LIST_HEAD_INIT(caches_config); /* cache config to init */ +static struct cache *tmp_cache_config = NULL; + +DECLARE_STATIC_POOL(pool_head_cache_st, "cache_st", sizeof(struct cache_st)); + +static struct eb32_node *insert_entry(struct cache *cache, struct cache_tree *tree, struct cache_entry *new_entry); +static void delete_entry(struct cache_entry *del_entry); +static void release_entry_locked(struct cache_tree *cache, struct cache_entry *entry); +static void release_entry_unlocked(struct cache_tree *cache, struct cache_entry *entry); + +/* + * Find a cache_entry in the <cache>'s tree that has the hash <hash>. + * If <delete_expired> is 0 then the entry is left untouched if it is found but + * is already expired, and NULL is returned. Otherwise, the expired entry is + * removed from the tree and NULL is returned. + * Returns a valid (not expired) cache_tree pointer. + * The returned entry is not retained, it should be explicitly retained only + * when necessary. + * + * This function must be called under a cache lock, either read if + * delete_expired==0, write otherwise. + */ +struct cache_entry *get_entry(struct cache_tree *cache_tree, char *hash, int delete_expired) +{ + struct eb32_node *node; + struct cache_entry *entry; + + node = eb32_lookup(&cache_tree->entries, read_u32(hash)); + if (!node) + return NULL; + + entry = eb32_entry(node, struct cache_entry, eb); + + /* if that's not the right node */ + if (memcmp(entry->hash, hash, sizeof(entry->hash))) + return NULL; + + if (entry->expire > date.tv_sec) { + return entry; + } else if (delete_expired) { + release_entry_locked(cache_tree, entry); + } + return NULL; +} + +/* + * Increment a cache_entry's reference counter. + */ +static void retain_entry(struct cache_entry *entry) +{ + if (entry) + HA_ATOMIC_INC(&entry->refcount); +} + +/* + * Decrement a cache_entry's reference counter and remove it from the <cache>'s + * tree if the reference counter becomes 0. + * If <needs_locking> is 0 then the cache lock was already taken by the caller, + * otherwise it must be taken in write mode before actually deleting the entry. + */ +static void release_entry(struct cache_tree *cache, struct cache_entry *entry, int needs_locking) +{ + if (!entry) + return; + + if (HA_ATOMIC_SUB_FETCH(&entry->refcount, 1) <= 0) { + if (needs_locking) { + cache_wrlock(cache); + /* The value might have changed between the last time we + * checked it and now, we need to recheck it just in + * case. + */ + if (HA_ATOMIC_LOAD(&entry->refcount) > 0) { + cache_wrunlock(cache); + return; + } + } + delete_entry(entry); + if (needs_locking) { + cache_wrunlock(cache); + } + } +} + +/* + * Decrement a cache_entry's reference counter and remove it from the <cache>'s + * tree if the reference counter becomes 0. + * This function must be called under the cache lock in write mode. + */ +static inline void release_entry_locked(struct cache_tree *cache, struct cache_entry *entry) +{ + release_entry(cache, entry, 0); +} + +/* + * Decrement a cache_entry's reference counter and remove it from the <cache>'s + * tree if the reference counter becomes 0. + * This function must not be called under the cache lock or the shctx lock. The + * cache lock might be taken in write mode (if the entry gets deleted). + */ +static inline void release_entry_unlocked(struct cache_tree *cache, struct cache_entry *entry) +{ + release_entry(cache, entry, 1); +} + + +/* + * Compare a newly built secondary key to the one found in a cache_entry. + * Every sub-part of the key is compared to the reference through the dedicated + * comparison function of the sub-part (that might do more than a simple + * memcmp). + * Returns 0 if the keys are alike. + */ +static int secondary_key_cmp(const char *ref_key, const char *new_key) +{ + int retval = 0; + size_t idx = 0; + unsigned int offset = 0; + const struct vary_hashing_information *info; + + for (idx = 0; idx < sizeof(vary_information)/sizeof(*vary_information) && !retval; ++idx) { + info = &vary_information[idx]; + + if (info->cmp_fn) + retval = info->cmp_fn(&ref_key[offset], &new_key[offset], info->hash_length); + else + retval = memcmp(&ref_key[offset], &new_key[offset], info->hash_length); + + offset += info->hash_length; + } + + return retval; +} + +/* + * There can be multiple entries with the same primary key in the ebtree so in + * order to get the proper one out of the list, we use a secondary_key. + * This function simply iterates over all the entries with the same primary_key + * until it finds the right one. + * If <delete_expired> is 0 then the entry is left untouched if it is found but + * is already expired, and NULL is returned. Otherwise, the expired entry is + * removed from the tree and NULL is returned. + * Returns the cache_entry in case of success, NULL otherwise. + * + * This function must be called under a cache lock, either read if + * delete_expired==0, write otherwise. + */ +struct cache_entry *get_secondary_entry(struct cache_tree *cache, struct cache_entry *entry, + const char *secondary_key, int delete_expired) +{ + struct eb32_node *node = &entry->eb; + + if (!entry->secondary_key_signature) + return NULL; + + while (entry && secondary_key_cmp(entry->secondary_key, secondary_key) != 0) { + node = eb32_next_dup(node); + + /* Make the best use of this iteration and clear expired entries + * when we find them. Calling delete_entry would be too costly + * so we simply call eb32_delete. The secondary_entry count will + * be updated when we try to insert a new entry to this list. */ + if (entry->expire <= date.tv_sec && delete_expired) { + release_entry_locked(cache, entry); + } + + entry = node ? eb32_entry(node, struct cache_entry, eb) : NULL; + } + + /* Expired entry */ + if (entry && entry->expire <= date.tv_sec) { + if (delete_expired) { + release_entry_locked(cache, entry); + } + entry = NULL; + } + + return entry; +} + +static inline struct cache_tree *get_cache_tree_from_hash(struct cache *cache, unsigned int hash) +{ + if (!cache) + return NULL; + + return &cache->trees[hash % CACHE_TREE_NUM]; +} + + +/* + * Remove all expired entries from a list of duplicates. + * Return the number of alive entries in the list and sets dup_tail to the + * current last item of the list. + * + * This function must be called under a cache write lock. + */ +static unsigned int clear_expired_duplicates(struct cache_tree *cache, struct eb32_node **dup_tail) +{ + unsigned int entry_count = 0; + struct cache_entry *entry = NULL; + struct eb32_node *prev = *dup_tail; + struct eb32_node *tail = NULL; + + while (prev) { + entry = container_of(prev, struct cache_entry, eb); + prev = eb32_prev_dup(prev); + if (entry->expire <= date.tv_sec) { + release_entry_locked(cache, entry); + } + else { + if (!tail) + tail = &entry->eb; + ++entry_count; + } + } + + *dup_tail = tail; + + return entry_count; +} + + +/* + * This function inserts a cache_entry in the cache's ebtree. In case of + * duplicate entries (vary), it then checks that the number of entries did not + * reach the max number of secondary entries. If this entry should not have been + * created, remove it. + * In the regular case (unique entries), this function does not do more than a + * simple insert. In case of secondary entries, it will at most cost an + * insertion+max_sec_entries time checks and entry deletion. + * Returns the newly inserted node in case of success, NULL otherwise. + * + * This function must be called under a cache write lock. + */ +static struct eb32_node *insert_entry(struct cache *cache, struct cache_tree *tree, struct cache_entry *new_entry) +{ + struct eb32_node *prev = NULL; + struct cache_entry *entry = NULL; + unsigned int entry_count = 0; + unsigned int last_clear_ts = date.tv_sec; + + struct eb32_node *node = eb32_insert(&tree->entries, &new_entry->eb); + + new_entry->refcount = 1; + + /* We should not have multiple entries with the same primary key unless + * the entry has a non null vary signature. */ + if (!new_entry->secondary_key_signature) + return node; + + prev = eb32_prev_dup(node); + if (prev != NULL) { + /* The last entry of a duplicate list should contain the current + * number of entries in the list. */ + entry = container_of(prev, struct cache_entry, eb); + entry_count = entry->secondary_entries_count; + last_clear_ts = entry->last_clear_ts; + + if (entry_count >= cache->max_secondary_entries) { + /* Some entries of the duplicate list might be expired so + * we will iterate over all the items in order to free some + * space. In order to avoid going over the same list too + * often, we first check the timestamp of the last check + * performed. */ + if (last_clear_ts == date.tv_sec) { + /* Too many entries for this primary key, clear the + * one that was inserted. */ + release_entry_locked(tree, entry); + return NULL; + } + + entry_count = clear_expired_duplicates(tree, &prev); + if (entry_count >= cache->max_secondary_entries) { + /* Still too many entries for this primary key, delete + * the newly inserted one. */ + entry = container_of(prev, struct cache_entry, eb); + entry->last_clear_ts = date.tv_sec; + release_entry_locked(tree, entry); + return NULL; + } + } + } + + new_entry->secondary_entries_count = entry_count + 1; + new_entry->last_clear_ts = last_clear_ts; + + return node; +} + + +/* + * This function removes an entry from the ebtree. If the entry was a duplicate + * (in case of Vary), it updates the secondary entry counter in another + * duplicate entry (the last entry of the dup list). + * + * This function must be called under a cache write lock. + */ +static void delete_entry(struct cache_entry *del_entry) +{ + struct eb32_node *prev = NULL, *next = NULL; + struct cache_entry *entry = NULL; + struct eb32_node *last = NULL; + + /* The entry might have been removed from the cache before. In such a + * case calling eb32_next_dup would crash. */ + if (del_entry->secondary_key_signature && del_entry->eb.key != 0) { + next = &del_entry->eb; + + /* Look for last entry of the duplicates list. */ + while ((next = eb32_next_dup(next))) { + last = next; + } + + if (last) { + entry = container_of(last, struct cache_entry, eb); + --entry->secondary_entries_count; + } + else { + /* The current entry is the last one, look for the + * previous one to update its counter. */ + prev = eb32_prev_dup(&del_entry->eb); + if (prev) { + entry = container_of(prev, struct cache_entry, eb); + entry->secondary_entries_count = del_entry->secondary_entries_count - 1; + } + } + } + eb32_delete(&del_entry->eb); + del_entry->eb.key = 0; +} + + +static inline struct shared_context *shctx_ptr(struct cache *cache) +{ + return (struct shared_context *)((unsigned char *)cache - offsetof(struct shared_context, data)); +} + +static inline struct shared_block *block_ptr(struct cache_entry *entry) +{ + return (struct shared_block *)((unsigned char *)entry - offsetof(struct shared_block, data)); +} + + + +static int +cache_store_init(struct proxy *px, struct flt_conf *fconf) +{ + fconf->flags |= FLT_CFG_FL_HTX; + return 0; +} + +static void +cache_store_deinit(struct proxy *px, struct flt_conf *fconf) +{ + struct cache_flt_conf *cconf = fconf->conf; + + if (!(cconf->flags & CACHE_FLT_INIT)) + free(cconf->c.name); + free(cconf); +} + +static int +cache_store_check(struct proxy *px, struct flt_conf *fconf) +{ + struct cache_flt_conf *cconf = fconf->conf; + struct flt_conf *f; + struct cache *cache; + int comp = 0; + + /* Find the cache corresponding to the name in the filter config. The + * cache will not be referenced now in the filter config because it is + * not fully allocated. This step will be performed during the cache + * post_check. + */ + list_for_each_entry(cache, &caches_config, list) { + if (strcmp(cache->id, cconf->c.name) == 0) + goto found; + } + + ha_alert("config: %s '%s': unable to find the cache '%s' referenced by the filter 'cache'.\n", + proxy_type_str(px), px->id, (char *)cconf->c.name); + return 1; + + found: + /* Here <cache> points on the cache the filter must use and <cconf> + * points on the cache filter configuration. */ + + /* Check all filters for proxy <px> to know if the compression is + * enabled and if it is after the cache. When the compression is before + * the cache, an error is returned. Also check if the cache filter must + * be explicitly declaired or not. */ + list_for_each_entry(f, &px->filter_configs, list) { + if (f == fconf) { + /* The compression filter must be evaluated after the cache. */ + if (comp) { + ha_alert("config: %s '%s': unable to enable the compression filter before " + "the cache '%s'.\n", proxy_type_str(px), px->id, cache->id); + return 1; + } + } + else if (f->id == http_comp_flt_id) + comp = 1; + else if (f->id == fcgi_flt_id) + continue; + else if ((f->id != fconf->id) && (cconf->flags & CACHE_FLT_F_IMPLICIT_DECL)) { + /* Implicit declaration is only allowed with the + * compression and fcgi. For other filters, an implicit + * declaration is required. */ + ha_alert("config: %s '%s': require an explicit filter declaration " + "to use the cache '%s'.\n", proxy_type_str(px), px->id, cache->id); + return 1; + } + + } + return 0; +} + +static int +cache_store_strm_init(struct stream *s, struct filter *filter) +{ + struct cache_st *st; + + st = pool_alloc(pool_head_cache_st); + if (st == NULL) + return -1; + + st->first_block = NULL; + filter->ctx = st; + + /* Register post-analyzer on AN_RES_WAIT_HTTP */ + filter->post_analyzers |= AN_RES_WAIT_HTTP; + return 1; +} + +static void +cache_store_strm_deinit(struct stream *s, struct filter *filter) +{ + struct cache_st *st = filter->ctx; + struct cache_flt_conf *cconf = FLT_CONF(filter); + struct cache *cache = cconf->c.cache; + struct shared_context *shctx = shctx_ptr(cache); + + /* Everything should be released in the http_end filter, but we need to do it + * there too, in case of errors */ + if (st && st->first_block) { + struct cache_entry *object = (struct cache_entry *)st->first_block->data; + if (!object->complete) { + /* The stream was closed but the 'complete' flag was not + * set which means that cache_store_http_end was not + * called. The stream must have been closed before we + * could store the full answer in the cache. + */ + release_entry_unlocked(&cache->trees[object->eb.key % CACHE_TREE_NUM], object); + } + shctx_wrlock(shctx); + shctx_row_reattach(shctx, st->first_block); + shctx_wrunlock(shctx); + } + if (st) { + pool_free(pool_head_cache_st, st); + filter->ctx = NULL; + } +} + +static int +cache_store_post_analyze(struct stream *s, struct filter *filter, struct channel *chn, + unsigned an_bit) +{ + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->rsp; + struct cache_st *st = filter->ctx; + + if (an_bit != AN_RES_WAIT_HTTP) + goto end; + + /* Here we need to check if any compression filter precedes the cache + * filter. This is only possible when the compression is configured in + * the frontend while the cache filter is configured on the + * backend. This case cannot be detected during HAProxy startup. So in + * such cases, the cache is disabled. + */ + if (st && (msg->flags & HTTP_MSGF_COMPRESSING)) { + pool_free(pool_head_cache_st, st); + filter->ctx = NULL; + } + + end: + return 1; +} + +static int +cache_store_http_headers(struct stream *s, struct filter *filter, struct http_msg *msg) +{ + struct cache_st *st = filter->ctx; + + if (!(msg->chn->flags & CF_ISRESP) || !st) + return 1; + + if (st->first_block) + register_data_filter(s, msg->chn, filter); + return 1; +} + +static inline void disable_cache_entry(struct cache_st *st, + struct filter *filter, struct shared_context *shctx) +{ + struct cache_entry *object; + struct cache *cache = (struct cache*)shctx->data; + + object = (struct cache_entry *)st->first_block->data; + filter->ctx = NULL; /* disable cache */ + release_entry_unlocked(&cache->trees[object->eb.key % CACHE_TREE_NUM], object); + shctx_wrlock(shctx); + shctx_row_reattach(shctx, st->first_block); + shctx_wrunlock(shctx); + pool_free(pool_head_cache_st, st); +} + +static int +cache_store_http_payload(struct stream *s, struct filter *filter, struct http_msg *msg, + unsigned int offset, unsigned int len) +{ + struct cache_flt_conf *cconf = FLT_CONF(filter); + struct shared_context *shctx = shctx_ptr(cconf->c.cache); + struct cache_st *st = filter->ctx; + struct htx *htx = htxbuf(&msg->chn->buf); + struct htx_blk *blk; + struct shared_block *fb; + struct htx_ret htxret; + unsigned int orig_len, to_forward; + int ret; + + if (!len) + return len; + + if (!st->first_block) { + unregister_data_filter(s, msg->chn, filter); + return len; + } + + chunk_reset(&trash); + orig_len = len; + to_forward = 0; + + htxret = htx_find_offset(htx, offset); + blk = htxret.blk; + offset = htxret.ret; + for (; blk && len; blk = htx_get_next_blk(htx, blk)) { + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t info, sz = htx_get_blksz(blk); + struct ist v; + + switch (type) { + case HTX_BLK_UNUSED: + break; + + case HTX_BLK_DATA: + v = htx_get_blk_value(htx, blk); + v = istadv(v, offset); + v = isttrim(v, len); + + info = (type << 28) + v.len; + chunk_memcat(&trash, (char *)&info, sizeof(info)); + chunk_istcat(&trash, v); + to_forward += v.len; + len -= v.len; + break; + + default: + /* Here offset must always be 0 because only + * DATA blocks can be partially transferred. */ + if (offset) + goto no_cache; + if (sz > len) + goto end; + + chunk_memcat(&trash, (char *)&blk->info, sizeof(blk->info)); + chunk_memcat(&trash, htx_get_blk_ptr(htx, blk), sz); + to_forward += sz; + len -= sz; + break; + } + + offset = 0; + } + + end: + + fb = shctx_row_reserve_hot(shctx, st->first_block, trash.data); + if (!fb) { + goto no_cache; + } + + ret = shctx_row_data_append(shctx, st->first_block, + (unsigned char *)b_head(&trash), b_data(&trash)); + if (ret < 0) + goto no_cache; + + return to_forward; + + no_cache: + disable_cache_entry(st, filter, shctx); + unregister_data_filter(s, msg->chn, filter); + return orig_len; +} + +static int +cache_store_http_end(struct stream *s, struct filter *filter, + struct http_msg *msg) +{ + struct cache_st *st = filter->ctx; + struct cache_flt_conf *cconf = FLT_CONF(filter); + struct cache *cache = cconf->c.cache; + struct shared_context *shctx = shctx_ptr(cache); + struct cache_entry *object; + + if (!(msg->chn->flags & CF_ISRESP)) + return 1; + + if (st && st->first_block) { + + object = (struct cache_entry *)st->first_block->data; + + shctx_wrlock(shctx); + /* The whole payload was cached, the entry can now be used. */ + object->complete = 1; + /* remove from the hotlist */ + shctx_row_reattach(shctx, st->first_block); + shctx_wrunlock(shctx); + + } + if (st) { + pool_free(pool_head_cache_st, st); + filter->ctx = NULL; + } + + return 1; +} + + /* + * This intends to be used when checking HTTP headers for some + * word=value directive. Return a pointer to the first character of value, if + * the word was not found or if there wasn't any value assigned to it return NULL + */ +char *directive_value(const char *sample, int slen, const char *word, int wlen) +{ + int st = 0; + + if (slen < wlen) + return 0; + + while (wlen) { + char c = *sample ^ *word; + if (c && c != ('A' ^ 'a')) + return NULL; + sample++; + word++; + slen--; + wlen--; + } + + while (slen) { + if (st == 0) { + if (*sample != '=') + return NULL; + sample++; + slen--; + st = 1; + continue; + } else { + return (char *)sample; + } + } + + return NULL; +} + +/* + * Return the maxage in seconds of an HTTP response. + * The returned value will always take the cache's configuration into account + * (cache->maxage) but the actual max age of the response will be set in the + * true_maxage parameter. It will be used to determine if a response is already + * stale or not. + * Compute the maxage using either: + * - the assigned max-age of the cache + * - the s-maxage directive + * - the max-age directive + * - (Expires - Data) headers + * - the default-max-age of the cache + * + */ +int http_calc_maxage(struct stream *s, struct cache *cache, int *true_maxage) +{ + struct htx *htx = htxbuf(&s->res.buf); + struct http_hdr_ctx ctx = { .blk = NULL }; + long smaxage = -1; + long maxage = -1; + int expires = -1; + struct tm tm = {}; + time_t expires_val = 0; + char *endptr = NULL; + int offset = 0; + + /* The Cache-Control max-age and s-maxage directives should be followed by + * a positive numerical value (see RFC 7234#5.2.1.1). According to the + * specs, a sender "should not" generate a quoted-string value but we will + * still accept this format since it isn't strictly forbidden. */ + while (http_find_header(htx, ist("cache-control"), &ctx, 0)) { + char *value; + + value = directive_value(ctx.value.ptr, ctx.value.len, "s-maxage", 8); + if (value) { + struct buffer *chk = get_trash_chunk(); + + chunk_memcat(chk, value, ctx.value.len - 8 + 1); + chunk_memcat(chk, "", 1); + offset = (*chk->area == '"') ? 1 : 0; + smaxage = strtol(chk->area + offset, &endptr, 10); + if (unlikely(smaxage < 0 || endptr == chk->area + offset)) + return -1; + } + + value = directive_value(ctx.value.ptr, ctx.value.len, "max-age", 7); + if (value) { + struct buffer *chk = get_trash_chunk(); + + chunk_memcat(chk, value, ctx.value.len - 7 + 1); + chunk_memcat(chk, "", 1); + offset = (*chk->area == '"') ? 1 : 0; + maxage = strtol(chk->area + offset, &endptr, 10); + if (unlikely(maxage < 0 || endptr == chk->area + offset)) + return -1; + } + } + + /* Look for Expires header if no s-maxage or max-age Cache-Control data + * was found. */ + if (maxage == -1 && smaxage == -1) { + ctx.blk = NULL; + if (http_find_header(htx, ist("expires"), &ctx, 1)) { + if (parse_http_date(istptr(ctx.value), istlen(ctx.value), &tm)) { + expires_val = my_timegm(&tm); + /* A request having an expiring date earlier + * than the current date should be considered as + * stale. */ + expires = (expires_val >= date.tv_sec) ? + (expires_val - date.tv_sec) : 0; + } + else { + /* Following RFC 7234#5.3, an invalid date + * format must be treated as a date in the past + * so the cache entry must be seen as already + * expired. */ + expires = 0; + } + } + } + + + if (smaxage > 0) { + if (true_maxage) + *true_maxage = smaxage; + return MIN(smaxage, cache->maxage); + } + + if (maxage > 0) { + if (true_maxage) + *true_maxage = maxage; + return MIN(maxage, cache->maxage); + } + + if (expires >= 0) { + if (true_maxage) + *true_maxage = expires; + return MIN(expires, cache->maxage); + } + + return cache->maxage; + +} + + +static void cache_free_blocks(struct shared_block *first, void *data) +{ + struct cache_entry *object = (struct cache_entry *)first->data; + struct cache *cache = (struct cache *)data; + struct cache_tree *cache_tree; + + if (object->eb.key) { + object->complete = 0; + cache_tree = &cache->trees[object->eb.key % CACHE_TREE_NUM]; + retain_entry(object); + HA_SPIN_LOCK(CACHE_LOCK, &cache_tree->cleanup_lock); + LIST_INSERT(&cache_tree->cleanup_list, &object->cleanup_list); + HA_SPIN_UNLOCK(CACHE_LOCK, &cache_tree->cleanup_lock); + } +} + +static void cache_reserve_finish(struct shared_context *shctx) +{ + struct cache_entry *object, *back; + struct cache *cache = (struct cache *)shctx->data; + struct cache_tree *cache_tree; + int cache_tree_idx = 0; + + for (; cache_tree_idx < CACHE_TREE_NUM; ++cache_tree_idx) { + cache_tree = &cache->trees[cache_tree_idx]; + + cache_wrlock(cache_tree); + HA_SPIN_LOCK(CACHE_LOCK, &cache_tree->cleanup_lock); + + list_for_each_entry_safe(object, back, &cache_tree->cleanup_list, cleanup_list) { + LIST_DELETE(&object->cleanup_list); + /* + * At this point we locked the cache tree in write mode + * so no new thread could retain the current entry + * because the only two places where it can happen is in + * the cache_use case which is under cache_rdlock and + * the reserve_hot case which would require the + * corresponding block to still be in the avail list, + * which is impossible (we reserved it for a thread and + * took it out of the avail list already). The only two + * references are then the default one (upon cache_entry + * creation) and the one in this cleanup list. + */ + BUG_ON(object->refcount > 2); + delete_entry(object); + } + + HA_SPIN_UNLOCK(CACHE_LOCK, &cache_tree->cleanup_lock); + cache_wrunlock(cache_tree); + } +} + + +/* As per RFC 7234#4.3.2, in case of "If-Modified-Since" conditional request, the + * date value should be compared to a date determined by in a previous response (for + * the same entity). This date could either be the "Last-Modified" value, or the "Date" + * value of the response's reception time (by decreasing order of priority). */ +static time_t get_last_modified_time(struct htx *htx) +{ + time_t last_modified = 0; + struct http_hdr_ctx ctx = { .blk = NULL }; + struct tm tm = {}; + + if (http_find_header(htx, ist("last-modified"), &ctx, 1)) { + if (parse_http_date(istptr(ctx.value), istlen(ctx.value), &tm)) { + last_modified = my_timegm(&tm); + } + } + + if (!last_modified) { + ctx.blk = NULL; + if (http_find_header(htx, ist("date"), &ctx, 1)) { + if (parse_http_date(istptr(ctx.value), istlen(ctx.value), &tm)) { + last_modified = my_timegm(&tm); + } + } + } + + /* Fallback on the current time if no "Last-Modified" or "Date" header + * was found. */ + if (!last_modified) + last_modified = date.tv_sec; + + return last_modified; +} + +/* + * Checks the vary header's value. The headers on which vary should be applied + * must be explicitly supported in the vary_information array (see cache.c). If + * any other header is mentioned, we won't store the response. + * Returns 1 if Vary-based storage can work, 0 otherwise. + */ +static int http_check_vary_header(struct htx *htx, unsigned int *vary_signature) +{ + unsigned int vary_idx; + unsigned int vary_info_count; + const struct vary_hashing_information *vary_info; + struct http_hdr_ctx ctx = { .blk = NULL }; + + int retval = 1; + + *vary_signature = 0; + + vary_info_count = sizeof(vary_information)/sizeof(*vary_information); + while (retval && http_find_header(htx, ist("Vary"), &ctx, 0)) { + for (vary_idx = 0; vary_idx < vary_info_count; ++vary_idx) { + vary_info = &vary_information[vary_idx]; + if (isteqi(ctx.value, vary_info->hdr_name)) { + *vary_signature |= vary_info->value; + break; + } + } + retval = (vary_idx < vary_info_count); + } + + return retval; +} + + +/* + * Look for the accept-encoding part of the secondary_key and replace the + * encoding bitmap part of the hash with the actual encoding of the response, + * extracted from the content-encoding header value. + * Responses that have an unknown encoding will not be cached if they also + * "vary" on the accept-encoding value. + * Returns 0 if we found a known encoding in the response, -1 otherwise. + */ +static int set_secondary_key_encoding(struct htx *htx, char *secondary_key) +{ + unsigned int resp_encoding_bitmap = 0; + const struct vary_hashing_information *info = vary_information; + unsigned int offset = 0; + unsigned int count = 0; + unsigned int hash_info_count = sizeof(vary_information)/sizeof(*vary_information); + unsigned int encoding_value; + struct http_hdr_ctx ctx = { .blk = NULL }; + + /* Look for the accept-encoding part of the secondary_key. */ + while (count < hash_info_count && info->value != VARY_ACCEPT_ENCODING) { + offset += info->hash_length; + ++info; + ++count; + } + + if (count == hash_info_count) + return -1; + + while (http_find_header(htx, ist("content-encoding"), &ctx, 0)) { + if (parse_encoding_value(ctx.value, &encoding_value, NULL)) + return -1; /* Do not store responses with an unknown encoding */ + resp_encoding_bitmap |= encoding_value; + } + + if (!resp_encoding_bitmap) + resp_encoding_bitmap |= VARY_ENCODING_IDENTITY; + + /* Rewrite the bitmap part of the hash with the new bitmap that only + * corresponds the the response's encoding. */ + write_u32(secondary_key + offset, resp_encoding_bitmap); + + return 0; +} + + +/* + * This function will store the headers of the response in a buffer and then + * register a filter to store the data + */ +enum act_return http_action_store_cache(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + int effective_maxage = 0; + int true_maxage = 0; + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->rsp; + struct filter *filter; + struct shared_block *first = NULL; + struct cache_flt_conf *cconf = rule->arg.act.p[0]; + struct cache *cache = cconf->c.cache; + struct shared_context *shctx = shctx_ptr(cache); + struct cache_st *cache_ctx = NULL; + struct cache_entry *object, *old; + unsigned int key = read_u32(txn->cache_hash); + struct htx *htx; + struct http_hdr_ctx ctx; + size_t hdrs_len = 0; + int32_t pos; + unsigned int vary_signature = 0; + struct cache_tree *cache_tree = NULL; + + /* Don't cache if the response came from a cache */ + if ((obj_type(s->target) == OBJ_TYPE_APPLET) && + s->target == &http_cache_applet.obj_type) { + goto out; + } + + /* cache only HTTP/1.1 */ + if (!(txn->req.flags & HTTP_MSGF_VER_11)) + goto out; + + cache_tree = get_cache_tree_from_hash(cache, read_u32(txn->cache_hash)); + + /* cache only GET method */ + if (txn->meth != HTTP_METH_GET) { + /* In case of successful unsafe method on a stored resource, the + * cached entry must be invalidated (see RFC7234#4.4). + * A "non-error response" is one with a 2xx (Successful) or 3xx + * (Redirection) status code. */ + if (txn->status >= 200 && txn->status < 400) { + switch (txn->meth) { + case HTTP_METH_OPTIONS: + case HTTP_METH_GET: + case HTTP_METH_HEAD: + case HTTP_METH_TRACE: + break; + + default: /* Any unsafe method */ + /* Discard any corresponding entry in case of successful + * unsafe request (such as PUT, POST or DELETE). */ + cache_wrlock(cache_tree); + + old = get_entry(cache_tree, txn->cache_hash, 1); + if (old) + release_entry_locked(cache_tree, old); + cache_wrunlock(cache_tree); + } + } + goto out; + } + + /* cache key was not computed */ + if (!key) + goto out; + + /* cache only 200 status code */ + if (txn->status != 200) + goto out; + + /* Find the corresponding filter instance for the current stream */ + list_for_each_entry(filter, &s->strm_flt.filters, list) { + if (FLT_ID(filter) == cache_store_flt_id && FLT_CONF(filter) == cconf) { + /* No filter ctx, don't cache anything */ + if (!filter->ctx) + goto out; + cache_ctx = filter->ctx; + break; + } + } + + /* from there, cache_ctx is always defined */ + htx = htxbuf(&s->res.buf); + + /* Do not cache too big objects. */ + if ((msg->flags & HTTP_MSGF_CNT_LEN) && shctx->max_obj_size > 0 && + htx->data + htx->extra > shctx->max_obj_size) + goto out; + + /* Only a subset of headers are supported in our Vary implementation. If + * any other header is present in the Vary header value, we won't be + * able to use the cache. Likewise, if Vary header support is disabled, + * avoid caching responses that contain such a header. */ + ctx.blk = NULL; + if (cache->vary_processing_enabled) { + if (!http_check_vary_header(htx, &vary_signature)) + goto out; + if (vary_signature) { + /* If something went wrong during the secondary key + * building, do not store the response. */ + if (!(txn->flags & TX_CACHE_HAS_SEC_KEY)) + goto out; + http_request_reduce_secondary_key(vary_signature, txn->cache_secondary_hash); + } + } + else if (http_find_header(htx, ist("Vary"), &ctx, 0)) { + goto out; + } + + http_check_response_for_cacheability(s, &s->res); + + if (!(txn->flags & TX_CACHEABLE) || !(txn->flags & TX_CACHE_COOK)) + goto out; + + cache_wrlock(cache_tree); + old = get_entry(cache_tree, txn->cache_hash, 1); + if (old) { + if (vary_signature) + old = get_secondary_entry(cache_tree, old, + txn->cache_secondary_hash, 1); + if (old) { + if (!old->complete) { + /* An entry with the same primary key is already being + * created, we should not try to store the current + * response because it will waste space in the cache. */ + cache_wrunlock(cache_tree); + goto out; + } + release_entry_locked(cache_tree, old); + } + } + cache_wrunlock(cache_tree); + + first = shctx_row_reserve_hot(shctx, NULL, sizeof(struct cache_entry)); + if (!first) { + goto out; + } + + /* the received memory is not initialized, we need at least to mark + * the object as not indexed yet. + */ + object = (struct cache_entry *)first->data; + memset(object, 0, sizeof(*object)); + object->eb.key = key; + object->secondary_key_signature = vary_signature; + /* We need to temporarily set a valid expiring time until the actual one + * is set by the end of this function (in case of concurrent accesses to + * the same resource). This way the second access will find an existing + * but not yet usable entry in the tree and will avoid storing its data. */ + object->expire = date.tv_sec + 2; + + memcpy(object->hash, txn->cache_hash, sizeof(object->hash)); + if (vary_signature) + memcpy(object->secondary_key, txn->cache_secondary_hash, HTTP_CACHE_SEC_KEY_LEN); + + cache_wrlock(cache_tree); + /* Insert the entry in the tree even if the payload is not cached yet. */ + if (insert_entry(cache, cache_tree, object) != &object->eb) { + object->eb.key = 0; + cache_wrunlock(cache_tree); + goto out; + } + cache_wrunlock(cache_tree); + + /* reserve space for the cache_entry structure */ + first->len = sizeof(struct cache_entry); + first->last_append = NULL; + + /* Determine the entry's maximum age (taking into account the cache's + * configuration) as well as the response's explicit max age (extracted + * from cache-control directives or the expires header). */ + effective_maxage = http_calc_maxage(s, cache, &true_maxage); + + ctx.blk = NULL; + if (http_find_header(htx, ist("Age"), &ctx, 0)) { + long long hdr_age; + if (!strl2llrc(ctx.value.ptr, ctx.value.len, &hdr_age) && hdr_age > 0) { + if (unlikely(hdr_age > CACHE_ENTRY_MAX_AGE)) + hdr_age = CACHE_ENTRY_MAX_AGE; + /* A response with an Age value greater than its + * announced max age is stale and should not be stored. */ + object->age = hdr_age; + if (unlikely(object->age > true_maxage)) + goto out; + } + else + goto out; + http_remove_header(htx, &ctx); + } + + /* Build a last-modified time that will be stored in the cache_entry and + * compared to a future If-Modified-Since client header. */ + object->last_modified = get_last_modified_time(htx); + + chunk_reset(&trash); + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t sz = htx_get_blksz(blk); + + hdrs_len += sizeof(*blk) + sz; + chunk_memcat(&trash, (char *)&blk->info, sizeof(blk->info)); + chunk_memcat(&trash, htx_get_blk_ptr(htx, blk), sz); + + /* Look for optional ETag header. + * We need to store the offset of the ETag value in order for + * future conditional requests to be able to perform ETag + * comparisons. */ + if (type == HTX_BLK_HDR) { + struct ist header_name = htx_get_blk_name(htx, blk); + if (isteq(header_name, ist("etag"))) { + object->etag_length = sz - istlen(header_name); + object->etag_offset = sizeof(struct cache_entry) + b_data(&trash) - sz + istlen(header_name); + } + } + if (type == HTX_BLK_EOH) + break; + } + + /* Do not cache objects if the headers are too big. */ + if (hdrs_len > htx->size - global.tune.maxrewrite) + goto out; + + /* If the response has a secondary_key, fill its key part related to + * encodings with the actual encoding of the response. This way any + * subsequent request having the same primary key will have its accepted + * encodings tested upon the cached response's one. + * We will not cache a response that has an unknown encoding (not + * explicitly supported in parse_encoding_value function). */ + if (cache->vary_processing_enabled && vary_signature) + if (set_secondary_key_encoding(htx, object->secondary_key)) + goto out; + + if (!shctx_row_reserve_hot(shctx, first, trash.data)) { + goto out; + } + + /* cache the headers in a http action because it allows to chose what + * to cache, for example you might want to cache a response before + * modifying some HTTP headers, or on the contrary after modifying + * those headers. + */ + /* does not need to be locked because it's in the "hot" list, + * copy the headers */ + if (shctx_row_data_append(shctx, first, (unsigned char *)trash.area, trash.data) < 0) + goto out; + + /* register the buffer in the filter ctx for filling it with data*/ + if (cache_ctx) { + cache_ctx->first_block = first; + LIST_INIT(&cache_ctx->detached_head); + /* store latest value and expiration time */ + object->latest_validation = date.tv_sec; + object->expire = date.tv_sec + effective_maxage; + return ACT_RET_CONT; + } + +out: + /* if does not cache */ + if (first) { + first->len = 0; + if (object->eb.key) { + release_entry_unlocked(cache_tree, object); + } + shctx_wrlock(shctx); + shctx_row_reattach(shctx, first); + shctx_wrunlock(shctx); + } + + return ACT_RET_CONT; +} + +#define HTX_CACHE_INIT 0 /* Initial state. */ +#define HTX_CACHE_HEADER 1 /* Cache entry headers forwarding */ +#define HTX_CACHE_DATA 2 /* Cache entry data forwarding */ +#define HTX_CACHE_EOM 3 /* Cache entry completely forwarded. Finish the HTX message */ +#define HTX_CACHE_END 4 /* Cache entry treatment terminated */ + +static void http_cache_applet_release(struct appctx *appctx) +{ + struct cache_appctx *ctx = appctx->svcctx; + struct cache_flt_conf *cconf = appctx->rule->arg.act.p[0]; + struct cache_entry *cache_ptr = ctx->entry; + struct cache *cache = cconf->c.cache; + struct shared_context *shctx = shctx_ptr(cache); + struct shared_block *first = block_ptr(cache_ptr); + + release_entry(ctx->cache_tree, cache_ptr, 1); + + shctx_wrlock(shctx); + shctx_row_reattach(shctx, first); + shctx_wrunlock(shctx); +} + + +static unsigned int htx_cache_dump_blk(struct appctx *appctx, struct htx *htx, enum htx_blk_type type, + uint32_t info, struct shared_block *shblk, unsigned int offset) +{ + struct cache_appctx *ctx = appctx->svcctx; + struct cache_flt_conf *cconf = appctx->rule->arg.act.p[0]; + struct shared_context *shctx = shctx_ptr(cconf->c.cache); + struct htx_blk *blk; + char *ptr; + unsigned int max, total; + uint32_t blksz; + + max = htx_get_max_blksz(htx, + channel_htx_recv_max(sc_ic(appctx_sc(appctx)), htx)); + if (!max) + return 0; + blksz = ((type == HTX_BLK_HDR || type == HTX_BLK_TLR) + ? (info & 0xff) + ((info >> 8) & 0xfffff) + : info & 0xfffffff); + if (blksz > max) + return 0; + + blk = htx_add_blk(htx, type, blksz); + if (!blk) + return 0; + + blk->info = info; + total = 4; + ptr = htx_get_blk_ptr(htx, blk); + while (blksz) { + max = MIN(blksz, shctx->block_size - offset); + memcpy(ptr, (const char *)shblk->data + offset, max); + offset += max; + blksz -= max; + total += max; + ptr += max; + if (blksz || offset == shctx->block_size) { + shblk = LIST_NEXT(&shblk->list, typeof(shblk), list); + offset = 0; + } + } + ctx->offset = offset; + ctx->next = shblk; + ctx->sent += total; + return total; +} + +static unsigned int htx_cache_dump_data_blk(struct appctx *appctx, struct htx *htx, + uint32_t info, struct shared_block *shblk, unsigned int offset) +{ + struct cache_appctx *ctx = appctx->svcctx; + struct cache_flt_conf *cconf = appctx->rule->arg.act.p[0]; + struct shared_context *shctx = shctx_ptr(cconf->c.cache); + unsigned int max, total, rem_data; + uint32_t blksz; + + max = htx_get_max_blksz(htx, + channel_htx_recv_max(sc_ic(appctx_sc(appctx)), htx)); + if (!max) + return 0; + + rem_data = 0; + if (ctx->rem_data) { + blksz = ctx->rem_data; + total = 0; + } + else { + blksz = (info & 0xfffffff); + total = 4; + } + if (blksz > max) { + rem_data = blksz - max; + blksz = max; + } + + while (blksz) { + size_t sz; + + max = MIN(blksz, shctx->block_size - offset); + sz = htx_add_data(htx, ist2(shblk->data + offset, max)); + offset += sz; + blksz -= sz; + total += sz; + if (sz < max) + break; + if (blksz || offset == shctx->block_size) { + shblk = LIST_NEXT(&shblk->list, typeof(shblk), list); + offset = 0; + } + } + + ctx->offset = offset; + ctx->next = shblk; + ctx->sent += total; + ctx->rem_data = rem_data + blksz; + return total; +} + +static size_t htx_cache_dump_msg(struct appctx *appctx, struct htx *htx, unsigned int len, + enum htx_blk_type mark) +{ + struct cache_appctx *ctx = appctx->svcctx; + struct cache_flt_conf *cconf = appctx->rule->arg.act.p[0]; + struct shared_context *shctx = shctx_ptr(cconf->c.cache); + struct shared_block *shblk; + unsigned int offset, sz; + unsigned int ret, total = 0; + + while (len) { + enum htx_blk_type type; + uint32_t info; + + shblk = ctx->next; + offset = ctx->offset; + if (ctx->rem_data) { + type = HTX_BLK_DATA; + info = 0; + goto add_data_blk; + } + + /* Get info of the next HTX block. May be split on 2 shblk */ + sz = MIN(4, shctx->block_size - offset); + memcpy((char *)&info, (const char *)shblk->data + offset, sz); + offset += sz; + if (sz < 4) { + shblk = LIST_NEXT(&shblk->list, typeof(shblk), list); + memcpy(((char *)&info)+sz, (const char *)shblk->data, 4 - sz); + offset = (4 - sz); + } + + /* Get payload of the next HTX block and insert it. */ + type = (info >> 28); + if (type != HTX_BLK_DATA) + ret = htx_cache_dump_blk(appctx, htx, type, info, shblk, offset); + else { + add_data_blk: + ret = htx_cache_dump_data_blk(appctx, htx, info, shblk, offset); + } + + if (!ret) + break; + total += ret; + len -= ret; + + if (ctx->rem_data || type == mark) + break; + } + + return total; +} + +static int htx_cache_add_age_hdr(struct appctx *appctx, struct htx *htx) +{ + struct cache_appctx *ctx = appctx->svcctx; + struct cache_entry *cache_ptr = ctx->entry; + unsigned int age; + char *end; + + chunk_reset(&trash); + age = MAX(0, (int)(date.tv_sec - cache_ptr->latest_validation)) + cache_ptr->age; + if (unlikely(age > CACHE_ENTRY_MAX_AGE)) + age = CACHE_ENTRY_MAX_AGE; + end = ultoa_o(age, b_head(&trash), b_size(&trash)); + b_set_data(&trash, end - b_head(&trash)); + if (!http_add_header(htx, ist("Age"), ist2(b_head(&trash), b_data(&trash)))) + return 0; + return 1; +} + +static void http_cache_io_handler(struct appctx *appctx) +{ + struct cache_appctx *ctx = appctx->svcctx; + struct cache_entry *cache_ptr = ctx->entry; + struct shared_block *first = block_ptr(cache_ptr); + struct stconn *sc = appctx_sc(appctx); + struct channel *req = sc_oc(sc); + struct channel *res = sc_ic(sc); + struct htx *req_htx, *res_htx; + struct buffer *errmsg; + unsigned int len; + size_t ret, total = 0; + + res_htx = htx_from_buf(&res->buf); + total = res_htx->data; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) + goto out; + + /* Check if the input buffer is available. */ + if (!b_size(&res->buf)) { + sc_need_room(sc, 0); + goto out; + } + + if (appctx->st0 == HTX_CACHE_INIT) { + ctx->next = block_ptr(cache_ptr); + ctx->offset = sizeof(*cache_ptr); + ctx->sent = 0; + ctx->rem_data = 0; + appctx->st0 = HTX_CACHE_HEADER; + } + + if (appctx->st0 == HTX_CACHE_HEADER) { + /* Headers must be dump at once. Otherwise it is an error */ + len = first->len - sizeof(*cache_ptr) - ctx->sent; + ret = htx_cache_dump_msg(appctx, res_htx, len, HTX_BLK_EOH); + if (!ret || (htx_get_tail_type(res_htx) != HTX_BLK_EOH) || + !htx_cache_add_age_hdr(appctx, res_htx)) + goto error; + + /* In case of a conditional request, we might want to send a + * "304 Not Modified" response instead of the stored data. */ + if (ctx->send_notmodified) { + if (!http_replace_res_status(res_htx, ist("304"), ist("Not Modified"))) { + /* If replacing the status code fails we need to send the full response. */ + ctx->send_notmodified = 0; + } + } + + /* Skip response body for HEAD requests or in case of "304 Not + * Modified" response. */ + if (__sc_strm(sc)->txn->meth == HTTP_METH_HEAD || ctx->send_notmodified) + appctx->st0 = HTX_CACHE_EOM; + else + appctx->st0 = HTX_CACHE_DATA; + } + + if (appctx->st0 == HTX_CACHE_DATA) { + len = first->len - sizeof(*cache_ptr) - ctx->sent; + if (len) { + ret = htx_cache_dump_msg(appctx, res_htx, len, HTX_BLK_UNUSED); + if (ret < len) { + sc_need_room(sc, channel_htx_recv_max(res, res_htx) + 1); + goto out; + } + } + appctx->st0 = HTX_CACHE_EOM; + } + + if (appctx->st0 == HTX_CACHE_EOM) { + /* no more data are expected. */ + res_htx->flags |= HTX_FL_EOM; + se_fl_set(appctx->sedesc, SE_FL_EOI); + + appctx->st0 = HTX_CACHE_END; + } + + end: + if (appctx->st0 == HTX_CACHE_END) + se_fl_set(appctx->sedesc, SE_FL_EOS); + + out: + total = res_htx->data - total; + if (total) + channel_add_input(res, total); + htx_to_buf(res_htx, &res->buf); + + /* eat the whole request */ + if (co_data(req)) { + req_htx = htx_from_buf(&req->buf); + co_htx_skip(req, req_htx, co_data(req)); + htx_to_buf(req_htx, &req->buf); + } + return; + + error: + /* Sent and HTTP error 500 */ + b_reset(&res->buf); + errmsg = &http_err_chunks[HTTP_ERR_500]; + res->buf.data = b_data(errmsg); + memcpy(res->buf.area, b_head(errmsg), b_data(errmsg)); + res_htx = htx_from_buf(&res->buf); + + total = 0; + se_fl_set(appctx->sedesc, SE_FL_ERROR); + appctx->st0 = HTX_CACHE_END; + goto end; +} + + +static int parse_cache_rule(struct proxy *proxy, const char *name, struct act_rule *rule, char **err) +{ + struct flt_conf *fconf; + struct cache_flt_conf *cconf = NULL; + + if (!*name || strcmp(name, "if") == 0 || strcmp(name, "unless") == 0) { + memprintf(err, "expects a cache name"); + goto err; + } + + /* check if a cache filter was already registered with this cache + * name, if that's the case, must use it. */ + list_for_each_entry(fconf, &proxy->filter_configs, list) { + if (fconf->id == cache_store_flt_id) { + cconf = fconf->conf; + if (cconf && strcmp((char *)cconf->c.name, name) == 0) { + rule->arg.act.p[0] = cconf; + return 1; + } + } + } + + /* Create the filter cache config */ + cconf = calloc(1, sizeof(*cconf)); + if (!cconf) { + memprintf(err, "out of memory\n"); + goto err; + } + cconf->flags = CACHE_FLT_F_IMPLICIT_DECL; + cconf->c.name = strdup(name); + if (!cconf->c.name) { + memprintf(err, "out of memory\n"); + goto err; + } + + /* register a filter to fill the cache buffer */ + fconf = calloc(1, sizeof(*fconf)); + if (!fconf) { + memprintf(err, "out of memory\n"); + goto err; + } + fconf->id = cache_store_flt_id; + fconf->conf = cconf; + fconf->ops = &cache_ops; + LIST_APPEND(&proxy->filter_configs, &fconf->list); + + rule->arg.act.p[0] = cconf; + return 1; + + err: + free(cconf); + return 0; +} + +enum act_parse_ret parse_cache_store(const char **args, int *orig_arg, struct proxy *proxy, + struct act_rule *rule, char **err) +{ + rule->action = ACT_CUSTOM; + rule->action_ptr = http_action_store_cache; + + if (!parse_cache_rule(proxy, args[*orig_arg], rule, err)) + return ACT_RET_PRS_ERR; + + (*orig_arg)++; + return ACT_RET_PRS_OK; +} + +/* This produces a sha1 hash of the concatenation of the HTTP method, + * the first occurrence of the Host header followed by the path component + * if it begins with a slash ('/'). */ +int sha1_hosturi(struct stream *s) +{ + struct http_txn *txn = s->txn; + struct htx *htx = htxbuf(&s->req.buf); + struct htx_sl *sl; + struct http_hdr_ctx ctx; + struct ist uri; + blk_SHA_CTX sha1_ctx; + struct buffer *trash; + + trash = get_trash_chunk(); + ctx.blk = NULL; + + sl = http_get_stline(htx); + uri = htx_sl_req_uri(sl); // whole uri + if (!uri.len) + return 0; + + /* In HTTP/1, most URIs are seen in origin form ('/path/to/resource'), + * unless haproxy is deployed in front of an outbound cache. In HTTP/2, + * URIs are almost always sent in absolute form with their scheme. In + * this case, the scheme is almost always "https". In order to support + * sharing of cache objects between H1 and H2, we'll hash the absolute + * URI whenever known, or prepend "https://" + the Host header for + * relative URIs. The difference will only appear on absolute HTTP/1 + * requests sent to an origin server, which practically is never met in + * the real world so we don't care about the ability to share the same + * key here.URIs are normalized from the absolute URI to an origin form as + * well. + */ + if (!(sl->flags & HTX_SL_F_HAS_AUTHORITY)) { + chunk_istcat(trash, ist("https://")); + if (!http_find_header(htx, ist("Host"), &ctx, 0)) + return 0; + chunk_istcat(trash, ctx.value); + } + + chunk_istcat(trash, uri); + + /* hash everything */ + blk_SHA1_Init(&sha1_ctx); + blk_SHA1_Update(&sha1_ctx, trash->area, trash->data); + blk_SHA1_Final((unsigned char *)txn->cache_hash, &sha1_ctx); + + return 1; +} + +/* Looks for "If-None-Match" headers in the request and compares their value + * with the one that might have been stored in the cache_entry. If any of them + * matches, a "304 Not Modified" response should be sent instead of the cached + * data. + * Although unlikely in a GET/HEAD request, the "If-None-Match: *" syntax is + * valid and should receive a "304 Not Modified" response (RFC 7234#4.3.2). + * + * If no "If-None-Match" header was found, look for an "If-Modified-Since" + * header and compare its value (date) to the one stored in the cache_entry. + * If the request's date is later than the cached one, we also send a + * "304 Not Modified" response (see RFCs 7232#3.3 and 7234#4.3.2). + * + * Returns 1 if "304 Not Modified" should be sent, 0 otherwise. + */ +static int should_send_notmodified_response(struct cache *cache, struct htx *htx, + struct cache_entry *entry) +{ + int retval = 0; + + struct http_hdr_ctx ctx = { .blk = NULL }; + struct ist cache_entry_etag = IST_NULL; + struct buffer *etag_buffer = NULL; + int if_none_match_found = 0; + + struct tm tm = {}; + time_t if_modified_since = 0; + + /* If we find a "If-None-Match" header in the request, rebuild the + * cache_entry's ETag in order to perform comparisons. + * There could be multiple "if-none-match" header lines. */ + while (http_find_header(htx, ist("if-none-match"), &ctx, 0)) { + if_none_match_found = 1; + + /* A '*' matches everything. */ + if (isteq(ctx.value, ist("*")) != 0) { + retval = 1; + break; + } + + /* No need to rebuild an etag if none was stored in the cache. */ + if (entry->etag_length == 0) + break; + + /* Rebuild the stored ETag. */ + if (etag_buffer == NULL) { + etag_buffer = get_trash_chunk(); + + if (shctx_row_data_get(shctx_ptr(cache), block_ptr(entry), + (unsigned char*)b_orig(etag_buffer), + entry->etag_offset, entry->etag_length) == 0) { + cache_entry_etag = ist2(b_orig(etag_buffer), entry->etag_length); + } else { + /* We could not rebuild the ETag in one go, we + * won't send a "304 Not Modified" response. */ + break; + } + } + + if (http_compare_etags(cache_entry_etag, ctx.value) == 1) { + retval = 1; + break; + } + } + + /* If the request did not contain an "If-None-Match" header, we look for + * an "If-Modified-Since" header (see RFC 7232#3.3). */ + if (retval == 0 && if_none_match_found == 0) { + ctx.blk = NULL; + if (http_find_header(htx, ist("if-modified-since"), &ctx, 1)) { + if (parse_http_date(istptr(ctx.value), istlen(ctx.value), &tm)) { + if_modified_since = my_timegm(&tm); + + /* We send a "304 Not Modified" response if the + * entry's last modified date is earlier than + * the one found in the "If-Modified-Since" + * header. */ + retval = (entry->last_modified <= if_modified_since); + } + } + } + + return retval; +} + +enum act_return http_action_req_cache_use(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + + struct http_txn *txn = s->txn; + struct cache_entry *res, *sec_entry = NULL; + struct cache_flt_conf *cconf = rule->arg.act.p[0]; + struct cache *cache = cconf->c.cache; + struct shared_context *shctx = shctx_ptr(cache); + struct shared_block *entry_block; + + struct cache_tree *cache_tree = NULL; + + /* Ignore cache for HTTP/1.0 requests and for requests other than GET + * and HEAD */ + if (!(txn->req.flags & HTTP_MSGF_VER_11) || + (txn->meth != HTTP_METH_GET && txn->meth != HTTP_METH_HEAD)) + txn->flags |= TX_CACHE_IGNORE; + + http_check_request_for_cacheability(s, &s->req); + + /* The request's hash has to be calculated for all requests, even POSTs + * or PUTs for instance because RFC7234 specifies that a successful + * "unsafe" method on a stored resource must invalidate it + * (see RFC7234#4.4). */ + if (!sha1_hosturi(s)) + return ACT_RET_CONT; + + if (s->txn->flags & TX_CACHE_IGNORE) + return ACT_RET_CONT; + + if (px == strm_fe(s)) + _HA_ATOMIC_INC(&px->fe_counters.p.http.cache_lookups); + else + _HA_ATOMIC_INC(&px->be_counters.p.http.cache_lookups); + + cache_tree = get_cache_tree_from_hash(cache, read_u32(s->txn->cache_hash)); + + if (!cache_tree) + return ACT_RET_CONT; + + cache_rdlock(cache_tree); + res = get_entry(cache_tree, s->txn->cache_hash, 0); + /* We must not use an entry that is not complete but the check will be + * performed after we look for a potential secondary entry (in case of + * Vary). */ + if (res) { + struct appctx *appctx; + int detached = 0; + + retain_entry(res); + + entry_block = block_ptr(res); + shctx_wrlock(shctx); + if (res->complete) { + shctx_row_detach(shctx, entry_block); + detached = 1; + } else { + release_entry(cache_tree, res, 0); + res = NULL; + } + shctx_wrunlock(shctx); + cache_rdunlock(cache_tree); + + /* In case of Vary, we could have multiple entries with the same + * primary hash. We need to calculate the secondary hash in order + * to find the actual entry we want (if it exists). */ + if (res && res->secondary_key_signature) { + if (!http_request_build_secondary_key(s, res->secondary_key_signature)) { + cache_rdlock(cache_tree); + sec_entry = get_secondary_entry(cache_tree, res, + s->txn->cache_secondary_hash, 0); + if (sec_entry && sec_entry != res) { + /* The wrong row was added to the hot list. */ + release_entry(cache_tree, res, 0); + retain_entry(sec_entry); + shctx_wrlock(shctx); + if (detached) + shctx_row_reattach(shctx, entry_block); + entry_block = block_ptr(sec_entry); + shctx_row_detach(shctx, entry_block); + shctx_wrunlock(shctx); + } + res = sec_entry; + cache_rdunlock(cache_tree); + } + else { + release_entry(cache_tree, res, 1); + + res = NULL; + shctx_wrlock(shctx); + shctx_row_reattach(shctx, entry_block); + shctx_wrunlock(shctx); + } + } + + /* We either looked for a valid secondary entry and could not + * find one, or the entry we want to use is not complete. We + * can't use the cache's entry and must forward the request to + * the server. */ + if (!res) { + return ACT_RET_CONT; + } else if (!res->complete) { + release_entry(cache_tree, res, 1); + return ACT_RET_CONT; + } + + s->target = &http_cache_applet.obj_type; + if ((appctx = sc_applet_create(s->scb, objt_applet(s->target)))) { + struct cache_appctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + appctx->st0 = HTX_CACHE_INIT; + appctx->rule = rule; + ctx->cache_tree = cache_tree; + ctx->entry = res; + ctx->next = NULL; + ctx->sent = 0; + ctx->send_notmodified = + should_send_notmodified_response(cache, htxbuf(&s->req.buf), res); + + if (px == strm_fe(s)) + _HA_ATOMIC_INC(&px->fe_counters.p.http.cache_hits); + else + _HA_ATOMIC_INC(&px->be_counters.p.http.cache_hits); + return ACT_RET_CONT; + } else { + s->target = NULL; + release_entry(cache_tree, res, 1); + shctx_wrlock(shctx); + shctx_row_reattach(shctx, entry_block); + shctx_wrunlock(shctx); + return ACT_RET_CONT; + } + } + cache_rdunlock(cache_tree); + + /* Shared context does not need to be locked while we calculate the + * secondary hash. */ + if (!res && cache->vary_processing_enabled) { + /* Build a complete secondary hash until the server response + * tells us which fields should be kept (if any). */ + http_request_prebuild_full_secondary_key(s); + } + return ACT_RET_CONT; +} + + +enum act_parse_ret parse_cache_use(const char **args, int *orig_arg, struct proxy *proxy, + struct act_rule *rule, char **err) +{ + rule->action = ACT_CUSTOM; + rule->action_ptr = http_action_req_cache_use; + + if (!parse_cache_rule(proxy, args[*orig_arg], rule, err)) + return ACT_RET_PRS_ERR; + + (*orig_arg)++; + return ACT_RET_PRS_OK; +} + +int cfg_parse_cache(const char *file, int linenum, char **args, int kwm) +{ + int err_code = 0; + + if (strcmp(args[0], "cache") == 0) { /* new cache section */ + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects a <name> argument\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + if (tmp_cache_config == NULL) { + struct cache *cache_config; + + tmp_cache_config = calloc(1, sizeof(*tmp_cache_config)); + if (!tmp_cache_config) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + strlcpy2(tmp_cache_config->id, args[1], 33); + if (strlen(args[1]) > 32) { + ha_warning("parsing [%s:%d]: cache name is limited to 32 characters, truncate to '%s'.\n", + file, linenum, tmp_cache_config->id); + err_code |= ERR_WARN; + } + + list_for_each_entry(cache_config, &caches_config, list) { + if (strcmp(tmp_cache_config->id, cache_config->id) == 0) { + ha_alert("parsing [%s:%d]: Duplicate cache name '%s'.\n", + file, linenum, tmp_cache_config->id); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + } + + tmp_cache_config->maxage = 60; + tmp_cache_config->maxblocks = 0; + tmp_cache_config->maxobjsz = 0; + tmp_cache_config->max_secondary_entries = DEFAULT_MAX_SECONDARY_ENTRY; + } + } else if (strcmp(args[0], "total-max-size") == 0) { + unsigned long int maxsize; + char *err; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + maxsize = strtoul(args[1], &err, 10); + if (err == args[1] || *err != '\0') { + ha_warning("parsing [%s:%d]: total-max-size wrong value '%s'\n", + file, linenum, args[1]); + err_code |= ERR_ABORT; + goto out; + } + + if (maxsize > (UINT_MAX >> 20)) { + ha_warning("parsing [%s:%d]: \"total-max-size\" (%s) must not be greater than %u\n", + file, linenum, args[1], UINT_MAX >> 20); + err_code |= ERR_ABORT; + goto out; + } + + /* size in megabytes */ + maxsize *= 1024 * 1024 / CACHE_BLOCKSIZE; + tmp_cache_config->maxblocks = maxsize; + } else if (strcmp(args[0], "max-age") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + if (!*args[1]) { + ha_warning("parsing [%s:%d]: '%s' expects an age parameter in seconds.\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + + tmp_cache_config->maxage = atoi(args[1]); + } else if (strcmp(args[0], "max-object-size") == 0) { + unsigned int maxobjsz; + char *err; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + if (!*args[1]) { + ha_warning("parsing [%s:%d]: '%s' expects a maximum file size parameter in bytes.\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + + maxobjsz = strtoul(args[1], &err, 10); + if (err == args[1] || *err != '\0') { + ha_warning("parsing [%s:%d]: max-object-size wrong value '%s'\n", + file, linenum, args[1]); + err_code |= ERR_ABORT; + goto out; + } + tmp_cache_config->maxobjsz = maxobjsz; + } else if (strcmp(args[0], "process-vary") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + if (!*args[1]) { + ha_warning("parsing [%s:%d]: '%s' expects \"on\" or \"off\" (enable or disable vary processing).\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + if (strcmp(args[1], "on") == 0) + tmp_cache_config->vary_processing_enabled = 1; + else if (strcmp(args[1], "off") == 0) + tmp_cache_config->vary_processing_enabled = 0; + else { + ha_warning("parsing [%s:%d]: '%s' expects \"on\" or \"off\" (enable or disable vary processing).\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + } else if (strcmp(args[0], "max-secondary-entries") == 0) { + unsigned int max_sec_entries; + char *err; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + if (!*args[1]) { + ha_warning("parsing [%s:%d]: '%s' expects a strictly positive number.\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + + max_sec_entries = strtoul(args[1], &err, 10); + if (err == args[1] || *err != '\0' || max_sec_entries == 0) { + ha_warning("parsing [%s:%d]: max-secondary-entries wrong value '%s'\n", + file, linenum, args[1]); + err_code |= ERR_ABORT; + goto out; + } + tmp_cache_config->max_secondary_entries = max_sec_entries; + } + else if (*args[0] != 0) { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in 'cache' section\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } +out: + return err_code; +} + +/* once the cache section is parsed */ + +int cfg_post_parse_section_cache() +{ + int err_code = 0; + + if (tmp_cache_config) { + + if (tmp_cache_config->maxblocks <= 0) { + ha_alert("Size not specified for cache '%s'\n", tmp_cache_config->id); + err_code |= ERR_FATAL | ERR_ALERT; + goto out; + } + + if (!tmp_cache_config->maxobjsz) { + /* Default max. file size is a 256th of the cache size. */ + tmp_cache_config->maxobjsz = + (tmp_cache_config->maxblocks * CACHE_BLOCKSIZE) >> 8; + } + else if (tmp_cache_config->maxobjsz > tmp_cache_config->maxblocks * CACHE_BLOCKSIZE / 2) { + ha_alert("\"max-object-size\" is limited to an half of \"total-max-size\" => %u\n", tmp_cache_config->maxblocks * CACHE_BLOCKSIZE / 2); + err_code |= ERR_FATAL | ERR_ALERT; + goto out; + } + + /* add to the list of cache to init and reinit tmp_cache_config + * for next cache section, if any. + */ + LIST_APPEND(&caches_config, &tmp_cache_config->list); + tmp_cache_config = NULL; + return err_code; + } +out: + ha_free(&tmp_cache_config); + return err_code; + +} + +int post_check_cache() +{ + struct proxy *px; + struct cache *back, *cache_config, *cache; + struct shared_context *shctx; + int ret_shctx; + int err_code = ERR_NONE; + int i; + + list_for_each_entry_safe(cache_config, back, &caches_config, list) { + + ret_shctx = shctx_init(&shctx, cache_config->maxblocks, CACHE_BLOCKSIZE, + cache_config->maxobjsz, sizeof(struct cache)); + + if (ret_shctx <= 0) { + if (ret_shctx == SHCTX_E_INIT_LOCK) + ha_alert("Unable to initialize the lock for the cache.\n"); + else + ha_alert("Unable to allocate cache.\n"); + + err_code |= ERR_FATAL | ERR_ALERT; + goto out; + } + shctx->free_block = cache_free_blocks; + shctx->reserve_finish = cache_reserve_finish; + shctx->cb_data = (void*)shctx->data; + /* the cache structure is stored in the shctx and added to the + * caches list, we can remove the entry from the caches_config + * list */ + memcpy(shctx->data, cache_config, sizeof(struct cache)); + cache = (struct cache *)shctx->data; + LIST_APPEND(&caches, &cache->list); + LIST_DELETE(&cache_config->list); + free(cache_config); + for (i = 0; i < CACHE_TREE_NUM; ++i) { + cache->trees[i].entries = EB_ROOT; + HA_RWLOCK_INIT(&cache->trees[i].lock); + + LIST_INIT(&cache->trees[i].cleanup_list); + HA_SPIN_INIT(&cache->trees[i].cleanup_lock); + } + + /* Find all references for this cache in the existing filters + * (over all proxies) and reference it in matching filters. + */ + for (px = proxies_list; px; px = px->next) { + struct flt_conf *fconf; + struct cache_flt_conf *cconf; + + list_for_each_entry(fconf, &px->filter_configs, list) { + if (fconf->id != cache_store_flt_id) + continue; + + cconf = fconf->conf; + if (strcmp(cache->id, cconf->c.name) == 0) { + free(cconf->c.name); + cconf->flags |= CACHE_FLT_INIT; + cconf->c.cache = cache; + break; + } + } + } + } + +out: + return err_code; + +} + +struct flt_ops cache_ops = { + .init = cache_store_init, + .check = cache_store_check, + .deinit = cache_store_deinit, + + /* Handle stream init/deinit */ + .attach = cache_store_strm_init, + .detach = cache_store_strm_deinit, + + /* Handle channels activity */ + .channel_post_analyze = cache_store_post_analyze, + + /* Filter HTTP requests and responses */ + .http_headers = cache_store_http_headers, + .http_payload = cache_store_http_payload, + .http_end = cache_store_http_end, +}; + + +#define CHECK_ENCODING(str, encoding_name, encoding_value) \ + ({ \ + int retval = 0; \ + if (istmatch(str, (struct ist){ .ptr = encoding_name+1, .len = sizeof(encoding_name) - 2 })) { \ + retval = encoding_value; \ + encoding = istadv(encoding, sizeof(encoding_name) - 2); \ + } \ + (retval); \ + }) + +/* + * Parse the encoding <encoding> and try to match the encoding part upon an + * encoding list of explicitly supported encodings (which all have a specific + * bit in an encoding bitmap). If a weight is included in the value, find out if + * it is null or not. The bit value will be set in the <encoding_value> + * parameter and the <has_null_weight> will be set to 1 if the weight is strictly + * 0, 1 otherwise. + * The encodings list is extracted from + * https://www.iana.org/assignments/http-parameters/http-parameters.xhtml. + * Returns 0 in case of success and -1 in case of error. + */ +static int parse_encoding_value(struct ist encoding, unsigned int *encoding_value, + unsigned int *has_null_weight) +{ + int retval = 0; + + if (!encoding_value) + return -1; + + if (!istlen(encoding)) + return -1; /* Invalid encoding */ + + *encoding_value = 0; + if (has_null_weight) + *has_null_weight = 0; + + switch (*encoding.ptr) { + case 'a': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "aes128gcm", VARY_ENCODING_AES128GCM); + break; + case 'b': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "br", VARY_ENCODING_BR); + break; + case 'c': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "compress", VARY_ENCODING_COMPRESS); + break; + case 'd': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "deflate", VARY_ENCODING_DEFLATE); + break; + case 'e': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "exi", VARY_ENCODING_EXI); + break; + case 'g': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "gzip", VARY_ENCODING_GZIP); + break; + case 'i': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "identity", VARY_ENCODING_IDENTITY); + break; + case 'p': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "pack200-gzip", VARY_ENCODING_PACK200_GZIP); + break; + case 'x': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "x-gzip", VARY_ENCODING_GZIP); + if (!*encoding_value) + *encoding_value = CHECK_ENCODING(encoding, "x-compress", VARY_ENCODING_COMPRESS); + break; + case 'z': + encoding = istnext(encoding); + *encoding_value = CHECK_ENCODING(encoding, "zstd", VARY_ENCODING_ZSTD); + break; + case '*': + encoding = istnext(encoding); + *encoding_value = VARY_ENCODING_STAR; + break; + default: + retval = -1; /* Unmanaged encoding */ + break; + } + + /* Process the optional weight part of the encoding. */ + if (*encoding_value) { + encoding = http_trim_leading_spht(encoding); + if (istlen(encoding)) { + if (*encoding.ptr != ';') + return -1; + + if (has_null_weight) { + encoding = istnext(encoding); + + encoding = http_trim_leading_spht(encoding); + + *has_null_weight = isteq(encoding, ist("q=0")); + } + } + } + + return retval; +} + +#define ACCEPT_ENCODING_MAX_ENTRIES 16 +/* + * Build a bitmap of the accept-encoding header. + * + * The bitmap is built by matching every sub-part of the accept-encoding value + * with a subset of explicitly supported encodings, which all have their own bit + * in the bitmap. This bitmap will be used to determine if a response can be + * served to a client (that is if it has an encoding that is accepted by the + * client). Any unknown encodings will be indicated by the VARY_ENCODING_OTHER + * bit. + * + * Returns 0 in case of success and -1 in case of error. + */ +static int accept_encoding_normalizer(struct htx *htx, struct ist hdr_name, + char *buf, unsigned int *buf_len) +{ + size_t count = 0; + uint32_t encoding_bitmap = 0; + unsigned int encoding_bmp_bl = -1; + struct http_hdr_ctx ctx = { .blk = NULL }; + unsigned int encoding_value; + unsigned int rejected_encoding; + + /* A user agent always accepts an unencoded value unless it explicitly + * refuses it through an "identity;q=0" accept-encoding value. */ + encoding_bitmap |= VARY_ENCODING_IDENTITY; + + /* Iterate over all the ACCEPT_ENCODING_MAX_ENTRIES first accept-encoding + * values that might span acrosse multiple accept-encoding headers. */ + while (http_find_header(htx, hdr_name, &ctx, 0) && count < ACCEPT_ENCODING_MAX_ENTRIES) { + count++; + + /* As per RFC7231#5.3.4, "An Accept-Encoding header field with a + * combined field-value that is empty implies that the user agent + * does not want any content-coding in response." + * + * We must (and did) count the existence of this empty header to not + * hit the `count == 0` case below, but must ignore the value to not + * include VARY_ENCODING_OTHER into the final bitmap. + */ + if (istlen(ctx.value) == 0) + continue; + + /* Turn accept-encoding value to lower case */ + ist2bin_lc(istptr(ctx.value), ctx.value); + + /* Try to identify a known encoding and to manage null weights. */ + if (!parse_encoding_value(ctx.value, &encoding_value, &rejected_encoding)) { + if (rejected_encoding) + encoding_bmp_bl &= ~encoding_value; + else + encoding_bitmap |= encoding_value; + } + else { + /* Unknown encoding */ + encoding_bitmap |= VARY_ENCODING_OTHER; + } + } + + /* If a "*" was found in the accepted encodings (without a null weight), + * all the encoding are accepted except the ones explicitly rejected. */ + if (encoding_bitmap & VARY_ENCODING_STAR) { + encoding_bitmap = ~0; + } + + /* Clear explicitly rejected encodings from the bitmap */ + encoding_bitmap &= encoding_bmp_bl; + + /* As per RFC7231#5.3.4, "If no Accept-Encoding field is in the request, + * any content-coding is considered acceptable by the user agent". */ + if (count == 0) + encoding_bitmap = ~0; + + /* A request with more than ACCEPT_ENCODING_MAX_ENTRIES accepted + * encodings might be illegitimate so we will not use it. */ + if (count == ACCEPT_ENCODING_MAX_ENTRIES) + return -1; + + write_u32(buf, encoding_bitmap); + *buf_len = sizeof(encoding_bitmap); + + /* This function fills the hash buffer correctly even if no header was + * found, hence the 0 return value (success). */ + return 0; +} +#undef ACCEPT_ENCODING_MAX_ENTRIES + +/* + * Normalizer used by default for the Referer and Origin header. It only + * calculates a hash of the whole value using xxhash algorithm. + * Only the first occurrence of the header will be taken into account in the + * hash. + * Returns 0 in case of success, 1 if the hash buffer should be filled with 0s + * and -1 in case of error. + */ +static int default_normalizer(struct htx *htx, struct ist hdr_name, + char *buf, unsigned int *buf_len) +{ + int retval = 1; + struct http_hdr_ctx ctx = { .blk = NULL }; + + if (http_find_header(htx, hdr_name, &ctx, 1)) { + retval = 0; + write_u64(buf, XXH3(istptr(ctx.value), istlen(ctx.value), cache_hash_seed)); + *buf_len = sizeof(uint64_t); + } + + return retval; +} + +/* + * Accept-Encoding bitmap comparison function. + * Returns 0 if the bitmaps are compatible. + */ +static int accept_encoding_bitmap_cmp(const void *ref, const void *new, unsigned int len) +{ + uint32_t ref_bitmap = read_u32(ref); + uint32_t new_bitmap = read_u32(new); + + if (!(ref_bitmap & VARY_ENCODING_OTHER)) { + /* All the bits set in the reference bitmap correspond to the + * stored response' encoding and should all be set in the new + * encoding bitmap in order for the client to be able to manage + * the response. + * + * If this is the case the cached response has encodings that + * are accepted by the client. It can be served directly by + * the cache (as far as the accept-encoding part is concerned). + */ + + return (ref_bitmap & new_bitmap) != ref_bitmap; + } + else { + return 1; + } +} + + +/* + * Pre-calculate the hashes of all the supported headers (in our Vary + * implementation) of a given request. We have to calculate all the hashes + * in advance because the actual Vary signature won't be known until the first + * response. + * Only the first occurrence of every header will be taken into account in the + * hash. + * If the header is not present, the hash portion of the given header will be + * filled with zeros. + * Returns 0 in case of success. + */ +static int http_request_prebuild_full_secondary_key(struct stream *s) +{ + /* The fake signature (second parameter) will ensure that every part of the + * secondary key is calculated. */ + return http_request_build_secondary_key(s, ~0); +} + + +/* + * Calculate the secondary key for a request for which we already have a known + * vary signature. The key is made by aggregating hashes calculated for every + * header mentioned in the vary signature. + * Only the first occurrence of every header will be taken into account in the + * hash. + * If the header is not present, the hash portion of the given header will be + * filled with zeros. + * Returns 0 in case of success. + */ +static int http_request_build_secondary_key(struct stream *s, int vary_signature) +{ + struct http_txn *txn = s->txn; + struct htx *htx = htxbuf(&s->req.buf); + + unsigned int idx; + const struct vary_hashing_information *info = NULL; + unsigned int hash_length = 0; + int retval = 0; + int offset = 0; + + for (idx = 0; idx < sizeof(vary_information)/sizeof(*vary_information) && retval >= 0; ++idx) { + info = &vary_information[idx]; + + /* The normalizing functions will be in charge of getting the + * header values from the htx. This way they can manage multiple + * occurrences of their processed header. */ + if ((vary_signature & info->value) && info->norm_fn != NULL && + !(retval = info->norm_fn(htx, info->hdr_name, &txn->cache_secondary_hash[offset], &hash_length))) { + offset += hash_length; + } + else { + /* Fill hash with 0s. */ + hash_length = info->hash_length; + memset(&txn->cache_secondary_hash[offset], 0, hash_length); + offset += hash_length; + } + } + + if (retval >= 0) + txn->flags |= TX_CACHE_HAS_SEC_KEY; + + return (retval < 0); +} + +/* + * Build the actual secondary key of a given request out of the prebuilt key and + * the actual vary signature (extracted from the response). + * Returns 0 in case of success. + */ +static int http_request_reduce_secondary_key(unsigned int vary_signature, + char prebuilt_key[HTTP_CACHE_SEC_KEY_LEN]) +{ + int offset = 0; + int global_offset = 0; + int vary_info_count = 0; + int keep = 0; + unsigned int vary_idx; + const struct vary_hashing_information *vary_info; + + vary_info_count = sizeof(vary_information)/sizeof(*vary_information); + for (vary_idx = 0; vary_idx < vary_info_count; ++vary_idx) { + vary_info = &vary_information[vary_idx]; + keep = (vary_signature & vary_info->value) ? 0xff : 0; + + for (offset = 0; offset < vary_info->hash_length; ++offset,++global_offset) { + prebuilt_key[global_offset] &= keep; + } + } + + return 0; +} + + + +static int +parse_cache_flt(char **args, int *cur_arg, struct proxy *px, + struct flt_conf *fconf, char **err, void *private) +{ + struct flt_conf *f, *back; + struct cache_flt_conf *cconf = NULL; + char *name = NULL; + int pos = *cur_arg; + + /* Get the cache filter name. <pos> point on "cache" keyword */ + if (!*args[pos + 1]) { + memprintf(err, "%s : expects a <name> argument", args[pos]); + goto error; + } + name = strdup(args[pos + 1]); + if (!name) { + memprintf(err, "%s '%s' : out of memory", args[pos], args[pos + 1]); + goto error; + } + pos += 2; + + /* Check if an implicit filter with the same name already exists. If so, + * we remove the implicit filter to use the explicit one. */ + list_for_each_entry_safe(f, back, &px->filter_configs, list) { + if (f->id != cache_store_flt_id) + continue; + + cconf = f->conf; + if (strcmp(name, cconf->c.name) != 0) { + cconf = NULL; + continue; + } + + if (!(cconf->flags & CACHE_FLT_F_IMPLICIT_DECL)) { + cconf = NULL; + memprintf(err, "%s: multiple explicit declarations of the cache filter '%s'", + px->id, name); + goto error; + } + + /* Remove the implicit filter. <cconf> is kept for the explicit one */ + LIST_DELETE(&f->list); + free(f); + free(name); + break; + } + + /* No implicit cache filter found, create configuration for the explicit one */ + if (!cconf) { + cconf = calloc(1, sizeof(*cconf)); + if (!cconf) { + memprintf(err, "%s: out of memory", args[*cur_arg]); + goto error; + } + cconf->c.name = name; + } + + cconf->flags = 0; + fconf->id = cache_store_flt_id; + fconf->conf = cconf; + fconf->ops = &cache_ops; + + *cur_arg = pos; + return 0; + + error: + free(name); + free(cconf); + return -1; +} + +/* It reserves a struct show_cache_ctx for the local variables */ +static int cli_parse_show_cache(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_cache_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + ctx->cache = LIST_ELEM((caches).n, typeof(struct cache *), list); + return 0; +} + +/* It uses a struct show_cache_ctx for the local variables */ +static int cli_io_handler_show_cache(struct appctx *appctx) +{ + struct show_cache_ctx *ctx = appctx->svcctx; + struct cache* cache = ctx->cache; + struct buffer *buf = alloc_trash_chunk(); + + if (buf == NULL) + return 1; + + list_for_each_entry_from(cache, &caches, list) { + struct eb32_node *node = NULL; + unsigned int next_key; + struct cache_entry *entry; + unsigned int i; + struct shared_context *shctx = shctx_ptr(cache); + int cache_tree_index = 0; + struct cache_tree *cache_tree = NULL; + + next_key = ctx->next_key; + if (!next_key) { + shctx_rdlock(shctx); + chunk_printf(buf, "%p: %s (shctx:%p, available blocks:%d)\n", cache, cache->id, shctx_ptr(cache), shctx_ptr(cache)->nbav); + shctx_rdunlock(shctx); + if (applet_putchk(appctx, buf) == -1) { + goto yield; + } + } + + ctx->cache = cache; + + if (ctx->cache_tree) + cache_tree_index = (ctx->cache_tree - ctx->cache->trees); + + for (;cache_tree_index < CACHE_TREE_NUM; ++cache_tree_index) { + + ctx->cache_tree = cache_tree = &ctx->cache->trees[cache_tree_index]; + + cache_rdlock(cache_tree); + + while (1) { + node = eb32_lookup_ge(&cache_tree->entries, next_key); + if (!node) { + ctx->next_key = 0; + break; + } + + entry = container_of(node, struct cache_entry, eb); + next_key = node->key + 1; + + if (entry->expire > date.tv_sec) { + chunk_printf(buf, "%p hash:%u vary:0x", entry, read_u32(entry->hash)); + for (i = 0; i < HTTP_CACHE_SEC_KEY_LEN; ++i) + chunk_appendf(buf, "%02x", (unsigned char)entry->secondary_key[i]); + chunk_appendf(buf, " size:%u (%u blocks), refcount:%u, expire:%d\n", + block_ptr(entry)->len, block_ptr(entry)->block_count, + block_ptr(entry)->refcount, entry->expire - (int)date.tv_sec); + } + + ctx->next_key = next_key; + + if (applet_putchk(appctx, buf) == -1) { + cache_rdunlock(cache_tree); + goto yield; + } + } + cache_rdunlock(cache_tree); + } + } + + free_trash_chunk(buf); + return 1; + +yield: + free_trash_chunk(buf); + return 0; +} + + +/* + * boolean, returns true if response was built out of a cache entry. + */ +static int +smp_fetch_res_cache_hit(const struct arg *args, struct sample *smp, + const char *kw, void *private) +{ + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = (smp->strm ? (smp->strm->target == &http_cache_applet.obj_type) : 0); + + return 1; +} + +/* + * string, returns cache name (if response came from a cache). + */ +static int +smp_fetch_res_cache_name(const struct arg *args, struct sample *smp, + const char *kw, void *private) +{ + struct appctx *appctx = NULL; + + struct cache_flt_conf *cconf = NULL; + struct cache *cache = NULL; + + if (!smp->strm || smp->strm->target != &http_cache_applet.obj_type) + return 0; + + /* Get appctx from the stream connector. */ + appctx = sc_appctx(smp->strm->scb); + if (appctx && appctx->rule) { + cconf = appctx->rule->arg.act.p[0]; + if (cconf) { + cache = cconf->c.cache; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.area = cache->id; + smp->data.u.str.data = strlen(cache->id); + return 1; + } + } + + return 0; +} + + +/* early boot initialization */ +static void cache_init() +{ + cache_hash_seed = ha_random64(); +} + +INITCALL0(STG_PREPARE, cache_init); + +/* Declare the filter parser for "cache" keyword */ +static struct flt_kw_list filter_kws = { "CACHE", { }, { + { "cache", parse_cache_flt, NULL }, + { NULL, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, flt_register_keywords, &filter_kws); + +static struct cli_kw_list cli_kws = {{},{ + { { "show", "cache", NULL }, "show cache : show cache status", cli_parse_show_cache, cli_io_handler_show_cache, NULL, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +static struct action_kw_list http_res_actions = { + .kw = { + { "cache-store", parse_cache_store }, + { NULL, NULL } + } +}; + +INITCALL1(STG_REGISTER, http_res_keywords_register, &http_res_actions); + +static struct action_kw_list http_req_actions = { + .kw = { + { "cache-use", parse_cache_use }, + { NULL, NULL } + } +}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_actions); + +struct applet http_cache_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<CACHE>", /* used for logging */ + .fct = http_cache_io_handler, + .release = http_cache_applet_release, +}; + +/* config parsers for this section */ +REGISTER_CONFIG_SECTION("cache", cfg_parse_cache, cfg_post_parse_section_cache); +REGISTER_POST_CHECK(post_check_cache); + + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "res.cache_hit", smp_fetch_res_cache_hit, 0, NULL, SMP_T_BOOL, SMP_USE_HRSHP, SMP_VAL_RESPONSE }, + { "res.cache_name", smp_fetch_res_cache_name, 0, NULL, SMP_T_STR, SMP_USE_HRSHP, SMP_VAL_RESPONSE }, + { /* END */ }, + } +}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); diff --git a/src/calltrace.c b/src/calltrace.c new file mode 100644 index 0000000..3946b28 --- /dev/null +++ b/src/calltrace.c @@ -0,0 +1,286 @@ +/* + * Function call tracing for gcc >= 2.95 + * WARNING! THIS CODE IS NOT THREAD-SAFE! + * + * Copyright 2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * gcc is able to call a specific function when entering and leaving any + * function when compiled with -finstrument-functions. This code must not + * be built with this argument. The performance impact is huge, so this + * feature should only be used when debugging. + * + * The entry and exits of all functions will be dumped into a file designated + * by the HAPROXY_TRACE environment variable, or by default "trace.out". If the + * trace file name is empty or "/dev/null", then traces are disabled. If + * opening the trace file fails, then stderr is used. If HAPROXY_TRACE_FAST is + * used, then the time is taken from the global <now> variable. Last, if + * HAPROXY_TRACE_TSC is used, then the machine's TSC is used instead of the + * real time (almost twice as fast). + * + * The output format is : + * + * <sec.usec> <level> <caller_ptr> <dir> <callee_ptr> + * or : + * <tsc> <level> <caller_ptr> <dir> <callee_ptr> + * + * where <dir> is '>' when entering a function and '<' when leaving. + * + * It is also possible to emit comments using the calltrace() function which uses + * the printf() format. Such comments are then inserted by replacing the caller + * pointer with a sharp ('#') like this : + * + * <sec.usec> <level> # <comment> + * or : + * <tsc> <level> # <comment> + * + * The article below is a nice explanation of how this works : + * http://balau82.wordpress.com/2010/10/06/trace-and-profile-function-calls-with-gcc/ + */ + +#include <sys/time.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <haproxy/api.h> +#include <haproxy/clock.h> +#include <haproxy/tools.h> + +static FILE *log; +static int level; +static int disabled; +static int fast_time; +static int use_tsc; +static struct timeval trace_now; +static struct timeval *now_ptr; +static char line[128]; /* more than enough for a message (9+1+6+1+3+1+18+1+1+18+1+1) */ + +static int open_trace() +{ + const char *output = getenv("HAPROXY_TRACE"); + + if (!output) + output = "trace.out"; + + if (!*output || strcmp(output, "/dev/null") == 0) { + disabled = 1; + return 0; + } + + log = fopen(output, "w"); + if (!log) + log = stderr; + + now_ptr = &date; + if (getenv("HAPROXY_TRACE_FAST") != NULL) { + fast_time = 1; + now_ptr = &trace_now; + } + if (getenv("HAPROXY_TRACE_TSC") != NULL) { + fast_time = 1; + use_tsc = 1; + } + return 1; +} + +/* This function first divides the number by 100M then iteratively multiplies it + * by 100 (using adds and shifts). The trick is that dividing by 100M is equivalent + * to multiplying by 1/100M, which approximates to 1441151881/2^57. All local + * variables fit in registers on x86. This version outputs two digits per round. + * <min_pairs> indicates the minimum number of pairs of digits that have to be + * emitted, which might be left-padded with zeroes. + * It returns the pointer to the ending '\0'. + */ +static char *ultoad2(unsigned int x, char *out, int min_pairs) +{ + unsigned int q; + char *p = out; + int pos = 4; + unsigned long long y; + + static const unsigned short bcd[100] = { + 0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930, + 0x3031, 0x3131, 0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931, + 0x3032, 0x3132, 0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932, + 0x3033, 0x3133, 0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933, + 0x3034, 0x3134, 0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934, + 0x3035, 0x3135, 0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935, + 0x3036, 0x3136, 0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936, + 0x3037, 0x3137, 0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937, + 0x3038, 0x3138, 0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938, + 0x3039, 0x3139, 0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939 }; + + y = x * 1441151881ULL; /* y>>57 will be the integer part of x/100M */ + while (1) { + q = y >> 57; + /* Q is composed of the first digit in the lower byte and the second + * digit in the higher byte. + */ + if (p != out || q > 9 || pos < min_pairs) { +#if defined(__i386__) || defined(__x86_64__) + /* unaligned accesses are fast on x86 */ + *(unsigned short *)p = bcd[q]; + p += 2; +#else + *(p++) = bcd[q]; + *(p++) = bcd[q] >> 8; +#endif + } + else if (q || !pos) { + /* only at most one digit */ + *(p++) = bcd[q] >> 8; + } + if (--pos < 0) + break; + + y &= 0x1FFFFFFFFFFFFFFULL; // remainder + + if (sizeof(long) >= sizeof(long long)) { + /* shifting is preferred on 64-bit archs, while mult is faster on 32-bit. + * We multiply by 100 by doing *5, *5 and *4, all of which are trivial. + */ + y += (y << 2); + y += (y << 2); + y <<= 2; + } + else + y *= 100; + } + + *p = '\0'; + return p; +} + +/* Send <h> as hex into <out>. Returns the pointer to the ending '\0'. */ +static char *emit_hex(unsigned long h, char *out) +{ + static unsigned char hextab[16] = "0123456789abcdef"; + int shift = sizeof(h) * 8 - 4; + unsigned int idx; + + do { + idx = (h >> shift); + if (idx || !shift) + *out++ = hextab[idx & 15]; + shift -= 4; + } while (shift >= 0); + *out = '\0'; + return out; +} + +static void make_line(void *from, void *to, int level, char dir, long ret) +{ + char *p = line; + + if (unlikely(!log) && !open_trace()) + return; + + if (unlikely(!fast_time)) + gettimeofday(now_ptr, NULL); + +#ifdef USE_SLOW_FPRINTF + if (!use_tsc) + fprintf(log, "%u.%06u %d %p %c %p\n", + (unsigned int)now_ptr->tv_sec, + (unsigned int)now_ptr->tv_usec, + level, from, dir, to); + else + fprintf(log, "%llx %d %p %c %p\n", + rdtsc(), level, from, dir, to); + return; +#endif + + if (unlikely(!use_tsc)) { + /* "%u.06u", tv_sec, tv_usec */ + p = ultoad2(now_ptr->tv_sec, p, 0); + *p++ = '.'; + p = ultoad2(now_ptr->tv_usec, p, 3); + } else { + /* "%08x%08x", high, low */ + unsigned long long t = rdtsc(); + if (sizeof(long) < sizeof(long long)) + p = emit_hex((unsigned long)(t >> 32U), p); + p = emit_hex((unsigned long)(t), p); + } + + /* " %u", level */ + *p++ = ' '; + p = ultoad2(level, p, 0); + + /* " %p", from */ + *p++ = ' '; *p++ = '0'; *p++ = 'x'; + p = emit_hex((unsigned long)from, p); + + /* " %c", dir */ + *p++ = ' '; *p++ = dir; + + /* " %p", to */ + *p++ = ' '; *p++ = '0'; *p++ = 'x'; + p = emit_hex((unsigned long)to, p); + + if (dir == '<') { + /* " %x", ret */ + *p++ = ' '; *p++ = '0'; *p++ = 'x'; + p = emit_hex(ret, p); + } + + *p++ = '\n'; + + fwrite(line, p - line, 1, log); +} + +/* These are the functions GCC calls */ +void __cyg_profile_func_enter(void *to, void *from) +{ + if (!disabled) + return make_line(from, to, ++level, '>', 0); +} + +void __cyg_profile_func_exit(void *to, void *from) +{ + long ret = 0; + +#if defined(__x86_64__) + /* on x86_64, the return value (eax) is temporarily stored in ebx + * during the call to __cyg_profile_func_exit() so we can snoop it. + */ + asm volatile("mov %%rbx, %0" : "=r"(ret)); +#endif + if (!disabled) + return make_line(from, to, level--, '<', ret); +} + +/* the one adds comments in the trace above. The output format is : + * <timestamp> <level> # <string> + */ +__attribute__((format(printf, 1, 2))) +void calltrace(char *fmt, ...) +{ + va_list ap; + + if (unlikely(!log) && !open_trace()) + return; + + if (unlikely(!fast_time)) + gettimeofday(now_ptr, NULL); + + if (!use_tsc) + fprintf(log, "%u.%06u %d # ", + (unsigned int)now_ptr->tv_sec, + (unsigned int)now_ptr->tv_usec, + level + 1); + else + fprintf(log, "%llx %d # ", + rdtsc(), level + 1); + + va_start(ap, fmt); + vfprintf(log, fmt, ap); + va_end(ap); + fputc('\n', log); + fflush(log); +} diff --git a/src/cbuf.c b/src/cbuf.c new file mode 100644 index 0000000..b36bbeb --- /dev/null +++ b/src/cbuf.c @@ -0,0 +1,59 @@ +/* + * Circular buffer management + * + * Copyright 2021 HAProxy Technologies, Frederic Lecaille <flecaill@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <haproxy/list.h> +#include <haproxy/pool.h> +#include <haproxy/cbuf-t.h> + +DECLARE_POOL(pool_head_cbuf, "cbuf", sizeof(struct cbuf)); + +/* Allocate and return a new circular buffer with <buf> as <sz> byte internal buffer + * if succeeded, NULL if not. + */ +struct cbuf *cbuf_new(unsigned char *buf, size_t sz) +{ + struct cbuf *cbuf; + + cbuf = pool_alloc(pool_head_cbuf); + if (cbuf) { + cbuf->sz = sz; + cbuf->buf = buf; + cbuf->wr = 0; + cbuf->rd = 0; + } + + return cbuf; +} + +/* Free QUIC ring <cbuf> */ +void cbuf_free(struct cbuf *cbuf) +{ + if (!cbuf) + return; + + pool_free(pool_head_cbuf, cbuf); +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/cfgcond.c b/src/cfgcond.c new file mode 100644 index 0000000..117cf6c --- /dev/null +++ b/src/cfgcond.c @@ -0,0 +1,559 @@ +/* + * Configuration condition preprocessor + * + * Copyright 2000-2021 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/cfgcond.h> +#include <haproxy/global.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/tools.h> + +/* supported condition predicates */ +const struct cond_pred_kw cond_predicates[] = { + { "defined", CFG_PRED_DEFINED, ARG1(1, STR) }, + { "feature", CFG_PRED_FEATURE, ARG1(1, STR) }, + { "streq", CFG_PRED_STREQ, ARG2(2, STR, STR) }, + { "strneq", CFG_PRED_STRNEQ, ARG2(2, STR, STR) }, + { "strstr", CFG_PRED_STRSTR, ARG2(2, STR, STR) }, + { "version_atleast", CFG_PRED_VERSION_ATLEAST, ARG1(1, STR) }, + { "version_before", CFG_PRED_VERSION_BEFORE, ARG1(1, STR) }, + { "openssl_version_atleast", CFG_PRED_OSSL_VERSION_ATLEAST, ARG1(1, STR) }, + { "openssl_version_before", CFG_PRED_OSSL_VERSION_BEFORE, ARG1(1, STR) }, + { "ssllib_name_startswith", CFG_PRED_SSLLIB_NAME_STARTSWITH, ARG1(1, STR) }, + { "enabled", CFG_PRED_ENABLED, ARG1(1, STR) }, + { NULL, CFG_PRED_NONE, 0 } +}; + +/* looks up a cond predicate matching the keyword in <str>, possibly followed + * by a parenthesis. Returns a pointer to it or NULL if not found. + */ +const struct cond_pred_kw *cfg_lookup_cond_pred(const char *str) +{ + const struct cond_pred_kw *ret; + int len = strcspn(str, " ("); + + for (ret = &cond_predicates[0]; ret->word; ret++) { + if (len != strlen(ret->word)) + continue; + if (strncmp(str, ret->word, len) != 0) + continue; + return ret; + } + return NULL; +} + +/* Frees <term> and its args. NULL is supported and does nothing. */ +void cfg_free_cond_term(struct cfg_cond_term *term) +{ + if (!term) + return; + + if (term->type == CCTT_PAREN) { + cfg_free_cond_expr(term->expr); + term->expr = NULL; + } + + free_args(term->args); + free(term->args); + free(term); +} + +/* Parse an indirect input text as a possible config condition term. + * Returns <0 on parsing error, 0 if the parser is desynchronized, or >0 on + * success. <term> is allocated and filled with the parsed info, and <text> + * is updated on success to point to the first unparsed character, or is left + * untouched on failure. On success, the caller must free <term> using + * cfg_free_cond_term(). An error will be set in <err> on error, and only + * in this case. In this case the first bad character will be reported in + * <errptr>. <maxdepth> corresponds to the maximum recursion depth permitted, + * it is decremented on each recursive call and the parsing will fail one + * reaching <= 0. + */ +int cfg_parse_cond_term(const char **text, struct cfg_cond_term **term, char **err, const char **errptr, int maxdepth) +{ + struct cfg_cond_term *t; + const char *in = *text; + const char *end_ptr; + int err_arg; + int nbargs; + char *end; + long val; + + while (*in == ' ' || *in == '\t') + in++; + + if (!*in) /* empty term does not parse */ + return 0; + + *term = NULL; + if (maxdepth <= 0) + goto fail0; + + t = *term = calloc(1, sizeof(**term)); + if (!t) { + memprintf(err, "memory allocation error while parsing conditional expression '%s'", *text); + goto fail1; + } + + t->type = CCTT_NONE; + t->args = NULL; + t->neg = 0; + + /* !<term> negates the term. White spaces permitted */ + while (*in == '!') { + t->neg = !t->neg; + do { in++; } while (*in == ' ' || *in == '\t'); + } + + val = strtol(in, &end, 0); + if (end != in) { + t->type = val ? CCTT_TRUE : CCTT_FALSE; + *text = end; + return 1; + } + + /* Try to parse '(' EXPR ')' */ + if (*in == '(') { + int ret; + + t->type = CCTT_PAREN; + t->args = NULL; + + do { in++; } while (*in == ' ' || *in == '\t'); + ret = cfg_parse_cond_expr(&in, &t->expr, err, errptr, maxdepth - 1); + if (ret == -1) + goto fail2; + if (ret == 0) + goto fail0; + + /* find the closing ')' */ + while (*in == ' ' || *in == '\t') + in++; + if (*in != ')') { + memprintf(err, "expected ')' after conditional expression '%s'", *text); + goto fail1; + } + do { in++; } while (*in == ' ' || *in == '\t'); + *text = in; + return 1; + } + + /* below we'll likely all make_arg_list() so we must return only via + * the <done> label which frees the arg list. + */ + t->pred = cfg_lookup_cond_pred(in); + if (t->pred) { + t->type = CCTT_PRED; + nbargs = make_arg_list(in + strlen(t->pred->word), -1, + t->pred->arg_mask, &t->args, err, + &end_ptr, &err_arg, NULL); + if (nbargs < 0) { + memprintf(err, "%s in argument %d of predicate '%s' used in conditional expression", *err, err_arg, t->pred->word); + if (errptr) + *errptr = end_ptr; + goto fail2; + } + *text = end_ptr; + return 1; + } + + fail0: + memprintf(err, "unparsable conditional expression '%s'", *text); + fail1: + if (errptr) + *errptr = *text; + fail2: + cfg_free_cond_term(*term); + *term = NULL; + return -1; +} + +/* evaluate a "enabled" expression. Only a subset of options are matched. It + * returns 1 if the option is enabled. 0 is returned is the option is not + * enabled or if it is not recognized. + */ +static int cfg_eval_cond_enabled(const char *str) +{ + if (strcmp(str, "POLL") == 0) + return !!(global.tune.options & GTUNE_USE_POLL); + else if (strcmp(str, "EPOLL") == 0) + return !!(global.tune.options & GTUNE_USE_EPOLL); + else if (strcmp(str, "KQUEUE") == 0) + return !!(global.tune.options & GTUNE_USE_EPOLL); + else if (strcmp(str, "EVPORTS") == 0) + return !!(global.tune.options & GTUNE_USE_EVPORTS); + else if (strcmp(str, "SPLICE") == 0) + return !!(global.tune.options & GTUNE_USE_SPLICE); + else if (strcmp(str, "GETADDRINFO") == 0) + return !!(global.tune.options & GTUNE_USE_GAI); + else if (strcmp(str, "REUSEPORT") == 0) + return !!(proto_tcpv4.flags & PROTO_F_REUSEPORT_SUPPORTED); + else if (strcmp(str, "FAST-FORWARD") == 0) + return !!(global.tune.options & GTUNE_USE_FAST_FWD); + else if (strcmp(str, "SERVER-SSL-VERIFY-NONE") == 0) + return !!(global.ssl_server_verify == SSL_SERVER_VERIFY_NONE); + return 0; +} + +/* evaluate a condition term on a .if/.elif line. The condition was already + * parsed in <term>. Returns -1 on error (in which case err is filled with a + * message, and only in this case), 0 if the condition is false, 1 if it's + * true. + */ +int cfg_eval_cond_term(const struct cfg_cond_term *term, char **err) +{ + int ret = -1; + + if (term->type == CCTT_FALSE) + ret = 0; + else if (term->type == CCTT_TRUE) + ret = 1; + else if (term->type == CCTT_PRED) { + /* here we know we have a valid predicate with valid arguments + * placed in term->args (which the caller will free). + */ + switch (term->pred->prd) { + case CFG_PRED_DEFINED: // checks if arg exists as an environment variable + ret = getenv(term->args[0].data.str.area) != NULL; + break; + + case CFG_PRED_FEATURE: { // checks if the arg matches an enabled feature + const char *p; + + ret = 0; // assume feature not found + for (p = build_features; (p = strstr(p, term->args[0].data.str.area)); p++) { + if (p > build_features && + (p[term->args[0].data.str.data] == ' ' || + p[term->args[0].data.str.data] == 0)) { + if (*(p-1) == '+') { // e.g. "+OPENSSL" + ret = 1; + break; + } + else if (*(p-1) == '-') { // e.g. "-OPENSSL" + ret = 0; + break; + } + /* it was a sub-word, let's restart from next place */ + } + } + break; + } + case CFG_PRED_STREQ: // checks if the two arg are equal + ret = strcmp(term->args[0].data.str.area, term->args[1].data.str.area) == 0; + break; + + case CFG_PRED_STRNEQ: // checks if the two arg are different + ret = strcmp(term->args[0].data.str.area, term->args[1].data.str.area) != 0; + break; + + case CFG_PRED_STRSTR: // checks if the 2nd arg is found in the first one + ret = strstr(term->args[0].data.str.area, term->args[1].data.str.area) != NULL; + break; + + case CFG_PRED_VERSION_ATLEAST: // checks if the current version is at least this one + ret = compare_current_version(term->args[0].data.str.area) <= 0; + break; + + case CFG_PRED_VERSION_BEFORE: // checks if the current version is older than this one + ret = compare_current_version(term->args[0].data.str.area) > 0; + break; + + case CFG_PRED_OSSL_VERSION_ATLEAST: { // checks if the current openssl version is at least this one + int opensslret = openssl_compare_current_version(term->args[0].data.str.area); + + if (opensslret < -1) /* can't parse the string or no openssl available */ + ret = -1; + else + ret = opensslret <= 0; + break; + } + case CFG_PRED_OSSL_VERSION_BEFORE: { // checks if the current openssl version is older than this one + int opensslret = openssl_compare_current_version(term->args[0].data.str.area); + + if (opensslret < -1) /* can't parse the string or no openssl available */ + ret = -1; + else + ret = opensslret > 0; + break; + } + case CFG_PRED_SSLLIB_NAME_STARTSWITH: { // checks if the current SSL library's name starts with a specified string (can be used to distinguish OpenSSL from LibreSSL or BoringSSL) + ret = openssl_compare_current_name(term->args[0].data.str.area) == 0; + break; + } + case CFG_PRED_ENABLED: { // checks if the arg matches on a subset of enabled options + ret = cfg_eval_cond_enabled(term->args[0].data.str.area) != 0; + break; + } + default: + memprintf(err, "internal error: unhandled conditional expression predicate '%s'", term->pred->word); + break; + } + } + else if (term->type == CCTT_PAREN) { + ret = cfg_eval_cond_expr(term->expr, err); + } + else { + memprintf(err, "internal error: unhandled condition term type %d", (int)term->type); + } + + if (ret >= 0 && term->neg) + ret = !ret; + return ret; +} + + +/* Frees <expr> and its terms and args. NULL is supported and does nothing. */ +void cfg_free_cond_and(struct cfg_cond_and *expr) +{ + struct cfg_cond_and *prev; + + while (expr) { + cfg_free_cond_term(expr->left); + prev = expr; + expr = expr->right; + free(prev); + } +} + +/* Frees <expr> and its terms and args. NULL is supported and does nothing. */ +void cfg_free_cond_expr(struct cfg_cond_expr *expr) +{ + struct cfg_cond_expr *prev; + + while (expr) { + cfg_free_cond_and(expr->left); + prev = expr; + expr = expr->right; + free(prev); + } +} + +/* Parse an indirect input text as a possible config condition sub-expr. + * Returns <0 on parsing error, 0 if the parser is desynchronized, or >0 on + * success. <expr> is filled with the parsed info, and <text> is updated on + * success to point to the first unparsed character, or is left untouched + * on failure. On success, the caller will have to free all lower-level + * allocated structs using cfg_free_cond_expr(). An error will be set in + * <err> on error, and only in this case. In this case the first bad + * character will be reported in <errptr>. <maxdepth> corresponds to the + * maximum recursion depth permitted, it is decremented on each recursive + * call and the parsing will fail one reaching <= 0. + */ +int cfg_parse_cond_and(const char **text, struct cfg_cond_and **expr, char **err, const char **errptr, int maxdepth) +{ + struct cfg_cond_and *e; + const char *in = *text; + int ret = -1; + + if (!*in) /* empty expr does not parse */ + return 0; + + *expr = NULL; + if (maxdepth <= 0) { + memprintf(err, "unparsable conditional sub-expression '%s'", in); + if (errptr) + *errptr = in; + goto done; + } + + e = *expr = calloc(1, sizeof(**expr)); + if (!e) { + memprintf(err, "memory allocation error while parsing conditional expression '%s'", *text); + goto done; + } + + ret = cfg_parse_cond_term(&in, &e->left, err, errptr, maxdepth - 1); + if (ret == -1) // parse error, error already reported + goto done; + + if (ret == 0) { + /* ret == 0, no other way to parse this */ + memprintf(err, "unparsable conditional sub-expression '%s'", in); + if (errptr) + *errptr = in; + ret = -1; + goto done; + } + + /* ret=1, we have a term in the left hand set */ + + /* find an optional '&&' */ + while (*in == ' ' || *in == '\t') + in++; + + *text = in; + if (in[0] != '&' || in[1] != '&') + goto done; + + /* we have a '&&', let's parse the right handset's subexp */ + in += 2; + while (*in == ' ' || *in == '\t') + in++; + + ret = cfg_parse_cond_and(&in, &e->right, err, errptr, maxdepth - 1); + if (ret > 0) + *text = in; + done: + if (ret < 0) { + cfg_free_cond_and(*expr); + *expr = NULL; + } + return ret; +} + +/* Parse an indirect input text as a possible config condition term. + * Returns <0 on parsing error, 0 if the parser is desynchronized, or >0 on + * success. <expr> is filled with the parsed info, and <text> is updated on + * success to point to the first unparsed character, or is left untouched + * on failure. On success, the caller will have to free all lower-level + * allocated structs using cfg_free_cond_expr(). An error will be set in + * <err> on error, and only in this case. In this case the first bad + * character will be reported in <errptr>. <maxdepth> corresponds to the + * maximum recursion depth permitted, it is decremented on each recursive call + * and the parsing will fail one reaching <= 0. + */ +int cfg_parse_cond_expr(const char **text, struct cfg_cond_expr **expr, char **err, const char **errptr, int maxdepth) +{ + struct cfg_cond_expr *e; + const char *in = *text; + int ret = -1; + + if (!*in) /* empty expr does not parse */ + return 0; + + *expr = NULL; + if (maxdepth <= 0) { + memprintf(err, "unparsable conditional expression '%s'", in); + if (errptr) + *errptr = in; + goto done; + } + + e = *expr = calloc(1, sizeof(**expr)); + if (!e) { + memprintf(err, "memory allocation error while parsing conditional expression '%s'", *text); + goto done; + } + + ret = cfg_parse_cond_and(&in, &e->left, err, errptr, maxdepth - 1); + if (ret == -1) // parse error, error already reported + goto done; + + if (ret == 0) { + /* ret == 0, no other way to parse this */ + memprintf(err, "unparsable conditional expression '%s'", in); + if (errptr) + *errptr = in; + ret = -1; + goto done; + } + + /* ret=1, we have a sub-expr in the left hand set */ + + /* find an optional '||' */ + while (*in == ' ' || *in == '\t') + in++; + + *text = in; + if (in[0] != '|' || in[1] != '|') + goto done; + + /* we have a '||', let's parse the right handset's subexp */ + in += 2; + while (*in == ' ' || *in == '\t') + in++; + + ret = cfg_parse_cond_expr(&in, &e->right, err, errptr, maxdepth - 1); + if (ret > 0) + *text = in; + done: + if (ret < 0) { + cfg_free_cond_expr(*expr); + *expr = NULL; + } + return ret; +} + +/* evaluate an sub-expression on a .if/.elif line. The expression is valid and + * was already parsed in <expr>. Returns -1 on error (in which case err is + * filled with a message, and only in this case), 0 if the condition is false, + * 1 if it's true. + */ +int cfg_eval_cond_and(struct cfg_cond_and *expr, char **err) +{ + int ret; + + /* AND: loop on terms and sub-exp's terms as long as they're TRUE + * (stop on FALSE and ERROR). + */ + while ((ret = cfg_eval_cond_term(expr->left, err)) > 0 && expr->right) + expr = expr->right; + return ret; +} + +/* evaluate an expression on a .if/.elif line. The expression is valid and was + * already parsed in <expr>. Returns -1 on error (in which case err is filled + * with a message, and only in this case), 0 if the condition is false, 1 if + * it's true. + */ +int cfg_eval_cond_expr(struct cfg_cond_expr *expr, char **err) +{ + int ret; + + /* OR: loop on sub-exps as long as they're FALSE (stop on TRUE and ERROR) */ + while ((ret = cfg_eval_cond_and(expr->left, err)) == 0 && expr->right) + expr = expr->right; + return ret; +} + +/* evaluate a condition on a .if/.elif line. The condition is already tokenized + * in <err>. Returns -1 on error (in which case err is filled with a message, + * and only in this case), 0 if the condition is false, 1 if it's true. If + * <errptr> is not NULL, it's set to the first invalid character on error. + */ +int cfg_eval_condition(char **args, char **err, const char **errptr) +{ + struct cfg_cond_expr *expr = NULL; + const char *text = args[0]; + int ret = -1; + + if (!*text) /* note: empty = false */ + return 0; + + ret = cfg_parse_cond_expr(&text, &expr, err, errptr, MAX_CFG_RECURSION); + if (ret != 0) { + if (ret == -1) // parse error, error already reported + goto done; + while (*text == ' ' || *text == '\t') + text++; + + if (*text) { + ret = -1; + memprintf(err, "unexpected character '%c' at the end of conditional expression '%s'", + *text, args[0]); + goto fail; + } + + ret = cfg_eval_cond_expr(expr, err); + goto done; + } + + /* ret == 0, no other way to parse this */ + ret = -1; + memprintf(err, "unparsable conditional expression '%s'", args[0]); + fail: + if (errptr) + *errptr = text; + done: + cfg_free_cond_expr(expr); + return ret; +} diff --git a/src/cfgdiag.c b/src/cfgdiag.c new file mode 100644 index 0000000..f8e4a9e --- /dev/null +++ b/src/cfgdiag.c @@ -0,0 +1,97 @@ +#include <stdarg.h> +#include <stdlib.h> + +#include <import/ebistree.h> + +#include <haproxy/cfgdiag.h> +#include <haproxy/log.h> +#include <haproxy/proxy.h> +#include <haproxy/server.h> + +/* Use this function to emit diagnostic. + * This can be used as a shortcut to set value pointed by <ret> to 1 at the + * same time. + */ +static inline void diag_warning(int *ret, char *fmt, ...) +{ + va_list argp; + + va_start(argp, fmt); + *ret = 1; + _ha_vdiag_warning(fmt, argp); + va_end(argp); +} + +/* Use this for dynamic allocation in diagnostics. + * In case of allocation failure, this will immediately terminates haproxy. + */ +static inline void *diag_alloc(size_t size) +{ + void *out = NULL; + + if (!(out = malloc(size))) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + + return out; +} + +/* Checks that two servers from the same backend does not share the same cookie + * value. Backup servers are not taken into account as it can be quite common to + * share cookie values in this case. + */ +static void check_server_cookies(int *ret) +{ + struct cookie_entry { + struct ebpt_node node; + }; + + struct proxy *px; + struct server *srv; + + struct eb_root cookies_tree = EB_ROOT_UNIQUE; + struct ebpt_node *cookie_node; + struct cookie_entry *cookie_entry; + struct ebpt_node *node; + + for (px = proxies_list; px; px = px->next) { + for (srv = px->srv; srv; srv = srv->next) { + /* do not take into account backup servers */ + if (!srv->cookie || (srv->flags & SRV_F_BACKUP)) + continue; + + cookie_node = ebis_lookup(&cookies_tree, srv->cookie); + if (cookie_node) { + diag_warning(ret, "parsing [%s:%d] : 'server %s' : same cookie value is set for a previous non-backup server in the same backend, it may break connection persistence\n", + srv->conf.file, srv->conf.line, srv->id); + continue; + } + + cookie_entry = diag_alloc(sizeof(*cookie_entry)); + cookie_entry->node.key = srv->cookie; + ebis_insert(&cookies_tree, &cookie_entry->node); + } + + /* clear the tree and free its entries */ + while ((node = ebpt_first(&cookies_tree))) { + cookie_entry = ebpt_entry(node, struct cookie_entry, node); + eb_delete(&node->node); + free(cookie_entry); + } + } +} + +/* Placeholder to execute various diagnostic checks after the configuration file + * has been fully parsed. It will output a warning for each diagnostic found. + * + * Returns 0 if no diagnostic message has been found else 1. + */ +int cfg_run_diagnostics() +{ + int ret = 0; + + check_server_cookies(&ret); + + return ret; +} diff --git a/src/cfgparse-global.c b/src/cfgparse-global.c new file mode 100644 index 0000000..f31e7a0 --- /dev/null +++ b/src/cfgparse-global.c @@ -0,0 +1,1396 @@ +#define _GNU_SOURCE /* for cpu_set_t from haproxy/cpuset.h */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> +#include <ctype.h> +#include <pwd.h> +#include <grp.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <import/sha1.h> + +#include <haproxy/buf.h> +#include <haproxy/cfgparse.h> +#ifdef USE_CPU_AFFINITY +#include <haproxy/cpuset.h> +#endif +#include <haproxy/compression.h> +#include <haproxy/global.h> +#include <haproxy/log.h> +#include <haproxy/peers.h> +#include <haproxy/protocol.h> +#include <haproxy/tools.h> + +int cluster_secret_isset; + +/* some keywords that are still being parsed using strcmp() and are not + * registered anywhere. They are used as suggestions for mistyped words. + */ +static const char *common_kw_list[] = { + "global", "daemon", "master-worker", "noepoll", "nokqueue", + "noevports", "nopoll", "busy-polling", "set-dumpable", + "insecure-fork-wanted", "insecure-setuid-wanted", "nosplice", + "nogetaddrinfo", "noreuseport", "quiet", "zero-warning", + "tune.runqueue-depth", "tune.maxpollevents", "tune.maxaccept", + "tune.recv_enough", "tune.buffers.limit", + "tune.buffers.reserve", "tune.bufsize", "tune.maxrewrite", + "tune.idletimer", "tune.rcvbuf.client", "tune.rcvbuf.server", + "tune.sndbuf.client", "tune.sndbuf.server", "tune.pipesize", + "tune.http.cookielen", "tune.http.logurilen", "tune.http.maxhdr", + "tune.comp.maxlevel", "tune.pattern.cache-size", + "tune.fast-forward", "uid", "gid", + "external-check", "user", "group", "nbproc", "maxconn", + "ssl-server-verify", "maxconnrate", "maxsessrate", "maxsslrate", + "maxcomprate", "maxpipes", "maxzlibmem", "maxcompcpuusage", "ulimit-n", + "chroot", "description", "node", "pidfile", "unix-bind", "log", + "log-send-hostname", "server-state-base", "server-state-file", + "log-tag", "spread-checks", "max-spread-checks", "cpu-map", "setenv", + "presetenv", "unsetenv", "resetenv", "strict-limits", "localpeer", + "numa-cpu-mapping", "defaults", "listen", "frontend", "backend", + "peers", "resolvers", "cluster-secret", "no-quic", "limited-quic", + NULL /* must be last */ +}; + +/* + * parse a line in a <global> section. Returns the error code, 0 if OK, or + * any combination of : + * - ERR_ABORT: must abort ASAP + * - ERR_FATAL: we can continue parsing but not start the service + * - ERR_WARN: a warning has been emitted + * - ERR_ALERT: an alert has been emitted + * Only the two first ones can stop processing, the two others are just + * indicators. + */ +int cfg_parse_global(const char *file, int linenum, char **args, int kwm) +{ + int err_code = 0; + char *errmsg = NULL; + + if (strcmp(args[0], "global") == 0) { /* new section */ + /* no option, nothing special to do */ + alertif_too_many_args(0, file, linenum, args, &err_code); + goto out; + } + else if (strcmp(args[0], "expose-experimental-directives") == 0) { + experimental_directives_allowed = 1; + } + else if (strcmp(args[0], "daemon") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.mode |= MODE_DAEMON; + } + else if (strcmp(args[0], "master-worker") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*args[1]) { + if (strcmp(args[1], "no-exit-on-failure") == 0) { + global.tune.options |= GTUNE_NOEXIT_ONFAILURE; + } else { + ha_alert("parsing [%s:%d] : '%s' only supports 'no-exit-on-failure' option.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + global.mode |= MODE_MWORKER; + } + else if (strcmp(args[0], "noepoll") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options &= ~GTUNE_USE_EPOLL; + } + else if (strcmp(args[0], "nokqueue") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options &= ~GTUNE_USE_KQUEUE; + } + else if (strcmp(args[0], "noevports") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options &= ~GTUNE_USE_EVPORTS; + } + else if (strcmp(args[0], "nopoll") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options &= ~GTUNE_USE_POLL; + } + else if (strcmp(args[0], "limited-quic") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + + global.tune.options |= GTUNE_LIMITED_QUIC; + } + else if (strcmp(args[0], "no-quic") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + + global.tune.options |= GTUNE_NO_QUIC; + } + else if (strcmp(args[0], "busy-polling") == 0) { /* "no busy-polling" or "busy-polling" */ + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_NO) + global.tune.options &= ~GTUNE_BUSY_POLLING; + else + global.tune.options |= GTUNE_BUSY_POLLING; + } + else if (strcmp(args[0], "set-dumpable") == 0) { /* "no set-dumpable" or "set-dumpable" */ + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_NO) + global.tune.options &= ~GTUNE_SET_DUMPABLE; + else + global.tune.options |= GTUNE_SET_DUMPABLE; + } + else if (strcmp(args[0], "h2-workaround-bogus-websocket-clients") == 0) { /* "no h2-workaround-bogus-websocket-clients" or "h2-workaround-bogus-websocket-clients" */ + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_NO) + global.tune.options &= ~GTUNE_DISABLE_H2_WEBSOCKET; + else + global.tune.options |= GTUNE_DISABLE_H2_WEBSOCKET; + } + else if (strcmp(args[0], "insecure-fork-wanted") == 0) { /* "no insecure-fork-wanted" or "insecure-fork-wanted" */ + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_NO) + global.tune.options &= ~GTUNE_INSECURE_FORK; + else + global.tune.options |= GTUNE_INSECURE_FORK; + } + else if (strcmp(args[0], "insecure-setuid-wanted") == 0) { /* "no insecure-setuid-wanted" or "insecure-setuid-wanted" */ + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_NO) + global.tune.options &= ~GTUNE_INSECURE_SETUID; + else + global.tune.options |= GTUNE_INSECURE_SETUID; + } + else if (strcmp(args[0], "nosplice") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options &= ~GTUNE_USE_SPLICE; + } + else if (strcmp(args[0], "nogetaddrinfo") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options &= ~GTUNE_USE_GAI; + } + else if (strcmp(args[0], "noreuseport") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + protocol_clrf_all(PROTO_F_REUSEPORT_SUPPORTED); + } + else if (strcmp(args[0], "quiet") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.mode |= MODE_QUIET; + } + else if (strcmp(args[0], "zero-warning") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.mode |= MODE_ZERO_WARNING; + } + else if (strcmp(args[0], "tune.runqueue-depth") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.tune.runqueue_depth != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.runqueue_depth = atol(args[1]); + + } + else if (strcmp(args[0], "tune.maxpollevents") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.tune.maxpollevents != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.maxpollevents = atol(args[1]); + } + else if (strcmp(args[0], "tune.maxaccept") == 0) { + long max; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.tune.maxaccept != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + max = atol(args[1]); + if (/*max < -1 || */max > INT_MAX) { + ha_alert("parsing [%s:%d] : '%s' expects -1 or an integer from 0 to INT_MAX.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.maxaccept = max; + } + else if (strcmp(args[0], "tune.chksize") == 0) { + ha_alert("parsing [%s:%d]: option '%s' is not supported any more (tune.bufsize is used instead).\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "tune.recv_enough") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.recv_enough = atol(args[1]); + } + else if (strcmp(args[0], "tune.buffers.limit") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.buf_limit = atol(args[1]); + if (global.tune.buf_limit) { + if (global.tune.buf_limit < 3) + global.tune.buf_limit = 3; + if (global.tune.buf_limit <= global.tune.reserved_bufs) + global.tune.buf_limit = global.tune.reserved_bufs + 1; + } + } + else if (strcmp(args[0], "tune.buffers.reserve") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.reserved_bufs = atol(args[1]); + if (global.tune.reserved_bufs < 2) + global.tune.reserved_bufs = 2; + if (global.tune.buf_limit && global.tune.buf_limit <= global.tune.reserved_bufs) + global.tune.buf_limit = global.tune.reserved_bufs + 1; + } + else if (strcmp(args[0], "tune.bufsize") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.bufsize = atol(args[1]); + /* round it up to support a two-pointer alignment at the end */ + global.tune.bufsize = (global.tune.bufsize + 2 * sizeof(void *) - 1) & -(2 * sizeof(void *)); + if (global.tune.bufsize <= 0) { + ha_alert("parsing [%s:%d] : '%s' expects a positive integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "tune.maxrewrite") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.maxrewrite = atol(args[1]); + if (global.tune.maxrewrite < 0) { + ha_alert("parsing [%s:%d] : '%s' expects a positive integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "tune.idletimer") == 0) { + unsigned int idle; + const char *res; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a timer value between 0 and 65535 ms.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + res = parse_time_err(args[1], &idle, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s>, maximum value is 65535 ms.\n", + file, linenum, args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s>, minimum non-null value is 1 ms.\n", + file, linenum, args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res) { + ha_alert("parsing [%s:%d]: unexpected character '%c' in argument to <%s>.\n", + file, linenum, *res, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (idle > 65535) { + ha_alert("parsing [%s:%d] : '%s' expects a timer value between 0 and 65535 ms.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.idle_timer = idle; + } + else if (strcmp(args[0], "tune.rcvbuf.client") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.tune.client_rcvbuf != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.client_rcvbuf = atol(args[1]); + } + else if (strcmp(args[0], "tune.rcvbuf.server") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.tune.server_rcvbuf != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.server_rcvbuf = atol(args[1]); + } + else if (strcmp(args[0], "tune.sndbuf.client") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.tune.client_sndbuf != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.client_sndbuf = atol(args[1]); + } + else if (strcmp(args[0], "tune.sndbuf.server") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.tune.server_sndbuf != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.server_sndbuf = atol(args[1]); + } + else if (strcmp(args[0], "tune.pipesize") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.pipesize = atol(args[1]); + } + else if (strcmp(args[0], "tune.http.cookielen") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.cookie_len = atol(args[1]) + 1; + } + else if (strcmp(args[0], "tune.http.logurilen") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.requri_len = atol(args[1]) + 1; + } + else if (strcmp(args[0], "tune.http.maxhdr") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.tune.max_http_hdr = atoi(args[1]); + if (global.tune.max_http_hdr < 1 || global.tune.max_http_hdr > 32767) { + ha_alert("parsing [%s:%d] : '%s' expects a numeric value between 1 and 32767\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "tune.comp.maxlevel") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*args[1]) { + global.tune.comp_maxlevel = atoi(args[1]); + if (global.tune.comp_maxlevel < 1 || global.tune.comp_maxlevel > 9) { + ha_alert("parsing [%s:%d] : '%s' expects a numeric value between 1 and 9\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } else { + ha_alert("parsing [%s:%d] : '%s' expects a numeric value between 1 and 9\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "tune.pattern.cache-size") == 0) { + if (*args[1]) { + global.tune.pattern_cache = atoi(args[1]); + if (global.tune.pattern_cache < 0) { + ha_alert("parsing [%s:%d] : '%s' expects a positive numeric value\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } else { + ha_alert("parsing [%s:%d] : '%s' expects a positive numeric value\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "tune.disable-fast-forward") == 0) { + if (!experimental_directives_allowed) { + ha_alert("parsing [%s:%d] : '%s' directive is experimental, must be allowed via a global 'expose-experimental-directives'", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + mark_tainted(TAINTED_CONFIG_EXP_KW_DECLARED); + + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options &= ~GTUNE_USE_FAST_FWD; + } + else if (strcmp(args[0], "tune.disable-zero-copy-forwarding") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.no_zero_copy_fwd |= NO_ZERO_COPY_FWD; + } + else if (strcmp(args[0], "cluster-secret") == 0) { + blk_SHA_CTX sha1_ctx; + unsigned char sha1_out[20]; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*args[1] == 0) { + ha_alert("parsing [%s:%d] : expects an ASCII string argument.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (cluster_secret_isset) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + + blk_SHA1_Init(&sha1_ctx); + blk_SHA1_Update(&sha1_ctx, args[1], strlen(args[1])); + blk_SHA1_Final(sha1_out, &sha1_ctx); + BUG_ON(sizeof sha1_out < sizeof global.cluster_secret); + memcpy(global.cluster_secret, sha1_out, sizeof global.cluster_secret); + cluster_secret_isset = 1; + } + else if (strcmp(args[0], "uid") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.uid != 0) { + ha_alert("parsing [%s:%d] : user/uid already specified. Continuing.\n", file, linenum); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (strl2irc(args[1], strlen(args[1]), &global.uid) != 0) { + ha_warning("parsing [%s:%d] : uid: string '%s' is not a number.\n | You might want to use the 'user' parameter to use a system user name.\n", file, linenum, args[1]); + err_code |= ERR_WARN; + goto out; + } + + } + else if (strcmp(args[0], "gid") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.gid != 0) { + ha_alert("parsing [%s:%d] : group/gid already specified. Continuing.\n", file, linenum); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (strl2irc(args[1], strlen(args[1]), &global.gid) != 0) { + ha_warning("parsing [%s:%d] : gid: string '%s' is not a number.\n | You might want to use the 'group' parameter to use a system group name.\n", file, linenum, args[1]); + err_code |= ERR_WARN; + goto out; + } + } + else if (strcmp(args[0], "external-check") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + global.external_check = 1; + if (strcmp(args[1], "preserve-env") == 0) { + global.external_check = 2; + } else if (*args[1]) { + ha_alert("parsing [%s:%d] : '%s' only supports 'preserve-env' as an argument, found '%s'.\n", file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + /* user/group name handling */ + else if (strcmp(args[0], "user") == 0) { + struct passwd *ha_user; + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.uid != 0) { + ha_alert("parsing [%s:%d] : user/uid already specified. Continuing.\n", file, linenum); + err_code |= ERR_ALERT; + goto out; + } + errno = 0; + ha_user = getpwnam(args[1]); + if (ha_user != NULL) { + global.uid = (int)ha_user->pw_uid; + } + else { + ha_alert("parsing [%s:%d] : cannot find user id for '%s' (%d:%s)\n", file, linenum, args[1], errno, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + else if (strcmp(args[0], "group") == 0) { + struct group *ha_group; + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.gid != 0) { + ha_alert("parsing [%s:%d] : gid/group was already specified. Continuing.\n", file, linenum); + err_code |= ERR_ALERT; + goto out; + } + errno = 0; + ha_group = getgrnam(args[1]); + if (ha_group != NULL) { + global.gid = (int)ha_group->gr_gid; + } + else { + ha_alert("parsing [%s:%d] : cannot find group id for '%s' (%d:%s)\n", file, linenum, args[1], errno, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + /* end of user/group name handling*/ + else if (strcmp(args[0], "nbproc") == 0) { + ha_alert("parsing [%s:%d] : nbproc is not supported any more since HAProxy 2.5. Threads will automatically be used on multi-processor machines if available.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "maxconn") == 0) { + char *stop; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.maxconn != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.maxconn = strtol(args[1], &stop, 10); + if (*stop != '\0') { + ha_alert("parsing [%s:%d] : cannot parse '%s' value '%s', an integer is expected.\n", file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } +#ifdef SYSTEM_MAXCONN + if (global.maxconn > SYSTEM_MAXCONN && cfg_maxconn <= SYSTEM_MAXCONN) { + ha_alert("parsing [%s:%d] : maxconn value %d too high for this system.\nLimiting to %d. Please use '-n' to force the value.\n", file, linenum, global.maxconn, SYSTEM_MAXCONN); + global.maxconn = SYSTEM_MAXCONN; + err_code |= ERR_ALERT; + } +#endif /* SYSTEM_MAXCONN */ + } + else if (strcmp(args[0], "ssl-server-verify") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (strcmp(args[1],"none") == 0) + global.ssl_server_verify = SSL_SERVER_VERIFY_NONE; + else if (strcmp(args[1],"required") == 0) + global.ssl_server_verify = SSL_SERVER_VERIFY_REQUIRED; + else { + ha_alert("parsing [%s:%d] : '%s' expects 'none' or 'required' as argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "maxconnrate") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.cps_lim != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.cps_lim = atol(args[1]); + } + else if (strcmp(args[0], "maxsessrate") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.sps_lim != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.sps_lim = atol(args[1]); + } + else if (strcmp(args[0], "maxsslrate") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.ssl_lim != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.ssl_lim = atol(args[1]); + } + else if (strcmp(args[0], "maxcomprate") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument in kb/s.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.comp_rate_lim = atoi(args[1]) * 1024; + } + else if (strcmp(args[0], "maxpipes") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.maxpipes != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.maxpipes = atol(args[1]); + } + else if (strcmp(args[0], "maxzlibmem") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.maxzlibmem = atol(args[1]) * 1024L * 1024L; + } + else if (strcmp(args[0], "maxcompcpuusage") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument between 0 and 100.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + compress_min_idle = 100 - atoi(args[1]); + if (compress_min_idle > 100) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument between 0 and 100.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "fd-hard-limit") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.fd_hard_limit != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.fd_hard_limit = atol(args[1]); + } + else if (strcmp(args[0], "ulimit-n") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.rlimit_nofile != 0) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.rlimit_nofile = atol(args[1]); + } + else if (strcmp(args[0], "chroot") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.chroot != NULL) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a directory as an argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.chroot = strdup(args[1]); + } + else if (strcmp(args[0], "description") == 0) { + int i, len=0; + char *d; + + if (!*args[1]) { + ha_alert("parsing [%s:%d]: '%s' expects a string argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + for (i = 1; *args[i]; i++) + len += strlen(args[i]) + 1; + + if (global.desc) + free(global.desc); + + global.desc = d = calloc(1, len); + + d += snprintf(d, global.desc + len - d, "%s", args[1]); + for (i = 2; *args[i]; i++) + d += snprintf(d, global.desc + len - d, " %s", args[i]); + } + else if (strcmp(args[0], "node") == 0) { + int i; + char c; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + for (i=0; args[1][i]; i++) { + c = args[1][i]; + if (!isupper((unsigned char)c) && !islower((unsigned char)c) && + !isdigit((unsigned char)c) && c != '_' && c != '-' && c != '.') + break; + } + + if (!i || args[1][i]) { + ha_alert("parsing [%s:%d]: '%s' requires valid node name - non-empty string" + " with digits(0-9), letters(A-Z, a-z), dot(.), hyphen(-) or underscode(_).\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (global.node) + free(global.node); + + global.node = strdup(args[1]); + } + else if (strcmp(args[0], "pidfile") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.pidfile != NULL) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a file name as an argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.pidfile = strdup(args[1]); + } + else if (strcmp(args[0], "unix-bind") == 0) { + int cur_arg = 1; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "prefix") == 0) { + if (global.unix_bind.prefix != NULL) { + ha_alert("parsing [%s:%d] : unix-bind '%s' already specified. Continuing.\n", file, linenum, args[cur_arg]); + err_code |= ERR_ALERT; + cur_arg += 2; + continue; + } + + if (*(args[cur_arg+1]) == 0) { + ha_alert("parsing [%s:%d] : unix_bind '%s' expects a path as an argument.\n", file, linenum, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.unix_bind.prefix = strdup(args[cur_arg+1]); + cur_arg += 2; + continue; + } + + if (strcmp(args[cur_arg], "mode") == 0) { + + global.unix_bind.ux.mode = strtol(args[cur_arg + 1], NULL, 8); + cur_arg += 2; + continue; + } + + if (strcmp(args[cur_arg], "uid") == 0) { + + global.unix_bind.ux.uid = atol(args[cur_arg + 1 ]); + cur_arg += 2; + continue; + } + + if (strcmp(args[cur_arg], "gid") == 0) { + + global.unix_bind.ux.gid = atol(args[cur_arg + 1 ]); + cur_arg += 2; + continue; + } + + if (strcmp(args[cur_arg], "user") == 0) { + struct passwd *user; + + user = getpwnam(args[cur_arg + 1]); + if (!user) { + ha_alert("parsing [%s:%d] : '%s' : '%s' unknown user.\n", + file, linenum, args[0], args[cur_arg + 1 ]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + global.unix_bind.ux.uid = user->pw_uid; + cur_arg += 2; + continue; + } + + if (strcmp(args[cur_arg], "group") == 0) { + struct group *group; + + group = getgrnam(args[cur_arg + 1]); + if (!group) { + ha_alert("parsing [%s:%d] : '%s' : '%s' unknown group.\n", + file, linenum, args[0], args[cur_arg + 1 ]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + global.unix_bind.ux.gid = group->gr_gid; + cur_arg += 2; + continue; + } + + ha_alert("parsing [%s:%d] : '%s' only supports the 'prefix', 'mode', 'uid', 'gid', 'user' and 'group' options.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "log") == 0) { /* "no log" or "log ..." */ + if (!parse_logger(args, &global.loggers, (kwm == KWM_NO), file, linenum, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "log-send-hostname") == 0) { /* set the hostname in syslog header */ + char *name; + + if (global.log_send_hostname != NULL) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + + if (*(args[1])) + name = args[1]; + else + name = hostname; + + free(global.log_send_hostname); + global.log_send_hostname = strdup(name); + } + else if (strcmp(args[0], "server-state-base") == 0) { /* path base where HAProxy can find server state files */ + if (global.server_state_base != NULL) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expects one argument: a directory path.\n", file, linenum, args[0]); + err_code |= ERR_FATAL; + goto out; + } + + global.server_state_base = strdup(args[1]); + } + else if (strcmp(args[0], "server-state-file") == 0) { /* path to the file where HAProxy can load the server states */ + if (global.server_state_file != NULL) { + ha_alert("parsing [%s:%d] : '%s' already specified. Continuing.\n", file, linenum, args[0]); + err_code |= ERR_ALERT; + goto out; + } + + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expect one argument: a file path.\n", file, linenum, args[0]); + err_code |= ERR_FATAL; + goto out; + } + + global.server_state_file = strdup(args[1]); + } + else if (strcmp(args[0], "log-tag") == 0) { /* tag to report to syslog */ + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a tag for use in syslog.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + chunk_destroy(&global.log_tag); + chunk_initlen(&global.log_tag, strdup(args[1]), strlen(args[1]), strlen(args[1])); + if (b_orig(&global.log_tag) == NULL) { + chunk_destroy(&global.log_tag); + ha_alert("parsing [%s:%d]: cannot allocate memory for '%s'.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "spread-checks") == 0) { /* random time between checks (0-50) */ + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (global.spread_checks != 0) { + ha_alert("parsing [%s:%d]: spread-checks already specified. Continuing.\n", file, linenum); + err_code |= ERR_ALERT; + goto out; + } + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d]: '%s' expects an integer argument (0..50).\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + global.spread_checks = atol(args[1]); + if (global.spread_checks < 0 || global.spread_checks > 50) { + ha_alert("parsing [%s:%d]: 'spread-checks' needs a positive value in range 0..50.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + else if (strcmp(args[0], "max-spread-checks") == 0) { /* maximum time between first and last check */ + const char *err; + unsigned int val; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d]: '%s' expects an integer argument (0..50).\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err = parse_time_err(args[1], &val, TIME_UNIT_MS); + if (err == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s>, maximum value is 2147483647 ms (~24.8 days).\n", + file, linenum, args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + } + else if (err == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s>, minimum non-null value is 1 ms.\n", + file, linenum, args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + } + else if (err) { + ha_alert("parsing [%s:%d]: unsupported character '%c' in '%s' (wants an integer delay).\n", file, linenum, *err, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + } + global.max_spread_checks = val; + } + else if (strcmp(args[0], "cpu-map") == 0) { + /* map a process list to a CPU set */ +#ifdef USE_CPU_AFFINITY + char *slash; + unsigned long tgroup = 0, thread = 0; + int g, j, n, autoinc; + struct hap_cpuset cpus, cpus_copy; + + if (!*args[1] || !*args[2]) { + ha_alert("parsing [%s:%d] : %s expects a thread group number " + " ('all', 'odd', 'even', a number from 1 to %d or a range), " + " followed by a list of CPU ranges with numbers from 0 to %d.\n", + file, linenum, args[0], LONGBITS, LONGBITS - 1); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((slash = strchr(args[1], '/')) != NULL) + *slash = 0; + + /* note: we silently ignore thread group numbers over MAX_TGROUPS + * and threads over MAX_THREADS so as not to make configurations a + * pain to maintain. + */ + if (parse_process_number(args[1], &tgroup, LONGBITS, &autoinc, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (slash) { + if (parse_process_number(slash+1, &thread, LONGBITS, NULL, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + *slash = '/'; + } else + thread = ~0UL; /* missing '/' = 'all' */ + + /* from now on, thread cannot be NULL anymore */ + + if (parse_cpu_set((const char **)args+2, &cpus, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (autoinc && + my_popcountl(tgroup) != ha_cpuset_count(&cpus) && + my_popcountl(thread) != ha_cpuset_count(&cpus)) { + ha_alert("parsing [%s:%d] : %s : TGROUP/THREAD range and CPU sets " + "must have the same size to be automatically bound\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* we now have to deal with 3 real cases : + * cpu-map P-Q => mapping for whole tgroups, numbers P to Q + * cpu-map P-Q/1 => mapping of first thread of groups P to Q + * cpu-map P/T-U => mapping of threads T to U of tgroup P + */ + /* first tgroup, iterate on threads. E.g. cpu-map 1/1-4 0-3 */ + for (g = 0; g < MAX_TGROUPS; g++) { + /* No mapping for this tgroup */ + if (!(tgroup & (1UL << g))) + continue; + + ha_cpuset_assign(&cpus_copy, &cpus); + + /* a thread set is specified, apply the + * CPU set to these threads. + */ + for (j = n = 0; j < MAX_THREADS_PER_GROUP; j++) { + /* No mapping for this thread */ + if (!(thread & (1UL << j))) + continue; + + if (!autoinc) + ha_cpuset_assign(&cpu_map[g].thread[j], &cpus); + else { + ha_cpuset_zero(&cpu_map[g].thread[j]); + n = ha_cpuset_ffs(&cpus_copy) - 1; + ha_cpuset_clr(&cpus_copy, n); + ha_cpuset_set(&cpu_map[g].thread[j], n); + } + } + } +#else + ha_alert("parsing [%s:%d] : '%s' is not enabled, please check build options for USE_CPU_AFFINITY.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +#endif /* ! USE_CPU_AFFINITY */ + } + else if (strcmp(args[0], "setenv") == 0 || strcmp(args[0], "presetenv") == 0) { + if (alertif_too_many_args(3, file, linenum, args, &err_code)) + goto out; + + if (*(args[2]) == 0) { + ha_alert("parsing [%s:%d]: '%s' expects a name and a value.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* "setenv" overwrites, "presetenv" only sets if not yet set */ + if (setenv(args[1], args[2], (args[0][0] == 's')) != 0) { + ha_alert("parsing [%s:%d]: '%s' failed on variable '%s' : %s.\n", file, linenum, args[0], args[1], strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "unsetenv") == 0) { + int arg; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d]: '%s' expects at least one variable name.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + for (arg = 1; *args[arg]; arg++) { + if (unsetenv(args[arg]) != 0) { + ha_alert("parsing [%s:%d]: '%s' failed on variable '%s' : %s.\n", file, linenum, args[0], args[arg], strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + } + else if (strcmp(args[0], "resetenv") == 0) { + extern char **environ; + char **env = environ; + + /* args contain variable names to keep, one per argument */ + while (*env) { + int arg; + + /* look for current variable in among all those we want to keep */ + for (arg = 1; *args[arg]; arg++) { + if (strncmp(*env, args[arg], strlen(args[arg])) == 0 && + (*env)[strlen(args[arg])] == '=') + break; + } + + /* delete this variable */ + if (!*args[arg]) { + char *delim = strchr(*env, '='); + + if (!delim || delim - *env >= trash.size) { + ha_alert("parsing [%s:%d]: '%s' failed to unset invalid variable '%s'.\n", file, linenum, args[0], *env); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + memcpy(trash.area, *env, delim - *env); + trash.area[delim - *env] = 0; + + if (unsetenv(trash.area) != 0) { + ha_alert("parsing [%s:%d]: '%s' failed to unset variable '%s' : %s.\n", file, linenum, args[0], *env, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else + env++; + } + } + else if (strcmp(args[0], "quick-exit") == 0) { + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + global.tune.options |= GTUNE_QUICK_EXIT; + } + else if (strcmp(args[0], "strict-limits") == 0) { /* "no strict-limits" or "strict-limits" */ + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_NO) + global.tune.options &= ~GTUNE_STRICT_LIMITS; + } + else if (strcmp(args[0], "localpeer") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a name as an argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (global.localpeer_cmdline != 0) { + ha_warning("parsing [%s:%d] : '%s' ignored since it is already set by using the '-L' " + "command line argument.\n", file, linenum, args[0]); + err_code |= ERR_WARN; + goto out; + } + + if (cfg_peers) { + ha_warning("parsing [%s:%d] : '%s' ignored since it is used after 'peers' section.\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + goto out; + } + + free(localpeer); + if ((localpeer = strdup(args[1])) == NULL) { + ha_alert("parsing [%s:%d]: cannot allocate memory for '%s'.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + setenv("HAPROXY_LOCALPEER", localpeer, 1); + } + else if (strcmp(args[0], "numa-cpu-mapping") == 0) { + global.numa_cpu_mapping = (kwm == KWM_NO) ? 0 : 1; + } + else if (strcmp(args[0], "anonkey") == 0) { + long long tmp = 0; + + if (*args[1] == 0) { + ha_alert("parsing [%s:%d]: a key is expected after '%s'.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (HA_ATOMIC_LOAD(&global.anon_key) == 0) { + tmp = atoll(args[1]); + if (tmp < 0 || tmp > UINT_MAX) { + ha_alert("parsing [%s:%d]: '%s' value must be within range %u-%u (was '%s').\n", + file, linenum, args[0], 0, UINT_MAX, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + HA_ATOMIC_STORE(&global.anon_key, tmp); + } + } + else { + struct cfg_kw_list *kwl; + const char *best; + int index; + int rc; + + list_for_each_entry(kwl, &cfg_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (kwl->kw[index].section != CFG_GLOBAL) + continue; + if (strcmp(kwl->kw[index].kw, args[0]) == 0) { + if (check_kw_experimental(&kwl->kw[index], file, linenum, &errmsg)) { + ha_alert("%s\n", errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + rc = kwl->kw[index].parse(args, CFG_GLOBAL, NULL, NULL, file, linenum, &errmsg); + if (rc < 0) { + ha_alert("parsing [%s:%d] : %s\n", file, linenum, errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + } + else if (rc > 0) { + ha_warning("parsing [%s:%d] : %s\n", file, linenum, errmsg); + err_code |= ERR_WARN; + goto out; + } + goto out; + } + } + } + + best = cfg_find_best_match(args[0], &cfg_keywords.list, CFG_GLOBAL, common_kw_list); + if (best) + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section; did you mean '%s' maybe ?\n", file, linenum, args[0], cursection, best); + else + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], "global"); + err_code |= ERR_ALERT | ERR_FATAL; + } + + out: + free(errmsg); + return err_code; +} + +static int cfg_parse_prealloc_fd(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(0, args, err, NULL)) + return -1; + + global.prealloc_fd = 1; + + return 0; +} + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "prealloc-fd", cfg_parse_prealloc_fd }, + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/cfgparse-listen.c b/src/cfgparse-listen.c new file mode 100644 index 0000000..4f88b77 --- /dev/null +++ b/src/cfgparse-listen.c @@ -0,0 +1,3073 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> +#include <ctype.h> +#include <pwd.h> +#include <grp.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <haproxy/acl.h> +#include <haproxy/buf.h> +#include <haproxy/capture-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/check.h> +#include <haproxy/compression-t.h> +#include <haproxy/connection.h> +#include <haproxy/extcheck.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_htx.h> +#include <haproxy/http_ext.h> +#include <haproxy/http_rules.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/peers.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/server.h> +#include <haproxy/stats-t.h> +#include <haproxy/stick_table.h> +#include <haproxy/tcpcheck.h> +#include <haproxy/tools.h> +#include <haproxy/uri_auth.h> + +/* some keywords that are still being parsed using strcmp() and are not + * registered anywhere. They are used as suggestions for mistyped words. + */ +static const char *common_kw_list[] = { + "listen", "frontend", "backend", "defaults", "server", + "default-server", "server-template", "bind", "monitor-net", + "monitor-uri", "mode", "id", "description", "disabled", "enabled", + "acl", "dynamic-cookie-key", "cookie", "email-alert", + "persist", "appsession", "load-server-state-from-file", + "server-state-file-name", "max-session-srv-conns", "capture", + "retries", "http-request", "http-response", "http-after-response", + "http-send-name-header", "block", "redirect", "use_backend", + "use-server", "force-persist", "ignore-persist", "force-persist", + "stick-table", "stick", "stats", "option", "default_backend", + "http-reuse", "monitor", "transparent", "maxconn", "backlog", + "fullconn", "dispatch", "balance", "log-balance", "hash-type", + "hash-balance-factor", "unique-id-format", "unique-id-header", + "log-format", "log-format-sd", "log-tag", "log", "source", "usesrc", + "error-log-format", + NULL /* must be last */ +}; + +static const char *common_options[] = { + "httpclose", "http-server-close", "http-keep-alive", + "redispatch", "httplog", "tcplog", "tcpka", "httpchk", + "ssl-hello-chk", "smtpchk", "pgsql-check", "redis-check", + "mysql-check", "ldap-check", "spop-check", "tcp-check", + "external-check", "forwardfor", "original-to", "forwarded", + NULL /* must be last */ +}; + +/* Report a warning if a rule is placed after a 'tcp-request session' rule. + * Return 1 if the warning has been emitted, otherwise 0. + */ +int warnif_rule_after_tcp_sess(struct proxy *proxy, const char *file, int line, const char *arg) +{ + if (!LIST_ISEMPTY(&proxy->tcp_req.l5_rules)) { + ha_warning("parsing [%s:%d] : a '%s' rule placed after a 'tcp-request session' rule will still be processed before.\n", + file, line, arg); + return 1; + } + return 0; +} + +/* Report a warning if a rule is placed after a 'tcp-request content' rule. + * Return 1 if the warning has been emitted, otherwise 0. + */ +int warnif_rule_after_tcp_cont(struct proxy *proxy, const char *file, int line, const char *arg) +{ + if (!LIST_ISEMPTY(&proxy->tcp_req.inspect_rules)) { + ha_warning("parsing [%s:%d] : a '%s' rule placed after a 'tcp-request content' rule will still be processed before.\n", + file, line, arg); + return 1; + } + return 0; +} + +/* Report a warning if a rule is placed after a 'monitor fail' rule. + * Return 1 if the warning has been emitted, otherwise 0. + */ +int warnif_rule_after_monitor(struct proxy *proxy, const char *file, int line, const char *arg) +{ + if (!LIST_ISEMPTY(&proxy->mon_fail_cond)) { + ha_warning("parsing [%s:%d] : a '%s' rule placed after a 'monitor fail' rule will still be processed before.\n", + file, line, arg); + return 1; + } + return 0; +} + +/* Report a warning if a rule is placed after an 'http_request' rule. + * Return 1 if the warning has been emitted, otherwise 0. + */ +int warnif_rule_after_http_req(struct proxy *proxy, const char *file, int line, const char *arg) +{ + if (!LIST_ISEMPTY(&proxy->http_req_rules)) { + ha_warning("parsing [%s:%d] : a '%s' rule placed after an 'http-request' rule will still be processed before.\n", + file, line, arg); + return 1; + } + return 0; +} + +/* Report a warning if a rule is placed after a redirect rule. + * Return 1 if the warning has been emitted, otherwise 0. + */ +int warnif_rule_after_redirect(struct proxy *proxy, const char *file, int line, const char *arg) +{ + if (!LIST_ISEMPTY(&proxy->redirect_rules)) { + ha_warning("parsing [%s:%d] : a '%s' rule placed after a 'redirect' rule will still be processed before.\n", + file, line, arg); + return 1; + } + return 0; +} + +/* Report a warning if a rule is placed after a 'use_backend' rule. + * Return 1 if the warning has been emitted, otherwise 0. + */ +int warnif_rule_after_use_backend(struct proxy *proxy, const char *file, int line, const char *arg) +{ + if (!LIST_ISEMPTY(&proxy->switching_rules)) { + ha_warning("parsing [%s:%d] : a '%s' rule placed after a 'use_backend' rule will still be processed before.\n", + file, line, arg); + return 1; + } + return 0; +} + +/* Report a warning if a rule is placed after a 'use-server' rule. + * Return 1 if the warning has been emitted, otherwise 0. + */ +int warnif_rule_after_use_server(struct proxy *proxy, const char *file, int line, const char *arg) +{ + if (!LIST_ISEMPTY(&proxy->server_rules)) { + ha_warning("parsing [%s:%d] : a '%s' rule placed after a 'use-server' rule will still be processed before.\n", + file, line, arg); + return 1; + } + return 0; +} + +/* report a warning if a redirect rule is dangerously placed */ +int warnif_misplaced_redirect(struct proxy *proxy, const char *file, int line, const char *arg) +{ + return warnif_rule_after_use_backend(proxy, file, line, arg) || + warnif_rule_after_use_server(proxy, file, line, arg); +} + +/* report a warning if an http-request rule is dangerously placed */ +int warnif_misplaced_http_req(struct proxy *proxy, const char *file, int line, const char *arg) +{ + return warnif_rule_after_redirect(proxy, file, line, arg) || + warnif_misplaced_redirect(proxy, file, line, arg); +} + +/* report a warning if a block rule is dangerously placed */ +int warnif_misplaced_monitor(struct proxy *proxy, const char *file, int line, const char *arg) +{ + return warnif_rule_after_http_req(proxy, file, line, arg) || + warnif_misplaced_http_req(proxy, file, line, arg); +} + +/* report a warning if a "tcp request content" rule is dangerously placed */ +int warnif_misplaced_tcp_cont(struct proxy *proxy, const char *file, int line, const char *arg) +{ + return warnif_rule_after_monitor(proxy, file, line, arg) || + warnif_misplaced_monitor(proxy, file, line, arg); +} + +/* report a warning if a "tcp request session" rule is dangerously placed */ +int warnif_misplaced_tcp_sess(struct proxy *proxy, const char *file, int line, const char *arg) +{ + return warnif_rule_after_tcp_cont(proxy, file, line, arg) || + warnif_misplaced_tcp_cont(proxy, file, line, arg); +} + +/* report a warning if a "tcp request connection" rule is dangerously placed */ +int warnif_misplaced_tcp_conn(struct proxy *proxy, const char *file, int line, const char *arg) +{ + return warnif_rule_after_tcp_sess(proxy, file, line, arg) || + warnif_misplaced_tcp_sess(proxy, file, line, arg); +} + +int cfg_parse_listen(const char *file, int linenum, char **args, int kwm) +{ + static struct proxy *curr_defproxy = NULL; + static struct proxy *last_defproxy = NULL; + const char *err; + int rc; + int err_code = 0; + struct acl_cond *cond = NULL; + char *errmsg = NULL; + struct bind_conf *bind_conf; + + if (!last_defproxy) { + /* we need a default proxy and none was created yet */ + last_defproxy = alloc_new_proxy("", PR_CAP_DEF|PR_CAP_LISTEN, &errmsg); + + curr_defproxy = last_defproxy; + if (!last_defproxy) { + ha_alert("parsing [%s:%d] : %s\n", file, linenum, errmsg); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + } + + if (strcmp(args[0], "listen") == 0) + rc = PR_CAP_LISTEN | PR_CAP_LB; + else if (strcmp(args[0], "frontend") == 0) + rc = PR_CAP_FE | PR_CAP_LB; + else if (strcmp(args[0], "backend") == 0) + rc = PR_CAP_BE | PR_CAP_LB; + else if (strcmp(args[0], "defaults") == 0) { + /* "defaults" must first delete the last no-name defaults if any */ + curr_defproxy = NULL; + rc = PR_CAP_DEF | PR_CAP_LISTEN; + } + else + rc = PR_CAP_NONE; + + if ((rc & PR_CAP_LISTEN) && !(rc & PR_CAP_DEF)) { /* new proxy */ + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects an <id> argument\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + } + + curproxy = (rc & PR_CAP_FE) ? proxy_fe_by_name(args[1]) : proxy_be_by_name(args[1]); + if (curproxy) { + ha_alert("Parsing [%s:%d]: %s '%s' has the same name as %s '%s' declared at %s:%d.\n", + file, linenum, proxy_cap_str(rc), args[1], proxy_type_str(curproxy), + curproxy->id, curproxy->conf.file, curproxy->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + + curproxy = log_forward_by_name(args[1]); + if (curproxy) { + ha_alert("Parsing [%s:%d]: %s '%s' has the same name as log forward section '%s' declared at %s:%d.\n", + file, linenum, proxy_cap_str(rc), args[1], + curproxy->id, curproxy->conf.file, curproxy->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + + if ((*args[2] && (!*args[3] || strcmp(args[2], "from") != 0)) || + alertif_too_many_args(3, file, linenum, args, &err_code)) { + if (rc & PR_CAP_FE) + ha_alert("parsing [%s:%d] : please use the 'bind' keyword for listening addresses.\n", file, linenum); + goto out; + } + } + + if (rc & PR_CAP_LISTEN) { /* new proxy or defaults section */ + const char *name = args[1]; + int arg = 2; + + if (rc & PR_CAP_DEF && strcmp(args[1], "from") == 0 && *args[2] && !*args[3]) { + // also support "defaults from blah" (no name then) + arg = 1; + name = ""; + } + + /* only regular proxies inherit from the previous defaults section */ + if (!(rc & PR_CAP_DEF)) + curr_defproxy = last_defproxy; + + if (strcmp(args[arg], "from") == 0) { + struct ebpt_node *next_by_name; + + curr_defproxy = proxy_find_by_name(args[arg+1], PR_CAP_DEF, 0); + + if (!curr_defproxy) { + ha_alert("parsing [%s:%d] : defaults section '%s' not found for %s '%s'.\n", file, linenum, args[arg+1], proxy_cap_str(rc), name); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if ((next_by_name = ebpt_next_dup(&curr_defproxy->conf.by_name))) { + struct proxy *px2 = container_of(next_by_name, struct proxy, conf.by_name); + + ha_alert("parsing [%s:%d] : ambiguous defaults section name '%s' referenced by %s '%s' exists at least at %s:%d and %s:%d.\n", + file, linenum, args[arg+1], proxy_cap_str(rc), name, + curr_defproxy->conf.file, curr_defproxy->conf.line, px2->conf.file, px2->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + + err = invalid_char(args[arg+1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in defaults section name '%s' when designated by its name (section found at %s:%d).\n", + file, linenum, *err, args[arg+1], curr_defproxy->conf.file, curr_defproxy->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + curr_defproxy->flags |= PR_FL_EXPLICIT_REF; + } + else if (curr_defproxy) + curr_defproxy->flags |= PR_FL_IMPLICIT_REF; + + if (curr_defproxy && (curr_defproxy->flags & (PR_FL_EXPLICIT_REF|PR_FL_IMPLICIT_REF)) == (PR_FL_EXPLICIT_REF|PR_FL_IMPLICIT_REF)) { + ha_warning("parsing [%s:%d] : defaults section '%s' (declared at %s:%d) is explicitly referenced by another proxy and implicitly used here." + " To avoid any ambiguity don't mix both usage. Add a last defaults section not explicitly used or always use explicit references.\n", + file, linenum, curr_defproxy->id, curr_defproxy->conf.file, curr_defproxy->conf.line); + err_code |= ERR_WARN; + } + + curproxy = parse_new_proxy(name, rc, file, linenum, curr_defproxy); + if (!curproxy) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (curr_defproxy && (!LIST_ISEMPTY(&curr_defproxy->http_req_rules) || + !LIST_ISEMPTY(&curr_defproxy->http_res_rules) || + !LIST_ISEMPTY(&curr_defproxy->http_after_res_rules) || + !LIST_ISEMPTY(&curr_defproxy->tcp_req.l4_rules) || + !LIST_ISEMPTY(&curr_defproxy->tcp_req.l5_rules) || + !LIST_ISEMPTY(&curr_defproxy->tcp_req.inspect_rules) || + !LIST_ISEMPTY(&curr_defproxy->tcp_rep.inspect_rules))) { + /* If the current default proxy defines TCP/HTTP rules, the + * current proxy will keep a reference on it. But some sanity + * checks are performed first: + * + * - It cannot be used to init a defaults section + * - It cannot be used to init a listen section + * - It cannot be used to init backend and frontend sections at + * same time. It can be used to init several sections of the + * same type only. + * - It cannot define L4/L5 TCP rules if it is used to init + * backend sections. + * - It cannot define 'tcp-response content' rules if it + * is used to init frontend sections. + * + * If no error is found, refcount of the default proxy is incremented. + */ + + /* Note: Add tcpcheck_rules too if unresolve args become allowed in defaults section */ + if (rc & PR_CAP_DEF) { + ha_alert("parsing [%s:%d]: a defaults section cannot inherit from a defaults section defining TCP/HTTP rules (defaults section at %s:%d).\n", + file, linenum, curr_defproxy->conf.file, curr_defproxy->conf.line); + err_code |= ERR_ALERT | ERR_ABORT; + } + else if ((rc & PR_CAP_LISTEN) == PR_CAP_LISTEN) { + ha_alert("parsing [%s:%d]: a listen section cannot inherit from a defaults section defining TCP/HTTP rules.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + } + else { + char defcap = (curr_defproxy->cap & PR_CAP_LISTEN); + + if ((defcap == PR_CAP_BE || defcap == PR_CAP_FE) && (rc & PR_CAP_LISTEN) != defcap) { + ha_alert("parsing [%s:%d]: frontends and backends cannot inherit from the same defaults section" + " if it defines TCP/HTTP rules (defaults section at %s:%d).\n", + file, linenum, curr_defproxy->conf.file, curr_defproxy->conf.line); + err_code |= ERR_ALERT | ERR_ABORT; + } + else if (!(rc & PR_CAP_FE) && (!LIST_ISEMPTY(&curr_defproxy->tcp_req.l4_rules) || + !LIST_ISEMPTY(&curr_defproxy->tcp_req.l5_rules))) { + ha_alert("parsing [%s:%d]: a backend section cannot inherit from a defaults section defining" + " 'tcp-request connection' or 'tcp-request session' rules (defaults section at %s:%d).\n", + file, linenum, curr_defproxy->conf.file, curr_defproxy->conf.line); + err_code |= ERR_ALERT | ERR_ABORT; + } + else if (!(rc & PR_CAP_BE) && !LIST_ISEMPTY(&curr_defproxy->tcp_rep.inspect_rules)) { + ha_alert("parsing [%s:%d]: a frontend section cannot inherit from a defaults section defining" + " 'tcp-response content' rules (defaults section at %s:%d).\n", + file, linenum, curr_defproxy->conf.file, curr_defproxy->conf.line); + err_code |= ERR_ALERT | ERR_ABORT; + } + else { + curr_defproxy->cap = (curr_defproxy->cap & ~PR_CAP_LISTEN) | (rc & PR_CAP_LISTEN); + proxy_ref_defaults(curproxy, curr_defproxy); + } + } + } + + if (curr_defproxy && (curr_defproxy->tcpcheck_rules.flags & TCPCHK_RULES_PROTO_CHK) && + (curproxy->cap & PR_CAP_LISTEN) == PR_CAP_BE) { + /* If the current default proxy defines tcpcheck rules, the + * current proxy will keep a reference on it. but only if the + * current proxy has the backend capability. + */ + proxy_ref_defaults(curproxy, curr_defproxy); + } + + if ((rc & PR_CAP_BE) && curr_defproxy && (curr_defproxy->nb_req_cap || curr_defproxy->nb_rsp_cap)) { + ha_alert("parsing [%s:%d]: backend or defaults sections cannot inherit from a defaults section defining" + " capptures (defaults section at %s:%d).\n", + file, linenum, curr_defproxy->conf.file, curr_defproxy->conf.line); + err_code |= ERR_ALERT | ERR_ABORT; + } + + if (rc & PR_CAP_DEF) { + /* last and current proxies must be updated to this one */ + curr_defproxy = last_defproxy = curproxy; + } else { + /* regular proxies are in a list */ + curproxy->next = proxies_list; + proxies_list = curproxy; + } + goto out; + } + else if (curproxy == NULL) { + ha_alert("parsing [%s:%d] : 'listen' or 'defaults' expected.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* update the current file and line being parsed */ + curproxy->conf.args.file = curproxy->conf.file; + curproxy->conf.args.line = linenum; + + /* Now let's parse the proxy-specific keywords */ + if ((strcmp(args[0], "server") == 0)) { + err_code |= parse_server(file, linenum, args, + curproxy, curr_defproxy, + SRV_PARSE_PARSE_ADDR); + + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[0], "default-server") == 0) { + err_code |= parse_server(file, linenum, args, + curproxy, curr_defproxy, + SRV_PARSE_DEFAULT_SERVER); + + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[0], "server-template") == 0) { + err_code |= parse_server(file, linenum, args, + curproxy, curr_defproxy, + SRV_PARSE_TEMPLATE|SRV_PARSE_PARSE_ADDR); + + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[0], "bind") == 0) { /* new listen addresses */ + struct listener *l; + int cur_arg; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expects {<path>|[addr1]:port1[-end1]}{,[addr]:port[-end]}... as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + bind_conf = bind_conf_alloc(curproxy, file, linenum, args[1], xprt_get(XPRT_RAW)); + if (!bind_conf) + goto alloc_error; + + /* use default settings for unix sockets */ + bind_conf->settings.ux.uid = global.unix_bind.ux.uid; + bind_conf->settings.ux.gid = global.unix_bind.ux.gid; + bind_conf->settings.ux.mode = global.unix_bind.ux.mode; + + /* NOTE: the following line might create several listeners if there + * are comma-separated IPs or port ranges. So all further processing + * will have to be applied to all listeners created after last_listen. + */ + if (!str2listener(args[1], curproxy, bind_conf, file, linenum, &errmsg)) { + if (errmsg && *errmsg) { + indent_msg(&errmsg, 2); + ha_alert("parsing [%s:%d] : '%s' : %s\n", file, linenum, args[0], errmsg); + } + else + ha_alert("parsing [%s:%d] : '%s' : error encountered while parsing listening address '%s'.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + list_for_each_entry(l, &bind_conf->listeners, by_bind) { + /* Set default global rights and owner for unix bind */ + global.maxsock++; + } + + cur_arg = 2; + err_code |= bind_parse_args_list(bind_conf, args, cur_arg, cursection, file, linenum); + goto out; + } + else if (strcmp(args[0], "monitor-net") == 0) { /* set the range of IPs to ignore */ + ha_alert("parsing [%s:%d] : 'monitor-net' doesn't exist anymore. Please use 'http-request return status 200 if { src %s }' instead.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "monitor-uri") == 0) { /* set the URI to intercept */ + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects an URI.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + istfree(&curproxy->monitor_uri); + curproxy->monitor_uri = istdup(ist(args[1])); + if (!isttest(curproxy->monitor_uri)) + goto alloc_error; + + goto out; + } + else if (strcmp(args[0], "mode") == 0) { /* sets the proxy mode */ + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + if (strcmp(args[1], "http") == 0) curproxy->mode = PR_MODE_HTTP; + else if (strcmp(args[1], "tcp") == 0) curproxy->mode = PR_MODE_TCP; + else if (strcmp(args[1], "log") == 0 && (curproxy->cap & PR_CAP_BE)) curproxy->mode = PR_MODE_SYSLOG; + else if (strcmp(args[1], "health") == 0) { + ha_alert("parsing [%s:%d] : 'mode health' doesn't exist anymore. Please use 'http-request return status 200' instead.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else { + ha_alert("parsing [%s:%d] : unknown proxy mode '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "id") == 0) { + struct eb32_node *node; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d]: '%s' not allowed in 'defaults' section.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + if (!*args[1]) { + ha_alert("parsing [%s:%d]: '%s' expects an integer argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + curproxy->uuid = atol(args[1]); + curproxy->conf.id.key = curproxy->uuid; + curproxy->options |= PR_O_FORCED_ID; + + if (curproxy->uuid <= 0) { + ha_alert("parsing [%s:%d]: custom id has to be > 0.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + node = eb32_lookup(&used_proxy_id, curproxy->uuid); + if (node) { + struct proxy *target = container_of(node, struct proxy, conf.id); + ha_alert("parsing [%s:%d]: %s %s reuses same custom id as %s %s (declared at %s:%d).\n", + file, linenum, proxy_type_str(curproxy), curproxy->id, + proxy_type_str(target), target->id, target->conf.file, target->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + eb32_insert(&used_proxy_id, &curproxy->conf.id); + } + else if (strcmp(args[0], "description") == 0) { + int i, len=0; + char *d; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d]: '%s' not allowed in 'defaults' section.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!*args[1]) { + ha_alert("parsing [%s:%d]: '%s' expects a string argument.\n", + file, linenum, args[0]); + return -1; + } + + for (i = 1; *args[i]; i++) + len += strlen(args[i]) + 1; + + d = calloc(1, len); + if (!d) + goto alloc_error; + curproxy->desc = d; + + d += snprintf(d, curproxy->desc + len - d, "%s", args[1]); + for (i = 2; *args[i]; i++) + d += snprintf(d, curproxy->desc + len - d, " %s", args[i]); + + } + else if (strcmp(args[0], "disabled") == 0) { /* disables this proxy */ + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + curproxy->flags |= PR_FL_DISABLED; + } + else if (strcmp(args[0], "enabled") == 0) { /* enables this proxy (used to revert a disabled default) */ + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + curproxy->flags &= ~PR_FL_DISABLED; + } + else if (strcmp(args[0], "bind-process") == 0) { /* enable this proxy only on some processes */ + ha_alert("parsing [%s:%d]: '%s' is not supported anymore.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + } + else if (strcmp(args[0], "acl") == 0) { /* add an ACL */ + if ((curproxy->cap & PR_CAP_DEF) && strlen(curproxy->id) == 0) { + ha_alert("parsing [%s:%d] : '%s' not allowed in anonymous 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in acl name '%s'.\n", + file, linenum, *err, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcasecmp(args[1], "or") == 0) { + ha_alert("parsing [%s:%d] : acl name '%s' will never match. 'or' is used to express a " + "logical disjunction within a condition.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (parse_acl((const char **)args + 1, &curproxy->acl, &errmsg, &curproxy->conf.args, file, linenum) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing ACL '%s' : %s.\n", + file, linenum, args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "dynamic-cookie-key") == 0) { /* Dynamic cookies secret key */ + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects <secret_key> as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->dyncookie_key); + curproxy->dyncookie_key = strdup(args[1]); + } + else if (strcmp(args[0], "cookie") == 0) { /* cookie name */ + int cur_arg; + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects <cookie_name> as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + curproxy->ck_opts = 0; + curproxy->cookie_maxidle = curproxy->cookie_maxlife = 0; + ha_free(&curproxy->cookie_domain); + free(curproxy->cookie_name); + curproxy->cookie_name = strdup(args[1]); + if (!curproxy->cookie_name) + goto alloc_error; + curproxy->cookie_len = strlen(curproxy->cookie_name); + + cur_arg = 2; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "rewrite") == 0) { + curproxy->ck_opts |= PR_CK_RW; + } + else if (strcmp(args[cur_arg], "indirect") == 0) { + curproxy->ck_opts |= PR_CK_IND; + } + else if (strcmp(args[cur_arg], "insert") == 0) { + curproxy->ck_opts |= PR_CK_INS; + } + else if (strcmp(args[cur_arg], "nocache") == 0) { + curproxy->ck_opts |= PR_CK_NOC; + } + else if (strcmp(args[cur_arg], "postonly") == 0) { + curproxy->ck_opts |= PR_CK_POST; + } + else if (strcmp(args[cur_arg], "preserve") == 0) { + curproxy->ck_opts |= PR_CK_PSV; + } + else if (strcmp(args[cur_arg], "prefix") == 0) { + curproxy->ck_opts |= PR_CK_PFX; + } + else if (strcmp(args[cur_arg], "httponly") == 0) { + curproxy->ck_opts |= PR_CK_HTTPONLY; + } + else if (strcmp(args[cur_arg], "secure") == 0) { + curproxy->ck_opts |= PR_CK_SECURE; + } + else if (strcmp(args[cur_arg], "domain") == 0) { + if (!*args[cur_arg + 1]) { + ha_alert("parsing [%s:%d]: '%s' expects <domain> as argument.\n", + file, linenum, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!strchr(args[cur_arg + 1], '.')) { + /* rfc6265, 5.2.3 The Domain Attribute */ + ha_warning("parsing [%s:%d]: domain '%s' contains no embedded dot," + " this configuration may not work properly (see RFC6265#5.2.3).\n", + file, linenum, args[cur_arg + 1]); + err_code |= ERR_WARN; + } + + err = invalid_domainchar(args[cur_arg + 1]); + if (err) { + ha_alert("parsing [%s:%d]: character '%c' is not permitted in domain name '%s'.\n", + file, linenum, *err, args[cur_arg + 1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!curproxy->cookie_domain) { + curproxy->cookie_domain = strdup(args[cur_arg + 1]); + } else { + /* one domain was already specified, add another one by + * building the string which will be returned along with + * the cookie. + */ + memprintf(&curproxy->cookie_domain, "%s; domain=%s", curproxy->cookie_domain, args[cur_arg+1]); + } + + if (!curproxy->cookie_domain) + goto alloc_error; + cur_arg++; + } + else if (strcmp(args[cur_arg], "maxidle") == 0) { + unsigned int maxidle; + const char *res; + + if (!*args[cur_arg + 1]) { + ha_alert("parsing [%s:%d]: '%s' expects <idletime> in seconds as argument.\n", + file, linenum, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + res = parse_time_err(args[cur_arg + 1], &maxidle, TIME_UNIT_S); + if (res == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s>, maximum value is 2147483647 s (~68 years).\n", + file, linenum, args[cur_arg+1], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s>, minimum non-null value is 1 s.\n", + file, linenum, args[cur_arg+1], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res) { + ha_alert("parsing [%s:%d]: unexpected character '%c' in argument to <%s>.\n", + file, linenum, *res, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->cookie_maxidle = maxidle; + cur_arg++; + } + else if (strcmp(args[cur_arg], "maxlife") == 0) { + unsigned int maxlife; + const char *res; + + if (!*args[cur_arg + 1]) { + ha_alert("parsing [%s:%d]: '%s' expects <lifetime> in seconds as argument.\n", + file, linenum, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + + res = parse_time_err(args[cur_arg + 1], &maxlife, TIME_UNIT_S); + if (res == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s>, maximum value is 2147483647 s (~68 years).\n", + file, linenum, args[cur_arg+1], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s>, minimum non-null value is 1 s.\n", + file, linenum, args[cur_arg+1], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res) { + ha_alert("parsing [%s:%d]: unexpected character '%c' in argument to <%s>.\n", + file, linenum, *res, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->cookie_maxlife = maxlife; + cur_arg++; + } + else if (strcmp(args[cur_arg], "dynamic") == 0) { /* Dynamic persistent cookies secret key */ + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[cur_arg], NULL)) + err_code |= ERR_WARN; + curproxy->ck_opts |= PR_CK_DYNAMIC; + } + else if (strcmp(args[cur_arg], "attr") == 0) { + char *val; + if (!*args[cur_arg + 1]) { + ha_alert("parsing [%s:%d]: '%s' expects <value> as argument.\n", + file, linenum, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + val = args[cur_arg + 1]; + while (*val) { + if (iscntrl((unsigned char)*val) || *val == ';') { + ha_alert("parsing [%s:%d]: character '%%x%02X' is not permitted in attribute value.\n", + file, linenum, *val); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + val++; + } + /* don't add ';' for the first attribute */ + if (!curproxy->cookie_attrs) + curproxy->cookie_attrs = strdup(args[cur_arg + 1]); + else + memprintf(&curproxy->cookie_attrs, "%s; %s", curproxy->cookie_attrs, args[cur_arg + 1]); + + if (!curproxy->cookie_attrs) + goto alloc_error; + cur_arg++; + } + + else { + ha_alert("parsing [%s:%d] : '%s' supports 'rewrite', 'insert', 'prefix', 'indirect', 'nocache', 'postonly', 'domain', 'maxidle', 'dynamic', 'maxlife' and 'attr' options.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + cur_arg++; + } + if (!POWEROF2(curproxy->ck_opts & (PR_CK_RW|PR_CK_IND))) { + ha_alert("parsing [%s:%d] : cookie 'rewrite' and 'indirect' modes are incompatible.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + } + + if (!POWEROF2(curproxy->ck_opts & (PR_CK_RW|PR_CK_INS|PR_CK_PFX))) { + ha_alert("parsing [%s:%d] : cookie 'rewrite', 'insert' and 'prefix' modes are incompatible.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + } + + if ((curproxy->ck_opts & (PR_CK_PSV | PR_CK_INS | PR_CK_IND)) == PR_CK_PSV) { + ha_alert("parsing [%s:%d] : cookie 'preserve' requires at least 'insert' or 'indirect'.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + } + }/* end else if (!strcmp(args[0], "cookie")) */ + else if (strcmp(args[0], "email-alert") == 0) { + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : missing argument after '%s'.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcmp(args[1], "from") == 0) { + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : missing argument after '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->email_alert.from); + curproxy->email_alert.from = strdup(args[2]); + if (!curproxy->email_alert.from) + goto alloc_error; + } + else if (strcmp(args[1], "mailers") == 0) { + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : missing argument after '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->email_alert.mailers.name); + curproxy->email_alert.mailers.name = strdup(args[2]); + if (!curproxy->email_alert.mailers.name) + goto alloc_error; + } + else if (strcmp(args[1], "myhostname") == 0) { + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : missing argument after '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->email_alert.myhostname); + curproxy->email_alert.myhostname = strdup(args[2]); + if (!curproxy->email_alert.myhostname) + goto alloc_error; + } + else if (strcmp(args[1], "level") == 0) { + curproxy->email_alert.level = get_log_level(args[2]); + if (curproxy->email_alert.level < 0) { + ha_alert("parsing [%s:%d] : unknown log level '%s' after '%s'\n", + file, linenum, args[1], args[2]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[1], "to") == 0) { + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : missing argument after '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->email_alert.to); + curproxy->email_alert.to = strdup(args[2]); + if (!curproxy->email_alert.to) + goto alloc_error; + } + else { + ha_alert("parsing [%s:%d] : email-alert: unknown argument '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + /* Indicate that the email_alert is at least partially configured */ + curproxy->email_alert.set = 1; + }/* end else if (!strcmp(args[0], "email-alert")) */ + else if (strcmp(args[0], "persist") == 0) { /* persist */ + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : missing persist method.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!strncmp(args[1], "rdp-cookie", 10)) { + curproxy->options2 |= PR_O2_RDPC_PRST; + + if (*(args[1] + 10) == '(') { /* cookie name */ + const char *beg, *end; + + beg = args[1] + 11; + end = strchr(beg, ')'); + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + if (!end || end == beg) { + ha_alert("parsing [%s:%d] : persist rdp-cookie(name)' requires an rdp cookie name.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + free(curproxy->rdp_cookie_name); + curproxy->rdp_cookie_name = my_strndup(beg, end - beg); + if (!curproxy->rdp_cookie_name) + goto alloc_error; + curproxy->rdp_cookie_len = end-beg; + } + else if (*(args[1] + 10) == '\0') { /* default cookie name 'msts' */ + free(curproxy->rdp_cookie_name); + curproxy->rdp_cookie_name = strdup("msts"); + if (!curproxy->rdp_cookie_name) + goto alloc_error; + curproxy->rdp_cookie_len = strlen(curproxy->rdp_cookie_name); + } + else { /* syntax */ + ha_alert("parsing [%s:%d] : persist rdp-cookie(name)' requires an rdp cookie name.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else { + ha_alert("parsing [%s:%d] : unknown persist method.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "appsession") == 0) { /* cookie name */ + ha_alert("parsing [%s:%d] : '%s' is not supported anymore since HAProxy 1.6.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "load-server-state-from-file") == 0) { + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + if (strcmp(args[1], "global") == 0) { /* use the file pointed to by global server-state-file directive */ + curproxy->load_server_state_from_file = PR_SRV_STATE_FILE_GLOBAL; + } + else if (strcmp(args[1], "local") == 0) { /* use the server-state-file-name variable to locate the server-state file */ + curproxy->load_server_state_from_file = PR_SRV_STATE_FILE_LOCAL; + } + else if (strcmp(args[1], "none") == 0) { /* don't use server-state-file directive for this backend */ + curproxy->load_server_state_from_file = PR_SRV_STATE_FILE_NONE; + } + else { + ha_alert("parsing [%s:%d] : '%s' expects 'global', 'local' or 'none'. Got '%s'\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "server-state-file-name") == 0) { + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + ha_free(&curproxy->server_state_file_name); + + if (*(args[1]) == 0 || strcmp(args[1], "use-backend-name") == 0) + curproxy->server_state_file_name = strdup(curproxy->id); + else + curproxy->server_state_file_name = strdup(args[1]); + + if (!curproxy->server_state_file_name) + goto alloc_error; + } + else if (strcmp(args[0], "max-session-srv-conns") == 0) { + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + if (*(args[1]) == 0) { + ha_alert("parsine [%s:%d] : '%s' expects a number. Got no argument\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->max_out_conns = atoi(args[1]); + } + else if (strcmp(args[0], "capture") == 0) { + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (strcmp(args[1], "cookie") == 0) { /* name of a cookie to capture */ + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s %s' not allowed in 'defaults' section.\n", file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (alertif_too_many_args_idx(4, 1, file, linenum, args, &err_code)) + goto out; + + if (*(args[4]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects 'cookie' <cookie_name> 'len' <len>.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->capture_name); + curproxy->capture_name = strdup(args[2]); + if (!curproxy->capture_name) + goto alloc_error; + curproxy->capture_namelen = strlen(curproxy->capture_name); + curproxy->capture_len = atol(args[4]); + curproxy->to_log |= LW_COOKIE; + } + else if (strcmp(args[1], "request") == 0 && strcmp(args[2], "header") == 0) { + struct cap_hdr *hdr; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s %s' not allowed in 'defaults' section.\n", file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (alertif_too_many_args_idx(4, 1, file, linenum, args, &err_code)) + goto out; + + if (*(args[3]) == 0 || strcmp(args[4], "len") != 0 || *(args[5]) == 0) { + ha_alert("parsing [%s:%d] : '%s %s' expects 'header' <header_name> 'len' <len>.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + hdr = calloc(1, sizeof(*hdr)); + if (!hdr) + goto req_caphdr_alloc_error; + hdr->next = curproxy->req_cap; + hdr->name = strdup(args[3]); + if (!hdr->name) + goto req_caphdr_alloc_error; + hdr->namelen = strlen(args[3]); + hdr->len = atol(args[5]); + hdr->pool = create_pool("caphdr", hdr->len + 1, MEM_F_SHARED); + if (!hdr->pool) { + req_caphdr_alloc_error: + if (hdr) + ha_free(&hdr->name); + ha_free(&hdr); + goto alloc_error; + } + hdr->index = curproxy->nb_req_cap++; + curproxy->req_cap = hdr; + curproxy->to_log |= LW_REQHDR; + } + else if (strcmp(args[1], "response") == 0 && strcmp(args[2], "header") == 0) { + struct cap_hdr *hdr; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s %s' not allowed in 'defaults' section.\n", file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (alertif_too_many_args_idx(4, 1, file, linenum, args, &err_code)) + goto out; + + if (*(args[3]) == 0 || strcmp(args[4], "len") != 0 || *(args[5]) == 0) { + ha_alert("parsing [%s:%d] : '%s %s' expects 'header' <header_name> 'len' <len>.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + hdr = calloc(1, sizeof(*hdr)); + if (!hdr) + goto res_caphdr_alloc_error; + hdr->next = curproxy->rsp_cap; + hdr->name = strdup(args[3]); + if (!hdr->name) + goto res_caphdr_alloc_error; + hdr->namelen = strlen(args[3]); + hdr->len = atol(args[5]); + hdr->pool = create_pool("caphdr", hdr->len + 1, MEM_F_SHARED); + if (!hdr->pool) { + res_caphdr_alloc_error: + if (hdr) + ha_free(&hdr->name); + ha_free(&hdr); + goto alloc_error; + } + hdr->index = curproxy->nb_rsp_cap++; + curproxy->rsp_cap = hdr; + curproxy->to_log |= LW_RSPHDR; + } + else { + ha_alert("parsing [%s:%d] : '%s' expects 'cookie' or 'request header' or 'response header'.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "retries") == 0) { /* connection retries */ + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument (dispatch counts for one).\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->conn_retries = atol(args[1]); + } + else if (strcmp(args[0], "http-request") == 0) { /* request access control: allow/deny/auth */ + struct act_rule *rule; + int where = 0; + + if ((curproxy->cap & PR_CAP_DEF) && strlen(curproxy->id) == 0) { + ha_alert("parsing [%s:%d] : '%s' not allowed in anonymous 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!LIST_ISEMPTY(&curproxy->http_req_rules) && + !LIST_PREV(&curproxy->http_req_rules, struct act_rule *, list)->cond && + (LIST_PREV(&curproxy->http_req_rules, struct act_rule *, list)->flags & ACT_FLAG_FINAL)) { + ha_warning("parsing [%s:%d]: previous '%s' action is final and has no condition attached, further entries are NOOP.\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + + rule = parse_http_req_cond((const char **)args + 1, file, linenum, curproxy); + + if (!rule) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + err_code |= warnif_misplaced_http_req(curproxy, file, linenum, args[0]); + + if (curproxy->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (curproxy->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + err_code |= warnif_cond_conflicts(rule->cond, where, file, linenum); + + LIST_APPEND(&curproxy->http_req_rules, &rule->list); + } + else if (strcmp(args[0], "http-response") == 0) { /* response access control */ + struct act_rule *rule; + int where = 0; + + if ((curproxy->cap & PR_CAP_DEF) && strlen(curproxy->id) == 0) { + ha_alert("parsing [%s:%d] : '%s' not allowed in anonymous 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!LIST_ISEMPTY(&curproxy->http_res_rules) && + !LIST_PREV(&curproxy->http_res_rules, struct act_rule *, list)->cond && + (LIST_PREV(&curproxy->http_res_rules, struct act_rule *, list)->flags & ACT_FLAG_FINAL)) { + ha_warning("parsing [%s:%d]: previous '%s' action is final and has no condition attached, further entries are NOOP.\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + + rule = parse_http_res_cond((const char **)args + 1, file, linenum, curproxy); + + if (!rule) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (curproxy->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRS_HDR; + if (curproxy->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRS_HDR; + err_code |= warnif_cond_conflicts(rule->cond, where, file, linenum); + + LIST_APPEND(&curproxy->http_res_rules, &rule->list); + } + else if (strcmp(args[0], "http-after-response") == 0) { + struct act_rule *rule; + int where = 0; + if ((curproxy->cap & PR_CAP_DEF) && strlen(curproxy->id) == 0) { + ha_alert("parsing [%s:%d] : '%s' not allowed in anonymous 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!LIST_ISEMPTY(&curproxy->http_after_res_rules) && + !LIST_PREV(&curproxy->http_after_res_rules, struct act_rule *, list)->cond && + (LIST_PREV(&curproxy->http_after_res_rules, struct act_rule *, list)->flags & ACT_FLAG_FINAL)) { + ha_warning("parsing [%s:%d]: previous '%s' action is final and has no condition attached, further entries are NOOP.\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + + rule = parse_http_after_res_cond((const char **)args + 1, file, linenum, curproxy); + + if (!rule) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (curproxy->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRS_HDR; + if (curproxy->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRS_HDR; + err_code |= warnif_cond_conflicts(rule->cond, where, file, linenum); + + LIST_APPEND(&curproxy->http_after_res_rules, &rule->list); + } + else if (strcmp(args[0], "http-send-name-header") == 0) { /* send server name in request header */ + /* set the header name and length into the proxy structure */ + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' requires a header string.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* set the desired header name, in lower case */ + istfree(&curproxy->server_id_hdr_name); + curproxy->server_id_hdr_name = istdup(ist(args[1])); + if (!isttest(curproxy->server_id_hdr_name)) + goto alloc_error; + ist2bin_lc(istptr(curproxy->server_id_hdr_name), curproxy->server_id_hdr_name); + } + else if (strcmp(args[0], "block") == 0) { + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. Use 'http-request deny' which uses the exact same syntax.\n", file, linenum, args[0]); + + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "redirect") == 0) { + struct redirect_rule *rule; + int where = 0; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((rule = http_parse_redirect_rule(file, linenum, curproxy, (const char **)args + 1, &errmsg, 0, 0)) == NULL) { + ha_alert("parsing [%s:%d] : error detected in %s '%s' while parsing redirect rule : %s.\n", + file, linenum, proxy_type_str(curproxy), curproxy->id, errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + LIST_APPEND(&curproxy->redirect_rules, &rule->list); + err_code |= warnif_misplaced_redirect(curproxy, file, linenum, args[0]); + + if (curproxy->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (curproxy->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + err_code |= warnif_cond_conflicts(rule->cond, where, file, linenum); + } + else if (strcmp(args[0], "use_backend") == 0) { + struct switching_rule *rule; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a backend name.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcmp(args[2], "if") == 0 || strcmp(args[2], "unless") == 0) { + if ((cond = build_acl_cond(file, linenum, &curproxy->acl, curproxy, (const char **)args + 2, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing switching rule : %s.\n", + file, linenum, errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err_code |= warnif_cond_conflicts(cond, SMP_VAL_FE_SET_BCK, file, linenum); + } + else if (*args[2]) { + ha_alert("parsing [%s:%d] : unexpected keyword '%s' after switching rule, only 'if' and 'unless' are allowed.\n", + file, linenum, args[2]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + rule = calloc(1, sizeof(*rule)); + if (!rule) + goto use_backend_alloc_error; + rule->cond = cond; + rule->be.name = strdup(args[1]); + if (!rule->be.name) + goto use_backend_alloc_error; + rule->line = linenum; + rule->file = strdup(file); + if (!rule->file) { + use_backend_alloc_error: + free_acl_cond(cond); + if (rule) + ha_free(&(rule->be.name)); + ha_free(&rule); + goto alloc_error; + } + LIST_INIT(&rule->list); + LIST_APPEND(&curproxy->switching_rules, &rule->list); + } + else if (strcmp(args[0], "use-server") == 0) { + struct server_rule *rule; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a server name.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcmp(args[2], "if") != 0 && strcmp(args[2], "unless") != 0) { + ha_alert("parsing [%s:%d] : '%s' requires either 'if' or 'unless' followed by a condition.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((cond = build_acl_cond(file, linenum, &curproxy->acl, curproxy, (const char **)args + 2, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing switching rule : %s.\n", + file, linenum, errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err_code |= warnif_cond_conflicts(cond, SMP_VAL_BE_SET_SRV, file, linenum); + + rule = calloc(1, sizeof(*rule)); + if (!rule) + goto use_server_alloc_error; + rule->cond = cond; + rule->srv.name = strdup(args[1]); + if (!rule->srv.name) + goto use_server_alloc_error; + rule->line = linenum; + rule->file = strdup(file); + if (!rule->file) { + use_server_alloc_error: + free_acl_cond(cond); + if (rule) + ha_free(&(rule->srv.name)); + ha_free(&rule); + goto alloc_error; + } + LIST_INIT(&rule->list); + LIST_APPEND(&curproxy->server_rules, &rule->list); + curproxy->be_req_ana |= AN_REQ_SRV_RULES; + } + else if ((strcmp(args[0], "force-persist") == 0) || + (strcmp(args[0], "ignore-persist") == 0)) { + struct persist_rule *rule; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (strcmp(args[1], "if") != 0 && strcmp(args[1], "unless") != 0) { + ha_alert("parsing [%s:%d] : '%s' requires either 'if' or 'unless' followed by a condition.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((cond = build_acl_cond(file, linenum, &curproxy->acl, curproxy, (const char **)args + 1, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing a '%s' rule : %s.\n", + file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* note: BE_REQ_CNT is the first one after FE_SET_BCK, which is + * where force-persist is applied. + */ + err_code |= warnif_cond_conflicts(cond, SMP_VAL_BE_REQ_CNT, file, linenum); + + rule = calloc(1, sizeof(*rule)); + if (!rule) { + free_acl_cond(cond); + goto alloc_error; + } + rule->cond = cond; + if (strcmp(args[0], "force-persist") == 0) { + rule->type = PERSIST_TYPE_FORCE; + } else { + rule->type = PERSIST_TYPE_IGNORE; + } + LIST_INIT(&rule->list); + LIST_APPEND(&curproxy->persist_rules, &rule->list); + } + else if (strcmp(args[0], "stick-table") == 0) { + struct stktable *other; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : 'stick-table' is not supported in 'defaults' section.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + other = stktable_find_by_name(curproxy->id); + if (other) { + ha_alert("parsing [%s:%d] : stick-table name '%s' conflicts with table declared in %s '%s' at %s:%d.\n", + file, linenum, curproxy->id, + other->proxy ? proxy_cap_str(other->proxy->cap) : "peers", + other->proxy ? other->id : other->peers.p->id, + other->conf.file, other->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + curproxy->table = calloc(1, sizeof *curproxy->table); + if (!curproxy->table) { + ha_alert("parsing [%s:%d]: '%s %s' : memory allocation failed\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err_code |= parse_stick_table(file, linenum, args, curproxy->table, + curproxy->id, curproxy->id, NULL); + if (err_code & ERR_FATAL) { + ha_free(&curproxy->table); + goto out; + } + + /* Store the proxy in the stick-table. */ + curproxy->table->proxy = curproxy; + + stktable_store_name(curproxy->table); + curproxy->table->next = stktables_list; + stktables_list = curproxy->table; + + /* Add this proxy to the list of proxies which refer to its stick-table. */ + if (curproxy->table->proxies_list != curproxy) { + curproxy->next_stkt_ref = curproxy->table->proxies_list; + curproxy->table->proxies_list = curproxy; + } + } + else if (strcmp(args[0], "stick") == 0) { + struct sticking_rule *rule; + struct sample_expr *expr; + int myidx = 0; + const char *name = NULL; + int flags; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) { + err_code |= ERR_WARN; + goto out; + } + + myidx++; + if ((strcmp(args[myidx], "store") == 0) || + (strcmp(args[myidx], "store-request") == 0)) { + myidx++; + flags = STK_IS_STORE; + } + else if (strcmp(args[myidx], "store-response") == 0) { + myidx++; + flags = STK_IS_STORE | STK_ON_RSP; + } + else if (strcmp(args[myidx], "match") == 0) { + myidx++; + flags = STK_IS_MATCH; + } + else if (strcmp(args[myidx], "on") == 0) { + myidx++; + flags = STK_IS_MATCH | STK_IS_STORE; + } + else { + ha_alert("parsing [%s:%d] : '%s' expects 'on', 'match', or 'store'.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (*(args[myidx]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a fetch method.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + curproxy->conf.args.ctx = ARGC_STK; + expr = sample_parse_expr(args, &myidx, file, linenum, &errmsg, &curproxy->conf.args, NULL); + if (!expr) { + ha_alert("parsing [%s:%d] : '%s': %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (flags & STK_ON_RSP) { + if (!(expr->fetch->val & SMP_VAL_BE_STO_RUL)) { + ha_alert("parsing [%s:%d] : '%s': fetch method '%s' extracts information from '%s', none of which is available for 'store-response'.\n", + file, linenum, args[0], expr->fetch->kw, sample_src_names(expr->fetch->use)); + err_code |= ERR_ALERT | ERR_FATAL; + free(expr); + goto out; + } + } else { + if (!(expr->fetch->val & SMP_VAL_BE_SET_SRV)) { + ha_alert("parsing [%s:%d] : '%s': fetch method '%s' extracts information from '%s', none of which is available during request.\n", + file, linenum, args[0], expr->fetch->kw, sample_src_names(expr->fetch->use)); + err_code |= ERR_ALERT | ERR_FATAL; + free(expr); + goto out; + } + } + + /* check if we need to allocate an http_txn struct for HTTP parsing */ + curproxy->http_needed |= !!(expr->fetch->use & SMP_USE_HTTP_ANY); + + if (strcmp(args[myidx], "table") == 0) { + myidx++; + name = args[myidx++]; + } + + if (strcmp(args[myidx], "if") == 0 || strcmp(args[myidx], "unless") == 0) { + if ((cond = build_acl_cond(file, linenum, &curproxy->acl, curproxy, (const char **)args + myidx, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : '%s': error detected while parsing sticking condition : %s.\n", + file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + free(expr); + goto out; + } + } + else if (*(args[myidx])) { + ha_alert("parsing [%s:%d] : '%s': unknown keyword '%s'.\n", + file, linenum, args[0], args[myidx]); + err_code |= ERR_ALERT | ERR_FATAL; + free(expr); + goto out; + } + if (flags & STK_ON_RSP) + err_code |= warnif_cond_conflicts(cond, SMP_VAL_BE_STO_RUL, file, linenum); + else + err_code |= warnif_cond_conflicts(cond, SMP_VAL_BE_SET_SRV, file, linenum); + + rule = calloc(1, sizeof(*rule)); + if (!rule) { + free_acl_cond(cond); + goto alloc_error; + } + rule->cond = cond; + rule->expr = expr; + rule->flags = flags; + rule->table.name = name ? strdup(name) : NULL; + LIST_INIT(&rule->list); + if (flags & STK_ON_RSP) + LIST_APPEND(&curproxy->storersp_rules, &rule->list); + else + LIST_APPEND(&curproxy->sticking_rules, &rule->list); + } + else if (strcmp(args[0], "stats") == 0) { + if (!(curproxy->cap & PR_CAP_DEF) && curproxy->uri_auth == curr_defproxy->uri_auth) + curproxy->uri_auth = NULL; /* we must detach from the default config */ + + if (!*args[1]) { + goto stats_error_parsing; + } else if (strcmp(args[1], "admin") == 0) { + struct stats_admin_rule *rule; + int where = 0; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d]: '%s %s' not allowed in 'defaults' section.\n", file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!stats_check_init_uri_auth(&curproxy->uri_auth)) + goto alloc_error; + + if (strcmp(args[2], "if") != 0 && strcmp(args[2], "unless") != 0) { + ha_alert("parsing [%s:%d] : '%s %s' requires either 'if' or 'unless' followed by a condition.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if ((cond = build_acl_cond(file, linenum, &curproxy->acl, curproxy, (const char **)args + 2, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing a '%s %s' rule : %s.\n", + file, linenum, args[0], args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (curproxy->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (curproxy->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + err_code |= warnif_cond_conflicts(cond, where, file, linenum); + + rule = calloc(1, sizeof(*rule)); + if (!rule) { + free_acl_cond(cond); + goto alloc_error; + } + rule->cond = cond; + LIST_INIT(&rule->list); + LIST_APPEND(&curproxy->uri_auth->admin_rules, &rule->list); + } else if (strcmp(args[1], "uri") == 0) { + if (*(args[2]) == 0) { + ha_alert("parsing [%s:%d] : 'uri' needs an URI prefix.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } else if (!stats_set_uri(&curproxy->uri_auth, args[2])) + goto alloc_error; + } else if (strcmp(args[1], "realm") == 0) { + if (*(args[2]) == 0) { + ha_alert("parsing [%s:%d] : 'realm' needs an realm name.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } else if (!stats_set_realm(&curproxy->uri_auth, args[2])) + goto alloc_error; + } else if (strcmp(args[1], "refresh") == 0) { + unsigned interval; + + err = parse_time_err(args[2], &interval, TIME_UNIT_S); + if (err == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to stats refresh interval, maximum value is 2147483647 s (~68 years).\n", + file, linenum, args[2]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (err == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to stats refresh interval, minimum non-null value is 1 s.\n", + file, linenum, args[2]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (err) { + ha_alert("parsing [%s:%d]: unexpected character '%c' in argument to stats refresh interval.\n", + file, linenum, *err); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } else if (!stats_set_refresh(&curproxy->uri_auth, interval)) + goto alloc_error; + } else if (strcmp(args[1], "http-request") == 0) { /* request access control: allow/deny/auth */ + struct act_rule *rule; + int where = 0; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d]: '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!stats_check_init_uri_auth(&curproxy->uri_auth)) + goto alloc_error; + + if (!LIST_ISEMPTY(&curproxy->uri_auth->http_req_rules) && + !LIST_PREV(&curproxy->uri_auth->http_req_rules, struct act_rule *, list)->cond) { + ha_warning("parsing [%s:%d]: previous '%s' action has no condition attached, further entries are NOOP.\n", + file, linenum, args[0]); + err_code |= ERR_WARN; + } + + rule = parse_http_req_cond((const char **)args + 2, file, linenum, curproxy); + + if (!rule) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (curproxy->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (curproxy->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + err_code |= warnif_cond_conflicts(rule->cond, where, file, linenum); + LIST_APPEND(&curproxy->uri_auth->http_req_rules, &rule->list); + + } else if (strcmp(args[1], "auth") == 0) { + if (*(args[2]) == 0) { + ha_alert("parsing [%s:%d] : 'auth' needs a user:password account.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } else if (!stats_add_auth(&curproxy->uri_auth, args[2])) + goto alloc_error; + } else if (strcmp(args[1], "scope") == 0) { + if (*(args[2]) == 0) { + ha_alert("parsing [%s:%d] : 'scope' needs a proxy name.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } else if (!stats_add_scope(&curproxy->uri_auth, args[2])) + goto alloc_error; + } else if (strcmp(args[1], "enable") == 0) { + if (!stats_check_init_uri_auth(&curproxy->uri_auth)) + goto alloc_error; + } else if (strcmp(args[1], "hide-version") == 0) { + if (!stats_set_flag(&curproxy->uri_auth, STAT_HIDEVER)) + goto alloc_error; + } else if (strcmp(args[1], "show-legends") == 0) { + if (!stats_set_flag(&curproxy->uri_auth, STAT_SHLGNDS)) + goto alloc_error; + } else if (strcmp(args[1], "show-modules") == 0) { + if (!stats_set_flag(&curproxy->uri_auth, STAT_SHMODULES)) + goto alloc_error; + } else if (strcmp(args[1], "show-node") == 0) { + + if (*args[2]) { + int i; + char c; + + for (i=0; args[2][i]; i++) { + c = args[2][i]; + if (!isupper((unsigned char)c) && !islower((unsigned char)c) && + !isdigit((unsigned char)c) && c != '_' && c != '-' && c != '.') + break; + } + + if (!i || args[2][i]) { + ha_alert("parsing [%s:%d]: '%s %s' invalid node name - should be a string" + "with digits(0-9), letters(A-Z, a-z), hyphen(-) or underscode(_).\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + if (!stats_set_node(&curproxy->uri_auth, args[2])) + goto alloc_error; + } else if (strcmp(args[1], "show-desc") == 0) { + char *desc = NULL; + + if (*args[2]) { + int i, len=0; + char *d; + + for (i = 2; *args[i]; i++) + len += strlen(args[i]) + 1; + + desc = d = calloc(1, len); + + d += snprintf(d, desc + len - d, "%s", args[2]); + for (i = 3; *args[i]; i++) + d += snprintf(d, desc + len - d, " %s", args[i]); + } + + if (!*args[2] && !global.desc) + ha_warning("parsing [%s:%d]: '%s' requires a parameter or 'desc' to be set in the global section.\n", + file, linenum, args[1]); + else { + if (!stats_set_desc(&curproxy->uri_auth, desc)) { + free(desc); + goto alloc_error; + } + free(desc); + } + } else { +stats_error_parsing: + ha_alert("parsing [%s:%d]: %s '%s', expects 'admin', 'uri', 'realm', 'auth', 'scope', 'enable', 'hide-version', 'show-node', 'show-desc' or 'show-legends'.\n", + file, linenum, *args[1]?"unknown stats parameter":"missing keyword in", args[*args[1]?1:0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "option") == 0) { + int optnum; + + if (*(args[1]) == '\0') { + ha_alert("parsing [%s:%d]: '%s' expects an option name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + for (optnum = 0; cfg_opts[optnum].name; optnum++) { + if (strcmp(args[1], cfg_opts[optnum].name) == 0) { + if (cfg_opts[optnum].cap == PR_CAP_NONE) { + ha_alert("parsing [%s:%d]: option '%s' is not supported due to build options.\n", + file, linenum, cfg_opts[optnum].name); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + + if (warnifnotcap(curproxy, cfg_opts[optnum].cap, file, linenum, args[1], NULL)) { + err_code |= ERR_WARN; + goto out; + } + + curproxy->no_options &= ~cfg_opts[optnum].val; + curproxy->options &= ~cfg_opts[optnum].val; + + switch (kwm) { + case KWM_STD: + curproxy->options |= cfg_opts[optnum].val; + break; + case KWM_NO: + curproxy->no_options |= cfg_opts[optnum].val; + break; + case KWM_DEF: /* already cleared */ + break; + } + + goto out; + } + } + + for (optnum = 0; cfg_opts2[optnum].name; optnum++) { + if (strcmp(args[1], cfg_opts2[optnum].name) == 0) { + if (cfg_opts2[optnum].cap == PR_CAP_NONE) { + ha_alert("parsing [%s:%d]: option '%s' is not supported due to build options.\n", + file, linenum, cfg_opts2[optnum].name); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + if (warnifnotcap(curproxy, cfg_opts2[optnum].cap, file, linenum, args[1], NULL)) { + err_code |= ERR_WARN; + goto out; + } + + curproxy->no_options2 &= ~cfg_opts2[optnum].val; + curproxy->options2 &= ~cfg_opts2[optnum].val; + + switch (kwm) { + case KWM_STD: + curproxy->options2 |= cfg_opts2[optnum].val; + break; + case KWM_NO: + curproxy->no_options2 |= cfg_opts2[optnum].val; + break; + case KWM_DEF: /* already cleared */ + break; + } + goto out; + } + } + + /* HTTP options override each other. They can be cancelled using + * "no option xxx" which only switches to default mode if the mode + * was this one (useful for cancelling options set in defaults + * sections). + */ + if (strcmp(args[1], "forceclose") == 0) { + ha_alert("parsing [%s:%d]: option '%s' is not supported any more since HAProxy 2.0, please just remove it, or use 'option httpclose' if absolutely needed.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[1], "httpclose") == 0) { + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_STD) { + curproxy->options &= ~PR_O_HTTP_MODE; + curproxy->options |= PR_O_HTTP_CLO; + goto out; + } + else if (kwm == KWM_NO) { + if ((curproxy->options & PR_O_HTTP_MODE) == PR_O_HTTP_CLO) + curproxy->options &= ~PR_O_HTTP_MODE; + goto out; + } + } + else if (strcmp(args[1], "http-server-close") == 0) { + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_STD) { + curproxy->options &= ~PR_O_HTTP_MODE; + curproxy->options |= PR_O_HTTP_SCL; + goto out; + } + else if (kwm == KWM_NO) { + if ((curproxy->options & PR_O_HTTP_MODE) == PR_O_HTTP_SCL) + curproxy->options &= ~PR_O_HTTP_MODE; + goto out; + } + } + else if (strcmp(args[1], "http-keep-alive") == 0) { + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_STD) { + curproxy->options &= ~PR_O_HTTP_MODE; + curproxy->options |= PR_O_HTTP_KAL; + goto out; + } + else if (kwm == KWM_NO) { + if ((curproxy->options & PR_O_HTTP_MODE) == PR_O_HTTP_KAL) + curproxy->options &= ~PR_O_HTTP_MODE; + goto out; + } + } + else if (strcmp(args[1], "http-tunnel") == 0) { + ha_alert("parsing [%s:%d]: option '%s' is not supported any more since HAProxy 2.1, please just remove it, it shouldn't be needed.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[1], "forwarded") == 0) { + if (kwm == KWM_STD) { + err_code |= proxy_http_parse_7239(args, 0, curproxy, curr_defproxy, file, linenum); + goto out; + } + else if (kwm == KWM_NO) { + if (curproxy->http_ext) + http_ext_7239_clean(curproxy); + goto out; + } + } + + /* Redispatch can take an integer argument that control when the + * resispatch occurs. All values are relative to the retries option. + * This can be cancelled using "no option xxx". + */ + if (strcmp(args[1], "redispatch") == 0) { + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[1], NULL)) { + err_code |= ERR_WARN; + goto out; + } + + curproxy->no_options &= ~PR_O_REDISP; + curproxy->options &= ~PR_O_REDISP; + + switch (kwm) { + case KWM_STD: + curproxy->options |= PR_O_REDISP; + curproxy->redispatch_after = -1; + if(*args[2]) { + curproxy->redispatch_after = atol(args[2]); + } + break; + case KWM_NO: + curproxy->no_options |= PR_O_REDISP; + curproxy->redispatch_after = 0; + break; + case KWM_DEF: /* already cleared */ + break; + } + goto out; + } + + if (strcmp(args[1], "http_proxy") == 0) { + ha_alert("parsing [%s:%d]: option '%s' is not supported any more since HAProxy 2.5. This option stopped working in HAProxy 1.9 and usually had nasty side effects. It can be more reliably implemented with combinations of 'http-request set-dst' and 'http-request set-uri', and even 'http-request do-resolve' if DNS resolution is desired.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (kwm != KWM_STD) { + ha_alert("parsing [%s:%d]: negation/default is not supported for option '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcmp(args[1], "httplog") == 0) { + char *logformat; + /* generate a complete HTTP log */ + logformat = default_http_log_format; + if (*(args[2]) != '\0') { + if (strcmp(args[2], "clf") == 0) { + curproxy->options2 |= PR_O2_CLFLOG; + logformat = clf_http_log_format; + } else { + ha_alert("parsing [%s:%d] : keyword '%s' only supports option 'clf'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args_idx(1, 1, file, linenum, args, &err_code)) + goto out; + } + if (curproxy->conf.logformat_string && curproxy->cap & PR_CAP_DEF) { + char *oldlogformat = "log-format"; + char *clflogformat = ""; + + if (curproxy->conf.logformat_string == default_http_log_format) + oldlogformat = "option httplog"; + else if (curproxy->conf.logformat_string == default_tcp_log_format) + oldlogformat = "option tcplog"; + else if (curproxy->conf.logformat_string == clf_http_log_format) + oldlogformat = "option httplog clf"; + else if (curproxy->conf.logformat_string == default_https_log_format) + oldlogformat = "option httpslog"; + if (logformat == clf_http_log_format) + clflogformat = " clf"; + ha_warning("parsing [%s:%d]: 'option httplog%s' overrides previous '%s' in 'defaults' section.\n", + file, linenum, clflogformat, oldlogformat); + } + if (curproxy->conf.logformat_string != default_http_log_format && + curproxy->conf.logformat_string != default_tcp_log_format && + curproxy->conf.logformat_string != clf_http_log_format && + curproxy->conf.logformat_string != default_https_log_format) + free(curproxy->conf.logformat_string); + curproxy->conf.logformat_string = logformat; + + free(curproxy->conf.lfs_file); + curproxy->conf.lfs_file = strdup(curproxy->conf.args.file); + curproxy->conf.lfs_line = curproxy->conf.args.line; + + if (!(curproxy->cap & PR_CAP_DEF) && !(curproxy->cap & PR_CAP_FE)) { + ha_warning("parsing [%s:%d] : backend '%s' : 'option httplog' directive is ignored in backends.\n", + file, linenum, curproxy->id); + err_code |= ERR_WARN; + } + } + else if (strcmp(args[1], "tcplog") == 0) { + if (curproxy->conf.logformat_string && curproxy->cap & PR_CAP_DEF) { + char *oldlogformat = "log-format"; + + if (curproxy->conf.logformat_string == default_http_log_format) + oldlogformat = "option httplog"; + else if (curproxy->conf.logformat_string == default_tcp_log_format) + oldlogformat = "option tcplog"; + else if (curproxy->conf.logformat_string == clf_http_log_format) + oldlogformat = "option httplog clf"; + else if (curproxy->conf.logformat_string == default_https_log_format) + oldlogformat = "option httpslog"; + ha_warning("parsing [%s:%d]: 'option tcplog' overrides previous '%s' in 'defaults' section.\n", + file, linenum, oldlogformat); + } + /* generate a detailed TCP log */ + if (curproxy->conf.logformat_string != default_http_log_format && + curproxy->conf.logformat_string != default_tcp_log_format && + curproxy->conf.logformat_string != clf_http_log_format && + curproxy->conf.logformat_string != default_https_log_format) + free(curproxy->conf.logformat_string); + curproxy->conf.logformat_string = default_tcp_log_format; + + free(curproxy->conf.lfs_file); + curproxy->conf.lfs_file = strdup(curproxy->conf.args.file); + curproxy->conf.lfs_line = curproxy->conf.args.line; + + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + + if (!(curproxy->cap & PR_CAP_DEF) && !(curproxy->cap & PR_CAP_FE)) { + ha_warning("parsing [%s:%d] : backend '%s' : 'option tcplog' directive is ignored in backends.\n", + file, linenum, curproxy->id); + err_code |= ERR_WARN; + } + } + else if (strcmp(args[1], "httpslog") == 0) { + char *logformat; + /* generate a complete HTTP log */ + logformat = default_https_log_format; + if (curproxy->conf.logformat_string && curproxy->cap & PR_CAP_DEF) { + char *oldlogformat = "log-format"; + + if (curproxy->conf.logformat_string == default_http_log_format) + oldlogformat = "option httplog"; + else if (curproxy->conf.logformat_string == default_tcp_log_format) + oldlogformat = "option tcplog"; + else if (curproxy->conf.logformat_string == clf_http_log_format) + oldlogformat = "option httplog clf"; + else if (curproxy->conf.logformat_string == default_https_log_format) + oldlogformat = "option httpslog"; + ha_warning("parsing [%s:%d]: 'option httplog' overrides previous '%s' in 'defaults' section.\n", + file, linenum, oldlogformat); + } + if (curproxy->conf.logformat_string != default_http_log_format && + curproxy->conf.logformat_string != default_tcp_log_format && + curproxy->conf.logformat_string != clf_http_log_format && + curproxy->conf.logformat_string != default_https_log_format) + free(curproxy->conf.logformat_string); + curproxy->conf.logformat_string = logformat; + + free(curproxy->conf.lfs_file); + curproxy->conf.lfs_file = strdup(curproxy->conf.args.file); + curproxy->conf.lfs_line = curproxy->conf.args.line; + + if (!(curproxy->cap & PR_CAP_DEF) && !(curproxy->cap & PR_CAP_FE)) { + ha_warning("parsing [%s:%d] : backend '%s' : 'option httpslog' directive is ignored in backends.\n", + file, linenum, curproxy->id); + err_code |= ERR_WARN; + } + } + else if (strcmp(args[1], "tcpka") == 0) { + /* enable TCP keep-alives on client and server streams */ + if (warnifnotcap(curproxy, PR_CAP_BE | PR_CAP_FE, file, linenum, args[1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + + if (curproxy->cap & PR_CAP_FE) + curproxy->options |= PR_O_TCP_CLI_KA; + if (curproxy->cap & PR_CAP_BE) + curproxy->options |= PR_O_TCP_SRV_KA; + } + else if (strcmp(args[1], "httpchk") == 0) { + err_code |= proxy_parse_httpchk_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "ssl-hello-chk") == 0) { + err_code |= proxy_parse_ssl_hello_chk_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "smtpchk") == 0) { + err_code |= proxy_parse_smtpchk_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "pgsql-check") == 0) { + err_code |= proxy_parse_pgsql_check_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "redis-check") == 0) { + err_code |= proxy_parse_redis_check_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "mysql-check") == 0) { + err_code |= proxy_parse_mysql_check_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "ldap-check") == 0) { + err_code |= proxy_parse_ldap_check_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "spop-check") == 0) { + err_code |= proxy_parse_spop_check_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "tcp-check") == 0) { + err_code |= proxy_parse_tcp_check_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "external-check") == 0) { + err_code |= proxy_parse_external_check_opt(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "forwardfor") == 0) { + err_code |= proxy_http_parse_xff(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "originalto") == 0) { + err_code |= proxy_http_parse_xot(args, 0, curproxy, curr_defproxy, file, linenum); + if (err_code & ERR_FATAL) + goto out; + } + else if (strcmp(args[1], "http-restrict-req-hdr-names") == 0) { + if (alertif_too_many_args(2, file, linenum, args, &err_code)) + goto out; + + if (*(args[2]) == 0) { + ha_alert("parsing [%s:%d] : missing parameter. option '%s' expects 'preserve', 'reject' or 'delete' option.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + curproxy->options2 &= ~PR_O2_RSTRICT_REQ_HDR_NAMES_MASK; + if (strcmp(args[2], "preserve") == 0) + curproxy->options2 |= PR_O2_RSTRICT_REQ_HDR_NAMES_NOOP; + else if (strcmp(args[2], "reject") == 0) + curproxy->options2 |= PR_O2_RSTRICT_REQ_HDR_NAMES_BLK; + else if (strcmp(args[2], "delete") == 0) + curproxy->options2 |= PR_O2_RSTRICT_REQ_HDR_NAMES_DEL; + else { + ha_alert("parsing [%s:%d] : invalid parameter '%s'. option '%s' expects 'preserve', 'reject' or 'delete' option.\n", + file, linenum, args[2], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else { + const char *best = proxy_find_best_option(args[1], common_options); + + if (best) + ha_alert("parsing [%s:%d] : unknown option '%s'; did you mean '%s' maybe ?\n", file, linenum, args[1], best); + else + ha_alert("parsing [%s:%d] : unknown option '%s'.\n", file, linenum, args[1]); + + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + goto out; + } + else if (strcmp(args[0], "default_backend") == 0) { + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a backend name.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->defbe.name); + curproxy->defbe.name = strdup(args[1]); + if (!curproxy->defbe.name) + goto alloc_error; + + if (alertif_too_many_args_idx(1, 0, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[0], "redispatch") == 0 || strcmp(args[0], "redisp") == 0) { + ha_alert("parsing [%s:%d] : keyword '%s' directive is not supported anymore since HAProxy 2.1. Use 'option redispatch'.\n", file, linenum, args[0]); + + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "http-reuse") == 0) { + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (strcmp(args[1], "never") == 0) { + /* enable a graceful server shutdown on an HTTP 404 response */ + curproxy->options &= ~PR_O_REUSE_MASK; + curproxy->options |= PR_O_REUSE_NEVR; + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[1], "safe") == 0) { + /* enable a graceful server shutdown on an HTTP 404 response */ + curproxy->options &= ~PR_O_REUSE_MASK; + curproxy->options |= PR_O_REUSE_SAFE; + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[1], "aggressive") == 0) { + curproxy->options &= ~PR_O_REUSE_MASK; + curproxy->options |= PR_O_REUSE_AGGR; + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[1], "always") == 0) { + /* enable a graceful server shutdown on an HTTP 404 response */ + curproxy->options &= ~PR_O_REUSE_MASK; + curproxy->options |= PR_O_REUSE_ALWS; + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + } + else { + ha_alert("parsing [%s:%d] : '%s' only supports 'never', 'safe', 'aggressive', 'always'.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "monitor") == 0) { + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (strcmp(args[1], "fail") == 0) { + /* add a condition to fail monitor requests */ + if (strcmp(args[2], "if") != 0 && strcmp(args[2], "unless") != 0) { + ha_alert("parsing [%s:%d] : '%s %s' requires either 'if' or 'unless' followed by a condition.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err_code |= warnif_misplaced_monitor(curproxy, file, linenum, "monitor fail"); + if ((cond = build_acl_cond(file, linenum, &curproxy->acl, curproxy, (const char **)args + 2, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing a '%s %s' condition : %s.\n", + file, linenum, args[0], args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + LIST_APPEND(&curproxy->mon_fail_cond, &cond->list); + } + else { + ha_alert("parsing [%s:%d] : '%s' only supports 'fail'.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } +#ifdef USE_TPROXY + else if (strcmp(args[0], "transparent") == 0) { + /* enable transparent proxy connections */ + curproxy->options |= PR_O_TRANSP; + if (alertif_too_many_args(0, file, linenum, args, &err_code)) + goto out; + } +#endif + else if (strcmp(args[0], "maxconn") == 0) { /* maxconn */ + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], " Maybe you want 'fullconn' instead ?")) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->maxconn = atol(args[1]); + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[0], "backlog") == 0) { /* backlog */ + if (warnifnotcap(curproxy, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->backlog = atol(args[1]); + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[0], "fullconn") == 0) { /* fullconn */ + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], " Maybe you want 'maxconn' instead ?")) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->fullconn = atol(args[1]); + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[0], "grace") == 0) { /* grace time (ms) */ + ha_alert("parsing [%s:%d]: the '%s' keyword is not supported any more since HAProxy version 2.5.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "dispatch") == 0) { /* dispatch address */ + struct sockaddr_storage *sk; + int port1, port2; + + if (curproxy->cap & PR_CAP_DEF) { + ha_alert("parsing [%s:%d] : '%s' not allowed in 'defaults' section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + sk = str2sa_range(args[1], NULL, &port1, &port2, NULL, NULL, NULL, + &errmsg, NULL, NULL, + PA_O_RESOLVE | PA_O_PORT_OK | PA_O_PORT_MAND | PA_O_STREAM | PA_O_XPRT | PA_O_CONNECT); + if (!sk) { + ha_alert("parsing [%s:%d] : '%s' : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + curproxy->dispatch_addr = *sk; + curproxy->options |= PR_O_DISPATCH; + } + else if (strcmp(args[0], "balance") == 0) { /* set balancing with optional algorithm */ + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (backend_parse_balance((const char **)args + 1, &errmsg, curproxy) < 0) { + ha_alert("parsing [%s:%d] : %s %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "hash-type") == 0) { /* set hashing method */ + /** + * The syntax for hash-type config element is + * hash-type {map-based|consistent} [[<algo>] avalanche] + * + * The default hash function is sdbm for map-based and sdbm+avalanche for consistent. + */ + curproxy->lbprm.algo &= ~(BE_LB_HASH_TYPE | BE_LB_HASH_FUNC | BE_LB_HASH_MOD); + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (strcmp(args[1], "consistent") == 0) { /* use consistent hashing */ + curproxy->lbprm.algo |= BE_LB_HASH_CONS; + } + else if (strcmp(args[1], "map-based") == 0) { /* use map-based hashing */ + curproxy->lbprm.algo |= BE_LB_HASH_MAP; + } + else if (strcmp(args[1], "avalanche") == 0) { + ha_alert("parsing [%s:%d] : experimental feature '%s %s' is not supported anymore, please use '%s map-based sdbm avalanche' instead.\n", file, linenum, args[0], args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else { + ha_alert("parsing [%s:%d] : '%s' only supports 'consistent' and 'map-based'.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* set the hash function to use */ + if (!*args[2]) { + /* the default algo is sdbm */ + curproxy->lbprm.algo |= BE_LB_HFCN_SDBM; + + /* if consistent with no argument, then avalanche modifier is also applied */ + if ((curproxy->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS) + curproxy->lbprm.algo |= BE_LB_HMOD_AVAL; + } else { + /* set the hash function */ + if (strcmp(args[2], "sdbm") == 0) { + curproxy->lbprm.algo |= BE_LB_HFCN_SDBM; + } + else if (strcmp(args[2], "djb2") == 0) { + curproxy->lbprm.algo |= BE_LB_HFCN_DJB2; + } + else if (strcmp(args[2], "wt6") == 0) { + curproxy->lbprm.algo |= BE_LB_HFCN_WT6; + } + else if (strcmp(args[2], "crc32") == 0) { + curproxy->lbprm.algo |= BE_LB_HFCN_CRC32; + } + else if (strcmp(args[2], "none") == 0) { + curproxy->lbprm.algo |= BE_LB_HFCN_NONE; + } + else { + ha_alert("parsing [%s:%d] : '%s' only supports 'sdbm', 'djb2', 'crc32', or 'wt6' hash functions.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* set the hash modifier */ + if (strcmp(args[3], "avalanche") == 0) { + curproxy->lbprm.algo |= BE_LB_HMOD_AVAL; + } + else if (*args[3]) { + ha_alert("parsing [%s:%d] : '%s' only supports 'avalanche' as a modifier for hash functions.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + } + else if (strcmp(args[0], "hash-balance-factor") == 0) { + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->lbprm.hash_balance_factor = atol(args[1]); + if (curproxy->lbprm.hash_balance_factor != 0 && curproxy->lbprm.hash_balance_factor <= 100) { + ha_alert("parsing [%s:%d] : '%s' must be 0 or greater than 100.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "unique-id-format") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : %s expects an argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (*(args[2])) { + ha_alert("parsing [%s:%d] : %s expects only one argument, don't forget to escape spaces!\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->conf.uniqueid_format_string); + curproxy->conf.uniqueid_format_string = strdup(args[1]); + if (!curproxy->conf.uniqueid_format_string) + goto alloc_error; + + free(curproxy->conf.uif_file); + curproxy->conf.uif_file = strdup(curproxy->conf.args.file); + curproxy->conf.uif_line = curproxy->conf.args.line; + } + + else if (strcmp(args[0], "unique-id-header") == 0) { + char *copy; + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : %s expects an argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + copy = strdup(args[1]); + if (copy == NULL) { + ha_alert("parsing [%s:%d] : failed to allocate memory for unique-id-header\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + istfree(&curproxy->header_unique_id); + curproxy->header_unique_id = ist(copy); + } + + else if (strcmp(args[0], "log-format") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : %s expects an argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (*(args[2])) { + ha_alert("parsing [%s:%d] : %s expects only one argument, don't forget to escape spaces!\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (curproxy->conf.logformat_string && curproxy->cap & PR_CAP_DEF) { + char *oldlogformat = "log-format"; + + if (curproxy->conf.logformat_string == default_http_log_format) + oldlogformat = "option httplog"; + else if (curproxy->conf.logformat_string == default_tcp_log_format) + oldlogformat = "option tcplog"; + else if (curproxy->conf.logformat_string == clf_http_log_format) + oldlogformat = "option httplog clf"; + else if (curproxy->conf.logformat_string == default_https_log_format) + oldlogformat = "option httpslog"; + ha_warning("parsing [%s:%d]: 'log-format' overrides previous '%s' in 'defaults' section.\n", + file, linenum, oldlogformat); + } + if (curproxy->conf.logformat_string != default_http_log_format && + curproxy->conf.logformat_string != default_tcp_log_format && + curproxy->conf.logformat_string != clf_http_log_format && + curproxy->conf.logformat_string != default_https_log_format) + free(curproxy->conf.logformat_string); + curproxy->conf.logformat_string = strdup(args[1]); + if (!curproxy->conf.logformat_string) + goto alloc_error; + + free(curproxy->conf.lfs_file); + curproxy->conf.lfs_file = strdup(curproxy->conf.args.file); + curproxy->conf.lfs_line = curproxy->conf.args.line; + + /* get a chance to improve log-format error reporting by + * reporting the correct line-number when possible. + */ + if (!(curproxy->cap & PR_CAP_DEF) && !(curproxy->cap & PR_CAP_FE)) { + ha_warning("parsing [%s:%d] : backend '%s' : 'log-format' directive is ignored in backends.\n", + file, linenum, curproxy->id); + err_code |= ERR_WARN; + } + } + else if (strcmp(args[0], "log-format-sd") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : %s expects an argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (*(args[2])) { + ha_alert("parsing [%s:%d] : %s expects only one argument, don't forget to escape spaces!\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (curproxy->conf.logformat_sd_string != default_rfc5424_sd_log_format) + free(curproxy->conf.logformat_sd_string); + curproxy->conf.logformat_sd_string = strdup(args[1]); + if (!curproxy->conf.logformat_sd_string) + goto alloc_error; + + free(curproxy->conf.lfsd_file); + curproxy->conf.lfsd_file = strdup(curproxy->conf.args.file); + curproxy->conf.lfsd_line = curproxy->conf.args.line; + + /* get a chance to improve log-format-sd error reporting by + * reporting the correct line-number when possible. + */ + if (!(curproxy->cap & PR_CAP_DEF) && !(curproxy->cap & PR_CAP_FE)) { + ha_warning("parsing [%s:%d] : backend '%s' : 'log-format-sd' directive is ignored in backends.\n", + file, linenum, curproxy->id); + err_code |= ERR_WARN; + } + } + else if (strcmp(args[0], "error-log-format") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : %s expects an argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (*(args[2])) { + ha_alert("parsing [%s:%d] : %s expects only one argument, don't forget to escape spaces!\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (curproxy->conf.error_logformat_string && curproxy->cap & PR_CAP_DEF) { + ha_warning("parsing [%s:%d]: 'error-log-format' overrides previous 'error-log-format' in 'defaults' section.\n", + file, linenum); + } + free(curproxy->conf.error_logformat_string); + curproxy->conf.error_logformat_string = strdup(args[1]); + if (!curproxy->conf.error_logformat_string) + goto alloc_error; + + free(curproxy->conf.elfs_file); + curproxy->conf.elfs_file = strdup(curproxy->conf.args.file); + curproxy->conf.elfs_line = curproxy->conf.args.line; + + /* get a chance to improve log-format error reporting by + * reporting the correct line-number when possible. + */ + if (!(curproxy->cap & PR_CAP_DEF) && !(curproxy->cap & PR_CAP_FE)) { + ha_warning("parsing [%s:%d] : backend '%s' : 'error-log-format' directive is ignored in backends.\n", + file, linenum, curproxy->id); + err_code |= ERR_WARN; + } + } + else if (strcmp(args[0], "log-tag") == 0) { /* tag to report to syslog */ + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects a tag for use in syslog.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + chunk_destroy(&curproxy->log_tag); + chunk_initlen(&curproxy->log_tag, strdup(args[1]), strlen(args[1]), strlen(args[1])); + if (b_orig(&curproxy->log_tag) == NULL) { + chunk_destroy(&curproxy->log_tag); + ha_alert("parsing [%s:%d]: cannot allocate memory for '%s'.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "log") == 0) { /* "no log" or "log ..." */ + if (!parse_logger(args, &curproxy->loggers, (kwm == KWM_NO), file, linenum, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "source") == 0) { /* address to which we bind when connecting */ + int cur_arg; + int port1, port2; + struct sockaddr_storage *sk; + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects <addr>[:<port>], and optionally '%s' <addr>, and '%s' <name>.\n", + file, linenum, "source", "usesrc", "interface"); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* we must first clear any optional default setting */ + curproxy->conn_src.opts &= ~CO_SRC_TPROXY_MASK; + ha_free(&curproxy->conn_src.iface_name); + curproxy->conn_src.iface_len = 0; + + sk = str2sa_range(args[1], NULL, &port1, &port2, NULL, NULL, NULL, + &errmsg, NULL, NULL, PA_O_RESOLVE | PA_O_PORT_OK | PA_O_STREAM | PA_O_CONNECT); + if (!sk) { + ha_alert("parsing [%s:%d] : '%s %s' : %s\n", + file, linenum, args[0], args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + curproxy->conn_src.source_addr = *sk; + curproxy->conn_src.opts |= CO_SRC_BIND; + + cur_arg = 2; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "usesrc") == 0) { /* address to use outside */ +#if defined(CONFIG_HAP_TRANSPARENT) + if (!*args[cur_arg + 1]) { + ha_alert("parsing [%s:%d] : '%s' expects <addr>[:<port>], 'client', or 'clientip' as argument.\n", + file, linenum, "usesrc"); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcmp(args[cur_arg + 1], "client") == 0) { + curproxy->conn_src.opts &= ~CO_SRC_TPROXY_MASK; + curproxy->conn_src.opts |= CO_SRC_TPROXY_CLI; + } else if (strcmp(args[cur_arg + 1], "clientip") == 0) { + curproxy->conn_src.opts &= ~CO_SRC_TPROXY_MASK; + curproxy->conn_src.opts |= CO_SRC_TPROXY_CIP; + } else if (!strncmp(args[cur_arg + 1], "hdr_ip(", 7)) { + char *name, *end; + + name = args[cur_arg+1] + 7; + while (isspace((unsigned char)*name)) + name++; + + end = name; + while (*end && !isspace((unsigned char)*end) && *end != ',' && *end != ')') + end++; + + curproxy->conn_src.opts &= ~CO_SRC_TPROXY_MASK; + curproxy->conn_src.opts |= CO_SRC_TPROXY_DYN; + free(curproxy->conn_src.bind_hdr_name); + curproxy->conn_src.bind_hdr_name = calloc(1, end - name + 1); + if (!curproxy->conn_src.bind_hdr_name) + goto alloc_error; + curproxy->conn_src.bind_hdr_len = end - name; + memcpy(curproxy->conn_src.bind_hdr_name, name, end - name); + curproxy->conn_src.bind_hdr_name[end-name] = '\0'; + curproxy->conn_src.bind_hdr_occ = -1; + + /* now look for an occurrence number */ + while (isspace((unsigned char)*end)) + end++; + if (*end == ',') { + end++; + name = end; + if (*end == '-') + end++; + while (isdigit((unsigned char)*end)) + end++; + curproxy->conn_src.bind_hdr_occ = strl2ic(name, end-name); + } + + if (curproxy->conn_src.bind_hdr_occ < -MAX_HDR_HISTORY) { + ha_alert("parsing [%s:%d] : usesrc hdr_ip(name,num) does not support negative" + " occurrences values smaller than %d.\n", + file, linenum, MAX_HDR_HISTORY); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } else { + struct sockaddr_storage *sk; + + sk = str2sa_range(args[cur_arg + 1], NULL, &port1, &port2, NULL, NULL, NULL, + &errmsg, NULL, NULL, PA_O_RESOLVE | PA_O_PORT_OK | PA_O_STREAM | PA_O_CONNECT); + if (!sk) { + ha_alert("parsing [%s:%d] : '%s %s' : %s\n", + file, linenum, args[cur_arg], args[cur_arg+1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + curproxy->conn_src.tproxy_addr = *sk; + curproxy->conn_src.opts |= CO_SRC_TPROXY_ADDR; + } + global.last_checks |= LSTCHK_NETADM; +#else /* no TPROXY support */ + ha_alert("parsing [%s:%d] : '%s' not allowed here because support for TPROXY was not compiled in.\n", + file, linenum, "usesrc"); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +#endif + cur_arg += 2; + continue; + } + + if (strcmp(args[cur_arg], "interface") == 0) { /* specifically bind to this interface */ +#ifdef SO_BINDTODEVICE + if (!*args[cur_arg + 1]) { + ha_alert("parsing [%s:%d] : '%s' : missing interface name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + free(curproxy->conn_src.iface_name); + curproxy->conn_src.iface_name = strdup(args[cur_arg + 1]); + if (!curproxy->conn_src.iface_name) + goto alloc_error; + curproxy->conn_src.iface_len = strlen(curproxy->conn_src.iface_name); + global.last_checks |= LSTCHK_NETADM; +#else + ha_alert("parsing [%s:%d] : '%s' : '%s' option not implemented.\n", + file, linenum, args[0], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +#endif + cur_arg += 2; + continue; + } + ha_alert("parsing [%s:%d] : '%s' only supports optional keywords '%s' and '%s'.\n", + file, linenum, args[0], "interface", "usesrc"); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "usesrc") == 0) { /* address to use outside: needs "source" first */ + ha_alert("parsing [%s:%d] : '%s' only allowed after a '%s' statement.\n", + file, linenum, "usesrc", "source"); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "cliexp") == 0 || strcmp(args[0], "reqrep") == 0) { /* replace request header from a regex */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request replace-path', 'http-request replace-uri' or 'http-request replace-header' instead.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqdel") == 0) { /* delete request header from a regex */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request del-header' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqdeny") == 0) { /* deny a request if a header matches this regex */ + ha_alert("parsing [%s:%d] : The '%s' not supported anymore since HAProxy 2.1. " + "Use 'http-request deny' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqpass") == 0) { /* pass this header without allowing or denying the request */ + ha_alert("parsing [%s:%d] : The '%s' not supported anymore since HAProxy 2.1.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqallow") == 0) { /* allow a request if a header matches this regex */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request allow' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqtarpit") == 0) { /* tarpit a request if a header matches this regex */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request tarpit' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqirep") == 0) { /* replace request header from a regex, ignoring case */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request replace-header' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqidel") == 0) { /* delete request header from a regex ignoring case */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request del-header' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqideny") == 0) { /* deny a request if a header matches this regex ignoring case */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request deny' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqipass") == 0) { /* pass this header without allowing or denying the request */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqiallow") == 0) { /* allow a request if a header matches this regex ignoring case */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request allow' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqitarpit") == 0) { /* tarpit a request if a header matches this regex ignoring case */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request tarpit' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "reqadd") == 0) { /* add request header */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-request add-header' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "srvexp") == 0 || strcmp(args[0], "rsprep") == 0) { /* replace response header from a regex */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-response replace-header' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "rspdel") == 0) { /* delete response header from a regex */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-response del-header' .\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "rspdeny") == 0) { /* block response header from a regex */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-response deny' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "rspirep") == 0) { /* replace response header from a regex ignoring case */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-response replace-header' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "rspidel") == 0) { /* delete response header from a regex ignoring case */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-response del-header' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "rspideny") == 0) { /* block response header from a regex ignoring case */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-response deny' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "rspadd") == 0) { /* add response header */ + ha_alert("parsing [%s:%d] : The '%s' directive is not supported anymore since HAProxy 2.1. " + "Use 'http-response add-header' instead.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else { + struct cfg_kw_list *kwl; + const char *best; + int index; + + list_for_each_entry(kwl, &cfg_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (kwl->kw[index].section != CFG_LISTEN) + continue; + if (strcmp(kwl->kw[index].kw, args[0]) == 0) { + if (check_kw_experimental(&kwl->kw[index], file, linenum, &errmsg)) { + ha_alert("%s\n", errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* prepare error message just in case */ + rc = kwl->kw[index].parse(args, CFG_LISTEN, curproxy, curr_defproxy, file, linenum, &errmsg); + if (rc < 0) { + ha_alert("parsing [%s:%d] : %s\n", file, linenum, errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (rc > 0) { + ha_warning("parsing [%s:%d] : %s\n", file, linenum, errmsg); + err_code |= ERR_WARN; + goto out; + } + goto out; + } + } + } + + best = cfg_find_best_match(args[0], &cfg_keywords.list, CFG_LISTEN, common_kw_list); + if (best) + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section; did you mean '%s' maybe ?\n", file, linenum, args[0], cursection, best); + else + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], cursection); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + out: + free(errmsg); + return err_code; + + alloc_error: + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; +} diff --git a/src/cfgparse-quic.c b/src/cfgparse-quic.c new file mode 100644 index 0000000..3b38efa --- /dev/null +++ b/src/cfgparse-quic.c @@ -0,0 +1,292 @@ +#include <errno.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/listener.h> +#include <haproxy/proxy-t.h> +#include <haproxy/quic_cc-t.h> +#include <haproxy/tools.h> + +#define QUIC_CC_NEWRENO_STR "newreno" +#define QUIC_CC_CUBIC_STR "cubic" +#define QUIC_CC_NO_CC_STR "nocc" + +static int bind_parse_quic_force_retry(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->options |= BC_O_QUIC_FORCE_RETRY; + return 0; +} + +/* parse "quic-cc-algo" bind keyword */ +static int bind_parse_quic_cc_algo(char **args, int cur_arg, struct proxy *px, + struct bind_conf *conf, char **err) +{ + struct quic_cc_algo *cc_algo; + const char *algo = NULL; + char *arg; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing control congestion algorithm", args[cur_arg]); + goto fail; + } + + arg = args[cur_arg + 1]; + if (strncmp(arg, QUIC_CC_NEWRENO_STR, strlen(QUIC_CC_NEWRENO_STR)) == 0) { + /* newreno */ + algo = QUIC_CC_NEWRENO_STR; + cc_algo = &quic_cc_algo_nr; + arg += strlen(QUIC_CC_NEWRENO_STR); + } + else if (strncmp(arg, QUIC_CC_CUBIC_STR, strlen(QUIC_CC_CUBIC_STR)) == 0) { + /* cubic */ + algo = QUIC_CC_CUBIC_STR; + cc_algo = &quic_cc_algo_cubic; + arg += strlen(QUIC_CC_CUBIC_STR); + } + else if (strncmp(arg, QUIC_CC_NO_CC_STR, strlen(QUIC_CC_NO_CC_STR)) == 0) { + /* nocc */ + if (!experimental_directives_allowed) { + ha_alert("'%s' algo is experimental, must be allowed via a global " + "'expose-experimental-directives'\n", arg); + goto fail; + } + + algo = QUIC_CC_NO_CC_STR; + cc_algo = &quic_cc_algo_nocc; + arg += strlen(QUIC_CC_NO_CC_STR); + } + else { + memprintf(err, "'%s' : unknown control congestion algorithm", args[cur_arg + 1]); + goto fail; + } + + if (*arg++ == '(') { + unsigned long cwnd; + char *end_opt; + + errno = 0; + cwnd = strtoul(arg, &end_opt, 0); + if (end_opt == arg || errno != 0) { + memprintf(err, "'%s' : could not parse congestion window value", args[cur_arg + 1]); + goto fail; + } + + if (*end_opt == 'k') { + cwnd <<= 10; + end_opt++; + } + else if (*end_opt == 'm') { + cwnd <<= 20; + end_opt++; + } + else if (*end_opt == 'g') { + cwnd <<= 30; + end_opt++; + } + + if (*end_opt != ')') { + memprintf(err, "'%s' : expects %s(<max window>)", args[cur_arg + 1], algo); + goto fail; + } + + if (cwnd < 10240 || cwnd > (4UL << 30)) { + memprintf(err, "'%s' : should be greater than 10k and smaller than 4g", args[cur_arg + 1]); + goto fail; + } + + conf->max_cwnd = cwnd; + } + + conf->quic_cc_algo = cc_algo; + return 0; + + fail: + return ERR_ALERT | ERR_FATAL; +} + +static int bind_parse_quic_socket(char **args, int cur_arg, struct proxy *px, + struct bind_conf *conf, char **err) +{ + char *arg; + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing argument, use either connection or listener.", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + arg = args[cur_arg + 1]; + if (strcmp(arg, "connection") == 0) { + conf->quic_mode = QUIC_SOCK_MODE_CONN; + } + else if (strcmp(arg, "listener") == 0) { + conf->quic_mode = QUIC_SOCK_MODE_LSTNR; + } + else { + memprintf(err, "'%s' : unknown argument, use either connection or listener.", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +static struct bind_kw_list bind_kws = { "QUIC", { }, { + { "quic-force-retry", bind_parse_quic_force_retry, 0 }, + { "quic-cc-algo", bind_parse_quic_cc_algo, 1 }, + { "quic-socket", bind_parse_quic_socket, 1 }, + { NULL, NULL, 0 }, +}}; + +INITCALL1(STG_REGISTER, bind_register_keywords, &bind_kws); + +/* parse "tune.quic.socket-owner", accepts "listener" or "connection" */ +static int cfg_parse_quic_tune_socket_owner(char **args, int section_type, + struct proxy *curpx, + const struct proxy *defpx, + const char *file, int line, char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "connection") == 0) { + global.tune.options |= GTUNE_QUIC_SOCK_PER_CONN; + } + else if (strcmp(args[1], "listener") == 0) { + global.tune.options &= ~GTUNE_QUIC_SOCK_PER_CONN; + } + else { + memprintf(err, "'%s' expects either 'listener' or 'connection' but got '%s'.", args[0], args[1]); + return -1; + } + + return 0; +} + +/* Must be used to parse tune.quic.* setting which requires a time + * as value. + * Return -1 on alert, or 0 if succeeded. + */ +static int cfg_parse_quic_time(char **args, int section_type, + struct proxy *curpx, + const struct proxy *defpx, + const char *file, int line, char **err) +{ + unsigned int time; + const char *res, *name, *value; + int prefix_len = strlen("tune.quic."); + + if (too_many_args(1, args, err, NULL)) + return -1; + + name = args[0]; + value = args[1]; + res = parse_time_err(value, &time, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' " + "(maximum value is 2147483647 ms or ~24.8 days)", value, name); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' " + "(minimum non-null value is 1 ms)", value, name); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in '%s'", *res, name); + return -1; + } + + if (strcmp(name + prefix_len, "frontend.max-idle-timeout") == 0) + global.tune.quic_frontend_max_idle_timeout = time; + else if (strcmp(name + prefix_len, "backend.max-idle-timeout") == 0) + global.tune.quic_backend_max_idle_timeout = time; + else { + memprintf(err, "'%s' keyword not unhandled (please report this bug).", args[0]); + return -1; + } + + return 0; +} + +/* Parse any tune.quic.* setting with strictly positive integer values. + * Return -1 on alert, or 0 if succeeded. + */ +static int cfg_parse_quic_tune_setting(char **args, int section_type, + struct proxy *curpx, + const struct proxy *defpx, + const char *file, int line, char **err) +{ + unsigned int arg = 0; + int prefix_len = strlen("tune.quic."); + const char *suffix; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) != 0) + arg = atoi(args[1]); + + if (arg < 1) { + memprintf(err, "'%s' expects a positive integer.", args[0]); + return -1; + } + + suffix = args[0] + prefix_len; + if (strcmp(suffix, "frontend.conn-tx-buffers.limit") == 0) + global.tune.quic_streams_buf = arg; + else if (strcmp(suffix, "frontend.max-streams-bidi") == 0) + global.tune.quic_frontend_max_streams_bidi = arg; + else if (strcmp(suffix, "max-frame-loss") == 0) + global.tune.quic_max_frame_loss = arg; + else if (strcmp(suffix, "reorder-ratio") == 0) { + if (arg > 100) { + memprintf(err, "'%s' expects an integer argument between 0 and 100.", args[0]); + return -1; + } + + global.tune.quic_reorder_ratio = arg; + } + else if (strcmp(suffix, "retry-threshold") == 0) + global.tune.quic_retry_threshold = arg; + else { + memprintf(err, "'%s' keyword not unhandled (please report this bug).", args[0]); + return -1; + } + + return 0; +} + +/* config parser for global "tune.quic.zero-copy-fwd-send" */ +static int cfg_parse_quic_zero_copy_fwd_snd(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.no_zero_copy_fwd &= ~NO_ZERO_COPY_FWD_QUIC_SND; + else if (strcmp(args[1], "off") == 0) + global.tune.no_zero_copy_fwd |= NO_ZERO_COPY_FWD_QUIC_SND; + else { + memprintf(err, "'%s' expects 'on' or 'off'.", args[0]); + return -1; + } + return 0; +} + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.quic.socket-owner", cfg_parse_quic_tune_socket_owner }, + { CFG_GLOBAL, "tune.quic.backend.max-idle-timeou", cfg_parse_quic_time }, + { CFG_GLOBAL, "tune.quic.frontend.conn-tx-buffers.limit", cfg_parse_quic_tune_setting }, + { CFG_GLOBAL, "tune.quic.frontend.max-streams-bidi", cfg_parse_quic_tune_setting }, + { CFG_GLOBAL, "tune.quic.frontend.max-idle-timeout", cfg_parse_quic_time }, + { CFG_GLOBAL, "tune.quic.max-frame-loss", cfg_parse_quic_tune_setting }, + { CFG_GLOBAL, "tune.quic.reorder-ratio", cfg_parse_quic_tune_setting }, + { CFG_GLOBAL, "tune.quic.retry-threshold", cfg_parse_quic_tune_setting }, + { CFG_GLOBAL, "tune.quic.zero-copy-fwd-send", cfg_parse_quic_zero_copy_fwd_snd }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/cfgparse-ssl.c b/src/cfgparse-ssl.c new file mode 100644 index 0000000..5666336 --- /dev/null +++ b/src/cfgparse-ssl.c @@ -0,0 +1,2382 @@ +/* + * + * Copyright (C) 2012 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * Configuration parsing for SSL. + * This file is split in 3 parts: + * - global section parsing + * - bind keyword parsing + * - server keyword parsing + * + * Please insert the new keywords at the right place + */ + +#define _GNU_SOURCE +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/stat.h> +#include <sys/types.h> + +#include <haproxy/api.h> +#include <haproxy/base64.h> +#include <haproxy/cfgparse.h> +#include <haproxy/errors.h> +#include <haproxy/listener.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/ssl_utils.h> +#include <haproxy/tools.h> +#include <haproxy/ssl_ckch.h> +#include <haproxy/ssl_ocsp.h> + + +/****************** Global Section Parsing ********************************************/ + +static int ssl_load_global_issuers_from_path(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *path; + struct dirent **de_list; + int i, n; + struct stat buf; + char *end; + char fp[MAXPATHLEN+1]; + + if (too_many_args(1, args, err, NULL)) + return -1; + + path = args[1]; + if (*path == 0 || stat(path, &buf)) { + memprintf(err, "%sglobal statement '%s' expects a directory path as an argument.\n", + err && *err ? *err : "", args[0]); + return -1; + } + if (S_ISDIR(buf.st_mode) == 0) { + memprintf(err, "%sglobal statement '%s': %s is not a directory.\n", + err && *err ? *err : "", args[0], path); + return -1; + } + + /* strip trailing slashes, including first one */ + for (end = path + strlen(path) - 1; end >= path && *end == '/'; end--) + *end = 0; + /* path already parsed? */ + if (global_ssl.issuers_chain_path && strcmp(global_ssl.issuers_chain_path, path) == 0) + return 0; + /* overwrite old issuers_chain_path */ + free(global_ssl.issuers_chain_path); + global_ssl.issuers_chain_path = strdup(path); + ssl_free_global_issuers(); + + n = scandir(path, &de_list, 0, alphasort); + if (n < 0) { + memprintf(err, "%sglobal statement '%s': unable to scan directory '%s' : %s.\n", + err && *err ? *err : "", args[0], path, strerror(errno)); + return -1; + } + for (i = 0; i < n; i++) { + struct dirent *de = de_list[i]; + BIO *in = NULL; + char *warn = NULL; + + snprintf(fp, sizeof(fp), "%s/%s", path, de->d_name); + free(de); + if (stat(fp, &buf) != 0) { + ha_warning("unable to stat certificate from file '%s' : %s.\n", fp, strerror(errno)); + goto next; + } + if (!S_ISREG(buf.st_mode)) + goto next; + + in = BIO_new(BIO_s_file()); + if (in == NULL) + goto next; + if (BIO_read_filename(in, fp) <= 0) + goto next; + ssl_load_global_issuer_from_BIO(in, fp, &warn); + if (warn) { + ha_warning("%s", warn); + ha_free(&warn); + } + next: + if (in) + BIO_free(in); + } + free(de_list); + + return 0; +} + +/* parse the "ssl-mode-async" keyword in global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_ssl_async(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ +#ifdef SSL_MODE_ASYNC + global_ssl.async = 1; + global.ssl_used_async_engines = nb_engines; + return 0; +#else + memprintf(err, "'%s': openssl library does not support async mode", args[0]); + return -1; +#endif +} + +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) +/* parse the "ssl-engine" keyword in global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_ssl_engine(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *algo; + int ret = -1; + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a valid engine name as an argument.", args[0]); + return ret; + } + + if (*(args[2]) == 0) { + /* if no list of algorithms is given, it defaults to ALL */ + algo = strdup("ALL"); + goto add_engine; + } + + /* otherwise the expected format is ssl-engine <engine_name> algo <list of algo> */ + if (strcmp(args[2], "algo") != 0) { + memprintf(err, "global statement '%s' expects to have algo keyword.", args[0]); + return ret; + } + + if (*(args[3]) == 0) { + memprintf(err, "global statement '%s' expects algorithm names as an argument.", args[0]); + return ret; + } + algo = strdup(args[3]); + +add_engine: + if (ssl_init_single_engine(args[1], algo)==0) { + openssl_engines_initialized++; + ret = 0; + } + free(algo); + return ret; +} +#endif + +#ifdef HAVE_SSL_PROVIDERS +/* parse the "ssl-propquery" keyword in global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_ssl_propquery(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int ret = -1; + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a property string as an argument.", args[0]); + return ret; + } + + if (EVP_set_default_properties(NULL, args[1])) + ret = 0; + + return ret; +} + +/* parse the "ssl-provider" keyword in global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_ssl_provider(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int ret = -1; + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a valid engine provider name as an argument.", args[0]); + return ret; + } + + if (ssl_init_provider(args[1]) == 0) + ret = 0; + + return ret; +} + +/* parse the "ssl-provider-path" keyword in global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_ssl_provider_path(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a directory path as an argument.", args[0]); + return -1; + } + + OSSL_PROVIDER_set_default_search_path(NULL, args[1]); + + return 0; +} +#endif + +/* parse the "ssl-default-bind-ciphers" / "ssl-default-server-ciphers" keywords + * in global section. Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_ciphers(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char **target; + + target = (args[0][12] == 'b') ? &global_ssl.listen_default_ciphers : &global_ssl.connect_default_ciphers; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a cipher suite as an argument.", args[0]); + return -1; + } + + free(*target); + *target = strdup(args[1]); + return 0; +} + +/* parse the "ssl-default-bind-ciphersuites" / "ssl-default-server-ciphersuites" keywords + * in global section. Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_ciphersuites(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + char **target; + + target = (args[0][12] == 'b') ? &global_ssl.listen_default_ciphersuites : &global_ssl.connect_default_ciphersuites; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a cipher suite as an argument.", args[0]); + return -1; + } + + free(*target); + *target = strdup(args[1]); + return 0; +#else /* ! HAVE_SSL_CTX_SET_CIPHERSUITES */ + memprintf(err, "'%s' not supported for your SSL library (%s).", args[0], OPENSSL_VERSION_TEXT); + return -1; + +#endif +} + +#if defined(SSL_CTX_set1_curves_list) +/* + * parse the "ssl-default-bind-curves" keyword in a global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_curves(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char **target; + target = (args[0][12] == 'b') ? &global_ssl.listen_default_curves : &global_ssl.connect_default_curves; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a curves suite as an arguments.", args[0]); + return -1; + } + + free(*target); + *target = strdup(args[1]); + return 0; +} +#endif + +#if defined(SSL_CTX_set1_sigalgs_list) +/* + * parse the "ssl-default-bind-sigalgs" and "ssl-default-server-sigalgs" keyword in a global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_sigalgs(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char **target; + + target = (args[0][12] == 'b') ? &global_ssl.listen_default_sigalgs : &global_ssl.connect_default_sigalgs; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a curves suite as an arguments.", args[0]); + return -1; + } + + free(*target); + *target = strdup(args[1]); + return 0; +} +#endif + +#if defined(SSL_CTX_set1_client_sigalgs_list) +/* + * parse the "ssl-default-bind-client-sigalgs" keyword in a global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_client_sigalgs(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char **target; + + target = (args[0][12] == 'b') ? &global_ssl.listen_default_client_sigalgs : &global_ssl.connect_default_client_sigalgs; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects signature algorithms as an arguments.", args[0]); + return -1; + } + + free(*target); + *target = strdup(args[1]); + return 0; +} +#endif + +/* parse various global tune.ssl settings consisting in positive integers. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_int(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int *target; + + if (strcmp(args[0], "tune.ssl.cachesize") == 0) + target = &global.tune.sslcachesize; + else if (strcmp(args[0], "tune.ssl.maxrecord") == 0) + target = (int *)&global_ssl.max_record; + else if (strcmp(args[0], "tune.ssl.hard-maxrecord") == 0) + target = (int *)&global_ssl.hard_max_record; + else if (strcmp(args[0], "tune.ssl.ssl-ctx-cache-size") == 0) + target = &global_ssl.ctx_cache; + else if (strcmp(args[0], "maxsslconn") == 0) + target = &global.maxsslconn; + else if (strcmp(args[0], "tune.ssl.capture-buffer-size") == 0) + target = &global_ssl.capture_buffer_size; + else if (strcmp(args[0], "tune.ssl.capture-cipherlist-size") == 0) { + target = &global_ssl.capture_buffer_size; + ha_warning("parsing [%s:%d]: '%s' is deprecated and will be removed in version 2.7. Please use 'tune.ssl.capture-buffer-size' instead.\n", + file, line, args[0]); + } + else { + memprintf(err, "'%s' keyword not unhandled (please report this bug).", args[0]); + return -1; + } + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects an integer argument.", args[0]); + return -1; + } + + *target = atoi(args[1]); + if (*target < 0) { + memprintf(err, "'%s' expects a positive numeric value.", args[0]); + return -1; + } + return 0; +} + +static int ssl_parse_global_capture_buffer(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int ret; + + ret = ssl_parse_global_int(args, section_type, curpx, defpx, file, line, err); + if (ret != 0) + return ret; + + if (pool_head_ssl_capture) { + memprintf(err, "'%s' is already configured.", args[0]); + return -1; + } + + pool_head_ssl_capture = create_pool("ssl-capture", sizeof(struct ssl_capture) + global_ssl.capture_buffer_size, MEM_F_SHARED); + if (!pool_head_ssl_capture) { + memprintf(err, "Out of memory error."); + return -1; + } + return 0; +} + +/* init the SSLKEYLOGFILE pool */ +#ifdef HAVE_SSL_KEYLOG +static int ssl_parse_global_keylog(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global_ssl.keylog = 1; + else if (strcmp(args[1], "off") == 0) + global_ssl.keylog = 0; + else { + memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + + if (pool_head_ssl_keylog) /* already configured */ + return 0; + + pool_head_ssl_keylog = create_pool("ssl-keylogfile", sizeof(struct ssl_keylog), MEM_F_SHARED); + if (!pool_head_ssl_keylog) { + memprintf(err, "Out of memory error."); + return -1; + } + + pool_head_ssl_keylog_str = create_pool("ssl-keylogfile-str", sizeof(char) * SSL_KEYLOG_MAX_SECRET_SIZE, MEM_F_SHARED); + if (!pool_head_ssl_keylog_str) { + memprintf(err, "Out of memory error."); + return -1; + } + + return 0; +} +#else +static int ssl_parse_global_keylog(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + memprintf(err, "'%s' requires at least OpenSSL 1.1.1.", args[0]); + return -1; +} +#endif + +/* parse "ssl.force-private-cache". + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_private_cache(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(0, args, err, NULL)) + return -1; + + global_ssl.private_cache = 1; + return 0; +} + +/* parse "ssl.lifetime". + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_lifetime(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + const char *res; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects ssl sessions <lifetime> in seconds as argument.", args[0]); + return -1; + } + + res = parse_time_err(args[1], &global_ssl.life_time, TIME_UNIT_S); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to <%s> (maximum value is 2147483647 s or ~68 years).", + args[1], args[0]); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to <%s> (minimum non-null value is 1 s).", + args[1], args[0]); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in argument to <%s>.", *res, args[0]); + return -1; + } + return 0; +} + +#ifndef OPENSSL_NO_DH +/* parse "ssl-dh-param-file". + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_dh_param_file(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects a file path as an argument.", args[0]); + return -1; + } + + if (ssl_sock_load_global_dh_param_from_file(args[1])) { + memprintf(err, "'%s': unable to load DH parameters from file <%s>.", args[0], args[1]); + return -1; + } + return 0; +} + +/* parse "ssl.default-dh-param". + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_default_dh(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects an integer argument.", args[0]); + return -1; + } + + global_ssl.default_dh_param = atoi(args[1]); + if (global_ssl.default_dh_param < 1024) { + memprintf(err, "'%s' expects a value >= 1024.", args[0]); + return -1; + } + return 0; +} +#endif + + +/* + * parse "ssl-load-extra-files". + * multiple arguments are allowed: "bundle", "sctl", "ocsp", "issuer", "all", "none" + */ +static int ssl_parse_global_extra_files(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int i; + int gf = SSL_GF_NONE; + + if (*(args[1]) == 0) + goto err_arg; + + for (i = 1; *args[i]; i++) { + + if (strcmp("bundle", args[i]) == 0) { + gf |= SSL_GF_BUNDLE; + + } else if (strcmp("sctl", args[i]) == 0) { + gf |= SSL_GF_SCTL; + + } else if (strcmp("ocsp", args[i]) == 0){ + gf |= SSL_GF_OCSP; + + } else if (strcmp("issuer", args[i]) == 0){ + gf |= SSL_GF_OCSP_ISSUER; + + } else if (strcmp("key", args[i]) == 0) { + gf |= SSL_GF_KEY; + + } else if (strcmp("none", args[i]) == 0) { + if (gf != SSL_GF_NONE) + goto err_alone; + gf = SSL_GF_NONE; + i++; + break; + + } else if (strcmp("all", args[i]) == 0) { + if (gf != SSL_GF_NONE) + goto err_alone; + gf = SSL_GF_ALL; + i++; + break; + } else { + goto err_arg; + } + } + /* break from loop but there are still arguments */ + if (*args[i]) + goto err_alone; + + global_ssl.extra_files = gf; + + return 0; + +err_alone: + memprintf(err, "'%s' 'none' and 'all' can be only used alone", args[0]); + return -1; + +err_arg: + memprintf(err, "'%s' expects one or multiple arguments (none, all, bundle, sctl, ocsp, issuer).", args[0]); + return -1; +} + + +/* parse 'ssl-load-extra-del-ext */ +static int ssl_parse_global_extra_noext(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + global_ssl.extra_files_noext = 1; + return 0; +} + + +/***************************** Bind keyword Parsing ********************************************/ + +/* for ca-file and ca-verify-file */ +static int ssl_bind_parse_ca_file_common(char **args, int cur_arg, char **ca_file_p, int from_cli, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing CAfile path", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if ((*args[cur_arg + 1] != '/') && (*args[cur_arg + 1] != '@') && global_ssl.ca_base) + memprintf(ca_file_p, "%s/%s", global_ssl.ca_base, args[cur_arg + 1]); + else + memprintf(ca_file_p, "%s", args[cur_arg + 1]); + + if (!ssl_store_load_locations_file(*ca_file_p, !from_cli, CAFILE_CERT)) { + memprintf(err, "'%s' : unable to load %s", args[cur_arg], *ca_file_p); + return ERR_ALERT | ERR_FATAL; + } + return 0; +} + +/* parse the "ca-file" bind keyword */ +static int ssl_bind_parse_ca_file(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ + return ssl_bind_parse_ca_file_common(args, cur_arg, &conf->ca_file, from_cli, err); +} +static int bind_parse_ca_file(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_ca_file(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "ca-verify-file" bind keyword */ +static int ssl_bind_parse_ca_verify_file(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ + return ssl_bind_parse_ca_file_common(args, cur_arg, &conf->ca_verify_file, from_cli, err); +} +static int bind_parse_ca_verify_file(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_ca_verify_file(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "ca-sign-file" bind keyword */ +static int bind_parse_ca_sign_file(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing CAfile path", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if ((*args[cur_arg + 1] != '/') && (*args[cur_arg + 1] != '@') && global_ssl.ca_base) + memprintf(&conf->ca_sign_file, "%s/%s", global_ssl.ca_base, args[cur_arg + 1]); + else + memprintf(&conf->ca_sign_file, "%s", args[cur_arg + 1]); + + return 0; +} + +/* parse the "ca-sign-pass" bind keyword */ +static int bind_parse_ca_sign_pass(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing CAkey password", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + memprintf(&conf->ca_sign_pass, "%s", args[cur_arg + 1]); + return 0; +} + +/* parse the "ciphers" bind keyword */ +static int ssl_bind_parse_ciphers(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing cipher suite", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(conf->ciphers); + conf->ciphers = strdup(args[cur_arg + 1]); + return 0; +} +static int bind_parse_ciphers(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_ciphers(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "ciphersuites" bind keyword */ +static int ssl_bind_parse_ciphersuites(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing cipher suite", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(conf->ciphersuites); + conf->ciphersuites = strdup(args[cur_arg + 1]); + return 0; +#else + memprintf(err, "'%s' keyword not supported for this SSL library version (%s).", args[cur_arg], OPENSSL_VERSION_TEXT); + return ERR_ALERT | ERR_FATAL; +#endif +} + +static int bind_parse_ciphersuites(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_ciphersuites(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "crt" bind keyword. Returns a set of ERR_* flags possibly with an error in <err>. */ +static int bind_parse_crt(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + char path[MAXPATHLEN]; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing certificate location", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if ((*args[cur_arg + 1] != '/' ) && global_ssl.crt_base) { + if ((strlen(global_ssl.crt_base) + 1 + strlen(args[cur_arg + 1]) + 1) > sizeof(path) || + snprintf(path, sizeof(path), "%s/%s", global_ssl.crt_base, args[cur_arg + 1]) > sizeof(path)) { + memprintf(err, "'%s' : path too long", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + return ssl_sock_load_cert(path, conf, err); + } + + return ssl_sock_load_cert(args[cur_arg + 1], conf, err); +} + +/* parse the "crt-list" bind keyword. Returns a set of ERR_* flags possibly with an error in <err>. */ +static int bind_parse_crt_list(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + int err_code; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing certificate location", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + err_code = ssl_sock_load_cert_list_file(args[cur_arg + 1], 0, conf, px, err); + if (err_code) + memprintf(err, "'%s' : %s", args[cur_arg], *err); + + return err_code; +} + +/* parse the "crl-file" bind keyword */ +static int ssl_bind_parse_crl_file(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ +#ifndef X509_V_FLAG_CRL_CHECK + memprintf(err, "'%s' : library does not support CRL verify", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#else + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing CRLfile path", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if ((*args[cur_arg + 1] != '/') && (*args[cur_arg + 1] != '@') && global_ssl.ca_base) + memprintf(&conf->crl_file, "%s/%s", global_ssl.ca_base, args[cur_arg + 1]); + else + memprintf(&conf->crl_file, "%s", args[cur_arg + 1]); + + if (!ssl_store_load_locations_file(conf->crl_file, !from_cli, CAFILE_CRL)) { + memprintf(err, "'%s' : unable to load %s", args[cur_arg], conf->crl_file); + return ERR_ALERT | ERR_FATAL; + } + return 0; +#endif +} +static int bind_parse_crl_file(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_crl_file(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "curves" bind keyword keyword */ +static int ssl_bind_parse_curves(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ +#if defined(SSL_CTX_set1_curves_list) + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing curve suite", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + conf->curves = strdup(args[cur_arg + 1]); + return 0; +#else + memprintf(err, "'%s' : library does not support curve suite", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} +static int bind_parse_curves(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_curves(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "sigalgs" bind keyword */ +static int ssl_bind_parse_sigalgs(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ +#if defined(SSL_CTX_set1_sigalgs_list) + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing signature algorithm list", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + conf->sigalgs = strdup(args[cur_arg + 1]); + return 0; +#else + memprintf(err, "'%s' : library does not support setting signature algorithms", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} +static int bind_parse_sigalgs(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_sigalgs(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "client-sigalgs" bind keyword */ +static int ssl_bind_parse_client_sigalgs(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ +#if defined(SSL_CTX_set1_client_sigalgs_list) + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing signature algorithm list", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + conf->client_sigalgs = strdup(args[cur_arg + 1]); + return 0; +#else + memprintf(err, "'%s' : library does not support setting signature algorithms", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} +static int bind_parse_client_sigalgs(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_client_sigalgs(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + + +/* parse the "ecdhe" bind keyword keyword */ +static int ssl_bind_parse_ecdhe(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ +#if !defined(SSL_CTX_set_tmp_ecdh) + memprintf(err, "'%s' : library does not support elliptic curve Diffie-Hellman (too old)", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#elif defined(OPENSSL_NO_ECDH) + memprintf(err, "'%s' : library does not support elliptic curve Diffie-Hellman (disabled via OPENSSL_NO_ECDH)", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#else + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing named curve", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + conf->ecdhe = strdup(args[cur_arg + 1]); + + return 0; +#endif +} +static int bind_parse_ecdhe(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_ecdhe(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "crt-ignore-err" and "ca-ignore-err" bind keywords */ +static int bind_parse_ignore_err(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + int code; + char *s1 = NULL, *s2 = NULL; + char *token = NULL; + char *p = args[cur_arg + 1]; + char *str; + unsigned long long *ignerr = conf->crt_ignerr_bitfield; + + if (!*p) { + memprintf(err, "'%s' : missing error IDs list", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[cur_arg], "ca-ignore-err") == 0) + ignerr = conf->ca_ignerr_bitfield; + + if (strcmp(p, "all") == 0) { + cert_ignerr_bitfield_set_all(ignerr); + return 0; + } + + /* copy the string to be able to dump the complete one in case of + * error, because strtok_r is writing \0 inside. */ + str = strdup(p); + if (!str) { + memprintf(err, "'%s' : Could not allocate memory", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + s1 = str; + while ((token = strtok_r(s1, ",", &s2))) { + s1 = NULL; + if (isdigit((int)*token)) { + code = atoi(token); + if ((code <= 0) || (code > SSL_MAX_VFY_ERROR_CODE)) { + memprintf(err, "'%s' : ID '%d' out of range (1..%d) in error IDs list '%s'", + args[cur_arg], code, SSL_MAX_VFY_ERROR_CODE, args[cur_arg + 1]); + free(str); + return ERR_ALERT | ERR_FATAL; + } + } else { + code = x509_v_err_str_to_int(token); + if (code < 0) { + memprintf(err, "'%s' : error constant '%s' unknown in error IDs list '%s'", + args[cur_arg], token, args[cur_arg + 1]); + free(str); + return ERR_ALERT | ERR_FATAL; + } + } + cert_ignerr_bitfield_set(ignerr, code); + } + + free(str); + return 0; +} + +/* parse tls_method_options "no-xxx" and "force-xxx" */ +static int parse_tls_method_options(char *arg, struct tls_version_filter *methods, char **err) +{ + uint16_t v; + char *p; + p = strchr(arg, '-'); + if (!p) + goto fail; + p++; + if (strcmp(p, "sslv3") == 0) + v = CONF_SSLV3; + else if (strcmp(p, "tlsv10") == 0) + v = CONF_TLSV10; + else if (strcmp(p, "tlsv11") == 0) + v = CONF_TLSV11; + else if (strcmp(p, "tlsv12") == 0) + v = CONF_TLSV12; + else if (strcmp(p, "tlsv13") == 0) + v = CONF_TLSV13; + else + goto fail; + if (!strncmp(arg, "no-", 3)) + methods->flags |= methodVersions[v].flag; + else if (!strncmp(arg, "force-", 6)) + methods->min = methods->max = v; + else + goto fail; + return 0; + fail: + memprintf(err, "'%s' : option not implemented", arg); + return ERR_ALERT | ERR_FATAL; +} + +static int bind_parse_tls_method_options(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return parse_tls_method_options(args[cur_arg], &conf->ssl_conf.ssl_methods, err); +} + +static int srv_parse_tls_method_options(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + return parse_tls_method_options(args[*cur_arg], &newsrv->ssl_ctx.methods, err); +} + +/* parse tls_method min/max: "ssl-min-ver" and "ssl-max-ver" */ +static int parse_tls_method_minmax(char **args, int cur_arg, struct tls_version_filter *methods, char **err) +{ + uint16_t i, v = 0; + char *argv = args[cur_arg + 1]; + if (!*argv) { + memprintf(err, "'%s' : missing the ssl/tls version", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + for (i = CONF_TLSV_MIN; i <= CONF_TLSV_MAX; i++) + if (strcmp(argv, methodVersions[i].name) == 0) + v = i; + if (!v) { + memprintf(err, "'%s' : unknown ssl/tls version", args[cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + if (strcmp("ssl-min-ver", args[cur_arg]) == 0) + methods->min = v; + else if (strcmp("ssl-max-ver", args[cur_arg]) == 0) + methods->max = v; + else { + memprintf(err, "'%s' : option not implemented", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + return 0; +} + +static int ssl_bind_parse_tls_method_minmax(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ + int ret; + +#if (HA_OPENSSL_VERSION_NUMBER < 0x10101000L) && !defined(OPENSSL_IS_BORINGSSL) + ha_warning("crt-list: ssl-min-ver and ssl-max-ver are not supported with this Openssl version (skipped).\n"); +#endif + ret = parse_tls_method_minmax(args, cur_arg, &conf->ssl_methods_cfg, err); + if (ret != ERR_NONE) + return ret; + + conf->ssl_methods.min = conf->ssl_methods_cfg.min; + conf->ssl_methods.max = conf->ssl_methods_cfg.max; + + return ret; +} +static int bind_parse_tls_method_minmax(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return parse_tls_method_minmax(args, cur_arg, &conf->ssl_conf.ssl_methods, err); +} + +static int srv_parse_tls_method_minmax(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + return parse_tls_method_minmax(args, *cur_arg, &newsrv->ssl_ctx.methods, err); +} + +/* parse the "no-tls-tickets" bind keyword */ +static int bind_parse_no_tls_tickets(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->ssl_options |= BC_SSL_O_NO_TLS_TICKETS; + return 0; +} + +/* parse the "allow-0rtt" bind keyword */ +static int ssl_bind_parse_allow_0rtt(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ + conf->early_data = 1; + return 0; +} + +static int bind_parse_allow_0rtt(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->ssl_conf.early_data = 1; + return 0; +} + +/* parse the "npn" bind keyword */ +static int ssl_bind_parse_npn(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + char *p1, *p2; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing the comma-delimited NPN protocol suite", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(conf->npn_str); + + /* the NPN string is built as a suite of (<len> <name>)*, + * so we reuse each comma to store the next <len> and need + * one more for the end of the string. + */ + conf->npn_len = strlen(args[cur_arg + 1]) + 1; + conf->npn_str = calloc(1, conf->npn_len + 1); + if (!conf->npn_str) { + memprintf(err, "out of memory"); + return ERR_ALERT | ERR_FATAL; + } + + memcpy(conf->npn_str + 1, args[cur_arg + 1], conf->npn_len); + + /* replace commas with the name length */ + p1 = conf->npn_str; + p2 = p1 + 1; + while (1) { + p2 = memchr(p1 + 1, ',', conf->npn_str + conf->npn_len - (p1 + 1)); + if (!p2) + p2 = p1 + 1 + strlen(p1 + 1); + + if (p2 - (p1 + 1) > 255) { + *p2 = '\0'; + memprintf(err, "'%s' : NPN protocol name too long : '%s'", args[cur_arg], p1 + 1); + return ERR_ALERT | ERR_FATAL; + } + + *p1 = p2 - (p1 + 1); + p1 = p2; + + if (!*p2) + break; + + *(p2++) = '\0'; + } + return 0; +#else + memprintf(err, "'%s' : library does not support TLS NPN extension", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} + +static int bind_parse_npn(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_npn(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + + +/* Parses a alpn string and converts it to the right format for the SSL api */ +int ssl_sock_parse_alpn(char *arg, char **alpn_str, int *alpn_len, char **err) +{ + char *p1, *p2, *alpn = NULL; + int len, ret = 0; + + *alpn_str = NULL; + *alpn_len = 0; + + if (!*arg) { + memprintf(err, "missing the comma-delimited ALPN protocol suite"); + goto error; + } + + /* the ALPN string is built as a suite of (<len> <name>)*, + * so we reuse each comma to store the next <len> and need + * one more for the end of the string. + */ + len = strlen(arg) + 1; + alpn = calloc(1, len+1); + if (!alpn) { + memprintf(err, "'%s' : out of memory", arg); + goto error; + } + memcpy(alpn+1, arg, len); + + /* replace commas with the name length */ + p1 = alpn; + p2 = p1 + 1; + while (1) { + p2 = memchr(p1 + 1, ',', alpn + len - (p1 + 1)); + if (!p2) + p2 = p1 + 1 + strlen(p1 + 1); + + if (p2 - (p1 + 1) > 255) { + *p2 = '\0'; + memprintf(err, "ALPN protocol name too long : '%s'", p1 + 1); + goto error; + } + + *p1 = p2 - (p1 + 1); + p1 = p2; + + if (!*p2) + break; + + *(p2++) = '\0'; + } + + *alpn_str = alpn; + *alpn_len = len; + + out: + return ret; + + error: + free(alpn); + ret = ERR_ALERT | ERR_FATAL; + goto out; +} + +/* parse the "alpn" bind keyword */ +static int ssl_bind_parse_alpn(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + int ret; + + free(conf->alpn_str); + + ret = ssl_sock_parse_alpn(args[cur_arg + 1], &conf->alpn_str, &conf->alpn_len, err); + if (ret) + memprintf(err, "'%s' : %s", args[cur_arg], *err); + return ret; +#else + memprintf(err, "'%s' : library does not support TLS ALPN extension", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} + +static int bind_parse_alpn(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_alpn(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "ssl" bind keyword */ +static int bind_parse_ssl(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->options |= BC_O_USE_SSL; + + if (global_ssl.listen_default_ciphers && !conf->ssl_conf.ciphers) + conf->ssl_conf.ciphers = strdup(global_ssl.listen_default_ciphers); +#if defined(SSL_CTX_set1_curves_list) + if (global_ssl.listen_default_curves && !conf->ssl_conf.curves) + conf->ssl_conf.curves = strdup(global_ssl.listen_default_curves); +#endif +#if defined(SSL_CTX_set1_sigalgs_list) + if (global_ssl.listen_default_sigalgs && !conf->ssl_conf.sigalgs) + conf->ssl_conf.sigalgs = strdup(global_ssl.listen_default_sigalgs); +#endif +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + if (global_ssl.listen_default_ciphersuites && !conf->ssl_conf.ciphersuites) + conf->ssl_conf.ciphersuites = strdup(global_ssl.listen_default_ciphersuites); +#endif + conf->ssl_options |= global_ssl.listen_default_ssloptions; + conf->ssl_conf.ssl_methods.flags |= global_ssl.listen_default_sslmethods.flags; + if (!conf->ssl_conf.ssl_methods.min) + conf->ssl_conf.ssl_methods.min = global_ssl.listen_default_sslmethods.min; + if (!conf->ssl_conf.ssl_methods.max) + conf->ssl_conf.ssl_methods.max = global_ssl.listen_default_sslmethods.max; + + return 0; +} + +/* parse the "prefer-client-ciphers" bind keyword */ +static int bind_parse_pcc(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->ssl_options |= BC_SSL_O_PREF_CLIE_CIPH; + return 0; +} + +/* parse the "generate-certificates" bind keyword */ +static int bind_parse_generate_certs(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ +#if (defined SSL_CTRL_SET_TLSEXT_HOSTNAME && !defined SSL_NO_GENERATE_CERTIFICATES) + conf->options |= BC_O_GENERATE_CERTS; +#else + memprintf(err, "%sthis version of openssl cannot generate SSL certificates.\n", + err && *err ? *err : ""); +#endif + return 0; +} + +/* parse the "strict-sni" bind keyword */ +static int bind_parse_strict_sni(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->strict_sni = 1; + return 0; +} + +/* parse the "tls-ticket-keys" bind keyword */ +static int bind_parse_tls_ticket_keys(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) + FILE *f = NULL; + int i = 0; + char thisline[LINESIZE]; + struct tls_keys_ref *keys_ref = NULL; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing TLS ticket keys file path", args[cur_arg]); + goto fail; + } + + keys_ref = tlskeys_ref_lookup(args[cur_arg + 1]); + if (keys_ref) { + keys_ref->refcount++; + conf->keys_ref = keys_ref; + return 0; + } + + keys_ref = calloc(1, sizeof(*keys_ref)); + if (!keys_ref) { + memprintf(err, "'%s' : allocation error", args[cur_arg+1]); + goto fail; + } + + keys_ref->tlskeys = malloc(TLS_TICKETS_NO * sizeof(union tls_sess_key)); + if (!keys_ref->tlskeys) { + memprintf(err, "'%s' : allocation error", args[cur_arg+1]); + goto fail; + } + + if ((f = fopen(args[cur_arg + 1], "r")) == NULL) { + memprintf(err, "'%s' : unable to load ssl tickets keys file", args[cur_arg+1]); + goto fail; + } + + keys_ref->filename = strdup(args[cur_arg + 1]); + if (!keys_ref->filename) { + memprintf(err, "'%s' : allocation error", args[cur_arg+1]); + goto fail; + } + + keys_ref->key_size_bits = 0; + while (fgets(thisline, sizeof(thisline), f) != NULL) { + int len = strlen(thisline); + int dec_size; + + /* Strip newline characters from the end */ + if(thisline[len - 1] == '\n') + thisline[--len] = 0; + + if(thisline[len - 1] == '\r') + thisline[--len] = 0; + + dec_size = base64dec(thisline, len, (char *) (keys_ref->tlskeys + i % TLS_TICKETS_NO), sizeof(union tls_sess_key)); + if (dec_size < 0) { + memprintf(err, "'%s' : unable to decode base64 key on line %d", args[cur_arg+1], i + 1); + goto fail; + } + else if (!keys_ref->key_size_bits && (dec_size == sizeof(struct tls_sess_key_128))) { + keys_ref->key_size_bits = 128; + } + else if (!keys_ref->key_size_bits && (dec_size == sizeof(struct tls_sess_key_256))) { + keys_ref->key_size_bits = 256; + } + else if (((dec_size != sizeof(struct tls_sess_key_128)) && (dec_size != sizeof(struct tls_sess_key_256))) + || ((dec_size == sizeof(struct tls_sess_key_128) && (keys_ref->key_size_bits != 128))) + || ((dec_size == sizeof(struct tls_sess_key_256) && (keys_ref->key_size_bits != 256)))) { + memprintf(err, "'%s' : wrong sized key on line %d", args[cur_arg+1], i + 1); + goto fail; + } + i++; + } + + if (i < TLS_TICKETS_NO) { + memprintf(err, "'%s' : please supply at least %d keys in the tls-tickets-file", args[cur_arg+1], TLS_TICKETS_NO); + goto fail; + } + + fclose(f); + + /* Use penultimate key for encryption, handle when TLS_TICKETS_NO = 1 */ + i -= 2; + keys_ref->tls_ticket_enc_index = i < 0 ? 0 : i % TLS_TICKETS_NO; + keys_ref->unique_id = -1; + keys_ref->refcount = 1; + HA_RWLOCK_INIT(&keys_ref->lock); + conf->keys_ref = keys_ref; + + LIST_INSERT(&tlskeys_reference, &keys_ref->list); + + return 0; + + fail: + if (f) + fclose(f); + if (keys_ref) { + free(keys_ref->filename); + free(keys_ref->tlskeys); + free(keys_ref); + } + return ERR_ALERT | ERR_FATAL; + +#else + memprintf(err, "'%s' : TLS ticket callback extension not supported", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif /* SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB */ +} + +/* parse the "verify" bind keyword */ +static int ssl_bind_parse_verify(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing verify method", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[cur_arg + 1], "none") == 0) + conf->verify = SSL_SOCK_VERIFY_NONE; + else if (strcmp(args[cur_arg + 1], "optional") == 0) + conf->verify = SSL_SOCK_VERIFY_OPTIONAL; + else if (strcmp(args[cur_arg + 1], "required") == 0) + conf->verify = SSL_SOCK_VERIFY_REQUIRED; + else { + memprintf(err, "'%s' : unknown verify method '%s', only 'none', 'optional', and 'required' are supported\n", + args[cur_arg], args[cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} +static int bind_parse_verify(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_verify(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + +/* parse the "no-alpn" ssl-bind keyword, storing an empty ALPN string */ +static int ssl_bind_parse_no_alpn(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ + free(conf->alpn_str); + conf->alpn_len = 0; + conf->alpn_str = strdup(""); + + if (!conf->alpn_str) { + memprintf(err, "'%s' : out of memory", *args); + return ERR_ALERT | ERR_FATAL; + } + return 0; +} + +/* parse the "no-alpn" bind keyword, storing an empty ALPN string */ +static int bind_parse_no_alpn(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_no_alpn(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + + +/* parse the "no-ca-names" bind keyword */ +static int ssl_bind_parse_no_ca_names(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err) +{ + conf->no_ca_names = 1; + return 0; +} + +static int bind_parse_no_ca_names(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + return ssl_bind_parse_no_ca_names(args, cur_arg, px, &conf->ssl_conf, 0, err); +} + + +static int ssl_bind_parse_ocsp_update(char **args, int cur_arg, struct proxy *px, + struct ssl_bind_conf *ssl_conf, int from_cli, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : expecting <on|off>", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[cur_arg + 1], "on") == 0) + ssl_conf->ocsp_update = SSL_SOCK_OCSP_UPDATE_ON; + else if (strcmp(args[cur_arg + 1], "off") == 0) + ssl_conf->ocsp_update = SSL_SOCK_OCSP_UPDATE_OFF; + else { + memprintf(err, "'%s' : expecting <on|off>", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (ssl_conf->ocsp_update == SSL_SOCK_OCSP_UPDATE_ON) { + /* We might need to create the main ocsp update task */ + int ret = ssl_create_ocsp_update_task(err); + if (ret) + return ret; + } + + return 0; +} + + +/***************************** "server" keywords Parsing ********************************************/ + +/* parse the "npn" bind keyword */ +static int srv_parse_npn(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + char *p1, *p2; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing the comma-delimited NPN protocol suite", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(newsrv->ssl_ctx.npn_str); + + /* the NPN string is built as a suite of (<len> <name>)*, + * so we reuse each comma to store the next <len> and need + * one more for the end of the string. + */ + newsrv->ssl_ctx.npn_len = strlen(args[*cur_arg + 1]) + 1; + newsrv->ssl_ctx.npn_str = calloc(1, newsrv->ssl_ctx.npn_len + 1); + if (!newsrv->ssl_ctx.npn_str) { + memprintf(err, "out of memory"); + return ERR_ALERT | ERR_FATAL; + } + + memcpy(newsrv->ssl_ctx.npn_str + 1, args[*cur_arg + 1], + newsrv->ssl_ctx.npn_len); + + /* replace commas with the name length */ + p1 = newsrv->ssl_ctx.npn_str; + p2 = p1 + 1; + while (1) { + p2 = memchr(p1 + 1, ',', newsrv->ssl_ctx.npn_str + + newsrv->ssl_ctx.npn_len - (p1 + 1)); + if (!p2) + p2 = p1 + 1 + strlen(p1 + 1); + + if (p2 - (p1 + 1) > 255) { + *p2 = '\0'; + memprintf(err, "'%s' : NPN protocol name too long : '%s'", args[*cur_arg], p1 + 1); + return ERR_ALERT | ERR_FATAL; + } + + *p1 = p2 - (p1 + 1); + p1 = p2; + + if (!*p2) + break; + + *(p2++) = '\0'; + } + return 0; +#else + memprintf(err, "'%s' : library does not support TLS NPN extension", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} + +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation +static int parse_alpn(char *alpn, char **out_alpn_str, int *out_alpn_len, char **err) +{ + free(*out_alpn_str); + return ssl_sock_parse_alpn(alpn, out_alpn_str, out_alpn_len, err); +} +#endif + +/* parse the "alpn" server keyword */ +static int srv_parse_alpn(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + int ret = parse_alpn(args[*cur_arg + 1], + &newsrv->ssl_ctx.alpn_str, + &newsrv->ssl_ctx.alpn_len, err); + if (ret) + memprintf(err, "'%s' : %s", args[*cur_arg], *err); + return ret; +#else + memprintf(err, "'%s' : library does not support TLS ALPN extension", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} + +/* parse the "check-alpn" server keyword */ +static int srv_parse_check_alpn(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + int ret = parse_alpn(args[*cur_arg + 1], + &newsrv->check.alpn_str, + &newsrv->check.alpn_len, err); + if (ret) + memprintf(err, "'%s' : %s", args[*cur_arg], *err); + return ret; +#else + memprintf(err, "'%s' : library does not support TLS ALPN extension", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} + +/* parse the "ca-file" server keyword */ +static int srv_parse_ca_file(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + const int create_if_none = newsrv->flags & SRV_F_DYNAMIC ? 0 : 1; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing CAfile path", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if ((*args[*cur_arg + 1] != '/') && (*args[*cur_arg + 1] != '@') && global_ssl.ca_base) + memprintf(&newsrv->ssl_ctx.ca_file, "%s/%s", global_ssl.ca_base, args[*cur_arg + 1]); + else + memprintf(&newsrv->ssl_ctx.ca_file, "%s", args[*cur_arg + 1]); + + if (!ssl_store_load_locations_file(newsrv->ssl_ctx.ca_file, create_if_none, CAFILE_CERT)) { + memprintf(err, "'%s' : unable to load %s", args[*cur_arg], newsrv->ssl_ctx.ca_file); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "check-sni" server keyword */ +static int srv_parse_check_sni(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing SNI", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + newsrv->check.sni = strdup(args[*cur_arg + 1]); + if (!newsrv->check.sni) { + memprintf(err, "'%s' : failed to allocate memory", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + return 0; + +} + +/* common function to init ssl_ctx */ +static int ssl_sock_init_srv(struct server *s) +{ + if (global_ssl.connect_default_ciphers && !s->ssl_ctx.ciphers) + s->ssl_ctx.ciphers = strdup(global_ssl.connect_default_ciphers); +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + if (global_ssl.connect_default_ciphersuites && !s->ssl_ctx.ciphersuites) { + s->ssl_ctx.ciphersuites = strdup(global_ssl.connect_default_ciphersuites); + if (!s->ssl_ctx.ciphersuites) + return 1; + } +#endif + s->ssl_ctx.options |= global_ssl.connect_default_ssloptions; + s->ssl_ctx.methods.flags |= global_ssl.connect_default_sslmethods.flags; + + if (!s->ssl_ctx.methods.min) + s->ssl_ctx.methods.min = global_ssl.connect_default_sslmethods.min; + + if (!s->ssl_ctx.methods.max) + s->ssl_ctx.methods.max = global_ssl.connect_default_sslmethods.max; + +#if defined(SSL_CTX_set1_sigalgs_list) + if (global_ssl.connect_default_sigalgs && !s->ssl_ctx.sigalgs) { + s->ssl_ctx.sigalgs = strdup(global_ssl.connect_default_sigalgs); + if (!s->ssl_ctx.sigalgs) + return 1; + } +#endif + +#if defined(SSL_CTX_set1_client_sigalgs_list) + if (global_ssl.connect_default_client_sigalgs && !s->ssl_ctx.client_sigalgs) { + s->ssl_ctx.client_sigalgs = strdup(global_ssl.connect_default_client_sigalgs); + if (!s->ssl_ctx.client_sigalgs) + return 1; + } +#endif + +#if defined(SSL_CTX_set1_curves_list) + if (global_ssl.connect_default_curves && !s->ssl_ctx.curves) { + s->ssl_ctx.curves = strdup(global_ssl.connect_default_curves); + if (!s->ssl_ctx.curves) + return 1; + } +#endif + + return 0; +} + +/* parse the "check-ssl" server keyword */ +static int srv_parse_check_ssl(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->check.use_ssl = 1; + if (ssl_sock_init_srv(newsrv)) { + memprintf(err, "'%s' : not enough memory", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "ciphers" server keyword */ +static int srv_parse_ciphers(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing cipher suite", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(newsrv->ssl_ctx.ciphers); + newsrv->ssl_ctx.ciphers = strdup(args[*cur_arg + 1]); + + if (!newsrv->ssl_ctx.ciphers) { + memprintf(err, "'%s' : not enough memory", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "ciphersuites" server keyword */ +static int srv_parse_ciphersuites(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing cipher suite", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(newsrv->ssl_ctx.ciphersuites); + newsrv->ssl_ctx.ciphersuites = strdup(args[*cur_arg + 1]); + + if (!newsrv->ssl_ctx.ciphersuites) { + memprintf(err, "'%s' : not enough memory", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +#else /* ! HAVE_SSL_CTX_SET_CIPHERSUITES */ + memprintf(err, "'%s' not supported for your SSL library (%s).", args[*cur_arg], OPENSSL_VERSION_TEXT); + return ERR_ALERT | ERR_FATAL; + +#endif +} + +/* parse the "client-sigalgs" server keyword */ +static int srv_parse_client_sigalgs(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#ifndef SSL_CTX_set1_client_sigalgs_list + memprintf(err, "'%s' : library does not support setting signature algorithms", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#else + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' : missing signature algorithm list", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + newsrv->ssl_ctx.client_sigalgs = strdup(arg); + if (!newsrv->ssl_ctx.client_sigalgs) { + memprintf(err, "out of memory"); + return ERR_ALERT | ERR_FATAL; + } + return 0; +#endif +} + + +/* parse the "crl-file" server keyword */ +static int srv_parse_crl_file(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#ifndef X509_V_FLAG_CRL_CHECK + memprintf(err, "'%s' : library does not support CRL verify", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#else + const int create_if_none = newsrv->flags & SRV_F_DYNAMIC ? 0 : 1; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing CRLfile path", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if ((*args[*cur_arg + 1] != '/') && (*args[*cur_arg + 1] != '@') && global_ssl.ca_base) + memprintf(&newsrv->ssl_ctx.crl_file, "%s/%s", global_ssl.ca_base, args[*cur_arg + 1]); + else + memprintf(&newsrv->ssl_ctx.crl_file, "%s", args[*cur_arg + 1]); + + if (!ssl_store_load_locations_file(newsrv->ssl_ctx.crl_file, create_if_none, CAFILE_CRL)) { + memprintf(err, "'%s' : unable to load %s", args[*cur_arg], newsrv->ssl_ctx.crl_file); + return ERR_ALERT | ERR_FATAL; + } + return 0; +#endif +} + +/* parse the "curves" server keyword */ +static int srv_parse_curves(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#ifndef SSL_CTX_set1_curves_list + memprintf(err, "'%s' : library does not support setting curves list", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#else + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' : missing curves list", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + newsrv->ssl_ctx.curves = strdup(arg); + if (!newsrv->ssl_ctx.curves) { + memprintf(err, "out of memory"); + return ERR_ALERT | ERR_FATAL; + } + return 0; +#endif +} + +/* parse the "crt" server keyword */ +static int srv_parse_crt(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing certificate file path", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if ((*args[*cur_arg + 1] != '/') && global_ssl.crt_base) + memprintf(&newsrv->ssl_ctx.client_crt, "%s/%s", global_ssl.crt_base, args[*cur_arg + 1]); + else + memprintf(&newsrv->ssl_ctx.client_crt, "%s", args[*cur_arg + 1]); + + return 0; +} + +/* parse the "no-check-ssl" server keyword */ +static int srv_parse_no_check_ssl(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->check.use_ssl = -1; + ha_free(&newsrv->ssl_ctx.ciphers); + newsrv->ssl_ctx.options &= ~global_ssl.connect_default_ssloptions; + return 0; +} + +/* parse the "no-send-proxy-v2-ssl" server keyword */ +static int srv_parse_no_send_proxy_ssl(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->pp_opts &= ~SRV_PP_V2; + newsrv->pp_opts &= ~SRV_PP_V2_SSL; + return 0; +} + +/* parse the "no-send-proxy-v2-ssl-cn" server keyword */ +static int srv_parse_no_send_proxy_cn(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->pp_opts &= ~SRV_PP_V2; + newsrv->pp_opts &= ~SRV_PP_V2_SSL; + newsrv->pp_opts &= ~SRV_PP_V2_SSL_CN; + return 0; +} + +/* parse the "no-ssl" server keyword */ +static int srv_parse_no_ssl(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + /* if default-server have use_ssl, prepare ssl settings */ + if (newsrv->use_ssl == 1) { + if (ssl_sock_init_srv(newsrv)) { + memprintf(err, "'%s' : not enough memory", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + } + else { + ha_free(&newsrv->ssl_ctx.ciphers); + } + newsrv->use_ssl = -1; + return 0; +} + +/* parse the "allow-0rtt" server keyword */ +static int srv_parse_allow_0rtt(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->ssl_ctx.options |= SRV_SSL_O_EARLY_DATA; + return 0; +} + +/* parse the "no-ssl-reuse" server keyword */ +static int srv_parse_no_ssl_reuse(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->ssl_ctx.options |= SRV_SSL_O_NO_REUSE; + return 0; +} + +/* parse the "no-tls-tickets" server keyword */ +static int srv_parse_no_tls_tickets(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->ssl_ctx.options |= SRV_SSL_O_NO_TLS_TICKETS; + return 0; +} +/* parse the "send-proxy-v2-ssl" server keyword */ +static int srv_parse_send_proxy_ssl(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->pp_opts |= SRV_PP_V2; + newsrv->pp_opts |= SRV_PP_V2_SSL; + return 0; +} + +/* parse the "send-proxy-v2-ssl-cn" server keyword */ +static int srv_parse_send_proxy_cn(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->pp_opts |= SRV_PP_V2; + newsrv->pp_opts |= SRV_PP_V2_SSL; + newsrv->pp_opts |= SRV_PP_V2_SSL_CN; + return 0; +} + +/* parse the "sigalgs" server keyword */ +static int srv_parse_sigalgs(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#ifndef SSL_CTX_set1_sigalgs_list + memprintf(err, "'%s' : library does not support setting signature algorithms", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#else + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' : missing signature algorithm list", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + newsrv->ssl_ctx.sigalgs = strdup(arg); + if (!newsrv->ssl_ctx.sigalgs) { + memprintf(err, "out of memory"); + return ERR_ALERT | ERR_FATAL; + } + return 0; +#endif +} + +/* parse the "sni" server keyword */ +static int srv_parse_sni(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ +#ifndef SSL_CTRL_SET_TLSEXT_HOSTNAME + memprintf(err, "'%s' : the current SSL library doesn't support the SNI TLS extension", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#else + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' : missing sni expression", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(newsrv->sni_expr); + newsrv->sni_expr = strdup(arg); + if (!newsrv->sni_expr) { + memprintf(err, "out of memory"); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +#endif +} + +/* parse the "ssl" server keyword */ +static int srv_parse_ssl(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->use_ssl = 1; + if (ssl_sock_init_srv(newsrv)) { + memprintf(err, "'%s' : not enough memory", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "ssl-reuse" server keyword */ +static int srv_parse_ssl_reuse(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->ssl_ctx.options &= ~SRV_SSL_O_NO_REUSE; + return 0; +} + +/* parse the "tls-tickets" server keyword */ +static int srv_parse_tls_tickets(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->ssl_ctx.options &= ~SRV_SSL_O_NO_TLS_TICKETS; + return 0; +} + +/* parse the "verify" server keyword */ +static int srv_parse_verify(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing verify method", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[*cur_arg + 1], "none") == 0) + newsrv->ssl_ctx.verify = SSL_SOCK_VERIFY_NONE; + else if (strcmp(args[*cur_arg + 1], "required") == 0) + newsrv->ssl_ctx.verify = SSL_SOCK_VERIFY_REQUIRED; + else { + memprintf(err, "'%s' : unknown verify method '%s', only 'none' and 'required' are supported\n", + args[*cur_arg], args[*cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "verifyhost" server keyword */ +static int srv_parse_verifyhost(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing hostname to verify against", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(newsrv->ssl_ctx.verify_host); + newsrv->ssl_ctx.verify_host = strdup(args[*cur_arg + 1]); + + if (!newsrv->ssl_ctx.verify_host) { + memprintf(err, "'%s' : not enough memory", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "ssl-default-bind-options" keyword in global section */ +static int ssl_parse_default_bind_options(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) { + int i = 1; + + if (*(args[i]) == 0) { + memprintf(err, "global statement '%s' expects an option as an argument.", args[0]); + return -1; + } + while (*(args[i])) { + if (strcmp(args[i], "no-tls-tickets") == 0) + global_ssl.listen_default_ssloptions |= BC_SSL_O_NO_TLS_TICKETS; + else if (strcmp(args[i], "prefer-client-ciphers") == 0) + global_ssl.listen_default_ssloptions |= BC_SSL_O_PREF_CLIE_CIPH; + else if (strcmp(args[i], "ssl-min-ver") == 0 || strcmp(args[i], "ssl-max-ver") == 0) { + if (!parse_tls_method_minmax(args, i, &global_ssl.listen_default_sslmethods, err)) + i++; + else { + memprintf(err, "%s on global statement '%s'.", *err, args[0]); + return -1; + } + } + else if (parse_tls_method_options(args[i], &global_ssl.listen_default_sslmethods, err)) { + memprintf(err, "unknown option '%s' on global statement '%s'.", args[i], args[0]); + return -1; + } + i++; + } + return 0; +} + +/* parse the "ssl-default-server-options" keyword in global section */ +static int ssl_parse_default_server_options(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) { + int i = 1; + + if (*(args[i]) == 0) { + memprintf(err, "global statement '%s' expects an option as an argument.", args[0]); + return -1; + } + while (*(args[i])) { + if (strcmp(args[i], "no-tls-tickets") == 0) + global_ssl.connect_default_ssloptions |= SRV_SSL_O_NO_TLS_TICKETS; + else if (strcmp(args[i], "ssl-min-ver") == 0 || strcmp(args[i], "ssl-max-ver") == 0) { + if (!parse_tls_method_minmax(args, i, &global_ssl.connect_default_sslmethods, err)) + i++; + else { + memprintf(err, "%s on global statement '%s'.", *err, args[0]); + return -1; + } + } + else if (parse_tls_method_options(args[i], &global_ssl.connect_default_sslmethods, err)) { + memprintf(err, "unknown option '%s' on global statement '%s'.", args[i], args[0]); + return -1; + } + i++; + } + return 0; +} + +/* parse the "ca-base" / "crt-base" keywords in global section. + * Returns <0 on alert, >0 on warning, 0 on success. + */ +static int ssl_parse_global_ca_crt_base(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char **target; + + target = (args[0][1] == 'a') ? &global_ssl.ca_base : &global_ssl.crt_base; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*target) { + memprintf(err, "'%s' already specified.", args[0]); + return -1; + } + + if (*(args[1]) == 0) { + memprintf(err, "global statement '%s' expects a directory path as an argument.", args[0]); + return -1; + } + *target = strdup(args[1]); + return 0; +} + +/* parse the "ssl-skip-self-issued-ca" keyword in global section. */ +static int ssl_parse_skip_self_issued_ca(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ +#ifdef SSL_CTX_build_cert_chain + global_ssl.skip_self_issued_ca = 1; + return 0; +#else + memprintf(err, "global statement '%s' requires at least OpenSSL 1.0.2.", args[0]); + return -1; +#endif +} + + +static int ssl_parse_global_ocsp_maxdelay(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int value = 0; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects an integer argument.", args[0]); + return -1; + } + + value = atoi(args[1]); + if (value < 0) { + memprintf(err, "'%s' expects a positive numeric value.", args[0]); + return -1; + } + + if (global_ssl.ocsp_update.delay_min > value) { + memprintf(err, "'%s' can not be lower than tune.ssl.ocsp-update.mindelay.", args[0]); + return -1; + } + + global_ssl.ocsp_update.delay_max = value; + + return 0; +} + +static int ssl_parse_global_ocsp_mindelay(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int value = 0; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects an integer argument.", args[0]); + return -1; + } + + value = atoi(args[1]); + if (value < 0) { + memprintf(err, "'%s' expects a positive numeric value.", args[0]); + return -1; + } + + if (value > global_ssl.ocsp_update.delay_max) { + memprintf(err, "'%s' can not be higher than tune.ssl.ocsp-update.maxdelay.", args[0]); + return -1; + } + + global_ssl.ocsp_update.delay_min = value; + + return 0; +} + + + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted, doing so helps + * all code contributors. + * Optional keywords are also declared with a NULL ->parse() function so that + * the config parser can report an appropriate error when a known keyword was + * not enabled. + */ + +/* the <ssl_crtlist_kws> keywords are used for crt-list parsing, they *MUST* be safe + * with their proxy argument NULL and must only fill the ssl_bind_conf */ +struct ssl_crtlist_kw ssl_crtlist_kws[] = { + { "allow-0rtt", ssl_bind_parse_allow_0rtt, 0 }, /* allow 0-RTT */ + { "alpn", ssl_bind_parse_alpn, 1 }, /* set ALPN supported protocols */ + { "ca-file", ssl_bind_parse_ca_file, 1 }, /* set CAfile to process ca-names and verify on client cert */ + { "ca-verify-file", ssl_bind_parse_ca_verify_file, 1 }, /* set CAverify file to process verify on client cert */ + { "ciphers", ssl_bind_parse_ciphers, 1 }, /* set SSL cipher suite */ + { "ciphersuites", ssl_bind_parse_ciphersuites, 1 }, /* set TLS 1.3 cipher suite */ + { "client-sigalgs", ssl_bind_parse_client_sigalgs, 1 }, /* set SSL client signature algorithms */ + { "crl-file", ssl_bind_parse_crl_file, 1 }, /* set certificate revocation list file use on client cert verify */ + { "curves", ssl_bind_parse_curves, 1 }, /* set SSL curve suite */ + { "ecdhe", ssl_bind_parse_ecdhe, 1 }, /* defines named curve for elliptic curve Diffie-Hellman */ + { "no-alpn", ssl_bind_parse_no_alpn, 0 }, /* disable sending ALPN */ + { "no-ca-names", ssl_bind_parse_no_ca_names, 0 }, /* do not send ca names to clients (ca_file related) */ + { "npn", ssl_bind_parse_npn, 1 }, /* set NPN supported protocols */ + { "sigalgs", ssl_bind_parse_sigalgs, 1 }, /* set SSL signature algorithms */ + { "ssl-min-ver", ssl_bind_parse_tls_method_minmax,1 }, /* minimum version */ + { "ssl-max-ver", ssl_bind_parse_tls_method_minmax,1 }, /* maximum version */ + { "verify", ssl_bind_parse_verify, 1 }, /* set SSL verify method */ + { "ocsp-update", ssl_bind_parse_ocsp_update, 1 }, /* ocsp update mode (on or off) */ + { NULL, NULL, 0 }, +}; + +/* no initcall for ssl_bind_kws, these ones are parsed in the parser loop */ + +static struct bind_kw_list bind_kws = { "SSL", { }, { + { "allow-0rtt", bind_parse_allow_0rtt, 0 }, /* Allow 0RTT */ + { "alpn", bind_parse_alpn, 1 }, /* set ALPN supported protocols */ + { "ca-file", bind_parse_ca_file, 1 }, /* set CAfile to process ca-names and verify on client cert */ + { "ca-verify-file", bind_parse_ca_verify_file, 1 }, /* set CAverify file to process verify on client cert */ + { "ca-ignore-err", bind_parse_ignore_err, 1 }, /* set error IDs to ignore on verify depth > 0 */ + { "ca-sign-file", bind_parse_ca_sign_file, 1 }, /* set CAFile used to generate and sign server certs */ + { "ca-sign-pass", bind_parse_ca_sign_pass, 1 }, /* set CAKey passphrase */ + { "ciphers", bind_parse_ciphers, 1 }, /* set SSL cipher suite */ + { "ciphersuites", bind_parse_ciphersuites, 1 }, /* set TLS 1.3 cipher suite */ + { "client-sigalgs", bind_parse_client_sigalgs, 1 }, /* set SSL client signature algorithms */ + { "crl-file", bind_parse_crl_file, 1 }, /* set certificate revocation list file use on client cert verify */ + { "crt", bind_parse_crt, 1 }, /* load SSL certificates from this location */ + { "crt-ignore-err", bind_parse_ignore_err, 1 }, /* set error IDs to ignore on verify depth == 0 */ + { "crt-list", bind_parse_crt_list, 1 }, /* load a list of crt from this location */ + { "curves", bind_parse_curves, 1 }, /* set SSL curve suite */ + { "ecdhe", bind_parse_ecdhe, 1 }, /* defines named curve for elliptic curve Diffie-Hellman */ + { "force-sslv3", bind_parse_tls_method_options, 0 }, /* force SSLv3 */ + { "force-tlsv10", bind_parse_tls_method_options, 0 }, /* force TLSv10 */ + { "force-tlsv11", bind_parse_tls_method_options, 0 }, /* force TLSv11 */ + { "force-tlsv12", bind_parse_tls_method_options, 0 }, /* force TLSv12 */ + { "force-tlsv13", bind_parse_tls_method_options, 0 }, /* force TLSv13 */ + { "generate-certificates", bind_parse_generate_certs, 0 }, /* enable the server certificates generation */ + { "no-alpn", bind_parse_no_alpn, 0 }, /* disable sending ALPN */ + { "no-ca-names", bind_parse_no_ca_names, 0 }, /* do not send ca names to clients (ca_file related) */ + { "no-sslv3", bind_parse_tls_method_options, 0 }, /* disable SSLv3 */ + { "no-tlsv10", bind_parse_tls_method_options, 0 }, /* disable TLSv10 */ + { "no-tlsv11", bind_parse_tls_method_options, 0 }, /* disable TLSv11 */ + { "no-tlsv12", bind_parse_tls_method_options, 0 }, /* disable TLSv12 */ + { "no-tlsv13", bind_parse_tls_method_options, 0 }, /* disable TLSv13 */ + { "no-tls-tickets", bind_parse_no_tls_tickets, 0 }, /* disable session resumption tickets */ + { "sigalgs", bind_parse_sigalgs, 1 }, /* set SSL signature algorithms */ + { "ssl", bind_parse_ssl, 0 }, /* enable SSL processing */ + { "ssl-min-ver", bind_parse_tls_method_minmax, 1 }, /* minimum version */ + { "ssl-max-ver", bind_parse_tls_method_minmax, 1 }, /* maximum version */ + { "strict-sni", bind_parse_strict_sni, 0 }, /* refuse negotiation if sni doesn't match a certificate */ + { "tls-ticket-keys", bind_parse_tls_ticket_keys, 1 }, /* set file to load TLS ticket keys from */ + { "verify", bind_parse_verify, 1 }, /* set SSL verify method */ + { "npn", bind_parse_npn, 1 }, /* set NPN supported protocols */ + { "prefer-client-ciphers", bind_parse_pcc, 0 }, /* prefer client ciphers */ + { NULL, NULL, 0 }, +}}; + +INITCALL1(STG_REGISTER, bind_register_keywords, &bind_kws); + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted, doing so helps + * all code contributors. + * Optional keywords are also declared with a NULL ->parse() function so that + * the config parser can report an appropriate error when a known keyword was + * not enabled. + */ +static struct srv_kw_list srv_kws = { "SSL", { }, { + { "allow-0rtt", srv_parse_allow_0rtt, 0, 1, 1 }, /* Allow using early data on this server */ + { "alpn", srv_parse_alpn, 1, 1, 1 }, /* Set ALPN supported protocols */ + { "ca-file", srv_parse_ca_file, 1, 1, 1 }, /* set CAfile to process verify server cert */ + { "check-alpn", srv_parse_check_alpn, 1, 1, 1 }, /* Set ALPN used for checks */ + { "check-sni", srv_parse_check_sni, 1, 1, 1 }, /* set SNI */ + { "check-ssl", srv_parse_check_ssl, 0, 1, 1 }, /* enable SSL for health checks */ + { "ciphers", srv_parse_ciphers, 1, 1, 1 }, /* select the cipher suite */ + { "ciphersuites", srv_parse_ciphersuites, 1, 1, 1 }, /* select the cipher suite */ + { "client-sigalgs", srv_parse_client_sigalgs, 1, 1, 1 }, /* signature algorithms */ + { "crl-file", srv_parse_crl_file, 1, 1, 1 }, /* set certificate revocation list file use on server cert verify */ + { "curves", srv_parse_curves, 1, 1, 1 }, /* set TLS curves list */ + { "crt", srv_parse_crt, 1, 1, 1 }, /* set client certificate */ + { "force-sslv3", srv_parse_tls_method_options, 0, 1, 1 }, /* force SSLv3 */ + { "force-tlsv10", srv_parse_tls_method_options, 0, 1, 1 }, /* force TLSv10 */ + { "force-tlsv11", srv_parse_tls_method_options, 0, 1, 1 }, /* force TLSv11 */ + { "force-tlsv12", srv_parse_tls_method_options, 0, 1, 1 }, /* force TLSv12 */ + { "force-tlsv13", srv_parse_tls_method_options, 0, 1, 1 }, /* force TLSv13 */ + { "no-check-ssl", srv_parse_no_check_ssl, 0, 1, 0 }, /* disable SSL for health checks */ + { "no-send-proxy-v2-ssl", srv_parse_no_send_proxy_ssl, 0, 1, 0 }, /* do not send PROXY protocol header v2 with SSL info */ + { "no-send-proxy-v2-ssl-cn", srv_parse_no_send_proxy_cn, 0, 1, 0 }, /* do not send PROXY protocol header v2 with CN */ + { "no-ssl", srv_parse_no_ssl, 0, 1, 0 }, /* disable SSL processing */ + { "no-ssl-reuse", srv_parse_no_ssl_reuse, 0, 1, 1 }, /* disable session reuse */ + { "no-sslv3", srv_parse_tls_method_options, 0, 0, 1 }, /* disable SSLv3 */ + { "no-tlsv10", srv_parse_tls_method_options, 0, 0, 1 }, /* disable TLSv10 */ + { "no-tlsv11", srv_parse_tls_method_options, 0, 0, 1 }, /* disable TLSv11 */ + { "no-tlsv12", srv_parse_tls_method_options, 0, 0, 1 }, /* disable TLSv12 */ + { "no-tlsv13", srv_parse_tls_method_options, 0, 0, 1 }, /* disable TLSv13 */ + { "no-tls-tickets", srv_parse_no_tls_tickets, 0, 1, 1 }, /* disable session resumption tickets */ + { "npn", srv_parse_npn, 1, 1, 1 }, /* Set NPN supported protocols */ + { "send-proxy-v2-ssl", srv_parse_send_proxy_ssl, 0, 1, 1 }, /* send PROXY protocol header v2 with SSL info */ + { "send-proxy-v2-ssl-cn", srv_parse_send_proxy_cn, 0, 1, 1 }, /* send PROXY protocol header v2 with CN */ + { "sigalgs", srv_parse_sigalgs, 1, 1, 1 }, /* signature algorithms */ + { "sni", srv_parse_sni, 1, 1, 1 }, /* send SNI extension */ + { "ssl", srv_parse_ssl, 0, 1, 1 }, /* enable SSL processing */ + { "ssl-min-ver", srv_parse_tls_method_minmax, 1, 1, 1 }, /* minimum version */ + { "ssl-max-ver", srv_parse_tls_method_minmax, 1, 1, 1 }, /* maximum version */ + { "ssl-reuse", srv_parse_ssl_reuse, 0, 1, 0 }, /* enable session reuse */ + { "tls-tickets", srv_parse_tls_tickets, 0, 1, 1 }, /* enable session resumption tickets */ + { "verify", srv_parse_verify, 1, 1, 1 }, /* set SSL verify method */ + { "verifyhost", srv_parse_verifyhost, 1, 1, 1 }, /* require that SSL cert verifies for hostname */ + { NULL, NULL, 0, 0 }, +}}; + +INITCALL1(STG_REGISTER, srv_register_keywords, &srv_kws); + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "ca-base", ssl_parse_global_ca_crt_base }, + { CFG_GLOBAL, "crt-base", ssl_parse_global_ca_crt_base }, + { CFG_GLOBAL, "issuers-chain-path", ssl_load_global_issuers_from_path }, + { CFG_GLOBAL, "maxsslconn", ssl_parse_global_int }, + { CFG_GLOBAL, "ssl-default-bind-options", ssl_parse_default_bind_options }, + { CFG_GLOBAL, "ssl-default-server-options", ssl_parse_default_server_options }, +#ifndef OPENSSL_NO_DH + { CFG_GLOBAL, "ssl-dh-param-file", ssl_parse_global_dh_param_file }, +#endif + { CFG_GLOBAL, "ssl-mode-async", ssl_parse_global_ssl_async }, +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) + { CFG_GLOBAL, "ssl-engine", ssl_parse_global_ssl_engine }, +#endif +#ifdef HAVE_SSL_PROVIDERS + { CFG_GLOBAL, "ssl-propquery", ssl_parse_global_ssl_propquery }, + { CFG_GLOBAL, "ssl-provider", ssl_parse_global_ssl_provider }, + { CFG_GLOBAL, "ssl-provider-path", ssl_parse_global_ssl_provider_path }, +#endif + { CFG_GLOBAL, "ssl-skip-self-issued-ca", ssl_parse_skip_self_issued_ca }, + { CFG_GLOBAL, "tune.ssl.cachesize", ssl_parse_global_int }, +#ifndef OPENSSL_NO_DH + { CFG_GLOBAL, "tune.ssl.default-dh-param", ssl_parse_global_default_dh }, +#endif + { CFG_GLOBAL, "tune.ssl.force-private-cache", ssl_parse_global_private_cache }, + { CFG_GLOBAL, "tune.ssl.lifetime", ssl_parse_global_lifetime }, + { CFG_GLOBAL, "tune.ssl.maxrecord", ssl_parse_global_int }, + { CFG_GLOBAL, "tune.ssl.hard-maxrecord", ssl_parse_global_int }, + { CFG_GLOBAL, "tune.ssl.ssl-ctx-cache-size", ssl_parse_global_int }, + { CFG_GLOBAL, "tune.ssl.capture-cipherlist-size", ssl_parse_global_capture_buffer }, + { CFG_GLOBAL, "tune.ssl.capture-buffer-size", ssl_parse_global_capture_buffer }, + { CFG_GLOBAL, "tune.ssl.keylog", ssl_parse_global_keylog }, + { CFG_GLOBAL, "ssl-default-bind-ciphers", ssl_parse_global_ciphers }, + { CFG_GLOBAL, "ssl-default-server-ciphers", ssl_parse_global_ciphers }, +#if defined(SSL_CTX_set1_curves_list) + { CFG_GLOBAL, "ssl-default-bind-curves", ssl_parse_global_curves }, + { CFG_GLOBAL, "ssl-default-server-curves", ssl_parse_global_curves }, +#endif +#if defined(SSL_CTX_set1_sigalgs_list) + { CFG_GLOBAL, "ssl-default-bind-sigalgs", ssl_parse_global_sigalgs }, + { CFG_GLOBAL, "ssl-default-server-sigalgs", ssl_parse_global_sigalgs }, +#endif +#if defined(SSL_CTX_set1_client_sigalgs_list) + { CFG_GLOBAL, "ssl-default-bind-client-sigalgs", ssl_parse_global_client_sigalgs }, + { CFG_GLOBAL, "ssl-default-server-client-sigalgs", ssl_parse_global_client_sigalgs }, +#endif + { CFG_GLOBAL, "ssl-default-bind-ciphersuites", ssl_parse_global_ciphersuites }, + { CFG_GLOBAL, "ssl-default-server-ciphersuites", ssl_parse_global_ciphersuites }, + { CFG_GLOBAL, "ssl-load-extra-files", ssl_parse_global_extra_files }, + { CFG_GLOBAL, "ssl-load-extra-del-ext", ssl_parse_global_extra_noext }, +#ifndef OPENSSL_NO_OCSP + { CFG_GLOBAL, "tune.ssl.ocsp-update.maxdelay", ssl_parse_global_ocsp_maxdelay }, + { CFG_GLOBAL, "tune.ssl.ocsp-update.mindelay", ssl_parse_global_ocsp_mindelay }, +#endif + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/cfgparse-tcp.c b/src/cfgparse-tcp.c new file mode 100644 index 0000000..a4f6f29 --- /dev/null +++ b/src/cfgparse-tcp.c @@ -0,0 +1,275 @@ +/* + * Configuration parsing for TCP (bind and server keywords) + * + * Copyright 2000-2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <netinet/tcp.h> +#include <netinet/in.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/errors.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/namespace.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server.h> +#include <haproxy/tools.h> + + +#ifdef IPV6_V6ONLY +/* parse the "v4v6" bind keyword */ +static int bind_parse_v4v6(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->settings.options |= RX_O_V4V6; + return 0; +} + +/* parse the "v6only" bind keyword */ +static int bind_parse_v6only(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->settings.options |= RX_O_V6ONLY; + return 0; +} +#endif + +#ifdef CONFIG_HAP_TRANSPARENT +/* parse the "transparent" bind keyword */ +static int bind_parse_transparent(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->settings.options |= RX_O_FOREIGN; + return 0; +} +#endif + +#if defined(TCP_DEFER_ACCEPT) || defined(SO_ACCEPTFILTER) +/* parse the "defer-accept" bind keyword */ +static int bind_parse_defer_accept(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->options |= BC_O_DEF_ACCEPT; + return 0; +} +#endif + +#ifdef TCP_FASTOPEN +/* parse the "tfo" bind keyword */ +static int bind_parse_tfo(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->options |= BC_O_TCP_FO; + return 0; +} +#endif + +#ifdef TCP_MAXSEG +/* parse the "mss" bind keyword */ +static int bind_parse_mss(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + int mss; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing MSS value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + mss = atoi(args[cur_arg + 1]); + if (!mss || abs(mss) > 65535) { + memprintf(err, "'%s' : expects an MSS with and absolute value between 1 and 65535", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + conf->maxseg = mss; + return 0; +} +#endif + +#ifdef TCP_USER_TIMEOUT +/* parse the "tcp-ut" bind keyword */ +static int bind_parse_tcp_ut(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + const char *ptr = NULL; + unsigned int timeout; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing TCP User Timeout value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + ptr = parse_time_err(args[cur_arg + 1], &timeout, TIME_UNIT_MS); + if (ptr == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[cur_arg+1], args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + else if (ptr == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[cur_arg+1], args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + else if (ptr) { + memprintf(err, "'%s' : expects a positive delay in milliseconds", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + conf->tcp_ut = timeout; + return 0; +} +#endif + +#ifdef SO_BINDTODEVICE +/* parse the "interface" bind keyword */ +static int bind_parse_interface(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing interface name", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + ha_free(&conf->settings.interface); + conf->settings.interface = strdup(args[cur_arg + 1]); + return 0; +} +#endif + +#ifdef USE_NS +/* parse the "namespace" bind keyword */ +static int bind_parse_namespace(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + char *namespace = NULL; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing namespace id", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + namespace = args[cur_arg + 1]; + + conf->settings.netns = netns_store_lookup(namespace, strlen(namespace)); + + if (conf->settings.netns == NULL) + conf->settings.netns = netns_store_insert(namespace); + + if (conf->settings.netns == NULL) { + ha_alert("Cannot open namespace '%s'.\n", args[cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + return 0; +} +#endif + +#ifdef TCP_USER_TIMEOUT +/* parse the "tcp-ut" server keyword */ +static int srv_parse_tcp_ut(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + const char *ptr = NULL; + unsigned int timeout; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing TCP User Timeout value", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + ptr = parse_time_err(args[*cur_arg + 1], &timeout, TIME_UNIT_MS); + if (ptr == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[*cur_arg+1], args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + else if (ptr == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[*cur_arg+1], args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + else if (ptr) { + memprintf(err, "'%s' : expects a positive delay in milliseconds", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (newsrv->addr.ss_family == AF_INET || newsrv->addr.ss_family == AF_INET6) + newsrv->tcp_ut = timeout; + + return 0; +} +#endif + + +/************************************************************************/ +/* All supported bind keywords must be declared here. */ +/************************************************************************/ + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted, doing so helps + * all code contributors. + * Optional keywords are also declared with a NULL ->parse() function so that + * the config parser can report an appropriate error when a known keyword was + * not enabled. + */ +static struct bind_kw_list bind_kws = { "TCP", { }, { +#if defined(TCP_DEFER_ACCEPT) || defined(SO_ACCEPTFILTER) + { "defer-accept", bind_parse_defer_accept, 0 }, /* wait for some data for 1 second max before doing accept */ +#endif +#ifdef SO_BINDTODEVICE + { "interface", bind_parse_interface, 1 }, /* specifically bind to this interface */ +#endif +#ifdef TCP_MAXSEG + { "mss", bind_parse_mss, 1 }, /* set MSS of listening socket */ +#endif +#ifdef TCP_USER_TIMEOUT + { "tcp-ut", bind_parse_tcp_ut, 1 }, /* set User Timeout on listening socket */ +#endif +#ifdef TCP_FASTOPEN + { "tfo", bind_parse_tfo, 0 }, /* enable TCP_FASTOPEN of listening socket */ +#endif +#ifdef CONFIG_HAP_TRANSPARENT + { "transparent", bind_parse_transparent, 0 }, /* transparently bind to the specified addresses */ +#endif +#ifdef IPV6_V6ONLY + { "v4v6", bind_parse_v4v6, 0 }, /* force socket to bind to IPv4+IPv6 */ + { "v6only", bind_parse_v6only, 0 }, /* force socket to bind to IPv6 only */ +#endif +#ifdef USE_NS + { "namespace", bind_parse_namespace, 1 }, +#endif + /* the versions with the NULL parse function*/ + { "defer-accept", NULL, 0 }, + { "interface", NULL, 1 }, + { "mss", NULL, 1 }, + { "transparent", NULL, 0 }, + { "v4v6", NULL, 0 }, + { "v6only", NULL, 0 }, + { NULL, NULL, 0 }, +}}; + +INITCALL1(STG_REGISTER, bind_register_keywords, &bind_kws); + +static struct srv_kw_list srv_kws = { "TCP", { }, { +#ifdef TCP_USER_TIMEOUT + { "tcp-ut", srv_parse_tcp_ut, 1, 1, 0 }, /* set TCP user timeout on server */ +#endif + { NULL, NULL, 0 }, +}}; + +INITCALL1(STG_REGISTER, srv_register_keywords, &srv_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/cfgparse-unix.c b/src/cfgparse-unix.c new file mode 100644 index 0000000..b1fb1e2 --- /dev/null +++ b/src/cfgparse-unix.c @@ -0,0 +1,135 @@ +/* + * Configuration parsing for UNIX sockets (bind and server keywords) + * + * Copyright 2000-2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <grp.h> +#include <pwd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> + +#include <netinet/tcp.h> +#include <netinet/in.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/errors.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/namespace.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server.h> +#include <haproxy/tools.h> + +/* parse the "mode" bind keyword */ +static int bind_parse_mode(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + char *endptr; + + conf->settings.ux.mode = strtol(args[cur_arg + 1], &endptr, 8); + + if (!*args[cur_arg + 1] || *endptr) { + memprintf(err, "'%s' : missing or invalid mode '%s' (octal integer expected)", args[cur_arg], args[cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "gid" bind keyword */ +static int bind_parse_gid(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + conf->settings.ux.gid = atol(args[cur_arg + 1]); + return 0; +} + +/* parse the "group" bind keyword */ +static int bind_parse_group(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + struct group *group; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing group name", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + group = getgrnam(args[cur_arg + 1]); + if (!group) { + memprintf(err, "'%s' : unknown group name '%s'", args[cur_arg], args[cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + conf->settings.ux.gid = group->gr_gid; + return 0; +} + +/* parse the "uid" bind keyword */ +static int bind_parse_uid(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + conf->settings.ux.uid = atol(args[cur_arg + 1]); + return 0; +} + +/* parse the "user" bind keyword */ +static int bind_parse_user(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + struct passwd *user; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing user name", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + user = getpwnam(args[cur_arg + 1]); + if (!user) { + memprintf(err, "'%s' : unknown user name '%s'", args[cur_arg], args[cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + conf->settings.ux.uid = user->pw_uid; + return 0; +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted, doing so helps + * all code contributors. + * Optional keywords are also declared with a NULL ->parse() function so that + * the config parser can report an appropriate error when a known keyword was + * not enabled. + */ +static struct bind_kw_list bind_kws = { "UNIX", { }, { + { "gid", bind_parse_gid, 1 }, /* set the socket's gid */ + { "group", bind_parse_group, 1 }, /* set the socket's gid from the group name */ + { "mode", bind_parse_mode, 1 }, /* set the socket's mode (eg: 0644)*/ + { "uid", bind_parse_uid, 1 }, /* set the socket's uid */ + { "user", bind_parse_user, 1 }, /* set the socket's uid from the user name */ + { NULL, NULL, 0 }, +}}; + +INITCALL1(STG_REGISTER, bind_register_keywords, &bind_kws); diff --git a/src/cfgparse.c b/src/cfgparse.c new file mode 100644 index 0000000..2744f97 --- /dev/null +++ b/src/cfgparse.c @@ -0,0 +1,4798 @@ +/* + * Configuration parser + * + * Copyright 2000-2011 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* This is to have crypt() and sched_setaffinity() defined on Linux */ +#define _GNU_SOURCE + +#ifdef USE_LIBCRYPT +#ifdef USE_CRYPT_H +/* some platforms such as Solaris need this */ +#include <crypt.h> +#endif +#endif /* USE_LIBCRYPT */ + +#include <dirent.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> +#include <ctype.h> +#include <pwd.h> +#include <grp.h> +#include <errno.h> +#ifdef USE_CPU_AFFINITY +#include <sched.h> +#endif +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <haproxy/acl.h> +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/auth.h> +#include <haproxy/backend.h> +#include <haproxy/capture.h> +#include <haproxy/cfgcond.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/chunk.h> +#include <haproxy/clock.h> +#ifdef USE_CPU_AFFINITY +#include <haproxy/cpuset.h> +#endif +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/filters.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_rules.h> +#include <haproxy/lb_chash.h> +#include <haproxy/lb_fas.h> +#include <haproxy/lb_fwlc.h> +#include <haproxy/lb_fwrr.h> +#include <haproxy/lb_map.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/sink.h> +#include <haproxy/mailers.h> +#include <haproxy/namespace.h> +#include <haproxy/quic_sock.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/peers-t.h> +#include <haproxy/peers.h> +#include <haproxy/pool.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/resolvers.h> +#include <haproxy/sample.h> +#include <haproxy/server.h> +#include <haproxy/session.h> +#include <haproxy/stats-t.h> +#include <haproxy/stick_table.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/tcpcheck.h> +#include <haproxy/thread.h> +#include <haproxy/tools.h> +#include <haproxy/uri_auth-t.h> + + +/* Used to chain configuration sections definitions. This list + * stores struct cfg_section + */ +struct list sections = LIST_HEAD_INIT(sections); + +struct list postparsers = LIST_HEAD_INIT(postparsers); + +extern struct proxy *mworker_proxy; + +/* curproxy is only valid during parsing and will be NULL afterwards. */ +struct proxy *curproxy = NULL; + +char *cursection = NULL; +int cfg_maxpconn = 0; /* # of simultaneous connections per proxy (-N) */ +int cfg_maxconn = 0; /* # of simultaneous connections, (-n) */ +char *cfg_scope = NULL; /* the current scope during the configuration parsing */ +int non_global_section_parsed = 0; + +/* how to handle default paths */ +static enum default_path_mode { + DEFAULT_PATH_CURRENT = 0, /* "current": paths are relative to CWD (this is the default) */ + DEFAULT_PATH_CONFIG, /* "config": paths are relative to config file */ + DEFAULT_PATH_PARENT, /* "parent": paths are relative to config file's ".." */ + DEFAULT_PATH_ORIGIN, /* "origin": paths are relative to default_path_origin */ +} default_path_mode; + +static char initial_cwd[PATH_MAX]; +static char current_cwd[PATH_MAX]; + +/* List head of all known configuration keywords */ +struct cfg_kw_list cfg_keywords = { + .list = LIST_HEAD_INIT(cfg_keywords.list) +}; + +/* + * converts <str> to a list of listeners which are dynamically allocated. + * The format is "{addr|'*'}:port[-end][,{addr|'*'}:port[-end]]*", where : + * - <addr> can be empty or "*" to indicate INADDR_ANY ; + * - <port> is a numerical port from 1 to 65535 ; + * - <end> indicates to use the range from <port> to <end> instead (inclusive). + * This can be repeated as many times as necessary, separated by a coma. + * Function returns 1 for success or 0 if error. In case of errors, if <err> is + * not NULL, it must be a valid pointer to either NULL or a freeable area that + * will be replaced with an error message. + */ +int str2listener(char *str, struct proxy *curproxy, struct bind_conf *bind_conf, const char *file, int line, char **err) +{ + struct protocol *proto; + char *next, *dupstr; + int port, end; + + next = dupstr = strdup(str); + + while (next && *next) { + struct sockaddr_storage *ss2; + int fd = -1; + + str = next; + /* 1) look for the end of the first address */ + if ((next = strchr(str, ',')) != NULL) { + *next++ = 0; + } + + ss2 = str2sa_range(str, NULL, &port, &end, &fd, &proto, NULL, err, + (curproxy == global.cli_fe || curproxy == mworker_proxy) ? NULL : global.unix_bind.prefix, + NULL, PA_O_RESOLVE | PA_O_PORT_OK | PA_O_PORT_MAND | PA_O_PORT_RANGE | + PA_O_SOCKET_FD | PA_O_STREAM | PA_O_XPRT); + if (!ss2) + goto fail; + + if (ss2->ss_family == AF_CUST_RHTTP_SRV) { + /* Check if a previous non reverse HTTP present is + * already defined. If DGRAM or STREAM is set, this + * indicates that we are currently parsing the second + * or more address. + */ + if (bind_conf->options & (BC_O_USE_SOCK_DGRAM|BC_O_USE_SOCK_STREAM) && + !(bind_conf->options & BC_O_REVERSE_HTTP)) { + memprintf(err, "Cannot mix reverse HTTP bind with others.\n"); + goto fail; + } + + bind_conf->rhttp_srvname = strdup(str + strlen("rhttp@")); + if (!bind_conf->rhttp_srvname) { + memprintf(err, "Cannot allocate reverse HTTP bind.\n"); + goto fail; + } + + bind_conf->options |= BC_O_REVERSE_HTTP; + } + else if (bind_conf->options & BC_O_REVERSE_HTTP) { + /* Standard address mixed with a previous reverse HTTP one. */ + memprintf(err, "Cannot mix reverse HTTP bind with others.\n"); + goto fail; + } + + /* OK the address looks correct */ + if (proto->proto_type == PROTO_TYPE_DGRAM) + bind_conf->options |= BC_O_USE_SOCK_DGRAM; + else + bind_conf->options |= BC_O_USE_SOCK_STREAM; + + if (proto->xprt_type == PROTO_TYPE_DGRAM) + bind_conf->options |= BC_O_USE_XPRT_DGRAM; + else + bind_conf->options |= BC_O_USE_XPRT_STREAM; + + if (!create_listeners(bind_conf, ss2, port, end, fd, proto, err)) { + memprintf(err, "%s for address '%s'.\n", *err, str); + goto fail; + } + } /* end while(next) */ + free(dupstr); + return 1; + fail: + free(dupstr); + return 0; +} + +/* + * converts <str> to a list of datagram-oriented listeners which are dynamically + * allocated. + * The format is "{addr|'*'}:port[-end][,{addr|'*'}:port[-end]]*", where : + * - <addr> can be empty or "*" to indicate INADDR_ANY ; + * - <port> is a numerical port from 1 to 65535 ; + * - <end> indicates to use the range from <port> to <end> instead (inclusive). + * This can be repeated as many times as necessary, separated by a coma. + * Function returns 1 for success or 0 if error. In case of errors, if <err> is + * not NULL, it must be a valid pointer to either NULL or a freeable area that + * will be replaced with an error message. + */ +int str2receiver(char *str, struct proxy *curproxy, struct bind_conf *bind_conf, const char *file, int line, char **err) +{ + struct protocol *proto; + char *next, *dupstr; + int port, end; + + next = dupstr = strdup(str); + + while (next && *next) { + struct sockaddr_storage *ss2; + int fd = -1; + + str = next; + /* 1) look for the end of the first address */ + if ((next = strchr(str, ',')) != NULL) { + *next++ = 0; + } + + ss2 = str2sa_range(str, NULL, &port, &end, &fd, &proto, NULL, err, + curproxy == global.cli_fe ? NULL : global.unix_bind.prefix, + NULL, PA_O_RESOLVE | PA_O_PORT_OK | PA_O_PORT_MAND | PA_O_PORT_RANGE | + PA_O_SOCKET_FD | PA_O_DGRAM | PA_O_XPRT); + if (!ss2) + goto fail; + + /* OK the address looks correct */ + if (!create_listeners(bind_conf, ss2, port, end, fd, proto, err)) { + memprintf(err, "%s for address '%s'.\n", *err, str); + goto fail; + } + } /* end while(next) */ + free(dupstr); + return 1; + fail: + free(dupstr); + return 0; +} + +/* + * Sends a warning if proxy <proxy> does not have at least one of the + * capabilities in <cap>. An optional <hint> may be added at the end + * of the warning to help the user. Returns 1 if a warning was emitted + * or 0 if the condition is valid. + */ +int warnifnotcap(struct proxy *proxy, int cap, const char *file, int line, const char *arg, const char *hint) +{ + char *msg; + + switch (cap) { + case PR_CAP_BE: msg = "no backend"; break; + case PR_CAP_FE: msg = "no frontend"; break; + case PR_CAP_BE|PR_CAP_FE: msg = "neither frontend nor backend"; break; + default: msg = "not enough"; break; + } + + if (!(proxy->cap & cap)) { + ha_warning("parsing [%s:%d] : '%s' ignored because %s '%s' has %s capability.%s\n", + file, line, arg, proxy_type_str(proxy), proxy->id, msg, hint ? hint : ""); + return 1; + } + return 0; +} + +/* + * Sends an alert if proxy <proxy> does not have at least one of the + * capabilities in <cap>. An optional <hint> may be added at the end + * of the alert to help the user. Returns 1 if an alert was emitted + * or 0 if the condition is valid. + */ +int failifnotcap(struct proxy *proxy, int cap, const char *file, int line, const char *arg, const char *hint) +{ + char *msg; + + switch (cap) { + case PR_CAP_BE: msg = "no backend"; break; + case PR_CAP_FE: msg = "no frontend"; break; + case PR_CAP_BE|PR_CAP_FE: msg = "neither frontend nor backend"; break; + default: msg = "not enough"; break; + } + + if (!(proxy->cap & cap)) { + ha_alert("parsing [%s:%d] : '%s' not allowed because %s '%s' has %s capability.%s\n", + file, line, arg, proxy_type_str(proxy), proxy->id, msg, hint ? hint : ""); + return 1; + } + return 0; +} + +/* + * Report an error in <msg> when there are too many arguments. This version is + * intended to be used by keyword parsers so that the message will be included + * into the general error message. The index is the current keyword in args. + * Return 0 if the number of argument is correct, otherwise build a message and + * return 1. Fill err_code with an ERR_ALERT and an ERR_FATAL if not null. The + * message may also be null, it will simply not be produced (useful to check only). + * <msg> and <err_code> are only affected on error. + */ +int too_many_args_idx(int maxarg, int index, char **args, char **msg, int *err_code) +{ + int i; + + if (!*args[index + maxarg + 1]) + return 0; + + if (msg) { + *msg = NULL; + memprintf(msg, "%s", args[0]); + for (i = 1; i <= index; i++) + memprintf(msg, "%s %s", *msg, args[i]); + + memprintf(msg, "'%s' cannot handle unexpected argument '%s'.", *msg, args[index + maxarg + 1]); + } + if (err_code) + *err_code |= ERR_ALERT | ERR_FATAL; + + return 1; +} + +/* + * same as too_many_args_idx with a 0 index + */ +int too_many_args(int maxarg, char **args, char **msg, int *err_code) +{ + return too_many_args_idx(maxarg, 0, args, msg, err_code); +} + +/* + * Report a fatal Alert when there is too much arguments + * The index is the current keyword in args + * Return 0 if the number of argument is correct, otherwise emit an alert and return 1 + * Fill err_code with an ERR_ALERT and an ERR_FATAL + */ +int alertif_too_many_args_idx(int maxarg, int index, const char *file, int linenum, char **args, int *err_code) +{ + char *kw = NULL; + int i; + + if (!*args[index + maxarg + 1]) + return 0; + + memprintf(&kw, "%s", args[0]); + for (i = 1; i <= index; i++) { + memprintf(&kw, "%s %s", kw, args[i]); + } + + ha_alert("parsing [%s:%d] : '%s' cannot handle unexpected argument '%s'.\n", file, linenum, kw, args[index + maxarg + 1]); + free(kw); + *err_code |= ERR_ALERT | ERR_FATAL; + return 1; +} + +/* + * same as alertif_too_many_args_idx with a 0 index + */ +int alertif_too_many_args(int maxarg, const char *file, int linenum, char **args, int *err_code) +{ + return alertif_too_many_args_idx(maxarg, 0, file, linenum, args, err_code); +} + + +/* Report it if a request ACL condition uses some keywords that are incompatible + * with the place where the ACL is used. It returns either 0 or ERR_WARN so that + * its result can be or'ed with err_code. Note that <cond> may be NULL and then + * will be ignored. + */ +int warnif_cond_conflicts(const struct acl_cond *cond, unsigned int where, const char *file, int line) +{ + const struct acl *acl; + const char *kw; + + if (!cond) + return 0; + + acl = acl_cond_conflicts(cond, where); + if (acl) { + if (acl->name && *acl->name) + ha_warning("parsing [%s:%d] : acl '%s' will never match because it only involves keywords that are incompatible with '%s'\n", + file, line, acl->name, sample_ckp_names(where)); + else + ha_warning("parsing [%s:%d] : anonymous acl will never match because it uses keyword '%s' which is incompatible with '%s'\n", + file, line, LIST_ELEM(acl->expr.n, struct acl_expr *, list)->kw, sample_ckp_names(where)); + return ERR_WARN; + } + if (!acl_cond_kw_conflicts(cond, where, &acl, &kw)) + return 0; + + if (acl->name && *acl->name) + ha_warning("parsing [%s:%d] : acl '%s' involves keywords '%s' which is incompatible with '%s'\n", + file, line, acl->name, kw, sample_ckp_names(where)); + else + ha_warning("parsing [%s:%d] : anonymous acl involves keyword '%s' which is incompatible with '%s'\n", + file, line, kw, sample_ckp_names(where)); + return ERR_WARN; +} + +/* Report it if an ACL uses a L6 sample fetch from an HTTP proxy. It returns + * either 0 or ERR_WARN so that its result can be or'ed with err_code. Note that + * <cond> may be NULL and then will be ignored. +*/ +int warnif_tcp_http_cond(const struct proxy *px, const struct acl_cond *cond) +{ + if (!cond || px->mode != PR_MODE_HTTP) + return 0; + + if (cond->use & (SMP_USE_L6REQ|SMP_USE_L6RES)) { + ha_warning("Proxy '%s': L6 sample fetches ignored on HTTP proxies (declared at %s:%d).\n", + px->id, cond->file, cond->line); + return ERR_WARN; + } + return 0; +} + +/* try to find in <list> the word that looks closest to <word> by counting + * transitions between letters, digits and other characters. Will return the + * best matching word if found, otherwise NULL. An optional array of extra + * words to compare may be passed in <extra>, but it must then be terminated + * by a NULL entry. If unused it may be NULL. + */ +const char *cfg_find_best_match(const char *word, const struct list *list, int section, const char **extra) +{ + uint8_t word_sig[1024]; // 0..25=letter, 26=digit, 27=other, 28=begin, 29=end + uint8_t list_sig[1024]; + const struct cfg_kw_list *kwl; + int index; + const char *best_ptr = NULL; + int dist, best_dist = INT_MAX; + + make_word_fingerprint(word_sig, word); + list_for_each_entry(kwl, list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (kwl->kw[index].section != section) + continue; + + make_word_fingerprint(list_sig, kwl->kw[index].kw); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_ptr = kwl->kw[index].kw; + } + } + } + + while (extra && *extra) { + make_word_fingerprint(list_sig, *extra); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_ptr = *extra; + } + extra++; + } + + if (best_dist > 2 * strlen(word) || (best_ptr && best_dist > 2 * strlen(best_ptr))) + best_ptr = NULL; + return best_ptr; +} + +/* Parse a string representing a process number or a set of processes. It must + * be "all", "odd", "even", a number between 1 and <max> or a range with + * two such numbers delimited by a dash ('-'). On success, it returns + * 0. otherwise it returns 1 with an error message in <err>. + * + * Note: this function can also be used to parse a thread number or a set of + * threads. + */ +int parse_process_number(const char *arg, unsigned long *proc, int max, int *autoinc, char **err) +{ + if (autoinc) { + *autoinc = 0; + if (strncmp(arg, "auto:", 5) == 0) { + arg += 5; + *autoinc = 1; + } + } + + if (strcmp(arg, "all") == 0) + *proc |= ~0UL; + else if (strcmp(arg, "odd") == 0) + *proc |= ~0UL/3UL; /* 0x555....555 */ + else if (strcmp(arg, "even") == 0) + *proc |= (~0UL/3UL) << 1; /* 0xAAA...AAA */ + else { + const char *p, *dash = NULL; + unsigned int low, high; + + for (p = arg; *p; p++) { + if (*p == '-' && !dash) + dash = p; + else if (!isdigit((unsigned char)*p)) { + memprintf(err, "'%s' is not a valid number/range.", arg); + return -1; + } + } + + low = high = str2uic(arg); + if (dash) + high = ((!*(dash+1)) ? max : str2uic(dash + 1)); + + if (high < low) { + unsigned int swap = low; + low = high; + high = swap; + } + + if (low < 1 || low > max || high > max) { + memprintf(err, "'%s' is not a valid number/range." + " It supports numbers from 1 to %d.\n", + arg, max); + return 1; + } + + for (;low <= high; low++) + *proc |= 1UL << (low-1); + } + *proc &= ~0UL >> (LONGBITS - max); + + return 0; +} + +/* Allocate and initialize the frontend of a "peers" section found in + * file <file> at line <linenum> with <id> as ID. + * Return 0 if succeeded, -1 if not. + * Note that this function may be called from "default-server" + * or "peer" lines. + */ +static int init_peers_frontend(const char *file, int linenum, + const char *id, struct peers *peers) +{ + struct proxy *p; + + if (peers->peers_fe) { + p = peers->peers_fe; + goto out; + } + + p = calloc(1, sizeof *p); + if (!p) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + return -1; + } + + init_new_proxy(p); + peers_setup_frontend(p); + p->parent = peers; + /* Finally store this frontend. */ + peers->peers_fe = p; + + out: + if (id && !p->id) + p->id = strdup(id); + free(p->conf.file); + p->conf.args.file = p->conf.file = strdup(file); + if (linenum != -1) + p->conf.args.line = p->conf.line = linenum; + + return 0; +} + +/* Only change ->file, ->line and ->arg struct bind_conf member values + * if already present. + */ +static struct bind_conf *bind_conf_uniq_alloc(struct proxy *p, + const char *file, int line, + const char *arg, struct xprt_ops *xprt) +{ + struct bind_conf *bind_conf; + + if (!LIST_ISEMPTY(&p->conf.bind)) { + bind_conf = LIST_ELEM((&p->conf.bind)->n, typeof(bind_conf), by_fe); + /* + * We keep bind_conf->file and bind_conf->line unchanged + * to make them available for error messages + */ + if (arg) { + free(bind_conf->arg); + bind_conf->arg = strdup(arg); + } + } + else { + bind_conf = bind_conf_alloc(p, file, line, arg, xprt); + } + + return bind_conf; +} + +/* + * Allocate a new struct peer parsed at line <linenum> in file <file> + * to be added to <peers>. + * Returns the new allocated structure if succeeded, NULL if not. + */ +static struct peer *cfg_peers_add_peer(struct peers *peers, + const char *file, int linenum, + const char *id, int local) +{ + struct peer *p; + + p = calloc(1, sizeof *p); + if (!p) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + return NULL; + } + + /* the peers are linked backwards first */ + peers->count++; + p->peers = peers; + p->next = peers->remote; + peers->remote = p; + p->conf.file = strdup(file); + p->conf.line = linenum; + p->last_change = ns_to_sec(now_ns); + p->xprt = xprt_get(XPRT_RAW); + p->sock_init_arg = NULL; + HA_SPIN_INIT(&p->lock); + if (id) + p->id = strdup(id); + if (local) { + p->local = 1; + peers->local = p; + } + + return p; +} + +/* + * Parse a line in a <listen>, <frontend> or <backend> section. + * Returns the error code, 0 if OK, or any combination of : + * - ERR_ABORT: must abort ASAP + * - ERR_FATAL: we can continue parsing but not start the service + * - ERR_WARN: a warning has been emitted + * - ERR_ALERT: an alert has been emitted + * Only the two first ones can stop processing, the two others are just + * indicators. + */ +int cfg_parse_peers(const char *file, int linenum, char **args, int kwm) +{ + static struct peers *curpeers = NULL; + static int nb_shards = 0; + struct peer *newpeer = NULL; + const char *err; + struct bind_conf *bind_conf; + int err_code = 0; + char *errmsg = NULL; + static int bind_line, peer_line; + + if (strcmp(args[0], "bind") == 0 || strcmp(args[0], "default-bind") == 0) { + int cur_arg; + struct bind_conf *bind_conf; + int ret; + + cur_arg = 1; + + if (init_peers_frontend(file, linenum, NULL, curpeers) != 0) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + bind_conf = bind_conf_uniq_alloc(curpeers->peers_fe, file, linenum, + args[1], xprt_get(XPRT_RAW)); + if (!bind_conf) { + ha_alert("parsing [%s:%d] : '%s %s' : cannot allocate memory.\n", file, linenum, args[0], args[1]); + err_code |= ERR_FATAL; + goto out; + } + + bind_conf->maxaccept = 1; + bind_conf->accept = session_accept_fd; + bind_conf->options |= BC_O_UNLIMITED; /* don't make the peers subject to global limits */ + + if (*args[0] == 'b') { + struct listener *l; + + if (peer_line) { + ha_alert("parsing [%s:%d] : mixing \"peer\" and \"bind\" line is forbidden\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!LIST_ISEMPTY(&bind_conf->listeners)) { + ha_alert("parsing [%s:%d] : One listener per \"peers\" section is authorized but another is already configured at [%s:%d].\n", file, linenum, bind_conf->file, bind_conf->line); + err_code |= ERR_FATAL; + } + + if (!str2listener(args[1], curpeers->peers_fe, bind_conf, file, linenum, &errmsg)) { + if (errmsg && *errmsg) { + indent_msg(&errmsg, 2); + ha_alert("parsing [%s:%d] : '%s %s' : %s\n", file, linenum, args[0], args[1], errmsg); + } + else + ha_alert("parsing [%s:%d] : '%s %s' : error encountered while parsing listening address %s.\n", + file, linenum, args[0], args[1], args[1]); + err_code |= ERR_FATAL; + goto out; + } + + /* Only one listener supported. Compare first listener + * against the last one. It must be the same one. + */ + if (bind_conf->listeners.n != bind_conf->listeners.p) { + ha_alert("parsing [%s:%d] : Only one listener per \"peers\" section is authorized. Multiple listening addresses or port range are not supported.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + /* + * Newly allocated listener is at the end of the list + */ + l = LIST_ELEM(bind_conf->listeners.p, typeof(l), by_bind); + + global.maxsock++; /* for the listening socket */ + + bind_line = 1; + if (cfg_peers->local) { + newpeer = cfg_peers->local; + } + else { + /* This peer is local. + * Note that we do not set the peer ID. This latter is initialized + * when parsing "peer" or "server" line. + */ + newpeer = cfg_peers_add_peer(curpeers, file, linenum, NULL, 1); + if (!newpeer) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + } + newpeer->addr = l->rx.addr; + newpeer->proto = l->rx.proto; + cur_arg++; + } + + ret = bind_parse_args_list(bind_conf, args, cur_arg, cursection, file, linenum); + err_code |= ret; + if (ret != 0) + goto out; + } + else if (strcmp(args[0], "default-server") == 0) { + if (init_peers_frontend(file, -1, NULL, curpeers) != 0) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + err_code |= parse_server(file, linenum, args, curpeers->peers_fe, NULL, + SRV_PARSE_DEFAULT_SERVER|SRV_PARSE_IN_PEER_SECTION|SRV_PARSE_INITIAL_RESOLVE); + } + else if (strcmp(args[0], "log") == 0) { + if (init_peers_frontend(file, linenum, NULL, curpeers) != 0) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + if (!parse_logger(args, &curpeers->peers_fe->loggers, (kwm == KWM_NO), file, linenum, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "peers") == 0) { /* new peers section */ + /* Initialize these static variables when entering a new "peers" section*/ + bind_line = peer_line = 0; + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing name for peers section.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + for (curpeers = cfg_peers; curpeers != NULL; curpeers = curpeers->next) { + /* + * If there are two proxies with the same name only following + * combinations are allowed: + */ + if (strcmp(curpeers->id, args[1]) == 0) { + ha_alert("Parsing [%s:%d]: peers section '%s' has the same name as another peers section declared at %s:%d.\n", + file, linenum, args[1], curpeers->conf.file, curpeers->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + + if ((curpeers = calloc(1, sizeof(*curpeers))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + curpeers->next = cfg_peers; + cfg_peers = curpeers; + curpeers->conf.file = strdup(file); + curpeers->conf.line = linenum; + curpeers->last_change = ns_to_sec(now_ns); + curpeers->id = strdup(args[1]); + curpeers->disabled = 0; + } + else if (strcmp(args[0], "peer") == 0 || + strcmp(args[0], "server") == 0) { /* peer or server definition */ + int local_peer, peer; + int parse_addr = 0; + + peer = *args[0] == 'p'; + local_peer = strcmp(args[1], localpeer) == 0; + /* The local peer may have already partially been parsed on a "bind" line. */ + if (*args[0] == 'p') { + if (bind_line) { + ha_alert("parsing [%s:%d] : mixing \"peer\" and \"bind\" line is forbidden\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + peer_line = 1; + } + if (cfg_peers->local && !cfg_peers->local->id && local_peer) { + /* The local peer has already been initialized on a "bind" line. + * Let's use it and store its ID. + */ + newpeer = cfg_peers->local; + newpeer->id = strdup(localpeer); + } + else { + if (local_peer && cfg_peers->local) { + ha_alert("parsing [%s:%d] : '%s %s' : local peer name already referenced at %s:%d. %s\n", + file, linenum, args[0], args[1], + curpeers->peers_fe->conf.file, curpeers->peers_fe->conf.line, cfg_peers->local->id); + err_code |= ERR_FATAL; + goto out; + } + newpeer = cfg_peers_add_peer(curpeers, file, linenum, args[1], local_peer); + if (!newpeer) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + } + + /* Line number and peer ID are updated only if this peer is the local one. */ + if (init_peers_frontend(file, + newpeer->local ? linenum: -1, + newpeer->local ? newpeer->id : NULL, + curpeers) != 0) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* This initializes curpeer->peers->peers_fe->srv. + * The server address is parsed only if we are parsing a "peer" line, + * or if we are parsing a "server" line and the current peer is not the local one. + */ + parse_addr = (peer || !local_peer) ? SRV_PARSE_PARSE_ADDR : 0; + err_code |= parse_server(file, linenum, args, curpeers->peers_fe, NULL, + SRV_PARSE_IN_PEER_SECTION|parse_addr|SRV_PARSE_INITIAL_RESOLVE); + if (!curpeers->peers_fe->srv) { + /* Remove the newly allocated peer. */ + if (newpeer != curpeers->local) { + struct peer *p; + + p = curpeers->remote; + curpeers->remote = curpeers->remote->next; + free(p->id); + free(p); + } + goto out; + } + + if (nb_shards && curpeers->peers_fe->srv->shard > nb_shards) { + ha_warning("parsing [%s:%d] : '%s %s' : %d peer shard greater value than %d shards value is ignored.\n", + file, linenum, args[0], args[1], curpeers->peers_fe->srv->shard, nb_shards); + curpeers->peers_fe->srv->shard = 0; + err_code |= ERR_WARN; + } + + if (curpeers->peers_fe->srv->init_addr_methods || curpeers->peers_fe->srv->resolvers_id || + curpeers->peers_fe->srv->do_check || curpeers->peers_fe->srv->do_agent) { + ha_warning("parsing [%s:%d] : '%s %s' : init_addr, resolvers, check and agent are ignored for peers.\n", file, linenum, args[0], args[1]); + err_code |= ERR_WARN; + } + + /* If the peer address has just been parsed, let's copy it to <newpeer> + * and initializes ->proto. + */ + if (peer || !local_peer) { + newpeer->addr = curpeers->peers_fe->srv->addr; + newpeer->proto = protocol_lookup(newpeer->addr.ss_family, PROTO_TYPE_STREAM, 0); + } + + newpeer->xprt = xprt_get(XPRT_RAW); + newpeer->sock_init_arg = NULL; + HA_SPIN_INIT(&newpeer->lock); + + newpeer->srv = curpeers->peers_fe->srv; + if (!newpeer->local) + goto out; + + /* The lines above are reserved to "peer" lines. */ + if (*args[0] == 's') + goto out; + + bind_conf = bind_conf_uniq_alloc(curpeers->peers_fe, file, linenum, args[2], xprt_get(XPRT_RAW)); + if (!bind_conf) { + ha_alert("parsing [%s:%d] : '%s %s' : Cannot allocate memory.\n", file, linenum, args[0], args[1]); + err_code |= ERR_FATAL; + goto out; + } + + bind_conf->maxaccept = 1; + bind_conf->accept = session_accept_fd; + bind_conf->options |= BC_O_UNLIMITED; /* don't make the peers subject to global limits */ + + if (!LIST_ISEMPTY(&bind_conf->listeners)) { + ha_alert("parsing [%s:%d] : One listener per \"peers\" section is authorized but another is already configured at [%s:%d].\n", file, linenum, bind_conf->file, bind_conf->line); + err_code |= ERR_FATAL; + } + + if (!str2listener(args[2], curpeers->peers_fe, bind_conf, file, linenum, &errmsg)) { + if (errmsg && *errmsg) { + indent_msg(&errmsg, 2); + ha_alert("parsing [%s:%d] : '%s %s' : %s\n", file, linenum, args[0], args[1], errmsg); + } + else + ha_alert("parsing [%s:%d] : '%s %s' : error encountered while parsing listening address %s.\n", + file, linenum, args[0], args[1], args[2]); + err_code |= ERR_FATAL; + goto out; + } + + global.maxsock++; /* for the listening socket */ + } + else if (strcmp(args[0], "shards") == 0) { + char *endptr; + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' : missing value\n", file, linenum, args[0]); + err_code |= ERR_FATAL; + goto out; + } + + curpeers->nb_shards = strtol(args[1], &endptr, 10); + if (*endptr != '\0') { + ha_alert("parsing [%s:%d] : '%s' : expects an integer argument, found '%s'\n", + file, linenum, args[0], args[1]); + err_code |= ERR_FATAL; + goto out; + } + + if (!curpeers->nb_shards) { + ha_alert("parsing [%s:%d] : '%s' : expects a strictly positive integer argument\n", + file, linenum, args[0]); + err_code |= ERR_FATAL; + goto out; + } + + nb_shards = curpeers->nb_shards; + } + else if (strcmp(args[0], "table") == 0) { + struct stktable *t, *other; + char *id; + size_t prefix_len; + + /* Line number and peer ID are updated only if this peer is the local one. */ + if (init_peers_frontend(file, -1, NULL, curpeers) != 0) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* Build the stick-table name, concatenating the "peers" section name + * followed by a '/' character and the table name argument. + */ + chunk_reset(&trash); + if (!chunk_strcpy(&trash, curpeers->id)) { + ha_alert("parsing [%s:%d]: '%s %s' : stick-table name too long.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + prefix_len = trash.data; + if (!chunk_memcat(&trash, "/", 1) || !chunk_strcat(&trash, args[1])) { + ha_alert("parsing [%s:%d]: '%s %s' : stick-table name too long.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + t = calloc(1, sizeof *t); + id = strdup(trash.area); + if (!t || !id) { + ha_alert("parsing [%s:%d]: '%s %s' : memory allocation failed\n", + file, linenum, args[0], args[1]); + free(t); + free(id); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + other = stktable_find_by_name(trash.area); + if (other) { + ha_alert("parsing [%s:%d] : stick-table name '%s' conflicts with table declared in %s '%s' at %s:%d.\n", + file, linenum, args[1], + other->proxy ? proxy_cap_str(other->proxy->cap) : "peers", + other->proxy ? other->id : other->peers.p->id, + other->conf.file, other->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + + err_code |= parse_stick_table(file, linenum, args, t, id, id + prefix_len, curpeers); + if (err_code & ERR_FATAL) { + free(t); + free(id); + goto out; + } + + stktable_store_name(t); + t->next = stktables_list; + stktables_list = t; + } + else if (strcmp(args[0], "disabled") == 0) { /* disables this peers section */ + curpeers->disabled |= PR_FL_DISABLED; + } + else if (strcmp(args[0], "enabled") == 0) { /* enables this peers section (used to revert a disabled default) */ + curpeers->disabled = 0; + } + else if (*args[0] != 0) { + struct peers_kw_list *pkwl; + int index; + int rc = -1; + + list_for_each_entry(pkwl, &peers_keywords.list, list) { + for (index = 0; pkwl->kw[index].kw != NULL; index++) { + if (strcmp(pkwl->kw[index].kw, args[0]) == 0) { + rc = pkwl->kw[index].parse(args, curpeers, file, linenum, &errmsg); + if (rc < 0) { + ha_alert("parsing [%s:%d] : %s\n", file, linenum, errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (rc > 0) { + ha_warning("parsing [%s:%d] : %s\n", file, linenum, errmsg); + err_code |= ERR_WARN; + goto out; + } + goto out; + } + } + } + + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], cursection); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + +out: + free(errmsg); + return err_code; +} + +/* + * Parse a line in a <listen>, <frontend> or <backend> section. + * Returns the error code, 0 if OK, or any combination of : + * - ERR_ABORT: must abort ASAP + * - ERR_FATAL: we can continue parsing but not start the service + * - ERR_WARN: a warning has been emitted + * - ERR_ALERT: an alert has been emitted + * Only the two first ones can stop processing, the two others are just + * indicators. + */ +int cfg_parse_mailers(const char *file, int linenum, char **args, int kwm) +{ + static struct mailers *curmailers = NULL; + struct mailer *newmailer = NULL; + const char *err; + int err_code = 0; + char *errmsg = NULL; + + if (strcmp(args[0], "mailers") == 0) { /* new mailers section */ + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing name for mailers section.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + for (curmailers = mailers; curmailers != NULL; curmailers = curmailers->next) { + /* + * If there are two proxies with the same name only following + * combinations are allowed: + */ + if (strcmp(curmailers->id, args[1]) == 0) { + ha_alert("Parsing [%s:%d]: mailers section '%s' has the same name as another mailers section declared at %s:%d.\n", + file, linenum, args[1], curmailers->conf.file, curmailers->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + + if ((curmailers = calloc(1, sizeof(*curmailers))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + curmailers->next = mailers; + mailers = curmailers; + curmailers->conf.file = strdup(file); + curmailers->conf.line = linenum; + curmailers->id = strdup(args[1]); + curmailers->timeout.mail = DEF_MAILALERTTIME;/* XXX: Would like to Skip to the next alert, if any, ASAP. + * But need enough time so that timeouts don't occur + * during tcp procssing. For now just us an arbitrary default. */ + } + else if (strcmp(args[0], "mailer") == 0) { /* mailer definition */ + struct sockaddr_storage *sk; + int port1, port2; + struct protocol *proto; + + if (!*args[2]) { + ha_alert("parsing [%s:%d] : '%s' expects <name> and <addr>[:<port>] as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in server name '%s'.\n", + file, linenum, *err, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((newmailer = calloc(1, sizeof(*newmailer))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* the mailers are linked backwards first */ + curmailers->count++; + newmailer->next = curmailers->mailer_list; + curmailers->mailer_list = newmailer; + newmailer->mailers = curmailers; + newmailer->conf.file = strdup(file); + newmailer->conf.line = linenum; + + newmailer->id = strdup(args[1]); + + sk = str2sa_range(args[2], NULL, &port1, &port2, NULL, &proto, NULL, + &errmsg, NULL, NULL, + PA_O_RESOLVE | PA_O_PORT_OK | PA_O_PORT_MAND | PA_O_STREAM | PA_O_XPRT | PA_O_CONNECT); + if (!sk) { + ha_alert("parsing [%s:%d] : '%s %s' : %s\n", file, linenum, args[0], args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (proto->sock_prot != IPPROTO_TCP) { + ha_alert("parsing [%s:%d] : '%s %s' : TCP not supported for this address family.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + newmailer->addr = *sk; + newmailer->proto = proto; + newmailer->xprt = xprt_get(XPRT_RAW); + newmailer->sock_init_arg = NULL; + } + else if (strcmp(args[0], "timeout") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects 'mail' and <time> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[1], "mail") == 0) { + const char *res; + unsigned int timeout_mail; + if (!*args[2]) { + ha_alert("parsing [%s:%d] : '%s %s' expects <time> as argument.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + res = parse_time_err(args[2], &timeout_mail, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s %s>, maximum value is 2147483647 ms (~24.8 days).\n", + file, linenum, args[2], args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s %s>, minimum non-null value is 1 ms.\n", + file, linenum, args[2], args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res) { + ha_alert("parsing [%s:%d]: unexpected character '%c' in argument to <%s %s>.\n", + file, linenum, *res, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curmailers->timeout.mail = timeout_mail; + } else { + ha_alert("parsing [%s:%d] : '%s' expects 'mail' and <time> as arguments got '%s'.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (*args[0] != 0) { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], cursection); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + +out: + free(errmsg); + return err_code; +} + +void free_email_alert(struct proxy *p) +{ + ha_free(&p->email_alert.mailers.name); + ha_free(&p->email_alert.from); + ha_free(&p->email_alert.to); + ha_free(&p->email_alert.myhostname); +} + + +int +cfg_parse_netns(const char *file, int linenum, char **args, int kwm) +{ +#ifdef USE_NS + const char *err; + const char *item = args[0]; + + if (strcmp(item, "namespace_list") == 0) { + return 0; + } + else if (strcmp(item, "namespace") == 0) { + size_t idx = 1; + const char *current; + while (*(current = args[idx++])) { + err = invalid_char(current); + if (err) { + ha_alert("parsing [%s:%d]: character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, item, current); + return ERR_ALERT | ERR_FATAL; + } + + if (netns_store_lookup(current, strlen(current))) { + ha_alert("parsing [%s:%d]: Namespace '%s' is already added.\n", + file, linenum, current); + return ERR_ALERT | ERR_FATAL; + } + if (!netns_store_insert(current)) { + ha_alert("parsing [%s:%d]: Cannot open namespace '%s'.\n", + file, linenum, current); + return ERR_ALERT | ERR_FATAL; + } + } + } + + return 0; +#else + ha_alert("parsing [%s:%d]: namespace support is not compiled in.", + file, linenum); + return ERR_ALERT | ERR_FATAL; +#endif +} + +int +cfg_parse_users(const char *file, int linenum, char **args, int kwm) +{ + + int err_code = 0; + const char *err; + + if (strcmp(args[0], "userlist") == 0) { /* new userlist */ + struct userlist *newul; + + if (!*args[1]) { + ha_alert("parsing [%s:%d]: '%s' expects <name> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d]: character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + for (newul = userlist; newul; newul = newul->next) + if (strcmp(newul->name, args[1]) == 0) { + ha_warning("parsing [%s:%d]: ignoring duplicated userlist '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_WARN; + goto out; + } + + newul = calloc(1, sizeof(*newul)); + if (!newul) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + newul->name = strdup(args[1]); + if (!newul->name) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + free(newul); + goto out; + } + + newul->next = userlist; + userlist = newul; + + } else if (strcmp(args[0], "group") == 0) { /* new group */ + int cur_arg; + const char *err; + struct auth_groups *ag; + + if (!*args[1]) { + ha_alert("parsing [%s:%d]: '%s' expects <name> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d]: character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (!userlist) + goto out; + + for (ag = userlist->groups; ag; ag = ag->next) + if (strcmp(ag->name, args[1]) == 0) { + ha_warning("parsing [%s:%d]: ignoring duplicated group '%s' in userlist '%s'.\n", + file, linenum, args[1], userlist->name); + err_code |= ERR_ALERT; + goto out; + } + + ag = calloc(1, sizeof(*ag)); + if (!ag) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + ag->name = strdup(args[1]); + if (!ag->name) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + free(ag); + goto out; + } + + cur_arg = 2; + + while (*args[cur_arg]) { + if (strcmp(args[cur_arg], "users") == 0) { + ag->groupusers = strdup(args[cur_arg + 1]); + cur_arg += 2; + continue; + } else { + ha_alert("parsing [%s:%d]: '%s' only supports 'users' option.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + free(ag->groupusers); + free(ag->name); + free(ag); + goto out; + } + } + + ag->next = userlist->groups; + userlist->groups = ag; + + } else if (strcmp(args[0], "user") == 0) { /* new user */ + struct auth_users *newuser; + int cur_arg; + + if (!*args[1]) { + ha_alert("parsing [%s:%d]: '%s' expects <name> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (!userlist) + goto out; + + for (newuser = userlist->users; newuser; newuser = newuser->next) + if (strcmp(newuser->user, args[1]) == 0) { + ha_warning("parsing [%s:%d]: ignoring duplicated user '%s' in userlist '%s'.\n", + file, linenum, args[1], userlist->name); + err_code |= ERR_ALERT; + goto out; + } + + newuser = calloc(1, sizeof(*newuser)); + if (!newuser) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + newuser->user = strdup(args[1]); + + newuser->next = userlist->users; + userlist->users = newuser; + + cur_arg = 2; + + while (*args[cur_arg]) { + if (strcmp(args[cur_arg], "password") == 0) { +#ifdef USE_LIBCRYPT + if (!crypt("", args[cur_arg + 1])) { + ha_alert("parsing [%s:%d]: the encrypted password used for user '%s' is not supported by crypt(3).\n", + file, linenum, newuser->user); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } +#else + ha_warning("parsing [%s:%d]: no crypt(3) support compiled, encrypted passwords will not work.\n", + file, linenum); + err_code |= ERR_ALERT; +#endif + newuser->pass = strdup(args[cur_arg + 1]); + cur_arg += 2; + continue; + } else if (strcmp(args[cur_arg], "insecure-password") == 0) { + newuser->pass = strdup(args[cur_arg + 1]); + newuser->flags |= AU_O_INSECURE; + cur_arg += 2; + continue; + } else if (strcmp(args[cur_arg], "groups") == 0) { + newuser->u.groups_names = strdup(args[cur_arg + 1]); + cur_arg += 2; + continue; + } else { + ha_alert("parsing [%s:%d]: '%s' only supports 'password', 'insecure-password' and 'groups' options.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + } else { + ha_alert("parsing [%s:%d]: unknown keyword '%s' in '%s' section\n", file, linenum, args[0], "users"); + err_code |= ERR_ALERT | ERR_FATAL; + } + +out: + return err_code; +} + +int +cfg_parse_scope(const char *file, int linenum, char *line) +{ + char *beg, *end, *scope = NULL; + int err_code = 0; + const char *err; + + beg = line + 1; + end = strchr(beg, ']'); + + /* Detect end of scope declaration */ + if (!end || end == beg) { + ha_alert("parsing [%s:%d] : empty scope name is forbidden.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* Get scope name and check its validity */ + scope = my_strndup(beg, end-beg); + err = invalid_char(scope); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in a scope name.\n", + file, linenum, *err); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* Be sure to have a scope declaration alone on its line */ + line = end+1; + while (isspace((unsigned char)*line)) + line++; + if (*line && *line != '#' && *line != '\n' && *line != '\r') { + ha_alert("parsing [%s:%d] : character '%c' is not permitted after scope declaration.\n", + file, linenum, *line); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* We have a valid scope declaration, save it */ + free(cfg_scope); + cfg_scope = scope; + scope = NULL; + + out: + free(scope); + return err_code; +} + +int +cfg_parse_track_sc_num(unsigned int *track_sc_num, + const char *arg, const char *end, char **errmsg) +{ + const char *p; + unsigned int num; + + p = arg; + num = read_uint64(&arg, end); + + if (arg != end) { + memprintf(errmsg, "Wrong track-sc number '%s'", p); + return -1; + } + + if (num >= global.tune.nb_stk_ctr) { + if (!global.tune.nb_stk_ctr) + memprintf(errmsg, "%u track-sc number not usable, stick-counters " + "are disabled by tune.stick-counters", num); + else + memprintf(errmsg, "%u track-sc number exceeding " + "%d (tune.stick-counters-1) value", num, global.tune.nb_stk_ctr - 1); + return -1; + } + + *track_sc_num = num; + return 0; +} + +/* + * Detect a global section after a non-global one and output a diagnostic + * warning. + */ +static void check_section_position(char *section_name, const char *file, int linenum) +{ + if (strcmp(section_name, "global") == 0) { + if ((global.mode & MODE_DIAG) && non_global_section_parsed == 1) + _ha_diag_warning("parsing [%s:%d] : global section detected after a non-global one, the prevalence of their statements is unspecified\n", file, linenum); + } + else if (non_global_section_parsed == 0) { + non_global_section_parsed = 1; + } +} + +/* apply the current default_path setting for config file <file>, and + * optionally replace the current path to <origin> if not NULL while the + * default-path mode is set to "origin". Errors are returned into an + * allocated string passed to <err> if it's not NULL. Returns 0 on failure + * or non-zero on success. + */ +static int cfg_apply_default_path(const char *file, const char *origin, char **err) +{ + const char *beg, *end; + + /* make path start at <beg> and end before <end>, and switch it to "" + * if no slash was passed. + */ + beg = file; + end = strrchr(beg, '/'); + if (!end) + end = beg; + + if (!*initial_cwd) { + if (getcwd(initial_cwd, sizeof(initial_cwd)) == NULL) { + if (err) + memprintf(err, "Impossible to retrieve startup directory name: %s", strerror(errno)); + return 0; + } + } + else if (chdir(initial_cwd) == -1) { + if (err) + memprintf(err, "Impossible to get back to initial directory '%s': %s", initial_cwd, strerror(errno)); + return 0; + } + + /* OK now we're (back) to initial_cwd */ + + switch (default_path_mode) { + case DEFAULT_PATH_CURRENT: + /* current_cwd never set, nothing to do */ + return 1; + + case DEFAULT_PATH_ORIGIN: + /* current_cwd set in the config */ + if (origin && + snprintf(current_cwd, sizeof(current_cwd), "%s", origin) > sizeof(current_cwd)) { + if (err) + memprintf(err, "Absolute path too long: '%s'", origin); + return 0; + } + break; + + case DEFAULT_PATH_CONFIG: + if (end - beg >= sizeof(current_cwd)) { + if (err) + memprintf(err, "Config file path too long, cannot use for relative paths: '%s'", file); + return 0; + } + memcpy(current_cwd, beg, end - beg); + current_cwd[end - beg] = 0; + break; + + case DEFAULT_PATH_PARENT: + if (end - beg + 3 >= sizeof(current_cwd)) { + if (err) + memprintf(err, "Config file path too long, cannot use for relative paths: '%s'", file); + return 0; + } + memcpy(current_cwd, beg, end - beg); + if (end > beg) + memcpy(current_cwd + (end - beg), "/..\0", 4); + else + memcpy(current_cwd + (end - beg), "..\0", 3); + break; + } + + if (*current_cwd && chdir(current_cwd) == -1) { + if (err) + memprintf(err, "Impossible to get back to directory '%s': %s", initial_cwd, strerror(errno)); + return 0; + } + + return 1; +} + +/* parses a global "default-path" directive. */ +static int cfg_parse_global_def_path(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int ret = -1; + + /* "current", "config", "parent", "origin <path>" */ + + if (strcmp(args[1], "current") == 0) + default_path_mode = DEFAULT_PATH_CURRENT; + else if (strcmp(args[1], "config") == 0) + default_path_mode = DEFAULT_PATH_CONFIG; + else if (strcmp(args[1], "parent") == 0) + default_path_mode = DEFAULT_PATH_PARENT; + else if (strcmp(args[1], "origin") == 0) + default_path_mode = DEFAULT_PATH_ORIGIN; + else { + memprintf(err, "%s default-path mode '%s' for '%s', supported modes include 'current', 'config', 'parent', and 'origin'.", *args[1] ? "unsupported" : "missing", args[1], args[0]); + goto end; + } + + if (default_path_mode == DEFAULT_PATH_ORIGIN) { + if (!*args[2]) { + memprintf(err, "'%s %s' expects a directory as an argument.", args[0], args[1]); + goto end; + } + if (!cfg_apply_default_path(file, args[2], err)) { + memprintf(err, "couldn't set '%s' to origin '%s': %s.", args[0], args[2], *err); + goto end; + } + } + else if (!cfg_apply_default_path(file, NULL, err)) { + memprintf(err, "couldn't set '%s' to '%s': %s.", args[0], args[1], *err); + goto end; + } + + /* note that once applied, the path is immediately updated */ + + ret = 0; + end: + return ret; +} + +/* + * This function reads and parses the configuration file given in the argument. + * Returns the error code, 0 if OK, -1 if the config file couldn't be opened, + * or any combination of : + * - ERR_ABORT: must abort ASAP + * - ERR_FATAL: we can continue parsing but not start the service + * - ERR_WARN: a warning has been emitted + * - ERR_ALERT: an alert has been emitted + * Only the two first ones can stop processing, the two others are just + * indicators. + */ +int readcfgfile(const char *file) +{ + char *thisline = NULL; + int linesize = LINESIZE; + FILE *f = NULL; + int linenum = 0; + int err_code = 0; + struct cfg_section *cs = NULL, *pcs = NULL; + struct cfg_section *ics; + int readbytes = 0; + char *outline = NULL; + size_t outlen = 0; + size_t outlinesize = 0; + int fatal = 0; + int missing_lf = -1; + int nested_cond_lvl = 0; + enum nested_cond_state nested_conds[MAXNESTEDCONDS]; + char *errmsg = NULL; + + global.cfg_curr_line = 0; + global.cfg_curr_file = file; + + if ((thisline = malloc(sizeof(*thisline) * linesize)) == NULL) { + ha_alert("Out of memory trying to allocate a buffer for a configuration line.\n"); + err_code = -1; + goto err; + } + + if ((f = fopen(file,"r")) == NULL) { + err_code = -1; + goto err; + } + + /* change to the new dir if required */ + if (!cfg_apply_default_path(file, NULL, &errmsg)) { + ha_alert("parsing [%s:%d]: failed to apply default-path: %s.\n", file, linenum, errmsg); + free(errmsg); + err_code = -1; + goto err; + } + +next_line: + while (fgets(thisline + readbytes, linesize - readbytes, f) != NULL) { + int arg, kwm = KWM_STD; + char *end; + char *args[MAX_LINE_ARGS + 1]; + char *line = thisline; + + if (missing_lf != -1) { + ha_alert("parsing [%s:%d]: Stray NUL character at position %d.\n", + file, linenum, (missing_lf + 1)); + err_code |= ERR_ALERT | ERR_FATAL; + missing_lf = -1; + break; + } + + linenum++; + global.cfg_curr_line = linenum; + + if (fatal >= 50) { + ha_alert("parsing [%s:%d]: too many fatal errors (%d), stopping now.\n", file, linenum, fatal); + break; + } + + end = line + strlen(line); + + if (end-line == linesize-1 && *(end-1) != '\n') { + /* Check if we reached the limit and the last char is not \n. + * Watch out for the last line without the terminating '\n'! + */ + char *newline; + int newlinesize = linesize * 2; + + newline = realloc(thisline, sizeof(*thisline) * newlinesize); + if (newline == NULL) { + ha_alert("parsing [%s:%d]: line too long, cannot allocate memory.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + linenum--; + continue; + } + + readbytes = linesize - 1; + linesize = newlinesize; + thisline = newline; + linenum--; + continue; + } + + readbytes = 0; + + if (end > line && *(end-1) == '\n') { + /* kill trailing LF */ + *(end - 1) = 0; + } + else { + /* mark this line as truncated */ + missing_lf = end - line; + } + + /* skip leading spaces */ + while (isspace((unsigned char)*line)) + line++; + + if (*line == '[') {/* This is the beginning if a scope */ + err_code |= cfg_parse_scope(file, linenum, line); + goto next_line; + } + + while (1) { + uint32_t err; + const char *errptr; + + arg = sizeof(args) / sizeof(*args); + outlen = outlinesize; + err = parse_line(line, outline, &outlen, args, &arg, + PARSE_OPT_ENV | PARSE_OPT_DQUOTE | PARSE_OPT_SQUOTE | + PARSE_OPT_BKSLASH | PARSE_OPT_SHARP | PARSE_OPT_WORD_EXPAND, + &errptr); + + if (err & PARSE_ERR_QUOTE) { + size_t newpos = sanitize_for_printing(line, errptr - line, 80); + + ha_alert("parsing [%s:%d]: unmatched quote at position %d:\n" + " %s\n %*s\n", file, linenum, (int)(errptr-thisline+1), line, (int)(newpos+1), "^"); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + goto next_line; + } + + if (err & PARSE_ERR_BRACE) { + size_t newpos = sanitize_for_printing(line, errptr - line, 80); + + ha_alert("parsing [%s:%d]: unmatched brace in environment variable name at position %d:\n" + " %s\n %*s\n", file, linenum, (int)(errptr-thisline+1), line, (int)(newpos+1), "^"); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + goto next_line; + } + + if (err & PARSE_ERR_VARNAME) { + size_t newpos = sanitize_for_printing(line, errptr - line, 80); + + ha_alert("parsing [%s:%d]: forbidden first char in environment variable name at position %d:\n" + " %s\n %*s\n", file, linenum, (int)(errptr-thisline+1), line, (int)(newpos+1), "^"); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + goto next_line; + } + + if (err & PARSE_ERR_HEX) { + size_t newpos = sanitize_for_printing(line, errptr - line, 80); + + ha_alert("parsing [%s:%d]: truncated or invalid hexadecimal sequence at position %d:\n" + " %s\n %*s\n", file, linenum, (int)(errptr-thisline+1), line, (int)(newpos+1), "^"); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + goto next_line; + } + + if (err & PARSE_ERR_WRONG_EXPAND) { + size_t newpos = sanitize_for_printing(line, errptr - line, 80); + + ha_alert("parsing [%s:%d]: truncated or invalid word expansion sequence at position %d:\n" + " %s\n %*s\n", file, linenum, (int)(errptr-thisline+1), line, (int)(newpos+1), "^"); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + goto next_line; + } + + if (err & (PARSE_ERR_TOOLARGE|PARSE_ERR_OVERLAP)) { + outlinesize = (outlen + 1023) & -1024; + outline = my_realloc2(outline, outlinesize); + if (outline == NULL) { + ha_alert("parsing [%s:%d]: line too long, cannot allocate memory.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + fatal++; + outlinesize = 0; + goto err; + } + /* try again */ + continue; + } + + if (err & PARSE_ERR_TOOMANY) { + /* only check this *after* being sure the output is allocated */ + ha_alert("parsing [%s:%d]: too many words, truncating after word %d, position %ld: <%s>.\n", + file, linenum, MAX_LINE_ARGS, (long)(args[MAX_LINE_ARGS-1] - outline + 1), args[MAX_LINE_ARGS-1]); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + goto next_line; + } + + /* everything's OK */ + break; + } + + /* dump cfg */ + if (global.mode & MODE_DUMP_CFG) { + if (args[0] != NULL) { + struct cfg_section *sect; + int is_sect = 0; + int i = 0; + uint32_t g_key = HA_ATOMIC_LOAD(&global.anon_key); + + if (global.mode & MODE_DUMP_NB_L) + qfprintf(stdout, "%d\t", linenum); + + /* if a word is in sections list, is_sect = 1 */ + list_for_each_entry(sect, §ions, list) { + if (strcmp(args[0], sect->section_name) == 0) { + is_sect = 1; + break; + } + } + + if (g_key == 0) { + /* no anonymizing needed, dump the config as-is (but without comments). + * Note: tabs were lost during tokenizing, so we reinsert for non-section + * keywords. + */ + if (!is_sect) + qfprintf(stdout, "\t"); + + for (i = 0; i < arg; i++) { + qfprintf(stdout, "%s ", args[i]); + } + qfprintf(stdout, "\n"); + continue; + } + + /* We're anonymizing */ + + if (is_sect) { + /* new sections are optionally followed by an identifier */ + if (arg >= 2) { + qfprintf(stdout, "%s %s\n", args[0], HA_ANON_ID(g_key, args[1])); + } + else { + qfprintf(stdout, "%s\n", args[0]); + } + continue; + } + + /* non-section keywords start indented */ + qfprintf(stdout, "\t"); + + /* some keywords deserve special treatment */ + if (!*args[0]) { + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "anonkey") == 0) { + qfprintf(stdout, "%s [...]\n", args[0]); + } + + else if (strcmp(args[0], "maxconn") == 0) { + qfprintf(stdout, "%s %s\n", args[0], args[1]); + } + + else if (strcmp(args[0], "stats") == 0 && + (strcmp(args[1], "timeout") == 0 || strcmp(args[1], "maxconn") == 0)) { + qfprintf(stdout, "%s %s %s\n", args[0], args[1], args[2]); + } + + else if (strcmp(args[0], "stats") == 0 && strcmp(args[1], "socket") == 0) { + qfprintf(stdout, "%s %s ", args[0], args[1]); + + if (arg > 2) { + qfprintf(stdout, "%s ", hash_ipanon(g_key, args[2], 1)); + + if (arg > 3) { + qfprintf(stdout, "[...]\n"); + } + else { + qfprintf(stdout, "\n"); + } + } + else { + qfprintf(stdout, "\n"); + } + } + + else if (strcmp(args[0], "timeout") == 0) { + qfprintf(stdout, "%s %s %s\n", args[0], args[1], args[2]); + } + + else if (strcmp(args[0], "mode") == 0) { + qfprintf(stdout, "%s %s\n", args[0], args[1]); + } + + /* It concerns user in global section and in userlist */ + else if (strcmp(args[0], "user") == 0) { + qfprintf(stdout, "%s %s ", args[0], HA_ANON_ID(g_key, args[1])); + + if (arg > 2) { + qfprintf(stdout, "[...]\n"); + } + else { + qfprintf(stdout, "\n"); + } + } + + else if (strcmp(args[0], "bind") == 0) { + qfprintf(stdout, "%s ", args[0]); + qfprintf(stdout, "%s ", hash_ipanon(g_key, args[1], 1)); + if (arg > 2) { + qfprintf(stdout, "[...]\n"); + } + else { + qfprintf(stdout, "\n"); + } + } + + else if (strcmp(args[0], "server") == 0) { + qfprintf(stdout, "%s %s ", args[0], HA_ANON_ID(g_key, args[1])); + + if (arg > 2) { + qfprintf(stdout, "%s ", hash_ipanon(g_key, args[2], 1)); + } + if (arg > 3) { + qfprintf(stdout, "[...]\n"); + } + else { + qfprintf(stdout, "\n"); + } + } + + else if (strcmp(args[0], "redirect") == 0) { + qfprintf(stdout, "%s %s ", args[0], args[1]); + + if (strcmp(args[1], "prefix") == 0 || strcmp(args[1], "location") == 0) { + qfprintf(stdout, "%s ", HA_ANON_PATH(g_key, args[2])); + } + else { + qfprintf(stdout, "%s ", args[2]); + } + if (arg > 3) { + qfprintf(stdout, "[...]"); + } + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "acl") == 0) { + qfprintf(stdout, "%s %s %s ", args[0], HA_ANON_ID(g_key, args[1]), args[2]); + + if (arg > 3) { + qfprintf(stdout, "[...]"); + } + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "log") == 0) { + qfprintf(stdout, "log "); + + if (strcmp(args[1], "global") == 0) { + qfprintf(stdout, "%s ", args[1]); + } + else { + qfprintf(stdout, "%s ", hash_ipanon(g_key, args[1], 1)); + } + if (arg > 2) { + qfprintf(stdout, "[...]"); + } + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "peer") == 0) { + qfprintf(stdout, "%s %s ", args[0], HA_ANON_ID(g_key, args[1])); + qfprintf(stdout, "%s ", hash_ipanon(g_key, args[2], 1)); + + if (arg > 3) { + qfprintf(stdout, "[...]"); + } + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "use_backend") == 0) { + qfprintf(stdout, "%s %s ", args[0], HA_ANON_ID(g_key, args[1])); + + if (arg > 2) { + qfprintf(stdout, "[...]"); + } + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "default_backend") == 0) { + qfprintf(stdout, "%s %s\n", args[0], HA_ANON_ID(g_key, args[1])); + } + + else if (strcmp(args[0], "source") == 0) { + qfprintf(stdout, "%s %s ", args[0], hash_ipanon(g_key, args[1], 1)); + + if (arg > 2) { + qfprintf(stdout, "[...]"); + } + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "nameserver") == 0) { + qfprintf(stdout, "%s %s %s ", args[0], + HA_ANON_ID(g_key, args[1]), hash_ipanon(g_key, args[2], 1)); + if (arg > 3) { + qfprintf(stdout, "[...]"); + } + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "http-request") == 0) { + qfprintf(stdout, "%s %s ", args[0], args[1]); + if (arg > 2) + qfprintf(stdout, "[...]"); + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "http-response") == 0) { + qfprintf(stdout, "%s %s ", args[0], args[1]); + if (arg > 2) + qfprintf(stdout, "[...]"); + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "http-after-response") == 0) { + qfprintf(stdout, "%s %s ", args[0], args[1]); + if (arg > 2) + qfprintf(stdout, "[...]"); + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "filter") == 0) { + qfprintf(stdout, "%s %s ", args[0], args[1]); + if (arg > 2) + qfprintf(stdout, "[...]"); + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "errorfile") == 0) { + qfprintf(stdout, "%s %s %s\n", args[0], args[1], HA_ANON_PATH(g_key, args[2])); + } + + else if (strcmp(args[0], "cookie") == 0) { + qfprintf(stdout, "%s %s ", args[0], HA_ANON_ID(g_key, args[1])); + if (arg > 2) + qfprintf(stdout, "%s ", args[2]); + if (arg > 3) + qfprintf(stdout, "[...]"); + qfprintf(stdout, "\n"); + } + + else if (strcmp(args[0], "stats") == 0 && strcmp(args[1], "auth") == 0) { + qfprintf(stdout, "%s %s %s\n", args[0], args[1], HA_ANON_STR(g_key, args[2])); + } + + else { + /* display up to 3 words and mask the rest which might be confidential */ + for (i = 0; i < MIN(arg, 3); i++) { + qfprintf(stdout, "%s ", args[i]); + } + if (arg > 3) { + qfprintf(stdout, "[...]"); + } + qfprintf(stdout, "\n"); + } + } + continue; + } + /* end of config dump */ + + /* empty line */ + if (!**args) + continue; + + /* check for config macros */ + if (*args[0] == '.') { + if (strcmp(args[0], ".if") == 0) { + const char *errptr = NULL; + char *errmsg = NULL; + int cond; + char *w; + + /* remerge all words into a single expression */ + for (w = *args; (w += strlen(w)) < outline + outlen - 1; *w = ' ') + ; + + nested_cond_lvl++; + if (nested_cond_lvl >= MAXNESTEDCONDS) { + ha_alert("parsing [%s:%d]: too many nested '.if', max is %d.\n", file, linenum, MAXNESTEDCONDS); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + goto err; + } + + if (nested_cond_lvl > 1 && + (nested_conds[nested_cond_lvl - 1] == NESTED_COND_IF_DROP || + nested_conds[nested_cond_lvl - 1] == NESTED_COND_IF_SKIP || + nested_conds[nested_cond_lvl - 1] == NESTED_COND_ELIF_DROP || + nested_conds[nested_cond_lvl - 1] == NESTED_COND_ELIF_SKIP || + nested_conds[nested_cond_lvl - 1] == NESTED_COND_ELSE_DROP)) { + nested_conds[nested_cond_lvl] = NESTED_COND_IF_SKIP; + goto next_line; + } + + cond = cfg_eval_condition(args + 1, &errmsg, &errptr); + if (cond < 0) { + size_t newpos = sanitize_for_printing(args[1], errptr - args[1], 76); + + ha_alert("parsing [%s:%d]: %s in '.if' at position %d:\n .if %s\n %*s\n", + file, linenum, errmsg, + (int)(errptr-args[1]+1), args[1], (int)(newpos+5), "^"); + + free(errmsg); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + goto err; + } + + if (cond) + nested_conds[nested_cond_lvl] = NESTED_COND_IF_TAKE; + else + nested_conds[nested_cond_lvl] = NESTED_COND_IF_DROP; + + goto next_line; + } + else if (strcmp(args[0], ".elif") == 0) { + const char *errptr = NULL; + char *errmsg = NULL; + int cond; + char *w; + + /* remerge all words into a single expression */ + for (w = *args; (w += strlen(w)) < outline + outlen - 1; *w = ' ') + ; + + if (!nested_cond_lvl) { + ha_alert("parsing [%s:%d]: lone '.elif' with no matching '.if'.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + goto err; + } + + if (nested_conds[nested_cond_lvl] == NESTED_COND_ELSE_TAKE || + nested_conds[nested_cond_lvl] == NESTED_COND_ELSE_DROP) { + ha_alert("parsing [%s:%d]: '.elif' after '.else' is not permitted.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + goto err; + } + + if (nested_conds[nested_cond_lvl] == NESTED_COND_IF_TAKE || + nested_conds[nested_cond_lvl] == NESTED_COND_IF_SKIP || + nested_conds[nested_cond_lvl] == NESTED_COND_ELIF_TAKE || + nested_conds[nested_cond_lvl] == NESTED_COND_ELIF_SKIP) { + nested_conds[nested_cond_lvl] = NESTED_COND_ELIF_SKIP; + goto next_line; + } + + cond = cfg_eval_condition(args + 1, &errmsg, &errptr); + if (cond < 0) { + size_t newpos = sanitize_for_printing(args[1], errptr - args[1], 74); + + ha_alert("parsing [%s:%d]: %s in '.elif' at position %d:\n .elif %s\n %*s\n", + file, linenum, errmsg, + (int)(errptr-args[1]+1), args[1], (int)(newpos+7), "^"); + + free(errmsg); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + goto err; + } + + if (cond) + nested_conds[nested_cond_lvl] = NESTED_COND_ELIF_TAKE; + else + nested_conds[nested_cond_lvl] = NESTED_COND_ELIF_DROP; + + goto next_line; + } + else if (strcmp(args[0], ".else") == 0) { + if (*args[1]) { + ha_alert("parsing [%s:%d]: Unexpected argument '%s' for '%s'.\n", + file, linenum, args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + break; + } + + if (!nested_cond_lvl) { + ha_alert("parsing [%s:%d]: lone '.else' with no matching '.if'.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + goto err; + } + + if (nested_conds[nested_cond_lvl] == NESTED_COND_ELSE_TAKE || + nested_conds[nested_cond_lvl] == NESTED_COND_ELSE_DROP) { + ha_alert("parsing [%s:%d]: '.else' after '.else' is not permitted.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + goto err; + } + + if (nested_conds[nested_cond_lvl] == NESTED_COND_IF_TAKE || + nested_conds[nested_cond_lvl] == NESTED_COND_IF_SKIP || + nested_conds[nested_cond_lvl] == NESTED_COND_ELIF_TAKE || + nested_conds[nested_cond_lvl] == NESTED_COND_ELIF_SKIP) { + nested_conds[nested_cond_lvl] = NESTED_COND_ELSE_DROP; + } else { + /* otherwise we take the "else" */ + nested_conds[nested_cond_lvl] = NESTED_COND_ELSE_TAKE; + } + goto next_line; + } + else if (strcmp(args[0], ".endif") == 0) { + if (*args[1]) { + ha_alert("parsing [%s:%d]: Unexpected argument '%s' for '%s'.\n", + file, linenum, args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + break; + } + + if (!nested_cond_lvl) { + ha_alert("parsing [%s:%d]: lone '.endif' with no matching '.if'.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + break; + } + nested_cond_lvl--; + goto next_line; + } + } + + if (nested_cond_lvl && + (nested_conds[nested_cond_lvl] == NESTED_COND_IF_DROP || + nested_conds[nested_cond_lvl] == NESTED_COND_IF_SKIP || + nested_conds[nested_cond_lvl] == NESTED_COND_ELIF_DROP || + nested_conds[nested_cond_lvl] == NESTED_COND_ELIF_SKIP || + nested_conds[nested_cond_lvl] == NESTED_COND_ELSE_DROP)) { + /* The current block is masked out by the conditions */ + goto next_line; + } + + /* .warning/.error/.notice/.diag */ + if (*args[0] == '.') { + if (strcmp(args[0], ".alert") == 0) { + if (*args[2]) { + ha_alert("parsing [%s:%d]: Unexpected argument '%s' for '%s'. Use quotes if the message should contain spaces.\n", + file, linenum, args[2], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto next_line; + } + + ha_alert("parsing [%s:%d]: '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + goto err; + } + else if (strcmp(args[0], ".warning") == 0) { + if (*args[2]) { + ha_alert("parsing [%s:%d]: Unexpected argument '%s' for '%s'. Use quotes if the message should contain spaces.\n", + file, linenum, args[2], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto next_line; + } + + ha_warning("parsing [%s:%d]: '%s'.\n", file, linenum, args[1]); + err_code |= ERR_WARN; + goto next_line; + } + else if (strcmp(args[0], ".notice") == 0) { + if (*args[2]) { + ha_alert("parsing [%s:%d]: Unexpected argument '%s' for '%s'. Use quotes if the message should contain spaces.\n", + file, linenum, args[2], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto next_line; + } + + ha_notice("parsing [%s:%d]: '%s'.\n", file, linenum, args[1]); + goto next_line; + } + else if (strcmp(args[0], ".diag") == 0) { + if (*args[2]) { + ha_alert("parsing [%s:%d]: Unexpected argument '%s' for '%s'. Use quotes if the message should contain spaces.\n", + file, linenum, args[2], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto next_line; + } + + ha_diag_warning("parsing [%s:%d]: '%s'.\n", file, linenum, args[1]); + goto next_line; + } + else { + ha_alert("parsing [%s:%d]: unknown directive '%s'.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + break; + } + } + + /* check for keyword modifiers "no" and "default" */ + if (strcmp(args[0], "no") == 0) { + char *tmp; + + kwm = KWM_NO; + tmp = args[0]; + for (arg=0; *args[arg+1]; arg++) + args[arg] = args[arg+1]; // shift args after inversion + *tmp = '\0'; // fix the next arg to \0 + args[arg] = tmp; + } + else if (strcmp(args[0], "default") == 0) { + kwm = KWM_DEF; + for (arg=0; *args[arg+1]; arg++) + args[arg] = args[arg+1]; // shift args after inversion + } + + if (kwm != KWM_STD && strcmp(args[0], "option") != 0 && + strcmp(args[0], "log") != 0 && strcmp(args[0], "busy-polling") != 0 && + strcmp(args[0], "set-dumpable") != 0 && strcmp(args[0], "strict-limits") != 0 && + strcmp(args[0], "insecure-fork-wanted") != 0 && + strcmp(args[0], "numa-cpu-mapping") != 0) { + ha_alert("parsing [%s:%d]: negation/default currently " + "supported only for options, log, busy-polling, " + "set-dumpable, strict-limits, insecure-fork-wanted " + "and numa-cpu-mapping.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + } + + /* detect section start */ + list_for_each_entry(ics, §ions, list) { + if (strcmp(args[0], ics->section_name) == 0) { + cursection = ics->section_name; + pcs = cs; + cs = ics; + free(global.cfg_curr_section); + global.cfg_curr_section = strdup(*args[1] ? args[1] : args[0]); + check_section_position(args[0], file, linenum); + break; + } + } + + if (pcs && pcs->post_section_parser) { + int status; + + status = pcs->post_section_parser(); + err_code |= status; + if (status & ERR_FATAL) + fatal++; + + if (err_code & ERR_ABORT) + goto err; + } + pcs = NULL; + + if (!cs) { + ha_alert("parsing [%s:%d]: unknown keyword '%s' out of section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + fatal++; + } else { + int status; + + status = cs->section_parser(file, linenum, args, kwm); + err_code |= status; + if (status & ERR_FATAL) + fatal++; + + if (err_code & ERR_ABORT) + goto err; + } + } + + if (missing_lf != -1) { + ha_alert("parsing [%s:%d]: Missing LF on last line, file might have been truncated at position %d.\n", + file, linenum, (missing_lf + 1)); + err_code |= ERR_ALERT | ERR_FATAL; + } + + ha_free(&global.cfg_curr_section); + if (cs && cs->post_section_parser) + err_code |= cs->post_section_parser(); + + if (nested_cond_lvl) { + ha_alert("parsing [%s:%d]: non-terminated '.if' block.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL | ERR_ABORT; + } + + if (*initial_cwd && chdir(initial_cwd) == -1) { + ha_alert("Impossible to get back to initial directory '%s' : %s\n", initial_cwd, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + } + +err: + ha_free(&cfg_scope); + cursection = NULL; + free(thisline); + free(outline); + global.cfg_curr_line = 0; + global.cfg_curr_file = NULL; + + if (f) + fclose(f); + + return err_code; +} + +#if defined(USE_THREAD) && defined USE_CPU_AFFINITY +#if defined(__linux__) + +/* filter directory name of the pattern node<X> */ +static int numa_filter(const struct dirent *dir) +{ + char *endptr; + + /* dir name must start with "node" prefix */ + if (strncmp(dir->d_name, "node", 4)) + return 0; + + /* dir name must be at least 5 characters long */ + if (!dir->d_name[4]) + return 0; + + /* dir name must end with a numeric id */ + if (strtol(&dir->d_name[4], &endptr, 10) < 0 || *endptr) + return 0; + + /* all tests succeeded */ + return 1; +} + +/* Inspect the cpu topology of the machine on startup. If a multi-socket + * machine is detected, try to bind on the first node with active cpu. This is + * done to prevent an impact on the overall performance when the topology of + * the machine is unknown. This function is not called if one of the conditions + * is met : + * - a non-null nbthread directive is active + * - a restrictive cpu-map directive is active + * - a restrictive affinity is already applied, for example via taskset + * + * Returns the count of cpus selected. If no automatic binding was required or + * an error occurred and the topology is unknown, 0 is returned. + */ +static int numa_detect_topology() +{ + struct dirent **node_dirlist; + int node_dirlist_size; + + struct hap_cpuset active_cpus, node_cpu_set; + const char *parse_cpu_set_args[2]; + char *err = NULL; + int grp, thr; + + /* node_cpu_set count is used as return value */ + ha_cpuset_zero(&node_cpu_set); + + /* 1. count the sysfs node<X> directories */ + node_dirlist = NULL; + node_dirlist_size = scandir(NUMA_DETECT_SYSTEM_SYSFS_PATH"/node", &node_dirlist, numa_filter, alphasort); + if (node_dirlist_size <= 1) + goto free_scandir_entries; + + /* 2. read and parse the list of currently online cpu */ + if (read_line_to_trash("%s/cpu/online", NUMA_DETECT_SYSTEM_SYSFS_PATH) < 0) { + ha_notice("Cannot read online CPUs list, will not try to refine binding\n"); + goto free_scandir_entries; + } + + parse_cpu_set_args[0] = trash.area; + parse_cpu_set_args[1] = "\0"; + if (parse_cpu_set(parse_cpu_set_args, &active_cpus, &err) != 0) { + ha_notice("Cannot read online CPUs list: '%s'. Will not try to refine binding\n", err); + free(err); + goto free_scandir_entries; + } + + /* 3. loop through nodes dirs and find the first one with active cpus */ + while (node_dirlist_size--) { + const char *node = node_dirlist[node_dirlist_size]->d_name; + ha_cpuset_zero(&node_cpu_set); + + if (read_line_to_trash("%s/node/%s/cpumap", NUMA_DETECT_SYSTEM_SYSFS_PATH, node) < 0) { + ha_notice("Cannot read CPUs list of '%s', will not select them to refine binding\n", node); + free(node_dirlist[node_dirlist_size]); + continue; + } + + parse_cpumap(trash.area, &node_cpu_set); + ha_cpuset_and(&node_cpu_set, &active_cpus); + + /* 5. set affinity on the first found node with active cpus */ + if (!ha_cpuset_count(&node_cpu_set)) { + free(node_dirlist[node_dirlist_size]); + continue; + } + + ha_diag_warning("Multi-socket cpu detected, automatically binding on active CPUs of '%s' (%u active cpu(s))\n", node, ha_cpuset_count(&node_cpu_set)); + for (grp = 0; grp < MAX_TGROUPS; grp++) + for (thr = 0; thr < MAX_THREADS_PER_GROUP; thr++) + ha_cpuset_assign(&cpu_map[grp].thread[thr], &node_cpu_set); + + free(node_dirlist[node_dirlist_size]); + break; + } + + free_scandir_entries: + while (node_dirlist_size-- > 0) + free(node_dirlist[node_dirlist_size]); + free(node_dirlist); + + return ha_cpuset_count(&node_cpu_set); +} + +#elif defined(__FreeBSD__) +static int numa_detect_topology() +{ + struct hap_cpuset node_cpu_set; + int ndomains = 0, i; + size_t len = sizeof(ndomains); + int grp, thr; + + if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) { + ha_notice("Cannot assess the number of CPUs domains\n"); + return 0; + } + + BUG_ON(ndomains > MAXMEMDOM); + ha_cpuset_zero(&node_cpu_set); + + if (ndomains < 2) + goto leave; + + /* + * We retrieve the first active valid CPU domain + * with active cpu and binding it, we returns + * the number of cpu from the said domain + */ + for (i = 0; i < ndomains; i ++) { + struct hap_cpuset dom; + ha_cpuset_zero(&dom); + if (cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_DOMAIN, i, sizeof(dom.cpuset), &dom.cpuset) == -1) + continue; + + if (!ha_cpuset_count(&dom)) + continue; + + ha_cpuset_assign(&node_cpu_set, &dom); + + ha_diag_warning("Multi-socket cpu detected, automatically binding on active CPUs of '%d' (%u active cpu(s))\n", i, ha_cpuset_count(&node_cpu_set)); + for (grp = 0; grp < MAX_TGROUPS; grp++) + for (thr = 0; thr < MAX_THREADS_PER_GROUP; thr++) + ha_cpuset_assign(&cpu_map[grp].thread[thr], &node_cpu_set); + break; + } + leave: + return ha_cpuset_count(&node_cpu_set); +} + +#else +static int numa_detect_topology() +{ + return 0; +} + +#endif +#endif /* USE_THREAD && USE_CPU_AFFINITY */ + +/* + * Returns the error code, 0 if OK, or any combination of : + * - ERR_ABORT: must abort ASAP + * - ERR_FATAL: we can continue parsing but not start the service + * - ERR_WARN: a warning has been emitted + * - ERR_ALERT: an alert has been emitted + * Only the two first ones can stop processing, the two others are just + * indicators. + */ +int check_config_validity() +{ + int cfgerr = 0; + struct proxy *curproxy = NULL; + struct proxy *init_proxies_list = NULL; + struct stktable *t; + struct server *newsrv = NULL; + int err_code = 0; + unsigned int next_pxid = 1; + struct bind_conf *bind_conf; + char *err; + struct cfg_postparser *postparser; + struct resolvers *curr_resolvers = NULL; + int i; + + bind_conf = NULL; + /* + * Now, check for the integrity of all that we have collected. + */ + + if (!global.tune.max_http_hdr) + global.tune.max_http_hdr = MAX_HTTP_HDR; + + if (!global.tune.cookie_len) + global.tune.cookie_len = CAPTURE_LEN; + + if (!global.tune.requri_len) + global.tune.requri_len = REQURI_LEN; + + if (!global.nbthread) { + /* nbthread not set, thus automatic. In this case, and only if + * running on a single process, we enable the same number of + * threads as the number of CPUs the process is bound to. This + * allows to easily control the number of threads using taskset. + */ + global.nbthread = 1; + +#if defined(USE_THREAD) + { + int numa_cores = 0; +#if defined(USE_CPU_AFFINITY) + if (global.numa_cpu_mapping && !thread_cpu_mask_forced() && !cpu_map_configured()) + numa_cores = numa_detect_topology(); +#endif + global.nbthread = numa_cores ? numa_cores : + thread_cpus_enabled_at_boot; + + /* Note that we cannot have more than 32 or 64 threads per group */ + if (!global.nbtgroups) + global.nbtgroups = 1; + + if (global.nbthread > MAX_THREADS_PER_GROUP * global.nbtgroups) { + ha_diag_warning("nbthread not set, found %d CPUs, limiting to %d threads (maximum is %d per thread group). Please set nbthreads and/or increase thread-groups in the global section to silence this warning.\n", + global.nbthread, MAX_THREADS_PER_GROUP * global.nbtgroups, MAX_THREADS_PER_GROUP); + global.nbthread = MAX_THREADS_PER_GROUP * global.nbtgroups; + } + } +#endif + } + + if (!global.nbtgroups) + global.nbtgroups = 1; + + if (thread_map_to_groups() < 0) { + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + pool_head_requri = create_pool("requri", global.tune.requri_len , MEM_F_SHARED); + + pool_head_capture = create_pool("capture", global.tune.cookie_len, MEM_F_SHARED); + + /* Post initialisation of the users and groups lists. */ + err_code = userlist_postinit(); + if (err_code != ERR_NONE) + goto out; + + /* first, we will invert the proxy list order */ + curproxy = NULL; + while (proxies_list) { + struct proxy *next; + + next = proxies_list->next; + proxies_list->next = curproxy; + curproxy = proxies_list; + if (!next) + break; + proxies_list = next; + } + + /* starting to initialize the main proxies list */ + init_proxies_list = proxies_list; + +init_proxies_list_stage1: + for (curproxy = init_proxies_list; curproxy; curproxy = curproxy->next) { + struct switching_rule *rule; + struct server_rule *srule; + struct sticking_rule *mrule; + struct logger *tmplogger; + unsigned int next_id; + + if (!(curproxy->cap & PR_CAP_INT) && curproxy->uuid < 0) { + /* proxy ID not set, use automatic numbering with first + * spare entry starting with next_pxid. We don't assign + * numbers for internal proxies as they may depend on + * build or config options and we don't want them to + * possibly reuse existing IDs. + */ + next_pxid = get_next_id(&used_proxy_id, next_pxid); + curproxy->conf.id.key = curproxy->uuid = next_pxid; + eb32_insert(&used_proxy_id, &curproxy->conf.id); + } + + if (curproxy->mode == PR_MODE_HTTP && global.tune.bufsize >= (256 << 20) && ONLY_ONCE()) { + ha_alert("global.tune.bufsize must be below 256 MB when HTTP is in use (current value = %d).\n", + global.tune.bufsize); + cfgerr++; + } + + /* next IDs are shifted even if the proxy is disabled, this + * guarantees that a proxy that is temporarily disabled in the + * configuration doesn't cause a renumbering. Internal proxies + * that are not assigned a static ID must never shift the IDs + * either since they may appear in any order (Lua, logs, etc). + * The GLOBAL proxy that carries the stats socket has its ID + * forced to zero. + */ + if (curproxy->uuid >= 0) + next_pxid++; + + if (curproxy->flags & PR_FL_DISABLED) { + /* ensure we don't keep listeners uselessly bound. We + * can't disable their listeners yet (fdtab not + * allocated yet) but let's skip them. + */ + if (curproxy->table) { + ha_free(&curproxy->table->peers.name); + curproxy->table->peers.p = NULL; + } + continue; + } + + /* The current proxy is referencing a default proxy. We must + * finalize its config, but only once. If the default proxy is + * ready (PR_FL_READY) it means it was already fully configured. + */ + if (curproxy->defpx) { + if (!(curproxy->defpx->flags & PR_FL_READY)) { + /* check validity for 'tcp-request' layer 4/5/6/7 rules */ + cfgerr += check_action_rules(&curproxy->defpx->tcp_req.l4_rules, curproxy->defpx, &err_code); + cfgerr += check_action_rules(&curproxy->defpx->tcp_req.l5_rules, curproxy->defpx, &err_code); + cfgerr += check_action_rules(&curproxy->defpx->tcp_req.inspect_rules, curproxy->defpx, &err_code); + cfgerr += check_action_rules(&curproxy->defpx->tcp_rep.inspect_rules, curproxy->defpx, &err_code); + cfgerr += check_action_rules(&curproxy->defpx->http_req_rules, curproxy->defpx, &err_code); + cfgerr += check_action_rules(&curproxy->defpx->http_res_rules, curproxy->defpx, &err_code); + cfgerr += check_action_rules(&curproxy->defpx->http_after_res_rules, curproxy->defpx, &err_code); + + err = NULL; + i = smp_resolve_args(curproxy->defpx, &err); + cfgerr += i; + if (i) { + indent_msg(&err, 8); + ha_alert("%s%s\n", i > 1 ? "multiple argument resolution errors:" : "", err); + ha_free(&err); + } + else + cfgerr += acl_find_targets(curproxy->defpx); + + /* default proxy is now ready. Set the right FE/BE capabilities */ + curproxy->defpx->flags |= PR_FL_READY; + } + } + + /* check and reduce the bind-proc of each listener */ + list_for_each_entry(bind_conf, &curproxy->conf.bind, by_fe) { + int ret; + + /* HTTP frontends with "h2" as ALPN/NPN will work in + * HTTP/2 and absolutely require buffers 16kB or larger. + */ +#ifdef USE_OPENSSL + /* no-alpn ? If so, it's the right moment to remove it */ + if (bind_conf->ssl_conf.alpn_str && !bind_conf->ssl_conf.alpn_len) { + free(bind_conf->ssl_conf.alpn_str); + bind_conf->ssl_conf.alpn_str = NULL; + } +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + else if (!bind_conf->ssl_conf.alpn_str && !bind_conf->ssl_conf.npn_str && + ((bind_conf->options & BC_O_USE_SSL) || bind_conf->xprt == xprt_get(XPRT_QUIC)) && + curproxy->mode == PR_MODE_HTTP && global.tune.bufsize >= 16384) { + + /* Neither ALPN nor NPN were explicitly set nor disabled, we're + * in HTTP mode with an SSL or QUIC listener, we can enable ALPN. + * Note that it's in binary form. + */ + if (bind_conf->xprt == xprt_get(XPRT_QUIC)) + bind_conf->ssl_conf.alpn_str = strdup("\002h3"); + else + bind_conf->ssl_conf.alpn_str = strdup("\002h2\010http/1.1"); + + if (!bind_conf->ssl_conf.alpn_str) { + ha_alert("Proxy '%s': out of memory while trying to allocate a default alpn string in 'bind %s' at [%s:%d].\n", + curproxy->id, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr++; + err_code |= ERR_FATAL | ERR_ALERT; + goto out; + } + bind_conf->ssl_conf.alpn_len = strlen(bind_conf->ssl_conf.alpn_str); + } +#endif + + if (curproxy->mode == PR_MODE_HTTP && global.tune.bufsize < 16384) { +#ifdef OPENSSL_NPN_NEGOTIATED + /* check NPN */ + if (bind_conf->ssl_conf.npn_str && strstr(bind_conf->ssl_conf.npn_str, "\002h2")) { + ha_alert("HTTP frontend '%s' enables HTTP/2 via NPN at [%s:%d], so global.tune.bufsize must be at least 16384 bytes (%d now).\n", + curproxy->id, bind_conf->file, bind_conf->line, global.tune.bufsize); + cfgerr++; + } +#endif +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + /* check ALPN */ + if (bind_conf->ssl_conf.alpn_str && strstr(bind_conf->ssl_conf.alpn_str, "\002h2")) { + ha_alert("HTTP frontend '%s' enables HTTP/2 via ALPN at [%s:%d], so global.tune.bufsize must be at least 16384 bytes (%d now).\n", + curproxy->id, bind_conf->file, bind_conf->line, global.tune.bufsize); + cfgerr++; + } +#endif + } /* HTTP && bufsize < 16384 */ +#endif + + /* finish the bind setup */ + ret = bind_complete_thread_setup(bind_conf, &err_code); + if (ret != 0) { + cfgerr += ret; + if (err_code & ERR_FATAL) + goto out; + } + } + + switch (curproxy->mode) { + case PR_MODE_TCP: + cfgerr += proxy_cfg_ensure_no_http(curproxy); + cfgerr += proxy_cfg_ensure_no_log(curproxy); + break; + + case PR_MODE_HTTP: + cfgerr += proxy_cfg_ensure_no_log(curproxy); + curproxy->http_needed = 1; + break; + + case PR_MODE_CLI: + cfgerr += proxy_cfg_ensure_no_http(curproxy); + cfgerr += proxy_cfg_ensure_no_log(curproxy); + break; + + case PR_MODE_SYSLOG: + /* this mode is initialized as the classic tcp proxy */ + cfgerr += proxy_cfg_ensure_no_http(curproxy); + break; + + case PR_MODE_PEERS: + case PR_MODES: + /* should not happen, bug gcc warn missing switch statement */ + ha_alert("%s '%s' cannot initialize this proxy mode (peers) in this way. NOTE: PLEASE REPORT THIS TO DEVELOPERS AS YOU'RE NOT SUPPOSED TO BE ABLE TO CREATE A CONFIGURATION TRIGGERING THIS!\n", + proxy_type_str(curproxy), curproxy->id); + cfgerr++; + break; + } + + if (!(curproxy->cap & PR_CAP_INT) && (curproxy->cap & PR_CAP_FE) && LIST_ISEMPTY(&curproxy->conf.listeners)) { + ha_warning("%s '%s' has no 'bind' directive. Please declare it as a backend if this was intended.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + if (curproxy->cap & PR_CAP_BE) { + if (curproxy->lbprm.algo & BE_LB_KIND) { + if (curproxy->options & PR_O_TRANSP) { + ha_alert("%s '%s' cannot use both transparent and balance mode.\n", + proxy_type_str(curproxy), curproxy->id); + cfgerr++; + } +#ifdef WE_DONT_SUPPORT_SERVERLESS_LISTENERS + else if (curproxy->srv == NULL) { + ha_alert("%s '%s' needs at least 1 server in balance mode.\n", + proxy_type_str(curproxy), curproxy->id); + cfgerr++; + } +#endif + else if (curproxy->options & PR_O_DISPATCH) { + ha_warning("dispatch address of %s '%s' will be ignored in balance mode.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + } + else if (!(curproxy->options & (PR_O_TRANSP | PR_O_DISPATCH))) { + /* If no LB algo is set in a backend, and we're not in + * transparent mode, dispatch mode nor proxy mode, we + * want to use balance roundrobin by default. + */ + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_RR; + } + } + + if (curproxy->options & PR_O_DISPATCH) + curproxy->options &= ~PR_O_TRANSP; + else if (curproxy->options & PR_O_TRANSP) + curproxy->options &= ~PR_O_DISPATCH; + + if ((curproxy->tcpcheck_rules.flags & TCPCHK_RULES_UNUSED_HTTP_RS)) { + ha_warning("%s '%s' uses http-check rules without 'option httpchk', so the rules are ignored.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + if ((curproxy->options2 & PR_O2_CHK_ANY) == PR_O2_TCPCHK_CHK && + (curproxy->tcpcheck_rules.flags & TCPCHK_RULES_PROTO_CHK) != TCPCHK_RULES_HTTP_CHK) { + if (curproxy->options & PR_O_DISABLE404) { + ha_warning("'%s' will be ignored for %s '%s' (requires 'option httpchk').\n", + "disable-on-404", proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + curproxy->options &= ~PR_O_DISABLE404; + } + if (curproxy->options2 & PR_O2_CHK_SNDST) { + ha_warning("'%s' will be ignored for %s '%s' (requires 'option httpchk').\n", + "send-state", proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + curproxy->options &= ~PR_O2_CHK_SNDST; + } + } + + if ((curproxy->options2 & PR_O2_CHK_ANY) == PR_O2_EXT_CHK) { + if (!global.external_check) { + ha_alert("Proxy '%s' : '%s' unable to find required 'global.external-check'.\n", + curproxy->id, "option external-check"); + cfgerr++; + } + if (!curproxy->check_command) { + ha_alert("Proxy '%s' : '%s' unable to find required 'external-check command'.\n", + curproxy->id, "option external-check"); + cfgerr++; + } + if (!(global.tune.options & GTUNE_INSECURE_FORK)) { + ha_warning("Proxy '%s' : 'insecure-fork-wanted' not enabled in the global section, '%s' will likely fail.\n", + curproxy->id, "option external-check"); + err_code |= ERR_WARN; + } + } + + if (curproxy->email_alert.set) { + if (!(curproxy->email_alert.mailers.name && curproxy->email_alert.from && curproxy->email_alert.to)) { + ha_warning("'email-alert' will be ignored for %s '%s' (the presence any of " + "'email-alert from', 'email-alert level' 'email-alert mailers', " + "'email-alert myhostname', or 'email-alert to' " + "requires each of 'email-alert from', 'email-alert mailers' and 'email-alert to' " + "to be present).\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + free_email_alert(curproxy); + } + if (!curproxy->email_alert.myhostname) + curproxy->email_alert.myhostname = strdup(hostname); + } + + if (curproxy->check_command) { + int clear = 0; + if ((curproxy->options2 & PR_O2_CHK_ANY) != PR_O2_EXT_CHK) { + ha_warning("'%s' will be ignored for %s '%s' (requires 'option external-check').\n", + "external-check command", proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + clear = 1; + } + if (curproxy->check_command[0] != '/' && !curproxy->check_path) { + ha_alert("Proxy '%s': '%s' does not have a leading '/' and 'external-check path' is not set.\n", + curproxy->id, "external-check command"); + cfgerr++; + } + if (clear) { + ha_free(&curproxy->check_command); + } + } + + if (curproxy->check_path) { + if ((curproxy->options2 & PR_O2_CHK_ANY) != PR_O2_EXT_CHK) { + ha_warning("'%s' will be ignored for %s '%s' (requires 'option external-check').\n", + "external-check path", proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + ha_free(&curproxy->check_path); + } + } + + /* if a default backend was specified, let's find it */ + if (curproxy->defbe.name) { + struct proxy *target; + + target = proxy_be_by_name(curproxy->defbe.name); + if (!target) { + ha_alert("Proxy '%s': unable to find required default_backend: '%s'.\n", + curproxy->id, curproxy->defbe.name); + cfgerr++; + } else if (target == curproxy) { + ha_alert("Proxy '%s': loop detected for default_backend: '%s'.\n", + curproxy->id, curproxy->defbe.name); + cfgerr++; + } else if (target->mode != curproxy->mode && + !(curproxy->mode == PR_MODE_TCP && target->mode == PR_MODE_HTTP)) { + + ha_alert("%s %s '%s' (%s:%d) tries to use incompatible %s %s '%s' (%s:%d) as its default backend (see 'mode').\n", + proxy_mode_str(curproxy->mode), proxy_type_str(curproxy), curproxy->id, + curproxy->conf.file, curproxy->conf.line, + proxy_mode_str(target->mode), proxy_type_str(target), target->id, + target->conf.file, target->conf.line); + cfgerr++; + } else { + free(curproxy->defbe.name); + curproxy->defbe.be = target; + /* Emit a warning if this proxy also has some servers */ + if (curproxy->srv) { + ha_warning("In proxy '%s', the 'default_backend' rule always has precedence over the servers, which will never be used.\n", + curproxy->id); + err_code |= ERR_WARN; + } + } + } + + /* find the target proxy for 'use_backend' rules */ + list_for_each_entry(rule, &curproxy->switching_rules, list) { + struct proxy *target; + struct logformat_node *node; + char *pxname; + + /* Try to parse the string as a log format expression. If the result + * of the parsing is only one entry containing a simple string, then + * it's a standard string corresponding to a static rule, thus the + * parsing is cancelled and be.name is restored to be resolved. + */ + pxname = rule->be.name; + LIST_INIT(&rule->be.expr); + curproxy->conf.args.ctx = ARGC_UBK; + curproxy->conf.args.file = rule->file; + curproxy->conf.args.line = rule->line; + err = NULL; + if (!parse_logformat_string(pxname, curproxy, &rule->be.expr, 0, SMP_VAL_FE_HRQ_HDR, &err)) { + ha_alert("Parsing [%s:%d]: failed to parse use_backend rule '%s' : %s.\n", + rule->file, rule->line, pxname, err); + free(err); + cfgerr++; + continue; + } + node = LIST_NEXT(&rule->be.expr, struct logformat_node *, list); + + if (!LIST_ISEMPTY(&rule->be.expr)) { + if (node->type != LOG_FMT_TEXT || node->list.n != &rule->be.expr) { + rule->dynamic = 1; + free(pxname); + continue; + } + /* Only one element in the list, a simple string: free the expression and + * fall back to static rule + */ + LIST_DELETE(&node->list); + free(node->arg); + free(node); + } + + rule->dynamic = 0; + rule->be.name = pxname; + + target = proxy_be_by_name(rule->be.name); + if (!target) { + ha_alert("Proxy '%s': unable to find required use_backend: '%s'.\n", + curproxy->id, rule->be.name); + cfgerr++; + } else if (target == curproxy) { + ha_alert("Proxy '%s': loop detected for use_backend: '%s'.\n", + curproxy->id, rule->be.name); + cfgerr++; + } else if (target->mode != curproxy->mode && + !(curproxy->mode == PR_MODE_TCP && target->mode == PR_MODE_HTTP)) { + + ha_alert("%s %s '%s' (%s:%d) tries to use incompatible %s %s '%s' (%s:%d) in a 'use_backend' rule (see 'mode').\n", + proxy_mode_str(curproxy->mode), proxy_type_str(curproxy), curproxy->id, + curproxy->conf.file, curproxy->conf.line, + proxy_mode_str(target->mode), proxy_type_str(target), target->id, + target->conf.file, target->conf.line); + cfgerr++; + } else { + ha_free(&rule->be.name); + rule->be.backend = target; + } + err_code |= warnif_tcp_http_cond(curproxy, rule->cond); + } + + /* find the target server for 'use_server' rules */ + list_for_each_entry(srule, &curproxy->server_rules, list) { + struct server *target; + struct logformat_node *node; + char *server_name; + + /* We try to parse the string as a log format expression. If the result of the parsing + * is only one entry containing a single string, then it's a standard string corresponding + * to a static rule, thus the parsing is cancelled and we fall back to setting srv.ptr. + */ + server_name = srule->srv.name; + LIST_INIT(&srule->expr); + curproxy->conf.args.ctx = ARGC_USRV; + err = NULL; + if (!parse_logformat_string(server_name, curproxy, &srule->expr, 0, SMP_VAL_FE_HRQ_HDR, &err)) { + ha_alert("Parsing [%s:%d]; use-server rule failed to parse log-format '%s' : %s.\n", + srule->file, srule->line, server_name, err); + free(err); + cfgerr++; + continue; + } + node = LIST_NEXT(&srule->expr, struct logformat_node *, list); + + if (!LIST_ISEMPTY(&srule->expr)) { + if (node->type != LOG_FMT_TEXT || node->list.n != &srule->expr) { + srule->dynamic = 1; + free(server_name); + continue; + } + /* Only one element in the list, a simple string: free the expression and + * fall back to static rule + */ + LIST_DELETE(&node->list); + free(node->arg); + free(node); + } + + srule->dynamic = 0; + srule->srv.name = server_name; + target = findserver(curproxy, srule->srv.name); + err_code |= warnif_tcp_http_cond(curproxy, srule->cond); + + if (!target) { + ha_alert("%s '%s' : unable to find server '%s' referenced in a 'use-server' rule.\n", + proxy_type_str(curproxy), curproxy->id, srule->srv.name); + cfgerr++; + continue; + } + ha_free(&srule->srv.name); + srule->srv.ptr = target; + target->flags |= SRV_F_NON_PURGEABLE; + } + + /* find the target table for 'stick' rules */ + list_for_each_entry(mrule, &curproxy->sticking_rules, list) { + curproxy->be_req_ana |= AN_REQ_STICKING_RULES; + if (mrule->flags & STK_IS_STORE) + curproxy->be_rsp_ana |= AN_RES_STORE_RULES; + + if (!resolve_stick_rule(curproxy, mrule)) + cfgerr++; + + err_code |= warnif_tcp_http_cond(curproxy, mrule->cond); + } + + /* find the target table for 'store response' rules */ + list_for_each_entry(mrule, &curproxy->storersp_rules, list) { + curproxy->be_rsp_ana |= AN_RES_STORE_RULES; + + if (!resolve_stick_rule(curproxy, mrule)) + cfgerr++; + } + + /* check validity for 'tcp-request' layer 4/5/6/7 rules */ + cfgerr += check_action_rules(&curproxy->tcp_req.l4_rules, curproxy, &err_code); + cfgerr += check_action_rules(&curproxy->tcp_req.l5_rules, curproxy, &err_code); + cfgerr += check_action_rules(&curproxy->tcp_req.inspect_rules, curproxy, &err_code); + cfgerr += check_action_rules(&curproxy->tcp_rep.inspect_rules, curproxy, &err_code); + cfgerr += check_action_rules(&curproxy->http_req_rules, curproxy, &err_code); + cfgerr += check_action_rules(&curproxy->http_res_rules, curproxy, &err_code); + cfgerr += check_action_rules(&curproxy->http_after_res_rules, curproxy, &err_code); + + /* Warn is a switch-mode http is used on a TCP listener with servers but no backend */ + if (!curproxy->defbe.name && LIST_ISEMPTY(&curproxy->switching_rules) && curproxy->srv) { + if ((curproxy->options & PR_O_HTTP_UPG) && curproxy->mode == PR_MODE_TCP) + ha_warning("Proxy '%s' : 'switch-mode http' configured for a %s %s with no backend. " + "Incoming connections upgraded to HTTP cannot be routed to TCP servers\n", + curproxy->id, proxy_mode_str(curproxy->mode), proxy_type_str(curproxy)); + } + + if (curproxy->table && curproxy->table->peers.name) { + struct peers *curpeers; + + for (curpeers = cfg_peers; curpeers; curpeers = curpeers->next) { + if (strcmp(curpeers->id, curproxy->table->peers.name) == 0) { + ha_free(&curproxy->table->peers.name); + curproxy->table->peers.p = curpeers; + break; + } + } + + if (!curpeers) { + ha_alert("Proxy '%s': unable to find sync peers '%s'.\n", + curproxy->id, curproxy->table->peers.name); + ha_free(&curproxy->table->peers.name); + curproxy->table->peers.p = NULL; + cfgerr++; + } + else if (curpeers->disabled) { + /* silently disable this peers section */ + curproxy->table->peers.p = NULL; + } + else if (!curpeers->peers_fe) { + ha_alert("Proxy '%s': unable to find local peer '%s' in peers section '%s'.\n", + curproxy->id, localpeer, curpeers->id); + curproxy->table->peers.p = NULL; + cfgerr++; + } + } + + + if (curproxy->email_alert.mailers.name) { + struct mailers *curmailers = mailers; + + for (curmailers = mailers; curmailers; curmailers = curmailers->next) { + if (strcmp(curmailers->id, curproxy->email_alert.mailers.name) == 0) + break; + } + if (!curmailers) { + ha_alert("Proxy '%s': unable to find mailers '%s'.\n", + curproxy->id, curproxy->email_alert.mailers.name); + free_email_alert(curproxy); + cfgerr++; + } + else { + err = NULL; + if (init_email_alert(curmailers, curproxy, &err)) { + ha_alert("Proxy '%s': %s.\n", curproxy->id, err); + free(err); + cfgerr++; + } + } + } + + if (curproxy->uri_auth && !(curproxy->uri_auth->flags & STAT_CONVDONE) && + !LIST_ISEMPTY(&curproxy->uri_auth->http_req_rules) && + (curproxy->uri_auth->userlist || curproxy->uri_auth->auth_realm )) { + ha_alert("%s '%s': stats 'auth'/'realm' and 'http-request' can't be used at the same time.\n", + "proxy", curproxy->id); + cfgerr++; + goto out_uri_auth_compat; + } + + if (curproxy->uri_auth && curproxy->uri_auth->userlist && + (!(curproxy->uri_auth->flags & STAT_CONVDONE) || + LIST_ISEMPTY(&curproxy->uri_auth->http_req_rules))) { + const char *uri_auth_compat_req[10]; + struct act_rule *rule; + i = 0; + + /* build the ACL condition from scratch. We're relying on anonymous ACLs for that */ + uri_auth_compat_req[i++] = "auth"; + + if (curproxy->uri_auth->auth_realm) { + uri_auth_compat_req[i++] = "realm"; + uri_auth_compat_req[i++] = curproxy->uri_auth->auth_realm; + } + + uri_auth_compat_req[i++] = "unless"; + uri_auth_compat_req[i++] = "{"; + uri_auth_compat_req[i++] = "http_auth(.internal-stats-userlist)"; + uri_auth_compat_req[i++] = "}"; + uri_auth_compat_req[i++] = ""; + + rule = parse_http_req_cond(uri_auth_compat_req, "internal-stats-auth-compat", 0, curproxy); + if (!rule) { + cfgerr++; + break; + } + + LIST_APPEND(&curproxy->uri_auth->http_req_rules, &rule->list); + + if (curproxy->uri_auth->auth_realm) { + ha_free(&curproxy->uri_auth->auth_realm); + } + curproxy->uri_auth->flags |= STAT_CONVDONE; + } +out_uri_auth_compat: + + /* check whether we have a logger that uses RFC5424 log format */ + list_for_each_entry(tmplogger, &curproxy->loggers, list) { + if (tmplogger->format == LOG_FORMAT_RFC5424) { + if (!curproxy->conf.logformat_sd_string) { + /* set the default logformat_sd_string */ + curproxy->conf.logformat_sd_string = default_rfc5424_sd_log_format; + } + break; + } + } + + /* compile the log format */ + if (!(curproxy->cap & PR_CAP_FE)) { + if (curproxy->conf.logformat_string != default_http_log_format && + curproxy->conf.logformat_string != default_tcp_log_format && + curproxy->conf.logformat_string != clf_http_log_format) + free(curproxy->conf.logformat_string); + curproxy->conf.logformat_string = NULL; + ha_free(&curproxy->conf.lfs_file); + curproxy->conf.lfs_line = 0; + + if (curproxy->conf.logformat_sd_string != default_rfc5424_sd_log_format) + free(curproxy->conf.logformat_sd_string); + curproxy->conf.logformat_sd_string = NULL; + ha_free(&curproxy->conf.lfsd_file); + curproxy->conf.lfsd_line = 0; + } + + if (curproxy->conf.logformat_string) { + curproxy->conf.args.ctx = ARGC_LOG; + curproxy->conf.args.file = curproxy->conf.lfs_file; + curproxy->conf.args.line = curproxy->conf.lfs_line; + err = NULL; + if (!parse_logformat_string(curproxy->conf.logformat_string, curproxy, &curproxy->logformat, + LOG_OPT_MANDATORY|LOG_OPT_MERGE_SPACES, + SMP_VAL_FE_LOG_END, &err)) { + ha_alert("Parsing [%s:%d]: failed to parse log-format : %s.\n", + curproxy->conf.lfs_file, curproxy->conf.lfs_line, err); + free(err); + cfgerr++; + } + curproxy->conf.args.file = NULL; + curproxy->conf.args.line = 0; + } + + if (curproxy->conf.logformat_sd_string) { + curproxy->conf.args.ctx = ARGC_LOGSD; + curproxy->conf.args.file = curproxy->conf.lfsd_file; + curproxy->conf.args.line = curproxy->conf.lfsd_line; + err = NULL; + if (!parse_logformat_string(curproxy->conf.logformat_sd_string, curproxy, &curproxy->logformat_sd, + LOG_OPT_MANDATORY|LOG_OPT_MERGE_SPACES, + SMP_VAL_FE_LOG_END, &err)) { + ha_alert("Parsing [%s:%d]: failed to parse log-format-sd : %s.\n", + curproxy->conf.lfs_file, curproxy->conf.lfs_line, err); + free(err); + cfgerr++; + } else if (!add_to_logformat_list(NULL, NULL, LF_SEPARATOR, &curproxy->logformat_sd, &err)) { + ha_alert("Parsing [%s:%d]: failed to parse log-format-sd : %s.\n", + curproxy->conf.lfs_file, curproxy->conf.lfs_line, err); + free(err); + cfgerr++; + } + curproxy->conf.args.file = NULL; + curproxy->conf.args.line = 0; + } + + if (curproxy->conf.uniqueid_format_string) { + int where = 0; + + curproxy->conf.args.ctx = ARGC_UIF; + curproxy->conf.args.file = curproxy->conf.uif_file; + curproxy->conf.args.line = curproxy->conf.uif_line; + err = NULL; + if (curproxy->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (curproxy->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + if (!parse_logformat_string(curproxy->conf.uniqueid_format_string, curproxy, &curproxy->format_unique_id, + LOG_OPT_HTTP|LOG_OPT_MERGE_SPACES, where, &err)) { + ha_alert("Parsing [%s:%d]: failed to parse unique-id : %s.\n", + curproxy->conf.uif_file, curproxy->conf.uif_line, err); + free(err); + cfgerr++; + } + curproxy->conf.args.file = NULL; + curproxy->conf.args.line = 0; + } + + if (curproxy->conf.error_logformat_string) { + curproxy->conf.args.ctx = ARGC_LOG; + curproxy->conf.args.file = curproxy->conf.elfs_file; + curproxy->conf.args.line = curproxy->conf.elfs_line; + err = NULL; + if (!parse_logformat_string(curproxy->conf.error_logformat_string, curproxy, &curproxy->logformat_error, + LOG_OPT_MANDATORY|LOG_OPT_MERGE_SPACES, + SMP_VAL_FE_LOG_END, &err)) { + ha_alert("Parsing [%s:%d]: failed to parse error-log-format : %s.\n", + curproxy->conf.elfs_file, curproxy->conf.elfs_line, err); + free(err); + cfgerr++; + } + curproxy->conf.args.file = NULL; + curproxy->conf.args.line = 0; + } + + /* "balance hash" needs to compile its expression + * (log backends will handle this in proxy log postcheck) + */ + if (curproxy->mode != PR_MODE_SYSLOG && + (curproxy->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_SMP) { + int idx = 0; + const char *args[] = { + curproxy->lbprm.arg_str, + NULL, + }; + + err = NULL; + curproxy->conf.args.ctx = ARGC_USRV; // same context as use_server. + curproxy->lbprm.expr = + sample_parse_expr((char **)args, &idx, + curproxy->conf.file, curproxy->conf.line, + &err, &curproxy->conf.args, NULL); + + if (!curproxy->lbprm.expr) { + ha_alert("%s '%s' [%s:%d]: failed to parse 'balance hash' expression '%s' in : %s.\n", + proxy_type_str(curproxy), curproxy->id, + curproxy->conf.file, curproxy->conf.line, + curproxy->lbprm.arg_str, err); + ha_free(&err); + cfgerr++; + } + else if (!(curproxy->lbprm.expr->fetch->val & SMP_VAL_BE_SET_SRV)) { + ha_alert("%s '%s' [%s:%d]: error detected while parsing 'balance hash' expression '%s' " + "which requires information from %s, which is not available here.\n", + proxy_type_str(curproxy), curproxy->id, + curproxy->conf.file, curproxy->conf.line, + curproxy->lbprm.arg_str, sample_src_names(curproxy->lbprm.expr->fetch->use)); + cfgerr++; + } + else if (curproxy->mode == PR_MODE_HTTP && (curproxy->lbprm.expr->fetch->use & SMP_USE_L6REQ)) { + ha_warning("%s '%s' [%s:%d]: L6 sample fetch <%s> will be ignored in 'balance hash' expression in HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id, + curproxy->conf.file, curproxy->conf.line, + curproxy->lbprm.arg_str); + } + else + curproxy->http_needed |= !!(curproxy->lbprm.expr->fetch->use & SMP_USE_HTTP_ANY); + } + + /* only now we can check if some args remain unresolved. + * This must be done after the users and groups resolution. + */ + err = NULL; + i = smp_resolve_args(curproxy, &err); + cfgerr += i; + if (i) { + indent_msg(&err, 8); + ha_alert("%s%s\n", i > 1 ? "multiple argument resolution errors:" : "", err); + ha_free(&err); + } else + cfgerr += acl_find_targets(curproxy); + + if (!(curproxy->cap & PR_CAP_INT) && (curproxy->mode == PR_MODE_TCP || curproxy->mode == PR_MODE_HTTP) && + (((curproxy->cap & PR_CAP_FE) && !curproxy->timeout.client) || + ((curproxy->cap & PR_CAP_BE) && (curproxy->srv) && + (!curproxy->timeout.connect || + (!curproxy->timeout.server && (curproxy->mode == PR_MODE_HTTP || !curproxy->timeout.tunnel)))))) { + ha_warning("missing timeouts for %s '%s'.\n" + " | While not properly invalid, you will certainly encounter various problems\n" + " | with such a configuration. To fix this, please ensure that all following\n" + " | timeouts are set to a non-zero value: 'client', 'connect', 'server'.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + /* Historically, the tarpit and queue timeouts were inherited from contimeout. + * We must still support older configurations, so let's find out whether those + * parameters have been set or must be copied from contimeouts. + */ + if (!curproxy->timeout.tarpit) + curproxy->timeout.tarpit = curproxy->timeout.connect; + if ((curproxy->cap & PR_CAP_BE) && !curproxy->timeout.queue) + curproxy->timeout.queue = curproxy->timeout.connect; + + if ((curproxy->tcpcheck_rules.flags & TCPCHK_RULES_UNUSED_TCP_RS)) { + ha_warning("%s '%s' uses tcp-check rules without 'option tcp-check', so the rules are ignored.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + /* ensure that cookie capture length is not too large */ + if (curproxy->capture_len >= global.tune.cookie_len) { + ha_warning("truncating capture length to %d bytes for %s '%s'.\n", + global.tune.cookie_len - 1, proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + curproxy->capture_len = global.tune.cookie_len - 1; + } + + /* The small pools required for the capture lists */ + if (curproxy->nb_req_cap) { + curproxy->req_cap_pool = create_pool("ptrcap", + curproxy->nb_req_cap * sizeof(char *), + MEM_F_SHARED); + } + + if (curproxy->nb_rsp_cap) { + curproxy->rsp_cap_pool = create_pool("ptrcap", + curproxy->nb_rsp_cap * sizeof(char *), + MEM_F_SHARED); + } + + switch (curproxy->load_server_state_from_file) { + case PR_SRV_STATE_FILE_UNSPEC: + curproxy->load_server_state_from_file = PR_SRV_STATE_FILE_NONE; + break; + case PR_SRV_STATE_FILE_GLOBAL: + if (!global.server_state_file) { + ha_warning("backend '%s' configured to load server state file from global section 'server-state-file' directive. Unfortunately, 'server-state-file' is not set!\n", + curproxy->id); + err_code |= ERR_WARN; + } + break; + } + + /* first, we will invert the servers list order */ + newsrv = NULL; + while (curproxy->srv) { + struct server *next; + + next = curproxy->srv->next; + curproxy->srv->next = newsrv; + newsrv = curproxy->srv; + if (!next) + break; + curproxy->srv = next; + } + + /* Check that no server name conflicts. This causes trouble in the stats. + * We only emit a warning for the first conflict affecting each server, + * in order to avoid combinatory explosion if all servers have the same + * name. We do that only for servers which do not have an explicit ID, + * because these IDs were made also for distinguishing them and we don't + * want to annoy people who correctly manage them. + */ + for (newsrv = curproxy->srv; newsrv; newsrv = newsrv->next) { + struct server *other_srv; + + if (newsrv->puid) + continue; + + for (other_srv = curproxy->srv; other_srv && other_srv != newsrv; other_srv = other_srv->next) { + if (!other_srv->puid && strcmp(other_srv->id, newsrv->id) == 0) { + ha_alert("parsing [%s:%d] : %s '%s', another server named '%s' was already defined at line %d, please use distinct names.\n", + newsrv->conf.file, newsrv->conf.line, + proxy_type_str(curproxy), curproxy->id, + newsrv->id, other_srv->conf.line); + cfgerr++; + break; + } + } + } + + /* assign automatic UIDs to servers which don't have one yet */ + next_id = 1; + newsrv = curproxy->srv; + while (newsrv != NULL) { + if (!newsrv->puid) { + /* server ID not set, use automatic numbering with first + * spare entry starting with next_svid. + */ + next_id = get_next_id(&curproxy->conf.used_server_id, next_id); + newsrv->conf.id.key = newsrv->puid = next_id; + eb32_insert(&curproxy->conf.used_server_id, &newsrv->conf.id); + } + newsrv->conf.name.key = newsrv->id; + ebis_insert(&curproxy->conf.used_server_name, &newsrv->conf.name); + + next_id++; + newsrv = newsrv->next; + } + + curproxy->lbprm.wmult = 1; /* default weight multiplier */ + curproxy->lbprm.wdiv = 1; /* default weight divider */ + + /* + * If this server supports a maxconn parameter, it needs a dedicated + * tasks to fill the emptied slots when a connection leaves. + * Also, resolve deferred tracking dependency if needed. + */ + newsrv = curproxy->srv; + while (newsrv != NULL) { + set_usermsgs_ctx(newsrv->conf.file, newsrv->conf.line, &newsrv->obj_type); + + srv_minmax_conn_apply(newsrv); + + /* this will also properly set the transport layer for + * prod and checks + * if default-server have use_ssl, prerare ssl init + * without activating it */ + if (newsrv->use_ssl == 1 || newsrv->check.use_ssl == 1 || + (newsrv->proxy->options & PR_O_TCPCHK_SSL) || + ((newsrv->flags & SRV_F_DEFSRV_USE_SSL) && newsrv->use_ssl != 1)) { + if (xprt_get(XPRT_SSL) && xprt_get(XPRT_SSL)->prepare_srv) + cfgerr += xprt_get(XPRT_SSL)->prepare_srv(newsrv); + } + + if ((newsrv->flags & SRV_F_FASTOPEN) && + ((curproxy->retry_type & (PR_RE_DISCONNECTED | PR_RE_TIMEOUT)) != + (PR_RE_DISCONNECTED | PR_RE_TIMEOUT))) + ha_warning("server has tfo activated, the backend should be configured with at least 'conn-failure', 'empty-response' and 'response-timeout' or we wouldn't be able to retry the connection on failure.\n"); + + if (newsrv->trackit) { + if (srv_apply_track(newsrv, curproxy)) { + ++cfgerr; + goto next_srv; + } + } + + next_srv: + reset_usermsgs_ctx(); + newsrv = newsrv->next; + } + + /* + * Try to generate dynamic cookies for servers now. + * It couldn't be done earlier, since at the time we parsed + * the server line, we may not have known yet that we + * should use dynamic cookies, or the secret key may not + * have been provided yet. + */ + if (curproxy->ck_opts & PR_CK_DYNAMIC) { + newsrv = curproxy->srv; + while (newsrv != NULL) { + srv_set_dyncookie(newsrv); + newsrv = newsrv->next; + } + + } + /* We have to initialize the server lookup mechanism depending + * on what LB algorithm was chosen. + */ + + if (curproxy->mode == PR_MODE_SYSLOG) { + /* log load-balancing requires special init that is performed + * during log-postparsing step + */ + goto skip_server_lb_init; + } + curproxy->lbprm.algo &= ~(BE_LB_LKUP | BE_LB_PROP_DYN); + switch (curproxy->lbprm.algo & BE_LB_KIND) { + case BE_LB_KIND_RR: + if ((curproxy->lbprm.algo & BE_LB_PARM) == BE_LB_RR_STATIC) { + curproxy->lbprm.algo |= BE_LB_LKUP_MAP; + init_server_map(curproxy); + } else if ((curproxy->lbprm.algo & BE_LB_PARM) == BE_LB_RR_RANDOM) { + curproxy->lbprm.algo |= BE_LB_LKUP_CHTREE | BE_LB_PROP_DYN; + if (chash_init_server_tree(curproxy) < 0) { + cfgerr++; + } + } else { + curproxy->lbprm.algo |= BE_LB_LKUP_RRTREE | BE_LB_PROP_DYN; + fwrr_init_server_groups(curproxy); + } + break; + + case BE_LB_KIND_CB: + if ((curproxy->lbprm.algo & BE_LB_PARM) == BE_LB_CB_LC) { + curproxy->lbprm.algo |= BE_LB_LKUP_LCTREE | BE_LB_PROP_DYN; + fwlc_init_server_tree(curproxy); + } else { + curproxy->lbprm.algo |= BE_LB_LKUP_FSTREE | BE_LB_PROP_DYN; + fas_init_server_tree(curproxy); + } + break; + + case BE_LB_KIND_HI: + if ((curproxy->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS) { + curproxy->lbprm.algo |= BE_LB_LKUP_CHTREE | BE_LB_PROP_DYN; + if (chash_init_server_tree(curproxy) < 0) { + cfgerr++; + } + } else { + curproxy->lbprm.algo |= BE_LB_LKUP_MAP; + init_server_map(curproxy); + } + break; + } + skip_server_lb_init: + HA_RWLOCK_INIT(&curproxy->lbprm.lock); + + if (curproxy->options & PR_O_LOGASAP) + curproxy->to_log &= ~LW_BYTES; + + if (!(curproxy->cap & PR_CAP_INT) && (curproxy->mode == PR_MODE_TCP || curproxy->mode == PR_MODE_HTTP) && + (curproxy->cap & PR_CAP_FE) && LIST_ISEMPTY(&curproxy->loggers) && + (!LIST_ISEMPTY(&curproxy->logformat) || !LIST_ISEMPTY(&curproxy->logformat_sd))) { + ha_warning("log format ignored for %s '%s' since it has no log address.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + if (curproxy->mode != PR_MODE_HTTP && !(curproxy->options & PR_O_HTTP_UPG)) { + int optnum; + + if (curproxy->uri_auth) { + ha_warning("'stats' statement ignored for %s '%s' as it requires HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + curproxy->uri_auth = NULL; + } + + if (curproxy->capture_name) { + ha_warning("'capture' statement ignored for %s '%s' as it requires HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + if (!LIST_ISEMPTY(&curproxy->http_req_rules)) { + ha_warning("'http-request' rules ignored for %s '%s' as they require HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + if (!LIST_ISEMPTY(&curproxy->http_res_rules)) { + ha_warning("'http-response' rules ignored for %s '%s' as they require HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + if (!LIST_ISEMPTY(&curproxy->http_after_res_rules)) { + ha_warning("'http-after-response' rules ignored for %s '%s' as they require HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + if (!LIST_ISEMPTY(&curproxy->redirect_rules)) { + ha_warning("'redirect' rules ignored for %s '%s' as they require HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + } + + for (optnum = 0; cfg_opts[optnum].name; optnum++) { + if (cfg_opts[optnum].mode == PR_MODE_HTTP && + (curproxy->cap & cfg_opts[optnum].cap) && + (curproxy->options & cfg_opts[optnum].val)) { + ha_warning("'option %s' ignored for %s '%s' as it requires HTTP mode.\n", + cfg_opts[optnum].name, proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + curproxy->options &= ~cfg_opts[optnum].val; + } + } + + for (optnum = 0; cfg_opts2[optnum].name; optnum++) { + if (cfg_opts2[optnum].mode == PR_MODE_HTTP && + (curproxy->cap & cfg_opts2[optnum].cap) && + (curproxy->options2 & cfg_opts2[optnum].val)) { + ha_warning("'option %s' ignored for %s '%s' as it requires HTTP mode.\n", + cfg_opts2[optnum].name, proxy_type_str(curproxy), curproxy->id); + err_code |= ERR_WARN; + curproxy->options2 &= ~cfg_opts2[optnum].val; + } + } + +#if defined(CONFIG_HAP_TRANSPARENT) + if (curproxy->conn_src.bind_hdr_occ) { + curproxy->conn_src.bind_hdr_occ = 0; + ha_warning("%s '%s' : ignoring use of header %s as source IP in non-HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id, curproxy->conn_src.bind_hdr_name); + err_code |= ERR_WARN; + } +#endif + } + + /* + * ensure that we're not cross-dressing a TCP server into HTTP. + */ + newsrv = curproxy->srv; + while (newsrv != NULL) { + if ((curproxy->mode != PR_MODE_HTTP) && newsrv->rdr_len) { + ha_alert("%s '%s' : server cannot have cookie or redirect prefix in non-HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id); + cfgerr++; + } + + if ((curproxy->mode != PR_MODE_HTTP) && newsrv->cklen) { + ha_warning("%s '%s' : ignoring cookie for server '%s' as HTTP mode is disabled.\n", + proxy_type_str(curproxy), curproxy->id, newsrv->id); + err_code |= ERR_WARN; + } + + if ((newsrv->flags & SRV_F_MAPPORTS) && (curproxy->options2 & PR_O2_RDPC_PRST)) { + ha_warning("%s '%s' : RDP cookie persistence will not work for server '%s' because it lacks an explicit port number.\n", + proxy_type_str(curproxy), curproxy->id, newsrv->id); + err_code |= ERR_WARN; + } + +#if defined(CONFIG_HAP_TRANSPARENT) + if (curproxy->mode != PR_MODE_HTTP && newsrv->conn_src.bind_hdr_occ) { + newsrv->conn_src.bind_hdr_occ = 0; + ha_warning("%s '%s' : server %s cannot use header %s as source IP in non-HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id, newsrv->id, newsrv->conn_src.bind_hdr_name); + err_code |= ERR_WARN; + } +#endif + + if ((curproxy->mode != PR_MODE_HTTP) && (curproxy->options & PR_O_REUSE_MASK) != PR_O_REUSE_NEVR) + curproxy->options &= ~PR_O_REUSE_MASK; + + if ((curproxy->mode != PR_MODE_HTTP) && newsrv->flags & SRV_F_RHTTP) { + ha_alert("%s '%s' : server %s uses reverse HTTP addressing which can only be used with HTTP mode.\n", + proxy_type_str(curproxy), curproxy->id, newsrv->id); + cfgerr++; + err_code |= ERR_FATAL | ERR_ALERT; + goto out; + } + + newsrv = newsrv->next; + } + + /* Check filter configuration, if any */ + cfgerr += flt_check(curproxy); + + if (curproxy->cap & PR_CAP_FE) { + if (!curproxy->accept) + curproxy->accept = frontend_accept; + + if (!LIST_ISEMPTY(&curproxy->tcp_req.inspect_rules) || + (curproxy->defpx && !LIST_ISEMPTY(&curproxy->defpx->tcp_req.inspect_rules))) + curproxy->fe_req_ana |= AN_REQ_INSPECT_FE; + + if (curproxy->mode == PR_MODE_HTTP) { + curproxy->fe_req_ana |= AN_REQ_WAIT_HTTP | AN_REQ_HTTP_PROCESS_FE; + curproxy->fe_rsp_ana |= AN_RES_WAIT_HTTP | AN_RES_HTTP_PROCESS_FE; + } + + if (curproxy->mode == PR_MODE_CLI) { + curproxy->fe_req_ana |= AN_REQ_WAIT_CLI; + curproxy->fe_rsp_ana |= AN_RES_WAIT_CLI; + } + + /* both TCP and HTTP must check switching rules */ + curproxy->fe_req_ana |= AN_REQ_SWITCHING_RULES; + + /* Add filters analyzers if needed */ + if (!LIST_ISEMPTY(&curproxy->filter_configs)) { + curproxy->fe_req_ana |= AN_REQ_FLT_START_FE | AN_REQ_FLT_XFER_DATA | AN_REQ_FLT_END; + curproxy->fe_rsp_ana |= AN_RES_FLT_START_FE | AN_RES_FLT_XFER_DATA | AN_RES_FLT_END; + } + } + + if (curproxy->cap & PR_CAP_BE) { + if (!LIST_ISEMPTY(&curproxy->tcp_req.inspect_rules) || + (curproxy->defpx && !LIST_ISEMPTY(&curproxy->defpx->tcp_req.inspect_rules))) + curproxy->be_req_ana |= AN_REQ_INSPECT_BE; + + if (!LIST_ISEMPTY(&curproxy->tcp_rep.inspect_rules) || + (curproxy->defpx && !LIST_ISEMPTY(&curproxy->defpx->tcp_rep.inspect_rules))) + curproxy->be_rsp_ana |= AN_RES_INSPECT; + + if (curproxy->mode == PR_MODE_HTTP) { + curproxy->be_req_ana |= AN_REQ_WAIT_HTTP | AN_REQ_HTTP_INNER | AN_REQ_HTTP_PROCESS_BE; + curproxy->be_rsp_ana |= AN_RES_WAIT_HTTP | AN_RES_HTTP_PROCESS_BE; + } + + /* If the backend does requires RDP cookie persistence, we have to + * enable the corresponding analyser. + */ + if (curproxy->options2 & PR_O2_RDPC_PRST) + curproxy->be_req_ana |= AN_REQ_PRST_RDP_COOKIE; + + /* Add filters analyzers if needed */ + if (!LIST_ISEMPTY(&curproxy->filter_configs)) { + curproxy->be_req_ana |= AN_REQ_FLT_START_BE | AN_REQ_FLT_XFER_DATA | AN_REQ_FLT_END; + curproxy->be_rsp_ana |= AN_RES_FLT_START_BE | AN_RES_FLT_XFER_DATA | AN_RES_FLT_END; + } + } + + /* Check the mux protocols, if any, for each listener and server + * attached to the current proxy */ + list_for_each_entry(bind_conf, &curproxy->conf.bind, by_fe) { + int mode = conn_pr_mode_to_proto_mode(curproxy->mode); + const struct mux_proto_list *mux_ent; + + if (!bind_conf->mux_proto) { + /* No protocol was specified. If we're using QUIC at the transport + * layer, we'll instantiate it as a mux as well. If QUIC is not + * compiled in, this will remain NULL. + */ + if (bind_conf->xprt && bind_conf->xprt == xprt_get(XPRT_QUIC)) + bind_conf->mux_proto = get_mux_proto(ist("quic")); + } + + if (!bind_conf->mux_proto) + continue; + + /* it is possible that an incorrect mux was referenced + * due to the proxy's mode not being taken into account + * on first pass. Let's adjust it now. + */ + mux_ent = conn_get_best_mux_entry(bind_conf->mux_proto->token, PROTO_SIDE_FE, mode); + + if (!mux_ent || !isteq(mux_ent->token, bind_conf->mux_proto->token)) { + ha_alert("%s '%s' : MUX protocol '%.*s' is not usable for 'bind %s' at [%s:%d].\n", + proxy_type_str(curproxy), curproxy->id, + (int)bind_conf->mux_proto->token.len, + bind_conf->mux_proto->token.ptr, + bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr++; + } else { + if ((mux_ent->mux->flags & MX_FL_FRAMED) && !(bind_conf->options & BC_O_USE_SOCK_DGRAM)) { + ha_alert("%s '%s' : frame-based MUX protocol '%.*s' is incompatible with stream transport of 'bind %s' at [%s:%d].\n", + proxy_type_str(curproxy), curproxy->id, + (int)bind_conf->mux_proto->token.len, + bind_conf->mux_proto->token.ptr, + bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr++; + } + else if (!(mux_ent->mux->flags & MX_FL_FRAMED) && !(bind_conf->options & BC_O_USE_SOCK_STREAM)) { + ha_alert("%s '%s' : stream-based MUX protocol '%.*s' is incompatible with framed transport of 'bind %s' at [%s:%d].\n", + proxy_type_str(curproxy), curproxy->id, + (int)bind_conf->mux_proto->token.len, + bind_conf->mux_proto->token.ptr, + bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr++; + } + } + + /* update the mux */ + bind_conf->mux_proto = mux_ent; + } + for (newsrv = curproxy->srv; newsrv; newsrv = newsrv->next) { + int mode = conn_pr_mode_to_proto_mode(curproxy->mode); + const struct mux_proto_list *mux_ent; + + if (!newsrv->mux_proto) + continue; + + /* it is possible that an incorrect mux was referenced + * due to the proxy's mode not being taken into account + * on first pass. Let's adjust it now. + */ + mux_ent = conn_get_best_mux_entry(newsrv->mux_proto->token, PROTO_SIDE_BE, mode); + + if (!mux_ent || !isteq(mux_ent->token, newsrv->mux_proto->token)) { + ha_alert("%s '%s' : MUX protocol '%.*s' is not usable for server '%s' at [%s:%d].\n", + proxy_type_str(curproxy), curproxy->id, + (int)newsrv->mux_proto->token.len, + newsrv->mux_proto->token.ptr, + newsrv->id, newsrv->conf.file, newsrv->conf.line); + cfgerr++; + } + + /* update the mux */ + newsrv->mux_proto = mux_ent; + } + + /* Allocate default tcp-check rules for proxies without + * explicit rules. + */ + if (curproxy->cap & PR_CAP_BE) { + if (!(curproxy->options2 & PR_O2_CHK_ANY)) { + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curproxy->tcpcheck_rules; + + curproxy->options2 |= PR_O2_TCPCHK_CHK; + + rs = find_tcpcheck_ruleset("*tcp-check"); + if (!rs) { + rs = create_tcpcheck_ruleset("*tcp-check"); + if (rs == NULL) { + ha_alert("config: %s '%s': out of memory.\n", + proxy_type_str(curproxy), curproxy->id); + cfgerr++; + } + } + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = &rs->rules; + rules->flags = 0; + } + } + } + + /* + * We have just initialized the main proxies list + * we must also configure the log-forward proxies list + */ + if (init_proxies_list == proxies_list) { + init_proxies_list = cfg_log_forward; + /* check if list is not null to avoid infinite loop */ + if (init_proxies_list) + goto init_proxies_list_stage1; + } + + if (init_proxies_list == cfg_log_forward) { + init_proxies_list = sink_proxies_list; + /* check if list is not null to avoid infinite loop */ + if (init_proxies_list) + goto init_proxies_list_stage1; + } + + /***********************************************************/ + /* At this point, target names have already been resolved. */ + /***********************************************************/ + + /* we must finish to initialize certain things on the servers */ + + list_for_each_entry(newsrv, &servers_list, global_list) { + /* initialize idle conns lists */ + if (srv_init_per_thr(newsrv) == -1) { + ha_alert("parsing [%s:%d] : failed to allocate per-thread lists for server '%s'.\n", + newsrv->conf.file, newsrv->conf.line, newsrv->id); + cfgerr++; + continue; + } + + if (newsrv->max_idle_conns != 0) { + newsrv->curr_idle_thr = calloc(global.nbthread, sizeof(*newsrv->curr_idle_thr)); + if (!newsrv->curr_idle_thr) { + ha_alert("parsing [%s:%d] : failed to allocate idle connection tasks for server '%s'.\n", + newsrv->conf.file, newsrv->conf.line, newsrv->id); + cfgerr++; + continue; + } + + } + } + + idle_conn_task = task_new_anywhere(); + if (!idle_conn_task) { + ha_alert("parsing : failed to allocate global idle connection task.\n"); + cfgerr++; + } + else { + idle_conn_task->process = srv_cleanup_idle_conns; + idle_conn_task->context = NULL; + + for (i = 0; i < global.nbthread; i++) { + idle_conns[i].cleanup_task = task_new_on(i); + if (!idle_conns[i].cleanup_task) { + ha_alert("parsing : failed to allocate idle connection tasks for thread '%d'.\n", i); + cfgerr++; + break; + } + + idle_conns[i].cleanup_task->process = srv_cleanup_toremove_conns; + idle_conns[i].cleanup_task->context = NULL; + HA_SPIN_INIT(&idle_conns[i].idle_conns_lock); + MT_LIST_INIT(&idle_conns[i].toremove_conns); + } + } + + /* perform the final checks before creating tasks */ + + /* starting to initialize the main proxies list */ + init_proxies_list = proxies_list; + +init_proxies_list_stage2: + for (curproxy = init_proxies_list; curproxy; curproxy = curproxy->next) { + struct listener *listener; + unsigned int next_id; + + /* Configure SSL for each bind line. + * Note: if configuration fails at some point, the ->ctx member + * remains NULL so that listeners can later detach. + */ + list_for_each_entry(bind_conf, &curproxy->conf.bind, by_fe) { + if (bind_conf->xprt->prepare_bind_conf && + bind_conf->xprt->prepare_bind_conf(bind_conf) < 0) + cfgerr++; + bind_conf->analysers |= curproxy->fe_req_ana; + if (!bind_conf->maxaccept) + bind_conf->maxaccept = global.tune.maxaccept ? global.tune.maxaccept : MAX_ACCEPT; + bind_conf->accept = session_accept_fd; + if (curproxy->options & PR_O_TCP_NOLING) + bind_conf->options |= BC_O_NOLINGER; + + /* smart accept mode is automatic in HTTP mode */ + if ((curproxy->options2 & PR_O2_SMARTACC) || + ((curproxy->mode == PR_MODE_HTTP || (bind_conf->options & BC_O_USE_SSL)) && + !(curproxy->no_options2 & PR_O2_SMARTACC))) + bind_conf->options |= BC_O_NOQUICKACK; + } + + /* adjust this proxy's listeners */ + bind_conf = NULL; + next_id = 1; + list_for_each_entry(listener, &curproxy->conf.listeners, by_fe) { + if (!listener->luid) { + /* listener ID not set, use automatic numbering with first + * spare entry starting with next_luid. + */ + next_id = get_next_id(&curproxy->conf.used_listener_id, next_id); + listener->conf.id.key = listener->luid = next_id; + eb32_insert(&curproxy->conf.used_listener_id, &listener->conf.id); + } + next_id++; + + /* enable separate counters */ + if (curproxy->options2 & PR_O2_SOCKSTAT) { + listener->counters = calloc(1, sizeof(*listener->counters)); + if (!listener->name) + memprintf(&listener->name, "sock-%d", listener->luid); + } + +#ifdef USE_QUIC + if (listener->bind_conf->xprt == xprt_get(XPRT_QUIC)) { + /* quic_conn are counted against maxconn. */ + listener->bind_conf->options |= BC_O_XPRT_MAXCONN; + listener->rx.quic_curr_handshake = 0; + listener->rx.quic_curr_accept = 0; + +# ifdef USE_QUIC_OPENSSL_COMPAT + /* store the last checked bind_conf in bind_conf */ + if (!(global.tune.options & GTUNE_NO_QUIC) && + !(global.tune.options & GTUNE_LIMITED_QUIC) && + listener->bind_conf != bind_conf) { + bind_conf = listener->bind_conf; + ha_alert("Binding [%s:%d] for %s %s: this SSL library does not support the " + "QUIC protocol. A limited compatibility layer may be enabled using " + "the \"limited-quic\" global option if desired.\n", + listener->bind_conf->file, listener->bind_conf->line, + proxy_type_str(curproxy), curproxy->id); + cfgerr++; + } +# endif + + li_init_per_thr(listener); + } +#endif + } + + /* Release unused SSL configs */ + list_for_each_entry(bind_conf, &curproxy->conf.bind, by_fe) { + if (!(bind_conf->options & BC_O_USE_SSL) && bind_conf->xprt->destroy_bind_conf) + bind_conf->xprt->destroy_bind_conf(bind_conf); + } + + /* create the task associated with the proxy */ + curproxy->task = task_new_anywhere(); + if (curproxy->task) { + curproxy->task->context = curproxy; + curproxy->task->process = manage_proxy; + curproxy->flags |= PR_FL_READY; + } else { + ha_alert("Proxy '%s': no more memory when trying to allocate the management task\n", + curproxy->id); + cfgerr++; + } + } + + /* + * We have just initialized the main proxies list + * we must also configure the log-forward proxies list + */ + if (init_proxies_list == proxies_list) { + init_proxies_list = cfg_log_forward; + /* check if list is not null to avoid infinite loop */ + if (init_proxies_list) + goto init_proxies_list_stage2; + } + + /* + * Recount currently required checks. + */ + + for (curproxy=proxies_list; curproxy; curproxy=curproxy->next) { + int optnum; + + for (optnum = 0; cfg_opts[optnum].name; optnum++) + if (curproxy->options & cfg_opts[optnum].val) + global.last_checks |= cfg_opts[optnum].checks; + + for (optnum = 0; cfg_opts2[optnum].name; optnum++) + if (curproxy->options2 & cfg_opts2[optnum].val) + global.last_checks |= cfg_opts2[optnum].checks; + } + + if (cfg_peers) { + struct peers *curpeers = cfg_peers, **last; + struct peer *p, *pb; + + /* Remove all peers sections which don't have a valid listener, + * which are not used by any table, or which are bound to more + * than one process. + */ + last = &cfg_peers; + while (*last) { + struct peer *peer; + struct stktable *t; + curpeers = *last; + + if (curpeers->disabled) { + /* the "disabled" keyword was present */ + if (curpeers->peers_fe) + stop_proxy(curpeers->peers_fe); + curpeers->peers_fe = NULL; + } + else if (!curpeers->peers_fe || !curpeers->peers_fe->id) { + ha_warning("Removing incomplete section 'peers %s' (no peer named '%s').\n", + curpeers->id, localpeer); + if (curpeers->peers_fe) + stop_proxy(curpeers->peers_fe); + curpeers->peers_fe = NULL; + } + else { + /* Initializes the transport layer of the server part of all the peers belonging to + * <curpeers> section if required. + * Note that ->srv is used by the local peer of a new process to connect to the local peer + * of an old process. + */ + curpeers->peers_fe->flags |= PR_FL_READY; + p = curpeers->remote; + while (p) { + struct peer *other_peer; + + for (other_peer = curpeers->remote; other_peer && other_peer != p; other_peer = other_peer->next) { + if (strcmp(other_peer->id, p->id) == 0) { + ha_alert("Peer section '%s' [%s:%d]: another peer named '%s' was already defined at line %s:%d, please use distinct names.\n", + curpeers->peers_fe->id, + p->conf.file, p->conf.line, + other_peer->id, other_peer->conf.file, other_peer->conf.line); + cfgerr++; + break; + } + } + + if (p->srv) { + if (p->srv->use_ssl == 1 && xprt_get(XPRT_SSL) && xprt_get(XPRT_SSL)->prepare_srv) + cfgerr += xprt_get(XPRT_SSL)->prepare_srv(p->srv); + } + p = p->next; + } + /* Configure the SSL bindings of the local peer if required. */ + if (!LIST_ISEMPTY(&curpeers->peers_fe->conf.bind)) { + struct list *l; + struct bind_conf *bind_conf; + int ret; + + l = &curpeers->peers_fe->conf.bind; + bind_conf = LIST_ELEM(l->n, typeof(bind_conf), by_fe); + + if (curpeers->local->srv) { + if (curpeers->local->srv->use_ssl == 1 && !(bind_conf->options & BC_O_USE_SSL)) { + ha_warning("Peers section '%s': local peer have a non-SSL listener and a SSL server configured at line %s:%d.\n", + curpeers->peers_fe->id, curpeers->local->conf.file, curpeers->local->conf.line); + } + else if (curpeers->local->srv->use_ssl != 1 && (bind_conf->options & BC_O_USE_SSL)) { + ha_warning("Peers section '%s': local peer have a SSL listener and a non-SSL server configured at line %s:%d.\n", + curpeers->peers_fe->id, curpeers->local->conf.file, curpeers->local->conf.line); + } + } + + /* finish the bind setup */ + ret = bind_complete_thread_setup(bind_conf, &err_code); + if (ret != 0) { + cfgerr += ret; + if (err_code & ERR_FATAL) + goto out; + } + + if (bind_conf->xprt->prepare_bind_conf && + bind_conf->xprt->prepare_bind_conf(bind_conf) < 0) + cfgerr++; + } + if (!peers_init_sync(curpeers) || !peers_alloc_dcache(curpeers)) { + ha_alert("Peers section '%s': out of memory, giving up on peers.\n", + curpeers->id); + cfgerr++; + break; + } + last = &curpeers->next; + + /* Ignore the peer shard greater than the number of peer shard for this section. + * Also ignore the peer shard of the local peer. + */ + for (peer = curpeers->remote; peer; peer = peer->next) { + if (peer == curpeers->local) { + if (peer->srv->shard) { + ha_warning("Peers section '%s': shard ignored for '%s' local peer\n", + curpeers->id, peer->id); + peer->srv->shard = 0; + } + } + else if (peer->srv->shard > curpeers->nb_shards) { + ha_warning("Peers section '%s': shard ignored for '%s' local peer because " + "%d shard value is greater than the section number of shards (%d)\n", + curpeers->id, peer->id, peer->srv->shard, curpeers->nb_shards); + peer->srv->shard = 0; + } + } + + continue; + } + + /* clean what has been detected above */ + p = curpeers->remote; + while (p) { + pb = p->next; + free(p->id); + free(p); + p = pb; + } + + /* Destroy and unlink this curpeers section. + * Note: curpeers is backed up into *last. + */ + free(curpeers->id); + curpeers = curpeers->next; + /* Reset any refereance to this peers section in the list of stick-tables */ + for (t = stktables_list; t; t = t->next) { + if (t->peers.p && t->peers.p == *last) + t->peers.p = NULL; + } + free(*last); + *last = curpeers; + } + } + + for (t = stktables_list; t; t = t->next) { + if (t->proxy) + continue; + err = NULL; + if (!stktable_init(t, &err)) { + ha_alert("Parsing [%s:%d]: failed to initialize '%s' stick-table: %s.\n", t->conf.file, t->conf.line, t->id, err); + ha_free(&err); + cfgerr++; + } + } + + /* initialize stick-tables on backend capable proxies. This must not + * be done earlier because the data size may be discovered while parsing + * other proxies. + */ + for (curproxy = proxies_list; curproxy; curproxy = curproxy->next) { + if ((curproxy->flags & PR_FL_DISABLED) || !curproxy->table) + continue; + + err = NULL; + if (!stktable_init(curproxy->table, &err)) { + ha_alert("Proxy '%s': failed to initialize stick-table: %s.\n", curproxy->id, err); + ha_free(&err); + cfgerr++; + } + } + + if (mailers) { + struct mailers *curmailers = mailers, **last; + struct mailer *m, *mb; + + /* Remove all mailers sections which don't have a valid listener. + * This can happen when a mailers section is never referenced. + */ + last = &mailers; + while (*last) { + curmailers = *last; + if (curmailers->users) { + last = &curmailers->next; + continue; + } + + ha_warning("Removing incomplete section 'mailers %s'.\n", + curmailers->id); + + m = curmailers->mailer_list; + while (m) { + mb = m->next; + free(m->id); + free(m); + m = mb; + } + + /* Destroy and unlink this curmailers section. + * Note: curmailers is backed up into *last. + */ + free(curmailers->id); + curmailers = curmailers->next; + free(*last); + *last = curmailers; + } + } + + /* Update server_state_file_name to backend name if backend is supposed to use + * a server-state file locally defined and none has been provided */ + for (curproxy = proxies_list; curproxy; curproxy = curproxy->next) { + if (curproxy->load_server_state_from_file == PR_SRV_STATE_FILE_LOCAL && + curproxy->server_state_file_name == NULL) + curproxy->server_state_file_name = strdup(curproxy->id); + } + + list_for_each_entry(curr_resolvers, &sec_resolvers, list) { + if (LIST_ISEMPTY(&curr_resolvers->nameservers)) { + ha_warning("resolvers '%s' [%s:%d] has no nameservers configured!\n", + curr_resolvers->id, curr_resolvers->conf.file, + curr_resolvers->conf.line); + err_code |= ERR_WARN; + } + } + + list_for_each_entry(postparser, &postparsers, list) { + if (postparser->func) + cfgerr += postparser->func(); + } + + if (cfgerr > 0) + err_code |= ERR_ALERT | ERR_FATAL; + out: + return err_code; +} + +/* + * Registers the CFG keyword list <kwl> as a list of valid keywords for next + * parsing sessions. + */ +void cfg_register_keywords(struct cfg_kw_list *kwl) +{ + LIST_APPEND(&cfg_keywords.list, &kwl->list); +} + +/* + * Unregisters the CFG keyword list <kwl> from the list of valid keywords. + */ +void cfg_unregister_keywords(struct cfg_kw_list *kwl) +{ + LIST_DELETE(&kwl->list); + LIST_INIT(&kwl->list); +} + +/* this function register new section in the haproxy configuration file. + * <section_name> is the name of this new section and <section_parser> + * is the called parser. If two section declaration have the same name, + * only the first declared is used. + */ +int cfg_register_section(char *section_name, + int (*section_parser)(const char *, int, char **, int), + int (*post_section_parser)()) +{ + struct cfg_section *cs; + + list_for_each_entry(cs, §ions, list) { + if (strcmp(cs->section_name, section_name) == 0) { + ha_alert("register section '%s': already registered.\n", section_name); + return 0; + } + } + + cs = calloc(1, sizeof(*cs)); + if (!cs) { + ha_alert("register section '%s': out of memory.\n", section_name); + return 0; + } + + cs->section_name = section_name; + cs->section_parser = section_parser; + cs->post_section_parser = post_section_parser; + + LIST_APPEND(§ions, &cs->list); + + return 1; +} + +/* this function register a new function which will be called once the haproxy + * configuration file has been parsed. It's useful to check dependencies + * between sections or to resolve items once everything is parsed. + */ +int cfg_register_postparser(char *name, int (*func)()) +{ + struct cfg_postparser *cp; + + cp = calloc(1, sizeof(*cp)); + if (!cp) { + ha_alert("register postparser '%s': out of memory.\n", name); + return 0; + } + cp->name = name; + cp->func = func; + + LIST_APPEND(&postparsers, &cp->list); + + return 1; +} + +/* + * free all config section entries + */ +void cfg_unregister_sections(void) +{ + struct cfg_section *cs, *ics; + + list_for_each_entry_safe(cs, ics, §ions, list) { + LIST_DELETE(&cs->list); + free(cs); + } +} + +void cfg_backup_sections(struct list *backup_sections) +{ + struct cfg_section *cs, *ics; + + list_for_each_entry_safe(cs, ics, §ions, list) { + LIST_DELETE(&cs->list); + LIST_APPEND(backup_sections, &cs->list); + } +} + +void cfg_restore_sections(struct list *backup_sections) +{ + struct cfg_section *cs, *ics; + + list_for_each_entry_safe(cs, ics, backup_sections, list) { + LIST_DELETE(&cs->list); + LIST_APPEND(§ions, &cs->list); + } +} + +/* dumps all registered keywords by section on stdout */ +void cfg_dump_registered_keywords() +{ + /* CFG_GLOBAL, CFG_LISTEN, CFG_USERLIST, CFG_PEERS, CFG_CRTLIST */ + const char* sect_names[] = { "", "global", "listen", "userlist", "peers", "crt-list", 0 }; + int section; + int index; + + for (section = 1; sect_names[section]; section++) { + struct cfg_kw_list *kwl; + const struct cfg_keyword *kwp, *kwn; + + printf("%s\n", sect_names[section]); + + for (kwn = kwp = NULL;; kwp = kwn) { + list_for_each_entry(kwl, &cfg_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) + if (kwl->kw[index].section == section && + strordered(kwp ? kwp->kw : NULL, kwl->kw[index].kw, kwn != kwp ? kwn->kw : NULL)) + kwn = &kwl->kw[index]; + } + if (kwn == kwp) + break; + printf("\t%s\n", kwn->kw); + } + + if (section == CFG_LISTEN) { + /* there are plenty of other keywords there */ + extern struct list tcp_req_conn_keywords, tcp_req_sess_keywords, + tcp_req_cont_keywords, tcp_res_cont_keywords; + extern struct bind_kw_list bind_keywords; + extern struct srv_kw_list srv_keywords; + struct bind_kw_list *bkwl; + struct srv_kw_list *skwl; + const struct bind_kw *bkwp, *bkwn; + const struct srv_kw *skwp, *skwn; + const struct cfg_opt *coptp, *coptn; + + /* display the non-ssl keywords */ + for (bkwn = bkwp = NULL;; bkwp = bkwn) { + list_for_each_entry(bkwl, &bind_keywords.list, list) { + if (strcmp(bkwl->scope, "SSL") == 0) /* skip SSL keywords */ + continue; + for (index = 0; bkwl->kw[index].kw != NULL; index++) { + if (strordered(bkwp ? bkwp->kw : NULL, + bkwl->kw[index].kw, + bkwn != bkwp ? bkwn->kw : NULL)) + bkwn = &bkwl->kw[index]; + } + } + if (bkwn == bkwp) + break; + + if (!bkwn->skip) + printf("\tbind <addr> %s\n", bkwn->kw); + else + printf("\tbind <addr> %s +%d\n", bkwn->kw, bkwn->skip); + } +#if defined(USE_OPENSSL) + /* displays the "ssl" keywords */ + for (bkwn = bkwp = NULL;; bkwp = bkwn) { + list_for_each_entry(bkwl, &bind_keywords.list, list) { + if (strcmp(bkwl->scope, "SSL") != 0) /* skip non-SSL keywords */ + continue; + for (index = 0; bkwl->kw[index].kw != NULL; index++) { + if (strordered(bkwp ? bkwp->kw : NULL, + bkwl->kw[index].kw, + bkwn != bkwp ? bkwn->kw : NULL)) + bkwn = &bkwl->kw[index]; + } + } + if (bkwn == bkwp) + break; + + if (strcmp(bkwn->kw, "ssl") == 0) /* skip "bind <addr> ssl ssl" */ + continue; + + if (!bkwn->skip) + printf("\tbind <addr> ssl %s\n", bkwn->kw); + else + printf("\tbind <addr> ssl %s +%d\n", bkwn->kw, bkwn->skip); + } +#endif + for (skwn = skwp = NULL;; skwp = skwn) { + list_for_each_entry(skwl, &srv_keywords.list, list) { + for (index = 0; skwl->kw[index].kw != NULL; index++) + if (strordered(skwp ? skwp->kw : NULL, + skwl->kw[index].kw, + skwn != skwp ? skwn->kw : NULL)) + skwn = &skwl->kw[index]; + } + if (skwn == skwp) + break; + + if (!skwn->skip) + printf("\tserver <name> <addr> %s\n", skwn->kw); + else + printf("\tserver <name> <addr> %s +%d\n", skwn->kw, skwn->skip); + } + + for (coptn = coptp = NULL;; coptp = coptn) { + for (index = 0; cfg_opts[index].name; index++) + if (strordered(coptp ? coptp->name : NULL, + cfg_opts[index].name, + coptn != coptp ? coptn->name : NULL)) + coptn = &cfg_opts[index]; + + for (index = 0; cfg_opts2[index].name; index++) + if (strordered(coptp ? coptp->name : NULL, + cfg_opts2[index].name, + coptn != coptp ? coptn->name : NULL)) + coptn = &cfg_opts2[index]; + if (coptn == coptp) + break; + + printf("\toption %s [ ", coptn->name); + if (coptn->cap & PR_CAP_FE) + printf("FE "); + if (coptn->cap & PR_CAP_BE) + printf("BE "); + if (coptn->mode == PR_MODE_HTTP) + printf("HTTP "); + printf("]\n"); + } + + dump_act_rules(&tcp_req_conn_keywords, "\ttcp-request connection "); + dump_act_rules(&tcp_req_sess_keywords, "\ttcp-request session "); + dump_act_rules(&tcp_req_cont_keywords, "\ttcp-request content "); + dump_act_rules(&tcp_res_cont_keywords, "\ttcp-response content "); + dump_act_rules(&http_req_keywords.list, "\thttp-request "); + dump_act_rules(&http_res_keywords.list, "\thttp-response "); + dump_act_rules(&http_after_res_keywords.list, "\thttp-after-response "); + } + if (section == CFG_PEERS) { + struct peers_kw_list *pkwl; + const struct peers_keyword *pkwp, *pkwn; + for (pkwn = pkwp = NULL;; pkwp = pkwn) { + list_for_each_entry(pkwl, &peers_keywords.list, list) { + for (index = 0; pkwl->kw[index].kw != NULL; index++) { + if (strordered(pkwp ? pkwp->kw : NULL, + pkwl->kw[index].kw, + pkwn != pkwp ? pkwn->kw : NULL)) + pkwn = &pkwl->kw[index]; + } + } + if (pkwn == pkwp) + break; + printf("\t%s\n", pkwn->kw); + } + } + if (section == CFG_CRTLIST) { + /* displays the keyword available for the crt-lists */ + extern struct ssl_crtlist_kw ssl_crtlist_kws[] __maybe_unused; + const struct ssl_crtlist_kw *sbkwp __maybe_unused, *sbkwn __maybe_unused; + +#if defined(USE_OPENSSL) + for (sbkwn = sbkwp = NULL;; sbkwp = sbkwn) { + for (index = 0; ssl_crtlist_kws[index].kw != NULL; index++) { + if (strordered(sbkwp ? sbkwp->kw : NULL, + ssl_crtlist_kws[index].kw, + sbkwn != sbkwp ? sbkwn->kw : NULL)) + sbkwn = &ssl_crtlist_kws[index]; + } + if (sbkwn == sbkwp) + break; + if (!sbkwn->skip) + printf("\t%s\n", sbkwn->kw); + else + printf("\t%s +%d\n", sbkwn->kw, sbkwn->skip); + } +#endif + + } + } +} + +/* these are the config sections handled by default */ +REGISTER_CONFIG_SECTION("listen", cfg_parse_listen, NULL); +REGISTER_CONFIG_SECTION("frontend", cfg_parse_listen, NULL); +REGISTER_CONFIG_SECTION("backend", cfg_parse_listen, NULL); +REGISTER_CONFIG_SECTION("defaults", cfg_parse_listen, NULL); +REGISTER_CONFIG_SECTION("global", cfg_parse_global, NULL); +REGISTER_CONFIG_SECTION("userlist", cfg_parse_users, NULL); +REGISTER_CONFIG_SECTION("peers", cfg_parse_peers, NULL); +REGISTER_CONFIG_SECTION("mailers", cfg_parse_mailers, NULL); +REGISTER_CONFIG_SECTION("namespace_list", cfg_parse_netns, NULL); + +static struct cfg_kw_list cfg_kws = {{ },{ + { CFG_GLOBAL, "default-path", cfg_parse_global_def_path }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/channel.c b/src/channel.c new file mode 100644 index 0000000..0b6389d --- /dev/null +++ b/src/channel.c @@ -0,0 +1,591 @@ +/* + * Channel management functions. + * + * Copyright 2000-2014 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <stdarg.h> +#include <stdio.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/channel.h> + + +/* Schedule up to <bytes> more bytes to be forwarded via the channel without + * notifying the owner task. Any data pending in the buffer are scheduled to be + * sent as well, within the limit of the number of bytes to forward. This must + * be the only method to use to schedule bytes to be forwarded. If the requested + * number is too large, it is automatically adjusted. The number of bytes taken + * into account is returned. Directly touching ->to_forward will cause lockups + * when buf->o goes down to zero if nobody is ready to push the remaining data. + */ +unsigned long long __channel_forward(struct channel *chn, unsigned long long bytes) +{ + unsigned int budget; + unsigned int forwarded; + + /* This is more of a safety measure as it's not supposed to happen in + * regular code paths. + */ + if (unlikely(chn->to_forward == CHN_INFINITE_FORWARD)) { + c_adv(chn, ci_data(chn)); + return bytes; + } + + /* Bound the transferred size to a 32-bit count since all our values + * are 32-bit, and we don't want to reach CHN_INFINITE_FORWARD. + */ + budget = MIN(bytes, CHN_INFINITE_FORWARD - 1); + + /* transfer as much as we can of buf->i */ + forwarded = MIN(ci_data(chn), budget); + c_adv(chn, forwarded); + budget -= forwarded; + + if (!budget) + return forwarded; + + /* Now we must ensure chn->to_forward sats below CHN_INFINITE_FORWARD, + * which also implies it won't overflow. It's less operations in 64-bit. + */ + bytes = (unsigned long long)chn->to_forward + budget; + if (bytes >= CHN_INFINITE_FORWARD) + bytes = CHN_INFINITE_FORWARD - 1; + budget = bytes - chn->to_forward; + + chn->to_forward += budget; + forwarded += budget; + return forwarded; +} + +/* writes <len> bytes from message <msg> to the channel's buffer. Returns -1 in + * case of success, -2 if the message is larger than the buffer size, or the + * number of bytes available otherwise. The send limit is automatically + * adjusted to the amount of data written. FIXME-20060521: handle unaligned + * data. Note: this function appends data to the buffer's output and possibly + * overwrites any pending input data which are assumed not to exist. + */ +int co_inject(struct channel *chn, const char *msg, int len) +{ + int max; + + if (len == 0) + return -1; + + if (len < 0 || len > c_size(chn)) { + /* we can't write this chunk and will never be able to, because + * it is larger than the buffer. This must be reported as an + * error. Then we return -2 so that writers that don't care can + * ignore it and go on, and others can check for this value. + */ + return -2; + } + + c_realign_if_empty(chn); + max = b_contig_space(&chn->buf); + if (len > max) + return max; + + memcpy(co_tail(chn), msg, len); + b_add(&chn->buf, len); + c_adv(chn, len); + chn->total += len; + return -1; +} + +/* Tries to copy character <c> into the channel's buffer after some length + * controls. The chn->o and to_forward pointers are updated. If the channel + * input is closed, -2 is returned. If there is not enough room left in the + * buffer, -1 is returned. Otherwise the number of bytes copied is returned + * (1). Channel flag READ_PARTIAL is updated if some data can be transferred. + */ +int ci_putchr(struct channel *chn, char c) +{ + if (unlikely(channel_input_closed(chn))) + return -2; + + if (!channel_may_recv(chn)) + return -1; + + *ci_tail(chn) = c; + + b_add(&chn->buf, 1); + chn->flags |= CF_READ_EVENT; + + if (chn->to_forward >= 1) { + if (chn->to_forward != CHN_INFINITE_FORWARD) + chn->to_forward--; + c_adv(chn, 1); + } + + chn->total++; + return 1; +} + +/* Tries to copy block <blk> at once into the channel's buffer after length + * controls. The chn->o and to_forward pointers are updated. If the channel + * input is closed, -2 is returned. If the block is too large for this buffer, + * -3 is returned. If there is not enough room left in the buffer, -1 is + * returned. Otherwise the number of bytes copied is returned (0 being a valid + * number). Channel flag READ_PARTIAL is updated if some data can be + * transferred. + */ +int ci_putblk(struct channel *chn, const char *blk, int len) +{ + int max; + + if (unlikely(channel_input_closed(chn))) + return -2; + + if (len < 0) + return -3; + + max = channel_recv_limit(chn); + if (unlikely(len > max - c_data(chn))) { + /* we can't write this chunk right now because the buffer is + * almost full or because the block is too large. Returns + * -3 if block is too large for this buffer. Or -1 if the + * room left is not large enough. + */ + if (len > max) + return -3; + + return -1; + } + + if (unlikely(len == 0)) + return 0; + + /* OK so the data fits in the buffer in one or two blocks */ + max = b_contig_space(&chn->buf); + memcpy(ci_tail(chn), blk, MIN(len, max)); + if (len > max) + memcpy(c_orig(chn), blk + max, len - max); + + b_add(&chn->buf, len); + channel_add_input(chn, len); + return len; +} + +/* Locates the longest part of the channel's output buffer that is composed + * exclusively of characters not in the <delim> set, and delimited by one of + * these characters, and returns the initial part and the first of such + * delimiters. A single escape character in <escape> may be specified so that + * when not 0 and found, the character that follows it is never taken as a + * delimiter. Note that <delim> cannot contain the zero byte, hence this + * function is not usable with byte zero as a delimiter. + * + * Return values : + * >0 : number of bytes read. Includes the sep if present before len or end. + * =0 : no sep before end found. <str> is left undefined. + * <0 : no more bytes readable because output is shut. + * The channel status is not changed. The caller must call co_skip() to + * update it. One of the delimiters is waited for as long as neither the buffer + * nor the output are full. If either of them is full, the string may be + * returned as is, without the delimiter. + */ +int co_getdelim(const struct channel *chn, char *str, int len, const char *delim, char escape) +{ + uchar delim_map[256 / 8]; + int found, escaped; + uint pos, bit; + int ret, max; + uchar b; + char *p; + + ret = 0; + max = len; + + /* closed or empty + imminent close = -1; empty = 0 */ + if (unlikely((chn_cons(chn)->flags & SC_FL_SHUT_DONE) || !co_data(chn))) { + if (chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) + ret = -1; + goto out; + } + + p = co_head(chn); + + if (max > co_data(chn)) { + max = co_data(chn); + str[max-1] = 0; + } + + /* create the byte map */ + memset(delim_map, 0, sizeof(delim_map)); + while ((b = *delim)) { + pos = b >> 3; + bit = b & 7; + delim_map[pos] |= 1 << bit; + delim++; + } + + found = escaped = 0; + while (max) { + *str++ = b = *p; + ret++; + max--; + + if (escape && (escaped || *p == escape)) { + escaped = !escaped; + goto skip; + } + + pos = b >> 3; + bit = b & 7; + if (delim_map[pos] & (1 << bit)) { + found = 1; + break; + } + skip: + p = b_next(&chn->buf, p); + } + + if (ret > 0 && ret < len && + (ret < co_data(chn) || channel_may_recv(chn)) && + !found && + !(chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED))) + ret = 0; + out: + if (max) + *str = 0; + return ret; +} + +/* Gets one text word out of a channel's buffer from a stream connector. + * Return values : + * >0 : number of bytes read. Includes the sep if present before len or end. + * =0 : no sep before end found. <str> is left undefined. + * <0 : no more bytes readable because output is shut. + * The channel status is not changed. The caller must call co_skip() to + * update it. The line separator is waited for as long as neither the buffer + * nor the output are full. If either of them is full, the string may be + * returned as is, without the line separator. + */ +int co_getword(const struct channel *chn, char *str, int len, char sep) +{ + int ret, max; + char *p; + + ret = 0; + max = len; + + /* closed or empty + imminent close = -1; empty = 0 */ + if (unlikely((chn_cons(chn)->flags & SC_FL_SHUT_DONE) || !co_data(chn))) { + if (chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) + ret = -1; + goto out; + } + + p = co_head(chn); + + if (max > co_data(chn)) { + max = co_data(chn); + str[max-1] = 0; + } + while (max) { + *str++ = *p; + ret++; + max--; + + if (*p == sep) + break; + p = b_next(&chn->buf, p); + } + if (ret > 0 && ret < len && + (ret < co_data(chn) || channel_may_recv(chn)) && + *(str-1) != sep && + !(chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED))) + ret = 0; + out: + if (max) + *str = 0; + return ret; +} + +/* Gets one text line out of a channel's buffer from a stream connector. + * Return values : + * >0 : number of bytes read. Includes the \n if present before len or end. + * =0 : no '\n' before end found. <str> is left undefined. + * <0 : no more bytes readable because output is shut. + * The channel status is not changed. The caller must call co_skip() to + * update it. The '\n' is waited for as long as neither the buffer nor the + * output are full. If either of them is full, the string may be returned + * as is, without the '\n'. + */ +int co_getline(const struct channel *chn, char *str, int len) +{ + int ret, max; + char *p; + + ret = 0; + max = len; + + /* closed or empty + imminent close = -1; empty = 0 */ + if (unlikely((chn_cons(chn)->flags & SC_FL_SHUT_DONE) || !co_data(chn))) { + if (chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) + ret = -1; + goto out; + } + + p = co_head(chn); + + if (max > co_data(chn)) { + max = co_data(chn); + str[max-1] = 0; + } + while (max) { + *str++ = *p; + ret++; + max--; + + if (*p == '\n') + break; + p = b_next(&chn->buf, p); + } + if (ret > 0 && ret < len && + (ret < co_data(chn) || channel_may_recv(chn)) && + *(str-1) != '\n' && + !(chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED))) + ret = 0; + out: + if (max) + *str = 0; + return ret; +} + +/* Gets one char of data from a channel's buffer, + * Return values : + * 1 : number of bytes read, equal to requested size. + * =0 : not enough data available. <c> is left undefined. + * <0 : no more bytes readable because output is shut. + * The channel status is not changed. The caller must call co_skip() to + * update it. + */ +int co_getchar(const struct channel *chn, char *c) +{ + if (chn_cons(chn)->flags & SC_FL_SHUT_DONE) + return -1; + + if (unlikely(co_data(chn) == 0)) { + if (chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) + return -1; + return 0; + } + + *c = *(co_head(chn)); + return 1; +} + +/* Gets one full block of data at once from a channel's buffer, optionally from + * a specific offset. Return values : + * >0 : number of bytes read, equal to requested size. + * =0 : not enough data available. <blk> is left undefined. + * <0 : no more bytes readable because output is shut. + * The channel status is not changed. The caller must call co_skip() to + * update it. + */ +int co_getblk(const struct channel *chn, char *blk, int len, int offset) +{ + if (chn_cons(chn)->flags & SC_FL_SHUT_DONE) + return -1; + + if (len + offset > co_data(chn) || co_data(chn) == 0) { + if (chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) + return -1; + return 0; + } + + return b_getblk(&chn->buf, blk, len, offset); +} + +/* Gets one or two blocks of data at once from a channel's output buffer. + * Return values : + * >0 : number of blocks filled (1 or 2). blk1 is always filled before blk2. + * =0 : not enough data available. <blk*> are left undefined. + * <0 : no more bytes readable because output is shut. + * The channel status is not changed. The caller must call co_skip() to + * update it. Unused buffers are left in an undefined state. + */ +int co_getblk_nc(const struct channel *chn, const char **blk1, size_t *len1, const char **blk2, size_t *len2) +{ + if (unlikely(co_data(chn) == 0)) { + if (chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) + return -1; + return 0; + } + + return b_getblk_nc(&chn->buf, blk1, len1, blk2, len2, 0, co_data(chn)); +} + +/* Gets one text line out of a channel's output buffer from a stream connector. + * Return values : + * >0 : number of blocks returned (1 or 2). blk1 is always filled before blk2. + * =0 : not enough data available. + * <0 : no more bytes readable because output is shut. + * The '\n' is waited for as long as neither the buffer nor the output are + * full. If either of them is full, the string may be returned as is, without + * the '\n'. Unused buffers are left in an undefined state. + */ +int co_getline_nc(const struct channel *chn, + const char **blk1, size_t *len1, + const char **blk2, size_t *len2) +{ + int retcode; + int l; + + retcode = co_getblk_nc(chn, blk1, len1, blk2, len2); + if (unlikely(retcode <= 0)) + return retcode; + + for (l = 0; l < *len1 && (*blk1)[l] != '\n'; l++); + if (l < *len1 && (*blk1)[l] == '\n') { + *len1 = l + 1; + return 1; + } + + if (retcode >= 2) { + for (l = 0; l < *len2 && (*blk2)[l] != '\n'; l++); + if (l < *len2 && (*blk2)[l] == '\n') { + *len2 = l + 1; + return 2; + } + } + + if (chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) { + /* If we have found no LF and the buffer is shut, then + * the resulting string is made of the concatenation of + * the pending blocks (1 or 2). + */ + return retcode; + } + + /* No LF yet and not shut yet */ + return 0; +} + +/* Gets one full block of data at once from a channel's input buffer. + * This function can return the data slitted in one or two blocks. + * Return values : + * >0 : number of blocks returned (1 or 2). blk1 is always filled before blk2. + * =0 : not enough data available. + * <0 : no more bytes readable because input is shut. + */ +int ci_getblk_nc(const struct channel *chn, + char **blk1, size_t *len1, + char **blk2, size_t *len2) +{ + if (unlikely(ci_data(chn) == 0)) { + if (chn_prod(chn)->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + return -1; + return 0; + } + + if (unlikely(ci_head(chn) + ci_data(chn) > c_wrap(chn))) { + *blk1 = ci_head(chn); + *len1 = c_wrap(chn) - ci_head(chn); + *blk2 = c_orig(chn); + *len2 = ci_data(chn) - *len1; + return 2; + } + + *blk1 = ci_head(chn); + *len1 = ci_data(chn); + return 1; +} + +/* Gets one text line out of a channel's input buffer from a stream connector. + * Return values : + * >0 : number of blocks returned (1 or 2). blk1 is always filled before blk2. + * =0 : not enough data available. + * <0 : no more bytes readable because output is shut. + * The '\n' is waited for as long as neither the buffer nor the input are + * full. If either of them is full, the string may be returned as is, without + * the '\n'. Unused buffers are left in an undefined state. + */ +int ci_getline_nc(const struct channel *chn, + char **blk1, size_t *len1, + char **blk2, size_t *len2) +{ + int retcode; + int l; + + retcode = ci_getblk_nc(chn, blk1, len1, blk2, len2); + if (unlikely(retcode <= 0)) + return retcode; + + for (l = 0; l < *len1 && (*blk1)[l] != '\n'; l++); + if (l < *len1 && (*blk1)[l] == '\n') { + *len1 = l + 1; + return 1; + } + + if (retcode >= 2) { + for (l = 0; l < *len2 && (*blk2)[l] != '\n'; l++); + if (l < *len2 && (*blk2)[l] == '\n') { + *len2 = l + 1; + return 2; + } + } + + if (chn_cons(chn)->flags & SC_FL_SHUT_DONE) { + /* If we have found no LF and the buffer is shut, then + * the resulting string is made of the concatenation of + * the pending blocks (1 or 2). + */ + return retcode; + } + + /* No LF yet and not shut yet */ + return 0; +} + +/* Inserts <str> followed by "\r\n" at position <pos> relative to channel <c>'s + * input head. The <len> argument informs about the length of string <str> so + * that we don't have to measure it. <str> must be a valid pointer and must not + * include the trailing "\r\n". + * + * The number of bytes added is returned on success. 0 is returned on failure. + */ +int ci_insert_line2(struct channel *c, int pos, const char *str, int len) +{ + struct buffer *b = &c->buf; + char *dst = c_ptr(c, pos); + int delta; + + delta = len + 2; + + if (__b_tail(b) + delta >= b_wrap(b)) + return 0; /* no space left */ + + if (b_data(b) && + b_tail(b) + delta > b_head(b) && + b_head(b) >= b_tail(b)) + return 0; /* no space left before wrapping data */ + + /* first, protect the end of the buffer */ + memmove(dst + delta, dst, b_tail(b) - dst); + + /* now, copy str over dst */ + memcpy(dst, str, len); + dst[len] = '\r'; + dst[len + 1] = '\n'; + + b_add(b, delta); + return delta; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/check.c b/src/check.c new file mode 100644 index 0000000..2753c93 --- /dev/null +++ b/src/check.c @@ -0,0 +1,2642 @@ +/* + * Health-checks functions. + * + * Copyright 2000-2009 Willy Tarreau <w@1wt.eu> + * Copyright 2007-2009 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/cfgparse.h> +#include <haproxy/check.h> +#include <haproxy/chunk.h> +#include <haproxy/dgram.h> +#include <haproxy/dynbuf.h> +#include <haproxy/extcheck.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/h1.h> +#include <haproxy/http.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/istbuf.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/mailers.h> +#include <haproxy/port_range.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/queue.h> +#include <haproxy/regex.h> +#include <haproxy/resolvers.h> +#include <haproxy/sample.h> +#include <haproxy/server.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/stats-t.h> +#include <haproxy/task.h> +#include <haproxy/tcpcheck.h> +#include <haproxy/thread.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> +#include <haproxy/vars.h> + +/* trace source and events */ +static void check_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +/* The event representation is split like this : + * check - check + * + * CHECK_EV_* macros are defined in <haproxy/check.h> + */ +static const struct trace_event check_trace_events[] = { + { .mask = CHK_EV_TASK_WAKE, .name = "task_wake", .desc = "Check task woken up" }, + { .mask = CHK_EV_HCHK_START, .name = "hchck_start", .desc = "Health-check started" }, + { .mask = CHK_EV_HCHK_WAKE, .name = "hchck_wake", .desc = "Health-check woken up" }, + { .mask = CHK_EV_HCHK_RUN, .name = "hchck_run", .desc = "Health-check running" }, + { .mask = CHK_EV_HCHK_END, .name = "hchck_end", .desc = "Health-check terminated" }, + { .mask = CHK_EV_HCHK_SUCC, .name = "hchck_succ", .desc = "Health-check success" }, + { .mask = CHK_EV_HCHK_ERR, .name = "hchck_err", .desc = "Health-check failure" }, + + { .mask = CHK_EV_TCPCHK_EVAL, .name = "tcp_check_eval", .desc = "tcp-check rules evaluation" }, + { .mask = CHK_EV_TCPCHK_ERR, .name = "tcp_check_err", .desc = "tcp-check evaluation error" }, + { .mask = CHK_EV_TCPCHK_CONN, .name = "tcp_check_conn", .desc = "tcp-check connection rule" }, + { .mask = CHK_EV_TCPCHK_SND, .name = "tcp_check_send", .desc = "tcp-check send rule" }, + { .mask = CHK_EV_TCPCHK_EXP, .name = "tcp_check_expect", .desc = "tcp-check expect rule" }, + { .mask = CHK_EV_TCPCHK_ACT, .name = "tcp_check_action", .desc = "tcp-check action rule" }, + + { .mask = CHK_EV_RX_DATA, .name = "rx_data", .desc = "receipt of data" }, + { .mask = CHK_EV_RX_BLK, .name = "rx_blk", .desc = "receipt blocked" }, + { .mask = CHK_EV_RX_ERR, .name = "rx_err", .desc = "receipt error" }, + + { .mask = CHK_EV_TX_DATA, .name = "tx_data", .desc = "transmission of data" }, + { .mask = CHK_EV_TX_BLK, .name = "tx_blk", .desc = "transmission blocked" }, + { .mask = CHK_EV_TX_ERR, .name = "tx_err", .desc = "transmission error" }, + + {} +}; + +static const struct name_desc check_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the check */ }, + /* arg2 */ { }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc check_trace_decoding[] = { +#define CHK_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define CHK_VERB_MINIMAL 2 + { .name="minimal", .desc="report info on streams and connectors" }, +#define CHK_VERB_SIMPLE 3 + { .name="simple", .desc="add info on request and response channels" }, +#define CHK_VERB_ADVANCED 4 + { .name="advanced", .desc="add info on channel's buffer for data and developer levels only" }, +#define CHK_VERB_COMPLETE 5 + { .name="complete", .desc="add info on channel's buffer" }, + { /* end */ } +}; + +struct trace_source trace_check = { + .name = IST("check"), + .desc = "Health-check", + .arg_def = TRC_ARG1_CHK, // TRACE()'s first argument is always a stream + .default_cb = check_trace, + .known_events = check_trace_events, + .lockon_args = check_trace_lockon_args, + .decoding = check_trace_decoding, + .report_events = ~0, // report everything by default +}; + +#define TRACE_SOURCE &trace_check +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + + +/* Dummy frontend used to create all checks sessions. */ +struct proxy checks_fe; + + +static inline void check_trace_buf(const struct buffer *buf, size_t ofs, size_t len) +{ + size_t block1, block2; + int line, ptr, newptr; + + block1 = b_contig_data(buf, ofs); + block2 = 0; + if (block1 > len) + block1 = len; + block2 = len - block1; + + ofs = b_peek_ofs(buf, ofs); + + line = 0; + ptr = ofs; + while (ptr < ofs + block1) { + newptr = dump_text_line(&trace_buf, b_orig(buf), b_size(buf), ofs + block1, &line, ptr); + if (newptr == ptr) + break; + ptr = newptr; + } + + line = ptr = 0; + while (ptr < block2) { + newptr = dump_text_line(&trace_buf, b_orig(buf), b_size(buf), block2, &line, ptr); + if (newptr == ptr) + break; + ptr = newptr; + } +} + +/* trace source and events */ +static void check_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct check *check = a1; + const struct server *srv = (check ? check->server : NULL); + const size_t *val = a4; + const char *res; + + if (!check || src->verbosity < CHK_VERB_CLEAN) + return; + + if (srv) { + chunk_appendf(&trace_buf, " : [%c] SRV=%s", + ((check->type == PR_O2_EXT_CHK) ? 'E' : (check->state & CHK_ST_AGENT ? 'A' : 'H')), + srv->id); + + chunk_appendf(&trace_buf, " status=%d/%d %s", + (check->health >= check->rise) ? check->health - check->rise + 1 : check->health, + (check->health >= check->rise) ? check->fall : check->rise, + (check->health >= check->rise) ? (srv->uweight ? "UP" : "DRAIN") : "DOWN"); + } + else + chunk_appendf(&trace_buf, " : [EMAIL]"); + + switch (check->result) { + case CHK_RES_NEUTRAL: res = "-"; break; + case CHK_RES_FAILED: res = "FAIL"; break; + case CHK_RES_PASSED: res = "PASS"; break; + case CHK_RES_CONDPASS: res = "COND"; break; + default: res = "UNK"; break; + } + + if (src->verbosity == CHK_VERB_CLEAN) + return; + + chunk_appendf(&trace_buf, " - last=%s(%d)/%s(%d)", + get_check_status_info(check->status), check->status, + res, check->result); + + /* Display the value to the 4th argument (level > STATE) */ + if (src->level > TRACE_LEVEL_STATE && val) + chunk_appendf(&trace_buf, " - VAL=%lu", (long)*val); + + chunk_appendf(&trace_buf, " check=%p(0x%08x)", check, check->state); + + if (src->verbosity == CHK_VERB_MINIMAL) + return; + + + if (check->sc) { + struct connection *conn = sc_conn(check->sc); + + chunk_appendf(&trace_buf, " - conn=%p(0x%08x)", conn, conn ? conn->flags : 0); + chunk_appendf(&trace_buf, " sc=%p(0x%08x)", check->sc, check->sc->flags); + } + + if (mask & CHK_EV_TCPCHK) { + const char *type; + + switch (check->tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK) { + case TCPCHK_RULES_PGSQL_CHK: type = "PGSQL"; break; + case TCPCHK_RULES_REDIS_CHK: type = "REDIS"; break; + case TCPCHK_RULES_SMTP_CHK: type = "SMTP"; break; + case TCPCHK_RULES_HTTP_CHK: type = "HTTP"; break; + case TCPCHK_RULES_MYSQL_CHK: type = "MYSQL"; break; + case TCPCHK_RULES_LDAP_CHK: type = "LDAP"; break; + case TCPCHK_RULES_SSL3_CHK: type = "SSL3"; break; + case TCPCHK_RULES_AGENT_CHK: type = "AGENT"; break; + case TCPCHK_RULES_SPOP_CHK: type = "SPOP"; break; + case TCPCHK_RULES_TCP_CHK: type = "TCP"; break; + default: type = "???"; break; + } + if (check->current_step) + chunk_appendf(&trace_buf, " - tcp-check=(%s,%d)", type, tcpcheck_get_step_id(check, NULL)); + else + chunk_appendf(&trace_buf, " - tcp-check=(%s,-)", type); + } + + /* Display bi and bo buffer info (level > USER & verbosity > SIMPLE) */ + if (src->level > TRACE_LEVEL_USER) { + const struct buffer *buf = NULL; + + chunk_appendf(&trace_buf, " bi=%u@%p+%u/%u", + (unsigned int)b_data(&check->bi), b_orig(&check->bi), + (unsigned int)b_head_ofs(&check->bi), (unsigned int)b_size(&check->bi)); + chunk_appendf(&trace_buf, " bo=%u@%p+%u/%u", + (unsigned int)b_data(&check->bo), b_orig(&check->bo), + (unsigned int)b_head_ofs(&check->bo), (unsigned int)b_size(&check->bo)); + + if (src->verbosity >= CHK_VERB_ADVANCED && (mask & (CHK_EV_RX))) + buf = (b_is_null(&check->bi) ? NULL : &check->bi); + else if (src->verbosity >= CHK_VERB_ADVANCED && (mask & (CHK_EV_TX))) + buf = (b_is_null(&check->bo) ? NULL : &check->bo); + + if (buf) { + if ((check->tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_HTTP_CHK) { + int full = (src->verbosity == CHK_VERB_COMPLETE); + + chunk_memcat(&trace_buf, "\n\t", 2); + htx_dump(&trace_buf, htxbuf(buf), full); + } + else { + int max = ((src->verbosity == CHK_VERB_COMPLETE) ? 1024 : 256); + + chunk_memcat(&trace_buf, "\n", 1); + if (b_data(buf) > max) { + check_trace_buf(buf, 0, max); + chunk_memcat(&trace_buf, " ...\n", 6); + } + else + check_trace_buf(buf, 0, b_data(buf)); + } + + } + } + +} + + +/**************************************************************************/ +/************************ Handle check results ****************************/ +/**************************************************************************/ +struct check_status { + short result; /* one of SRV_CHK_* */ + char *info; /* human readable short info */ + char *desc; /* long description */ +}; + +struct analyze_status { + char *desc; /* description */ + unsigned char lr[HANA_OBS_SIZE]; /* result for l4/l7: 0 = ignore, 1 - error, 2 - OK */ +}; + +static const struct check_status check_statuses[HCHK_STATUS_SIZE] = { + [HCHK_STATUS_UNKNOWN] = { CHK_RES_UNKNOWN, "UNK", "Unknown" }, + [HCHK_STATUS_INI] = { CHK_RES_UNKNOWN, "INI", "Initializing" }, + [HCHK_STATUS_START] = { /* SPECIAL STATUS*/ }, + + /* Below we have finished checks */ + [HCHK_STATUS_CHECKED] = { CHK_RES_NEUTRAL, "CHECKED", "No status change" }, + [HCHK_STATUS_HANA] = { CHK_RES_FAILED, "HANA", "Health analyze" }, + + [HCHK_STATUS_SOCKERR] = { CHK_RES_FAILED, "SOCKERR", "Socket error" }, + + [HCHK_STATUS_L4OK] = { CHK_RES_PASSED, "L4OK", "Layer4 check passed" }, + [HCHK_STATUS_L4TOUT] = { CHK_RES_FAILED, "L4TOUT", "Layer4 timeout" }, + [HCHK_STATUS_L4CON] = { CHK_RES_FAILED, "L4CON", "Layer4 connection problem" }, + + [HCHK_STATUS_L6OK] = { CHK_RES_PASSED, "L6OK", "Layer6 check passed" }, + [HCHK_STATUS_L6TOUT] = { CHK_RES_FAILED, "L6TOUT", "Layer6 timeout" }, + [HCHK_STATUS_L6RSP] = { CHK_RES_FAILED, "L6RSP", "Layer6 invalid response" }, + + [HCHK_STATUS_L7TOUT] = { CHK_RES_FAILED, "L7TOUT", "Layer7 timeout" }, + [HCHK_STATUS_L7RSP] = { CHK_RES_FAILED, "L7RSP", "Layer7 invalid response" }, + + [HCHK_STATUS_L57DATA] = { /* DUMMY STATUS */ }, + + [HCHK_STATUS_L7OKD] = { CHK_RES_PASSED, "L7OK", "Layer7 check passed" }, + [HCHK_STATUS_L7OKCD] = { CHK_RES_CONDPASS, "L7OKC", "Layer7 check conditionally passed" }, + [HCHK_STATUS_L7STS] = { CHK_RES_FAILED, "L7STS", "Layer7 wrong status" }, + + [HCHK_STATUS_PROCERR] = { CHK_RES_FAILED, "PROCERR", "External check error" }, + [HCHK_STATUS_PROCTOUT] = { CHK_RES_FAILED, "PROCTOUT", "External check timeout" }, + [HCHK_STATUS_PROCOK] = { CHK_RES_PASSED, "PROCOK", "External check passed" }, +}; + +static const struct analyze_status analyze_statuses[HANA_STATUS_SIZE] = { /* 0: ignore, 1: error, 2: OK */ + [HANA_STATUS_UNKNOWN] = { "Unknown", { 0, 0 }}, + + [HANA_STATUS_L4_OK] = { "L4 successful connection", { 2, 0 }}, + [HANA_STATUS_L4_ERR] = { "L4 unsuccessful connection", { 1, 1 }}, + + [HANA_STATUS_HTTP_OK] = { "Correct http response", { 0, 2 }}, + [HANA_STATUS_HTTP_STS] = { "Wrong http response", { 0, 1 }}, + [HANA_STATUS_HTTP_HDRRSP] = { "Invalid http response (headers)", { 0, 1 }}, + [HANA_STATUS_HTTP_RSP] = { "Invalid http response", { 0, 1 }}, + + [HANA_STATUS_HTTP_READ_ERROR] = { "Read error (http)", { 0, 1 }}, + [HANA_STATUS_HTTP_READ_TIMEOUT] = { "Read timeout (http)", { 0, 1 }}, + [HANA_STATUS_HTTP_BROKEN_PIPE] = { "Close from server (http)", { 0, 1 }}, +}; + +/* checks if <err> is a real error for errno or one that can be ignored, and + * return 0 for these ones or <err> for real ones. + */ +static inline int unclean_errno(int err) +{ + if (err == EAGAIN || err == EWOULDBLOCK || err == EINPROGRESS || + err == EISCONN || err == EALREADY) + return 0; + return err; +} + +/* Converts check_status code to result code */ +short get_check_status_result(short check_status) +{ + if (check_status < HCHK_STATUS_SIZE) + return check_statuses[check_status].result; + else + return check_statuses[HCHK_STATUS_UNKNOWN].result; +} + +/* Converts check_status code to description */ +const char *get_check_status_description(short check_status) { + + const char *desc; + + if (check_status < HCHK_STATUS_SIZE) + desc = check_statuses[check_status].desc; + else + desc = NULL; + + if (desc && *desc) + return desc; + else + return check_statuses[HCHK_STATUS_UNKNOWN].desc; +} + +/* Converts check_status code to short info */ +const char *get_check_status_info(short check_status) +{ + const char *info; + + if (check_status < HCHK_STATUS_SIZE) + info = check_statuses[check_status].info; + else + info = NULL; + + if (info && *info) + return info; + else + return check_statuses[HCHK_STATUS_UNKNOWN].info; +} + +/* Convert analyze_status to description */ +const char *get_analyze_status(short analyze_status) { + + const char *desc; + + if (analyze_status < HANA_STATUS_SIZE) + desc = analyze_statuses[analyze_status].desc; + else + desc = NULL; + + if (desc && *desc) + return desc; + else + return analyze_statuses[HANA_STATUS_UNKNOWN].desc; +} + +/* append check info to buffer msg */ +void check_append_info(struct buffer *msg, struct check *check) +{ + if (!check) + return; + chunk_appendf(msg, ", reason: %s", get_check_status_description(check->status)); + + if (check->status >= HCHK_STATUS_L57DATA) + chunk_appendf(msg, ", code: %d", check->code); + + if (check->desc[0]) { + struct buffer src; + + chunk_appendf(msg, ", info: \""); + + chunk_initlen(&src, check->desc, 0, strlen(check->desc)); + chunk_asciiencode(msg, &src, '"'); + + chunk_appendf(msg, "\""); + } + + if (check->duration >= 0) + chunk_appendf(msg, ", check duration: %ldms", check->duration); +} + +/* Sets check->status, update check->duration and fill check->result with an + * adequate CHK_RES_* value. The new check->health is computed based on the + * result. + * + * Shows information in logs about failed health check if server is UP or + * succeeded health checks if server is DOWN. + */ +void set_server_check_status(struct check *check, short status, const char *desc) +{ + struct server *s = check->server; + short prev_status = check->status; + int report = (status != prev_status) ? 1 : 0; + + TRACE_POINT(CHK_EV_HCHK_RUN, check); + + if (status == HCHK_STATUS_START) { + check->result = CHK_RES_UNKNOWN; /* no result yet */ + check->desc[0] = '\0'; + check->start = now_ns; + return; + } + + if (!check->status) + return; + + if (desc && *desc) { + strncpy(check->desc, desc, HCHK_DESC_LEN-1); + check->desc[HCHK_DESC_LEN-1] = '\0'; + } else + check->desc[0] = '\0'; + + check->status = status; + if (check_statuses[status].result) + check->result = check_statuses[status].result; + + if (status == HCHK_STATUS_HANA) + check->duration = -1; + else if (check->start) { + /* set_server_check_status() may be called more than once */ + check->duration = ns_to_ms(now_ns - check->start); + check->start = 0; + } + + /* no change is expected if no state change occurred */ + if (check->result == CHK_RES_NEUTRAL) + return; + + /* If the check was really just sending a mail, it won't have an + * associated server, so we're done now. + */ + if (!s) + return; + + switch (check->result) { + case CHK_RES_FAILED: + /* Failure to connect to the agent as a secondary check should not + * cause the server to be marked down. + */ + if ((!(check->state & CHK_ST_AGENT) || + (check->status >= HCHK_STATUS_L57DATA)) && + (check->health > 0)) { + _HA_ATOMIC_INC(&s->counters.failed_checks); + report = 1; + check->health--; + if (check->health < check->rise) + check->health = 0; + } + break; + + case CHK_RES_PASSED: + case CHK_RES_CONDPASS: + if (check->health < check->rise + check->fall - 1) { + report = 1; + check->health++; + + if (check->health >= check->rise) + check->health = check->rise + check->fall - 1; /* OK now */ + } + + /* clear consecutive_errors if observing is enabled */ + if (s->onerror) + HA_ATOMIC_STORE(&s->consecutive_errors, 0); + break; + + default: + break; + } + + if (report) + srv_event_hdl_publish_check(s, check); + + if (s->proxy->options2 & PR_O2_LOGHCHKS && report) { + chunk_printf(&trash, + "%s check for %sserver %s/%s %s%s", + (check->state & CHK_ST_AGENT) ? "Agent" : "Health", + s->flags & SRV_F_BACKUP ? "backup " : "", + s->proxy->id, s->id, + (check->result == CHK_RES_CONDPASS) ? "conditionally ":"", + (check->result >= CHK_RES_PASSED) ? "succeeded" : "failed"); + + check_append_info(&trash, check); + + chunk_appendf(&trash, ", status: %d/%d %s", + (check->health >= check->rise) ? check->health - check->rise + 1 : check->health, + (check->health >= check->rise) ? check->fall : check->rise, + (check->health >= check->rise) ? (s->uweight ? "UP" : "DRAIN") : "DOWN"); + + ha_warning("%s.\n", trash.area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.area); + send_email_alert(s, LOG_INFO, "%s", trash.area); + } +} + +static inline enum srv_op_st_chg_cause check_notify_cause(struct check *check) +{ + struct server *s = check->server; + + /* We only report a cause for the check if we did not do so previously */ + if (!s->track && !(s->proxy->options2 & PR_O2_LOGHCHKS)) + return (check->state & CHK_ST_AGENT) ? SRV_OP_STCHGC_AGENT : SRV_OP_STCHGC_HEALTH; + return SRV_OP_STCHGC_NONE; +} + +/* Marks the check <check>'s server down if the current check is already failed + * and the server is not down yet nor in maintenance. + */ +void check_notify_failure(struct check *check) +{ + struct server *s = check->server; + + /* The agent secondary check should only cause a server to be marked + * as down if check->status is HCHK_STATUS_L7STS, which indicates + * that the agent returned "fail", "stopped" or "down". + * The implication here is that failure to connect to the agent + * as a secondary check should not cause the server to be marked + * down. */ + if ((check->state & CHK_ST_AGENT) && check->status != HCHK_STATUS_L7STS) + return; + + if (check->health > 0) + return; + + TRACE_STATE("health-check failed, set server DOWN", CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + srv_set_stopped(s, check_notify_cause(check)); +} + +/* Marks the check <check> as valid and tries to set its server up, provided + * it isn't in maintenance, it is not tracking a down server and other checks + * comply. The rule is simple : by default, a server is up, unless any of the + * following conditions is true : + * - health check failed (check->health < rise) + * - agent check failed (agent->health < rise) + * - the server tracks a down server (track && track->state == STOPPED) + * Note that if the server has a slowstart, it will switch to STARTING instead + * of RUNNING. Also, only the health checks support the nolb mode, so the + * agent's success may not take the server out of this mode. + */ +void check_notify_success(struct check *check) +{ + struct server *s = check->server; + + if (s->next_admin & SRV_ADMF_MAINT) + return; + + if (s->track && s->track->next_state == SRV_ST_STOPPED) + return; + + if ((s->check.state & CHK_ST_ENABLED) && (s->check.health < s->check.rise)) + return; + + if ((s->agent.state & CHK_ST_ENABLED) && (s->agent.health < s->agent.rise)) + return; + + if ((check->state & CHK_ST_AGENT) && s->next_state == SRV_ST_STOPPING) + return; + + TRACE_STATE("health-check succeeded, set server RUNNING", CHK_EV_HCHK_END|CHK_EV_HCHK_SUCC, check); + srv_set_running(s, check_notify_cause(check)); +} + +/* Marks the check <check> as valid and tries to set its server into stopping mode + * if it was running or starting, and provided it isn't in maintenance and other + * checks comply. The conditions for the server to be marked in stopping mode are + * the same as for it to be turned up. Also, only the health checks support the + * nolb mode. + */ +void check_notify_stopping(struct check *check) +{ + struct server *s = check->server; + + if (s->next_admin & SRV_ADMF_MAINT) + return; + + if (check->state & CHK_ST_AGENT) + return; + + if (s->track && s->track->next_state == SRV_ST_STOPPED) + return; + + if ((s->check.state & CHK_ST_ENABLED) && (s->check.health < s->check.rise)) + return; + + if ((s->agent.state & CHK_ST_ENABLED) && (s->agent.health < s->agent.rise)) + return; + + TRACE_STATE("health-check condionnaly succeeded, set server STOPPING", CHK_EV_HCHK_END|CHK_EV_HCHK_SUCC, check); + srv_set_stopping(s, check_notify_cause(check)); +} + +/* note: use health_adjust() only, which first checks that the observe mode is + * enabled. This will take the server lock if needed. + */ +void __health_adjust(struct server *s, short status) +{ + int failed; + + if (s->observe >= HANA_OBS_SIZE) + return; + + if (status >= HANA_STATUS_SIZE || !analyze_statuses[status].desc) + return; + + switch (analyze_statuses[status].lr[s->observe - 1]) { + case 1: + failed = 1; + break; + + case 2: + failed = 0; + break; + + default: + return; + } + + if (!failed) { + /* good: clear consecutive_errors */ + HA_ATOMIC_STORE(&s->consecutive_errors, 0); + return; + } + + if (HA_ATOMIC_ADD_FETCH(&s->consecutive_errors, 1) < s->consecutive_errors_limit) + return; + + chunk_printf(&trash, "Detected %d consecutive errors, last one was: %s", + HA_ATOMIC_LOAD(&s->consecutive_errors), get_analyze_status(status)); + + HA_SPIN_LOCK(SERVER_LOCK, &s->lock); + + /* force fastinter for upcoming check + * (does nothing if fastinter is not enabled) + */ + s->check.state |= CHK_ST_FASTINTER; + + switch (s->onerror) { + case HANA_ONERR_FASTINTER: + /* force fastinter - nothing to do here as all modes force it */ + break; + + case HANA_ONERR_SUDDTH: + /* simulate a pre-fatal failed health check */ + if (s->check.health > s->check.rise) + s->check.health = s->check.rise + 1; + + __fallthrough; + + case HANA_ONERR_FAILCHK: + /* simulate a failed health check */ + set_server_check_status(&s->check, HCHK_STATUS_HANA, + trash.area); + check_notify_failure(&s->check); + break; + + case HANA_ONERR_MARKDWN: + /* mark server down */ + s->check.health = s->check.rise; + set_server_check_status(&s->check, HCHK_STATUS_HANA, + trash.area); + check_notify_failure(&s->check); + break; + + default: + /* write a warning? */ + break; + } + + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + + HA_ATOMIC_STORE(&s->consecutive_errors, 0); + _HA_ATOMIC_INC(&s->counters.failed_hana); + + if (s->check.fastinter) { + /* timer might need to be advanced, it might also already be + * running in another thread. Let's just wake the task up, it + * will automatically adjust its timer. + */ + task_wakeup(s->check.task, TASK_WOKEN_MSG); + } +} + +/* Checks the connection. If an error has already been reported or the socket is + * closed, keep errno intact as it is supposed to contain the valid error code. + * If no error is reported, check the socket's error queue using getsockopt(). + * Warning, this must be done only once when returning from poll, and never + * after an I/O error was attempted, otherwise the error queue might contain + * inconsistent errors. If an error is detected, the CO_FL_ERROR is set on the + * socket. Returns non-zero if an error was reported, zero if everything is + * clean (including a properly closed socket). + */ +static int retrieve_errno_from_socket(struct connection *conn) +{ + int skerr; + socklen_t lskerr = sizeof(skerr); + + if (conn->flags & CO_FL_ERROR && (unclean_errno(errno) || !conn->ctrl)) + return 1; + + if (!conn_ctrl_ready(conn)) + return 0; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (getsockopt(conn->handle.fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr) == 0) + errno = skerr; + + errno = unclean_errno(errno); + + if (!errno) { + /* we could not retrieve an error, that does not mean there is + * none. Just don't change anything and only report the prior + * error if any. + */ + if (conn->flags & CO_FL_ERROR) + return 1; + else + return 0; + } + + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_WR_SH | CO_FL_SOCK_RD_SH; + return 1; +} + +/* Tries to collect as much information as possible on the connection status, + * and adjust the server status accordingly. It may make use of <errno_bck> + * if non-null when the caller is absolutely certain of its validity (eg: + * checked just after a syscall). If the caller doesn't have a valid errno, + * it can pass zero, and retrieve_errno_from_socket() will be called to try + * to extract errno from the socket. If no error is reported, it will consider + * the <expired> flag. This is intended to be used when a connection error was + * reported in conn->flags or when a timeout was reported in <expired>. The + * function takes care of not updating a server status which was already set. + * All situations where at least one of <expired> or CO_FL_ERROR are set + * produce a status. + */ +void chk_report_conn_err(struct check *check, int errno_bck, int expired) +{ + struct stconn *sc = check->sc; + struct connection *conn = sc_conn(sc); + const char *err_msg; + struct buffer *chk; + int step; + + if (check->result != CHK_RES_UNKNOWN) { + return; + } + + errno = unclean_errno(errno_bck); + if (conn && errno) + retrieve_errno_from_socket(conn); + + if (conn && !(conn->flags & CO_FL_ERROR) && !sc_ep_test(sc, SE_FL_ERROR) && !expired) + return; + + TRACE_ENTER(CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check, 0, 0, (size_t[]){expired}); + + /* we'll try to build a meaningful error message depending on the + * context of the error possibly present in conn->err_code, and the + * socket error possibly collected above. This is useful to know the + * exact step of the L6 layer (eg: SSL handshake). + */ + chk = get_trash_chunk(); + + if (check->type == PR_O2_TCPCHK_CHK && + (check->tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_TCP_CHK) { + step = tcpcheck_get_step_id(check, NULL); + if (!step) { + TRACE_DEVEL("initial connection failure", CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + chunk_printf(chk, " at initial connection step of tcp-check"); + } + else { + chunk_printf(chk, " at step %d of tcp-check", step); + /* we were looking for a string */ + if (check->current_step && check->current_step->action == TCPCHK_ACT_CONNECT) { + if (check->current_step->connect.port) + chunk_appendf(chk, " (connect port %d)" ,check->current_step->connect.port); + else + chunk_appendf(chk, " (connect)"); + TRACE_DEVEL("connection failure", CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + } + else if (check->current_step && check->current_step->action == TCPCHK_ACT_EXPECT) { + struct tcpcheck_expect *expect = &check->current_step->expect; + + switch (expect->type) { + case TCPCHK_EXPECT_STRING: + chunk_appendf(chk, " (expect string '%.*s')", (unsigned int)istlen(expect->data), istptr(expect->data)); + break; + case TCPCHK_EXPECT_BINARY: + chunk_appendf(chk, " (expect binary '"); + dump_binary(chk, istptr(expect->data), (int)istlen(expect->data)); + chunk_appendf(chk, "')"); + break; + case TCPCHK_EXPECT_STRING_REGEX: + chunk_appendf(chk, " (expect regex)"); + break; + case TCPCHK_EXPECT_BINARY_REGEX: + chunk_appendf(chk, " (expect binary regex)"); + break; + case TCPCHK_EXPECT_STRING_LF: + chunk_appendf(chk, " (expect log-format string)"); + break; + case TCPCHK_EXPECT_BINARY_LF: + chunk_appendf(chk, " (expect log-format binary)"); + break; + case TCPCHK_EXPECT_HTTP_STATUS: + chunk_appendf(chk, " (expect HTTP status codes)"); + break; + case TCPCHK_EXPECT_HTTP_STATUS_REGEX: + chunk_appendf(chk, " (expect HTTP status regex)"); + break; + case TCPCHK_EXPECT_HTTP_HEADER: + chunk_appendf(chk, " (expect HTTP header pattern)"); + break; + case TCPCHK_EXPECT_HTTP_BODY: + chunk_appendf(chk, " (expect HTTP body content '%.*s')", (unsigned int)istlen(expect->data), istptr(expect->data)); + break; + case TCPCHK_EXPECT_HTTP_BODY_REGEX: + chunk_appendf(chk, " (expect HTTP body regex)"); + break; + case TCPCHK_EXPECT_HTTP_BODY_LF: + chunk_appendf(chk, " (expect log-format HTTP body)"); + break; + case TCPCHK_EXPECT_CUSTOM: + chunk_appendf(chk, " (expect custom function)"); + break; + case TCPCHK_EXPECT_UNDEF: + chunk_appendf(chk, " (undefined expect!)"); + break; + } + TRACE_DEVEL("expect rule failed", CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + } + else if (check->current_step && check->current_step->action == TCPCHK_ACT_SEND) { + chunk_appendf(chk, " (send)"); + TRACE_DEVEL("send rule failed", CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + } + + if (check->current_step && check->current_step->comment) + chunk_appendf(chk, " comment: '%s'", check->current_step->comment); + } + } + + if (conn && conn->err_code) { + if (unclean_errno(errno)) + chunk_printf(&trash, "%s (%s)%s", conn_err_code_str(conn), strerror(errno), + chk->area); + else + chunk_printf(&trash, "%s%s", conn_err_code_str(conn), + chk->area); + err_msg = trash.area; + } + else { + if (unclean_errno(errno)) { + chunk_printf(&trash, "%s%s", strerror(errno), + chk->area); + err_msg = trash.area; + } + else { + err_msg = chk->area; + } + } + + if (check->state & CHK_ST_PORT_MISS) { + /* NOTE: this is reported after <fall> tries */ + set_server_check_status(check, HCHK_STATUS_SOCKERR, err_msg); + } + + if (!conn || !conn->ctrl) { + /* error before any connection attempt (connection allocation error or no control layer) */ + set_server_check_status(check, HCHK_STATUS_SOCKERR, err_msg); + } + else if (conn->flags & CO_FL_WAIT_L4_CONN) { + /* L4 not established (yet) */ + if (conn->flags & CO_FL_ERROR || sc_ep_test(sc, SE_FL_ERROR)) + set_server_check_status(check, HCHK_STATUS_L4CON, err_msg); + else if (expired) + set_server_check_status(check, HCHK_STATUS_L4TOUT, err_msg); + + /* + * might be due to a server IP change. + * Let's trigger a DNS resolution if none are currently running. + */ + if (check->server) + resolv_trigger_resolution(check->server->resolv_requester); + + } + else if (conn->flags & CO_FL_WAIT_L6_CONN) { + /* L6 not established (yet) */ + if (conn->flags & CO_FL_ERROR || sc_ep_test(sc, SE_FL_ERROR)) + set_server_check_status(check, HCHK_STATUS_L6RSP, err_msg); + else if (expired) + set_server_check_status(check, HCHK_STATUS_L6TOUT, err_msg); + } + else if (conn->flags & CO_FL_ERROR || sc_ep_test(sc, SE_FL_ERROR)) { + /* I/O error after connection was established and before we could diagnose */ + set_server_check_status(check, HCHK_STATUS_SOCKERR, err_msg); + } + else if (expired) { + enum healthcheck_status tout = HCHK_STATUS_L7TOUT; + + /* connection established but expired check */ + if (check->current_step && check->current_step->action == TCPCHK_ACT_EXPECT && + check->current_step->expect.tout_status != HCHK_STATUS_UNKNOWN) + tout = check->current_step->expect.tout_status; + set_server_check_status(check, tout, err_msg); + } + + TRACE_LEAVE(CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + return; +} + + +/* Builds the server state header used by HTTP health-checks */ +int httpchk_build_status_header(struct server *s, struct buffer *buf) +{ + int sv_state; + int ratio; + char addr[46]; + char port[6]; + const char *srv_hlt_st[7] = { "DOWN", "DOWN %d/%d", + "UP %d/%d", "UP", + "NOLB %d/%d", "NOLB", + "no check" }; + + if (!(s->check.state & CHK_ST_ENABLED)) + sv_state = 6; + else if (s->cur_state != SRV_ST_STOPPED) { + if (s->check.health == s->check.rise + s->check.fall - 1) + sv_state = 3; /* UP */ + else + sv_state = 2; /* going down */ + + if (s->cur_state == SRV_ST_STOPPING) + sv_state += 2; + } else { + if (s->check.health) + sv_state = 1; /* going up */ + else + sv_state = 0; /* DOWN */ + } + + chunk_appendf(buf, srv_hlt_st[sv_state], + (s->cur_state != SRV_ST_STOPPED) ? (s->check.health - s->check.rise + 1) : (s->check.health), + (s->cur_state != SRV_ST_STOPPED) ? (s->check.fall) : (s->check.rise)); + + addr_to_str(&s->addr, addr, sizeof(addr)); + if (s->addr.ss_family == AF_INET || s->addr.ss_family == AF_INET6) + snprintf(port, sizeof(port), "%u", s->svc_port); + else + *port = 0; + + chunk_appendf(buf, "; address=%s; port=%s; name=%s/%s; node=%s; weight=%d/%d; scur=%d/%d; qcur=%d", + addr, port, s->proxy->id, s->id, + global.node, + (s->cur_eweight * s->proxy->lbprm.wmult + s->proxy->lbprm.wdiv - 1) / s->proxy->lbprm.wdiv, + (s->proxy->lbprm.tot_weight * s->proxy->lbprm.wmult + s->proxy->lbprm.wdiv - 1) / s->proxy->lbprm.wdiv, + s->cur_sess, s->proxy->beconn - s->proxy->queue.length, + s->queue.length); + + if ((s->cur_state == SRV_ST_STARTING) && + ns_to_sec(now_ns) < s->last_change + s->slowstart && + ns_to_sec(now_ns) >= s->last_change) { + ratio = MAX(1, 100 * (ns_to_sec(now_ns) - s->last_change) / s->slowstart); + chunk_appendf(buf, "; throttle=%d%%", ratio); + } + + return b_data(buf); +} + +/**************************************************************************/ +/***************** Health-checks based on connections *********************/ +/**************************************************************************/ +/* This function is used only for server health-checks. It handles connection + * status updates including errors. If necessary, it wakes the check task up. + * It returns 0 on normal cases, <0 if at least one close() has happened on the + * connection (eg: reconnect). It relies on tcpcheck_main(). + */ +int wake_srv_chk(struct stconn *sc) +{ + struct connection *conn; + struct check *check = __sc_check(sc); + struct email_alertq *q = container_of(check, typeof(*q), check); + int ret = 0; + + TRACE_ENTER(CHK_EV_HCHK_WAKE, check); + if (check->result != CHK_RES_UNKNOWN) + goto end; + + if (check->server) + HA_SPIN_LOCK(SERVER_LOCK, &check->server->lock); + else + HA_SPIN_LOCK(EMAIL_ALERTS_LOCK, &q->lock); + + /* we may have to make progress on the TCP checks */ + ret = tcpcheck_main(check); + + sc = check->sc; + conn = sc_conn(sc); + + if (unlikely(!conn || conn->flags & CO_FL_ERROR || sc_ep_test(sc, SE_FL_ERROR))) { + /* We may get error reports bypassing the I/O handlers, typically + * the case when sending a pure TCP check which fails, then the I/O + * handlers above are not called. This is completely handled by the + * main processing task so let's simply wake it up. If we get here, + * we expect errno to still be valid. + */ + TRACE_ERROR("report connection error", CHK_EV_HCHK_WAKE|CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + chk_report_conn_err(check, errno, 0); + task_wakeup(check->task, TASK_WOKEN_IO); + } + + if (check->result != CHK_RES_UNKNOWN || ret == -1) { + /* Check complete or aborted. Wake the check task up to be sure + * the result is handled ASAP. */ + ret = -1; + task_wakeup(check->task, TASK_WOKEN_IO); + } + + if (check->server) + HA_SPIN_UNLOCK(SERVER_LOCK, &check->server->lock); + else + HA_SPIN_UNLOCK(EMAIL_ALERTS_LOCK, &q->lock); + + end: + TRACE_LEAVE(CHK_EV_HCHK_WAKE, check); + return ret; +} + +/* This function checks if any I/O is wanted, and if so, attempts to do so */ +struct task *srv_chk_io_cb(struct task *t, void *ctx, unsigned int state) +{ + struct stconn *sc = ctx; + + wake_srv_chk(sc); + return NULL; +} + +/* returns <0, 0, >0 if check thread 1 is respectively less loaded than, + * equally as, or more loaded than thread 2. This is made to decide on + * migrations so a margin is applied in either direction. For ease of + * remembering the direction, consider this returns load1 - load2. + */ +static inline int check_thread_cmp_load(int thr1, int thr2) +{ + uint t1_load = _HA_ATOMIC_LOAD(&ha_thread_ctx[thr1].rq_total); + uint t1_act = _HA_ATOMIC_LOAD(&ha_thread_ctx[thr1].active_checks); + uint t2_load = _HA_ATOMIC_LOAD(&ha_thread_ctx[thr2].rq_total); + uint t2_act = _HA_ATOMIC_LOAD(&ha_thread_ctx[thr2].active_checks); + + /* twice as more active checks is a significant difference */ + if (t1_act * 2 < t2_act) + return -1; + + if (t2_act * 2 < t1_act) + return 1; + + /* twice as more rqload with more checks is also a significant + * difference. + */ + if (t1_act <= t2_act && t1_load * 2 < t2_load) + return -1; + + if (t2_act <= t1_act && t2_load * 2 < t1_load) + return 1; + + /* otherwise they're roughly equal */ + return 0; +} + +/* returns <0, 0, >0 if check thread 1's active checks count is respectively + * higher than, equal, or lower than thread 2's. This is made to decide on + * forced migrations upon overload, so only a very little margin is applied + * here (~1%). For ease of remembering the direction, consider this returns + * active1 - active2. + */ +static inline int check_thread_cmp_active(int thr1, int thr2) +{ + uint t1_act = _HA_ATOMIC_LOAD(&ha_thread_ctx[thr1].active_checks); + uint t2_act = _HA_ATOMIC_LOAD(&ha_thread_ctx[thr2].active_checks); + + if (t1_act * 128 >= t2_act * 129) + return 1; + if (t2_act * 128 >= t1_act * 129) + return -1; + return 0; +} + + +/* manages a server health-check that uses a connection. Returns + * the time the task accepts to wait, or TIME_ETERNITY for infinity. + * + * Please do NOT place any return statement in this function and only leave + * via the out_unlock label. + */ +struct task *process_chk_conn(struct task *t, void *context, unsigned int state) +{ + struct check *check = context; + struct proxy *proxy = check->proxy; + struct stconn *sc; + struct connection *conn; + int rv; + int expired = tick_is_expired(t->expire, now_ms); + + TRACE_ENTER(CHK_EV_TASK_WAKE, check); + + if (check->state & CHK_ST_SLEEPING) { + /* This check just restarted. It's still time to verify if + * we're on an overloaded thread or if a more suitable one is + * available. This helps spread the load over the available + * threads, without migrating too often. For this we'll check + * our load, and pick a random thread, check if it has less + * than half of the current thread's load, and if so we'll + * bounce the task there. It's possible because it's not yet + * tied to the current thread. The other thread will not bounce + * the task again because we're setting CHK_ST_READY indicating + * a migration. + */ + uint run_checks = _HA_ATOMIC_LOAD(&th_ctx->running_checks); + uint my_load = HA_ATOMIC_LOAD(&th_ctx->rq_total); + uint attempts = MIN(global.nbthread, 3); + + if (check->state & CHK_ST_READY) { + /* check was migrated, active already counted */ + activity[tid].check_adopted++; + } + else { + /* first wakeup, let's check if another thread is less loaded + * than this one in order to smooth the load. If the current + * thread is not yet overloaded, we attempt an opportunistic + * migration to another thread that is not full and that is + * significantly less loaded. And if the current thread is + * already overloaded, we attempt a forced migration to a + * thread with less active checks. We try at most 3 random + * other thread. + */ + while (attempts-- > 0 && + (!LIST_ISEMPTY(&th_ctx->queued_checks) || my_load >= 3) && + _HA_ATOMIC_LOAD(&th_ctx->active_checks) >= 3) { + uint new_tid = statistical_prng_range(global.nbthread); + + if (new_tid == tid) + continue; + + ALREADY_CHECKED(new_tid); + + if (check_thread_cmp_active(tid, new_tid) > 0 && + (run_checks >= global.tune.max_checks_per_thread || + check_thread_cmp_load(tid, new_tid) > 0)) { + /* Found one. Let's migrate the task over there. We have to + * remove it from the WQ first and kill its expire time + * otherwise the scheduler will reinsert it and trigger a + * BUG_ON() as we're not allowed to call task_queue() for a + * foreign thread. The recipient will restore the expiration. + */ + check->state |= CHK_ST_READY; + HA_ATOMIC_INC(&ha_thread_ctx[new_tid].active_checks); + task_unlink_wq(t); + t->expire = TICK_ETERNITY; + task_set_thread(t, new_tid); + task_wakeup(t, TASK_WOKEN_MSG); + TRACE_LEAVE(CHK_EV_TASK_WAKE, check); + return t; + } + } + /* check just woke up, count it as active */ + _HA_ATOMIC_INC(&th_ctx->active_checks); + } + + /* OK we're keeping it so this check is ours now */ + task_set_thread(t, tid); + check->state &= ~CHK_ST_SLEEPING; + + /* if we just woke up and the thread is full of running, or + * already has others waiting, we might have to wait in queue + * (for health checks only). This means !SLEEPING && !READY. + */ + if (check->server && + (!LIST_ISEMPTY(&th_ctx->queued_checks) || + (global.tune.max_checks_per_thread && + _HA_ATOMIC_LOAD(&th_ctx->running_checks) >= global.tune.max_checks_per_thread))) { + TRACE_DEVEL("health-check queued", CHK_EV_TASK_WAKE, check); + t->expire = TICK_ETERNITY; + LIST_APPEND(&th_ctx->queued_checks, &check->check_queue); + + /* reset fastinter flag (if set) so that srv_getinter() + * only returns fastinter if server health is degraded + */ + check->state &= ~CHK_ST_FASTINTER; + goto out_leave; + } + + /* OK let's run, now we cannot roll back anymore */ + check->state |= CHK_ST_READY; + activity[tid].check_started++; + _HA_ATOMIC_INC(&th_ctx->running_checks); + } + + /* at this point, CHK_ST_SLEEPING = 0 and CHK_ST_READY = 1*/ + + if (check->server) + HA_SPIN_LOCK(SERVER_LOCK, &check->server->lock); + + if (!(check->state & (CHK_ST_INPROGRESS|CHK_ST_IN_ALLOC|CHK_ST_OUT_ALLOC))) { + /* This task might have bounced from another overloaded thread, it + * needs an expiration timer that was supposed to be now, but that + * was erased during the bounce. + */ + if (!tick_isset(t->expire)) { + t->expire = now_ms; + expired = 0; + } + } + + if (unlikely(check->state & CHK_ST_PURGE)) { + TRACE_STATE("health-check state to purge", CHK_EV_TASK_WAKE, check); + } + else if (!(check->state & (CHK_ST_INPROGRESS))) { + /* no check currently running, but we might have been woken up + * before the timer's expiration to update it according to a + * new state (e.g. fastinter), in which case we'll reprogram + * the new timer. + */ + if (!tick_is_expired(t->expire, now_ms)) { /* woke up too early */ + if (check->server) { + int new_exp = tick_add(now_ms, MS_TO_TICKS(srv_getinter(check))); + + if (tick_is_expired(new_exp, t->expire)) { + TRACE_STATE("health-check was advanced", CHK_EV_TASK_WAKE, check); + goto update_timer; + } + } + + TRACE_STATE("health-check wake up too early", CHK_EV_TASK_WAKE, check); + goto out_unlock; + } + + /* we don't send any health-checks when the proxy is + * stopped, the server should not be checked or the check + * is disabled. + */ + if (((check->state & (CHK_ST_ENABLED | CHK_ST_PAUSED)) != CHK_ST_ENABLED) || + (proxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + TRACE_STATE("health-check paused or disabled", CHK_EV_TASK_WAKE, check); + goto reschedule; + } + + /* we'll initiate a new check */ + set_server_check_status(check, HCHK_STATUS_START, NULL); + + check->state |= CHK_ST_INPROGRESS; + TRACE_STATE("init new health-check", CHK_EV_TASK_WAKE|CHK_EV_HCHK_START, check); + + check->current_step = NULL; + + check->sc = sc_new_from_check(check, SC_FL_NONE); + if (!check->sc) { + set_server_check_status(check, HCHK_STATUS_SOCKERR, NULL); + goto end; + } + tcpcheck_main(check); + expired = 0; + } + + /* there was a test running. + * First, let's check whether there was an uncaught error, + * which can happen on connect timeout or error. + */ + if (check->result == CHK_RES_UNKNOWN && likely(!(check->state & CHK_ST_PURGE))) { + sc = check->sc; + conn = sc_conn(sc); + + /* Here the connection must be defined. Otherwise the + * error would have already been detected + */ + if ((conn && ((conn->flags & CO_FL_ERROR) || sc_ep_test(sc, SE_FL_ERROR))) || expired) { + TRACE_ERROR("report connection error", CHK_EV_TASK_WAKE|CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + chk_report_conn_err(check, 0, expired); + } + else { + if (check->state & CHK_ST_CLOSE_CONN) { + TRACE_DEVEL("closing current connection", CHK_EV_TASK_WAKE|CHK_EV_HCHK_RUN, check); + check->state &= ~CHK_ST_CLOSE_CONN; + if (!sc_reset_endp(check->sc)) { + /* error will be handled by tcpcheck_main(). + * On success, remove all flags except SE_FL_DETACHED + */ + sc_ep_clr(check->sc, ~SE_FL_DETACHED); + } + tcpcheck_main(check); + } + if (check->result == CHK_RES_UNKNOWN) { + TRACE_DEVEL("health-check not expired", CHK_EV_TASK_WAKE|CHK_EV_HCHK_RUN, check); + goto out_unlock; /* timeout not reached, wait again */ + } + } + } + + /* check complete or aborted */ + TRACE_STATE("health-check complete or aborted", CHK_EV_TASK_WAKE|CHK_EV_HCHK_END, check); + + /* check->sc may be NULL when the healthcheck is purged */ + check->current_step = NULL; + sc = check->sc; + conn = (sc ? sc_conn(sc) : NULL); + + if (conn && conn->xprt) { + /* The check was aborted and the connection was not yet closed. + * This can happen upon timeout, or when an external event such + * as a failed response coupled with "observe layer7" caused the + * server state to be suddenly changed. + */ + sc_conn_drain_and_shut(sc); + } + + if (sc) { + sc_destroy(sc); + check->sc = NULL; + } + + if (check->sess != NULL) { + vars_prune(&check->vars, check->sess, NULL); + session_free(check->sess); + check->sess = NULL; + } + + end: + if (check->server && likely(!(check->state & CHK_ST_PURGE))) { + if (check->result == CHK_RES_FAILED) { + /* a failure or timeout detected */ + TRACE_DEVEL("report failure", CHK_EV_TASK_WAKE|CHK_EV_HCHK_END|CHK_EV_HCHK_ERR, check); + check_notify_failure(check); + } + else if (check->result == CHK_RES_CONDPASS) { + /* check is OK but asks for stopping mode */ + TRACE_DEVEL("report conditional success", CHK_EV_TASK_WAKE|CHK_EV_HCHK_END|CHK_EV_HCHK_SUCC, check); + check_notify_stopping(check); + } + else if (check->result == CHK_RES_PASSED) { + /* a success was detected */ + TRACE_DEVEL("report success", CHK_EV_TASK_WAKE|CHK_EV_HCHK_END|CHK_EV_HCHK_SUCC, check); + check_notify_success(check); + } + } + + if (LIST_INLIST(&check->buf_wait.list)) + LIST_DEL_INIT(&check->buf_wait.list); + + check_release_buf(check, &check->bi); + check_release_buf(check, &check->bo); + _HA_ATOMIC_DEC(&th_ctx->running_checks); + _HA_ATOMIC_DEC(&th_ctx->active_checks); + check->state &= ~(CHK_ST_INPROGRESS|CHK_ST_IN_ALLOC|CHK_ST_OUT_ALLOC); + check->state &= ~CHK_ST_READY; + check->state |= CHK_ST_SLEEPING; + + update_timer: + /* when going to sleep, we need to check if other checks are waiting + * for a slot. If so we pick them out of the queue and wake them up. + */ + if (check->server && (check->state & CHK_ST_SLEEPING)) { + if (!LIST_ISEMPTY(&th_ctx->queued_checks) && + _HA_ATOMIC_LOAD(&th_ctx->running_checks) < global.tune.max_checks_per_thread) { + struct check *next_chk = LIST_ELEM(th_ctx->queued_checks.n, struct check *, check_queue); + + /* wake up pending task */ + LIST_DEL_INIT(&next_chk->check_queue); + + activity[tid].check_started++; + _HA_ATOMIC_INC(&th_ctx->running_checks); + next_chk->state |= CHK_ST_READY; + /* now running */ + task_wakeup(next_chk->task, TASK_WOKEN_RES); + } + } + + if (check->server) { + rv = 0; + if (global.spread_checks > 0) { + rv = srv_getinter(check) * global.spread_checks / 100; + rv -= (int) (2 * rv * (statistical_prng() / 4294967295.0)); + } + t->expire = tick_add(now_ms, MS_TO_TICKS(srv_getinter(check) + rv)); + /* reset fastinter flag (if set) so that srv_getinter() + * only returns fastinter if server health is degraded + */ + check->state &= ~CHK_ST_FASTINTER; + } + + reschedule: + if (proxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + t->expire = TICK_ETERNITY; + else { + while (tick_is_expired(t->expire, now_ms)) + t->expire = tick_add(t->expire, MS_TO_TICKS(check->inter)); + } + + out_unlock: + if (check->server) + HA_SPIN_UNLOCK(SERVER_LOCK, &check->server->lock); + + out_leave: + TRACE_LEAVE(CHK_EV_TASK_WAKE, check); + + /* Free the check if set to PURGE. After this, the check instance may be + * freed via the srv_drop invocation, so it must not be accessed after + * this point. + */ + if (unlikely(check->state & CHK_ST_PURGE)) { + free_check(check); + if (check->server) + srv_drop(check->server); + + t = NULL; + } + + return t; +} + + +/**************************************************************************/ +/************************** Init/deinit checks ****************************/ +/**************************************************************************/ +/* + * Tries to grab a buffer and to re-enables processing on check <target>. The + * check flags are used to figure what buffer was requested. It returns 1 if the + * allocation succeeds, in which case the I/O tasklet is woken up, or 0 if it's + * impossible to wake up and we prefer to be woken up later. + */ +int check_buf_available(void *target) +{ + struct check *check = target; + + BUG_ON(!check->sc); + + if ((check->state & CHK_ST_IN_ALLOC) && b_alloc(&check->bi)) { + TRACE_STATE("unblocking check, input buffer allocated", CHK_EV_TCPCHK_EXP|CHK_EV_RX_BLK, check); + check->state &= ~CHK_ST_IN_ALLOC; + tasklet_wakeup(check->sc->wait_event.tasklet); + return 1; + } + if ((check->state & CHK_ST_OUT_ALLOC) && b_alloc(&check->bo)) { + TRACE_STATE("unblocking check, output buffer allocated", CHK_EV_TCPCHK_SND|CHK_EV_TX_BLK, check); + check->state &= ~CHK_ST_OUT_ALLOC; + tasklet_wakeup(check->sc->wait_event.tasklet); + return 1; + } + + return 0; +} + +/* + * Allocate a buffer. If it fails, it adds the check in buffer wait queue. + */ +struct buffer *check_get_buf(struct check *check, struct buffer *bptr) +{ + struct buffer *buf = NULL; + + if (likely(!LIST_INLIST(&check->buf_wait.list)) && + unlikely((buf = b_alloc(bptr)) == NULL)) { + check->buf_wait.target = check; + check->buf_wait.wakeup_cb = check_buf_available; + LIST_APPEND(&th_ctx->buffer_wq, &check->buf_wait.list); + } + return buf; +} + +/* + * Release a buffer, if any, and try to wake up entities waiting in the buffer + * wait queue. + */ +void check_release_buf(struct check *check, struct buffer *bptr) +{ + if (bptr->size) { + b_free(bptr); + offer_buffers(check->buf_wait.target, 1); + } +} + +const char *init_check(struct check *check, int type) +{ + check->type = type; + + check->bi = BUF_NULL; + check->bo = BUF_NULL; + LIST_INIT(&check->buf_wait.list); + LIST_INIT(&check->check_queue); + return NULL; +} + +/* Liberates the resources allocated for a check. + * + * This function must only be run by the thread owning the check. + */ +void free_check(struct check *check) +{ + /* For agent-check, free the rules / vars from the server. This is not + * done for health-check : the proxy is the owner of the rules / vars + * in this case. + */ + if (check->state & CHK_ST_AGENT) { + free_tcpcheck_vars(&check->tcpcheck_rules->preset_vars); + ha_free(&check->tcpcheck_rules); + } + + task_destroy(check->task); + + check_release_buf(check, &check->bi); + check_release_buf(check, &check->bo); + if (check->sc) { + sc_destroy(check->sc); + check->sc = NULL; + } +} + +/* This function must be used in order to free a started check. The check will + * be scheduled for a next execution in order to properly close and free all + * check elements. + * + * Non thread-safe. + */ +void check_purge(struct check *check) +{ + check->state |= CHK_ST_PURGE; + task_wakeup(check->task, TASK_WOKEN_OTHER); +} + +/* manages a server health-check. Returns the time the task accepts to wait, or + * TIME_ETERNITY for infinity. + */ +struct task *process_chk(struct task *t, void *context, unsigned int state) +{ + struct check *check = context; + + if (check->type == PR_O2_EXT_CHK) + return process_chk_proc(t, context, state); + return process_chk_conn(t, context, state); + +} + + +int start_check_task(struct check *check, int mininter, + int nbcheck, int srvpos) +{ + struct task *t; + + /* task for the check. Process-based checks exclusively run on thread 1. */ + if (check->type == PR_O2_EXT_CHK) + t = task_new_on(0); + else + t = task_new_anywhere(); + + if (!t) + goto fail_alloc_task; + + check->task = t; + t->process = process_chk; + t->context = check; + + if (mininter < srv_getinter(check)) + mininter = srv_getinter(check); + + if (global.spread_checks > 0) { + int rnd; + + rnd = srv_getinter(check) * global.spread_checks / 100; + rnd -= (int) (2 * rnd * (ha_random32() / 4294967295.0)); + mininter += rnd; + } + + if (global.max_spread_checks && mininter > global.max_spread_checks) + mininter = global.max_spread_checks; + + /* check this every ms */ + t->expire = tick_add(now_ms, MS_TO_TICKS(mininter * srvpos / nbcheck)); + check->start = now_ns; + task_queue(t); + + return 1; + + fail_alloc_task: + ha_alert("Starting [%s:%s] check: out of memory.\n", + check->server->proxy->id, check->server->id); + return 0; +} + +/* + * Start health-check. + * Returns 0 if OK, ERR_FATAL on error, and prints the error in this case. + */ +static int start_checks() +{ + + struct proxy *px; + struct server *s; + int nbcheck=0, mininter=0, srvpos=0; + + /* 0- init the dummy frontend used to create all checks sessions */ + init_new_proxy(&checks_fe); + checks_fe.id = strdup("CHECKS-FE"); + checks_fe.cap = PR_CAP_FE | PR_CAP_BE; + checks_fe.mode = PR_MODE_TCP; + checks_fe.maxconn = 0; + checks_fe.conn_retries = CONN_RETRIES; + checks_fe.options2 |= PR_O2_INDEPSTR | PR_O2_SMARTCON | PR_O2_SMARTACC; + checks_fe.timeout.client = TICK_ETERNITY; + + /* 1- count the checkers to run simultaneously. + * We also determine the minimum interval among all of those which + * have an interval larger than SRV_CHK_INTER_THRES. This interval + * will be used to spread their start-up date. Those which have + * a shorter interval will start independently and will not dictate + * too short an interval for all others. + */ + for (px = proxies_list; px; px = px->next) { + for (s = px->srv; s; s = s->next) { + if (s->check.state & CHK_ST_CONFIGURED) { + nbcheck++; + if ((srv_getinter(&s->check) >= SRV_CHK_INTER_THRES) && + (!mininter || mininter > srv_getinter(&s->check))) + mininter = srv_getinter(&s->check); + } + + if (s->agent.state & CHK_ST_CONFIGURED) { + nbcheck++; + if ((srv_getinter(&s->agent) >= SRV_CHK_INTER_THRES) && + (!mininter || mininter > srv_getinter(&s->agent))) + mininter = srv_getinter(&s->agent); + } + } + } + + if (!nbcheck) + return ERR_NONE; + + srand((unsigned)time(NULL)); + + /* 2- start them as far as possible from each other. For this, we will + * start them after their interval is set to the min interval divided + * by the number of servers, weighted by the server's position in the + * list. + */ + for (px = proxies_list; px; px = px->next) { + if ((px->options2 & PR_O2_CHK_ANY) == PR_O2_EXT_CHK) { + if (init_pid_list()) { + ha_alert("Starting [%s] check: out of memory.\n", px->id); + return ERR_ALERT | ERR_FATAL; + } + } + + for (s = px->srv; s; s = s->next) { + /* A task for the main check */ + if (s->check.state & CHK_ST_CONFIGURED) { + if (s->check.type == PR_O2_EXT_CHK) { + if (!prepare_external_check(&s->check)) + return ERR_ALERT | ERR_FATAL; + } + if (!start_check_task(&s->check, mininter, nbcheck, srvpos)) + return ERR_ALERT | ERR_FATAL; + srvpos++; + } + + /* A task for a auxiliary agent check */ + if (s->agent.state & CHK_ST_CONFIGURED) { + if (!start_check_task(&s->agent, mininter, nbcheck, srvpos)) { + return ERR_ALERT | ERR_FATAL; + } + srvpos++; + } + } + } + return ERR_NONE; +} + + +/* + * Return value: + * the port to be used for the health check + * 0 in case no port could be found for the check + */ +static int srv_check_healthcheck_port(struct check *chk) +{ + int i = 0; + struct server *srv = NULL; + + srv = chk->server; + + /* by default, we use the health check port configured */ + if (chk->port > 0) + return chk->port; + + /* try to get the port from check_core.addr if check.port not set */ + i = get_host_port(&chk->addr); + if (i > 0) + return i; + + /* try to get the port from server address */ + /* prevent MAPPORTS from working at this point, since checks could + * not be performed in such case (MAPPORTS impose a relative ports + * based on live traffic) + */ + if (srv->flags & SRV_F_MAPPORTS) + return 0; + + i = srv->svc_port; /* by default */ + if (i > 0) + return i; + + return 0; +} + +/* Initializes an health-check attached to the server <srv>. Non-zero is returned + * if an error occurred. + */ +int init_srv_check(struct server *srv) +{ + const char *err; + struct tcpcheck_rule *r; + int ret = ERR_NONE; + int check_type; + + if (!srv->do_check || !(srv->proxy->cap & PR_CAP_BE)) + goto out; + + check_type = srv->check.tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK; + + if (!(srv->flags & SRV_F_DYNAMIC)) { + /* If neither a port nor an addr was specified and no check + * transport layer is forced, then the transport layer used by + * the checks is the same as for the production traffic. + * Otherwise we use raw_sock by default, unless one is + * specified. + */ + if (!srv->check.port && !is_addr(&srv->check.addr)) { + if (!srv->check.use_ssl && srv->use_ssl != -1) { + srv->check.use_ssl = srv->use_ssl; + srv->check.xprt = srv->xprt; + } + else if (srv->check.use_ssl == 1) + srv->check.xprt = xprt_get(XPRT_SSL); + srv->check.send_proxy |= (srv->pp_opts); + } + else if (srv->check.use_ssl == 1) + srv->check.xprt = xprt_get(XPRT_SSL); + } + else { + /* For dynamic servers, check-ssl and check-send-proxy must be + * explicitly defined even if the check port was not + * overridden. + */ + if (srv->check.use_ssl == 1) + srv->check.xprt = xprt_get(XPRT_SSL); + } + + /* Inherit the mux protocol from the server if not already defined for + * the check + */ + if (srv->mux_proto && !srv->check.mux_proto && + ((srv->mux_proto->mode == PROTO_MODE_HTTP && check_type == TCPCHK_RULES_HTTP_CHK) || + (srv->mux_proto->mode == PROTO_MODE_TCP && check_type != TCPCHK_RULES_HTTP_CHK))) { + srv->check.mux_proto = srv->mux_proto; + } + /* test that check proto is valid if explicitly defined */ + else if (srv->check.mux_proto && + ((srv->check.mux_proto->mode == PROTO_MODE_HTTP && check_type != TCPCHK_RULES_HTTP_CHK) || + (srv->check.mux_proto->mode == PROTO_MODE_TCP && check_type == TCPCHK_RULES_HTTP_CHK))) { + ha_alert("config: %s '%s': server '%s' uses an incompatible MUX protocol for the selected check type\n", + proxy_type_str(srv->proxy), srv->proxy->id, srv->id); + ret |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* validate <srv> server health-check settings */ + + /* We need at least a service port, a check port or the first tcp-check + * rule must be a 'connect' one when checking an IPv4/IPv6 server. + */ + if ((srv_check_healthcheck_port(&srv->check) != 0) || + (!is_inet_addr(&srv->check.addr) && (is_addr(&srv->check.addr) || !is_inet_addr(&srv->addr)))) + goto init; + + if (!srv->proxy->tcpcheck_rules.list || LIST_ISEMPTY(srv->proxy->tcpcheck_rules.list)) { + ha_alert("config: %s '%s': server '%s' has neither service port nor check port.\n", + proxy_type_str(srv->proxy), srv->proxy->id, srv->id); + ret |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* search the first action (connect / send / expect) in the list */ + r = get_first_tcpcheck_rule(&srv->proxy->tcpcheck_rules); + if (!r || (r->action != TCPCHK_ACT_CONNECT) || (!r->connect.port && !get_host_port(&r->connect.addr))) { + ha_alert("config: %s '%s': server '%s' has neither service port nor check port " + "nor tcp_check rule 'connect' with port information.\n", + proxy_type_str(srv->proxy), srv->proxy->id, srv->id); + ret |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* scan the tcp-check ruleset to ensure a port has been configured */ + list_for_each_entry(r, srv->proxy->tcpcheck_rules.list, list) { + if ((r->action == TCPCHK_ACT_CONNECT) && (!r->connect.port && !get_host_port(&r->connect.addr))) { + ha_alert("config: %s '%s': server '%s' has neither service port nor check port, " + "and a tcp_check rule 'connect' with no port information.\n", + proxy_type_str(srv->proxy), srv->proxy->id, srv->id); + ret |= ERR_ALERT | ERR_ABORT; + goto out; + } + } + + init: + err = init_check(&srv->check, srv->proxy->options2 & PR_O2_CHK_ANY); + if (err) { + ha_alert("config: %s '%s': unable to init check for server '%s' (%s).\n", + proxy_type_str(srv->proxy), srv->proxy->id, srv->id, err); + ret |= ERR_ALERT | ERR_ABORT; + goto out; + } + srv->check.state |= CHK_ST_CONFIGURED | CHK_ST_ENABLED | CHK_ST_SLEEPING; + srv_take(srv); + + /* Only increment maxsock for servers from the configuration. Dynamic + * servers at the moment are not taken into account for the estimation + * of the resources limits. + */ + if (global.mode & MODE_STARTING) + global.maxsock++; + + out: + return ret; +} + +/* Initializes an agent-check attached to the server <srv>. Non-zero is returned + * if an error occurred. + */ +int init_srv_agent_check(struct server *srv) +{ + struct tcpcheck_rule *chk; + const char *err; + int ret = ERR_NONE; + + if (!srv->do_agent || !(srv->proxy->cap & PR_CAP_BE)) + goto out; + + /* If there is no connect rule preceding all send / expect rules, an + * implicit one is inserted before all others. + */ + chk = get_first_tcpcheck_rule(srv->agent.tcpcheck_rules); + if (!chk || chk->action != TCPCHK_ACT_CONNECT) { + chk = calloc(1, sizeof(*chk)); + if (!chk) { + ha_alert("%s '%s': unable to add implicit tcp-check connect rule" + " to agent-check for server '%s' (out of memory).\n", + proxy_type_str(srv->proxy), srv->proxy->id, srv->id); + ret |= ERR_ALERT | ERR_FATAL; + goto out; + } + chk->action = TCPCHK_ACT_CONNECT; + chk->connect.options = (TCPCHK_OPT_DEFAULT_CONNECT|TCPCHK_OPT_IMPLICIT); + LIST_INSERT(srv->agent.tcpcheck_rules->list, &chk->list); + } + + /* <chk> is always defined here and it is a CONNECT action. If there is + * a preset variable, it means there is an agent string defined and data + * will be sent after the connect. + */ + if (!LIST_ISEMPTY(&srv->agent.tcpcheck_rules->preset_vars)) + chk->connect.options |= TCPCHK_OPT_HAS_DATA; + + + err = init_check(&srv->agent, PR_O2_TCPCHK_CHK); + if (err) { + ha_alert("config: %s '%s': unable to init agent-check for server '%s' (%s).\n", + proxy_type_str(srv->proxy), srv->proxy->id, srv->id, err); + ret |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (!srv->agent.inter) + srv->agent.inter = srv->check.inter; + + srv->agent.state |= CHK_ST_CONFIGURED | CHK_ST_ENABLED | CHK_ST_SLEEPING | CHK_ST_AGENT; + srv_take(srv); + + /* Only increment maxsock for servers from the configuration. Dynamic + * servers at the moment are not taken into account for the estimation + * of the resources limits. + */ + if (global.mode & MODE_STARTING) + global.maxsock++; + + out: + return ret; +} + +static void deinit_srv_check(struct server *srv) +{ + if (srv->check.state & CHK_ST_CONFIGURED) { + free_check(&srv->check); + /* it is safe to drop now since the main server reference is still held by the proxy */ + srv_drop(srv); + } + srv->check.state &= ~CHK_ST_CONFIGURED & ~CHK_ST_ENABLED; + srv->do_check = 0; +} + + +static void deinit_srv_agent_check(struct server *srv) +{ + if (srv->agent.state & CHK_ST_CONFIGURED) { + free_check(&srv->agent); + /* it is safe to drop now since the main server reference is still held by the proxy */ + srv_drop(srv); + } + + srv->agent.state &= ~CHK_ST_CONFIGURED & ~CHK_ST_ENABLED & ~CHK_ST_AGENT; + srv->do_agent = 0; +} + +REGISTER_POST_SERVER_CHECK(init_srv_check); +REGISTER_POST_SERVER_CHECK(init_srv_agent_check); +REGISTER_POST_CHECK(start_checks); + +REGISTER_SERVER_DEINIT(deinit_srv_check); +REGISTER_SERVER_DEINIT(deinit_srv_agent_check); + +/* perform minimal initializations */ +static void init_checks() +{ + int i; + + for (i = 0; i < MAX_THREADS; i++) + LIST_INIT(&ha_thread_ctx[i].queued_checks); +} + +INITCALL0(STG_PREPARE, init_checks); + +/**************************************************************************/ +/************************** Check sample fetches **************************/ +/**************************************************************************/ + +static struct sample_fetch_kw_list smp_kws = {ILH, { + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + + +/**************************************************************************/ +/************************ Check's parsing functions ***********************/ +/**************************************************************************/ +/* Parse the "addr" server keyword */ +static int srv_parse_addr(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + struct sockaddr_storage *sk; + int port1, port2, err_code = 0; + + + if (!*args[*cur_arg+1]) { + memprintf(errmsg, "'%s' expects <ipv4|ipv6> as argument.", args[*cur_arg]); + goto error; + } + + sk = str2sa_range(args[*cur_arg+1], NULL, &port1, &port2, NULL, NULL, NULL, errmsg, NULL, NULL, + PA_O_RESOLVE | PA_O_PORT_OK | PA_O_STREAM | PA_O_CONNECT); + if (!sk) { + memprintf(errmsg, "'%s' : %s", args[*cur_arg], *errmsg); + goto error; + } + + srv->check.addr = *sk; + /* if agentaddr was never set, we can use addr */ + if (!(srv->flags & SRV_F_AGENTADDR)) + srv->agent.addr = *sk; + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parse the "agent-addr" server keyword */ +static int srv_parse_agent_addr(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + struct sockaddr_storage sk; + int err_code = 0; + + if (!*(args[*cur_arg+1])) { + memprintf(errmsg, "'%s' expects an address as argument.", args[*cur_arg]); + goto error; + } + memset(&sk, 0, sizeof(sk)); + if (str2ip(args[*cur_arg + 1], &sk) == NULL) { + memprintf(errmsg, "parsing agent-addr failed. Check if '%s' is correct address.", args[*cur_arg+1]); + goto error; + } + set_srv_agent_addr(srv, &sk); + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parse the "agent-check" server keyword */ +static int srv_parse_agent_check(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = srv->agent.tcpcheck_rules; + struct tcpcheck_rule *chk; + int err_code = 0; + + if (srv->do_agent) + goto out; + + if (!(curpx->cap & PR_CAP_BE)) { + memprintf(errmsg, "'%s' ignored because %s '%s' has no backend capability", + args[*cur_arg], proxy_type_str(curpx), curpx->id); + return ERR_WARN; + } + + if (!rules) { + rules = calloc(1, sizeof(*rules)); + if (!rules) { + memprintf(errmsg, "out of memory."); + goto error; + } + LIST_INIT(&rules->preset_vars); + srv->agent.tcpcheck_rules = rules; + } + rules->list = NULL; + rules->flags = 0; + + rs = find_tcpcheck_ruleset("*agent-check"); + if (rs) + goto ruleset_found; + + rs = create_tcpcheck_ruleset("*agent-check"); + if (rs == NULL) { + memprintf(errmsg, "out of memory."); + goto error; + } + + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send-lf", "%[var(check.agent_string)]", ""}, + 1, curpx, &rs->rules, srv->conf.file, srv->conf.line, errmsg); + if (!chk) { + memprintf(errmsg, "'%s': %s", args[*cur_arg], *errmsg); + goto error; + } + chk->index = 0; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "custom", ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_AGENT_CHK, + srv->conf.file, srv->conf.line, errmsg); + if (!chk) { + memprintf(errmsg, "'%s': %s", args[*cur_arg], *errmsg); + goto error; + } + chk->expect.custom = tcpcheck_agent_expect_reply; + chk->index = 1; + LIST_APPEND(&rs->rules, &chk->list); + + ruleset_found: + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_AGENT_CHK; + srv->do_agent = 1; + + out: + return err_code; + + error: + deinit_srv_agent_check(srv); + free_tcpcheck_ruleset(rs); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parse the "agent-inter" server keyword */ +static int srv_parse_agent_inter(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + const char *err = NULL; + unsigned int delay; + int err_code = 0; + + if (!*(args[*cur_arg+1])) { + memprintf(errmsg, "'%s' expects a delay as argument.", args[*cur_arg]); + goto error; + } + + err = parse_time_err(args[*cur_arg+1], &delay, TIME_UNIT_MS); + if (err == PARSE_TIME_OVER) { + memprintf(errmsg, "timer overflow in argument <%s> to <%s> of server %s, maximum value is 2147483647 ms (~24.8 days).", + args[*cur_arg+1], args[*cur_arg], srv->id); + goto error; + } + else if (err == PARSE_TIME_UNDER) { + memprintf(errmsg, "timer underflow in argument <%s> to <%s> of server %s, minimum non-null value is 1 ms.", + args[*cur_arg+1], args[*cur_arg], srv->id); + goto error; + } + else if (err) { + memprintf(errmsg, "unexpected character '%c' in 'agent-inter' argument of server %s.", + *err, srv->id); + goto error; + } + if (delay <= 0) { + memprintf(errmsg, "invalid value %d for argument '%s' of server %s.", + delay, args[*cur_arg], srv->id); + goto error; + } + srv->agent.inter = delay; + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parse the "agent-port" server keyword */ +static int srv_parse_agent_port(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + int err_code = 0; + + if (!*(args[*cur_arg+1])) { + memprintf(errmsg, "'%s' expects a port number as argument.", args[*cur_arg]); + goto error; + } + + /* Only increment maxsock for servers from the configuration. Dynamic + * servers at the moment are not taken into account for the estimation + * of the resources limits. + */ + if (global.mode & MODE_STARTING) + global.maxsock++; + + set_srv_agent_port(srv, atol(args[*cur_arg + 1])); + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +int set_srv_agent_send(struct server *srv, const char *send) +{ + struct tcpcheck_rules *rules = srv->agent.tcpcheck_rules; + struct tcpcheck_var *var = NULL; + char *str; + + str = strdup(send); + var = create_tcpcheck_var(ist("check.agent_string")); + if (str == NULL || var == NULL) + goto error; + + free_tcpcheck_vars(&rules->preset_vars); + + var->data.type = SMP_T_STR; + var->data.u.str.area = str; + var->data.u.str.data = strlen(str); + LIST_INIT(&var->list); + LIST_APPEND(&rules->preset_vars, &var->list); + + return 1; + + error: + free(str); + free(var); + return 0; +} + +/* Parse the "agent-send" server keyword */ +static int srv_parse_agent_send(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + struct tcpcheck_rules *rules = srv->agent.tcpcheck_rules; + int err_code = 0; + + if (!*(args[*cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[*cur_arg]); + goto error; + } + + if (!rules) { + rules = calloc(1, sizeof(*rules)); + if (!rules) { + memprintf(errmsg, "out of memory."); + goto error; + } + LIST_INIT(&rules->preset_vars); + srv->agent.tcpcheck_rules = rules; + } + + if (!set_srv_agent_send(srv, args[*cur_arg+1])) { + memprintf(errmsg, "out of memory."); + goto error; + } + + out: + return err_code; + + error: + deinit_srv_agent_check(srv); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parse the "no-agent-send" server keyword */ +static int srv_parse_no_agent_check(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + deinit_srv_agent_check(srv); + return 0; +} + +/* Parse the "check" server keyword */ +static int srv_parse_check(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + if (!(curpx->cap & PR_CAP_BE)) { + memprintf(errmsg, "'%s' ignored because %s '%s' has no backend capability", + args[*cur_arg], proxy_type_str(curpx), curpx->id); + return ERR_WARN; + } + + srv->do_check = 1; + return 0; +} + +/* Parse the "check-send-proxy" server keyword */ +static int srv_parse_check_send_proxy(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + srv->check.send_proxy = 1; + return 0; +} + +/* Parse the "check-via-socks4" server keyword */ +static int srv_parse_check_via_socks4(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + srv->check.via_socks4 = 1; + return 0; +} + +/* Parse the "no-check" server keyword */ +static int srv_parse_no_check(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + deinit_srv_check(srv); + return 0; +} + +/* Parse the "no-check-send-proxy" server keyword */ +static int srv_parse_no_check_send_proxy(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + srv->check.send_proxy = 0; + return 0; +} + +/* parse the "check-proto" server keyword */ +static int srv_parse_check_proto(char **args, int *cur_arg, + struct proxy *px, struct server *newsrv, char **err) +{ + int err_code = 0; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[*cur_arg]); + goto error; + } + newsrv->check.mux_proto = get_mux_proto(ist(args[*cur_arg + 1])); + if (!newsrv->check.mux_proto) { + memprintf(err, "'%s' : unknown MUX protocol '%s'", args[*cur_arg], args[*cur_arg+1]); + goto error; + } + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + + +/* Parse the "rise" server keyword */ +static int srv_parse_check_rise(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + int err_code = 0; + + if (!*args[*cur_arg + 1]) { + memprintf(errmsg, "'%s' expects an integer argument.", args[*cur_arg]); + goto error; + } + + srv->check.rise = atol(args[*cur_arg+1]); + if (srv->check.rise <= 0) { + memprintf(errmsg, "'%s' has to be > 0.", args[*cur_arg]); + goto error; + } + + if (srv->check.health) + srv->check.health = srv->check.rise; + + out: + return err_code; + + error: + deinit_srv_agent_check(srv); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parse the "fall" server keyword */ +static int srv_parse_check_fall(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + int err_code = 0; + + if (!*args[*cur_arg + 1]) { + memprintf(errmsg, "'%s' expects an integer argument.", args[*cur_arg]); + goto error; + } + + srv->check.fall = atol(args[*cur_arg+1]); + if (srv->check.fall <= 0) { + memprintf(errmsg, "'%s' has to be > 0.", args[*cur_arg]); + goto error; + } + + out: + return err_code; + + error: + deinit_srv_agent_check(srv); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parse the "inter" server keyword */ +static int srv_parse_check_inter(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + const char *err = NULL; + unsigned int delay; + int err_code = 0; + + if (!*(args[*cur_arg+1])) { + memprintf(errmsg, "'%s' expects a delay as argument.", args[*cur_arg]); + goto error; + } + + err = parse_time_err(args[*cur_arg+1], &delay, TIME_UNIT_MS); + if (err == PARSE_TIME_OVER) { + memprintf(errmsg, "timer overflow in argument <%s> to <%s> of server %s, maximum value is 2147483647 ms (~24.8 days).", + args[*cur_arg+1], args[*cur_arg], srv->id); + goto error; + } + else if (err == PARSE_TIME_UNDER) { + memprintf(errmsg, "timer underflow in argument <%s> to <%s> of server %s, minimum non-null value is 1 ms.", + args[*cur_arg+1], args[*cur_arg], srv->id); + goto error; + } + else if (err) { + memprintf(errmsg, "unexpected character '%c' in 'agent-inter' argument of server %s.", + *err, srv->id); + goto error; + } + if (delay <= 0) { + memprintf(errmsg, "invalid value %d for argument '%s' of server %s.", + delay, args[*cur_arg], srv->id); + goto error; + } + srv->check.inter = delay; + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + + +/* Parse the "fastinter" server keyword */ +static int srv_parse_check_fastinter(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + const char *err = NULL; + unsigned int delay; + int err_code = 0; + + if (!*(args[*cur_arg+1])) { + memprintf(errmsg, "'%s' expects a delay as argument.", args[*cur_arg]); + goto error; + } + + err = parse_time_err(args[*cur_arg+1], &delay, TIME_UNIT_MS); + if (err == PARSE_TIME_OVER) { + memprintf(errmsg, "timer overflow in argument <%s> to <%s> of server %s, maximum value is 2147483647 ms (~24.8 days).", + args[*cur_arg+1], args[*cur_arg], srv->id); + goto error; + } + else if (err == PARSE_TIME_UNDER) { + memprintf(errmsg, "timer underflow in argument <%s> to <%s> of server %s, minimum non-null value is 1 ms.", + args[*cur_arg+1], args[*cur_arg], srv->id); + goto error; + } + else if (err) { + memprintf(errmsg, "unexpected character '%c' in 'agent-inter' argument of server %s.", + *err, srv->id); + goto error; + } + if (delay <= 0) { + memprintf(errmsg, "invalid value %d for argument '%s' of server %s.", + delay, args[*cur_arg], srv->id); + goto error; + } + srv->check.fastinter = delay; + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + + +/* Parse the "downinter" server keyword */ +static int srv_parse_check_downinter(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + const char *err = NULL; + unsigned int delay; + int err_code = 0; + + if (!*(args[*cur_arg+1])) { + memprintf(errmsg, "'%s' expects a delay as argument.", args[*cur_arg]); + goto error; + } + + err = parse_time_err(args[*cur_arg+1], &delay, TIME_UNIT_MS); + if (err == PARSE_TIME_OVER) { + memprintf(errmsg, "timer overflow in argument <%s> to <%s> of server %s, maximum value is 2147483647 ms (~24.8 days).", + args[*cur_arg+1], args[*cur_arg], srv->id); + goto error; + } + else if (err == PARSE_TIME_UNDER) { + memprintf(errmsg, "timer underflow in argument <%s> to <%s> of server %s, minimum non-null value is 1 ms.", + args[*cur_arg+1], args[*cur_arg], srv->id); + goto error; + } + else if (err) { + memprintf(errmsg, "unexpected character '%c' in 'agent-inter' argument of server %s.", + *err, srv->id); + goto error; + } + if (delay <= 0) { + memprintf(errmsg, "invalid value %d for argument '%s' of server %s.", + delay, args[*cur_arg], srv->id); + goto error; + } + srv->check.downinter = delay; + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parse the "port" server keyword */ +static int srv_parse_check_port(char **args, int *cur_arg, struct proxy *curpx, struct server *srv, + char **errmsg) +{ + int err_code = 0; + + if (!*(args[*cur_arg+1])) { + memprintf(errmsg, "'%s' expects a port number as argument.", args[*cur_arg]); + goto error; + } + + /* Only increment maxsock for servers from the configuration. Dynamic + * servers at the moment are not taken into account for the estimation + * of the resources limits. + */ + if (global.mode & MODE_STARTING) + global.maxsock++; + + srv->check.port = atol(args[*cur_arg+1]); + /* if agentport was never set, we can use port */ + if (!(srv->flags & SRV_F_AGENTPORT)) + srv->agent.port = srv->check.port; + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* config parser for global "tune.max-checks-per-thread" */ +static int check_parse_global_max_checks(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + global.tune.max_checks_per_thread = atoi(args[1]); + return 0; +} + +/* register "global" section keywords */ +static struct cfg_kw_list chk_cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.max-checks-per-thread", check_parse_global_max_checks }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &chk_cfg_kws); + +/* register "server" line keywords */ +static struct srv_kw_list srv_kws = { "CHK", { }, { + { "addr", srv_parse_addr, 1, 1, 1 }, /* IP address to send health to or to probe from agent-check */ + { "agent-addr", srv_parse_agent_addr, 1, 1, 1 }, /* Enable an auxiliary agent check */ + { "agent-check", srv_parse_agent_check, 0, 1, 1 }, /* Enable agent checks */ + { "agent-inter", srv_parse_agent_inter, 1, 1, 1 }, /* Set the interval between two agent checks */ + { "agent-port", srv_parse_agent_port, 1, 1, 1 }, /* Set the TCP port used for agent checks. */ + { "agent-send", srv_parse_agent_send, 1, 1, 1 }, /* Set string to send to agent. */ + { "check", srv_parse_check, 0, 1, 1 }, /* Enable health checks */ + { "check-proto", srv_parse_check_proto, 1, 1, 1 }, /* Set the mux protocol for health checks */ + { "check-send-proxy", srv_parse_check_send_proxy, 0, 1, 1 }, /* Enable PROXY protocol for health checks */ + { "check-via-socks4", srv_parse_check_via_socks4, 0, 1, 1 }, /* Enable socks4 proxy for health checks */ + { "no-agent-check", srv_parse_no_agent_check, 0, 1, 0 }, /* Do not enable any auxiliary agent check */ + { "no-check", srv_parse_no_check, 0, 1, 0 }, /* Disable health checks */ + { "no-check-send-proxy", srv_parse_no_check_send_proxy, 0, 1, 0 }, /* Disable PROXY protocol for health checks */ + { "rise", srv_parse_check_rise, 1, 1, 1 }, /* Set rise value for health checks */ + { "fall", srv_parse_check_fall, 1, 1, 1 }, /* Set fall value for health checks */ + { "inter", srv_parse_check_inter, 1, 1, 1 }, /* Set inter value for health checks */ + { "fastinter", srv_parse_check_fastinter, 1, 1, 1 }, /* Set fastinter value for health checks */ + { "downinter", srv_parse_check_downinter, 1, 1, 1 }, /* Set downinter value for health checks */ + { "port", srv_parse_check_port, 1, 1, 1 }, /* Set the TCP port used for health checks. */ + { NULL, NULL, 0 }, +}}; + +INITCALL1(STG_REGISTER, srv_register_keywords, &srv_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/chunk.c b/src/chunk.c new file mode 100644 index 0000000..c5b74fc --- /dev/null +++ b/src/chunk.c @@ -0,0 +1,311 @@ +/* + * Chunk management functions. + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <stdarg.h> +#include <stdio.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/chunk.h> +#include <haproxy/global.h> +#include <haproxy/tools.h> + +/* trash chunks used for various conversions */ +static THREAD_LOCAL struct buffer *trash_chunk; +static THREAD_LOCAL struct buffer trash_chunk1; +static THREAD_LOCAL struct buffer trash_chunk2; + +/* trash buffers used for various conversions */ +static int trash_size __read_mostly; +static THREAD_LOCAL char *trash_buf1; +static THREAD_LOCAL char *trash_buf2; + +/* the trash pool for reentrant allocations */ +struct pool_head *pool_head_trash __read_mostly = NULL; + +/* this is used to drain data, and as a temporary buffer for sprintf()... */ +THREAD_LOCAL struct buffer trash = { }; + +/* +* Returns a pre-allocated and initialized trash chunk that can be used for any +* type of conversion. Two chunks and their respective buffers are alternatively +* returned so that it is always possible to iterate data transformations without +* losing the data being transformed. The blocks are initialized to the size of +* a standard buffer, so they should be enough for everything. For convenience, +* a zero is always emitted at the beginning of the string so that it may be +* used as an empty string as well. +*/ +struct buffer *get_trash_chunk(void) +{ + char *trash_buf; + + if (trash_chunk == &trash_chunk1) { + trash_chunk = &trash_chunk2; + trash_buf = trash_buf2; + } + else { + trash_chunk = &trash_chunk1; + trash_buf = trash_buf1; + } + *trash_buf = 0; + chunk_init(trash_chunk, trash_buf, trash_size); + return trash_chunk; +} + +/* (re)allocates the trash buffers. Returns 0 in case of failure. It is + * possible to call this function multiple times if the trash size changes. + */ +static int alloc_trash_buffers(int bufsize) +{ + chunk_init(&trash, my_realloc2(trash.area, bufsize), bufsize); + trash_size = bufsize; + trash_buf1 = (char *)my_realloc2(trash_buf1, bufsize); + trash_buf2 = (char *)my_realloc2(trash_buf2, bufsize); + return trash.area && trash_buf1 && trash_buf2; +} + +static int alloc_trash_buffers_per_thread() +{ + return alloc_trash_buffers(global.tune.bufsize); +} + +static void free_trash_buffers_per_thread() +{ + chunk_destroy(&trash); + ha_free(&trash_buf2); + ha_free(&trash_buf1); +} + +/* Initialize the trash buffers. It returns 0 if an error occurred. */ +int init_trash_buffers(int first) +{ + pool_destroy(pool_head_trash); + pool_head_trash = create_pool("trash", + sizeof(struct buffer) + global.tune.bufsize, + MEM_F_EXACT); + if (!pool_head_trash || !alloc_trash_buffers(global.tune.bufsize)) + return 0; + return 1; +} + +/* This is called during STG_POOL to allocate trash buffers early. They will + * be reallocated later once their final size is known. It returns 0 if an + * error occurred. + */ +static int alloc_early_trash(void) +{ + return init_trash_buffers(1); +} + +/* + * Does an snprintf() at the beginning of chunk <chk>, respecting the limit of + * at most chk->size chars. If the chk->len is over, nothing is added. Returns + * the new chunk size, or < 0 in case of failure. + */ +int chunk_printf(struct buffer *chk, const char *fmt, ...) +{ + va_list argp; + int ret; + + if (!chk->area || !chk->size) + return 0; + + va_start(argp, fmt); + ret = vsnprintf(chk->area, chk->size, fmt, argp); + va_end(argp); + + if (ret >= chk->size) + return -1; + + chk->data = ret; + return chk->data; +} + +/* + * Does an snprintf() at the end of chunk <chk>, respecting the limit of + * at most chk->size chars. If the chk->len is over, nothing is added. Returns + * the new chunk size. + */ +int chunk_appendf(struct buffer *chk, const char *fmt, ...) +{ + va_list argp; + size_t room; + int ret; + + if (!chk->area || !chk->size) + return 0; + + room = chk->size - chk->data; + if (!room) + return chk->data; + + va_start(argp, fmt); + ret = vsnprintf(chk->area + chk->data, room, fmt, argp); + if (ret >= room) + /* do not copy anything in case of truncation */ + chk->area[chk->data] = 0; + else + chk->data += ret; + va_end(argp); + return chk->data; +} + +/* + * Encode chunk <src> into chunk <dst>, respecting the limit of at most + * chk->size chars. Replace non-printable or special characters with "&#%d;". + * If the chk->len is over, nothing is added. Returns the new chunk size. + */ +int chunk_htmlencode(struct buffer *dst, struct buffer *src) +{ + int i, l; + int olen, free; + char c; + + olen = dst->data; + + for (i = 0; i < src->data; i++) { + free = dst->size - dst->data; + + if (!free) { + dst->data = olen; + return dst->data; + } + + c = src->area[i]; + + if (!isascii((unsigned char)c) || !isprint((unsigned char)c) || c == '&' || c == '"' || c == '\'' || c == '<' || c == '>') { + l = snprintf(dst->area + dst->data, free, "&#%u;", + (unsigned char)c); + + if (free < l) { + dst->data = olen; + return dst->data; + } + + dst->data += l; + } else { + dst->area[dst->data] = c; + dst->data++; + } + } + + return dst->data; +} + +/* + * Encode chunk <src> into chunk <dst>, respecting the limit of at most + * chk->size chars. Replace non-printable or char passed in qc with "<%02X>". + * If the chk->len is over, nothing is added. Returns the new chunk size. + */ +int chunk_asciiencode(struct buffer *dst, struct buffer *src, char qc) +{ + int i, l; + int olen, free; + char c; + + olen = dst->data; + + for (i = 0; i < src->data; i++) { + free = dst->size - dst->data; + + if (!free) { + dst->data = olen; + return dst->data; + } + + c = src->area[i]; + + if (!isascii((unsigned char)c) || !isprint((unsigned char)c) || c == '<' || c == '>' || c == qc) { + l = snprintf(dst->area + dst->data, free, "<%02X>", + (unsigned char)c); + + if (free < l) { + dst->data = olen; + return dst->data; + } + + dst->data += l; + } else { + dst->area[dst->data] = c; + dst->data++; + } + } + + return dst->data; +} + +/* Compares the string in chunk <chk> with the string in <str> which must be + * zero-terminated. Return is the same as with strcmp(). Neither is allowed + * to be null. + */ +int chunk_strcmp(const struct buffer *chk, const char *str) +{ + const char *s1 = chk->area; + int len = chk->data; + int diff = 0; + + do { + if (--len < 0) { + diff = (unsigned char)0 - (unsigned char)*str; + break; + } + diff = (unsigned char)*(s1++) - (unsigned char)*(str++); + } while (!diff); + return diff; +} + +/* Case-insensitively compares the string in chunk <chk> with the string in + * <str> which must be zero-terminated. Return is the same as with strcmp(). + * Neither is allowed to be null. + */ +int chunk_strcasecmp(const struct buffer *chk, const char *str) +{ + const char *s1 = chk->area; + int len = chk->data; + int diff = 0; + + do { + if (--len < 0) { + diff = (unsigned char)0 - (unsigned char)*str; + break; + } + diff = (unsigned char)*s1 - (unsigned char)*str; + if (unlikely(diff)) { + unsigned int l = (unsigned char)*s1; + unsigned int r = (unsigned char)*str; + + l -= 'a'; + r -= 'a'; + + if (likely(l <= (unsigned char)'z' - 'a')) + l -= 'a' - 'A'; + if (likely(r <= (unsigned char)'z' - 'a')) + r -= 'a' - 'A'; + diff = l - r; + } + s1++; str++; + } while (!diff); + return diff; +} + +INITCALL0(STG_POOL, alloc_early_trash); +REGISTER_PER_THREAD_ALLOC(alloc_trash_buffers_per_thread); +REGISTER_PER_THREAD_FREE(free_trash_buffers_per_thread); +REGISTER_POST_DEINIT(free_trash_buffers_per_thread); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/cli.c b/src/cli.c new file mode 100644 index 0000000..d0435f7 --- /dev/null +++ b/src/cli.c @@ -0,0 +1,3423 @@ +/* + * Functions dedicated to statistics output and the stats socket + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * Copyright 2007-2009 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pwd.h> +#include <grp.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include <net/if.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/base64.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/cli.h> +#include <haproxy/compression.h> +#include <haproxy/dns-t.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/mworker.h> +#include <haproxy/mworker-t.h> +#include <haproxy/pattern-t.h> +#include <haproxy/peers.h> +#include <haproxy/pipe.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/quic_sock.h> +#include <haproxy/sample-t.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/session.h> +#include <haproxy/sock.h> +#include <haproxy/stats-t.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + +#define PAYLOAD_PATTERN "<<" + +static struct applet cli_applet; +static struct applet mcli_applet; + +static const char cli_permission_denied_msg[] = + "Permission denied\n" + ""; + + +static THREAD_LOCAL char *dynamic_usage_msg = NULL; + +/* List head of cli keywords */ +static struct cli_kw_list cli_keywords = { + .list = LIST_HEAD_INIT(cli_keywords.list) +}; + +extern const char *stat_status_codes[]; + +struct proxy *mworker_proxy; /* CLI proxy of the master */ +struct bind_conf *mcli_reload_bind_conf; + +/* CLI context for the "show env" command */ +struct show_env_ctx { + char **var; /* first variable to show */ + int show_one; /* stop after showing the first one */ +}; + +/* CLI context for the "show fd" command */ +/* flags for show_fd_ctx->show_mask */ +#define CLI_SHOWFD_F_PI 0x00000001 /* pipes */ +#define CLI_SHOWFD_F_LI 0x00000002 /* listeners */ +#define CLI_SHOWFD_F_FE 0x00000004 /* frontend conns */ +#define CLI_SHOWFD_F_SV 0x00000010 /* server-only conns */ +#define CLI_SHOWFD_F_PX 0x00000020 /* proxy-only conns */ +#define CLI_SHOWFD_F_BE 0x00000030 /* backend: srv+px */ +#define CLI_SHOWFD_F_CO 0x00000034 /* conn: be+fe */ +#define CLI_SHOWFD_F_ANY 0x0000003f /* any type */ + +struct show_fd_ctx { + int fd; /* first FD to show */ + int show_one; /* stop after showing one FD */ + uint show_mask; /* CLI_SHOWFD_F_xxx */ +}; + +/* CLI context for the "show cli sockets" command */ +struct show_sock_ctx { + struct bind_conf *bind_conf; + struct listener *listener; +}; + +static int cmp_kw_entries(const void *a, const void *b) +{ + const struct cli_kw *l = *(const struct cli_kw **)a; + const struct cli_kw *r = *(const struct cli_kw **)b; + + return strcmp(l->usage ? l->usage : "", r->usage ? r->usage : ""); +} + +/* This will show the help message and list the commands supported at the + * current level that match all of the first words of <args> if args is not + * NULL, or all args if none matches or if args is null. + */ +static char *cli_gen_usage_msg(struct appctx *appctx, char * const *args) +{ + struct cli_kw *entries[CLI_MAX_HELP_ENTRIES]; + struct cli_kw_list *kw_list; + struct cli_kw *kw; + struct buffer *tmp = get_trash_chunk(); + struct buffer out; + struct { struct cli_kw *kw; int dist; } matches[CLI_MAX_MATCHES], swp; + int idx; + int ishelp = 0; + int length = 0; + int help_entries = 0; + + ha_free(&dynamic_usage_msg); + + if (args && *args && strcmp(*args, "help") == 0) { + args++; + ishelp = 1; + } + + /* first, let's measure the longest match */ + list_for_each_entry(kw_list, &cli_keywords.list, list) { + for (kw = &kw_list->kw[0]; kw->str_kw[0]; kw++) { + if (kw->level & ~appctx->cli_level & (ACCESS_MASTER_ONLY|ACCESS_EXPERT|ACCESS_EXPERIMENTAL)) + continue; + if (!(appctx->cli_level & ACCESS_MCLI_DEBUG) && + (appctx->cli_level & ~kw->level & (ACCESS_MASTER_ONLY|ACCESS_MASTER)) == + (ACCESS_MASTER_ONLY|ACCESS_MASTER)) + continue; + + /* OK this command is visible */ + for (idx = 0; idx < CLI_PREFIX_KW_NB; idx++) { + if (!kw->str_kw[idx]) + break; // end of keyword + if (!args || !args[idx] || !*args[idx]) + break; // end of command line + if (strcmp(kw->str_kw[idx], args[idx]) != 0) + break; + if (idx + 1 > length) + length = idx + 1; + } + } + } + + /* now <length> equals the number of exactly matching words */ + chunk_reset(tmp); + if (ishelp) // this is the help message. + chunk_strcat(tmp, "The following commands are valid at this level:\n"); + else { + chunk_strcat(tmp, "Unknown command: '"); + if (args && *args) + chunk_strcat(tmp, *args); + chunk_strcat(tmp, "'"); + + if (!length && (!args || !*args || !**args)) // no match + chunk_strcat(tmp, ". Please enter one of the following commands only:\n"); + else // partial match + chunk_strcat(tmp, ", but maybe one of the following ones is a better match:\n"); + } + + for (idx = 0; idx < CLI_MAX_MATCHES; idx++) { + matches[idx].kw = NULL; + matches[idx].dist = INT_MAX; + } + + /* In case of partial match we'll look for the best matching entries + * starting from position <length> + */ + if (args && args[length] && *args[length]) { + list_for_each_entry(kw_list, &cli_keywords.list, list) { + for (kw = &kw_list->kw[0]; kw->str_kw[0]; kw++) { + if (kw->level & ~appctx->cli_level & (ACCESS_MASTER_ONLY|ACCESS_EXPERT|ACCESS_EXPERIMENTAL)) + continue; + if (!(appctx->cli_level & ACCESS_MCLI_DEBUG) && + ((appctx->cli_level & ~kw->level & (ACCESS_MASTER_ONLY|ACCESS_MASTER)) == + (ACCESS_MASTER_ONLY|ACCESS_MASTER))) + continue; + + for (idx = 0; idx < length; idx++) { + if (!kw->str_kw[idx]) + break; // end of keyword + if (!args || !args[idx] || !*args[idx]) + break; // end of command line + if (strcmp(kw->str_kw[idx], args[idx]) != 0) + break; + } + + /* extra non-matching words are fuzzy-matched */ + if (kw->usage && idx == length && args[idx] && *args[idx]) { + uint8_t word_sig[1024]; + uint8_t list_sig[1024]; + int dist = 0; + int totlen = 0; + int i; + + /* this one matches, let's compute the distance between the two + * on the remaining words. For this we're computing the signature + * of everything that remains and the cumulated length of the + * strings. + */ + memset(word_sig, 0, sizeof(word_sig)); + for (i = idx; i < CLI_PREFIX_KW_NB && args[i] && *args[i]; i++) { + update_word_fingerprint(word_sig, args[i]); + totlen += strlen(args[i]); + } + + memset(list_sig, 0, sizeof(list_sig)); + for (i = idx; i < CLI_PREFIX_KW_NB && kw->str_kw[i]; i++) { + update_word_fingerprint(list_sig, kw->str_kw[i]); + totlen += strlen(kw->str_kw[i]); + } + + dist = word_fingerprint_distance(word_sig, list_sig); + + /* insert this one at its place if relevant, in order to keep only + * the best matches. + */ + swp.kw = kw; swp.dist = dist; + if (dist < 5*totlen/2 && dist < matches[CLI_MAX_MATCHES-1].dist) { + matches[CLI_MAX_MATCHES-1] = swp; + for (idx = CLI_MAX_MATCHES - 1; --idx >= 0;) { + if (matches[idx+1].dist >= matches[idx].dist) + break; + matches[idx+1] = matches[idx]; + matches[idx] = swp; + } + } + } + } + } + } + + if (matches[0].kw) { + /* we have fuzzy matches, let's propose them */ + for (idx = 0; idx < CLI_MAX_MATCHES; idx++) { + kw = matches[idx].kw; + if (!kw) + break; + + /* stop the dump if some words look very unlikely candidates */ + if (matches[idx].dist > 5*matches[0].dist/2) + break; + + if (help_entries < CLI_MAX_HELP_ENTRIES) + entries[help_entries++] = kw; + } + } + + list_for_each_entry(kw_list, &cli_keywords.list, list) { + /* no full dump if we've already found nice candidates */ + if (matches[0].kw) + break; + + for (kw = &kw_list->kw[0]; kw->str_kw[0]; kw++) { + + /* in a worker or normal process, don't display master-only commands + * nor expert/experimental mode commands if not in this mode. + */ + if (kw->level & ~appctx->cli_level & (ACCESS_MASTER_ONLY|ACCESS_EXPERT|ACCESS_EXPERIMENTAL)) + continue; + + /* in master, if the CLI don't have the + * ACCESS_MCLI_DEBUG don't display commands that have + * neither the master bit nor the master-only bit. + */ + if (!(appctx->cli_level & ACCESS_MCLI_DEBUG) && + ((appctx->cli_level & ~kw->level & (ACCESS_MASTER_ONLY|ACCESS_MASTER)) == + (ACCESS_MASTER_ONLY|ACCESS_MASTER))) + continue; + + for (idx = 0; idx < length; idx++) { + if (!kw->str_kw[idx]) + break; // end of keyword + if (!args || !args[idx] || !*args[idx]) + break; // end of command line + if (strcmp(kw->str_kw[idx], args[idx]) != 0) + break; + } + + if (kw->usage && idx == length && help_entries < CLI_MAX_HELP_ENTRIES) + entries[help_entries++] = kw; + } + } + + qsort(entries, help_entries, sizeof(*entries), cmp_kw_entries); + + for (idx = 0; idx < help_entries; idx++) + chunk_appendf(tmp, " %s\n", entries[idx]->usage); + + /* always show the prompt/help/quit commands */ + chunk_strcat(tmp, + " help [<command>] : list matching or all commands\n" + " prompt [timed] : toggle interactive mode with prompt\n" + " quit : disconnect\n"); + + chunk_init(&out, NULL, 0); + chunk_dup(&out, tmp); + dynamic_usage_msg = out.area; + + cli_msg(appctx, LOG_INFO, dynamic_usage_msg); + return dynamic_usage_msg; +} + +struct cli_kw* cli_find_kw(char **args) +{ + struct cli_kw_list *kw_list; + struct cli_kw *kw;/* current cli_kw */ + char **tmp_args; + const char **tmp_str_kw; + int found = 0; + + if (LIST_ISEMPTY(&cli_keywords.list)) + return NULL; + + list_for_each_entry(kw_list, &cli_keywords.list, list) { + kw = &kw_list->kw[0]; + while (*kw->str_kw) { + tmp_args = args; + tmp_str_kw = kw->str_kw; + while (*tmp_str_kw) { + if (strcmp(*tmp_str_kw, *tmp_args) == 0) { + found = 1; + } else { + found = 0; + break; + } + tmp_args++; + tmp_str_kw++; + } + if (found) + return (kw); + kw++; + } + } + return NULL; +} + +struct cli_kw* cli_find_kw_exact(char **args) +{ + struct cli_kw_list *kw_list; + int found = 0; + int i; + int j; + + if (LIST_ISEMPTY(&cli_keywords.list)) + return NULL; + + list_for_each_entry(kw_list, &cli_keywords.list, list) { + for (i = 0; kw_list->kw[i].str_kw[0]; i++) { + found = 1; + for (j = 0; j < CLI_PREFIX_KW_NB; j++) { + if (args[j] == NULL && kw_list->kw[i].str_kw[j] == NULL) { + break; + } + if (args[j] == NULL || kw_list->kw[i].str_kw[j] == NULL) { + found = 0; + break; + } + if (strcmp(args[j], kw_list->kw[i].str_kw[j]) != 0) { + found = 0; + break; + } + } + if (found) + return &kw_list->kw[i]; + } + } + return NULL; +} + +void cli_register_kw(struct cli_kw_list *kw_list) +{ + LIST_APPEND(&cli_keywords.list, &kw_list->list); +} + +/* list all known keywords on stdout, one per line */ +void cli_list_keywords(void) +{ + struct cli_kw_list *kw_list; + struct cli_kw *kwp, *kwn, *kw; + int idx; + + for (kwn = kwp = NULL;; kwp = kwn) { + list_for_each_entry(kw_list, &cli_keywords.list, list) { + /* note: we sort based on the usage message when available, + * otherwise we fall back to the first keyword. + */ + for (kw = &kw_list->kw[0]; kw->str_kw[0]; kw++) { + if (strordered(kwp ? kwp->usage ? kwp->usage : kwp->str_kw[0] : NULL, + kw->usage ? kw->usage : kw->str_kw[0], + kwn != kwp ? kwn->usage ? kwn->usage : kwn->str_kw[0] : NULL)) + kwn = kw; + } + } + + if (kwn == kwp) + break; + + for (idx = 0; kwn->str_kw[idx]; idx++) { + printf("%s ", kwn->str_kw[idx]); + } + if (kwn->level & (ACCESS_MASTER_ONLY|ACCESS_MASTER)) + printf("[MASTER] "); + if (!(kwn->level & ACCESS_MASTER_ONLY)) + printf("[WORKER] "); + if (kwn->level & ACCESS_EXPERT) + printf("[EXPERT] "); + if (kwn->level & ACCESS_EXPERIMENTAL) + printf("[EXPERIM] "); + printf("\n"); + } +} + +/* allocate a new stats frontend named <name>, and return it + * (or NULL in case of lack of memory). + */ +static struct proxy *cli_alloc_fe(const char *name, const char *file, int line) +{ + struct proxy *fe; + + fe = calloc(1, sizeof(*fe)); + if (!fe) + return NULL; + + init_new_proxy(fe); + fe->next = proxies_list; + proxies_list = fe; + fe->last_change = ns_to_sec(now_ns); + fe->id = strdup("GLOBAL"); + fe->cap = PR_CAP_FE|PR_CAP_INT; + fe->maxconn = 10; /* default to 10 concurrent connections */ + fe->timeout.client = MS_TO_TICKS(10000); /* default timeout of 10 seconds */ + fe->conf.file = strdup(file); + fe->conf.line = line; + fe->accept = frontend_accept; + fe->default_target = &cli_applet.obj_type; + + /* the stats frontend is the only one able to assign ID #0 */ + fe->conf.id.key = fe->uuid = 0; + eb32_insert(&used_proxy_id, &fe->conf.id); + return fe; +} + +/* This function parses a "stats" statement in the "global" section. It returns + * -1 if there is any error, otherwise zero. If it returns -1, it will write an + * error message into the <err> buffer which will be preallocated. The trailing + * '\n' must not be written. The function must be called with <args> pointing to + * the first word after "stats". + */ +static int cli_parse_global(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + struct bind_conf *bind_conf; + struct listener *l; + + if (strcmp(args[1], "socket") == 0) { + int cur_arg; + + if (*args[2] == 0) { + memprintf(err, "'%s %s' in global section expects an address or a path to a UNIX socket", args[0], args[1]); + return -1; + } + + if (!global.cli_fe) { + if ((global.cli_fe = cli_alloc_fe("GLOBAL", file, line)) == NULL) { + memprintf(err, "'%s %s' : out of memory trying to allocate a frontend", args[0], args[1]); + return -1; + } + } + + bind_conf = bind_conf_alloc(global.cli_fe, file, line, args[2], xprt_get(XPRT_RAW)); + if (!bind_conf) { + memprintf(err, "'%s %s' : out of memory trying to allocate a bind_conf", args[0], args[1]); + return -1; + } + bind_conf->level &= ~ACCESS_LVL_MASK; + bind_conf->level |= ACCESS_LVL_OPER; /* default access level */ + + if (!str2listener(args[2], global.cli_fe, bind_conf, file, line, err)) { + memprintf(err, "parsing [%s:%d] : '%s %s' : %s\n", + file, line, args[0], args[1], err && *err ? *err : "error"); + return -1; + } + + cur_arg = 3; + while (*args[cur_arg]) { + struct bind_kw *kw; + const char *best; + int code; + + kw = bind_find_kw(args[cur_arg]); + if (kw) { + if (!kw->parse) { + memprintf(err, "'%s %s' : '%s' option is not implemented in this version (check build options).", + args[0], args[1], args[cur_arg]); + return -1; + } + + code = kw->parse(args, cur_arg, global.cli_fe, bind_conf, err); + + /* FIXME: this is ugly, we don't have a way to collect warnings, + * yet some important bind keywords may report warnings that we + * must display. + */ + if (((code & (ERR_WARN|ERR_FATAL|ERR_ALERT)) == ERR_WARN) && err && *err) { + indent_msg(err, 2); + ha_warning("parsing [%s:%d] : '%s %s' : %s\n", file, line, args[0], args[1], *err); + ha_free(err); + } + + if (code & ~ERR_WARN) { + if (err && *err) + memprintf(err, "'%s %s' : '%s'", args[0], args[1], *err); + else + memprintf(err, "'%s %s' : error encountered while processing '%s'", + args[0], args[1], args[cur_arg]); + return -1; + } + + cur_arg += 1 + kw->skip; + continue; + } + + best = bind_find_best_kw(args[cur_arg]); + if (best) + memprintf(err, "'%s %s' : unknown keyword '%s'. Did you mean '%s' maybe ?", + args[0], args[1], args[cur_arg], best); + else + memprintf(err, "'%s %s' : unknown keyword '%s'.", + args[0], args[1], args[cur_arg]); + return -1; + } + + bind_conf->accept = session_accept_fd; + bind_conf->nice = -64; /* we want to boost priority for local stats */ + bind_conf->options |= BC_O_UNLIMITED; /* don't make the peers subject to global limits */ + + list_for_each_entry(l, &bind_conf->listeners, by_bind) { + global.maxsock++; /* for the listening socket */ + } + } + else if (strcmp(args[1], "timeout") == 0) { + unsigned timeout; + const char *res = parse_time_err(args[2], &timeout, TIME_UNIT_MS); + + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s %s' (maximum value is 2147483647 ms or ~24.8 days)", + args[2], args[0], args[1]); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s %s' (minimum non-null value is 1 ms)", + args[2], args[0], args[1]); + return -1; + } + else if (res) { + memprintf(err, "'%s %s' : unexpected character '%c'", args[0], args[1], *res); + return -1; + } + + if (!timeout) { + memprintf(err, "'%s %s' expects a positive value", args[0], args[1]); + return -1; + } + if (!global.cli_fe) { + if ((global.cli_fe = cli_alloc_fe("GLOBAL", file, line)) == NULL) { + memprintf(err, "'%s %s' : out of memory trying to allocate a frontend", args[0], args[1]); + return -1; + } + } + global.cli_fe->timeout.client = MS_TO_TICKS(timeout); + } + else if (strcmp(args[1], "maxconn") == 0) { + int maxconn = atol(args[2]); + + if (maxconn <= 0) { + memprintf(err, "'%s %s' expects a positive value", args[0], args[1]); + return -1; + } + + if (!global.cli_fe) { + if ((global.cli_fe = cli_alloc_fe("GLOBAL", file, line)) == NULL) { + memprintf(err, "'%s %s' : out of memory trying to allocate a frontend", args[0], args[1]); + return -1; + } + } + global.cli_fe->maxconn = maxconn; + } + else if (strcmp(args[1], "bind-process") == 0) { + memprintf(err, "'%s %s' is not supported anymore.", args[0], args[1]); + return -1; + } + else { + memprintf(err, "'%s' only supports 'socket', 'maxconn', 'bind-process' and 'timeout' (got '%s')", args[0], args[1]); + return -1; + } + return 0; +} + +/* + * This function exports the bound addresses of a <frontend> in the environment + * variable <varname>. Those addresses are separated by semicolons and prefixed + * with their type (abns@, unix@, sockpair@ etc) + * Return -1 upon error, 0 otherwise + */ +int listeners_setenv(struct proxy *frontend, const char *varname) +{ + struct buffer *trash = get_trash_chunk(); + struct bind_conf *bind_conf; + + if (frontend) { + list_for_each_entry(bind_conf, &frontend->conf.bind, by_fe) { + struct listener *l; + + list_for_each_entry(l, &bind_conf->listeners, by_bind) { + char addr[46]; + char port[6]; + + /* separate listener by semicolons */ + if (trash->data) + chunk_appendf(trash, ";"); + + if (l->rx.addr.ss_family == AF_UNIX) { + const struct sockaddr_un *un; + + un = (struct sockaddr_un *)&l->rx.addr; + if (un->sun_path[0] == '\0') { + chunk_appendf(trash, "abns@%s", un->sun_path+1); + } else { + chunk_appendf(trash, "unix@%s", un->sun_path); + } + } else if (l->rx.addr.ss_family == AF_INET) { + addr_to_str(&l->rx.addr, addr, sizeof(addr)); + port_to_str(&l->rx.addr, port, sizeof(port)); + chunk_appendf(trash, "ipv4@%s:%s", addr, port); + } else if (l->rx.addr.ss_family == AF_INET6) { + addr_to_str(&l->rx.addr, addr, sizeof(addr)); + port_to_str(&l->rx.addr, port, sizeof(port)); + chunk_appendf(trash, "ipv6@[%s]:%s", addr, port); + } else if (l->rx.addr.ss_family == AF_CUST_SOCKPAIR) { + chunk_appendf(trash, "sockpair@%d", ((struct sockaddr_in *)&l->rx.addr)->sin_addr.s_addr); + } + } + } + trash->area[trash->data++] = '\0'; + if (setenv(varname, trash->area, 1) < 0) + return -1; + } + + return 0; +} + +int cli_socket_setenv() +{ + if (listeners_setenv(global.cli_fe, "HAPROXY_CLI") < 0) + return -1; + if (listeners_setenv(mworker_proxy, "HAPROXY_MASTER_CLI") < 0) + return -1; + + return 0; +} + +REGISTER_CONFIG_POSTPARSER("cli", cli_socket_setenv); + +/* Verifies that the CLI at least has a level at least as high as <level> + * (typically ACCESS_LVL_ADMIN). Returns 1 if OK, otherwise 0. In case of + * failure, an error message is prepared and the appctx's state is adjusted + * to print it so that a return 1 is enough to abort any processing. + */ +int cli_has_level(struct appctx *appctx, int level) +{ + + if ((appctx->cli_level & ACCESS_LVL_MASK) < level) { + cli_err(appctx, cli_permission_denied_msg); + return 0; + } + return 1; +} + +/* same as cli_has_level but for the CLI proxy and without error message */ +int pcli_has_level(struct stream *s, int level) +{ + if ((s->pcli_flags & ACCESS_LVL_MASK) < level) { + return 0; + } + return 1; +} + +/* Returns severity_output for the current session if set, or default for the socket */ +static int cli_get_severity_output(struct appctx *appctx) +{ + if (appctx->cli_severity_output) + return appctx->cli_severity_output; + return strm_li(appctx_strm(appctx))->bind_conf->severity_output; +} + +/* Processes the CLI interpreter on the stats socket. This function is called + * from the CLI's IO handler running in an appctx context. The function returns + * 1 if the request was understood, otherwise zero (in which case an error + * message will be displayed). It is called with appctx->st0 + * set to CLI_ST_GETREQ and presets ->st2 to 0 so that parsers don't have to do + * it. It will possilbly leave st0 to CLI_ST_CALLBACK if the keyword needs to + * have its own I/O handler called again. Most of the time, parsers will only + * set st0 to CLI_ST_PRINT and put their message to be displayed into cli.msg. + * If a keyword parser is NULL and an I/O handler is declared, the I/O handler + * will automatically be used. + */ +static int cli_parse_request(struct appctx *appctx) +{ + char *args[MAX_CLI_ARGS + 1], *p, *end, *payload = NULL; + int i = 0; + struct cli_kw *kw; + + p = appctx->chunk->area; + end = p + appctx->chunk->data; + + /* + * Get pointers on words. + * One extra slot is reserved to store a pointer on a null byte. + */ + while (i < MAX_CLI_ARGS && p < end) { + int j, k; + + /* skip leading spaces/tabs */ + p += strspn(p, " \t"); + if (!*p) + break; + + /* first check if the '<<' is present, but this is not enough + * because we don't know if this is the end of the string */ + if (strncmp(p, PAYLOAD_PATTERN, strlen(PAYLOAD_PATTERN)) == 0) { + int pat_len = strlen(appctx->cli_payload_pat); + + /* then if the customized pattern is empty, check if the next character is '\0' */ + if (pat_len == 0 && p[strlen(PAYLOAD_PATTERN)] == '\0') { + payload = p + strlen(PAYLOAD_PATTERN) + 1; + break; + } + + /* else if we found the customized pattern at the end of the string */ + if (strcmp(p + strlen(PAYLOAD_PATTERN), appctx->cli_payload_pat) == 0) { + payload = p + strlen(PAYLOAD_PATTERN) + pat_len + 1; + break; + } + } + + args[i] = p; + while (1) { + p += strcspn(p, " \t\\"); + /* escaped chars using backlashes (\) */ + if (*p == '\\') { + if (!*++p) + break; + if (!*++p) + break; + } else { + break; + } + } + *p++ = 0; + + /* unescape backslashes (\) */ + for (j = 0, k = 0; args[i][k]; k++) { + if (args[i][k] == '\\') { + if (args[i][k + 1] == '\\') + k++; + else + continue; + } + args[i][j] = args[i][k]; + j++; + } + args[i][j] = 0; + + i++; + } + /* fill unused slots */ + p = appctx->chunk->area + appctx->chunk->data; + for (; i < MAX_CLI_ARGS + 1; i++) + args[i] = p; + + if (!**args) + return 0; + + kw = cli_find_kw(args); + if (!kw || + (kw->level & ~appctx->cli_level & ACCESS_MASTER_ONLY) || + (!(appctx->cli_level & ACCESS_MCLI_DEBUG) && + (appctx->cli_level & ~kw->level & (ACCESS_MASTER_ONLY|ACCESS_MASTER)) == (ACCESS_MASTER_ONLY|ACCESS_MASTER))) { + /* keyword not found in this mode */ + cli_gen_usage_msg(appctx, args); + return 0; + } + + /* don't handle expert mode commands if not in this mode. */ + if (kw->level & ~appctx->cli_level & ACCESS_EXPERT) { + cli_err(appctx, "This command is restricted to expert mode only.\n"); + return 0; + } + + if (kw->level & ~appctx->cli_level & ACCESS_EXPERIMENTAL) { + cli_err(appctx, "This command is restricted to experimental mode only.\n"); + return 0; + } + + if (kw->level == ACCESS_EXPERT) + mark_tainted(TAINTED_CLI_EXPERT_MODE); + else if (kw->level == ACCESS_EXPERIMENTAL) + mark_tainted(TAINTED_CLI_EXPERIMENTAL_MODE); + + appctx->io_handler = kw->io_handler; + appctx->io_release = kw->io_release; + + if (kw->parse && kw->parse(args, payload, appctx, kw->private) != 0) + goto fail; + + /* kw->parse could set its own io_handler or io_release handler */ + if (!appctx->io_handler) + goto fail; + + appctx->st0 = CLI_ST_CALLBACK; + return 1; +fail: + appctx->io_handler = NULL; + appctx->io_release = NULL; + return 1; +} + +/* prepends then outputs the argument msg with a syslog-type severity depending on severity_output value */ +static int cli_output_msg(struct appctx *appctx, const char *msg, int severity, int severity_output) +{ + struct buffer *tmp; + struct ist imsg; + + tmp = get_trash_chunk(); + chunk_reset(tmp); + + if (likely(severity_output == CLI_SEVERITY_NONE)) + goto send_it; + + if (severity < 0 || severity > 7) { + ha_warning("socket command feedback with invalid severity %d", severity); + chunk_printf(tmp, "[%d]: ", severity); + } + else { + switch (severity_output) { + case CLI_SEVERITY_NUMBER: + chunk_printf(tmp, "[%d]: ", severity); + break; + case CLI_SEVERITY_STRING: + chunk_printf(tmp, "[%s]: ", log_levels[severity]); + break; + default: + ha_warning("Unrecognized severity output %d", severity_output); + } + } + send_it: + /* the vast majority of messages have their trailing LF but a few are + * still missing it, and very rare ones might even have two. For this + * reason, we'll first delete the trailing LFs if present, then + * systematically append one. + */ + for (imsg = ist(msg); imsg.len > 0 && imsg.ptr[imsg.len - 1] == '\n'; imsg.len--) + ; + + chunk_istcat(tmp, imsg); + chunk_istcat(tmp, ist("\n")); + + return applet_putchk(appctx, tmp); +} + +/* This I/O handler runs as an applet embedded in a stream connector. It is + * used to processes I/O from/to the stats unix socket. The system relies on a + * state machine handling requests and various responses. We read a request, + * then we process it and send the response, and we possibly display a prompt. + * Then we can read again. The state is stored in appctx->st0 and is one of the + * CLI_ST_* constants. appctx->st1 is used to indicate whether prompt is enabled + * or not. + */ +static void cli_io_handler(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct channel *req = sc_oc(sc); + struct channel *res = sc_ic(sc); + struct bind_conf *bind_conf = strm_li(__sc_strm(sc))->bind_conf; + int reql; + int len; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) { + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + goto out; + } + + /* Check if the input buffer is available. */ + if (!b_size(&res->buf)) { + sc_need_room(sc, 0); + goto out; + } + + while (1) { + if (appctx->st0 == CLI_ST_INIT) { + /* reset severity to default at init */ + appctx->cli_severity_output = bind_conf->severity_output; + applet_reset_svcctx(appctx); + appctx->st0 = CLI_ST_GETREQ; + appctx->cli_level = bind_conf->level; + } + else if (appctx->st0 == CLI_ST_END) { + se_fl_set(appctx->sedesc, SE_FL_EOS); + free_trash_chunk(appctx->chunk); + appctx->chunk = NULL; + break; + } + else if (appctx->st0 == CLI_ST_GETREQ) { + char *str; + + /* use a trash chunk to store received data */ + if (!appctx->chunk) { + appctx->chunk = alloc_trash_chunk(); + if (!appctx->chunk) { + se_fl_set(appctx->sedesc, SE_FL_ERROR); + appctx->st0 = CLI_ST_END; + continue; + } + } + + str = appctx->chunk->area + appctx->chunk->data; + + /* ensure we have some output room left in the event we + * would want to return some info right after parsing. + */ + if (buffer_almost_full(sc_ib(sc))) { + sc_need_room(sc, b_size(&res->buf) / 2); + break; + } + + /* payload doesn't take escapes nor does it end on semi-colons, so + * we use the regular getline. Normal mode however must stop on + * LFs and semi-colons that are not prefixed by a backslash. Note + * that we reserve one byte at the end to insert a trailing nul byte. + */ + + if (appctx->st1 & APPCTX_CLI_ST1_PAYLOAD) + reql = co_getline(sc_oc(sc), str, + appctx->chunk->size - appctx->chunk->data - 1); + else + reql = co_getdelim(sc_oc(sc), str, + appctx->chunk->size - appctx->chunk->data - 1, + "\n;", '\\'); + + if (reql <= 0) { /* closed or EOL not found */ + if (reql == 0) + break; + se_fl_set(appctx->sedesc, SE_FL_ERROR); + appctx->st0 = CLI_ST_END; + continue; + } + + if (!(appctx->st1 & APPCTX_CLI_ST1_PAYLOAD)) { + /* seek for a possible unescaped semi-colon. If we find + * one, we replace it with an LF and skip only this part. + */ + for (len = 0; len < reql; len++) { + if (str[len] == '\\') { + len++; + continue; + } + if (str[len] == ';') { + str[len] = '\n'; + reql = len + 1; + break; + } + } + } + + /* now it is time to check that we have a full line, + * remove the trailing \n and possibly \r, then cut the + * line. + */ + len = reql - 1; + if (str[len] != '\n') { + se_fl_set(appctx->sedesc, SE_FL_ERROR); + appctx->st0 = CLI_ST_END; + continue; + } + + if (len && str[len-1] == '\r') + len--; + + str[len] = '\0'; + appctx->chunk->data += len; + + if (appctx->st1 & APPCTX_CLI_ST1_PAYLOAD) { + appctx->chunk->area[appctx->chunk->data] = '\n'; + appctx->chunk->area[appctx->chunk->data + 1] = 0; + appctx->chunk->data++; + } + + appctx->st0 = CLI_ST_PROMPT; + + if (appctx->st1 & APPCTX_CLI_ST1_PAYLOAD) { + /* look for a pattern */ + if (len == strlen(appctx->cli_payload_pat)) { + /* here use 'len' because str still contains the \n */ + if (strncmp(str, appctx->cli_payload_pat, len) == 0) { + /* remove the last two \n */ + appctx->chunk->data -= strlen(appctx->cli_payload_pat) + 2; + appctx->chunk->area[appctx->chunk->data] = 0; + cli_parse_request(appctx); + chunk_reset(appctx->chunk); + /* NB: cli_sock_parse_request() may have put + * another CLI_ST_O_* into appctx->st0. + */ + + appctx->st1 &= ~APPCTX_CLI_ST1_PAYLOAD; + } + } + } + else { + char *last_arg; + /* + * Look for the "payload start" pattern at the end of a line + * Its location is not remembered here, this is just to switch + * to a gathering mode. + * The pattern must start by << followed by 0 + * to 7 characters, and finished by the end of + * the command (\n or ;). + */ + /* look for the first space starting by the end of the line */ + for (last_arg = appctx->chunk->area + appctx->chunk->data; last_arg != appctx->chunk->area; last_arg--) { + if (*last_arg == ' ' || *last_arg == '\t') { + last_arg++; + break; + } + } + if (strncmp(last_arg, PAYLOAD_PATTERN, strlen(PAYLOAD_PATTERN)) == 0) { + ssize_t pat_len = strlen(last_arg + strlen(PAYLOAD_PATTERN)); + + /* A customized pattern can't be more than 7 characters + * if it's more, don't make it a payload + */ + if (pat_len < sizeof(appctx->cli_payload_pat)) { + appctx->st1 |= APPCTX_CLI_ST1_PAYLOAD; + /* copy the customized pattern, don't store the << */ + strncpy(appctx->cli_payload_pat, last_arg + strlen(PAYLOAD_PATTERN), sizeof(appctx->cli_payload_pat)-1); + appctx->cli_payload_pat[sizeof(appctx->cli_payload_pat)-1] = '\0'; + appctx->chunk->data++; // keep the trailing \0 after the pattern + } + } + else { + /* no payload, the command is complete: parse the request */ + cli_parse_request(appctx); + chunk_reset(appctx->chunk); + } + } + + /* re-adjust req buffer */ + co_skip(sc_oc(sc), reql); + sc_opposite(sc)->flags |= SC_FL_RCV_ONCE; /* we plan to read small requests */ + } + else { /* output functions */ + struct cli_print_ctx *ctx; + const char *msg; + int sev; + + switch (appctx->st0) { + case CLI_ST_PROMPT: + break; + case CLI_ST_PRINT: /* print const message in msg */ + case CLI_ST_PRINT_ERR: /* print const error in msg */ + case CLI_ST_PRINT_DYN: /* print dyn message in msg, free */ + case CLI_ST_PRINT_DYNERR: /* print dyn error in err, free */ + case CLI_ST_PRINT_UMSG: /* print usermsgs_ctx and reset it */ + case CLI_ST_PRINT_UMSGERR: /* print usermsgs_ctx as error and reset it */ + /* the message is in the svcctx */ + ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + if (appctx->st0 == CLI_ST_PRINT || appctx->st0 == CLI_ST_PRINT_ERR) { + sev = appctx->st0 == CLI_ST_PRINT_ERR ? + LOG_ERR : ctx->severity; + msg = ctx->msg; + } + else if (appctx->st0 == CLI_ST_PRINT_DYN || appctx->st0 == CLI_ST_PRINT_DYNERR) { + sev = appctx->st0 == CLI_ST_PRINT_DYNERR ? + LOG_ERR : ctx->severity; + msg = ctx->err; + if (!msg) { + sev = LOG_ERR; + msg = "Out of memory.\n"; + } + } + else if (appctx->st0 == CLI_ST_PRINT_UMSG || + appctx->st0 == CLI_ST_PRINT_UMSGERR) { + sev = appctx->st0 == CLI_ST_PRINT_UMSGERR ? + LOG_ERR : ctx->severity; + msg = usermsgs_str(); + } + else { + sev = LOG_ERR; + msg = "Internal error.\n"; + } + + if (cli_output_msg(appctx, msg, sev, cli_get_severity_output(appctx)) != -1) { + if (appctx->st0 == CLI_ST_PRINT_DYN || + appctx->st0 == CLI_ST_PRINT_DYNERR) { + ha_free(&ctx->err); + } + else if (appctx->st0 == CLI_ST_PRINT_UMSG || + appctx->st0 == CLI_ST_PRINT_UMSGERR) { + usermsgs_clr(NULL); + } + appctx->st0 = CLI_ST_PROMPT; + } + break; + + case CLI_ST_CALLBACK: /* use custom pointer */ + if (appctx->io_handler) + if (appctx->io_handler(appctx)) { + appctx->st0 = CLI_ST_PROMPT; + if (appctx->io_release) { + appctx->io_release(appctx); + appctx->io_release = NULL; + } + } + break; + default: /* abnormal state */ + se_fl_set(appctx->sedesc, SE_FL_ERROR); + break; + } + + /* The post-command prompt is either LF alone or LF + '> ' in interactive mode */ + if (appctx->st0 == CLI_ST_PROMPT) { + char prompt_buf[20]; + const char *prompt = ""; + + if (appctx->st1 & APPCTX_CLI_ST1_PROMPT) { + /* + * when entering a payload with interactive mode, change the prompt + * to emphasize that more data can still be sent + */ + if (appctx->chunk->data && appctx->st1 & APPCTX_CLI_ST1_PAYLOAD) + prompt = "+ "; + else if (appctx->st1 & APPCTX_CLI_ST1_TIMED) { + uint up = ns_to_sec(now_ns - start_time_ns); + snprintf(prompt_buf, sizeof(prompt_buf), + "\n[%u:%02u:%02u:%02u]> ", + (up / 86400), (up / 3600) % 24, (up / 60) % 60, up % 60); + prompt = prompt_buf; + } + else + prompt = "\n> "; + } + else { + if (!(appctx->st1 & (APPCTX_CLI_ST1_PAYLOAD|APPCTX_CLI_ST1_NOLF))) + prompt = "\n"; + } + + if (applet_putstr(appctx, prompt) != -1) { + applet_reset_svcctx(appctx); + appctx->st0 = CLI_ST_GETREQ; + } + } + + /* If the output functions are still there, it means they require more room. */ + if (appctx->st0 >= CLI_ST_OUTPUT) { + applet_wont_consume(appctx); + break; + } + + /* Now we close the output if we're not in interactive + * mode and the request buffer is empty. This still + * allows pipelined requests to be sent in + * non-interactive mode. + */ + if (!(appctx->st1 & APPCTX_CLI_ST1_PROMPT) && !co_data(req) && (!(appctx->st1 & APPCTX_CLI_ST1_PAYLOAD))) { + se_fl_set(appctx->sedesc, SE_FL_EOI); + appctx->st0 = CLI_ST_END; + continue; + } + + /* switch state back to GETREQ to read next requests */ + applet_reset_svcctx(appctx); + appctx->st0 = CLI_ST_GETREQ; + applet_will_consume(appctx); + applet_expect_data(appctx); + + /* reactivate the \n at the end of the response for the next command */ + appctx->st1 &= ~APPCTX_CLI_ST1_NOLF; + + /* this forces us to yield between pipelined commands and + * avoid extremely long latencies (e.g. "del map" etc). In + * addition this increases the likelihood that the stream + * refills the buffer with new bytes in non-interactive + * mode, avoiding to close on apparently empty commands. + */ + if (co_data(sc_oc(sc))) { + appctx_wakeup(appctx); + goto out; + } + } + } + + out: + return; +} + +/* This is called when the stream connector is closed. For instance, upon an + * external abort, we won't call the i/o handler anymore so we may need to + * remove back references to the stream currently being dumped. + */ +static void cli_release_handler(struct appctx *appctx) +{ + free_trash_chunk(appctx->chunk); + appctx->chunk = NULL; + + if (appctx->io_release) { + appctx->io_release(appctx); + appctx->io_release = NULL; + } + else if (appctx->st0 == CLI_ST_PRINT_DYN || appctx->st0 == CLI_ST_PRINT_DYNERR) { + struct cli_print_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + ha_free(&ctx->err); + } + else if (appctx->st0 == CLI_ST_PRINT_UMSG || appctx->st0 == CLI_ST_PRINT_UMSGERR) { + usermsgs_clr(NULL); + } +} + +/* This function dumps all environmnent variables to the buffer. It returns 0 + * if the output buffer is full and it needs to be called again, otherwise + * non-zero. It takes its context from the show_env_ctx in svcctx, and will + * start from ->var and dump only one variable if ->show_one is set. + */ +static int cli_io_handler_show_env(struct appctx *appctx) +{ + struct show_env_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + char **var = ctx->var; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + chunk_reset(&trash); + + /* we have two inner loops here, one for the proxy, the other one for + * the buffer. + */ + while (*var) { + chunk_printf(&trash, "%s\n", *var); + + if (applet_putchk(appctx, &trash) == -1) + return 0; + + if (ctx->show_one) + break; + var++; + ctx->var = var; + } + + /* dump complete */ + return 1; +} + +/* This function dumps all file descriptors states (or the requested one) to + * the buffer. It returns 0 if the output buffer is full and it needs to be + * called again, otherwise non-zero. It takes its context from the show_fd_ctx + * in svcctx, only dumps one entry if ->show_one is non-zero, and (re)starts + * from ->fd. + */ +static int cli_io_handler_show_fd(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct show_fd_ctx *fdctx = appctx->svcctx; + uint match = fdctx->show_mask; + int fd = fdctx->fd; + int ret = 1; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + goto end; + + chunk_reset(&trash); + + /* isolate the threads once per round. We're limited to a buffer worth + * of output anyway, it cannot last very long. + */ + thread_isolate(); + + /* we have two inner loops here, one for the proxy, the other one for + * the buffer. + */ + while (fd >= 0 && fd < global.maxsock) { + struct fdtab fdt; + const struct listener *li = NULL; + const struct server *sv = NULL; + const struct proxy *px = NULL; + const struct connection *conn = NULL; + const struct mux_ops *mux = NULL; + const struct xprt_ops *xprt = NULL; + const void *ctx = NULL; + const void *xprt_ctx = NULL; + const struct quic_conn *qc = NULL; + uint32_t conn_flags = 0; + uint8_t conn_err = 0; + int is_back = 0; + int suspicious = 0; + + fdt = fdtab[fd]; + + /* When DEBUG_FD is set, we also report closed FDs that have a + * non-null event count to detect stuck ones. + */ + if (!fdt.owner) { +#ifdef DEBUG_FD + if (!fdt.event_count) +#endif + goto skip; // closed + } + else if (fdt.iocb == sock_conn_iocb) { + conn = (const struct connection *)fdt.owner; + conn_flags = conn->flags; + conn_err = conn->err_code; + mux = conn->mux; + ctx = conn->ctx; + xprt = conn->xprt; + xprt_ctx = conn->xprt_ctx; + li = objt_listener(conn->target); + sv = objt_server(conn->target); + px = objt_proxy(conn->target); + is_back = conn_is_back(conn); + if (atleast2(fdt.thread_mask)) + suspicious = 1; + if (conn->handle.fd != fd) + suspicious = 1; + } +#if defined(USE_QUIC) + else if (fdt.iocb == quic_conn_sock_fd_iocb) { + qc = fdtab[fd].owner; + li = qc ? qc->li : NULL; + xprt_ctx = qc ? qc->xprt_ctx : NULL; + conn = qc ? qc->conn : NULL; + xprt = conn ? conn->xprt : NULL; // in fact it's &ssl_quic + mux = conn ? conn->mux : NULL; + /* quic_conns don't always have a connection but they + * always have an xprt_ctx. + */ + } + else if (fdt.iocb == quic_lstnr_sock_fd_iocb) { + li = objt_listener(fdtab[fd].owner); + } +#endif + else if (fdt.iocb == sock_accept_iocb) + li = fdt.owner; + + if (!(((conn || xprt_ctx) && + ((match & CLI_SHOWFD_F_SV && sv) || + (match & CLI_SHOWFD_F_PX && px) || + (match & CLI_SHOWFD_F_FE && li))) || + (!conn && + ((match & CLI_SHOWFD_F_LI && li) || + (match & CLI_SHOWFD_F_PI && !li /* only pipes match this */))))) { + /* not a desired type */ + goto skip; + } + + if (!fdt.thread_mask) + suspicious = 1; + + chunk_printf(&trash, + " %5d : st=0x%06x(%c%c %c%c%c%c%c W:%c%c%c R:%c%c%c) ref=%#x gid=%d tmask=0x%lx umask=0x%lx prmsk=0x%lx pwmsk=0x%lx owner=%p iocb=%p(", + fd, + fdt.state, + (fdt.state & FD_CLONED) ? 'C' : 'c', + (fdt.state & FD_LINGER_RISK) ? 'L' : 'l', + (fdt.state & FD_POLL_HUP) ? 'H' : 'h', + (fdt.state & FD_POLL_ERR) ? 'E' : 'e', + (fdt.state & FD_POLL_OUT) ? 'O' : 'o', + (fdt.state & FD_POLL_PRI) ? 'P' : 'p', + (fdt.state & FD_POLL_IN) ? 'I' : 'i', + (fdt.state & FD_EV_SHUT_W) ? 'S' : 's', + (fdt.state & FD_EV_READY_W) ? 'R' : 'r', + (fdt.state & FD_EV_ACTIVE_W) ? 'A' : 'a', + (fdt.state & FD_EV_SHUT_R) ? 'S' : 's', + (fdt.state & FD_EV_READY_R) ? 'R' : 'r', + (fdt.state & FD_EV_ACTIVE_R) ? 'A' : 'a', + (fdt.refc_tgid >> 4) & 0xffff, + (fdt.refc_tgid) & 0xffff, + fdt.thread_mask, fdt.update_mask, + polled_mask[fd].poll_recv, + polled_mask[fd].poll_send, + fdt.owner, + fdt.iocb); + resolve_sym_name(&trash, NULL, fdt.iocb); + + if (!fdt.owner) { + chunk_appendf(&trash, ")"); + } + else if (conn) { + chunk_appendf(&trash, ") back=%d cflg=0x%08x cerr=%d", is_back, conn_flags, conn_err); + + if (!(conn->flags & CO_FL_FDLESS) && conn->handle.fd != fd) { + chunk_appendf(&trash, " fd=%d(BOGUS)", conn->handle.fd); + suspicious = 1; + } else if ((conn->flags & CO_FL_FDLESS) && (qc != conn->handle.qc)) { + chunk_appendf(&trash, " qc=%p(BOGUS)", conn->handle.qc); + suspicious = 1; + } else { + struct sockaddr_storage sa; + socklen_t salen; + + salen = sizeof(sa); + if (getsockname(fd, (struct sockaddr *)&sa, &salen) != -1) { + if (sa.ss_family == AF_INET) + chunk_appendf(&trash, " fam=ipv4 lport=%d", ntohs(((const struct sockaddr_in *)&sa)->sin_port)); + else if (sa.ss_family == AF_INET6) + chunk_appendf(&trash, " fam=ipv6 lport=%d", ntohs(((const struct sockaddr_in6 *)&sa)->sin6_port)); + else if (sa.ss_family == AF_UNIX) + chunk_appendf(&trash, " fam=unix"); + } + + salen = sizeof(sa); + if (getpeername(fd, (struct sockaddr *)&sa, &salen) != -1) { + if (sa.ss_family == AF_INET) + chunk_appendf(&trash, " rport=%d", ntohs(((const struct sockaddr_in *)&sa)->sin_port)); + else if (sa.ss_family == AF_INET6) + chunk_appendf(&trash, " rport=%d", ntohs(((const struct sockaddr_in6 *)&sa)->sin6_port)); + } + } + + if (px) + chunk_appendf(&trash, " px=%s", px->id); + else if (sv) + chunk_appendf(&trash, " sv=%s/%s", sv->proxy->id, sv->id); + else if (li) + chunk_appendf(&trash, " fe=%s", li->bind_conf->frontend->id); + + if (mux) { + chunk_appendf(&trash, " mux=%s ctx=%p", mux->name, ctx); + if (!ctx && !qc) + suspicious = 1; + if (mux->show_fd) + suspicious |= mux->show_fd(&trash, fdt.owner); + } + else + chunk_appendf(&trash, " nomux"); + + chunk_appendf(&trash, " xprt=%s", xprt ? xprt->name : ""); + if (xprt) { + if (xprt_ctx || xprt->show_fd) + chunk_appendf(&trash, " xprt_ctx=%p", xprt_ctx); + if (xprt->show_fd) + suspicious |= xprt->show_fd(&trash, conn, xprt_ctx); + } + } + else if (li && !xprt_ctx) { + struct sockaddr_storage sa; + socklen_t salen; + + chunk_appendf(&trash, ") l.st=%s fe=%s", + listener_state_str(li), + li->bind_conf->frontend->id); + + salen = sizeof(sa); + if (getsockname(fd, (struct sockaddr *)&sa, &salen) != -1) { + if (sa.ss_family == AF_INET) + chunk_appendf(&trash, " fam=ipv4 lport=%d", ntohs(((const struct sockaddr_in *)&sa)->sin_port)); + else if (sa.ss_family == AF_INET6) + chunk_appendf(&trash, " fam=ipv6 lport=%d", ntohs(((const struct sockaddr_in6 *)&sa)->sin6_port)); + else if (sa.ss_family == AF_UNIX) + chunk_appendf(&trash, " fam=unix"); + } + } + else + chunk_appendf(&trash, ")"); + +#ifdef DEBUG_FD + chunk_appendf(&trash, " evcnt=%u", fdtab[fd].event_count); + if (fdtab[fd].event_count >= 1000000) + suspicious = 1; +#endif + chunk_appendf(&trash, "%s\n", suspicious ? " !" : ""); + + if (applet_putchk(appctx, &trash) == -1) { + fdctx->fd = fd; + ret = 0; + break; + } + skip: + if (fdctx->show_one) + break; + + fd++; + } + + end: + /* dump complete */ + + thread_release(); + return ret; +} + +/* + * CLI IO handler for `show cli sockets`. + * Uses the svcctx as a show_sock_ctx to store/retrieve the bind_conf and the + * listener pointers. + */ +static int cli_io_handler_show_cli_sock(struct appctx *appctx) +{ + struct show_sock_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct bind_conf *bind_conf = ctx->bind_conf; + + if (!global.cli_fe) + goto done; + + chunk_reset(&trash); + + if (!bind_conf) { + /* first call */ + if (applet_putstr(appctx, "# socket lvl processes\n") == -1) + goto full; + bind_conf = LIST_ELEM(global.cli_fe->conf.bind.n, typeof(bind_conf), by_fe); + } + + list_for_each_entry_from(bind_conf, &global.cli_fe->conf.bind, by_fe) { + struct listener *l = ctx->listener; + + if (!l) + l = LIST_ELEM(bind_conf->listeners.n, typeof(l), by_bind); + + list_for_each_entry_from(l, &bind_conf->listeners, by_bind) { + char addr[46]; + char port[6]; + + if (l->rx.addr.ss_family == AF_UNIX) { + const struct sockaddr_un *un; + + un = (struct sockaddr_un *)&l->rx.addr; + if (un->sun_path[0] == '\0') { + chunk_appendf(&trash, "abns@%s ", un->sun_path+1); + } else { + chunk_appendf(&trash, "unix@%s ", un->sun_path); + } + } else if (l->rx.addr.ss_family == AF_INET) { + addr_to_str(&l->rx.addr, addr, sizeof(addr)); + port_to_str(&l->rx.addr, port, sizeof(port)); + chunk_appendf(&trash, "ipv4@%s:%s ", addr, port); + } else if (l->rx.addr.ss_family == AF_INET6) { + addr_to_str(&l->rx.addr, addr, sizeof(addr)); + port_to_str(&l->rx.addr, port, sizeof(port)); + chunk_appendf(&trash, "ipv6@[%s]:%s ", addr, port); + } else if (l->rx.addr.ss_family == AF_CUST_SOCKPAIR) { + chunk_appendf(&trash, "sockpair@%d ", ((struct sockaddr_in *)&l->rx.addr)->sin_addr.s_addr); + } else + chunk_appendf(&trash, "unknown "); + + if ((bind_conf->level & ACCESS_LVL_MASK) == ACCESS_LVL_ADMIN) + chunk_appendf(&trash, "admin "); + else if ((bind_conf->level & ACCESS_LVL_MASK) == ACCESS_LVL_OPER) + chunk_appendf(&trash, "operator "); + else if ((bind_conf->level & ACCESS_LVL_MASK) == ACCESS_LVL_USER) + chunk_appendf(&trash, "user "); + else + chunk_appendf(&trash, " "); + + chunk_appendf(&trash, "all\n"); + + if (applet_putchk(appctx, &trash) == -1) { + ctx->bind_conf = bind_conf; + ctx->listener = l; + goto full; + } + } + } + done: + return 1; + full: + return 0; +} + + +/* parse a "show env" CLI request. Returns 0 if it needs to continue, 1 if it + * wants to stop here. It reserves a sohw_env_ctx where it puts the variable to + * be dumped as well as a flag if a single variable is requested, otherwise puts + * environ there. + */ +static int cli_parse_show_env(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_env_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + extern char **environ; + char **var; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + var = environ; + + if (*args[2]) { + int len = strlen(args[2]); + + for (; *var; var++) { + if (strncmp(*var, args[2], len) == 0 && + (*var)[len] == '=') + break; + } + if (!*var) + return cli_err(appctx, "Variable not found\n"); + + ctx->show_one = 1; + } + ctx->var = var; + return 0; +} + +/* parse a "show fd" CLI request. Returns 0 if it needs to continue, 1 if it + * wants to stop here. It sets a show_fd_ctx context where, if a specific fd is + * requested, it puts the FD number into ->fd and sets ->show_one to 1. + */ +static int cli_parse_show_fd(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_fd_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + const char *c; + int arg; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + arg = 2; + + /* when starting with an inversion we preset every flag */ + if (*args[arg] == '!' || *args[arg] == '-') + ctx->show_mask = CLI_SHOWFD_F_ANY; + + while (*args[arg] && !isdigit((uchar)*args[arg])) { + uint flag = 0, inv = 0; + c = args[arg]; + while (*c) { + switch (*c) { + case '!': inv = !inv; break; + case '-': inv = !inv; break; + case 'p': flag = CLI_SHOWFD_F_PI; break; + case 'l': flag = CLI_SHOWFD_F_LI; break; + case 'c': flag = CLI_SHOWFD_F_CO; break; + case 'f': flag = CLI_SHOWFD_F_FE; break; + case 'b': flag = CLI_SHOWFD_F_BE; break; + case 's': flag = CLI_SHOWFD_F_SV; break; + case 'd': flag = CLI_SHOWFD_F_PX; break; + default: return cli_err(appctx, "Invalid FD type\n"); + } + c++; + if (!inv) + ctx->show_mask |= flag; + else + ctx->show_mask &= ~flag; + } + arg++; + } + + /* default mask is to show everything */ + if (!ctx->show_mask) + ctx->show_mask = CLI_SHOWFD_F_ANY; + + if (*args[arg]) { + ctx->fd = atoi(args[2]); + ctx->show_one = 1; + } + + return 0; +} + +/* parse a "set timeout" CLI request. It always returns 1. */ +static int cli_parse_set_timeout(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct stream *s = appctx_strm(appctx); + + if (strcmp(args[2], "cli") == 0) { + unsigned timeout; + const char *res; + + if (!*args[3]) + return cli_err(appctx, "Expects an integer value.\n"); + + res = parse_time_err(args[3], &timeout, TIME_UNIT_S); + if (res || timeout < 1) + return cli_err(appctx, "Invalid timeout value.\n"); + + s->scf->ioto = 1 + MS_TO_TICKS(timeout*1000); + task_wakeup(s->task, TASK_WOKEN_MSG); // recompute timeouts + return 1; + } + + return cli_err(appctx, "'set timeout' only supports 'cli'.\n"); +} + +/* parse a "set maxconn global" command. It always returns 1. */ +static int cli_parse_set_maxconn_global(char **args, char *payload, struct appctx *appctx, void *private) +{ + int v; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "Expects an integer value.\n"); + + v = atoi(args[3]); + if (v > global.hardmaxconn) + return cli_err(appctx, "Value out of range.\n"); + + /* check for unlimited values */ + if (v <= 0) + v = global.hardmaxconn; + + global.maxconn = v; + + /* Dequeues all of the listeners waiting for a resource */ + dequeue_all_listeners(); + + return 1; +} + +static int set_severity_output(int *target, char *argument) +{ + if (strcmp(argument, "none") == 0) { + *target = CLI_SEVERITY_NONE; + return 1; + } + else if (strcmp(argument, "number") == 0) { + *target = CLI_SEVERITY_NUMBER; + return 1; + } + else if (strcmp(argument, "string") == 0) { + *target = CLI_SEVERITY_STRING; + return 1; + } + return 0; +} + +/* parse a "set severity-output" command. */ +static int cli_parse_set_severity_output(char **args, char *payload, struct appctx *appctx, void *private) +{ + /* this will ask the applet to not output a \n after the command */ + if (strcmp(args[3], "-") == 0) + appctx->st1 |= APPCTX_CLI_ST1_NOLF; + + if (*args[2] && set_severity_output(&appctx->cli_severity_output, args[2])) + return 0; + + return cli_err(appctx, "one of 'none', 'number', 'string' is a required argument\n"); +} + + +/* show the level of the current CLI session */ +static int cli_parse_show_lvl(char **args, char *payload, struct appctx *appctx, void *private) +{ + if ((appctx->cli_level & ACCESS_LVL_MASK) == ACCESS_LVL_ADMIN) + return cli_msg(appctx, LOG_INFO, "admin\n"); + else if ((appctx->cli_level & ACCESS_LVL_MASK) == ACCESS_LVL_OPER) + return cli_msg(appctx, LOG_INFO, "operator\n"); + else if ((appctx->cli_level & ACCESS_LVL_MASK) == ACCESS_LVL_USER) + return cli_msg(appctx, LOG_INFO, "user\n"); + else + return cli_msg(appctx, LOG_INFO, "unknown\n"); +} + +/* parse and set the CLI level dynamically */ +static int cli_parse_set_lvl(char **args, char *payload, struct appctx *appctx, void *private) +{ + /* this will ask the applet to not output a \n after the command */ + if (strcmp(args[1], "-") == 0) + appctx->st1 |= APPCTX_CLI_ST1_NOLF; + + if (strcmp(args[0], "operator") == 0) { + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) { + return 1; + } + appctx->cli_level &= ~ACCESS_LVL_MASK; + appctx->cli_level |= ACCESS_LVL_OPER; + + } else if (strcmp(args[0], "user") == 0) { + if (!cli_has_level(appctx, ACCESS_LVL_USER)) { + return 1; + } + appctx->cli_level &= ~ACCESS_LVL_MASK; + appctx->cli_level |= ACCESS_LVL_USER; + } + appctx->cli_level &= ~(ACCESS_EXPERT|ACCESS_EXPERIMENTAL); + return 1; +} + + +/* parse and set the CLI expert/experimental-mode dynamically */ +static int cli_parse_expert_experimental_mode(char **args, char *payload, struct appctx *appctx, void *private) +{ + int level; + char *level_str; + char *output = NULL; + + /* this will ask the applet to not output a \n after the command */ + if (*args[1] && *args[2] && strcmp(args[2], "-") == 0) + appctx->st1 |= APPCTX_CLI_ST1_NOLF; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (strcmp(args[0], "expert-mode") == 0) { + level = ACCESS_EXPERT; + level_str = "expert-mode"; + } + else if (strcmp(args[0], "experimental-mode") == 0) { + level = ACCESS_EXPERIMENTAL; + level_str = "experimental-mode"; + } + else if (strcmp(args[0], "mcli-debug-mode") == 0) { + level = ACCESS_MCLI_DEBUG; + level_str = "mcli-debug-mode"; + } + else { + return 1; + } + + if (!*args[1]) { + memprintf(&output, "%s is %s\n", level_str, + (appctx->cli_level & level) ? "ON" : "OFF"); + return cli_dynmsg(appctx, LOG_INFO, output); + } + + appctx->cli_level &= ~level; + if (strcmp(args[1], "on") == 0) + appctx->cli_level |= level; + return 1; +} + +/* shows HAProxy version */ +static int cli_parse_show_version(char **args, char *payload, struct appctx *appctx, void *private) +{ + char *msg = NULL; + + return cli_dynmsg(appctx, LOG_INFO, memprintf(&msg, "%s\n", haproxy_version)); +} + +int cli_parse_default(char **args, char *payload, struct appctx *appctx, void *private) +{ + return 0; +} + +/* enable or disable the anonymized mode, it returns 1 when it works or displays an error message if it doesn't. */ +static int cli_parse_set_anon(char **args, char *payload, struct appctx *appctx, void *private) +{ + uint32_t tmp; + long long key; + + if (strcmp(args[2], "on") == 0) { + + if (*args[3]) { + key = atoll(args[3]); + if (key < 1 || key > UINT_MAX) + return cli_err(appctx, "Value out of range (1 to 4294967295 expected).\n"); + appctx->cli_anon_key = key; + } + else { + tmp = HA_ATOMIC_LOAD(&global.anon_key); + if (tmp != 0) + appctx->cli_anon_key = tmp; + else + appctx->cli_anon_key = ha_random32(); + } + } + else if (strcmp(args[2], "off") == 0) { + + if (*args[3]) { + return cli_err(appctx, "Key can't be added while disabling anonymized mode\n"); + } + else { + appctx->cli_anon_key = 0; + } + } + else { + return cli_err(appctx, + "'set anon' only supports :\n" + " - 'on' [key] to enable the anonymized mode\n" + " - 'off' to disable the anonymized mode"); + } + return 1; +} + +/* This function set the global anonyzing key, restricted to level 'admin' */ +static int cli_parse_set_global_key(char **args, char *payload, struct appctx *appctx, void *private) +{ + long long key; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return cli_err(appctx, "Permission denied\n"); + if (!*args[2]) + return cli_err(appctx, "Expects an integer value.\n"); + + key = atoll(args[2]); + if (key < 0 || key > UINT_MAX) + return cli_err(appctx, "Value out of range (0 to 4294967295 expected).\n"); + + HA_ATOMIC_STORE(&global.anon_key, key); + return 1; +} + +/* shows the anonymized mode state to everyone, and the key except for users, it always returns 1. */ +static int cli_parse_show_anon(char **args, char *payload, struct appctx *appctx, void *private) +{ + char *msg = NULL; + char *anon_mode = NULL; + uint32_t c_key = appctx->cli_anon_key; + + if (!c_key) + anon_mode = "Anonymized mode disabled"; + else + anon_mode = "Anonymized mode enabled"; + + if ( !((appctx->cli_level & ACCESS_LVL_MASK) < ACCESS_LVL_OPER) && c_key != 0) { + cli_dynmsg(appctx, LOG_INFO, memprintf(&msg, "%s\nKey : %u\n", anon_mode, c_key)); + } + else { + cli_dynmsg(appctx, LOG_INFO, memprintf(&msg, "%s\n", anon_mode)); + } + + return 1; +} + +/* parse a "set rate-limit" command. It always returns 1. */ +static int cli_parse_set_ratelimit(char **args, char *payload, struct appctx *appctx, void *private) +{ + int v; + int *res; + int mul = 1; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (strcmp(args[2], "connections") == 0 && strcmp(args[3], "global") == 0) + res = &global.cps_lim; + else if (strcmp(args[2], "sessions") == 0 && strcmp(args[3], "global") == 0) + res = &global.sps_lim; +#ifdef USE_OPENSSL + else if (strcmp(args[2], "ssl-sessions") == 0 && strcmp(args[3], "global") == 0) + res = &global.ssl_lim; +#endif + else if (strcmp(args[2], "http-compression") == 0 && strcmp(args[3], "global") == 0) { + res = &global.comp_rate_lim; + mul = 1024; + } + else { + return cli_err(appctx, + "'set rate-limit' only supports :\n" + " - 'connections global' to set the per-process maximum connection rate\n" + " - 'sessions global' to set the per-process maximum session rate\n" +#ifdef USE_OPENSSL + " - 'ssl-sessions global' to set the per-process maximum SSL session rate\n" +#endif + " - 'http-compression global' to set the per-process maximum compression speed in kB/s\n"); + } + + if (!*args[4]) + return cli_err(appctx, "Expects an integer value.\n"); + + v = atoi(args[4]); + if (v < 0) + return cli_err(appctx, "Value out of range.\n"); + + *res = v * mul; + + /* Dequeues all of the listeners waiting for a resource */ + dequeue_all_listeners(); + + return 1; +} + +/* parse the "expose-fd" argument on the bind lines */ +static int bind_parse_expose_fd(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing fd type", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + if (strcmp(args[cur_arg + 1], "listeners") == 0) { + conf->level |= ACCESS_FD_LISTENERS; + } else { + memprintf(err, "'%s' only supports 'listeners' (got '%s')", + args[cur_arg], args[cur_arg+1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "level" argument on the bind lines */ +static int bind_parse_level(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing level", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[cur_arg + 1], "user") == 0) { + conf->level &= ~ACCESS_LVL_MASK; + conf->level |= ACCESS_LVL_USER; + } else if (strcmp(args[cur_arg + 1], "operator") == 0) { + conf->level &= ~ACCESS_LVL_MASK; + conf->level |= ACCESS_LVL_OPER; + } else if (strcmp(args[cur_arg + 1], "admin") == 0) { + conf->level &= ~ACCESS_LVL_MASK; + conf->level |= ACCESS_LVL_ADMIN; + } else { + memprintf(err, "'%s' only supports 'user', 'operator', and 'admin' (got '%s')", + args[cur_arg], args[cur_arg+1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +static int bind_parse_severity_output(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing severity format", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (set_severity_output(&conf->severity_output, args[cur_arg+1])) + return 0; + else { + memprintf(err, "'%s' only supports 'none', 'number', and 'string' (got '%s')", + args[cur_arg], args[cur_arg+1]); + return ERR_ALERT | ERR_FATAL; + } +} + +/* Send all the bound sockets, always returns 1 */ +static int _getsocks(char **args, char *payload, struct appctx *appctx, void *private) +{ + static int already_sent = 0; + char *cmsgbuf = NULL; + unsigned char *tmpbuf = NULL; + struct cmsghdr *cmsg; + struct stconn *sc = appctx_sc(appctx); + struct stream *s = __sc_strm(sc); + struct connection *remote = sc_conn(sc_opposite(sc)); + struct msghdr msghdr; + struct iovec iov; + struct timeval tv = { .tv_sec = 1, .tv_usec = 0 }; + const char *ns_name, *if_name; + unsigned char ns_nlen, if_nlen; + int nb_queued; + int cur_fd = 0; + int *tmpfd; + int tot_fd_nb = 0; + int fd = -1; + int curoff = 0; + int old_fcntl = -1; + int ret; + + if (!remote) { + ha_warning("Only works on real connections\n"); + goto out; + } + + fd = remote->handle.fd; + + /* Temporary set the FD in blocking mode, that will make our life easier */ + old_fcntl = fcntl(fd, F_GETFL); + if (old_fcntl < 0) { + ha_warning("Couldn't get the flags for the unix socket\n"); + goto out; + } + cmsgbuf = malloc(CMSG_SPACE(sizeof(int) * MAX_SEND_FD)); + if (!cmsgbuf) { + ha_warning("Failed to allocate memory to send sockets\n"); + goto out; + } + if (fcntl(fd, F_SETFL, old_fcntl &~ O_NONBLOCK) == -1) { + ha_warning("Cannot make the unix socket blocking\n"); + goto out; + } + setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (void *)&tv, sizeof(tv)); + iov.iov_base = &tot_fd_nb; + iov.iov_len = sizeof(tot_fd_nb); + if (!(strm_li(s)->bind_conf->level & ACCESS_FD_LISTENERS)) + goto out; + memset(&msghdr, 0, sizeof(msghdr)); + /* + * First, calculates the total number of FD, so that we can let + * the caller know how much it should expect. + */ + for (cur_fd = 0;cur_fd < global.maxsock; cur_fd++) + tot_fd_nb += !!(fdtab[cur_fd].state & FD_EXPORTED); + + if (tot_fd_nb == 0) { + if (already_sent) + ha_warning("_getsocks: attempt to get sockets but they were already sent and closed in this process!\n"); + goto out; + } + + /* First send the total number of file descriptors, so that the + * receiving end knows what to expect. + */ + msghdr.msg_iov = &iov; + msghdr.msg_iovlen = 1; + ret = sendmsg(fd, &msghdr, 0); + if (ret != sizeof(tot_fd_nb)) { + ha_warning("Failed to send the number of sockets to send\n"); + goto out; + } + + /* Now send the fds */ + msghdr.msg_control = cmsgbuf; + msghdr.msg_controllen = CMSG_SPACE(sizeof(int) * MAX_SEND_FD); + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_len = CMSG_LEN(MAX_SEND_FD * sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + tmpfd = (int *)CMSG_DATA(cmsg); + + /* For each socket, e message is sent, containing the following : + * Size of the namespace name (or 0 if none), as an unsigned char. + * The namespace name, if any + * Size of the interface name (or 0 if none), as an unsigned char + * The interface name, if any + * 32 bits of zeroes (used to be listener options). + */ + /* We will send sockets MAX_SEND_FD per MAX_SEND_FD, allocate a + * buffer big enough to store the socket information. + */ + tmpbuf = malloc(MAX_SEND_FD * (1 + MAXPATHLEN + 1 + IFNAMSIZ + sizeof(int))); + if (tmpbuf == NULL) { + ha_warning("Failed to allocate memory to transfer socket information\n"); + goto out; + } + + nb_queued = 0; + iov.iov_base = tmpbuf; + for (cur_fd = 0; cur_fd < global.maxsock; cur_fd++) { + if (!(fdtab[cur_fd].state & FD_EXPORTED)) + continue; + + ns_name = if_name = ""; + ns_nlen = if_nlen = 0; + + /* for now we can only retrieve namespaces and interfaces from + * pure listeners. + */ + if (fdtab[cur_fd].iocb == sock_accept_iocb) { + const struct listener *l = fdtab[cur_fd].owner; + + if (l->rx.settings->interface) { + if_name = l->rx.settings->interface; + if_nlen = strlen(if_name); + } + +#ifdef USE_NS + if (l->rx.settings->netns) { + ns_name = l->rx.settings->netns->node.key; + ns_nlen = l->rx.settings->netns->name_len; + } +#endif + } + + /* put the FD into the CMSG_DATA */ + tmpfd[nb_queued++] = cur_fd; + + /* first block is <ns_name_len> <ns_name> */ + tmpbuf[curoff++] = ns_nlen; + if (ns_nlen) + memcpy(tmpbuf + curoff, ns_name, ns_nlen); + curoff += ns_nlen; + + /* second block is <if_name_len> <if_name> */ + tmpbuf[curoff++] = if_nlen; + if (if_nlen) + memcpy(tmpbuf + curoff, if_name, if_nlen); + curoff += if_nlen; + + /* we used to send the listener options here before 2.3 */ + memset(tmpbuf + curoff, 0, sizeof(int)); + curoff += sizeof(int); + + /* there's a limit to how many FDs may be sent at once */ + if (nb_queued == MAX_SEND_FD) { + iov.iov_len = curoff; + if (sendmsg(fd, &msghdr, 0) != curoff) { + ha_warning("Failed to transfer sockets\n"); + return -1; + } + + /* Wait for an ack */ + do { + ret = recv(fd, &tot_fd_nb, sizeof(tot_fd_nb), 0); + } while (ret == -1 && errno == EINTR); + + if (ret <= 0) { + ha_warning("Unexpected error while transferring sockets\n"); + return -1; + } + curoff = 0; + nb_queued = 0; + } + } + + already_sent = 1; + + /* flush pending stuff */ + if (nb_queued) { + iov.iov_len = curoff; + cmsg->cmsg_len = CMSG_LEN(nb_queued * sizeof(int)); + msghdr.msg_controllen = CMSG_SPACE(nb_queued * sizeof(int)); + if (sendmsg(fd, &msghdr, 0) != curoff) { + ha_warning("Failed to transfer sockets\n"); + goto out; + } + } + +out: + if (fd >= 0 && old_fcntl >= 0 && fcntl(fd, F_SETFL, old_fcntl) == -1) { + ha_warning("Cannot make the unix socket non-blocking\n"); + goto out; + } + se_fl_set(appctx->sedesc, SE_FL_EOI); + appctx->st0 = CLI_ST_END; + free(cmsgbuf); + free(tmpbuf); + return 1; +} + +static int cli_parse_simple(char **args, char *payload, struct appctx *appctx, void *private) +{ + if (*args[0] == 'h') + /* help */ + cli_gen_usage_msg(appctx, args); + else if (*args[0] == 'p') + /* prompt */ + if (strcmp(args[1], "timed") == 0) { + appctx->st1 |= APPCTX_CLI_ST1_PROMPT; + appctx->st1 ^= APPCTX_CLI_ST1_TIMED; + } + else + appctx->st1 ^= APPCTX_CLI_ST1_PROMPT; + else if (*args[0] == 'q') { + /* quit */ + se_fl_set(appctx->sedesc, SE_FL_EOI); + appctx->st0 = CLI_ST_END; + } + + return 1; +} + +void pcli_write_prompt(struct stream *s) +{ + struct buffer *msg = get_trash_chunk(); + struct channel *oc = sc_oc(s->scf); + + if (!(s->pcli_flags & PCLI_F_PROMPT)) + return; + + if (s->pcli_flags & PCLI_F_PAYLOAD) { + chunk_appendf(msg, "+ "); + } else { + if (s->pcli_next_pid == 0) { + /* master's prompt */ + if (s->pcli_flags & PCLI_F_TIMED) { + uint up = ns_to_sec(now_ns - start_time_ns); + chunk_appendf(msg, "[%u:%02u:%02u:%02u] ", + (up / 86400), (up / 3600) % 24, (up / 60) % 60, up % 60); + } + + chunk_appendf(msg, "master%s", + (proc_self->failedreloads > 0) ? "[ReloadFailed]" : ""); + } + else { + /* worker's prompt */ + if (s->pcli_flags & PCLI_F_TIMED) { + const struct mworker_proc *tmp, *proc; + uint up; + + /* set proc to the worker corresponding to pcli_next_pid or NULL */ + proc = NULL; + list_for_each_entry(tmp, &proc_list, list) { + if (!(tmp->options & PROC_O_TYPE_WORKER)) + continue; + if (tmp->pid == s->pcli_next_pid) { + proc = tmp; + break; + } + } + + if (!proc) + chunk_appendf(msg, "[gone] "); + else { + up = date.tv_sec - proc->timestamp; + if ((int)up < 0) /* must never be negative because of clock drift */ + up = 0; + chunk_appendf(msg, "[%u:%02u:%02u:%02u] ", + (up / 86400), (up / 3600) % 24, (up / 60) % 60, up % 60); + } + } + chunk_appendf(msg, "%d", s->pcli_next_pid); + } + + if (s->pcli_flags & (ACCESS_EXPERIMENTAL|ACCESS_EXPERT|ACCESS_MCLI_DEBUG)) { + chunk_appendf(msg, "("); + + if (s->pcli_flags & ACCESS_EXPERIMENTAL) + chunk_appendf(msg, "x"); + + if (s->pcli_flags & ACCESS_EXPERT) + chunk_appendf(msg, "e"); + + if (s->pcli_flags & ACCESS_MCLI_DEBUG) + chunk_appendf(msg, "d"); + + chunk_appendf(msg, ")"); + } + + chunk_appendf(msg, "> "); + + + } + co_inject(oc, msg->area, msg->data); +} + +/* The pcli_* functions are used for the CLI proxy in the master */ + + +/* flush the input buffer and output an error */ +void pcli_error(struct stream *s, const char *msg) +{ + struct buffer *buf = get_trash_chunk(); + struct channel *oc = &s->res; + struct channel *ic = &s->req; + + chunk_initstr(buf, msg); + + if (likely(buf && buf->data)) + co_inject(oc, buf->area, buf->data); + + channel_erase(ic); + +} + +/* flush the input buffer, output the error and close */ +void pcli_reply_and_close(struct stream *s, const char *msg) +{ + struct buffer *buf = get_trash_chunk(); + + chunk_initstr(buf, msg); + stream_retnclose(s, buf); +} + +static enum obj_type *pcli_pid_to_server(int proc_pid) +{ + struct mworker_proc *child; + + /* return the mCLI applet of the master */ + if (proc_pid == 0) + return &mcli_applet.obj_type; + + list_for_each_entry(child, &proc_list, list) { + if (child->pid == proc_pid){ + return &child->srv->obj_type; + } + } + return NULL; +} + +/* Take a CLI prefix in argument (eg: @!1234 @master @1) + * Return: + * 0: master + * > 0: pid of a worker + * < 0: didn't find a worker + */ +static int pcli_prefix_to_pid(const char *prefix) +{ + int proc_pid; + struct mworker_proc *child; + char *errtol = NULL; + + if (*prefix != '@') /* not a prefix, should not happen */ + return -1; + + prefix++; + if (!*prefix) /* sent @ alone, return the master */ + return 0; + + if (strcmp("master", prefix) == 0) { + return 0; + } else if (*prefix == '!') { + prefix++; + if (!*prefix) + return -1; + + proc_pid = strtol(prefix, &errtol, 10); + if (*errtol != '\0') + return -1; + list_for_each_entry(child, &proc_list, list) { + if (!(child->options & PROC_O_TYPE_WORKER)) + continue; + if (child->pid == proc_pid){ + return child->pid; + } + } + } else { + struct mworker_proc *chosen = NULL; + /* this is a relative pid */ + + proc_pid = strtol(prefix, &errtol, 10); + if (*errtol != '\0') + return -1; + + if (proc_pid == 0) /* return the master */ + return 0; + + if (proc_pid != 1) /* only the "@1" relative PID is supported */ + return -1; + + /* chose the right process, the current one is the one with the + least number of reloads */ + list_for_each_entry(child, &proc_list, list) { + if (!(child->options & PROC_O_TYPE_WORKER)) + continue; + if (child->reloads == 0) + return child->pid; + else if (chosen == NULL || child->reloads < chosen->reloads) + chosen = child; + } + if (chosen) + return chosen->pid; + } + return -1; +} + +/* Return:: + * >= 0 : number of words to escape + * = -1 : error + */ +int pcli_find_and_exec_kw(struct stream *s, char **args, int argl, char **errmsg, int *next_pid) +{ + if (argl < 1) + return 0; + + /* there is a prefix */ + if (args[0][0] == '@') { + int target_pid = pcli_prefix_to_pid(args[0]); + + if (target_pid == -1) { + memprintf(errmsg, "Can't find the target PID matching the prefix '%s'\n", args[0]); + return -1; + } + + /* if the prefix is alone, define a default target */ + if (argl == 1) + s->pcli_next_pid = target_pid; + else + *next_pid = target_pid; + return 1; + } else if (strcmp("prompt", args[0]) == 0) { + if (argl >= 2 && strcmp(args[1], "timed") == 0) { + s->pcli_flags |= PCLI_F_PROMPT; + s->pcli_flags ^= PCLI_F_TIMED; + } + else + s->pcli_flags ^= PCLI_F_PROMPT; + return argl; /* return the number of elements in the array */ + } else if (strcmp("quit", args[0]) == 0) { + sc_schedule_abort(s->scf); + sc_schedule_shutdown(s->scf); + return argl; /* return the number of elements in the array */ + } else if (strcmp(args[0], "operator") == 0) { + if (!pcli_has_level(s, ACCESS_LVL_OPER)) { + memprintf(errmsg, "Permission denied!\n"); + return -1; + } + s->pcli_flags &= ~ACCESS_LVL_MASK; + s->pcli_flags |= ACCESS_LVL_OPER; + return argl; + + } else if (strcmp(args[0], "user") == 0) { + if (!pcli_has_level(s, ACCESS_LVL_USER)) { + memprintf(errmsg, "Permission denied!\n"); + return -1; + } + s->pcli_flags &= ~ACCESS_LVL_MASK; + s->pcli_flags |= ACCESS_LVL_USER; + return argl; + + } else if (strcmp(args[0], "expert-mode") == 0) { + if (!pcli_has_level(s, ACCESS_LVL_ADMIN)) { + memprintf(errmsg, "Permission denied!\n"); + return -1; + } + + s->pcli_flags &= ~ACCESS_EXPERT; + if ((argl > 1) && (strcmp(args[1], "on") == 0)) + s->pcli_flags |= ACCESS_EXPERT; + return argl; + + } else if (strcmp(args[0], "experimental-mode") == 0) { + if (!pcli_has_level(s, ACCESS_LVL_ADMIN)) { + memprintf(errmsg, "Permission denied!\n"); + return -1; + } + s->pcli_flags &= ~ACCESS_EXPERIMENTAL; + if ((argl > 1) && (strcmp(args[1], "on") == 0)) + s->pcli_flags |= ACCESS_EXPERIMENTAL; + return argl; + } else if (strcmp(args[0], "mcli-debug-mode") == 0) { + if (!pcli_has_level(s, ACCESS_LVL_ADMIN)) { + memprintf(errmsg, "Permission denied!\n"); + return -1; + } + s->pcli_flags &= ~ACCESS_MCLI_DEBUG; + if ((argl > 1) && (strcmp(args[1], "on") == 0)) + s->pcli_flags |= ACCESS_MCLI_DEBUG; + return argl; + } else if (strcmp(args[0], "set") == 0) { + if ((argl > 1) && (strcmp(args[1], "severity-output") == 0)) { + if ((argl > 2) &&strcmp(args[2], "none") == 0) { + s->pcli_flags &= ~(ACCESS_MCLI_SEVERITY_NB|ACCESS_MCLI_SEVERITY_STR); + } else if ((argl > 2) && strcmp(args[2], "string") == 0) { + s->pcli_flags |= ACCESS_MCLI_SEVERITY_STR; + } else if ((argl > 2) && strcmp(args[2], "number") == 0) { + s->pcli_flags |= ACCESS_MCLI_SEVERITY_NB; + } else { + memprintf(errmsg, "one of 'none', 'number', 'string' is a required argument\n"); + return -1; + } + /* only skip argl if we have "set severity-output" not only "set" */ + return argl; + } + } + + return 0; +} + +/* + * Parse the CLI request: + * - It does basically the same as the cli_io_handler, but as a proxy + * - It can exec a command and strip non forwardable commands + * + * Return: + * - the number of characters to forward or + * - 1 if there is an error or not enough data + */ +int pcli_parse_request(struct stream *s, struct channel *req, char **errmsg, int *next_pid) +{ + char *str; + char *end; + char *args[MAX_CLI_ARGS + 1]; /* +1 for storing a NULL */ + int argl; /* number of args */ + char *p; + char *trim = NULL; + int wtrim = 0; /* number of words to trim */ + int reql = 0; + int ret; + int i = 0; + + /* we cannot deal with a wrapping buffer, so let's take care of this + * first. + */ + if (b_head(&req->buf) + b_data(&req->buf) > b_wrap(&req->buf)) + b_slow_realign(&req->buf, trash.area, co_data(req)); + + str = (char *)ci_head(req); + end = (char *)ci_stop(req); + + p = str; + + if (!(s->pcli_flags & PCLI_F_PAYLOAD)) { + + /* Looks for the end of one command */ + while (p+reql < end) { + /* handle escaping */ + if (p[reql] == '\\') { + reql+=2; + continue; + } + if (p[reql] == ';' || p[reql] == '\n') { + /* found the end of the command */ + p[reql] = '\n'; + reql++; + break; + } + reql++; + } + } else { + while (p+reql < end) { + if (p[reql] == '\n') { + /* found the end of the line */ + reql++; + break; + } + reql++; + } + } + + /* set end to first byte after the end of the command */ + end = p + reql; + + /* there is no end to this command, need more to parse ! */ + if (!reql || *(end-1) != '\n') { + ret = -1; + goto end; + } + + /* in payload mode, skip the whole parsing/exec and just look for a pattern */ + if (s->pcli_flags & PCLI_F_PAYLOAD) { + if (reql-1 == strlen(s->pcli_payload_pat)) { + /* the custom pattern len can be 0 (empty line) */ + if (strncmp(str, s->pcli_payload_pat, strlen(s->pcli_payload_pat)) == 0) { + s->pcli_flags &= ~PCLI_F_PAYLOAD; + } + } + ret = reql; + goto end; + } + + *(end-1) = '\0'; + + /* splits the command in words */ + while (i < MAX_CLI_ARGS && p < end) { + /* skip leading spaces/tabs */ + p += strspn(p, " \t"); + if (!*p) + break; + + args[i] = p; + while (1) { + p += strcspn(p, " \t\\"); + /* escaped chars using backlashes (\) */ + if (*p == '\\') { + if (!*++p) + break; + if (!*++p) + break; + } else { + break; + } + } + *p++ = 0; + i++; + } + argl = i; + + /* first look for '<<' at the beginning of the last argument */ + if (argl && strncmp(args[argl-1], PAYLOAD_PATTERN, strlen(PAYLOAD_PATTERN)) == 0) { + size_t pat_len = strlen(args[argl-1] + strlen(PAYLOAD_PATTERN)); + + /* + * A customized pattern can't be more than 7 characters + * if it's more, don't make it a payload + */ + if (pat_len < sizeof(s->pcli_payload_pat)) { + s->pcli_flags |= PCLI_F_PAYLOAD; + /* copy the customized pattern, don't store the << */ + strncpy(s->pcli_payload_pat, args[argl-1] + strlen(PAYLOAD_PATTERN), sizeof(s->pcli_payload_pat)-1); + s->pcli_payload_pat[sizeof(s->pcli_payload_pat)-1] = '\0'; + } + } + + for (; i < MAX_CLI_ARGS + 1; i++) + args[i] = NULL; + + wtrim = pcli_find_and_exec_kw(s, args, argl, errmsg, next_pid); + + /* End of words are ending by \0, we need to replace the \0s by spaces + before forwarding them */ + p = str; + while (p < end-1) { + if (*p == '\0') + *p = ' '; + p++; + } + + *(end-1) = '\n'; + + if (wtrim > 0) { + trim = &args[wtrim][0]; + if (trim == NULL) /* if this was the last word in the table */ + trim = end; + + b_del(&req->buf, trim - str); + + ret = end - trim; + } else if (wtrim < 0) { + /* parsing error */ + ret = -1; + goto end; + } else { + /* the whole string */ + ret = end - str; + } + + if (ret > 1) { + + /* the mcli-debug-mode is only sent to the applet of the master */ + if ((s->pcli_flags & ACCESS_MCLI_DEBUG) && *next_pid <= 0) { + ci_insert_line2(req, 0, "mcli-debug-mode on -", strlen("mcli-debug-mode on -")); + ret += strlen("mcli-debug-mode on -") + 2; + } + if (s->pcli_flags & ACCESS_EXPERIMENTAL) { + ci_insert_line2(req, 0, "experimental-mode on -", strlen("experimental-mode on -")); + ret += strlen("experimental-mode on -") + 2; + } + if (s->pcli_flags & ACCESS_EXPERT) { + ci_insert_line2(req, 0, "expert-mode on -", strlen("expert-mode on -")); + ret += strlen("expert-mode on -") + 2; + } + if (s->pcli_flags & ACCESS_MCLI_SEVERITY_STR) { + const char *cmd = "set severity-output string -"; + ci_insert_line2(req, 0, cmd, strlen(cmd)); + ret += strlen(cmd) + 2; + } + if (s->pcli_flags & ACCESS_MCLI_SEVERITY_NB) { + const char *cmd = "set severity-output number -"; + ci_insert_line2(req, 0, cmd, strlen(cmd)); + ret += strlen(cmd) + 2; + } + + if (pcli_has_level(s, ACCESS_LVL_ADMIN)) { + goto end; + } else if (pcli_has_level(s, ACCESS_LVL_OPER)) { + ci_insert_line2(req, 0, "operator -", strlen("operator -")); + ret += strlen("operator -") + 2; + } else if (pcli_has_level(s, ACCESS_LVL_USER)) { + ci_insert_line2(req, 0, "user -", strlen("user -")); + ret += strlen("user -") + 2; + } + } +end: + + return ret; +} + +int pcli_wait_for_request(struct stream *s, struct channel *req, int an_bit) +{ + int next_pid = -1; + int to_forward; + char *errmsg = NULL; + + /* Don't read the next command if still processing the response of the + * current one. Just wait. At this stage, errors should be handled by + * the response analyzer. + */ + if (s->res.analysers & AN_RES_WAIT_CLI) + return 0; + + if ((s->pcli_flags & ACCESS_LVL_MASK) == ACCESS_LVL_NONE) + s->pcli_flags |= strm_li(s)->bind_conf->level & ACCESS_LVL_MASK; + + /* stream that comes from the reload listener only responses the reload + * status and quits */ + if (!(s->pcli_flags & PCLI_F_RELOAD) + && strm_li(s)->bind_conf == mcli_reload_bind_conf) + goto send_status; + + +read_again: + /* if the channel is closed for read, we won't receive any more data + from the client, but we don't want to forward this close to the + server */ + channel_dont_close(req); + + /* We don't know yet to which server we will connect */ + channel_dont_connect(req); + + s->scf->flags |= SC_FL_RCV_ONCE; + + /* need more data */ + if (!ci_data(req)) + goto missing_data; + + /* If there is data available for analysis, log the end of the idle time. */ + if (c_data(req) && s->logs.t_idle == -1) + s->logs.t_idle = ns_to_ms(now_ns - s->logs.accept_ts) - s->logs.t_handshake; + + to_forward = pcli_parse_request(s, req, &errmsg, &next_pid); + if (to_forward > 0) { + int target_pid; + /* enough data */ + + /* forward only 1 command */ + channel_forward(req, to_forward); + + if (!(s->pcli_flags & PCLI_F_PAYLOAD)) { + /* we send only 1 command per request, and we write close after it */ + sc_schedule_shutdown(s->scb); + } else { + pcli_write_prompt(s); + } + + s->res.flags |= CF_WAKE_ONCE; /* need to be called again */ + s->res.analysers |= AN_RES_WAIT_CLI; + + if (!(s->flags & SF_ASSIGNED)) { + if (next_pid > -1) + target_pid = next_pid; + else + target_pid = s->pcli_next_pid; + /* we can connect now */ + s->target = pcli_pid_to_server(target_pid); + + if (!s->target) + goto server_disconnect; + + s->flags |= (SF_DIRECT | SF_ASSIGNED); + channel_auto_connect(req); + } + + } else if (to_forward == 0) { + /* we trimmed things but we might have other commands to consume */ + pcli_write_prompt(s); + goto read_again; + } else if (to_forward == -1) { + if (!errmsg) /* no error means missing data */ + goto missing_data; + + /* there was an error during the parsing */ + pcli_error(s, errmsg); + pcli_write_prompt(s); + } + + return 0; + +send_help: + b_reset(&req->buf); + b_putblk(&req->buf, "help\n", 5); + goto read_again; + +send_status: + s->pcli_flags |= PCLI_F_RELOAD; + /* don't use ci_putblk here because SHUT_DONE could have been sent */ + b_reset(&req->buf); + b_putblk(&req->buf, "_loadstatus;quit\n", 17); + goto read_again; + +missing_data: + if (s->scf->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) { + /* There is no more request or a only a partial one and we + * receive a close from the client, we can leave */ + sc_schedule_shutdown(s->scf); + s->req.analysers &= ~AN_REQ_WAIT_CLI; + return 1; + } + else if (channel_full(req, global.tune.maxrewrite)) { + /* buffer is full and we didn't catch the end of a command */ + goto send_help; + } + return 0; + +server_disconnect: + pcli_reply_and_close(s, "Can't connect to the target CLI!\n"); + return 0; +} + +int pcli_wait_for_response(struct stream *s, struct channel *rep, int an_bit) +{ + struct proxy *fe = strm_fe(s); + struct proxy *be = s->be; + + if ((s->scb->flags & SC_FL_ERROR) || (rep->flags & (CF_READ_TIMEOUT|CF_WRITE_TIMEOUT)) || + ((s->scf->flags & SC_FL_SHUT_DONE) && (rep->to_forward || co_data(rep)))) { + pcli_reply_and_close(s, "Can't connect to the target CLI!\n"); + s->req.analysers &= ~AN_REQ_WAIT_CLI; + s->res.analysers &= ~AN_RES_WAIT_CLI; + return 0; + } + s->scb->flags |= SC_FL_RCV_ONCE; /* try to get back here ASAP */ + s->scf->flags |= SC_FL_SND_NEVERWAIT; + + /* don't forward the close */ + channel_dont_close(&s->res); + channel_dont_close(&s->req); + + if (s->pcli_flags & PCLI_F_PAYLOAD) { + s->res.analysers &= ~AN_RES_WAIT_CLI; + s->req.flags |= CF_WAKE_ONCE; /* need to be called again if there is some command left in the request */ + return 0; + } + + /* forward the data */ + if (ci_data(rep)) { + c_adv(rep, ci_data(rep)); + return 0; + } + + if (s->scb->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) { + /* stream cleanup */ + + pcli_write_prompt(s); + + s->scb->flags |= SC_FL_NOLINGER | SC_FL_NOHALF; + sc_abort(s->scb); + sc_shutdown(s->scb); + + /* + * starting from there this the same code as + * http_end_txn_clean_session(). + * + * It allows to do frontend keepalive while reconnecting to a + * new server for each request. + */ + + if (s->flags & SF_BE_ASSIGNED) { + HA_ATOMIC_DEC(&be->beconn); + if (unlikely(s->srv_conn)) + sess_change_server(s, NULL); + } + + s->logs.t_close = ns_to_ms(now_ns - s->logs.accept_ts); + stream_process_counters(s); + + /* don't count other requests' data */ + s->logs.bytes_in -= ci_data(&s->req); + s->logs.bytes_out -= ci_data(&s->res); + + /* we may need to know the position in the queue */ + pendconn_free(s); + + /* let's do a final log if we need it */ + if (!LIST_ISEMPTY(&fe->logformat) && s->logs.logwait && + !(s->flags & SF_MONITOR) && + (!(fe->options & PR_O_NULLNOLOG) || s->req.total)) { + s->do_log(s); + } + + /* stop tracking content-based counters */ + stream_stop_content_counters(s); + stream_update_time_stats(s); + + s->logs.accept_date = date; /* user-visible date for logging */ + s->logs.accept_ts = now_ns; /* corrected date for internal use */ + s->logs.t_handshake = 0; /* There are no handshake in keep alive connection. */ + s->logs.t_idle = -1; + s->logs.request_ts = 0; + s->logs.t_queue = -1; + s->logs.t_connect = -1; + s->logs.t_data = -1; + s->logs.t_close = 0; + s->logs.prx_queue_pos = 0; /* we get the number of pending conns before us */ + s->logs.srv_queue_pos = 0; /* we will get this number soon */ + + s->logs.bytes_in = s->req.total = ci_data(&s->req); + s->logs.bytes_out = s->res.total = ci_data(&s->res); + + stream_del_srv_conn(s); + if (objt_server(s->target)) { + if (s->flags & SF_CURR_SESS) { + s->flags &= ~SF_CURR_SESS; + HA_ATOMIC_DEC(&__objt_server(s->target)->cur_sess); + } + if (may_dequeue_tasks(__objt_server(s->target), be)) + process_srv_queue(__objt_server(s->target)); + } + + s->target = NULL; + + /* only release our endpoint if we don't intend to reuse the + * connection. + */ + if (!sc_conn_ready(s->scb)) { + s->srv_conn = NULL; + if (sc_reset_endp(s->scb) < 0) { + if (!s->conn_err_type) + s->conn_err_type = STRM_ET_CONN_OTHER; + if (s->srv_error) + s->srv_error(s, s->scb); + return 1; + } + se_fl_clr(s->scb->sedesc, ~SE_FL_DETACHED); + } + + sockaddr_free(&s->scb->dst); + + sc_set_state(s->scb, SC_ST_INI); + s->scb->flags &= ~(SC_FL_ERROR|SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED); + s->scb->flags &= SC_FL_ISBACK | SC_FL_DONT_WAKE; /* we're in the context of process_stream */ + + s->req.flags &= ~(CF_AUTO_CONNECT|CF_STREAMER|CF_STREAMER_FAST|CF_WROTE_DATA); + s->res.flags &= ~(CF_STREAMER|CF_STREAMER_FAST|CF_WRITE_EVENT|CF_WROTE_DATA|CF_READ_EVENT); + s->flags &= ~(SF_DIRECT|SF_ASSIGNED|SF_BE_ASSIGNED|SF_FORCE_PRST|SF_IGNORE_PRST); + s->flags &= ~(SF_CURR_SESS|SF_REDIRECTABLE|SF_SRV_REUSED); + s->flags &= ~(SF_ERR_MASK|SF_FINST_MASK|SF_REDISP); + s->conn_retries = 0; /* used for logging too */ + s->conn_exp = TICK_ETERNITY; + s->conn_err_type = STRM_ET_NONE; + /* reinitialise the current rule list pointer to NULL. We are sure that + * any rulelist match the NULL pointer. + */ + s->current_rule_list = NULL; + + s->be = strm_fe(s); + s->logs.logwait = strm_fe(s)->to_log; + s->logs.level = 0; + stream_del_srv_conn(s); + s->target = NULL; + /* re-init store persistence */ + s->store_count = 0; + s->uniq_id = global.req_count++; + + s->scf->flags &= ~(SC_FL_EOS|SC_FL_ERROR|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED); + s->scf->flags &= ~SC_FL_SND_NEVERWAIT; + s->scf->flags |= SC_FL_RCV_ONCE; /* one read is usually enough */ + + s->req.flags |= CF_WAKE_ONCE; /* need to be called again if there is some command left in the request */ + + s->res.analysers &= ~AN_RES_WAIT_CLI; + + /* We must trim any excess data from the response buffer, because we + * may have blocked an invalid response from a server that we don't + * want to accidentally forward once we disable the analysers, nor do + * we want those data to come along with next response. A typical + * example of such data would be from a buggy server responding to + * a HEAD with some data, or sending more than the advertised + * content-length. + */ + if (unlikely(ci_data(&s->res))) + b_set_data(&s->res.buf, co_data(&s->res)); + + /* Now we can realign the response buffer */ + c_realign_if_empty(&s->res); + + s->scf->ioto = strm_fe(s)->timeout.client; + s->scb->ioto = TICK_ETERNITY; + + s->req.analyse_exp = TICK_ETERNITY; + s->res.analyse_exp = TICK_ETERNITY; + + /* we're removing the analysers, we MUST re-enable events detection. + * We don't enable close on the response channel since it's either + * already closed, or in keep-alive with an idle connection handler. + */ + channel_auto_read(&s->req); + channel_auto_close(&s->req); + channel_auto_read(&s->res); + + + return 1; + } + return 0; +} + +/* + * The mworker functions are used to initialize the CLI in the master process + */ + + /* + * Stop the mworker proxy + */ +void mworker_cli_proxy_stop() +{ + if (mworker_proxy) + stop_proxy(mworker_proxy); +} + +/* + * Create the mworker CLI proxy + */ +int mworker_cli_proxy_create() +{ + struct mworker_proc *child; + char *msg = NULL; + char *errmsg = NULL; + + mworker_proxy = alloc_new_proxy("MASTER", PR_CAP_LISTEN|PR_CAP_INT, &errmsg); + if (!mworker_proxy) + goto error_proxy; + + mworker_proxy->mode = PR_MODE_CLI; + mworker_proxy->maxconn = 10; /* default to 10 concurrent connections */ + mworker_proxy->timeout.client = 0; /* no timeout */ + mworker_proxy->conf.file = strdup("MASTER"); + mworker_proxy->conf.line = 0; + mworker_proxy->accept = frontend_accept; + mworker_proxy-> lbprm.algo = BE_LB_ALGO_NONE; + + /* Does not init the default target the CLI applet, but must be done in + * the request parsing code */ + mworker_proxy->default_target = NULL; + + /* create all servers using the mworker_proc list */ + list_for_each_entry(child, &proc_list, list) { + struct server *newsrv = NULL; + struct sockaddr_storage *sk; + int port1, port2, port; + struct protocol *proto; + + /* only the workers support the master CLI */ + if (!(child->options & PROC_O_TYPE_WORKER)) + continue; + + newsrv = new_server(mworker_proxy); + if (!newsrv) + goto error; + + /* we don't know the new pid yet */ + if (child->pid == -1) + memprintf(&msg, "cur-%d", 1); + else + memprintf(&msg, "old-%d", child->pid); + + newsrv->next = mworker_proxy->srv; + mworker_proxy->srv = newsrv; + newsrv->conf.file = strdup(msg); + newsrv->id = strdup(msg); + newsrv->conf.line = 0; + + memprintf(&msg, "sockpair@%d", child->ipc_fd[0]); + if ((sk = str2sa_range(msg, &port, &port1, &port2, NULL, &proto, NULL, + &errmsg, NULL, NULL, PA_O_STREAM)) == 0) { + goto error; + } + ha_free(&msg); + + if (!proto->connect) { + goto error; + } + + /* no port specified */ + newsrv->flags |= SRV_F_MAPPORTS; + newsrv->addr = *sk; + /* don't let the server participate to load balancing */ + newsrv->iweight = 0; + newsrv->uweight = 0; + srv_lb_commit_status(newsrv); + + child->srv = newsrv; + } + + mworker_proxy->next = proxies_list; + proxies_list = mworker_proxy; + + return 0; + +error: + + list_for_each_entry(child, &proc_list, list) { + free((char *)child->srv->conf.file); /* cast because of const char * */ + free(child->srv->id); + ha_free(&child->srv); + } + free_proxy(mworker_proxy); + free(msg); + +error_proxy: + ha_alert("%s\n", errmsg); + free(errmsg); + + return -1; +} + +/* + * Create a new listener for the master CLI proxy + */ +struct bind_conf *mworker_cli_proxy_new_listener(char *line) +{ + struct bind_conf *bind_conf; + struct listener *l; + char *err = NULL; + char *args[MAX_LINE_ARGS + 1]; + int arg; + int cur_arg; + + arg = 1; + args[0] = line; + + /* args is a bind configuration with spaces replaced by commas */ + while (*line && arg < MAX_LINE_ARGS) { + + if (*line == ',') { + *line++ = '\0'; + while (*line == ',') + line++; + args[arg++] = line; + } + line++; + } + + args[arg] = "\0"; + + bind_conf = bind_conf_alloc(mworker_proxy, "master-socket", 0, "", xprt_get(XPRT_RAW)); + if (!bind_conf) + goto err; + + bind_conf->level &= ~ACCESS_LVL_MASK; + bind_conf->level |= ACCESS_LVL_ADMIN; + bind_conf->level |= ACCESS_MASTER | ACCESS_MASTER_ONLY; + + if (!str2listener(args[0], mworker_proxy, bind_conf, "master-socket", 0, &err)) { + ha_alert("Cannot create the listener of the master CLI\n"); + goto err; + } + + cur_arg = 1; + + while (*args[cur_arg]) { + struct bind_kw *kw; + const char *best; + + kw = bind_find_kw(args[cur_arg]); + if (kw) { + if (!kw->parse) { + memprintf(&err, "'%s %s' : '%s' option is not implemented in this version (check build options).", + args[0], args[1], args[cur_arg]); + goto err; + } + + if (kw->parse(args, cur_arg, global.cli_fe, bind_conf, &err) != 0) { + if (err) + memprintf(&err, "'%s %s' : '%s'", args[0], args[1], err); + else + memprintf(&err, "'%s %s' : error encountered while processing '%s'", + args[0], args[1], args[cur_arg]); + goto err; + } + + cur_arg += 1 + kw->skip; + continue; + } + + best = bind_find_best_kw(args[cur_arg]); + if (best) + memprintf(&err, "'%s %s' : unknown keyword '%s'. Did you mean '%s' maybe ?", + args[0], args[1], args[cur_arg], best); + else + memprintf(&err, "'%s %s' : unknown keyword '%s'.", + args[0], args[1], args[cur_arg]); + goto err; + } + + + bind_conf->accept = session_accept_fd; + bind_conf->nice = -64; /* we want to boost priority for local stats */ + bind_conf->options |= BC_O_UNLIMITED; /* don't make the peers subject to global limits */ + + /* Pin master CLI on the first thread of the first group only */ + thread_set_pin_grp1(&bind_conf->thread_set, 1); + + list_for_each_entry(l, &bind_conf->listeners, by_bind) { + l->rx.flags |= RX_F_MWORKER; /* we are keeping this FD in the master */ + global.maxsock++; /* for the listening socket */ + } + global.maxsock += mworker_proxy->maxconn; + + return bind_conf; + +err: + ha_alert("%s\n", err); + free(err); + free(bind_conf); + return NULL; + +} + +/* + * Create a new CLI socket using a socketpair for a worker process + * <mworker_proc> is the process structure, and <proc> is the process number + */ +int mworker_cli_sockpair_new(struct mworker_proc *mworker_proc, int proc) +{ + struct bind_conf *bind_conf; + struct listener *l; + char *path = NULL; + char *err = NULL; + + /* master pipe to ensure the master is still alive */ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, mworker_proc->ipc_fd) < 0) { + ha_alert("Cannot create worker socketpair.\n"); + return -1; + } + + /* XXX: we might want to use a separate frontend at some point */ + if (!global.cli_fe) { + if ((global.cli_fe = cli_alloc_fe("GLOBAL", "master-socket", 0)) == NULL) { + ha_alert("out of memory trying to allocate the stats frontend"); + goto error; + } + } + + bind_conf = bind_conf_alloc(global.cli_fe, "master-socket", 0, "", xprt_get(XPRT_RAW)); + if (!bind_conf) + goto error; + + bind_conf->level &= ~ACCESS_LVL_MASK; + bind_conf->level |= ACCESS_LVL_ADMIN; /* TODO: need to lower the rights with a CLI keyword*/ + bind_conf->level |= ACCESS_FD_LISTENERS; + + if (!memprintf(&path, "sockpair@%d", mworker_proc->ipc_fd[1])) { + ha_alert("Cannot allocate listener.\n"); + goto error; + } + + if (!str2listener(path, global.cli_fe, bind_conf, "master-socket", 0, &err)) { + free(path); + ha_alert("Cannot create a CLI sockpair listener for process #%d\n", proc); + goto error; + } + ha_free(&path); + + bind_conf->accept = session_accept_fd; + bind_conf->nice = -64; /* we want to boost priority for local stats */ + bind_conf->options |= BC_O_UNLIMITED | BC_O_NOSTOP; + + /* Pin master CLI on the first thread of the first group only */ + thread_set_pin_grp1(&bind_conf->thread_set, 1); + + list_for_each_entry(l, &bind_conf->listeners, by_bind) { + HA_ATOMIC_INC(&unstoppable_jobs); + /* it's a sockpair but we don't want to keep the fd in the master */ + l->rx.flags &= ~RX_F_INHERITED; + global.maxsock++; /* for the listening socket */ + } + + return 0; + +error: + close(mworker_proc->ipc_fd[0]); + close(mworker_proc->ipc_fd[1]); + free(err); + + return -1; +} + +static struct applet cli_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<CLI>", /* used for logging */ + .fct = cli_io_handler, + .release = cli_release_handler, +}; + +/* master CLI */ +static struct applet mcli_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<MCLI>", /* used for logging */ + .fct = cli_io_handler, + .release = cli_release_handler, +}; + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "help", NULL }, NULL, cli_parse_simple, NULL, NULL, NULL, ACCESS_MASTER }, + { { "prompt", NULL }, NULL, cli_parse_simple, NULL, NULL, NULL, ACCESS_MASTER }, + { { "quit", NULL }, NULL, cli_parse_simple, NULL, NULL, NULL, ACCESS_MASTER }, + { { "_getsocks", NULL }, NULL, _getsocks, NULL }, + { { "expert-mode", NULL }, NULL, cli_parse_expert_experimental_mode, NULL, NULL, NULL, ACCESS_MASTER }, // not listed + { { "experimental-mode", NULL }, NULL, cli_parse_expert_experimental_mode, NULL, NULL, NULL, ACCESS_MASTER }, // not listed + { { "mcli-debug-mode", NULL }, NULL, cli_parse_expert_experimental_mode, NULL, NULL, NULL, ACCESS_MASTER_ONLY }, // not listed + { { "set", "anon", "on" }, "set anon on [value] : activate the anonymized mode", cli_parse_set_anon, NULL, NULL }, + { { "set", "anon", "off" }, "set anon off : deactivate the anonymized mode", cli_parse_set_anon, NULL, NULL }, + { { "set", "anon", "global-key", NULL }, "set anon global-key <value> : change the global anonymizing key", cli_parse_set_global_key, NULL, NULL }, + { { "set", "maxconn", "global", NULL }, "set maxconn global <value> : change the per-process maxconn setting", cli_parse_set_maxconn_global, NULL }, + { { "set", "rate-limit", NULL }, "set rate-limit <setting> <value> : change a rate limiting value", cli_parse_set_ratelimit, NULL }, + { { "set", "severity-output", NULL }, "set severity-output [none|number|string]: set presence of severity level in feedback information", cli_parse_set_severity_output, NULL, NULL }, + { { "set", "timeout", NULL }, "set timeout [cli] <delay> : change a timeout setting", cli_parse_set_timeout, NULL, NULL }, + { { "show", "anon", NULL }, "show anon : display the current state of anonymized mode", cli_parse_show_anon, NULL }, + { { "show", "env", NULL }, "show env [var] : dump environment variables known to the process", cli_parse_show_env, cli_io_handler_show_env, NULL }, + { { "show", "cli", "sockets", NULL }, "show cli sockets : dump list of cli sockets", cli_parse_default, cli_io_handler_show_cli_sock, NULL, NULL, ACCESS_MASTER }, + { { "show", "cli", "level", NULL }, "show cli level : display the level of the current CLI session", cli_parse_show_lvl, NULL, NULL, NULL, ACCESS_MASTER}, + { { "show", "fd", NULL }, "show fd [-!plcfbsd]* [num] : dump list of file descriptors in use or a specific one", cli_parse_show_fd, cli_io_handler_show_fd, NULL }, + { { "show", "version", NULL }, "show version : show version of the current process", cli_parse_show_version, NULL, NULL, NULL, ACCESS_MASTER }, + { { "operator", NULL }, "operator : lower the level of the current CLI session to operator", cli_parse_set_lvl, NULL, NULL, NULL, ACCESS_MASTER}, + { { "user", NULL }, "user : lower the level of the current CLI session to user", cli_parse_set_lvl, NULL, NULL, NULL, ACCESS_MASTER}, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "stats", cli_parse_global }, + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +static struct bind_kw_list bind_kws = { "STAT", { }, { + { "level", bind_parse_level, 1 }, /* set the unix socket admin level */ + { "expose-fd", bind_parse_expose_fd, 1 }, /* set the unix socket expose fd rights */ + { "severity-output", bind_parse_severity_output, 1 }, /* set the severity output format */ + { NULL, NULL, 0 }, +}}; + +INITCALL1(STG_REGISTER, bind_register_keywords, &bind_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/clock.c b/src/clock.c new file mode 100644 index 0000000..ec2133c --- /dev/null +++ b/src/clock.c @@ -0,0 +1,460 @@ +/* + * General time-keeping code and variables + * + * Copyright 2000-2021 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/time.h> +#include <signal.h> +#include <time.h> + +#ifdef USE_THREAD +#include <pthread.h> +#endif + +#include <haproxy/api.h> +#include <haproxy/activity.h> +#include <haproxy/clock.h> +#include <haproxy/signal-t.h> +#include <haproxy/time.h> +#include <haproxy/tinfo-t.h> +#include <haproxy/tools.h> + +struct timeval start_date; /* the process's start date in wall-clock time */ +struct timeval ready_date; /* date when the process was considered ready */ +ullong start_time_ns; /* the process's start date in internal monotonic time (ns) */ +volatile ullong global_now_ns; /* common monotonic date between all threads, in ns (wraps every 585 yr) */ +volatile uint global_now_ms; /* common monotonic date in milliseconds (may wrap) */ + +THREAD_ALIGNED(64) static llong now_offset; /* global offset between system time and global time in ns */ + +THREAD_LOCAL ullong now_ns; /* internal monotonic date derived from real clock, in ns (wraps every 585 yr) */ +THREAD_LOCAL uint now_ms; /* internal monotonic date in milliseconds (may wrap) */ +THREAD_LOCAL struct timeval date; /* the real current date (wall-clock time) */ + +static THREAD_LOCAL struct timeval before_poll; /* system date before calling poll() */ +static THREAD_LOCAL struct timeval after_poll; /* system date after leaving poll() */ +static THREAD_LOCAL unsigned int samp_time; /* total elapsed time over current sample */ +static THREAD_LOCAL unsigned int idle_time; /* total idle time over current sample */ +static THREAD_LOCAL unsigned int iso_time_sec; /* last iso time value for this thread */ +static THREAD_LOCAL char iso_time_str[34]; /* ISO time representation of gettimeofday() */ + +#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME) +static clockid_t per_thread_clock_id[MAX_THREADS]; +#endif + +/* returns the system's monotonic time in nanoseconds if supported, otherwise zero */ +uint64_t now_mono_time(void) +{ + uint64_t ret = 0; +#if defined(_POSIX_TIMERS) && defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_MONOTONIC_CLOCK) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + ret = ts.tv_sec * 1000000000ULL + ts.tv_nsec; +#endif + return ret; +} + +/* Returns the system's monotonic time in nanoseconds. + * Uses the coarse clock source if supported (for fast but + * less precise queries with limited resource usage). + * Fallback to now_mono_time() if coarse source is not supported, + * which may itself return 0 if not supported either. + */ +uint64_t now_mono_time_fast(void) +{ +#if defined(CLOCK_MONOTONIC_COARSE) + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts); + return (ts.tv_sec * 1000000000ULL + ts.tv_nsec); +#else + /* fallback to regular mono time, + * returns 0 if not supported + */ + return now_mono_time(); +#endif +} + +/* returns the current thread's cumulated CPU time in nanoseconds if supported, otherwise zero */ +uint64_t now_cpu_time(void) +{ + uint64_t ret = 0; +#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + ret = ts.tv_sec * 1000000000ULL + ts.tv_nsec; +#endif + return ret; +} + +/* Returns the current thread's cumulated CPU time in nanoseconds. + * + * thread_local timer is cached so that call is less precise but also less + * expensive if heavily used. + * We use the mono time as a cache expiration hint since now_cpu_time() is + * known to be much more expensive than now_mono_time_fast() on systems + * supporting the COARSE clock source. + * + * Returns 0 if either now_mono_time_fast() or now_cpu_time() are not + * supported. + */ +uint64_t now_cpu_time_fast(void) +{ + static THREAD_LOCAL uint64_t mono_cache = 0; + static THREAD_LOCAL uint64_t cpu_cache = 0; + uint64_t mono_cur; + + mono_cur = now_mono_time_fast(); + if (unlikely(mono_cur != mono_cache)) { + /* global mono clock was updated: local cache is outdated */ + cpu_cache = now_cpu_time(); + mono_cache = mono_cur; + } + return cpu_cache; +} + +/* returns another thread's cumulated CPU time in nanoseconds if supported, otherwise zero */ +uint64_t now_cpu_time_thread(int thr) +{ + uint64_t ret = 0; +#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME) + struct timespec ts; + clock_gettime(per_thread_clock_id[thr], &ts); + ret = ts.tv_sec * 1000000000ULL + ts.tv_nsec; +#endif + return ret; +} + +/* set the clock source for the local thread */ +void clock_set_local_source(void) +{ +#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME) +#ifdef USE_THREAD + pthread_getcpuclockid(pthread_self(), &per_thread_clock_id[tid]); +#else + per_thread_clock_id[tid] = CLOCK_THREAD_CPUTIME_ID; +#endif +#endif +} + +/* registers a timer <tmr> of type timer_t delivering signal <sig> with value + * <val>. It tries on the current thread's clock ID first and falls back to + * CLOCK_REALTIME. Returns non-zero on success, 1 on failure. + */ +int clock_setup_signal_timer(void *tmr, int sig, int val) +{ + int ret = 0; + +#if defined(USE_RT) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME) + struct sigevent sev = { }; + timer_t *timer = tmr; + sigset_t set; + + /* unblock the WDTSIG signal we intend to use */ + sigemptyset(&set); + sigaddset(&set, WDTSIG); + ha_sigmask(SIG_UNBLOCK, &set, NULL); + + /* this timer will signal WDTSIG when it fires, with tid in the si_int + * field (important since any thread will receive the signal). + */ + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = sig; + sev.sigev_value.sival_int = val; + if (timer_create(per_thread_clock_id[tid], &sev, timer) != -1 || + timer_create(CLOCK_REALTIME, &sev, timer) != -1) + ret = 1; +#endif + return ret; +} + +/* clock_update_date: sets <date> to system time, and sets <now_ns> to something + * as close as possible to real time, following a monotonic function. The main + * principle consists in detecting backwards and forwards time jumps and adjust + * an offset to correct them. This function should be called once after each + * poll, and never farther apart than MAX_DELAY_MS*2. The poll's timeout should + * be passed in <max_wait>, and the return value in <interrupted> (a non-zero + * value means that we have not expired the timeout). + * + * clock_init_process_date() must have been called once first, and + * clock_init_thread_date() must also have been called once for each thread. + * + * An offset is used to adjust the current time (date), to figure a monotonic + * local time (now_ns). The offset is not critical, as it is only updated after + * a clock jump is detected. From this point all threads will apply it to their + * locally measured time, and will then agree around a common monotonic + * global_now_ns value that serves to further refine their local time. Both + * now_ns and global_now_ns are 64-bit integers counting nanoseconds since a + * vague reference (it starts roughly 20s before the next wrap-around of the + * millisecond counter after boot). The offset is also an integral number of + * nanoseconds, but it's signed so that the clock can be adjusted in the two + * directions. + */ +void clock_update_local_date(int max_wait, int interrupted) +{ + struct timeval min_deadline, max_deadline; + + gettimeofday(&date, NULL); + + /* compute the minimum and maximum local date we may have reached based + * on our past date and the associated timeout. There are three possible + * extremities: + * - the new date cannot be older than before_poll + * - if not interrupted, the new date cannot be older than + * before_poll+max_wait + * - in any case the new date cannot be newer than + * before_poll+max_wait+some margin (100ms used here). + * In case of violation, we'll ignore the current date and instead + * restart from the last date we knew. + */ + _tv_ms_add(&min_deadline, &before_poll, max_wait); + _tv_ms_add(&max_deadline, &before_poll, max_wait + 100); + + if (unlikely(__tv_islt(&date, &before_poll) || // big jump backwards + (!interrupted && __tv_islt(&date, &min_deadline)) || // small jump backwards + __tv_islt(&max_deadline, &date))) { // big jump forwards + if (!interrupted) + now_ns += ms_to_ns(max_wait); + } else { + /* The date is still within expectations. Let's apply the + * now_offset to the system date. Note: ofs if made of two + * independent signed ints. + */ + now_ns = tv_to_ns(&date) + HA_ATOMIC_LOAD(&now_offset); + } + now_ms = ns_to_ms(now_ns); +} + +void clock_update_global_date() +{ + ullong old_now_ns; + uint old_now_ms; + + /* now that we have bounded the local time, let's check if it's + * realistic regarding the global date, which only moves forward, + * otherwise catch up. + */ + old_now_ns = _HA_ATOMIC_LOAD(&global_now_ns); + old_now_ms = global_now_ms; + + do { + if (now_ns < old_now_ns) + now_ns = old_now_ns; + + /* now <now_ns> is expected to be the most accurate date, + * equal to <global_now_ns> or newer. Updating the global + * date too often causes extreme contention and is not + * needed: it's only used to help threads run at the + * same date in case of local drift, and the global date, + * which changes, is only used by freq counters (a choice + * which is debatable by the way since it changes under us). + * Tests have seen that the contention can be reduced from + * 37% in this function to almost 0% when keeping clocks + * synchronized no better than 32 microseconds, so that's + * what we're doing here. + */ + now_ms = ns_to_ms(now_ns); + + if (!((now_ns ^ old_now_ns) & ~0x7FFFULL)) + return; + + /* let's try to update the global_now_ns (both in nanoseconds + * and ms forms) or loop again. + */ + } while ((!_HA_ATOMIC_CAS(&global_now_ns, &old_now_ns, now_ns) || + (now_ms != old_now_ms && !_HA_ATOMIC_CAS(&global_now_ms, &old_now_ms, now_ms))) && + __ha_cpu_relax()); + + /* <now_ns> and <now_ms> are now updated to the last value of + * global_now_ns and global_now_ms, which were also monotonically + * updated. We can compute the latest offset, we don't care who writes + * it last, the variations will not break the monotonic property. + */ + HA_ATOMIC_STORE(&now_offset, now_ns - tv_to_ns(&date)); +} + +/* must be called once at boot to initialize some global variables */ +void clock_init_process_date(void) +{ + now_offset = 0; + gettimeofday(&date, NULL); + after_poll = before_poll = date; + now_ns = global_now_ns = tv_to_ns(&date); + global_now_ms = ns_to_ms(now_ns); + + /* force time to wrap 20s after boot: we first compute the time offset + * that once applied to the wall-clock date will make the local time + * wrap in 5 seconds. This offset is applied to the process-wide time, + * and will be used to recompute the local time, both of which will + * match and continue from this shifted date. + */ + now_offset = sec_to_ns((uint)((uint)(-global_now_ms) / 1000U - BOOT_TIME_WRAP_SEC)); + global_now_ns += now_offset; + now_ns = global_now_ns; + now_ms = global_now_ms = ns_to_ms(now_ns); + + th_ctx->idle_pct = 100; + clock_update_date(0, 1); +} + +void clock_adjust_now_offset(void) +{ + HA_ATOMIC_STORE(&now_offset, now_ns - tv_to_ns(&date)); +} + +/* must be called once per thread to initialize their thread-local variables. + * Note that other threads might also be initializing and running in parallel. + */ +void clock_init_thread_date(void) +{ + gettimeofday(&date, NULL); + after_poll = before_poll = date; + + now_ns = _HA_ATOMIC_LOAD(&global_now_ns); + th_ctx->idle_pct = 100; + th_ctx->prev_cpu_time = now_cpu_time(); + clock_update_date(0, 1); +} + +/* report the average CPU idle percentage over all running threads, between 0 and 100 */ +uint clock_report_idle(void) +{ + uint total = 0; + uint rthr = 0; + uint thr; + + for (thr = 0; thr < MAX_THREADS; thr++) { + if (!ha_thread_info[thr].tg || + !(ha_thread_info[thr].tg->threads_enabled & ha_thread_info[thr].ltid_bit)) + continue; + total += HA_ATOMIC_LOAD(&ha_thread_ctx[thr].idle_pct); + rthr++; + } + return rthr ? total / rthr : 0; +} + +/* Update the idle time value twice a second, to be called after + * clock_update_date() when called after poll(), and currently called only by + * clock_leaving_poll() below. It relies on <before_poll> to be updated to + * the system time before calling poll(). + */ +static inline void clock_measure_idle(void) +{ + /* Let's compute the idle to work ratio. We worked between after_poll + * and before_poll, and slept between before_poll and date. The idle_pct + * is updated at most twice every second. Note that the current second + * rarely changes so we avoid a multiply when not needed. + */ + int delta; + + if ((delta = date.tv_sec - before_poll.tv_sec)) + delta *= 1000000; + idle_time += delta + (date.tv_usec - before_poll.tv_usec); + + if ((delta = date.tv_sec - after_poll.tv_sec)) + delta *= 1000000; + samp_time += delta + (date.tv_usec - after_poll.tv_usec); + + after_poll.tv_sec = date.tv_sec; after_poll.tv_usec = date.tv_usec; + if (samp_time < 500000) + return; + + HA_ATOMIC_STORE(&th_ctx->idle_pct, (100ULL * idle_time + samp_time / 2) / samp_time); + idle_time = samp_time = 0; +} + +/* Collect date and time information after leaving poll(). <timeout> must be + * set to the maximum sleep time passed to poll (in milliseconds), and + * <interrupted> must be zero if the poller reached the timeout or non-zero + * otherwise, which generally is provided by the poller's return value. + */ +void clock_leaving_poll(int timeout, int interrupted) +{ + clock_measure_idle(); + th_ctx->prev_cpu_time = now_cpu_time(); + th_ctx->prev_mono_time = now_mono_time(); +} + +/* Collect date and time information before calling poll(). This will be used + * to count the run time of the past loop and the sleep time of the next poll. + * It also compares the elapsed and cpu times during the activity period to + * estimate the amount of stolen time, which is reported if higher than half + * a millisecond. + */ +void clock_entering_poll(void) +{ + uint64_t new_mono_time; + uint64_t new_cpu_time; + uint32_t run_time; + int64_t stolen; + + gettimeofday(&before_poll, NULL); + + run_time = (before_poll.tv_sec - after_poll.tv_sec) * 1000000U + (before_poll.tv_usec - after_poll.tv_usec); + + new_cpu_time = now_cpu_time(); + new_mono_time = now_mono_time(); + + if (th_ctx->prev_cpu_time && th_ctx->prev_mono_time) { + new_cpu_time -= th_ctx->prev_cpu_time; + new_mono_time -= th_ctx->prev_mono_time; + stolen = new_mono_time - new_cpu_time; + if (unlikely(stolen >= 500000)) { + stolen /= 500000; + /* more than half a millisecond difference might + * indicate an undesired preemption. + */ + report_stolen_time(stolen); + } + } + + /* update the average runtime */ + activity_count_runtime(run_time); +} + +/* returns the current date as returned by gettimeofday() in ISO+microsecond + * format. It uses a thread-local static variable that the reader can consume + * for as long as it wants until next call. Thus, do not call it from a signal + * handler. If <pad> is non-0, a trailing space will be added. It will always + * return exactly 32 or 33 characters (depending on padding) and will always be + * zero-terminated, thus it will always fit into a 34 bytes buffer. + * This also always include the local timezone (in +/-HH:mm format) . + */ +char *timeofday_as_iso_us(int pad) +{ + struct timeval new_date; + struct tm tm; + const char *offset; + char c; + + gettimeofday(&new_date, NULL); + if (new_date.tv_sec != iso_time_sec || !new_date.tv_sec) { + get_localtime(new_date.tv_sec, &tm); + offset = get_gmt_offset(new_date.tv_sec, &tm); + if (unlikely(strftime(iso_time_str, sizeof(iso_time_str), "%Y-%m-%dT%H:%M:%S.000000+00:00", &tm) != 32)) + strlcpy2(iso_time_str, "YYYY-mm-ddTHH:MM:SS.000000-00:00", sizeof(iso_time_str)); // make the failure visible but respect format. + iso_time_str[26] = offset[0]; + iso_time_str[27] = offset[1]; + iso_time_str[28] = offset[2]; + iso_time_str[30] = offset[3]; + iso_time_str[31] = offset[4]; + iso_time_sec = new_date.tv_sec; + } + + /* utoa_pad adds a trailing 0 so we save the char for restore */ + c = iso_time_str[26]; + utoa_pad(new_date.tv_usec, iso_time_str + 20, 7); + iso_time_str[26] = c; + if (pad) { + iso_time_str[32] = ' '; + iso_time_str[33] = 0; + } + return iso_time_str; +} diff --git a/src/compression.c b/src/compression.c new file mode 100644 index 0000000..7b75461 --- /dev/null +++ b/src/compression.c @@ -0,0 +1,742 @@ +/* + * HTTP compression. + * + * Copyright 2012 Exceliance, David Du Colombier <dducolombier@exceliance.fr> + * William Lallemand <wlallemand@exceliance.fr> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <stdio.h> + +#if defined(USE_ZLIB) +/* Note: the crappy zlib and openssl libs both define the "free_func" type. + * That's a very clever idea to use such a generic name in general purpose + * libraries, really... The zlib one is easier to redefine than openssl's, + * so let's only fix this one. + */ +#define free_func zlib_free_func +#include <zlib.h> +#undef free_func +#endif /* USE_ZLIB */ + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/compression-t.h> +#include <haproxy/compression.h> +#include <haproxy/dynbuf.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/global.h> +#include <haproxy/pool.h> +#include <haproxy/stream.h> +#include <haproxy/thread.h> +#include <haproxy/tools.h> + + +#if defined(USE_ZLIB) +__decl_spinlock(comp_pool_lock); +#endif + +#ifdef USE_ZLIB + +static void *alloc_zlib(void *opaque, unsigned int items, unsigned int size); +static void free_zlib(void *opaque, void *ptr); + +/* zlib allocation */ +static struct pool_head *zlib_pool_deflate_state __read_mostly = NULL; +static struct pool_head *zlib_pool_window __read_mostly = NULL; +static struct pool_head *zlib_pool_prev __read_mostly = NULL; +static struct pool_head *zlib_pool_head __read_mostly = NULL; +static struct pool_head *zlib_pool_pending_buf __read_mostly = NULL; + +long zlib_used_memory = 0; + +static int global_tune_zlibmemlevel = 8; /* zlib memlevel */ +static int global_tune_zlibwindowsize = MAX_WBITS; /* zlib window size */ + +#endif + +unsigned int compress_min_idle = 0; + +static int identity_init(struct comp_ctx **comp_ctx, int level); +static int identity_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out); +static int identity_flush(struct comp_ctx *comp_ctx, struct buffer *out); +static int identity_finish(struct comp_ctx *comp_ctx, struct buffer *out); +static int identity_end(struct comp_ctx **comp_ctx); + +#if defined(USE_SLZ) + +static int rfc1950_init(struct comp_ctx **comp_ctx, int level); +static int rfc1951_init(struct comp_ctx **comp_ctx, int level); +static int rfc1952_init(struct comp_ctx **comp_ctx, int level); +static int rfc195x_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out); +static int rfc195x_flush(struct comp_ctx *comp_ctx, struct buffer *out); +static int rfc195x_finish(struct comp_ctx *comp_ctx, struct buffer *out); +static int rfc195x_end(struct comp_ctx **comp_ctx); + +#elif defined(USE_ZLIB) + +static int gzip_init(struct comp_ctx **comp_ctx, int level); +static int raw_def_init(struct comp_ctx **comp_ctx, int level); +static int deflate_init(struct comp_ctx **comp_ctx, int level); +static int deflate_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out); +static int deflate_flush(struct comp_ctx *comp_ctx, struct buffer *out); +static int deflate_finish(struct comp_ctx *comp_ctx, struct buffer *out); +static int deflate_end(struct comp_ctx **comp_ctx); + +#endif /* USE_ZLIB */ + + +const struct comp_algo comp_algos[] = +{ + { "identity", 8, "identity", 8, identity_init, identity_add_data, identity_flush, identity_finish, identity_end }, +#if defined(USE_SLZ) + { "deflate", 7, "deflate", 7, rfc1950_init, rfc195x_add_data, rfc195x_flush, rfc195x_finish, rfc195x_end }, + { "raw-deflate", 11, "deflate", 7, rfc1951_init, rfc195x_add_data, rfc195x_flush, rfc195x_finish, rfc195x_end }, + { "gzip", 4, "gzip", 4, rfc1952_init, rfc195x_add_data, rfc195x_flush, rfc195x_finish, rfc195x_end }, +#elif defined(USE_ZLIB) + { "deflate", 7, "deflate", 7, deflate_init, deflate_add_data, deflate_flush, deflate_finish, deflate_end }, + { "raw-deflate", 11, "deflate", 7, raw_def_init, deflate_add_data, deflate_flush, deflate_finish, deflate_end }, + { "gzip", 4, "gzip", 4, gzip_init, deflate_add_data, deflate_flush, deflate_finish, deflate_end }, +#endif /* USE_ZLIB */ + { NULL, 0, NULL, 0, NULL , NULL, NULL, NULL, NULL } +}; + +/* + * Add a content-type in the configuration + * Returns 0 in case of success, 1 in case of allocation failure. + */ +int comp_append_type(struct comp_type **types, const char *type) +{ + struct comp_type *comp_type; + + comp_type = calloc(1, sizeof(*comp_type)); + if (!comp_type) + return 1; + comp_type->name_len = strlen(type); + comp_type->name = strdup(type); + comp_type->next = *types; + *types = comp_type; + return 0; +} + +/* + * Add an algorithm in the configuration + * Returns 0 in case of success, -1 if the <algo> is unmanaged, 1 in case of + * allocation failure. + */ +int comp_append_algo(struct comp_algo **algos, const char *algo) +{ + struct comp_algo *comp_algo; + int i; + + for (i = 0; comp_algos[i].cfg_name; i++) { + if (strcmp(algo, comp_algos[i].cfg_name) == 0) { + comp_algo = calloc(1, sizeof(*comp_algo)); + if (!comp_algo) + return 1; + memmove(comp_algo, &comp_algos[i], sizeof(struct comp_algo)); + comp_algo->next = *algos; + *algos = comp_algo; + return 0; + } + } + return -1; +} + +#if defined(USE_ZLIB) || defined(USE_SLZ) +DECLARE_STATIC_POOL(pool_comp_ctx, "comp_ctx", sizeof(struct comp_ctx)); + +/* + * Alloc the comp_ctx + */ +static inline int init_comp_ctx(struct comp_ctx **comp_ctx) +{ +#ifdef USE_ZLIB + z_stream *strm; + + if (global.maxzlibmem > 0 && (global.maxzlibmem - zlib_used_memory) < sizeof(struct comp_ctx)) + return -1; +#endif + + *comp_ctx = pool_alloc(pool_comp_ctx); + if (*comp_ctx == NULL) + return -1; +#if defined(USE_SLZ) + (*comp_ctx)->direct_ptr = NULL; + (*comp_ctx)->direct_len = 0; + (*comp_ctx)->queued = BUF_NULL; +#elif defined(USE_ZLIB) + _HA_ATOMIC_ADD(&zlib_used_memory, sizeof(struct comp_ctx)); + __ha_barrier_atomic_store(); + + strm = &(*comp_ctx)->strm; + strm->zalloc = alloc_zlib; + strm->zfree = free_zlib; + strm->opaque = *comp_ctx; +#endif + return 0; +} + +/* + * Dealloc the comp_ctx + */ +static inline int deinit_comp_ctx(struct comp_ctx **comp_ctx) +{ + if (!*comp_ctx) + return 0; + + pool_free(pool_comp_ctx, *comp_ctx); + *comp_ctx = NULL; + +#ifdef USE_ZLIB + _HA_ATOMIC_SUB(&zlib_used_memory, sizeof(struct comp_ctx)); + __ha_barrier_atomic_store(); +#endif + return 0; +} +#endif + + +/**************************** + **** Identity algorithm **** + ****************************/ + +/* + * Init the identity algorithm + */ +static int identity_init(struct comp_ctx **comp_ctx, int level) +{ + return 0; +} + +/* + * Process data + * Return size of consumed data or -1 on error + */ +static int identity_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out) +{ + char *out_data = b_tail(out); + int out_len = b_room(out); + + if (out_len < in_len) + return -1; + + memcpy(out_data, in_data, in_len); + + b_add(out, in_len); + + return in_len; +} + +static int identity_flush(struct comp_ctx *comp_ctx, struct buffer *out) +{ + return 0; +} + +static int identity_finish(struct comp_ctx *comp_ctx, struct buffer *out) +{ + return 0; +} + +/* + * Deinit the algorithm + */ +static int identity_end(struct comp_ctx **comp_ctx) +{ + return 0; +} + + +#ifdef USE_SLZ + +/* SLZ's gzip format (RFC1952). Returns < 0 on error. */ +static int rfc1952_init(struct comp_ctx **comp_ctx, int level) +{ + if (init_comp_ctx(comp_ctx) < 0) + return -1; + + (*comp_ctx)->cur_lvl = !!level; + return slz_rfc1952_init(&(*comp_ctx)->strm, !!level); +} + +/* SLZ's raw deflate format (RFC1951). Returns < 0 on error. */ +static int rfc1951_init(struct comp_ctx **comp_ctx, int level) +{ + if (init_comp_ctx(comp_ctx) < 0) + return -1; + + (*comp_ctx)->cur_lvl = !!level; + return slz_rfc1951_init(&(*comp_ctx)->strm, !!level); +} + +/* SLZ's zlib format (RFC1950). Returns < 0 on error. */ +static int rfc1950_init(struct comp_ctx **comp_ctx, int level) +{ + if (init_comp_ctx(comp_ctx) < 0) + return -1; + + (*comp_ctx)->cur_lvl = !!level; + return slz_rfc1950_init(&(*comp_ctx)->strm, !!level); +} + +/* Return the size of consumed data or -1. The output buffer is unused at this + * point, we only keep a reference to the input data or a copy of them if the + * reference is already used. + */ +static int rfc195x_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out) +{ + static THREAD_LOCAL struct buffer tmpbuf = BUF_NULL; + + if (in_len <= 0) + return 0; + + if (comp_ctx->direct_ptr && b_is_null(&comp_ctx->queued)) { + /* data already being pointed to, we're in front of fragmented + * data and need a buffer now. We reuse the same buffer, as it's + * not used out of the scope of a series of add_data()*, end(). + */ + if (b_alloc(&tmpbuf) == NULL) + return -1; /* no memory */ + b_reset(&tmpbuf); + memcpy(b_tail(&tmpbuf), comp_ctx->direct_ptr, comp_ctx->direct_len); + b_add(&tmpbuf, comp_ctx->direct_len); + comp_ctx->direct_ptr = NULL; + comp_ctx->direct_len = 0; + comp_ctx->queued = tmpbuf; + /* fall through buffer copy */ + } + + if (!b_is_null(&comp_ctx->queued)) { + /* data already pending */ + memcpy(b_tail(&comp_ctx->queued), in_data, in_len); + b_add(&comp_ctx->queued, in_len); + return in_len; + } + + comp_ctx->direct_ptr = in_data; + comp_ctx->direct_len = in_len; + return in_len; +} + +/* Compresses the data accumulated using add_data(), and optionally sends the + * format-specific trailer if <finish> is non-null. <out> is expected to have a + * large enough free non-wrapping space as verified by http_comp_buffer_init(). + * The number of bytes emitted is reported. + */ +static int rfc195x_flush_or_finish(struct comp_ctx *comp_ctx, struct buffer *out, int finish) +{ + struct slz_stream *strm = &comp_ctx->strm; + const char *in_ptr; + int in_len; + int out_len; + + in_ptr = comp_ctx->direct_ptr; + in_len = comp_ctx->direct_len; + + if (!b_is_null(&comp_ctx->queued)) { + in_ptr = b_head(&comp_ctx->queued); + in_len = b_data(&comp_ctx->queued); + } + + out_len = b_data(out); + + if (in_ptr) + b_add(out, slz_encode(strm, b_tail(out), in_ptr, in_len, !finish)); + + if (finish) + b_add(out, slz_finish(strm, b_tail(out))); + else + b_add(out, slz_flush(strm, b_tail(out))); + + out_len = b_data(out) - out_len; + + /* very important, we must wipe the data we've just flushed */ + comp_ctx->direct_len = 0; + comp_ctx->direct_ptr = NULL; + comp_ctx->queued = BUF_NULL; + + /* Verify compression rate limiting and CPU usage */ + if ((global.comp_rate_lim > 0 && (read_freq_ctr(&global.comp_bps_out) > global.comp_rate_lim)) || /* rate */ + (th_ctx->idle_pct < compress_min_idle)) { /* idle */ + if (comp_ctx->cur_lvl > 0) + strm->level = --comp_ctx->cur_lvl; + } + else if (comp_ctx->cur_lvl < global.tune.comp_maxlevel && comp_ctx->cur_lvl < 1) { + strm->level = ++comp_ctx->cur_lvl; + } + + /* and that's all */ + return out_len; +} + +static int rfc195x_flush(struct comp_ctx *comp_ctx, struct buffer *out) +{ + return rfc195x_flush_or_finish(comp_ctx, out, 0); +} + +static int rfc195x_finish(struct comp_ctx *comp_ctx, struct buffer *out) +{ + return rfc195x_flush_or_finish(comp_ctx, out, 1); +} + +/* we just need to free the comp_ctx here, nothing was allocated */ +static int rfc195x_end(struct comp_ctx **comp_ctx) +{ + deinit_comp_ctx(comp_ctx); + return 0; +} + +#elif defined(USE_ZLIB) /* ! USE_SLZ */ + +/* + * This is a tricky allocation function using the zlib. + * This is based on the allocation order in deflateInit2. + */ +static void *alloc_zlib(void *opaque, unsigned int items, unsigned int size) +{ + struct comp_ctx *ctx = opaque; + static THREAD_LOCAL char round = 0; /* order in deflateInit2 */ + void *buf = NULL; + struct pool_head *pool = NULL; + + if (global.maxzlibmem > 0 && (global.maxzlibmem - zlib_used_memory) < (long)(items * size)) + goto end; + + switch (round) { + case 0: + if (zlib_pool_deflate_state == NULL) { + HA_SPIN_LOCK(COMP_POOL_LOCK, &comp_pool_lock); + if (zlib_pool_deflate_state == NULL) + zlib_pool_deflate_state = create_pool("zlib_state", size * items, MEM_F_SHARED); + HA_SPIN_UNLOCK(COMP_POOL_LOCK, &comp_pool_lock); + } + pool = zlib_pool_deflate_state; + ctx->zlib_deflate_state = buf = pool_alloc(pool); + break; + + case 1: + if (zlib_pool_window == NULL) { + HA_SPIN_LOCK(COMP_POOL_LOCK, &comp_pool_lock); + if (zlib_pool_window == NULL) + zlib_pool_window = create_pool("zlib_window", size * items, MEM_F_SHARED); + HA_SPIN_UNLOCK(COMP_POOL_LOCK, &comp_pool_lock); + } + pool = zlib_pool_window; + ctx->zlib_window = buf = pool_alloc(pool); + break; + + case 2: + if (zlib_pool_prev == NULL) { + HA_SPIN_LOCK(COMP_POOL_LOCK, &comp_pool_lock); + if (zlib_pool_prev == NULL) + zlib_pool_prev = create_pool("zlib_prev", size * items, MEM_F_SHARED); + HA_SPIN_UNLOCK(COMP_POOL_LOCK, &comp_pool_lock); + } + pool = zlib_pool_prev; + ctx->zlib_prev = buf = pool_alloc(pool); + break; + + case 3: + if (zlib_pool_head == NULL) { + HA_SPIN_LOCK(COMP_POOL_LOCK, &comp_pool_lock); + if (zlib_pool_head == NULL) + zlib_pool_head = create_pool("zlib_head", size * items, MEM_F_SHARED); + HA_SPIN_UNLOCK(COMP_POOL_LOCK, &comp_pool_lock); + } + pool = zlib_pool_head; + ctx->zlib_head = buf = pool_alloc(pool); + break; + + case 4: + if (zlib_pool_pending_buf == NULL) { + HA_SPIN_LOCK(COMP_POOL_LOCK, &comp_pool_lock); + if (zlib_pool_pending_buf == NULL) + zlib_pool_pending_buf = create_pool("zlib_pending_buf", size * items, MEM_F_SHARED); + HA_SPIN_UNLOCK(COMP_POOL_LOCK, &comp_pool_lock); + } + pool = zlib_pool_pending_buf; + ctx->zlib_pending_buf = buf = pool_alloc(pool); + break; + } + if (buf != NULL) { + _HA_ATOMIC_ADD(&zlib_used_memory, pool->size); + __ha_barrier_atomic_store(); + } + +end: + + /* deflateInit2() first allocates and checks the deflate_state, then if + * it succeeds, it allocates all other 4 areas at ones and checks them + * at the end. So we want to correctly count the rounds depending on when + * zlib is supposed to abort. + */ + if (buf || round) + round = (round + 1) % 5; + return buf; +} + +static void free_zlib(void *opaque, void *ptr) +{ + struct comp_ctx *ctx = opaque; + struct pool_head *pool = NULL; + + if (ptr == ctx->zlib_window) + pool = zlib_pool_window; + else if (ptr == ctx->zlib_deflate_state) + pool = zlib_pool_deflate_state; + else if (ptr == ctx->zlib_prev) + pool = zlib_pool_prev; + else if (ptr == ctx->zlib_head) + pool = zlib_pool_head; + else if (ptr == ctx->zlib_pending_buf) + pool = zlib_pool_pending_buf; + else { + // never matched, just to silence gcc + ABORT_NOW(); + return; + } + + pool_free(pool, ptr); + _HA_ATOMIC_SUB(&zlib_used_memory, pool->size); + __ha_barrier_atomic_store(); +} + +/************************** +**** gzip algorithm **** +***************************/ +static int gzip_init(struct comp_ctx **comp_ctx, int level) +{ + z_stream *strm; + + if (init_comp_ctx(comp_ctx) < 0) + return -1; + + strm = &(*comp_ctx)->strm; + + if (deflateInit2(strm, level, Z_DEFLATED, global_tune_zlibwindowsize + 16, global_tune_zlibmemlevel, Z_DEFAULT_STRATEGY) != Z_OK) { + deinit_comp_ctx(comp_ctx); + return -1; + } + + (*comp_ctx)->cur_lvl = level; + + return 0; +} + +/* Raw deflate algorithm */ +static int raw_def_init(struct comp_ctx **comp_ctx, int level) +{ + z_stream *strm; + + if (init_comp_ctx(comp_ctx) < 0) + return -1; + + strm = &(*comp_ctx)->strm; + + if (deflateInit2(strm, level, Z_DEFLATED, -global_tune_zlibwindowsize, global_tune_zlibmemlevel, Z_DEFAULT_STRATEGY) != Z_OK) { + deinit_comp_ctx(comp_ctx); + return -1; + } + + (*comp_ctx)->cur_lvl = level; + return 0; +} + +/************************** +**** Deflate algorithm **** +***************************/ + +static int deflate_init(struct comp_ctx **comp_ctx, int level) +{ + z_stream *strm; + + if (init_comp_ctx(comp_ctx) < 0) + return -1; + + strm = &(*comp_ctx)->strm; + + if (deflateInit2(strm, level, Z_DEFLATED, global_tune_zlibwindowsize, global_tune_zlibmemlevel, Z_DEFAULT_STRATEGY) != Z_OK) { + deinit_comp_ctx(comp_ctx); + return -1; + } + + (*comp_ctx)->cur_lvl = level; + + return 0; +} + +/* Return the size of consumed data or -1 */ +static int deflate_add_data(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out) +{ + int ret; + z_stream *strm = &comp_ctx->strm; + char *out_data = b_tail(out); + int out_len = b_room(out); + + if (in_len <= 0) + return 0; + + + if (out_len <= 0) + return -1; + + strm->next_in = (unsigned char *)in_data; + strm->avail_in = in_len; + strm->next_out = (unsigned char *)out_data; + strm->avail_out = out_len; + + ret = deflate(strm, Z_NO_FLUSH); + if (ret != Z_OK) + return -1; + + /* deflate update the available data out */ + b_add(out, out_len - strm->avail_out); + + return in_len - strm->avail_in; +} + +static int deflate_flush_or_finish(struct comp_ctx *comp_ctx, struct buffer *out, int flag) +{ + int ret; + int out_len = 0; + z_stream *strm = &comp_ctx->strm; + + strm->next_in = NULL; + strm->avail_in = 0; + strm->next_out = (unsigned char *)b_tail(out); + strm->avail_out = b_room(out); + + ret = deflate(strm, flag); + if (ret != Z_OK && ret != Z_STREAM_END) + return -1; + + out_len = b_room(out) - strm->avail_out; + b_add(out, out_len); + + /* compression limit */ + if ((global.comp_rate_lim > 0 && (read_freq_ctr(&global.comp_bps_out) > global.comp_rate_lim)) || /* rate */ + (th_ctx->idle_pct < compress_min_idle)) { /* idle */ + /* decrease level */ + if (comp_ctx->cur_lvl > 0) { + comp_ctx->cur_lvl--; + deflateParams(&comp_ctx->strm, comp_ctx->cur_lvl, Z_DEFAULT_STRATEGY); + } + + } else if (comp_ctx->cur_lvl < global.tune.comp_maxlevel) { + /* increase level */ + comp_ctx->cur_lvl++ ; + deflateParams(&comp_ctx->strm, comp_ctx->cur_lvl, Z_DEFAULT_STRATEGY); + } + + return out_len; +} + +static int deflate_flush(struct comp_ctx *comp_ctx, struct buffer *out) +{ + return deflate_flush_or_finish(comp_ctx, out, Z_SYNC_FLUSH); +} + +static int deflate_finish(struct comp_ctx *comp_ctx, struct buffer *out) +{ + return deflate_flush_or_finish(comp_ctx, out, Z_FINISH); +} + +static int deflate_end(struct comp_ctx **comp_ctx) +{ + z_stream *strm = &(*comp_ctx)->strm; + int ret; + + ret = deflateEnd(strm); + + deinit_comp_ctx(comp_ctx); + + return ret; +} + +/* config parser for global "tune.zlibmemlevel" */ +static int zlib_parse_global_memlevel(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects a numeric value between 1 and 9.", args[0]); + return -1; + } + + global_tune_zlibmemlevel = atoi(args[1]); + if (global_tune_zlibmemlevel < 1 || global_tune_zlibmemlevel > 9) { + memprintf(err, "'%s' expects a numeric value between 1 and 9.", args[0]); + return -1; + } + return 0; +} + + +/* config parser for global "tune.zlibwindowsize" */ +static int zlib_parse_global_windowsize(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects a numeric value between 8 and 15.", args[0]); + return -1; + } + + global_tune_zlibwindowsize = atoi(args[1]); + if (global_tune_zlibwindowsize < 8 || global_tune_zlibwindowsize > 15) { + memprintf(err, "'%s' expects a numeric value between 8 and 15.", args[0]); + return -1; + } + return 0; +} + +#endif /* USE_ZLIB */ + + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { +#ifdef USE_ZLIB + { CFG_GLOBAL, "tune.zlib.memlevel", zlib_parse_global_memlevel }, + { CFG_GLOBAL, "tune.zlib.windowsize", zlib_parse_global_windowsize }, +#endif + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +static void comp_register_build_opts(void) +{ + char *ptr = NULL; + int i; + +#ifdef USE_ZLIB + memprintf(&ptr, "Built with zlib version : " ZLIB_VERSION); + memprintf(&ptr, "%s\nRunning on zlib version : %s", ptr, zlibVersion()); +#elif defined(USE_SLZ) + memprintf(&ptr, "Built with libslz for stateless compression."); +#else + memprintf(&ptr, "Built without compression support (neither USE_ZLIB nor USE_SLZ are set)."); +#endif + memprintf(&ptr, "%s\nCompression algorithms supported :", ptr); + + for (i = 0; comp_algos[i].cfg_name; i++) + memprintf(&ptr, "%s%s %s(\"%s\")", ptr, (i == 0 ? "" : ","), comp_algos[i].cfg_name, comp_algos[i].ua_name); + + if (i == 0) + memprintf(&ptr, "%s none", ptr); + + hap_register_build_opts(ptr, 1); +} + +INITCALL0(STG_REGISTER, comp_register_build_opts); diff --git a/src/connection.c b/src/connection.c new file mode 100644 index 0000000..7930cc4 --- /dev/null +++ b/src/connection.c @@ -0,0 +1,2748 @@ +/* + * Connection management functions + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> + +#include <import/ebmbtree.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/cfgparse.h> +#include <haproxy/connection.h> +#include <haproxy/fd.h> +#include <haproxy/frontend.h> +#include <haproxy/hash.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/namespace.h> +#include <haproxy/net_helper.h> +#include <haproxy/proto_rhttp.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/session.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> +#include <haproxy/xxhash.h> + + +DECLARE_POOL(pool_head_connection, "connection", sizeof(struct connection)); +DECLARE_POOL(pool_head_conn_hash_node, "conn_hash_node", sizeof(struct conn_hash_node)); +DECLARE_POOL(pool_head_sockaddr, "sockaddr", sizeof(struct sockaddr_storage)); +DECLARE_POOL(pool_head_pp_tlv_128, "pp_tlv_128", sizeof(struct conn_tlv_list) + HA_PP2_TLV_VALUE_128); +DECLARE_POOL(pool_head_pp_tlv_256, "pp_tlv_256", sizeof(struct conn_tlv_list) + HA_PP2_TLV_VALUE_256); + +struct idle_conns idle_conns[MAX_THREADS] = { }; +struct xprt_ops *registered_xprt[XPRT_ENTRIES] = { NULL, }; + +/* List head of all known muxes for PROTO */ +struct mux_proto_list mux_proto_list = { + .list = LIST_HEAD_INIT(mux_proto_list.list) +}; + +struct mux_stopping_data mux_stopping_data[MAX_THREADS]; + +/* disables sending of proxy-protocol-v2's LOCAL command */ +static int pp2_never_send_local; + +/* find the value of a received TLV for a given type */ +struct conn_tlv_list *conn_get_tlv(struct connection *conn, int type) +{ + struct conn_tlv_list *tlv = NULL; + + if (!conn) + return NULL; + + list_for_each_entry(tlv, &conn->tlv_list, list) { + if (tlv->type == type) + return tlv; + } + + return NULL; +} + +/* Remove <conn> idle connection from its attached tree (idle, safe or avail). + * If also present in the secondary server idle list, conn is removed from it. + * + * Must be called with idle_conns_lock held. + */ +void conn_delete_from_tree(struct connection *conn) +{ + LIST_DEL_INIT(&conn->idle_list); + eb64_delete(&conn->hash_node->node); +} + +int conn_create_mux(struct connection *conn) +{ + if (conn_is_back(conn)) { + struct server *srv; + struct stconn *sc = conn->ctx; + struct session *sess = conn->owner; + + if (conn->flags & CO_FL_ERROR) + goto fail; + + if (sess && obj_type(sess->origin) == OBJ_TYPE_CHECK) { + if (conn_install_mux_chk(conn, conn->ctx, sess) < 0) + goto fail; + } + else if (conn_install_mux_be(conn, conn->ctx, sess, NULL) < 0) + goto fail; + srv = objt_server(conn->target); + + /* If we're doing http-reuse always, and the connection is not + * private with available streams (an http2 connection), add it + * to the available list, so that others can use it right + * away. If the connection is private, add it in the session + * server list. + */ + if (srv && ((srv->proxy->options & PR_O_REUSE_MASK) == PR_O_REUSE_ALWS) && + !(conn->flags & CO_FL_PRIVATE) && conn->mux->avail_streams(conn) > 0) { + srv_add_to_avail_list(srv, conn); + } + else if (conn->flags & CO_FL_PRIVATE) { + /* If it fail now, the same will be done in mux->detach() callback */ + session_add_conn(sess, conn, conn->target); + } + return 0; +fail: + /* let the upper layer know the connection failed */ + if (sc) { + sc->app_ops->wake(sc); + } + else if (conn_reverse_in_preconnect(conn)) { + struct listener *l = conn_active_reverse_listener(conn); + + /* If mux init failed, consider connection on error. + * This is necessary to ensure connection is freed by + * proto-rhttp receiver task. + */ + if (!conn->mux) + conn->flags |= CO_FL_ERROR; + + /* If connection is interrupted without CO_FL_ERROR, receiver task won't free it. */ + BUG_ON(!(conn->flags & CO_FL_ERROR)); + + task_wakeup(l->rx.rhttp.task, TASK_WOKEN_ANY); + } + return -1; + } else + return conn_complete_session(conn); + +} + +/* This is used at the end of the socket IOCB to possibly create the mux if it + * was not done yet, or wake it up if flags changed compared to old_flags or if + * need_wake insists on this. It returns <0 if the connection was destroyed and + * must not be used, >=0 otherwise. + */ +int conn_notify_mux(struct connection *conn, int old_flags, int forced_wake) +{ + int ret = 0; + + /* If we don't yet have a mux, that means we were waiting for + * information to create one, typically from the ALPN. If we're + * done with the handshake, attempt to create one. + */ + if (unlikely(!conn->mux) && !(conn->flags & CO_FL_WAIT_XPRT)) { + ret = conn_create_mux(conn); + if (ret < 0) + goto done; + } + + /* The wake callback is normally used to notify the data layer about + * data layer activity (successful send/recv), connection establishment, + * shutdown and fatal errors. We need to consider the following + * situations to wake up the data layer : + * - change among the CO_FL_NOTIFY_DONE flags : + * SOCK_{RD,WR}_SH, ERROR, + * - absence of any of {L4,L6}_CONN and CONNECTED, indicating the + * end of handshake and transition to CONNECTED + * - raise of CONNECTED with HANDSHAKE down + * - end of HANDSHAKE with CONNECTED set + * - regular data layer activity + * + * One tricky case is the wake up on read0 or error on an idle + * backend connection, that can happen on a connection that is still + * polled while at the same moment another thread is about to perform a + * takeover. The solution against this is to remove the connection from + * the idle list if it was in it, and possibly reinsert it at the end + * if the connection remains valid. The cost is non-null (locked tree + * removal) but remains low given that this is extremely rarely called. + * In any case it's guaranteed by the FD's thread_mask that we're + * called from the same thread the connection is queued in. + * + * Note that the wake callback is allowed to release the connection and + * the fd (and return < 0 in this case). + */ + if ((forced_wake || + ((conn->flags ^ old_flags) & CO_FL_NOTIFY_DONE) || + ((old_flags & CO_FL_WAIT_XPRT) && !(conn->flags & CO_FL_WAIT_XPRT))) && + conn->mux && conn->mux->wake) { + uint conn_in_list = conn->flags & CO_FL_LIST_MASK; + struct server *srv = objt_server(conn->target); + + if (conn_in_list) { + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + conn_delete_from_tree(conn); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + + ret = conn->mux->wake(conn); + if (ret < 0) + goto done; + + if (conn_in_list) { + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + _srv_add_idle(srv, conn, conn_in_list == CO_FL_SAFE_LIST); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + } + done: + return ret; +} + +/* Change the mux for the connection. + * The caller should make sure he's not subscribed to the underlying XPRT. + */ +int conn_upgrade_mux_fe(struct connection *conn, void *ctx, struct buffer *buf, + struct ist mux_proto, int mode) +{ + struct bind_conf *bind_conf = __objt_listener(conn->target)->bind_conf; + const struct mux_ops *old_mux, *new_mux; + void *old_mux_ctx; + const char *alpn_str = NULL; + int alpn_len = 0; + + if (!mux_proto.len) { + conn_get_alpn(conn, &alpn_str, &alpn_len); + mux_proto = ist2(alpn_str, alpn_len); + } + new_mux = conn_get_best_mux(conn, mux_proto, PROTO_SIDE_FE, mode); + old_mux = conn->mux; + + /* No mux found */ + if (!new_mux) + return -1; + + /* Same mux, nothing to do */ + if (old_mux == new_mux) + return 0; + + old_mux_ctx = conn->ctx; + conn->mux = new_mux; + conn->ctx = ctx; + if (new_mux->init(conn, bind_conf->frontend, conn->owner, buf) == -1) { + /* The mux upgrade failed, so restore the old mux */ + conn->ctx = old_mux_ctx; + conn->mux = old_mux; + return -1; + } + + /* The mux was upgraded, destroy the old one */ + *buf = BUF_NULL; + old_mux->destroy(old_mux_ctx); + return 0; +} + +/* installs the best mux for incoming connection <conn> using the upper context + * <ctx>. If the mux protocol is forced, we use it to find the best + * mux. Otherwise we use the ALPN name, if any. Returns < 0 on error. + */ +int conn_install_mux_fe(struct connection *conn, void *ctx) +{ + struct bind_conf *bind_conf = __objt_listener(conn->target)->bind_conf; + const struct mux_ops *mux_ops; + + if (bind_conf->mux_proto) + mux_ops = bind_conf->mux_proto->mux; + else { + struct ist mux_proto; + const char *alpn_str = NULL; + int alpn_len = 0; + int mode; + + if (bind_conf->frontend->mode == PR_MODE_HTTP) + mode = PROTO_MODE_HTTP; + else + mode = PROTO_MODE_TCP; + + conn_get_alpn(conn, &alpn_str, &alpn_len); + mux_proto = ist2(alpn_str, alpn_len); + mux_ops = conn_get_best_mux(conn, mux_proto, PROTO_SIDE_FE, mode); + if (!mux_ops) + return -1; + } + + /* Ensure a valid protocol is selected if connection is targeted by a + * tcp-request session attach-srv rule. + */ + if (conn->reverse.target && !(mux_ops->flags & MX_FL_REVERSABLE)) { + conn->err_code = CO_ER_REVERSE; + return -1; + } + + return conn_install_mux(conn, mux_ops, ctx, bind_conf->frontend, conn->owner); +} + +/* installs the best mux for outgoing connection <conn> using the upper context + * <ctx>. If the server mux protocol is forced, we use it to find the best mux. + * It's also possible to specify an alternative mux protocol <force_mux_ops>, + * in which case it will be used instead of the default server mux protocol. + * + * Returns < 0 on error. + */ +int conn_install_mux_be(struct connection *conn, void *ctx, struct session *sess, + const struct mux_ops *force_mux_ops) +{ + struct server *srv = objt_server(conn->target); + struct proxy *prx = objt_proxy(conn->target); + const struct mux_ops *mux_ops; + + if (srv) + prx = srv->proxy; + + if (!prx) // target must be either proxy or server + return -1; + + if (srv && srv->mux_proto && likely(!force_mux_ops)) { + mux_ops = srv->mux_proto->mux; + } + else if (srv && unlikely(force_mux_ops)) { + mux_ops = force_mux_ops; + } + else { + struct ist mux_proto; + const char *alpn_str = NULL; + int alpn_len = 0; + int mode; + + if (prx->mode == PR_MODE_HTTP) + mode = PROTO_MODE_HTTP; + else + mode = PROTO_MODE_TCP; + + conn_get_alpn(conn, &alpn_str, &alpn_len); + mux_proto = ist2(alpn_str, alpn_len); + + mux_ops = conn_get_best_mux(conn, mux_proto, PROTO_SIDE_BE, mode); + if (!mux_ops) + return -1; + } + return conn_install_mux(conn, mux_ops, ctx, prx, sess); +} + +/* installs the best mux for outgoing connection <conn> for a check using the + * upper context <ctx>. If the mux protocol is forced by the check, we use it to + * find the best mux. Returns < 0 on error. + */ +int conn_install_mux_chk(struct connection *conn, void *ctx, struct session *sess) +{ + struct check *check = objt_check(sess->origin); + struct server *srv = objt_server(conn->target); + struct proxy *prx = objt_proxy(conn->target); + const struct mux_ops *mux_ops; + + if (!check) // Check must be defined + return -1; + + if (srv) + prx = srv->proxy; + + if (!prx) // target must be either proxy or server + return -1; + + if (check->mux_proto) + mux_ops = check->mux_proto->mux; + else { + struct ist mux_proto; + const char *alpn_str = NULL; + int alpn_len = 0; + int mode; + + if ((check->tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_HTTP_CHK) + mode = PROTO_MODE_HTTP; + else + mode = PROTO_MODE_TCP; + + conn_get_alpn(conn, &alpn_str, &alpn_len); + mux_proto = ist2(alpn_str, alpn_len); + + mux_ops = conn_get_best_mux(conn, mux_proto, PROTO_SIDE_BE, mode); + if (!mux_ops) + return -1; + } + return conn_install_mux(conn, mux_ops, ctx, prx, sess); +} + +/* Set the ALPN of connection <conn> to <alpn>. If force is false, <alpn> must + * be a subset or identical to the registered protos for the parent SSL_CTX. + * In this case <alpn> must be a single protocol value, not a list. + * + * Returns 0 if ALPN is updated else -1. + */ +int conn_update_alpn(struct connection *conn, const struct ist alpn, int force) +{ +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + size_t alpn_len = istlen(alpn); + char *ctx_alpn_str = NULL; + int ctx_alpn_len = 0, found = 0; + + /* if not force, first search if alpn is a subset or identical to the + * parent SSL_CTX. + */ + if (!force) { + /* retrieve the SSL_CTX according to the connection side. */ + if (conn_is_back(conn)) { + if (obj_type(conn->target) == OBJ_TYPE_SERVER) { + struct server *srv = __objt_server(conn->target); + ctx_alpn_str = srv->ssl_ctx.alpn_str; + ctx_alpn_len = srv->ssl_ctx.alpn_len; + } + } + else { + struct session *sess = conn->owner; + struct listener *li = sess->listener; + + if (li->bind_conf && li->bind_conf->options & BC_O_USE_SSL) { + ctx_alpn_str = li->bind_conf->ssl_conf.alpn_str; + ctx_alpn_len = li->bind_conf->ssl_conf.alpn_len; + } + } + + if (ctx_alpn_str) { + /* search if ALPN is present in SSL_CTX ALPN before + * using it. + */ + while (ctx_alpn_len) { + /* skip ALPN whose size is not 8 */ + if (*ctx_alpn_str != alpn_len - 1) { + ctx_alpn_len -= *ctx_alpn_str + 1; + } + else { + if (isteqi(ist2(ctx_alpn_str, alpn_len), alpn)) { + found = 1; + break; + } + } + ctx_alpn_str += *ctx_alpn_str + 1; + + /* This indicates an invalid ALPN formatted + * string and should never happen. */ + BUG_ON(ctx_alpn_len < 0); + } + } + } + + if (found || force) { + ssl_sock_set_alpn(conn, (const uchar *)istptr(alpn), istlen(alpn)); + return 0; + } + +#endif + return -1; +} + +/* Initializes all required fields for a new connection. Note that it does the + * minimum acceptable initialization for a connection that already exists and + * is about to be reused. It also leaves the addresses untouched, which makes + * it usable across connection retries to reset a connection to a known state. + */ +void conn_init(struct connection *conn, void *target) +{ + conn->obj_type = OBJ_TYPE_CONN; + conn->flags = CO_FL_NONE; + conn->mux = NULL; + conn->ctx = NULL; + conn->owner = NULL; + conn->send_proxy_ofs = 0; + conn->handle.fd = DEAD_FD_MAGIC; + conn->err_code = CO_ER_NONE; + conn->target = target; + conn->destroy_cb = NULL; + conn->proxy_netns = NULL; + MT_LIST_INIT(&conn->toremove_list); + if (conn_is_back(conn)) + LIST_INIT(&conn->session_list); + else + LIST_INIT(&conn->stopping_list); + LIST_INIT(&conn->tlv_list); + conn->subs = NULL; + conn->src = NULL; + conn->dst = NULL; + conn->hash_node = NULL; + conn->xprt = NULL; + conn->reverse.target = NULL; + conn->reverse.name = BUF_NULL; +} + +/* Initialize members used for backend connections. + * + * Returns 0 on success else non-zero. + */ +static int conn_backend_init(struct connection *conn) +{ + if (!sockaddr_alloc(&conn->dst, 0, 0)) + return 1; + + conn->hash_node = conn_alloc_hash_node(conn); + if (unlikely(!conn->hash_node)) + return 1; + + return 0; +} + +/* Release connection elements reserved for backend side usage. It also takes + * care to detach it if linked to a session or a server instance. + * + * This function is useful when freeing a connection or reversing it to the + * frontend side. + */ +static void conn_backend_deinit(struct connection *conn) +{ + /* If the connection is owned by the session, remove it from its list + */ + if (conn_is_back(conn) && LIST_INLIST(&conn->session_list)) { + session_unown_conn(conn->owner, conn); + } + else if (!(conn->flags & CO_FL_PRIVATE)) { + if (obj_type(conn->target) == OBJ_TYPE_SERVER) + srv_release_conn(__objt_server(conn->target), conn); + } + + /* Make sure the connection is not left in the idle connection tree */ + if (conn->hash_node != NULL) + BUG_ON(conn->hash_node->node.node.leaf_p != NULL); + + pool_free(pool_head_conn_hash_node, conn->hash_node); + conn->hash_node = NULL; + +} + +/* Tries to allocate a new connection and initialized its main fields. The + * connection is returned on success, NULL on failure. The connection must + * be released using pool_free() or conn_free(). + */ +struct connection *conn_new(void *target) +{ + struct connection *conn; + + conn = pool_alloc(pool_head_connection); + if (unlikely(!conn)) + return NULL; + + conn_init(conn, target); + + if (conn_is_back(conn)) { + if (obj_type(target) == OBJ_TYPE_SERVER) + srv_use_conn(__objt_server(target), conn); + + if (conn_backend_init(conn)) { + conn_free(conn); + return NULL; + } + } + + return conn; +} + +/* Releases a connection previously allocated by conn_new() */ +void conn_free(struct connection *conn) +{ + struct conn_tlv_list *tlv, *tlv_back = NULL; + + if (conn_is_back(conn)) + conn_backend_deinit(conn); + + /* Remove the conn from toremove_list. + * + * This is needed to prevent a double-free in case the connection was + * already scheduled from cleaning but is freed before via another + * call. + */ + MT_LIST_DELETE(&conn->toremove_list); + + sockaddr_free(&conn->src); + sockaddr_free(&conn->dst); + + /* Free all previously allocated TLVs */ + list_for_each_entry_safe(tlv, tlv_back, &conn->tlv_list, list) { + LIST_DELETE(&tlv->list); + if (tlv->len > HA_PP2_TLV_VALUE_256) + free(tlv); + else if (tlv->len <= HA_PP2_TLV_VALUE_128) + pool_free(pool_head_pp_tlv_128, tlv); + else + pool_free(pool_head_pp_tlv_256, tlv); + } + + ha_free(&conn->reverse.name.area); + + if (conn_reverse_in_preconnect(conn)) { + struct listener *l = conn_active_reverse_listener(conn); + rhttp_notify_preconn_err(l); + HA_ATOMIC_DEC(&th_ctx->nb_rhttp_conns); + } + else if (conn->flags & CO_FL_REVERSED) { + HA_ATOMIC_DEC(&th_ctx->nb_rhttp_conns); + } + + + conn_force_unsubscribe(conn); + pool_free(pool_head_connection, conn); +} + +struct conn_hash_node *conn_alloc_hash_node(struct connection *conn) +{ + struct conn_hash_node *hash_node = NULL; + + hash_node = pool_zalloc(pool_head_conn_hash_node); + if (unlikely(!hash_node)) + return NULL; + + hash_node->conn = conn; + + return hash_node; +} + +/* Allocates a struct sockaddr from the pool if needed, assigns it to *sap and + * returns it. If <sap> is NULL, the address is always allocated and returned. + * if <sap> is non-null, an address will only be allocated if it points to a + * non-null pointer. In this case the allocated address will be assigned there. + * If <orig> is non-null and <len> positive, the address in <sa> will be copied + * into the allocated address. In both situations the new pointer is returned. + */ +struct sockaddr_storage *sockaddr_alloc(struct sockaddr_storage **sap, const struct sockaddr_storage *orig, socklen_t len) +{ + struct sockaddr_storage *sa; + + if (sap && *sap) + return *sap; + + sa = pool_alloc(pool_head_sockaddr); + if (sa && orig && len > 0) + memcpy(sa, orig, len); + if (sap) + *sap = sa; + return sa; +} + +/* Releases the struct sockaddr potentially pointed to by <sap> to the pool. It + * may be NULL or may point to NULL. If <sap> is not NULL, a NULL is placed + * there. + */ +void sockaddr_free(struct sockaddr_storage **sap) +{ + if (!sap) + return; + pool_free(pool_head_sockaddr, *sap); + *sap = NULL; +} + +/* Try to add a handshake pseudo-XPRT. If the connection's first XPRT is + * raw_sock, then just use the new XPRT as the connection XPRT, otherwise + * call the xprt's add_xprt() method. + * Returns 0 on success, or non-zero on failure. + */ +int xprt_add_hs(struct connection *conn) +{ + void *xprt_ctx = NULL; + const struct xprt_ops *ops = xprt_get(XPRT_HANDSHAKE); + void *nextxprt_ctx = NULL; + const struct xprt_ops *nextxprt_ops = NULL; + + if (conn->flags & CO_FL_ERROR) + return -1; + if (ops->init(conn, &xprt_ctx) < 0) + return -1; + if (conn->xprt == xprt_get(XPRT_RAW)) { + nextxprt_ctx = conn->xprt_ctx; + nextxprt_ops = conn->xprt; + conn->xprt_ctx = xprt_ctx; + conn->xprt = ops; + } else { + if (conn->xprt->add_xprt(conn, conn->xprt_ctx, xprt_ctx, ops, + &nextxprt_ctx, &nextxprt_ops) != 0) { + ops->close(conn, xprt_ctx); + return -1; + } + } + if (ops->add_xprt(conn, xprt_ctx, nextxprt_ctx, nextxprt_ops, NULL, NULL) != 0) { + ops->close(conn, xprt_ctx); + return -1; + } + return 0; +} + +/* returns a human-readable error code for conn->err_code, or NULL if the code + * is unknown. + */ +const char *conn_err_code_str(struct connection *c) +{ + switch (c->err_code) { + case CO_ER_NONE: return "Success"; + + case CO_ER_CONF_FDLIM: return "Reached configured maxconn value"; + case CO_ER_PROC_FDLIM: return "Too many sockets on the process"; + case CO_ER_SYS_FDLIM: return "Too many sockets on the system"; + case CO_ER_SYS_MEMLIM: return "Out of system buffers"; + case CO_ER_NOPROTO: return "Protocol or address family not supported"; + case CO_ER_SOCK_ERR: return "General socket error"; + case CO_ER_PORT_RANGE: return "Source port range exhausted"; + case CO_ER_CANT_BIND: return "Can't bind to source address"; + case CO_ER_FREE_PORTS: return "Out of local source ports on the system"; + case CO_ER_ADDR_INUSE: return "Local source address already in use"; + + case CO_ER_PRX_EMPTY: return "Connection closed while waiting for PROXY protocol header"; + case CO_ER_PRX_ABORT: return "Connection error while waiting for PROXY protocol header"; + case CO_ER_PRX_TIMEOUT: return "Timeout while waiting for PROXY protocol header"; + case CO_ER_PRX_TRUNCATED: return "Truncated PROXY protocol header received"; + case CO_ER_PRX_NOT_HDR: return "Received something which does not look like a PROXY protocol header"; + case CO_ER_PRX_BAD_HDR: return "Received an invalid PROXY protocol header"; + case CO_ER_PRX_BAD_PROTO: return "Received an unhandled protocol in the PROXY protocol header"; + + case CO_ER_CIP_EMPTY: return "Connection closed while waiting for NetScaler Client IP header"; + case CO_ER_CIP_ABORT: return "Connection error while waiting for NetScaler Client IP header"; + case CO_ER_CIP_TIMEOUT: return "Timeout while waiting for a NetScaler Client IP header"; + case CO_ER_CIP_TRUNCATED: return "Truncated NetScaler Client IP header received"; + case CO_ER_CIP_BAD_MAGIC: return "Received an invalid NetScaler Client IP magic number"; + case CO_ER_CIP_BAD_PROTO: return "Received an unhandled protocol in the NetScaler Client IP header"; + + case CO_ER_SSL_EMPTY: return "Connection closed during SSL handshake"; + case CO_ER_SSL_ABORT: return "Connection error during SSL handshake"; + case CO_ER_SSL_TIMEOUT: return "Timeout during SSL handshake"; + case CO_ER_SSL_TOO_MANY: return "Too many SSL connections"; + case CO_ER_SSL_NO_MEM: return "Out of memory when initializing an SSL connection"; + case CO_ER_SSL_RENEG: return "Rejected a client-initiated SSL renegotiation attempt"; + case CO_ER_SSL_CA_FAIL: return "SSL client CA chain cannot be verified"; + case CO_ER_SSL_CRT_FAIL: return "SSL client certificate not trusted"; + case CO_ER_SSL_MISMATCH: return "Server presented an SSL certificate different from the configured one"; + case CO_ER_SSL_MISMATCH_SNI: return "Server presented an SSL certificate different from the expected one"; + case CO_ER_SSL_HANDSHAKE: return "SSL handshake failure"; + case CO_ER_SSL_HANDSHAKE_HB: return "SSL handshake failure after heartbeat"; + case CO_ER_SSL_KILLED_HB: return "Stopped a TLSv1 heartbeat attack (CVE-2014-0160)"; + case CO_ER_SSL_NO_TARGET: return "Attempt to use SSL on an unknown target (internal error)"; + case CO_ER_SSL_EARLY_FAILED: return "Server refused early data"; + + case CO_ER_SOCKS4_SEND: return "SOCKS4 Proxy write error during handshake"; + case CO_ER_SOCKS4_RECV: return "SOCKS4 Proxy read error during handshake"; + case CO_ER_SOCKS4_DENY: return "SOCKS4 Proxy deny the request"; + case CO_ER_SOCKS4_ABORT: return "SOCKS4 Proxy handshake aborted by server"; + + case CO_ERR_SSL_FATAL: return "SSL fatal error"; + + case CO_ER_REVERSE: return "Reverse connect failure"; + } + return NULL; +} + +/* Send a message over an established connection. It makes use of send() and + * returns the same return code and errno. If the socket layer is not ready yet + * then -1 is returned and ENOTSOCK is set into errno. If the fd is not marked + * as ready, or if EAGAIN or ENOTCONN is returned, then we return 0. It returns + * EMSGSIZE if called with a zero length message. The purpose is to simplify + * some rare attempts to directly write on the socket from above the connection + * (typically send_proxy). In case of EAGAIN, the fd is marked as "cant_send". + * It automatically retries on EINTR. Other errors cause the connection to be + * marked as in error state. It takes similar arguments as send() except the + * first one which is the connection instead of the file descriptor. <flags> + * only support CO_SFL_MSG_MORE. + */ +int conn_ctrl_send(struct connection *conn, const void *buf, int len, int flags) +{ + const struct buffer buffer = b_make((char*)buf, len, 0, len); + const struct xprt_ops *xprt = xprt_get(XPRT_RAW); + int ret; + + ret = -1; + errno = ENOTSOCK; + + if (conn->flags & CO_FL_SOCK_WR_SH) + goto fail; + + if (!conn_ctrl_ready(conn)) + goto fail; + + errno = EMSGSIZE; + if (!len) + goto fail; + + /* snd_buf() already takes care of updating conn->flags and handling + * the FD polling status. + */ + ret = xprt->snd_buf(conn, NULL, &buffer, buffer.data, flags); + if (conn->flags & CO_FL_ERROR) + ret = -1; + return ret; + fail: + conn->flags |= CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH | CO_FL_ERROR; + return ret; +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type>. + * The event subscriber <es> is not allowed to change from a previous call as + * long as at least one event is still subscribed. The <event_type> must only + * be a combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0. + */ +int conn_unsubscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(conn->subs && conn->subs != es); + + es->events &= ~event_type; + if (!es->events) + conn->subs = NULL; + + if (conn_ctrl_ready(conn) && conn->ctrl->ignore_events) + conn->ctrl->ignore_events(conn, event_type); + + return 0; +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. + * The <es> struct is not allowed to differ from the one passed during a + * previous call to subscribe(). If the connection's ctrl layer is ready, + * the wait_event is immediately woken up and the subscription is cancelled. + * It always returns zero. + */ +int conn_subscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + int ret = 0; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(conn->subs && conn->subs != es); + + if (conn->subs && (conn->subs->events & event_type) == event_type) + return 0; + + if (conn_ctrl_ready(conn) && conn->ctrl->check_events) { + ret = conn->ctrl->check_events(conn, event_type); + if (ret) + tasklet_wakeup(es->tasklet); + } + + es->events = (es->events | event_type) & ~ret; + conn->subs = es->events ? es : NULL; + return 0; +} + +/* Drains possibly pending incoming data on the connection and update the flags + * accordingly. This is used to know whether we need to disable lingering on + * close. Returns non-zero if it is safe to close without disabling lingering, + * otherwise zero. The CO_FL_SOCK_RD_SH flag may also be updated if the incoming + * shutdown was reported by the ->drain() function. + */ +int conn_ctrl_drain(struct connection *conn) +{ + int ret = 0; + + if (!conn_ctrl_ready(conn) || conn->flags & (CO_FL_ERROR | CO_FL_SOCK_RD_SH)) + ret = 1; + else if (conn->ctrl->drain) { + ret = conn->ctrl->drain(conn); + if (ret) + conn->flags |= CO_FL_SOCK_RD_SH; + } + return ret; +} + +/* + * Get data length from tlv + */ +static inline size_t get_tlv_length(const struct tlv *src) +{ + return (src->length_hi << 8) | src->length_lo; +} + +/* This handshake handler waits a PROXY protocol header at the beginning of the + * raw data stream. The header looks like this : + * + * "PROXY" <SP> PROTO <SP> SRC3 <SP> DST3 <SP> SRC4 <SP> <DST4> "\r\n" + * + * There must be exactly one space between each field. Fields are : + * - PROTO : layer 4 protocol, which must be "TCP4" or "TCP6". + * - SRC3 : layer 3 (eg: IP) source address in standard text form + * - DST3 : layer 3 (eg: IP) destination address in standard text form + * - SRC4 : layer 4 (eg: TCP port) source address in standard text form + * - DST4 : layer 4 (eg: TCP port) destination address in standard text form + * + * This line MUST be at the beginning of the buffer and MUST NOT wrap. + * + * The header line is small and in all cases smaller than the smallest normal + * TCP MSS. So it MUST always be delivered as one segment, which ensures we + * can safely use MSG_PEEK and avoid buffering. + * + * Once the data is fetched, the values are set in the connection's address + * fields, and data are removed from the socket's buffer. The function returns + * zero if it needs to wait for more data or if it fails, or 1 if it completed + * and removed itself. + */ +int conn_recv_proxy(struct connection *conn, int flag) +{ + struct session *sess = conn->owner; + char *line, *end; + struct proxy_hdr_v2 *hdr_v2; + const char v2sig[] = PP2_SIGNATURE; + size_t total_v2_len; + size_t tlv_offset = 0; + int ret; + + if (!conn_ctrl_ready(conn)) + goto fail; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (!fd_recv_ready(conn->handle.fd)) + goto not_ready; + + while (1) { + ret = recv(conn->handle.fd, trash.area, trash.size, MSG_PEEK); + if (ret < 0) { + if (errno == EINTR) + continue; + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_recv(conn->handle.fd); + goto not_ready; + } + goto recv_abort; + } + trash.data = ret; + break; + } + + if (!trash.data) { + /* client shutdown */ + conn->err_code = CO_ER_PRX_EMPTY; + goto fail; + } + + conn->flags &= ~CO_FL_WAIT_L4_CONN; + + if (trash.data < 6) + goto missing; + + line = trash.area; + end = trash.area + trash.data; + + /* Decode a possible proxy request, fail early if it does not match */ + if (strncmp(line, "PROXY ", 6) != 0) + goto not_v1; + + line += 6; + if (trash.data < 9) /* shortest possible line */ + goto missing; + + if (memcmp(line, "TCP4 ", 5) == 0) { + u32 src3, dst3, sport, dport; + + line += 5; + + src3 = inetaddr_host_lim_ret(line, end, &line); + if (line == end) + goto missing; + if (*line++ != ' ') + goto bad_header; + + dst3 = inetaddr_host_lim_ret(line, end, &line); + if (line == end) + goto missing; + if (*line++ != ' ') + goto bad_header; + + sport = read_uint((const char **)&line, end); + if (line == end) + goto missing; + if (*line++ != ' ') + goto bad_header; + + dport = read_uint((const char **)&line, end); + if (line > end - 2) + goto missing; + if (*line++ != '\r') + goto bad_header; + if (*line++ != '\n') + goto bad_header; + + if (!sess || !sockaddr_alloc(&sess->src, NULL, 0) || !sockaddr_alloc(&sess->dst, NULL, 0)) + goto fail; + + /* update the session's addresses and mark them set */ + ((struct sockaddr_in *)sess->src)->sin_family = AF_INET; + ((struct sockaddr_in *)sess->src)->sin_addr.s_addr = htonl(src3); + ((struct sockaddr_in *)sess->src)->sin_port = htons(sport); + + ((struct sockaddr_in *)sess->dst)->sin_family = AF_INET; + ((struct sockaddr_in *)sess->dst)->sin_addr.s_addr = htonl(dst3); + ((struct sockaddr_in *)sess->dst)->sin_port = htons(dport); + } + else if (memcmp(line, "TCP6 ", 5) == 0) { + u32 sport, dport; + char *src_s; + char *dst_s, *sport_s, *dport_s; + struct in6_addr src3, dst3; + + line += 5; + + src_s = line; + dst_s = sport_s = dport_s = NULL; + while (1) { + if (line > end - 2) { + goto missing; + } + else if (*line == '\r') { + *line = 0; + line++; + if (*line++ != '\n') + goto bad_header; + break; + } + + if (*line == ' ') { + *line = 0; + if (!dst_s) + dst_s = line + 1; + else if (!sport_s) + sport_s = line + 1; + else if (!dport_s) + dport_s = line + 1; + } + line++; + } + + if (!dst_s || !sport_s || !dport_s) + goto bad_header; + + sport = read_uint((const char **)&sport_s,dport_s - 1); + if (*sport_s != 0) + goto bad_header; + + dport = read_uint((const char **)&dport_s,line - 2); + if (*dport_s != 0) + goto bad_header; + + if (inet_pton(AF_INET6, src_s, (void *)&src3) != 1) + goto bad_header; + + if (inet_pton(AF_INET6, dst_s, (void *)&dst3) != 1) + goto bad_header; + + if (!sess || !sockaddr_alloc(&sess->src, NULL, 0) || !sockaddr_alloc(&sess->dst, NULL, 0)) + goto fail; + + /* update the session's addresses and mark them set */ + ((struct sockaddr_in6 *)sess->src)->sin6_family = AF_INET6; + memcpy(&((struct sockaddr_in6 *)sess->src)->sin6_addr, &src3, sizeof(struct in6_addr)); + ((struct sockaddr_in6 *)sess->src)->sin6_port = htons(sport); + + ((struct sockaddr_in6 *)sess->dst)->sin6_family = AF_INET6; + memcpy(&((struct sockaddr_in6 *)sess->dst)->sin6_addr, &dst3, sizeof(struct in6_addr)); + ((struct sockaddr_in6 *)sess->dst)->sin6_port = htons(dport); + } + else if (memcmp(line, "UNKNOWN\r\n", 9) == 0) { + /* This can be a UNIX socket forwarded by an haproxy upstream */ + line += 9; + } + else { + /* The protocol does not match something known (TCP4/TCP6/UNKNOWN) */ + conn->err_code = CO_ER_PRX_BAD_PROTO; + goto fail; + } + + trash.data = line - trash.area; + goto eat_header; + + not_v1: + /* try PPv2 */ + if (trash.data < PP2_HEADER_LEN) + goto missing; + + hdr_v2 = (struct proxy_hdr_v2 *) trash.area; + + if (memcmp(hdr_v2->sig, v2sig, PP2_SIGNATURE_LEN) != 0 || + (hdr_v2->ver_cmd & PP2_VERSION_MASK) != PP2_VERSION) { + conn->err_code = CO_ER_PRX_NOT_HDR; + goto fail; + } + + total_v2_len = PP2_HEADER_LEN + ntohs(hdr_v2->len); + if (trash.data < total_v2_len) + goto missing; + + switch (hdr_v2->ver_cmd & PP2_CMD_MASK) { + case 0x01: /* PROXY command */ + switch (hdr_v2->fam) { + case 0x11: /* TCPv4 */ + if (ntohs(hdr_v2->len) < PP2_ADDR_LEN_INET) + goto bad_header; + + if (!sess || !sockaddr_alloc(&sess->src, NULL, 0) || !sockaddr_alloc(&sess->dst, NULL, 0)) + goto fail; + + ((struct sockaddr_in *)sess->src)->sin_family = AF_INET; + ((struct sockaddr_in *)sess->src)->sin_addr.s_addr = hdr_v2->addr.ip4.src_addr; + ((struct sockaddr_in *)sess->src)->sin_port = hdr_v2->addr.ip4.src_port; + ((struct sockaddr_in *)sess->dst)->sin_family = AF_INET; + ((struct sockaddr_in *)sess->dst)->sin_addr.s_addr = hdr_v2->addr.ip4.dst_addr; + ((struct sockaddr_in *)sess->dst)->sin_port = hdr_v2->addr.ip4.dst_port; + tlv_offset = PP2_HEADER_LEN + PP2_ADDR_LEN_INET; + break; + case 0x21: /* TCPv6 */ + if (ntohs(hdr_v2->len) < PP2_ADDR_LEN_INET6) + goto bad_header; + + if (!sess || !sockaddr_alloc(&sess->src, NULL, 0) || !sockaddr_alloc(&sess->dst, NULL, 0)) + goto fail; + + ((struct sockaddr_in6 *)sess->src)->sin6_family = AF_INET6; + memcpy(&((struct sockaddr_in6 *)sess->src)->sin6_addr, hdr_v2->addr.ip6.src_addr, 16); + ((struct sockaddr_in6 *)sess->src)->sin6_port = hdr_v2->addr.ip6.src_port; + ((struct sockaddr_in6 *)sess->dst)->sin6_family = AF_INET6; + memcpy(&((struct sockaddr_in6 *)sess->dst)->sin6_addr, hdr_v2->addr.ip6.dst_addr, 16); + ((struct sockaddr_in6 *)sess->dst)->sin6_port = hdr_v2->addr.ip6.dst_port; + tlv_offset = PP2_HEADER_LEN + PP2_ADDR_LEN_INET6; + break; + } + + /* TLV parsing */ + while (tlv_offset < total_v2_len) { + struct ist tlv; + struct tlv *tlv_packet = NULL; + struct conn_tlv_list *new_tlv = NULL; + size_t data_len = 0; + + /* Verify that we have at least TLV_HEADER_SIZE bytes left */ + if (tlv_offset + TLV_HEADER_SIZE > total_v2_len) + goto bad_header; + + tlv_packet = (struct tlv *) &trash.area[tlv_offset]; + tlv = ist2((const char *)tlv_packet->value, get_tlv_length(tlv_packet)); + tlv_offset += istlen(tlv) + TLV_HEADER_SIZE; + + /* Verify that the TLV length does not exceed the total PROXYv2 length */ + if (tlv_offset > total_v2_len) + goto bad_header; + + /* Prepare known TLV types */ + switch (tlv_packet->type) { + case PP2_TYPE_CRC32C: { + uint32_t n_crc32c; + + /* Verify that this TLV is exactly 4 bytes long */ + if (istlen(tlv) != PP2_CRC32C_LEN) + goto bad_header; + + n_crc32c = read_n32(istptr(tlv)); + write_n32(istptr(tlv), 0); // compute with CRC==0 + + if (hash_crc32c(trash.area, total_v2_len) != n_crc32c) + goto bad_header; + break; + } +#ifdef USE_NS + case PP2_TYPE_NETNS: { + const struct netns_entry *ns; + + ns = netns_store_lookup(istptr(tlv), istlen(tlv)); + if (ns) + conn->proxy_netns = ns; + break; + } +#endif + case PP2_TYPE_AUTHORITY: { + /* For now, keep the length restriction by HAProxy */ + if (istlen(tlv) > HA_PP2_AUTHORITY_MAX) + goto bad_header; + + break; + } + case PP2_TYPE_UNIQUE_ID: { + if (istlen(tlv) > UNIQUEID_LEN) + goto bad_header; + break; + } + default: + break; + } + + /* If we did not find a known TLV type that we can optimize for, we generically allocate it */ + data_len = get_tlv_length(tlv_packet); + + /* Prevent attackers from allocating too much memory */ + if (unlikely(data_len > HA_PP2_MAX_ALLOC)) + goto fail; + + /* Alloc memory based on data_len */ + if (data_len > HA_PP2_TLV_VALUE_256) + new_tlv = malloc(get_tlv_length(tlv_packet) + sizeof(struct conn_tlv_list)); + else if (data_len <= HA_PP2_TLV_VALUE_128) + new_tlv = pool_alloc(pool_head_pp_tlv_128); + else + new_tlv = pool_alloc(pool_head_pp_tlv_256); + + if (unlikely(!new_tlv)) + goto fail; + + new_tlv->type = tlv_packet->type; + + /* Save TLV to make it accessible via sample fetch */ + memcpy(new_tlv->value, tlv.ptr, data_len); + new_tlv->len = data_len; + + LIST_APPEND(&conn->tlv_list, &new_tlv->list); + } + + + /* Verify that the PROXYv2 header ends at a TLV boundary. + * This is can not be true, because the TLV parsing already + * verifies that a TLV does not exceed the total length and + * also that there is space for a TLV header. + */ + BUG_ON(tlv_offset != total_v2_len); + + /* unsupported protocol, keep local connection address */ + break; + case 0x00: /* LOCAL command */ + /* keep local connection address for LOCAL */ + break; + default: + goto bad_header; /* not a supported command */ + } + + trash.data = total_v2_len; + goto eat_header; + + eat_header: + /* remove the PROXY line from the request. For this we re-read the + * exact line at once. If we don't get the exact same result, we + * fail. + */ + while (1) { + ssize_t len2 = recv(conn->handle.fd, trash.area, trash.data, 0); + + if (len2 < 0 && errno == EINTR) + continue; + if (len2 != trash.data) + goto recv_abort; + break; + } + + conn->flags &= ~flag; + conn->flags |= CO_FL_RCVD_PROXY; + return 1; + + not_ready: + return 0; + + missing: + /* Missing data. Since we're using MSG_PEEK, we can only poll again if + * we have not read anything. Otherwise we need to fail because we won't + * be able to poll anymore. + */ + conn->err_code = CO_ER_PRX_TRUNCATED; + goto fail; + + bad_header: + /* This is not a valid proxy protocol header */ + conn->err_code = CO_ER_PRX_BAD_HDR; + goto fail; + + recv_abort: + conn->err_code = CO_ER_PRX_ABORT; + conn->flags |= CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + goto fail; + + fail: + conn->flags |= CO_FL_ERROR; + return 0; +} + +/* This callback is used to send a valid PROXY protocol line to a socket being + * established. It returns 0 if it fails in a fatal way or needs to poll to go + * further, otherwise it returns non-zero and removes itself from the connection's + * flags (the bit is provided in <flag> by the caller). It is designed to be + * called by the connection handler and relies on it to commit polling changes. + * Note that it can emit a PROXY line by relying on the other end's address + * when the connection is attached to a stream connector, or by resolving the + * local address otherwise (also called a LOCAL line). + */ +int conn_send_proxy(struct connection *conn, unsigned int flag) +{ + if (!conn_ctrl_ready(conn)) + goto out_error; + + /* If we have a PROXY line to send, we'll use this to validate the + * connection, in which case the connection is validated only once + * we've sent the whole proxy line. Otherwise we use connect(). + */ + if (conn->send_proxy_ofs) { + struct stconn *sc; + int ret; + + /* If there is no mux attached to the connection, it means the + * connection context is a stream connector. + */ + sc = conn->mux ? conn_get_first_sc(conn) : conn->ctx; + + /* The target server expects a PROXY line to be sent first. + * If the send_proxy_ofs is negative, it corresponds to the + * offset to start sending from then end of the proxy string + * (which is recomputed every time since it's constant). If + * it is positive, it means we have to send from the start. + * We can only send a "normal" PROXY line when the connection + * is attached to a stream connector. Otherwise we can only + * send a LOCAL line (eg: for use with health checks). + */ + + if (sc && sc_strm(sc)) { + ret = make_proxy_line(trash.area, trash.size, + objt_server(conn->target), + sc_conn(sc_opposite(sc)), + __sc_strm(sc)); + } + else { + /* The target server expects a LOCAL line to be sent first. Retrieving + * local or remote addresses may fail until the connection is established. + */ + if (!conn_get_src(conn) || !conn_get_dst(conn)) + goto out_wait; + + ret = make_proxy_line(trash.area, trash.size, + objt_server(conn->target), conn, + NULL); + } + + if (!ret) + goto out_error; + + if (conn->send_proxy_ofs > 0) + conn->send_proxy_ofs = -ret; /* first call */ + + /* we have to send trash from (ret+sp for -sp bytes). If the + * data layer has a pending write, we'll also set MSG_MORE. + */ + ret = conn_ctrl_send(conn, + trash.area + ret + conn->send_proxy_ofs, + -conn->send_proxy_ofs, + (conn->subs && conn->subs->events & SUB_RETRY_SEND) ? CO_SFL_MSG_MORE : 0); + + if (ret < 0) + goto out_error; + + conn->send_proxy_ofs += ret; /* becomes zero once complete */ + if (conn->send_proxy_ofs != 0) + goto out_wait; + + /* OK we've sent the whole line, we're connected */ + } + + /* The connection is ready now, simply return and let the connection + * handler notify upper layers if needed. + */ + conn->flags &= ~CO_FL_WAIT_L4_CONN; + conn->flags &= ~flag; + return 1; + + out_error: + /* Write error on the file descriptor */ + conn->flags |= CO_FL_ERROR; + return 0; + + out_wait: + return 0; +} + +/* This handshake handler waits a NetScaler Client IP insertion header + * at the beginning of the raw data stream. The header format is + * described in doc/netscaler-client-ip-insertion-protocol.txt + * + * This line MUST be at the beginning of the buffer and MUST NOT be + * fragmented. + * + * The header line is small and in all cases smaller than the smallest normal + * TCP MSS. So it MUST always be delivered as one segment, which ensures we + * can safely use MSG_PEEK and avoid buffering. + * + * Once the data is fetched, the values are set in the connection's address + * fields, and data are removed from the socket's buffer. The function returns + * zero if it needs to wait for more data or if it fails, or 1 if it completed + * and removed itself. + */ +int conn_recv_netscaler_cip(struct connection *conn, int flag) +{ + struct session *sess = conn->owner; + char *line; + uint32_t hdr_len; + uint8_t ip_ver; + int ret; + + if (!conn_ctrl_ready(conn)) + goto fail; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (!fd_recv_ready(conn->handle.fd)) + goto not_ready; + + while (1) { + ret = recv(conn->handle.fd, trash.area, trash.size, MSG_PEEK); + if (ret < 0) { + if (errno == EINTR) + continue; + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_recv(conn->handle.fd); + goto not_ready; + } + goto recv_abort; + } + trash.data = ret; + break; + } + + conn->flags &= ~CO_FL_WAIT_L4_CONN; + + if (!trash.data) { + /* client shutdown */ + conn->err_code = CO_ER_CIP_EMPTY; + goto fail; + } + + /* Fail if buffer length is not large enough to contain + * CIP magic, header length or + * CIP magic, CIP length, CIP type, header length */ + if (trash.data < 12) + goto missing; + + line = trash.area; + + /* Decode a possible NetScaler Client IP request, fail early if + * it does not match */ + if (ntohl(read_u32(line)) != __objt_listener(conn->target)->bind_conf->ns_cip_magic) + goto bad_magic; + + /* Legacy CIP protocol */ + if ((trash.area[8] & 0xD0) == 0x40) { + hdr_len = ntohl(read_u32((line+4))); + line += 8; + } + /* Standard CIP protocol */ + else if (trash.area[8] == 0x00) { + hdr_len = ntohs(read_u32((line+10))); + line += 12; + } + /* Unknown CIP protocol */ + else { + conn->err_code = CO_ER_CIP_BAD_PROTO; + goto fail; + } + + /* Fail if buffer length is not large enough to contain + * a minimal IP header */ + if (trash.data < 20) + goto missing; + + /* Get IP version from the first four bits */ + ip_ver = (*line & 0xf0) >> 4; + + if (ip_ver == 4) { + struct ip *hdr_ip4; + struct my_tcphdr *hdr_tcp; + + hdr_ip4 = (struct ip *)line; + + if (trash.data < 40 || trash.data < hdr_len) { + /* Fail if buffer length is not large enough to contain + * IPv4 header, TCP header */ + goto missing; + } + else if (hdr_ip4->ip_p != IPPROTO_TCP) { + /* The protocol does not include a TCP header */ + conn->err_code = CO_ER_CIP_BAD_PROTO; + goto fail; + } + + hdr_tcp = (struct my_tcphdr *)(line + (hdr_ip4->ip_hl * 4)); + + if (!sess || !sockaddr_alloc(&sess->src, NULL, 0) || !sockaddr_alloc(&sess->dst, NULL, 0)) + goto fail; + + /* update the session's addresses and mark them set */ + ((struct sockaddr_in *)sess->src)->sin_family = AF_INET; + ((struct sockaddr_in *)sess->src)->sin_addr.s_addr = hdr_ip4->ip_src.s_addr; + ((struct sockaddr_in *)sess->src)->sin_port = hdr_tcp->source; + + ((struct sockaddr_in *)sess->dst)->sin_family = AF_INET; + ((struct sockaddr_in *)sess->dst)->sin_addr.s_addr = hdr_ip4->ip_dst.s_addr; + ((struct sockaddr_in *)sess->dst)->sin_port = hdr_tcp->dest; + } + else if (ip_ver == 6) { + struct ip6_hdr *hdr_ip6; + struct my_tcphdr *hdr_tcp; + + hdr_ip6 = (struct ip6_hdr *)line; + + if (trash.data < 60 || trash.data < hdr_len) { + /* Fail if buffer length is not large enough to contain + * IPv6 header, TCP header */ + goto missing; + } + else if (hdr_ip6->ip6_nxt != IPPROTO_TCP) { + /* The protocol does not include a TCP header */ + conn->err_code = CO_ER_CIP_BAD_PROTO; + goto fail; + } + + hdr_tcp = (struct my_tcphdr *)(line + sizeof(struct ip6_hdr)); + + if (!sess || !sockaddr_alloc(&sess->src, NULL, 0) || !sockaddr_alloc(&sess->dst, NULL, 0)) + goto fail; + + /* update the session's addresses and mark them set */ + ((struct sockaddr_in6 *)sess->src)->sin6_family = AF_INET6; + ((struct sockaddr_in6 *)sess->src)->sin6_addr = hdr_ip6->ip6_src; + ((struct sockaddr_in6 *)sess->src)->sin6_port = hdr_tcp->source; + + ((struct sockaddr_in6 *)sess->dst)->sin6_family = AF_INET6; + ((struct sockaddr_in6 *)sess->dst)->sin6_addr = hdr_ip6->ip6_dst; + ((struct sockaddr_in6 *)sess->dst)->sin6_port = hdr_tcp->dest; + } + else { + /* The protocol does not match something known (IPv4/IPv6) */ + conn->err_code = CO_ER_CIP_BAD_PROTO; + goto fail; + } + + line += hdr_len; + trash.data = line - trash.area; + + /* remove the NetScaler Client IP header from the request. For this + * we re-read the exact line at once. If we don't get the exact same + * result, we fail. + */ + while (1) { + int len2 = recv(conn->handle.fd, trash.area, trash.data, 0); + if (len2 < 0 && errno == EINTR) + continue; + if (len2 != trash.data) + goto recv_abort; + break; + } + + conn->flags &= ~flag; + return 1; + + not_ready: + return 0; + + missing: + /* Missing data. Since we're using MSG_PEEK, we can only poll again if + * we have not read anything. Otherwise we need to fail because we won't + * be able to poll anymore. + */ + conn->err_code = CO_ER_CIP_TRUNCATED; + goto fail; + + bad_magic: + conn->err_code = CO_ER_CIP_BAD_MAGIC; + goto fail; + + recv_abort: + conn->err_code = CO_ER_CIP_ABORT; + conn->flags |= CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + goto fail; + + fail: + conn->flags |= CO_FL_ERROR; + return 0; +} + + +int conn_send_socks4_proxy_request(struct connection *conn) +{ + struct socks4_request req_line; + + if (!conn_ctrl_ready(conn)) + goto out_error; + + if (!conn_get_dst(conn)) + goto out_error; + + req_line.version = 0x04; + req_line.command = 0x01; + req_line.port = get_net_port(conn->dst); + req_line.ip = is_inet_addr(conn->dst); + memcpy(req_line.user_id, "HAProxy\0", 8); + + if (conn->send_proxy_ofs > 0) { + /* + * This is the first call to send the request + */ + conn->send_proxy_ofs = -(int)sizeof(req_line); + } + + if (conn->send_proxy_ofs < 0) { + int ret = 0; + + /* we are sending the socks4_req_line here. If the data layer + * has a pending write, we'll also set MSG_MORE. + */ + ret = conn_ctrl_send( + conn, + ((char *)(&req_line)) + (sizeof(req_line)+conn->send_proxy_ofs), + -conn->send_proxy_ofs, + (conn->subs && conn->subs->events & SUB_RETRY_SEND) ? CO_SFL_MSG_MORE : 0); + + DPRINTF(stderr, "SOCKS PROXY HS FD[%04X]: Before send remain is [%d], sent [%d]\n", + conn_fd(conn), -conn->send_proxy_ofs, ret); + + if (ret < 0) { + goto out_error; + } + + conn->send_proxy_ofs += ret; /* becomes zero once complete */ + if (conn->send_proxy_ofs != 0) { + goto out_wait; + } + } + + /* OK we've the whole request sent */ + conn->flags &= ~CO_FL_SOCKS4_SEND; + + /* The connection is ready now, simply return and let the connection + * handler notify upper layers if needed. + */ + conn->flags &= ~CO_FL_WAIT_L4_CONN; + + if (conn->flags & CO_FL_SEND_PROXY) { + /* + * Get the send_proxy_ofs ready for the send_proxy due to we are + * reusing the "send_proxy_ofs", and SOCKS4 handshake should be done + * before sending PROXY Protocol. + */ + conn->send_proxy_ofs = 1; + } + return 1; + + out_error: + /* Write error on the file descriptor */ + conn->flags |= CO_FL_ERROR; + if (conn->err_code == CO_ER_NONE) { + conn->err_code = CO_ER_SOCKS4_SEND; + } + return 0; + + out_wait: + return 0; +} + +int conn_recv_socks4_proxy_response(struct connection *conn) +{ + char line[SOCKS4_HS_RSP_LEN]; + int ret; + + if (!conn_ctrl_ready(conn)) + goto fail; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (!fd_recv_ready(conn->handle.fd)) + goto not_ready; + + while (1) { + /* SOCKS4 Proxy will response with 8 bytes, 0x00 | 0x5A | 0x00 0x00 | 0x00 0x00 0x00 0x00 + * Try to peek into it, before all 8 bytes ready. + */ + ret = recv(conn->handle.fd, line, SOCKS4_HS_RSP_LEN, MSG_PEEK); + + if (ret == 0) { + /* the socket has been closed or shutdown for send */ + DPRINTF(stderr, "SOCKS PROXY HS FD[%04X]: Received ret[%d], errno[%d], looks like the socket has been closed or shutdown for send\n", + conn->handle.fd, ret, errno); + if (conn->err_code == CO_ER_NONE) { + conn->err_code = CO_ER_SOCKS4_RECV; + } + goto fail; + } + + if (ret > 0) { + if (ret == SOCKS4_HS_RSP_LEN) { + DPRINTF(stderr, "SOCKS PROXY HS FD[%04X]: Received 8 bytes, the response is [%02X|%02X|%02X %02X|%02X %02X %02X %02X]\n", + conn->handle.fd, line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[7]); + }else{ + DPRINTF(stderr, "SOCKS PROXY HS FD[%04X]: Received ret[%d], first byte is [%02X], last bye is [%02X]\n", conn->handle.fd, ret, line[0], line[ret-1]); + } + } else { + DPRINTF(stderr, "SOCKS PROXY HS FD[%04X]: Received ret[%d], errno[%d]\n", conn->handle.fd, ret, errno); + } + + if (ret < 0) { + if (errno == EINTR) { + continue; + } + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_recv(conn->handle.fd); + goto not_ready; + } + goto recv_abort; + } + break; + } + + conn->flags &= ~CO_FL_WAIT_L4_CONN; + + if (ret < SOCKS4_HS_RSP_LEN) { + /* Missing data. Since we're using MSG_PEEK, we can only poll again if + * we are not able to read enough data. + */ + goto not_ready; + } + + /* + * Base on the SOCSK4 protocol: + * + * +----+----+----+----+----+----+----+----+ + * | VN | CD | DSTPORT | DSTIP | + * +----+----+----+----+----+----+----+----+ + * # of bytes: 1 1 2 4 + * VN is the version of the reply code and should be 0. CD is the result + * code with one of the following values: + * 90: request granted + * 91: request rejected or failed + * 92: request rejected because SOCKS server cannot connect to identd on the client + * 93: request rejected because the client program and identd report different user-ids + * The remaining fields are ignored. + */ + if (line[1] != 90) { + conn->flags &= ~CO_FL_SOCKS4_RECV; + + DPRINTF(stderr, "SOCKS PROXY HS FD[%04X]: FAIL, the response is [%02X|%02X|%02X %02X|%02X %02X %02X %02X]\n", + conn->handle.fd, line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[7]); + if (conn->err_code == CO_ER_NONE) { + conn->err_code = CO_ER_SOCKS4_DENY; + } + goto fail; + } + + /* remove the 8 bytes response from the stream */ + while (1) { + ret = recv(conn->handle.fd, line, SOCKS4_HS_RSP_LEN, 0); + if (ret < 0 && errno == EINTR) { + continue; + } + if (ret != SOCKS4_HS_RSP_LEN) { + if (conn->err_code == CO_ER_NONE) { + conn->err_code = CO_ER_SOCKS4_RECV; + } + goto fail; + } + break; + } + + conn->flags &= ~CO_FL_SOCKS4_RECV; + return 1; + + not_ready: + return 0; + + recv_abort: + if (conn->err_code == CO_ER_NONE) { + conn->err_code = CO_ER_SOCKS4_ABORT; + } + conn->flags |= (CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH); + goto fail; + + fail: + conn->flags |= CO_FL_ERROR; + return 0; +} + +/* registers proto mux list <list>. Modifies the list element! */ +void register_mux_proto(struct mux_proto_list *list) +{ + LIST_APPEND(&mux_proto_list.list, &list->list); +} + +/* Lists the known proto mux on <out>. This function is used by "haproxy -vv" + * and is suitable for early boot just after the "REGISTER" stage because it + * doesn't depend on anything to be already allocated. + */ +void list_mux_proto(FILE *out) +{ + struct mux_proto_list *item; + struct ist proto; + char *mode, *side; + int done; + + fprintf(out, "Available multiplexer protocols :\n" + "(protocols marked as <default> cannot be specified using 'proto' keyword)\n"); + list_for_each_entry(item, &mux_proto_list.list, list) { + proto = item->token; + + if (item->mode == PROTO_MODE_ANY) + mode = "TCP|HTTP"; + else if (item->mode == PROTO_MODE_TCP) + mode = "TCP"; + else if (item->mode == PROTO_MODE_HTTP) + mode = "HTTP"; + else + mode = "NONE"; + + if (item->side == PROTO_SIDE_BOTH) + side = "FE|BE"; + else if (item->side == PROTO_SIDE_FE) + side = "FE"; + else if (item->side == PROTO_SIDE_BE) + side = "BE"; + else + side = "NONE"; + + fprintf(out, " %10s : mode=%-5s side=%-6s mux=%-5s flags=", + (proto.len ? proto.ptr : "<default>"), mode, side, item->mux->name); + + done = 0; + + /* note: the block below could be simplified using macros but for only + * 4 flags it's not worth it. + */ + if (item->mux->flags & MX_FL_HTX) + done |= fprintf(out, "%sHTX", done ? "|" : ""); + + if (item->mux->flags & MX_FL_HOL_RISK) + done |= fprintf(out, "%sHOL_RISK", done ? "|" : ""); + + if (item->mux->flags & MX_FL_NO_UPG) + done |= fprintf(out, "%sNO_UPG", done ? "|" : ""); + + if (item->mux->flags & MX_FL_FRAMED) + done |= fprintf(out, "%sFRAMED", done ? "|" : ""); + + fprintf(out, "\n"); + } +} + +/* Makes a PROXY protocol line from the two addresses. The output is sent to + * buffer <buf> for a maximum size of <buf_len> (including the trailing zero). + * It returns the number of bytes composing this line (including the trailing + * LF), or zero in case of failure (eg: not enough space). It supports TCP4, + * TCP6 and "UNKNOWN" formats. If any of <src> or <dst> is null, UNKNOWN is + * emitted as well. + */ +static int make_proxy_line_v1(char *buf, int buf_len, const struct sockaddr_storage *src, const struct sockaddr_storage *dst) +{ + int ret = 0; + char * protocol; + char src_str[MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)]; + char dst_str[MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN)]; + in_port_t src_port; + in_port_t dst_port; + + if ( !src + || !dst + || (src->ss_family != AF_INET && src->ss_family != AF_INET6) + || (dst->ss_family != AF_INET && dst->ss_family != AF_INET6)) { + /* unknown family combination */ + ret = snprintf(buf, buf_len, "PROXY UNKNOWN\r\n"); + if (ret >= buf_len) + return 0; + + return ret; + } + + /* IPv4 for both src and dst */ + if (src->ss_family == AF_INET && dst->ss_family == AF_INET) { + protocol = "TCP4"; + if (!inet_ntop(AF_INET, &((struct sockaddr_in *)src)->sin_addr, src_str, sizeof(src_str))) + return 0; + src_port = ((struct sockaddr_in *)src)->sin_port; + if (!inet_ntop(AF_INET, &((struct sockaddr_in *)dst)->sin_addr, dst_str, sizeof(dst_str))) + return 0; + dst_port = ((struct sockaddr_in *)dst)->sin_port; + } + /* IPv6 for at least one of src and dst */ + else { + struct in6_addr tmp; + + protocol = "TCP6"; + + if (src->ss_family == AF_INET) { + /* Convert src to IPv6 */ + v4tov6(&tmp, &((struct sockaddr_in *)src)->sin_addr); + src_port = ((struct sockaddr_in *)src)->sin_port; + } + else { + tmp = ((struct sockaddr_in6 *)src)->sin6_addr; + src_port = ((struct sockaddr_in6 *)src)->sin6_port; + } + + if (!inet_ntop(AF_INET6, &tmp, src_str, sizeof(src_str))) + return 0; + + if (dst->ss_family == AF_INET) { + /* Convert dst to IPv6 */ + v4tov6(&tmp, &((struct sockaddr_in *)dst)->sin_addr); + dst_port = ((struct sockaddr_in *)dst)->sin_port; + } + else { + tmp = ((struct sockaddr_in6 *)dst)->sin6_addr; + dst_port = ((struct sockaddr_in6 *)dst)->sin6_port; + } + + if (!inet_ntop(AF_INET6, &tmp, dst_str, sizeof(dst_str))) + return 0; + } + + ret = snprintf(buf, buf_len, "PROXY %s %s %s %u %u\r\n", protocol, src_str, dst_str, ntohs(src_port), ntohs(dst_port)); + if (ret >= buf_len) + return 0; + + return ret; +} + +static int make_tlv(char *dest, int dest_len, char type, uint16_t length, const char *value) +{ + struct tlv *tlv; + + if (!dest || (length + sizeof(*tlv) > dest_len)) + return 0; + + tlv = (struct tlv *)dest; + + tlv->type = type; + tlv->length_hi = length >> 8; + tlv->length_lo = length & 0x00ff; + memcpy(tlv->value, value, length); + return length + sizeof(*tlv); +} + +/* Note: <remote> is explicitly allowed to be NULL */ +static int make_proxy_line_v2(char *buf, int buf_len, struct server *srv, struct connection *remote, struct stream *strm) +{ + const char pp2_signature[] = PP2_SIGNATURE; + void *tlv_crc32c_p = NULL; + int ret = 0; + struct proxy_hdr_v2 *hdr = (struct proxy_hdr_v2 *)buf; + struct sockaddr_storage null_addr = { .ss_family = 0 }; + struct srv_pp_tlv_list *srv_tlv = NULL; + const struct sockaddr_storage *src = &null_addr; + const struct sockaddr_storage *dst = &null_addr; + const char *value = ""; + int value_len = 0; + + if (buf_len < PP2_HEADER_LEN) + return 0; + memcpy(hdr->sig, pp2_signature, PP2_SIGNATURE_LEN); + + if (strm) { + src = sc_src(strm->scf); + dst = sc_dst(strm->scf); + } + else if (remote && conn_get_src(remote) && conn_get_dst(remote)) { + src = conn_src(remote); + dst = conn_dst(remote); + } + + /* At least one of src or dst is not of AF_INET or AF_INET6 */ + if ( !src + || !dst + || (!pp2_never_send_local && conn_is_back(remote)) // locally initiated connection + || (src->ss_family != AF_INET && src->ss_family != AF_INET6) + || (dst->ss_family != AF_INET && dst->ss_family != AF_INET6)) { + if (buf_len < PP2_HDR_LEN_UNSPEC) + return 0; + hdr->ver_cmd = PP2_VERSION | PP2_CMD_LOCAL; + hdr->fam = PP2_FAM_UNSPEC | PP2_TRANS_UNSPEC; + ret = PP2_HDR_LEN_UNSPEC; + } + else { + hdr->ver_cmd = PP2_VERSION | PP2_CMD_PROXY; + /* IPv4 for both src and dst */ + if (src->ss_family == AF_INET && dst->ss_family == AF_INET) { + if (buf_len < PP2_HDR_LEN_INET) + return 0; + hdr->fam = PP2_FAM_INET | PP2_TRANS_STREAM; + hdr->addr.ip4.src_addr = ((struct sockaddr_in *)src)->sin_addr.s_addr; + hdr->addr.ip4.src_port = ((struct sockaddr_in *)src)->sin_port; + hdr->addr.ip4.dst_addr = ((struct sockaddr_in *)dst)->sin_addr.s_addr; + hdr->addr.ip4.dst_port = ((struct sockaddr_in *)dst)->sin_port; + ret = PP2_HDR_LEN_INET; + } + /* IPv6 for at least one of src and dst */ + else { + struct in6_addr tmp; + + if (buf_len < PP2_HDR_LEN_INET6) + return 0; + hdr->fam = PP2_FAM_INET6 | PP2_TRANS_STREAM; + if (src->ss_family == AF_INET) { + v4tov6(&tmp, &((struct sockaddr_in *)src)->sin_addr); + memcpy(hdr->addr.ip6.src_addr, &tmp, 16); + hdr->addr.ip6.src_port = ((struct sockaddr_in *)src)->sin_port; + } + else { + memcpy(hdr->addr.ip6.src_addr, &((struct sockaddr_in6 *)src)->sin6_addr, 16); + hdr->addr.ip6.src_port = ((struct sockaddr_in6 *)src)->sin6_port; + } + if (dst->ss_family == AF_INET) { + v4tov6(&tmp, &((struct sockaddr_in *)dst)->sin_addr); + memcpy(hdr->addr.ip6.dst_addr, &tmp, 16); + hdr->addr.ip6.dst_port = ((struct sockaddr_in *)dst)->sin_port; + } + else { + memcpy(hdr->addr.ip6.dst_addr, &((struct sockaddr_in6 *)dst)->sin6_addr, 16); + hdr->addr.ip6.dst_port = ((struct sockaddr_in6 *)dst)->sin6_port; + } + + ret = PP2_HDR_LEN_INET6; + } + } + + if (strm) { + struct buffer *replace = NULL; + + list_for_each_entry(srv_tlv, &srv->pp_tlvs, list) { + replace = NULL; + + /* Users will always need to provide a value, in case of forwarding, they should use fc_pp_tlv. + * for generic types. Otherwise, we will send an empty TLV. + */ + if (!LIST_ISEMPTY(&srv_tlv->fmt)) { + replace = alloc_trash_chunk(); + if (unlikely(!replace)) + return 0; + + replace->data = build_logline(strm, replace->area, replace->size, &srv_tlv->fmt); + + if (unlikely((buf_len - ret) < sizeof(struct tlv))) { + free_trash_chunk(replace); + return 0; + } + ret += make_tlv(&buf[ret], (buf_len - ret), srv_tlv->type, replace->data, replace->area); + free_trash_chunk(replace); + } + else { + /* Create empty TLV as no value was specified */ + ret += make_tlv(&buf[ret], (buf_len - ret), srv_tlv->type, 0, NULL); + } + } + } + + /* Handle predefined TLVs as usual */ + if (srv->pp_opts & SRV_PP_V2_CRC32C) { + uint32_t zero_crc32c = 0; + + if ((buf_len - ret) < sizeof(struct tlv)) + return 0; + tlv_crc32c_p = (void *)((struct tlv *)&buf[ret])->value; + ret += make_tlv(&buf[ret], (buf_len - ret), PP2_TYPE_CRC32C, sizeof(zero_crc32c), (const char *)&zero_crc32c); + } + + if (remote && conn_get_alpn(remote, &value, &value_len)) { + if ((buf_len - ret) < sizeof(struct tlv)) + return 0; + ret += make_tlv(&buf[ret], (buf_len - ret), PP2_TYPE_ALPN, value_len, value); + } + + if (srv->pp_opts & SRV_PP_V2_AUTHORITY) { + struct conn_tlv_list *tlv = conn_get_tlv(remote, PP2_TYPE_AUTHORITY); + + value = NULL; + if (tlv) { + value_len = tlv->len; + value = tlv->value; + } +#ifdef USE_OPENSSL + else { + if ((value = ssl_sock_get_sni(remote))) + value_len = strlen(value); + } +#endif + if (value) { + if ((buf_len - ret) < sizeof(struct tlv)) + return 0; + ret += make_tlv(&buf[ret], (buf_len - ret), PP2_TYPE_AUTHORITY, value_len, value); + } + } + + if (strm && (srv->pp_opts & SRV_PP_V2_UNIQUE_ID)) { + struct session* sess = strm_sess(strm); + struct ist unique_id = stream_generate_unique_id(strm, &sess->fe->format_unique_id); + + value = unique_id.ptr; + value_len = unique_id.len; + + if (value_len >= 0) { + if ((buf_len - ret) < sizeof(struct tlv)) + return 0; + ret += make_tlv(&buf[ret], (buf_len - ret), PP2_TYPE_UNIQUE_ID, value_len, value); + } + } + +#ifdef USE_OPENSSL + if (srv->pp_opts & SRV_PP_V2_SSL) { + struct tlv_ssl *tlv; + int ssl_tlv_len = 0; + + if ((buf_len - ret) < sizeof(struct tlv_ssl)) + return 0; + tlv = (struct tlv_ssl *)&buf[ret]; + memset(tlv, 0, sizeof(struct tlv_ssl)); + ssl_tlv_len += sizeof(struct tlv_ssl); + tlv->tlv.type = PP2_TYPE_SSL; + if (conn_is_ssl(remote)) { + tlv->client |= PP2_CLIENT_SSL; + value = ssl_sock_get_proto_version(remote); + if (value) { + ssl_tlv_len += make_tlv(&buf[ret+ssl_tlv_len], (buf_len-ret-ssl_tlv_len), PP2_SUBTYPE_SSL_VERSION, strlen(value), value); + } + if (ssl_sock_get_cert_used_sess(remote)) { + tlv->client |= PP2_CLIENT_CERT_SESS; + tlv->verify = htonl(ssl_sock_get_verify_result(remote)); + if (ssl_sock_get_cert_used_conn(remote)) + tlv->client |= PP2_CLIENT_CERT_CONN; + } + if (srv->pp_opts & SRV_PP_V2_SSL_CN) { + struct buffer *cn_trash = get_trash_chunk(); + if (ssl_sock_get_remote_common_name(remote, cn_trash) > 0) { + ssl_tlv_len += make_tlv(&buf[ret+ssl_tlv_len], (buf_len - ret - ssl_tlv_len), PP2_SUBTYPE_SSL_CN, + cn_trash->data, + cn_trash->area); + } + } + if (srv->pp_opts & SRV_PP_V2_SSL_KEY_ALG) { + struct buffer *pkey_trash = get_trash_chunk(); + if (ssl_sock_get_pkey_algo(remote, pkey_trash) > 0) { + ssl_tlv_len += make_tlv(&buf[ret+ssl_tlv_len], (buf_len - ret - ssl_tlv_len), PP2_SUBTYPE_SSL_KEY_ALG, + pkey_trash->data, + pkey_trash->area); + } + } + if (srv->pp_opts & SRV_PP_V2_SSL_SIG_ALG) { + value = ssl_sock_get_cert_sig(remote); + if (value) { + ssl_tlv_len += make_tlv(&buf[ret+ssl_tlv_len], (buf_len - ret - ssl_tlv_len), PP2_SUBTYPE_SSL_SIG_ALG, strlen(value), value); + } + } + if (srv->pp_opts & SRV_PP_V2_SSL_CIPHER) { + value = ssl_sock_get_cipher_name(remote); + if (value) { + ssl_tlv_len += make_tlv(&buf[ret+ssl_tlv_len], (buf_len - ret - ssl_tlv_len), PP2_SUBTYPE_SSL_CIPHER, strlen(value), value); + } + } + } + tlv->tlv.length_hi = (uint16_t)(ssl_tlv_len - sizeof(struct tlv)) >> 8; + tlv->tlv.length_lo = (uint16_t)(ssl_tlv_len - sizeof(struct tlv)) & 0x00ff; + ret += ssl_tlv_len; + } +#endif + +#ifdef USE_NS + if (remote && (remote->proxy_netns)) { + if ((buf_len - ret) < sizeof(struct tlv)) + return 0; + ret += make_tlv(&buf[ret], (buf_len - ret), PP2_TYPE_NETNS, remote->proxy_netns->name_len, remote->proxy_netns->node.key); + } +#endif + + hdr->len = htons((uint16_t)(ret - PP2_HEADER_LEN)); + + if (tlv_crc32c_p) { + write_u32(tlv_crc32c_p, htonl(hash_crc32c(buf, ret))); + } + + return ret; +} + +/* Note: <remote> is explicitly allowed to be NULL */ +int make_proxy_line(char *buf, int buf_len, struct server *srv, struct connection *remote, struct stream *strm) +{ + int ret = 0; + + if (srv && (srv->pp_opts & SRV_PP_V2)) { + ret = make_proxy_line_v2(buf, buf_len, srv, remote, strm); + } + else { + const struct sockaddr_storage *src = NULL; + const struct sockaddr_storage *dst = NULL; + + if (strm) { + src = sc_src(strm->scf); + dst = sc_dst(strm->scf); + } + else if (remote && conn_get_src(remote) && conn_get_dst(remote)) { + src = conn_src(remote); + dst = conn_dst(remote); + } + + if (src && dst) + ret = make_proxy_line_v1(buf, buf_len, src, dst); + else + ret = make_proxy_line_v1(buf, buf_len, NULL, NULL); + } + + return ret; +} + +/* returns 0 on success */ +static int cfg_parse_pp2_never_send_local(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(0, args, err, NULL)) + return -1; + pp2_never_send_local = 1; + return 0; +} + +/* extracts some info from the connection and appends them to buffer <buf>. The + * connection's pointer, its direction, target (fe/be/srv), xprt/ctrl, source + * when set, destination when set, are printed in a compact human-readable format + * fitting on a single line. This is handy to complete traces or debug output. + * It is permitted to pass a NULL conn pointer. The number of characters emitted + * is returned. A prefix <pfx> might be prepended before the first field if not + * NULL. + */ +int conn_append_debug_info(struct buffer *buf, const struct connection *conn, const char *pfx) +{ + const struct listener *li; + const struct server *sv; + const struct proxy *px; + char addr[40]; + int old_len = buf->data; + + if (!conn) + return 0; + + chunk_appendf(buf, "%sconn=%p(%s)", pfx ? pfx : "", conn, conn_is_back(conn) ? "OUT" : "IN"); + + if ((li = objt_listener(conn->target))) + chunk_appendf(buf, " fe=%s", li->bind_conf->frontend->id); + else if ((sv = objt_server(conn->target))) + chunk_appendf(buf, " sv=%s/%s", sv->proxy->id, sv->id); + else if ((px = objt_proxy(conn->target))) + chunk_appendf(buf, " be=%s", px->id); + + chunk_appendf(buf, " %s/%s", conn_get_xprt_name(conn), conn_get_ctrl_name(conn)); + + if (conn->src && addr_to_str(conn->src, addr, sizeof(addr))) + chunk_appendf(buf, " src=%s:%d", addr, get_host_port(conn->src)); + + if (conn->dst && addr_to_str(conn->dst, addr, sizeof(addr))) + chunk_appendf(buf, " dst=%s:%d", addr, get_host_port(conn->dst)); + + return buf->data - old_len; +} + +/* return the major HTTP version as 1 or 2 depending on how the request arrived + * before being processed. + * + * WARNING: Should be updated if a new major HTTP version is added. + */ +static int +smp_fetch_fc_http_major(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn = NULL; + const char *mux_name = NULL; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[0] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[0] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + /* No connection or a connection with a RAW muxx */ + if (!conn || (conn->mux && !(conn->mux->flags & MX_FL_HTX))) + return 0; + + /* No mux install, this may change */ + if (!conn->mux) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + mux_name = conn_get_mux_name(conn); + + smp->data.type = SMP_T_SINT; + if (strcmp(mux_name, "QUIC") == 0) + smp->data.u.sint = 3; + else if (strcmp(mux_name, "H2") == 0) + smp->data.u.sint = 2; + else + smp->data.u.sint = 1; + + return 1; +} + +/* fetch if the received connection used a PROXY protocol header */ +int smp_fetch_fc_rcvd_proxy(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + + conn = objt_conn(smp->sess->origin); + if (!conn) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + smp->flags = 0; + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = (conn->flags & CO_FL_RCVD_PROXY) ? 1 : 0; + + return 1; +} + +/* + * This function checks the TLV type converter configuration. + * It expects the corresponding TLV type as a string representing the number + * or a constant. args[0] will be turned into the numerical value of the + * TLV type string. + */ +static int smp_check_tlv_type(struct arg *args, char **err) +{ + int type; + char *endp; + struct ist input = ist2(args[0].data.str.area, args[0].data.str.data); + + if (isteqi(input, ist("ALPN")) != 0) + type = PP2_TYPE_ALPN; + else if (isteqi(input, ist("AUTHORITY")) != 0) + type = PP2_TYPE_AUTHORITY; + else if (isteqi(input, ist("CRC32C")) != 0) + type = PP2_TYPE_CRC32C; + else if (isteqi(input, ist("NOOP")) != 0) + type = PP2_TYPE_NOOP; + else if (isteqi(input, ist("UNIQUE_ID")) != 0) + type = PP2_TYPE_UNIQUE_ID; + else if (isteqi(input, ist("SSL")) != 0) + type = PP2_TYPE_SSL; + else if (isteqi(input, ist("SSL_VERSION")) != 0) + type = PP2_SUBTYPE_SSL_VERSION; + else if (isteqi(input, ist("SSL_CN")) != 0) + type = PP2_SUBTYPE_SSL_CN; + else if (isteqi(input, ist("SSL_CIPHER")) != 0) + type = PP2_SUBTYPE_SSL_CIPHER; + else if (isteqi(input, ist("SSL_SIG_ALG")) != 0) + type = PP2_SUBTYPE_SSL_SIG_ALG; + else if (isteqi(input, ist("SSL_KEY_ALG")) != 0) + type = PP2_SUBTYPE_SSL_KEY_ALG; + else if (isteqi(input, ist("NETNS")) != 0) + type = PP2_TYPE_NETNS; + else { + type = strtoul(input.ptr, &endp, 0); + if (endp && *endp != '\0') { + memprintf(err, "Could not convert type '%s'", input.ptr); + return 0; + } + } + + if (type < 0 || type > 255) { + memprintf(err, "Invalid TLV Type '%s'", input.ptr); + return 0; + } + + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = type; + + return 1; +} + +/* fetch an arbitrary TLV from a PROXY protocol v2 header */ +int smp_fetch_fc_pp_tlv(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int idx; + struct connection *conn = NULL; + struct conn_tlv_list *conn_tlv = NULL; + + conn = objt_conn(smp->sess->origin); + if (!conn) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (args[0].type != ARGT_SINT) + return 0; + + idx = args[0].data.sint; + conn_tlv = smp->ctx.p ? smp->ctx.p : LIST_ELEM(conn->tlv_list.n, struct conn_tlv_list *, list); + list_for_each_entry_from(conn_tlv, &conn->tlv_list, list) { + if (conn_tlv->type == idx) { + smp->flags |= SMP_F_NOT_LAST; + smp->data.type = SMP_T_STR; + smp->data.u.str.area = conn_tlv->value; + smp->data.u.str.data = conn_tlv->len; + smp->ctx.p = conn_tlv; + + return 1; + } + } + + smp->flags &= ~SMP_F_NOT_LAST; + + return 0; +} + +/* fetch the authority TLV from a PROXY protocol header */ +int smp_fetch_fc_pp_authority(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct arg tlv_arg; + int ret; + + set_tlv_arg(PP2_TYPE_AUTHORITY, &tlv_arg); + ret = smp_fetch_fc_pp_tlv(&tlv_arg, smp, kw, private); + smp->flags &= ~SMP_F_NOT_LAST; // return only the first authority + return ret; +} + +/* fetch the unique ID TLV from a PROXY protocol header */ +int smp_fetch_fc_pp_unique_id(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct arg tlv_arg; + int ret; + + set_tlv_arg(PP2_TYPE_UNIQUE_ID, &tlv_arg); + ret = smp_fetch_fc_pp_tlv(&tlv_arg, smp, kw, private); + smp->flags &= ~SMP_F_NOT_LAST; // return only the first unique ID + return ret; +} + +/* fetch the error code of a connection */ +int smp_fetch_fc_err(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[0] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[0] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + if (!conn) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + smp->flags = 0; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = (unsigned long long int)conn->err_code; + + return 1; +} + +/* fetch a string representation of the error code of a connection */ +int smp_fetch_fc_err_str(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + const char *err_code_str; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[0] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[0] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + if (!conn) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + err_code_str = conn_err_code_str(conn); + + if (!err_code_str) + return 0; + + smp->flags = 0; + smp->data.type = SMP_T_STR; + smp->data.u.str.area = (char*)err_code_str; + smp->data.u.str.data = strlen(err_code_str); + + return 1; +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Note: fetches that may return multiple types should be declared using the + * appropriate pseudo-type. If not available it must be declared as the lowest + * common denominator, the type that can be casted into all other ones. + */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "bc_err", smp_fetch_fc_err, 0, NULL, SMP_T_SINT, SMP_USE_L4SRV }, + { "bc_err_str", smp_fetch_fc_err_str, 0, NULL, SMP_T_STR, SMP_USE_L4SRV }, + { "bc_http_major", smp_fetch_fc_http_major, 0, NULL, SMP_T_SINT, SMP_USE_L4SRV }, + { "fc_err", smp_fetch_fc_err, 0, NULL, SMP_T_SINT, SMP_USE_L4CLI }, + { "fc_err_str", smp_fetch_fc_err_str, 0, NULL, SMP_T_STR, SMP_USE_L4CLI }, + { "fc_http_major", smp_fetch_fc_http_major, 0, NULL, SMP_T_SINT, SMP_USE_L4CLI }, + { "fc_rcvd_proxy", smp_fetch_fc_rcvd_proxy, 0, NULL, SMP_T_BOOL, SMP_USE_L4CLI }, + { "fc_pp_authority", smp_fetch_fc_pp_authority, 0, NULL, SMP_T_STR, SMP_USE_L4CLI }, + { "fc_pp_unique_id", smp_fetch_fc_pp_unique_id, 0, NULL, SMP_T_STR, SMP_USE_L4CLI }, + { "fc_pp_tlv", smp_fetch_fc_pp_tlv, ARG1(1, STR), smp_check_tlv_type, SMP_T_STR, SMP_USE_L4CLI }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "pp2-never-send-local", cfg_parse_pp2_never_send_local }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* private function to handle sockaddr as input for connection hash */ +static void conn_calculate_hash_sockaddr(const struct sockaddr_storage *ss, + char *buf, size_t *idx, + enum conn_hash_params_t *hash_flags, + enum conn_hash_params_t param_type_addr, + enum conn_hash_params_t param_type_port) +{ + struct sockaddr_in *addr; + struct sockaddr_in6 *addr6; + + switch (ss->ss_family) { + case AF_INET: + addr = (struct sockaddr_in *)ss; + + conn_hash_update(buf, idx, + &addr->sin_addr, sizeof(addr->sin_addr), + hash_flags, param_type_addr); + + if (addr->sin_port) { + conn_hash_update(buf, idx, + &addr->sin_port, sizeof(addr->sin_port), + hash_flags, param_type_port); + } + + break; + + case AF_INET6: + addr6 = (struct sockaddr_in6 *)ss; + + conn_hash_update(buf, idx, + &addr6->sin6_addr, sizeof(addr6->sin6_addr), + hash_flags, param_type_addr); + + if (addr6->sin6_port) { + conn_hash_update(buf, idx, + &addr6->sin6_port, sizeof(addr6->sin6_port), + hash_flags, param_type_port); + } + + break; + } +} + +/* Generate the hash of a connection with params as input + * Each non-null field of params is taken into account for the hash calcul. + */ +uint64_t conn_hash_prehash(char *buf, size_t size) +{ + return XXH64(buf, size, 0); +} + +/* Append <data> into <buf> at <idx> offset in preparation for connection hash + * calcul. <idx> is incremented beyond data <size>. In the same time, <flags> + * are updated with <type> for the hash header. + */ +void conn_hash_update(char *buf, size_t *idx, + const void *data, size_t size, + enum conn_hash_params_t *flags, + enum conn_hash_params_t type) +{ + memcpy(&buf[*idx], data, size); + *idx += size; + *flags |= type; +} + +uint64_t conn_hash_digest(char *buf, size_t bufsize, + enum conn_hash_params_t flags) +{ + const uint64_t flags_u64 = (uint64_t)flags; + const uint64_t hash = XXH64(buf, bufsize, 0); + + return (flags_u64 << CONN_HASH_PAYLOAD_LEN) | CONN_HASH_GET_PAYLOAD(hash); +} + +uint64_t conn_calculate_hash(const struct conn_hash_params *params) +{ + char *buf; + size_t idx = 0; + uint64_t hash = 0; + enum conn_hash_params_t hash_flags = 0; + + buf = trash.area; + + conn_hash_update(buf, &idx, ¶ms->target, sizeof(params->target), &hash_flags, 0); + + if (params->sni_prehash) { + conn_hash_update(buf, &idx, + ¶ms->sni_prehash, sizeof(params->sni_prehash), + &hash_flags, CONN_HASH_PARAMS_TYPE_SNI); + } + + if (params->dst_addr) { + conn_calculate_hash_sockaddr(params->dst_addr, + buf, &idx, &hash_flags, + CONN_HASH_PARAMS_TYPE_DST_ADDR, + CONN_HASH_PARAMS_TYPE_DST_PORT); + } + + if (params->src_addr) { + conn_calculate_hash_sockaddr(params->src_addr, + buf, &idx, &hash_flags, + CONN_HASH_PARAMS_TYPE_SRC_ADDR, + CONN_HASH_PARAMS_TYPE_SRC_PORT); + } + + if (params->proxy_prehash) { + conn_hash_update(buf, &idx, + ¶ms->proxy_prehash, sizeof(params->proxy_prehash), + &hash_flags, CONN_HASH_PARAMS_TYPE_PROXY); + } + + hash = conn_hash_digest(buf, idx, hash_flags); + return hash; +} + +/* Reverse a <conn> connection instance. This effectively moves the connection + * from frontend to backend side or vice-versa depending on its initial status. + * + * For active reversal, 'reverse' member points to the listener used as the new + * connection target. Once transition is completed, the connection needs to be + * accepted on the listener to instantiate its parent session before using + * streams. + * + * For passive reversal, 'reverse' member points to the server used as the new + * connection target. Once transition is completed, the connection appears as a + * normal backend connection. + * + * Returns 0 on success else non-zero. + */ +int conn_reverse(struct connection *conn) +{ + struct conn_hash_params hash_params; + int64_t hash = 0; + struct session *sess = conn->owner; + + if (!conn_is_back(conn)) { + /* srv must have been set by a previous 'attach-srv' rule. */ + struct server *srv = objt_server(conn->reverse.target); + BUG_ON(!srv); + + if (conn_backend_init(conn)) + return 1; + + /* Initialize hash value for usage as idle conns. */ + memset(&hash_params, 0, sizeof(hash_params)); + hash_params.target = srv; + + if (b_data(&conn->reverse.name)) { + /* data cannot wrap else prehash usage is incorrect */ + BUG_ON(b_data(&conn->reverse.name) != b_contig_data(&conn->reverse.name, 0)); + + hash_params.sni_prehash = + conn_hash_prehash(b_head(&conn->reverse.name), + b_data(&conn->reverse.name)); + } + + hash = conn_calculate_hash(&hash_params); + conn->hash_node->node.key = hash; + + conn->target = &srv->obj_type; + srv_use_conn(srv, conn); + + /* Free the session after detaching the connection from it. */ + session_unown_conn(sess, conn); + sess->origin = NULL; + session_free(sess); + conn_set_owner(conn, NULL, NULL); + + conn->flags |= CO_FL_REVERSED; + } + else { + /* Wake up receiver to proceed to connection accept. */ + struct listener *l = __objt_listener(conn->reverse.target); + + conn_backend_deinit(conn); + + conn->target = &l->obj_type; + conn->flags |= CO_FL_ACT_REVERSING; + task_wakeup(l->rx.rhttp.task, TASK_WOKEN_ANY); + } + + /* Invert source and destination addresses if already set. */ + SWAP(conn->src, conn->dst); + + conn->reverse.target = NULL; + ha_free(&conn->reverse.name.area); + conn->reverse.name = BUF_NULL; + + return 0; +} + +/* Handler of the task of mux_stopping_data. + * Called on soft-stop. + */ +static struct task *mux_stopping_process(struct task *t, void *ctx, unsigned int state) +{ + struct connection *conn, *back; + + list_for_each_entry_safe(conn, back, &mux_stopping_data[tid].list, stopping_list) { + if (conn->mux && conn->mux->wake) + conn->mux->wake(conn); + } + + return t; +} + +static int allocate_mux_cleanup(void) +{ + /* allocates the thread bound mux_stopping_data task */ + mux_stopping_data[tid].task = task_new_here(); + if (!mux_stopping_data[tid].task) { + ha_alert("Failed to allocate the task for connection cleanup on thread %d.\n", tid); + return 0; + } + + mux_stopping_data[tid].task->process = mux_stopping_process; + LIST_INIT(&mux_stopping_data[tid].list); + + return 1; +} +REGISTER_PER_THREAD_ALLOC(allocate_mux_cleanup); + +static int deallocate_mux_cleanup(void) +{ + task_destroy(mux_stopping_data[tid].task); + return 1; +} +REGISTER_PER_THREAD_FREE(deallocate_mux_cleanup); + +static void deinit_idle_conns(void) +{ + int i; + + for (i = 0; i < global.nbthread; i++) { + task_destroy(idle_conns[i].cleanup_task); + } +} +REGISTER_POST_DEINIT(deinit_idle_conns); diff --git a/src/cpuset.c b/src/cpuset.c new file mode 100644 index 0000000..82e350f --- /dev/null +++ b/src/cpuset.c @@ -0,0 +1,296 @@ +#define _GNU_SOURCE +#include <sched.h> +#include <ctype.h> + +#include <haproxy/compat.h> +#include <haproxy/cpuset.h> +#include <haproxy/intops.h> +#include <haproxy/tools.h> + +struct cpu_map *cpu_map; + +void ha_cpuset_zero(struct hap_cpuset *set) +{ +#if defined(CPUSET_USE_CPUSET) || defined(CPUSET_USE_FREEBSD_CPUSET) + CPU_ZERO(&set->cpuset); + +#elif defined(CPUSET_USE_ULONG) + set->cpuset = 0; +#endif +} + +int ha_cpuset_set(struct hap_cpuset *set, int cpu) +{ + if (cpu >= ha_cpuset_size()) + return 1; + +#if defined(CPUSET_USE_CPUSET) || defined(CPUSET_USE_FREEBSD_CPUSET) + CPU_SET(cpu, &set->cpuset); + return 0; + +#elif defined(CPUSET_USE_ULONG) + set->cpuset |= (0x1 << cpu); + return 0; +#endif +} + +int ha_cpuset_clr(struct hap_cpuset *set, int cpu) +{ + if (cpu >= ha_cpuset_size()) + return 1; + +#if defined(CPUSET_USE_CPUSET) || defined(CPUSET_USE_FREEBSD_CPUSET) + CPU_CLR(cpu, &set->cpuset); + return 0; + +#elif defined(CPUSET_USE_ULONG) + set->cpuset &= ~(0x1 << cpu); + return 0; +#endif +} + +void ha_cpuset_and(struct hap_cpuset *dst, struct hap_cpuset *src) +{ +#if defined(CPUSET_USE_CPUSET) + CPU_AND(&dst->cpuset, &dst->cpuset, &src->cpuset); + +#elif defined(CPUSET_USE_FREEBSD_CPUSET) + CPU_AND(&dst->cpuset, &src->cpuset); + +#elif defined(CPUSET_USE_ULONG) + dst->cpuset &= src->cpuset; +#endif +} + +void ha_cpuset_or(struct hap_cpuset *dst, struct hap_cpuset *src) +{ +#if defined(CPUSET_USE_CPUSET) + CPU_OR(&dst->cpuset, &dst->cpuset, &src->cpuset); + +#elif defined(CPUSET_USE_FREEBSD_CPUSET) + CPU_OR(&dst->cpuset, &src->cpuset); + +#elif defined(CPUSET_USE_ULONG) + dst->cpuset |= src->cpuset; +#endif +} + +int ha_cpuset_isset(const struct hap_cpuset *set, int cpu) +{ + if (cpu >= ha_cpuset_size()) + return 0; + +#if defined(CPUSET_USE_CPUSET) || defined(CPUSET_USE_FREEBSD_CPUSET) + return CPU_ISSET(cpu, &set->cpuset); + +#elif defined(CPUSET_USE_ULONG) + return !!(set->cpuset & (0x1 << cpu)); +#else + return 0; +#endif +} + +int ha_cpuset_count(const struct hap_cpuset *set) +{ +#if defined(CPUSET_USE_CPUSET) || defined(CPUSET_USE_FREEBSD_CPUSET) + return CPU_COUNT(&set->cpuset); + +#elif defined(CPUSET_USE_ULONG) + return my_popcountl(set->cpuset); +#endif +} + +int ha_cpuset_ffs(const struct hap_cpuset *set) +{ +#if defined(CPUSET_USE_CPUSET) + int n; + + if (!CPU_COUNT(&set->cpuset)) + return 0; + + for (n = 0; !CPU_ISSET(n, &set->cpuset); ++n) + ; + + return n + 1; + +#elif defined(CPUSET_USE_FREEBSD_CPUSET) + return CPU_FFS(&set->cpuset); + +#elif defined(CPUSET_USE_ULONG) + if (!set->cpuset) + return 0; + + return my_ffsl(set->cpuset); +#endif +} + +void ha_cpuset_assign(struct hap_cpuset *dst, struct hap_cpuset *src) +{ +#if defined(CPUSET_USE_CPUSET) + CPU_ZERO(&dst->cpuset); + CPU_OR(&dst->cpuset, &dst->cpuset, &src->cpuset); + +#elif defined(CPUSET_USE_FREEBSD_CPUSET) + CPU_COPY(&src->cpuset, &dst->cpuset); + +#elif defined(CPUSET_USE_ULONG) + dst->cpuset = src->cpuset; +#endif +} + +int ha_cpuset_size() +{ +#if defined(CPUSET_USE_CPUSET) || defined(CPUSET_USE_FREEBSD_CPUSET) + return CPU_SETSIZE; + +#elif defined(CPUSET_USE_ULONG) + return LONGBITS; + +#endif +} + +/* Detects CPUs that are bound to the current process. Returns the number of + * CPUs detected or 0 if the detection failed. + */ +int ha_cpuset_detect_bound(struct hap_cpuset *set) +{ + ha_cpuset_zero(set); + + /* detect bound CPUs depending on the OS's API */ + if (0 +#if defined(__linux__) + || sched_getaffinity(0, sizeof(set->cpuset), &set->cpuset) != 0 +#elif defined(__FreeBSD__) + || cpuset_getaffinity(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(set->cpuset), &set->cpuset) != 0 +#else + || 1 // unhandled platform +#endif + ) { + /* detection failed */ + return 0; + } + + return ha_cpuset_count(set); +} + +/* Parse cpu sets. Each CPU set is either a unique number between 0 and + * ha_cpuset_size() - 1 or a range with two such numbers delimited by a dash + * ('-'). Each CPU set can be a list of unique numbers or ranges separated by + * a comma. It is also possible to specify multiple cpu numbers or ranges in + * distinct argument in <args>. On success, it returns 0, otherwise it returns + * 1, optionally with an error message in <err> if <err> is not NULL. + */ +int parse_cpu_set(const char **args, struct hap_cpuset *cpu_set, char **err) +{ + int cur_arg = 0; + const char *arg; + + ha_cpuset_zero(cpu_set); + + arg = args[cur_arg]; + while (*arg) { + const char *dash, *comma; + unsigned int low, high; + + if (!isdigit((unsigned char)*args[cur_arg])) { + memprintf(err, "'%s' is not a CPU range.", arg); + return 1; + } + + low = high = str2uic(arg); + + comma = strchr(arg, ','); + dash = strchr(arg, '-'); + + if (dash && (!comma || dash < comma)) + high = *(dash+1) ? str2uic(dash + 1) : ha_cpuset_size() - 1; + + if (high < low) { + unsigned int swap = low; + low = high; + high = swap; + } + + if (high >= ha_cpuset_size()) { + memprintf(err, "supports CPU numbers from 0 to %d.", + ha_cpuset_size() - 1); + return 1; + } + + while (low <= high) + ha_cpuset_set(cpu_set, low++); + + /* if a comma is present, parse the rest of the arg, else + * skip to the next arg */ + arg = comma ? comma + 1 : args[++cur_arg]; + } + return 0; +} + +/* Parse a linux cpu map string representing to a numeric cpu mask map + * The cpu map string is a list of 4-byte hex strings separated by commas, with + * most-significant byte first, one bit per cpu number. + */ +void parse_cpumap(char *cpumap_str, struct hap_cpuset *cpu_set) +{ + unsigned long cpumap; + char *start, *endptr, *comma; + int i, j; + + ha_cpuset_zero(cpu_set); + + i = 0; + do { + /* reverse-search for a comma, parse the string after the comma + * or at the beginning if no comma found + */ + comma = strrchr(cpumap_str, ','); + start = comma ? comma + 1 : cpumap_str; + + cpumap = strtoul(start, &endptr, 16); + for (j = 0; cpumap; cpumap >>= 1, ++j) { + if (cpumap & 0x1) + ha_cpuset_set(cpu_set, j + i * 32); + } + + if (comma) + *comma = '\0'; + ++i; + } while (comma); +} + +/* Returns true if at least one cpu-map directive was configured, otherwise + * false. + */ +int cpu_map_configured(void) +{ + int grp, thr; + + for (grp = 0; grp < MAX_TGROUPS; grp++) { + for (thr = 0; thr < MAX_THREADS_PER_GROUP; thr++) + if (ha_cpuset_count(&cpu_map[grp].thread[thr])) + return 1; + } + return 0; +} + +/* Allocates everything needed to store CPU information at boot. + * Returns non-zero on success, zero on failure. + */ +static int cpuset_alloc(void) +{ + /* allocate the structures used to store CPU topology info */ + cpu_map = (struct cpu_map*)calloc(MAX_TGROUPS, sizeof(*cpu_map)); + if (!cpu_map) + return 0; + + return 1; +} + +static void cpuset_deinit(void) +{ + ha_free(&cpu_map); +} + +INITCALL0(STG_ALLOC, cpuset_alloc); +REGISTER_POST_DEINIT(cpuset_deinit); diff --git a/src/debug.c b/src/debug.c new file mode 100644 index 0000000..fbaad80 --- /dev/null +++ b/src/debug.c @@ -0,0 +1,2301 @@ +/* + * Process debugging functions. + * + * Copyright 2000-2019 Willy Tarreau <willy@haproxy.org>. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + + +#include <errno.h> +#include <fcntl.h> +#include <signal.h> +#include <time.h> +#include <stdio.h> +#include <stdlib.h> +#include <syslog.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/utsname.h> +#include <sys/wait.h> +#include <unistd.h> +#ifdef USE_EPOLL +#include <sys/epoll.h> +#endif + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/buf.h> +#include <haproxy/cli.h> +#include <haproxy/clock.h> +#include <haproxy/debug.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/hlua.h> +#include <haproxy/http_ana.h> +#include <haproxy/log.h> +#include <haproxy/net_helper.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/task.h> +#include <haproxy/thread.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <import/ist.h> + + +/* The dump state is made of: + * - num_thread on the lowest 15 bits + * - a SYNC flag on bit 15 (waiting for sync start) + * - number of participating threads on bits 16-30 + * Initiating a dump consists in setting it to SYNC and incrementing the + * num_thread part when entering the function. The first thread periodically + * recounts active threads and compares it to the ready ones, and clears SYNC + * and sets the number of participants to the value found, which serves as a + * start signal. A thread finished dumping looks up the TID of the next active + * thread after it and writes it in the lowest part. If there's none, it sets + * the thread counter to the number of participants and resets that part, + * which serves as an end-of-dump signal. All threads decrement the num_thread + * part. Then all threads wait for the value to reach zero. Only used when + * USE_THREAD_DUMP is set. + */ +#define THREAD_DUMP_TMASK 0x00007FFFU +#define THREAD_DUMP_FSYNC 0x00008000U +#define THREAD_DUMP_PMASK 0x7FFF0000U + +/* Description of a component with name, version, path, build options etc. E.g. + * one of them is haproxy. Others might be some clearly identified shared libs. + * They're intentionally self-contained and to be placed into an array to make + * it easier to find them in a core. The important fields (name and version) + * are locally allocated, other ones are dynamic. + */ +struct post_mortem_component { + char name[32]; // symbolic short name + char version[32]; // exact version + char *toolchain; // compiler and version (e.g. gcc-11.4.0) + char *toolchain_opts; // optims, arch-specific options (e.g. CFLAGS) + char *build_settings; // build options (e.g. USE_*, TARGET, etc) + char *path; // path if known. +}; + +/* This is a collection of information that are centralized to help with core + * dump analysis. It must be used with a public variable and gather elements + * as much as possible without dereferences so that even when identified in a + * core dump it's possible to get the most out of it even if the core file is + * not much exploitable. It's aligned to 256 so that it's easy to spot, given + * that being that large it will not change its size much. + */ +struct post_mortem { + /* platform-specific information */ + struct { + struct utsname utsname; // OS name+ver+arch+hostname + char hw_vendor[64]; // hardware/hypervisor vendor when known + char hw_family[64]; // hardware/hypervisor product family when known + char hw_model[64]; // hardware/hypervisor product/model when known + char brd_vendor[64]; // mainboard vendor when known + char brd_model[64]; // mainboard model when known + char soc_vendor[64]; // SoC/CPU vendor from cpuinfo + char soc_model[64]; // SoC model when known and relevant + char cpu_model[64]; // CPU model when different from SoC + char virt_techno[16]; // when provided by cpuid + char cont_techno[16]; // empty, "no", "yes", "docker" or others + } platform; + + /* process-specific information */ + struct { + pid_t pid; + uid_t boot_uid; + gid_t boot_gid; + struct rlimit limit_fd; // RLIMIT_NOFILE + struct rlimit limit_ram; // RLIMIT_AS or RLIMIT_DATA + +#if defined(USE_THREAD) + struct { + ullong pth_id; // pthread_t cast to a ullong + void *stack_top; // top of the stack + } thread_info[MAX_THREADS]; +#endif + } process; + +#if defined(HA_HAVE_DUMP_LIBS) + /* information about dynamic shared libraries involved */ + char *libs; // dump of one addr / path per line, or NULL +#endif + + /* info about identified distinct components (executable, shared libs, etc). + * These can be all listed at once in gdb using: + * p *post_mortem.components@post_mortem.nb_components + */ + uint nb_components; // # of components below + struct post_mortem_component *components; // NULL or array +} post_mortem ALIGNED(256) = { }; + +/* Points to a copy of the buffer where the dump functions should write, when + * non-null. It's only used by debuggers for core dump analysis. + */ +struct buffer *thread_dump_buffer = NULL; +unsigned int debug_commands_issued = 0; + +/* dumps a backtrace of the current thread that is appended to buffer <buf>. + * Lines are prefixed with the string <prefix> which may be empty (used for + * indenting). It is recommended to use this at a function's tail so that + * the function does not appear in the call stack. The <dump> argument + * indicates what dump state to start from, and should usually be zero. It + * may be among the following values: + * - 0: search usual callers before step 1, or directly jump to 2 + * - 1: skip usual callers before step 2 + * - 2: dump until polling loop, scheduler, or main() (excluded) + * - 3: end + * - 4-7: like 0 but stops *after* main. + */ +void ha_dump_backtrace(struct buffer *buf, const char *prefix, int dump) +{ + struct buffer bak; + char pfx2[100]; + void *callers[100]; + int j, nptrs; + const void *addr; + + nptrs = my_backtrace(callers, sizeof(callers)/sizeof(*callers)); + if (!nptrs) + return; + + if (snprintf(pfx2, sizeof(pfx2), "%s| ", prefix) > sizeof(pfx2)) + pfx2[0] = 0; + + /* The call backtrace_symbols_fd(callers, nptrs, STDOUT_FILENO would + * produce similar output to the following: + */ + chunk_appendf(buf, "%scall trace(%d):\n", prefix, nptrs); + for (j = 0; (j < nptrs || (dump & 3) < 2); j++) { + if (j == nptrs && !(dump & 3)) { + /* we failed to spot the starting point of the + * dump, let's start over dumping everything we + * have. + */ + dump += 2; + j = 0; + } + bak = *buf; + dump_addr_and_bytes(buf, pfx2, callers[j], 8); + addr = resolve_sym_name(buf, ": ", callers[j]); + if ((dump & 3) == 0) { + /* dump not started, will start *after* ha_thread_dump_one(), + * ha_panic and ha_backtrace_to_stderr + */ + if (addr == ha_panic || + addr == ha_backtrace_to_stderr || addr == ha_thread_dump_one) + dump++; + *buf = bak; + continue; + } + + if ((dump & 3) == 1) { + /* starting */ + if (addr == ha_panic || + addr == ha_backtrace_to_stderr || addr == ha_thread_dump_one) { + *buf = bak; + continue; + } + dump++; + } + + if ((dump & 3) == 2) { + /* still dumping */ + if (dump == 6) { + /* we only stop *after* main and we must send the LF */ + if (addr == main) { + j = nptrs; + dump++; + } + } + else if (addr == run_poll_loop || addr == main || addr == run_tasks_from_lists) { + dump++; + *buf = bak; + break; + } + } + /* OK, line dumped */ + chunk_appendf(buf, "\n"); + } +} + +/* dump a backtrace of current thread's stack to stderr. */ +void ha_backtrace_to_stderr(void) +{ + char area[2048]; + struct buffer b = b_make(area, sizeof(area), 0, 0); + + ha_dump_backtrace(&b, " ", 4); + if (b.data) + DISGUISE(write(2, b.area, b.data)); +} + +/* Dumps to the thread's buffer some known information for the desired thread, + * and optionally extra info when it's safe to do so (current thread or + * isolated). The dump will be appended to the buffer, so the caller is + * responsible for preliminary initializing it. The <from_signal> argument will + * indicate if the function is called from the debug signal handler, indicating + * the thread was dumped upon request from another one, otherwise if the thread + * it the current one, a star ('*') will be displayed in front of the thread to + * indicate the requesting one. Any stuck thread is also prefixed with a '>'. + * The caller is responsible for atomically setting up the thread's dump buffer + * to point to a valid buffer with enough room. Output will be truncated if it + * does not fit. When the dump is complete, the dump buffer will be switched to + * (void*)0x1 that the caller must turn to 0x0 once the contents are collected. + */ +void ha_thread_dump_one(int thr, int from_signal) +{ + struct buffer *buf = HA_ATOMIC_LOAD(&ha_thread_ctx[thr].thread_dump_buffer); + unsigned long __maybe_unused thr_bit = ha_thread_info[thr].ltid_bit; + int __maybe_unused tgrp = ha_thread_info[thr].tgid; + unsigned long long p = ha_thread_ctx[thr].prev_cpu_time; + unsigned long long n = now_cpu_time_thread(thr); + int stuck = !!(ha_thread_ctx[thr].flags & TH_FL_STUCK); + + chunk_appendf(buf, + "%c%cThread %-2u: id=0x%llx act=%d glob=%d wq=%d rq=%d tl=%d tlsz=%d rqsz=%d\n" + " %2u/%-2u stuck=%d prof=%d", + (thr == tid && !from_signal) ? '*' : ' ', stuck ? '>' : ' ', thr + 1, + ha_get_pthread_id(thr), + thread_has_tasks(), + !eb_is_empty(&ha_thread_ctx[thr].rqueue_shared), + !eb_is_empty(&ha_thread_ctx[thr].timers), + !eb_is_empty(&ha_thread_ctx[thr].rqueue), + !(LIST_ISEMPTY(&ha_thread_ctx[thr].tasklets[TL_URGENT]) && + LIST_ISEMPTY(&ha_thread_ctx[thr].tasklets[TL_NORMAL]) && + LIST_ISEMPTY(&ha_thread_ctx[thr].tasklets[TL_BULK]) && + MT_LIST_ISEMPTY(&ha_thread_ctx[thr].shared_tasklet_list)), + ha_thread_ctx[thr].tasks_in_list, + ha_thread_ctx[thr].rq_total, + ha_thread_info[thr].tgid, ha_thread_info[thr].ltid + 1, + stuck, + !!(ha_thread_ctx[thr].flags & TH_FL_TASK_PROFILING)); + +#if defined(USE_THREAD) + chunk_appendf(buf, + " harmless=%d isolated=%d", + !!(_HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp-1].threads_harmless) & thr_bit), + isolated_thread == thr); +#endif + + chunk_appendf(buf, "\n"); + chunk_appendf(buf, " cpu_ns: poll=%llu now=%llu diff=%llu\n", p, n, n-p); + + /* this is the end of what we can dump from outside the current thread */ + + if (thr != tid && !thread_isolated()) + goto leave; + + chunk_appendf(buf, " curr_task="); + ha_task_dump(buf, th_ctx->current, " "); + + if (stuck && thr == tid) { +#ifdef USE_LUA + if (th_ctx->current && + th_ctx->current->process == process_stream && th_ctx->current->context) { + const struct stream *s = (const struct stream *)th_ctx->current->context; + struct hlua *hlua = s ? s->hlua : NULL; + + if (hlua && hlua->T) { + mark_tainted(TAINTED_LUA_STUCK); + if (hlua->state_id == 0) + mark_tainted(TAINTED_LUA_STUCK_SHARED); + } + } +#endif + + if (HA_ATOMIC_LOAD(&pool_trim_in_progress)) + mark_tainted(TAINTED_MEM_TRIMMING_STUCK); + + /* We only emit the backtrace for stuck threads in order not to + * waste precious output buffer space with non-interesting data. + * Please leave this as the last instruction in this function + * so that the compiler uses tail merging and the current + * function does not appear in the stack. + */ + ha_dump_backtrace(buf, " ", 0); + } + leave: + /* end of dump, setting the buffer to 0x1 will tell the caller we're done */ + HA_ATOMIC_STORE(&ha_thread_ctx[thr].thread_dump_buffer, (void*)0x1UL); +} + +/* Triggers a thread dump from thread <thr>, either directly if it's the + * current thread or if thread dump signals are not implemented, or by sending + * a signal if it's a remote one and the feature is supported. The buffer <buf> + * will get the dump appended, and the caller is responsible for making sure + * there is enough room otherwise some contents will be truncated. + */ +void ha_thread_dump(struct buffer *buf, int thr) +{ + struct buffer *old = NULL; + + /* try to impose our dump buffer and to reserve the target thread's + * next dump for us. + */ + do { + if (old) + ha_thread_relax(); + old = NULL; + } while (!HA_ATOMIC_CAS(&ha_thread_ctx[thr].thread_dump_buffer, &old, buf)); + +#ifdef USE_THREAD_DUMP + /* asking the remote thread to dump itself allows to get more details + * including a backtrace. + */ + if (thr != tid) + ha_tkill(thr, DEBUGSIG); + else +#endif + ha_thread_dump_one(thr, thr != tid); + + /* now wait for the dump to be done, and release it */ + do { + if (old) + ha_thread_relax(); + old = (void*)0x01; + } while (!HA_ATOMIC_CAS(&ha_thread_ctx[thr].thread_dump_buffer, &old, 0)); +} + +/* dumps into the buffer some information related to task <task> (which may + * either be a task or a tasklet, and prepend each line except the first one + * with <pfx>. The buffer is only appended and the first output starts by the + * pointer itself. The caller is responsible for making sure the task is not + * going to vanish during the dump. + */ +void ha_task_dump(struct buffer *buf, const struct task *task, const char *pfx) +{ + const struct stream *s = NULL; + const struct appctx __maybe_unused *appctx = NULL; + struct hlua __maybe_unused *hlua = NULL; + const struct stconn *sc; + + if (!task) { + chunk_appendf(buf, "0\n"); + return; + } + + if (TASK_IS_TASKLET(task)) + chunk_appendf(buf, + "%p (tasklet) calls=%u\n", + task, + task->calls); + else + chunk_appendf(buf, + "%p (task) calls=%u last=%llu%s\n", + task, + task->calls, + task->wake_date ? (unsigned long long)(now_mono_time() - task->wake_date) : 0, + task->wake_date ? " ns ago" : ""); + + chunk_appendf(buf, "%s fct=%p(", pfx, task->process); + resolve_sym_name(buf, NULL, task->process); + chunk_appendf(buf,") ctx=%p", task->context); + + if (task->process == task_run_applet && (appctx = task->context)) + chunk_appendf(buf, "(%s)\n", appctx->applet->name); + else + chunk_appendf(buf, "\n"); + + if (task->process == process_stream && task->context) + s = (struct stream *)task->context; + else if (task->process == task_run_applet && task->context && (sc = appctx_sc((struct appctx *)task->context))) + s = sc_strm(sc); + else if (task->process == sc_conn_io_cb && task->context) + s = sc_strm(((struct stconn *)task->context)); + + if (s) { + chunk_appendf(buf, "%sstream=", pfx); + strm_dump_to_buffer(buf, s, pfx, HA_ATOMIC_LOAD(&global.anon_key)); + } + +#ifdef USE_LUA + hlua = NULL; + if (s && (hlua = s->hlua)) { + chunk_appendf(buf, "%sCurrent executing Lua from a stream analyser -- ", pfx); + } + else if (task->process == hlua_process_task && (hlua = task->context)) { + chunk_appendf(buf, "%sCurrent executing a Lua task -- ", pfx); + } + else if (task->process == task_run_applet && (appctx = task->context) && + (appctx->applet->fct == hlua_applet_tcp_fct)) { + chunk_appendf(buf, "%sCurrent executing a Lua TCP service -- ", pfx); + } + else if (task->process == task_run_applet && (appctx = task->context) && + (appctx->applet->fct == hlua_applet_http_fct)) { + chunk_appendf(buf, "%sCurrent executing a Lua HTTP service -- ", pfx); + } + + if (hlua && hlua->T) { + chunk_appendf(buf, "stack traceback:\n "); + append_prefixed_str(buf, hlua_traceback(hlua->T, "\n "), pfx, '\n', 0); + } + + /* we may need to terminate the current line */ + if (*b_peek(buf, b_data(buf)-1) != '\n') + b_putchr(buf, '\n'); +#endif +} + +/* This function dumps all profiling settings. It returns 0 if the output + * buffer is full and it needs to be called again, otherwise non-zero. + */ +static int cli_io_handler_show_threads(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + int thr; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + if (appctx->st0) + thr = appctx->st1; + else + thr = 0; + + do { + chunk_reset(&trash); + ha_thread_dump(&trash, thr); + + if (applet_putchk(appctx, &trash) == -1) { + /* failed, try again */ + appctx->st1 = thr; + return 0; + } + thr++; + } while (thr < global.nbthread); + + return 1; +} + +#if defined(HA_HAVE_DUMP_LIBS) +/* parse a "show libs" command. It returns 1 if it emits anything otherwise zero. */ +static int debug_parse_cli_show_libs(char **args, char *payload, struct appctx *appctx, void *private) +{ + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + chunk_reset(&trash); + if (dump_libs(&trash, 1)) + return cli_msg(appctx, LOG_INFO, trash.area); + else + return 0; +} +#endif + +/* parse a "show dev" command. It returns 1 if it emits anything otherwise zero. */ +static int debug_parse_cli_show_dev(char **args, char *payload, struct appctx *appctx, void *private) +{ + const char **build_opt; + + if (*args[2]) + return cli_err(appctx, "This command takes no argument.\n"); + + chunk_reset(&trash); + + chunk_appendf(&trash, "Features\n %s\n", build_features); + + chunk_appendf(&trash, "Build options\n"); + for (build_opt = NULL; (build_opt = hap_get_next_build_opt(build_opt)); ) + if (append_prefixed_str(&trash, *build_opt, " ", '\n', 0) == 0) + chunk_strcat(&trash, "\n"); + + chunk_appendf(&trash, "Platform info\n"); + if (*post_mortem.platform.hw_vendor) + chunk_appendf(&trash, " machine vendor: %s\n", post_mortem.platform.hw_vendor); + if (*post_mortem.platform.hw_family) + chunk_appendf(&trash, " machine family: %s\n", post_mortem.platform.hw_family); + if (*post_mortem.platform.hw_model) + chunk_appendf(&trash, " machine model: %s\n", post_mortem.platform.hw_model); + if (*post_mortem.platform.brd_vendor) + chunk_appendf(&trash, " board vendor: %s\n", post_mortem.platform.brd_vendor); + if (*post_mortem.platform.brd_model) + chunk_appendf(&trash, " board model: %s\n", post_mortem.platform.brd_model); + if (*post_mortem.platform.soc_vendor) + chunk_appendf(&trash, " soc vendor: %s\n", post_mortem.platform.soc_vendor); + if (*post_mortem.platform.soc_model) + chunk_appendf(&trash, " soc model: %s\n", post_mortem.platform.soc_model); + if (*post_mortem.platform.cpu_model) + chunk_appendf(&trash, " cpu model: %s\n", post_mortem.platform.cpu_model); + if (*post_mortem.platform.virt_techno) + chunk_appendf(&trash, " virtual machine: %s\n", post_mortem.platform.virt_techno); + if (*post_mortem.platform.cont_techno) + chunk_appendf(&trash, " container: %s\n", post_mortem.platform.cont_techno); + if (*post_mortem.platform.utsname.sysname) + chunk_appendf(&trash, " OS name: %s\n", post_mortem.platform.utsname.sysname); + if (*post_mortem.platform.utsname.release) + chunk_appendf(&trash, " OS release: %s\n", post_mortem.platform.utsname.release); + if (*post_mortem.platform.utsname.version) + chunk_appendf(&trash, " OS version: %s\n", post_mortem.platform.utsname.version); + if (*post_mortem.platform.utsname.machine) + chunk_appendf(&trash, " OS architecture: %s\n", post_mortem.platform.utsname.machine); + if (*post_mortem.platform.utsname.nodename) + chunk_appendf(&trash, " node name: %s\n", HA_ANON_CLI(post_mortem.platform.utsname.nodename)); + + chunk_appendf(&trash, "Process info\n"); + chunk_appendf(&trash, " pid: %d\n", post_mortem.process.pid); + chunk_appendf(&trash, " boot uid: %d\n", post_mortem.process.boot_uid); + chunk_appendf(&trash, " boot gid: %d\n", post_mortem.process.boot_gid); + + if ((ulong)post_mortem.process.limit_fd.rlim_cur != RLIM_INFINITY) + chunk_appendf(&trash, " fd limit (soft): %lu\n", (ulong)post_mortem.process.limit_fd.rlim_cur); + if ((ulong)post_mortem.process.limit_fd.rlim_max != RLIM_INFINITY) + chunk_appendf(&trash, " fd limit (hard): %lu\n", (ulong)post_mortem.process.limit_fd.rlim_max); + if ((ulong)post_mortem.process.limit_ram.rlim_cur != RLIM_INFINITY) + chunk_appendf(&trash, " ram limit (soft): %lu\n", (ulong)post_mortem.process.limit_ram.rlim_cur); + if ((ulong)post_mortem.process.limit_ram.rlim_max != RLIM_INFINITY) + chunk_appendf(&trash, " ram limit (hard): %lu\n", (ulong)post_mortem.process.limit_ram.rlim_max); + + return cli_msg(appctx, LOG_INFO, trash.area); +} + +/* Dumps a state of all threads into the trash and on fd #2, then aborts. + * A copy will be put into a trash chunk that's assigned to thread_dump_buffer + * so that the debugger can easily find it. This buffer might be truncated if + * too many threads are being dumped, but at least we'll dump them all on stderr. + * If thread_dump_buffer is set, it means that a panic has already begun. + */ +void ha_panic() +{ + struct buffer *old; + unsigned int thr; + + mark_tainted(TAINTED_PANIC); + + old = NULL; + if (!HA_ATOMIC_CAS(&thread_dump_buffer, &old, get_trash_chunk())) { + /* a panic dump is already in progress, let's not disturb it, + * we'll be called via signal DEBUGSIG. By returning we may be + * able to leave a current signal handler (e.g. WDT) so that + * this will ensure more reliable signal delivery. + */ + return; + } + + chunk_reset(&trash); + chunk_appendf(&trash, "Thread %u is about to kill the process.\n", tid + 1); + + for (thr = 0; thr < global.nbthread; thr++) { + ha_thread_dump(&trash, thr); + DISGUISE(write(2, trash.area, trash.data)); + b_force_xfer(thread_dump_buffer, &trash, b_room(thread_dump_buffer)); + chunk_reset(&trash); + } + +#ifdef USE_LUA + if (get_tainted() & TAINTED_LUA_STUCK_SHARED && global.nbthread > 1) { + chunk_printf(&trash, + "### Note: at least one thread was stuck in a Lua context loaded using the\n" + " 'lua-load' directive, which is known for causing heavy contention\n" + " when used with threads. Please consider using 'lua-load-per-thread'\n" + " instead if your code is safe to run in parallel on multiple threads.\n"); + DISGUISE(write(2, trash.area, trash.data)); + } + else if (get_tainted() & TAINTED_LUA_STUCK) { + chunk_printf(&trash, + "### Note: at least one thread was stuck in a Lua context in a way that suggests\n" + " heavy processing inside a dependency or a long loop that can't yield.\n" + " Please make sure any external code you may rely on is safe for use in\n" + " an event-driven engine.\n"); + DISGUISE(write(2, trash.area, trash.data)); + } +#endif + if (get_tainted() & TAINTED_MEM_TRIMMING_STUCK) { + chunk_printf(&trash, + "### Note: one thread was found stuck under malloc_trim(), which can run for a\n" + " very long time on large memory systems. You way want to disable this\n" + " memory reclaiming feature by setting 'no-memory-trimming' in the\n" + " 'global' section of your configuration to avoid this in the future.\n"); + DISGUISE(write(2, trash.area, trash.data)); + } + + for (;;) + abort(); +} + +/* Complain with message <msg> on stderr. If <counter> is not NULL, it is + * atomically incremented, and the message is only printed when the counter + * was zero, so that the message is only printed once. <taint> is only checked + * on bit 1, and will taint the process either for a bug (2) or warn (0). + */ +void complain(int *counter, const char *msg, int taint) +{ + if (counter && _HA_ATOMIC_FETCH_ADD(counter, 1)) + return; + DISGUISE(write(2, msg, strlen(msg))); + if (taint & 2) + mark_tainted(TAINTED_BUG); + else + mark_tainted(TAINTED_WARN); +} + +/* parse a "debug dev exit" command. It always returns 1, though it should never return. */ +static int debug_parse_cli_exit(char **args, char *payload, struct appctx *appctx, void *private) +{ + int code = atoi(args[3]); + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + exit(code); + return 1; +} + +/* parse a "debug dev bug" command. It always returns 1, though it should never return. + * Note: we make sure not to make the function static so that it appears in the trace. + */ +int debug_parse_cli_bug(char **args, char *payload, struct appctx *appctx, void *private) +{ + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + BUG_ON(one > zero); + return 1; +} + +/* parse a "debug dev warn" command. It always returns 1. + * Note: we make sure not to make the function static so that it appears in the trace. + */ +int debug_parse_cli_warn(char **args, char *payload, struct appctx *appctx, void *private) +{ + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + WARN_ON(one > zero); + return 1; +} + +/* parse a "debug dev check" command. It always returns 1. + * Note: we make sure not to make the function static so that it appears in the trace. + */ +int debug_parse_cli_check(char **args, char *payload, struct appctx *appctx, void *private) +{ + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + CHECK_IF(one > zero); + return 1; +} + +/* parse a "debug dev close" command. It always returns 1. */ +static int debug_parse_cli_close(char **args, char *payload, struct appctx *appctx, void *private) +{ + int fd; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "Missing file descriptor number.\n"); + + fd = atoi(args[3]); + if (fd < 0 || fd >= global.maxsock) + return cli_err(appctx, "File descriptor out of range.\n"); + + if (!fdtab[fd].owner) + return cli_msg(appctx, LOG_INFO, "File descriptor was already closed.\n"); + + _HA_ATOMIC_INC(&debug_commands_issued); + fd_delete(fd); + return 1; +} + +/* this is meant to cause a deadlock when more than one task is running it or when run twice */ +static struct task *debug_run_cli_deadlock(struct task *task, void *ctx, unsigned int state) +{ + static HA_SPINLOCK_T lock __maybe_unused; + + HA_SPIN_LOCK(OTHER_LOCK, &lock); + return NULL; +} + +/* parse a "debug dev deadlock" command. It always returns 1. */ +static int debug_parse_cli_deadlock(char **args, char *payload, struct appctx *appctx, void *private) +{ + int tasks; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + for (tasks = atoi(args[3]); tasks > 0; tasks--) { + struct task *t = task_new_on(tasks % global.nbthread); + if (!t) + continue; + t->process = debug_run_cli_deadlock; + t->context = NULL; + task_wakeup(t, TASK_WOKEN_INIT); + } + + return 1; +} + +/* parse a "debug dev delay" command. It always returns 1. */ +static int debug_parse_cli_delay(char **args, char *payload, struct appctx *appctx, void *private) +{ + int delay = atoi(args[3]); + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + usleep((long)delay * 1000); + return 1; +} + +/* parse a "debug dev log" command. It always returns 1. */ +static int debug_parse_cli_log(char **args, char *payload, struct appctx *appctx, void *private) +{ + int arg; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + chunk_reset(&trash); + for (arg = 3; *args[arg]; arg++) { + if (arg > 3) + chunk_strcat(&trash, " "); + chunk_strcat(&trash, args[arg]); + } + + send_log(NULL, LOG_INFO, "%s\n", trash.area); + return 1; +} + +/* parse a "debug dev loop" command. It always returns 1. */ +static int debug_parse_cli_loop(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct timeval deadline, curr; + int loop = atoi(args[3]); + int isolate; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + isolate = strcmp(args[4], "isolated") == 0; + + _HA_ATOMIC_INC(&debug_commands_issued); + gettimeofday(&curr, NULL); + tv_ms_add(&deadline, &curr, loop); + + if (isolate) + thread_isolate(); + + while (tv_ms_cmp(&curr, &deadline) < 0) + gettimeofday(&curr, NULL); + + if (isolate) + thread_release(); + + return 1; +} + +/* parse a "debug dev panic" command. It always returns 1, though it should never return. */ +static int debug_parse_cli_panic(char **args, char *payload, struct appctx *appctx, void *private) +{ + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + ha_panic(); + return 1; +} + +/* parse a "debug dev exec" command. It always returns 1. */ +#if defined(DEBUG_DEV) +static int debug_parse_cli_exec(char **args, char *payload, struct appctx *appctx, void *private) +{ + int pipefd[2]; + int arg; + int pid; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + _HA_ATOMIC_INC(&debug_commands_issued); + chunk_reset(&trash); + for (arg = 3; *args[arg]; arg++) { + if (arg > 3) + chunk_strcat(&trash, " "); + chunk_strcat(&trash, args[arg]); + } + + thread_isolate(); + if (pipe(pipefd) < 0) + goto fail_pipe; + + if (fd_set_cloexec(pipefd[0]) == -1) + goto fail_fcntl; + + if (fd_set_cloexec(pipefd[1]) == -1) + goto fail_fcntl; + + pid = fork(); + + if (pid < 0) + goto fail_fork; + else if (pid == 0) { + /* child */ + char *cmd[4] = { "/bin/sh", "-c", 0, 0 }; + + close(0); + dup2(pipefd[1], 1); + dup2(pipefd[1], 2); + + cmd[2] = trash.area; + execvp(cmd[0], cmd); + printf("execvp() failed\n"); + exit(1); + } + + /* parent */ + thread_release(); + close(pipefd[1]); + chunk_reset(&trash); + while (1) { + size_t ret = read(pipefd[0], trash.area + trash.data, trash.size - 20 - trash.data); + if (ret <= 0) + break; + trash.data += ret; + if (trash.data + 20 == trash.size) { + chunk_strcat(&trash, "\n[[[TRUNCATED]]]\n"); + break; + } + } + close(pipefd[0]); + waitpid(pid, NULL, WNOHANG); + trash.area[trash.data] = 0; + return cli_msg(appctx, LOG_INFO, trash.area); + + fail_fork: + fail_fcntl: + close(pipefd[0]); + close(pipefd[1]); + fail_pipe: + thread_release(); + return cli_err(appctx, "Failed to execute command.\n"); +} + +/* handles SIGRTMAX to inject random delays on the receiving thread in order + * to try to increase the likelihood to reproduce inter-thread races. The + * signal is periodically sent by a task initiated by "debug dev delay-inj". + */ +void debug_delay_inj_sighandler(int sig, siginfo_t *si, void *arg) +{ + volatile int i = statistical_prng_range(10000); + + while (i--) + __ha_cpu_relax(); +} +#endif + +/* parse a "debug dev hex" command. It always returns 1. */ +static int debug_parse_cli_hex(char **args, char *payload, struct appctx *appctx, void *private) +{ + unsigned long start, len; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "Missing memory address to dump from.\n"); + + start = strtoul(args[3], NULL, 0); + if (!start) + return cli_err(appctx, "Will not dump from NULL address.\n"); + + _HA_ATOMIC_INC(&debug_commands_issued); + + /* by default, dump ~128 till next block of 16 */ + len = strtoul(args[4], NULL, 0); + if (!len) + len = ((start + 128) & -16) - start; + + chunk_reset(&trash); + dump_hex(&trash, " ", (const void *)start, len, 1); + trash.area[trash.data] = 0; + return cli_msg(appctx, LOG_INFO, trash.area); +} + +/* parse a "debug dev sym <addr>" command. It always returns 1. */ +static int debug_parse_cli_sym(char **args, char *payload, struct appctx *appctx, void *private) +{ + unsigned long addr; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "Missing memory address to be resolved.\n"); + + _HA_ATOMIC_INC(&debug_commands_issued); + + addr = strtoul(args[3], NULL, 0); + chunk_printf(&trash, "%#lx resolves to ", addr); + resolve_sym_name(&trash, NULL, (const void *)addr); + chunk_appendf(&trash, "\n"); + + return cli_msg(appctx, LOG_INFO, trash.area); +} + +/* parse a "debug dev tkill" command. It always returns 1. */ +static int debug_parse_cli_tkill(char **args, char *payload, struct appctx *appctx, void *private) +{ + int thr = 0; + int sig = SIGABRT; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (*args[3]) + thr = atoi(args[3]); + + if (thr < 0 || thr > global.nbthread) + return cli_err(appctx, "Thread number out of range (use 0 for current).\n"); + + if (*args[4]) + sig = atoi(args[4]); + + _HA_ATOMIC_INC(&debug_commands_issued); + if (thr) + ha_tkill(thr - 1, sig); + else + raise(sig); + return 1; +} + +/* hashes 'word' in "debug dev hash 'word' ". */ +static int debug_parse_cli_hash(char **args, char *payload, struct appctx *appctx, void *private) +{ + char *msg = NULL; + + cli_dynmsg(appctx, LOG_INFO, memprintf(&msg, "%s\n", HA_ANON_CLI(args[3]))); + return 1; +} + +/* parse a "debug dev write" command. It always returns 1. */ +static int debug_parse_cli_write(char **args, char *payload, struct appctx *appctx, void *private) +{ + unsigned long len; + + if (!*args[3]) + return cli_err(appctx, "Missing output size.\n"); + + len = strtoul(args[3], NULL, 0); + if (len >= trash.size) + return cli_err(appctx, "Output too large, must be <tune.bufsize.\n"); + + _HA_ATOMIC_INC(&debug_commands_issued); + + chunk_reset(&trash); + trash.data = len; + memset(trash.area, '.', trash.data); + trash.area[trash.data] = 0; + for (len = 64; len < trash.data; len += 64) + trash.area[len] = '\n'; + return cli_msg(appctx, LOG_INFO, trash.area); +} + +/* parse a "debug dev stream" command */ +/* + * debug dev stream [strm=<ptr>] [strm.f[{+-=}<flags>]] [txn.f[{+-=}<flags>]] \ + * [req.f[{+-=}<flags>]] [res.f[{+-=}<flags>]] \ + * [sif.f[{+-=<flags>]] [sib.f[{+-=<flags>]] \ + * [sif.s[=<state>]] [sib.s[=<state>]] + */ +static int debug_parse_cli_stream(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct stream *s = appctx_strm(appctx); + int arg; + void *ptr; + int size; + const char *word, *end; + struct ist name; + char *msg = NULL; + char *endarg; + unsigned long long old, new; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + ptr = NULL; size = 0; + + if (!*args[3]) { + return cli_err(appctx, + "Usage: debug dev stream [ strm=<ptr> ] { <obj> <op> <value> | wake }*\n" + " <obj> = { strm.f | strm.x | scf.s | scb.s | txn.f | req.f | res.f }\n" + " <op> = {'' (show) | '=' (assign) | '^' (xor) | '+' (or) | '-' (andnot)}\n" + " <value> = 'now' | 64-bit dec/hex integer (0x prefix supported)\n" + " 'wake' wakes the stream assigned to 'strm' (default: current)\n" + ); + } + + _HA_ATOMIC_INC(&debug_commands_issued); + for (arg = 3; *args[arg]; arg++) { + old = 0; + end = word = args[arg]; + while (*end && *end != '=' && *end != '^' && *end != '+' && *end != '-') + end++; + name = ist2(word, end - word); + if (isteq(name, ist("strm"))) { + ptr = (!s || !may_access(s)) ? NULL : &s; size = sizeof(s); + } else if (isteq(name, ist("strm.f"))) { + ptr = (!s || !may_access(s)) ? NULL : &s->flags; size = sizeof(s->flags); + } else if (isteq(name, ist("strm.x"))) { + ptr = (!s || !may_access(s)) ? NULL : &s->conn_exp; size = sizeof(s->conn_exp); + } else if (isteq(name, ist("txn.f"))) { + ptr = (!s || !may_access(s)) ? NULL : &s->txn->flags; size = sizeof(s->txn->flags); + } else if (isteq(name, ist("req.f"))) { + ptr = (!s || !may_access(s)) ? NULL : &s->req.flags; size = sizeof(s->req.flags); + } else if (isteq(name, ist("res.f"))) { + ptr = (!s || !may_access(s)) ? NULL : &s->res.flags; size = sizeof(s->res.flags); + } else if (isteq(name, ist("scf.s"))) { + ptr = (!s || !may_access(s)) ? NULL : &s->scf->state; size = sizeof(s->scf->state); + } else if (isteq(name, ist("scb.s"))) { + ptr = (!s || !may_access(s)) ? NULL : &s->scf->state; size = sizeof(s->scb->state); + } else if (isteq(name, ist("wake"))) { + if (s && may_access(s) && may_access((void *)s + sizeof(*s) - 1)) + task_wakeup(s->task, TASK_WOKEN_TIMER|TASK_WOKEN_IO|TASK_WOKEN_MSG); + continue; + } else + return cli_dynerr(appctx, memprintf(&msg, "Unsupported field name: '%s'.\n", word)); + + /* read previous value */ + if ((s || ptr == &s) && ptr && may_access(ptr) && may_access(ptr + size - 1)) { + if (size == 8) + old = read_u64(ptr); + else if (size == 4) + old = read_u32(ptr); + else if (size == 2) + old = read_u16(ptr); + else + old = *(const uint8_t *)ptr; + } else { + memprintf(&msg, + "%sSkipping inaccessible pointer %p for field '%.*s'.\n", + msg ? msg : "", ptr, (int)(end - word), word); + continue; + } + + /* parse the new value . */ + new = strtoll(end + 1, &endarg, 0); + if (end[1] && *endarg) { + if (strcmp(end + 1, "now") == 0) + new = now_ms; + else { + memprintf(&msg, + "%sIgnoring unparsable value '%s' for field '%.*s'.\n", + msg ? msg : "", end + 1, (int)(end - word), word); + continue; + } + } + + switch (*end) { + case '\0': /* show */ + memprintf(&msg, "%s%.*s=%#llx ", msg ? msg : "", (int)(end - word), word, old); + new = old; // do not change the value + break; + + case '=': /* set */ + break; + + case '^': /* XOR */ + new = old ^ new; + break; + + case '+': /* OR */ + new = old | new; + break; + + case '-': /* AND NOT */ + new = old & ~new; + break; + + default: + break; + } + + /* write the new value */ + if (new != old) { + if (size == 8) + write_u64(ptr, new); + else if (size == 4) + write_u32(ptr, new); + else if (size == 2) + write_u16(ptr, new); + else + *(uint8_t *)ptr = new; + } + } + + if (msg && *msg) + return cli_dynmsg(appctx, LOG_INFO, msg); + return 1; +} + +/* parse a "debug dev stream" command */ +/* + * debug dev task <ptr> [ "wake" | "expire" | "kill" ] + * Show/change status of a task/tasklet + */ +static int debug_parse_cli_task(char **args, char *payload, struct appctx *appctx, void *private) +{ + const struct ha_caller *caller; + struct task *t; + char *endarg; + char *msg; + void *ptr; + int ret = 1; + int task_ok; + int arg; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + /* parse the pointer value */ + ptr = (void *)strtoul(args[3], &endarg, 0); + if (!*args[3] || *endarg) + goto usage; + + _HA_ATOMIC_INC(&debug_commands_issued); + + /* everything below must run under thread isolation till reaching label "leave" */ + thread_isolate(); + + /* struct tasklet is smaller than struct task and is sufficient to check + * the TASK_COMMON part. + */ + if (!may_access(ptr) || !may_access(ptr + sizeof(struct tasklet) - 1) || + ((const struct tasklet *)ptr)->tid < -1 || + ((const struct tasklet *)ptr)->tid >= (int)MAX_THREADS) { + ret = cli_err(appctx, "The designated memory area doesn't look like a valid task/tasklet\n"); + goto leave; + } + + t = ptr; + caller = t->caller; + msg = NULL; + task_ok = may_access(ptr + sizeof(*t) - 1); + + chunk_reset(&trash); + resolve_sym_name(&trash, NULL, (const void *)t->process); + + /* we need to be careful here because we may dump a freed task that's + * still in the pool cache, containing garbage in pointers. + */ + if (!*args[4]) { + memprintf(&msg, "%s%p: %s state=%#x tid=%d process=%s ctx=%p calls=%d last=%s:%d intl=%d", + msg ? msg : "", t, (t->state & TASK_F_TASKLET) ? "tasklet" : "task", + t->state, t->tid, trash.area, t->context, t->calls, + caller && may_access(caller) && may_access(caller->func) && isalnum((uchar)*caller->func) ? caller->func : "0", + caller ? t->caller->line : 0, + (t->state & TASK_F_TASKLET) ? LIST_INLIST(&((const struct tasklet *)t)->list) : 0); + + if (task_ok && !(t->state & TASK_F_TASKLET)) + memprintf(&msg, "%s inrq=%d inwq=%d exp=%d nice=%d", + msg ? msg : "", task_in_rq(t), task_in_wq(t), t->expire, t->nice); + + memprintf(&msg, "%s\n", msg ? msg : ""); + } + + for (arg = 4; *args[arg]; arg++) { + if (strcmp(args[arg], "expire") == 0) { + if (t->state & TASK_F_TASKLET) { + /* do nothing for tasklets */ + } + else if (task_ok) { + /* unlink task and wake with timer flag */ + __task_unlink_wq(t); + t->expire = now_ms; + task_wakeup(t, TASK_WOKEN_TIMER); + } + } else if (strcmp(args[arg], "wake") == 0) { + /* wake with all flags but init / timer */ + if (t->state & TASK_F_TASKLET) + tasklet_wakeup((struct tasklet *)t); + else if (task_ok) + task_wakeup(t, TASK_WOKEN_ANY & ~(TASK_WOKEN_INIT|TASK_WOKEN_TIMER)); + } else if (strcmp(args[arg], "kill") == 0) { + /* Kill the task. This is not idempotent! */ + if (!(t->state & TASK_KILLED)) { + if (t->state & TASK_F_TASKLET) + tasklet_kill((struct tasklet *)t); + else if (task_ok) + task_kill(t); + } + } else { + thread_release(); + goto usage; + } + } + + if (msg && *msg) + ret = cli_dynmsg(appctx, LOG_INFO, msg); + leave: + thread_release(); + return ret; + usage: + return cli_err(appctx, + "Usage: debug dev task <ptr> [ wake | expire | kill ]\n" + " By default, dumps some info on task/tasklet <ptr>. 'wake' will wake it up\n" + " with all conditions flags but init/exp. 'expire' will expire the entry, and\n" + " 'kill' will kill it (warning: may crash since later not idempotent!). All\n" + " changes may crash the process if performed on a wrong object!\n" + ); +} + +#if defined(DEBUG_DEV) +static struct task *debug_delay_inj_task(struct task *t, void *ctx, unsigned int state) +{ + unsigned long *tctx = ctx; // [0] = interval, [1] = nbwakeups + unsigned long inter = tctx[0]; + unsigned long count = tctx[1]; + unsigned long rnd; + + if (inter) + t->expire = tick_add(now_ms, inter); + else + task_wakeup(t, TASK_WOKEN_MSG); + + /* wake a random thread */ + while (count--) { + rnd = statistical_prng_range(global.nbthread); + ha_tkill(rnd, SIGRTMAX); + } + return t; +} + +/* parse a "debug dev delay-inj" command + * debug dev delay-inj <inter> <count> + */ +static int debug_parse_delay_inj(char **args, char *payload, struct appctx *appctx, void *private) +{ + unsigned long *tctx; // [0] = inter, [2] = count + struct task *task; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[4]) + return cli_err(appctx, "Usage: debug dev delay-inj <inter_ms> <count>*\n"); + + _HA_ATOMIC_INC(&debug_commands_issued); + + tctx = calloc(2, sizeof(*tctx)); + if (!tctx) + goto fail; + + tctx[0] = atoi(args[3]); + tctx[1] = atoi(args[4]); + + task = task_new_here/*anywhere*/(); + if (!task) + goto fail; + + task->process = debug_delay_inj_task; + task->context = tctx; + task_wakeup(task, TASK_WOKEN_INIT); + return 1; + + fail: + free(tctx); + return cli_err(appctx, "Not enough memory"); +} +#endif // DEBUG_DEV + +static struct task *debug_task_handler(struct task *t, void *ctx, unsigned int state) +{ + unsigned long *tctx = ctx; // [0] = #tasks, [1] = inter, [2+] = { tl | (tsk+1) } + unsigned long inter = tctx[1]; + unsigned long rnd; + + t->expire = tick_add(now_ms, inter); + + /* half of the calls will wake up another entry */ + rnd = statistical_prng(); + if (rnd & 1) { + rnd >>= 1; + rnd %= tctx[0]; + rnd = tctx[rnd + 2]; + + if (rnd & 1) + task_wakeup((struct task *)(rnd - 1), TASK_WOKEN_MSG); + else + tasklet_wakeup((struct tasklet *)rnd); + } + return t; +} + +static struct task *debug_tasklet_handler(struct task *t, void *ctx, unsigned int state) +{ + unsigned long *tctx = ctx; // [0] = #tasks, [1] = inter, [2+] = { tl | (tsk+1) } + unsigned long rnd; + int i; + + /* wake up two random entries */ + for (i = 0; i < 2; i++) { + rnd = statistical_prng() % tctx[0]; + rnd = tctx[rnd + 2]; + + if (rnd & 1) + task_wakeup((struct task *)(rnd - 1), TASK_WOKEN_MSG); + else + tasklet_wakeup((struct tasklet *)rnd); + } + return t; +} + +/* parse a "debug dev sched" command + * debug dev sched {task|tasklet} [count=<count>] [mask=<mask>] [single=<single>] [inter=<inter>] + */ +static int debug_parse_cli_sched(char **args, char *payload, struct appctx *appctx, void *private) +{ + int arg; + void *ptr; + int size; + const char *word, *end; + struct ist name; + char *msg = NULL; + char *endarg; + unsigned long long new; + unsigned long count = 0; + unsigned long thrid = tid; + unsigned int inter = 0; + unsigned long i; + int mode = 0; // 0 = tasklet; 1 = task + unsigned long *tctx; // [0] = #tasks, [1] = inter, [2+] = { tl | (tsk+1) } + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + ptr = NULL; size = 0; + + if (strcmp(args[3], "task") != 0 && strcmp(args[3], "tasklet") != 0) { + return cli_err(appctx, + "Usage: debug dev sched {task|tasklet} { <obj> = <value> }*\n" + " <obj> = {count | tid | inter }\n" + " <value> = 64-bit dec/hex integer (0x prefix supported)\n" + ); + } + + mode = strcmp(args[3], "task") == 0; + + _HA_ATOMIC_INC(&debug_commands_issued); + for (arg = 4; *args[arg]; arg++) { + end = word = args[arg]; + while (*end && *end != '=' && *end != '^' && *end != '+' && *end != '-') + end++; + name = ist2(word, end - word); + if (isteq(name, ist("count"))) { + ptr = &count; size = sizeof(count); + } else if (isteq(name, ist("tid"))) { + ptr = &thrid; size = sizeof(thrid); + } else if (isteq(name, ist("inter"))) { + ptr = &inter; size = sizeof(inter); + } else + return cli_dynerr(appctx, memprintf(&msg, "Unsupported setting: '%s'.\n", word)); + + /* parse the new value . */ + new = strtoll(end + 1, &endarg, 0); + if (end[1] && *endarg) { + memprintf(&msg, + "%sIgnoring unparsable value '%s' for field '%.*s'.\n", + msg ? msg : "", end + 1, (int)(end - word), word); + continue; + } + + /* write the new value */ + if (size == 8) + write_u64(ptr, new); + else if (size == 4) + write_u32(ptr, new); + else if (size == 2) + write_u16(ptr, new); + else + *(uint8_t *)ptr = new; + } + + tctx = calloc(count + 2, sizeof(*tctx)); + if (!tctx) + goto fail; + + tctx[0] = (unsigned long)count; + tctx[1] = (unsigned long)inter; + + if (thrid >= global.nbthread) + thrid = tid; + + for (i = 0; i < count; i++) { + /* now, if poly or mask was set, tmask corresponds to the + * valid thread mask to use, otherwise it remains zero. + */ + //printf("%lu: mode=%d mask=%#lx\n", i, mode, tmask); + if (mode == 0) { + struct tasklet *tl = tasklet_new(); + + if (!tl) + goto fail; + + tl->tid = thrid; + tl->process = debug_tasklet_handler; + tl->context = tctx; + tctx[i + 2] = (unsigned long)tl; + } else { + struct task *task = task_new_on(thrid); + + if (!task) + goto fail; + + task->process = debug_task_handler; + task->context = tctx; + tctx[i + 2] = (unsigned long)task + 1; + } + } + + /* start the tasks and tasklets */ + for (i = 0; i < count; i++) { + unsigned long ctx = tctx[i + 2]; + + if (ctx & 1) + task_wakeup((struct task *)(ctx - 1), TASK_WOKEN_INIT); + else + tasklet_wakeup((struct tasklet *)ctx); + } + + if (msg && *msg) + return cli_dynmsg(appctx, LOG_INFO, msg); + return 1; + + fail: + /* free partially allocated entries */ + for (i = 0; tctx && i < count; i++) { + unsigned long ctx = tctx[i + 2]; + + if (!ctx) + break; + + if (ctx & 1) + task_destroy((struct task *)(ctx - 1)); + else + tasklet_free((struct tasklet *)ctx); + } + + free(tctx); + return cli_err(appctx, "Not enough memory"); +} + +/* CLI state for "debug dev fd" */ +struct dev_fd_ctx { + int start_fd; +}; + +/* CLI parser for the "debug dev fd" command. The current FD to restart from is + * stored in a struct dev_fd_ctx pointed to by svcctx. + */ +static int debug_parse_cli_fd(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct dev_fd_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + /* start at fd #0 */ + ctx->start_fd = 0; + return 0; +} + +/* CLI I/O handler for the "debug dev fd" command. Dumps all FDs that are + * accessible from the process but not known from fdtab. The FD number to + * restart from is stored in a struct dev_fd_ctx pointed to by svcctx. + */ +static int debug_iohandler_fd(struct appctx *appctx) +{ + struct dev_fd_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct sockaddr_storage sa; + struct stat statbuf; + socklen_t salen, vlen; + int ret1, ret2, port; + char *addrstr; + int ret = 1; + int i, fd; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + goto end; + + chunk_reset(&trash); + + thread_isolate(); + + /* we have two inner loops here, one for the proxy, the other one for + * the buffer. + */ + for (fd = ctx->start_fd; fd < global.maxsock; fd++) { + /* check for FD's existence */ + ret1 = fcntl(fd, F_GETFD, 0); + if (ret1 == -1) + continue; // not known to the process + if (fdtab[fd].owner) + continue; // well-known + + /* OK we're seeing an orphan let's try to retrieve as much + * information as possible about it. + */ + chunk_printf(&trash, "%5d", fd); + + if (fstat(fd, &statbuf) != -1) { + chunk_appendf(&trash, " type=%s mod=%04o dev=%#llx siz=%#llx uid=%lld gid=%lld fs=%#llx ino=%#llx", + isatty(fd) ? "tty.": + S_ISREG(statbuf.st_mode) ? "file": + S_ISDIR(statbuf.st_mode) ? "dir.": + S_ISCHR(statbuf.st_mode) ? "chr.": + S_ISBLK(statbuf.st_mode) ? "blk.": + S_ISFIFO(statbuf.st_mode) ? "pipe": + S_ISLNK(statbuf.st_mode) ? "link": + S_ISSOCK(statbuf.st_mode) ? "sock": +#ifdef USE_EPOLL + /* trick: epoll_ctl() will return -ENOENT when trying + * to remove from a valid epoll FD an FD that was not + * registered against it. But we don't want to risk + * disabling a random FD. Instead we'll create a new + * one by duplicating 0 (it should be valid since + * pointing to a terminal or /dev/null), and try to + * remove it. + */ + ({ + int fd2 = dup(0); + int ret = fd2; + if (ret >= 0) { + ret = epoll_ctl(fd, EPOLL_CTL_DEL, fd2, NULL); + if (ret == -1 && errno == ENOENT) + ret = 0; // that's a real epoll + else + ret = -1; // it's something else + close(fd2); + } + ret; + }) == 0 ? "epol" : +#endif + "????", + (uint)statbuf.st_mode & 07777, + + (ullong)statbuf.st_rdev, + (ullong)statbuf.st_size, + (ullong)statbuf.st_uid, + (ullong)statbuf.st_gid, + + (ullong)statbuf.st_dev, + (ullong)statbuf.st_ino); + } + + chunk_appendf(&trash, " getfd=%s+%#x", + (ret1 & FD_CLOEXEC) ? "cloex" : "", + ret1 &~ FD_CLOEXEC); + + /* FD options */ + ret2 = fcntl(fd, F_GETFL, 0); + if (ret2) { + chunk_appendf(&trash, " getfl=%s", + (ret1 & 3) >= 2 ? "O_RDWR" : + (ret1 & 1) ? "O_WRONLY" : "O_RDONLY"); + + for (i = 2; i < 32; i++) { + if (!(ret2 & (1UL << i))) + continue; + switch (1UL << i) { + case O_CREAT: chunk_appendf(&trash, ",O_CREAT"); break; + case O_EXCL: chunk_appendf(&trash, ",O_EXCL"); break; + case O_NOCTTY: chunk_appendf(&trash, ",O_NOCTTY"); break; + case O_TRUNC: chunk_appendf(&trash, ",O_TRUNC"); break; + case O_APPEND: chunk_appendf(&trash, ",O_APPEND"); break; +#ifdef O_ASYNC + case O_ASYNC: chunk_appendf(&trash, ",O_ASYNC"); break; +#endif +#ifdef O_DIRECT + case O_DIRECT: chunk_appendf(&trash, ",O_DIRECT"); break; +#endif +#ifdef O_NOATIME + case O_NOATIME: chunk_appendf(&trash, ",O_NOATIME"); break; +#endif + } + } + } + + vlen = sizeof(ret2); + ret1 = getsockopt(fd, SOL_SOCKET, SO_TYPE, &ret2, &vlen); + if (ret1 != -1) + chunk_appendf(&trash, " so_type=%d", ret2); + + vlen = sizeof(ret2); + ret1 = getsockopt(fd, SOL_SOCKET, SO_ACCEPTCONN, &ret2, &vlen); + if (ret1 != -1) + chunk_appendf(&trash, " so_accept=%d", ret2); + + vlen = sizeof(ret2); + ret1 = getsockopt(fd, SOL_SOCKET, SO_ERROR, &ret2, &vlen); + if (ret1 != -1) + chunk_appendf(&trash, " so_error=%d", ret2); + + salen = sizeof(sa); + if (getsockname(fd, (struct sockaddr *)&sa, &salen) != -1) { + if (sa.ss_family == AF_INET) + port = ntohs(((const struct sockaddr_in *)&sa)->sin_port); + else if (sa.ss_family == AF_INET6) + port = ntohs(((const struct sockaddr_in6 *)&sa)->sin6_port); + else + port = 0; + addrstr = sa2str(&sa, port, 0); + chunk_appendf(&trash, " laddr=%s", addrstr); + free(addrstr); + } + + salen = sizeof(sa); + if (getpeername(fd, (struct sockaddr *)&sa, &salen) != -1) { + if (sa.ss_family == AF_INET) + port = ntohs(((const struct sockaddr_in *)&sa)->sin_port); + else if (sa.ss_family == AF_INET6) + port = ntohs(((const struct sockaddr_in6 *)&sa)->sin6_port); + else + port = 0; + addrstr = sa2str(&sa, port, 0); + chunk_appendf(&trash, " raddr=%s", addrstr); + free(addrstr); + } + + chunk_appendf(&trash, "\n"); + + if (applet_putchk(appctx, &trash) == -1) { + ctx->start_fd = fd; + ret = 0; + break; + } + } + + thread_release(); + end: + return ret; +} + +#if defined(DEBUG_MEM_STATS) + +/* CLI state for "debug dev memstats" */ +struct dev_mem_ctx { + struct mem_stats *start, *stop; /* begin/end of dump */ + char *match; /* non-null if a name prefix is specified */ + int show_all; /* show all entries if non-null */ + int width; /* 1st column width */ + long tot_size; /* sum of alloc-free */ + ulong tot_calls; /* sum of calls */ +}; + +/* CLI parser for the "debug dev memstats" command. Sets a dev_mem_ctx shown above. */ +static int debug_parse_cli_memstats(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct dev_mem_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int arg; + + extern __attribute__((__weak__)) struct mem_stats __start_mem_stats; + extern __attribute__((__weak__)) struct mem_stats __stop_mem_stats; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + for (arg = 3; *args[arg]; arg++) { + if (strcmp(args[arg], "reset") == 0) { + struct mem_stats *ptr; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + for (ptr = &__start_mem_stats; ptr < &__stop_mem_stats; ptr++) { + _HA_ATOMIC_STORE(&ptr->calls, 0); + _HA_ATOMIC_STORE(&ptr->size, 0); + } + return 1; + } + else if (strcmp(args[arg], "all") == 0) { + ctx->show_all = 1; + continue; + } + else if (strcmp(args[arg], "match") == 0 && *args[arg + 1]) { + ha_free(&ctx->match); + ctx->match = strdup(args[arg + 1]); + arg++; + continue; + } + else + return cli_err(appctx, "Expects either 'reset', 'all', or 'match <pfx>'.\n"); + } + + /* otherwise proceed with the dump from p0 to p1 */ + ctx->start = &__start_mem_stats; + ctx->stop = &__stop_mem_stats; + ctx->width = 0; + return 0; +} + +/* CLI I/O handler for the "debug dev memstats" command using a dev_mem_ctx + * found in appctx->svcctx. Dumps all mem_stats structs referenced by pointers + * located between ->start and ->stop. Dumps all entries if ->show_all != 0, + * otherwise only non-zero calls. + */ +static int debug_iohandler_memstats(struct appctx *appctx) +{ + struct dev_mem_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct mem_stats *ptr; + const char *pfx = ctx->match; + int ret = 1; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + goto end; + + if (!ctx->width) { + /* we don't know the first column's width, let's compute it + * now based on a first pass on printable entries and their + * expected width (approximated). + */ + for (ptr = ctx->start; ptr != ctx->stop; ptr++) { + const char *p, *name; + int w = 0; + char tmp; + + if (!ptr->size && !ptr->calls && !ctx->show_all) + continue; + + for (p = name = ptr->caller.file; *p; p++) { + if (*p == '/') + name = p + 1; + } + + if (ctx->show_all) + w = snprintf(&tmp, 0, "%s(%s:%d) ", ptr->caller.func, name, ptr->caller.line); + else + w = snprintf(&tmp, 0, "%s:%d ", name, ptr->caller.line); + + if (w > ctx->width) + ctx->width = w; + } + } + + /* we have two inner loops here, one for the proxy, the other one for + * the buffer. + */ + for (ptr = ctx->start; ptr != ctx->stop; ptr++) { + const char *type; + const char *name; + const char *p; + const char *info = NULL; + const char *func = NULL; + int direction = 0; // neither alloc nor free (e.g. realloc) + + if (!ptr->size && !ptr->calls && !ctx->show_all) + continue; + + /* basename only */ + for (p = name = ptr->caller.file; *p; p++) { + if (*p == '/') + name = p + 1; + } + + func = ptr->caller.func; + + switch (ptr->caller.what) { + case MEM_STATS_TYPE_CALLOC: type = "CALLOC"; direction = 1; break; + case MEM_STATS_TYPE_FREE: type = "FREE"; direction = -1; break; + case MEM_STATS_TYPE_MALLOC: type = "MALLOC"; direction = 1; break; + case MEM_STATS_TYPE_REALLOC: type = "REALLOC"; break; + case MEM_STATS_TYPE_STRDUP: type = "STRDUP"; direction = 1; break; + case MEM_STATS_TYPE_P_ALLOC: type = "P_ALLOC"; direction = 1; if (ptr->extra) info = ((const struct pool_head *)ptr->extra)->name; break; + case MEM_STATS_TYPE_P_FREE: type = "P_FREE"; direction = -1; if (ptr->extra) info = ((const struct pool_head *)ptr->extra)->name; break; + default: type = "UNSET"; break; + } + + //chunk_printf(&trash, + // "%20s:%-5d %7s size: %12lu calls: %9lu size/call: %6lu\n", + // name, ptr->line, type, + // (unsigned long)ptr->size, (unsigned long)ptr->calls, + // (unsigned long)(ptr->calls ? (ptr->size / ptr->calls) : 0)); + + /* only match requested prefixes */ + if (pfx && (!info || strncmp(info, pfx, strlen(pfx)) != 0)) + continue; + + chunk_reset(&trash); + if (ctx->show_all) + chunk_appendf(&trash, "%s(", func); + + chunk_appendf(&trash, "%s:%d", name, ptr->caller.line); + + if (ctx->show_all) + chunk_appendf(&trash, ")"); + + while (trash.data < ctx->width) + trash.area[trash.data++] = ' '; + + chunk_appendf(&trash, "%7s size: %12lu calls: %9lu size/call: %6lu %s\n", + type, + (unsigned long)ptr->size, (unsigned long)ptr->calls, + (unsigned long)(ptr->calls ? (ptr->size / ptr->calls) : 0), + info ? info : ""); + + if (applet_putchk(appctx, &trash) == -1) { + ctx->start = ptr; + ret = 0; + goto end; + } + if (direction > 0) { + ctx->tot_size += (ulong)ptr->size; + ctx->tot_calls += (ulong)ptr->calls; + } + else if (direction < 0) { + ctx->tot_size -= (ulong)ptr->size; + ctx->tot_calls += (ulong)ptr->calls; + } + } + + /* now dump a summary */ + chunk_reset(&trash); + chunk_appendf(&trash, "Total"); + while (trash.data < ctx->width) + trash.area[trash.data++] = ' '; + + chunk_appendf(&trash, "%7s size: %12ld calls: %9lu size/call: %6ld %s\n", + "BALANCE", + ctx->tot_size, ctx->tot_calls, + (long)(ctx->tot_calls ? (ctx->tot_size / ctx->tot_calls) : 0), + "(excl. realloc)"); + + if (applet_putchk(appctx, &trash) == -1) { + ctx->start = ptr; + ret = 0; + goto end; + } + end: + return ret; +} + +/* release the "show pools" context */ +static void debug_release_memstats(struct appctx *appctx) +{ + struct dev_mem_ctx *ctx = appctx->svcctx; + + ha_free(&ctx->match); +} +#endif + +#ifdef USE_THREAD_DUMP + +/* handles DEBUGSIG to dump the state of the thread it's working on. This is + * appended at the end of thread_dump_buffer which must be protected against + * reentrance from different threads (a thread-local buffer works fine). + */ +void debug_handler(int sig, siginfo_t *si, void *arg) +{ + struct buffer *buf = HA_ATOMIC_LOAD(&th_ctx->thread_dump_buffer); + int harmless = is_thread_harmless(); + + /* first, let's check it's really for us and that we didn't just get + * a spurious DEBUGSIG. + */ + if (!buf || buf == (void*)(0x1UL)) + return; + + /* now dump the current state into the designated buffer, and indicate + * we come from a sig handler. + */ + ha_thread_dump_one(tid, 1); + + /* mark the current thread as stuck to detect it upon next invocation + * if it didn't move. + */ + if (!harmless && + !(_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_SLEEPING)) + _HA_ATOMIC_OR(&th_ctx->flags, TH_FL_STUCK); +} + +static int init_debug_per_thread() +{ + sigset_t set; + + /* unblock the DEBUGSIG signal we intend to use */ + sigemptyset(&set); + sigaddset(&set, DEBUGSIG); +#if defined(DEBUG_DEV) + sigaddset(&set, SIGRTMAX); +#endif + ha_sigmask(SIG_UNBLOCK, &set, NULL); + return 1; +} + +static int init_debug() +{ + struct sigaction sa; + void *callers[1]; + + /* calling backtrace() will access libgcc at runtime. We don't want to + * do it after the chroot, so let's perform a first call to have it + * ready in memory for later use. + */ + my_backtrace(callers, sizeof(callers)/sizeof(*callers)); + sa.sa_handler = NULL; + sa.sa_sigaction = debug_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + sigaction(DEBUGSIG, &sa, NULL); + +#if defined(DEBUG_DEV) + sa.sa_handler = NULL; + sa.sa_sigaction = debug_delay_inj_sighandler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + sigaction(SIGRTMAX, &sa, NULL); +#endif + return ERR_NONE; +} + +REGISTER_POST_CHECK(init_debug); +REGISTER_PER_THREAD_INIT(init_debug_per_thread); + +#endif /* USE_THREAD_DUMP */ + + +static void feed_post_mortem_linux() +{ +#if defined(__linux__) + struct stat statbuf; + FILE *file; + + /* DMI reports either HW or hypervisor, this allows to detect most VMs. + * On ARM the device-tree is often more precise for the model. Since many + * boards present "to be filled by OEM" or so in many fields, we dedup + * them as much as possible. + */ + if (read_line_to_trash("/sys/class/dmi/id/sys_vendor") > 0) + strlcpy2(post_mortem.platform.hw_vendor, trash.area, sizeof(post_mortem.platform.hw_vendor)); + + if (read_line_to_trash("/sys/class/dmi/id/product_family") > 0 && + strcmp(trash.area, post_mortem.platform.hw_vendor) != 0) + strlcpy2(post_mortem.platform.hw_family, trash.area, sizeof(post_mortem.platform.hw_family)); + + if ((read_line_to_trash("/sys/class/dmi/id/product_name") > 0 && + strcmp(trash.area, post_mortem.platform.hw_vendor) != 0 && + strcmp(trash.area, post_mortem.platform.hw_family) != 0)) + strlcpy2(post_mortem.platform.hw_model, trash.area, sizeof(post_mortem.platform.hw_model)); + + if ((read_line_to_trash("/sys/class/dmi/id/board_vendor") > 0 && + strcmp(trash.area, post_mortem.platform.hw_vendor) != 0)) + strlcpy2(post_mortem.platform.brd_vendor, trash.area, sizeof(post_mortem.platform.brd_vendor)); + + if ((read_line_to_trash("/sys/firmware/devicetree/base/model") > 0 && + strcmp(trash.area, post_mortem.platform.brd_vendor) != 0 && + strcmp(trash.area, post_mortem.platform.hw_vendor) != 0 && + strcmp(trash.area, post_mortem.platform.hw_family) != 0 && + strcmp(trash.area, post_mortem.platform.hw_model) != 0) || + (read_line_to_trash("/sys/class/dmi/id/board_name") > 0 && + strcmp(trash.area, post_mortem.platform.brd_vendor) != 0 && + strcmp(trash.area, post_mortem.platform.hw_vendor) != 0 && + strcmp(trash.area, post_mortem.platform.hw_family) != 0 && + strcmp(trash.area, post_mortem.platform.hw_model) != 0)) + strlcpy2(post_mortem.platform.brd_model, trash.area, sizeof(post_mortem.platform.brd_model)); + + /* Check for containers. In a container on linux we don't see keventd (2.4) kthreadd (2.6+) on pid 2 */ + if (read_line_to_trash("/proc/2/status") <= 0 || + (strcmp(trash.area, "Name:\tkthreadd") != 0 && + strcmp(trash.area, "Name:\tkeventd") != 0)) { + /* OK we're in a container. Docker often has /.dockerenv */ + const char *tech = "yes"; + + if (stat("/.dockerenv", &statbuf) == 0) + tech = "docker"; + strlcpy2(post_mortem.platform.cont_techno, tech, sizeof(post_mortem.platform.cont_techno)); + } + else { + strlcpy2(post_mortem.platform.cont_techno, "no", sizeof(post_mortem.platform.cont_techno)); + } + + file = fopen("/proc/cpuinfo", "r"); + if (file) { + uint cpu_implem = 0, cpu_arch = 0, cpu_variant = 0, cpu_part = 0, cpu_rev = 0; // arm + uint cpu_family = 0, model = 0, stepping = 0; // x86 + char vendor_id[64] = "", model_name[64] = ""; // x86 + char machine[64] = "", system_type[64] = "", cpu_model[64] = ""; // mips + const char *virt = "no"; + char *p, *e, *v, *lf; + + /* let's figure what CPU we're working with */ + while ((p = fgets(trash.area, trash.size, file)) != NULL) { + lf = strchr(p, '\n'); + if (lf) + *lf = 0; + + /* stop at first line break */ + if (!*p) + break; + + /* skip colon and spaces and trim spaces after name */ + v = e = strchr(p, ':'); + if (!e) + continue; + + do { *e-- = 0; } while (e >= p && (*e == ' ' || *e == '\t')); + + /* locate value after colon */ + do { v++; } while (*v == ' ' || *v == '\t'); + + /* ARM */ + if (strcmp(p, "CPU implementer") == 0) + cpu_implem = strtoul(v, NULL, 0); + else if (strcmp(p, "CPU architecture") == 0) + cpu_arch = strtoul(v, NULL, 0); + else if (strcmp(p, "CPU variant") == 0) + cpu_variant = strtoul(v, NULL, 0); + else if (strcmp(p, "CPU part") == 0) + cpu_part = strtoul(v, NULL, 0); + else if (strcmp(p, "CPU revision") == 0) + cpu_rev = strtoul(v, NULL, 0); + + /* x86 */ + else if (strcmp(p, "cpu family") == 0) + cpu_family = strtoul(v, NULL, 0); + else if (strcmp(p, "model") == 0) + model = strtoul(v, NULL, 0); + else if (strcmp(p, "stepping") == 0) + stepping = strtoul(v, NULL, 0); + else if (strcmp(p, "vendor_id") == 0) + strlcpy2(vendor_id, v, sizeof(vendor_id)); + else if (strcmp(p, "model name") == 0) + strlcpy2(model_name, v, sizeof(model_name)); + else if (strcmp(p, "flags") == 0) { + if (strstr(v, "hypervisor")) { + if (strncmp(post_mortem.platform.hw_vendor, "QEMU", 4) == 0) + virt = "qemu"; + else if (strncmp(post_mortem.platform.hw_vendor, "VMware", 6) == 0) + virt = "vmware"; + else + virt = "yes"; + } + } + + /* MIPS */ + else if (strcmp(p, "system type") == 0) + strlcpy2(system_type, v, sizeof(system_type)); + else if (strcmp(p, "machine") == 0) + strlcpy2(machine, v, sizeof(machine)); + else if (strcmp(p, "cpu model") == 0) + strlcpy2(cpu_model, v, sizeof(cpu_model)); + } + fclose(file); + + /* Machine may replace hw_product on MIPS */ + if (!*post_mortem.platform.hw_model) + strlcpy2(post_mortem.platform.hw_model, machine, sizeof(post_mortem.platform.hw_model)); + + /* SoC vendor */ + strlcpy2(post_mortem.platform.soc_vendor, vendor_id, sizeof(post_mortem.platform.soc_vendor)); + + /* SoC model */ + if (*system_type) { + /* MIPS */ + strlcpy2(post_mortem.platform.soc_model, system_type, sizeof(post_mortem.platform.soc_model)); + *system_type = 0; + } else if (*model_name) { + /* x86 */ + strlcpy2(post_mortem.platform.soc_model, model_name, sizeof(post_mortem.platform.soc_model)); + *model_name = 0; + } + + /* Create a CPU model name based on available IDs */ + if (cpu_implem) // arm + snprintf(cpu_model + strlen(cpu_model), + sizeof(cpu_model) - strlen(cpu_model), + "%sImpl %#02x", *cpu_model ? " " : "", cpu_implem); + + if (cpu_family) // x86 + snprintf(cpu_model + strlen(cpu_model), + sizeof(cpu_model) - strlen(cpu_model), + "%sFam %u", *cpu_model ? " " : "", cpu_family); + + if (model) // x86 + snprintf(cpu_model + strlen(cpu_model), + sizeof(cpu_model) - strlen(cpu_model), + "%sModel %u", *cpu_model ? " " : "", model); + + if (stepping) // x86 + snprintf(cpu_model + strlen(cpu_model), + sizeof(cpu_model) - strlen(cpu_model), + "%sStep %u", *cpu_model ? " " : "", stepping); + + if (cpu_arch) // arm + snprintf(cpu_model + strlen(cpu_model), + sizeof(cpu_model) - strlen(cpu_model), + "%sArch %u", *cpu_model ? " " : "", cpu_arch); + + if (cpu_part) // arm + snprintf(cpu_model + strlen(cpu_model), + sizeof(cpu_model) - strlen(cpu_model), + "%sPart %#03x", *cpu_model ? " " : "", cpu_part); + + if (cpu_variant || cpu_rev) // arm + snprintf(cpu_model + strlen(cpu_model), + sizeof(cpu_model) - strlen(cpu_model), + "%sr%up%u", *cpu_model ? " " : "", cpu_variant, cpu_rev); + + strlcpy2(post_mortem.platform.cpu_model, cpu_model, sizeof(post_mortem.platform.cpu_model)); + + if (*virt) + strlcpy2(post_mortem.platform.virt_techno, virt, sizeof(post_mortem.platform.virt_techno)); + } +#endif // __linux__ +} + +static int feed_post_mortem() +{ + /* kernel type, version and arch */ + uname(&post_mortem.platform.utsname); + + /* some boot-time info related to the process */ + post_mortem.process.pid = getpid(); + post_mortem.process.boot_uid = geteuid(); + post_mortem.process.boot_gid = getegid(); + + getrlimit(RLIMIT_NOFILE, &post_mortem.process.limit_fd); +#if defined(RLIMIT_AS) + getrlimit(RLIMIT_AS, &post_mortem.process.limit_ram); +#elif defined(RLIMIT_DATA) + getrlimit(RLIMIT_DATA, &post_mortem.process.limit_ram); +#endif + + if (strcmp(post_mortem.platform.utsname.sysname, "Linux") == 0) + feed_post_mortem_linux(); + +#if defined(HA_HAVE_DUMP_LIBS) + chunk_reset(&trash); + if (dump_libs(&trash, 1)) + post_mortem.libs = strdup(trash.area); +#endif + + return ERR_NONE; +} + +REGISTER_POST_CHECK(feed_post_mortem); + +static void deinit_post_mortem(void) +{ + int comp; + +#if defined(HA_HAVE_DUMP_LIBS) + ha_free(&post_mortem.libs); +#endif + for (comp = 0; comp < post_mortem.nb_components; comp++) { + free(post_mortem.components[comp].toolchain); + free(post_mortem.components[comp].toolchain_opts); + free(post_mortem.components[comp].build_settings); + free(post_mortem.components[comp].path); + } + ha_free(&post_mortem.components); +} + +REGISTER_POST_DEINIT(deinit_post_mortem); + +/* Appends a component to the list of post_portem info. May silently fail + * on allocation errors but we don't care since the goal is to provide info + * we have in case it helps. + */ +void post_mortem_add_component(const char *name, const char *version, + const char *toolchain, const char *toolchain_opts, + const char *build_settings, const char *path) +{ + struct post_mortem_component *comp; + int nbcomp = post_mortem.nb_components; + + comp = realloc(post_mortem.components, (nbcomp + 1) * sizeof(*comp)); + if (!comp) + return; + + memset(&comp[nbcomp], 0, sizeof(*comp)); + strlcpy2(comp[nbcomp].name, name, sizeof(comp[nbcomp].name)); + strlcpy2(comp[nbcomp].version, version, sizeof(comp[nbcomp].version)); + comp[nbcomp].toolchain = strdup(toolchain); + comp[nbcomp].toolchain_opts = strdup(toolchain_opts); + comp[nbcomp].build_settings = strdup(build_settings); + comp[nbcomp].path = strdup(path); + + post_mortem.nb_components++; + post_mortem.components = comp; +} + +#ifdef USE_THREAD +/* init code is called one at a time so let's collect all per-thread info on + * the last starting thread. These info are not critical anyway and there's no + * problem if we get them slightly late. + */ +static int feed_post_mortem_late() +{ + static int per_thread_info_collected; + + if (HA_ATOMIC_ADD_FETCH(&per_thread_info_collected, 1) == global.nbthread) { + int i; + for (i = 0; i < global.nbthread; i++) { + post_mortem.process.thread_info[i].pth_id = ha_thread_info[i].pth_id; + post_mortem.process.thread_info[i].stack_top = ha_thread_info[i].stack_top; + } + } + return 1; +} + +REGISTER_PER_THREAD_INIT(feed_post_mortem_late); +#endif + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + {{ "debug", "dev", "bug", NULL }, "debug dev bug : call BUG_ON() and crash", debug_parse_cli_bug, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "check", NULL }, "debug dev check : call CHECK_IF() and possibly crash", debug_parse_cli_check, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "close", NULL }, "debug dev close <fd> : close this file descriptor", debug_parse_cli_close, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "deadlock", NULL }, "debug dev deadlock [nbtask] : deadlock between this number of tasks", debug_parse_cli_deadlock, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "delay", NULL }, "debug dev delay [ms] : sleep this long", debug_parse_cli_delay, NULL, NULL, NULL, ACCESS_EXPERT }, +#if defined(DEBUG_DEV) + {{ "debug", "dev", "delay-inj", NULL },"debug dev delay-inj <inter> <count> : inject random delays into threads", debug_parse_delay_inj, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "exec", NULL }, "debug dev exec [cmd] ... : show this command's output", debug_parse_cli_exec, NULL, NULL, NULL, ACCESS_EXPERT }, +#endif + {{ "debug", "dev", "fd", NULL }, "debug dev fd : scan for rogue/unhandled FDs", debug_parse_cli_fd, debug_iohandler_fd, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "exit", NULL }, "debug dev exit [code] : immediately exit the process", debug_parse_cli_exit, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "hash", NULL }, "debug dev hash [msg] : return msg hashed if anon is set", debug_parse_cli_hash, NULL, NULL, NULL, 0 }, + {{ "debug", "dev", "hex", NULL }, "debug dev hex <addr> [len] : dump a memory area", debug_parse_cli_hex, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "log", NULL }, "debug dev log [msg] ... : send this msg to global logs", debug_parse_cli_log, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "loop", NULL }, "debug dev loop <ms> [isolated] : loop this long, possibly isolated", debug_parse_cli_loop, NULL, NULL, NULL, ACCESS_EXPERT }, +#if defined(DEBUG_MEM_STATS) + {{ "debug", "dev", "memstats", NULL }, "debug dev memstats [reset|all|match ...]: dump/reset memory statistics", debug_parse_cli_memstats, debug_iohandler_memstats, debug_release_memstats, NULL, 0 }, +#endif + {{ "debug", "dev", "panic", NULL }, "debug dev panic : immediately trigger a panic", debug_parse_cli_panic, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "sched", NULL }, "debug dev sched {task|tasklet} [k=v]* : stress the scheduler", debug_parse_cli_sched, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "stream",NULL }, "debug dev stream [k=v]* : show/manipulate stream flags", debug_parse_cli_stream,NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "sym", NULL }, "debug dev sym <addr> : resolve symbol address", debug_parse_cli_sym, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "task", NULL }, "debug dev task <ptr> [wake|expire|kill] : show/wake/expire/kill task/tasklet", debug_parse_cli_task, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "tkill", NULL }, "debug dev tkill [thr] [sig] : send signal to thread", debug_parse_cli_tkill, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "warn", NULL }, "debug dev warn : call WARN_ON() and possibly crash", debug_parse_cli_warn, NULL, NULL, NULL, ACCESS_EXPERT }, + {{ "debug", "dev", "write", NULL }, "debug dev write [size] : write that many bytes in return", debug_parse_cli_write, NULL, NULL, NULL, ACCESS_EXPERT }, + + {{ "show", "dev", NULL, NULL }, "show dev : show debug info for developers", debug_parse_cli_show_dev, NULL, NULL }, +#if defined(HA_HAVE_DUMP_LIBS) + {{ "show", "libs", NULL, NULL }, "show libs : show loaded object files and libraries", debug_parse_cli_show_libs, NULL, NULL }, +#endif + {{ "show", "threads", NULL, NULL }, "show threads : show some threads debugging information", NULL, cli_io_handler_show_threads, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); diff --git a/src/dgram.c b/src/dgram.c new file mode 100644 index 0000000..c983c03 --- /dev/null +++ b/src/dgram.c @@ -0,0 +1,79 @@ +/* + * Datagram processing functions + * + * Copyright 2014 Baptiste Assmann <bedis9@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/fd.h> +#include <haproxy/cfgparse.h> +#include <haproxy/dgram.h> +#include <haproxy/errors.h> +#include <haproxy/tools.h> + +/* datagram handler callback */ +void dgram_fd_handler(int fd) +{ + struct dgram_conn *dgram = fdtab[fd].owner; + + if (unlikely(!dgram)) + return; + + if (fd_recv_ready(fd)) + dgram->data->recv(dgram); + if (fd_send_ready(fd)) + dgram->data->send(dgram); + + return; +} + +/* config parser for global "tune.{rcv,snd}buf.{frontend,backend}" */ +static int dgram_parse_tune_bufs(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int *valptr; + int val; + + if (too_many_args(1, args, err, NULL)) + return -1; + + /* "tune.rcvbuf.frontend", "tune.rcvbuf.backend", + * "tune.sndbuf.frontend", "tune.sndbuf.backend" + */ + valptr = (args[0][5] == 'r' && args[0][12] == 'f') ? &global.tune.frontend_rcvbuf : + (args[0][5] == 'r' && args[0][12] == 'b') ? &global.tune.backend_rcvbuf : + (args[0][5] == 's' && args[0][12] == 'f') ? &global.tune.frontend_sndbuf : + &global.tune.backend_sndbuf; + + if (*valptr != 0) { + memprintf(err, "parsing [%s:%d] : ignoring '%s' which was already specified.\n", file, line, args[0]); + return 1; + } + + val = atoi(args[1]); + + if (*(args[1]) == 0 || val <= 0) { + memprintf(err, "parsing [%s:%d] : '%s' expects a strictly positive integer argument.\n", file, line, args[0]); + return -1; + } + + *valptr = val; + return 0; +} + +/* register "global" section keywords */ +static struct cfg_kw_list dgram_cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.rcvbuf.backend", dgram_parse_tune_bufs }, + { CFG_GLOBAL, "tune.rcvbuf.frontend", dgram_parse_tune_bufs }, + { CFG_GLOBAL, "tune.sndbuf.backend", dgram_parse_tune_bufs }, + { CFG_GLOBAL, "tune.sndbuf.frontend", dgram_parse_tune_bufs }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &dgram_cfg_kws); diff --git a/src/dict.c b/src/dict.c new file mode 100644 index 0000000..a225081 --- /dev/null +++ b/src/dict.c @@ -0,0 +1,127 @@ +#include <string.h> + +#include <import/eb32tree.h> +#include <import/ebistree.h> +#include <haproxy/dict.h> +#include <haproxy/thread.h> + +struct dict *new_dict(const char *name) +{ + struct dict *dict; + + dict = malloc(sizeof *dict); + if (!dict) + return NULL; + + dict->name = name; + dict->values = EB_ROOT_UNIQUE; + HA_RWLOCK_INIT(&dict->rwlock); + + return dict; +} + +/* + * Allocate a new dictionary entry with <s> as string value which is strdup()'ed. + * Returns the new allocated entry if succeeded, NULL if not. + */ +static struct dict_entry *new_dict_entry(char *s) +{ + struct dict_entry *de; + + de = calloc(1, sizeof *de); + if (!de) + return NULL; + + de->value.key = strdup(s); + if (!de->value.key) + goto err; + + de->len = strlen(s); + de->refcount = 1; + + return de; + + err: + ha_free(&de->value.key); + de->len = 0; + free(de); + return NULL; +} + +/* + * Release the memory allocated for <de> dictionary entry. + */ +static void free_dict_entry(struct dict_entry *de) +{ + de->refcount = 0; + ha_free(&de->value.key); + free(de); +} + +/* + * Simple function to lookup dictionary entries with <s> as value. + */ +static struct dict_entry *__dict_lookup(struct dict *d, const char *s) +{ + struct dict_entry *de; + struct ebpt_node *node; + + de = NULL; + node = ebis_lookup(&d->values, s); + if (node) + de = container_of(node, struct dict_entry, value); + + return de; +} + +/* + * Insert an entry in <d> dictionary with <s> as value. * + */ +struct dict_entry *dict_insert(struct dict *d, char *s) +{ + struct dict_entry *de; + struct ebpt_node *n; + + HA_RWLOCK_RDLOCK(DICT_LOCK, &d->rwlock); + de = __dict_lookup(d, s); + HA_RWLOCK_RDUNLOCK(DICT_LOCK, &d->rwlock); + if (de) { + HA_ATOMIC_INC(&de->refcount); + return de; + } + + de = new_dict_entry(s); + if (!de) + return NULL; + + HA_RWLOCK_WRLOCK(DICT_LOCK, &d->rwlock); + n = ebis_insert(&d->values, &de->value); + HA_RWLOCK_WRUNLOCK(DICT_LOCK, &d->rwlock); + if (n != &de->value) { + free_dict_entry(de); + de = container_of(n, struct dict_entry, value); + } + + return de; +} + + +/* + * Unreference a dict entry previously acquired with <dict_insert>. + * If this is the last live reference to the entry, it is + * removed from the dictionary. + */ +void dict_entry_unref(struct dict *d, struct dict_entry *de) +{ + if (!de) + return; + + if (HA_ATOMIC_SUB_FETCH(&de->refcount, 1) != 0) + return; + + HA_RWLOCK_WRLOCK(DICT_LOCK, &d->rwlock); + ebpt_delete(&de->value); + HA_RWLOCK_WRUNLOCK(DICT_LOCK, &d->rwlock); + + free_dict_entry(de); +} diff --git a/src/dns.c b/src/dns.c new file mode 100644 index 0000000..23e9d9d --- /dev/null +++ b/src/dns.c @@ -0,0 +1,1330 @@ +/* + * Name server resolution + * + * Copyright 2020 HAProxy Technologies + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/types.h> + +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/cli.h> +#include <haproxy/dgram.h> +#include <haproxy/dns.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/log.h> +#include <haproxy/ring.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/tools.h> + +static THREAD_LOCAL char *dns_msg_trash; + +DECLARE_STATIC_POOL(dns_session_pool, "dns_session", sizeof(struct dns_session)); +DECLARE_STATIC_POOL(dns_query_pool, "dns_query", sizeof(struct dns_query)); +DECLARE_STATIC_POOL(dns_msg_buf, "dns_msg_buf", DNS_TCP_MSG_RING_MAX_SIZE); + +/* Opens an UDP socket on the namesaver's IP/Port, if required. Returns 0 on + * success, -1 otherwise. ns->dgram must be defined. + */ +static int dns_connect_nameserver(struct dns_nameserver *ns) +{ + struct dgram_conn *dgram = &ns->dgram->conn; + int fd; + + /* Already connected */ + if (dgram->t.sock.fd != -1) + return 0; + + /* Create an UDP socket and connect it on the nameserver's IP/Port */ + if ((fd = socket(dgram->addr.to.ss_family, SOCK_DGRAM, IPPROTO_UDP)) == -1) { + send_log(NULL, LOG_WARNING, + "DNS : section '%s': can't create socket for nameserver '%s'.\n", + ns->counters->pid, ns->id); + return -1; + } + if (connect(fd, (struct sockaddr*)&dgram->addr.to, get_addr_len(&dgram->addr.to)) == -1) { + send_log(NULL, LOG_WARNING, + "DNS : section '%s': can't connect socket for nameserver '%s'.\n", + ns->counters->id, ns->id); + close(fd); + return -1; + } + + /* Make the socket non blocking */ + fd_set_nonblock(fd); + + /* Add the fd in the fd list and update its parameters */ + dgram->t.sock.fd = fd; + fd_insert(fd, dgram, dgram_fd_handler, tgid, tg->threads_enabled); + fd_want_recv(fd); + return 0; +} + +/* Sends a message to a name server + * It returns message length on success + * or -1 in error case + * 0 is returned in case of output ring buffer is full + */ +int dns_send_nameserver(struct dns_nameserver *ns, void *buf, size_t len) +{ + int ret = -1; + + if (ns->dgram) { + struct dgram_conn *dgram = &ns->dgram->conn; + int fd; + + HA_SPIN_LOCK(DNS_LOCK, &dgram->lock); + fd = dgram->t.sock.fd; + if (fd == -1) { + if (dns_connect_nameserver(ns) == -1) { + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + fd = dgram->t.sock.fd; + } + + ret = send(fd, buf, len, 0); + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + struct ist myist; + + myist = ist2(buf, len); + ret = ring_write(ns->dgram->ring_req, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1); + if (!ret) { + ns->counters->snd_error++; + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + fd_cant_send(fd); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return ret; + } + ns->counters->snd_error++; + fd_delete(fd); + dgram->t.sock.fd = -1; + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + ns->counters->sent++; + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + } + else if (ns->stream) { + struct ist myist; + + myist = ist2(buf, len); + ret = ring_write(ns->stream->ring_req, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1); + if (!ret) { + ns->counters->snd_error++; + return -1; + } + task_wakeup(ns->stream->task_req, TASK_WOKEN_MSG); + return ret; + } + + return ret; +} + +void dns_session_free(struct dns_session *); + +/* Receives a dns message + * Returns message length + * 0 is returned if no more message available + * -1 in error case + */ +ssize_t dns_recv_nameserver(struct dns_nameserver *ns, void *data, size_t size) +{ + ssize_t ret = -1; + + if (ns->dgram) { + struct dgram_conn *dgram = &ns->dgram->conn; + int fd; + + HA_SPIN_LOCK(DNS_LOCK, &dgram->lock); + fd = dgram->t.sock.fd; + if (fd == -1) { + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + + if ((ret = recv(fd, data, size, 0)) < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_recv(fd); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return 0; + } + fd_delete(fd); + dgram->t.sock.fd = -1; + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return -1; + } + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + } + else if (ns->stream) { + struct dns_stream_server *dss = ns->stream; + struct dns_session *ds; + + HA_SPIN_LOCK(DNS_LOCK, &dss->lock); + + if (!LIST_ISEMPTY(&dss->wait_sess)) { + ds = LIST_NEXT(&dss->wait_sess, struct dns_session *, waiter); + ret = ds->rx_msg.len < size ? ds->rx_msg.len : size; + memcpy(data, ds->rx_msg.area, ret); + + ds->rx_msg.len = 0; + + /* This barrier is here to ensure that all data is + * stored if the appctx detect the elem is out of the + * list. + */ + __ha_barrier_store(); + + LIST_DEL_INIT(&ds->waiter); + + if (ds->appctx) { + /* This second barrier is here to ensure that + * the waked up appctx won't miss that the elem + * is removed from the list. + */ + __ha_barrier_store(); + + /* awake appctx because it may have other + * message to receive + */ + appctx_wakeup(ds->appctx); + + /* dns_session could already be into free_sess list + * so we firstly remove it */ + LIST_DEL_INIT(&ds->list); + + /* decrease nb_queries to free a slot for a new query on that sess */ + ds->nb_queries--; + if (ds->nb_queries) { + /* it remains pipelined unanswered request + * into this session but we just decrease + * the counter so the session + * can not be full of pipelined requests + * so we can add if to free_sess list + * to receive a new request + */ + LIST_INSERT(&ds->dss->free_sess, &ds->list); + } + else { + /* there is no more pipelined requests + * into this session, so we move it + * to idle_sess list */ + LIST_INSERT(&ds->dss->idle_sess, &ds->list); + + /* update the counter of idle sessions */ + ds->dss->idle_conns++; + + /* Note: this is useless there to update + * the max_active_conns since we increase + * the idle count */ + } + } + else { + /* there is no more appctx for this session + * it means it is ready to die + */ + dns_session_free(ds); + } + + + } + + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + } + + return ret; +} + +static void dns_resolve_recv(struct dgram_conn *dgram) +{ + struct dns_nameserver *ns; + int fd; + + HA_SPIN_LOCK(DNS_LOCK, &dgram->lock); + + fd = dgram->t.sock.fd; + + /* check if ready for reading */ + if ((fd == -1) || !fd_recv_ready(fd)) { + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return; + } + + /* no need to go further if we can't retrieve the nameserver */ + if ((ns = dgram->owner) == NULL) { + _HA_ATOMIC_AND(&fdtab[fd].state, ~(FD_POLL_HUP|FD_POLL_ERR)); + fd_stop_recv(fd); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return; + } + + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + + ns->process_responses(ns); +} + +/* Called when a dns network socket is ready to send data */ +static void dns_resolve_send(struct dgram_conn *dgram) +{ + int fd; + struct dns_nameserver *ns; + struct ring *ring; + struct buffer *buf; + uint64_t msg_len; + size_t len, cnt, ofs; + + HA_SPIN_LOCK(DNS_LOCK, &dgram->lock); + + fd = dgram->t.sock.fd; + + /* check if ready for sending */ + if ((fd == -1) || !fd_send_ready(fd)) { + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return; + } + + /* no need to go further if we can't retrieve the nameserver */ + if ((ns = dgram->owner) == NULL) { + _HA_ATOMIC_AND(&fdtab[fd].state, ~(FD_POLL_HUP|FD_POLL_ERR)); + fd_stop_send(fd); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); + return; + } + + ring = ns->dgram->ring_req; + buf = &ring->buf; + + HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock); + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(ns->dgram->ofs_req == ~0)) { + ns->dgram->ofs_req = b_peek_ofs(buf, 0); + HA_ATOMIC_INC(b_orig(buf) + ns->dgram->ofs_req); + } + + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs = ns->dgram->ofs_req - b_head_ofs(buf); + if (ns->dgram->ofs_req < b_head_ofs(buf)) + ofs += b_size(buf); + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + while (ofs + 1 < b_data(buf)) { + int ret; + + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + if (unlikely(msg_len > DNS_TCP_MSG_MAX_SIZE)) { + /* too large a message to ever fit, let's skip it */ + ofs += cnt + msg_len; + continue; + } + + len = b_getblk(buf, dns_msg_trash, msg_len, ofs + cnt); + + ret = send(fd, dns_msg_trash, len, 0); + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_send(fd); + goto out; + } + ns->counters->snd_error++; + fd_delete(fd); + fd = dgram->t.sock.fd = -1; + goto out; + } + ns->counters->sent++; + + ofs += cnt + len; + } + + /* we don't want/need to be waked up any more for sending + * because all ring content is sent */ + fd_stop_send(fd); + +out: + HA_ATOMIC_INC(b_peek(buf, ofs)); + ns->dgram->ofs_req = b_peek_ofs(buf, ofs); + HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock); + HA_SPIN_UNLOCK(DNS_LOCK, &dgram->lock); +} + +/* proto_udp callback functions for a DNS resolution */ +struct dgram_data_cb dns_dgram_cb = { + .recv = dns_resolve_recv, + .send = dns_resolve_send, +}; + +int dns_dgram_init(struct dns_nameserver *ns, struct sockaddr_storage *sk) +{ + struct dns_dgram_server *dgram; + + if ((dgram = calloc(1, sizeof(*dgram))) == NULL) + return -1; + + /* Leave dgram partially initialized, no FD attached for + * now. */ + dgram->conn.owner = ns; + dgram->conn.data = &dns_dgram_cb; + dgram->conn.t.sock.fd = -1; + dgram->conn.addr.to = *sk; + HA_SPIN_INIT(&dgram->conn.lock); + ns->dgram = dgram; + + dgram->ofs_req = ~0; /* init ring offset */ + dgram->ring_req = ring_new(2*DNS_TCP_MSG_RING_MAX_SIZE); + if (!dgram->ring_req) { + ha_alert("memory allocation error initializing the ring for nameserver.\n"); + goto out; + } + + /* attach the task as reader */ + if (!ring_attach(dgram->ring_req)) { + /* mark server attached to the ring */ + ha_alert("nameserver sets too many watchers > 255 on ring. This is a bug and should not happen.\n"); + goto out; + } + return 0; +out: + ring_free(dgram->ring_req); + + free(dgram); + + return -1; +} + +/* + * IO Handler to handle message push to dns tcp server + * It takes its context from appctx->svcctx. + */ +static void dns_session_io_handler(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct dns_session *ds = appctx->svcctx; + struct ring *ring = &ds->ring; + struct buffer *buf = &ring->buf; + uint64_t msg_len; + int available_room; + size_t len, cnt, ofs; + int ret = 0; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) { + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + goto out; + } + + /* if stopping was requested, close immediately */ + if (unlikely(stopping)) + goto close; + + /* we want to be sure to not miss that we have been awaked for a shutdown */ + __ha_barrier_load(); + + /* that means the connection was requested to shutdown + * for instance idle expire */ + if (ds->shutdown) + goto close; + + /* if the connection is not established, inform the stream that we want + * to be notified whenever the connection completes. + */ + if (sc_opposite(sc)->state < SC_ST_EST) { + applet_need_more_data(appctx); + se_need_remote_conn(appctx->sedesc); + applet_have_more_data(appctx); + goto out; + } + + HA_RWLOCK_WRLOCK(DNS_LOCK, &ring->lock); + LIST_DEL_INIT(&appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ring->lock); + + HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock); + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(ds->ofs == ~0)) { + ds->ofs = b_peek_ofs(buf, 0); + HA_ATOMIC_INC(b_orig(buf) + ds->ofs); + } + + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs = ds->ofs - b_head_ofs(buf); + if (ds->ofs < b_head_ofs(buf)) + ofs += b_size(buf); + + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + /* in following loop, ofs always points to the counter byte that + * precedes the message so that we can take our reference there if we + * have to stop before the end (ret=0). + */ + ret = 1; + while (ofs + 1 < b_data(buf)) { + struct dns_query *query; + uint16_t original_qid; + uint16_t new_qid; + + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + + /* retrieve available room on output channel */ + available_room = channel_recv_max(sc_ic(sc)); + + /* tx_msg_offset null means we are at the start of a new message */ + if (!ds->tx_msg_offset) { + uint16_t slen; + + /* check if there is enough room to put message len and query id */ + if (available_room < sizeof(slen) + sizeof(new_qid)) { + sc_need_room(sc, sizeof(slen) + sizeof(new_qid)); + ret = 0; + break; + } + + /* put msg len into then channel */ + slen = (uint16_t)msg_len; + slen = htons(slen); + applet_putblk(appctx, (char *)&slen, sizeof(slen)); + available_room -= sizeof(slen); + + /* backup original query id */ + len = b_getblk(buf, (char *)&original_qid, sizeof(original_qid), ofs + cnt); + if (!len) { + /* should never happen since messages are atomically + * written into ring + */ + ret = 0; + break; + } + + /* generates new query id */ + new_qid = ++ds->query_counter; + new_qid = htons(new_qid); + + /* put new query id into the channel */ + applet_putblk(appctx, (char *)&new_qid, sizeof(new_qid)); + available_room -= sizeof(new_qid); + + /* keep query id mapping */ + + query = pool_alloc(dns_query_pool); + if (query) { + query->qid.key = new_qid; + query->original_qid = original_qid; + query->expire = tick_add(now_ms, 5000); + LIST_INIT(&query->list); + if (LIST_ISEMPTY(&ds->queries)) { + /* enable task to handle expire */ + ds->task_exp->expire = query->expire; + /* ensure this will be executed by the same + * thread than ds_session_release + * to ensure session_release is free + * to destroy the task */ + task_queue(ds->task_exp); + } + LIST_APPEND(&ds->queries, &query->list); + eb32_insert(&ds->query_ids, &query->qid); + ds->onfly_queries++; + } + + /* update the tx_offset to handle output in 16k streams */ + ds->tx_msg_offset = sizeof(original_qid); + + } + + /* check if it remains available room on output chan */ + if (unlikely(!available_room)) { + sc_need_room(sc, 1); + ret = 0; + break; + } + + chunk_reset(&trash); + if ((msg_len - ds->tx_msg_offset) > available_room) { + /* remaining msg data is too large to be written in output channel at one time */ + + len = b_getblk(buf, trash.area, available_room, ofs + cnt + ds->tx_msg_offset); + + /* update offset to complete mesg forwarding later */ + ds->tx_msg_offset += len; + } + else { + /* remaining msg data can be written in output channel at one time */ + len = b_getblk(buf, trash.area, msg_len - ds->tx_msg_offset, ofs + cnt + ds->tx_msg_offset); + + /* reset tx_msg_offset to mark forward fully processed */ + ds->tx_msg_offset = 0; + } + trash.data += len; + + if (applet_putchk(appctx, &trash) == -1) { + /* should never happen since we + * check available_room is large + * enough here. + */ + ret = 0; + break; + } + + if (ds->tx_msg_offset) { + /* msg was not fully processed, we must be awake to drain pending data */ + sc_need_room(sc, 0); + ret = 0; + break; + } + /* switch to next message */ + ofs += cnt + msg_len; + } + + HA_ATOMIC_INC(b_peek(buf, ofs)); + ds->ofs = b_peek_ofs(buf, ofs); + + HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock); + + if (ret) { + /* let's be woken up once new request to write arrived */ + HA_RWLOCK_WRLOCK(DNS_LOCK, &ring->lock); + BUG_ON(LIST_INLIST(&appctx->wait_entry)); + LIST_APPEND(&ring->waiters, &appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ring->lock); + applet_have_no_more_data(appctx); + } + + /* if session is not a waiter it means there is no committed + * message into rx_buf and we are free to use it + * Note: we need a load barrier here to not miss the + * delete from the list + */ + __ha_barrier_load(); + if (!LIST_INLIST_ATOMIC(&ds->waiter)) { + while (1) { + uint16_t query_id; + struct eb32_node *eb; + struct dns_query *query; + + if (!ds->rx_msg.len) { + /* retrieve message len */ + ret = co_getblk(sc_oc(sc), (char *)&msg_len, 2, 0); + if (ret <= 0) { + if (ret == -1) + goto error; + applet_need_more_data(appctx); + break; + } + + /* mark as consumed */ + co_skip(sc_oc(sc), 2); + + /* store message len */ + ds->rx_msg.len = ntohs(msg_len); + if (!ds->rx_msg.len) + continue; + } + + if (co_data(sc_oc(sc)) + ds->rx_msg.offset < ds->rx_msg.len) { + /* message only partially available */ + + /* read available data */ + ret = co_getblk(sc_oc(sc), ds->rx_msg.area + ds->rx_msg.offset, co_data(sc_oc(sc)), 0); + if (ret <= 0) { + if (ret == -1) + goto error; + applet_need_more_data(appctx); + break; + } + + /* update message offset */ + ds->rx_msg.offset += co_data(sc_oc(sc)); + + /* consume all pending data from the channel */ + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + + /* we need to wait for more data */ + applet_need_more_data(appctx); + break; + } + + /* enough data is available into the channel to read the message until the end */ + + /* read from the channel until the end of the message */ + ret = co_getblk(sc_oc(sc), ds->rx_msg.area + ds->rx_msg.offset, ds->rx_msg.len - ds->rx_msg.offset, 0); + if (ret <= 0) { + if (ret == -1) + goto error; + applet_need_more_data(appctx); + break; + } + + /* consume all data until the end of the message from the channel */ + co_skip(sc_oc(sc), ds->rx_msg.len - ds->rx_msg.offset); + + /* reset reader offset to 0 for next message reand */ + ds->rx_msg.offset = 0; + + /* try remap query id to original */ + memcpy(&query_id, ds->rx_msg.area, sizeof(query_id)); + eb = eb32_lookup(&ds->query_ids, query_id); + if (!eb) { + /* query id not found means we have an unknown corresponding + * request, perhaps server's bug or or the query reached + * timeout + */ + ds->rx_msg.len = 0; + continue; + } + + /* re-map the original query id set by the requester */ + query = eb32_entry(eb, struct dns_query, qid); + memcpy(ds->rx_msg.area, &query->original_qid, sizeof(query->original_qid)); + + /* remove query ids mapping from pending queries list/tree */ + eb32_delete(&query->qid); + LIST_DELETE(&query->list); + pool_free(dns_query_pool, query); + ds->onfly_queries--; + + /* the dns_session is also added in queue of the + * wait_sess list where the task processing + * response will pop available responses + */ + HA_SPIN_LOCK(DNS_LOCK, &ds->dss->lock); + + BUG_ON(LIST_INLIST(&ds->waiter)); + LIST_APPEND(&ds->dss->wait_sess, &ds->waiter); + + HA_SPIN_UNLOCK(DNS_LOCK, &ds->dss->lock); + + /* awake the task processing the responses */ + task_wakeup(ds->dss->task_rsp, TASK_WOKEN_INIT); + + break; + } + } + +out: + return; + +close: + se_fl_set(appctx->sedesc, SE_FL_EOS|SE_FL_EOI); + goto out; + +error: + se_fl_set(appctx->sedesc, SE_FL_ERROR); + goto out; +} + +void dns_queries_flush(struct dns_session *ds) +{ + struct dns_query *query, *queryb; + + list_for_each_entry_safe(query, queryb, &ds->queries, list) { + eb32_delete(&query->qid); + LIST_DELETE(&query->list); + pool_free(dns_query_pool, query); + } +} + +void dns_session_free(struct dns_session *ds) +{ + pool_free(dns_msg_buf, ds->rx_msg.area); + pool_free(dns_msg_buf, ds->tx_ring_area); + task_destroy(ds->task_exp); + + dns_queries_flush(ds); + + /* Ensure to remove this session from external lists + * Note: we are under the lock of dns_stream_server + * which own the heads of those lists. + */ + LIST_DEL_INIT(&ds->waiter); + LIST_DEL_INIT(&ds->list); + + ds->dss->cur_conns--; + /* Note: this is useless to update + * max_active_conns here because + * we decrease the value + */ + + BUG_ON(!LIST_ISEMPTY(&ds->list)); + BUG_ON(!LIST_ISEMPTY(&ds->waiter)); + BUG_ON(!LIST_ISEMPTY(&ds->queries)); + BUG_ON(!LIST_ISEMPTY(&ds->ring.waiters)); + BUG_ON(!eb_is_empty(&ds->query_ids)); + pool_free(dns_session_pool, ds); +} + +static struct appctx *dns_session_create(struct dns_session *ds); + +static int dns_session_init(struct appctx *appctx) +{ + struct dns_session *ds = appctx->svcctx; + struct stream *s; + struct sockaddr_storage *addr = NULL; + + if (!sockaddr_alloc(&addr, &ds->dss->srv->addr, sizeof(ds->dss->srv->addr))) + goto error; + + if (appctx_finalize_startup(appctx, ds->dss->srv->proxy, &BUF_NULL) == -1) + goto error; + + s = appctx_strm(appctx); + s->scb->dst = addr; + s->scb->flags |= (SC_FL_RCV_ONCE|SC_FL_NOLINGER); + s->target = &ds->dss->srv->obj_type; + s->flags = SF_ASSIGNED; + + s->do_log = NULL; + s->uniq_id = 0; + + applet_expect_no_data(appctx); + ds->appctx = appctx; + return 0; + + error: + return -1; +} + +/* + * Function to release a DNS tcp session + */ +static void dns_session_release(struct appctx *appctx) +{ + struct dns_session *ds = appctx->svcctx; + struct dns_stream_server *dss __maybe_unused; + + if (!ds) + return; + + /* We do not call ring_appctx_detach here + * because we want to keep readers counters + * to retry a conn with a different appctx. + */ + HA_RWLOCK_WRLOCK(DNS_LOCK, &ds->ring.lock); + LIST_DEL_INIT(&appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(DNS_LOCK, &ds->ring.lock); + + dss = ds->dss; + + HA_SPIN_LOCK(DNS_LOCK, &dss->lock); + LIST_DEL_INIT(&ds->list); + + if (stopping) { + dns_session_free(ds); + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return; + } + + if (!ds->nb_queries) { + /* this is an idle session */ + /* Note: this is useless to update max_active_sess + * here because we decrease idle_conns but + * dns_session_free decrease curconns + */ + + ds->dss->idle_conns--; + dns_session_free(ds); + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return; + } + + if (ds->onfly_queries == ds->nb_queries) { + /* the session can be released because + * it means that all queries AND + * responses are in fly */ + dns_session_free(ds); + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return; + } + + /* if there is no pending complete response + * message, ensure to reset + * message offsets if the session + * was closed with an incomplete pending response + */ + if (!LIST_INLIST(&ds->waiter)) + ds->rx_msg.len = ds->rx_msg.offset = 0; + + /* we flush pending sent queries because we never + * have responses + */ + ds->nb_queries -= ds->onfly_queries; + dns_queries_flush(ds); + + /* reset offset to be sure to start from message start */ + ds->tx_msg_offset = 0; + + /* here the ofs and the attached counter + * are kept unchanged + */ + + /* Create a new appctx, We hope we can + * create from the release callback! */ + ds->appctx = dns_session_create(ds); + if (!ds->appctx) { + dns_session_free(ds); + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return; + } + + if (ds->nb_queries < DNS_STREAM_MAX_PIPELINED_REQ) + LIST_INSERT(&ds->dss->free_sess, &ds->list); + + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); +} + +/* DNS tcp session applet */ +static struct applet dns_session_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<STRMDNS>", /* used for logging */ + .fct = dns_session_io_handler, + .init = dns_session_init, + .release = dns_session_release, +}; + +/* + * Function used to create an appctx for a DNS session + * It sets its context into appctx->svcctx. + */ +static struct appctx *dns_session_create(struct dns_session *ds) +{ + struct appctx *appctx; + + appctx = appctx_new_here(&dns_session_applet, NULL); + if (!appctx) + goto out_close; + appctx->svcctx = (void *)ds; + + if (appctx_init(appctx) == -1) { + ha_alert("out of memory in dns_session_create().\n"); + goto out_free_appctx; + } + + return appctx; + + /* Error unrolling */ + out_free_appctx: + appctx_free_on_early_error(appctx); + out_close: + return NULL; +} + +/* Task processing expiration of unresponded queries, this one is supposed + * to be stuck on the same thread than the appctx handler + */ +static struct task *dns_process_query_exp(struct task *t, void *context, unsigned int state) +{ + struct dns_session *ds = (struct dns_session *)context; + struct dns_query *query, *queryb; + + t->expire = TICK_ETERNITY; + + list_for_each_entry_safe(query, queryb, &ds->queries, list) { + if (tick_is_expired(query->expire, now_ms)) { + eb32_delete(&query->qid); + LIST_DELETE(&query->list); + pool_free(dns_query_pool, query); + ds->onfly_queries--; + } + else { + t->expire = query->expire; + break; + } + } + + return t; +} + +/* Task processing expiration of idle sessions */ +static struct task *dns_process_idle_exp(struct task *t, void *context, unsigned int state) +{ + struct dns_stream_server *dss = (struct dns_stream_server *)context; + struct dns_session *ds, *dsb; + int target = 0; + int cur_active_conns; + + HA_SPIN_LOCK(DNS_LOCK, &dss->lock); + + + cur_active_conns = dss->cur_conns - dss->idle_conns; + if (cur_active_conns > dss->max_active_conns) + dss->max_active_conns = cur_active_conns; + + target = (dss->max_active_conns - cur_active_conns) / 2; + list_for_each_entry_safe(ds, dsb, &dss->idle_sess, list) { + if (!stopping && !target) + break; + + /* remove conn to pending list to ensure it won't be reused */ + LIST_DEL_INIT(&ds->list); + + /* force session shutdown */ + ds->shutdown = 1; + + /* to be sure that the appctx won't miss shutdown */ + __ha_barrier_store(); + + /* wake appctx to perform the shutdown */ + appctx_wakeup(ds->appctx); + } + + /* reset max to current active conns */ + dss->max_active_conns = cur_active_conns; + + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + + t->expire = tick_add(now_ms, 5000); + + return t; +} + +struct dns_session *dns_session_new(struct dns_stream_server *dss) +{ + struct dns_session *ds; + + if (dss->maxconn && (dss->maxconn <= dss->cur_conns)) + return NULL; + + ds = pool_zalloc(dns_session_pool); + if (!ds) + return NULL; + + ds->ofs = ~0; + ds->dss = dss; + LIST_INIT(&ds->list); + LIST_INIT(&ds->queries); + LIST_INIT(&ds->waiter); + ds->rx_msg.offset = ds->rx_msg.len = 0; + ds->rx_msg.area = NULL; + ds->tx_ring_area = NULL; + ds->task_exp = NULL; + ds->appctx = NULL; + ds->shutdown = 0; + ds->nb_queries = 0; + ds->query_ids = EB_ROOT_UNIQUE; + ds->rx_msg.area = pool_alloc(dns_msg_buf); + if (!ds->rx_msg.area) + goto error; + + ds->tx_ring_area = pool_alloc(dns_msg_buf); + if (!ds->tx_ring_area) + goto error; + + ring_init(&ds->ring, ds->tx_ring_area, DNS_TCP_MSG_RING_MAX_SIZE); + /* never fail because it is the first watcher attached to the ring */ + DISGUISE(ring_attach(&ds->ring)); + + if ((ds->task_exp = task_new_here()) == NULL) + goto error; + + ds->task_exp->process = dns_process_query_exp; + ds->task_exp->context = ds; + + ds->appctx = dns_session_create(ds); + if (!ds->appctx) + goto error; + + dss->cur_conns++; + + return ds; + +error: + task_destroy(ds->task_exp); + pool_free(dns_msg_buf, ds->rx_msg.area); + pool_free(dns_msg_buf, ds->tx_ring_area); + + pool_free(dns_session_pool, ds); + + return NULL; +} + +/* + * Task used to consume pending messages from nameserver ring + * and forward them to dns_session ring. + * Note: If no slot found a new dns_session is allocated + */ +static struct task *dns_process_req(struct task *t, void *context, unsigned int state) +{ + struct dns_nameserver *ns = (struct dns_nameserver *)context; + struct dns_stream_server *dss = ns->stream; + struct ring *ring = dss->ring_req; + struct buffer *buf = &ring->buf; + uint64_t msg_len; + size_t len, cnt, ofs; + struct dns_session *ds, *ads; + HA_SPIN_LOCK(DNS_LOCK, &dss->lock); + + HA_RWLOCK_RDLOCK(DNS_LOCK, &ring->lock); + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(dss->ofs_req == ~0)) { + dss->ofs_req = b_peek_ofs(buf, 0); + HA_ATOMIC_INC(b_orig(buf) + dss->ofs_req); + } + + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs = dss->ofs_req - b_head_ofs(buf); + if (dss->ofs_req < b_head_ofs(buf)) + ofs += b_size(buf); + + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + while (ofs + 1 < b_data(buf)) { + struct ist myist; + + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + if (unlikely(msg_len > DNS_TCP_MSG_MAX_SIZE)) { + /* too large a message to ever fit, let's skip it */ + ofs += cnt + msg_len; + continue; + } + + len = b_getblk(buf, dns_msg_trash, msg_len, ofs + cnt); + + myist = ist2(dns_msg_trash, len); + + ads = NULL; + /* try to push request into active sess with free slot */ + if (!LIST_ISEMPTY(&dss->free_sess)) { + ds = LIST_NEXT(&dss->free_sess, struct dns_session *, list); + + if (ring_write(&ds->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1) > 0) { + ds->nb_queries++; + if (ds->nb_queries >= DNS_STREAM_MAX_PIPELINED_REQ) + LIST_DEL_INIT(&ds->list); + ads = ds; + } + else { + /* it means we were unable to put a request in this slot, + * it may be close to be full so we put it at the end + * of free conn list */ + LIST_DEL_INIT(&ds->list); + LIST_APPEND(&dss->free_sess, &ds->list); + } + } + + if (!ads) { + /* try to push request into idle, this one should have enough free space */ + if (!LIST_ISEMPTY(&dss->idle_sess)) { + ds = LIST_NEXT(&dss->idle_sess, struct dns_session *, list); + + /* ring is empty so this ring_write should never fail */ + ring_write(&ds->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1); + ds->nb_queries++; + LIST_DEL_INIT(&ds->list); + + ds->dss->idle_conns--; + + /* we may have to update the max_active_conns */ + if (ds->dss->max_active_conns < ds->dss->cur_conns - ds->dss->idle_conns) + ds->dss->max_active_conns = ds->dss->cur_conns - ds->dss->idle_conns; + + /* since we may unable to find a free list to handle + * this request, this request may be large and fill + * the ring buffer so we prefer to put at the end of free + * list. */ + LIST_APPEND(&dss->free_sess, &ds->list); + ads = ds; + } + } + + /* we didn't find a session available with large enough room */ + if (!ads) { + /* allocate a new session */ + ads = dns_session_new(dss); + if (ads) { + /* ring is empty so this ring_write should never fail */ + ring_write(&ads->ring, DNS_TCP_MSG_MAX_SIZE, NULL, 0, &myist, 1); + ads->nb_queries++; + LIST_INSERT(&dss->free_sess, &ads->list); + } + else + ns->counters->snd_error++; + } + + if (ads) + ns->counters->sent++; + + ofs += cnt + len; + } + + HA_ATOMIC_INC(b_peek(buf, ofs)); + dss->ofs_req = b_peek_ofs(buf, ofs); + HA_RWLOCK_RDUNLOCK(DNS_LOCK, &ring->lock); + + + HA_SPIN_UNLOCK(DNS_LOCK, &dss->lock); + return t; +} + +/* + * Task used to consume response + * Note: upper layer callback is called + */ +static struct task *dns_process_rsp(struct task *t, void *context, unsigned int state) +{ + struct dns_nameserver *ns = (struct dns_nameserver *)context; + + ns->process_responses(ns); + + return t; +} + +/* Function used to initialize an TCP nameserver */ +int dns_stream_init(struct dns_nameserver *ns, struct server *srv) +{ + struct dns_stream_server *dss = NULL; + + dss = calloc(1, sizeof(*dss)); + if (!dss) { + ha_alert("memory allocation error initializing dns tcp server '%s'.\n", srv->id); + goto out; + } + + dss->srv = srv; + dss->maxconn = srv->maxconn; + + dss->ofs_req = ~0; /* init ring offset */ + dss->ring_req = ring_new(2*DNS_TCP_MSG_RING_MAX_SIZE); + if (!dss->ring_req) { + ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id); + goto out; + } + /* Create the task associated to the resolver target handling conns */ + if ((dss->task_req = task_new_anywhere()) == NULL) { + ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id); + goto out; + } + + /* Update task's parameters */ + dss->task_req->process = dns_process_req; + dss->task_req->context = ns; + + /* attach the task as reader */ + if (!ring_attach(dss->ring_req)) { + /* mark server attached to the ring */ + ha_alert("server '%s': too many watchers for ring. this should never happen.\n", srv->id); + goto out; + } + + /* Create the task associated to the resolver target handling conns */ + if ((dss->task_rsp = task_new_anywhere()) == NULL) { + ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id); + goto out; + } + + /* Update task's parameters */ + dss->task_rsp->process = dns_process_rsp; + dss->task_rsp->context = ns; + + /* Create the task associated to the resolver target handling conns */ + if ((dss->task_idle = task_new_anywhere()) == NULL) { + ha_alert("memory allocation error initializing the ring for dns tcp server '%s'.\n", srv->id); + goto out; + } + + /* Update task's parameters */ + dss->task_idle->process = dns_process_idle_exp; + dss->task_idle->context = dss; + dss->task_idle->expire = tick_add(now_ms, 5000); + + /* let start the task to free idle conns immediately */ + task_queue(dss->task_idle); + + LIST_INIT(&dss->free_sess); + LIST_INIT(&dss->idle_sess); + LIST_INIT(&dss->wait_sess); + HA_SPIN_INIT(&dss->lock); + ns->stream = dss; + return 0; +out: + if (dss && dss->task_rsp) + task_destroy(dss->task_rsp); + if (dss && dss->task_req) + task_destroy(dss->task_req); + if (dss && dss->ring_req) + ring_free(dss->ring_req); + + free(dss); + return -1; +} + +int init_dns_buffers() +{ + dns_msg_trash = malloc(DNS_TCP_MSG_MAX_SIZE); + if (!dns_msg_trash) + return 0; + + return 1; +} + +void deinit_dns_buffers() +{ + ha_free(&dns_msg_trash); +} + +REGISTER_PER_THREAD_ALLOC(init_dns_buffers); +REGISTER_PER_THREAD_FREE(deinit_dns_buffers); diff --git a/src/dynbuf.c b/src/dynbuf.c new file mode 100644 index 0000000..712e334 --- /dev/null +++ b/src/dynbuf.c @@ -0,0 +1,129 @@ +/* + * Buffer management functions. + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <stdio.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/dynbuf.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/pool.h> + +struct pool_head *pool_head_buffer __read_mostly; + +/* perform minimal initializations, report 0 in case of error, 1 if OK. */ +int init_buffer() +{ + void *buffer; + int thr; + int done; + + pool_head_buffer = create_pool("buffer", global.tune.bufsize, MEM_F_SHARED|MEM_F_EXACT); + if (!pool_head_buffer) + return 0; + + for (thr = 0; thr < MAX_THREADS; thr++) + LIST_INIT(&ha_thread_ctx[thr].buffer_wq); + + + /* The reserved buffer is what we leave behind us. Thus we always need + * at least one extra buffer in minavail otherwise we'll end up waking + * up tasks with no memory available, causing a lot of useless wakeups. + * That means that we always want to have at least 3 buffers available + * (2 for current session, one for next session that might be needed to + * release a server connection). + */ + pool_head_buffer->minavail = MAX(global.tune.reserved_bufs, 3); + if (global.tune.buf_limit) + pool_head_buffer->limit = global.tune.buf_limit; + + for (done = 0; done < pool_head_buffer->minavail - 1; done++) { + buffer = pool_alloc_nocache(pool_head_buffer, init_buffer); + if (!buffer) + return 0; + pool_free(pool_head_buffer, buffer); + } + return 1; +} + +/* + * Dumps part or all of a buffer. + */ +void buffer_dump(FILE *o, struct buffer *b, int from, int to) +{ + fprintf(o, "Dumping buffer %p\n", b); + fprintf(o, " orig=%p size=%u head=%u tail=%u data=%u\n", + b_orig(b), (unsigned int)b_size(b), (unsigned int)b_head_ofs(b), (unsigned int)b_tail_ofs(b), (unsigned int)b_data(b)); + + fprintf(o, "Dumping contents from byte %d to byte %d\n", from, to); + fprintf(o, " 0 1 2 3 4 5 6 7 8 9 a b c d e f\n"); + /* dump hexa */ + while (from < to) { + int i; + + fprintf(o, " %04x: ", from); + for (i = 0; ((from + i) < to) && (i < 16) ; i++) { + fprintf(o, "%02x ", (unsigned char)b_orig(b)[from + i]); + if (i == 7) + fprintf(o, "- "); + } + if (to - from < 16) { + int j = 0; + + for (j = 0; j < from + 16 - to; j++) + fprintf(o, " "); + if (j > 8) + fprintf(o, " "); + } + fprintf(o, " "); + for (i = 0; (from + i < to) && (i < 16) ; i++) { + fprintf(o, "%c", isprint((unsigned char)b_orig(b)[from + i]) ? b_orig(b)[from + i] : '.') ; + if ((i == 15) && ((from + i) != to-1)) + fprintf(o, "\n"); + } + from += i; + } + fprintf(o, "\n--\n"); + fflush(o); +} + +/* see offer_buffers() for details */ +void __offer_buffers(void *from, unsigned int count) +{ + struct buffer_wait *wait, *wait_back; + + /* For now, we consider that all objects need 1 buffer, so we can stop + * waking up them once we have enough of them to eat all the available + * buffers. Note that we don't really know if they are streams or just + * other tasks, but that's a rough estimate. Similarly, for each cached + * event we'll need 1 buffer. + */ + list_for_each_entry_safe(wait, wait_back, &th_ctx->buffer_wq, list) { + if (!count) + break; + + if (wait->target == from || !wait->wakeup_cb(wait->target)) + continue; + + LIST_DEL_INIT(&wait->list); + count--; + } +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/eb32sctree.c b/src/eb32sctree.c new file mode 100644 index 0000000..af6a539 --- /dev/null +++ b/src/eb32sctree.c @@ -0,0 +1,472 @@ +/* + * Elastic Binary Trees - exported functions for operations on 32bit nodes. + * Version 6.0.6 with backports from v7-dev + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Consult eb32sctree.h for more details about those functions */ + +#include <import/eb32sctree.h> + + +/* This function is used to build a tree of duplicates by adding a new node to + * a subtree of at least 2 entries. + */ +struct eb32sc_node *eb32sc_insert_dup(struct eb_node *sub, struct eb_node *new, unsigned long scope) +{ + struct eb32sc_node *eb32; + struct eb_node *head = sub; + eb_troot_t *new_left = eb_dotag(&new->branches, EB_LEFT); + eb_troot_t *new_rght = eb_dotag(&new->branches, EB_RGHT); + eb_troot_t *new_leaf = eb_dotag(&new->branches, EB_LEAF); + + /* first, identify the deepest hole on the right branch */ + while (eb_gettag(head->branches.b[EB_RGHT]) != EB_LEAF) { + struct eb_node *last = head; + + head = container_of(eb_untag(head->branches.b[EB_RGHT], EB_NODE), + struct eb_node, branches); + + if (unlikely(head->bit > last->bit + 1)) { + /* there's a hole here, we must assign the top of the + * following sub-tree to <sub> and mark all intermediate + * nodes with the scope mask. + */ + do { + eb32 = container_of(sub, struct eb32sc_node, node); + if (!(eb32->node_s & scope)) + eb32->node_s |= scope; + + sub = container_of(eb_untag(sub->branches.b[EB_RGHT], EB_NODE), + struct eb_node, branches); + } while (sub != head); + } + + eb32 = container_of(head, struct eb32sc_node, node); + if (!(eb32->node_s & scope)) + eb32->node_s |= scope; + } + + /* Here we have a leaf attached to (head)->b[EB_RGHT] */ + if (head->bit < -1) { + /* A hole exists just before the leaf, we insert there */ + new->bit = -1; + sub = container_of(eb_untag(head->branches.b[EB_RGHT], EB_LEAF), + struct eb_node, branches); + head->branches.b[EB_RGHT] = eb_dotag(&new->branches, EB_NODE); + + new->node_p = sub->leaf_p; + new->leaf_p = new_rght; + sub->leaf_p = new_left; + new->branches.b[EB_LEFT] = eb_dotag(&sub->branches, EB_LEAF); + new->branches.b[EB_RGHT] = new_leaf; + eb32 = container_of(new, struct eb32sc_node, node); + eb32->node_s = container_of(sub, struct eb32sc_node, node)->leaf_s | scope; + return eb32; + } else { + int side; + /* No hole was found before a leaf. We have to insert above + * <sub>. Note that we cannot be certain that <sub> is attached + * to the right of its parent, as this is only true if <sub> + * is inside the dup tree, not at the head. + */ + new->bit = sub->bit - 1; /* install at the lowest level */ + side = eb_gettag(sub->node_p); + head = container_of(eb_untag(sub->node_p, side), struct eb_node, branches); + head->branches.b[side] = eb_dotag(&new->branches, EB_NODE); + + new->node_p = sub->node_p; + new->leaf_p = new_rght; + sub->node_p = new_left; + new->branches.b[EB_LEFT] = eb_dotag(&sub->branches, EB_NODE); + new->branches.b[EB_RGHT] = new_leaf; + eb32 = container_of(new, struct eb32sc_node, node); + eb32->node_s = container_of(sub, struct eb32sc_node, node)->node_s | scope; + return eb32; + } +} + +/* Insert eb32sc_node <new> into subtree starting at node root <root>. Only + * new->key needs be set with the key. The eb32sc_node is returned. This + * implementation does NOT support unique trees. + */ +struct eb32sc_node *eb32sc_insert(struct eb_root *root, struct eb32sc_node *new, unsigned long scope) +{ + struct eb32sc_node *old; + unsigned int side; + eb_troot_t *troot, **up_ptr; + u32 newkey; /* caching the key saves approximately one cycle */ + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf; + int old_node_bit; + unsigned long old_scope; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + new->node_s = scope; + new->leaf_s = scope; + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. <newkey> carries the key being inserted. + */ + newkey = new->key; + + while (1) { + if (eb_gettag(troot) == EB_LEAF) { + /* insert above a leaf */ + old = container_of(eb_untag(troot, EB_LEAF), + struct eb32sc_node, node.branches); + new->node.node_p = old->node.leaf_p; + up_ptr = &old->node.leaf_p; + old_scope = old->leaf_s; + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct eb32sc_node, node.branches); + old_node_bit = old->node.bit; + + /* our new node will be found through this one, we must mark it */ + if ((old->node_s | scope) != old->node_s) + old->node_s |= scope; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. + */ + + if ((old_node_bit < 0) || /* we're above a duplicate tree, stop here */ + (((new->key ^ old->key) >> old_node_bit) >= EB_NODE_BRANCHES)) { + /* The tree did not contain the key, so we insert <new> before the node + * <old>, and set ->bit to designate the lowest bit position in <new> + * which applies to ->branches.b[]. + */ + new->node.node_p = old->node.node_p; + up_ptr = &old->node.node_p; + old_scope = old->node_s; + break; + } + + /* walk down */ + root = &old->node.branches; + side = (newkey >> old_node_bit) & EB_NODE_BRANCH_MASK; + troot = root->b[side]; + } + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + + /* We need the common higher bits between new->key and old->key. + * What differences are there between new->key and the node here ? + * NOTE that bit(new) is always < bit(root) because highest + * bit of new->key and old->key are identical here (otherwise they + * would sit on different branches). + */ + + // note that if EB_NODE_BITS > 1, we should check that it's still >= 0 + new->node.bit = flsnz(new->key ^ old->key) - EB_NODE_BITS; + new->leaf_s = scope; + new->node_s = old_scope | scope; + + if (new->key == old->key) { + new->node.bit = -1; /* mark as new dup tree, just in case */ + + if (eb_gettag(troot) != EB_LEAF) { + /* there was already a dup tree below */ + return eb32sc_insert_dup(&old->node, &new->node, scope); + } + /* otherwise fall through */ + } + + if (new->key >= old->key) { + new->node.branches.b[EB_LEFT] = troot; + new->node.branches.b[EB_RGHT] = new_leaf; + new->node.leaf_p = new_rght; + *up_ptr = new_left; + } + else { + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = troot; + new->node.leaf_p = new_left; + *up_ptr = new_rght; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; +} + +/* + * Find the first occurrence of the lowest key in the tree <root>, which is + * equal to or greater than <x>. NULL is returned is no key matches. + */ +struct eb32sc_node *eb32sc_lookup_ge(struct eb_root *root, u32 x, unsigned long scope) +{ + struct eb32sc_node *node; + eb_troot_t *troot; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + /* We reached a leaf, which means that the whole upper + * parts were common. We will return either the current + * node or its next one if the former is too small. + */ + node = container_of(eb_untag(troot, EB_LEAF), + struct eb32sc_node, node.branches); + if ((node->leaf_s & scope) && node->key >= x) + return node; + /* return next */ + troot = node->node.leaf_p; + break; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb32sc_node, node.branches); + + if (node->node.bit < 0) { + /* We're at the top of a dup tree. Either we got a + * matching value and we return the leftmost node, or + * we don't and we skip the whole subtree to return the + * next node after the subtree. Note that since we're + * at the top of the dup tree, we can simply return the + * next node without first trying to escape from the + * tree. + */ + if ((node->node_s & scope) && node->key >= x) + troot = eb_dotag(&node->node.branches, EB_LEFT); + else + troot = node->node.node_p; + break; + } + + if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) { + /* No more common bits at all. Either this node is too + * large and we need to get its lowest value, or it is too + * small, and we need to get the next value. + */ + if ((node->node_s & scope) && (node->key >> node->node.bit) > (x >> node->node.bit)) + troot = eb_dotag(&node->node.branches, EB_LEFT); + else + troot = node->node.node_p; + break; + } + troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } + + /* If we get here, it means we want to report next node after the + * current one which is not below. <troot> is already initialised + * to the parent's branches. + */ + return eb32sc_next_with_parent(troot, scope); +} + +/* + * Find the first occurrence of the lowest key in the tree <root> which is + * equal to or greater than <x>, matching scope <scope>. If not found, it loops + * back to the beginning of the tree. NULL is returned is no key matches. + */ +struct eb32sc_node *eb32sc_lookup_ge_or_first(struct eb_root *root, u32 x, unsigned long scope) +{ + struct eb32sc_node *eb32; + eb_troot_t *troot; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + /* We reached a leaf, which means that the whole upper + * parts were common. We will return either the current + * node or its next one if the former is too small. + */ + eb32 = container_of(eb_untag(troot, EB_LEAF), + struct eb32sc_node, node.branches); + if ((eb32->leaf_s & scope) && eb32->key >= x) + return eb32; + /* return next */ + troot = eb32->node.leaf_p; + break; + } + eb32 = container_of(eb_untag(troot, EB_NODE), + struct eb32sc_node, node.branches); + + if (eb32->node.bit < 0) { + /* We're at the top of a dup tree. Either we got a + * matching value and we return the leftmost node, or + * we don't and we skip the whole subtree to return the + * next node after the subtree. Note that since we're + * at the top of the dup tree, we can simply return the + * next node without first trying to escape from the + * tree. + */ + if ((eb32->node_s & scope) && eb32->key >= x) + troot = eb_dotag(&eb32->node.branches, EB_LEFT); + else + troot = eb32->node.node_p; + break; + } + + if (((x ^ eb32->key) >> eb32->node.bit) >= EB_NODE_BRANCHES) { + /* No more common bits at all. Either this node is too + * large and we need to get its lowest value, or it is too + * small, and we need to get the next value. + */ + if ((eb32->node_s & scope) && (eb32->key >> eb32->node.bit) > (x >> eb32->node.bit)) + troot = eb_dotag(&eb32->node.branches, EB_LEFT); + else + troot = eb32->node.node_p; + break; + } + troot = eb32->node.branches.b[(x >> eb32->node.bit) & EB_NODE_BRANCH_MASK]; + } + + /* If we get here, it means we want to report next node after the + * current one which is not below. <troot> is already initialised + * to the parent's branches. + */ + eb32 = eb32sc_next_with_parent(troot, scope); + if (!eb32) + eb32 = eb32sc_walk_down_left(root->b[EB_LEFT], scope); + + return eb32; +} + +/* Removes a leaf node from the tree if it was still in it. Marks the node + * as unlinked. + */ +void eb32sc_delete(struct eb32sc_node *eb32) +{ + struct eb_node *node = &eb32->node; + unsigned int pside, gpside, sibtype; + struct eb_node *parent; + struct eb_root *gparent; + unsigned long scope; + + if (!node->leaf_p) + return; + + /* we need the parent, our side, and the grand parent */ + pside = eb_gettag(node->leaf_p); + parent = eb_root_to_node(eb_untag(node->leaf_p, pside)); + + /* We likely have to release the parent link, unless it's the root, + * in which case we only set our branch to NULL. Note that we can + * only be attached to the root by its left branch. + */ + + if (eb_clrtag(parent->branches.b[EB_RGHT]) == NULL) { + /* we're just below the root, it's trivial. */ + parent->branches.b[EB_LEFT] = NULL; + goto delete_unlink; + } + + /* To release our parent, we have to identify our sibling, and reparent + * it directly to/from the grand parent. Note that the sibling can + * either be a link or a leaf. + */ + + gpside = eb_gettag(parent->node_p); + gparent = eb_untag(parent->node_p, gpside); + + gparent->b[gpside] = parent->branches.b[!pside]; + sibtype = eb_gettag(gparent->b[gpside]); + + if (sibtype == EB_LEAF) { + eb_root_to_node(eb_untag(gparent->b[gpside], EB_LEAF))->leaf_p = + eb_dotag(gparent, gpside); + } else { + eb_root_to_node(eb_untag(gparent->b[gpside], EB_NODE))->node_p = + eb_dotag(gparent, gpside); + } + /* Mark the parent unused. Note that we do not check if the parent is + * our own node, but that's not a problem because if it is, it will be + * marked unused at the same time, which we'll use below to know we can + * safely remove it. + */ + parent->node_p = NULL; + + /* The parent node has been detached, and is currently unused. It may + * belong to another node, so we cannot remove it that way. Also, our + * own node part might still be used. so we can use this spare node + * to replace ours if needed. + */ + + /* If our link part is unused, we can safely exit now */ + if (!node->node_p) + goto delete_unlink; + + /* From now on, <node> and <parent> are necessarily different, and the + * <node>'s node part is in use. By definition, <parent> is at least + * below <node>, so keeping its key for the bit string is OK. However + * its scope must be enlarged to cover the new branch it absorbs. + */ + + parent->node_p = node->node_p; + parent->branches = node->branches; + parent->bit = node->bit; + + /* We must now update the new node's parent... */ + gpside = eb_gettag(parent->node_p); + gparent = eb_untag(parent->node_p, gpside); + gparent->b[gpside] = eb_dotag(&parent->branches, EB_NODE); + + /* ... and its branches */ + scope = 0; + for (pside = 0; pside <= 1; pside++) { + if (eb_gettag(parent->branches.b[pside]) == EB_NODE) { + eb_root_to_node(eb_untag(parent->branches.b[pside], EB_NODE))->node_p = + eb_dotag(&parent->branches, pside); + scope |= container_of(eb_untag(parent->branches.b[pside], EB_NODE), struct eb32sc_node, node.branches)->node_s; + } else { + eb_root_to_node(eb_untag(parent->branches.b[pside], EB_LEAF))->leaf_p = + eb_dotag(&parent->branches, pside); + scope |= container_of(eb_untag(parent->branches.b[pside], EB_LEAF), struct eb32sc_node, node.branches)->leaf_s; + } + } + container_of(parent, struct eb32sc_node, node)->node_s = scope; + + delete_unlink: + /* Now the node has been completely unlinked */ + node->leaf_p = NULL; + return; /* tree is not empty yet */ +} diff --git a/src/eb32tree.c b/src/eb32tree.c new file mode 100644 index 0000000..38ddab0 --- /dev/null +++ b/src/eb32tree.c @@ -0,0 +1,218 @@ +/* + * Elastic Binary Trees - exported functions for operations on 32bit nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Consult eb32tree.h for more details about those functions */ + +#include <import/eb32tree.h> + +struct eb32_node *eb32_insert(struct eb_root *root, struct eb32_node *new) +{ + return __eb32_insert(root, new); +} + +struct eb32_node *eb32i_insert(struct eb_root *root, struct eb32_node *new) +{ + return __eb32i_insert(root, new); +} + +struct eb32_node *eb32_lookup(struct eb_root *root, u32 x) +{ + return __eb32_lookup(root, x); +} + +struct eb32_node *eb32i_lookup(struct eb_root *root, s32 x) +{ + return __eb32i_lookup(root, x); +} + +/* + * Find the last occurrence of the highest key in the tree <root>, which is + * equal to or less than <x>. NULL is returned is no key matches. + */ +struct eb32_node *eb32_lookup_le(struct eb_root *root, u32 x) +{ + struct eb32_node *node; + eb_troot_t *troot; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + /* We reached a leaf, which means that the whole upper + * parts were common. We will return either the current + * node or its next one if the former is too small. + */ + node = container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + if (node->key <= x) + return node; + /* return prev */ + troot = node->node.leaf_p; + break; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb32_node, node.branches); + + if (node->node.bit < 0) { + /* We're at the top of a dup tree. Either we got a + * matching value and we return the rightmost node, or + * we don't and we skip the whole subtree to return the + * prev node before the subtree. Note that since we're + * at the top of the dup tree, we can simply return the + * prev node without first trying to escape from the + * tree. + */ + if (node->key <= x) { + troot = node->node.branches.b[EB_RGHT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_RGHT]; + return container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + } + /* return prev */ + troot = node->node.node_p; + break; + } + + if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) { + /* No more common bits at all. Either this node is too + * small and we need to get its highest value, or it is + * too large, and we need to get the prev value. + */ + if ((node->key >> node->node.bit) < (x >> node->node.bit)) { + troot = node->node.branches.b[EB_RGHT]; + return eb32_entry(eb_walk_down(troot, EB_RGHT), struct eb32_node, node); + } + + /* Further values will be too high here, so return the prev + * unique node (if it exists). + */ + troot = node->node.node_p; + break; + } + troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } + + /* If we get here, it means we want to report previous node before the + * current one which is not above. <troot> is already initialised to + * the parent's branches. + */ + while (eb_gettag(troot) == EB_LEFT) { + /* Walking up from left branch. We must ensure that we never + * walk beyond root. + */ + if (unlikely(eb_clrtag((eb_untag(troot, EB_LEFT))->b[EB_RGHT]) == NULL)) + return NULL; + troot = (eb_root_to_node(eb_untag(troot, EB_LEFT)))->node_p; + } + /* Note that <troot> cannot be NULL at this stage */ + troot = (eb_untag(troot, EB_RGHT))->b[EB_LEFT]; + node = eb32_entry(eb_walk_down(troot, EB_RGHT), struct eb32_node, node); + return node; +} + +/* + * Find the first occurrence of the lowest key in the tree <root>, which is + * equal to or greater than <x>. NULL is returned is no key matches. + */ +struct eb32_node *eb32_lookup_ge(struct eb_root *root, u32 x) +{ + struct eb32_node *node; + eb_troot_t *troot; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + /* We reached a leaf, which means that the whole upper + * parts were common. We will return either the current + * node or its next one if the former is too small. + */ + node = container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + if (node->key >= x) + return node; + /* return next */ + troot = node->node.leaf_p; + break; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb32_node, node.branches); + + if (node->node.bit < 0) { + /* We're at the top of a dup tree. Either we got a + * matching value and we return the leftmost node, or + * we don't and we skip the whole subtree to return the + * next node after the subtree. Note that since we're + * at the top of the dup tree, we can simply return the + * next node without first trying to escape from the + * tree. + */ + if (node->key >= x) { + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + return container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + } + /* return next */ + troot = node->node.node_p; + break; + } + + if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) { + /* No more common bits at all. Either this node is too + * large and we need to get its lowest value, or it is too + * small, and we need to get the next value. + */ + if ((node->key >> node->node.bit) > (x >> node->node.bit)) { + troot = node->node.branches.b[EB_LEFT]; + return eb32_entry(eb_walk_down(troot, EB_LEFT), struct eb32_node, node); + } + + /* Further values will be too low here, so return the next + * unique node (if it exists). + */ + troot = node->node.node_p; + break; + } + troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } + + /* If we get here, it means we want to report next node after the + * current one which is not below. <troot> is already initialised + * to the parent's branches. + */ + while (eb_gettag(troot) != EB_LEFT) + /* Walking up from right branch, so we cannot be below root */ + troot = (eb_root_to_node(eb_untag(troot, EB_RGHT)))->node_p; + + /* Note that <troot> cannot be NULL at this stage */ + troot = (eb_untag(troot, EB_LEFT))->b[EB_RGHT]; + if (eb_clrtag(troot) == NULL) + return NULL; + + node = eb32_entry(eb_walk_down(troot, EB_LEFT), struct eb32_node, node); + return node; +} diff --git a/src/eb64tree.c b/src/eb64tree.c new file mode 100644 index 0000000..b908d4d --- /dev/null +++ b/src/eb64tree.c @@ -0,0 +1,218 @@ +/* + * Elastic Binary Trees - exported functions for operations on 64bit nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Consult eb64tree.h for more details about those functions */ + +#include <import/eb64tree.h> + +struct eb64_node *eb64_insert(struct eb_root *root, struct eb64_node *new) +{ + return __eb64_insert(root, new); +} + +struct eb64_node *eb64i_insert(struct eb_root *root, struct eb64_node *new) +{ + return __eb64i_insert(root, new); +} + +struct eb64_node *eb64_lookup(struct eb_root *root, u64 x) +{ + return __eb64_lookup(root, x); +} + +struct eb64_node *eb64i_lookup(struct eb_root *root, s64 x) +{ + return __eb64i_lookup(root, x); +} + +/* + * Find the last occurrence of the highest key in the tree <root>, which is + * equal to or less than <x>. NULL is returned is no key matches. + */ +struct eb64_node *eb64_lookup_le(struct eb_root *root, u64 x) +{ + struct eb64_node *node; + eb_troot_t *troot; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + /* We reached a leaf, which means that the whole upper + * parts were common. We will return either the current + * node or its next one if the former is too small. + */ + node = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + if (node->key <= x) + return node; + /* return prev */ + troot = node->node.leaf_p; + break; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb64_node, node.branches); + + if (node->node.bit < 0) { + /* We're at the top of a dup tree. Either we got a + * matching value and we return the rightmost node, or + * we don't and we skip the whole subtree to return the + * prev node before the subtree. Note that since we're + * at the top of the dup tree, we can simply return the + * prev node without first trying to escape from the + * tree. + */ + if (node->key <= x) { + troot = node->node.branches.b[EB_RGHT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_RGHT]; + return container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + } + /* return prev */ + troot = node->node.node_p; + break; + } + + if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) { + /* No more common bits at all. Either this node is too + * small and we need to get its highest value, or it is + * too large, and we need to get the prev value. + */ + if ((node->key >> node->node.bit) < (x >> node->node.bit)) { + troot = node->node.branches.b[EB_RGHT]; + return eb64_entry(eb_walk_down(troot, EB_RGHT), struct eb64_node, node); + } + + /* Further values will be too high here, so return the prev + * unique node (if it exists). + */ + troot = node->node.node_p; + break; + } + troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } + + /* If we get here, it means we want to report previous node before the + * current one which is not above. <troot> is already initialised to + * the parent's branches. + */ + while (eb_gettag(troot) == EB_LEFT) { + /* Walking up from left branch. We must ensure that we never + * walk beyond root. + */ + if (unlikely(eb_clrtag((eb_untag(troot, EB_LEFT))->b[EB_RGHT]) == NULL)) + return NULL; + troot = (eb_root_to_node(eb_untag(troot, EB_LEFT)))->node_p; + } + /* Note that <troot> cannot be NULL at this stage */ + troot = (eb_untag(troot, EB_RGHT))->b[EB_LEFT]; + node = eb64_entry(eb_walk_down(troot, EB_RGHT), struct eb64_node, node); + return node; +} + +/* + * Find the first occurrence of the lowest key in the tree <root>, which is + * equal to or greater than <x>. NULL is returned is no key matches. + */ +struct eb64_node *eb64_lookup_ge(struct eb_root *root, u64 x) +{ + struct eb64_node *node; + eb_troot_t *troot; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + /* We reached a leaf, which means that the whole upper + * parts were common. We will return either the current + * node or its next one if the former is too small. + */ + node = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + if (node->key >= x) + return node; + /* return next */ + troot = node->node.leaf_p; + break; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb64_node, node.branches); + + if (node->node.bit < 0) { + /* We're at the top of a dup tree. Either we got a + * matching value and we return the leftmost node, or + * we don't and we skip the whole subtree to return the + * next node after the subtree. Note that since we're + * at the top of the dup tree, we can simply return the + * next node without first trying to escape from the + * tree. + */ + if (node->key >= x) { + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + return container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + } + /* return next */ + troot = node->node.node_p; + break; + } + + if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) { + /* No more common bits at all. Either this node is too + * large and we need to get its lowest value, or it is too + * small, and we need to get the next value. + */ + if ((node->key >> node->node.bit) > (x >> node->node.bit)) { + troot = node->node.branches.b[EB_LEFT]; + return eb64_entry(eb_walk_down(troot, EB_LEFT), struct eb64_node, node); + } + + /* Further values will be too low here, so return the next + * unique node (if it exists). + */ + troot = node->node.node_p; + break; + } + troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } + + /* If we get here, it means we want to report next node after the + * current one which is not below. <troot> is already initialised + * to the parent's branches. + */ + while (eb_gettag(troot) != EB_LEFT) + /* Walking up from right branch, so we cannot be below root */ + troot = (eb_root_to_node(eb_untag(troot, EB_RGHT)))->node_p; + + /* Note that <troot> cannot be NULL at this stage */ + troot = (eb_untag(troot, EB_LEFT))->b[EB_RGHT]; + if (eb_clrtag(troot) == NULL) + return NULL; + + node = eb64_entry(eb_walk_down(troot, EB_LEFT), struct eb64_node, node); + return node; +} diff --git a/src/ebimtree.c b/src/ebimtree.c new file mode 100644 index 0000000..1ac444a --- /dev/null +++ b/src/ebimtree.c @@ -0,0 +1,44 @@ +/* + * Elastic Binary Trees - exported functions for Indirect Multi-Byte data nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Consult ebimtree.h for more details about those functions */ + +#include <import/ebimtree.h> +#include <import/ebpttree.h> + +/* Find the first occurrence of a key of <len> bytes in the tree <root>. + * If none can be found, return NULL. + */ +struct ebpt_node * +ebim_lookup(struct eb_root *root, const void *x, unsigned int len) +{ + return __ebim_lookup(root, x, len); +} + +/* Insert ebpt_node <new> into subtree starting at node root <root>. + * Only new->key needs be set with the key. The ebpt_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * len is specified in bytes. + */ +struct ebpt_node * +ebim_insert(struct eb_root *root, struct ebpt_node *new, unsigned int len) +{ + return __ebim_insert(root, new, len); +} diff --git a/src/ebistree.c b/src/ebistree.c new file mode 100644 index 0000000..193950d --- /dev/null +++ b/src/ebistree.c @@ -0,0 +1,42 @@ +/* + * Elastic Binary Trees - exported functions for Indirect String data nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Consult ebistree.h for more details about those functions */ + +#include <import/ebistree.h> + +/* Find the first occurrence of a zero-terminated string <x> in the tree <root>. + * It's the caller's responsibility to use this function only on trees which + * only contain zero-terminated strings. If none can be found, return NULL. + */ +struct ebpt_node *ebis_lookup(struct eb_root *root, const char *x) +{ + return __ebis_lookup(root, x); +} + +/* Insert ebpt_node <new> into subtree starting at node root <root>. Only + * new->key needs be set with the zero-terminated string key. The ebpt_node is + * returned. If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * caller is responsible for properly terminating the key with a zero. + */ +struct ebpt_node *ebis_insert(struct eb_root *root, struct ebpt_node *new) +{ + return __ebis_insert(root, new); +} diff --git a/src/ebmbtree.c b/src/ebmbtree.c new file mode 100644 index 0000000..a3de9a1 --- /dev/null +++ b/src/ebmbtree.c @@ -0,0 +1,77 @@ +/* + * Elastic Binary Trees - exported functions for Multi-Byte data nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Consult ebmbtree.h for more details about those functions */ + +#include <import/ebmbtree.h> + +/* Find the first occurrence of a key of <len> bytes in the tree <root>. + * If none can be found, return NULL. + */ +struct ebmb_node * +ebmb_lookup(struct eb_root *root, const void *x, unsigned int len) +{ + return __ebmb_lookup(root, x, len); +} + +/* Insert ebmb_node <new> into subtree starting at node root <root>. + * Only new->key needs be set with the key. The ebmb_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * len is specified in bytes. + */ +struct ebmb_node * +ebmb_insert(struct eb_root *root, struct ebmb_node *new, unsigned int len) +{ + return __ebmb_insert(root, new, len); +} + +/* Find the first occurrence of the longest prefix matching a key <x> in the + * tree <root>. It's the caller's responsibility to ensure that key <x> is at + * least as long as the keys in the tree. If none can be found, return NULL. + */ +struct ebmb_node * +ebmb_lookup_longest(struct eb_root *root, const void *x) +{ + return __ebmb_lookup_longest(root, x); +} + +/* Find the first occurrence of a prefix matching a key <x> of <pfx> BITS in the + * tree <root>. If none can be found, return NULL. + */ +struct ebmb_node * +ebmb_lookup_prefix(struct eb_root *root, const void *x, unsigned int pfx) +{ + return __ebmb_lookup_prefix(root, x, pfx); +} + +/* Insert ebmb_node <new> into a prefix subtree starting at node root <root>. + * Only new->key and new->pfx need be set with the key and its prefix length. + * Note that bits between <pfx> and <len> are theoretically ignored and should be + * zero, as it is not certain yet that they will always be ignored everywhere + * (eg in bit compare functions). + * The ebmb_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * len is specified in bytes. + */ +struct ebmb_node * +ebmb_insert_prefix(struct eb_root *root, struct ebmb_node *new, unsigned int len) +{ + return __ebmb_insert_prefix(root, new, len); +} diff --git a/src/ebpttree.c b/src/ebpttree.c new file mode 100644 index 0000000..558d334 --- /dev/null +++ b/src/ebpttree.c @@ -0,0 +1,208 @@ +/* + * Elastic Binary Trees - exported functions for operations on pointer nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Consult ebpttree.h for more details about those functions */ + +#include <import/ebpttree.h> + +struct ebpt_node *ebpt_insert(struct eb_root *root, struct ebpt_node *new) +{ + return __ebpt_insert(root, new); +} + +struct ebpt_node *ebpt_lookup(struct eb_root *root, void *x) +{ + return __ebpt_lookup(root, x); +} + +/* + * Find the last occurrence of the highest key in the tree <root>, which is + * equal to or less than <x>. NULL is returned is no key matches. + */ +struct ebpt_node *ebpt_lookup_le(struct eb_root *root, void *x) +{ + struct ebpt_node *node; + eb_troot_t *troot; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + /* We reached a leaf, which means that the whole upper + * parts were common. We will return either the current + * node or its next one if the former is too small. + */ + node = container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + if (node->key <= x) + return node; + /* return prev */ + troot = node->node.leaf_p; + break; + } + node = container_of(eb_untag(troot, EB_NODE), + struct ebpt_node, node.branches); + + if (node->node.bit < 0) { + /* We're at the top of a dup tree. Either we got a + * matching value and we return the rightmost node, or + * we don't and we skip the whole subtree to return the + * prev node before the subtree. Note that since we're + * at the top of the dup tree, we can simply return the + * prev node without first trying to escape from the + * tree. + */ + if (node->key <= x) { + troot = node->node.branches.b[EB_RGHT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_RGHT]; + return container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + } + /* return prev */ + troot = node->node.node_p; + break; + } + + if ((((ptr_t)x ^ (ptr_t)node->key) >> node->node.bit) >= EB_NODE_BRANCHES) { + /* No more common bits at all. Either this node is too + * small and we need to get its highest value, or it is + * too large, and we need to get the prev value. + */ + if (((ptr_t)node->key >> node->node.bit) < ((ptr_t)x >> node->node.bit)) { + troot = node->node.branches.b[EB_RGHT]; + return ebpt_entry(eb_walk_down(troot, EB_RGHT), struct ebpt_node, node); + } + + /* Further values will be too high here, so return the prev + * unique node (if it exists). + */ + troot = node->node.node_p; + break; + } + troot = node->node.branches.b[((ptr_t)x >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } + + /* If we get here, it means we want to report previous node before the + * current one which is not above. <troot> is already initialised to + * the parent's branches. + */ + while (eb_gettag(troot) == EB_LEFT) { + /* Walking up from left branch. We must ensure that we never + * walk beyond root. + */ + if (unlikely(eb_clrtag((eb_untag(troot, EB_LEFT))->b[EB_RGHT]) == NULL)) + return NULL; + troot = (eb_root_to_node(eb_untag(troot, EB_LEFT)))->node_p; + } + /* Note that <troot> cannot be NULL at this stage */ + troot = (eb_untag(troot, EB_RGHT))->b[EB_LEFT]; + node = ebpt_entry(eb_walk_down(troot, EB_RGHT), struct ebpt_node, node); + return node; +} + +/* + * Find the first occurrence of the lowest key in the tree <root>, which is + * equal to or greater than <x>. NULL is returned is no key matches. + */ +struct ebpt_node *ebpt_lookup_ge(struct eb_root *root, void *x) +{ + struct ebpt_node *node; + eb_troot_t *troot; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + /* We reached a leaf, which means that the whole upper + * parts were common. We will return either the current + * node or its next one if the former is too small. + */ + node = container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + if (node->key >= x) + return node; + /* return next */ + troot = node->node.leaf_p; + break; + } + node = container_of(eb_untag(troot, EB_NODE), + struct ebpt_node, node.branches); + + if (node->node.bit < 0) { + /* We're at the top of a dup tree. Either we got a + * matching value and we return the leftmost node, or + * we don't and we skip the whole subtree to return the + * next node after the subtree. Note that since we're + * at the top of the dup tree, we can simply return the + * next node without first trying to escape from the + * tree. + */ + if (node->key >= x) { + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + return container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + } + /* return next */ + troot = node->node.node_p; + break; + } + + if ((((ptr_t)x ^ (ptr_t)node->key) >> node->node.bit) >= EB_NODE_BRANCHES) { + /* No more common bits at all. Either this node is too + * large and we need to get its lowest value, or it is too + * small, and we need to get the next value. + */ + if (((ptr_t)node->key >> node->node.bit) > ((ptr_t)x >> node->node.bit)) { + troot = node->node.branches.b[EB_LEFT]; + return ebpt_entry(eb_walk_down(troot, EB_LEFT), struct ebpt_node, node); + } + + /* Further values will be too low here, so return the next + * unique node (if it exists). + */ + troot = node->node.node_p; + break; + } + troot = node->node.branches.b[((ptr_t)x >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } + + /* If we get here, it means we want to report next node after the + * current one which is not below. <troot> is already initialised + * to the parent's branches. + */ + while (eb_gettag(troot) != EB_LEFT) + /* Walking up from right branch, so we cannot be below root */ + troot = (eb_root_to_node(eb_untag(troot, EB_RGHT)))->node_p; + + /* Note that <troot> cannot be NULL at this stage */ + troot = (eb_untag(troot, EB_LEFT))->b[EB_RGHT]; + if (eb_clrtag(troot) == NULL) + return NULL; + + node = ebpt_entry(eb_walk_down(troot, EB_LEFT), struct ebpt_node, node); + return node; +} diff --git a/src/ebsttree.c b/src/ebsttree.c new file mode 100644 index 0000000..a4fbe33 --- /dev/null +++ b/src/ebsttree.c @@ -0,0 +1,42 @@ +/* + * Elastic Binary Trees - exported functions for String data nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Consult ebsttree.h for more details about those functions */ + +#include <import/ebsttree.h> + +/* Find the first occurrence of a zero-terminated string <x> in the tree <root>. + * It's the caller's responsibility to use this function only on trees which + * only contain zero-terminated strings. If none can be found, return NULL. + */ +struct ebmb_node *ebst_lookup(struct eb_root *root, const char *x) +{ + return __ebst_lookup(root, x); +} + +/* Insert ebmb_node <new> into subtree starting at node root <root>. Only + * new->key needs be set with the zero-terminated string key. The ebmb_node is + * returned. If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * caller is responsible for properly terminating the key with a zero. + */ +struct ebmb_node *ebst_insert(struct eb_root *root, struct ebmb_node *new) +{ + return __ebst_insert(root, new); +} diff --git a/src/ebtree.c b/src/ebtree.c new file mode 100644 index 0000000..db27875 --- /dev/null +++ b/src/ebtree.c @@ -0,0 +1,50 @@ +/* + * Elastic Binary Trees - exported generic functions + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <import/ebtree.h> + +void eb_delete(struct eb_node *node) +{ + __eb_delete(node); +} + +/* used by insertion primitives */ +struct eb_node *eb_insert_dup(struct eb_node *sub, struct eb_node *new) +{ + return __eb_insert_dup(sub, new); +} + +/* compares memory blocks m1 and m2 for up to <len> bytes. Immediately stops at + * the first non-matching byte. It returns 0 on full match, non-zero otherwise. + * One byte will always be checked so this must not be called with len==0. It + * takes 2+5cy/B on x86_64 and is ~29 bytes long. + */ +int eb_memcmp(const void *m1, const void *m2, size_t len) +{ + const char *p1 = (const char *)m1 + len; + const char *p2 = (const char *)m2 + len; + ssize_t ofs = -len; + char diff; + + do { + diff = p1[ofs] - p2[ofs]; + } while (!diff && ++ofs); + return diff; +} diff --git a/src/errors.c b/src/errors.c new file mode 100644 index 0000000..7a2d14a --- /dev/null +++ b/src/errors.c @@ -0,0 +1,567 @@ +#include <sys/mman.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <syslog.h> + +#include <haproxy/api.h> +#include <haproxy/applet-t.h> +#include <haproxy/buf.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/obj_type.h> +#include <haproxy/ring.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + +/* A global buffer used to store all startup alerts/warnings. It will then be + * retrieve on the CLI. */ +struct ring *startup_logs = NULL; +uint tot_warnings = 0; +#ifdef USE_SHM_OPEN +static struct ring *shm_startup_logs = NULL; +#endif + +/* A thread local buffer used to store all alerts/warnings. It can be used to + * retrieve them for CLI commands after startup. + */ +#define USER_MESSAGES_BUFSIZE 1024 +static THREAD_LOCAL struct buffer usermsgs_buf = BUF_NULL; + +/* A thread local context used for stderr output via ha_alert/warning/notice/diag. + */ +#define USERMSGS_CTX_BUFSIZE PATH_MAX +static THREAD_LOCAL struct usermsgs_ctx usermsgs_ctx = { .str = BUF_NULL, }; + +#ifdef USE_SHM_OPEN + +/* initialise an SHM for the startup logs and return its fd */ +static int startup_logs_new_shm() +{ + char *path = NULL; + int fd = -1; + int flags; + + /* create a unique path per PID so we don't collide with another + process */ + memprintf(&path, "/haproxy_startup_logs_%d", getpid()); + fd = shm_open(path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + if (fd == -1) + goto error; + shm_unlink(path); + ha_free(&path); + + if (ftruncate(fd, STARTUP_LOG_SIZE) == -1) + goto error; + + flags = fcntl(fd, F_GETFD); + if (flags == -1) + goto error; + flags &= ~FD_CLOEXEC; + flags = fcntl(fd, F_SETFD, flags); + if (flags == -1) + goto error; + + return fd; +error: + if (fd != -1) { + close(fd); + fd = -1; + } + return fd; +} + +/* mmap a startup-logs from a <fd>. + * if <new> is set to one, initialize the buffer. + * Returns the ring. + */ +static struct ring *startup_logs_from_fd(int fd, int new) +{ + char *area; + struct ring *r = NULL; + + if (fd == -1) + goto error; + + area = mmap(NULL, STARTUP_LOG_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (area == MAP_FAILED || area == NULL) + goto error; + + if (new) + r = ring_make_from_area(area, STARTUP_LOG_SIZE); + else + r = ring_cast_from_area(area); + + if (r == NULL) + goto error; + + shm_startup_logs = r; /* save the ptr so we can unmap later */ + + return r; +error: + return NULL; +} + +/* + * Use a shm across reexec of the master. + * + * During the startup of the master, a shm_open must be done and the FD saved + * into the HAPROXY_STARTUPLOGS_FD environment variable. + * + * When forking workers, the child must use a copy of the shm, not the shm itself. + * + * Once in wait mode, the shm must be copied and closed. + * + */ +void startup_logs_init() +{ + struct ring *r = NULL; + char *str_fd, *endptr; + int fd = -1; + + str_fd = getenv("HAPROXY_STARTUPLOGS_FD"); + if (str_fd) { + fd = strtol(str_fd, &endptr, 10); + if (*endptr != '\0') + goto error; + unsetenv("HAPROXY_STARTUPLOGS_FD"); + } + + /* during startup, or just after a reload. + * Note: the WAIT_ONLY env variable must be + * check in case of an early call */ + if (!(global.mode & MODE_MWORKER_WAIT) && + getenv("HAPROXY_MWORKER_WAIT_ONLY") == NULL) { + if (fd != -1) + close(fd); + + fd = startup_logs_new_shm(); + if (fd == -1) + goto error; + + r = startup_logs_from_fd(fd, 1); + if (!r) + goto error; + + str_fd = NULL; + memprintf(&str_fd, "%d", fd); + setenv("HAPROXY_STARTUPLOGS_FD", str_fd, 1); + ha_free(&str_fd); + + } else { + /* in wait mode, copy the shm to an allocated buffer */ + struct ring *prev = NULL; + + if (fd == -1) + goto error; + + prev = startup_logs_from_fd(fd, 0); + if (!prev) + goto error; + + r = startup_logs_dup(prev); + if (!r) + goto error; + startup_logs_free(prev); + close(fd); + } + + startup_logs = r; + + return; +error: + if (fd != -1) + close(fd); + /* couldn't get a mmap to work */ + startup_logs = ring_new(STARTUP_LOG_SIZE); + +} + +#else /* ! USE_SHM_OPEN */ + +void startup_logs_init() +{ + startup_logs = ring_new(STARTUP_LOG_SIZE); +} + +#endif + +/* free the startup logs, unmap if it was an shm */ +void startup_logs_free(struct ring *r) +{ +#ifdef USE_SHM_OPEN + if (r == shm_startup_logs) + munmap(r, STARTUP_LOG_SIZE); + else +#endif /* ! USE_SHM_OPEN */ + ring_free(r); +} + +/* duplicate a startup logs which was previously allocated in a shm */ +struct ring *startup_logs_dup(struct ring *src) +{ + struct ring *dst = NULL; + + /* must use the size of the previous buffer */ + dst = ring_new(b_size(&src->buf)); + if (!dst) + goto error; + + b_reset(&dst->buf); + b_ncat(&dst->buf, &src->buf, b_data(&src->buf)); +error: + return dst; +} + +/* Put msg in usermsgs_buf. + * + * The message should not be terminated by a newline because this function + * manually insert it. + * + * If there is not enough room in the buffer, the message is silently discarded. + * Do not forget to frequently clear the buffer. + */ +static void usermsgs_put(const struct ist *msg) +{ + /* Allocate the buffer if not already done. */ + if (unlikely(b_is_null(&usermsgs_buf))) { + usermsgs_buf.area = malloc(USER_MESSAGES_BUFSIZE * sizeof(char)); + if (usermsgs_buf.area) + usermsgs_buf.size = USER_MESSAGES_BUFSIZE; + } + + if (likely(!b_is_null(&usermsgs_buf))) { + if (b_room(&usermsgs_buf) >= msg->len + 2) { + /* Insert the message + newline. */ + b_putblk(&usermsgs_buf, msg->ptr, msg->len); + b_putchr(&usermsgs_buf, '\n'); + /* Insert NUL outside of the buffer. */ + *b_tail(&usermsgs_buf) = '\0'; + } + } +} + +/* Clear the user messages log buffer. + * + * <prefix> will set the local-thread context appended to every output + * following this call. It can be NULL if not necessary. + */ +void usermsgs_clr(const char *prefix) +{ + if (likely(!b_is_null(&usermsgs_buf))) { + b_reset(&usermsgs_buf); + usermsgs_buf.area[0] = '\0'; + } + + usermsgs_ctx.prefix = prefix; +} + +/* Check if the user messages buffer is empty. */ +int usermsgs_empty(void) +{ + return !!(b_is_null(&usermsgs_buf) || !b_data(&usermsgs_buf)); +} + +/* Return the messages log buffer content. */ +const char *usermsgs_str(void) +{ + if (unlikely(b_is_null(&usermsgs_buf))) + return ""; + + return b_head(&usermsgs_buf); +} + +/* Set thread-local context infos to prefix forthcoming stderr output during + * configuration parsing. + * + * <file> and <line> specify the location of the parsed configuration. + * + * <obj> can be of various types. If not NULL, the string prefix generated will + * depend on its type. + */ +void set_usermsgs_ctx(const char *file, int line, enum obj_type *obj) +{ + usermsgs_ctx.file = file; + usermsgs_ctx.line = line; + usermsgs_ctx.obj = obj; +} + +/* Set thread-local context infos to prefix forthcoming stderr output. It will + * be set as a complement to possibly already defined file/line. + * + * <obj> can be of various types. If not NULL, the string prefix generated will + * depend on its type. + */ +void register_parsing_obj(enum obj_type *obj) +{ + usermsgs_ctx.obj = obj; +} + +/* Reset thread-local context infos for stderr output. */ +void reset_usermsgs_ctx(void) +{ + usermsgs_ctx.file = NULL; + usermsgs_ctx.line = 0; + usermsgs_ctx.obj = NULL; +} + +static void generate_usermsgs_ctx_str(void) +{ + struct usermsgs_ctx *ctx = &usermsgs_ctx; + void *area; + int ret; + + if (unlikely(b_is_null(&ctx->str))) { + area = calloc(USERMSGS_CTX_BUFSIZE, sizeof(*area)); + if (area) + ctx->str = b_make(area, USERMSGS_CTX_BUFSIZE, 0, 0); + } + + if (likely(!b_is_null(&ctx->str))) { + b_reset(&ctx->str); + + if (ctx->prefix) { + ret = snprintf(b_tail(&ctx->str), b_room(&ctx->str), + "%s : ", ctx->prefix); + b_add(&ctx->str, MIN(ret, b_room(&ctx->str))); + } + + if (ctx->file) { + ret = snprintf(b_tail(&ctx->str), b_room(&ctx->str), + "[%s:%d] : ", ctx->file, ctx->line); + b_add(&ctx->str, MIN(ret, b_room(&ctx->str))); + } + + switch (obj_type(ctx->obj)) { + case OBJ_TYPE_SERVER: + ret = snprintf(b_tail(&ctx->str), b_room(&ctx->str), + "'server %s/%s' : ", + __objt_server(ctx->obj)->proxy->id, + __objt_server(ctx->obj)->id); + b_add(&ctx->str, MIN(ret, b_room(&ctx->str))); + break; + + case OBJ_TYPE_NONE: + default: + break; + } + + if (!b_data(&ctx->str)) + snprintf(b_tail(&ctx->str), b_room(&ctx->str), "%s", ""); + } +} + +/* Generic function to display messages prefixed by a label */ +static void print_message(int use_usermsgs_ctx, const char *label, const char *fmt, va_list argp) +{ + struct ist msg_ist = IST_NULL; + char *head, *parsing_str, *msg; + char prefix[11]; // '[' + 8 chars + ']' + 0. + + *prefix = '['; + strncpy(prefix + 1, label, sizeof(prefix) - 2); + msg = prefix + strlen(prefix); + *msg++ = ']'; + while (msg < prefix + sizeof(prefix) - 1) + *msg++ = ' '; + *msg = 0; + + head = parsing_str = msg = NULL; + memprintf(&head, "%s (%u) : ", prefix, (uint)getpid()); + memvprintf(&msg, fmt, argp); + + /* trim the trailing '\n' */ + msg_ist = ist(msg); + if (msg_ist.len > 0 && msg_ist.ptr[msg_ist.len - 1] == '\n') + msg_ist.len--; + + if (use_usermsgs_ctx) { + generate_usermsgs_ctx_str(); + parsing_str = b_head(&usermsgs_ctx.str); + reset_usermsgs_ctx(); + } + else { + parsing_str = ""; + } + + if (global.mode & MODE_STARTING) { + if (unlikely(!startup_logs)) + startup_logs_init(); + + if (likely(startup_logs)) { + struct ist m[3]; + + m[0] = ist(head); + m[1] = ist(parsing_str); + m[2] = msg_ist; + + ring_write(startup_logs, ~0, 0, 0, m, 3); + } + } + else { + usermsgs_put(&msg_ist); + } + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) { + fprintf(stderr, "%s%s%s", head, parsing_str, msg); + fflush(stderr); + } + + free(head); + free(msg); +} + +static void print_message_args(int use_usermsgs_ctx, const char *label, const char *fmt, ...) +{ + va_list argp; + va_start(argp, fmt); + print_message(use_usermsgs_ctx, label, fmt, argp); + va_end(argp); +} + +/* + * Display a notice with the happroxy version and executable path when the + * first message is emitted in starting mode. + */ +static void warn_exec_path() +{ + if (!(warned & WARN_EXEC_PATH) && (global.mode & MODE_STARTING)) { + const char *path = get_exec_path(); + + warned |= WARN_EXEC_PATH; + print_message_args(0, "NOTICE", "haproxy version is %s\n", haproxy_version); + if (path) + print_message_args(0, "NOTICE", "path to executable is %s\n", path); + } +} + +/* + * Displays the message on stderr with the pid. + */ +void ha_alert(const char *fmt, ...) +{ + va_list argp; + + warn_exec_path(); + va_start(argp, fmt); + print_message(1, "ALERT", fmt, argp); + va_end(argp); +} + +/* + * Displays the message on stderr with the pid. + */ +void ha_warning(const char *fmt, ...) +{ + va_list argp; + + warned |= WARN_ANY; + HA_ATOMIC_INC(&tot_warnings); + + warn_exec_path(); + va_start(argp, fmt); + print_message(1, "WARNING", fmt, argp); + va_end(argp); +} + +/* + * Variant of _ha_diag_warning with va_list. + * Use it only if MODE_DIAG has been previously checked. + */ +void _ha_vdiag_warning(const char *fmt, va_list argp) +{ + warned |= WARN_ANY; + HA_ATOMIC_INC(&tot_warnings); + + warn_exec_path(); + print_message(1, "DIAG", fmt, argp); +} + +/* + * Output a diagnostic warning. + * Use it only if MODE_DIAG has been previously checked. + */ +void _ha_diag_warning(const char *fmt, ...) +{ + va_list argp; + + va_start(argp, fmt); + _ha_vdiag_warning(fmt, argp); + va_end(argp); +} + +/* + * Output a diagnostic warning. Do nothing of MODE_DIAG is not on. + */ +void ha_diag_warning(const char *fmt, ...) +{ + va_list argp; + + if (global.mode & MODE_DIAG) { + va_start(argp, fmt); + _ha_vdiag_warning(fmt, argp); + va_end(argp); + } +} + +/* + * Displays the message on stderr with the pid. + */ +void ha_notice(const char *fmt, ...) +{ + va_list argp; + + va_start(argp, fmt); + print_message(1, "NOTICE", fmt, argp); + va_end(argp); +} + +/* + * Displays the message on <out> only if quiet mode is not set. + */ +void qfprintf(FILE *out, const char *fmt, ...) +{ + va_list argp; + + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) { + va_start(argp, fmt); + vfprintf(out, fmt, argp); + fflush(out); + va_end(argp); + } +} + + +/* parse the "show startup-logs" command, returns 1 if a message is returned, otherwise zero */ +static int cli_parse_show_startup_logs(char **args, char *payload, struct appctx *appctx, void *private) +{ + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + if (!startup_logs) + return cli_msg(appctx, LOG_INFO, "\n"); // nothing to print + + return ring_attach_cli(startup_logs, appctx, 0); +} + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "show", "startup-logs", NULL }, "show startup-logs : report logs emitted during HAProxy startup", cli_parse_show_startup_logs, NULL, NULL, NULL, ACCESS_MASTER }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + + +static void deinit_errors_buffers() +{ + ring_free(_HA_ATOMIC_XCHG(&startup_logs, NULL)); + ha_free(&usermsgs_buf.area); + ha_free(&usermsgs_ctx.str.area); +} + +/* errors might be used in threads and even before forking, thus 2 deinit */ +REGISTER_PER_THREAD_FREE(deinit_errors_buffers); +REGISTER_POST_DEINIT(deinit_errors_buffers); diff --git a/src/ev_epoll.c b/src/ev_epoll.c new file mode 100644 index 0000000..c42cf2e --- /dev/null +++ b/src/ev_epoll.c @@ -0,0 +1,413 @@ +/* + * FD polling functions for Linux epoll + * + * Copyright 2000-2014 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <unistd.h> +#include <sys/epoll.h> +#include <sys/time.h> +#include <sys/types.h> + +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/signal.h> +#include <haproxy/ticks.h> +#include <haproxy/task.h> +#include <haproxy/tools.h> + + +/* private data */ +static THREAD_LOCAL struct epoll_event *epoll_events = NULL; +static int epoll_fd[MAX_THREADS] __read_mostly; // per-thread epoll_fd + +#ifndef EPOLLRDHUP +/* EPOLLRDHUP was defined late in libc, and it appeared in kernel 2.6.17 */ +#define EPOLLRDHUP 0x2000 +#endif + +/* + * Immediately remove file descriptor from epoll set upon close. + * Since we forked, some fds share inodes with the other process, and epoll may + * send us events even though this process closed the fd (see man 7 epoll, + * "Questions and answers", Q 6). + */ +static void __fd_clo(int fd) +{ + if (unlikely(fdtab[fd].state & FD_CLONED)) { + unsigned long m = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_recv) | _HA_ATOMIC_LOAD(&polled_mask[fd].poll_send); + int tgrp = fd_tgid(fd); + struct epoll_event ev; + int i; + + if (!m) + return; + + /* since FDs may only be shared per group and are only closed + * once entirely reset, it should never happen that we have to + * close an FD for another group, unless we're stopping from the + * wrong thread or during startup, which is what we're checking + * for. Regardless, it is not a problem to do so. + */ + if (unlikely(!(global.mode & MODE_STARTING))) { + CHECK_IF(tgid != tgrp && !thread_isolated()); + } + + for (i = ha_tgroup_info[tgrp-1].base; i < ha_tgroup_info[tgrp-1].base + ha_tgroup_info[tgrp-1].count; i++) + if (m & ha_thread_info[i].ltid_bit) + epoll_ctl(epoll_fd[i], EPOLL_CTL_DEL, fd, &ev); + } +} + +static void _update_fd(int fd) +{ + int en, opcode; + struct epoll_event ev = { }; + ulong pr, ps; + + en = fdtab[fd].state; + pr = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_recv); + ps = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_send); + + /* Try to force EPOLLET on FDs that support it */ + if (fdtab[fd].state & FD_ET_POSSIBLE) { + /* already done ? */ + if (pr & ps & ti->ltid_bit) + return; + + /* enable ET polling in both directions */ + _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, ti->ltid_bit); + _HA_ATOMIC_OR(&polled_mask[fd].poll_send, ti->ltid_bit); + opcode = EPOLL_CTL_ADD; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLOUT | EPOLLET; + goto done; + } + + /* if we're already polling or are going to poll for this FD and it's + * neither active nor ready, force it to be active so that we don't + * needlessly unsubscribe then re-subscribe it. + */ + if (!(en & (FD_EV_READY_R | FD_EV_SHUT_R | FD_EV_ERR_RW | FD_POLL_ERR)) && + ((en & FD_EV_ACTIVE_W) || ((ps | pr) & ti->ltid_bit))) + en |= FD_EV_ACTIVE_R; + + if ((ps | pr) & ti->ltid_bit) { + if (!(fdtab[fd].thread_mask & ti->ltid_bit) || !(en & FD_EV_ACTIVE_RW)) { + /* fd removed from poll list */ + opcode = EPOLL_CTL_DEL; + if (pr & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit); + if (ps & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit); + } + else { + if (((en & FD_EV_ACTIVE_R) != 0) == ((pr & ti->ltid_bit) != 0) && + ((en & FD_EV_ACTIVE_W) != 0) == ((ps & ti->ltid_bit) != 0)) + return; + if (en & FD_EV_ACTIVE_R) { + if (!(pr & ti->ltid_bit)) + _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, ti->ltid_bit); + } else { + if (pr & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit); + } + if (en & FD_EV_ACTIVE_W) { + if (!(ps & ti->ltid_bit)) + _HA_ATOMIC_OR(&polled_mask[fd].poll_send, ti->ltid_bit); + } else { + if (ps & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit); + } + /* fd status changed */ + opcode = EPOLL_CTL_MOD; + } + } + else if ((fdtab[fd].thread_mask & ti->ltid_bit) && (en & FD_EV_ACTIVE_RW)) { + /* new fd in the poll list */ + opcode = EPOLL_CTL_ADD; + if (en & FD_EV_ACTIVE_R) + _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, ti->ltid_bit); + if (en & FD_EV_ACTIVE_W) + _HA_ATOMIC_OR(&polled_mask[fd].poll_send, ti->ltid_bit); + } + else { + return; + } + + /* construct the epoll events based on new state */ + if (en & FD_EV_ACTIVE_R) + ev.events |= EPOLLIN | EPOLLRDHUP; + + if (en & FD_EV_ACTIVE_W) + ev.events |= EPOLLOUT; + + done: + ev.data.fd = fd; + epoll_ctl(epoll_fd[tid], opcode, fd, &ev); +} + +/* + * Linux epoll() poller + */ +static void _do_poll(struct poller *p, int exp, int wake) +{ + int status; + int fd; + int count; + int updt_idx; + int wait_time; + int old_fd; + + /* first, scan the update list to find polling changes */ + for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { + fd = fd_updt[updt_idx]; + + if (!fd_grab_tgid(fd, tgid)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~ti->ltid_bit); + + if (fdtab[fd].owner) + _update_fd(fd); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); + } + fd_nbupdt = 0; + + /* Scan the shared update list */ + for (old_fd = fd = update_list[tgid - 1].first; fd != -1; fd = fdtab[fd].update.next) { + if (fd == -2) { + fd = old_fd; + continue; + } + else if (fd <= -3) + fd = -fd -4; + if (fd == -1) + break; + + if (!fd_grab_tgid(fd, tgid)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + if (!(fdtab[fd].update_mask & ti->ltid_bit)) { + fd_drop_tgid(fd); + continue; + } + + done_update_polling(fd); + + if (fdtab[fd].owner) + _update_fd(fd); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); + } + + thread_idle_now(); + thread_harmless_now(); + + /* Now let's wait for polled events. */ + wait_time = wake ? 0 : compute_poll_timeout(exp); + clock_entering_poll(); + + do { + int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time; + + status = epoll_wait(epoll_fd[tid], epoll_events, global.tune.maxpollevents, timeout); + clock_update_local_date(timeout, status); + + if (status) { + activity[tid].poll_io++; + break; + } + if (timeout || !wait_time) + break; + if (tick_isset(exp) && tick_is_expired(exp, now_ms)) + break; + } while (1); + + clock_update_global_date(); + fd_leaving_poll(wait_time, status); + + /* process polled events */ + + for (count = 0; count < status; count++) { + unsigned int n, e; + + e = epoll_events[count].events; + fd = epoll_events[count].data.fd; + + if ((e & EPOLLRDHUP) && !(cur_poller.flags & HAP_POLL_F_RDHUP)) + _HA_ATOMIC_OR(&cur_poller.flags, HAP_POLL_F_RDHUP); + +#ifdef DEBUG_FD + _HA_ATOMIC_INC(&fdtab[fd].event_count); +#endif + n = ((e & EPOLLIN) ? FD_EV_READY_R : 0) | + ((e & EPOLLOUT) ? FD_EV_READY_W : 0) | + ((e & EPOLLRDHUP) ? FD_EV_SHUT_R : 0) | + ((e & EPOLLHUP) ? FD_EV_SHUT_RW : 0) | + ((e & EPOLLERR) ? FD_EV_ERR_RW : 0); + + fd_update_events(fd, n); + } + /* the caller will take care of cached events */ +} + +static int init_epoll_per_thread() +{ + epoll_events = calloc(1, sizeof(struct epoll_event) * global.tune.maxpollevents); + if (epoll_events == NULL) + goto fail_alloc; + + if (MAX_THREADS > 1 && tid) { + epoll_fd[tid] = epoll_create(global.maxsock + 1); + if (epoll_fd[tid] < 0) + goto fail_fd; + } + + /* we may have to unregister some events initially registered on the + * original fd when it was alone, and/or to register events on the new + * fd for this thread. Let's just mark them as updated, the poller will + * do the rest. + */ + fd_reregister_all(tgid, ti->ltid_bit); + + return 1; + fail_fd: + free(epoll_events); + fail_alloc: + return 0; +} + +static void deinit_epoll_per_thread() +{ + if (MAX_THREADS > 1 && tid) + close(epoll_fd[tid]); + + ha_free(&epoll_events); +} + +/* + * Initialization of the epoll() poller. + * Returns 0 in case of failure, non-zero in case of success. If it fails, it + * disables the poller by setting its pref to 0. + */ +static int _do_init(struct poller *p) +{ + p->private = NULL; + + epoll_fd[tid] = epoll_create(global.maxsock + 1); + if (epoll_fd[tid] < 0) + goto fail_fd; + + hap_register_per_thread_init(init_epoll_per_thread); + hap_register_per_thread_deinit(deinit_epoll_per_thread); + + return 1; + + fail_fd: + p->pref = 0; + return 0; +} + +/* + * Termination of the epoll() poller. + * Memory is released and the poller is marked as unselectable. + */ +static void _do_term(struct poller *p) +{ + if (epoll_fd[tid] >= 0) { + close(epoll_fd[tid]); + epoll_fd[tid] = -1; + } + + p->private = NULL; + p->pref = 0; +} + +/* + * Check that the poller works. + * Returns 1 if OK, otherwise 0. + */ +static int _do_test(struct poller *p) +{ + int fd; + + fd = epoll_create(global.maxsock + 1); + if (fd < 0) + return 0; + close(fd); + return 1; +} + +/* + * Recreate the epoll file descriptor after a fork(). Returns 1 if OK, + * otherwise 0. It will ensure that all processes will not share their + * epoll_fd. Some side effects were encountered because of this, such + * as epoll_wait() returning an FD which was previously deleted. + */ +static int _do_fork(struct poller *p) +{ + if (epoll_fd[tid] >= 0) + close(epoll_fd[tid]); + epoll_fd[tid] = epoll_create(global.maxsock + 1); + if (epoll_fd[tid] < 0) + return 0; + return 1; +} + +/* + * Registers the poller. + */ +static void _do_register(void) +{ + struct poller *p; + int i; + + if (nbpollers >= MAX_POLLERS) + return; + + for (i = 0; i < MAX_THREADS; i++) + epoll_fd[i] = -1; + + p = &pollers[nbpollers++]; + + p->name = "epoll"; + p->pref = 300; + p->flags = HAP_POLL_F_ERRHUP; // note: RDHUP might be dynamically added + p->private = NULL; + + p->clo = __fd_clo; + p->test = _do_test; + p->init = _do_init; + p->term = _do_term; + p->poll = _do_poll; + p->fork = _do_fork; +} + +INITCALL0(STG_REGISTER, _do_register); + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/ev_evports.c b/src/ev_evports.c new file mode 100644 index 0000000..07676e6 --- /dev/null +++ b/src/ev_evports.c @@ -0,0 +1,441 @@ +/* + * FD polling functions for SunOS event ports. + * + * Copyright 2018 Joyent, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <unistd.h> +#include <sys/time.h> +#include <sys/types.h> + +#include <poll.h> +#include <port.h> +#include <errno.h> +#include <syslog.h> + +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/signal.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> + +/* + * Private data: + */ +static int evports_fd[MAX_THREADS]; // per-thread evports_fd +static THREAD_LOCAL port_event_t *evports_evlist = NULL; +static THREAD_LOCAL int evports_evlist_max = 0; + +/* + * Convert the "state" member of "fdtab" into an event ports event mask. + */ +static inline int evports_state_to_events(int state) +{ + int events = 0; + + if (state & FD_EV_ACTIVE_W) + events |= POLLOUT; + if (state & FD_EV_ACTIVE_R) + events |= POLLIN; + + return (events); +} + +/* + * Associate or dissociate this file descriptor with the event port, using the + * specified event mask. + */ +static inline void evports_resync_fd(int fd, int events) +{ + if (events == 0) + port_dissociate(evports_fd[tid], PORT_SOURCE_FD, fd); + else + port_associate(evports_fd[tid], PORT_SOURCE_FD, fd, events, NULL); +} + +static void _update_fd(int fd) +{ + int en; + int events; + ulong pr, ps; + + en = fdtab[fd].state; + pr = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_recv); + ps = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_send); + + if (!(fdtab[fd].thread_mask & ti->ltid_bit) || !(en & FD_EV_ACTIVE_RW)) { + if (!((pr | ps) & ti->ltid_bit)) { + /* fd was not watched, it's still not */ + return; + } + /* fd totally removed from poll list */ + events = 0; + if (pr & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit); + if (ps & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit); + } + else { + /* OK fd has to be monitored, it was either added or changed */ + events = evports_state_to_events(en); + if (en & FD_EV_ACTIVE_R) { + if (!(pr & ti->ltid_bit)) + _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, ti->ltid_bit); + } else { + if (pr & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit); + } + if (en & FD_EV_ACTIVE_W) { + if (!(ps & ti->ltid_bit)) + _HA_ATOMIC_OR(&polled_mask[fd].poll_send, ti->ltid_bit); + } else { + if (ps & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit); + } + + } + evports_resync_fd(fd, events); +} + +/* + * Event Ports poller. This routine interacts with the file descriptor + * management data structures and routines; see the large block comment in + * "src/fd.c" for more information. + */ + +static void _do_poll(struct poller *p, int exp, int wake) +{ + int i; + int wait_time; + struct timespec timeout_ts; + unsigned int nevlist; + int fd, old_fd; + int status; + + /* + * Scan the list of file descriptors with an updated status: + */ + for (i = 0; i < fd_nbupdt; i++) { + fd = fd_updt[i]; + + if (!fd_grab_tgid(fd, tgid)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~ti->ltid_bit); + + if (fdtab[fd].owner) + _update_fd(fd); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); + } + fd_nbupdt = 0; + + /* Scan the shared update list */ + for (old_fd = fd = update_list[tgid - 1].first; fd != -1; fd = fdtab[fd].update.next) { + if (fd == -2) { + fd = old_fd; + continue; + } + else if (fd <= -3) + fd = -fd -4; + if (fd == -1) + break; + + if (!fd_grab_tgid(fd, tgid)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + if (!(fdtab[fd].update_mask & ti->ltid_bit)) { + fd_drop_tgid(fd); + continue; + } + + done_update_polling(fd); + + if (fdtab[fd].owner) + _update_fd(fd); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); + } + + thread_idle_now(); + thread_harmless_now(); + + /* Now let's wait for polled events. */ + wait_time = wake ? 0 : compute_poll_timeout(exp); + clock_entering_poll(); + + do { + int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time; + int interrupted = 0; + nevlist = 1; /* desired number of events to be retrieved */ + timeout_ts.tv_sec = (timeout / 1000); + timeout_ts.tv_nsec = (timeout % 1000) * 1000000; + + status = port_getn(evports_fd[tid], + evports_evlist, + evports_evlist_max, + &nevlist, /* updated to the number of events retrieved */ + &timeout_ts); + if (status != 0) { + int e = errno; + switch (e) { + case ETIME: + /* + * Though the manual page has not historically made it + * clear, port_getn() can return -1 with an errno of + * ETIME and still have returned some number of events. + */ + /* nevlist >= 0 */ + break; + default: + nevlist = 0; + interrupted = 1; + break; + } + } + clock_update_local_date(timeout, nevlist); + + if (nevlist || interrupted) + break; + if (timeout || !wait_time) + break; + if (tick_isset(exp) && tick_is_expired(exp, now_ms)) + break; + } while(1); + + clock_update_global_date(); + fd_leaving_poll(wait_time, nevlist); + + if (nevlist > 0) + activity[tid].poll_io++; + + for (i = 0; i < nevlist; i++) { + unsigned int n = 0; + int events, rebind_events; + int ret; + + fd = evports_evlist[i].portev_object; + events = evports_evlist[i].portev_events; + +#ifdef DEBUG_FD + _HA_ATOMIC_INC(&fdtab[fd].event_count); +#endif + /* + * By virtue of receiving an event for this file descriptor, it + * is no longer associated with the port in question. Store + * the previous event mask so that we may reassociate after + * processing is complete. + */ + rebind_events = evports_state_to_events(fdtab[fd].state); + /* rebind_events != 0 */ + + /* + * Set bits based on the events we received from the port: + */ + n = ((events & POLLIN) ? FD_EV_READY_R : 0) | + ((events & POLLOUT) ? FD_EV_READY_W : 0) | + ((events & POLLHUP) ? FD_EV_SHUT_RW : 0) | + ((events & POLLERR) ? FD_EV_ERR_RW : 0); + + /* + * Call connection processing callbacks. Note that it's + * possible for this processing to alter the required event + * port association; i.e., the "state" member of the "fdtab" + * entry. If it changes, the fd will be placed on the updated + * list for processing the next time we are called. + */ + ret = fd_update_events(fd, n); + + /* polling will be on this instance if the FD was migrated */ + if (ret == FD_UPDT_MIGRATED) + continue; + + /* + * This file descriptor was closed during the processing of + * polled events. No need to reassociate. + */ + if (ret == FD_UPDT_CLOSED) + continue; + + /* + * Reassociate with the port, using the same event mask as + * before. This call will not result in a dissociation as we + * asserted that _some_ events needed to be rebound above. + * + * Reassociating with the same mask allows us to mimic the + * level-triggered behaviour of poll(2). In the event that we + * are interested in the same events on the next turn of the + * loop, this represents no extra work. + * + * If this additional port_associate(3C) call becomes a + * performance problem, we would need to verify that we can + * correctly interact with the file descriptor cache and update + * list (see "src/fd.c") to avoid reassociating here, or to use + * a different events mask. + */ + evports_resync_fd(fd, rebind_events); + } +} + +static int init_evports_per_thread() +{ + evports_evlist_max = global.tune.maxpollevents; + evports_evlist = calloc(evports_evlist_max, sizeof(*evports_evlist)); + if (evports_evlist == NULL) { + goto fail_alloc; + } + + if (MAX_THREADS > 1 && tid) { + if ((evports_fd[tid] = port_create()) == -1) { + goto fail_fd; + } + } + + /* we may have to unregister some events initially registered on the + * original fd when it was alone, and/or to register events on the new + * fd for this thread. Let's just mark them as updated, the poller will + * do the rest. + */ + fd_reregister_all(tgid, ti->ltid_bit); + + return 1; + + fail_fd: + ha_free(&evports_evlist); + evports_evlist_max = 0; + fail_alloc: + return 0; +} + +static void deinit_evports_per_thread() +{ + if (MAX_THREADS > 1 && tid) + close(evports_fd[tid]); + + ha_free(&evports_evlist); + evports_evlist_max = 0; +} + +/* + * Initialisation of the event ports poller. + * Returns 0 in case of failure, non-zero in case of success. + */ +static int _do_init(struct poller *p) +{ + p->private = NULL; + + if ((evports_fd[tid] = port_create()) == -1) { + goto fail; + } + + hap_register_per_thread_init(init_evports_per_thread); + hap_register_per_thread_deinit(deinit_evports_per_thread); + + return 1; + +fail: + p->pref = 0; + return 0; +} + +/* + * Termination of the event ports poller. + * All resources are released and the poller is marked as inoperative. + */ +static void _do_term(struct poller *p) +{ + if (evports_fd[tid] != -1) { + close(evports_fd[tid]); + evports_fd[tid] = -1; + } + + p->private = NULL; + p->pref = 0; + + ha_free(&evports_evlist); + evports_evlist_max = 0; +} + +/* + * Run-time check to make sure we can allocate the resources needed for + * the poller to function correctly. + * Returns 1 on success, otherwise 0. + */ +static int _do_test(struct poller *p) +{ + int fd; + + if ((fd = port_create()) == -1) { + return 0; + } + + close(fd); + return 1; +} + +/* + * Close and recreate the event port after fork(). Returns 1 on success, + * otherwise 0. If this function fails, "_do_term()" must be called to + * clean up the poller. + */ +static int _do_fork(struct poller *p) +{ + if (evports_fd[tid] != -1) { + close(evports_fd[tid]); + } + + if ((evports_fd[tid] = port_create()) == -1) { + return 0; + } + + return 1; +} + +/* + * Registers the poller. + */ +static void _do_register(void) +{ + struct poller *p; + int i; + + if (nbpollers >= MAX_POLLERS) + return; + + for (i = 0; i < MAX_THREADS; i++) + evports_fd[i] = -1; + + p = &pollers[nbpollers++]; + + p->name = "evports"; + p->pref = 300; + p->flags = HAP_POLL_F_ERRHUP; + p->private = NULL; + + p->clo = NULL; + p->test = _do_test; + p->init = _do_init; + p->term = _do_term; + p->poll = _do_poll; + p->fork = _do_fork; +} + +INITCALL0(STG_REGISTER, _do_register); diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c new file mode 100644 index 0000000..f123e7b --- /dev/null +++ b/src/ev_kqueue.c @@ -0,0 +1,380 @@ +/* + * FD polling functions for FreeBSD kqueue() + * + * Copyright 2000-2014 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <unistd.h> +#include <sys/time.h> +#include <sys/types.h> + +#include <sys/event.h> +#include <sys/time.h> + +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/signal.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> + + +/* private data */ +static int kqueue_fd[MAX_THREADS] __read_mostly; // per-thread kqueue_fd +static THREAD_LOCAL struct kevent *kev = NULL; +static struct kevent *kev_out = NULL; // Trash buffer for kevent() to write the eventlist in + +static int _update_fd(int fd, int start) +{ + int en; + int changes = start; + ulong pr, ps; + + en = fdtab[fd].state; + pr = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_recv); + ps = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_send); + + if (!(fdtab[fd].thread_mask & ti->ltid_bit) || !(en & FD_EV_ACTIVE_RW)) { + if (!((pr | ps) & ti->ltid_bit)) { + /* fd was not watched, it's still not */ + return changes; + } + /* fd totally removed from poll list */ + EV_SET(&kev[changes++], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + EV_SET(&kev[changes++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + if (pr & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit); + if (ps & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit); + } + else { + /* OK fd has to be monitored, it was either added or changed */ + + if (en & FD_EV_ACTIVE_R) { + if (!(pr & ti->ltid_bit)) { + EV_SET(&kev[changes++], fd, EVFILT_READ, EV_ADD, 0, 0, NULL); + _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, ti->ltid_bit); + } + } + else if (pr & ti->ltid_bit) { + EV_SET(&kev[changes++], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit); + } + + if (en & FD_EV_ACTIVE_W) { + if (!(ps & ti->ltid_bit)) { + EV_SET(&kev[changes++], fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL); + _HA_ATOMIC_OR(&polled_mask[fd].poll_send, ti->ltid_bit); + } + } + else if (ps & ti->ltid_bit) { + EV_SET(&kev[changes++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit); + } + + } + return changes; +} + +/* + * kqueue() poller + */ +static void _do_poll(struct poller *p, int exp, int wake) +{ + int status; + int count, fd, wait_time; + struct timespec timeout_ts; + int updt_idx; + int changes = 0; + int old_fd; + + timeout_ts.tv_sec = 0; + timeout_ts.tv_nsec = 0; + /* first, scan the update list to find changes */ + for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { + fd = fd_updt[updt_idx]; + + if (!fd_grab_tgid(fd, tgid)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~ti->ltid_bit); + + if (fdtab[fd].owner) + changes = _update_fd(fd, changes); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); + } + /* Scan the global update list */ + for (old_fd = fd = update_list[tgid - 1].first; fd != -1; fd = fdtab[fd].update.next) { + if (fd == -2) { + fd = old_fd; + continue; + } + else if (fd <= -3) + fd = -fd -4; + if (fd == -1) + break; + + if (!fd_grab_tgid(fd, tgid)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + if (!(fdtab[fd].update_mask & ti->ltid_bit)) { + fd_drop_tgid(fd); + continue; + } + + done_update_polling(fd); + + if (fdtab[fd].owner) + changes = _update_fd(fd, changes); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); + } + + thread_idle_now(); + thread_harmless_now(); + + if (changes) { +#ifdef EV_RECEIPT + kev[0].flags |= EV_RECEIPT; +#else + /* If EV_RECEIPT isn't defined, just add an invalid entry, + * so that we get an error and kevent() stops before scanning + * the kqueue. + */ + EV_SET(&kev[changes++], -1, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); +#endif + kevent(kqueue_fd[tid], kev, changes, kev_out, changes, &timeout_ts); + } + fd_nbupdt = 0; + + /* Now let's wait for polled events. */ + wait_time = wake ? 0 : compute_poll_timeout(exp); + fd = global.tune.maxpollevents; + clock_entering_poll(); + + do { + int timeout = (global.tune.options & GTUNE_BUSY_POLLING) ? 0 : wait_time; + + timeout_ts.tv_sec = (timeout / 1000); + timeout_ts.tv_nsec = (timeout % 1000) * 1000000; + + status = kevent(kqueue_fd[tid], // int kq + NULL, // const struct kevent *changelist + 0, // int nchanges + kev, // struct kevent *eventlist + fd, // int nevents + &timeout_ts); // const struct timespec *timeout + clock_update_local_date(timeout, status); + + if (status) { + activity[tid].poll_io++; + break; + } + if (timeout || !wait_time) + break; + if (tick_isset(exp) && tick_is_expired(exp, now_ms)) + break; + } while (1); + + clock_update_global_date(); + fd_leaving_poll(wait_time, status); + + for (count = 0; count < status; count++) { + unsigned int n = 0; + + fd = kev[count].ident; + +#ifdef DEBUG_FD + _HA_ATOMIC_INC(&fdtab[fd].event_count); +#endif + if (kev[count].filter == EVFILT_READ) { + if (kev[count].data || !(kev[count].flags & EV_EOF)) + n |= FD_EV_READY_R; + if (kev[count].flags & EV_EOF) + n |= FD_EV_SHUT_R; + } + else if (kev[count].filter == EVFILT_WRITE) { + n |= FD_EV_READY_W; + if (kev[count].flags & EV_EOF) + n |= FD_EV_ERR_RW; + } + + fd_update_events(fd, n); + } +} + + +static int init_kqueue_per_thread() +{ + /* we can have up to two events per fd, so allocate enough to store + * 2*fd event, and an extra one, in case EV_RECEIPT isn't defined, + * so that we can add an invalid entry and get an error, to avoid + * scanning the kqueue uselessly. + */ + kev = calloc(1, sizeof(struct kevent) * (2 * global.maxsock + 1)); + if (kev == NULL) + goto fail_alloc; + + if (MAX_THREADS > 1 && tid) { + kqueue_fd[tid] = kqueue(); + if (kqueue_fd[tid] < 0) + goto fail_fd; + } + + /* we may have to unregister some events initially registered on the + * original fd when it was alone, and/or to register events on the new + * fd for this thread. Let's just mark them as updated, the poller will + * do the rest. + */ + fd_reregister_all(tgid, ti->ltid_bit); + + return 1; + fail_fd: + free(kev); + fail_alloc: + return 0; +} + +static void deinit_kqueue_per_thread() +{ + if (MAX_THREADS > 1 && tid) + close(kqueue_fd[tid]); + + ha_free(&kev); +} + +/* + * Initialization of the kqueue() poller. + * Returns 0 in case of failure, non-zero in case of success. If it fails, it + * disables the poller by setting its pref to 0. + */ +static int _do_init(struct poller *p) +{ + p->private = NULL; + + /* we can have up to two events per fd, so allocate enough to store + * 2*fd event, and an extra one, in case EV_RECEIPT isn't defined, + * so that we can add an invalid entry and get an error, to avoid + * scanning the kqueue uselessly. + */ + kev_out = calloc(1, sizeof(struct kevent) * (2 * global.maxsock + 1)); + if (!kev_out) + goto fail_alloc; + + kqueue_fd[tid] = kqueue(); + if (kqueue_fd[tid] < 0) + goto fail_fd; + + hap_register_per_thread_init(init_kqueue_per_thread); + hap_register_per_thread_deinit(deinit_kqueue_per_thread); + return 1; + + fail_fd: + ha_free(&kev_out); +fail_alloc: + p->pref = 0; + return 0; +} + +/* + * Termination of the kqueue() poller. + * Memory is released and the poller is marked as unselectable. + */ +static void _do_term(struct poller *p) +{ + if (kqueue_fd[tid] >= 0) { + close(kqueue_fd[tid]); + kqueue_fd[tid] = -1; + } + + p->private = NULL; + p->pref = 0; + if (kev_out) { + ha_free(&kev_out); + } +} + +/* + * Check that the poller works. + * Returns 1 if OK, otherwise 0. + */ +static int _do_test(struct poller *p) +{ + int fd; + + fd = kqueue(); + if (fd < 0) + return 0; + close(fd); + return 1; +} + +/* + * Recreate the kqueue file descriptor after a fork(). Returns 1 if OK, + * otherwise 0. Note that some pollers need to be reopened after a fork() + * (such as kqueue), and some others may fail to do so in a chroot. + */ +static int _do_fork(struct poller *p) +{ + kqueue_fd[tid] = kqueue(); + if (kqueue_fd[tid] < 0) + return 0; + return 1; +} + +/* + * Registers the poller. + */ +static void _do_register(void) +{ + struct poller *p; + int i; + + if (nbpollers >= MAX_POLLERS) + return; + + for (i = 0; i < MAX_THREADS; i++) + kqueue_fd[i] = -1; + + p = &pollers[nbpollers++]; + + p->name = "kqueue"; + p->pref = 300; + p->flags = HAP_POLL_F_RDHUP | HAP_POLL_F_ERRHUP; + p->private = NULL; + + p->clo = NULL; + p->test = _do_test; + p->init = _do_init; + p->term = _do_term; + p->poll = _do_poll; + p->fork = _do_fork; +} + +INITCALL0(STG_REGISTER, _do_register); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/ev_poll.c b/src/ev_poll.c new file mode 100644 index 0000000..e98630c --- /dev/null +++ b/src/ev_poll.c @@ -0,0 +1,348 @@ +/* + * FD polling functions for generic poll() + * + * Copyright 2000-2014 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE // for POLLRDHUP on Linux + +#include <unistd.h> +#include <poll.h> +#include <sys/time.h> +#include <sys/types.h> + +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/signal.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> + + +#ifndef POLLRDHUP +/* POLLRDHUP was defined late in libc, and it appeared in kernel 2.6.17 */ +#define POLLRDHUP 0 +#endif + +static int maxfd; /* # of the highest fd + 1 */ +static unsigned int *fd_evts[2]; + +/* private data */ +static THREAD_LOCAL int nbfd = 0; +static THREAD_LOCAL struct pollfd *poll_events = NULL; + +static void __fd_clo(int fd) +{ + hap_fd_clr(fd, fd_evts[DIR_RD]); + hap_fd_clr(fd, fd_evts[DIR_WR]); +} + +static void _update_fd(int fd, int *max_add_fd) +{ + int en; + ulong pr, ps; + + en = fdtab[fd].state; + pr = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_recv); + ps = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_send); + + /* we have a single state for all threads, which is why we + * don't check the tid_bit. First thread to see the update + * takes it for every other one. + */ + if (!(en & FD_EV_ACTIVE_RW)) { + if (!(pr | ps)) { + /* fd was not watched, it's still not */ + return; + } + /* fd totally removed from poll list */ + hap_fd_clr(fd, fd_evts[DIR_RD]); + hap_fd_clr(fd, fd_evts[DIR_WR]); + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, 0); + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, 0); + } + else { + /* OK fd has to be monitored, it was either added or changed */ + if (!(en & FD_EV_ACTIVE_R)) { + hap_fd_clr(fd, fd_evts[DIR_RD]); + if (pr & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit); + } else { + hap_fd_set(fd, fd_evts[DIR_RD]); + if (!(pr & ti->ltid_bit)) + _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, ti->ltid_bit); + } + + if (!(en & FD_EV_ACTIVE_W)) { + hap_fd_clr(fd, fd_evts[DIR_WR]); + if (ps & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit); + } else { + hap_fd_set(fd, fd_evts[DIR_WR]); + if (!(ps & ti->ltid_bit)) + _HA_ATOMIC_OR(&polled_mask[fd].poll_send, ti->ltid_bit); + } + + if (fd > *max_add_fd) + *max_add_fd = fd; + } +} + +/* + * Poll() poller + */ +static void _do_poll(struct poller *p, int exp, int wake) +{ + int status; + int fd; + int wait_time; + int updt_idx; + int fds, count; + int sr, sw; + int old_maxfd, new_maxfd, max_add_fd; + unsigned rn, wn; /* read new, write new */ + int old_fd; + + max_add_fd = -1; + + /* first, scan the update list to find changes */ + for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { + fd = fd_updt[updt_idx]; + + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~ti->ltid_bit); + if (!fdtab[fd].owner) { + activity[tid].poll_drop_fd++; + continue; + } + _update_fd(fd, &max_add_fd); + } + + /* Now scan the global update list */ + for (old_fd = fd = update_list[tgid - 1].first; fd != -1; fd = fdtab[fd].update.next) { + if (fd == -2) { + fd = old_fd; + continue; + } + else if (fd <= -3) + fd = -fd -4; + if (fd == -1) + break; + if (fdtab[fd].update_mask & ti->ltid_bit) { + /* Cheat a bit, as the state is global to all pollers + * we don't need every thread to take care of the + * update. + */ + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tg->threads_enabled); + done_update_polling(fd); + } else + continue; + if (!fdtab[fd].owner) + continue; + _update_fd(fd, &max_add_fd); + } + + /* maybe we added at least one fd larger than maxfd */ + for (old_maxfd = maxfd; old_maxfd <= max_add_fd; ) { + if (_HA_ATOMIC_CAS(&maxfd, &old_maxfd, max_add_fd + 1)) + break; + } + + /* maxfd doesn't need to be precise but it needs to cover *all* active + * FDs. Thus we only shrink it if we have such an opportunity. The algo + * is simple : look for the previous used place, try to update maxfd to + * point to it, abort if maxfd changed in the mean time. + */ + old_maxfd = maxfd; + do { + new_maxfd = old_maxfd; + while (new_maxfd - 1 >= 0 && !fdtab[new_maxfd - 1].owner) + new_maxfd--; + if (new_maxfd >= old_maxfd) + break; + } while (!_HA_ATOMIC_CAS(&maxfd, &old_maxfd, new_maxfd)); + + thread_idle_now(); + thread_harmless_now(); + + fd_nbupdt = 0; + + nbfd = 0; + for (fds = 0; (fds * 8*sizeof(**fd_evts)) < maxfd; fds++) { + rn = fd_evts[DIR_RD][fds]; + wn = fd_evts[DIR_WR][fds]; + + if (!(rn|wn)) + continue; + + for (count = 0, fd = fds * 8*sizeof(**fd_evts); count < 8*sizeof(**fd_evts) && fd < maxfd; count++, fd++) { + sr = (rn >> count) & 1; + sw = (wn >> count) & 1; + if ((sr|sw)) { + if (!fdtab[fd].owner) { + /* should normally not happen here except + * due to rare thread concurrency + */ + continue; + } + + if (!(fdtab[fd].thread_mask & ti->ltid_bit)) { + continue; + } + + poll_events[nbfd].fd = fd; + poll_events[nbfd].events = (sr ? (POLLIN | POLLRDHUP) : 0) | (sw ? POLLOUT : 0); + nbfd++; + } + } + } + + /* Now let's wait for polled events. */ + wait_time = wake ? 0 : compute_poll_timeout(exp); + clock_entering_poll(); + status = poll(poll_events, nbfd, wait_time); + clock_update_date(wait_time, status); + + fd_leaving_poll(wait_time, status); + + if (status > 0) + activity[tid].poll_io++; + + for (count = 0; status > 0 && count < nbfd; count++) { + unsigned int n; + int e = poll_events[count].revents; + + fd = poll_events[count].fd; + + if ((e & POLLRDHUP) && !(cur_poller.flags & HAP_POLL_F_RDHUP)) + _HA_ATOMIC_OR(&cur_poller.flags, HAP_POLL_F_RDHUP); + +#ifdef DEBUG_FD + _HA_ATOMIC_INC(&fdtab[fd].event_count); +#endif + if (!(e & ( POLLOUT | POLLIN | POLLERR | POLLHUP | POLLRDHUP ))) + continue; + + /* ok, we found one active fd */ + status--; + + n = ((e & POLLIN) ? FD_EV_READY_R : 0) | + ((e & POLLOUT) ? FD_EV_READY_W : 0) | + ((e & POLLRDHUP) ? FD_EV_SHUT_R : 0) | + ((e & POLLHUP) ? FD_EV_SHUT_RW : 0) | + ((e & POLLERR) ? FD_EV_ERR_RW : 0); + + fd_update_events(fd, n); + } +} + + +static int init_poll_per_thread() +{ + poll_events = calloc(1, sizeof(struct pollfd) * global.maxsock); + if (poll_events == NULL) + return 0; + return 1; +} + +static void deinit_poll_per_thread() +{ + ha_free(&poll_events); +} + +/* + * Initialization of the poll() poller. + * Returns 0 in case of failure, non-zero in case of success. If it fails, it + * disables the poller by setting its pref to 0. + */ +static int _do_init(struct poller *p) +{ + __label__ fail_swevt, fail_srevt; + int fd_evts_bytes; + + p->private = NULL; + + /* this old poller uses a process-wide FD list that cannot work with + * groups. + */ + if (global.nbtgroups > 1) + goto fail_srevt; + + fd_evts_bytes = (global.maxsock + sizeof(**fd_evts) * 8 - 1) / (sizeof(**fd_evts) * 8) * sizeof(**fd_evts); + + if ((fd_evts[DIR_RD] = calloc(1, fd_evts_bytes)) == NULL) + goto fail_srevt; + if ((fd_evts[DIR_WR] = calloc(1, fd_evts_bytes)) == NULL) + goto fail_swevt; + + hap_register_per_thread_init(init_poll_per_thread); + hap_register_per_thread_deinit(deinit_poll_per_thread); + + return 1; + + fail_swevt: + free(fd_evts[DIR_RD]); + fail_srevt: + p->pref = 0; + return 0; +} + +/* + * Termination of the poll() poller. + * Memory is released and the poller is marked as unselectable. + */ +static void _do_term(struct poller *p) +{ + free(fd_evts[DIR_WR]); + free(fd_evts[DIR_RD]); + p->private = NULL; + p->pref = 0; +} + +/* + * Check that the poller works. + * Returns 1 if OK, otherwise 0. + */ +static int _do_test(struct poller *p) +{ + return 1; +} + +/* + * Registers the poller. + */ +static void _do_register(void) +{ + struct poller *p; + + if (nbpollers >= MAX_POLLERS) + return; + p = &pollers[nbpollers++]; + + p->name = "poll"; + p->pref = 200; + p->flags = HAP_POLL_F_ERRHUP; + p->private = NULL; + + p->clo = __fd_clo; + p->test = _do_test; + p->init = _do_init; + p->term = _do_term; + p->poll = _do_poll; +} + +INITCALL0(STG_REGISTER, _do_register); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/ev_select.c b/src/ev_select.c new file mode 100644 index 0000000..eadd588 --- /dev/null +++ b/src/ev_select.c @@ -0,0 +1,335 @@ +/* + * FD polling functions for generic select() + * + * Copyright 2000-2014 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <unistd.h> +#include <sys/time.h> +#include <sys/types.h> + +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> + + +/* private data */ +static int maxfd; /* # of the highest fd + 1 */ +static unsigned int *fd_evts[2]; +static THREAD_LOCAL fd_set *tmp_evts[2]; + +/* Immediately remove the entry upon close() */ +static void __fd_clo(int fd) +{ + hap_fd_clr(fd, fd_evts[DIR_RD]); + hap_fd_clr(fd, fd_evts[DIR_WR]); +} + +static void _update_fd(int fd, int *max_add_fd) +{ + int en; + ulong pr, ps; + + en = fdtab[fd].state; + pr = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_recv); + ps = _HA_ATOMIC_LOAD(&polled_mask[fd].poll_send); + + /* we have a single state for all threads, which is why we + * don't check the tid_bit. First thread to see the update + * takes it for every other one. + */ + if (!(en & FD_EV_ACTIVE_RW)) { + if (!(pr | ps)) { + /* fd was not watched, it's still not */ + return; + } + /* fd totally removed from poll list */ + hap_fd_clr(fd, fd_evts[DIR_RD]); + hap_fd_clr(fd, fd_evts[DIR_WR]); + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, 0); + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, 0); + } + else { + /* OK fd has to be monitored, it was either added or changed */ + if (!(en & FD_EV_ACTIVE_R)) { + hap_fd_clr(fd, fd_evts[DIR_RD]); + if (pr & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~ti->ltid_bit); + } else { + hap_fd_set(fd, fd_evts[DIR_RD]); + if (!(pr & ti->ltid_bit)) + _HA_ATOMIC_OR(&polled_mask[fd].poll_recv, ti->ltid_bit); + } + + if (!(en & FD_EV_ACTIVE_W)) { + hap_fd_clr(fd, fd_evts[DIR_WR]); + if (ps & ti->ltid_bit) + _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~ti->ltid_bit); + } else { + hap_fd_set(fd, fd_evts[DIR_WR]); + if (!(ps & ti->ltid_bit)) + _HA_ATOMIC_OR(&polled_mask[fd].poll_send, ti->ltid_bit); + } + + if (fd > *max_add_fd) + *max_add_fd = fd; + } +} + +/* + * Select() poller + */ +static void _do_poll(struct poller *p, int exp, int wake) +{ + int status; + int fd, i; + struct timeval delta; + int delta_ms; + int fds; + int updt_idx; + char count; + int readnotnull, writenotnull; + int old_maxfd, new_maxfd, max_add_fd; + int old_fd; + + max_add_fd = -1; + + /* first, scan the update list to find changes */ + for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { + fd = fd_updt[updt_idx]; + + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~ti->ltid_bit); + if (!fdtab[fd].owner) { + activity[tid].poll_drop_fd++; + continue; + } + _update_fd(fd, &max_add_fd); + } + /* Now scan the global update list */ + for (old_fd = fd = update_list[tgid - 1].first; fd != -1; fd = fdtab[fd].update.next) { + if (fd == -2) { + fd = old_fd; + continue; + } + else if (fd <= -3) + fd = -fd -4; + if (fd == -1) + break; + if (fdtab[fd].update_mask & ti->ltid_bit) { + /* Cheat a bit, as the state is global to all pollers + * we don't need every thread to take care of the + * update. + */ + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tg->threads_enabled); + done_update_polling(fd); + } else + continue; + if (!fdtab[fd].owner) + continue; + _update_fd(fd, &max_add_fd); + } + + + /* maybe we added at least one fd larger than maxfd */ + for (old_maxfd = maxfd; old_maxfd <= max_add_fd; ) { + if (_HA_ATOMIC_CAS(&maxfd, &old_maxfd, max_add_fd + 1)) + break; + } + + /* maxfd doesn't need to be precise but it needs to cover *all* active + * FDs. Thus we only shrink it if we have such an opportunity. The algo + * is simple : look for the previous used place, try to update maxfd to + * point to it, abort if maxfd changed in the mean time. + */ + old_maxfd = maxfd; + do { + new_maxfd = old_maxfd; + while (new_maxfd - 1 >= 0 && !fdtab[new_maxfd - 1].owner) + new_maxfd--; + if (new_maxfd >= old_maxfd) + break; + } while (!_HA_ATOMIC_CAS(&maxfd, &old_maxfd, new_maxfd)); + + thread_idle_now(); + thread_harmless_now(); + + fd_nbupdt = 0; + + /* let's restore fdset state */ + readnotnull = 0; writenotnull = 0; + for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) { + readnotnull |= (*(((int*)tmp_evts[DIR_RD])+i) = *(((int*)fd_evts[DIR_RD])+i)) != 0; + writenotnull |= (*(((int*)tmp_evts[DIR_WR])+i) = *(((int*)fd_evts[DIR_WR])+i)) != 0; + } + + /* now let's wait for events */ + delta_ms = wake ? 0 : compute_poll_timeout(exp); + delta.tv_sec = (delta_ms / 1000); + delta.tv_usec = (delta_ms % 1000) * 1000; + clock_entering_poll(); + status = select(maxfd, + readnotnull ? tmp_evts[DIR_RD] : NULL, + writenotnull ? tmp_evts[DIR_WR] : NULL, + NULL, + &delta); + clock_update_date(delta_ms, status); + fd_leaving_poll(delta_ms, status); + + if (status <= 0) + return; + + activity[tid].poll_io++; + + for (fds = 0; (fds * BITS_PER_INT) < maxfd; fds++) { + if ((((int *)(tmp_evts[DIR_RD]))[fds] | ((int *)(tmp_evts[DIR_WR]))[fds]) == 0) + continue; + + for (count = BITS_PER_INT, fd = fds * BITS_PER_INT; count && fd < maxfd; count--, fd++) { + unsigned int n = 0; + + if (FD_ISSET(fd, tmp_evts[DIR_RD])) + n |= FD_EV_READY_R; + + if (FD_ISSET(fd, tmp_evts[DIR_WR])) + n |= FD_EV_READY_W; + + if (!n) + continue; + +#ifdef DEBUG_FD + _HA_ATOMIC_INC(&fdtab[fd].event_count); +#endif + + fd_update_events(fd, n); + } + } +} + +static int init_select_per_thread() +{ + int fd_set_bytes; + + fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE; + tmp_evts[DIR_RD] = calloc(1, fd_set_bytes); + if (tmp_evts[DIR_RD] == NULL) + goto fail; + tmp_evts[DIR_WR] = calloc(1, fd_set_bytes); + if (tmp_evts[DIR_WR] == NULL) + goto fail; + return 1; + fail: + free(tmp_evts[DIR_RD]); + free(tmp_evts[DIR_WR]); + return 0; +} + +static void deinit_select_per_thread() +{ + ha_free(&tmp_evts[DIR_WR]); + ha_free(&tmp_evts[DIR_RD]); +} + +/* + * Initialization of the select() poller. + * Returns 0 in case of failure, non-zero in case of success. If it fails, it + * disables the poller by setting its pref to 0. + */ +static int _do_init(struct poller *p) +{ + int fd_set_bytes; + + p->private = NULL; + + /* this old poller uses a process-wide FD list that cannot work with + * groups. + */ + if (global.nbtgroups > 1) + goto fail_srevt; + + if (global.maxsock > FD_SETSIZE) + goto fail_srevt; + + fd_set_bytes = sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE; + + if ((fd_evts[DIR_RD] = calloc(1, fd_set_bytes)) == NULL) + goto fail_srevt; + if ((fd_evts[DIR_WR] = calloc(1, fd_set_bytes)) == NULL) + goto fail_swevt; + + hap_register_per_thread_init(init_select_per_thread); + hap_register_per_thread_deinit(deinit_select_per_thread); + + return 1; + + fail_swevt: + free(fd_evts[DIR_RD]); + fail_srevt: + p->pref = 0; + return 0; +} + +/* + * Termination of the select() poller. + * Memory is released and the poller is marked as unselectable. + */ +static void _do_term(struct poller *p) +{ + free(fd_evts[DIR_WR]); + free(fd_evts[DIR_RD]); + p->private = NULL; + p->pref = 0; +} + +/* + * Check that the poller works. + * Returns 1 if OK, otherwise 0. + */ +static int _do_test(struct poller *p) +{ + if (global.maxsock > FD_SETSIZE) + return 0; + + return 1; +} + +/* + * Registers the poller. + */ +static void _do_register(void) +{ + struct poller *p; + + if (nbpollers >= MAX_POLLERS) + return; + p = &pollers[nbpollers++]; + + p->name = "select"; + p->pref = 150; + p->flags = 0; + p->private = NULL; + + p->clo = __fd_clo; + p->test = _do_test; + p->init = _do_init; + p->term = _do_term; + p->poll = _do_poll; +} + +INITCALL0(STG_REGISTER, _do_register); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/event_hdl.c b/src/event_hdl.c new file mode 100644 index 0000000..aeb4d24 --- /dev/null +++ b/src/event_hdl.c @@ -0,0 +1,999 @@ +/* + * general purpose event handlers management + * + * Copyright 2022 HAProxy Technologies + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2.1 of the License, or (at your option) any later version. + * + */ + +#include <string.h> +#include <haproxy/event_hdl.h> +#include <haproxy/compiler.h> +#include <haproxy/task.h> +#include <haproxy/tools.h> +#include <haproxy/errors.h> +#include <haproxy/signal.h> +#include <haproxy/xxhash.h> +#include <haproxy/cfgparse.h> + +/* event types changes in event_hdl-t.h file should be reflected in the + * map below to allow string to type and type to string conversions + */ +static struct event_hdl_sub_type_map event_hdl_sub_type_map[] = { + {"NONE", EVENT_HDL_SUB_NONE}, + {"SERVER", EVENT_HDL_SUB_SERVER}, + {"SERVER_ADD", EVENT_HDL_SUB_SERVER_ADD}, + {"SERVER_DEL", EVENT_HDL_SUB_SERVER_DEL}, + {"SERVER_UP", EVENT_HDL_SUB_SERVER_UP}, + {"SERVER_DOWN", EVENT_HDL_SUB_SERVER_DOWN}, + {"SERVER_STATE", EVENT_HDL_SUB_SERVER_STATE}, + {"SERVER_ADMIN", EVENT_HDL_SUB_SERVER_ADMIN}, + {"SERVER_CHECK", EVENT_HDL_SUB_SERVER_CHECK}, + {"SERVER_INETADDR", EVENT_HDL_SUB_SERVER_INETADDR}, +}; + +/* internal types (only used in this file) */ +struct event_hdl_async_task_default_ctx +{ + event_hdl_async_equeue e_queue; /* event queue list */ + event_hdl_cb_async func; /* event handling func */ +}; + +/* memory pools declarations */ +DECLARE_STATIC_POOL(pool_head_sub, "ehdl_sub", sizeof(struct event_hdl_sub)); +DECLARE_STATIC_POOL(pool_head_sub_event, "ehdl_sub_e", sizeof(struct event_hdl_async_event)); +DECLARE_STATIC_POOL(pool_head_sub_event_data, "ehdl_sub_ed", sizeof(struct event_hdl_async_event_data)); +DECLARE_STATIC_POOL(pool_head_sub_taskctx, "ehdl_sub_tctx", sizeof(struct event_hdl_async_task_default_ctx)); + +/* global event_hdl tunables (public variable) */ +struct event_hdl_tune event_hdl_tune; + +/* global subscription list (implicit where NULL is used as sublist argument) */ +static event_hdl_sub_list global_event_hdl_sub_list; + +/* every known subscription lists are tracked in this list (including the global one) */ +static struct mt_list known_event_hdl_sub_list = MT_LIST_HEAD_INIT(known_event_hdl_sub_list); + +static void _event_hdl_sub_list_destroy(event_hdl_sub_list *sub_list); + +static void event_hdl_deinit(struct sig_handler *sh) +{ + event_hdl_sub_list *cur_list; + struct mt_list *elt1, elt2; + + /* destroy all known subscription lists */ + mt_list_for_each_entry_safe(cur_list, &known_event_hdl_sub_list, known, elt1, elt2) { + /* remove cur elem from list */ + MT_LIST_DELETE_SAFE(elt1); + /* then destroy it */ + _event_hdl_sub_list_destroy(cur_list); + } +} + +static void event_hdl_init(void) +{ + /* initialize global subscription list */ + event_hdl_sub_list_init(&global_event_hdl_sub_list); + /* register the deinit function, will be called on soft-stop */ + signal_register_fct(0, event_hdl_deinit, 0); + + /* set some default values */ + event_hdl_tune.max_events_at_once = EVENT_HDL_MAX_AT_ONCE; +} + +/* general purpose hashing function when you want to compute + * an ID based on <scope> x <name> + * It is your responsibility to make sure <scope> is not used + * elsewhere in the code (or that you are fine with sharing + * the scope). + */ +inline uint64_t event_hdl_id(const char *scope, const char *name) +{ + XXH64_state_t state; + + XXH64_reset(&state, 0); + XXH64_update(&state, scope, strlen(scope)); + XXH64_update(&state, name, strlen(name)); + return XXH64_digest(&state); +} + +/* takes a sub_type as input, returns corresponding sub_type + * printable string or "N/A" if not found. + * If not found, an error will be reported to stderr so the developers + * know that a sub_type is missing its associated string in event_hdl-t.h + */ +const char *event_hdl_sub_type_to_string(struct event_hdl_sub_type sub_type) +{ + int it; + + for (it = 0; it < (int)(sizeof(event_hdl_sub_type_map) / sizeof(event_hdl_sub_type_map[0])); it++) { + if (sub_type.family == event_hdl_sub_type_map[it].type.family && + sub_type.subtype == event_hdl_sub_type_map[it].type.subtype) + return event_hdl_sub_type_map[it].name; + } + ha_alert("event_hdl-t.h: missing sub_type string representation.\n" + "Please reflect any changes in event_hdl_sub_type_map.\n"); + return "N/A"; +} + +/* returns the internal sub_type corresponding + * to the printable representation <name> + * or EVENT_HDL_SUB_NONE if no such event exists + * (see event_hdl-t.h for the complete list of supported types) + */ +struct event_hdl_sub_type event_hdl_string_to_sub_type(const char *name) +{ + int it; + + for (it = 0; it < (int)(sizeof(event_hdl_sub_type_map) / sizeof(event_hdl_sub_type_map[0])); it++) { + if (!strcmp(name, event_hdl_sub_type_map[it].name)) + return event_hdl_sub_type_map[it].type; + } + return EVENT_HDL_SUB_NONE; +} + +/* Takes <subscriptions> sub list as input, returns a printable string + * containing every sub_types contained in <subscriptions> + * separated by '|' char. + * Returns NULL if no sub_types are found in <subscriptions> + * This functions leverages memprintf, thus it is up to the + * caller to free the returned value (if != NULL) when he no longer + * uses it. + */ +char *event_hdl_sub_type_print(struct event_hdl_sub_type subscriptions) +{ + char *out = NULL; + int it; + uint8_t first = 1; + + for (it = 0; it < (int)(sizeof(event_hdl_sub_type_map) / sizeof(event_hdl_sub_type_map[0])); it++) { + if (subscriptions.family == event_hdl_sub_type_map[it].type.family && + ((subscriptions.subtype & event_hdl_sub_type_map[it].type.subtype) == + event_hdl_sub_type_map[it].type.subtype)) { + if (first) { + memprintf(&out, "%s", event_hdl_sub_type_map[it].name); + first--; + } + else + memprintf(&out, "%s%s%s", out, "|", event_hdl_sub_type_map[it].name); + } + } + + return out; +} + +/* event_hdl debug/reporting function */ +typedef void (*event_hdl_report_hdl_state_func)(const char *fmt, ...); +static void event_hdl_report_hdl_state(event_hdl_report_hdl_state_func report_func, + const struct event_hdl *hdl, const char *what, const char *state) +{ + report_func("[event_hdl]:%s (%s)'#%llu@%s': %s\n", + what, + (hdl->async) ? "ASYNC" : "SYNC", + (long long unsigned int)hdl->id, + hdl->dorigin, + state); +} + +static inline void _event_hdl_async_data_drop(struct event_hdl_async_event_data *data) +{ + if (HA_ATOMIC_SUB_FETCH(&data->refcount, 1) == 0) { + /* we were the last one holding a reference to event data - free required */ + if (data->mfree) { + /* Some event data members are dynamically allocated and thus + * require specific cleanup using user-provided function. + * We directly pass a pointer to internal data storage but + * we only expect the cleanup function to typecast it in the + * relevant data type to give enough context to the function to + * perform the cleanup on data members, and not actually freeing + * data pointer since it is our internal buffer :) + */ + data->mfree(&data->data); + } + pool_free(pool_head_sub_event_data, data); + } +} + +void event_hdl_async_free_event(struct event_hdl_async_event *e) +{ + if (unlikely(event_hdl_sub_type_equal(e->type, EVENT_HDL_SUB_END))) { + /* last event for hdl, special case */ + /* free subscription entry as we're the last one still using it + * (it is already removed from mt_list, no race can occur) + */ + event_hdl_drop(e->sub_mgmt.this); + HA_ATOMIC_DEC(&jobs); + } + else if (e->_data) + _event_hdl_async_data_drop(e->_data); /* data wrapper */ + pool_free(pool_head_sub_event, e); +} + +/* wakeup the task depending on its type: + * normal async mode internally uses tasklets but advanced async mode + * allows both tasks and tasklets. + * While tasks and tasklets may be easily casted, we need to use the proper + * API to wake them up (the waiting queues are exclusive). + */ +static void event_hdl_task_wakeup(struct tasklet *task) +{ + if (TASK_IS_TASKLET(task)) + tasklet_wakeup(task); + else + task_wakeup((struct task *)task, TASK_WOKEN_OTHER); /* TODO: switch to TASK_WOKEN_EVENT? */ +} + +/* task handler used for normal async subscription mode + * if you use advanced async subscription mode, you can use this + * as an example to implement your own task wrapper + */ +static struct task *event_hdl_async_task_default(struct task *task, void *ctx, unsigned int state) +{ + struct tasklet *tl = (struct tasklet *)task; + struct event_hdl_async_task_default_ctx *task_ctx = ctx; + struct event_hdl_async_event *event; + int max_notif_at_once_it = 0; + uint8_t done = 0; + + /* run through e_queue, and call func() for each event + * if we read END event, it indicates we must stop: + * no more events to come (handler is unregistered) + * so we must free task_ctx and stop task + */ + while (max_notif_at_once_it < event_hdl_tune.max_events_at_once && + (event = event_hdl_async_equeue_pop(&task_ctx->e_queue))) + { + if (event_hdl_sub_type_equal(event->type, EVENT_HDL_SUB_END)) { + done = 1; + event_hdl_async_free_event(event); + /* break is normally not even required, EVENT_HDL_SUB_END + * is guaranteed to be last event of e_queue + * (because in normal mode one sub == one e_queue) + */ + break; + } + else { + struct event_hdl_cb cb; + + cb.e_type = event->type; + cb.e_data = event->data; + cb.sub_mgmt = &event->sub_mgmt; + cb._sync = 0; + + /* call user function */ + task_ctx->func(&cb, event->private); + max_notif_at_once_it++; + } + event_hdl_async_free_event(event); + } + + if (done) { + /* our job is done, subscription is over: no more events to come */ + pool_free(pool_head_sub_taskctx, task_ctx); + tasklet_free(tl); + return NULL; + } + return task; +} + +/* internal subscription mgmt functions */ +static inline struct event_hdl_sub_type _event_hdl_getsub(struct event_hdl_sub *cur_sub) +{ + return cur_sub->sub; +} + +static inline struct event_hdl_sub_type _event_hdl_getsub_async(struct event_hdl_sub *cur_sub) +{ + struct mt_list lock; + struct event_hdl_sub_type type = EVENT_HDL_SUB_NONE; + + lock = MT_LIST_LOCK_ELT(&cur_sub->mt_list); + if (lock.next != &cur_sub->mt_list) + type = _event_hdl_getsub(cur_sub); + // else already removed + MT_LIST_UNLOCK_ELT(&cur_sub->mt_list, lock); + return type; +} + +static inline int _event_hdl_resub(struct event_hdl_sub *cur_sub, struct event_hdl_sub_type type) +{ + if (!event_hdl_sub_family_equal(cur_sub->sub, type)) + return 0; /* family types differ, do nothing */ + cur_sub->sub.subtype = type.subtype; /* new subtype assignment */ + return 1; +} + +static inline int _event_hdl_resub_async(struct event_hdl_sub *cur_sub, struct event_hdl_sub_type type) +{ + int status = 0; + struct mt_list lock; + + lock = MT_LIST_LOCK_ELT(&cur_sub->mt_list); + if (lock.next != &cur_sub->mt_list) + status = _event_hdl_resub(cur_sub, type); + // else already removed + MT_LIST_UNLOCK_ELT(&cur_sub->mt_list, lock); + return status; +} + +static inline void _event_hdl_unsubscribe(struct event_hdl_sub *del_sub) +{ + struct mt_list lock; + + if (del_sub->hdl.async) { + /* ASYNC SUB MODE */ + /* push EVENT_HDL_SUB_END (to notify the task that the subscription is dead) */ + + /* push END EVENT in busy state so we can safely wakeup + * the task before releasing it. + * Not doing that would expose us to a race where the task could've already + * consumed the END event before the wakeup, and some tasks + * kill themselves (ie: normal async mode) when they receive such event + */ + HA_ATOMIC_INC(&del_sub->hdl.async_equeue->size); + lock = MT_LIST_APPEND_LOCKED(&del_sub->hdl.async_equeue->head, &del_sub->async_end->mt_list); + + /* wake up the task */ + event_hdl_task_wakeup(del_sub->hdl.async_task); + + /* unlock END EVENT (we're done, the task is now free to consume it) */ + MT_LIST_UNLOCK_ELT(&del_sub->async_end->mt_list, lock); + + /* we don't free sub here + * freeing will be performed by async task so it can safely rely + * on the pointer until it notices it + */ + } else { + /* SYNC SUB MODE */ + + /* we can directly free the subscription: + * no other thread can access it since we successfully + * removed it from the list + */ + event_hdl_drop(del_sub); + } +} + +static inline void _event_hdl_unsubscribe_async(struct event_hdl_sub *del_sub) +{ + if (!MT_LIST_DELETE(&del_sub->mt_list)) + return; /* already removed (but may be pending in e_queues) */ + _event_hdl_unsubscribe(del_sub); +} + +/* sub_mgmt function pointers (for handlers) */ +static struct event_hdl_sub_type event_hdl_getsub_sync(const struct event_hdl_sub_mgmt *mgmt) +{ + if (!mgmt) + return EVENT_HDL_SUB_NONE; + + if (!mgmt->this) + return EVENT_HDL_SUB_NONE; /* already removed from sync ctx */ + return _event_hdl_getsub(mgmt->this); +} + +static struct event_hdl_sub_type event_hdl_getsub_async(const struct event_hdl_sub_mgmt *mgmt) +{ + if (!mgmt) + return EVENT_HDL_SUB_NONE; + + return _event_hdl_getsub_async(mgmt->this); +} + +static int event_hdl_resub_sync(const struct event_hdl_sub_mgmt *mgmt, struct event_hdl_sub_type type) +{ + if (!mgmt) + return 0; + + if (!mgmt->this) + return 0; /* already removed from sync ctx */ + return _event_hdl_resub(mgmt->this, type); +} + +static int event_hdl_resub_async(const struct event_hdl_sub_mgmt *mgmt, struct event_hdl_sub_type type) +{ + if (!mgmt) + return 0; + + return _event_hdl_resub_async(mgmt->this, type); +} + +static void event_hdl_unsubscribe_sync(const struct event_hdl_sub_mgmt *mgmt) +{ + if (!mgmt) + return; + + if (!mgmt->this) + return; /* already removed from sync ctx */ + + /* assuming that publish sync code will notice that mgmt->this is NULL + * and will perform the list removal using MT_LIST_DELETE_SAFE and + * _event_hdl_unsubscribe() + * while still owning the lock + */ + ((struct event_hdl_sub_mgmt *)mgmt)->this = NULL; +} + +static void event_hdl_unsubscribe_async(const struct event_hdl_sub_mgmt *mgmt) +{ + if (!mgmt) + return; + + _event_hdl_unsubscribe_async(mgmt->this); +} + +#define EVENT_HDL_SUB_MGMT_ASYNC(_sub) (struct event_hdl_sub_mgmt){ .this = _sub, \ + .getsub = event_hdl_getsub_async, \ + .resub = event_hdl_resub_async, \ + .unsub = event_hdl_unsubscribe_async} +#define EVENT_HDL_SUB_MGMT_SYNC(_sub) (struct event_hdl_sub_mgmt){ .this = _sub, \ + .getsub = event_hdl_getsub_sync, \ + .resub = event_hdl_resub_sync, \ + .unsub = event_hdl_unsubscribe_sync} + +struct event_hdl_sub *event_hdl_subscribe_ptr(event_hdl_sub_list *sub_list, + struct event_hdl_sub_type e_type, struct event_hdl hdl) +{ + struct event_hdl_sub *new_sub = NULL; + struct mt_list *elt1, elt2; + struct event_hdl_async_task_default_ctx *task_ctx = NULL; + struct mt_list lock; + + if (!sub_list) + sub_list = &global_event_hdl_sub_list; /* fall back to global list */ + + /* hdl API consistency check */ + /*FIXME: do we need to ensure that if private is set, private_free should be set as well? */ + BUG_ON((!hdl.async && !hdl.sync_ptr) || + (hdl.async == EVENT_HDL_ASYNC_MODE_NORMAL && !hdl.async_ptr) || + (hdl.async == EVENT_HDL_ASYNC_MODE_ADVANCED && + (!hdl.async_equeue || !hdl.async_task))); + + new_sub = pool_alloc(pool_head_sub); + if (new_sub == NULL) { + goto memory_error; + } + + /* assignments */ + new_sub->sub.family = e_type.family; + new_sub->sub.subtype = e_type.subtype; + new_sub->flags = 0; + new_sub->hdl = hdl; + + if (hdl.async) { + /* async END event pre-allocation */ + new_sub->async_end = pool_alloc(pool_head_sub_event); + if (!new_sub->async_end) { + /* memory error */ + goto memory_error; + } + if (hdl.async == EVENT_HDL_ASYNC_MODE_NORMAL) { + /* normal mode: no task provided, we must initialize it */ + + /* initialize task context */ + task_ctx = pool_alloc(pool_head_sub_taskctx); + + if (!task_ctx) { + /* memory error */ + goto memory_error; + } + event_hdl_async_equeue_init(&task_ctx->e_queue); + task_ctx->func = new_sub->hdl.async_ptr; + + new_sub->hdl.async_equeue = &task_ctx->e_queue; + new_sub->hdl.async_task = tasklet_new(); + + if (!new_sub->hdl.async_task) { + /* memory error */ + goto memory_error; + } + new_sub->hdl.async_task->context = task_ctx; + new_sub->hdl.async_task->process = event_hdl_async_task_default; + } + /* initialize END event (used to notify about subscription ending) + * used by both normal and advanced mode: + * - to safely terminate the task in normal mode + * - to safely free subscription and + * keep track of active subscriptions in advanced mode + */ + new_sub->async_end->type = EVENT_HDL_SUB_END; + new_sub->async_end->sub_mgmt = EVENT_HDL_SUB_MGMT_ASYNC(new_sub); + new_sub->async_end->private = new_sub->hdl.private; + new_sub->async_end->_data = NULL; + MT_LIST_INIT(&new_sub->async_end->mt_list); + } + /* set refcount to 2: + * 1 for handler (because handler can manage the subscription itself) + * 1 for caller (will be dropped automatically if caller use the non-ptr version) + */ + new_sub->refcount = 2; + + /* ready for registration */ + MT_LIST_INIT(&new_sub->mt_list); + + lock = MT_LIST_LOCK_ELT(&sub_list->known); + + /* check if such identified hdl is not already registered */ + if (hdl.id) { + struct event_hdl_sub *cur_sub; + uint8_t found = 0; + + mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) { + if (hdl.id == cur_sub->hdl.id) { + /* we found matching registered hdl */ + found = 1; + break; + } + } + if (found) { + /* error already registered */ + MT_LIST_UNLOCK_ELT(&sub_list->known, lock); + event_hdl_report_hdl_state(ha_alert, &hdl, "SUB", "could not subscribe: subscription with this id already exists"); + goto cleanup; + } + } + + if (lock.next == &sub_list->known) { + /* this is an expected corner case on de-init path, a subscribe attempt + * was made but the subscription list is already destroyed, we pretend + * it is a memory/IO error since it should not be long before haproxy + * enters the deinit() function anyway + */ + MT_LIST_UNLOCK_ELT(&sub_list->known, lock); + goto cleanup; + } + + /* Append in list (global or user specified list). + * For now, append when sync mode, and insert when async mode + * so that async handlers are executed first + */ + if (hdl.async) { + /* Prevent the task from being aborted on soft-stop: let's wait + * until the END event is acknowledged by the task. + * (decrease is performed in event_hdl_async_free_event()) + * + * If we don't do this, event_hdl API will leak and we won't give + * a chance to the event-handling task to perform cleanup + */ + HA_ATOMIC_INC(&jobs); + /* async mode, insert at the beginning of the list */ + MT_LIST_INSERT(&sub_list->head, &new_sub->mt_list); + } else { + /* sync mode, append at the end of the list */ + MT_LIST_APPEND(&sub_list->head, &new_sub->mt_list); + } + + MT_LIST_UNLOCK_ELT(&sub_list->known, lock); + + return new_sub; + + cleanup: + if (new_sub) { + if (hdl.async == EVENT_HDL_ASYNC_MODE_NORMAL) { + tasklet_free(new_sub->hdl.async_task); + pool_free(pool_head_sub_taskctx, task_ctx); + } + if (hdl.async) + pool_free(pool_head_sub_event, new_sub->async_end); + pool_free(pool_head_sub, new_sub); + } + + return NULL; + + memory_error: + event_hdl_report_hdl_state(ha_warning, &hdl, "SUB", "could not register subscription due to memory error"); + goto cleanup; +} + +void event_hdl_take(struct event_hdl_sub *sub) +{ + HA_ATOMIC_INC(&sub->refcount); +} + +void event_hdl_drop(struct event_hdl_sub *sub) +{ + if (HA_ATOMIC_SUB_FETCH(&sub->refcount, 1) != 0) + return; + + /* we were the last one holding a reference to event sub - free required */ + if (sub->hdl.private_free) { + /* free private data if specified upon registration */ + sub->hdl.private_free(sub->hdl.private); + } + pool_free(pool_head_sub, sub); +} + +int event_hdl_resubscribe(struct event_hdl_sub *cur_sub, struct event_hdl_sub_type type) +{ + return _event_hdl_resub_async(cur_sub, type); +} + +void _event_hdl_pause(struct event_hdl_sub *cur_sub) +{ + cur_sub->flags |= EHDL_SUB_F_PAUSED; +} + +void event_hdl_pause(struct event_hdl_sub *cur_sub) +{ + struct mt_list lock; + + lock = MT_LIST_LOCK_ELT(&cur_sub->mt_list); + if (lock.next != &cur_sub->mt_list) + _event_hdl_pause(cur_sub); + // else already removed + MT_LIST_UNLOCK_ELT(&cur_sub->mt_list, lock); +} + +void _event_hdl_resume(struct event_hdl_sub *cur_sub) +{ + cur_sub->flags &= ~EHDL_SUB_F_PAUSED; +} + +void event_hdl_resume(struct event_hdl_sub *cur_sub) +{ + struct mt_list lock; + + lock = MT_LIST_LOCK_ELT(&cur_sub->mt_list); + if (lock.next != &cur_sub->mt_list) + _event_hdl_resume(cur_sub); + // else already removed + MT_LIST_UNLOCK_ELT(&cur_sub->mt_list, lock); +} + +void event_hdl_unsubscribe(struct event_hdl_sub *del_sub) +{ + _event_hdl_unsubscribe_async(del_sub); + /* drop refcount, assuming caller no longer use ptr */ + event_hdl_drop(del_sub); +} + +int event_hdl_subscribe(event_hdl_sub_list *sub_list, struct event_hdl_sub_type e_type, struct event_hdl hdl) +{ + struct event_hdl_sub *sub; + + sub = event_hdl_subscribe_ptr(sub_list, e_type, hdl); + if (sub) { + /* drop refcount because the user is not willing to hold a reference */ + event_hdl_drop(sub); + return 1; + } + return 0; +} + +/* Subscription external lookup functions + */ +int event_hdl_lookup_unsubscribe(event_hdl_sub_list *sub_list, + uint64_t lookup_id) +{ + struct event_hdl_sub *del_sub = NULL; + struct mt_list *elt1, elt2; + int found = 0; + + if (!sub_list) + sub_list = &global_event_hdl_sub_list; /* fall back to global list */ + + mt_list_for_each_entry_safe(del_sub, &sub_list->head, mt_list, elt1, elt2) { + if (lookup_id == del_sub->hdl.id) { + /* we found matching registered hdl */ + MT_LIST_DELETE_SAFE(elt1); + _event_hdl_unsubscribe(del_sub); + found = 1; + break; /* id is unique, stop searching */ + } + } + return found; +} + +int event_hdl_lookup_resubscribe(event_hdl_sub_list *sub_list, + uint64_t lookup_id, struct event_hdl_sub_type type) +{ + struct event_hdl_sub *cur_sub = NULL; + struct mt_list *elt1, elt2; + int status = 0; + + if (!sub_list) + sub_list = &global_event_hdl_sub_list; /* fall back to global list */ + + mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) { + if (lookup_id == cur_sub->hdl.id) { + /* we found matching registered hdl */ + status = _event_hdl_resub(cur_sub, type); + break; /* id is unique, stop searching */ + } + } + return status; +} + +int event_hdl_lookup_pause(event_hdl_sub_list *sub_list, + uint64_t lookup_id) +{ + struct event_hdl_sub *cur_sub = NULL; + struct mt_list *elt1, elt2; + int found = 0; + + if (!sub_list) + sub_list = &global_event_hdl_sub_list; /* fall back to global list */ + + mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) { + if (lookup_id == cur_sub->hdl.id) { + /* we found matching registered hdl */ + _event_hdl_pause(cur_sub); + found = 1; + break; /* id is unique, stop searching */ + } + } + return found; +} + +int event_hdl_lookup_resume(event_hdl_sub_list *sub_list, + uint64_t lookup_id) +{ + struct event_hdl_sub *cur_sub = NULL; + struct mt_list *elt1, elt2; + int found = 0; + + if (!sub_list) + sub_list = &global_event_hdl_sub_list; /* fall back to global list */ + + mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) { + if (lookup_id == cur_sub->hdl.id) { + /* we found matching registered hdl */ + _event_hdl_resume(cur_sub); + found = 1; + break; /* id is unique, stop searching */ + } + } + return found; +} + +struct event_hdl_sub *event_hdl_lookup_take(event_hdl_sub_list *sub_list, + uint64_t lookup_id) +{ + struct event_hdl_sub *cur_sub = NULL; + struct mt_list *elt1, elt2; + uint8_t found = 0; + + if (!sub_list) + sub_list = &global_event_hdl_sub_list; /* fall back to global list */ + + mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) { + if (lookup_id == cur_sub->hdl.id) { + /* we found matching registered hdl */ + event_hdl_take(cur_sub); + found = 1; + break; /* id is unique, stop searching */ + } + } + if (found) + return cur_sub; + return NULL; +} + +/* event publishing functions + */ +static int _event_hdl_publish(event_hdl_sub_list *sub_list, struct event_hdl_sub_type e_type, + const struct event_hdl_cb_data *data) +{ + struct event_hdl_sub *cur_sub; + struct mt_list *elt1, elt2; + struct event_hdl_async_event_data *async_data = NULL; /* reuse async data for multiple async hdls */ + int error = 0; + + mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) { + /* notify each function that has subscribed to sub_family.type, unless paused */ + if ((cur_sub->sub.family == e_type.family) && + ((cur_sub->sub.subtype & e_type.subtype) == e_type.subtype) && + !(cur_sub->flags & EHDL_SUB_F_PAUSED)) { + /* hdl should be notified */ + if (!cur_sub->hdl.async) { + /* sync mode: simply call cb pointer + * it is up to the callee to schedule a task if needed or + * take specific precautions in order to return as fast as possible + * and not use locks that are already held by the caller + */ + struct event_hdl_cb cb; + struct event_hdl_sub_mgmt sub_mgmt; + + sub_mgmt = EVENT_HDL_SUB_MGMT_SYNC(cur_sub); + cb.e_type = e_type; + if (data) + cb.e_data = data->_ptr; + else + cb.e_data = NULL; + cb.sub_mgmt = &sub_mgmt; + cb._sync = 1; + + /* call user function */ + cur_sub->hdl.sync_ptr(&cb, cur_sub->hdl.private); + + if (!sub_mgmt.this) { + /* user has performed hdl unsub + * we must remove it from the list + */ + MT_LIST_DELETE_SAFE(elt1); + /* then free it */ + _event_hdl_unsubscribe(cur_sub); + } + } else { + /* async mode: here we need to prepare event data + * and push it to the event_queue of the task(s) + * responsible for consuming the events of current + * subscription. + * Once the event is pushed, we wake up the associated task. + * This feature depends on <haproxy/task> that also + * depends on <haproxy/pool>: + * If STG_PREPARE+STG_POOL is not performed prior to publishing to + * async handler, program may crash. + * Hopefully, STG_PREPARE+STG_POOL should be done early in + * HAProxy startup sequence. + */ + struct event_hdl_async_event *new_event; + + new_event = pool_alloc(pool_head_sub_event); + if (!new_event) { + error = 1; + break; /* stop on error */ + } + new_event->type = e_type; + new_event->private = cur_sub->hdl.private; + new_event->when = date; + new_event->sub_mgmt = EVENT_HDL_SUB_MGMT_ASYNC(cur_sub); + if (data) { + /* if this fails, please adjust EVENT_HDL_ASYNC_EVENT_DATA in + * event_hdl-t.h file or consider providing dynamic struct members + * to reduce overall struct size + */ + BUG_ON(data->_size > sizeof(async_data->data)); + if (!async_data) { + /* first async hdl reached - preparing async_data cache */ + async_data = pool_alloc(pool_head_sub_event_data); + if (!async_data) { + error = 1; + pool_free(pool_head_sub_event, new_event); + break; /* stop on error */ + } + + /* async data assignment */ + memcpy(async_data->data, data->_ptr, data->_size); + async_data->mfree = data->_mfree; + /* Initialize refcount, we start at 1 to prevent async + * data from being freed by an async handler while we + * still use it. We will drop the reference when the + * publish is over. + * + * (first use, atomic operation not required) + */ + async_data->refcount = 1; + } + new_event->_data = async_data; + new_event->data = async_data->data; + /* increment refcount because multiple hdls could + * use the same async_data + */ + HA_ATOMIC_INC(&async_data->refcount); + } else + new_event->data = NULL; + + /* appending new event to event hdl queue */ + MT_LIST_INIT(&new_event->mt_list); + HA_ATOMIC_INC(&cur_sub->hdl.async_equeue->size); + MT_LIST_APPEND(&cur_sub->hdl.async_equeue->head, &new_event->mt_list); + + /* wake up the task */ + event_hdl_task_wakeup(cur_sub->hdl.async_task); + } /* end async mode */ + } /* end hdl should be notified */ + } /* end mt_list */ + if (async_data) { + /* we finished publishing, drop the reference on async data */ + _event_hdl_async_data_drop(async_data); + } else { + /* no async subscribers, we are responsible for calling the data + * member freeing function if it was provided + */ + if (data && data->_mfree) + data->_mfree(data->_ptr); + } + if (error) { + event_hdl_report_hdl_state(ha_warning, &cur_sub->hdl, "PUBLISH", "memory error"); + return 0; + } + return 1; +} + +/* Publish function should not be used from high calling rate or time sensitive + * places for now, because list lookup based on e_type is not optimized at + * all! + * Returns 1 in case of SUCCESS: + * Subscribed handlers were notified successfully + * Returns 0 in case of FAILURE: + * FAILURE means memory error while handling the very first async handler from + * the subscription list. + * As async handlers are executed first within the list, when such failure occurs + * you can safely assume that no events were published for the current call + */ +int event_hdl_publish(event_hdl_sub_list *sub_list, + struct event_hdl_sub_type e_type, const struct event_hdl_cb_data *data) +{ + if (!e_type.family) { + /* do nothing, these types are reserved for internal use only + * (ie: unregistering) */ + return 0; + } + if (sub_list) { + /* if sublist is provided, first publish event to list subscribers */ + return _event_hdl_publish(sub_list, e_type, data); + } else { + /* publish to global list */ + return _event_hdl_publish(&global_event_hdl_sub_list, e_type, data); + } +} + +void event_hdl_sub_list_init(event_hdl_sub_list *sub_list) +{ + BUG_ON(!sub_list); /* unexpected, global sublist is managed internally */ + MT_LIST_INIT(&sub_list->head); + MT_LIST_APPEND(&known_event_hdl_sub_list, &sub_list->known); +} + +/* internal function, assumes that sub_list ptr is always valid */ +static void _event_hdl_sub_list_destroy(event_hdl_sub_list *sub_list) +{ + struct event_hdl_sub *cur_sub; + struct mt_list *elt1, elt2; + + mt_list_for_each_entry_safe(cur_sub, &sub_list->head, mt_list, elt1, elt2) { + /* remove cur elem from list */ + MT_LIST_DELETE_SAFE(elt1); + /* then free it */ + _event_hdl_unsubscribe(cur_sub); + } +} + +/* when a subscription list is no longer used, call this + * to do the cleanup and make sure all related subscriptions are + * safely ended according to their types + */ +void event_hdl_sub_list_destroy(event_hdl_sub_list *sub_list) +{ + BUG_ON(!sub_list); /* unexpected, global sublist is managed internally */ + if (!MT_LIST_DELETE(&sub_list->known)) + return; /* already destroyed */ + _event_hdl_sub_list_destroy(sub_list); +} + +/* config parser for global "tune.events.max-events-at-once" */ +static int event_hdl_parse_max_events_at_once(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int arg = -1; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) != 0) + arg = atoi(args[1]); + + if (arg < 1 || arg > 10000) { + memprintf(err, "'%s' expects an integer argument between 1 and 10000.", args[0]); + return -1; + } + + event_hdl_tune.max_events_at_once = arg; + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.events.max-events-at-once", event_hdl_parse_max_events_at_once }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +INITCALL0(STG_INIT, event_hdl_init); diff --git a/src/extcheck.c b/src/extcheck.c new file mode 100644 index 0000000..c667b16 --- /dev/null +++ b/src/extcheck.c @@ -0,0 +1,694 @@ +/* + * External health-checks functions. + * + * Copyright 2000-2009,2020 Willy Tarreau <w@1wt.eu> + * Copyright 2014 Horms Solutions Ltd, Simon Horman <horms@verge.net.au> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/check.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/proxy.h> +#include <haproxy/server.h> +#include <haproxy/signal.h> +#include <haproxy/stream-t.h> +#include <haproxy/task.h> +#include <haproxy/thread.h> +#include <haproxy/tools.h> + + +static struct list pid_list = LIST_HEAD_INIT(pid_list); +static struct pool_head *pool_head_pid_list __read_mostly; +__decl_spinlock(pid_list_lock); + +struct extcheck_env { + char *name; /* environment variable name */ + int vmaxlen; /* value maximum length, used to determine the required memory allocation */ +}; + +/* environment variables memory requirement for different types of data */ +#define EXTCHK_SIZE_EVAL_INIT 0 /* size determined during the init phase, + * such environment variables are not updatable. */ +#define EXTCHK_SIZE_ULONG 20 /* max string length for an unsigned long value */ +#define EXTCHK_SIZE_UINT 11 /* max string length for an unsigned int value */ +#define EXTCHK_SIZE_ADDR 256 /* max string length for an IPv4/IPv6/UNIX address */ + +/* external checks environment variables */ +enum { + EXTCHK_PATH = 0, + + /* Proxy specific environment variables */ + EXTCHK_HAPROXY_PROXY_NAME, /* the backend name */ + EXTCHK_HAPROXY_PROXY_ID, /* the backend id */ + EXTCHK_HAPROXY_PROXY_ADDR, /* the first bind address if available (or empty) */ + EXTCHK_HAPROXY_PROXY_PORT, /* the first bind port if available (or empty) */ + + /* Server specific environment variables */ + EXTCHK_HAPROXY_SERVER_NAME, /* the server name */ + EXTCHK_HAPROXY_SERVER_ID, /* the server id */ + EXTCHK_HAPROXY_SERVER_ADDR, /* the server address */ + EXTCHK_HAPROXY_SERVER_PORT, /* the server port if available (or empty) */ + EXTCHK_HAPROXY_SERVER_MAXCONN, /* the server max connections */ + EXTCHK_HAPROXY_SERVER_CURCONN, /* the current number of connections on the server */ + EXTCHK_HAPROXY_SERVER_SSL, /* "1" if the server supports SSL, otherwise zero */ + EXTCHK_HAPROXY_SERVER_PROTO, /* the server's configured proto, if any */ + + EXTCHK_SIZE +}; + +const struct extcheck_env extcheck_envs[EXTCHK_SIZE] = { + [EXTCHK_PATH] = { "PATH", EXTCHK_SIZE_EVAL_INIT }, + [EXTCHK_HAPROXY_PROXY_NAME] = { "HAPROXY_PROXY_NAME", EXTCHK_SIZE_EVAL_INIT }, + [EXTCHK_HAPROXY_PROXY_ID] = { "HAPROXY_PROXY_ID", EXTCHK_SIZE_EVAL_INIT }, + [EXTCHK_HAPROXY_PROXY_ADDR] = { "HAPROXY_PROXY_ADDR", EXTCHK_SIZE_EVAL_INIT }, + [EXTCHK_HAPROXY_PROXY_PORT] = { "HAPROXY_PROXY_PORT", EXTCHK_SIZE_EVAL_INIT }, + [EXTCHK_HAPROXY_SERVER_NAME] = { "HAPROXY_SERVER_NAME", EXTCHK_SIZE_EVAL_INIT }, + [EXTCHK_HAPROXY_SERVER_ID] = { "HAPROXY_SERVER_ID", EXTCHK_SIZE_EVAL_INIT }, + [EXTCHK_HAPROXY_SERVER_ADDR] = { "HAPROXY_SERVER_ADDR", EXTCHK_SIZE_ADDR }, + [EXTCHK_HAPROXY_SERVER_PORT] = { "HAPROXY_SERVER_PORT", EXTCHK_SIZE_UINT }, + [EXTCHK_HAPROXY_SERVER_MAXCONN] = { "HAPROXY_SERVER_MAXCONN", EXTCHK_SIZE_EVAL_INIT }, + [EXTCHK_HAPROXY_SERVER_CURCONN] = { "HAPROXY_SERVER_CURCONN", EXTCHK_SIZE_ULONG }, + [EXTCHK_HAPROXY_SERVER_SSL] = { "HAPROXY_SERVER_SSL", EXTCHK_SIZE_UINT }, + [EXTCHK_HAPROXY_SERVER_PROTO] = { "HAPROXY_SERVER_PROTO", EXTCHK_SIZE_EVAL_INIT }, +}; + +void block_sigchld(void) +{ + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGCHLD); + assert(ha_sigmask(SIG_BLOCK, &set, NULL) == 0); +} + +void unblock_sigchld(void) +{ + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGCHLD); + assert(ha_sigmask(SIG_UNBLOCK, &set, NULL) == 0); +} + +static struct pid_list *pid_list_add(pid_t pid, struct task *t) +{ + struct pid_list *elem; + struct check *check = t->context; + + elem = pool_alloc(pool_head_pid_list); + if (!elem) + return NULL; + elem->pid = pid; + elem->t = t; + elem->exited = 0; + check->curpid = elem; + LIST_INIT(&elem->list); + + HA_SPIN_LOCK(PID_LIST_LOCK, &pid_list_lock); + LIST_INSERT(&pid_list, &elem->list); + HA_SPIN_UNLOCK(PID_LIST_LOCK, &pid_list_lock); + + return elem; +} + +static void pid_list_del(struct pid_list *elem) +{ + struct check *check; + + if (!elem) + return; + + HA_SPIN_LOCK(PID_LIST_LOCK, &pid_list_lock); + LIST_DELETE(&elem->list); + HA_SPIN_UNLOCK(PID_LIST_LOCK, &pid_list_lock); + + if (!elem->exited) + kill(elem->pid, SIGTERM); + + check = elem->t->context; + check->curpid = NULL; + pool_free(pool_head_pid_list, elem); +} + +/* Called from inside SIGCHLD handler, SIGCHLD is blocked */ +static void pid_list_expire(pid_t pid, int status) +{ + struct pid_list *elem; + + HA_SPIN_LOCK(PID_LIST_LOCK, &pid_list_lock); + list_for_each_entry(elem, &pid_list, list) { + if (elem->pid == pid) { + elem->t->expire = now_ms; + elem->status = status; + elem->exited = 1; + task_wakeup(elem->t, TASK_WOKEN_IO); + break; + } + } + HA_SPIN_UNLOCK(PID_LIST_LOCK, &pid_list_lock); +} + +static void sigchld_handler(struct sig_handler *sh) +{ + pid_t pid; + int status; + + while ((pid = waitpid(0, &status, WNOHANG)) > 0) + pid_list_expire(pid, status); +} + +int init_pid_list(void) +{ + if (pool_head_pid_list != NULL) + /* Nothing to do */ + return 0; + + if (!signal_register_fct(SIGCHLD, sigchld_handler, SIGCHLD)) { + ha_alert("Failed to set signal handler for external health checks: %s. Aborting.\n", + strerror(errno)); + return 1; + } + + pool_head_pid_list = create_pool("pid_list", sizeof(struct pid_list), MEM_F_SHARED); + if (pool_head_pid_list == NULL) { + ha_alert("Failed to allocate memory pool for external health checks: %s. Aborting.\n", + strerror(errno)); + return 1; + } + + return 0; +} + +/* helper macro to set an environment variable and jump to a specific label on failure. */ +#define EXTCHK_SETENV(check, envidx, value, fail) { if (extchk_setenv(check, envidx, value)) goto fail; } + +/* + * helper function to allocate enough memory to store an environment variable. + * It will also check that the environment variable is updatable, and silently + * fail if not. + */ +static int extchk_setenv(struct check *check, int idx, const char *value) +{ + int len, ret; + char *envname; + int vmaxlen; + + if (idx < 0 || idx >= EXTCHK_SIZE) { + ha_alert("Illegal environment variable index %d. Aborting.\n", idx); + return 1; + } + + envname = extcheck_envs[idx].name; + vmaxlen = extcheck_envs[idx].vmaxlen; + + /* Check if the environment variable is already set, and silently reject + * the update if this one is not updatable. */ + if ((vmaxlen == EXTCHK_SIZE_EVAL_INIT) && (check->envp[idx])) + return 0; + + /* Instead of sending NOT_USED, sending an empty value is preferable */ + if (strcmp(value, "NOT_USED") == 0) { + value = ""; + } + + len = strlen(envname) + 1; + if (vmaxlen == EXTCHK_SIZE_EVAL_INIT) + len += strlen(value); + else + len += vmaxlen; + + if (!check->envp[idx]) + check->envp[idx] = malloc(len + 1); + + if (!check->envp[idx]) { + ha_alert("Failed to allocate memory for the environment variable '%s'. Aborting.\n", envname); + return 1; + } + ret = snprintf(check->envp[idx], len + 1, "%s=%s", envname, value); + if (ret < 0) { + ha_alert("Failed to store the environment variable '%s'. Reason : %s. Aborting.\n", envname, strerror(errno)); + return 1; + } + else if (ret > len) { + ha_alert("Environment variable '%s' was truncated. Aborting.\n", envname); + return 1; + } + return 0; +} + +int prepare_external_check(struct check *check) +{ + struct server *s = check->server; + struct proxy *px = s->proxy; + struct listener *listener = NULL, *l; + int i; + const char *path = px->check_path ? px->check_path : DEF_CHECK_PATH; + char buf[256]; + const char *svmode = NULL; + + list_for_each_entry(l, &px->conf.listeners, by_fe) + /* Use the first INET, INET6 or UNIX listener */ + if (l->rx.addr.ss_family == AF_INET || + l->rx.addr.ss_family == AF_INET6 || + l->rx.addr.ss_family == AF_UNIX) { + listener = l; + break; + } + + check->curpid = NULL; + check->envp = calloc((EXTCHK_SIZE + 1), sizeof(*check->envp)); + if (!check->envp) { + ha_alert("Failed to allocate memory for environment variables. Aborting\n"); + goto err; + } + + check->argv = calloc(6, sizeof(*check->argv)); + if (!check->argv) { + ha_alert("Starting [%s:%s] check: out of memory.\n", px->id, s->id); + goto err; + } + + check->argv[0] = px->check_command; + + if (!listener) { + check->argv[1] = strdup("NOT_USED"); + check->argv[2] = strdup("NOT_USED"); + } + else if (listener->rx.addr.ss_family == AF_INET || + listener->rx.addr.ss_family == AF_INET6) { + addr_to_str(&listener->rx.addr, buf, sizeof(buf)); + check->argv[1] = strdup(buf); + port_to_str(&listener->rx.addr, buf, sizeof(buf)); + check->argv[2] = strdup(buf); + } + else if (listener->rx.addr.ss_family == AF_UNIX) { + const struct sockaddr_un *un; + + un = (struct sockaddr_un *)&listener->rx.addr; + check->argv[1] = strdup(un->sun_path); + check->argv[2] = strdup("NOT_USED"); + } + else { + ha_alert("Starting [%s:%s] check: unsupported address family.\n", px->id, s->id); + goto err; + } + + /* args 3 and 4 are the address, they're replaced on each check */ + check->argv[3] = calloc(EXTCHK_SIZE_ADDR, sizeof(*check->argv[3])); + check->argv[4] = calloc(EXTCHK_SIZE_UINT, sizeof(*check->argv[4])); + + for (i = 0; i < 5; i++) { + if (!check->argv[i]) { + ha_alert("Starting [%s:%s] check: out of memory.\n", px->id, s->id); + goto err; + } + } + + EXTCHK_SETENV(check, EXTCHK_PATH, path, err); + /* Add proxy environment variables */ + EXTCHK_SETENV(check, EXTCHK_HAPROXY_PROXY_NAME, px->id, err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_PROXY_ID, ultoa_r(px->uuid, buf, sizeof(buf)), err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_PROXY_ADDR, check->argv[1], err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_PROXY_PORT, check->argv[2], err); + /* Add server environment variables */ + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_NAME, s->id, err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_ID, ultoa_r(s->puid, buf, sizeof(buf)), err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_ADDR, check->argv[3], err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_PORT, check->argv[4], err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_MAXCONN, ultoa_r(s->maxconn, buf, sizeof(buf)), err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_CURCONN, ultoa_r(s->cur_sess, buf, sizeof(buf)), err); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_SSL, s->use_ssl ? "1" : "0", err); + + switch (px->mode) { + case PR_MODE_CLI: svmode = "cli"; break; + case PR_MODE_SYSLOG: svmode = "syslog"; break; + case PR_MODE_PEERS: svmode = "peers"; break; + case PR_MODE_HTTP: svmode = (s->mux_proto) ? s->mux_proto->token.ptr : "h1"; break; + case PR_MODE_TCP: svmode = "tcp"; break; + /* all valid cases must be enumerated above, below is to avoid a warning */ + case PR_MODES: svmode = "?"; break; + } + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_PROTO, svmode, err); + + /* Ensure that we don't leave any hole in check->envp */ + for (i = 0; i < EXTCHK_SIZE; i++) + if (!check->envp[i]) + EXTCHK_SETENV(check, i, "", err); + + return 1; +err: + if (check->envp) { + for (i = 0; i < EXTCHK_SIZE; i++) + free(check->envp[i]); + ha_free(&check->envp); + } + + if (check->argv) { + for (i = 1; i < 5; i++) + free(check->argv[i]); + ha_free(&check->argv); + } + return 0; +} + +/* + * establish a server health-check that makes use of a process. + * + * It can return one of : + * - SF_ERR_NONE if everything's OK + * - SF_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * Additionally, in the case of SF_ERR_RESOURCE, an emergency log will be emitted. + * + * Blocks and then unblocks SIGCHLD + */ +static int connect_proc_chk(struct task *t) +{ + char buf[256]; + struct check *check = t->context; + struct server *s = check->server; + struct proxy *px = s->proxy; + int status; + pid_t pid; + + status = SF_ERR_RESOURCE; + + block_sigchld(); + + pid = fork(); + if (pid < 0) { + ha_alert("Failed to fork process for external health check%s: %s. Aborting.\n", + (global.tune.options & GTUNE_INSECURE_FORK) ? + "" : " (likely caused by missing 'insecure-fork-wanted')", + strerror(errno)); + set_server_check_status(check, HCHK_STATUS_SOCKERR, strerror(errno)); + goto out; + } + if (pid == 0) { + /* Child */ + extern char **environ; + struct rlimit limit; + int fd; + + /* close all FDs. Keep stdin/stdout/stderr in verbose mode */ + fd = (global.mode & (MODE_QUIET|MODE_VERBOSE)) == MODE_QUIET ? 0 : 3; + + my_closefrom(fd); + + /* restore the initial FD limits */ + limit.rlim_cur = rlim_fd_cur_at_boot; + limit.rlim_max = rlim_fd_max_at_boot; + if (raise_rlim_nofile(NULL, &limit) != 0) { + getrlimit(RLIMIT_NOFILE, &limit); + ha_warning("External check: failed to restore initial FD limits (cur=%u max=%u), using cur=%u max=%u\n", + rlim_fd_cur_at_boot, rlim_fd_max_at_boot, + (unsigned int)limit.rlim_cur, (unsigned int)limit.rlim_max); + } + + if (global.external_check < 2) { + /* fresh new env for each check */ + environ = check->envp; + } + + /* Update some environment variables and command args: curconn, server addr and server port */ + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_CURCONN, ultoa_r(s->cur_sess, buf, sizeof(buf)), fail); + + if (s->addr.ss_family == AF_UNIX) { + const struct sockaddr_un *un = (struct sockaddr_un *)&s->addr; + strlcpy2(check->argv[3], un->sun_path, EXTCHK_SIZE_ADDR); + memcpy(check->argv[4], "NOT_USED", 9); + } else { + addr_to_str(&s->addr, check->argv[3], EXTCHK_SIZE_ADDR); + *check->argv[4] = 0; // just in case the address family changed + if (s->addr.ss_family == AF_INET || s->addr.ss_family == AF_INET6) + snprintf(check->argv[4], EXTCHK_SIZE_UINT, "%u", s->svc_port); + } + + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_ADDR, check->argv[3], fail); + EXTCHK_SETENV(check, EXTCHK_HAPROXY_SERVER_PORT, check->argv[4], fail); + + if (global.external_check >= 2) { + /* environment is preserved, let's merge new vars */ + int i; + + for (i = 0; check->envp[i] && *check->envp[i]; i++) { + char *delim = strchr(check->envp[i], '='); + if (!delim) + continue; + *(delim++) = 0; + if (setenv(check->envp[i], delim, 1) != 0) + goto fail; + } + } + haproxy_unblock_signals(); + execvp(px->check_command, check->argv); + ha_alert("Failed to exec process for external health check: %s. Aborting.\n", + strerror(errno)); + fail: + exit(-1); + } + + /* Parent */ + if (check->result == CHK_RES_UNKNOWN) { + if (pid_list_add(pid, t) != NULL) { + t->expire = tick_add(now_ms, MS_TO_TICKS(check->inter)); + + if (px->timeout.check && px->timeout.connect) { + int t_con = tick_add(now_ms, px->timeout.connect); + t->expire = tick_first(t->expire, t_con); + } + status = SF_ERR_NONE; + goto out; + } + else { + set_server_check_status(check, HCHK_STATUS_SOCKERR, strerror(errno)); + } + kill(pid, SIGTERM); /* process creation error */ + } + else + set_server_check_status(check, HCHK_STATUS_SOCKERR, strerror(errno)); + +out: + unblock_sigchld(); + return status; +} + +/* + * manages a server health-check that uses an external process. Returns + * the time the task accepts to wait, or TIME_ETERNITY for infinity. + * + * Please do NOT place any return statement in this function and only leave + * via the out_unlock label. + */ +struct task *process_chk_proc(struct task *t, void *context, unsigned int state) +{ + struct check *check = context; + struct server *s = check->server; + int rv; + int ret; + int expired = tick_is_expired(t->expire, now_ms); + + HA_SPIN_LOCK(SERVER_LOCK, &check->server->lock); + if (!(check->state & CHK_ST_INPROGRESS)) { + /* no check currently running */ + if (!expired) /* woke up too early */ + goto out_unlock; + + /* we don't send any health-checks when the proxy is + * stopped, the server should not be checked or the check + * is disabled. + */ + if (((check->state & (CHK_ST_ENABLED | CHK_ST_PAUSED)) != CHK_ST_ENABLED) || + (s->proxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) + goto reschedule; + + /* we'll initiate a new check */ + set_server_check_status(check, HCHK_STATUS_START, NULL); + + check->state |= CHK_ST_INPROGRESS; + + ret = connect_proc_chk(t); + if (ret == SF_ERR_NONE) { + /* the process was forked, we allow up to min(inter, + * timeout.connect) for it to report its status, but + * only when timeout.check is set as it may be to short + * for a full check otherwise. + */ + t->expire = tick_add(now_ms, MS_TO_TICKS(check->inter)); + + if (s->proxy->timeout.check && s->proxy->timeout.connect) { + int t_con = tick_add(now_ms, s->proxy->timeout.connect); + t->expire = tick_first(t->expire, t_con); + } + task_set_thread(t, tid); + goto reschedule; + } + + /* here, we failed to start the check */ + + check->state &= ~CHK_ST_INPROGRESS; + check_notify_failure(check); + + /* we allow up to min(inter, timeout.connect) for a connection + * to establish but only when timeout.check is set + * as it may be to short for a full check otherwise + */ + while (tick_is_expired(t->expire, now_ms)) { + int t_con; + + t_con = tick_add(t->expire, s->proxy->timeout.connect); + t->expire = tick_add(t->expire, MS_TO_TICKS(check->inter)); + + if (s->proxy->timeout.check) + t->expire = tick_first(t->expire, t_con); + } + } + else { + /* there was a test running. + * First, let's check whether there was an uncaught error, + * which can happen on connect timeout or error. + */ + if (check->result == CHK_RES_UNKNOWN) { + /* good connection is enough for pure TCP check */ + struct pid_list *elem = check->curpid; + int status = HCHK_STATUS_UNKNOWN; + + if (elem->exited) { + status = elem->status; /* Save in case the process exits between use below */ + if (!WIFEXITED(status)) + check->code = -1; + else + check->code = WEXITSTATUS(status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + status = HCHK_STATUS_PROCERR; + else + status = HCHK_STATUS_PROCOK; + } else if (expired) { + status = HCHK_STATUS_PROCTOUT; + ha_warning("kill %d\n", (int)elem->pid); + kill(elem->pid, SIGTERM); + } + set_server_check_status(check, status, NULL); + } + + if (check->result == CHK_RES_FAILED) { + /* a failure or timeout detected */ + check_notify_failure(check); + } + else if (check->result == CHK_RES_CONDPASS) { + /* check is OK but asks for stopping mode */ + check_notify_stopping(check); + } + else if (check->result == CHK_RES_PASSED) { + /* a success was detected */ + check_notify_success(check); + } + task_set_thread(t, 0); + check->state &= ~CHK_ST_INPROGRESS; + + pid_list_del(check->curpid); + + rv = 0; + if (global.spread_checks > 0) { + rv = srv_getinter(check) * global.spread_checks / 100; + rv -= (int) (2 * rv * (statistical_prng() / 4294967295.0)); + } + t->expire = tick_add(now_ms, MS_TO_TICKS(srv_getinter(check) + rv)); + } + + reschedule: + while (tick_is_expired(t->expire, now_ms)) + t->expire = tick_add(t->expire, MS_TO_TICKS(check->inter)); + + out_unlock: + HA_SPIN_UNLOCK(SERVER_LOCK, &check->server->lock); + return t; +} + +/* Parses the "external-check" proxy keyword */ +int proxy_parse_extcheck(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **errmsg) +{ + int cur_arg, ret = 0; + + cur_arg = 1; + if (!*(args[cur_arg])) { + memprintf(errmsg, "missing argument after '%s'.\n", args[0]); + goto error; + } + + if (strcmp(args[cur_arg], "command") == 0) { + if (too_many_args(2, args, errmsg, NULL)) + goto error; + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "missing argument after '%s'.", args[cur_arg]); + goto error; + } + free(curpx->check_command); + curpx->check_command = strdup(args[cur_arg+1]); + } + else if (strcmp(args[cur_arg], "path") == 0) { + if (too_many_args(2, args, errmsg, NULL)) + goto error; + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "missing argument after '%s'.", args[cur_arg]); + goto error; + } + free(curpx->check_path); + curpx->check_path = strdup(args[cur_arg+1]); + } + else { + memprintf(errmsg, "'%s' only supports 'command' and 'path'. but got '%s'.", + args[0], args[1]); + goto error; + } + + ret = (*errmsg != NULL); /* Handle warning */ + return ret; + +error: + return -1; +} + +int proxy_parse_external_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + int err_code = 0; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_EXT_CHK; + if (alertif_too_many_args_idx(0, 1, file, line, args, &err_code)) + goto out; + + out: + return err_code; +} + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_LISTEN, "external-check", proxy_parse_extcheck }, + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/fcgi-app.c b/src/fcgi-app.c new file mode 100644 index 0000000..00562f8 --- /dev/null +++ b/src/fcgi-app.c @@ -0,0 +1,1133 @@ +/* + * Functions about FCGI applications and filters. + * + * Copyright (C) 2019 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/acl.h> +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/chunk.h> +#include <haproxy/errors.h> +#include <haproxy/fcgi-app.h> +#include <haproxy/filters.h> +#include <haproxy/http_fetch.h> +#include <haproxy/http_htx.h> +#include <haproxy/log.h> +#include <haproxy/proxy.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/server-t.h> +#include <haproxy/session.h> +#include <haproxy/sink.h> +#include <haproxy/tools.h> + + +/* Global list of all FCGI applications */ +static struct fcgi_app *fcgi_apps = NULL; + +struct flt_ops fcgi_flt_ops; +const char *fcgi_flt_id = "FCGI filter"; + +DECLARE_STATIC_POOL(pool_head_fcgi_flt_ctx, "fcgi_flt_ctx", sizeof(struct fcgi_flt_ctx)); +DECLARE_STATIC_POOL(pool_head_fcgi_param_rule, "fcgi_param_rule", sizeof(struct fcgi_param_rule)); +DECLARE_STATIC_POOL(pool_head_fcgi_hdr_rule, "fcgi_hdr_rule", sizeof(struct fcgi_hdr_rule)); + +/**************************************************************************/ +/***************************** Uitls **************************************/ +/**************************************************************************/ +/* Makes a fcgi parameter name (prefixed by ':fcgi-') with <name> (in + * lowercase). All non alphanumeric character are replaced by an underscore + * ('_'). The result is copied into <dst>. the corresponding ist is returned. + */ +static struct ist fcgi_param_name(char *dst, const struct ist name) +{ + size_t ofs1, ofs2; + + memcpy(dst, ":fcgi-", 6); + ofs1 = 6; + for (ofs2 = 0; ofs2 < name.len; ofs2++) { + if (isalnum((unsigned char)name.ptr[ofs2])) + dst[ofs1++] = ist_lc[(unsigned char)name.ptr[ofs2]]; + else + dst[ofs1++] = '_'; + } + return ist2(dst, ofs1); +} + +/* Returns a pointer to the FCGi application matching the name <name>. NULL is + * returned if no match found. + */ +struct fcgi_app *fcgi_app_find_by_name(const char *name) +{ + struct fcgi_app *app; + + for (app = fcgi_apps; app != NULL; app = app->next) { + if (strcmp(app->name, name) == 0) + return app; + } + + return NULL; +} + +struct fcgi_flt_conf *find_px_fcgi_conf(struct proxy *px) +{ + struct flt_conf *fconf; + + list_for_each_entry(fconf, &px->filter_configs, list) { + if (fconf->id == fcgi_flt_id) + return fconf->conf; + } + return NULL; +} + +struct fcgi_flt_ctx *find_strm_fcgi_ctx(struct stream *s) +{ + struct filter *filter; + + if (!s) + return NULL; + + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + if (FLT_ID(filter) == fcgi_flt_id) + return FLT_CONF(filter); + } + return NULL; +} + +struct fcgi_app *get_px_fcgi_app(struct proxy *px) +{ + struct fcgi_flt_conf *fcgi_conf = find_px_fcgi_conf(px); + + if (fcgi_conf) + return fcgi_conf->app; + return NULL; +} + +struct fcgi_app *get_strm_fcgi_app(struct stream *s) +{ + struct fcgi_flt_ctx *fcgi_ctx = find_strm_fcgi_ctx(s); + + if (fcgi_ctx) + return fcgi_ctx->app; + return NULL; +} + +static void fcgi_release_rule_conf(struct fcgi_rule_conf *rule) +{ + if (!rule) + return; + free(rule->name); + free(rule->value); + free_acl_cond(rule->cond); + free(rule); +} + +static void fcgi_release_rule(struct fcgi_rule *rule) +{ + if (!rule) + return; + + if (!LIST_ISEMPTY(&rule->value)) { + struct logformat_node *lf, *lfb; + + list_for_each_entry_safe(lf, lfb, &rule->value, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } + } + /* ->cond and ->name are not owned by the rule */ + free(rule); +} + +/**************************************************************************/ +/*********************** FCGI Sample fetches ******************************/ +/**************************************************************************/ + +static int smp_fetch_fcgi_docroot(const struct arg *args, struct sample *smp, + const char *kw, void *private) +{ + struct fcgi_app *app = get_strm_fcgi_app(smp->strm); + + if (!app) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = app->docroot.ptr; + smp->data.u.str.data = app->docroot.len; + smp->flags = SMP_F_CONST; + return 1; +} + +static int smp_fetch_fcgi_index(const struct arg *args, struct sample *smp, + const char *kw, void *private) +{ + struct fcgi_app *app = get_strm_fcgi_app(smp->strm); + + if (!app || !istlen(app->index)) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = app->index.ptr; + smp->data.u.str.data = app->index.len; + smp->flags = SMP_F_CONST; + return 1; +} + +/**************************************************************************/ +/************************** FCGI filter ***********************************/ +/**************************************************************************/ +static int fcgi_flt_init(struct proxy *px, struct flt_conf *fconf) +{ + fconf->flags |= FLT_CFG_FL_HTX; + return 0; +} + +static void fcgi_flt_deinit(struct proxy *px, struct flt_conf *fconf) +{ + struct fcgi_flt_conf *fcgi_conf = fconf->conf; + struct fcgi_rule *rule, *back; + + if (!fcgi_conf) + return; + + free(fcgi_conf->name); + + list_for_each_entry_safe(rule, back, &fcgi_conf->param_rules, list) { + LIST_DELETE(&rule->list); + fcgi_release_rule(rule); + } + + list_for_each_entry_safe(rule, back, &fcgi_conf->hdr_rules, list) { + LIST_DELETE(&rule->list); + fcgi_release_rule(rule); + } + + free(fcgi_conf); +} + +static int fcgi_flt_check(struct proxy *px, struct flt_conf *fconf) +{ + struct fcgi_flt_conf *fcgi_conf = fconf->conf; + struct fcgi_rule_conf *crule, *back; + struct fcgi_rule *rule = NULL; + struct flt_conf *f; + char *errmsg = NULL; + + fcgi_conf->app = fcgi_app_find_by_name(fcgi_conf->name); + if (!fcgi_conf->app) { + ha_alert("proxy '%s' : fcgi-app '%s' not found.\n", + px->id, fcgi_conf->name); + goto err; + } + + list_for_each_entry(f, &px->filter_configs, list) { + if (f->id == http_comp_flt_id || f->id == cache_store_flt_id) + continue; + else if ((f->id == fconf->id) && f->conf != fcgi_conf) { + ha_alert("proxy '%s' : only one fcgi-app supported per backend.\n", + px->id); + goto err; + } + else if (f->id != fconf->id) { + /* Implicit declaration is only allowed with the + * compression and cache. For other filters, an implicit + * declaration is required. */ + ha_alert("config: proxy '%s': require an explicit filter declaration " + "to use the fcgi-app '%s'.\n", px->id, fcgi_conf->name); + goto err; + } + } + + list_for_each_entry_safe(crule, back, &fcgi_conf->app->conf.rules, list) { + rule = calloc(1, sizeof(*rule)); + if (!rule) { + ha_alert("proxy '%s' : out of memory.\n", px->id); + goto err; + } + rule->type = crule->type; + rule->name = ist(crule->name); + rule->cond = crule->cond; + LIST_INIT(&rule->value); + + if (crule->value) { + if (!parse_logformat_string(crule->value, px, &rule->value, LOG_OPT_HTTP, + SMP_VAL_BE_HRQ_HDR, &errmsg)) { + ha_alert("proxy '%s' : %s.\n", px->id, errmsg); + goto err; + } + } + + if (rule->type == FCGI_RULE_SET_PARAM || rule->type == FCGI_RULE_UNSET_PARAM) + LIST_APPEND(&fcgi_conf->param_rules, &rule->list); + else /* FCGI_RULE_PASS_HDR/FCGI_RULE_HIDE_HDR */ + LIST_APPEND(&fcgi_conf->hdr_rules, &rule->list); + } + return 0; + + err: + free(errmsg); + free(rule); + return 1; +} + +static int fcgi_flt_start(struct stream *s, struct filter *filter) +{ + struct fcgi_flt_conf *fcgi_conf = FLT_CONF(filter); + struct fcgi_flt_ctx *fcgi_ctx; + + fcgi_ctx = pool_alloc(pool_head_fcgi_flt_ctx); + if (fcgi_ctx == NULL) { + // FIXME: send a warning + return 0; + } + fcgi_ctx->filter = filter; + fcgi_ctx->app = fcgi_conf->app; + filter->ctx = fcgi_ctx; + + s->req.analysers |= AN_REQ_HTTP_BODY; + return 1; +} + +static void fcgi_flt_stop(struct stream *s, struct filter *filter) +{ + struct flt_fcgi_ctx *fcgi_ctx = filter->ctx; + + if (!fcgi_ctx) + return; + pool_free(pool_head_fcgi_flt_ctx, fcgi_ctx); + filter->ctx = NULL; +} + +static int fcgi_flt_http_headers(struct stream *s, struct filter *filter, struct http_msg *msg) +{ + struct session *sess = strm_sess(s); + struct buffer *value; + struct fcgi_flt_conf *fcgi_conf = FLT_CONF(filter); + struct fcgi_rule *rule; + struct fcgi_param_rule *param_rule; + struct fcgi_hdr_rule *hdr_rule; + struct ebpt_node *node, *next; + struct eb_root param_rules = EB_ROOT; + struct eb_root hdr_rules = EB_ROOT; + struct htx *htx; + struct http_hdr_ctx ctx; + int ret; + + htx = htxbuf(&msg->chn->buf); + + if (msg->chn->flags & CF_ISRESP) { + struct htx_sl *sl; + + /* Remove the header "Status:" from the response */ + ctx.blk = NULL; + while (http_find_header(htx, ist("status"), &ctx, 1)) + http_remove_header(htx, &ctx); + + /* Add the header "Date:" if not found */ + ctx.blk = NULL; + if (!http_find_header(htx, ist("date"), &ctx, 1)) { + struct tm tm; + + get_gmtime(date.tv_sec, &tm); + trash.data = strftime(trash.area, trash.size, "%a, %d %b %Y %T %Z", &tm); + if (trash.data) + http_add_header(htx, ist("date"), ist2(trash.area, trash.data)); + } + + /* Add the header "Content-Length:" if possible */ + sl = http_get_stline(htx); + if (s->txn->meth != HTTP_METH_HEAD && sl && + (msg->flags & (HTTP_MSGF_XFER_LEN|HTTP_MSGF_CNT_LEN|HTTP_MSGF_TE_CHNK)) == HTTP_MSGF_XFER_LEN && + (htx->flags & HTX_FL_EOM)) { + struct htx_blk * blk; + char *end; + size_t len = 0; + + for (blk = htx_get_first_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_DATA) + len += htx_get_blksz(blk); + } + end = ultoa_o(len, trash.area, trash.size); + if (http_add_header(htx, ist("content-length"), ist2(trash.area, end-trash.area))) { + sl->flags |= HTX_SL_F_CLEN; + msg->flags |= HTTP_MSGF_CNT_LEN; + } + } + + return 1; + } + + /* Analyze the request's headers */ + + value = alloc_trash_chunk(); + if (!value) + goto end; + + list_for_each_entry(rule, &fcgi_conf->param_rules, list) { + if (rule->cond) { + ret = acl_exec_cond(rule->cond, s->be, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + + /* the rule does not match */ + if (!ret) + continue; + } + + param_rule = NULL; + node = ebis_lookup_len(¶m_rules, rule->name.ptr, rule->name.len); + if (node) { + param_rule = container_of(node, struct fcgi_param_rule, node); + ebpt_delete(node); + } + else { + param_rule = pool_alloc(pool_head_fcgi_param_rule); + if (param_rule == NULL) + goto param_rule_err; + } + + param_rule->node.key = rule->name.ptr; + param_rule->name = rule->name; + param_rule->value = &rule->value; + ebis_insert(¶m_rules, ¶m_rule->node); + } + + list_for_each_entry(rule, &fcgi_conf->hdr_rules, list) { + if (rule->cond) { + ret = acl_exec_cond(rule->cond, s->be, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + + /* the rule does not match */ + if (!ret) + continue; + } + + hdr_rule = NULL; + node = ebis_lookup_len(&hdr_rules, rule->name.ptr, rule->name.len); + if (node) { + hdr_rule = container_of(node, struct fcgi_hdr_rule, node); + ebpt_delete(node); + } + else { + hdr_rule = pool_alloc(pool_head_fcgi_hdr_rule); + if (hdr_rule == NULL) + goto hdr_rule_err; + } + + hdr_rule->node.key = rule->name.ptr; + hdr_rule->name = rule->name; + hdr_rule->pass = (rule->type == FCGI_RULE_PASS_HDR); + ebis_insert(&hdr_rules, &hdr_rule->node); + } + + node = ebpt_first(¶m_rules); + while (node) { + next = ebpt_next(node); + ebpt_delete(node); + param_rule = container_of(node, struct fcgi_param_rule, node); + node = next; + + b_reset(value); + value->data = build_logline(s, value->area, value->size, param_rule->value); + if (!value->data) { + pool_free(pool_head_fcgi_param_rule, param_rule); + continue; + } + if (!http_add_header(htx, param_rule->name, ist2(value->area, value->data))) + goto rewrite_err; + pool_free(pool_head_fcgi_param_rule, param_rule); + } + + node = ebpt_first(&hdr_rules); + while (node) { + next = ebpt_next(node); + ebpt_delete(node); + hdr_rule = container_of(node, struct fcgi_hdr_rule, node); + node = next; + + if (!hdr_rule->pass) { + ctx.blk = NULL; + while (http_find_header(htx, hdr_rule->name, &ctx, 1)) + http_remove_header(htx, &ctx); + } + pool_free(pool_head_fcgi_hdr_rule, hdr_rule); + } + + goto end; + + rewrite_err: + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + hdr_rule_err: + node = ebpt_first(&hdr_rules); + while (node) { + next = ebpt_next(node); + ebpt_delete(node); + hdr_rule = container_of(node, struct fcgi_hdr_rule, node); + node = next; + pool_free(pool_head_fcgi_hdr_rule, hdr_rule); + } + param_rule_err: + node = ebpt_first(¶m_rules); + while (node) { + next = ebpt_next(node); + ebpt_delete(node); + param_rule = container_of(node, struct fcgi_param_rule, node); + node = next; + pool_free(pool_head_fcgi_param_rule, param_rule); + } + end: + free_trash_chunk(value); + return 1; +} + +struct flt_ops fcgi_flt_ops = { + .init = fcgi_flt_init, + .check = fcgi_flt_check, + .deinit = fcgi_flt_deinit, + + .attach = fcgi_flt_start, + .detach = fcgi_flt_stop, + + .http_headers = fcgi_flt_http_headers, +}; + +/**************************************************************************/ +/*********************** FCGI Config parsing ******************************/ +/**************************************************************************/ +static int +parse_fcgi_flt(char **args, int *cur_arg, struct proxy *px, + struct flt_conf *fconf, char **err, void *private) +{ + struct flt_conf *f, *back; + struct fcgi_flt_conf *fcgi_conf = NULL; + char *name = NULL; + int pos = *cur_arg; + + /* Get the fcgi-app name*/ + if (!*args[pos + 1]) { + memprintf(err, "%s : expects a <name> argument", args[pos]); + goto err; + } + name = strdup(args[pos + 1]); + if (!name) { + memprintf(err, "%s '%s' : out of memory", args[pos], args[pos + 1]); + goto err; + } + pos += 2; + + /* Check if an fcgi-app filter with the same name already exists */ + list_for_each_entry_safe(f, back, &px->filter_configs, list) { + if (f->id != fcgi_flt_id) + continue; + fcgi_conf = f->conf; + if (strcmp(name, fcgi_conf->name) != 0) { + fcgi_conf = NULL; + continue; + } + + /* Place the filter at its right position */ + LIST_DELETE(&f->list); + free(f); + ha_free(&name); + break; + } + + /* No other fcgi-app filter found, create configuration for the explicit one */ + if (!fcgi_conf) { + fcgi_conf = calloc(1, sizeof(*fcgi_conf)); + if (!fcgi_conf) { + memprintf(err, "%s: out of memory", args[*cur_arg]); + goto err; + } + fcgi_conf->name = name; + LIST_INIT(&fcgi_conf->param_rules); + LIST_INIT(&fcgi_conf->hdr_rules); + } + + fconf->id = fcgi_flt_id; + fconf->conf = fcgi_conf; + fconf->ops = &fcgi_flt_ops; + + *cur_arg = pos; + return 0; + err: + free(name); + return -1; +} + +/* Parses the "use-fcgi-app" proxy keyword */ +static int proxy_parse_use_fcgi_app(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + struct flt_conf *fconf = NULL; + struct fcgi_flt_conf *fcgi_conf = NULL; + int retval = 0; + + if ((curpx->cap & PR_CAP_DEF) || !(curpx->cap & PR_CAP_BE)) { + memprintf(err, "'%s' only available in backend or listen section", args[0]); + retval = -1; + goto end; + } + + if (!*(args[1])) { + memprintf(err, "'%s' expects <name> as argument", args[0]); + retval = -1; + goto end; + } + + /* check if a fcgi filter was already registered with this name, + * if that's the case, must use it. */ + list_for_each_entry(fconf, &curpx->filter_configs, list) { + if (fconf->id == fcgi_flt_id) { + fcgi_conf = fconf->conf; + if (fcgi_conf && strcmp((char *)fcgi_conf->name, args[1]) == 0) + goto end; + memprintf(err, "'%s' : only one fcgi-app supported per backend", args[0]); + retval = -1; + goto end; + } + } + + /* Create the FCGI filter config */ + fcgi_conf = calloc(1, sizeof(*fcgi_conf)); + if (!fcgi_conf) + goto err; + fcgi_conf->name = strdup(args[1]); + LIST_INIT(&fcgi_conf->param_rules); + LIST_INIT(&fcgi_conf->hdr_rules); + + /* Register the filter */ + fconf = calloc(1, sizeof(*fconf)); + if (!fconf) + goto err; + fconf->id = fcgi_flt_id; + fconf->conf = fcgi_conf; + fconf->ops = &fcgi_flt_ops; + LIST_APPEND(&curpx->filter_configs, &fconf->list); + + end: + return retval; + err: + if (fcgi_conf) { + free(fcgi_conf->name); + free(fcgi_conf); + } + memprintf(err, "out of memory"); + retval = -1; + goto end; +} + +/* Finishes the parsing of FCGI application of proxies and servers */ +static int cfg_fcgi_apps_postparser() +{ + struct fcgi_app *curapp; + struct proxy *px; + struct server *srv; + int err_code = 0; + + for (px = proxies_list; px; px = px->next) { + struct fcgi_flt_conf *fcgi_conf = find_px_fcgi_conf(px); + int nb_fcgi_srv = 0; + + if (px->mode == PR_MODE_TCP && fcgi_conf) { + ha_alert("proxy '%s': FCGI application cannot be used in non-HTTP mode.\n", + px->id); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* By default, for FCGI-ready backend, HTTP request header names + * are restricted and the "delete" policy is set + */ + if (fcgi_conf && !(px->options2 & PR_O2_RSTRICT_REQ_HDR_NAMES_MASK)) + px->options2 |= PR_O2_RSTRICT_REQ_HDR_NAMES_DEL; + + for (srv = px->srv; srv; srv = srv->next) { + if (srv->mux_proto && isteq(srv->mux_proto->token, ist("fcgi"))) { + nb_fcgi_srv++; + if (fcgi_conf) + continue; + ha_alert("proxy '%s': FCGI server '%s' has no FCGI app configured.\n", + px->id, srv->id); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + } + if (fcgi_conf && !nb_fcgi_srv) { + ha_alert("proxy '%s': FCGI app configured but no FCGI server found.\n", + px->id); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + } + + for (curapp = fcgi_apps; curapp != NULL; curapp = curapp->next) { + if (!istlen(curapp->docroot)) { + ha_alert("fcgi-app '%s': no docroot configured.\n", + curapp->name); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + if (!(curapp->flags & (FCGI_APP_FL_MPXS_CONNS|FCGI_APP_FL_GET_VALUES))) { + if (curapp->maxreqs > 1) { + ha_warning("fcgi-app '%s': multiplexing not supported, " + "ignore the option 'max-reqs'.\n", + curapp->name); + err_code |= ERR_WARN; + } + curapp->maxreqs = 1; + } + + err_code |= postresolve_logger_list(&curapp->loggers, "fcgi-app", curapp->name); + } + + end: + return err_code; +} + +static int fcgi_app_add_rule(struct fcgi_app *curapp, enum fcgi_rule_type type, char *name, char *value, + struct acl_cond *cond, char **err) +{ + struct fcgi_rule_conf *rule; + + /* Param not found, add a new one */ + rule = calloc(1, sizeof(*rule)); + if (!rule) + goto err; + LIST_INIT(&rule->list); + rule->type = type; + if (type == FCGI_RULE_SET_PARAM || type == FCGI_RULE_UNSET_PARAM) { + struct ist fname = fcgi_param_name(trash.area, ist(name)); + rule->name = my_strndup(fname.ptr, fname.len); + } + else { /* FCGI_RULE_PASS_HDR/FCGI_RULE_HIDE_HDR */ + struct ist fname = ist2bin_lc(trash.area, ist(name)); + rule->name = my_strndup(fname.ptr, fname.len); + } + if (!rule->name) + goto err; + + if (value) { + rule->value = strdup(value); + if (!rule->value) + goto err; + } + rule->cond = cond; + LIST_APPEND(&curapp->conf.rules, &rule->list); + return 1; + + err: + if (rule) { + free(rule->name); + free(rule->value); + free(rule); + } + free_acl_cond(cond); + memprintf(err, "out of memory"); + return 0; +} + +/* Parses "fcgi-app" section */ +static int cfg_parse_fcgi_app(const char *file, int linenum, char **args, int kwm) +{ + static struct fcgi_app *curapp = NULL; + struct acl_cond *cond = NULL; + char *name, *value = NULL; + enum fcgi_rule_type type; + int err_code = 0; + const char *err; + char *errmsg = NULL; + + if (strcmp(args[0], "fcgi-app") == 0) { /* new fcgi-app */ + if (!*(args[1])) { + ha_alert("parsing [%s:%d]: '%s' expects <name> as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d]: character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + for (curapp = fcgi_apps; curapp != NULL; curapp = curapp->next) { + if (strcmp(curapp->name, args[1]) == 0) { + ha_alert("Parsing [%s:%d]: fcgi-app section '%s' has the same name as another one declared at %s:%d.\n", + file, linenum, args[1], curapp->conf.file, curapp->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + + curapp = calloc(1, sizeof(*curapp)); + if (!curapp) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + curapp->next = fcgi_apps; + fcgi_apps = curapp; + curapp->flags = FCGI_APP_FL_KEEP_CONN; + curapp->docroot = ist(NULL); + curapp->index = ist(NULL); + curapp->pathinfo_re = NULL; + curapp->name = strdup(args[1]); + curapp->maxreqs = 1; + curapp->conf.file = strdup(file); + curapp->conf.line = linenum; + LIST_INIT(&curapp->acls); + LIST_INIT(&curapp->loggers); + LIST_INIT(&curapp->conf.args.list); + LIST_INIT(&curapp->conf.rules); + + /* Set info about authentication */ + if (!fcgi_app_add_rule(curapp, FCGI_RULE_SET_PARAM, "REMOTE_USER", "%[http_auth_user]", NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_SET_PARAM, "AUTH_TYPE", "%[http_auth_type]", NULL, &errmsg)) { + ha_alert("parsing [%s:%d] : '%s' : %s.\n", file, linenum, + args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + } + + /* Hide hop-by-hop headers by default */ + if (!fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "connection", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "keep-alive", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "authorization", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "proxy", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "proxy-authorization", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "proxy-authenticate", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "te", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "trailers", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "transfer-encoding", NULL, NULL, &errmsg) || + !fcgi_app_add_rule(curapp, FCGI_RULE_HIDE_HDR, "upgrade", NULL, NULL, &errmsg)) { + ha_alert("parsing [%s:%d] : '%s' : %s.\n", file, linenum, + args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + else if (strcmp(args[0], "docroot") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expects <path> as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + istfree(&curapp->docroot); + curapp->docroot = ist(strdup(args[1])); + if (!isttest(curapp->docroot)) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + } + } + else if (strcmp(args[0], "path-info") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expects <regex> as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + regex_free(curapp->pathinfo_re); + curapp->pathinfo_re = regex_comp(args[1], 1, 1, &errmsg); + if (!curapp->pathinfo_re) { + ha_alert("parsing [%s:%d] : '%s' : %s.\n", file, linenum, + args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + else if (strcmp(args[0], "index") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expects <filename> as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + istfree(&curapp->index); + curapp->index = ist(strdup(args[1])); + if (!isttest(curapp->index)) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + } + } + else if (strcmp(args[0], "acl") == 0) { + const char *err; + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in acl name '%s'.\n", + file, linenum, *err, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (strcasecmp(args[1], "or") == 0) { + ha_alert("parsing [%s:%d] : acl name '%s' will never match. 'or' is used to express a " + "logical disjunction within a condition.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (parse_acl((const char **)args+1, &curapp->acls, &errmsg, &curapp->conf.args, file, linenum) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing ACL '%s' : %s.\n", + file, linenum, args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "set-param") == 0) { + if (!*(args[1]) || !*(args[2])) { + ha_alert("parsing [%s:%d] : '%s' expects <name> and <value> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + type = FCGI_RULE_SET_PARAM; + name = args[1]; + value = args[2]; + cond = NULL; + args += 3; + + parse_cond_rule: + if (!*(args[0])) /* No condition */ + goto add_rule; + + if (strcmp(args[0], "if") == 0) + cond = parse_acl_cond((const char **)args+1, &curapp->acls, ACL_COND_IF, &errmsg, &curapp->conf.args, + file, linenum); + else if (strcmp(args[0], "unless") == 0) + cond = parse_acl_cond((const char **)args+1, &curapp->acls, ACL_COND_UNLESS, &errmsg, &curapp->conf.args, + file, linenum); + if (!cond) { + ha_alert("parsing [%s:%d] : '%s' : %s.\n", file, linenum, + name, errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + } + add_rule: + if (!fcgi_app_add_rule(curapp, type, name, value, cond, &errmsg)) { + ha_alert("parsing [%s:%d] : '%s' : %s.\n", file, linenum, + name, errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + } + } +#if 0 /* Disabled for now */ + else if (!strcmp(args[0], "unset-param")) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expects <name> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + type = FCGI_RULE_UNSET_PARAM; + name = args[1]; + value = NULL; + cond = NULL; + args += 2; + goto parse_cond_rule; + } +#endif + else if (strcmp(args[0], "pass-header") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expects <name> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + type = FCGI_RULE_PASS_HDR; + name = args[1]; + value = NULL; + cond = NULL; + args += 2; + goto parse_cond_rule; + } +#if 0 /* Disabled for now */ + else if (!strcmp(args[0], "hide-header")) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d] : '%s' expects <name> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + type = FCGI_RULE_HIDE_HDR; + name = args[1]; + value = NULL; + cond = NULL; + args += 2; + goto parse_cond_rule; + } +#endif + else if (strcmp(args[0], "option") == 0) { + if (!*(args[1])) { + ha_alert("parsing [%s:%d]: '%s' expects an option name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + } + else if (strcmp(args[1], "keep-conn") == 0) { + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_STD) + curapp->flags |= FCGI_APP_FL_KEEP_CONN; + else if (kwm == KWM_NO) + curapp->flags &= ~FCGI_APP_FL_KEEP_CONN; + } + else if (strcmp(args[1], "get-values") == 0) { + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_STD) + curapp->flags |= FCGI_APP_FL_GET_VALUES; + else if (kwm == KWM_NO) + curapp->flags &= ~FCGI_APP_FL_GET_VALUES; + } + else if (strcmp(args[1], "mpxs-conns") == 0) { + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto out; + if (kwm == KWM_STD) + curapp->flags |= FCGI_APP_FL_MPXS_CONNS; + else if (kwm == KWM_NO) + curapp->flags &= ~FCGI_APP_FL_MPXS_CONNS; + } + else if (strcmp(args[1], "max-reqs") == 0) { + if (kwm != KWM_STD) { + ha_alert("parsing [%s:%d]: negation/default is not supported for option '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (!*(args[2])) { + ha_alert("parsing [%s:%d]: option '%s' expects an integer argument.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args_idx(1, 1, file, linenum, args, &err_code)) + goto out; + + curapp->maxreqs = atol(args[2]); + if (!curapp->maxreqs) { + ha_alert("parsing [%s:%d]: option '%s' expects a strictly positive integer argument.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else { + ha_alert("parsing [%s:%d] : unknown option '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + else if (strcmp(args[0], "log-stderr") == 0) { + if (!parse_logger(args, &curapp->loggers, (kwm == KWM_NO), file, linenum, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + else { + ha_alert("parsing [%s:%d]: unknown keyword '%s' in '%s' section\n", file, linenum, args[0], "fcgi-app"); + err_code |= ERR_ALERT | ERR_FATAL; + } + +out: + free(errmsg); + return err_code; +} + + +/**************************************************************************/ +/*********************** FCGI Deinit functions ****************************/ +/**************************************************************************/ +void fcgi_apps_deinit() +{ + struct fcgi_app *curapp, *nextapp; + struct logger *log, *logb; + + for (curapp = fcgi_apps; curapp != NULL; curapp = nextapp) { + struct fcgi_rule_conf *rule, *back; + + free(curapp->name); + istfree(&curapp->docroot); + istfree(&curapp->index); + regex_free(curapp->pathinfo_re); + free(curapp->conf.file); + + list_for_each_entry_safe(log, logb, &curapp->loggers, list) { + LIST_DELETE(&log->list); + free(log); + } + + list_for_each_entry_safe(rule, back, &curapp->conf.rules, list) { + LIST_DELETE(&rule->list); + fcgi_release_rule_conf(rule); + } + + nextapp = curapp->next; + free(curapp); + } +} + + +/**************************************************************************/ +/*************** Keywords definition and registration *********************/ +/**************************************************************************/ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_LISTEN, "use-fcgi-app", proxy_parse_use_fcgi_app }, + { 0, NULL, NULL }, +}}; + +// FIXME: Add rep.fcgi smp_fetch +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "fcgi.docroot", smp_fetch_fcgi_docroot, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "fcgi.index", smp_fetch_fcgi_index, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { /* END */ } +}}; + +/* Declare the filter parser for "fcgi-app" keyword */ +static struct flt_kw_list filter_kws = { "FCGI", { }, { + { "fcgi-app", parse_fcgi_flt, NULL }, + { NULL, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); +INITCALL1(STG_REGISTER, flt_register_keywords, &filter_kws); + +INITCALL1(STG_REGISTER, hap_register_post_deinit, fcgi_apps_deinit); + +REGISTER_CONFIG_SECTION("fcgi-app", cfg_parse_fcgi_app, NULL); +REGISTER_CONFIG_POSTPARSER("fcgi-apps", cfg_fcgi_apps_postparser); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/fcgi.c b/src/fcgi.c new file mode 100644 index 0000000..1d1a82b --- /dev/null +++ b/src/fcgi.c @@ -0,0 +1,294 @@ +/* + * FastCGI protocol processing + * + * Copyright (C) 2019 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <haproxy/buf.h> +#include <haproxy/fcgi.h> +#include <haproxy/istbuf.h> + +/* Encodes header of a FCGI record into the chunk <out>. It returns non-zero on + * success and 0 on failure (buffer full). <out> is a chunk, so the wrapping is + * not handled by this function. It is the caller responsibility to ensure + * enough contiguous space is available + */ +int fcgi_encode_record_hdr(struct buffer *out, const struct fcgi_header *h) +{ + size_t len = out->data; + + if (len + 8 >= b_size(out)) + return 0; + + out->area[len++] = h->vsn; + out->area[len++] = h->type; + out->area[len++] = ((h->id >> 8) & 0xff); + out->area[len++] = (h->id & 0xff); + out->area[len++] = ((h->len >> 8) & 0xff); + out->area[len++] = (h->len & 0xff); + out->area[len++] = h->padding; + out->area[len++] = 0; /* rsv */ + + out->data = len; + return 1; +} + +/* Decodes a FCGI record header from offset <o> of buffer <in> into descriptor + * <h>. The buffer may wrap so each byte read must be checked. The header is + * formed like this : + * + * b0 b1 b2 b3 b4 b5 b6 b7 + * +-----+------+-----+-----+------+------+--------+-----+ + * | vsn | type | id1 | id0 | len1 | len0 | padlen | rsv | + * +-----+------+-----+-----+------+------+--------+-----+ + * + * Returns zero if some bytes are missing, otherwise the number of read bytes. + */ +size_t fcgi_decode_record_hdr(const struct buffer *in, size_t o, struct fcgi_header *h) +{ + if (b_data(in) < o + 8) + return 0; + + h->vsn = (uint8_t)(*b_peek(in, o)); + h->type = (uint8_t)(*b_peek(in, o+1)); + h->id = ((uint8_t)(*b_peek(in, o+2)) << 8) + (uint8_t)(*b_peek(in, o+3)); + h->len = ((uint8_t)(*b_peek(in, o+4)) << 8) + (uint8_t)(*b_peek(in, o+5)); + h->padding = (uint8_t)(*b_peek(in, o+6)); + /* ignore rsv */ + + return 8; +} + +/* Encodes the payload part of a BEGIN_REQUEST record into the chunk <out>. It + * returns non-zero on success and 0 on failure (buffer full). <out> is a chunk, + * so the wrapping is not handled by this function. It is the caller + * responsibility to ensure enough contiguous space is available + */ +int fcgi_encode_begin_request(struct buffer *out, const struct fcgi_begin_request *r) +{ + size_t len = out->data; + + if (len + 8 >= b_size(out)) + return 0; + + out->area[len++] = ((r->role >> 8) & 0xff); + out->area[len++] = (r->role & 0xff); + out->area[len++] = r->flags; + out->area[len++] = 0; /* rsv */ + out->area[len++] = 0; + out->area[len++] = 0; + out->area[len++] = 0; + out->area[len++] = 0; + + out->data = len; + return 1; +} + +/* Encodes a parameter, part of the payload of a PARAM record, into the chunk + * <out>. It returns non-zero on success and 0 on failure (buffer full). <out> + * is a chunk, so the wrapping is not handled by this function. It is the caller + * responsibility to ensure enough contiguous space is available. The + * parameter's name is converted to upper case and non-alphanumeric character + * are replaced by an underscore. + */ +int fcgi_encode_param(struct buffer *out, const struct fcgi_param *p) +{ + size_t off, len = out->data; + int nbytes, vbytes; + + nbytes = (!(p->n.len >> 7) ? 1 : 4); + vbytes = (!(p->v.len >> 7) ? 1 : 4); + if ((len + nbytes + p->n.len + vbytes + p->v.len) >= b_size(out)) + return 0; + + if (nbytes == 1) + out->area[len++] = (p->n.len & 0xff); + else { + out->area[len++] = (((p->n.len >> 24) & 0xff) | 0x80); + out->area[len++] = ((p->n.len >> 16) & 0xff); + out->area[len++] = ((p->n.len >> 8) & 0xff); + out->area[len++] = (p->n.len & 0xff); + } + + if (vbytes == 1) + out->area[len++] = (p->v.len & 0xff); + else { + out->area[len++] = (((p->v.len >> 24) & 0xff) | 0x80); + out->area[len++] = ((p->v.len >> 16) & 0xff); + out->area[len++] = ((p->v.len >> 8) & 0xff); + out->area[len++] = (p->v.len & 0xff); + } + + for (off = 0; off < p->n.len; off++) { + if (isalnum((unsigned char)p->n.ptr[off])) + out->area[len++] = ist_uc[(unsigned char)p->n.ptr[off]]; + else + out->area[len++] = '_'; + } + if (p->v.len) { + ist2bin(out->area + len, p->v); + len += p->v.len; + } + + out->data = len; + return 1; +} + +/* Decodes a parameter of a PARAM record from offset <o> of buffer <in> into the + * FCGI param <p>. The buffer may wrap so each byte read must be checked. + * Returns zero if some bytes are missing, otherwise the number of read bytes. + */ +size_t fcgi_decode_param(const struct buffer *in, size_t o, struct fcgi_param *p) +{ + size_t data = b_data(in); + size_t nlen, vlen, len = 0; + uint8_t b0, b1, b2, b3; + + if (data < o + 1) + return 0; + b0 = *b_peek(in, o++); + if (!(b0 >> 7)) { + nlen = b0; + len++; + } + else { + if (data < o + 3) + return 0; + b1 = *b_peek(in, o++); + b2 = *b_peek(in, o++); + b3 = *b_peek(in, o++); + nlen = ((b0 & 0x7f) << 24) + (b1 << 16) + (b2 << 8) + b3; + len += 4; + } + + if (data < o + 1) + return 0; + b0 = *b_peek(in, o++); + if (!(b0 >> 7)) { + vlen = b0; + len++; + } + else { + if (data < o + 3) + return 0; + b1 = *b_peek(in, o++); + b2 = *b_peek(in, o++); + b3 = *b_peek(in, o++); + vlen = ((b0 & 0x7f) << 24) + (b1 << 16) + (b2 << 8) + b3; + len += 4; + } + + if (data < nlen + vlen) + return 0; + + p->n = ist2(b_peek(in, o), nlen); + p->v = ist2(b_peek(in, o + nlen), vlen); + len += nlen + vlen; + + return len; +} + + +/* Decodes a parameter of a PARAM record from offset <o> of buffer <in> into the + * FCGI param <p>. To call this function, the buffer must not wrap. Returns zero + * if some bytes are missing, otherwise the number of read bytes. + */ +size_t fcgi_aligned_decode_param(const struct buffer *in, size_t o, struct fcgi_param *p) +{ + size_t data = b_data(in); + size_t nlen, vlen, len = 0; + uint8_t b0, b1, b2, b3; + + if (data < o + 1) + return 0; + b0 = in->area[o++]; + if (!(b0 >> 7)) { + nlen = b0; + len++; + } + else { + if (data < o + 3) + return 0; + b1 = in->area[o++]; + b2 = in->area[o++]; + b3 = in->area[o++]; + nlen = ((b0 & 0x7f) << 24) + (b1 << 16) + (b2 << 8) + b3; + len += 4; + } + + if (data < o + 1) + return 0; + b0 = in->area[o++]; + if (!(b0 >> 7)) { + vlen = b0; + len++; + } + else { + if (data < o + 3) + return 0; + b1 = in->area[o++]; + b2 = in->area[o++]; + b3 = in->area[o++]; + vlen = ((b0 & 0x7f) << 24) + (b1 << 16) + (b2 << 8) + b3; + len += 4; + } + + if (data < nlen + vlen) + return 0; + + p->n = ist2(in->area + o, nlen); + p->v = ist2(in->area + o + nlen, vlen); + len += nlen + vlen; + + return len; +} + +/* Decodes payload of a END_REQUEST record from offset <o> of buffer <in> into + * the FCGI param <p>. The buffer may wrap so each byte read must be + * checked. Returns zero if some bytes are missing, otherwise the number of read + * bytes. + */ +size_t fcgi_decode_end_request(const struct buffer *in, size_t o, struct fcgi_end_request *rec) +{ + uint8_t b0, b1, b2, b3; + + if (b_data(in) < o + 8) + return 0; + + b0 = *b_peek(in, o++); + b1 = *b_peek(in, o++); + b2 = *b_peek(in, o++); + b3 = *b_peek(in, o++); + rec->status = ((b0 & 0x7f) << 24) + (b1 << 16) + (b2 << 8) + b3; + rec->errcode = *b_peek(in, o++); + o += 3; /* ignore rsv */ + + return 8; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/fd.c b/src/fd.c new file mode 100644 index 0000000..9d34315 --- /dev/null +++ b/src/fd.c @@ -0,0 +1,1348 @@ +/* + * File descriptors management functions. + * + * Copyright 2000-2014 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * There is no direct link between the FD and the updates list. There is only a + * bit in the fdtab[] to indicate than a file descriptor is already present in + * the updates list. Once an fd is present in the updates list, it will have to + * be considered even if its changes are reverted in the middle or if the fd is + * replaced. + * + * The event state for an FD, as found in fdtab[].state, is maintained for each + * direction. The state field is built this way, with R bits in the low nibble + * and W bits in the high nibble for ease of access and debugging : + * + * 7 6 5 4 3 2 1 0 + * [ 0 | 0 | RW | AW | 0 | 0 | RR | AR ] + * + * A* = active *R = read + * R* = ready *W = write + * + * An FD is marked "active" when there is a desire to use it. + * An FD is marked "ready" when it has not faced a new EAGAIN since last wake-up + * (it is a cache of the last EAGAIN regardless of polling changes). Each poller + * has its own "polled" state for the same fd, as stored in the polled_mask. + * + * We have 4 possible states for each direction based on these 2 flags : + * + * +---+---+----------+---------------------------------------------+ + * | R | A | State | Description | + * +---+---+----------+---------------------------------------------+ + * | 0 | 0 | DISABLED | No activity desired, not ready. | + * | 0 | 1 | ACTIVE | Activity desired. | + * | 1 | 0 | STOPPED | End of activity. | + * | 1 | 1 | READY | Activity desired and reported. | + * +---+---+----------+---------------------------------------------+ + * + * The transitions are pretty simple : + * - fd_want_*() : set flag A + * - fd_stop_*() : clear flag A + * - fd_cant_*() : clear flag R (when facing EAGAIN) + * - fd_may_*() : set flag R (upon return from poll()) + * + * Each poller then computes its own polled state : + * if (A) { if (!R) P := 1 } else { P := 0 } + * + * The state transitions look like the diagram below. + * + * may +----------+ + * ,----| DISABLED | (READY=0, ACTIVE=0) + * | +----------+ + * | want | ^ + * | | | + * | v | stop + * | +----------+ + * | | ACTIVE | (READY=0, ACTIVE=1) + * | +----------+ + * | | ^ + * | may | | + * | v | EAGAIN (can't) + * | +--------+ + * | | READY | (READY=1, ACTIVE=1) + * | +--------+ + * | stop | ^ + * | | | + * | v | want + * | +---------+ + * `--->| STOPPED | (READY=1, ACTIVE=0) + * +---------+ + */ + +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/resource.h> +#include <sys/uio.h> + +#if defined(USE_POLL) +#include <poll.h> +#include <errno.h> +#endif + +#include <haproxy/api.h> +#include <haproxy/activity.h> +#include <haproxy/cfgparse.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/log.h> +#include <haproxy/port_range.h> +#include <haproxy/ticks.h> +#include <haproxy/tools.h> + + +struct fdtab *fdtab __read_mostly = NULL; /* array of all the file descriptors */ +struct polled_mask *polled_mask __read_mostly = NULL; /* Array for the polled_mask of each fd */ +struct fdinfo *fdinfo __read_mostly = NULL; /* less-often used infos for file descriptors */ +int totalconn; /* total # of terminated sessions */ +int actconn; /* # of active sessions */ + +struct poller pollers[MAX_POLLERS] __read_mostly; +struct poller cur_poller __read_mostly; +int nbpollers = 0; + +volatile struct fdlist update_list[MAX_TGROUPS]; // Global update list + +THREAD_LOCAL int *fd_updt = NULL; // FD updates list +THREAD_LOCAL int fd_nbupdt = 0; // number of updates in the list +THREAD_LOCAL int poller_rd_pipe = -1; // Pipe to wake the thread +int poller_wr_pipe[MAX_THREADS] __read_mostly; // Pipe to wake the threads + +volatile int ha_used_fds = 0; // Number of FD we're currently using +static struct fdtab *fdtab_addr; /* address of the allocated area containing fdtab */ + +/* adds fd <fd> to fd list <list> if it was not yet in it */ +void fd_add_to_fd_list(volatile struct fdlist *list, int fd) +{ + int next; + int new; + int old; + int last; + +redo_next: + next = HA_ATOMIC_LOAD(&fdtab[fd].update.next); + /* Check that we're not already in the cache, and if not, lock us. */ + if (next > -2) + goto done; + if (next == -2) + goto redo_next; + if (!_HA_ATOMIC_CAS(&fdtab[fd].update.next, &next, -2)) + goto redo_next; + __ha_barrier_atomic_store(); + + new = fd; +redo_last: + /* First, insert in the linked list */ + last = list->last; + old = -1; + + fdtab[fd].update.prev = -2; + /* Make sure the "prev" store is visible before we update the last entry */ + __ha_barrier_store(); + + if (unlikely(last == -1)) { + /* list is empty, try to add ourselves alone so that list->last=fd */ + if (unlikely(!_HA_ATOMIC_CAS(&list->last, &old, new))) + goto redo_last; + + /* list->first was necessary -1, we're guaranteed to be alone here */ + list->first = fd; + } else { + /* adding ourselves past the last element + * The CAS will only succeed if its next is -1, + * which means it's in the cache, and the last element. + */ + if (unlikely(!_HA_ATOMIC_CAS(&fdtab[last].update.next, &old, new))) + goto redo_last; + + /* Then, update the last entry */ + list->last = fd; + } + __ha_barrier_store(); + /* since we're alone at the end of the list and still locked(-2), + * we know no one tried to add past us. Mark the end of list. + */ + fdtab[fd].update.prev = last; + fdtab[fd].update.next = -1; + __ha_barrier_store(); +done: + return; +} + +/* removes fd <fd> from fd list <list> */ +void fd_rm_from_fd_list(volatile struct fdlist *list, int fd) +{ +#if defined(HA_HAVE_CAS_DW) || defined(HA_CAS_IS_8B) + volatile union { + struct fdlist_entry ent; + uint64_t u64; + uint32_t u32[2]; + } cur_list, next_list; +#endif + int old; + int new = -2; + int prev; + int next; + int last; +lock_self: +#if (defined(HA_CAS_IS_8B) || defined(HA_HAVE_CAS_DW)) + next_list.ent.next = next_list.ent.prev = -2; + cur_list.ent = *(volatile typeof(fdtab->update)*)&fdtab[fd].update; + /* First, attempt to lock our own entries */ + do { + /* The FD is not in the FD cache, give up */ + if (unlikely(cur_list.ent.next <= -3)) + return; + if (unlikely(cur_list.ent.prev == -2 || cur_list.ent.next == -2)) + goto lock_self; + } while ( +#ifdef HA_CAS_IS_8B + unlikely(!_HA_ATOMIC_CAS(((uint64_t *)&fdtab[fd].update), (uint64_t *)&cur_list.u64, next_list.u64)) +#else + unlikely(!_HA_ATOMIC_DWCAS(((long *)&fdtab[fd].update), (uint32_t *)&cur_list.u32, (const uint32_t *)&next_list.u32)) +#endif + ); + next = cur_list.ent.next; + prev = cur_list.ent.prev; + +#else +lock_self_next: + next = HA_ATOMIC_LOAD(&fdtab[fd].update.next); + if (next == -2) + goto lock_self_next; + if (next <= -3) + goto done; + if (unlikely(!_HA_ATOMIC_CAS(&fdtab[fd].update.next, &next, -2))) + goto lock_self_next; +lock_self_prev: + prev = HA_ATOMIC_LOAD(&fdtab[fd].update.prev); + if (prev == -2) + goto lock_self_prev; + if (unlikely(!_HA_ATOMIC_CAS(&fdtab[fd].update.prev, &prev, -2))) + goto lock_self_prev; +#endif + __ha_barrier_atomic_store(); + + /* Now, lock the entries of our neighbours */ + if (likely(prev != -1)) { +redo_prev: + old = fd; + + if (unlikely(!_HA_ATOMIC_CAS(&fdtab[prev].update.next, &old, new))) { + if (unlikely(old == -2)) { + /* Neighbour already locked, give up and + * retry again once he's done + */ + fdtab[fd].update.prev = prev; + __ha_barrier_store(); + fdtab[fd].update.next = next; + __ha_barrier_store(); + goto lock_self; + } + goto redo_prev; + } + } + if (likely(next != -1)) { +redo_next: + old = fd; + if (unlikely(!_HA_ATOMIC_CAS(&fdtab[next].update.prev, &old, new))) { + if (unlikely(old == -2)) { + /* Neighbour already locked, give up and + * retry again once he's done + */ + if (prev != -1) { + fdtab[prev].update.next = fd; + __ha_barrier_store(); + } + fdtab[fd].update.prev = prev; + __ha_barrier_store(); + fdtab[fd].update.next = next; + __ha_barrier_store(); + goto lock_self; + } + goto redo_next; + } + } + if (list->first == fd) + list->first = next; + __ha_barrier_store(); + last = list->last; + while (unlikely(last == fd && (!_HA_ATOMIC_CAS(&list->last, &last, prev)))) + __ha_compiler_barrier(); + /* Make sure we let other threads know we're no longer in cache, + * before releasing our neighbours. + */ + __ha_barrier_store(); + if (likely(prev != -1)) + fdtab[prev].update.next = next; + __ha_barrier_store(); + if (likely(next != -1)) + fdtab[next].update.prev = prev; + __ha_barrier_store(); + /* Ok, now we're out of the fd cache */ + fdtab[fd].update.next = -(next + 4); + __ha_barrier_store(); +done: + return; +} + +/* deletes the FD once nobody uses it anymore, as detected by the caller by its + * thread_mask being zero and its running mask turning to zero. There is no + * protection against concurrent accesses, it's up to the caller to make sure + * only the last thread will call it. If called under isolation, it is safe to + * call this from another group than the FD's. This is only for internal use, + * please use fd_delete() instead. + */ +void _fd_delete_orphan(int fd) +{ + int tgrp = fd_tgid(fd); + uint fd_disown; + + fd_disown = fdtab[fd].state & FD_DISOWN; + if (fdtab[fd].state & FD_LINGER_RISK) { + /* this is generally set when connecting to servers */ + DISGUISE(setsockopt(fd, SOL_SOCKET, SO_LINGER, + (struct linger *) &nolinger, sizeof(struct linger))); + } + + /* It's expected that a close() will result in the FD disappearing from + * pollers, but some pollers may have some internal bookkeeping to be + * done prior to the call (e.g. remove references from internal tables). + */ + if (cur_poller.clo) + cur_poller.clo(fd); + + /* now we're about to reset some of this FD's fields. We don't want + * anyone to grab it anymore and we need to make sure those which could + * possibly have stumbled upon it right now are leaving before we + * proceed. This is done in two steps. First we reset the tgid so that + * fd_take_tgid() and fd_grab_tgid() fail, then we wait for existing + * ref counts to drop. Past this point we're alone dealing with the + * FD's thead/running/update/polled masks. + */ + fd_reset_tgid(fd); + + while (_HA_ATOMIC_LOAD(&fdtab[fd].refc_tgid) != 0) // refc==0 ? + __ha_cpu_relax(); + + /* we don't want this FD anymore in the global list */ + fd_rm_from_fd_list(&update_list[tgrp - 1], fd); + + /* no more updates on this FD are relevant anymore */ + HA_ATOMIC_STORE(&fdtab[fd].update_mask, 0); + if (fd_nbupdt > 0 && fd_updt[fd_nbupdt - 1] == fd) + fd_nbupdt--; + + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + polled_mask[fd].poll_recv = polled_mask[fd].poll_send = 0; + + fdtab[fd].state = 0; + +#ifdef DEBUG_FD + fdtab[fd].event_count = 0; +#endif + fdinfo[fd].port_range = NULL; + fdtab[fd].owner = NULL; + + /* perform the close() call last as it's what unlocks the instant reuse + * of this FD by any other thread. + */ + if (!fd_disown) + close(fd); + _HA_ATOMIC_DEC(&ha_used_fds); +} + +/* Deletes an FD from the fdsets. The file descriptor is also closed, possibly + * asynchronously. It is safe to call it from another thread from the same + * group as the FD's or from a thread from a different group. However if called + * from a thread from another group, there is an extra cost involved because + * the operation is performed under thread isolation, so doing so must be + * reserved for ultra-rare cases (e.g. stopping a listener). + */ +void fd_delete(int fd) +{ + /* This must never happen and would definitely indicate a bug, in + * addition to overwriting some unexpected memory areas. + */ + BUG_ON(fd < 0 || fd >= global.maxsock); + + /* NOTE: The master when going into reexec mode re-closes all FDs after + * they were already dispatched. But we know we didn't start the polling + * threads so we can still close them. The masks will probably not match + * however so we force the value and erase the refcount if any. + */ + if (unlikely(global.mode & MODE_STARTING)) + fdtab[fd].refc_tgid = ti->tgid; + + /* the tgid cannot change before a complete close so we should never + * face the situation where we try to close an fd that was reassigned. + * However there is one corner case where this happens, it's when an + * attempt to pause a listener fails (e.g. abns), leaving the listener + * in fault state and it is forcefully stopped. This needs to be done + * under isolation, and it's quite rare (i.e. once per such FD per + * process). Since we'll be isolated we can clear the thread mask and + * close the FD ourselves. + */ + if (unlikely(fd_tgid(fd) != ti->tgid)) { + int must_isolate = !thread_isolated() && !(global.mode & MODE_STOPPING); + + if (must_isolate) + thread_isolate(); + + HA_ATOMIC_STORE(&fdtab[fd].thread_mask, 0); + HA_ATOMIC_STORE(&fdtab[fd].running_mask, 0); + _fd_delete_orphan(fd); + + if (must_isolate) + thread_release(); + return; + } + + /* we must postpone removal of an FD that may currently be in use + * by another thread. This can happen in the following two situations: + * - after a takeover, the owning thread closes the connection but + * the previous one just woke up from the poller and entered + * the FD handler iocb. That thread holds an entry in running_mask + * and requires removal protection. + * - multiple threads are accepting connections on a listener, and + * one of them (or even an separate one) decides to unbind the + * listener under the listener's lock while other ones still hold + * the running bit. + * In both situations the FD is marked as unused (thread_mask = 0) and + * will not take new bits in its running_mask so we have the guarantee + * that the last thread eliminating running_mask is the one allowed to + * safely delete the FD. Most of the time it will be the current thread. + * We still need to set and check the one-shot flag FD_MUST_CLOSE + * to take care of the rare cases where a thread wakes up on late I/O + * before the thread_mask is zero, and sets its bit in the running_mask + * just after the current thread finishes clearing its own bit, hence + * the two threads see themselves as last ones (which they really are). + */ + + HA_ATOMIC_OR(&fdtab[fd].running_mask, ti->ltid_bit); + HA_ATOMIC_OR(&fdtab[fd].state, FD_MUST_CLOSE); + HA_ATOMIC_STORE(&fdtab[fd].thread_mask, 0); + if (fd_clr_running(fd) == ti->ltid_bit) { + if (HA_ATOMIC_BTR(&fdtab[fd].state, FD_MUST_CLOSE_BIT)) { + _fd_delete_orphan(fd); + } + } +} + +/* makes the new fd non-blocking and clears all other O_* flags; this is meant + * to be used on new FDs. Returns -1 on failure. The result is disguised at the + * end because some callers need to be able to ignore it regardless of the libc + * attributes. + */ +int fd_set_nonblock(int fd) +{ + int ret = fcntl(fd, F_SETFL, O_NONBLOCK); + + return DISGUISE(ret); +} + +/* sets the close-on-exec flag on fd; returns -1 on failure. The result is + * disguised at the end because some callers need to be able to ignore it + * regardless of the libc attributes. + */ +int fd_set_cloexec(int fd) +{ + int flags, ret; + + flags = fcntl(fd, F_GETFD); + flags |= FD_CLOEXEC; + ret = fcntl(fd, F_SETFD, flags); + return DISGUISE(ret); +} + +/* Migrate a FD to a new thread <new_tid>. It is explicitly permitted to + * migrate to another thread group, the function takes the necessary locking + * for this. It is even permitted to migrate from a foreign group to another, + * but the calling thread must be certain that the FD is not about to close + * when doing so, reason why it is highly recommended that only one of the + * FD's owners performs this operation. The polling is completely disabled. + * The operation never fails. + */ +void fd_migrate_on(int fd, uint new_tid) +{ + struct thread_info *new_ti = &ha_thread_info[new_tid]; + + /* we must be alone to work on this idle FD. If not, it means that its + * poller is currently waking up and is about to use it, likely to + * close it on shut/error, but maybe also to process any unexpectedly + * pending data. It's also possible that the FD was closed and + * reassigned to another thread group, so let's be careful. + */ + fd_lock_tgid(fd, new_ti->tgid); + + /* now we have exclusive access to it. From now FD belongs to tid_bit + * for this tgid. + */ + HA_ATOMIC_STORE(&fdtab[fd].thread_mask, new_ti->ltid_bit); + + /* Make sure the FD doesn't have the active bit. It is possible that + * the fd is polled by the thread that used to own it, the new thread + * is supposed to call subscribe() later, to activate polling. + */ + fd_stop_both(fd); + + /* we're done with it. As soon as we unlock it, other threads from the + * target group can manipulate it. However it may only disappear once + * we drop the reference. + */ + fd_unlock_tgid(fd); + fd_drop_tgid(fd); +} + +/* + * Take over a FD belonging to another thread. + * unexpected_conn is the expected owner of the fd. + * Returns 0 on success, and -1 on failure. + */ +int fd_takeover(int fd, void *expected_owner) +{ + unsigned long old; + + /* protect ourself against a delete then an insert for the same fd, + * if it happens, then the owner will no longer be the expected + * connection. + */ + if (fdtab[fd].owner != expected_owner) + return -1; + + /* we must be alone to work on this idle FD. If not, it means that its + * poller is currently waking up and is about to use it, likely to + * close it on shut/error, but maybe also to process any unexpectedly + * pending data. It's also possible that the FD was closed and + * reassigned to another thread group, so let's be careful. + */ + if (unlikely(!fd_grab_tgid(fd, ti->tgid))) + return -1; + + old = 0; + if (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &old, ti->ltid_bit)) { + fd_drop_tgid(fd); + return -1; + } + + /* success, from now on it's ours */ + HA_ATOMIC_STORE(&fdtab[fd].thread_mask, ti->ltid_bit); + + /* Make sure the FD doesn't have the active bit. It is possible that + * the fd is polled by the thread that used to own it, the new thread + * is supposed to call subscribe() later, to activate polling. + */ + fd_stop_recv(fd); + + /* we're done with it */ + HA_ATOMIC_AND(&fdtab[fd].running_mask, ~ti->ltid_bit); + + /* no more changes planned */ + fd_drop_tgid(fd); + return 0; +} + +void updt_fd_polling(const int fd) +{ + uint tgrp = fd_take_tgid(fd); + + /* closed ? may happen */ + if (!tgrp) + return; + + if (unlikely(tgrp != tgid && tgrp <= MAX_TGROUPS)) { + /* Hmmm delivered an update for another group... That may + * happen on suspend/resume of a listener for example when + * the FD was not even marked for running. Let's broadcast + * the update. + */ + unsigned long update_mask = fdtab[fd].update_mask; + int thr; + + while (!_HA_ATOMIC_CAS(&fdtab[fd].update_mask, &update_mask, + _HA_ATOMIC_LOAD(&ha_tgroup_info[tgrp - 1].threads_enabled))) + __ha_cpu_relax(); + + fd_add_to_fd_list(&update_list[tgrp - 1], fd); + + thr = one_among_mask(fdtab[fd].thread_mask & ha_tgroup_info[tgrp - 1].threads_enabled, + statistical_prng_range(ha_tgroup_info[tgrp - 1].count)); + thr += ha_tgroup_info[tgrp - 1].base; + wake_thread(thr); + + fd_drop_tgid(fd); + return; + } + + fd_drop_tgid(fd); + + if (tg->threads_enabled == 1UL || (fdtab[fd].thread_mask & tg->threads_enabled) == ti->ltid_bit) { + if (HA_ATOMIC_BTS(&fdtab[fd].update_mask, ti->ltid)) + return; + + fd_updt[fd_nbupdt++] = fd; + } else { + unsigned long update_mask = fdtab[fd].update_mask; + do { + if (update_mask == fdtab[fd].thread_mask) // FIXME: this works only on thread-groups 1 + return; + } while (!_HA_ATOMIC_CAS(&fdtab[fd].update_mask, &update_mask, fdtab[fd].thread_mask)); + + fd_add_to_fd_list(&update_list[tgid - 1], fd); + + if (fd_active(fd) && !(fdtab[fd].thread_mask & ti->ltid_bit)) { + /* we need to wake up another thread to handle it immediately, any will fit, + * so let's pick a random one so that it doesn't always end up on the same. + */ + int thr = one_among_mask(fdtab[fd].thread_mask & tg->threads_enabled, + statistical_prng_range(tg->count)); + thr += tg->base; + wake_thread(thr); + } + } +} + +/* Update events seen for FD <fd> and its state if needed. This should be + * called by the poller, passing FD_EV_*_{R,W,RW} in <evts>. FD_EV_ERR_* + * doesn't need to also pass FD_EV_SHUT_*, it's implied. ERR and SHUT are + * allowed to be reported regardless of R/W readiness. Returns one of + * FD_UPDT_*. + */ +int fd_update_events(int fd, uint evts) +{ + unsigned long locked; + uint old, new; + uint new_flags, must_stop; + ulong rmask, tmask; + + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_STUCK); // this thread is still running + + if (unlikely(!fd_grab_tgid(fd, ti->tgid))) { + /* the FD changed to another tgid, we can't safely + * check it anymore. The bits in the masks are not + * ours anymore and we're not allowed to touch them. + * Ours have already been cleared and the FD was + * closed in between so we can safely leave now. + */ + activity[tid].poll_drop_fd++; + return FD_UPDT_CLOSED; + } + + /* Do not take running_mask if not strictly needed (will trigger a + * cosmetic BUG_ON() in fd_insert() anyway if done). + */ + tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask); + if (!(tmask & ti->ltid_bit)) + goto do_update; + + HA_ATOMIC_OR(&fdtab[fd].running_mask, ti->ltid_bit); + + /* From this point, our bit may possibly be in thread_mask, but it may + * still vanish, either because a takeover completed just before taking + * the bit above with the new owner deleting the FD, or because a + * takeover started just before taking the bit. In order to make sure a + * started takeover is complete, we need to verify that all bits of + * running_mask are present in thread_mask, since takeover first takes + * running then atomically replaces thread_mask. Once it's stable, if + * our bit remains there, no further takeover may happen because we + * hold running, but if our bit is not there it means we've lost the + * takeover race and have to decline touching the FD. Regarding the + * risk of deletion, our bit in running_mask prevents fd_delete() from + * finalizing the close, and the caller will leave the FD with a zero + * thread_mask and the FD_MUST_CLOSE flag set. It will then be our + * responsibility to close it. + */ + do { + rmask = _HA_ATOMIC_LOAD(&fdtab[fd].running_mask); + tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask); + rmask &= ~ti->ltid_bit; + } while ((rmask & ~tmask) && (tmask & ti->ltid_bit)); + + /* Now tmask is stable. Do nothing if the FD was taken over under us */ + + if (!(tmask & ti->ltid_bit)) { + /* a takeover has started */ + activity[tid].poll_skip_fd++; + + if (fd_clr_running(fd) == ti->ltid_bit) + goto closed_or_migrated; + + goto do_update; + } + + /* with running we're safe now, we can drop the reference */ + fd_drop_tgid(fd); + + locked = (tmask != ti->ltid_bit); + + /* OK now we are guaranteed that our thread_mask was present and + * that we're allowed to update the FD. + */ + + new_flags = + ((evts & FD_EV_READY_R) ? FD_POLL_IN : 0) | + ((evts & FD_EV_READY_W) ? FD_POLL_OUT : 0) | + ((evts & FD_EV_SHUT_R) ? FD_POLL_HUP : 0) | + ((evts & FD_EV_ERR_RW) ? FD_POLL_ERR : 0); + + /* SHUTW reported while FD was active for writes is an error */ + if ((fdtab[fd].state & FD_EV_ACTIVE_W) && (evts & FD_EV_SHUT_W)) + new_flags |= FD_POLL_ERR; + + /* compute the inactive events reported late that must be stopped */ + must_stop = 0; + if (unlikely(!fd_active(fd))) { + /* both sides stopped */ + must_stop = FD_POLL_IN | FD_POLL_OUT; + } + else if (unlikely(!fd_recv_active(fd) && (evts & (FD_EV_READY_R | FD_EV_SHUT_R | FD_EV_ERR_RW)))) { + /* only send remains */ + must_stop = FD_POLL_IN; + } + else if (unlikely(!fd_send_active(fd) && (evts & (FD_EV_READY_W | FD_EV_SHUT_W | FD_EV_ERR_RW)))) { + /* only recv remains */ + must_stop = FD_POLL_OUT; + } + + if (new_flags & (FD_POLL_IN | FD_POLL_HUP | FD_POLL_ERR)) + new_flags |= FD_EV_READY_R; + + if (new_flags & (FD_POLL_OUT | FD_POLL_ERR)) + new_flags |= FD_EV_READY_W; + + old = fdtab[fd].state; + new = (old & ~FD_POLL_UPDT_MASK) | new_flags; + + if (unlikely(locked)) { + /* Locked FDs (those with more than 2 threads) are atomically updated */ + while (unlikely(new != old && !_HA_ATOMIC_CAS(&fdtab[fd].state, &old, new))) + new = (old & ~FD_POLL_UPDT_MASK) | new_flags; + } else { + if (new != old) + fdtab[fd].state = new; + } + + if (fdtab[fd].iocb && fd_active(fd)) { + fdtab[fd].iocb(fd); + } + + /* + * We entered iocb with running set and with the valid tgid. + * Since then, this is what could have happened: + * - another thread tried to close the FD (e.g. timeout task from + * another one that owns it). We still have running set, but not + * tmask. We must call fd_clr_running() then _fd_delete_orphan() + * if we were the last one. + * + * - the iocb tried to close the FD => bit no more present in running, + * nothing to do. If it managed to close it, the poller's ->clo() + * has already been called. + * + * - after we closed, the FD was reassigned to another thread in + * another group => running not present, tgid differs, nothing to + * do because if it got reassigned it indicates it was already + * closed. + * + * There's no risk of takeover of the valid FD here during this period. + * Also if we still have running, immediately after we release it, the + * events above might instantly happen due to another thread taking + * over. + * + * As such, the only cases where the FD is still relevant are: + * - tgid still set and running still set (most common) + * - tgid still valid but running cleared due to fd_delete(): we may + * still need to stop polling otherwise we may keep it enabled + * while waiting for other threads to close it. + * And given that we may need to program a tentative update in case we + * don't immediately close, it's easier to grab the tgid during the + * whole check. + */ + + if (!fd_grab_tgid(fd, tgid)) + return FD_UPDT_CLOSED; + + tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask); + + /* another thread might have attempted to close this FD in the mean + * time (e.g. timeout task) striking on a previous thread and closing. + * This is detected by us being the last owners of a running_mask bit, + * and the thread_mask being zero. At the moment we release the running + * bit, a takeover may also happen, so in practice we check for our loss + * of the thread_mask bitboth thread_mask and running_mask being 0 after + * we remove ourselves last. There is no risk the FD gets reassigned + * to a different group since it's not released until the real close() + * in _fd_delete_orphan(). + */ + if (fd_clr_running(fd) == ti->ltid_bit && !(tmask & ti->ltid_bit)) + goto closed_or_migrated; + + /* we had to stop this FD and it still must be stopped after the I/O + * cb's changes, so let's program an update for this. + */ + if (must_stop && !(fdtab[fd].update_mask & ti->ltid_bit)) { + if (((must_stop & FD_POLL_IN) && !fd_recv_active(fd)) || + ((must_stop & FD_POLL_OUT) && !fd_send_active(fd))) + if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, ti->ltid)) + fd_updt[fd_nbupdt++] = fd; + } + + fd_drop_tgid(fd); + return FD_UPDT_DONE; + + closed_or_migrated: + /* We only come here once we've last dropped running and the FD is + * not for us as per !(tmask & tid_bit). It may imply we're + * responsible for closing it. Otherwise it's just a migration. + */ + if (HA_ATOMIC_BTR(&fdtab[fd].state, FD_MUST_CLOSE_BIT)) { + fd_drop_tgid(fd); + _fd_delete_orphan(fd); + return FD_UPDT_CLOSED; + } + + /* So we were alone, no close bit, at best the FD was migrated, at + * worst it's in the process of being closed by another thread. We must + * be ultra-careful as it can be re-inserted by yet another thread as + * the result of socket() or accept(). Let's just tell the poller the + * FD was lost. If it was closed it was already removed and this will + * only cost an update for nothing. + */ + + do_update: + /* The FD is not closed but we don't want the poller to wake up for + * it anymore. + */ + if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, ti->ltid)) + fd_updt[fd_nbupdt++] = fd; + + fd_drop_tgid(fd); + return FD_UPDT_MIGRATED; +} + +/* This is used by pollers at boot time to re-register desired events for + * all FDs after new pollers have been created. It doesn't do much, it checks + * that their thread group matches the one in argument, and that the thread + * mask matches at least one of the bits in the mask, and if so, marks the FD + * as updated. + */ +void fd_reregister_all(int tgrp, ulong mask) +{ + int fd; + + for (fd = 0; fd < global.maxsock; fd++) { + if (!fdtab[fd].owner) + continue; + + /* make sure we don't register other tgroups' FDs. We just + * avoid needlessly taking the lock if not needed. + */ + if (!(_HA_ATOMIC_LOAD(&fdtab[fd].thread_mask) & mask) || + !fd_grab_tgid(fd, tgrp)) + continue; // was not for us anyway + + if (_HA_ATOMIC_LOAD(&fdtab[fd].thread_mask) & mask) + updt_fd_polling(fd); + fd_drop_tgid(fd); + } +} + +/* Tries to send <npfx> parts from <prefix> followed by <nmsg> parts from <msg> + * optionally followed by a newline if <nl> is non-null, to file descriptor + * <fd>. The message is sent atomically using writev(). It may be truncated to + * <maxlen> bytes if <maxlen> is non-null. There is no distinction between the + * two lists, it's just a convenience to help the caller prepend some prefixes + * when necessary. It takes the fd's lock to make sure no other thread will + * write to the same fd in parallel. Returns the number of bytes sent, or <=0 + * on failure. A limit to 31 total non-empty segments is enforced. The caller + * is responsible for taking care of making the fd non-blocking. + */ +ssize_t fd_write_frag_line(int fd, size_t maxlen, const struct ist pfx[], size_t npfx, const struct ist msg[], size_t nmsg, int nl) +{ + struct iovec iovec[32]; + size_t sent = 0; + int vec = 0; + int attempts = 0; + + if (!maxlen) + maxlen = ~0; + + /* keep one char for a possible trailing '\n' in any case */ + maxlen--; + + /* make an iovec from the concatenation of all parts of the original + * message. Skip empty fields and truncate the whole message to maxlen, + * leaving one spare iovec for the '\n'. + */ + while (vec < (sizeof(iovec) / sizeof(iovec[0]) - 1)) { + if (!npfx) { + pfx = msg; + npfx = nmsg; + nmsg = 0; + if (!npfx) + break; + } + + iovec[vec].iov_base = pfx->ptr; + iovec[vec].iov_len = MIN(maxlen, pfx->len); + maxlen -= iovec[vec].iov_len; + if (iovec[vec].iov_len) + vec++; + pfx++; npfx--; + }; + + if (nl) { + iovec[vec].iov_base = "\n"; + iovec[vec].iov_len = 1; + vec++; + } + + /* make sure we never interleave writes and we never block. This means + * we prefer to fail on collision than to block. But we don't want to + * lose too many logs so we just perform a few lock attempts then give + * up. + */ + + while (HA_ATOMIC_BTS(&fdtab[fd].state, FD_EXCL_SYSCALL_BIT)) { + if (++attempts >= 200) { + /* so that the caller knows the message couldn't be delivered */ + sent = -1; + errno = EAGAIN; + goto leave; + } + ha_thread_relax(); + } + + if (unlikely(!(fdtab[fd].state & FD_INITIALIZED))) { + HA_ATOMIC_OR(&fdtab[fd].state, FD_INITIALIZED); + if (!isatty(fd)) + fd_set_nonblock(fd); + } + sent = writev(fd, iovec, vec); + HA_ATOMIC_BTR(&fdtab[fd].state, FD_EXCL_SYSCALL_BIT); + + leave: + /* sent > 0 if the message was delivered */ + return sent; +} + +#if defined(USE_CLOSEFROM) +void my_closefrom(int start) +{ + closefrom(start); +} + +#elif defined(USE_POLL) +/* This is a portable implementation of closefrom(). It closes all open file + * descriptors starting at <start> and above. It relies on the fact that poll() + * will return POLLNVAL for each invalid (hence close) file descriptor passed + * in argument in order to skip them. It acts with batches of FDs and will + * typically perform one poll() call per 1024 FDs so the overhead is low in + * case all FDs have to be closed. + */ +void my_closefrom(int start) +{ + struct pollfd poll_events[1024]; + struct rlimit limit; + int nbfds, fd, ret, idx; + int step, next; + + if (getrlimit(RLIMIT_NOFILE, &limit) == 0) + step = nbfds = limit.rlim_cur; + else + step = nbfds = 0; + + if (nbfds <= 0) { + /* set safe limit */ + nbfds = 1024; + step = 256; + } + + if (step > sizeof(poll_events) / sizeof(poll_events[0])) + step = sizeof(poll_events) / sizeof(poll_events[0]); + + while (start < nbfds) { + next = (start / step + 1) * step; + + for (fd = start; fd < next && fd < nbfds; fd++) { + poll_events[fd - start].fd = fd; + poll_events[fd - start].events = 0; + } + + do { + ret = poll(poll_events, fd - start, 0); + if (ret >= 0) + break; + } while (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR || errno == ENOMEM); + + if (ret) + ret = fd - start; + + for (idx = 0; idx < ret; idx++) { + if (poll_events[idx].revents & POLLNVAL) + continue; /* already closed */ + + fd = poll_events[idx].fd; + close(fd); + } + start = next; + } +} + +#else // defined(USE_POLL) + +/* This is a portable implementation of closefrom(). It closes all open file + * descriptors starting at <start> and above. This is a naive version for use + * when the operating system provides no alternative. + */ +void my_closefrom(int start) +{ + struct rlimit limit; + int nbfds; + + if (getrlimit(RLIMIT_NOFILE, &limit) == 0) + nbfds = limit.rlim_cur; + else + nbfds = 0; + + if (nbfds <= 0) + nbfds = 1024; /* safe limit */ + + while (start < nbfds) + close(start++); +} +#endif // defined(USE_POLL) + +/* Sets the RLIMIT_NOFILE setting to <new_limit> and returns the previous one + * in <old_limit> if the pointer is not NULL, even if set_rlimit() fails. The + * two pointers may point to the same variable as the copy happens after + * setting the new value. The value is only changed if at least one of the new + * limits is strictly higher than the current one, otherwise returns 0 without + * changing anything. The getrlimit() or setrlimit() syscall return value is + * returned and errno is preserved. + */ +int raise_rlim_nofile(struct rlimit *old_limit, struct rlimit *new_limit) +{ + struct rlimit limit = { }; + int ret = 0; + + ret = getrlimit(RLIMIT_NOFILE, &limit); + + if (ret == 0 && + (limit.rlim_max < new_limit->rlim_max || + limit.rlim_cur < new_limit->rlim_cur)) { + ret = setrlimit(RLIMIT_NOFILE, new_limit); + } + + if (old_limit) + *old_limit = limit; + + return ret; +} + +/* Computes the bounded poll() timeout based on the next expiration timer <next> + * by bounding it to MAX_DELAY_MS. <next> may equal TICK_ETERNITY. The pollers + * just needs to call this function right before polling to get their timeout + * value. Timeouts that are already expired (possibly due to a pending event) + * are accounted for in activity.poll_exp. + */ +int compute_poll_timeout(int next) +{ + int wait_time; + + if (!tick_isset(next)) + wait_time = MAX_DELAY_MS; + else if (tick_is_expired(next, now_ms)) { + activity[tid].poll_exp++; + wait_time = 0; + } + else { + wait_time = TICKS_TO_MS(tick_remain(now_ms, next)) + 1; + if (wait_time > MAX_DELAY_MS) + wait_time = MAX_DELAY_MS; + } + return wait_time; +} + +/* Handle the return of the poller, which consists in calculating the idle + * time, saving a few clocks, marking the thread harmful again etc. All that + * is some boring stuff that all pollers have to do anyway. + */ +void fd_leaving_poll(int wait_time, int status) +{ + clock_leaving_poll(wait_time, status); + + thread_harmless_end(); + thread_idle_end(); + + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_SLEEPING); +} + +/* disable the specified poller */ +void disable_poller(const char *poller_name) +{ + int p; + + for (p = 0; p < nbpollers; p++) + if (strcmp(pollers[p].name, poller_name) == 0) + pollers[p].pref = 0; +} + +void poller_pipe_io_handler(int fd) +{ + char buf[1024]; + /* Flush the pipe */ + while (read(fd, buf, sizeof(buf)) > 0); + fd_cant_recv(fd); +} + +/* allocate the per-thread fd_updt thus needs to be called early after + * thread creation. + */ +static int alloc_pollers_per_thread() +{ + fd_updt = calloc(global.maxsock, sizeof(*fd_updt)); + return fd_updt != NULL; +} + +/* Initialize the pollers per thread.*/ +static int init_pollers_per_thread() +{ + int mypipe[2]; + + if (pipe(mypipe) < 0) + return 0; + + poller_rd_pipe = mypipe[0]; + poller_wr_pipe[tid] = mypipe[1]; + fd_set_nonblock(poller_rd_pipe); + fd_insert(poller_rd_pipe, poller_pipe_io_handler, poller_pipe_io_handler, tgid, ti->ltid_bit); + fd_insert(poller_wr_pipe[tid], poller_pipe_io_handler, poller_pipe_io_handler, tgid, ti->ltid_bit); + fd_want_recv(poller_rd_pipe); + fd_stop_both(poller_wr_pipe[tid]); + return 1; +} + +/* Deinitialize the pollers per thread */ +static void deinit_pollers_per_thread() +{ + /* rd and wr are init at the same place, but only rd is init to -1, so + we rely to rd to close. */ + if (poller_rd_pipe > -1) { + fd_delete(poller_rd_pipe); + poller_rd_pipe = -1; + fd_delete(poller_wr_pipe[tid]); + poller_wr_pipe[tid] = -1; + } +} + +/* Release the pollers per thread, to be called late */ +static void free_pollers_per_thread() +{ + fd_nbupdt = 0; + ha_free(&fd_updt); +} + +/* + * Initialize the pollers till the best one is found. + * If none works, returns 0, otherwise 1. + */ +int init_pollers() +{ + int p; + struct poller *bp; + + if ((fdtab_addr = calloc(global.maxsock, sizeof(*fdtab) + 64)) == NULL) { + ha_alert("Not enough memory to allocate %d entries for fdtab!\n", global.maxsock); + goto fail_tab; + } + + /* always provide an aligned fdtab */ + fdtab = (struct fdtab*)((((size_t)fdtab_addr) + 63) & -(size_t)64); + + if ((polled_mask = calloc(global.maxsock, sizeof(*polled_mask))) == NULL) { + ha_alert("Not enough memory to allocate %d entries for polled_mask!\n", global.maxsock); + goto fail_polledmask; + } + + if ((fdinfo = calloc(global.maxsock, sizeof(*fdinfo))) == NULL) { + ha_alert("Not enough memory to allocate %d entries for fdinfo!\n", global.maxsock); + goto fail_info; + } + + for (p = 0; p < MAX_TGROUPS; p++) + update_list[p].first = update_list[p].last = -1; + + for (p = 0; p < global.maxsock; p++) { + /* Mark the fd as out of the fd cache */ + fdtab[p].update.next = -3; + } + + do { + bp = NULL; + for (p = 0; p < nbpollers; p++) + if (!bp || (pollers[p].pref > bp->pref)) + bp = &pollers[p]; + + if (!bp || bp->pref == 0) + break; + + if (bp->init(bp)) { + memcpy(&cur_poller, bp, sizeof(*bp)); + return 1; + } + } while (!bp || bp->pref == 0); + + free(fdinfo); + fail_info: + free(polled_mask); + fail_polledmask: + free(fdtab_addr); + fail_tab: + return 0; +} + +/* + * Deinitialize the pollers. + */ +void deinit_pollers() { + + struct poller *bp; + int p; + + for (p = 0; p < nbpollers; p++) { + bp = &pollers[p]; + + if (bp && bp->pref) + bp->term(bp); + } + + ha_free(&fdinfo); + ha_free(&fdtab_addr); + ha_free(&polled_mask); +} + +/* + * Lists the known pollers on <out>. + * Should be performed only before initialization. + */ +int list_pollers(FILE *out) +{ + int p; + int last, next; + int usable; + struct poller *bp; + + fprintf(out, "Available polling systems :\n"); + + usable = 0; + bp = NULL; + last = next = -1; + while (1) { + for (p = 0; p < nbpollers; p++) { + if ((next < 0 || pollers[p].pref > next) + && (last < 0 || pollers[p].pref < last)) { + next = pollers[p].pref; + if (!bp || (pollers[p].pref > bp->pref)) + bp = &pollers[p]; + } + } + + if (next == -1) + break; + + for (p = 0; p < nbpollers; p++) { + if (pollers[p].pref == next) { + fprintf(out, " %10s : ", pollers[p].name); + if (pollers[p].pref == 0) + fprintf(out, "disabled, "); + else + fprintf(out, "pref=%3d, ", pollers[p].pref); + if (pollers[p].test(&pollers[p])) { + fprintf(out, " test result OK"); + if (next > 0) + usable++; + } else { + fprintf(out, " test result FAILED"); + if (bp == &pollers[p]) + bp = NULL; + } + fprintf(out, "\n"); + } + } + last = next; + next = -1; + }; + fprintf(out, "Total: %d (%d usable), will use %s.\n", nbpollers, usable, bp ? bp->name : "none"); + return 0; +} + +/* + * Some pollers may lose their connection after a fork(). It may be necessary + * to create initialize part of them again. Returns 0 in case of failure, + * otherwise 1. The fork() function may be NULL if unused. In case of error, + * the the current poller is destroyed and the caller is responsible for trying + * another one by calling init_pollers() again. + */ +int fork_poller() +{ + int fd; + for (fd = 0; fd < global.maxsock; fd++) { + if (fdtab[fd].owner) { + HA_ATOMIC_OR(&fdtab[fd].state, FD_CLONED); + } + } + + if (cur_poller.fork) { + if (cur_poller.fork(&cur_poller)) + return 1; + cur_poller.term(&cur_poller); + return 0; + } + return 1; +} + +/* config parser for global "tune.fd.edge-triggered", accepts "on" or "off" */ +static int cfg_parse_tune_fd_edge_triggered(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.options |= GTUNE_FD_ET; + else if (strcmp(args[1], "off") == 0) + global.tune.options &= ~GTUNE_FD_ET; + else { + memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.fd.edge-triggered", cfg_parse_tune_fd_edge_triggered, KWF_EXPERIMENTAL }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +REGISTER_PER_THREAD_ALLOC(alloc_pollers_per_thread); +REGISTER_PER_THREAD_INIT(init_pollers_per_thread); +REGISTER_PER_THREAD_DEINIT(deinit_pollers_per_thread); +REGISTER_PER_THREAD_FREE(free_pollers_per_thread); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/filters.c b/src/filters.c new file mode 100644 index 0000000..e55adee --- /dev/null +++ b/src/filters.c @@ -0,0 +1,1125 @@ +/* + * Stream filters related variables and functions. + * + * Copyright (C) 2015 Qualys Inc., Christopher Faulet <cfaulet@qualys.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/api.h> +#include <haproxy/buf-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/compression.h> +#include <haproxy/errors.h> +#include <haproxy/filters.h> +#include <haproxy/flt_http_comp.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/namespace.h> +#include <haproxy/proxy.h> +#include <haproxy/stream.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> + + +#define TRACE_SOURCE &trace_strm + +/* Pool used to allocate filters */ +DECLARE_STATIC_POOL(pool_head_filter, "filter", sizeof(struct filter)); + +static int handle_analyzer_result(struct stream *s, struct channel *chn, unsigned int an_bit, int ret); + +/* - RESUME_FILTER_LOOP and RESUME_FILTER_END must always be used together. + * The first one begins a loop and the seconds one ends it. + * + * - BREAK_EXECUTION must be used to break the loop and set the filter from + * which to resume the next time. + * + * Here is an example: + * + * RESUME_FILTER_LOOP(stream, channel) { + * ... + * if (cond) + * BREAK_EXECUTION(stream, channel, label); + * ... + * } RESUME_FILTER_END; + * ... + * label: + * ... + * + */ +#define RESUME_FILTER_LOOP(strm, chn) \ + do { \ + struct filter *filter; \ + \ + if (strm_flt(strm)->current[CHN_IDX(chn)]) { \ + filter = strm_flt(strm)->current[CHN_IDX(chn)]; \ + strm_flt(strm)->current[CHN_IDX(chn)] = NULL; \ + goto resume_execution; \ + } \ + \ + list_for_each_entry(filter, &strm_flt(s)->filters, list) { \ + resume_execution: + +#define RESUME_FILTER_END \ + } \ + } while(0) + +#define BREAK_EXECUTION(strm, chn, label) \ + do { \ + strm_flt(strm)->current[CHN_IDX(chn)] = filter; \ + goto label; \ + } while (0) + + +/* List head of all known filter keywords */ +static struct flt_kw_list flt_keywords = { + .list = LIST_HEAD_INIT(flt_keywords.list) +}; + +/* + * Registers the filter keyword list <kwl> as a list of valid keywords for next + * parsing sessions. + */ +void +flt_register_keywords(struct flt_kw_list *kwl) +{ + LIST_APPEND(&flt_keywords.list, &kwl->list); +} + +/* + * Returns a pointer to the filter keyword <kw>, or NULL if not found. If the + * keyword is found with a NULL ->parse() function, then an attempt is made to + * find one with a valid ->parse() function. This way it is possible to declare + * platform-dependant, known keywords as NULL, then only declare them as valid + * if some options are met. Note that if the requested keyword contains an + * opening parenthesis, everything from this point is ignored. + */ +struct flt_kw * +flt_find_kw(const char *kw) +{ + int index; + const char *kwend; + struct flt_kw_list *kwl; + struct flt_kw *ret = NULL; + + kwend = strchr(kw, '('); + if (!kwend) + kwend = kw + strlen(kw); + + list_for_each_entry(kwl, &flt_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if ((strncmp(kwl->kw[index].kw, kw, kwend - kw) == 0) && + kwl->kw[index].kw[kwend-kw] == 0) { + if (kwl->kw[index].parse) + return &kwl->kw[index]; /* found it !*/ + else + ret = &kwl->kw[index]; /* may be OK */ + } + } + } + return ret; +} + +/* + * Dumps all registered "filter" keywords to the <out> string pointer. The + * unsupported keywords are only dumped if their supported form was not found. + * If <out> is NULL, the output is emitted using a more compact format on stdout. + */ +void +flt_dump_kws(char **out) +{ + struct flt_kw_list *kwl; + const struct flt_kw *kwp, *kw; + const char *scope = NULL; + int index; + + if (out) + *out = NULL; + + for (kw = kwp = NULL;; kwp = kw) { + list_for_each_entry(kwl, &flt_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if ((kwl->kw[index].parse || + flt_find_kw(kwl->kw[index].kw) == &kwl->kw[index]) + && strordered(kwp ? kwp->kw : NULL, + kwl->kw[index].kw, + kw != kwp ? kw->kw : NULL)) { + kw = &kwl->kw[index]; + scope = kwl->scope; + } + } + } + + if (kw == kwp) + break; + + if (out) + memprintf(out, "%s[%4s] %s%s\n", *out ? *out : "", + scope, + kw->kw, + kw->parse ? "" : " (not supported)"); + else + printf("%s [%s]\n", + kw->kw, scope); + } +} + +/* + * Lists the known filters on <out> + */ +void +list_filters(FILE *out) +{ + char *filters, *p, *f; + + fprintf(out, "Available filters :\n"); + flt_dump_kws(&filters); + for (p = filters; (f = strtok_r(p,"\n",&p));) + fprintf(out, "\t%s\n", f); + free(filters); +} + +/* + * Parses the "filter" keyword. All keywords must be handled by filters + * themselves + */ +static int +parse_filter(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, char **err) +{ + struct flt_conf *fconf = NULL; + + /* Filter cannot be defined on a default proxy */ + if (curpx == defpx) { + memprintf(err, "parsing [%s:%d] : %s is not allowed in a 'default' section.", + file, line, args[0]); + return -1; + } + if (strcmp(args[0], "filter") == 0) { + struct flt_kw *kw; + int cur_arg; + + if (!*args[1]) { + memprintf(err, + "parsing [%s:%d] : missing argument for '%s' in %s '%s'.", + file, line, args[0], proxy_type_str(curpx), curpx->id); + goto error; + } + fconf = calloc(1, sizeof(*fconf)); + if (!fconf) { + memprintf(err, "'%s' : out of memory", args[0]); + goto error; + } + + cur_arg = 1; + kw = flt_find_kw(args[cur_arg]); + if (kw) { + if (!kw->parse) { + memprintf(err, "parsing [%s:%d] : '%s' : " + "'%s' option is not implemented in this version (check build options).", + file, line, args[0], args[cur_arg]); + goto error; + } + if (kw->parse(args, &cur_arg, curpx, fconf, err, kw->private) != 0) { + if (err && *err) + memprintf(err, "'%s' : '%s'", + args[0], *err); + else + memprintf(err, "'%s' : error encountered while processing '%s'", + args[0], args[cur_arg]); + goto error; + } + } + else { + flt_dump_kws(err); + indent_msg(err, 4); + memprintf(err, "'%s' : unknown keyword '%s'.%s%s", + args[0], args[cur_arg], + err && *err ? " Registered keywords :" : "", err && *err ? *err : ""); + goto error; + } + if (*args[cur_arg]) { + memprintf(err, "'%s %s' : unknown keyword '%s'.", + args[0], args[1], args[cur_arg]); + goto error; + } + if (fconf->ops == NULL) { + memprintf(err, "'%s %s' : no callbacks defined.", + args[0], args[1]); + goto error; + } + + LIST_APPEND(&curpx->filter_configs, &fconf->list); + } + return 0; + + error: + free(fconf); + return -1; + + +} + +/* + * Calls 'init' callback for all filters attached to a proxy. This happens after + * the configuration parsing. Filters can finish to fill their config. Returns + * (ERR_ALERT|ERR_FATAL) if an error occurs, 0 otherwise. + */ +static int +flt_init(struct proxy *proxy) +{ + struct flt_conf *fconf; + + list_for_each_entry(fconf, &proxy->filter_configs, list) { + if (fconf->ops->init && fconf->ops->init(proxy, fconf) < 0) + return ERR_ALERT|ERR_FATAL; + } + return 0; +} + +/* + * Calls 'init_per_thread' callback for all filters attached to a proxy for each + * threads. This happens after the thread creation. Filters can finish to fill + * their config. Returns (ERR_ALERT|ERR_FATAL) if an error occurs, 0 otherwise. + */ +static int +flt_init_per_thread(struct proxy *proxy) +{ + struct flt_conf *fconf; + + list_for_each_entry(fconf, &proxy->filter_configs, list) { + if (fconf->ops->init_per_thread && fconf->ops->init_per_thread(proxy, fconf) < 0) + return ERR_ALERT|ERR_FATAL; + } + return 0; +} + +/* Calls flt_init() for all proxies, see above */ +static int +flt_init_all() +{ + struct proxy *px; + int err_code = ERR_NONE; + + for (px = proxies_list; px; px = px->next) { + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + continue; + + err_code |= flt_init(px); + if (err_code & (ERR_ABORT|ERR_FATAL)) { + ha_alert("Failed to initialize filters for proxy '%s'.\n", + px->id); + return err_code; + } + } + return 0; +} + +/* Calls flt_init_per_thread() for all proxies, see above. Be careful here, it + * returns 0 if an error occurred. This is the opposite of flt_init_all. */ +static int +flt_init_all_per_thread() +{ + struct proxy *px; + int err_code = 0; + + for (px = proxies_list; px; px = px->next) { + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + continue; + + err_code = flt_init_per_thread(px); + if (err_code & (ERR_ABORT|ERR_FATAL)) { + ha_alert("Failed to initialize filters for proxy '%s' for thread %u.\n", + px->id, tid); + return 0; + } + } + return 1; +} + +/* + * Calls 'check' callback for all filters attached to a proxy. This happens + * after the configuration parsing but before filters initialization. Returns + * the number of encountered errors. + */ +int +flt_check(struct proxy *proxy) +{ + struct flt_conf *fconf; + int err = 0; + + err += check_implicit_http_comp_flt(proxy); + list_for_each_entry(fconf, &proxy->filter_configs, list) { + if (fconf->ops->check) + err += fconf->ops->check(proxy, fconf); + } + return err; +} + +/* + * Calls 'denit' callback for all filters attached to a proxy. This happens when + * HAProxy is stopped. + */ +void +flt_deinit(struct proxy *proxy) +{ + struct flt_conf *fconf, *back; + + list_for_each_entry_safe(fconf, back, &proxy->filter_configs, list) { + if (fconf->ops->deinit) + fconf->ops->deinit(proxy, fconf); + LIST_DELETE(&fconf->list); + free(fconf); + } +} + +/* + * Calls 'denit_per_thread' callback for all filters attached to a proxy for + * each threads. This happens before exiting a thread. + */ +void +flt_deinit_per_thread(struct proxy *proxy) +{ + struct flt_conf *fconf, *back; + + list_for_each_entry_safe(fconf, back, &proxy->filter_configs, list) { + if (fconf->ops->deinit_per_thread) + fconf->ops->deinit_per_thread(proxy, fconf); + } +} + + +/* Calls flt_deinit_per_thread() for all proxies, see above */ +static void +flt_deinit_all_per_thread() +{ + struct proxy *px; + + for (px = proxies_list; px; px = px->next) + flt_deinit_per_thread(px); +} + +/* Attaches a filter to a stream. Returns -1 if an error occurs, 0 otherwise. */ +static int +flt_stream_add_filter(struct stream *s, struct flt_conf *fconf, unsigned int flags) +{ + struct filter *f; + + if (IS_HTX_STRM(s) && !(fconf->flags & FLT_CFG_FL_HTX)) + return 0; + + f = pool_zalloc(pool_head_filter); + if (!f) /* not enough memory */ + return -1; + f->config = fconf; + f->flags |= flags; + + if (FLT_OPS(f)->attach) { + int ret = FLT_OPS(f)->attach(s, f); + if (ret <= 0) { + pool_free(pool_head_filter, f); + return ret; + } + } + + LIST_APPEND(&strm_flt(s)->filters, &f->list); + strm_flt(s)->flags |= STRM_FLT_FL_HAS_FILTERS; + return 0; +} + +/* + * Called when a stream is created. It attaches all frontend filters to the + * stream. Returns -1 if an error occurs, 0 otherwise. + */ +int +flt_stream_init(struct stream *s) +{ + struct flt_conf *fconf; + + memset(strm_flt(s), 0, sizeof(*strm_flt(s))); + LIST_INIT(&strm_flt(s)->filters); + list_for_each_entry(fconf, &strm_fe(s)->filter_configs, list) { + if (flt_stream_add_filter(s, fconf, 0) < 0) + return -1; + } + return 0; +} + +/* + * Called when a stream is closed or when analyze ends (For an HTTP stream, this + * happens after each request/response exchange). When analyze ends, backend + * filters are removed. When the stream is closed, all filters attached to the + * stream are removed. + */ +void +flt_stream_release(struct stream *s, int only_backend) +{ + struct filter *filter, *back; + + list_for_each_entry_safe(filter, back, &strm_flt(s)->filters, list) { + if (!only_backend || (filter->flags & FLT_FL_IS_BACKEND_FILTER)) { + if (FLT_OPS(filter)->detach) + FLT_OPS(filter)->detach(s, filter); + LIST_DELETE(&filter->list); + pool_free(pool_head_filter, filter); + } + } + if (LIST_ISEMPTY(&strm_flt(s)->filters)) + strm_flt(s)->flags &= ~STRM_FLT_FL_HAS_FILTERS; +} + +/* + * Calls 'stream_start' for all filters attached to a stream. This happens when + * the stream is created, just after calling flt_stream_init + * function. Returns -1 if an error occurs, 0 otherwise. + */ +int +flt_stream_start(struct stream *s) +{ + struct filter *filter; + + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + if (FLT_OPS(filter)->stream_start && FLT_OPS(filter)->stream_start(s, filter) < 0) + return -1; + } + if (strm_li(s) && (strm_li(s)->bind_conf->analysers & AN_REQ_FLT_START_FE)) { + s->req.flags |= CF_FLT_ANALYZE; + s->req.analysers |= AN_REQ_FLT_END; + } + return 0; +} + +/* + * Calls 'stream_stop' for all filters attached to a stream. This happens when + * the stream is stopped, just before calling flt_stream_release function. + */ +void +flt_stream_stop(struct stream *s) +{ + struct filter *filter; + + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + if (FLT_OPS(filter)->stream_stop) + FLT_OPS(filter)->stream_stop(s, filter); + } +} + +/* + * Calls 'check_timeouts' for all filters attached to a stream. This happens when + * the stream is woken up because of expired timer. + */ +void +flt_stream_check_timeouts(struct stream *s) +{ + struct filter *filter; + + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + if (FLT_OPS(filter)->check_timeouts) + FLT_OPS(filter)->check_timeouts(s, filter); + } +} + +/* + * Called when a backend is set for a stream. If the frontend and the backend + * are not the same, this function attaches all backend filters to the + * stream. Returns -1 if an error occurs, 0 otherwise. + */ +int +flt_set_stream_backend(struct stream *s, struct proxy *be) +{ + struct flt_conf *fconf; + struct filter *filter; + + if (strm_fe(s) == be) + goto end; + + list_for_each_entry(fconf, &be->filter_configs, list) { + if (flt_stream_add_filter(s, fconf, FLT_FL_IS_BACKEND_FILTER) < 0) + return -1; + } + + end: + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + if (FLT_OPS(filter)->stream_set_backend && + FLT_OPS(filter)->stream_set_backend(s, filter, be) < 0) + return -1; + } + if (be->be_req_ana & AN_REQ_FLT_START_BE) { + s->req.flags |= CF_FLT_ANALYZE; + s->req.analysers |= AN_REQ_FLT_END; + } + if ((strm_fe(s)->fe_rsp_ana | be->be_rsp_ana) & (AN_RES_FLT_START_FE|AN_RES_FLT_START_BE)) { + s->res.flags |= CF_FLT_ANALYZE; + s->res.analysers |= AN_RES_FLT_END; + } + + return 0; +} + + +/* + * Calls 'http_end' callback for all filters attached to a stream. All filters + * are called here, but only if there is at least one "data" filter. This + * functions is called when all data were parsed and forwarded. 'http_end' + * callback is resumable, so this function returns a negative value if an error + * occurs, 0 if it needs to wait for some reason, any other value otherwise. + */ +int +flt_http_end(struct stream *s, struct http_msg *msg) +{ + unsigned long long *strm_off = &FLT_STRM_OFF(s, msg->chn); + unsigned int offset = 0; + int ret = 1; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s, s->txn, msg); + RESUME_FILTER_LOOP(s, msg->chn) { + unsigned long long flt_off = FLT_OFF(filter, msg->chn); + offset = flt_off - *strm_off; + + /* Call http_end for data filters only. But the filter offset is + * still valid for all filters + . */ + if (!IS_DATA_FILTER(filter, msg->chn)) + continue; + + if (FLT_OPS(filter)->http_end) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); + ret = FLT_OPS(filter)->http_end(s, filter, msg); + if (ret <= 0) + BREAK_EXECUTION(s, msg->chn, end); + } + } RESUME_FILTER_END; + + c_adv(msg->chn, offset); + *strm_off += offset; + +end: + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); + return ret; +} + +/* + * Calls 'http_reset' callback for all filters attached to a stream. This + * happens when a 100-continue response is received. + */ +void +flt_http_reset(struct stream *s, struct http_msg *msg) +{ + struct filter *filter; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s, s->txn, msg); + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + if (FLT_OPS(filter)->http_reset) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); + FLT_OPS(filter)->http_reset(s, filter, msg); + } + } + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); +} + +/* + * Calls 'http_reply' callback for all filters attached to a stream when HA + * decides to stop the HTTP message processing. + */ +void +flt_http_reply(struct stream *s, short status, const struct buffer *msg) +{ + struct filter *filter; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s, s->txn, msg); + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + if (FLT_OPS(filter)->http_reply) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); + FLT_OPS(filter)->http_reply(s, filter, status, msg); + } + } + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); +} + +/* + * Calls 'http_payload' callback for all "data" filters attached to a + * stream. This function is called when some data can be forwarded in the + * AN_REQ_HTTP_XFER_BODY and AN_RES_HTTP_XFER_BODY analyzers. It takes care to + * update the filters and the stream offset to be sure that a filter cannot + * forward more data than its predecessors. A filter can choose to not forward + * all data. Returns a negative value if an error occurs, else the number of + * forwarded bytes. + */ +int +flt_http_payload(struct stream *s, struct http_msg *msg, unsigned int len) +{ + struct filter *filter; + struct htx *htx; + unsigned long long *strm_off = &FLT_STRM_OFF(s, msg->chn); + unsigned int out = co_data(msg->chn); + int ret, data; + + strm_flt(s)->flags &= ~STRM_FLT_FL_HOLD_HTTP_HDRS; + + ret = data = len - out; + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s, s->txn, msg); + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + unsigned long long *flt_off = &FLT_OFF(filter, msg->chn); + unsigned int offset = *flt_off - *strm_off; + + /* Call http_payload for filters only. Forward all data for + * others and update the filter offset + */ + if (!IS_DATA_FILTER(filter, msg->chn)) { + *flt_off += data - offset; + continue; + } + + if (FLT_OPS(filter)->http_payload) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); + ret = FLT_OPS(filter)->http_payload(s, filter, msg, out + offset, data - offset); + if (ret < 0) + goto end; + data = ret + *flt_off - *strm_off; + *flt_off += ret; + } + } + + /* If nothing was forwarded yet, we take care to hold the headers if + * following conditions are met : + * + * - *strm_off == 0 (nothing forwarded yet) + * - ret == 0 (no data forwarded at all on this turn) + * - STRM_FLT_FL_HOLD_HTTP_HDRS flag set (at least one filter want to hold the headers) + * + * Be careful, STRM_FLT_FL_HOLD_HTTP_HDRS is removed before each http_payload loop. + * Thus, it must explicitly be set when necessary. We must do that to hold the headers + * when there is no payload. + */ + if (!ret && !*strm_off && (strm_flt(s)->flags & STRM_FLT_FL_HOLD_HTTP_HDRS)) + goto end; + + ret = data; + *strm_off += ret; + end: + htx = htxbuf(&msg->chn->buf); + if (msg->flags & HTTP_MSGF_XFER_LEN) + htx->extra = 0; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); + return ret; +} + +/* + * Calls 'channel_start_analyze' callback for all filters attached to a + * stream. This function is called when we start to analyze a request or a + * response. For frontend filters, it is called before all other analyzers. For + * backend ones, it is called before all backend + * analyzers. 'channel_start_analyze' callback is resumable, so this function + * returns 0 if an error occurs or if it needs to wait, any other value + * otherwise. + */ +int +flt_start_analyze(struct stream *s, struct channel *chn, unsigned int an_bit) +{ + int ret = 1; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + + /* If this function is called, this means there is at least one filter, + * so we do not need to check the filter list's emptiness. */ + + /* Set flag on channel to tell that the channel is filtered */ + chn->flags |= CF_FLT_ANALYZE; + chn->analysers |= ((chn->flags & CF_ISRESP) ? AN_RES_FLT_END : AN_REQ_FLT_END); + + RESUME_FILTER_LOOP(s, chn) { + if (!(chn->flags & CF_ISRESP)) { + if (an_bit == AN_REQ_FLT_START_BE && + !(filter->flags & FLT_FL_IS_BACKEND_FILTER)) + continue; + } + else { + if (an_bit == AN_RES_FLT_START_BE && + !(filter->flags & FLT_FL_IS_BACKEND_FILTER)) + continue; + } + + FLT_OFF(filter, chn) = 0; + if (FLT_OPS(filter)->channel_start_analyze) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_FLT_ANA, s); + ret = FLT_OPS(filter)->channel_start_analyze(s, filter, chn); + if (ret <= 0) + BREAK_EXECUTION(s, chn, end); + } + } RESUME_FILTER_END; + + end: + ret = handle_analyzer_result(s, chn, an_bit, ret); + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + return ret; +} + +/* + * Calls 'channel_pre_analyze' callback for all filters attached to a + * stream. This function is called BEFORE each analyzer attached to a channel, + * expects analyzers responsible for data sending. 'channel_pre_analyze' + * callback is resumable, so this function returns 0 if an error occurs or if it + * needs to wait, any other value otherwise. + * + * Note this function can be called many times for the same analyzer. In fact, + * it is called until the analyzer finishes its processing. + */ +int +flt_pre_analyze(struct stream *s, struct channel *chn, unsigned int an_bit) +{ + int ret = 1; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + + RESUME_FILTER_LOOP(s, chn) { + if (FLT_OPS(filter)->channel_pre_analyze && (filter->pre_analyzers & an_bit)) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_FLT_ANA, s); + ret = FLT_OPS(filter)->channel_pre_analyze(s, filter, chn, an_bit); + if (ret <= 0) + BREAK_EXECUTION(s, chn, check_result); + filter->pre_analyzers &= ~an_bit; + } + } RESUME_FILTER_END; + + check_result: + ret = handle_analyzer_result(s, chn, 0, ret); + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + return ret; +} + +/* + * Calls 'channel_post_analyze' callback for all filters attached to a + * stream. This function is called AFTER each analyzer attached to a channel, + * expects analyzers responsible for data sending. 'channel_post_analyze' + * callback is NOT resumable, so this function returns a 0 if an error occurs, + * any other value otherwise. + * + * Here, AFTER means when the analyzer finishes its processing. + */ +int +flt_post_analyze(struct stream *s, struct channel *chn, unsigned int an_bit) +{ + struct filter *filter; + int ret = 1; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + if (FLT_OPS(filter)->channel_post_analyze && (filter->post_analyzers & an_bit)) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_FLT_ANA, s); + ret = FLT_OPS(filter)->channel_post_analyze(s, filter, chn, an_bit); + if (ret < 0) + break; + filter->post_analyzers &= ~an_bit; + } + } + ret = handle_analyzer_result(s, chn, 0, ret); + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + return ret; +} + +/* + * This function is the AN_REQ/RES_FLT_HTTP_HDRS analyzer, used to filter HTTP + * headers or a request or a response. Returns 0 if an error occurs or if it + * needs to wait, any other value otherwise. + */ +int +flt_analyze_http_headers(struct stream *s, struct channel *chn, unsigned int an_bit) +{ + struct http_msg *msg; + int ret = 1; + + msg = ((chn->flags & CF_ISRESP) ? &s->txn->rsp : &s->txn->req); + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s, s->txn, msg); + + RESUME_FILTER_LOOP(s, chn) { + if (FLT_OPS(filter)->http_headers) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); + ret = FLT_OPS(filter)->http_headers(s, filter, msg); + if (ret <= 0) + BREAK_EXECUTION(s, chn, check_result); + } + } RESUME_FILTER_END; + + if (HAS_DATA_FILTERS(s, chn)) { + size_t data = http_get_hdrs_size(htxbuf(&chn->buf)); + struct filter *f; + + list_for_each_entry(f, &strm_flt(s)->filters, list) + FLT_OFF(f, chn) = data; + } + + check_result: + ret = handle_analyzer_result(s, chn, an_bit, ret); + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_FLT_ANA, s); + return ret; +} + +/* + * Calls 'channel_end_analyze' callback for all filters attached to a + * stream. This function is called when we stop to analyze a request or a + * response. It is called after all other analyzers. 'channel_end_analyze' + * callback is resumable, so this function returns 0 if an error occurs or if it + * needs to wait, any other value otherwise. + */ +int +flt_end_analyze(struct stream *s, struct channel *chn, unsigned int an_bit) +{ + int ret = 1; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + + /* Check if all filters attached on the stream have finished their + * processing on this channel. */ + if (!(chn->flags & CF_FLT_ANALYZE)) + goto sync; + + RESUME_FILTER_LOOP(s, chn) { + FLT_OFF(filter, chn) = 0; + unregister_data_filter(s, chn, filter); + + if (FLT_OPS(filter)->channel_end_analyze) { + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_FLT_ANA, s); + ret = FLT_OPS(filter)->channel_end_analyze(s, filter, chn); + if (ret <= 0) + BREAK_EXECUTION(s, chn, end); + } + } RESUME_FILTER_END; + + end: + /* We don't remove yet this analyzer because we need to synchronize the + * both channels. So here, we just remove the flag CF_FLT_ANALYZE. */ + ret = handle_analyzer_result(s, chn, 0, ret); + if (ret) { + chn->flags &= ~CF_FLT_ANALYZE; + + /* Pretend there is an activity on both channels. Flag on the + * current one will be automatically removed, so only the other + * one will remain. This is a way to be sure that + * 'channel_end_analyze' callback will have a chance to be + * called at least once for the other side to finish the current + * processing. Of course, this is the filter responsibility to + * wakeup the stream if it choose to loop on this callback. */ + s->req.flags |= CF_WAKE_ONCE; + s->res.flags |= CF_WAKE_ONCE; + } + + + sync: + /* Now we can check if filters have finished their work on the both + * channels */ + if (!(s->req.flags & CF_FLT_ANALYZE) && !(s->res.flags & CF_FLT_ANALYZE)) { + /* Sync channels by removing this analyzer for the both channels */ + s->req.analysers &= ~AN_REQ_FLT_END; + s->res.analysers &= ~AN_RES_FLT_END; + + /* Remove backend filters from the list */ + flt_stream_release(s, 1); + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + } + else { + DBG_TRACE_DEVEL("waiting for sync", STRM_EV_STRM_ANA|STRM_EV_FLT_ANA, s); + } + return ret; +} + + +/* + * Calls 'tcp_payload' callback for all "data" filters attached to a + * stream. This function is called when some data can be forwarded in the + * AN_REQ_FLT_XFER_BODY and AN_RES_FLT_XFER_BODY analyzers. It takes care to + * update the filters and the stream offset to be sure that a filter cannot + * forward more data than its predecessors. A filter can choose to not forward + * all data. Returns a negative value if an error occurs, else the number of + * forwarded bytes. + */ +int +flt_tcp_payload(struct stream *s, struct channel *chn, unsigned int len) +{ + struct filter *filter; + unsigned long long *strm_off = &FLT_STRM_OFF(s, chn); + unsigned int out = co_data(chn); + int ret, data; + + ret = data = len - out; + DBG_TRACE_ENTER(STRM_EV_TCP_ANA|STRM_EV_FLT_ANA, s); + list_for_each_entry(filter, &strm_flt(s)->filters, list) { + unsigned long long *flt_off = &FLT_OFF(filter, chn); + unsigned int offset = *flt_off - *strm_off; + + /* Call tcp_payload for filters only. Forward all data for + * others and update the filter offset + */ + if (!IS_DATA_FILTER(filter, chn)) { + *flt_off += data - offset; + continue; + } + + if (FLT_OPS(filter)->tcp_payload) { + + DBG_TRACE_DEVEL(FLT_ID(filter), STRM_EV_TCP_ANA|STRM_EV_FLT_ANA, s); + ret = FLT_OPS(filter)->tcp_payload(s, filter, chn, out + offset, data - offset); + if (ret < 0) + goto end; + data = ret + *flt_off - *strm_off; + *flt_off += ret; + } + } + + /* Only forward data if the last filter decides to forward something */ + if (ret > 0) { + ret = data; + *strm_off += ret; + } + end: + DBG_TRACE_LEAVE(STRM_EV_TCP_ANA|STRM_EV_FLT_ANA, s); + return ret; +} + +/* + * Called when TCP data must be filtered on a channel. This function is the + * AN_REQ/RES_FLT_XFER_DATA analyzer. When called, it is responsible to forward + * data when the proxy is not in http mode. Behind the scene, it calls + * consecutively 'tcp_data' and 'tcp_forward_data' callbacks for all "data" + * filters attached to a stream. Returns 0 if an error occurs or if it needs to + * wait, any other value otherwise. + */ +int +flt_xfer_data(struct stream *s, struct channel *chn, unsigned int an_bit) +{ + unsigned int len; + int ret = 1; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_TCP_ANA|STRM_EV_FLT_ANA, s); + + /* If there is no "data" filters, we do nothing */ + if (!HAS_DATA_FILTERS(s, chn)) + goto end; + + if (s->flags & SF_HTX) { + struct htx *htx = htxbuf(&chn->buf); + len = htx->data; + } + else + len = c_data(chn); + + ret = flt_tcp_payload(s, chn, len); + if (ret < 0) + goto end; + c_adv(chn, ret); + + /* Stop waiting data if: + * - it the output is closed + * - the input in closed and no data is pending + * - There is a READ/WRITE timeout + */ + if (chn_cons(chn)->flags & SC_FL_SHUT_DONE) { + ret = 1; + goto end; + } + if (chn_prod(chn)->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) { + if (((s->flags & SF_HTX) && htx_is_empty(htxbuf(&chn->buf))) || c_empty(chn)) { + ret = 1; + goto end; + } + } + if (chn->flags & (CF_READ_TIMEOUT|CF_WRITE_TIMEOUT)) { + ret = 1; + goto end; + } + + /* Wait for data */ + DBG_TRACE_DEVEL("waiting for more data", STRM_EV_STRM_ANA|STRM_EV_TCP_ANA|STRM_EV_FLT_ANA, s); + return 0; + end: + /* Terminate the data filtering. If <ret> is negative, an error was + * encountered during the filtering. */ + ret = handle_analyzer_result(s, chn, an_bit, ret); + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_TCP_ANA|STRM_EV_FLT_ANA, s); + return ret; +} + +/* + * Handles result of filter's analyzers. It returns 0 if an error occurs or if + * it needs to wait, any other value otherwise. + */ +static int +handle_analyzer_result(struct stream *s, struct channel *chn, + unsigned int an_bit, int ret) +{ + if (ret < 0) + goto return_bad_req; + else if (!ret) + goto wait; + + /* End of job, return OK */ + if (an_bit) { + chn->analysers &= ~an_bit; + chn->analyse_exp = TICK_ETERNITY; + } + return 1; + + return_bad_req: + /* An error occurs */ + if (IS_HTX_STRM(s)) { + http_set_term_flags(s); + + if (s->txn->status > 0) + http_reply_and_close(s, s->txn->status, NULL); + else { + s->txn->status = (!(chn->flags & CF_ISRESP)) ? 400 : 502; + http_reply_and_close(s, s->txn->status, http_error_message(s)); + } + } + else { + sess_set_term_flags(s); + stream_retnclose(s, NULL); + } + + if (!(chn->flags & CF_ISRESP)) + s->req.analysers &= AN_REQ_FLT_END; + else + s->res.analysers &= AN_RES_FLT_END; + + + DBG_TRACE_DEVEL("leaving on error", STRM_EV_FLT_ANA|STRM_EV_FLT_ERR, s); + return 0; + + wait: + if (!(chn->flags & CF_ISRESP)) + channel_dont_connect(chn); + DBG_TRACE_DEVEL("wairing for more data", STRM_EV_FLT_ANA, s); + return 0; +} + + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted, doing so helps + * all code contributors. + * Optional keywords are also declared with a NULL ->parse() function so that + * the config parser can report an appropriate error when a known keyword was + * not enabled. */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_LISTEN, "filter", parse_filter }, + { 0, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +REGISTER_POST_CHECK(flt_init_all); +REGISTER_PER_THREAD_INIT(flt_init_all_per_thread); +REGISTER_PER_THREAD_DEINIT(flt_deinit_all_per_thread); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/fix.c b/src/fix.c new file mode 100644 index 0000000..abf3119 --- /dev/null +++ b/src/fix.c @@ -0,0 +1,264 @@ +/* + * Financial Information eXchange Protocol + * + * Copyright 2020 Baptiste Assmann <bedis9@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/intops.h> +#include <haproxy/fix.h> +/* + * Return the corresponding numerical tag id if <str> looks like a valid FIX + * protocol tag ID. Otherwise, 0 is returned (0 is an invalid id). + * + * If <version> is given, it must be one of a defined FIX version string (see + * FIX_X_Y macros). In this case, the function will also check tag ID ranges. If + * no <version> is provided, any strictly positive integer is valid. + * + * tag ID range depends on FIX protocol version: + * - FIX.4.0: 1-140 + * - FIX.4.1: 1-211 + * - FIX.4.2: 1-446 + * - FIX.4.3: 1-659 + * - FIX.4.4: 1-956 + * - FIX.5.0: 1-1139 + * - FIX.5.0SP1: 1-1426 + * - FIX.5.0SP2: 1-1621 + * range 10000 to 19999 is for "user defined tags" + */ +unsigned int fix_check_id(const struct ist str, const struct ist version) { + const char *s, *end; + unsigned int ret; + + s = istptr(str); + end = istend(str); + ret = read_uint(&s, end); + + /* we did not consume all characters from <str>, this is an error */ + if (s != end) + return 0; + + /* field ID can't be 0 */ + if (ret == 0) + return 0; + + /* we can leave now if version was not provided */ + if (!isttest(version)) + return ret; + + /* we can leave now if this is a "user defined tag id" */ + if (ret >= 10000 && ret <= 19999) + return ret; + + /* now perform checking per FIX version */ + if (istissame(FIX_4_0, version) && (ret <= 140)) + return ret; + else if (istissame(FIX_4_1, version) && (ret <= 211)) + return ret; + else if (istissame(FIX_4_2, version) && (ret <= 446)) + return ret; + else if (istissame(FIX_4_3, version) && (ret <= 659)) + return ret; + else if (istissame(FIX_4_4, version) && (ret <= 956)) + return ret; + /* version string is the same for all 5.0 versions, so we can only take + * into consideration the biggest range + */ + else if (istissame(FIX_5_0, version) && (ret <= 1621)) + return ret; + + return 0; +} + +/* + * Parse a FIX message <msg> and performs following sanity checks: + * + * - checks tag ids and values are not empty + * - checks tag ids are numerical value + * - checks the first tag is BeginString with a valid version + * - checks the second tag is BodyLength with the right body length + * - checks the third tag is MsgType + * - checks the last tag is CheckSum with a valid checksum + * + * Returns: + * FIX_INVALID_MESSAGE if the message is invalid + * FIX_NEED_MORE_DATA if we need more data to fully validate the message + * FIX_VALID_MESSAGE if the message looks valid + */ +int fix_validate_message(const struct ist msg) +{ + struct ist parser, version; + unsigned int tagnum, bodylen; + unsigned char checksum; + char *body; + int ret = FIX_INVALID_MESSAGE; + + if (istlen(msg) < FIX_MSG_MINSIZE) { + ret = FIX_NEED_MORE_DATA; + goto end; + } + + /* parsing the whole message to compute the checksum and check all tag + * ids are properly set. Here we are sure to have the 2 first tags. Thus + * the version and the body length can be checked. + */ + parser = msg; + version = IST_NULL; + checksum = tagnum = bodylen = 0; + body = NULL; + while (istlen(parser) > 0) { + struct ist tag, value; + unsigned int tagid; + const char *p, *end; + + /* parse the tag ID and its value and perform first sanity checks */ + value = iststop(istfind(parser, '='), FIX_DELIMITER); + + /* end of value not found */ + if (istend(value) == istend(parser)) { + ret = FIX_NEED_MORE_DATA; + goto end; + } + /* empty tag or empty value are forbidden */ + if (istptr(parser) == istptr(value) ||!istlen(value)) + goto end; + + /* value points on '='. get the tag and skip '=' */ + tag = ist2(istptr(parser), istptr(value) - istptr(parser)); + value = istnext(value); + + /* Check the tag id */ + tagid = fix_check_id(tag, version); + if (!tagid) + goto end; + tagnum++; + + if (tagnum == 1) { + /* the first tag must be BeginString */ + if (tagid != FIX_TAG_BeginString) + goto end; + + version = fix_version(value); + if (!isttest(version)) + goto end; + } + else if (tagnum == 2) { + /* the second tag must be bodyLength */ + if (tagid != FIX_TAG_BodyLength) + goto end; + + p = istptr(value); + end = istend(value); + bodylen = read_uint(&p, end); + + /* we did not consume all characters from <str> or no body, this is an error. + * There is at least the message type in the body. + */ + if (p != end || !bodylen) + goto end; + + body = istend(value) + 1; + } + else if (tagnum == 3) { + /* the third tag must be MsgType */ + if (tagid != FIX_TAG_MsgType) + goto end; + } + else if (tagnum > 3 && tagid == FIX_TAG_CheckSum) { + /* CheckSum tag should be the last one and is not taken into account + * to compute the checksum itself and the body length. The value is + * a three-octet representation of the checksum decimal value. + */ + if (bodylen != istptr(parser) - body) + goto end; + + if (istlen(value) != 3) + goto end; + if (checksum != strl2ui(istptr(value), istlen(value))) + goto end; + + /* End of the message, exit from the loop */ + ret = FIX_VALID_MESSAGE; + goto end; + } + + /* compute checksum of tag=value<delim> */ + for (p = istptr(tag) ; p < istend(tag) ; ++p) + checksum += *p; + checksum += '='; + for (p = istptr(value) ; p < istend(value) ; ++p) + checksum += *p; + checksum += FIX_DELIMITER; + + /* move the parser after the value and its delimiter */ + parser = istadv(parser, istlen(tag) + istlen(value) + 2); + } + + if (body) { + /* We start to read the body but we don't reached the checksum tag */ + ret = FIX_NEED_MORE_DATA; + } + + end: + return ret; +} + + +/* + * Iter on a FIX message <msg> and return the value of <tagid>. + * + * Returns the corresponding value if <tagid> is found. If <tagid> is not found + * because more data are required, the message with a length set to 0 is + * returned. If <tagid> is not found in the message or if the message is + * invalid, IST_NULL is returned. + * + * Note: Only simple sanity checks are performed on tags and values (not empty). + * + * the tag looks like + * <tagid>=<value>FIX_DELIMITER with <tag> and <value> not empty + */ +struct ist fix_tag_value(const struct ist msg, unsigned int tagid) +{ + struct ist parser, t, v; + unsigned int id; + + parser = msg; + while (istlen(parser) > 0) { + v = iststop(istfind(parser, '='), FIX_DELIMITER); + + /* delimiter not found, need more data */ + if (istend(v) == istend(parser)) + break; + + /* empty tag or empty value, invalid */ + if (istptr(parser) == istptr(v) || !istlen(v)) + goto not_found_or_invalid; + + t = ist2(istptr(parser), istptr(v) - istptr(parser)); + v = istnext(v); + + id = fix_check_id(t, IST_NULL); + if (!id) + goto not_found_or_invalid; + if (id == tagid) { + /* <tagId> found, return the corresponding value */ + return v; + } + + /* CheckSum tag is the last one, no <tagid> found */ + if (id == FIX_TAG_CheckSum) + goto not_found_or_invalid; + + parser = istadv(parser, istlen(t) + istlen(v) + 2); + } + /* not enough data to find <tagid> */ + return ist2(istptr(msg), 0); + + not_found_or_invalid: + return IST_NULL; +} diff --git a/src/flt_bwlim.c b/src/flt_bwlim.c new file mode 100644 index 0000000..66c2883 --- /dev/null +++ b/src/flt_bwlim.c @@ -0,0 +1,976 @@ +/* + * Bandwidth limitation filter. + * + * Copyright 2022 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> + +#include <haproxy/api.h> +#include <haproxy/channel-t.h> +#include <haproxy/filters.h> +#include <haproxy/global.h> +#include <haproxy/http_ana-t.h> +#include <haproxy/http_rules.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/stream.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> + +const char *bwlim_flt_id = "bandwidth limitation filter"; + +struct flt_ops bwlim_ops; + +#define BWLIM_FL_NONE 0x00000000 /* For init purposr */ +#define BWLIM_FL_IN 0x00000001 /* Limit clients uploads */ +#define BWLIM_FL_OUT 0x00000002 /* Limit clients downloads */ +#define BWLIM_FL_SHARED 0x00000004 /* Limit shared between clients (using stick-tables) */ + +#define BWLIM_ACT_LIMIT_EXPR 0x00000001 +#define BWLIM_ACT_LIMIT_CONST 0x00000002 +#define BWLIM_ACT_PERIOD_EXPR 0x00000004 +#define BWLIM_ACT_PERIOD_CONST 0x00000008 + +struct bwlim_config { + struct proxy *proxy; + char *name; + unsigned int flags; + struct sample_expr *expr; + union { + char *n; + struct stktable *t; + } table; + unsigned int period; + unsigned int limit; + unsigned int min_size; +}; + +struct bwlim_state { + struct freq_ctr bytes_rate; + struct stksess *ts; + struct act_rule *rule; + unsigned int limit; + unsigned int period; + unsigned int exp; +}; + + +/* Pools used to allocate comp_state structs */ +DECLARE_STATIC_POOL(pool_head_bwlim_state, "bwlim_state", sizeof(struct bwlim_state)); + + +/* Apply the bandwidth limitation of the filter <filter>. <len> is the maximum + * amount of data that the filter can forward. This function applies the + * limitation and returns what the stream is authorized to forward. Several + * limitation can be stacked. + */ +static int bwlim_apply_limit(struct filter *filter, struct channel *chn, unsigned int len) +{ + struct bwlim_config *conf = FLT_CONF(filter); + struct bwlim_state *st = filter->ctx; + struct freq_ctr *bytes_rate; + unsigned int period, limit, remain, tokens, users; + unsigned int wait = 0; + int overshoot, ret = 0; + + /* Don't forward anything if there is nothing to forward or the waiting + * time is not expired + */ + if (!len || (tick_isset(st->exp) && !tick_is_expired(st->exp, now_ms))) + goto end; + + st->exp = TICK_ETERNITY; + ret = len; + if (conf->flags & BWLIM_FL_SHARED) { + void *ptr; + unsigned int type = ((conf->flags & BWLIM_FL_IN) ? STKTABLE_DT_BYTES_IN_RATE : STKTABLE_DT_BYTES_OUT_RATE); + + /* In shared mode, get a pointer on the stick table entry. it + * will be used to get the freq-counter. It is also used to get + * The number of users. + */ + ptr = stktable_data_ptr(conf->table.t, st->ts, type); + if (!ptr) + goto end; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &st->ts->lock); + bytes_rate = &stktable_data_cast(ptr, std_t_frqp); + period = conf->table.t->data_arg[type].u; + limit = conf->limit; + users = st->ts->ref_cnt; + } + else { + /* On per-stream mode, the freq-counter is private to the + * stream. Get it from the filter state. Rely on the custom + * limit/period if defined or use the default ones. In this mode, + * there is only one user. + */ + bytes_rate = &st->bytes_rate; + period = (st->period ? st->period : conf->period); + limit = (st->limit ? st->limit : conf->limit); + users = 1; + } + + /* Be sure the current rate does not exceed the limit over the current + * period. In this case, nothing is forwarded and the waiting time is + * computed to be sure to not retry too early. + * + * The test is used to avoid the initial burst. Otherwise, streams will + * consume the limit as fast as possible and will then be paused for + * long time. + */ + overshoot = freq_ctr_overshoot_period(bytes_rate, period, limit); + if (overshoot > 0) { + if (conf->flags & BWLIM_FL_SHARED) + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &st->ts->lock); + wait = div64_32((uint64_t)(conf->min_size + overshoot) * period * users, + limit); + st->exp = tick_add(now_ms, (wait ? wait : 1)); + ret = 0; + goto end; + } + + /* Get the allowed quota per user. */ + remain = freq_ctr_remain_period(bytes_rate, period, limit, 0); + tokens = div64_32((uint64_t)(remain + users - 1), users); + + if (tokens < len) { + /* The stream cannot forward all its data. But we will check if + * it can perform a small burst if the global quota is large + * enough. But, in this case, its waiting time will be + * increased accordingly. + */ + ret = tokens; + if (tokens < conf->min_size) { + ret = (chn_prod(chn)->flags & (SC_FL_EOI|SC_FL_EOS|SC_FL_ABRT_DONE)) + ? MIN(len, conf->min_size) + : conf->min_size; + + if (ret <= remain) + wait = div64_32((uint64_t)(ret - tokens) * period * users + limit - 1, limit); + else + ret = (limit < ret) ? remain : 0; + } + } + + /* At the end, update the freq-counter and compute the waiting time if + * the stream is limited + */ + update_freq_ctr_period(bytes_rate, period, ret); + if (ret < len) { + wait += next_event_delay_period(bytes_rate, period, limit, MIN(len - ret, conf->min_size * users)); + st->exp = tick_add(now_ms, (wait ? wait : 1)); + } + + if (conf->flags & BWLIM_FL_SHARED) + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &st->ts->lock); + + end: + chn->analyse_exp = tick_first((tick_is_expired(chn->analyse_exp, now_ms) ? TICK_ETERNITY : chn->analyse_exp), + st->exp); + return ret; +} + +/*************************************************************************** + * Hooks that manage the filter lifecycle (init/check/deinit) + **************************************************************************/ +/* Initialize the filter. Returns -1 on error, else 0. */ +static int bwlim_init(struct proxy *px, struct flt_conf *fconf) +{ + fconf->flags |= FLT_CFG_FL_HTX; + return 0; +} + +/* Free resources allocated by the bwlim filter. */ +static void bwlim_deinit(struct proxy *px, struct flt_conf *fconf) +{ + struct bwlim_config *conf = fconf->conf; + + if (conf) { + ha_free(&conf->name); + release_sample_expr(conf->expr); + conf->expr = NULL; + ha_free(&fconf->conf); + } +} + +/* Check configuration of a bwlim filter for a specified proxy. + * Return 1 on error, else 0. */ +static int bwlim_check(struct proxy *px, struct flt_conf *fconf) +{ + struct bwlim_config *conf = fconf->conf; + struct stktable *target; + + if (!(conf->flags & BWLIM_FL_SHARED)) + return 0; + + if (conf->table.n) + target = stktable_find_by_name(conf->table.n); + else + target = px->table; + + if (!target) { + ha_alert("Proxy %s : unable to find table '%s' referenced by bwlim filter '%s'", + px->id, conf->table.n ? conf->table.n : px->id, conf->name); + return 1; + } + + if ((conf->flags & BWLIM_FL_IN) && !target->data_ofs[STKTABLE_DT_BYTES_IN_RATE]) { + ha_alert("Proxy %s : stick-table '%s' uses a data type incompatible with bwlim filter '%s'." + " It must be 'bytes_in_rate'", + px->id, conf->table.n ? conf->table.n : px->id, conf->name); + return 1; + } + else if ((conf->flags & BWLIM_FL_OUT) && !target->data_ofs[STKTABLE_DT_BYTES_OUT_RATE]) { + ha_alert("Proxy %s : stick-table '%s' uses a data type incompatible with bwlim filter '%s'." + " It must be 'bytes_out_rate'", + px->id, conf->table.n ? conf->table.n : px->id, conf->name); + return 1; + } + + if (!stktable_compatible_sample(conf->expr, target->type)) { + ha_alert("Proxy %s : stick-table '%s' uses a key type incompatible with bwlim filter '%s'", + px->id, conf->table.n ? conf->table.n : px->id, conf->name); + return 1; + } + else { + if (!in_proxies_list(target->proxies_list, px)) { + px->next_stkt_ref = target->proxies_list; + target->proxies_list = px; + } + ha_free(&conf->table.n); + conf->table.t = target; + } + + return 0; +} + +/************************************************************************** + * Hooks to handle start/stop of streams + *************************************************************************/ +/* Called when a filter instance is created and attach to a stream */ +static int bwlim_attach(struct stream *s, struct filter *filter) +{ + struct bwlim_state *st; + + st = pool_zalloc(pool_head_bwlim_state); + if (!st) + return -1; + filter->ctx = st; + return 1; +} + +/* Called when a filter instance is detach from a stream, just before its + * destruction */ +static void bwlim_detach(struct stream *s, struct filter *filter) +{ + struct bwlim_config *conf = FLT_CONF(filter); + struct bwlim_state *st = filter->ctx; + struct stktable *t = conf->table.t; + + if (!st) + return; + + if (st->ts) + stktable_touch_local(t, st->ts, 1); + + /* release any possible compression context */ + pool_free(pool_head_bwlim_state, st); + filter->ctx = NULL; +} + +/************************************************************************** + * Hooks to handle channels activity + *************************************************************************/ + +/* Called when analyze ends for a given channel */ +static int bwlim_chn_end_analyze(struct stream *s, struct filter *filter, struct channel *chn) +{ + chn->analyse_exp = TICK_ETERNITY; + return 1; +} + + +/************************************************************************** + * Hooks to filter HTTP messages + *************************************************************************/ +static int bwlim_http_headers(struct stream *s, struct filter *filter, struct http_msg *msg) +{ + msg->chn->analyse_exp = TICK_ETERNITY; + return 1; +} + +static int bwlim_http_payload(struct stream *s, struct filter *filter, struct http_msg *msg, + unsigned int offset, unsigned int len) +{ + return bwlim_apply_limit(filter, msg->chn, len); +} + +/************************************************************************** + * Hooks to filter TCP data + *************************************************************************/ +static int bwlim_tcp_payload(struct stream *s, struct filter *filter, struct channel *chn, + unsigned int offset, unsigned int len) +{ + return bwlim_apply_limit(filter, chn, len); +} + +/******************************************************************** + * Functions that manage the filter initialization + ********************************************************************/ +struct flt_ops bwlim_ops = { + /* Manage bwlim filter, called for each filter declaration */ + .init = bwlim_init, + .deinit = bwlim_deinit, + .check = bwlim_check, + + /* Handle start/stop of streams */ + .attach = bwlim_attach, + .detach = bwlim_detach, + + /* Handle channels activity */ + .channel_end_analyze = bwlim_chn_end_analyze, + + /* Filter HTTP requests and responses */ + .http_headers = bwlim_http_headers, + .http_payload = bwlim_http_payload, + + /* Filter TCP data */ + .tcp_payload = bwlim_tcp_payload, +}; + +/* Set a bandwidth limitation. It always return ACT_RET_CONT. On error, the rule + * is ignored. First of all, it looks for the corresponding filter. Then, for a + * shared limitation, the stick-table entry is retrieved. For a per-stream + * limitation, the custom limit and period are computed, if necessary. At the + * end, the filter is registered on the data filtering for the right channel + * (bwlim-in = request, bwlim-out = response). + */ +static enum act_return bwlim_set_limit(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct bwlim_config *conf = rule->arg.act.p[3]; + struct filter *filter; + struct bwlim_state *st = NULL; + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + int opt; + + list_for_each_entry(filter, &s->strm_flt.filters, list) { + if (FLT_ID(filter) == bwlim_flt_id && FLT_CONF(filter) == conf) { + st = filter->ctx; + break; + } + } + + if (!st) + goto end; + + switch (rule->from) { + case ACT_F_TCP_REQ_CNT: opt = SMP_OPT_DIR_REQ | SMP_OPT_FINAL; break; + case ACT_F_TCP_RES_CNT: opt = SMP_OPT_DIR_RES | SMP_OPT_FINAL; break; + case ACT_F_HTTP_REQ: opt = SMP_OPT_DIR_REQ | SMP_OPT_FINAL; break; + case ACT_F_HTTP_RES: opt = SMP_OPT_DIR_RES | SMP_OPT_FINAL; break; + default: + goto end; + } + + if (conf->flags & BWLIM_FL_SHARED) { + t = conf->table.t; + key = stktable_fetch_key(t, px, sess, s, opt, conf->expr, NULL); + if (!key) + goto end; + + ts = stktable_get_entry(t, key); + if (!ts) + goto end; + + st->ts = ts; + st->rule = rule; + } + else { + struct sample *smp; + + st->limit = 0; + st->period = 0; + if (rule->action & BWLIM_ACT_LIMIT_EXPR) { + smp = sample_fetch_as_type(px, sess, s, opt, rule->arg.act.p[1], SMP_T_SINT); + if (smp && smp->data.u.sint > 0) + st->limit = smp->data.u.sint; + } + else if (rule->action & BWLIM_ACT_LIMIT_CONST) + st->limit = (uintptr_t)rule->arg.act.p[1]; + + if (rule->action & BWLIM_ACT_PERIOD_EXPR) { + smp = sample_fetch_as_type(px, sess, s, opt, rule->arg.act.p[2], SMP_T_SINT); + if (smp && smp->data.u.sint > 0) + st->period = smp->data.u.sint; + } + else if (rule->action & BWLIM_ACT_PERIOD_CONST) + st->period = (uintptr_t)rule->arg.act.p[2]; + } + + st->exp = TICK_ETERNITY; + if (conf->flags & BWLIM_FL_IN) + register_data_filter(s, &s->req, filter); + else + register_data_filter(s, &s->res, filter); + + end: + return ACT_RET_CONT; +} + +/* Check function for "set-bandwidth-limit" action. It returns 1 on + * success. Otherwise, it returns 0 and <err> is filled. + */ +int check_bwlim_action(struct act_rule *rule, struct proxy *px, char **err) +{ + struct flt_conf *fconf; + struct bwlim_config *conf = NULL; + unsigned int where; + + list_for_each_entry(fconf, &px->filter_configs, list) { + conf = NULL; + if (fconf->id == bwlim_flt_id) { + conf = fconf->conf; + if (strcmp(rule->arg.act.p[0], conf->name) == 0) + break; + } + } + if (!conf) { + memprintf(err, "unable to find bwlim filter '%s' referenced by set-bandwidth-limit rule", + (char *)rule->arg.act.p[0]); + return 0; + } + + if ((conf->flags & BWLIM_FL_SHARED) && rule->arg.act.p[1]) { + memprintf(err, "set-bandwidth-limit rule cannot define a limit for a shared bwlim filter"); + return 0; + } + + if ((conf->flags & BWLIM_FL_SHARED) && rule->arg.act.p[2]) { + memprintf(err, "set-bandwidth-limit rule cannot define a period for a shared bwlim filter"); + return 0; + } + + where = 0; + if (px->cap & PR_CAP_FE) { + if (rule->from == ACT_F_TCP_REQ_CNT) + where |= SMP_VAL_FE_REQ_CNT; + else if (rule->from == ACT_F_HTTP_REQ) + where |= SMP_VAL_FE_HRQ_HDR; + else if (rule->from == ACT_F_TCP_RES_CNT) + where |= SMP_VAL_FE_RES_CNT; + else if (rule->from == ACT_F_HTTP_RES) + where |= SMP_VAL_FE_HRS_HDR; + } + if (px->cap & PR_CAP_BE) { + if (rule->from == ACT_F_TCP_REQ_CNT) + where |= SMP_VAL_BE_REQ_CNT; + else if (rule->from == ACT_F_HTTP_REQ) + where |= SMP_VAL_BE_HRQ_HDR; + else if (rule->from == ACT_F_TCP_RES_CNT) + where |= SMP_VAL_BE_RES_CNT; + else if (rule->from == ACT_F_HTTP_RES) + where |= SMP_VAL_BE_HRS_HDR; + } + + if ((rule->action & BWLIM_ACT_LIMIT_EXPR) && rule->arg.act.p[1]) { + struct sample_expr *expr = rule->arg.act.p[1]; + + if (!(expr->fetch->val & where)) { + memprintf(err, "set-bandwidth-limit rule uses a limit extracting information from '%s', none of which is available here", + sample_src_names(expr->fetch->use)); + return 0; + } + + if (rule->from == ACT_F_TCP_REQ_CNT && (px->cap & PR_CAP_FE)) { + if (!px->tcp_req.inspect_delay && !(expr->fetch->val & SMP_VAL_FE_SES_ACC)) { + ha_warning("%s '%s' : a 'tcp-request content set-bandwidth-limit*' rule explicitly depending on request" + " contents without any 'tcp-request inspect-delay' setting." + " This means that this rule will randomly find its contents. This can be fixed by" + " setting the tcp-request inspect-delay.\n", + proxy_type_str(px), px->id); + } + } + if (rule->from == ACT_F_TCP_RES_CNT && (px->cap & PR_CAP_BE)) { + if (!px->tcp_rep.inspect_delay && !(expr->fetch->val & SMP_VAL_BE_SRV_CON)) { + ha_warning("%s '%s' : a 'tcp-response content set-bandwidth-limit*' rule explicitly depending on response" + " contents without any 'tcp-response inspect-delay' setting." + " This means that this rule will randomly find its contents. This can be fixed by" + " setting the tcp-response inspect-delay.\n", + proxy_type_str(px), px->id); + } + } + } + + if ((rule->action & BWLIM_ACT_PERIOD_EXPR) && rule->arg.act.p[2]) { + struct sample_expr *expr = rule->arg.act.p[2]; + + if (!(expr->fetch->val & where)) { + memprintf(err, "set-bandwidth-limit rule uses a period extracting information from '%s', none of which is available here", + sample_src_names(expr->fetch->use)); + return 0; + } + + if (rule->from == ACT_F_TCP_REQ_CNT && (px->cap & PR_CAP_FE)) { + if (!px->tcp_req.inspect_delay && !(expr->fetch->val & SMP_VAL_FE_SES_ACC)) { + ha_warning("%s '%s' : a 'tcp-request content set-bandwidth-limit*' rule explicitly depending on request" + " contents without any 'tcp-request inspect-delay' setting." + " This means that this rule will randomly find its contents. This can be fixed by" + " setting the tcp-request inspect-delay.\n", + proxy_type_str(px), px->id); + } + } + if (rule->from == ACT_F_TCP_RES_CNT && (px->cap & PR_CAP_BE)) { + if (!px->tcp_rep.inspect_delay && !(expr->fetch->val & SMP_VAL_BE_SRV_CON)) { + ha_warning("%s '%s' : a 'tcp-response content set-bandwidth-limit*' rule explicitly depending on response" + " contents without any 'tcp-response inspect-delay' setting." + " This means that this rule will randomly find its contents. This can be fixed by" + " setting the tcp-response inspect-delay.\n", + proxy_type_str(px), px->id); + } + } + } + + if (conf->expr) { + if (!(conf->expr->fetch->val & where)) { + memprintf(err, "bwlim filter '%s uses a key extracting information from '%s', none of which is available here", + conf->name, sample_src_names(conf->expr->fetch->use)); + return 0; + } + + if (rule->from == ACT_F_TCP_REQ_CNT && (px->cap & PR_CAP_FE)) { + if (!px->tcp_req.inspect_delay && !(conf->expr->fetch->val & SMP_VAL_FE_SES_ACC)) { + ha_warning("%s '%s' : a 'tcp-request content set-bandwidth-limit*' rule explicitly depending on request" + " contents without any 'tcp-request inspect-delay' setting." + " This means that this rule will randomly find its contents. This can be fixed by" + " setting the tcp-request inspect-delay.\n", + proxy_type_str(px), px->id); + } + } + if (rule->from == ACT_F_TCP_RES_CNT && (px->cap & PR_CAP_BE)) { + if (!px->tcp_rep.inspect_delay && !(conf->expr->fetch->val & SMP_VAL_BE_SRV_CON)) { + ha_warning("%s '%s' : a 'tcp-response content set-bandwidth-limit*' rule explicitly depending on response" + " contents without any 'tcp-response inspect-delay' setting." + " This means that this rule will randomly find its contents. This can be fixed by" + " setting the tcp-response inspect-delay.\n", + proxy_type_str(px), px->id); + } + } + } + + end: + rule->arg.act.p[3] = conf; + return 1; +} + +/* Release memory allocated by "set-bandwidth-limit" action. */ +static void release_bwlim_action(struct act_rule *rule) +{ + ha_free(&rule->arg.act.p[0]); + if ((rule->action & BWLIM_ACT_LIMIT_EXPR) && rule->arg.act.p[1]) { + release_sample_expr(rule->arg.act.p[1]); + rule->arg.act.p[1] = NULL; + } + if ((rule->action & BWLIM_ACT_PERIOD_EXPR) && rule->arg.act.p[2]) { + release_sample_expr(rule->arg.act.p[2]); + rule->arg.act.p[2] = NULL; + } + rule->arg.act.p[3] = NULL; /* points on the filter's config */ +} + +/* Parse "set-bandwidth-limit" action. The filter name must be specified. For + * shared limitations, there is no other supported parameter. For per-stream + * limitations, a custom limit and period may be specified. In both case, it + * must be an expression. On success: + * + * arg.act.p[0] will be the filter name (mandatory) + * arg.act.p[1] will be an expression for the custom limit (optional, may be NULL) + * arg.act.p[2] will be an expression for the custom period (optional, may be NULL) + * + * It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_bandwidth_limit(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + struct sample_expr *expr; + int cur_arg; + + cur_arg = *orig_arg; + + if (!*args[cur_arg]) { + memprintf(err, "missing bwlim filter name"); + return ACT_RET_PRS_ERR; + } + + rule->arg.act.p[0] = strdup(args[cur_arg]); + if (!rule->arg.act.p[0]) { + memprintf(err, "out of memory"); + return ACT_RET_PRS_ERR; + } + cur_arg++; + + while (1) { + if (strcmp(args[cur_arg], "limit") == 0) { + const char *res; + unsigned int limit; + + cur_arg++; + if (!args[cur_arg]) { + memprintf(err, "missing limit value or expression"); + goto error; + } + + res = parse_size_err(args[cur_arg], &limit); + if (!res) { + rule->action |= BWLIM_ACT_LIMIT_CONST; + rule->arg.act.p[1] = (void *)(uintptr_t)limit; + cur_arg++; + continue; + } + + expr = sample_parse_expr((char **)args, &cur_arg, px->conf.args.file, px->conf.args.line, NULL, &px->conf.args, NULL); + if (!expr) { + memprintf(err, "'%s': invalid size value or unknown fetch method '%s'", args[cur_arg-1], args[cur_arg]); + goto error; + } + rule->action |= BWLIM_ACT_LIMIT_EXPR; + rule->arg.act.p[1] = expr; + } + else if (strcmp(args[cur_arg], "period") == 0) { + const char *res; + unsigned int period; + + cur_arg++; + if (!args[cur_arg]) { + memprintf(err, "missing period value or expression"); + goto error; + } + + res = parse_time_err(args[cur_arg], &period, TIME_UNIT_MS); + if (!res) { + rule->action |= BWLIM_ACT_PERIOD_CONST; + rule->arg.act.p[2] = (void *)(uintptr_t)period; + cur_arg++; + continue; + } + + expr = sample_parse_expr((char **)args, &cur_arg, px->conf.args.file, px->conf.args.line, NULL, &px->conf.args, NULL); + if (!expr) { + memprintf(err, "'%s': invalid time value or unknown fetch method '%s'", args[cur_arg-1], args[cur_arg]); + goto error; + } + rule->action |= BWLIM_ACT_PERIOD_EXPR; + rule->arg.act.p[2] = expr; + } + else + break; + } + + rule->action_ptr = bwlim_set_limit; + rule->check_ptr = check_bwlim_action; + rule->release_ptr = release_bwlim_action; + + *orig_arg = cur_arg; + return ACT_RET_PRS_OK; + +error: + release_bwlim_action(rule); + return ACT_RET_PRS_ERR; +} + + +static struct action_kw_list tcp_req_cont_actions = { + .kw = { + { "set-bandwidth-limit", parse_bandwidth_limit, 0 }, + { NULL, NULL } + } +}; + +static struct action_kw_list tcp_res_cont_actions = { + .kw = { + { "set-bandwidth-limit", parse_bandwidth_limit, 0 }, + { NULL, NULL } + } +}; + +static struct action_kw_list http_req_actions = { + .kw = { + { "set-bandwidth-limit", parse_bandwidth_limit, 0 }, + { NULL, NULL } + } +}; + +static struct action_kw_list http_res_actions = { + .kw = { + { "set-bandwidth-limit", parse_bandwidth_limit, 0 }, + { NULL, NULL } + } +}; + +INITCALL1(STG_REGISTER, tcp_req_cont_keywords_register, &tcp_req_cont_actions); +INITCALL1(STG_REGISTER, tcp_res_cont_keywords_register, &tcp_res_cont_actions); +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_actions); +INITCALL1(STG_REGISTER, http_res_keywords_register, &http_res_actions); + + +/* Generic function to parse bandwidth limitation filter configurartion. It + * Returns -1 on error and 0 on success. It handles configuration for per-stream + * and shared limitations. + */ +static int parse_bwlim_flt(char **args, int *cur_arg, struct proxy *px, struct flt_conf *fconf, + char **err, void *private) +{ + struct flt_conf *fc; + struct bwlim_config *conf; + int shared, per_stream; + int pos = *cur_arg + 1; + + conf = calloc(1, sizeof(*conf)); + if (!conf) { + memprintf(err, "%s: out of memory", args[*cur_arg]); + return -1; + } + conf->proxy = px; + + if (!*args[pos]) { + memprintf(err, "'%s' : a name is expected as first argument ", args[*cur_arg]); + goto error; + } + conf->flags = BWLIM_FL_NONE; + conf->name = strdup(args[pos]); + if (!conf->name) { + memprintf(err, "%s: out of memory", args[*cur_arg]); + goto error; + } + + list_for_each_entry(fc, &px->filter_configs, list) { + if (fc->id == bwlim_flt_id) { + struct bwlim_config *c = fc->conf; + + if (strcmp(conf->name, c->name) == 0) { + memprintf(err, "bwlim filter '%s' already declared for proxy '%s'\n", + conf->name, px->id); + goto error; + } + } + } + shared = per_stream = 0; + pos++; + while (*args[pos]) { + if (strcmp(args[pos], "key") == 0) { + if (per_stream) { + memprintf(err, "'%s' : cannot mix per-stream and shared parameter", + args[*cur_arg]); + goto error; + } + if (!*args[pos + 1]) { + memprintf(err, "'%s' : the sample expression is missing for '%s' option", + args[*cur_arg], args[pos]); + goto error; + } + shared = 1; + pos++; + conf->expr = sample_parse_expr((char **)args, &pos, px->conf.args.file, px->conf.args.line, + err, &px->conf.args, NULL); + if (!conf->expr) + goto error; + } + else if (strcmp(args[pos], "table") == 0) { + if (per_stream) { + memprintf(err, "'%s' : cannot mix per-stream and shared parameter", + args[*cur_arg]); + goto error; + } + if (!*args[pos + 1]) { + memprintf(err, "'%s' : the table name is missing for '%s' option", + args[*cur_arg], args[pos]); + goto error; + } + shared = 1; + conf->table.n = strdup(args[pos + 1]); + if (!conf->table.n) { + memprintf(err, "%s: out of memory", args[*cur_arg]); + goto error; + } + pos += 2; + } + else if (strcmp(args[pos], "default-period") == 0) { + const char *res; + + if (shared) { + memprintf(err, "'%s' : cannot mix per-stream and shared parameter", + args[*cur_arg]); + goto error; + } + if (!*args[pos + 1]) { + memprintf(err, "'%s' : the value is missing for '%s' option", + args[*cur_arg], args[pos]); + goto error; + } + per_stream = 1; + res = parse_time_err(args[pos + 1], &conf->period, TIME_UNIT_MS); + if (res) { + memprintf(err, "'%s' : invalid value for option '%s' (unexpected character '%c')", + args[*cur_arg], args[pos], *res); + goto error; + } + pos += 2; + } + else if (strcmp(args[pos], "limit") == 0) { + const char *res; + + if (per_stream) { + memprintf(err, "'%s' : cannot mix per-stream and shared parameter", + args[*cur_arg]); + goto error; + } + if (!*args[pos + 1]) { + memprintf(err, "'%s' : the value is missing for '%s' option", + args[*cur_arg], args[pos]); + goto error; + } + shared = 1; + res = parse_size_err(args[pos + 1], &conf->limit); + if (res) { + memprintf(err, "'%s' : invalid value for option '%s' (unexpected character '%c')", + args[*cur_arg], args[pos], *res); + goto error; + } + pos += 2; + } + else if (strcmp(args[pos], "default-limit") == 0) { + const char *res; + + if (shared) { + memprintf(err, "'%s' : cannot mix per-stream and shared parameter", + args[*cur_arg]); + goto error; + } + if (!*args[pos + 1]) { + memprintf(err, "'%s' : the value is missing for '%s' option", + args[*cur_arg], args[pos]); + goto error; + } + per_stream = 1; + res = parse_size_err(args[pos + 1], &conf->limit); + if (res) { + memprintf(err, "'%s' : invalid value for option '%s' (unexpected character '%c')", + args[*cur_arg], args[pos], *res); + goto error; + } + pos += 2; + } + else if (strcmp(args[pos], "min-size") == 0) { + const char *res; + + if (!*args[pos + 1]) { + memprintf(err, "'%s' : the value is missing for '%s' option", + args[*cur_arg], args[pos]); + goto error; + } + res = parse_size_err(args[pos + 1], &conf->min_size); + if (res) { + memprintf(err, "'%s' : invalid value for option '%s' (unexpected character '%c')", + args[*cur_arg], args[pos], *res); + goto error; + } + pos += 2; + } + else + break; + } + + if (shared) { + conf->flags |= BWLIM_FL_SHARED; + if (!conf->expr) { + memprintf(err, "'%s' : <key> option is missing", args[*cur_arg]); + goto error; + } + if (!conf->limit) { + memprintf(err, "'%s' : <limit> option is missing", args[*cur_arg]); + goto error; + } + } + else { + /* Per-stream: limit downloads only for now */ + conf->flags |= BWLIM_FL_OUT; + if (!conf->period) { + memprintf(err, "'%s' : <default-period> option is missing", args[*cur_arg]); + goto error; + } + if (!conf->limit) { + memprintf(err, "'%s' : <default-limit> option is missing", args[*cur_arg]); + goto error; + } + } + + *cur_arg = pos; + fconf->id = bwlim_flt_id; + fconf->ops = &bwlim_ops; + fconf->conf = conf; + return 0; + + error: + if (conf->name) + ha_free(&conf->name); + if (conf->expr) { + release_sample_expr(conf->expr); + conf->expr = NULL; + } + if (conf->table.n) + ha_free(&conf->table.n); + free(conf); + return -1; +} + + +static int parse_bwlim_in_flt(char **args, int *cur_arg, struct proxy *px, struct flt_conf *fconf, + char **err, void *private) +{ + int ret; + + ret = parse_bwlim_flt(args, cur_arg, px, fconf, err, private); + if (!ret) { + struct bwlim_config *conf = fconf->conf; + + conf->flags |= BWLIM_FL_IN; + } + + return ret; +} + +static int parse_bwlim_out_flt(char **args, int *cur_arg, struct proxy *px, struct flt_conf *fconf, + char **err, void *private) +{ + int ret; + + ret = parse_bwlim_flt(args, cur_arg, px, fconf, err, private); + if (!ret) { + struct bwlim_config *conf = fconf->conf; + + conf->flags |= BWLIM_FL_OUT; + } + return ret; +} + +/* Declare the filter parser for "trace" keyword */ +static struct flt_kw_list flt_kws = { "BWLIM", { }, { + { "bwlim-in", parse_bwlim_in_flt, NULL }, + { "bwlim-out", parse_bwlim_out_flt, NULL }, + { NULL, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, flt_register_keywords, &flt_kws); diff --git a/src/flt_http_comp.c b/src/flt_http_comp.c new file mode 100644 index 0000000..30f9d2a --- /dev/null +++ b/src/flt_http_comp.c @@ -0,0 +1,1076 @@ +/* + * Stream filters related variables and functions. + * + * Copyright (C) 2015 Qualys Inc., Christopher Faulet <cfaulet@qualys.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/compression.h> +#include <haproxy/dynbuf.h> +#include <haproxy/filters.h> +#include <haproxy/http.h> +#include <haproxy/http_ana-t.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/list.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/stream.h> +#include <haproxy/tools.h> + +#define COMP_STATE_PROCESSING 0x01 + +const char *http_comp_flt_id = "compression filter"; + +struct flt_ops comp_ops; + +struct comp_state { + /* + * For both comp_ctx and comp_algo, COMP_DIR_REQ is the index + * for requests, and COMP_DIR_RES for responses + */ + struct comp_ctx *comp_ctx[2]; /* compression context */ + struct comp_algo *comp_algo[2]; /* compression algorithm if not NULL */ + unsigned int flags; /* COMP_STATE_* */ +}; + +/* Pools used to allocate comp_state structs */ +DECLARE_STATIC_POOL(pool_head_comp_state, "comp_state", sizeof(struct comp_state)); + +static THREAD_LOCAL struct buffer tmpbuf; +static THREAD_LOCAL struct buffer zbuf; + +static int select_compression_request_header(struct comp_state *st, + struct stream *s, + struct http_msg *msg); +static int select_compression_response_header(struct comp_state *st, + struct stream *s, + struct http_msg *msg); +static int set_compression_header(struct comp_state *st, + struct stream *s, + struct http_msg *msg); + +static int htx_compression_buffer_init(struct htx *htx, struct buffer *out); +static int htx_compression_buffer_add_data(struct comp_state *st, const char *data, size_t len, + struct buffer *out, int dir); +static int htx_compression_buffer_end(struct comp_state *st, struct buffer *out, int end, int dir); + +/***********************************************************************/ +static int +comp_flt_init(struct proxy *px, struct flt_conf *fconf) +{ + fconf->flags |= FLT_CFG_FL_HTX; + return 0; +} + +static int +comp_flt_init_per_thread(struct proxy *px, struct flt_conf *fconf) +{ + if (b_alloc(&tmpbuf) == NULL) + return -1; + if (b_alloc(&zbuf) == NULL) + return -1; + return 0; +} + +static void +comp_flt_deinit_per_thread(struct proxy *px, struct flt_conf *fconf) +{ + if (tmpbuf.size) + b_free(&tmpbuf); + if (zbuf.size) + b_free(&zbuf); +} + +static int +comp_strm_init(struct stream *s, struct filter *filter) +{ + struct comp_state *st; + + st = pool_alloc(pool_head_comp_state); + if (st == NULL) + return -1; + + st->comp_algo[COMP_DIR_REQ] = NULL; + st->comp_algo[COMP_DIR_RES] = NULL; + st->comp_ctx[COMP_DIR_REQ] = NULL; + st->comp_ctx[COMP_DIR_RES] = NULL; + st->flags = 0; + filter->ctx = st; + + /* Register post-analyzer on AN_RES_WAIT_HTTP because we need to + * analyze response headers before http-response rules execution + * to be sure we can use res.comp and res.comp_algo sample + * fetches */ + filter->post_analyzers |= AN_RES_WAIT_HTTP; + return 1; +} + +static void +comp_strm_deinit(struct stream *s, struct filter *filter) +{ + struct comp_state *st = filter->ctx; + + if (!st) + return; + + /* release any possible compression context */ + if (st->comp_algo[COMP_DIR_REQ]) + st->comp_algo[COMP_DIR_REQ]->end(&st->comp_ctx[COMP_DIR_REQ]); + if (st->comp_algo[COMP_DIR_RES]) + st->comp_algo[COMP_DIR_RES]->end(&st->comp_ctx[COMP_DIR_RES]); + pool_free(pool_head_comp_state, st); + filter->ctx = NULL; +} + +static void +comp_prepare_compress_request(struct comp_state *st, struct stream *s, struct http_msg *msg) +{ + struct htx *htx = htxbuf(&msg->chn->buf); + struct http_txn *txn = s->txn; + struct http_hdr_ctx ctx; + struct comp_type *comp_type; + + ctx.blk = NULL; + /* Already compressed, don't bother */ + if (http_find_header(htx, ist("Content-Encoding"), &ctx, 1)) + return; + /* HTTP < 1.1 should not be compressed */ + if (!(msg->flags & HTTP_MSGF_VER_11) || !(txn->req.flags & HTTP_MSGF_VER_11)) + return; + comp_type = NULL; + + /* + * We don't want to compress content-types not listed in the "compression type" directive if any. If no content-type was found but configuration + * requires one, we don't compress either. Backend has the priority. + */ + ctx.blk = NULL; + if (http_find_header(htx, ist("Content-Type"), &ctx, 1)) { + if ((s->be->comp && (comp_type = s->be->comp->types_req)) || + (strm_fe(s)->comp && (comp_type = strm_fe(s)->comp->types_req))) { + for (; comp_type; comp_type = comp_type->next) { + if (ctx.value.len >= comp_type->name_len && + strncasecmp(ctx.value.ptr, comp_type->name, comp_type->name_len) == 0) + /* this Content-Type should be compressed */ + break; + } + /* this Content-Type should not be compressed */ + if (comp_type == NULL) + goto fail; + } + } + else { /* no content-type header */ + if ((s->be->comp && s->be->comp->types_req) || + (strm_fe(s)->comp && strm_fe(s)->comp->types_req)) + goto fail; /* a content-type was required */ + } + + /* limit compression rate */ + if (global.comp_rate_lim > 0) + if (read_freq_ctr(&global.comp_bps_in) > global.comp_rate_lim) + goto fail; + + /* limit cpu usage */ + if (th_ctx->idle_pct < compress_min_idle) + goto fail; + + if (txn->meth == HTTP_METH_HEAD) + return; + if (s->be->comp && s->be->comp->algo_req != NULL) + st->comp_algo[COMP_DIR_REQ] = s->be->comp->algo_req; + else if (strm_fe(s)->comp && strm_fe(s)->comp->algo_req != NULL) + st->comp_algo[COMP_DIR_REQ] = strm_fe(s)->comp->algo_req; + else + goto fail; /* no algo selected: nothing to do */ + + + /* limit compression rate */ + if (global.comp_rate_lim > 0) + if (read_freq_ctr(&global.comp_bps_in) > global.comp_rate_lim) + goto fail; + + /* limit cpu usage */ + if (th_ctx->idle_pct < compress_min_idle) + goto fail; + + /* initialize compression */ + if (st->comp_algo[COMP_DIR_REQ]->init(&st->comp_ctx[COMP_DIR_REQ], global.tune.comp_maxlevel) < 0) + goto fail; + + return; +fail: + st->comp_algo[COMP_DIR_REQ] = NULL; +} + +static int +comp_http_headers(struct stream *s, struct filter *filter, struct http_msg *msg) +{ + struct comp_state *st = filter->ctx; + int comp_flags = 0; + + if (!strm_fe(s)->comp && !s->be->comp) + goto end; + if (strm_fe(s)->comp) + comp_flags |= strm_fe(s)->comp->flags; + if (s->be->comp) + comp_flags |= s->be->comp->flags; + + if (!(msg->chn->flags & CF_ISRESP)) { + if (comp_flags & COMP_FL_DIR_REQ) { + comp_prepare_compress_request(st, s, msg); + if (st->comp_algo[COMP_DIR_REQ]) { + if (!set_compression_header(st, s, msg)) + goto end; + register_data_filter(s, msg->chn, filter); + st->flags |= COMP_STATE_PROCESSING; + } + } + if (comp_flags & COMP_FL_DIR_RES) + select_compression_request_header(st, s, msg); + } else if (comp_flags & COMP_FL_DIR_RES) { + /* Response headers have already been checked in + * comp_http_post_analyze callback. */ + if (st->comp_algo[COMP_DIR_RES]) { + if (!set_compression_header(st, s, msg)) + goto end; + register_data_filter(s, msg->chn, filter); + st->flags |= COMP_STATE_PROCESSING; + } + } + + end: + return 1; +} + +static int +comp_http_post_analyze(struct stream *s, struct filter *filter, + struct channel *chn, unsigned an_bit) +{ + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->rsp; + struct comp_state *st = filter->ctx; + + if (an_bit != AN_RES_WAIT_HTTP) + goto end; + + if (!strm_fe(s)->comp && !s->be->comp) + goto end; + + select_compression_response_header(st, s, msg); + + end: + return 1; +} + +static int +comp_http_payload(struct stream *s, struct filter *filter, struct http_msg *msg, + unsigned int offset, unsigned int len) +{ + struct comp_state *st = filter->ctx; + struct htx *htx = htxbuf(&msg->chn->buf); + struct htx_ret htxret = htx_find_offset(htx, offset); + struct htx_blk *blk, *next; + int ret, consumed = 0, to_forward = 0, last = 0; + int dir; + + if (msg->chn->flags & CF_ISRESP) + dir = COMP_DIR_RES; + else + dir = COMP_DIR_REQ; + + blk = htxret.blk; + offset = htxret.ret; + for (next = NULL; blk && len; blk = next) { + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t sz = htx_get_blksz(blk); + struct ist v; + + next = htx_get_next_blk(htx, blk); + while (next && htx_get_blk_type(next) == HTX_BLK_UNUSED) + next = htx_get_next_blk(htx, next); + + if (!(st->flags & COMP_STATE_PROCESSING)) + goto consume; + + if (htx_compression_buffer_init(htx, &trash) < 0) { + msg->chn->flags |= CF_WAKE_WRITE; + goto end; + } + + switch (type) { + case HTX_BLK_DATA: + /* it is the last data block */ + last = ((!next && (htx->flags & HTX_FL_EOM)) || (next && htx_get_blk_type(next) != HTX_BLK_DATA)); + v = htx_get_blk_value(htx, blk); + v = istadv(v, offset); + if (v.len > len) { + last = 0; + v.len = len; + } + + ret = htx_compression_buffer_add_data(st, v.ptr, v.len, &trash, dir); + if (ret < 0 || htx_compression_buffer_end(st, &trash, last, dir) < 0) + goto error; + BUG_ON(v.len != ret); + + if (ret == sz && !b_data(&trash)) + next = htx_remove_blk(htx, blk); + else { + blk = htx_replace_blk_value(htx, blk, v, ist2(b_head(&trash), b_data(&trash))); + next = htx_get_next_blk(htx, blk); + } + + len -= ret; + consumed += ret; + to_forward += b_data(&trash); + if (last) + st->flags &= ~COMP_STATE_PROCESSING; + break; + + case HTX_BLK_TLR: + case HTX_BLK_EOT: + if (htx_compression_buffer_end(st, &trash, 1, dir) < 0) + goto error; + if (b_data(&trash)) { + struct htx_blk *last = htx_add_last_data(htx, ist2(b_head(&trash), b_data(&trash))); + if (!last) + goto error; + blk = htx_get_next_blk(htx, last); + if (!blk) + goto error; + next = htx_get_next_blk(htx, blk); + to_forward += b_data(&trash); + } + st->flags &= ~COMP_STATE_PROCESSING; + __fallthrough; + + default: + consume: + sz -= offset; + if (sz > len) + sz = len; + consumed += sz; + to_forward += sz; + len -= sz; + break; + } + + offset = 0; + } + + end: + if (to_forward != consumed) + flt_update_offsets(filter, msg->chn, to_forward - consumed); + + if (st->comp_ctx[dir] && st->comp_ctx[dir]->cur_lvl > 0) { + update_freq_ctr(&global.comp_bps_in, consumed); + _HA_ATOMIC_ADD(&strm_fe(s)->fe_counters.comp_in[dir], consumed); + _HA_ATOMIC_ADD(&s->be->be_counters.comp_in[dir], consumed); + update_freq_ctr(&global.comp_bps_out, to_forward); + _HA_ATOMIC_ADD(&strm_fe(s)->fe_counters.comp_out[dir], to_forward); + _HA_ATOMIC_ADD(&s->be->be_counters.comp_out[dir], to_forward); + } else { + _HA_ATOMIC_ADD(&strm_fe(s)->fe_counters.comp_byp[dir], consumed); + _HA_ATOMIC_ADD(&s->be->be_counters.comp_byp[dir], consumed); + } + return to_forward; + + error: + return -1; +} + + +static int +comp_http_end(struct stream *s, struct filter *filter, + struct http_msg *msg) +{ + struct comp_state *st = filter->ctx; + + if (!(msg->chn->flags & CF_ISRESP) || !st || !st->comp_algo[COMP_DIR_RES]) + goto end; + + if (strm_fe(s)->mode == PR_MODE_HTTP) + _HA_ATOMIC_INC(&strm_fe(s)->fe_counters.p.http.comp_rsp); + if ((s->flags & SF_BE_ASSIGNED) && (s->be->mode == PR_MODE_HTTP)) + _HA_ATOMIC_INC(&s->be->be_counters.p.http.comp_rsp); + end: + return 1; +} + +/***********************************************************************/ +static int +set_compression_header(struct comp_state *st, struct stream *s, struct http_msg *msg) +{ + struct htx *htx = htxbuf(&msg->chn->buf); + struct htx_sl *sl; + struct http_hdr_ctx ctx, last_vary; + struct comp_algo *comp_algo; + int comp_index; + + if (msg->chn->flags & CF_ISRESP) + comp_index = COMP_DIR_RES; + else + comp_index = COMP_DIR_REQ; + + sl = http_get_stline(htx); + if (!sl) + goto error; + + comp_algo = st->comp_algo[comp_index]; + + /* add "Transfer-Encoding: chunked" header */ + if (!(msg->flags & HTTP_MSGF_TE_CHNK)) { + if (!http_add_header(htx, ist("Transfer-Encoding"), ist("chunked"))) + goto error; + msg->flags |= HTTP_MSGF_TE_CHNK; + sl->flags |= (HTX_SL_F_XFER_ENC|HTX_SL_F_CHNK); + } + + /* remove Content-Length header */ + if (msg->flags & HTTP_MSGF_CNT_LEN) { + ctx.blk = NULL; + while (http_find_header(htx, ist("Content-Length"), &ctx, 1)) + http_remove_header(htx, &ctx); + msg->flags &= ~HTTP_MSGF_CNT_LEN; + sl->flags &= ~HTX_SL_F_CLEN; + } + + /* convert "ETag" header to a weak ETag */ + ctx.blk = NULL; + if (http_find_header(htx, ist("ETag"), &ctx, 1)) { + if (ctx.value.ptr[0] == '"') { + /* This a strong ETag. Convert it to a weak one. */ + struct ist v = ist2(trash.area, 0); + if (istcat(&v, ist("W/"), trash.size) == -1 || istcat(&v, ctx.value, trash.size) == -1) + goto error; + + if (!http_replace_header_value(htx, &ctx, v)) + goto error; + } + } + + /* Add "Vary: Accept-Encoding" header but only if it is not found. */ + ctx.blk = NULL; + last_vary.blk = NULL; + while (http_find_header(htx, ist("Vary"), &ctx, 0)) { + if (isteqi(ctx.value, ist("Accept-Encoding"))) + break; + last_vary = ctx; + } + /* No "Accept-Encoding" value found. */ + if (ctx.blk == NULL) { + if (last_vary.blk == NULL) { + /* No Vary header found at all. Add our header */ + if (!http_add_header(htx, ist("Vary"), ist("Accept-Encoding"))) + goto error; + } + else { + /* At least one Vary header found. Append the value to + * the last one. + */ + if (!http_append_header_value(htx, &last_vary, ist("Accept-Encoding"))) + goto error; + } + } + + /* + * Add Content-Encoding header when it's not identity encoding. + * RFC 2616 : Identity encoding: This content-coding is used only in the + * Accept-Encoding header, and SHOULD NOT be used in the Content-Encoding + * header. + */ + if (comp_algo->cfg_name_len != 8 || memcmp(comp_algo->cfg_name, "identity", 8) != 0) { + struct ist v = ist2(comp_algo->ua_name, comp_algo->ua_name_len); + + if (!http_add_header(htx, ist("Content-Encoding"), v)) + goto error; + } + + return 1; + + error: + st->comp_algo[comp_index]->end(&st->comp_ctx[comp_index]); + st->comp_algo[comp_index] = NULL; + return 0; +} + +/* + * Selects a compression algorithm depending on the client request. + */ +static int +select_compression_request_header(struct comp_state *st, struct stream *s, struct http_msg *msg) +{ + struct htx *htx = htxbuf(&msg->chn->buf); + struct http_hdr_ctx ctx; + struct comp_algo *comp_algo = NULL; + struct comp_algo *comp_algo_back = NULL; + + /* Disable compression for older user agents announcing themselves as "Mozilla/4" + * unless they are known good (MSIE 6 with XP SP2, or MSIE 7 and later). + * See http://zoompf.com/2012/02/lose-the-wait-http-compression for more details. + */ + ctx.blk = NULL; + if (http_find_header(htx, ist("User-Agent"), &ctx, 1) && + ctx.value.len >= 9 && + memcmp(ctx.value.ptr, "Mozilla/4", 9) == 0 && + (ctx.value.len < 31 || + memcmp(ctx.value.ptr + 25, "MSIE ", 5) != 0 || + *(ctx.value.ptr + 30) < '6' || + (*(ctx.value.ptr + 30) == '6' && + (ctx.value.len < 54 || memcmp(ctx.value.ptr + 51, "SV1", 3) != 0)))) { + st->comp_algo[COMP_DIR_RES] = NULL; + return 0; + } + + /* search for the algo in the backend in priority or the frontend */ + if ((s->be->comp && (comp_algo_back = s->be->comp->algos_res)) || + (strm_fe(s)->comp && (comp_algo_back = strm_fe(s)->comp->algos_res))) { + int best_q = 0; + + ctx.blk = NULL; + while (http_find_header(htx, ist("Accept-Encoding"), &ctx, 0)) { + const char *qval; + int q; + int toklen; + + /* try to isolate the token from the optional q-value */ + toklen = 0; + while (toklen < ctx.value.len && HTTP_IS_TOKEN(*(ctx.value.ptr + toklen))) + toklen++; + + qval = ctx.value.ptr + toklen; + while (1) { + while (qval < istend(ctx.value) && HTTP_IS_LWS(*qval)) + qval++; + + if (qval >= istend(ctx.value) || *qval != ';') { + qval = NULL; + break; + } + qval++; + + while (qval < istend(ctx.value) && HTTP_IS_LWS(*qval)) + qval++; + + if (qval >= istend(ctx.value)) { + qval = NULL; + break; + } + if (strncmp(qval, "q=", MIN(istend(ctx.value) - qval, 2)) == 0) + break; + + while (qval < istend(ctx.value) && *qval != ';') + qval++; + } + + /* here we have qval pointing to the first "q=" attribute or NULL if not found */ + q = qval ? http_parse_qvalue(qval + 2, NULL) : 1000; + + if (q <= best_q) + continue; + + for (comp_algo = comp_algo_back; comp_algo; comp_algo = comp_algo->next) { + if (*(ctx.value.ptr) == '*' || + word_match(ctx.value.ptr, toklen, comp_algo->ua_name, comp_algo->ua_name_len)) { + st->comp_algo[COMP_DIR_RES] = comp_algo; + best_q = q; + break; + } + } + } + } + + /* remove all occurrences of the header when "compression offload" is set */ + if (st->comp_algo[COMP_DIR_RES]) { + if ((s->be->comp && (s->be->comp->flags & COMP_FL_OFFLOAD)) || + (strm_fe(s)->comp && (strm_fe(s)->comp->flags & COMP_FL_OFFLOAD))) { + http_remove_header(htx, &ctx); + ctx.blk = NULL; + while (http_find_header(htx, ist("Accept-Encoding"), &ctx, 1)) + http_remove_header(htx, &ctx); + } + return 1; + } + + /* identity is implicit does not require headers */ + if ((s->be->comp && (comp_algo_back = s->be->comp->algos_res)) || + (strm_fe(s)->comp && (comp_algo_back = strm_fe(s)->comp->algos_res))) { + for (comp_algo = comp_algo_back; comp_algo; comp_algo = comp_algo->next) { + if (comp_algo->cfg_name_len == 8 && memcmp(comp_algo->cfg_name, "identity", 8) == 0) { + st->comp_algo[COMP_DIR_RES] = comp_algo; + return 1; + } + } + } + + st->comp_algo[COMP_DIR_RES] = NULL; + return 0; +} + +/* + * Selects a compression algorithm depending of the server response. + */ +static int +select_compression_response_header(struct comp_state *st, struct stream *s, struct http_msg *msg) +{ + struct htx *htx = htxbuf(&msg->chn->buf); + struct http_txn *txn = s->txn; + struct http_hdr_ctx ctx; + struct comp_type *comp_type; + + /* no common compression algorithm was found in request header */ + if (st->comp_algo[COMP_DIR_RES] == NULL) + goto fail; + + /* compression already in progress */ + if (msg->flags & HTTP_MSGF_COMPRESSING) + goto fail; + + /* HTTP < 1.1 should not be compressed */ + if (!(msg->flags & HTTP_MSGF_VER_11) || !(txn->req.flags & HTTP_MSGF_VER_11)) + goto fail; + + if (txn->meth == HTTP_METH_HEAD) + goto fail; + + /* compress 200,201,202,203 responses only */ + if ((txn->status != 200) && + (txn->status != 201) && + (txn->status != 202) && + (txn->status != 203)) + goto fail; + + if (!(msg->flags & HTTP_MSGF_XFER_LEN) || msg->flags & HTTP_MSGF_BODYLESS) + goto fail; + + /* content is already compressed */ + ctx.blk = NULL; + if (http_find_header(htx, ist("Content-Encoding"), &ctx, 1)) + goto fail; + + /* no compression when Cache-Control: no-transform is present in the message */ + ctx.blk = NULL; + while (http_find_header(htx, ist("Cache-Control"), &ctx, 0)) { + if (word_match(ctx.value.ptr, ctx.value.len, "no-transform", 12)) + goto fail; + } + + /* no compression when ETag is malformed */ + ctx.blk = NULL; + if (http_find_header(htx, ist("ETag"), &ctx, 1)) { + if (http_get_etag_type(ctx.value) == ETAG_INVALID) + goto fail; + } + /* no compression when multiple ETags are present + * Note: Do not reset ctx.blk! + */ + if (http_find_header(htx, ist("ETag"), &ctx, 1)) + goto fail; + + comp_type = NULL; + + /* we don't want to compress multipart content-types, nor content-types that are + * not listed in the "compression type" directive if any. If no content-type was + * found but configuration requires one, we don't compress either. Backend has + * the priority. + */ + ctx.blk = NULL; + if (http_find_header(htx, ist("Content-Type"), &ctx, 1)) { + if (ctx.value.len >= 9 && strncasecmp("multipart", ctx.value.ptr, 9) == 0) + goto fail; + + if ((s->be->comp && (comp_type = s->be->comp->types_res)) || + (strm_fe(s)->comp && (comp_type = strm_fe(s)->comp->types_res))) { + for (; comp_type; comp_type = comp_type->next) { + if (ctx.value.len >= comp_type->name_len && + strncasecmp(ctx.value.ptr, comp_type->name, comp_type->name_len) == 0) + /* this Content-Type should be compressed */ + break; + } + /* this Content-Type should not be compressed */ + if (comp_type == NULL) + goto fail; + } + } + else { /* no content-type header */ + if ((s->be->comp && s->be->comp->types_res) || + (strm_fe(s)->comp && strm_fe(s)->comp->types_res)) + goto fail; /* a content-type was required */ + } + + /* limit compression rate */ + if (global.comp_rate_lim > 0) + if (read_freq_ctr(&global.comp_bps_in) > global.comp_rate_lim) + goto fail; + + /* limit cpu usage */ + if (th_ctx->idle_pct < compress_min_idle) + goto fail; + + /* initialize compression */ + if (st->comp_algo[COMP_DIR_RES]->init(&st->comp_ctx[COMP_DIR_RES], global.tune.comp_maxlevel) < 0) + goto fail; + msg->flags |= HTTP_MSGF_COMPRESSING; + return 1; + + fail: + st->comp_algo[COMP_DIR_RES] = NULL; + return 0; +} + +/***********************************************************************/ +static int +htx_compression_buffer_init(struct htx *htx, struct buffer *out) +{ + /* output stream requires at least 10 bytes for the gzip header, plus + * at least 8 bytes for the gzip trailer (crc+len), plus a possible + * plus at most 5 bytes per 32kB block and 2 bytes to close the stream. + */ + if (htx_free_space(htx) < 20 + 5 * ((htx->data + 32767) >> 15)) + return -1; + b_reset(out); + return 0; +} + +static int +htx_compression_buffer_add_data(struct comp_state *st, const char *data, size_t len, + struct buffer *out, int dir) +{ + + return st->comp_algo[dir]->add_data(st->comp_ctx[dir], data, len, out); +} + +static int +htx_compression_buffer_end(struct comp_state *st, struct buffer *out, int end, int dir) +{ + + if (end) + return st->comp_algo[dir]->finish(st->comp_ctx[dir], out); + else + return st->comp_algo[dir]->flush(st->comp_ctx[dir], out); +} + + +/***********************************************************************/ +struct flt_ops comp_ops = { + .init = comp_flt_init, + .init_per_thread = comp_flt_init_per_thread, + .deinit_per_thread = comp_flt_deinit_per_thread, + + .attach = comp_strm_init, + .detach = comp_strm_deinit, + + .channel_post_analyze = comp_http_post_analyze, + + .http_headers = comp_http_headers, + .http_payload = comp_http_payload, + .http_end = comp_http_end, +}; + +static int +parse_compression_options(char **args, int section, struct proxy *proxy, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + struct comp *comp; + int ret = 0; + + if (proxy->comp == NULL) { + comp = calloc(1, sizeof(*comp)); + /* Always default to compress responses */ + comp->flags = COMP_FL_DIR_RES; + proxy->comp = comp; + } + else + comp = proxy->comp; + + if (strcmp(args[1], "algo") == 0 || strcmp(args[1], "algo-res") == 0) { + struct comp_ctx *ctx; + int cur_arg = 2; + + if (!*args[cur_arg]) { + memprintf(err, "parsing [%s:%d] : '%s' expects <algorithm>.", + file, line, args[0]); + ret = -1; + goto end; + } + while (*(args[cur_arg])) { + int retval = comp_append_algo(&comp->algos_res, args[cur_arg]); + if (retval) { + if (retval < 0) + memprintf(err, "'%s' : '%s' is not a supported algorithm.", + args[0], args[cur_arg]); + else + memprintf(err, "'%s' : out of memory while parsing algo '%s'.", + args[0], args[cur_arg]); + ret = -1; + goto end; + } + + if (proxy->comp->algos_res->init(&ctx, 9) == 0) + proxy->comp->algos_res->end(&ctx); + else { + memprintf(err, "'%s' : Can't init '%s' algorithm.", + args[0], args[cur_arg]); + ret = -1; + goto end; + } + cur_arg++; + continue; + } + } + else if (strcmp(args[1], "algo-req") == 0) { + struct comp_ctx *ctx; + int retval = comp_append_algo(&comp->algo_req, args[2]); + + if (retval) { + if (retval < 0) + memprintf(err, "'%s' : '%s' is not a supported algorithm.", + args[0], args[2]); + else + memprintf(err, "'%s' : out of memory while parsing algo '%s'.", + args[0], args[2]); + ret = -1; + goto end; + } + + if (proxy->comp->algo_req->init(&ctx, 9) == 0) + proxy->comp->algo_req->end(&ctx); + else { + memprintf(err, "'%s' : Can't init '%s' algorithm.", + args[0], args[2]); + ret = -1; + goto end; + } + } + else if (strcmp(args[1], "offload") == 0) { + if (proxy->cap & PR_CAP_DEF) { + memprintf(err, "'%s' : '%s' ignored in 'defaults' section.", + args[0], args[1]); + ret = 1; + } + comp->flags |= COMP_FL_OFFLOAD; + } + else if (strcmp(args[1], "type") == 0 || strcmp(args[1], "type-res") == 0) { + int cur_arg = 2; + + if (!*args[cur_arg]) { + memprintf(err, "'%s' expects <type>.", args[0]); + ret = -1; + goto end; + } + while (*(args[cur_arg])) { + if (comp_append_type(&comp->types_res, args[cur_arg])) { + memprintf(err, "'%s': out of memory.", args[0]); + ret = -1; + goto end; + } + cur_arg++; + continue; + } + } + else if (strcmp(args[1], "type-req") == 0) { + int cur_arg = 2; + + if (!*args[cur_arg]) { + memprintf(err, "'%s' expects <type>.", args[0]); + ret = -1; + goto end; + } + while (*(args[cur_arg])) { + if (comp_append_type(&comp->types_req, args[cur_arg])) { + memprintf(err, "'%s': out of memory.", args[0]); + ret = -1; + goto end; + } + cur_arg++; + continue; + } + } + else if (strcmp(args[1], "direction") == 0) { + if (!args[2]) { + memprintf(err, "'%s' expects 'request', 'response', or 'both'.", args[0]); + ret = -1; + goto end; + } + if (strcmp(args[2], "request") == 0) { + comp->flags &= ~COMP_FL_DIR_RES; + comp->flags |= COMP_FL_DIR_REQ; + } else if (strcmp(args[2], "response") == 0) { + comp->flags &= COMP_FL_DIR_REQ; + comp->flags |= COMP_FL_DIR_RES; + } else if (strcmp(args[2], "both") == 0) + comp->flags |= COMP_FL_DIR_REQ | COMP_FL_DIR_RES; + else { + memprintf(err, "'%s' expects 'request', 'response', or 'both'.", args[0]); + ret = -1; + goto end; + } + } + else { + memprintf(err, "'%s' expects 'algo', 'type' 'direction' or 'offload'", + args[0]); + ret = -1; + goto end; + } + + end: + return ret; +} + +static int +parse_http_comp_flt(char **args, int *cur_arg, struct proxy *px, + struct flt_conf *fconf, char **err, void *private) +{ + struct flt_conf *fc, *back; + + list_for_each_entry_safe(fc, back, &px->filter_configs, list) { + if (fc->id == http_comp_flt_id) { + memprintf(err, "%s: Proxy supports only one compression filter\n", px->id); + return -1; + } + } + + fconf->id = http_comp_flt_id; + fconf->conf = NULL; + fconf->ops = &comp_ops; + (*cur_arg)++; + + return 0; +} + + +int +check_implicit_http_comp_flt(struct proxy *proxy) +{ + struct flt_conf *fconf; + int explicit = 0; + int comp = 0; + int err = 0; + + if (proxy->comp == NULL) + goto end; + if (!LIST_ISEMPTY(&proxy->filter_configs)) { + list_for_each_entry(fconf, &proxy->filter_configs, list) { + if (fconf->id == http_comp_flt_id) + comp = 1; + else if (fconf->id == cache_store_flt_id) { + if (comp) { + ha_alert("config: %s '%s': unable to enable the compression filter " + "before any cache filter.\n", + proxy_type_str(proxy), proxy->id); + err++; + goto end; + } + } + else if (fconf->id == fcgi_flt_id) + continue; + else + explicit = 1; + } + } + if (comp) + goto end; + else if (explicit) { + ha_alert("config: %s '%s': require an explicit filter declaration to use " + "HTTP compression\n", proxy_type_str(proxy), proxy->id); + err++; + goto end; + } + + /* Implicit declaration of the compression filter is always the last + * one */ + fconf = calloc(1, sizeof(*fconf)); + if (!fconf) { + ha_alert("config: %s '%s': out of memory\n", + proxy_type_str(proxy), proxy->id); + err++; + goto end; + } + fconf->id = http_comp_flt_id; + fconf->conf = NULL; + fconf->ops = &comp_ops; + LIST_APPEND(&proxy->filter_configs, &fconf->list); + end: + return err; +} + +/* + * boolean, returns true if compression is used (either gzip or deflate) in the + * response. + */ +static int +smp_fetch_res_comp(const struct arg *args, struct sample *smp, const char *kw, + void *private) +{ + struct http_txn *txn = smp->strm ? smp->strm->txn : NULL; + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = (txn && (txn->rsp.flags & HTTP_MSGF_COMPRESSING)); + return 1; +} + +/* + * string, returns algo + */ +static int +smp_fetch_res_comp_algo(const struct arg *args, struct sample *smp, + const char *kw, void *private) +{ + struct http_txn *txn = smp->strm ? smp->strm->txn : NULL; + struct filter *filter; + struct comp_state *st; + + if (!txn || !(txn->rsp.flags & HTTP_MSGF_COMPRESSING)) + return 0; + + list_for_each_entry(filter, &strm_flt(smp->strm)->filters, list) { + if (FLT_ID(filter) != http_comp_flt_id) + continue; + + if (!(st = filter->ctx)) + break; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.area = st->comp_algo[COMP_DIR_RES]->cfg_name; + smp->data.u.str.data = st->comp_algo[COMP_DIR_RES]->cfg_name_len; + return 1; + } + return 0; +} + +/* Declare the config parser for "compression" keyword */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_LISTEN, "compression", parse_compression_options }, + { 0, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* Declare the filter parser for "compression" keyword */ +static struct flt_kw_list filter_kws = { "COMP", { }, { + { "compression", parse_http_comp_flt, NULL }, + { NULL, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, flt_register_keywords, &filter_kws); + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "res.comp", smp_fetch_res_comp, 0, NULL, SMP_T_BOOL, SMP_USE_HRSHP }, + { "res.comp_algo", smp_fetch_res_comp_algo, 0, NULL, SMP_T_STR, SMP_USE_HRSHP }, + { /* END */ }, + } +}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); diff --git a/src/flt_spoe.c b/src/flt_spoe.c new file mode 100644 index 0000000..70ea2ba --- /dev/null +++ b/src/flt_spoe.c @@ -0,0 +1,4739 @@ +/* + * Stream processing offload engine management. + * + * Copyright 2016 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <ctype.h> +#include <errno.h> + +#include <haproxy/acl.h> +#include <haproxy/applet.h> +#include <haproxy/action-t.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/cfgparse.h> +#include <haproxy/check.h> +#include <haproxy/filters.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/http_rules.h> +#include <haproxy/log.h> +#include <haproxy/pool.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/session.h> +#include <haproxy/signal.h> +#include <haproxy/sink.h> +#include <haproxy/spoe.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/thread.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/vars.h> + + +#if defined(DEBUG_SPOE) || defined(DEBUG_FULL) +#define SPOE_PRINTF(x...) fprintf(x) +#define SPOE_DEBUG_STMT(statement) statement +#else +#define SPOE_PRINTF(x...) +#define SPOE_DEBUG_STMT(statement) +#endif + +/* Reserved 4 bytes to the frame size. So a frame and its size can be written + * together in a buffer */ +#define MAX_FRAME_SIZE global.tune.bufsize - 4 + +/* The minimum size for a frame */ +#define MIN_FRAME_SIZE 256 + +/* Reserved for the metadata and the frame type. + * So <MAX_FRAME_SIZE> - <FRAME_HDR_SIZE> is the maximum payload size */ +#define FRAME_HDR_SIZE 32 + +/* Helper to get SPOE ctx inside an appctx */ +#define SPOE_APPCTX(appctx) ((struct spoe_appctx *)((appctx)->svcctx)) + +/* SPOE filter id. Used to identify SPOE filters */ +const char *spoe_filter_id = "SPOE filter"; + +/* Set if the handle on SIGUSR1 is registered */ +static int sighandler_registered = 0; + +/* The name of the SPOE engine, used during the parsing */ +char *curengine = NULL; + +/* SPOE agent used during the parsing */ +/* SPOE agent/group/message used during the parsing */ +struct spoe_agent *curagent = NULL; +struct spoe_group *curgrp = NULL; +struct spoe_message *curmsg = NULL; + +/* list of SPOE messages and placeholders used during the parsing */ +struct list curmsgs; +struct list curgrps; +struct list curmphs; +struct list curgphs; +struct list curvars; + +/* list of log servers used during the parsing */ +struct list curloggers; + +/* agent's proxy flags (PR_O_* and PR_O2_*) used during parsing */ +int curpxopts; +int curpxopts2; + +/* Pools used to allocate SPOE structs */ +DECLARE_STATIC_POOL(pool_head_spoe_ctx, "spoe_ctx", sizeof(struct spoe_context)); +DECLARE_STATIC_POOL(pool_head_spoe_appctx, "spoe_appctx", sizeof(struct spoe_appctx)); + +struct flt_ops spoe_ops; + +static int spoe_queue_context(struct spoe_context *ctx); +static int spoe_acquire_buffer(struct buffer *buf, struct buffer_wait *buffer_wait); +static void spoe_release_buffer(struct buffer *buf, struct buffer_wait *buffer_wait); +static struct appctx *spoe_create_appctx(struct spoe_config *conf); + +/******************************************************************** + * helper functions/globals + ********************************************************************/ +static void +spoe_release_placeholder(struct spoe_placeholder *ph) +{ + if (!ph) + return; + free(ph->id); + free(ph); +} + +static void +spoe_release_message(struct spoe_message *msg) +{ + struct spoe_arg *arg, *argback; + struct acl *acl, *aclback; + + if (!msg) + return; + free(msg->id); + free(msg->conf.file); + list_for_each_entry_safe(arg, argback, &msg->args, list) { + release_sample_expr(arg->expr); + free(arg->name); + LIST_DELETE(&arg->list); + free(arg); + } + list_for_each_entry_safe(acl, aclback, &msg->acls, list) { + LIST_DELETE(&acl->list); + prune_acl(acl); + free(acl); + } + free_acl_cond(msg->cond); + free(msg); +} + +static void +spoe_release_group(struct spoe_group *grp) +{ + if (!grp) + return; + free(grp->id); + free(grp->conf.file); + free(grp); +} + +static void +spoe_release_agent(struct spoe_agent *agent) +{ + struct spoe_message *msg, *msgback; + struct spoe_group *grp, *grpback; + int i; + + if (!agent) + return; + free(agent->id); + free(agent->conf.file); + free(agent->var_pfx); + free(agent->var_on_error); + free(agent->var_t_process); + free(agent->var_t_total); + list_for_each_entry_safe(msg, msgback, &agent->messages, list) { + LIST_DELETE(&msg->list); + spoe_release_message(msg); + } + list_for_each_entry_safe(grp, grpback, &agent->groups, list) { + LIST_DELETE(&grp->list); + spoe_release_group(grp); + } + if (agent->rt) { + for (i = 0; i < global.nbthread; ++i) { + free(agent->rt[i].engine_id); + HA_SPIN_DESTROY(&agent->rt[i].lock); + } + } + free(agent->rt); + free(agent); +} + +static const char *spoe_frm_err_reasons[SPOE_FRM_ERRS] = { + [SPOE_FRM_ERR_NONE] = "normal", + [SPOE_FRM_ERR_IO] = "I/O error", + [SPOE_FRM_ERR_TOUT] = "a timeout occurred", + [SPOE_FRM_ERR_TOO_BIG] = "frame is too big", + [SPOE_FRM_ERR_INVALID] = "invalid frame received", + [SPOE_FRM_ERR_NO_VSN] = "version value not found", + [SPOE_FRM_ERR_NO_FRAME_SIZE] = "max-frame-size value not found", + [SPOE_FRM_ERR_NO_CAP] = "capabilities value not found", + [SPOE_FRM_ERR_BAD_VSN] = "unsupported version", + [SPOE_FRM_ERR_BAD_FRAME_SIZE] = "max-frame-size too big or too small", + [SPOE_FRM_ERR_FRAG_NOT_SUPPORTED] = "fragmentation not supported", + [SPOE_FRM_ERR_INTERLACED_FRAMES] = "invalid interlaced frames", + [SPOE_FRM_ERR_FRAMEID_NOTFOUND] = "frame-id not found", + [SPOE_FRM_ERR_RES] = "resource allocation error", + [SPOE_FRM_ERR_UNKNOWN] = "an unknown error occurred", +}; + +static const char *spoe_event_str[SPOE_EV_EVENTS] = { + [SPOE_EV_ON_CLIENT_SESS] = "on-client-session", + [SPOE_EV_ON_TCP_REQ_FE] = "on-frontend-tcp-request", + [SPOE_EV_ON_TCP_REQ_BE] = "on-backend-tcp-request", + [SPOE_EV_ON_HTTP_REQ_FE] = "on-frontend-http-request", + [SPOE_EV_ON_HTTP_REQ_BE] = "on-backend-http-request", + + [SPOE_EV_ON_SERVER_SESS] = "on-server-session", + [SPOE_EV_ON_TCP_RSP] = "on-tcp-response", + [SPOE_EV_ON_HTTP_RSP] = "on-http-response", +}; + + +#if defined(DEBUG_SPOE) || defined(DEBUG_FULL) + +static const char *spoe_ctx_state_str[SPOE_CTX_ST_ERROR+1] = { + [SPOE_CTX_ST_NONE] = "NONE", + [SPOE_CTX_ST_READY] = "READY", + [SPOE_CTX_ST_ENCODING_MSGS] = "ENCODING_MSGS", + [SPOE_CTX_ST_SENDING_MSGS] = "SENDING_MSGS", + [SPOE_CTX_ST_WAITING_ACK] = "WAITING_ACK", + [SPOE_CTX_ST_DONE] = "DONE", + [SPOE_CTX_ST_ERROR] = "ERROR", +}; + +static const char *spoe_appctx_state_str[SPOE_APPCTX_ST_END+1] = { + [SPOE_APPCTX_ST_CONNECT] = "CONNECT", + [SPOE_APPCTX_ST_CONNECTING] = "CONNECTING", + [SPOE_APPCTX_ST_IDLE] = "IDLE", + [SPOE_APPCTX_ST_PROCESSING] = "PROCESSING", + [SPOE_APPCTX_ST_SENDING_FRAG_NOTIFY] = "SENDING_FRAG_NOTIFY", + [SPOE_APPCTX_ST_WAITING_SYNC_ACK] = "WAITING_SYNC_ACK", + [SPOE_APPCTX_ST_DISCONNECT] = "DISCONNECT", + [SPOE_APPCTX_ST_DISCONNECTING] = "DISCONNECTING", + [SPOE_APPCTX_ST_EXIT] = "EXIT", + [SPOE_APPCTX_ST_END] = "END", +}; + +#endif + +/* Used to generates a unique id for an engine. On success, it returns a + * allocated string. So it is the caller's responsibility to release it. If the + * allocation failed, it returns NULL. */ +static char * +generate_pseudo_uuid() +{ + ha_generate_uuid(&trash); + return my_strndup(trash.area, trash.data); +} + +/* set/add to <t> the elapsed time since <since> and now */ +static inline void +spoe_update_stat_time(ullong *since, long *t) +{ + if (*t == -1) + *t = ns_to_ms(now_ns - *since); + else + *t += ns_to_ms(now_ns - *since); + *since = 0; +} + +/******************************************************************** + * Functions that encode/decode SPOE frames + ********************************************************************/ +/* Helper to get static string length, excluding the terminating null byte */ +#define SLEN(str) (sizeof(str)-1) + +/* Predefined key used in HELLO/DISCONNECT frames */ +#define SUPPORTED_VERSIONS_KEY "supported-versions" +#define VERSION_KEY "version" +#define MAX_FRAME_SIZE_KEY "max-frame-size" +#define CAPABILITIES_KEY "capabilities" +#define ENGINE_ID_KEY "engine-id" +#define HEALTHCHECK_KEY "healthcheck" +#define STATUS_CODE_KEY "status-code" +#define MSG_KEY "message" + +struct spoe_version { + char *str; + int min; + int max; +}; + +/* All supported versions */ +static struct spoe_version supported_versions[] = { + /* 1.0 is now unsupported because of a bug about frame's flags*/ + {"2.0", 2000, 2000}, + {NULL, 0, 0} +}; + +/* Comma-separated list of supported versions */ +#define SUPPORTED_VERSIONS_VAL "2.0" + +/* Convert a string to a SPOE version value. The string must follow the format + * "MAJOR.MINOR". It will be concerted into the integer (1000 * MAJOR + MINOR). + * If an error occurred, -1 is returned. */ +static int +spoe_str_to_vsn(const char *str, size_t len) +{ + const char *p, *end; + int maj, min, vsn; + + p = str; + end = str+len; + maj = min = 0; + vsn = -1; + + /* skip leading spaces */ + while (p < end && isspace((unsigned char)*p)) + p++; + + /* parse Major number, until the '.' */ + while (*p != '.') { + if (p >= end || *p < '0' || *p > '9') + goto out; + maj *= 10; + maj += (*p - '0'); + p++; + } + + /* check Major version */ + if (!maj) + goto out; + + p++; /* skip the '.' */ + if (p >= end || *p < '0' || *p > '9') /* Minor number is missing */ + goto out; + + /* Parse Minor number */ + while (p < end) { + if (*p < '0' || *p > '9') + break; + min *= 10; + min += (*p - '0'); + p++; + } + + /* check Minor number */ + if (min > 999) + goto out; + + /* skip trailing spaces */ + while (p < end && isspace((unsigned char)*p)) + p++; + if (p != end) + goto out; + + vsn = maj * 1000 + min; + out: + return vsn; +} + +/* Encode the HELLO frame sent by HAProxy to an agent. It returns the number of + * encoded bytes in the frame on success, 0 if an encoding error occurred and -1 + * if a fatal error occurred. */ +static int +spoe_prepare_hahello_frame(struct appctx *appctx, char *frame, size_t size) +{ + struct buffer *chk; + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + char *p, *end; + unsigned int flags = SPOE_FRM_FL_FIN; + size_t sz; + + p = frame; + end = frame+size; + + /* Set Frame type */ + *p++ = SPOE_FRM_T_HAPROXY_HELLO; + + /* Set flags */ + flags = htonl(flags); + memcpy(p, (char *)&flags, 4); + p += 4; + + /* No stream-id and frame-id for HELLO frames */ + *p++ = 0; *p++ = 0; + + /* There are 3 mandatory items: "supported-versions", "max-frame-size" + * and "capabilities" */ + + /* "supported-versions" K/V item */ + sz = SLEN(SUPPORTED_VERSIONS_KEY); + if (spoe_encode_buffer(SUPPORTED_VERSIONS_KEY, sz, &p, end) == -1) + goto too_big; + + *p++ = SPOE_DATA_T_STR; + sz = SLEN(SUPPORTED_VERSIONS_VAL); + if (spoe_encode_buffer(SUPPORTED_VERSIONS_VAL, sz, &p, end) == -1) + goto too_big; + + /* "max-fram-size" K/V item */ + sz = SLEN(MAX_FRAME_SIZE_KEY); + if (spoe_encode_buffer(MAX_FRAME_SIZE_KEY, sz, &p, end) == -1) + goto too_big; + + *p++ = SPOE_DATA_T_UINT32; + if (encode_varint(SPOE_APPCTX(appctx)->max_frame_size, &p, end) == -1) + goto too_big; + + /* "capabilities" K/V item */ + sz = SLEN(CAPABILITIES_KEY); + if (spoe_encode_buffer(CAPABILITIES_KEY, sz, &p, end) == -1) + goto too_big; + + *p++ = SPOE_DATA_T_STR; + chk = get_trash_chunk(); + if (agent != NULL && (agent->flags & SPOE_FL_PIPELINING)) { + memcpy(chk->area, "pipelining", 10); + chk->data += 10; + } + if (agent != NULL && (agent->flags & SPOE_FL_ASYNC)) { + if (chk->data) chk->area[chk->data++] = ','; + memcpy(chk->area+chk->data, "async", 5); + chk->data += 5; + } + if (agent != NULL && (agent->flags & SPOE_FL_RCV_FRAGMENTATION)) { + if (chk->data) chk->area[chk->data++] = ','; + memcpy(chk->area+chk->data, "fragmentation", 13); + chk->data += 13; + } + if (spoe_encode_buffer(chk->area, chk->data, &p, end) == -1) + goto too_big; + + /* (optional) "engine-id" K/V item, if present */ + if (agent != NULL && agent->rt[tid].engine_id != NULL) { + sz = SLEN(ENGINE_ID_KEY); + if (spoe_encode_buffer(ENGINE_ID_KEY, sz, &p, end) == -1) + goto too_big; + + *p++ = SPOE_DATA_T_STR; + sz = strlen(agent->rt[tid].engine_id); + if (spoe_encode_buffer(agent->rt[tid].engine_id, sz, &p, end) == -1) + goto too_big; + } + + return (p - frame); + + too_big: + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOO_BIG; + return 0; +} + +/* Encode DISCONNECT frame sent by HAProxy to an agent. It returns the number of + * encoded bytes in the frame on success, 0 if an encoding error occurred and -1 + * if a fatal error occurred. */ +static int +spoe_prepare_hadiscon_frame(struct appctx *appctx, char *frame, size_t size) +{ + const char *reason; + char *p, *end; + unsigned int flags = SPOE_FRM_FL_FIN; + size_t sz; + + p = frame; + end = frame+size; + + /* Set Frame type */ + *p++ = SPOE_FRM_T_HAPROXY_DISCON; + + /* Set flags */ + flags = htonl(flags); + memcpy(p, (char *)&flags, 4); + p += 4; + + /* No stream-id and frame-id for DISCONNECT frames */ + *p++ = 0; *p++ = 0; + + if (SPOE_APPCTX(appctx)->status_code >= SPOE_FRM_ERRS) + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_UNKNOWN; + + /* There are 2 mandatory items: "status-code" and "message" */ + + /* "status-code" K/V item */ + sz = SLEN(STATUS_CODE_KEY); + if (spoe_encode_buffer(STATUS_CODE_KEY, sz, &p, end) == -1) + goto too_big; + + *p++ = SPOE_DATA_T_UINT32; + if (encode_varint(SPOE_APPCTX(appctx)->status_code, &p, end) == -1) + goto too_big; + + /* "message" K/V item */ + sz = SLEN(MSG_KEY); + if (spoe_encode_buffer(MSG_KEY, sz, &p, end) == -1) + goto too_big; + + /*Get the message corresponding to the status code */ + reason = spoe_frm_err_reasons[SPOE_APPCTX(appctx)->status_code]; + + *p++ = SPOE_DATA_T_STR; + sz = strlen(reason); + if (spoe_encode_buffer(reason, sz, &p, end) == -1) + goto too_big; + + return (p - frame); + + too_big: + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOO_BIG; + return 0; +} + +/* Encode the NOTIFY frame sent by HAProxy to an agent. It returns the number of + * encoded bytes in the frame on success, 0 if an encoding error occurred and -1 + * if a fatal error occurred. */ +static int +spoe_prepare_hanotify_frame(struct appctx *appctx, struct spoe_context *ctx, + char *frame, size_t size) +{ + char *p, *end; + unsigned int stream_id, frame_id; + unsigned int flags = SPOE_FRM_FL_FIN; + size_t sz; + + p = frame; + end = frame+size; + + stream_id = ctx->stream_id; + frame_id = ctx->frame_id; + + if (ctx->flags & SPOE_CTX_FL_FRAGMENTED) { + /* The fragmentation is not supported by the applet */ + if (!(SPOE_APPCTX(appctx)->flags & SPOE_APPCTX_FL_FRAGMENTATION)) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_FRAG_NOT_SUPPORTED; + return -1; + } + flags = ctx->frag_ctx.flags; + } + + /* Set Frame type */ + *p++ = SPOE_FRM_T_HAPROXY_NOTIFY; + + /* Set flags */ + flags = htonl(flags); + memcpy(p, (char *)&flags, 4); + p += 4; + + /* Set stream-id and frame-id */ + if (encode_varint(stream_id, &p, end) == -1) + goto too_big; + if (encode_varint(frame_id, &p, end) == -1) + goto too_big; + + /* Copy encoded messages, if possible */ + sz = b_data(&ctx->buffer); + if (p + sz >= end) + goto too_big; + memcpy(p, b_head(&ctx->buffer), sz); + p += sz; + + return (p - frame); + + too_big: + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOO_BIG; + return 0; +} + +/* Encode next part of a fragmented frame sent by HAProxy to an agent. It + * returns the number of encoded bytes in the frame on success, 0 if an encoding + * error occurred and -1 if a fatal error occurred. */ +static int +spoe_prepare_hafrag_frame(struct appctx *appctx, struct spoe_context *ctx, + char *frame, size_t size) +{ + char *p, *end; + unsigned int stream_id, frame_id; + unsigned int flags; + size_t sz; + + p = frame; + end = frame+size; + + /* <ctx> is null when the stream has aborted the processing of a + * fragmented frame. In this case, we must notify the corresponding + * agent using ids stored in <frag_ctx>. */ + if (ctx == NULL) { + flags = (SPOE_FRM_FL_FIN|SPOE_FRM_FL_ABRT); + stream_id = SPOE_APPCTX(appctx)->frag_ctx.cursid; + frame_id = SPOE_APPCTX(appctx)->frag_ctx.curfid; + } + else { + flags = ctx->frag_ctx.flags; + stream_id = ctx->stream_id; + frame_id = ctx->frame_id; + } + + /* Set Frame type */ + *p++ = SPOE_FRM_T_UNSET; + + /* Set flags */ + flags = htonl(flags); + memcpy(p, (char *)&flags, 4); + p += 4; + + /* Set stream-id and frame-id */ + if (encode_varint(stream_id, &p, end) == -1) + goto too_big; + if (encode_varint(frame_id, &p, end) == -1) + goto too_big; + + if (ctx == NULL) + goto end; + + /* Copy encoded messages, if possible */ + sz = b_data(&ctx->buffer); + if (p + sz >= end) + goto too_big; + memcpy(p, b_head(&ctx->buffer), sz); + p += sz; + + end: + return (p - frame); + + too_big: + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOO_BIG; + return 0; +} + +/* Decode and process the HELLO frame sent by an agent. It returns the number of + * read bytes on success, 0 if a decoding error occurred, and -1 if a fatal + * error occurred. */ +static int +spoe_handle_agenthello_frame(struct appctx *appctx, char *frame, size_t size) +{ + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + char *p, *end; + int vsn, max_frame_size; + unsigned int flags; + + p = frame; + end = frame + size; + + /* Check frame type */ + if (*p++ != SPOE_FRM_T_AGENT_HELLO) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + if (size < 7 /* TYPE + METADATA */) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + /* Retrieve flags */ + memcpy((char *)&flags, p, 4); + flags = ntohl(flags); + p += 4; + + /* Fragmentation is not supported for HELLO frame */ + if (!(flags & SPOE_FRM_FL_FIN)) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_FRAG_NOT_SUPPORTED; + return -1; + } + + /* stream-id and frame-id must be cleared */ + if (*p != 0 || *(p+1) != 0) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + p += 2; + + /* There are 3 mandatory items: "version", "max-frame-size" and + * "capabilities" */ + + /* Loop on K/V items */ + vsn = max_frame_size = flags = 0; + while (p < end) { + char *str; + uint64_t sz; + int ret; + + /* Decode the item key */ + ret = spoe_decode_buffer(&p, end, &str, &sz); + if (ret == -1 || !sz) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + /* Check "version" K/V item */ + if (sz >= strlen(VERSION_KEY) && !memcmp(str, VERSION_KEY, strlen(VERSION_KEY))) { + int i, type = *p++; + + /* The value must be a string */ + if ((type & SPOE_DATA_T_MASK) != SPOE_DATA_T_STR) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + if (spoe_decode_buffer(&p, end, &str, &sz) == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + vsn = spoe_str_to_vsn(str, sz); + if (vsn == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_BAD_VSN; + return -1; + } + for (i = 0; supported_versions[i].str != NULL; ++i) { + if (vsn >= supported_versions[i].min && + vsn <= supported_versions[i].max) + break; + } + if (supported_versions[i].str == NULL) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_BAD_VSN; + return -1; + } + } + /* Check "max-frame-size" K/V item */ + else if (sz >= strlen(MAX_FRAME_SIZE_KEY) && !memcmp(str, MAX_FRAME_SIZE_KEY, strlen(MAX_FRAME_SIZE_KEY))) { + int type = *p++; + + /* The value must be integer */ + if ((type & SPOE_DATA_T_MASK) != SPOE_DATA_T_INT32 && + (type & SPOE_DATA_T_MASK) != SPOE_DATA_T_INT64 && + (type & SPOE_DATA_T_MASK) != SPOE_DATA_T_UINT32 && + (type & SPOE_DATA_T_MASK) != SPOE_DATA_T_UINT64) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + if (decode_varint(&p, end, &sz) == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + if (sz < MIN_FRAME_SIZE || + sz > SPOE_APPCTX(appctx)->max_frame_size) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_BAD_FRAME_SIZE; + return -1; + } + max_frame_size = sz; + } + /* Check "capabilities" K/V item */ + else if (sz >= strlen(CAPABILITIES_KEY) && !memcmp(str, CAPABILITIES_KEY, strlen(CAPABILITIES_KEY))) { + int type = *p++; + + /* The value must be a string */ + if ((type & SPOE_DATA_T_MASK) != SPOE_DATA_T_STR) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + if (spoe_decode_buffer(&p, end, &str, &sz) == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + while (sz) { + char *delim; + + /* Skip leading spaces */ + for (; isspace((unsigned char)*str) && sz; str++, sz--); + + if (sz >= 10 && !strncmp(str, "pipelining", 10)) { + str += 10; sz -= 10; + if (!sz || isspace((unsigned char)*str) || *str == ',') + flags |= SPOE_APPCTX_FL_PIPELINING; + } + else if (sz >= 5 && !strncmp(str, "async", 5)) { + str += 5; sz -= 5; + if (!sz || isspace((unsigned char)*str) || *str == ',') + flags |= SPOE_APPCTX_FL_ASYNC; + } + else if (sz >= 13 && !strncmp(str, "fragmentation", 13)) { + str += 13; sz -= 13; + if (!sz || isspace((unsigned char)*str) || *str == ',') + flags |= SPOE_APPCTX_FL_FRAGMENTATION; + } + + /* Get the next comma or break */ + if (!sz || (delim = memchr(str, ',', sz)) == NULL) + break; + delim++; + sz -= (delim - str); + str = delim; + } + } + else { + /* Silently ignore unknown item */ + if (spoe_skip_data(&p, end) == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + } + } + + /* Final checks */ + if (!vsn) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_NO_VSN; + return -1; + } + if (!max_frame_size) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_NO_FRAME_SIZE; + return -1; + } + if (!agent) + flags &= ~(SPOE_APPCTX_FL_PIPELINING|SPOE_APPCTX_FL_ASYNC); + else { + if ((flags & SPOE_APPCTX_FL_PIPELINING) && !(agent->flags & SPOE_FL_PIPELINING)) + flags &= ~SPOE_APPCTX_FL_PIPELINING; + if ((flags & SPOE_APPCTX_FL_ASYNC) && !(agent->flags & SPOE_FL_ASYNC)) + flags &= ~SPOE_APPCTX_FL_ASYNC; + } + + SPOE_APPCTX(appctx)->version = (unsigned int)vsn; + SPOE_APPCTX(appctx)->max_frame_size = (unsigned int)max_frame_size; + SPOE_APPCTX(appctx)->flags |= flags; + + return (p - frame); +} + +/* Decode DISCONNECT frame sent by an agent. It returns the number of by read + * bytes on success, 0 if the frame can be ignored and -1 if an error + * occurred. */ +static int +spoe_handle_agentdiscon_frame(struct appctx *appctx, char *frame, size_t size) +{ + char *p, *end; + unsigned int flags; + + p = frame; + end = frame + size; + + /* Check frame type */ + if (*p++ != SPOE_FRM_T_AGENT_DISCON) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + if (size < 7 /* TYPE + METADATA */) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + /* Retrieve flags */ + memcpy((char *)&flags, p, 4); + flags = ntohl(flags); + p += 4; + + /* Fragmentation is not supported for DISCONNECT frame */ + if (!(flags & SPOE_FRM_FL_FIN)) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_FRAG_NOT_SUPPORTED; + return -1; + } + + /* stream-id and frame-id must be cleared */ + if (*p != 0 || *(p+1) != 0) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + p += 2; + + /* There are 2 mandatory items: "status-code" and "message" */ + + /* Loop on K/V items */ + while (p < end) { + char *str; + uint64_t sz; + int ret; + + /* Decode the item key */ + ret = spoe_decode_buffer(&p, end, &str, &sz); + if (ret == -1 || !sz) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + /* Check "status-code" K/V item */ + if (sz >= strlen(STATUS_CODE_KEY) && !memcmp(str, STATUS_CODE_KEY, strlen(STATUS_CODE_KEY))) { + int type = *p++; + + /* The value must be an integer */ + if ((type & SPOE_DATA_T_MASK) != SPOE_DATA_T_INT32 && + (type & SPOE_DATA_T_MASK) != SPOE_DATA_T_INT64 && + (type & SPOE_DATA_T_MASK) != SPOE_DATA_T_UINT32 && + (type & SPOE_DATA_T_MASK) != SPOE_DATA_T_UINT64) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + if (decode_varint(&p, end, &sz) == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + SPOE_APPCTX(appctx)->status_code = sz; + } + + /* Check "message" K/V item */ + else if (sz >= strlen(MSG_KEY) && !memcmp(str, MSG_KEY, strlen(MSG_KEY))) { + int type = *p++; + + /* The value must be a string */ + if ((type & SPOE_DATA_T_MASK) != SPOE_DATA_T_STR) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + ret = spoe_decode_buffer(&p, end, &str, &sz); + if (ret == -1 || sz > 255) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } +#if defined(DEBUG_SPOE) || defined(DEBUG_FULL) + SPOE_APPCTX(appctx)->reason = str; + SPOE_APPCTX(appctx)->rlen = sz; +#endif + } + else { + /* Silently ignore unknown item */ + if (spoe_skip_data(&p, end) == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + } + } + + return (p - frame); +} + + +/* Decode ACK frame sent by an agent. It returns the number of read bytes on + * success, 0 if the frame can be ignored and -1 if an error occurred. */ +static int +spoe_handle_agentack_frame(struct appctx *appctx, struct spoe_context **ctx, + char *frame, size_t size) +{ + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + char *p, *end; + uint64_t stream_id, frame_id; + int len; + unsigned int flags; + + p = frame; + end = frame + size; + *ctx = NULL; + + /* Check frame type */ + if (*p++ != SPOE_FRM_T_AGENT_ACK) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + if (size < 7 /* TYPE + METADATA */) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + /* Retrieve flags */ + memcpy((char *)&flags, p, 4); + flags = ntohl(flags); + p += 4; + + /* Fragmentation is not supported for now */ + if (!(flags & SPOE_FRM_FL_FIN)) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_FRAG_NOT_SUPPORTED; + return -1; + } + + /* Get the stream-id and the frame-id */ + if (decode_varint(&p, end, &stream_id) == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + if (decode_varint(&p, end, &frame_id) == -1) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + /* Try to find the corresponding SPOE context */ + if (SPOE_APPCTX(appctx)->flags & SPOE_APPCTX_FL_ASYNC) { + list_for_each_entry((*ctx), &agent->rt[tid].waiting_queue, list) { + if ((*ctx)->stream_id == (unsigned int)stream_id && + (*ctx)->frame_id == (unsigned int)frame_id) + goto found; + } + } + else { + list_for_each_entry((*ctx), &SPOE_APPCTX(appctx)->waiting_queue, list) { + if ((*ctx)->stream_id == (unsigned int)stream_id && + (*ctx)->frame_id == (unsigned int)frame_id) + goto found; + } + } + + if (SPOE_APPCTX(appctx)->frag_ctx.ctx && + SPOE_APPCTX(appctx)->frag_ctx.cursid == (unsigned int)stream_id && + SPOE_APPCTX(appctx)->frag_ctx.curfid == (unsigned int)frame_id) { + + /* ABRT bit is set for an unfinished fragmented frame */ + if (flags & SPOE_FRM_FL_ABRT) { + *ctx = SPOE_APPCTX(appctx)->frag_ctx.ctx; + (*ctx)->state = SPOE_CTX_ST_ERROR; + (*ctx)->status_code = SPOE_CTX_ERR_FRAG_FRAME_ABRT; + /* Ignore the payload */ + goto end; + } + /* TODO: Handle more flags for fragmented frames: RESUME, FINISH... */ + /* For now, we ignore the ack */ + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_INVALID; + return 0; + } + + /* No Stream found, ignore the frame */ + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - Ignore ACK frame" + " - stream-id=%u - frame-id=%u\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, appctx, + (unsigned int)stream_id, (unsigned int)frame_id); + + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_FRAMEID_NOTFOUND; + if (appctx->st0 == SPOE_APPCTX_ST_WAITING_SYNC_ACK) { + /* Report an error if we are waiting the ack for another frame, + * but not if there is no longer frame waiting for a ack + * (timeout) + */ + if (!LIST_ISEMPTY(&SPOE_APPCTX(appctx)->waiting_queue) || + SPOE_APPCTX(appctx)->frag_ctx.ctx) + return -1; + appctx->st0 = SPOE_APPCTX_ST_PROCESSING; + SPOE_APPCTX(appctx)->cur_fpa = 0; + } + return 0; + + found: + if (!spoe_acquire_buffer(&SPOE_APPCTX(appctx)->buffer, + &SPOE_APPCTX(appctx)->buffer_wait)) { + *ctx = NULL; + return 1; /* Retry later */ + } + + /* Copy encoded actions */ + len = (end - p); + memcpy(b_head(&SPOE_APPCTX(appctx)->buffer), p, len); + b_set_data(&SPOE_APPCTX(appctx)->buffer, len); + p += len; + + /* Transfer the buffer ownership to the SPOE context */ + (*ctx)->buffer = SPOE_APPCTX(appctx)->buffer; + SPOE_APPCTX(appctx)->buffer = BUF_NULL; + + (*ctx)->state = SPOE_CTX_ST_DONE; + + end: + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - ACK frame received" + " - ctx=%p - stream-id=%u - frame-id=%u - flags=0x%08x\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, appctx, *ctx, (*ctx)->stream_id, + (*ctx)->frame_id, flags); + return (p - frame); +} + +/* This function is used in cfgparse.c and declared in proto/checks.h. It + * prepare the request to send to agents during a healthcheck. It returns 0 on + * success and -1 if an error occurred. */ +int +spoe_prepare_healthcheck_request(char **req, int *len) +{ + struct appctx appctx; + struct spoe_appctx spoe_appctx; + char *frame, *end, buf[MAX_FRAME_SIZE+4]; + size_t sz; + int ret; + + memset(&appctx, 0, sizeof(appctx)); + memset(&spoe_appctx, 0, sizeof(spoe_appctx)); + memset(buf, 0, sizeof(buf)); + + appctx.svcctx = &spoe_appctx; + SPOE_APPCTX(&appctx)->max_frame_size = MAX_FRAME_SIZE; + + frame = buf+4; /* Reserved the 4 first bytes for the frame size */ + end = frame + MAX_FRAME_SIZE; + + ret = spoe_prepare_hahello_frame(&appctx, frame, MAX_FRAME_SIZE); + if (ret <= 0) + return -1; + frame += ret; + + /* Add "healthcheck" K/V item */ + sz = SLEN(HEALTHCHECK_KEY); + if (spoe_encode_buffer(HEALTHCHECK_KEY, sz, &frame, end) == -1) + return -1; + *frame++ = (SPOE_DATA_T_BOOL | SPOE_DATA_FL_TRUE); + + *len = frame - buf; + sz = htonl(*len - 4); + memcpy(buf, (char *)&sz, 4); + + if ((*req = malloc(*len)) == NULL) + return -1; + memcpy(*req, buf, *len); + return 0; +} + +/* This function is used in checks.c and declared in proto/checks.h. It decode + * the response received from an agent during a healthcheck. It returns 0 on + * success and -1 if an error occurred. */ +int +spoe_handle_healthcheck_response(char *frame, size_t size, char *err, int errlen) +{ + struct appctx appctx; + struct spoe_appctx spoe_appctx; + + memset(&appctx, 0, sizeof(appctx)); + memset(&spoe_appctx, 0, sizeof(spoe_appctx)); + + appctx.svcctx = &spoe_appctx; + SPOE_APPCTX(&appctx)->max_frame_size = MAX_FRAME_SIZE; + + if (*frame == SPOE_FRM_T_AGENT_DISCON) { + spoe_handle_agentdiscon_frame(&appctx, frame, size); + goto error; + } + if (spoe_handle_agenthello_frame(&appctx, frame, size) <= 0) + goto error; + + return 0; + + error: + if (SPOE_APPCTX(&appctx)->status_code >= SPOE_FRM_ERRS) + SPOE_APPCTX(&appctx)->status_code = SPOE_FRM_ERR_UNKNOWN; + strncpy(err, spoe_frm_err_reasons[SPOE_APPCTX(&appctx)->status_code], errlen); + return -1; +} + +/* Send a SPOE frame to an agent. It returns -1 when an error occurred, 0 when + * the frame can be ignored, 1 to retry later, and the frame length on + * success. */ +static int +spoe_send_frame(struct appctx *appctx, char *buf, size_t framesz) +{ + struct stconn *sc = appctx_sc(appctx); + int ret; + uint32_t netint; + + /* 4 bytes are reserved at the beginning of <buf> to store the frame + * length. */ + netint = htonl(framesz); + memcpy(buf, (char *)&netint, 4); + ret = applet_putblk(appctx, buf, framesz+4); + if (ret <= 0) { + if (ret == -3 && b_is_null(&sc_ic(sc)->buf)) { + /* WT: is this still needed for the case ret==-3 ? */ + sc_need_room(sc, 0); + return 1; /* retry */ + } + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_IO; + return -1; /* error */ + } + return framesz; +} + +/* Receive a SPOE frame from an agent. It return -1 when an error occurred, 0 + * when the frame can be ignored, 1 to retry later and the frame length on + * success. */ +static int +spoe_recv_frame(struct appctx *appctx, char *buf, size_t framesz) +{ + struct stconn *sc = appctx_sc(appctx); + int ret; + uint32_t netint; + + ret = co_getblk(sc_oc(sc), (char *)&netint, 4, 0); + if (ret > 0) { + framesz = ntohl(netint); + if (framesz > SPOE_APPCTX(appctx)->max_frame_size) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOO_BIG; + return -1; + } + ret = co_getblk(sc_oc(sc), buf, framesz, 4); + } + if (ret <= 0) { + if (ret == 0) { + return 1; /* retry */ + } + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_IO; + return -1; /* error */ + } + return framesz; +} + +/******************************************************************** + * Functions that manage the SPOE applet + ********************************************************************/ +static int +spoe_wakeup_appctx(struct appctx *appctx) +{ + applet_will_consume(appctx); + applet_have_more_data(appctx); + appctx_wakeup(appctx); + return 1; +} + +/* Callback function that catches applet timeouts. If a timeout occurred, we set + * <appctx->st1> flag and the SPOE applet is woken up. */ +static struct task * +spoe_process_appctx(struct task * task, void *context, unsigned int state) +{ + struct appctx *appctx = context; + + appctx->st1 = SPOE_APPCTX_ERR_NONE; + if (tick_is_expired(task->expire, now_ms)) { + task->expire = TICK_ETERNITY; + appctx->st1 = SPOE_APPCTX_ERR_TOUT; + } + spoe_wakeup_appctx(appctx); + return task; +} + +static int +spoe_init_appctx(struct appctx *appctx) +{ + struct spoe_appctx *spoe_appctx = SPOE_APPCTX(appctx); + struct spoe_agent *agent = spoe_appctx->agent; + struct task *task; + struct stream *s; + + if ((task = task_new_here()) == NULL) + goto out_error; + task->process = spoe_process_appctx; + task->context = appctx; + + if (appctx_finalize_startup(appctx, &agent->spoe_conf->agent_fe, &BUF_NULL) == -1) + goto out_free_task; + + spoe_appctx->owner = appctx; + spoe_appctx->task = task; + + LIST_INIT(&spoe_appctx->buffer_wait.list); + spoe_appctx->buffer_wait.target = appctx; + spoe_appctx->buffer_wait.wakeup_cb = (int (*)(void *))spoe_wakeup_appctx; + + s = appctx_strm(appctx); + stream_set_backend(s, agent->b.be); + + /* applet is waiting for data */ + applet_need_more_data(appctx); + + s->do_log = NULL; + s->scb->flags |= SC_FL_RCV_ONCE; + + HA_SPIN_LOCK(SPOE_APPLET_LOCK, &agent->rt[tid].lock); + LIST_APPEND(&agent->rt[tid].applets, &spoe_appctx->list); + HA_SPIN_UNLOCK(SPOE_APPLET_LOCK, &agent->rt[tid].lock); + _HA_ATOMIC_INC(&agent->counters.applets); + + appctx->st0 = SPOE_APPCTX_ST_CONNECT; + task_wakeup(spoe_appctx->task, TASK_WOKEN_INIT); + return 0; + out_free_task: + task_destroy(task); + out_error: + return -1; +} + +/* Callback function that releases a SPOE applet. This happens when the + * connection with the agent is closed. */ +static void +spoe_release_appctx(struct appctx *appctx) +{ + struct spoe_appctx *spoe_appctx = SPOE_APPCTX(appctx); + struct spoe_agent *agent; + struct spoe_context *ctx, *back; + + if (spoe_appctx == NULL) + return; + + appctx->svcctx = NULL; + agent = spoe_appctx->agent; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, appctx); + + /* Remove applet from the list of running applets */ + _HA_ATOMIC_DEC(&agent->counters.applets); + HA_SPIN_LOCK(SPOE_APPLET_LOCK, &agent->rt[tid].lock); + if (!LIST_ISEMPTY(&spoe_appctx->list)) { + LIST_DELETE(&spoe_appctx->list); + LIST_INIT(&spoe_appctx->list); + } + HA_SPIN_UNLOCK(SPOE_APPLET_LOCK, &agent->rt[tid].lock); + + /* Shutdown the server connection, if needed */ + if (appctx->st0 != SPOE_APPCTX_ST_END) { + if (appctx->st0 == SPOE_APPCTX_ST_IDLE) { + eb32_delete(&spoe_appctx->node); + _HA_ATOMIC_DEC(&agent->counters.idles); + agent->rt[tid].idles--; + } + + appctx->st0 = SPOE_APPCTX_ST_END; + if (spoe_appctx->status_code == SPOE_FRM_ERR_NONE) + spoe_appctx->status_code = SPOE_FRM_ERR_IO; + } + + /* Destroy the task attached to this applet */ + task_destroy(spoe_appctx->task); + + /* Report an error to all streams in the appctx waiting queue */ + list_for_each_entry_safe(ctx, back, &spoe_appctx->waiting_queue, list) { + LIST_DELETE(&ctx->list); + LIST_INIT(&ctx->list); + _HA_ATOMIC_DEC(&agent->counters.nb_waiting); + spoe_update_stat_time(&ctx->stats.wait_ts, &ctx->stats.t_waiting); + ctx->spoe_appctx = NULL; + ctx->state = SPOE_CTX_ST_ERROR; + ctx->status_code = (spoe_appctx->status_code + 0x100); + task_wakeup(ctx->strm->task, TASK_WOKEN_MSG); + } + + /* If the applet was processing a fragmented frame, report an error to + * the corresponding stream. */ + if (spoe_appctx->frag_ctx.ctx) { + ctx = spoe_appctx->frag_ctx.ctx; + ctx->spoe_appctx = NULL; + ctx->state = SPOE_CTX_ST_ERROR; + ctx->status_code = (spoe_appctx->status_code + 0x100); + task_wakeup(ctx->strm->task, TASK_WOKEN_MSG); + } + + if (!LIST_ISEMPTY(&agent->rt[tid].applets)) { + /* If there are still some running applets, remove reference on + * the current one from streams in the async waiting queue. In + * async mode, the ACK may be received from another appctx. + */ + list_for_each_entry_safe(ctx, back, &agent->rt[tid].waiting_queue, list) { + if (ctx->spoe_appctx == spoe_appctx) + ctx->spoe_appctx = NULL; + } + goto end; + } + else { + /* It is the last running applet and the sending and async + * waiting queues are not empty. So try to start a new applet if + * HAproxy is not stopping. On success, we remove reference on + * the current appctx from streams in the async waiting queue. + * In async mode, the ACK may be received from another appctx. + */ + if (!stopping && + (!LIST_ISEMPTY(&agent->rt[tid].sending_queue) || !LIST_ISEMPTY(&agent->rt[tid].waiting_queue)) && + spoe_create_appctx(agent->spoe_conf)) { + list_for_each_entry_safe(ctx, back, &agent->rt[tid].waiting_queue, list) { + if (ctx->spoe_appctx == spoe_appctx) + ctx->spoe_appctx = NULL; + } + goto end; + } + + /* Otherwise, report an error to all streams in the sending and + * async waiting queues. + */ + list_for_each_entry_safe(ctx, back, &agent->rt[tid].sending_queue, list) { + LIST_DELETE(&ctx->list); + LIST_INIT(&ctx->list); + _HA_ATOMIC_DEC(&agent->counters.nb_sending); + spoe_update_stat_time(&ctx->stats.queue_ts, &ctx->stats.t_queue); + ctx->spoe_appctx = NULL; + ctx->state = SPOE_CTX_ST_ERROR; + ctx->status_code = (spoe_appctx->status_code + 0x100); + task_wakeup(ctx->strm->task, TASK_WOKEN_MSG); + } + list_for_each_entry_safe(ctx, back, &agent->rt[tid].waiting_queue, list) { + LIST_DELETE(&ctx->list); + LIST_INIT(&ctx->list); + _HA_ATOMIC_DEC(&agent->counters.nb_waiting); + spoe_update_stat_time(&ctx->stats.wait_ts, &ctx->stats.t_waiting); + ctx->spoe_appctx = NULL; + ctx->state = SPOE_CTX_ST_ERROR; + ctx->status_code = (spoe_appctx->status_code + 0x100); + task_wakeup(ctx->strm->task, TASK_WOKEN_MSG); + } + } + + end: + /* Release allocated memory */ + spoe_release_buffer(&spoe_appctx->buffer, + &spoe_appctx->buffer_wait); + pool_free(pool_head_spoe_appctx, spoe_appctx); + + /* Update runtinme agent info */ + agent->rt[tid].frame_size = agent->max_frame_size; + list_for_each_entry(spoe_appctx, &agent->rt[tid].applets, list) + HA_ATOMIC_UPDATE_MIN(&agent->rt[tid].frame_size, spoe_appctx->max_frame_size); +} + +static int +spoe_handle_connect_appctx(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + char *frame, *buf; + int ret; + + /* if the connection is not established, inform the stream that we want + * to be notified whenever the connection completes. + */ + if (sc_opposite(sc)->state < SC_ST_EST) { + applet_need_more_data(appctx); + se_need_remote_conn(appctx->sedesc); + applet_have_more_data(appctx); + goto stop; + } + + if (appctx->st1 == SPOE_APPCTX_ERR_TOUT) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - Connection timed out\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, appctx); + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOUT; + goto exit; + } + + if (SPOE_APPCTX(appctx)->task->expire == TICK_ETERNITY) + SPOE_APPCTX(appctx)->task->expire = + tick_add_ifset(now_ms, agent->timeout.hello); + + /* 4 bytes are reserved at the beginning of <buf> to store the frame + * length. */ + buf = trash.area; frame = buf+4; + ret = spoe_prepare_hahello_frame(appctx, frame, + SPOE_APPCTX(appctx)->max_frame_size); + if (ret > 1) + ret = spoe_send_frame(appctx, buf, ret); + + switch (ret) { + case -1: /* error */ + case 0: /* ignore => an error, cannot be ignored */ + goto exit; + + case 1: /* retry later */ + goto stop; + + default: + /* HELLO frame successfully sent, now wait for the + * reply. */ + appctx->st0 = SPOE_APPCTX_ST_CONNECTING; + goto next; + } + + next: + return 0; + stop: + return 1; + exit: + appctx->st0 = SPOE_APPCTX_ST_EXIT; + return 0; +} + +static int +spoe_handle_connecting_appctx(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + char *frame; + int ret; + + if (appctx->st1 == SPOE_APPCTX_ERR_TOUT) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - Connection timed out\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, appctx); + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOUT; + goto exit; + } + + frame = trash.area; trash.data = 0; + ret = spoe_recv_frame(appctx, frame, + SPOE_APPCTX(appctx)->max_frame_size); + if (ret > 1) { + if (*frame == SPOE_FRM_T_AGENT_DISCON) { + appctx->st0 = SPOE_APPCTX_ST_DISCONNECTING; + goto next; + } + trash.data = ret + 4; + ret = spoe_handle_agenthello_frame(appctx, frame, ret); + } + + switch (ret) { + case -1: /* error */ + case 0: /* ignore => an error, cannot be ignored */ + appctx->st0 = SPOE_APPCTX_ST_DISCONNECT; + goto next; + + case 1: /* retry later */ + goto stop; + + default: + _HA_ATOMIC_INC(&agent->counters.idles); + agent->rt[tid].idles++; + appctx->st0 = SPOE_APPCTX_ST_IDLE; + SPOE_APPCTX(appctx)->node.key = 0; + eb32_insert(&agent->rt[tid].idle_applets, &SPOE_APPCTX(appctx)->node); + + /* Update runtinme agent info */ + HA_ATOMIC_UPDATE_MIN(&agent->rt[tid].frame_size, SPOE_APPCTX(appctx)->max_frame_size); + goto next; + } + + next: + /* Do not forget to remove processed frame from the output buffer */ + if (trash.data) + co_skip(sc_oc(sc), trash.data); + + SPOE_APPCTX(appctx)->task->expire = + tick_add_ifset(now_ms, agent->timeout.idle); + return 0; + stop: + return 1; + exit: + appctx->st0 = SPOE_APPCTX_ST_EXIT; + return 0; +} + + +static int +spoe_handle_sending_frame_appctx(struct appctx *appctx, int *skip) +{ + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + struct spoe_context *ctx = NULL; + char *frame, *buf; + int ret; + + /* 4 bytes are reserved at the beginning of <buf> to store the frame + * length. */ + buf = trash.area; frame = buf+4; + + if (appctx->st0 == SPOE_APPCTX_ST_SENDING_FRAG_NOTIFY) { + ctx = SPOE_APPCTX(appctx)->frag_ctx.ctx; + ret = spoe_prepare_hafrag_frame(appctx, ctx, frame, + SPOE_APPCTX(appctx)->max_frame_size); + } + else if (LIST_ISEMPTY(&agent->rt[tid].sending_queue)) { + *skip = 1; + ret = 1; + goto end; + } + else { + ctx = LIST_NEXT(&agent->rt[tid].sending_queue, typeof(ctx), list); + ret = spoe_prepare_hanotify_frame(appctx, ctx, frame, + SPOE_APPCTX(appctx)->max_frame_size); + + } + + if (ret > 1) + ret = spoe_send_frame(appctx, buf, ret); + + switch (ret) { + case -1: /* error */ + appctx->st0 = SPOE_APPCTX_ST_DISCONNECT; + goto end; + + case 0: /* ignore */ + if (ctx == NULL) + goto abort_frag_frame; + + spoe_release_buffer(&ctx->buffer, &ctx->buffer_wait); + LIST_DELETE(&ctx->list); + LIST_INIT(&ctx->list); + _HA_ATOMIC_DEC(&agent->counters.nb_sending); + spoe_update_stat_time(&ctx->stats.queue_ts, &ctx->stats.t_queue); + ctx->spoe_appctx = NULL; + ctx->state = SPOE_CTX_ST_ERROR; + ctx->status_code = (SPOE_APPCTX(appctx)->status_code + 0x100); + task_wakeup(ctx->strm->task, TASK_WOKEN_MSG); + *skip = 1; + break; + + case 1: /* retry */ + *skip = 1; + break; + + default: + if (ctx == NULL) + goto abort_frag_frame; + + spoe_release_buffer(&ctx->buffer, &ctx->buffer_wait); + LIST_DELETE(&ctx->list); + LIST_INIT(&ctx->list); + _HA_ATOMIC_DEC(&agent->counters.nb_sending); + spoe_update_stat_time(&ctx->stats.queue_ts, &ctx->stats.t_queue); + ctx->spoe_appctx = SPOE_APPCTX(appctx); + if (!(ctx->flags & SPOE_CTX_FL_FRAGMENTED) || + (ctx->frag_ctx.flags & SPOE_FRM_FL_FIN)) + goto no_frag_frame_sent; + else + goto frag_frame_sent; + } + goto end; + + frag_frame_sent: + appctx->st0 = SPOE_APPCTX_ST_SENDING_FRAG_NOTIFY; + *skip = 1; + SPOE_APPCTX(appctx)->frag_ctx.ctx = ctx; + SPOE_APPCTX(appctx)->frag_ctx.cursid = ctx->stream_id; + SPOE_APPCTX(appctx)->frag_ctx.curfid = ctx->frame_id; + ctx->state = SPOE_CTX_ST_ENCODING_MSGS; + task_wakeup(ctx->strm->task, TASK_WOKEN_MSG); + goto end; + + no_frag_frame_sent: + if (SPOE_APPCTX(appctx)->flags & SPOE_APPCTX_FL_ASYNC) { + appctx->st0 = SPOE_APPCTX_ST_PROCESSING; + LIST_APPEND(&agent->rt[tid].waiting_queue, &ctx->list); + } + else if (SPOE_APPCTX(appctx)->flags & SPOE_APPCTX_FL_PIPELINING) { + appctx->st0 = SPOE_APPCTX_ST_PROCESSING; + LIST_APPEND(&SPOE_APPCTX(appctx)->waiting_queue, &ctx->list); + } + else { + appctx->st0 = SPOE_APPCTX_ST_WAITING_SYNC_ACK; + *skip = 1; + LIST_APPEND(&SPOE_APPCTX(appctx)->waiting_queue, &ctx->list); + } + _HA_ATOMIC_INC(&agent->counters.nb_waiting); + ctx->stats.wait_ts = now_ns; + SPOE_APPCTX(appctx)->frag_ctx.ctx = NULL; + SPOE_APPCTX(appctx)->frag_ctx.cursid = 0; + SPOE_APPCTX(appctx)->frag_ctx.curfid = 0; + SPOE_APPCTX(appctx)->cur_fpa++; + + ctx->state = SPOE_CTX_ST_WAITING_ACK; + goto end; + + abort_frag_frame: + appctx->st0 = SPOE_APPCTX_ST_PROCESSING; + SPOE_APPCTX(appctx)->frag_ctx.ctx = NULL; + SPOE_APPCTX(appctx)->frag_ctx.cursid = 0; + SPOE_APPCTX(appctx)->frag_ctx.curfid = 0; + goto end; + + end: + return ret; +} + +static int +spoe_handle_receiving_frame_appctx(struct appctx *appctx, int *skip) +{ + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + struct spoe_context *ctx = NULL; + char *frame; + int ret; + + frame = trash.area; trash.data = 0; + ret = spoe_recv_frame(appctx, frame, + SPOE_APPCTX(appctx)->max_frame_size); + if (ret > 1) { + if (*frame == SPOE_FRM_T_AGENT_DISCON) { + appctx->st0 = SPOE_APPCTX_ST_DISCONNECTING; + ret = -1; + goto end; + } + trash.data = ret + 4; + ret = spoe_handle_agentack_frame(appctx, &ctx, frame, ret); + } + switch (ret) { + case -1: /* error */ + appctx->st0 = SPOE_APPCTX_ST_DISCONNECT; + break; + + case 0: /* ignore */ + break; + + case 1: /* retry */ + *skip = 1; + break; + + default: + LIST_DELETE(&ctx->list); + LIST_INIT(&ctx->list); + _HA_ATOMIC_DEC(&agent->counters.nb_waiting); + spoe_update_stat_time(&ctx->stats.wait_ts, &ctx->stats.t_waiting); + ctx->stats.response_ts = now_ns; + if (ctx->spoe_appctx) { + ctx->spoe_appctx->cur_fpa--; + ctx->spoe_appctx = NULL; + } + if (appctx->st0 == SPOE_APPCTX_ST_SENDING_FRAG_NOTIFY && + ctx == SPOE_APPCTX(appctx)->frag_ctx.ctx) { + appctx->st0 = SPOE_APPCTX_ST_PROCESSING; + SPOE_APPCTX(appctx)->frag_ctx.ctx = NULL; + SPOE_APPCTX(appctx)->frag_ctx.cursid = 0; + SPOE_APPCTX(appctx)->frag_ctx.curfid = 0; + } + else if (appctx->st0 == SPOE_APPCTX_ST_WAITING_SYNC_ACK) + appctx->st0 = SPOE_APPCTX_ST_PROCESSING; + task_wakeup(ctx->strm->task, TASK_WOKEN_MSG); + break; + } + + /* Do not forget to remove processed frame from the output buffer */ + if (trash.data) + co_skip(sc_oc(appctx_sc(appctx)), trash.data); + end: + return ret; +} + +static int +spoe_handle_processing_appctx(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct server *srv = objt_server(__sc_strm(sc)->target); + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + int ret, skip_sending = 0, skip_receiving = 0, active_s = 0, active_r = 0, close_asap = 0; + + if (appctx->st1 == SPOE_APPCTX_ERR_TOUT) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOUT; + appctx->st0 = SPOE_APPCTX_ST_DISCONNECT; + appctx->st1 = SPOE_APPCTX_ERR_NONE; + goto next; + } + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - process: fpa=%u/%u - appctx-state=%s - weight=%u - flags=0x%08x\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, appctx, SPOE_APPCTX(appctx)->cur_fpa, + agent->max_fpa, spoe_appctx_state_str[appctx->st0], + SPOE_APPCTX(appctx)->node.key, SPOE_APPCTX(appctx)->flags); + + + /* Close the applet ASAP because some sessions are waiting for a free + * connection slot. It is only an issue in multithreaded mode. + */ + close_asap = (global.nbthread > 1 && + (agent->b.be->queue.length || + (srv && (srv->queue.length || (srv->maxconn && srv->served >= srv_dynamic_maxconn(srv)))))); + + /* receiving_frame loop */ + while (!skip_receiving) { + ret = spoe_handle_receiving_frame_appctx(appctx, &skip_receiving); + switch (ret) { + case -1: /* error */ + goto next; + + case 0: /* ignore */ + active_r = 1; + break; + + case 1: /* retry */ + break; + + default: + active_r = 1; + break; + } + } + + /* Don"t try to send new frame we are waiting for at lease a ack, in + * sync mode or if applet must be closed ASAP + */ + if (appctx->st0 == SPOE_APPCTX_ST_WAITING_SYNC_ACK || (close_asap && SPOE_APPCTX(appctx)->cur_fpa)) + skip_sending = 1; + + /* send_frame loop */ + while (!skip_sending && SPOE_APPCTX(appctx)->cur_fpa < agent->max_fpa) { + ret = spoe_handle_sending_frame_appctx(appctx, &skip_sending); + switch (ret) { + case -1: /* error */ + goto next; + + case 0: /* ignore */ + if (SPOE_APPCTX(appctx)->node.key) + SPOE_APPCTX(appctx)->node.key--; + active_s++; + break; + + case 1: /* retry */ + break; + + default: + if (SPOE_APPCTX(appctx)->node.key) + SPOE_APPCTX(appctx)->node.key--; + active_s++; + break; + } + + /* if applet must be close ASAP, don't send more than a frame */ + if (close_asap) + break; + } + + if (active_s || active_r) { + update_freq_ctr(&agent->rt[tid].processing_per_sec, active_s); + SPOE_APPCTX(appctx)->task->expire = tick_add_ifset(now_ms, agent->timeout.idle); + } + + if (appctx->st0 == SPOE_APPCTX_ST_PROCESSING && SPOE_APPCTX(appctx)->cur_fpa < agent->max_fpa) { + /* If applet must be closed, don't switch it in IDLE state and + * close it when the last waiting frame is acknowledged. + */ + if (close_asap) { + if (SPOE_APPCTX(appctx)->cur_fpa) + goto out; + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_NONE; + appctx->st0 = SPOE_APPCTX_ST_DISCONNECT; + appctx->st1 = SPOE_APPCTX_ERR_NONE; + goto next; + } + _HA_ATOMIC_INC(&agent->counters.idles); + agent->rt[tid].idles++; + appctx->st0 = SPOE_APPCTX_ST_IDLE; + eb32_insert(&agent->rt[tid].idle_applets, &SPOE_APPCTX(appctx)->node); + } + + out: + return 1; + + next: + SPOE_APPCTX(appctx)->task->expire = tick_add_ifset(now_ms, agent->timeout.idle); + return 0; +} + +static int +spoe_handle_disconnect_appctx(struct appctx *appctx) +{ + struct spoe_agent *agent = SPOE_APPCTX(appctx)->agent; + char *frame, *buf; + int ret; + + if (appctx->st1 == SPOE_APPCTX_ERR_TOUT) + goto exit; + + /* 4 bytes are reserved at the beginning of <buf> to store the frame + * length. */ + buf = trash.area; frame = buf+4; + ret = spoe_prepare_hadiscon_frame(appctx, frame, + SPOE_APPCTX(appctx)->max_frame_size); + if (ret > 1) + ret = spoe_send_frame(appctx, buf, ret); + + switch (ret) { + case -1: /* error */ + case 0: /* ignore => an error, cannot be ignored */ + goto exit; + + case 1: /* retry */ + goto stop; + + default: + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - disconnected by HAProxy (%d): %s\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, appctx, + SPOE_APPCTX(appctx)->status_code, + spoe_frm_err_reasons[SPOE_APPCTX(appctx)->status_code]); + + appctx->st0 = SPOE_APPCTX_ST_DISCONNECTING; + goto next; + } + + next: + SPOE_APPCTX(appctx)->task->expire = + tick_add_ifset(now_ms, agent->timeout.idle); + return 0; + stop: + return 1; + exit: + appctx->st0 = SPOE_APPCTX_ST_EXIT; + return 0; +} + +static int +spoe_handle_disconnecting_appctx(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + char *frame; + int ret; + + if (appctx->st1 == SPOE_APPCTX_ERR_TOUT) { + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_TOUT; + goto exit; + } + + frame = trash.area; trash.data = 0; + ret = spoe_recv_frame(appctx, frame, + SPOE_APPCTX(appctx)->max_frame_size); + if (ret > 1) { + trash.data = ret + 4; + ret = spoe_handle_agentdiscon_frame(appctx, frame, ret); + } + + switch (ret) { + case -1: /* error */ + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - error on frame (%s)\n", + (int)date.tv_sec, (int)date.tv_usec, + ((struct spoe_agent *)SPOE_APPCTX(appctx)->agent)->id, + __FUNCTION__, appctx, + spoe_frm_err_reasons[SPOE_APPCTX(appctx)->status_code]); + goto exit; + + case 0: /* ignore */ + goto next; + + case 1: /* retry */ + goto stop; + + default: + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - disconnected by peer (%d): %.*s\n", + (int)date.tv_sec, (int)date.tv_usec, + ((struct spoe_agent *)SPOE_APPCTX(appctx)->agent)->id, + __FUNCTION__, appctx, SPOE_APPCTX(appctx)->status_code, + SPOE_APPCTX(appctx)->rlen, SPOE_APPCTX(appctx)->reason); + goto exit; + } + + next: + /* Do not forget to remove processed frame from the output buffer */ + if (trash.data) + co_skip(sc_oc(sc), trash.data); + + return 0; + stop: + return 1; + exit: + appctx->st0 = SPOE_APPCTX_ST_EXIT; + return 0; +} + +/* I/O Handler processing messages exchanged with the agent */ +static void +spoe_handle_appctx(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct spoe_agent *agent; + + if (SPOE_APPCTX(appctx) == NULL) + return; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) { + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + goto out; + } + + SPOE_APPCTX(appctx)->status_code = SPOE_FRM_ERR_NONE; + agent = SPOE_APPCTX(appctx)->agent; + + switchstate: + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: appctx=%p" + " - appctx-state=%s\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, appctx, spoe_appctx_state_str[appctx->st0]); + + switch (appctx->st0) { + case SPOE_APPCTX_ST_CONNECT: + if (spoe_handle_connect_appctx(appctx)) + goto out; + goto switchstate; + + case SPOE_APPCTX_ST_CONNECTING: + if (spoe_handle_connecting_appctx(appctx)) + goto out; + goto switchstate; + + case SPOE_APPCTX_ST_IDLE: + _HA_ATOMIC_DEC(&agent->counters.idles); + agent->rt[tid].idles--; + eb32_delete(&SPOE_APPCTX(appctx)->node); + if (stopping && + LIST_ISEMPTY(&agent->rt[tid].sending_queue) && + LIST_ISEMPTY(&SPOE_APPCTX(appctx)->waiting_queue)) { + SPOE_APPCTX(appctx)->task->expire = + tick_add_ifset(now_ms, agent->timeout.idle); + appctx->st0 = SPOE_APPCTX_ST_DISCONNECT; + goto switchstate; + } + appctx->st0 = SPOE_APPCTX_ST_PROCESSING; + __fallthrough; + + case SPOE_APPCTX_ST_PROCESSING: + case SPOE_APPCTX_ST_SENDING_FRAG_NOTIFY: + case SPOE_APPCTX_ST_WAITING_SYNC_ACK: + if (spoe_handle_processing_appctx(appctx)) + goto out; + goto switchstate; + + case SPOE_APPCTX_ST_DISCONNECT: + if (spoe_handle_disconnect_appctx(appctx)) + goto out; + goto switchstate; + + case SPOE_APPCTX_ST_DISCONNECTING: + if (spoe_handle_disconnecting_appctx(appctx)) + goto out; + goto switchstate; + + case SPOE_APPCTX_ST_EXIT: + appctx->st0 = SPOE_APPCTX_ST_END; + SPOE_APPCTX(appctx)->task->expire = TICK_ETERNITY; + se_fl_set(appctx->sedesc, SE_FL_EOS); + if (SPOE_APPCTX(appctx)->status_code != SPOE_FRM_ERR_NONE) + se_fl_set(appctx->sedesc, SE_FL_ERROR); + else + se_fl_set(appctx->sedesc, SE_FL_EOI); + __fallthrough; + + case SPOE_APPCTX_ST_END: + return; + } + out: + if (SPOE_APPCTX(appctx)->task->expire != TICK_ETERNITY) + task_queue(SPOE_APPCTX(appctx)->task); +} + +struct applet spoe_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<SPOE>", /* used for logging */ + .fct = spoe_handle_appctx, + .init = spoe_init_appctx, + .release = spoe_release_appctx, +}; + +/* Create a SPOE applet. On success, the created applet is returned, else + * NULL. */ +static struct appctx * +spoe_create_appctx(struct spoe_config *conf) +{ + struct spoe_agent *agent = conf->agent; + struct spoe_appctx *spoe_appctx; + struct appctx *appctx; + + /* Do not try to create a new applet if there is no server up for the + * agent's backend. */ + if (!agent->b.be->srv_act && !agent->b.be->srv_bck) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: don't create SPOE appctx: no server up\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, __FUNCTION__); + goto out; + } + + /* Do not try to create a new applet if we have reached the maximum of + * connection per seconds */ + if (agent->cps_max > 0) { + if (!freq_ctr_remain(&agent->rt[tid].conn_per_sec, agent->cps_max, 0)) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: don't create SPOE appctx: max CPS reached\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, __FUNCTION__); + goto out; + } + } + + spoe_appctx = pool_zalloc(pool_head_spoe_appctx); + if (spoe_appctx == NULL) + goto out_error; + + spoe_appctx->agent = agent; + spoe_appctx->version = 0; + spoe_appctx->max_frame_size = agent->max_frame_size; + spoe_appctx->flags = 0; + spoe_appctx->status_code = SPOE_FRM_ERR_NONE; + spoe_appctx->buffer = BUF_NULL; + spoe_appctx->cur_fpa = 0; + LIST_INIT(&spoe_appctx->list); + LIST_INIT(&spoe_appctx->waiting_queue); + + + if ((appctx = appctx_new_here(&spoe_applet, NULL)) == NULL) + goto out_free_spoe_appctx; + + appctx->svcctx = spoe_appctx; + if (appctx_init(appctx) == -1) + goto out_free_appctx; + + /* Increase the per-process number of cumulated connections */ + if (agent->cps_max > 0) + update_freq_ctr(&agent->rt[tid].conn_per_sec, 1); + + appctx_wakeup(appctx); + return appctx; + + /* Error unrolling */ + out_free_appctx: + appctx_free_on_early_error(appctx); + out_free_spoe_appctx: + pool_free(pool_head_spoe_appctx, spoe_appctx); + out_error: + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: failed to create SPOE appctx\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, __FUNCTION__); + send_log(&conf->agent_fe, LOG_EMERG, "SPOE: [%s] failed to create SPOE applet\n", agent->id); + out: + + return NULL; +} + +static int +spoe_queue_context(struct spoe_context *ctx) +{ + struct spoe_config *conf = FLT_CONF(ctx->filter); + struct spoe_agent *agent = conf->agent; + struct spoe_appctx *spoe_appctx; + + /* Check if we need to create a new SPOE applet or not. */ + if (agent->rt[tid].processing < agent->rt[tid].idles || + agent->rt[tid].processing < read_freq_ctr(&agent->rt[tid].processing_per_sec)) + goto end; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - try to create new SPOE appctx\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, __FUNCTION__, + ctx->strm); + + spoe_create_appctx(conf); + + end: + /* The only reason to return an error is when there is no applet */ + if (LIST_ISEMPTY(&agent->rt[tid].applets)) { + ctx->status_code = SPOE_CTX_ERR_RES; + return -1; + } + + /* Add the SPOE context in the sending queue if the stream has no applet + * already assigned and wakeup all idle applets. Otherwise, don't queue + * it. */ + _HA_ATOMIC_INC(&agent->counters.nb_sending); + spoe_update_stat_time(&ctx->stats.request_ts, &ctx->stats.t_request); + ctx->stats.queue_ts = now_ns; + if (ctx->spoe_appctx) + return 1; + LIST_APPEND(&agent->rt[tid].sending_queue, &ctx->list); + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - Add stream in sending queue" + " - applets=%u - idles=%u - processing=%u\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, __FUNCTION__, + ctx->strm, agent->counters.applets, agent->counters.idles, + agent->rt[tid].processing); + + /* Finally try to wakeup an IDLE applet. */ + if (!eb_is_empty(&agent->rt[tid].idle_applets)) { + struct eb32_node *node; + + node = eb32_first(&agent->rt[tid].idle_applets); + spoe_appctx = eb32_entry(node, struct spoe_appctx, node); + if (node && spoe_appctx) { + eb32_delete(&spoe_appctx->node); + spoe_appctx->node.key++; + eb32_insert(&agent->rt[tid].idle_applets, &spoe_appctx->node); + spoe_wakeup_appctx(spoe_appctx->owner); + } + } + return 1; +} + +/*************************************************************************** + * Functions that encode SPOE messages + **************************************************************************/ +/* Encode a SPOE message. Info in <ctx->frag_ctx>, if any, are used to handle + * fragmented_content. If the next message can be processed, it returns 0. If + * the message is too big, it returns -1.*/ +static int +spoe_encode_message(struct stream *s, struct spoe_context *ctx, + struct spoe_message *msg, int dir, + char **buf, char *end) +{ + struct sample *smp; + struct spoe_arg *arg; + int ret; + + if (msg->cond) { + ret = acl_exec_cond(msg->cond, s->be, s->sess, s, dir|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (msg->cond->pol == ACL_COND_UNLESS) + ret = !ret; + + /* the rule does not match */ + if (!ret) + goto next; + } + + /* Resume encoding of a SPOE argument */ + if (ctx->frag_ctx.curarg != NULL) { + arg = ctx->frag_ctx.curarg; + goto encode_argument; + } + + if (ctx->frag_ctx.curoff != UINT_MAX) + goto encode_msg_payload; + + /* Check if there is enough space for the message name and the + * number of arguments. It implies <msg->id_len> is encoded on 2 + * bytes, at most (< 2288). */ + if (*buf + 2 + msg->id_len + 1 > end) + goto too_big; + + /* Encode the message name */ + if (spoe_encode_buffer(msg->id, msg->id_len, buf, end) == -1) + goto too_big; + + /* Set the number of arguments for this message */ + **buf = msg->nargs; + (*buf)++; + + ctx->frag_ctx.curoff = 0; + encode_msg_payload: + + /* Loop on arguments */ + list_for_each_entry(arg, &msg->args, list) { + ctx->frag_ctx.curarg = arg; + ctx->frag_ctx.curoff = UINT_MAX; + ctx->frag_ctx.curlen = 0; + + encode_argument: + if (ctx->frag_ctx.curoff != UINT_MAX) + goto encode_arg_value; + + /* Encode the argument name as a string. It can by NULL */ + if (spoe_encode_buffer(arg->name, arg->name_len, buf, end) == -1) + goto too_big; + + ctx->frag_ctx.curoff = 0; + encode_arg_value: + + /* Fetch the argument value */ + smp = sample_process(s->be, s->sess, s, dir|SMP_OPT_FINAL, arg->expr, NULL); + if (smp) { + smp->ctx.a[0] = &ctx->frag_ctx.curlen; + smp->ctx.a[1] = &ctx->frag_ctx.curoff; + } + ret = spoe_encode_data(smp, buf, end); + if (ret == -1 || ctx->frag_ctx.curoff) + goto too_big; + } + + next: + return 0; + + too_big: + return -1; +} + +/* Encode list of SPOE messages. Info in <ctx->frag_ctx>, if any, are used to + * handle fragmented content. On success it returns 1. If an error occurred, -1 + * is returned. If nothing has been encoded, it returns 0 (this is only possible + * for unfragmented payload). */ +static int +spoe_encode_messages(struct stream *s, struct spoe_context *ctx, + struct list *messages, int dir, int type) +{ + struct spoe_config *conf = FLT_CONF(ctx->filter); + struct spoe_agent *agent = conf->agent; + struct spoe_message *msg; + char *p, *end; + + p = b_head(&ctx->buffer); + end = p + agent->rt[tid].frame_size - FRAME_HDR_SIZE; + + if (type == SPOE_MSGS_BY_EVENT) { /* Loop on messages by event */ + /* Resume encoding of a SPOE message */ + if (ctx->frag_ctx.curmsg != NULL) { + msg = ctx->frag_ctx.curmsg; + goto encode_evt_message; + } + + list_for_each_entry(msg, messages, by_evt) { + ctx->frag_ctx.curmsg = msg; + ctx->frag_ctx.curarg = NULL; + ctx->frag_ctx.curoff = UINT_MAX; + + encode_evt_message: + if (spoe_encode_message(s, ctx, msg, dir, &p, end) == -1) + goto too_big; + } + } + else if (type == SPOE_MSGS_BY_GROUP) { /* Loop on messages by group */ + /* Resume encoding of a SPOE message */ + if (ctx->frag_ctx.curmsg != NULL) { + msg = ctx->frag_ctx.curmsg; + goto encode_grp_message; + } + + list_for_each_entry(msg, messages, by_grp) { + ctx->frag_ctx.curmsg = msg; + ctx->frag_ctx.curarg = NULL; + ctx->frag_ctx.curoff = UINT_MAX; + + encode_grp_message: + if (spoe_encode_message(s, ctx, msg, dir, &p, end) == -1) + goto too_big; + } + } + else + goto skip; + + + /* nothing has been encoded for an unfragmented payload */ + if (!(ctx->flags & SPOE_CTX_FL_FRAGMENTED) && p == b_head(&ctx->buffer)) + goto skip; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - encode %s messages - spoe_appctx=%p" + "- max_size=%u - encoded=%ld\n", + (int)date.tv_sec, (int)date.tv_usec, + agent->id, __FUNCTION__, s, + ((ctx->flags & SPOE_CTX_FL_FRAGMENTED) ? "last fragment of" : "unfragmented"), + ctx->spoe_appctx, (agent->rt[tid].frame_size - FRAME_HDR_SIZE), + p - b_head(&ctx->buffer)); + + b_set_data(&ctx->buffer, p - b_head(&ctx->buffer)); + ctx->frag_ctx.curmsg = NULL; + ctx->frag_ctx.curarg = NULL; + ctx->frag_ctx.curoff = 0; + ctx->frag_ctx.flags = SPOE_FRM_FL_FIN; + + return 1; + + too_big: + /* Return an error if fragmentation is unsupported or if nothing has + * been encoded because its too big and not splittable. */ + if (!(agent->flags & SPOE_FL_SND_FRAGMENTATION) || p == b_head(&ctx->buffer)) { + ctx->status_code = SPOE_CTX_ERR_TOO_BIG; + return -1; + } + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - encode fragmented messages - spoe_appctx=%p" + " - curmsg=%p - curarg=%p - curoff=%u" + " - max_size=%u - encoded=%ld\n", + (int)date.tv_sec, (int)date.tv_usec, + agent->id, __FUNCTION__, s, ctx->spoe_appctx, + ctx->frag_ctx.curmsg, ctx->frag_ctx.curarg, ctx->frag_ctx.curoff, + (agent->rt[tid].frame_size - FRAME_HDR_SIZE), p - b_head(&ctx->buffer)); + + b_set_data(&ctx->buffer, p - b_head(&ctx->buffer)); + ctx->flags |= SPOE_CTX_FL_FRAGMENTED; + ctx->frag_ctx.flags &= ~SPOE_FRM_FL_FIN; + return 1; + + skip: + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - skip the frame because nothing has been encoded\n", + (int)date.tv_sec, (int)date.tv_usec, + agent->id, __FUNCTION__, s); + return 0; +} + + +/*************************************************************************** + * Functions that handle SPOE actions + **************************************************************************/ +/* Helper function to set a variable */ +static void +spoe_set_var(struct spoe_context *ctx, char *scope, char *name, int len, + struct sample *smp) +{ + struct spoe_config *conf = FLT_CONF(ctx->filter); + struct spoe_agent *agent = conf->agent; + char varname[64]; + + memset(varname, 0, sizeof(varname)); + len = snprintf(varname, sizeof(varname), "%s.%s.%.*s", + scope, agent->var_pfx, len, name); + if (agent->flags & SPOE_FL_FORCE_SET_VAR) + vars_set_by_name(varname, len, smp); + else + vars_set_by_name_ifexist(varname, len, smp); +} + +/* Helper function to unset a variable */ +static void +spoe_unset_var(struct spoe_context *ctx, char *scope, char *name, int len, + struct sample *smp) +{ + struct spoe_config *conf = FLT_CONF(ctx->filter); + struct spoe_agent *agent = conf->agent; + char varname[64]; + + memset(varname, 0, sizeof(varname)); + len = snprintf(varname, sizeof(varname), "%s.%s.%.*s", + scope, agent->var_pfx, len, name); + vars_unset_by_name_ifexist(varname, len, smp); +} + + +static inline int +spoe_decode_action_set_var(struct stream *s, struct spoe_context *ctx, + char **buf, char *end, int dir) +{ + char *str, *scope, *p = *buf; + struct sample smp; + uint64_t sz; + int ret; + + if (p + 2 >= end) + goto skip; + + /* SET-VAR requires 3 arguments */ + if (*p++ != 3) + goto skip; + + switch (*p++) { + case SPOE_SCOPE_PROC: scope = "proc"; break; + case SPOE_SCOPE_SESS: scope = "sess"; break; + case SPOE_SCOPE_TXN : scope = "txn"; break; + case SPOE_SCOPE_REQ : scope = "req"; break; + case SPOE_SCOPE_RES : scope = "res"; break; + default: goto skip; + } + + if (spoe_decode_buffer(&p, end, &str, &sz) == -1) + goto skip; + memset(&smp, 0, sizeof(smp)); + smp_set_owner(&smp, s->be, s->sess, s, dir|SMP_OPT_FINAL); + + if (spoe_decode_data(&p, end, &smp) == -1) + goto skip; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - set-var '%s.%s.%.*s'\n", + (int)date.tv_sec, (int)date.tv_usec, + ((struct spoe_config *)FLT_CONF(ctx->filter))->agent->id, + __FUNCTION__, s, scope, + ((struct spoe_config *)FLT_CONF(ctx->filter))->agent->var_pfx, + (int)sz, str); + + if (smp.data.type == SMP_T_ANY) + spoe_unset_var(ctx, scope, str, sz, &smp); + else + spoe_set_var(ctx, scope, str, sz, &smp); + + ret = (p - *buf); + *buf = p; + return ret; + skip: + return 0; +} + +static inline int +spoe_decode_action_unset_var(struct stream *s, struct spoe_context *ctx, + char **buf, char *end, int dir) +{ + char *str, *scope, *p = *buf; + struct sample smp; + uint64_t sz; + int ret; + + if (p + 2 >= end) + goto skip; + + /* UNSET-VAR requires 2 arguments */ + if (*p++ != 2) + goto skip; + + switch (*p++) { + case SPOE_SCOPE_PROC: scope = "proc"; break; + case SPOE_SCOPE_SESS: scope = "sess"; break; + case SPOE_SCOPE_TXN : scope = "txn"; break; + case SPOE_SCOPE_REQ : scope = "req"; break; + case SPOE_SCOPE_RES : scope = "res"; break; + default: goto skip; + } + + if (spoe_decode_buffer(&p, end, &str, &sz) == -1) + goto skip; + memset(&smp, 0, sizeof(smp)); + smp_set_owner(&smp, s->be, s->sess, s, dir|SMP_OPT_FINAL); + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - unset-var '%s.%s.%.*s'\n", + (int)date.tv_sec, (int)date.tv_usec, + ((struct spoe_config *)FLT_CONF(ctx->filter))->agent->id, + __FUNCTION__, s, scope, + ((struct spoe_config *)FLT_CONF(ctx->filter))->agent->var_pfx, + (int)sz, str); + + spoe_unset_var(ctx, scope, str, sz, &smp); + + ret = (p - *buf); + *buf = p; + return ret; + skip: + return 0; +} + +/* Process SPOE actions for a specific event. It returns 1 on success. If an + * error occurred, 0 is returned. */ +static int +spoe_process_actions(struct stream *s, struct spoe_context *ctx, int dir) +{ + char *p, *end; + int ret; + + p = b_head(&ctx->buffer); + end = p + b_data(&ctx->buffer); + + while (p < end) { + enum spoe_action_type type; + + type = *p++; + switch (type) { + case SPOE_ACT_T_SET_VAR: + ret = spoe_decode_action_set_var(s, ctx, &p, end, dir); + if (!ret) + goto skip; + break; + + case SPOE_ACT_T_UNSET_VAR: + ret = spoe_decode_action_unset_var(s, ctx, &p, end, dir); + if (!ret) + goto skip; + break; + + default: + goto skip; + } + } + + return 1; + skip: + return 0; +} + +/*************************************************************************** + * Functions that process SPOE events + **************************************************************************/ +static void +spoe_update_stats(struct stream *s, struct spoe_agent *agent, + struct spoe_context *ctx, int dir) +{ + if (ctx->stats.start_ts != 0) { + spoe_update_stat_time(&ctx->stats.start_ts, &ctx->stats.t_process); + ctx->stats.t_total += ctx->stats.t_process; + ctx->stats.request_ts = 0; + ctx->stats.queue_ts = 0; + ctx->stats.wait_ts = 0; + ctx->stats.response_ts = 0; + } + + if (agent->var_t_process) { + struct sample smp; + + memset(&smp, 0, sizeof(smp)); + smp_set_owner(&smp, s->be, s->sess, s, dir|SMP_OPT_FINAL); + smp.data.u.sint = ctx->stats.t_process; + smp.data.type = SMP_T_SINT; + + spoe_set_var(ctx, "txn", agent->var_t_process, + strlen(agent->var_t_process), &smp); + } + + if (agent->var_t_total) { + struct sample smp; + + memset(&smp, 0, sizeof(smp)); + smp_set_owner(&smp, s->be, s->sess, s, dir|SMP_OPT_FINAL); + smp.data.u.sint = ctx->stats.t_total; + smp.data.type = SMP_T_SINT; + + spoe_set_var(ctx, "txn", agent->var_t_total, + strlen(agent->var_t_total), &smp); + } +} + +static void +spoe_handle_processing_error(struct stream *s, struct spoe_agent *agent, + struct spoe_context *ctx, int dir) +{ + if (agent->eps_max > 0) + update_freq_ctr(&agent->rt[tid].err_per_sec, 1); + + if (agent->var_on_error) { + struct sample smp; + + memset(&smp, 0, sizeof(smp)); + smp_set_owner(&smp, s->be, s->sess, s, dir|SMP_OPT_FINAL); + smp.data.u.sint = ctx->status_code; + smp.data.type = SMP_T_BOOL; + + spoe_set_var(ctx, "txn", agent->var_on_error, + strlen(agent->var_on_error), &smp); + } + + ctx->state = ((agent->flags & SPOE_FL_CONT_ON_ERR) + ? SPOE_CTX_ST_READY + : SPOE_CTX_ST_NONE); +} + +static inline int +spoe_start_processing(struct spoe_agent *agent, struct spoe_context *ctx, int dir) +{ + /* If a process is already started for this SPOE context, retry + * later. */ + if (ctx->flags & SPOE_CTX_FL_PROCESS) + return 0; + + agent->rt[tid].processing++; + ctx->stats.start_ts = now_ns; + ctx->stats.request_ts = now_ns; + ctx->stats.t_request = -1; + ctx->stats.t_queue = -1; + ctx->stats.t_waiting = -1; + ctx->stats.t_response = -1; + ctx->stats.t_process = -1; + + ctx->status_code = 0; + + /* Set the right flag to prevent request and response processing + * in same time. */ + ctx->flags |= ((dir == SMP_OPT_DIR_REQ) + ? SPOE_CTX_FL_REQ_PROCESS + : SPOE_CTX_FL_RSP_PROCESS); + return 1; +} + +static inline void +spoe_stop_processing(struct spoe_agent *agent, struct spoe_context *ctx) +{ + struct spoe_appctx *sa = ctx->spoe_appctx; + + if (!(ctx->flags & SPOE_CTX_FL_PROCESS)) + return; + _HA_ATOMIC_INC(&agent->counters.nb_processed); + if (sa) { + if (sa->frag_ctx.ctx == ctx) { + sa->frag_ctx.ctx = NULL; + spoe_wakeup_appctx(sa->owner); + } + else + sa->cur_fpa--; + } + + /* Reset the flag to allow next processing */ + agent->rt[tid].processing--; + ctx->flags &= ~(SPOE_CTX_FL_PROCESS|SPOE_CTX_FL_FRAGMENTED); + + /* Reset processing timer */ + ctx->process_exp = TICK_ETERNITY; + + spoe_release_buffer(&ctx->buffer, &ctx->buffer_wait); + + ctx->spoe_appctx = NULL; + ctx->frag_ctx.curmsg = NULL; + ctx->frag_ctx.curarg = NULL; + ctx->frag_ctx.curoff = 0; + ctx->frag_ctx.flags = 0; + + if (!LIST_ISEMPTY(&ctx->list)) { + if (ctx->state == SPOE_CTX_ST_SENDING_MSGS) + _HA_ATOMIC_DEC(&agent->counters.nb_sending); + else + _HA_ATOMIC_DEC(&agent->counters.nb_waiting); + + LIST_DELETE(&ctx->list); + LIST_INIT(&ctx->list); + } +} + +/* Process a list of SPOE messages. First, this functions will process messages + * and send them to an agent in a NOTIFY frame. Then, it will wait a ACK frame + * to process corresponding actions. During all the processing, it returns 0 + * and it returns 1 when the processing is finished. If an error occurred, -1 + * is returned. */ +static int +spoe_process_messages(struct stream *s, struct spoe_context *ctx, + struct list *messages, int dir, int type) +{ + struct spoe_config *conf = FLT_CONF(ctx->filter); + struct spoe_agent *agent = conf->agent; + int ret = 1; + + if (ctx->state == SPOE_CTX_ST_ERROR) + goto end; + + if (tick_is_expired(ctx->process_exp, now_ms) && ctx->state != SPOE_CTX_ST_DONE) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - failed to process messages: timeout\n", + (int)date.tv_sec, (int)date.tv_usec, + agent->id, __FUNCTION__, s); + ctx->status_code = SPOE_CTX_ERR_TOUT; + goto end; + } + + if (ctx->state == SPOE_CTX_ST_READY) { + if (agent->eps_max > 0) { + if (!freq_ctr_remain(&agent->rt[tid].err_per_sec, agent->eps_max, 0)) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - skip processing of messages: max EPS reached\n", + (int)date.tv_sec, (int)date.tv_usec, + agent->id, __FUNCTION__, s); + goto skip; + } + } + + if (!tick_isset(ctx->process_exp)) { + ctx->process_exp = tick_add_ifset(now_ms, agent->timeout.processing); + s->task->expire = tick_first((tick_is_expired(s->task->expire, now_ms) ? 0 : s->task->expire), + ctx->process_exp); + } + ret = spoe_start_processing(agent, ctx, dir); + if (!ret) + goto out; + + ctx->state = SPOE_CTX_ST_ENCODING_MSGS; + /* fall through */ + } + + if (ctx->state == SPOE_CTX_ST_ENCODING_MSGS) { + if (ctx->stats.request_ts == 0) + ctx->stats.request_ts = now_ns; + if (!spoe_acquire_buffer(&ctx->buffer, &ctx->buffer_wait)) + goto out; + ret = spoe_encode_messages(s, ctx, messages, dir, type); + if (ret < 0) + goto end; + if (!ret) + goto skip; + if (spoe_queue_context(ctx) < 0) + goto end; + ctx->state = SPOE_CTX_ST_SENDING_MSGS; + } + + if (ctx->state == SPOE_CTX_ST_SENDING_MSGS) { + if (ctx->spoe_appctx) + spoe_wakeup_appctx(ctx->spoe_appctx->owner); + ret = 0; + goto out; + } + + if (ctx->state == SPOE_CTX_ST_WAITING_ACK) { + ret = 0; + goto out; + } + + if (ctx->state == SPOE_CTX_ST_DONE) { + spoe_process_actions(s, ctx, dir); + ret = 1; + ctx->frame_id++; + ctx->state = SPOE_CTX_ST_READY; + spoe_update_stat_time(&ctx->stats.response_ts, &ctx->stats.t_response); + goto end; + } + + out: + return ret; + + skip: + ctx->stats.start_ts = 0; + ctx->state = SPOE_CTX_ST_READY; + spoe_stop_processing(agent, ctx); + return 1; + + end: + spoe_update_stats(s, agent, ctx, dir); + spoe_stop_processing(agent, ctx); + if (ctx->status_code) { + _HA_ATOMIC_INC(&agent->counters.nb_errors); + spoe_handle_processing_error(s, agent, ctx, dir); + ret = 1; + } + return ret; +} + +/* Process a SPOE group, ie the list of messages attached to the group <grp>. + * See spoe_process_message for details. */ +static int +spoe_process_group(struct stream *s, struct spoe_context *ctx, + struct spoe_group *group, int dir) +{ + struct spoe_config *conf = FLT_CONF(ctx->filter); + struct spoe_agent *agent = conf->agent; + int ret; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - ctx-state=%s - Process messages for group=%s\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, s, spoe_ctx_state_str[ctx->state], + group->id); + + if (LIST_ISEMPTY(&group->messages)) + return 1; + + ret = spoe_process_messages(s, ctx, &group->messages, dir, SPOE_MSGS_BY_GROUP); + if (ret && ctx->stats.t_process != -1) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - <GROUP:%s> sid=%u st=%u %ld/%ld/%ld/%ld/%ld %u/%u %u/%u %llu/%llu %u/%u\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, s, group->id, s->uniq_id, ctx->status_code, + ctx->stats.t_request, ctx->stats.t_queue, ctx->stats.t_waiting, + ctx->stats.t_response, ctx->stats.t_process, + agent->counters.idles, agent->counters.applets, + agent->counters.nb_sending, agent->counters.nb_waiting, + agent->counters.nb_errors, agent->counters.nb_processed, + agent->rt[tid].processing, read_freq_ctr(&agent->rt[tid].processing_per_sec)); + if (ctx->status_code || !(conf->agent_fe.options2 & PR_O2_NOLOGNORM)) + send_log(&conf->agent_fe, (!ctx->status_code ? LOG_NOTICE : LOG_WARNING), + "SPOE: [%s] <GROUP:%s> sid=%u st=%u %ld/%ld/%ld/%ld/%ld %u/%u %u/%u %llu/%llu\n", + agent->id, group->id, s->uniq_id, ctx->status_code, + ctx->stats.t_request, ctx->stats.t_queue, ctx->stats.t_waiting, + ctx->stats.t_response, ctx->stats.t_process, + agent->counters.idles, agent->counters.applets, + agent->counters.nb_sending, agent->counters.nb_waiting, + agent->counters.nb_errors, agent->counters.nb_processed); + } + return ret; +} + +/* Process a SPOE event, ie the list of messages attached to the event <ev>. + * See spoe_process_message for details. */ +static int +spoe_process_event(struct stream *s, struct spoe_context *ctx, + enum spoe_event ev) +{ + struct spoe_config *conf = FLT_CONF(ctx->filter); + struct spoe_agent *agent = conf->agent; + int dir, ret; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - ctx-state=%s - Process messages for event=%s\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, s, spoe_ctx_state_str[ctx->state], + spoe_event_str[ev]); + + dir = ((ev < SPOE_EV_ON_SERVER_SESS) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES); + + if (LIST_ISEMPTY(&(ctx->events[ev]))) + return 1; + + ret = spoe_process_messages(s, ctx, &(ctx->events[ev]), dir, SPOE_MSGS_BY_EVENT); + if (ret && ctx->stats.t_process != -1) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - <EVENT:%s> sid=%u st=%u %ld/%ld/%ld/%ld/%ld %u/%u %u/%u %llu/%llu %u/%u\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, s, spoe_event_str[ev], s->uniq_id, ctx->status_code, + ctx->stats.t_request, ctx->stats.t_queue, ctx->stats.t_waiting, + ctx->stats.t_response, ctx->stats.t_process, + agent->counters.idles, agent->counters.applets, + agent->counters.nb_sending, agent->counters.nb_waiting, + agent->counters.nb_errors, agent->counters.nb_processed, + agent->rt[tid].processing, read_freq_ctr(&agent->rt[tid].processing_per_sec)); + if (ctx->status_code || !(conf->agent_fe.options2 & PR_O2_NOLOGNORM)) + send_log(&conf->agent_fe, (!ctx->status_code ? LOG_NOTICE : LOG_WARNING), + "SPOE: [%s] <EVENT:%s> sid=%u st=%u %ld/%ld/%ld/%ld/%ld %u/%u %u/%u %llu/%llu\n", + agent->id, spoe_event_str[ev], s->uniq_id, ctx->status_code, + ctx->stats.t_request, ctx->stats.t_queue, ctx->stats.t_waiting, + ctx->stats.t_response, ctx->stats.t_process, + agent->counters.idles, agent->counters.applets, + agent->counters.nb_sending, agent->counters.nb_waiting, + agent->counters.nb_errors, agent->counters.nb_processed); + } + return ret; +} + +/*************************************************************************** + * Functions that create/destroy SPOE contexts + **************************************************************************/ +static int +spoe_acquire_buffer(struct buffer *buf, struct buffer_wait *buffer_wait) +{ + if (buf->size) + return 1; + + if (LIST_INLIST(&buffer_wait->list)) + LIST_DEL_INIT(&buffer_wait->list); + + if (b_alloc(buf)) + return 1; + + LIST_APPEND(&th_ctx->buffer_wq, &buffer_wait->list); + return 0; +} + +static void +spoe_release_buffer(struct buffer *buf, struct buffer_wait *buffer_wait) +{ + if (LIST_INLIST(&buffer_wait->list)) + LIST_DEL_INIT(&buffer_wait->list); + + /* Release the buffer if needed */ + if (buf->size) { + b_free(buf); + offer_buffers(buffer_wait->target, 1); + } +} + +static int +spoe_wakeup_context(struct spoe_context *ctx) +{ + task_wakeup(ctx->strm->task, TASK_WOKEN_MSG); + return 1; +} + +static struct spoe_context * +spoe_create_context(struct stream *s, struct filter *filter) +{ + struct spoe_config *conf = FLT_CONF(filter); + struct spoe_context *ctx; + + ctx = pool_zalloc(pool_head_spoe_ctx); + if (ctx == NULL) { + return NULL; + } + ctx->filter = filter; + ctx->state = SPOE_CTX_ST_NONE; + ctx->status_code = SPOE_CTX_ERR_NONE; + ctx->flags = 0; + ctx->events = conf->agent->events; + ctx->groups = &conf->agent->groups; + ctx->buffer = BUF_NULL; + LIST_INIT(&ctx->buffer_wait.list); + ctx->buffer_wait.target = ctx; + ctx->buffer_wait.wakeup_cb = (int (*)(void *))spoe_wakeup_context; + LIST_INIT(&ctx->list); + + ctx->stream_id = 0; + ctx->frame_id = 1; + ctx->process_exp = TICK_ETERNITY; + + ctx->stats.start_ts = 0; + ctx->stats.request_ts = 0; + ctx->stats.queue_ts = 0; + ctx->stats.wait_ts = 0; + ctx->stats.response_ts= 0; + ctx->stats.t_request = -1; + ctx->stats.t_queue = -1; + ctx->stats.t_waiting = -1; + ctx->stats.t_response = -1; + ctx->stats.t_process = -1; + ctx->stats.t_total = 0; + + ctx->strm = s; + ctx->state = SPOE_CTX_ST_READY; + filter->ctx = ctx; + + return ctx; +} + +static void +spoe_destroy_context(struct filter *filter) +{ + struct spoe_config *conf = FLT_CONF(filter); + struct spoe_context *ctx = filter->ctx; + + if (!ctx) + return; + + spoe_stop_processing(conf->agent, ctx); + pool_free(pool_head_spoe_ctx, ctx); + filter->ctx = NULL; +} + +static void +spoe_reset_context(struct spoe_context *ctx) +{ + ctx->state = SPOE_CTX_ST_READY; + ctx->flags &= ~(SPOE_CTX_FL_PROCESS|SPOE_CTX_FL_FRAGMENTED); + + ctx->stats.start_ts = 0; + ctx->stats.request_ts = 0; + ctx->stats.queue_ts = 0; + ctx->stats.wait_ts = 0; + ctx->stats.response_ts= 0; + ctx->stats.t_request = -1; + ctx->stats.t_queue = -1; + ctx->stats.t_waiting = -1; + ctx->stats.t_response = -1; + ctx->stats.t_process = -1; + ctx->stats.t_total = 0; +} + + +/*************************************************************************** + * Hooks that manage the filter lifecycle (init/check/deinit) + **************************************************************************/ +/* Signal handler: Do a soft stop, wakeup SPOE applet */ +static void +spoe_sig_stop(struct sig_handler *sh) +{ + struct proxy *p; + + p = proxies_list; + while (p) { + struct flt_conf *fconf; + + /* SPOE filter are not initialized for disabled proxoes. Move to + * the next one + */ + if (p->flags & PR_FL_DISABLED) { + p = p->next; + continue; + } + + list_for_each_entry(fconf, &p->filter_configs, list) { + struct spoe_config *conf; + struct spoe_agent *agent; + struct spoe_appctx *spoe_appctx; + int i; + + if (fconf->id != spoe_filter_id) + continue; + + conf = fconf->conf; + agent = conf->agent; + + for (i = 0; i < global.nbthread; ++i) { + HA_SPIN_LOCK(SPOE_APPLET_LOCK, &agent->rt[i].lock); + list_for_each_entry(spoe_appctx, &agent->rt[i].applets, list) + spoe_wakeup_appctx(spoe_appctx->owner); + HA_SPIN_UNLOCK(SPOE_APPLET_LOCK, &agent->rt[i].lock); + } + } + p = p->next; + } +} + + +/* Initialize the SPOE filter. Returns -1 on error, else 0. */ +static int +spoe_init(struct proxy *px, struct flt_conf *fconf) +{ + struct spoe_config *conf = fconf->conf; + + /* conf->agent_fe was already initialized during the config + * parsing. Finish initialization. */ + conf->agent_fe.last_change = ns_to_sec(now_ns); + conf->agent_fe.cap = PR_CAP_FE; + conf->agent_fe.mode = PR_MODE_TCP; + conf->agent_fe.maxconn = 0; + conf->agent_fe.options2 |= PR_O2_INDEPSTR; + conf->agent_fe.conn_retries = CONN_RETRIES; + conf->agent_fe.accept = frontend_accept; + conf->agent_fe.srv = NULL; + conf->agent_fe.timeout.client = TICK_ETERNITY; + conf->agent_fe.fe_req_ana = AN_REQ_SWITCHING_RULES; + + if (!sighandler_registered) { + signal_register_fct(0, spoe_sig_stop, 0); + sighandler_registered = 1; + } + + fconf->flags |= FLT_CFG_FL_HTX; + return 0; +} + +/* Free resources allocated by the SPOE filter. */ +static void +spoe_deinit(struct proxy *px, struct flt_conf *fconf) +{ + struct spoe_config *conf = fconf->conf; + + if (conf) { + struct spoe_agent *agent = conf->agent; + + spoe_release_agent(agent); + free(conf->id); + free(conf); + } + fconf->conf = NULL; +} + +/* Check configuration of a SPOE filter for a specified proxy. + * Return 1 on error, else 0. */ +static int +spoe_check(struct proxy *px, struct flt_conf *fconf) +{ + struct flt_conf *f; + struct spoe_config *conf = fconf->conf; + struct proxy *target; + int i; + + /* Check all SPOE filters for proxy <px> to be sure all SPOE agent names + * are uniq */ + list_for_each_entry(f, &px->filter_configs, list) { + struct spoe_config *c = f->conf; + + /* This is not an SPOE filter */ + if (f->id != spoe_filter_id) + continue; + /* This is the current SPOE filter */ + if (f == fconf) + continue; + + /* Check engine Id. It should be uniq */ + if (strcmp(conf->id, c->id) == 0) { + ha_alert("Proxy %s : duplicated name for SPOE engine '%s'.\n", + px->id, conf->id); + return 1; + } + } + + target = proxy_be_by_name(conf->agent->b.name); + if (target == NULL) { + ha_alert("Proxy %s : unknown backend '%s' used by SPOE agent '%s'" + " declared at %s:%d.\n", + px->id, conf->agent->b.name, conf->agent->id, + conf->agent->conf.file, conf->agent->conf.line); + return 1; + } + if (target->mode != PR_MODE_TCP) { + ha_alert("Proxy %s : backend '%s' used by SPOE agent '%s' declared" + " at %s:%d does not support HTTP mode.\n", + px->id, target->id, conf->agent->id, + conf->agent->conf.file, conf->agent->conf.line); + return 1; + } + + if ((conf->agent->rt = calloc(global.nbthread, sizeof(*conf->agent->rt))) == NULL) { + ha_alert("Proxy %s : out of memory initializing SPOE agent '%s' declared at %s:%d.\n", + px->id, conf->agent->id, conf->agent->conf.file, conf->agent->conf.line); + return 1; + } + for (i = 0; i < global.nbthread; ++i) { + conf->agent->rt[i].engine_id = NULL; + conf->agent->rt[i].frame_size = conf->agent->max_frame_size; + conf->agent->rt[i].processing = 0; + conf->agent->rt[i].idles = 0; + LIST_INIT(&conf->agent->rt[i].applets); + LIST_INIT(&conf->agent->rt[i].sending_queue); + LIST_INIT(&conf->agent->rt[i].waiting_queue); + HA_SPIN_INIT(&conf->agent->rt[i].lock); + } + + if (postresolve_logger_list(&conf->agent_fe.loggers, "SPOE agent", conf->agent->id) & ERR_CODE) + return 1; + + ha_free(&conf->agent->b.name); + conf->agent->b.be = target; + return 0; +} + +/* Initializes the SPOE filter for a proxy for a specific thread. + * Returns a negative value if an error occurs. */ +static int +spoe_init_per_thread(struct proxy *p, struct flt_conf *fconf) +{ + struct spoe_config *conf = fconf->conf; + struct spoe_agent *agent = conf->agent; + + agent->rt[tid].engine_id = generate_pseudo_uuid(); + if (agent->rt[tid].engine_id == NULL) + return -1; + return 0; +} + +/************************************************************************** + * Hooks attached to a stream + *************************************************************************/ +/* Called when a filter instance is created and attach to a stream. It creates + * the context that will be used to process this stream. */ +static int +spoe_start(struct stream *s, struct filter *filter) +{ + struct spoe_config *conf = FLT_CONF(filter); + struct spoe_agent *agent = conf->agent; + struct spoe_context *ctx; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, s); + + if ((ctx = spoe_create_context(s, filter)) == NULL) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - failed to create SPOE context\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, s); + send_log(&conf->agent_fe, LOG_EMERG, + "SPOE: [%s] failed to create SPOE context\n", + agent->id); + return 0; + } + + if (!LIST_ISEMPTY(&ctx->events[SPOE_EV_ON_TCP_REQ_FE])) + filter->pre_analyzers |= AN_REQ_INSPECT_FE; + + if (!LIST_ISEMPTY(&ctx->events[SPOE_EV_ON_TCP_REQ_BE])) + filter->pre_analyzers |= AN_REQ_INSPECT_BE; + + if (!LIST_ISEMPTY(&ctx->events[SPOE_EV_ON_TCP_RSP])) + filter->pre_analyzers |= AN_RES_INSPECT; + + if (!LIST_ISEMPTY(&ctx->events[SPOE_EV_ON_HTTP_REQ_FE])) + filter->pre_analyzers |= AN_REQ_HTTP_PROCESS_FE; + + if (!LIST_ISEMPTY(&ctx->events[SPOE_EV_ON_HTTP_REQ_BE])) + filter->pre_analyzers |= AN_REQ_HTTP_PROCESS_BE; + + if (!LIST_ISEMPTY(&ctx->events[SPOE_EV_ON_HTTP_RSP])) + filter->pre_analyzers |= AN_RES_HTTP_PROCESS_FE; + + return 1; +} + +/* Called when a filter instance is detached from a stream. It release the + * attached SPOE context. */ +static void +spoe_stop(struct stream *s, struct filter *filter) +{ + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p\n", + (int)date.tv_sec, (int)date.tv_usec, + ((struct spoe_config *)FLT_CONF(filter))->agent->id, + __FUNCTION__, s); + spoe_destroy_context(filter); +} + + +/* + * Called when the stream is woken up because of expired timer. + */ +static void +spoe_check_timeouts(struct stream *s, struct filter *filter) +{ + struct spoe_context *ctx = filter->ctx; + + if (tick_is_expired(ctx->process_exp, now_ms)) + s->pending_events |= TASK_WOKEN_MSG; +} + +/* Called when we are ready to filter data on a channel */ +static int +spoe_start_analyze(struct stream *s, struct filter *filter, struct channel *chn) +{ + struct spoe_context *ctx = filter->ctx; + int ret = 1; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p - ctx-state=%s" + " - ctx-flags=0x%08x\n", + (int)date.tv_sec, (int)date.tv_usec, + ((struct spoe_config *)FLT_CONF(filter))->agent->id, + __FUNCTION__, s, spoe_ctx_state_str[ctx->state], ctx->flags); + + if (ctx->state == SPOE_CTX_ST_NONE) + goto out; + + if (!(chn->flags & CF_ISRESP)) { + if (filter->pre_analyzers & AN_REQ_INSPECT_FE) + chn->analysers |= AN_REQ_INSPECT_FE; + if (filter->pre_analyzers & AN_REQ_INSPECT_BE) + chn->analysers |= AN_REQ_INSPECT_BE; + + if (ctx->flags & SPOE_CTX_FL_CLI_CONNECTED) + goto out; + + ctx->stream_id = s->uniq_id; + ret = spoe_process_event(s, ctx, SPOE_EV_ON_CLIENT_SESS); + if (!ret) + goto out; + ctx->flags |= SPOE_CTX_FL_CLI_CONNECTED; + } + else { + if (filter->pre_analyzers & AN_RES_INSPECT) + chn->analysers |= AN_RES_INSPECT; + + if (ctx->flags & SPOE_CTX_FL_SRV_CONNECTED) + goto out; + + ret = spoe_process_event(s, ctx, SPOE_EV_ON_SERVER_SESS); + if (!ret) { + channel_dont_read(chn); + channel_dont_close(chn); + goto out; + } + ctx->flags |= SPOE_CTX_FL_SRV_CONNECTED; + } + + out: + return ret; +} + +/* Called before a processing happens on a given channel */ +static int +spoe_chn_pre_analyze(struct stream *s, struct filter *filter, + struct channel *chn, unsigned an_bit) +{ + struct spoe_context *ctx = filter->ctx; + int ret = 1; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p - ctx-state=%s" + " - ctx-flags=0x%08x - ana=0x%08x\n", + (int)date.tv_sec, (int)date.tv_usec, + ((struct spoe_config *)FLT_CONF(filter))->agent->id, + __FUNCTION__, s, spoe_ctx_state_str[ctx->state], + ctx->flags, an_bit); + + if (ctx->state == SPOE_CTX_ST_NONE) + goto out; + + switch (an_bit) { + case AN_REQ_INSPECT_FE: + ret = spoe_process_event(s, ctx, SPOE_EV_ON_TCP_REQ_FE); + break; + case AN_REQ_INSPECT_BE: + ret = spoe_process_event(s, ctx, SPOE_EV_ON_TCP_REQ_BE); + break; + case AN_RES_INSPECT: + ret = spoe_process_event(s, ctx, SPOE_EV_ON_TCP_RSP); + break; + case AN_REQ_HTTP_PROCESS_FE: + ret = spoe_process_event(s, ctx, SPOE_EV_ON_HTTP_REQ_FE); + break; + case AN_REQ_HTTP_PROCESS_BE: + ret = spoe_process_event(s, ctx, SPOE_EV_ON_HTTP_REQ_BE); + break; + case AN_RES_HTTP_PROCESS_FE: + ret = spoe_process_event(s, ctx, SPOE_EV_ON_HTTP_RSP); + break; + } + + out: + if (!ret && (chn->flags & CF_ISRESP)) { + channel_dont_read(chn); + channel_dont_close(chn); + } + return ret; +} + +/* Called when the filtering on the channel ends. */ +static int +spoe_end_analyze(struct stream *s, struct filter *filter, struct channel *chn) +{ + struct spoe_context *ctx = filter->ctx; + + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p - ctx-state=%s" + " - ctx-flags=0x%08x\n", + (int)date.tv_sec, (int)date.tv_usec, + ((struct spoe_config *)FLT_CONF(filter))->agent->id, + __FUNCTION__, s, spoe_ctx_state_str[ctx->state], ctx->flags); + + if (!(ctx->flags & SPOE_CTX_FL_PROCESS)) { + spoe_reset_context(ctx); + } + + return 1; +} + +/******************************************************************** + * Functions that manage the filter initialization + ********************************************************************/ +struct flt_ops spoe_ops = { + /* Manage SPOE filter, called for each filter declaration */ + .init = spoe_init, + .deinit = spoe_deinit, + .check = spoe_check, + .init_per_thread = spoe_init_per_thread, + + /* Handle start/stop of SPOE */ + .attach = spoe_start, + .detach = spoe_stop, + .check_timeouts = spoe_check_timeouts, + + /* Handle channels activity */ + .channel_start_analyze = spoe_start_analyze, + .channel_pre_analyze = spoe_chn_pre_analyze, + .channel_end_analyze = spoe_end_analyze, +}; + + +static int +cfg_parse_spoe_agent(const char *file, int linenum, char **args, int kwm) +{ + const char *err; + int i, err_code = 0; + + if ((cfg_scope == NULL && curengine != NULL) || + (cfg_scope != NULL && curengine == NULL) || + (curengine != NULL && cfg_scope != NULL && strcmp(curengine, cfg_scope) != 0)) + goto out; + + if (strcmp(args[0], "spoe-agent") == 0) { /* new spoe-agent section */ + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing name for spoe-agent section.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (curagent != NULL) { + ha_alert("parsing [%s:%d] : another spoe-agent section previously defined.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + if ((curagent = calloc(1, sizeof(*curagent))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + curagent->id = strdup(args[1]); + + curagent->conf.file = strdup(file); + curagent->conf.line = linenum; + + curagent->timeout.hello = TICK_ETERNITY; + curagent->timeout.idle = TICK_ETERNITY; + curagent->timeout.processing = TICK_ETERNITY; + + curagent->var_pfx = NULL; + curagent->var_on_error = NULL; + curagent->var_t_process = NULL; + curagent->var_t_total = NULL; + curagent->flags = (SPOE_FL_ASYNC | SPOE_FL_PIPELINING | SPOE_FL_SND_FRAGMENTATION); + curagent->cps_max = 0; + curagent->eps_max = 0; + curagent->max_frame_size = MAX_FRAME_SIZE; + curagent->max_fpa = 20; + + for (i = 0; i < SPOE_EV_EVENTS; ++i) + LIST_INIT(&curagent->events[i]); + LIST_INIT(&curagent->groups); + LIST_INIT(&curagent->messages); + } + else if (strcmp(args[0], "use-backend") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects a backend name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + free(curagent->b.name); + curagent->b.name = strdup(args[1]); + } + else if (strcmp(args[0], "messages") == 0) { + int cur_arg = 1; + while (*args[cur_arg]) { + struct spoe_placeholder *ph = NULL; + + list_for_each_entry(ph, &curmphs, list) { + if (strcmp(ph->id, args[cur_arg]) == 0) { + ha_alert("parsing [%s:%d]: spoe-message '%s' already used.\n", + file, linenum, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + if ((ph = calloc(1, sizeof(*ph))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + ph->id = strdup(args[cur_arg]); + LIST_APPEND(&curmphs, &ph->list); + cur_arg++; + } + } + else if (strcmp(args[0], "groups") == 0) { + int cur_arg = 1; + while (*args[cur_arg]) { + struct spoe_placeholder *ph = NULL; + + list_for_each_entry(ph, &curgphs, list) { + if (strcmp(ph->id, args[cur_arg]) == 0) { + ha_alert("parsing [%s:%d]: spoe-group '%s' already used.\n", + file, linenum, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + if ((ph = calloc(1, sizeof(*ph))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + ph->id = strdup(args[cur_arg]); + LIST_APPEND(&curgphs, &ph->list); + cur_arg++; + } + } + else if (strcmp(args[0], "timeout") == 0) { + unsigned int *tv = NULL; + const char *res; + unsigned timeout; + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : 'timeout' expects 'hello', 'idle' and 'processing'.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(2, file, linenum, args, &err_code)) + goto out; + if (strcmp(args[1], "hello") == 0) + tv = &curagent->timeout.hello; + else if (strcmp(args[1], "idle") == 0) + tv = &curagent->timeout.idle; + else if (strcmp(args[1], "processing") == 0) + tv = &curagent->timeout.processing; + else { + ha_alert("parsing [%s:%d] : 'timeout' supports 'hello', 'idle' or 'processing' (got %s).\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (!*args[2]) { + ha_alert("parsing [%s:%d] : 'timeout %s' expects an integer value (in milliseconds).\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + res = parse_time_err(args[2], &timeout, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s %s>, maximum value is 2147483647 ms (~24.8 days).\n", + file, linenum, args[2], args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s %s>, minimum non-null value is 1 ms.\n", + file, linenum, args[2], args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res) { + ha_alert("parsing [%s:%d] : unexpected character '%c' in 'timeout %s'.\n", + file, linenum, *res, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + *tv = MS_TO_TICKS(timeout); + } + else if (strcmp(args[0], "option") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d]: '%s' expects an option name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcmp(args[1], "pipelining") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (kwm == 1) + curagent->flags &= ~SPOE_FL_PIPELINING; + else + curagent->flags |= SPOE_FL_PIPELINING; + goto out; + } + else if (strcmp(args[1], "async") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (kwm == 1) + curagent->flags &= ~SPOE_FL_ASYNC; + else + curagent->flags |= SPOE_FL_ASYNC; + goto out; + } + else if (strcmp(args[1], "send-frag-payload") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (kwm == 1) + curagent->flags &= ~SPOE_FL_SND_FRAGMENTATION; + else + curagent->flags |= SPOE_FL_SND_FRAGMENTATION; + goto out; + } + else if (strcmp(args[1], "dontlog-normal") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + if (kwm == 1) + curpxopts2 &= ~PR_O2_NOLOGNORM; + else + curpxopts2 |= PR_O2_NOLOGNORM; + goto out; + } + + /* Following options does not support negation */ + if (kwm == 1) { + ha_alert("parsing [%s:%d]: negation is not supported for option '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcmp(args[1], "var-prefix") == 0) { + char *tmp; + + if (!*args[2]) { + ha_alert("parsing [%s:%d]: '%s %s' expects a value.\n", + file, linenum, args[0], + args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(2, file, linenum, args, &err_code)) + goto out; + tmp = args[2]; + while (*tmp) { + if (!isalnum((unsigned char)*tmp) && *tmp != '_' && *tmp != '.') { + ha_alert("parsing [%s:%d]: '%s %s' only supports [a-zA-Z0-9_.] chars.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + tmp++; + } + curagent->var_pfx = strdup(args[2]); + } + else if (strcmp(args[1], "force-set-var") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + curagent->flags |= SPOE_FL_FORCE_SET_VAR; + } + else if (strcmp(args[1], "continue-on-error") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + curagent->flags |= SPOE_FL_CONT_ON_ERR; + } + else if (strcmp(args[1], "set-on-error") == 0) { + char *tmp; + + if (!*args[2]) { + ha_alert("parsing [%s:%d]: '%s %s' expects a value.\n", + file, linenum, args[0], + args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(2, file, linenum, args, &err_code)) + goto out; + tmp = args[2]; + while (*tmp) { + if (!isalnum((unsigned char)*tmp) && *tmp != '_' && *tmp != '.') { + ha_alert("parsing [%s:%d]: '%s %s' only supports [a-zA-Z0-9_.] chars.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + tmp++; + } + curagent->var_on_error = strdup(args[2]); + } + else if (strcmp(args[1], "set-process-time") == 0) { + char *tmp; + + if (!*args[2]) { + ha_alert("parsing [%s:%d]: '%s %s' expects a value.\n", + file, linenum, args[0], + args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(2, file, linenum, args, &err_code)) + goto out; + tmp = args[2]; + while (*tmp) { + if (!isalnum((unsigned char)*tmp) && *tmp != '_' && *tmp != '.') { + ha_alert("parsing [%s:%d]: '%s %s' only supports [a-zA-Z0-9_.] chars.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + tmp++; + } + curagent->var_t_process = strdup(args[2]); + } + else if (strcmp(args[1], "set-total-time") == 0) { + char *tmp; + + if (!*args[2]) { + ha_alert("parsing [%s:%d]: '%s %s' expects a value.\n", + file, linenum, args[0], + args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(2, file, linenum, args, &err_code)) + goto out; + tmp = args[2]; + while (*tmp) { + if (!isalnum((unsigned char)*tmp) && *tmp != '_' && *tmp != '.') { + ha_alert("parsing [%s:%d]: '%s %s' only supports [a-zA-Z0-9_.] chars.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + tmp++; + } + curagent->var_t_total = strdup(args[2]); + } + else { + ha_alert("parsing [%s:%d]: option '%s' is not supported.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "maxconnrate") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + curagent->cps_max = atol(args[1]); + } + else if (strcmp(args[0], "maxerrrate") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + curagent->eps_max = atol(args[1]); + } + else if (strcmp(args[0], "max-frame-size") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + curagent->max_frame_size = atol(args[1]); + if (curagent->max_frame_size < MIN_FRAME_SIZE || + curagent->max_frame_size > MAX_FRAME_SIZE) { + ha_alert("parsing [%s:%d] : '%s' expects a positive integer argument in the range [%d, %d].\n", + file, linenum, args[0], MIN_FRAME_SIZE, MAX_FRAME_SIZE); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "max-waiting-frames") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + curagent->max_fpa = atol(args[1]); + if (curagent->max_fpa < 1) { + ha_alert("parsing [%s:%d] : '%s' expects a positive integer argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "register-var-names") == 0) { + int cur_arg; + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects one or more variable names.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + cur_arg = 1; + while (*args[cur_arg]) { + struct spoe_var_placeholder *vph; + + if ((vph = calloc(1, sizeof(*vph))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + if ((vph->name = strdup(args[cur_arg])) == NULL) { + free(vph); + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + LIST_APPEND(&curvars, &vph->list); + cur_arg++; + } + } + else if (strcmp(args[0], "log") == 0) { + char *errmsg = NULL; + + if (!parse_logger(args, &curloggers, (kwm == 1), file, linenum, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (*args[0]) { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in spoe-agent section.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + out: + return err_code; +} +static int +cfg_parse_spoe_group(const char *file, int linenum, char **args, int kwm) +{ + struct spoe_group *grp; + const char *err; + int err_code = 0; + + if ((cfg_scope == NULL && curengine != NULL) || + (cfg_scope != NULL && curengine == NULL) || + (curengine != NULL && cfg_scope != NULL && strcmp(curengine, cfg_scope) != 0)) + goto out; + + if (strcmp(args[0], "spoe-group") == 0) { /* new spoe-group section */ + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing name for spoe-group section.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + list_for_each_entry(grp, &curgrps, list) { + if (strcmp(grp->id, args[1]) == 0) { + ha_alert("parsing [%s:%d]: spoe-group section '%s' has the same" + " name as another one declared at %s:%d.\n", + file, linenum, args[1], grp->conf.file, grp->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + if ((curgrp = calloc(1, sizeof(*curgrp))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + curgrp->id = strdup(args[1]); + curgrp->conf.file = strdup(file); + curgrp->conf.line = linenum; + LIST_INIT(&curgrp->phs); + LIST_INIT(&curgrp->messages); + LIST_APPEND(&curgrps, &curgrp->list); + } + else if (strcmp(args[0], "messages") == 0) { + int cur_arg = 1; + while (*args[cur_arg]) { + struct spoe_placeholder *ph = NULL; + + list_for_each_entry(ph, &curgrp->phs, list) { + if (strcmp(ph->id, args[cur_arg]) == 0) { + ha_alert("parsing [%s:%d]: spoe-message '%s' already used.\n", + file, linenum, args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + if ((ph = calloc(1, sizeof(*ph))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + ph->id = strdup(args[cur_arg]); + LIST_APPEND(&curgrp->phs, &ph->list); + cur_arg++; + } + } + else if (*args[0]) { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in spoe-group section.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + out: + return err_code; +} + +static int +cfg_parse_spoe_message(const char *file, int linenum, char **args, int kwm) +{ + struct spoe_message *msg; + struct spoe_arg *arg; + const char *err; + char *errmsg = NULL; + int err_code = 0; + + if ((cfg_scope == NULL && curengine != NULL) || + (cfg_scope != NULL && curengine == NULL) || + (curengine != NULL && cfg_scope != NULL && strcmp(curengine, cfg_scope) != 0)) + goto out; + + if (strcmp(args[0], "spoe-message") == 0) { /* new spoe-message section */ + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing name for spoe-message section.\n", + file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + list_for_each_entry(msg, &curmsgs, list) { + if (strcmp(msg->id, args[1]) == 0) { + ha_alert("parsing [%s:%d]: spoe-message section '%s' has the same" + " name as another one declared at %s:%d.\n", + file, linenum, args[1], msg->conf.file, msg->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + if ((curmsg = calloc(1, sizeof(*curmsg))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + curmsg->id = strdup(args[1]); + curmsg->id_len = strlen(curmsg->id); + curmsg->event = SPOE_EV_NONE; + curmsg->conf.file = strdup(file); + curmsg->conf.line = linenum; + curmsg->nargs = 0; + LIST_INIT(&curmsg->args); + LIST_INIT(&curmsg->acls); + LIST_INIT(&curmsg->by_evt); + LIST_INIT(&curmsg->by_grp); + LIST_APPEND(&curmsgs, &curmsg->list); + } + else if (strcmp(args[0], "args") == 0) { + int cur_arg = 1; + + curproxy->conf.args.ctx = ARGC_SPOE; + curproxy->conf.args.file = file; + curproxy->conf.args.line = linenum; + while (*args[cur_arg]) { + char *delim = strchr(args[cur_arg], '='); + int idx = 0; + + if ((arg = calloc(1, sizeof(*arg))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (!delim) { + arg->name = NULL; + arg->name_len = 0; + delim = args[cur_arg]; + } + else { + arg->name = my_strndup(args[cur_arg], delim - args[cur_arg]); + arg->name_len = delim - args[cur_arg]; + delim++; + } + arg->expr = sample_parse_expr((char*[]){delim, NULL}, + &idx, file, linenum, &errmsg, + &curproxy->conf.args, NULL); + if (arg->expr == NULL) { + ha_alert("parsing [%s:%d] : '%s': %s.\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + free(arg->name); + free(arg); + goto out; + } + curmsg->nargs++; + LIST_APPEND(&curmsg->args, &arg->list); + cur_arg++; + } + curproxy->conf.args.file = NULL; + curproxy->conf.args.line = 0; + } + else if (strcmp(args[0], "acl") == 0) { + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in acl name '%s'.\n", + file, linenum, *err, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (strcasecmp(args[1], "or") == 0) { + ha_alert("parsing [%s:%d] : acl name '%s' will never match. 'or' is used to express a " + "logical disjunction within a condition.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (parse_acl((const char **)args + 1, &curmsg->acls, &errmsg, &curproxy->conf.args, file, linenum) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing ACL '%s' : %s.\n", + file, linenum, args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "event") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing event name.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + /* if (alertif_too_many_args(1, file, linenum, args, &err_code)) */ + /* goto out; */ + + if (strcmp(args[1], spoe_event_str[SPOE_EV_ON_CLIENT_SESS]) == 0) + curmsg->event = SPOE_EV_ON_CLIENT_SESS; + else if (strcmp(args[1], spoe_event_str[SPOE_EV_ON_SERVER_SESS]) == 0) + curmsg->event = SPOE_EV_ON_SERVER_SESS; + + else if (strcmp(args[1], spoe_event_str[SPOE_EV_ON_TCP_REQ_FE]) == 0) + curmsg->event = SPOE_EV_ON_TCP_REQ_FE; + else if (strcmp(args[1], spoe_event_str[SPOE_EV_ON_TCP_REQ_BE]) == 0) + curmsg->event = SPOE_EV_ON_TCP_REQ_BE; + else if (strcmp(args[1], spoe_event_str[SPOE_EV_ON_TCP_RSP]) == 0) + curmsg->event = SPOE_EV_ON_TCP_RSP; + + else if (strcmp(args[1], spoe_event_str[SPOE_EV_ON_HTTP_REQ_FE]) == 0) + curmsg->event = SPOE_EV_ON_HTTP_REQ_FE; + else if (strcmp(args[1], spoe_event_str[SPOE_EV_ON_HTTP_REQ_BE]) == 0) + curmsg->event = SPOE_EV_ON_HTTP_REQ_BE; + else if (strcmp(args[1], spoe_event_str[SPOE_EV_ON_HTTP_RSP]) == 0) + curmsg->event = SPOE_EV_ON_HTTP_RSP; + else { + ha_alert("parsing [%s:%d] : unknown event '%s'.\n", + file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (strcmp(args[2], "if") == 0 || strcmp(args[2], "unless") == 0) { + struct acl_cond *cond; + + cond = build_acl_cond(file, linenum, &curmsg->acls, + curproxy, (const char **)args+2, + &errmsg); + if (cond == NULL) { + ha_alert("parsing [%s:%d] : error detected while " + "parsing an 'event %s' condition : %s.\n", + file, linenum, args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curmsg->cond = cond; + } + else if (*args[2]) { + ha_alert("parsing [%s:%d]: 'event %s' expects either 'if' " + "or 'unless' followed by a condition but found '%s'.\n", + file, linenum, args[1], args[2]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (!*args[0]) { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in spoe-message section.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + out: + free(errmsg); + return err_code; +} + +/* Return -1 on error, else 0 */ +static int +parse_spoe_flt(char **args, int *cur_arg, struct proxy *px, + struct flt_conf *fconf, char **err, void *private) +{ + struct list backup_sections; + struct spoe_config *conf; + struct spoe_message *msg, *msgback; + struct spoe_group *grp, *grpback; + struct spoe_placeholder *ph, *phback; + struct spoe_var_placeholder *vph, *vphback; + struct logger *logger, *loggerback; + char *file = NULL, *engine = NULL; + int ret, pos = *cur_arg + 1; + + LIST_INIT(&curmsgs); + LIST_INIT(&curgrps); + LIST_INIT(&curmphs); + LIST_INIT(&curgphs); + LIST_INIT(&curvars); + LIST_INIT(&curloggers); + curpxopts = 0; + curpxopts2 = 0; + + conf = calloc(1, sizeof(*conf)); + if (conf == NULL) { + memprintf(err, "%s: out of memory", args[*cur_arg]); + goto error; + } + conf->proxy = px; + + while (*args[pos]) { + if (strcmp(args[pos], "config") == 0) { + if (!*args[pos+1]) { + memprintf(err, "'%s' : '%s' option without value", + args[*cur_arg], args[pos]); + goto error; + } + file = args[pos+1]; + pos += 2; + } + else if (strcmp(args[pos], "engine") == 0) { + if (!*args[pos+1]) { + memprintf(err, "'%s' : '%s' option without value", + args[*cur_arg], args[pos]); + goto error; + } + engine = args[pos+1]; + pos += 2; + } + else { + memprintf(err, "unknown keyword '%s'", args[pos]); + goto error; + } + } + if (file == NULL) { + memprintf(err, "'%s' : missing config file", args[*cur_arg]); + goto error; + } + + /* backup sections and register SPOE sections */ + LIST_INIT(&backup_sections); + cfg_backup_sections(&backup_sections); + cfg_register_section("spoe-agent", cfg_parse_spoe_agent, NULL); + cfg_register_section("spoe-group", cfg_parse_spoe_group, NULL); + cfg_register_section("spoe-message", cfg_parse_spoe_message, NULL); + + /* Parse SPOE filter configuration file */ + BUG_ON(px != curproxy); + curengine = engine; + curagent = NULL; + curmsg = NULL; + ret = readcfgfile(file); + + /* unregister SPOE sections and restore previous sections */ + cfg_unregister_sections(); + cfg_restore_sections(&backup_sections); + + if (ret == -1) { + memprintf(err, "Could not open configuration file %s : %s", + file, strerror(errno)); + goto error; + } + if (ret & (ERR_ABORT|ERR_FATAL)) { + memprintf(err, "Error(s) found in configuration file %s", file); + goto error; + } + + /* Check SPOE agent */ + if (curagent == NULL) { + memprintf(err, "No SPOE agent found in file %s", file); + goto error; + } + if (curagent->b.name == NULL) { + memprintf(err, "No backend declared for SPOE agent '%s' declared at %s:%d", + curagent->id, curagent->conf.file, curagent->conf.line); + goto error; + } + if (curagent->timeout.hello == TICK_ETERNITY || + curagent->timeout.idle == TICK_ETERNITY || + curagent->timeout.processing == TICK_ETERNITY) { + ha_warning("Proxy '%s': missing timeouts for SPOE agent '%s' declare at %s:%d.\n" + " | While not properly invalid, you will certainly encounter various problems\n" + " | with such a configuration. To fix this, please ensure that all following\n" + " | timeouts are set to a non-zero value: 'hello', 'idle', 'processing'.\n", + px->id, curagent->id, curagent->conf.file, curagent->conf.line); + } + if (curagent->var_pfx == NULL) { + char *tmp = curagent->id; + + while (*tmp) { + if (!isalnum((unsigned char)*tmp) && *tmp != '_' && *tmp != '.') { + memprintf(err, "Invalid variable prefix '%s' for SPOE agent '%s' declared at %s:%d. " + "Use 'option var-prefix' to set it. Only [a-zA-Z0-9_.] chars are supported.\n", + curagent->id, curagent->id, curagent->conf.file, curagent->conf.line); + goto error; + } + tmp++; + } + curagent->var_pfx = strdup(curagent->id); + } + + if (curagent->var_on_error) { + struct arg arg; + + trash.data = snprintf(trash.area, trash.size, "txn.%s.%s", + curagent->var_pfx, curagent->var_on_error); + + arg.type = ARGT_STR; + arg.data.str.area = trash.area; + arg.data.str.data = trash.data; + arg.data.str.size = 0; /* Set it to 0 to not release it in vars_check_arg() */ + if (!vars_check_arg(&arg, err)) { + memprintf(err, "SPOE agent '%s': failed to register variable %s.%s (%s)", + curagent->id, curagent->var_pfx, curagent->var_on_error, *err); + goto error; + } + } + + if (curagent->var_t_process) { + struct arg arg; + + trash.data = snprintf(trash.area, trash.size, "txn.%s.%s", + curagent->var_pfx, curagent->var_t_process); + + arg.type = ARGT_STR; + arg.data.str.area = trash.area; + arg.data.str.data = trash.data; + arg.data.str.size = 0; /* Set it to 0 to not release it in vars_check_arg() */ + if (!vars_check_arg(&arg, err)) { + memprintf(err, "SPOE agent '%s': failed to register variable %s.%s (%s)", + curagent->id, curagent->var_pfx, curagent->var_t_process, *err); + goto error; + } + } + + if (curagent->var_t_total) { + struct arg arg; + + trash.data = snprintf(trash.area, trash.size, "txn.%s.%s", + curagent->var_pfx, curagent->var_t_total); + + arg.type = ARGT_STR; + arg.data.str.area = trash.area; + arg.data.str.data = trash.data; + arg.data.str.size = 0; /* Set it to 0 to not release it in vars_check_arg() */ + if (!vars_check_arg(&arg, err)) { + memprintf(err, "SPOE agent '%s': failed to register variable %s.%s (%s)", + curagent->id, curagent->var_pfx, curagent->var_t_process, *err); + goto error; + } + } + + if (LIST_ISEMPTY(&curmphs) && LIST_ISEMPTY(&curgphs)) { + ha_warning("Proxy '%s': No message/group used by SPOE agent '%s' declared at %s:%d.\n", + px->id, curagent->id, curagent->conf.file, curagent->conf.line); + goto finish; + } + + /* Replace placeholders by the corresponding messages for the SPOE + * agent */ + list_for_each_entry(ph, &curmphs, list) { + list_for_each_entry(msg, &curmsgs, list) { + struct spoe_arg *arg; + unsigned int where; + + if (strcmp(msg->id, ph->id) == 0) { + if ((px->cap & (PR_CAP_FE|PR_CAP_BE)) == (PR_CAP_FE|PR_CAP_BE)) { + if (msg->event == SPOE_EV_ON_TCP_REQ_BE) + msg->event = SPOE_EV_ON_TCP_REQ_FE; + if (msg->event == SPOE_EV_ON_HTTP_REQ_BE) + msg->event = SPOE_EV_ON_HTTP_REQ_FE; + } + if (!(px->cap & PR_CAP_FE) && (msg->event == SPOE_EV_ON_CLIENT_SESS || + msg->event == SPOE_EV_ON_TCP_REQ_FE || + msg->event == SPOE_EV_ON_HTTP_REQ_FE)) { + ha_warning("Proxy '%s': frontend event used on a backend proxy at %s:%d.\n", + px->id, msg->conf.file, msg->conf.line); + goto next_mph; + } + if (msg->event == SPOE_EV_NONE) { + ha_warning("Proxy '%s': Ignore SPOE message '%s' without event at %s:%d.\n", + px->id, msg->id, msg->conf.file, msg->conf.line); + goto next_mph; + } + + where = 0; + switch (msg->event) { + case SPOE_EV_ON_CLIENT_SESS: + where |= SMP_VAL_FE_CON_ACC; + break; + + case SPOE_EV_ON_TCP_REQ_FE: + where |= SMP_VAL_FE_REQ_CNT; + break; + + case SPOE_EV_ON_HTTP_REQ_FE: + where |= SMP_VAL_FE_HRQ_HDR; + break; + + case SPOE_EV_ON_TCP_REQ_BE: + if (px->cap & PR_CAP_FE) + where |= SMP_VAL_FE_REQ_CNT; + if (px->cap & PR_CAP_BE) + where |= SMP_VAL_BE_REQ_CNT; + break; + + case SPOE_EV_ON_HTTP_REQ_BE: + if (px->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + break; + + case SPOE_EV_ON_SERVER_SESS: + where |= SMP_VAL_BE_SRV_CON; + break; + + case SPOE_EV_ON_TCP_RSP: + if (px->cap & PR_CAP_FE) + where |= SMP_VAL_FE_RES_CNT; + if (px->cap & PR_CAP_BE) + where |= SMP_VAL_BE_RES_CNT; + break; + + case SPOE_EV_ON_HTTP_RSP: + if (px->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRS_HDR; + if (px->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRS_HDR; + break; + + default: + break; + } + + list_for_each_entry(arg, &msg->args, list) { + if (!(arg->expr->fetch->val & where)) { + memprintf(err, "Ignore SPOE message '%s' at %s:%d: " + "some args extract information from '%s', " + "none of which is available here ('%s')", + msg->id, msg->conf.file, msg->conf.line, + sample_ckp_names(arg->expr->fetch->use), + sample_ckp_names(where)); + goto error; + } + } + + msg->agent = curagent; + LIST_APPEND(&curagent->events[msg->event], &msg->by_evt); + goto next_mph; + } + } + memprintf(err, "SPOE agent '%s' try to use undefined SPOE message '%s' at %s:%d", + curagent->id, ph->id, curagent->conf.file, curagent->conf.line); + goto error; + next_mph: + continue; + } + + /* Replace placeholders by the corresponding groups for the SPOE + * agent */ + list_for_each_entry(ph, &curgphs, list) { + list_for_each_entry_safe(grp, grpback, &curgrps, list) { + if (strcmp(grp->id, ph->id) == 0) { + grp->agent = curagent; + LIST_DELETE(&grp->list); + LIST_APPEND(&curagent->groups, &grp->list); + goto next_aph; + } + } + memprintf(err, "SPOE agent '%s' try to use undefined SPOE group '%s' at %s:%d", + curagent->id, ph->id, curagent->conf.file, curagent->conf.line); + goto error; + next_aph: + continue; + } + + /* Replace placeholders by the corresponding message for each SPOE + * group of the SPOE agent */ + list_for_each_entry(grp, &curagent->groups, list) { + list_for_each_entry_safe(ph, phback, &grp->phs, list) { + list_for_each_entry(msg, &curmsgs, list) { + if (strcmp(msg->id, ph->id) == 0) { + if (msg->group != NULL) { + memprintf(err, "SPOE message '%s' already belongs to " + "the SPOE group '%s' declare at %s:%d", + msg->id, msg->group->id, + msg->group->conf.file, + msg->group->conf.line); + goto error; + } + + /* Scope for arguments are not checked for now. We will check + * them only if a rule use the corresponding SPOE group. */ + msg->agent = curagent; + msg->group = grp; + LIST_DELETE(&ph->list); + LIST_APPEND(&grp->messages, &msg->by_grp); + goto next_mph_grp; + } + } + memprintf(err, "SPOE group '%s' try to use undefined SPOE message '%s' at %s:%d", + grp->id, ph->id, curagent->conf.file, curagent->conf.line); + goto error; + next_mph_grp: + continue; + } + } + + finish: + /* move curmsgs to the agent message list */ + curmsgs.n->p = &curagent->messages; + curmsgs.p->n = &curagent->messages; + curagent->messages = curmsgs; + LIST_INIT(&curmsgs); + + conf->id = strdup(engine ? engine : curagent->id); + conf->agent = curagent; + curagent->spoe_conf = conf; + + /* Start agent's proxy initialization here. It will be finished during + * the filter init. */ + memset(&conf->agent_fe, 0, sizeof(conf->agent_fe)); + init_new_proxy(&conf->agent_fe); + conf->agent_fe.id = conf->agent->id; + conf->agent_fe.parent = conf->agent; + conf->agent_fe.options |= curpxopts; + conf->agent_fe.options2 |= curpxopts2; + + list_for_each_entry_safe(logger, loggerback, &curloggers, list) { + LIST_DELETE(&logger->list); + LIST_APPEND(&conf->agent_fe.loggers, &logger->list); + } + + list_for_each_entry_safe(ph, phback, &curmphs, list) { + LIST_DELETE(&ph->list); + spoe_release_placeholder(ph); + } + list_for_each_entry_safe(ph, phback, &curgphs, list) { + LIST_DELETE(&ph->list); + spoe_release_placeholder(ph); + } + list_for_each_entry_safe(vph, vphback, &curvars, list) { + struct arg arg; + + trash.data = snprintf(trash.area, trash.size, "proc.%s.%s", + curagent->var_pfx, vph->name); + + arg.type = ARGT_STR; + arg.data.str.area = trash.area; + arg.data.str.data = trash.data; + arg.data.str.size = 0; /* Set it to 0 to not release it in vars_check_arg() */ + if (!vars_check_arg(&arg, err)) { + memprintf(err, "SPOE agent '%s': failed to register variable %s.%s (%s)", + curagent->id, curagent->var_pfx, vph->name, *err); + goto error; + } + + LIST_DELETE(&vph->list); + free(vph->name); + free(vph); + } + list_for_each_entry_safe(grp, grpback, &curgrps, list) { + LIST_DELETE(&grp->list); + spoe_release_group(grp); + } + *cur_arg = pos; + fconf->id = spoe_filter_id; + fconf->ops = &spoe_ops; + fconf->conf = conf; + return 0; + + error: + spoe_release_agent(curagent); + list_for_each_entry_safe(ph, phback, &curmphs, list) { + LIST_DELETE(&ph->list); + spoe_release_placeholder(ph); + } + list_for_each_entry_safe(ph, phback, &curgphs, list) { + LIST_DELETE(&ph->list); + spoe_release_placeholder(ph); + } + list_for_each_entry_safe(vph, vphback, &curvars, list) { + LIST_DELETE(&vph->list); + free(vph->name); + free(vph); + } + list_for_each_entry_safe(grp, grpback, &curgrps, list) { + LIST_DELETE(&grp->list); + spoe_release_group(grp); + } + list_for_each_entry_safe(msg, msgback, &curmsgs, list) { + LIST_DELETE(&msg->list); + spoe_release_message(msg); + } + list_for_each_entry_safe(logger, loggerback, &curloggers, list) { + LIST_DELETE(&logger->list); + free(logger); + } + free(conf); + return -1; +} + +/* Send message of a SPOE group. This is the action_ptr callback of a rule + * associated to a "send-spoe-group" action. + * + * It returns ACT_RET_CONT if processing is finished (with error or not), it returns + * ACT_RET_YIELD if the action is in progress. */ +static enum act_return +spoe_send_group(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct filter *filter; + struct spoe_agent *agent = NULL; + struct spoe_group *group = NULL; + struct spoe_context *ctx = NULL; + int ret, dir; + + list_for_each_entry(filter, &s->strm_flt.filters, list) { + if (filter->config == rule->arg.act.p[0]) { + agent = rule->arg.act.p[2]; + group = rule->arg.act.p[3]; + ctx = filter->ctx; + break; + } + } + if (agent == NULL || group == NULL || ctx == NULL) + return ACT_RET_CONT; + if (ctx->state == SPOE_CTX_ST_NONE) + return ACT_RET_CONT; + + switch (rule->from) { + case ACT_F_TCP_REQ_SES: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_CNT: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_RES_CNT: dir = SMP_OPT_DIR_RES; break; + case ACT_F_HTTP_REQ: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_HTTP_RES: dir = SMP_OPT_DIR_RES; break; + default: + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - internal error while execute spoe-send-group\n", + (int)date.tv_sec, (int)date.tv_usec, agent->id, + __FUNCTION__, s); + send_log(px, LOG_ERR, "SPOE: [%s] internal error while execute spoe-send-group\n", + agent->id); + return ACT_RET_CONT; + } + + ret = spoe_process_group(s, ctx, group, dir); + if (ret == 1) + return ACT_RET_CONT; + else if (ret == 0) { + if (flags & ACT_OPT_FINAL) { + SPOE_PRINTF(stderr, "%d.%06d [SPOE/%-15s] %s: stream=%p" + " - failed to process group '%s': interrupted by caller\n", + (int)date.tv_sec, (int)date.tv_usec, + agent->id, __FUNCTION__, s, group->id); + ctx->status_code = SPOE_CTX_ERR_INTERRUPT; + spoe_stop_processing(agent, ctx); + spoe_handle_processing_error(s, agent, ctx, dir); + return ACT_RET_CONT; + } + return ACT_RET_YIELD; + } + else + return ACT_RET_CONT; +} + +/* Check an "send-spoe-group" action. Here, we'll try to find the real SPOE + * group associated to <rule>. The format of an rule using 'send-spoe-group' + * action should be: + * + * (http|tcp)-(request|response) send-spoe-group <engine-id> <group-id> + * + * So, we'll loop on each configured SPOE filter for the proxy <px> to find the + * SPOE engine matching <engine-id>. And then, we'll try to find the good group + * matching <group-id>. Finally, we'll check all messages referenced by the SPOE + * group. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +static int +check_send_spoe_group(struct act_rule *rule, struct proxy *px, char **err) +{ + struct flt_conf *fconf; + struct spoe_config *conf; + struct spoe_agent *agent = NULL; + struct spoe_group *group; + struct spoe_message *msg; + char *engine_id = rule->arg.act.p[0]; + char *group_id = rule->arg.act.p[1]; + unsigned int where = 0; + + switch (rule->from) { + case ACT_F_TCP_REQ_SES: where = SMP_VAL_FE_SES_ACC; break; + case ACT_F_TCP_REQ_CNT: where = SMP_VAL_FE_REQ_CNT; break; + case ACT_F_TCP_RES_CNT: where = SMP_VAL_BE_RES_CNT; break; + case ACT_F_HTTP_REQ: where = SMP_VAL_FE_HRQ_HDR; break; + case ACT_F_HTTP_RES: where = SMP_VAL_BE_HRS_HDR; break; + default: + memprintf(err, + "internal error, unexpected rule->from=%d, please report this bug!", + rule->from); + goto error; + } + + /* Try to find the SPOE engine by checking all SPOE filters for proxy + * <px> */ + list_for_each_entry(fconf, &px->filter_configs, list) { + conf = fconf->conf; + + /* This is not an SPOE filter */ + if (fconf->id != spoe_filter_id) + continue; + + /* This is the good engine */ + if (strcmp(conf->id, engine_id) == 0) { + agent = conf->agent; + break; + } + } + if (agent == NULL) { + memprintf(err, "unable to find SPOE engine '%s' used by the send-spoe-group '%s'", + engine_id, group_id); + goto error; + } + + /* Try to find the right group */ + list_for_each_entry(group, &agent->groups, list) { + /* This is the good group */ + if (strcmp(group->id, group_id) == 0) + break; + } + if (&group->list == &agent->groups) { + memprintf(err, "unable to find SPOE group '%s' into SPOE engine '%s' configuration", + group_id, engine_id); + goto error; + } + + /* Ok, we found the group, we need to check messages and their + * arguments */ + list_for_each_entry(msg, &group->messages, by_grp) { + struct spoe_arg *arg; + + list_for_each_entry(arg, &msg->args, list) { + if (!(arg->expr->fetch->val & where)) { + memprintf(err, "Invalid SPOE message '%s' used by SPOE group '%s' at %s:%d: " + "some args extract information from '%s'," + "none of which is available here ('%s')", + msg->id, group->id, msg->conf.file, msg->conf.line, + sample_ckp_names(arg->expr->fetch->use), + sample_ckp_names(where)); + goto error; + } + } + } + + free(engine_id); + free(group_id); + rule->arg.act.p[0] = fconf; /* Associate filter config with the rule */ + rule->arg.act.p[1] = conf; /* Associate SPOE config with the rule */ + rule->arg.act.p[2] = agent; /* Associate SPOE agent with the rule */ + rule->arg.act.p[3] = group; /* Associate SPOE group with the rule */ + return 1; + + error: + free(engine_id); + free(group_id); + return 0; +} + +/* Parse 'send-spoe-group' action following the format: + * + * ... send-spoe-group <engine-id> <group-id> + * + * It returns ACT_RET_PRS_ERR if fails and <err> is filled with an error + * message. Otherwise, it returns ACT_RET_PRS_OK and parsing engine and group + * ids are saved and used later, when the rule will be checked. + */ +static enum act_parse_ret +parse_send_spoe_group(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + if (!*args[*orig_arg] || !*args[*orig_arg+1] || + (*args[*orig_arg+2] && strcmp(args[*orig_arg+2], "if") != 0 && strcmp(args[*orig_arg+2], "unless") != 0)) { + memprintf(err, "expects 2 arguments: <engine-id> <group-id>"); + return ACT_RET_PRS_ERR; + } + rule->arg.act.p[0] = strdup(args[*orig_arg]); /* Copy the SPOE engine id */ + rule->arg.act.p[1] = strdup(args[*orig_arg+1]); /* Cope the SPOE group id */ + + (*orig_arg) += 2; + + rule->action = ACT_CUSTOM; + rule->action_ptr = spoe_send_group; + rule->check_ptr = check_send_spoe_group; + return ACT_RET_PRS_OK; +} + + +/* Declare the filter parser for "spoe" keyword */ +static struct flt_kw_list flt_kws = { "SPOE", { }, { + { "spoe", parse_spoe_flt, NULL }, + { NULL, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, flt_register_keywords, &flt_kws); + +/* Delcate the action parser for "spoe-action" keyword */ +static struct action_kw_list tcp_req_action_kws = { { }, { + { "send-spoe-group", parse_send_spoe_group }, + { /* END */ }, + } +}; + +INITCALL1(STG_REGISTER, tcp_req_cont_keywords_register, &tcp_req_action_kws); + +static struct action_kw_list tcp_res_action_kws = { { }, { + { "send-spoe-group", parse_send_spoe_group }, + { /* END */ }, + } +}; + +INITCALL1(STG_REGISTER, tcp_res_cont_keywords_register, &tcp_res_action_kws); + +static struct action_kw_list http_req_action_kws = { { }, { + { "send-spoe-group", parse_send_spoe_group }, + { /* END */ }, + } +}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_action_kws); + +static struct action_kw_list http_res_action_kws = { { }, { + { "send-spoe-group", parse_send_spoe_group }, + { /* END */ }, + } +}; + +INITCALL1(STG_REGISTER, http_res_keywords_register, &http_res_action_kws); diff --git a/src/flt_trace.c b/src/flt_trace.c new file mode 100644 index 0000000..bbadfe2 --- /dev/null +++ b/src/flt_trace.c @@ -0,0 +1,675 @@ +/* + * Stream filters related variables and functions. + * + * Copyright (C) 2015 Qualys Inc., Christopher Faulet <cfaulet@qualys.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> + +#include <haproxy/api.h> +#include <haproxy/channel-t.h> +#include <haproxy/errors.h> +#include <haproxy/filters.h> +#include <haproxy/global.h> +#include <haproxy/http_ana-t.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/proxy-t.h> +#include <haproxy/stream.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> + +const char *trace_flt_id = "trace filter"; + +struct flt_ops trace_ops; + +#define TRACE_F_QUIET 0x00000001 +#define TRACE_F_RAND_FWD 0x00000002 +#define TRACE_F_HEXDUMP 0x00000004 + +struct trace_config { + struct proxy *proxy; + char *name; + unsigned int flags; +}; + +#define FLT_TRACE(conf, fmt, ...) \ + do { \ + if (!(conf->flags & TRACE_F_QUIET)) \ + fprintf(stderr, "%d.%06d [%-20s] " fmt "\n", \ + (int)date.tv_sec, (int)date.tv_usec, (conf)->name,\ + ##__VA_ARGS__); \ + } while (0) + +#define FLT_STRM_TRACE(conf, strm, fmt, ...) \ + do { \ + if (!(conf->flags & TRACE_F_QUIET)) \ + fprintf(stderr, "%d.%06d [%-20s] [strm %p(%x) 0x%08x 0x%08x] " fmt "\n", \ + (int)date.tv_sec, (int)date.tv_usec, (conf)->name, \ + strm, (strm ? ((struct stream *)strm)->uniq_id : ~0U), \ + (strm ? strm->req.analysers : 0), (strm ? strm->res.analysers : 0), \ + ##__VA_ARGS__); \ + } while (0) + + +static const char * +channel_label(const struct channel *chn) +{ + return (chn->flags & CF_ISRESP) ? "RESPONSE" : "REQUEST"; +} + +static const char * +proxy_mode(const struct stream *s) +{ + struct proxy *px = (s->flags & SF_BE_ASSIGNED ? s->be : strm_fe(s)); + + return ((px->mode == PR_MODE_HTTP) ? "HTTP" : "TCP"); +} + +static const char * +stream_pos(const struct stream *s) +{ + return (s->flags & SF_BE_ASSIGNED) ? "backend" : "frontend"; +} + +static const char * +filter_type(const struct filter *f) +{ + return (f->flags & FLT_FL_IS_BACKEND_FILTER) ? "backend" : "frontend"; +} + +static void +trace_hexdump(struct ist ist) +{ + int i, j, padding; + + padding = ((ist.len % 16) ? (16 - ist.len % 16) : 0); + for (i = 0; i < ist.len + padding; i++) { + if (!(i % 16)) + fprintf(stderr, "\t0x%06x: ", i); + else if (!(i % 8)) + fprintf(stderr, " "); + + if (i < ist.len) + fprintf(stderr, "%02x ", (unsigned char)*(ist.ptr+i)); + else + fprintf(stderr, " "); + + /* print ASCII dump */ + if (i % 16 == 15) { + fprintf(stderr, " |"); + for(j = i - 15; j <= i && j < ist.len; j++) + fprintf(stderr, "%c", (isprint((unsigned char)*(ist.ptr+j)) ? *(ist.ptr+j) : '.')); + fprintf(stderr, "|\n"); + } + } +} + +static void +trace_raw_hexdump(struct buffer *buf, unsigned int offset, unsigned int len) +{ + unsigned char p[len]; + int block1, block2; + + block1 = len; + if (block1 > b_contig_data(buf, offset)) + block1 = b_contig_data(buf, offset); + block2 = len - block1; + + memcpy(p, b_peek(buf, offset), block1); + memcpy(p+block1, b_orig(buf), block2); + trace_hexdump(ist2(p, len)); +} + +static void +trace_htx_hexdump(struct htx *htx, unsigned int offset, unsigned int len) +{ + struct htx_blk *blk; + + for (blk = htx_get_first_blk(htx); blk && len; blk = htx_get_next_blk(htx, blk)) { + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t sz = htx_get_blksz(blk); + struct ist v; + + if (offset >= sz) { + offset -= sz; + continue; + } + + v = htx_get_blk_value(htx, blk); + v = istadv(v, offset); + offset = 0; + + v = isttrim(v, len); + len -= v.len; + if (type == HTX_BLK_DATA) + trace_hexdump(v); + } +} + +static unsigned int +trace_get_htx_datalen(struct htx *htx, unsigned int offset, unsigned int len) +{ + struct htx_blk *blk; + struct htx_ret htxret = htx_find_offset(htx, offset); + uint32_t data = 0; + + blk = htxret.blk; + if (blk && htxret.ret && htx_get_blk_type(blk) == HTX_BLK_DATA) { + data += htxret.ret; + blk = htx_get_next_blk(htx, blk); + } + while (blk) { + if (htx_get_blk_type(blk) == HTX_BLK_UNUSED) + goto next; + else if (htx_get_blk_type(blk) != HTX_BLK_DATA) + break; + data += htx_get_blksz(blk); + next: + blk = htx_get_next_blk(htx, blk); + } + return data; +} + +/*************************************************************************** + * Hooks that manage the filter lifecycle (init/check/deinit) + **************************************************************************/ +/* Initialize the filter. Returns -1 on error, else 0. */ +static int +trace_init(struct proxy *px, struct flt_conf *fconf) +{ + struct trace_config *conf = fconf->conf; + + if (conf->name) + memprintf(&conf->name, "%s/%s", conf->name, px->id); + else + memprintf(&conf->name, "TRACE/%s", px->id); + + fconf->flags |= FLT_CFG_FL_HTX; + fconf->conf = conf; + + FLT_TRACE(conf, "filter initialized [quiet=%s - fwd random=%s - hexdump=%s]", + ((conf->flags & TRACE_F_QUIET) ? "true" : "false"), + ((conf->flags & TRACE_F_RAND_FWD) ? "true" : "false"), + ((conf->flags & TRACE_F_HEXDUMP) ? "true" : "false")); + return 0; +} + +/* Free resources allocated by the trace filter. */ +static void +trace_deinit(struct proxy *px, struct flt_conf *fconf) +{ + struct trace_config *conf = fconf->conf; + + if (conf) { + FLT_TRACE(conf, "filter deinitialized"); + free(conf->name); + free(conf); + } + fconf->conf = NULL; +} + +/* Check configuration of a trace filter for a specified proxy. + * Return 1 on error, else 0. */ +static int +trace_check(struct proxy *px, struct flt_conf *fconf) +{ + return 0; +} + +/* Initialize the filter for each thread. Return -1 on error, else 0. */ +static int +trace_init_per_thread(struct proxy *px, struct flt_conf *fconf) +{ + struct trace_config *conf = fconf->conf; + + FLT_TRACE(conf, "filter initialized for thread tid %u", tid); + return 0; +} + +/* Free resources allocate by the trace filter for each thread. */ +static void +trace_deinit_per_thread(struct proxy *px, struct flt_conf *fconf) +{ + struct trace_config *conf = fconf->conf; + + if (conf) + FLT_TRACE(conf, "filter deinitialized for thread tid %u", tid); +} + +/************************************************************************** + * Hooks to handle start/stop of streams + *************************************************************************/ +/* Called when a filter instance is created and attach to a stream */ +static int +trace_attach(struct stream *s, struct filter *filter) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s: filter-type=%s", + __FUNCTION__, filter_type(filter)); + + return 1; +} + +/* Called when a filter instance is detach from a stream, just before its + * destruction */ +static void +trace_detach(struct stream *s, struct filter *filter) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s: filter-type=%s", + __FUNCTION__, filter_type(filter)); +} + +/* Called when a stream is created */ +static int +trace_stream_start(struct stream *s, struct filter *filter) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s", + __FUNCTION__); + return 0; +} + + +/* Called when a backend is set for a stream */ +static int +trace_stream_set_backend(struct stream *s, struct filter *filter, + struct proxy *be) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s: backend=%s", + __FUNCTION__, be->id); + return 0; +} + +/* Called when a stream is destroyed */ +static void +trace_stream_stop(struct stream *s, struct filter *filter) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s", + __FUNCTION__); +} + +/* Called when the stream is woken up because of an expired timer */ +static void +trace_check_timeouts(struct stream *s, struct filter *filter) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s", + __FUNCTION__); +} + +/************************************************************************** + * Hooks to handle channels activity + *************************************************************************/ +/* Called when analyze starts for a given channel */ +static int +trace_chn_start_analyze(struct stream *s, struct filter *filter, + struct channel *chn) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s)", + __FUNCTION__, + channel_label(chn), proxy_mode(s), stream_pos(s)); + filter->pre_analyzers |= (AN_REQ_ALL | AN_RES_ALL); + filter->post_analyzers |= (AN_REQ_ALL | AN_RES_ALL); + register_data_filter(s, chn, filter); + return 1; +} + +/* Called before a processing happens on a given channel */ +static int +trace_chn_analyze(struct stream *s, struct filter *filter, + struct channel *chn, unsigned an_bit) +{ + struct trace_config *conf = FLT_CONF(filter); + char *ana; + + switch (an_bit) { + case AN_REQ_INSPECT_FE: + ana = "AN_REQ_INSPECT_FE"; + break; + case AN_REQ_WAIT_HTTP: + ana = "AN_REQ_WAIT_HTTP"; + break; + case AN_REQ_HTTP_BODY: + ana = "AN_REQ_HTTP_BODY"; + break; + case AN_REQ_HTTP_PROCESS_FE: + ana = "AN_REQ_HTTP_PROCESS_FE"; + break; + case AN_REQ_SWITCHING_RULES: + ana = "AN_REQ_SWITCHING_RULES"; + break; + case AN_REQ_INSPECT_BE: + ana = "AN_REQ_INSPECT_BE"; + break; + case AN_REQ_HTTP_PROCESS_BE: + ana = "AN_REQ_HTTP_PROCESS_BE"; + break; + case AN_REQ_SRV_RULES: + ana = "AN_REQ_SRV_RULES"; + break; + case AN_REQ_HTTP_INNER: + ana = "AN_REQ_HTTP_INNER"; + break; + case AN_REQ_HTTP_TARPIT: + ana = "AN_REQ_HTTP_TARPIT"; + break; + case AN_REQ_STICKING_RULES: + ana = "AN_REQ_STICKING_RULES"; + break; + case AN_REQ_PRST_RDP_COOKIE: + ana = "AN_REQ_PRST_RDP_COOKIE"; + break; + case AN_REQ_HTTP_XFER_BODY: + ana = "AN_REQ_HTTP_XFER_BODY"; + break; + case AN_RES_INSPECT: + ana = "AN_RES_INSPECT"; + break; + case AN_RES_WAIT_HTTP: + ana = "AN_RES_WAIT_HTTP"; + break; + case AN_RES_HTTP_PROCESS_FE: // AN_RES_HTTP_PROCESS_BE + ana = "AN_RES_HTTP_PROCESS_FE/BE"; + break; + case AN_RES_STORE_RULES: + ana = "AN_RES_STORE_RULES"; + break; + case AN_RES_HTTP_XFER_BODY: + ana = "AN_RES_HTTP_XFER_BODY"; + break; + default: + ana = "unknown"; + } + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - " + "analyzer=%s - step=%s", + __FUNCTION__, + channel_label(chn), proxy_mode(s), stream_pos(s), + ana, ((chn->analysers & an_bit) ? "PRE" : "POST")); + return 1; +} + +/* Called when analyze ends for a given channel */ +static int +trace_chn_end_analyze(struct stream *s, struct filter *filter, + struct channel *chn) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s)", + __FUNCTION__, + channel_label(chn), proxy_mode(s), stream_pos(s)); + return 1; +} + +/************************************************************************** + * Hooks to filter HTTP messages + *************************************************************************/ +static int +trace_http_headers(struct stream *s, struct filter *filter, + struct http_msg *msg) +{ + struct trace_config *conf = FLT_CONF(filter); + struct htx *htx = htxbuf(&msg->chn->buf); + struct htx_sl *sl = http_get_stline(htx); + int32_t pos; + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s)\t%.*s %.*s %.*s", + __FUNCTION__, + channel_label(msg->chn), proxy_mode(s), stream_pos(s), + HTX_SL_P1_LEN(sl), HTX_SL_P1_PTR(sl), + HTX_SL_P2_LEN(sl), HTX_SL_P2_PTR(sl), + HTX_SL_P3_LEN(sl), HTX_SL_P3_PTR(sl)); + + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist n, v; + + if (type == HTX_BLK_EOH) + break; + if (type != HTX_BLK_HDR) + continue; + + n = htx_get_blk_name(htx, blk); + v = htx_get_blk_value(htx, blk); + FLT_STRM_TRACE(conf, s, "\t%.*s: %.*s", + (int)n.len, n.ptr, (int)v.len, v.ptr); + } + return 1; +} + +static int +trace_http_payload(struct stream *s, struct filter *filter, struct http_msg *msg, + unsigned int offset, unsigned int len) +{ + struct trace_config *conf = FLT_CONF(filter); + int ret = len; + + if (ret && (conf->flags & TRACE_F_RAND_FWD)) { + unsigned int data = trace_get_htx_datalen(htxbuf(&msg->chn->buf), offset, len); + + if (data) { + ret = ha_random() % (ret+1); + if (!ret || ret >= data) + ret = len; + } + } + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - " + "offset=%u - len=%u - forward=%d", + __FUNCTION__, + channel_label(msg->chn), proxy_mode(s), stream_pos(s), + offset, len, ret); + + if (conf->flags & TRACE_F_HEXDUMP) + trace_htx_hexdump(htxbuf(&msg->chn->buf), offset, ret); + + if (ret != len) + task_wakeup(s->task, TASK_WOKEN_MSG); + return ret; +} + +static int +trace_http_end(struct stream *s, struct filter *filter, + struct http_msg *msg) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s)", + __FUNCTION__, + channel_label(msg->chn), proxy_mode(s), stream_pos(s)); + return 1; +} + +static void +trace_http_reset(struct stream *s, struct filter *filter, + struct http_msg *msg) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s)", + __FUNCTION__, + channel_label(msg->chn), proxy_mode(s), stream_pos(s)); +} + +static void +trace_http_reply(struct stream *s, struct filter *filter, short status, + const struct buffer *msg) +{ + struct trace_config *conf = FLT_CONF(filter); + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s)", + __FUNCTION__, "-", proxy_mode(s), stream_pos(s)); +} + +/************************************************************************** + * Hooks to filter TCP data + *************************************************************************/ +static int +trace_tcp_payload(struct stream *s, struct filter *filter, struct channel *chn, + unsigned int offset, unsigned int len) +{ + struct trace_config *conf = FLT_CONF(filter); + int ret = len; + + if (s->flags & SF_HTX) { + if (ret && (conf->flags & TRACE_F_RAND_FWD)) { + unsigned int data = trace_get_htx_datalen(htxbuf(&chn->buf), offset, len); + + if (data) { + ret = ha_random() % (ret+1); + if (!ret || ret >= data) + ret = len; + } + } + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - " + "offset=%u - len=%u - forward=%d", + __FUNCTION__, + channel_label(chn), proxy_mode(s), stream_pos(s), + offset, len, ret); + + if (conf->flags & TRACE_F_HEXDUMP) + trace_htx_hexdump(htxbuf(&chn->buf), offset, ret); + } + else { + + if (ret && (conf->flags & TRACE_F_RAND_FWD)) + ret = ha_random() % (ret+1); + + FLT_STRM_TRACE(conf, s, "%-25s: channel=%-10s - mode=%-5s (%s) - " + "offset=%u - len=%u - forward=%d", + __FUNCTION__, + channel_label(chn), proxy_mode(s), stream_pos(s), + offset, len, ret); + + if (conf->flags & TRACE_F_HEXDUMP) + trace_raw_hexdump(&chn->buf, offset, ret); + } + + if (ret != len) + task_wakeup(s->task, TASK_WOKEN_MSG); + return ret; +} +/******************************************************************** + * Functions that manage the filter initialization + ********************************************************************/ +struct flt_ops trace_ops = { + /* Manage trace filter, called for each filter declaration */ + .init = trace_init, + .deinit = trace_deinit, + .check = trace_check, + .init_per_thread = trace_init_per_thread, + .deinit_per_thread = trace_deinit_per_thread, + + /* Handle start/stop of streams */ + .attach = trace_attach, + .detach = trace_detach, + .stream_start = trace_stream_start, + .stream_set_backend = trace_stream_set_backend, + .stream_stop = trace_stream_stop, + .check_timeouts = trace_check_timeouts, + + /* Handle channels activity */ + .channel_start_analyze = trace_chn_start_analyze, + .channel_pre_analyze = trace_chn_analyze, + .channel_post_analyze = trace_chn_analyze, + .channel_end_analyze = trace_chn_end_analyze, + + /* Filter HTTP requests and responses */ + .http_headers = trace_http_headers, + .http_payload = trace_http_payload, + .http_end = trace_http_end, + .http_reset = trace_http_reset, + .http_reply = trace_http_reply, + + /* Filter TCP data */ + .tcp_payload = trace_tcp_payload, +}; + +/* Return -1 on error, else 0 */ +static int +parse_trace_flt(char **args, int *cur_arg, struct proxy *px, + struct flt_conf *fconf, char **err, void *private) +{ + struct trace_config *conf; + int pos = *cur_arg; + + conf = calloc(1, sizeof(*conf)); + if (!conf) { + memprintf(err, "%s: out of memory", args[*cur_arg]); + return -1; + } + conf->proxy = px; + conf->flags = 0; + if (strcmp(args[pos], "trace") == 0) { + pos++; + + while (*args[pos]) { + if (strcmp(args[pos], "name") == 0) { + if (!*args[pos + 1]) { + memprintf(err, "'%s' : '%s' option without value", + args[*cur_arg], args[pos]); + goto error; + } + conf->name = strdup(args[pos + 1]); + if (!conf->name) { + memprintf(err, "%s: out of memory", args[*cur_arg]); + goto error; + } + pos++; + } + else if (strcmp(args[pos], "quiet") == 0) + conf->flags |= TRACE_F_QUIET; + else if (strcmp(args[pos], "random-parsing") == 0) + ; // ignore + else if (strcmp(args[pos], "random-forwarding") == 0) + conf->flags |= TRACE_F_RAND_FWD; + else if (strcmp(args[pos], "hexdump") == 0) + conf->flags |= TRACE_F_HEXDUMP; + else + break; + pos++; + } + *cur_arg = pos; + fconf->id = trace_flt_id; + fconf->ops = &trace_ops; + } + + fconf->conf = conf; + return 0; + + error: + if (conf->name) + free(conf->name); + free(conf); + return -1; +} + +/* Declare the filter parser for "trace" keyword */ +static struct flt_kw_list flt_kws = { "TRACE", { }, { + { "trace", parse_trace_flt, NULL }, + { NULL, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, flt_register_keywords, &flt_kws); diff --git a/src/freq_ctr.c b/src/freq_ctr.c new file mode 100644 index 0000000..1361333 --- /dev/null +++ b/src/freq_ctr.c @@ -0,0 +1,218 @@ +/* + * Event rate calculation functions. + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/api.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/tools.h> + +/* Update a frequency counter by <inc> incremental units. It is automatically + * rotated if the period is over. It is important that it correctly initializes + * a null area. This one works on frequency counters which have a period + * different from one second. It relies on the process-wide clock that is + * guaranteed to be monotonic. It's important to avoid forced rotates between + * threads. A faster wrapper (update_freq_ctr_period) should be used instead, + * which uses the thread's local time whenever possible and falls back to this + * one when needed (less than 0.003% of the time). + */ +uint update_freq_ctr_period_slow(struct freq_ctr *ctr, uint period, uint inc) +{ + uint curr_tick; + uint32_t now_ms_tmp; + + /* atomically update the counter if still within the period, even if + * a rotation is in progress (no big deal). + */ + for (;; __ha_cpu_relax()) { + curr_tick = HA_ATOMIC_LOAD(&ctr->curr_tick); + now_ms_tmp = HA_ATOMIC_LOAD(&global_now_ms); + + if (now_ms_tmp - curr_tick < period) + return HA_ATOMIC_ADD_FETCH(&ctr->curr_ctr, inc); + + /* a rotation is needed. While extremely rare, contention may + * happen because it will be triggered on time, and all threads + * see the time change simultaneously. + */ + if (!(curr_tick & 1) && + HA_ATOMIC_CAS(&ctr->curr_tick, &curr_tick, curr_tick | 0x1)) + break; + } + + /* atomically switch the new period into the old one without losing any + * potential concurrent update. We're the only one performing the rotate + * (locked above), others are only adding positive values to curr_ctr. + */ + HA_ATOMIC_STORE(&ctr->prev_ctr, HA_ATOMIC_XCHG(&ctr->curr_ctr, inc)); + curr_tick += period; + if (likely(now_ms_tmp - curr_tick >= period)) { + /* we missed at least two periods */ + HA_ATOMIC_STORE(&ctr->prev_ctr, 0); + curr_tick = now_ms_tmp; + } + + /* release the lock and update the time in case of rotate. */ + HA_ATOMIC_STORE(&ctr->curr_tick, curr_tick & ~1); + return inc; +} + +/* Returns the total number of events over the current + last period, including + * a number of already pending events <pend>. The average frequency will be + * obtained by dividing the output by <period>. This is essentially made to + * ease implementation of higher-level read functions. + * + * As a special case, if pend < 0, it's assumed there are no pending + * events and a flapping correction must be applied at the end. This is used by + * read_freq_ctr_period() to avoid reporting ups and downs on low-frequency + * events when the past value is <= 1. + */ +ullong freq_ctr_total(const struct freq_ctr *ctr, uint period, int pend) +{ + ullong curr, past, old_curr, old_past; + uint tick, old_tick; + int remain; + + tick = HA_ATOMIC_LOAD(&ctr->curr_tick); + curr = HA_ATOMIC_LOAD(&ctr->curr_ctr); + past = HA_ATOMIC_LOAD(&ctr->prev_ctr); + + while (1) { + if (tick & 0x1) // change in progress + goto redo0; + + old_tick = tick; + old_curr = curr; + old_past = past; + + /* now let's load the values a second time and make sure they + * did not change, which will indicate it was a stable reading. + */ + + tick = HA_ATOMIC_LOAD(&ctr->curr_tick); + if (tick & 0x1) // change in progress + goto redo0; + + if (tick != old_tick) + goto redo1; + + curr = HA_ATOMIC_LOAD(&ctr->curr_ctr); + if (curr != old_curr) + goto redo2; + + past = HA_ATOMIC_LOAD(&ctr->prev_ctr); + if (past != old_past) + goto redo3; + + /* all values match between two loads, they're stable, let's + * quit now. + */ + break; + redo0: + tick = HA_ATOMIC_LOAD(&ctr->curr_tick); + redo1: + curr = HA_ATOMIC_LOAD(&ctr->curr_ctr); + redo2: + past = HA_ATOMIC_LOAD(&ctr->prev_ctr); + redo3: + __ha_cpu_relax(); + }; + + remain = tick + period - HA_ATOMIC_LOAD(&global_now_ms); + if (unlikely(remain < 0)) { + /* We're past the first period, check if we can still report a + * part of last period or if we're too far away. + */ + remain += period; + past = (remain >= 0) ? curr : 0; + curr = 0; + } + + if (pend < 0) { + /* enable flapping correction at very low rates */ + pend = 0; + if (!curr && past <= 1) + return past * period; + } + + /* compute the total number of confirmed events over the period */ + return past * remain + (curr + pend) * period; +} + +/* Returns the excess of events (may be negative) over the current period for + * target frequency <freq>. It returns 0 if the counter is in the future or if + * the counter is empty. The result considers the position of the current time + * within the current period. + * + * The caller may safely add new events if result is negative or null. + */ +int freq_ctr_overshoot_period(const struct freq_ctr *ctr, uint period, uint freq) +{ + ullong curr, old_curr; + uint tick, old_tick; + int elapsed; + + tick = HA_ATOMIC_LOAD(&ctr->curr_tick); + curr = HA_ATOMIC_LOAD(&ctr->curr_ctr); + + while (1) { + if (tick & 0x1) // change in progress + goto redo0; + + old_tick = tick; + old_curr = curr; + + /* now let's load the values a second time and make sure they + * did not change, which will indicate it was a stable reading. + */ + + tick = HA_ATOMIC_LOAD(&ctr->curr_tick); + if (tick & 0x1) // change in progress + goto redo0; + + if (tick != old_tick) + goto redo1; + + curr = HA_ATOMIC_LOAD(&ctr->curr_ctr); + if (curr != old_curr) + goto redo2; + + /* all values match between two loads, they're stable, let's + * quit now. + */ + break; + redo0: + tick = HA_ATOMIC_LOAD(&ctr->curr_tick); + redo1: + curr = HA_ATOMIC_LOAD(&ctr->curr_ctr); + redo2: + __ha_cpu_relax(); + }; + + if (!curr && !tick) { + /* The counter is empty, there is no overshoot */ + return 0; + } + + elapsed = HA_ATOMIC_LOAD(&global_now_ms) - tick; + if (unlikely(elapsed < 0 || elapsed > period)) { + /* The counter is in the future or the elapsed time is higher than the period, there is no overshoot */ + return 0; + } + + return curr - div64_32((uint64_t)elapsed * freq, period); +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/frontend.c b/src/frontend.c new file mode 100644 index 0000000..ad2e39e --- /dev/null +++ b/src/frontend.c @@ -0,0 +1,339 @@ +/* + * Frontend variables and functions. + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include <netinet/tcp.h> + +#include <haproxy/acl.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/chunk.h> +#include <haproxy/connection.h> +#include <haproxy/fd.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/http_ana.h> +#include <haproxy/log.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> +#include <haproxy/tools.h> + + +/* Finish a stream accept() for a proxy (TCP or HTTP). It returns a negative + * value in case of a critical failure which must cause the listener to be + * disabled, a positive or null value in case of success. + */ +int frontend_accept(struct stream *s) +{ + const struct sockaddr_storage *src, *dst; + struct session *sess = s->sess; + struct connection *conn = objt_conn(sess->origin); + struct listener *l = sess->listener; + struct proxy *fe = sess->fe; + + if ((fe->mode == PR_MODE_TCP || fe->mode == PR_MODE_HTTP) + && (!LIST_ISEMPTY(&fe->loggers))) { + if (likely(!LIST_ISEMPTY(&fe->logformat))) { + /* we have the client ip */ + if (s->logs.logwait & LW_CLIP) + if (!(s->logs.logwait &= ~(LW_CLIP|LW_INIT))) + s->do_log(s); + } + else if (conn) { + src = sc_src(s->scf); + if (!src) + send_log(fe, LOG_INFO, "Connect from unknown source to listener %d (%s/%s)\n", + l->luid, fe->id, (fe->mode == PR_MODE_HTTP) ? "HTTP" : "TCP"); + else { + char pn[INET6_ADDRSTRLEN], sn[INET6_ADDRSTRLEN]; + int port; + + switch (addr_to_str(src, pn, sizeof(pn))) { + case AF_INET: + case AF_INET6: + dst = sc_dst(s->scf); + if (dst) { + addr_to_str(dst, sn, sizeof(sn)); + port = get_host_port(dst); + } else { + strlcpy2(sn, "undetermined address", sizeof(sn)); + port = 0; + } + send_log(fe, LOG_INFO, "Connect from %s:%d to %s:%d (%s/%s)\n", + pn, get_host_port(src), + sn, port, + fe->id, (fe->mode == PR_MODE_HTTP) ? "HTTP" : "TCP"); + break; + case AF_UNIX: + /* UNIX socket, only the destination is known */ + send_log(fe, LOG_INFO, "Connect to unix:%d (%s/%s)\n", + l->luid, + fe->id, (fe->mode == PR_MODE_HTTP) ? "HTTP" : "TCP"); + break; + } + } + } + } + + if (unlikely((global.mode & MODE_DEBUG) && conn && + (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)))) { + char pn[INET6_ADDRSTRLEN]; + char alpn[16] = "<none>"; + const char *alpn_str = NULL; + int alpn_len; + + /* try to report the ALPN value when available (also works for NPN) */ + if (conn == sc_conn(s->scf)) { + if (conn_get_alpn(conn, &alpn_str, &alpn_len) && alpn_str) { + int len = MIN(alpn_len, sizeof(alpn) - 1); + memcpy(alpn, alpn_str, len); + alpn[len] = 0; + } + } + + src = sc_src(s->scf); + if (!src) { + chunk_printf(&trash, "%08x:%s.accept(%04x)=%04x from [listener:%d] ALPN=%s\n", + s->uniq_id, fe->id, (unsigned short)l->rx.fd, (unsigned short)conn->handle.fd, + l->luid, alpn); + } + else switch (addr_to_str(src, pn, sizeof(pn))) { + case AF_INET: + case AF_INET6: + chunk_printf(&trash, "%08x:%s.accept(%04x)=%04x from [%s:%d] ALPN=%s\n", + s->uniq_id, fe->id, (unsigned short)l->rx.fd, (unsigned short)conn->handle.fd, + pn, get_host_port(src), alpn); + break; + case AF_UNIX: + /* UNIX socket, only the destination is known */ + chunk_printf(&trash, "%08x:%s.accept(%04x)=%04x from [unix:%d] ALPN=%s\n", + s->uniq_id, fe->id, (unsigned short)l->rx.fd, (unsigned short)conn->handle.fd, + l->luid, alpn); + break; + } + + DISGUISE(write(1, trash.area, trash.data)); + } + + if (fe->mode == PR_MODE_HTTP) + s->scf->flags |= SC_FL_RCV_ONCE; /* one read is usually enough */ + + if (unlikely(fe->nb_req_cap > 0)) { + if ((s->req_cap = pool_zalloc(fe->req_cap_pool)) == NULL) + goto out_return; /* no memory */ + } + + if (unlikely(fe->nb_rsp_cap > 0)) { + if ((s->res_cap = pool_zalloc(fe->rsp_cap_pool)) == NULL) + goto out_free_reqcap; /* no memory */ + } + + if ((fe->http_needed || IS_HTX_STRM(s)) && !http_create_txn(s)) + goto out_free_rspcap; + + /* everything's OK, let's go on */ + return 1; + + /* Error unrolling */ + out_free_rspcap: + pool_free(fe->rsp_cap_pool, s->res_cap); + out_free_reqcap: + pool_free(fe->req_cap_pool, s->req_cap); + out_return: + return -1; +} + +/* Increment current active connection counter. This ensures that global + * maxconn is not reached or exceeded. This must be done for every new frontend + * connection allocation. + * + * Returns the new actconn global value. If maxconn reached or exceeded, 0 is + * returned : the connection allocation should be cancelled. + */ +int increment_actconn() +{ + unsigned int count, next_actconn; + + do { + count = actconn; + if (unlikely(count >= global.maxconn)) { + /* maxconn reached */ + next_actconn = 0; + goto end; + } + + /* try to increment actconn */ + next_actconn = count + 1; + } while (!_HA_ATOMIC_CAS(&actconn, (int *)(&count), next_actconn) && __ha_cpu_relax()); + + end: + return next_actconn; +} + +/************************************************************************/ +/* All supported sample and ACL keywords must be declared here. */ +/************************************************************************/ + +/* set temp integer to the id of the frontend */ +static int +smp_fetch_fe_id(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->sess->fe->uuid; + return 1; +} + +/* set string to the name of the frontend */ +static int +smp_fetch_fe_name(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.u.str.area = (char *)smp->sess->fe->id; + if (!smp->data.u.str.area) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + return 1; +} + +/* set string to the name of the default backend */ +static int +smp_fetch_fe_defbe(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->sess->fe->defbe.be) + return 0; + smp->data.u.str.area = (char *)smp->sess->fe->defbe.be->id; + if (!smp->data.u.str.area) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + return 1; +} + +/* set temp integer to the number of HTTP requests per second reaching the frontend. + * Accepts exactly 1 argument. Argument is a frontend, other types will cause + * an undefined behaviour. + */ +static int +smp_fetch_fe_req_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = args->data.prx; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = read_freq_ctr(&px->fe_req_per_sec); + return 1; +} + +/* set temp integer to the number of connections per second reaching the frontend. + * Accepts exactly 1 argument. Argument is a frontend, other types will cause + * an undefined behaviour. + */ +static int +smp_fetch_fe_sess_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = args->data.prx; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = read_freq_ctr(&px->fe_sess_per_sec); + return 1; +} + +/* set temp integer to the number of concurrent connections on the frontend + * Accepts exactly 1 argument. Argument is a frontend, other types will cause + * an undefined behaviour. + */ +static int +smp_fetch_fe_conn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *px = args->data.prx; + + if (px == NULL) + return 0; + if (px->cap & PR_CAP_DEF) + px = smp->px; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = px->feconn; + return 1; +} + +static int +smp_fetch_fe_client_timeout(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = TICKS_TO_MS(smp->sess->fe->timeout.client); + return 1; +} + + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct sample_fetch_kw_list smp_kws = {ILH, { + { "fe_client_timeout", smp_fetch_fe_client_timeout, 0, NULL, SMP_T_SINT, SMP_USE_FTEND, }, + { "fe_conn", smp_fetch_fe_conn, ARG1(1,FE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "fe_defbe", smp_fetch_fe_defbe, 0, NULL, SMP_T_STR, SMP_USE_FTEND, }, + { "fe_id", smp_fetch_fe_id, 0, NULL, SMP_T_SINT, SMP_USE_FTEND, }, + { "fe_name", smp_fetch_fe_name, 0, NULL, SMP_T_STR, SMP_USE_FTEND, }, + { "fe_req_rate", smp_fetch_fe_req_rate, ARG1(1,FE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "fe_sess_rate", smp_fetch_fe_sess_rate, ARG1(1,FE), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct acl_kw_list acl_kws = {ILH, { + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, acl_register_keywords, &acl_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/h1.c b/src/h1.c new file mode 100644 index 0000000..e251e74 --- /dev/null +++ b/src/h1.c @@ -0,0 +1,1319 @@ +/* + * HTTP/1 protocol analyzer + * + * Copyright 2000-2017 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> + +#include <import/sha1.h> + +#include <haproxy/api.h> +#include <haproxy/base64.h> +#include <haproxy/h1.h> +#include <haproxy/http-hdr.h> +#include <haproxy/tools.h> + +/* Parse the Content-Length header field of an HTTP/1 request. The function + * checks all possible occurrences of a comma-delimited value, and verifies + * if any of them doesn't match a previous value. It returns <0 if a value + * differs, 0 if the whole header can be dropped (i.e. already known), or >0 + * if the value can be indexed (first one). In the last case, the value might + * be adjusted and the caller must only add the updated value. + */ +int h1_parse_cont_len_header(struct h1m *h1m, struct ist *value) +{ + char *e, *n; + long long cl; + int not_first = !!(h1m->flags & H1_MF_CLEN); + struct ist word; + + word.ptr = value->ptr; + e = value->ptr + value->len; + + while (1) { + if (word.ptr >= e) { + /* empty header or empty value */ + goto fail; + } + + /* skip leading delimiter and blanks */ + if (unlikely(HTTP_IS_LWS(*word.ptr))) { + word.ptr++; + continue; + } + + /* digits only now */ + for (cl = 0, n = word.ptr; n < e; n++) { + unsigned int c = *n - '0'; + if (unlikely(c > 9)) { + /* non-digit */ + if (unlikely(n == word.ptr)) // spaces only + goto fail; + break; + } + + if (unlikely(!cl && n > word.ptr)) { + /* There was a leading zero before this digit, + * let's trim it. + */ + word.ptr = n; + } + + if (unlikely(cl > ULLONG_MAX / 10ULL)) + goto fail; /* multiply overflow */ + cl = cl * 10ULL; + if (unlikely(cl + c < cl)) + goto fail; /* addition overflow */ + cl = cl + c; + } + + /* keep a copy of the exact cleaned value */ + word.len = n - word.ptr; + + /* skip trailing LWS till next comma or EOL */ + for (; n < e; n++) { + if (!HTTP_IS_LWS(*n)) { + if (unlikely(*n != ',')) + goto fail; + break; + } + } + + /* if duplicate, must be equal */ + if (h1m->flags & H1_MF_CLEN && cl != h1m->body_len) + goto fail; + + /* OK, store this result as the one to be indexed */ + h1m->flags |= H1_MF_CLEN; + h1m->curr_len = h1m->body_len = cl; + *value = word; + + /* Now either n==e and we're done, or n points to the comma, + * and we skip it and continue. + */ + if (n++ == e) + break; + + word.ptr = n; + } + /* here we've reached the end with a single value or a series of + * identical values, all matching previous series if any. The last + * parsed value was sent back into <value>. We just have to decide + * if this occurrence has to be indexed (it's the first one) or + * silently skipped (it's not the first one) + */ + return !not_first; + fail: + return -1; +} + +/* Parse the Transfer-Encoding: header field of an HTTP/1 request, looking for + * "chunked" encoding to perform some checks (it must be the last encoding for + * the request and must not be performed twice for any message). The + * H1_MF_TE_CHUNKED is set if a valid "chunked" encoding is found. The + * H1_MF_TE_OTHER flag is set if any other encoding is found. The H1_MF_XFER_ENC + * flag is always set. The H1_MF_CHNK is set when "chunked" encoding is the last + * one. Note that transfer codings are case-insensitive (cf RFC7230#4). This + * function returns <0 if a error is found, 0 if the whole header can be dropped + * (not used yet), or >0 if the value can be indexed. + */ +int h1_parse_xfer_enc_header(struct h1m *h1m, struct ist value) +{ + char *e, *n; + struct ist word; + + h1m->flags |= H1_MF_XFER_ENC; + + word.ptr = value.ptr - 1; // -1 for next loop's pre-increment + e = istend(value); + + while (++word.ptr < e) { + /* skip leading delimiter and blanks */ + if (HTTP_IS_LWS(*word.ptr)) + continue; + + n = http_find_hdr_value_end(word.ptr, e); // next comma or end of line + word.len = n - word.ptr; + + /* trim trailing blanks */ + while (word.len && HTTP_IS_LWS(word.ptr[word.len-1])) + word.len--; + + h1m->flags &= ~H1_MF_CHNK; + if (isteqi(word, ist("chunked"))) { + if (h1m->flags & H1_MF_TE_CHUNKED) { + /* cf RFC7230#3.3.1 : A sender MUST NOT apply + * chunked more than once to a message body + * (i.e., chunking an already chunked message is + * not allowed) + */ + goto fail; + } + h1m->flags |= (H1_MF_TE_CHUNKED|H1_MF_CHNK); + } + else { + if ((h1m->flags & (H1_MF_RESP|H1_MF_TE_CHUNKED)) == H1_MF_TE_CHUNKED) { + /* cf RFC7230#3.3.1 : If any transfer coding + * other than chunked is applied to a request + * payload body, the sender MUST apply chunked + * as the final transfer coding to ensure that + * the message is properly framed. + */ + goto fail; + } + h1m->flags |= H1_MF_TE_OTHER; + } + + word.ptr = n; + } + + return 1; + fail: + return -1; +} + +/* Validate the authority and the host header value for CONNECT method. If there + * is hast header, its value is normalized. 0 is returned on success, -1 if the + * authority is invalid and -2 if the host is invalid. + */ +static int h1_validate_connect_authority(struct ist authority, struct ist *host_hdr) +{ + struct ist uri_host, uri_port, host, host_port; + + if (!isttest(authority)) + goto invalid_authority; + uri_host = authority; + uri_port = http_get_host_port(authority); + if (!istlen(uri_port)) + goto invalid_authority; + uri_host.len -= (istlen(uri_port) + 1); + + if (!host_hdr || !isttest(*host_hdr)) + goto end; + + /* Get the port of the host header value, if any */ + host = *host_hdr; + host_port = http_get_host_port(*host_hdr); + if (isttest(host_port)) + host.len -= (istlen(host_port) + 1); + + if (istlen(host_port)) { + if (!isteqi(host, uri_host) || !isteq(host_port, uri_port)) + goto invalid_host; + if (http_is_default_port(IST_NULL, uri_port)) + *host_hdr = host; /* normalize */ + } + else { + if (!http_is_default_port(IST_NULL, uri_port) || !isteqi(host, uri_host)) + goto invalid_host; + } + + end: + return 0; + + invalid_authority: + return -1; + + invalid_host: + return -2; +} + + +/* Validate the authority and the host header value for non-CONNECT method, when + * an absolute-URI is detected but when it does not exactly match the host + * value. The idea is to detect default port (http or https). authority and host + * are defined here. 0 is returned on success, -1 if the host is does not match + * the authority. + */ +static int h1_validate_mismatch_authority(struct ist scheme, struct ist authority, struct ist host_hdr) +{ + struct ist uri_host, uri_port, host, host_port; + + if (!isttest(scheme)) + goto mismatch; + + uri_host = authority; + uri_port = http_get_host_port(authority); + if (isttest(uri_port)) + uri_host.len -= (istlen(uri_port) + 1); + + host = host_hdr; + host_port = http_get_host_port(host_hdr); + if (isttest(host_port)) + host.len -= (istlen(host_port) + 1); + + if (!isttest(uri_port) && !isttest(host_port)) { + /* No port on both: we already know the authority does not match + * the host value + */ + goto mismatch; + } + else if (isttest(uri_port) && !http_is_default_port(scheme, uri_port)) { + /* here there is no port for the host value and the port for the + * authority is not the default one + */ + goto mismatch; + } + else if (isttest(host_port) && !http_is_default_port(scheme, host_port)) { + /* here there is no port for the authority and the port for the + * host value is not the default one + */ + goto mismatch; + } + else { + /* the authority or the host value contain a default port and + * there is no port on the other value + */ + if (!isteqi(uri_host, host)) + goto mismatch; + } + + return 0; + + mismatch: + return -1; +} + + +/* Parse the Connection: header of an HTTP/1 request, looking for "close", + * "keep-alive", and "upgrade" values, and updating h1m->flags according to + * what was found there. Note that flags are only added, not removed, so the + * function is safe for being called multiple times if multiple occurrences + * are found. If the flag H1_MF_CLEAN_CONN_HDR, the header value is cleaned + * up from "keep-alive" and "close" values. To do so, the header value is + * rewritten in place and its length is updated. + */ +void h1_parse_connection_header(struct h1m *h1m, struct ist *value) +{ + char *e, *n, *p; + struct ist word; + + word.ptr = value->ptr - 1; // -1 for next loop's pre-increment + p = value->ptr; + e = value->ptr + value->len; + if (h1m->flags & H1_MF_CLEAN_CONN_HDR) + value->len = 0; + + while (++word.ptr < e) { + /* skip leading delimiter and blanks */ + if (HTTP_IS_LWS(*word.ptr)) + continue; + + n = http_find_hdr_value_end(word.ptr, e); // next comma or end of line + word.len = n - word.ptr; + + /* trim trailing blanks */ + while (word.len && HTTP_IS_LWS(word.ptr[word.len-1])) + word.len--; + + if (isteqi(word, ist("keep-alive"))) { + h1m->flags |= H1_MF_CONN_KAL; + if (h1m->flags & H1_MF_CLEAN_CONN_HDR) + goto skip_val; + } + else if (isteqi(word, ist("close"))) { + h1m->flags |= H1_MF_CONN_CLO; + if (h1m->flags & H1_MF_CLEAN_CONN_HDR) + goto skip_val; + } + else if (isteqi(word, ist("upgrade"))) + h1m->flags |= H1_MF_CONN_UPG; + + if (h1m->flags & H1_MF_CLEAN_CONN_HDR) { + if (value->ptr + value->len == p) { + /* no rewrite done till now */ + value->len = n - value->ptr; + } + else { + if (value->len) + value->ptr[value->len++] = ','; + istcat(value, word, e - value->ptr); + } + } + + skip_val: + word.ptr = p = n; + } +} + +/* Parse the Upgrade: header of an HTTP/1 request. + * If "websocket" is found, set H1_MF_UPG_WEBSOCKET flag + */ +void h1_parse_upgrade_header(struct h1m *h1m, struct ist value) +{ + char *e, *n; + struct ist word; + + h1m->flags &= ~H1_MF_UPG_WEBSOCKET; + + word.ptr = value.ptr - 1; // -1 for next loop's pre-increment + e = istend(value); + + while (++word.ptr < e) { + /* skip leading delimiter and blanks */ + if (HTTP_IS_LWS(*word.ptr)) + continue; + + n = http_find_hdr_value_end(word.ptr, e); // next comma or end of line + word.len = n - word.ptr; + + /* trim trailing blanks */ + while (word.len && HTTP_IS_LWS(word.ptr[word.len-1])) + word.len--; + + if (isteqi(word, ist("websocket"))) + h1m->flags |= H1_MF_UPG_WEBSOCKET; + + word.ptr = n; + } +} + +/* Macros used in the HTTP/1 parser, to check for the expected presence of + * certain bytes (ef: LF) or to skip to next byte and yield in case of failure. + */ + +/* Expects to find an LF at <ptr>. If not, set <state> to <where> and jump to + * <bad>. + */ +#define EXPECT_LF_HERE(ptr, bad, state, where) \ + do { \ + if (unlikely(*(ptr) != '\n')) { \ + state = (where); \ + goto bad; \ + } \ + } while (0) + +/* Increments pointer <ptr>, continues to label <more> if it's still below + * pointer <end>, or goes to <stop> and sets <state> to <where> if the end + * of buffer was reached. + */ +#define EAT_AND_JUMP_OR_RETURN(ptr, end, more, stop, state, where) \ + do { \ + if (likely(++(ptr) < (end))) \ + goto more; \ + else { \ + state = (where); \ + goto stop; \ + } \ + } while (0) + +/* This function parses a contiguous HTTP/1 headers block starting at <start> + * and ending before <stop>, at once, and converts it a list of (name,value) + * pairs representing header fields into the array <hdr> of size <hdr_num>, + * whose last entry will have an empty name and an empty value. If <hdr_num> is + * too small to represent the whole message, an error is returned. Some + * protocol elements such as content-length and transfer-encoding will be + * parsed and stored into h1m as well. <hdr> may be null, in which case only + * the parsing state will be updated. This may be used to restart the parsing + * where it stopped for example. + * + * For now it's limited to the response. If the header block is incomplete, + * 0 is returned, waiting to be called again with more data to try it again. + * The caller is responsible for initializing h1m->state to H1_MSG_RPBEFORE, + * and h1m->next to zero on the first call, the parser will do the rest. If + * an incomplete message is seen, the caller only needs to present h1m->state + * and h1m->next again, with an empty header list so that the parser can start + * again. In this case, it will detect that it interrupted a previous session + * and will first look for the end of the message before reparsing it again and + * indexing it at the same time. This ensures that incomplete messages fed 1 + * character at a time are never processed entirely more than exactly twice, + * and that there is no need to store all the internal state and pre-parsed + * headers or start line between calls. + * + * A pointer to a start line descriptor may be passed in <slp>, in which case + * the parser will fill it with whatever it found. + * + * The code derived from the main HTTP/1 parser above but was simplified and + * optimized to process responses produced or forwarded by haproxy. The caller + * is responsible for ensuring that the message doesn't wrap, and should ensure + * it is complete to avoid having to retry the operation after a failed + * attempt. The message is not supposed to be invalid, which is why a few + * properties such as the character set used in the header field names are not + * checked. In case of an unparsable response message, a negative value will be + * returned with h1m->err_pos and h1m->err_state matching the location and + * state where the error was met. Leading blank likes are tolerated but not + * recommended. If flag H1_MF_HDRS_ONLY is set in h1m->flags, only headers are + * parsed and the start line is skipped. It is not required to set h1m->state + * nor h1m->next in this case. + * + * This function returns : + * -1 in case of error. In this case, h1m->err_state is filled (if h1m is + * set) with the state the error occurred in and h1m->err_pos with the + * the position relative to <start> + * -2 if the output is full (hdr_num reached). err_state and err_pos also + * indicate where it failed. + * 0 in case of missing data. + * > 0 on success, it then corresponds to the number of bytes read since + * <start> so that the caller can go on with the payload. + */ +int h1_headers_to_hdr_list(char *start, const char *stop, + struct http_hdr *hdr, unsigned int hdr_num, + struct h1m *h1m, union h1_sl *slp) +{ + enum h1m_state state; + register char *ptr; + register const char *end; + unsigned int hdr_count; + unsigned int skip; /* number of bytes skipped at the beginning */ + unsigned int sol; /* start of line */ + unsigned int col; /* position of the colon */ + unsigned int eol; /* end of line */ + unsigned int sov; /* start of value */ + union h1_sl sl; + int skip_update; + int restarting; + int host_idx; + struct ist n, v; /* header name and value during parsing */ + + skip = 0; // do it only once to keep track of the leading CRLF. + + try_again: + hdr_count = sol = col = eol = sov = 0; + sl.st.status = 0; + skip_update = restarting = 0; + host_idx = -1; + + if (h1m->flags & H1_MF_HDRS_ONLY) { + state = H1_MSG_HDR_FIRST; + h1m->next = 0; + } + else { + state = h1m->state; + if (h1m->state != H1_MSG_RQBEFORE && h1m->state != H1_MSG_RPBEFORE) + restarting = 1; + } + + ptr = start + h1m->next; + end = stop; + + if (unlikely(ptr >= end)) + goto http_msg_ood; + + /* don't update output if hdr is NULL or if we're restarting */ + if (!hdr || restarting) + skip_update = 1; + + switch (state) { + case H1_MSG_RQBEFORE: + http_msg_rqbefore: + if (likely(HTTP_IS_TOKEN(*ptr))) { + /* we have a start of message, we may have skipped some + * heading CRLF. Skip them now. + */ + skip += ptr - start; + start = ptr; + + sol = 0; + sl.rq.m.ptr = ptr; + hdr_count = 0; + state = H1_MSG_RQMETH; + goto http_msg_rqmeth; + } + + if (unlikely(!HTTP_IS_CRLF(*ptr))) { + state = H1_MSG_RQBEFORE; + goto http_msg_invalid; + } + + if (unlikely(*ptr == '\n')) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rqbefore, http_msg_ood, state, H1_MSG_RQBEFORE); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rqbefore_cr, http_msg_ood, state, H1_MSG_RQBEFORE_CR); + /* stop here */ + + case H1_MSG_RQBEFORE_CR: + http_msg_rqbefore_cr: + EXPECT_LF_HERE(ptr, http_msg_invalid, state, H1_MSG_RQBEFORE_CR); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rqbefore, http_msg_ood, state, H1_MSG_RQBEFORE); + /* stop here */ + + case H1_MSG_RQMETH: + http_msg_rqmeth: + if (likely(HTTP_IS_TOKEN(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rqmeth, http_msg_ood, state, H1_MSG_RQMETH); + + if (likely(HTTP_IS_SPHT(*ptr))) { + sl.rq.m.len = ptr - sl.rq.m.ptr; + sl.rq.meth = find_http_meth(start, sl.rq.m.len); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rqmeth_sp, http_msg_ood, state, H1_MSG_RQMETH_SP); + } + + if (likely(HTTP_IS_CRLF(*ptr))) { + /* HTTP 0.9 request */ + sl.rq.m.len = ptr - sl.rq.m.ptr; + sl.rq.meth = find_http_meth(sl.rq.m.ptr, sl.rq.m.len); + http_msg_req09_uri: + sl.rq.u.ptr = ptr; + http_msg_req09_uri_e: + sl.rq.u.len = ptr - sl.rq.u.ptr; + http_msg_req09_ver: + sl.rq.v = ist2(ptr, 0); + goto http_msg_rqline_eol; + } + state = H1_MSG_RQMETH; + goto http_msg_invalid; + + case H1_MSG_RQMETH_SP: + http_msg_rqmeth_sp: + if (likely(!HTTP_IS_LWS(*ptr))) { + sl.rq.u.ptr = ptr; + goto http_msg_rquri; + } + if (likely(HTTP_IS_SPHT(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rqmeth_sp, http_msg_ood, state, H1_MSG_RQMETH_SP); + /* so it's a CR/LF, meaning an HTTP 0.9 request */ + goto http_msg_req09_uri; + + case H1_MSG_RQURI: + http_msg_rquri: +#ifdef HA_UNALIGNED_LE + /* speedup: skip bytes not between 0x24 and 0x7e inclusive */ + while (ptr <= end - sizeof(int)) { + int x = *(int *)ptr - 0x24242424; + if (x & 0x80808080) + break; + + x -= 0x5b5b5b5b; + if (!(x & 0x80808080)) + break; + + ptr += sizeof(int); + } +#endif + if (ptr >= end) { + state = H1_MSG_RQURI; + goto http_msg_ood; + } + http_msg_rquri2: + if (likely((unsigned char)(*ptr - 33) <= 93)) { /* 33 to 126 included */ + if (*ptr == '#') { + if (h1m->err_pos < -1) /* PR_O2_REQBUG_OK not set */ + goto invalid_char; + if (h1m->err_pos == -1) /* PR_O2_REQBUG_OK set: just log */ + h1m->err_pos = ptr - start + skip; + } + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rquri2, http_msg_ood, state, H1_MSG_RQURI); + } + + if (likely(HTTP_IS_SPHT(*ptr))) { + sl.rq.u.len = ptr - sl.rq.u.ptr; + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rquri_sp, http_msg_ood, state, H1_MSG_RQURI_SP); + } + if (likely((unsigned char)*ptr >= 128)) { + /* non-ASCII chars are forbidden unless option + * accept-invalid-http-request is enabled in the frontend. + * In any case, we capture the faulty char. + */ + if (h1m->err_pos < -1) + goto invalid_char; + if (h1m->err_pos == -1) + h1m->err_pos = ptr - start + skip; + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rquri, http_msg_ood, state, H1_MSG_RQURI); + } + + if (likely(HTTP_IS_CRLF(*ptr))) { + /* so it's a CR/LF, meaning an HTTP 0.9 request */ + goto http_msg_req09_uri_e; + } + + /* OK forbidden chars, 0..31 or 127 */ + invalid_char: + state = H1_MSG_RQURI; + goto http_msg_invalid; + + case H1_MSG_RQURI_SP: + http_msg_rquri_sp: + if (likely(!HTTP_IS_LWS(*ptr))) { + sl.rq.v.ptr = ptr; + goto http_msg_rqver; + } + if (likely(HTTP_IS_SPHT(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rquri_sp, http_msg_ood, state, H1_MSG_RQURI_SP); + /* so it's a CR/LF, meaning an HTTP 0.9 request */ + goto http_msg_req09_ver; + + + case H1_MSG_RQVER: + http_msg_rqver: + if (likely(HTTP_IS_VER_TOKEN(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rqver, http_msg_ood, state, H1_MSG_RQVER); + + if (likely(HTTP_IS_CRLF(*ptr))) { + sl.rq.v.len = ptr - sl.rq.v.ptr; + http_msg_rqline_eol: + /* We have seen the end of line. Note that we do not + * necessarily have the \n yet, but at least we know that we + * have EITHER \r OR \n, otherwise the request would not be + * complete. We can then record the request length and return + * to the caller which will be able to register it. + */ + + if (likely(!skip_update)) { + if ((sl.rq.v.len == 8) && + (*(sl.rq.v.ptr + 5) > '1' || + (*(sl.rq.v.ptr + 5) == '1' && *(sl.rq.v.ptr + 7) >= '1'))) + h1m->flags |= H1_MF_VER_11; + + if (unlikely(hdr_count >= hdr_num)) { + state = H1_MSG_RQVER; + goto http_output_full; + } + if (!(h1m->flags & H1_MF_NO_PHDR)) + http_set_hdr(&hdr[hdr_count++], ist(":method"), sl.rq.m); + + if (unlikely(hdr_count >= hdr_num)) { + state = H1_MSG_RQVER; + goto http_output_full; + } + if (!(h1m->flags & H1_MF_NO_PHDR)) + http_set_hdr(&hdr[hdr_count++], ist(":path"), sl.rq.u); + } + + sol = ptr - start; + if (likely(*ptr == '\r')) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rqline_end, http_msg_ood, state, H1_MSG_RQLINE_END); + goto http_msg_rqline_end; + } + + /* neither an HTTP_VER token nor a CRLF */ + state = H1_MSG_RQVER; + goto http_msg_invalid; + + case H1_MSG_RQLINE_END: + http_msg_rqline_end: + /* check for HTTP/0.9 request : no version information + * available. sol must point to the first of CR or LF. However + * since we don't save these elements between calls, if we come + * here from a restart, we don't necessarily know. Thus in this + * case we simply start over. + */ + if (restarting) + goto restart; + + if (unlikely(sl.rq.v.len == 0)) + goto http_msg_last_lf; + + EXPECT_LF_HERE(ptr, http_msg_invalid, state, H1_MSG_RQLINE_END); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_first, http_msg_ood, state, H1_MSG_HDR_FIRST); + /* stop here */ + + /* + * Common states below + */ + case H1_MSG_RPBEFORE: + http_msg_rpbefore: + if (likely(HTTP_IS_TOKEN(*ptr))) { + /* we have a start of message, we may have skipped some + * heading CRLF. Skip them now. + */ + skip += ptr - start; + start = ptr; + + sol = 0; + sl.st.v.ptr = ptr; + hdr_count = 0; + state = H1_MSG_RPVER; + goto http_msg_rpver; + } + + if (unlikely(!HTTP_IS_CRLF(*ptr))) { + state = H1_MSG_RPBEFORE; + goto http_msg_invalid; + } + + if (unlikely(*ptr == '\n')) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpbefore, http_msg_ood, state, H1_MSG_RPBEFORE); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpbefore_cr, http_msg_ood, state, H1_MSG_RPBEFORE_CR); + /* stop here */ + + case H1_MSG_RPBEFORE_CR: + http_msg_rpbefore_cr: + EXPECT_LF_HERE(ptr, http_msg_invalid, state, H1_MSG_RPBEFORE_CR); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpbefore, http_msg_ood, state, H1_MSG_RPBEFORE); + /* stop here */ + + case H1_MSG_RPVER: + http_msg_rpver: + if (likely(HTTP_IS_VER_TOKEN(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpver, http_msg_ood, state, H1_MSG_RPVER); + + if (likely(HTTP_IS_SPHT(*ptr))) { + sl.st.v.len = ptr - sl.st.v.ptr; + + if ((sl.st.v.len == 8) && + (*(sl.st.v.ptr + 5) > '1' || + (*(sl.st.v.ptr + 5) == '1' && *(sl.st.v.ptr + 7) >= '1'))) + h1m->flags |= H1_MF_VER_11; + + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpver_sp, http_msg_ood, state, H1_MSG_RPVER_SP); + } + state = H1_MSG_RPVER; + goto http_msg_invalid; + + case H1_MSG_RPVER_SP: + http_msg_rpver_sp: + if (likely(!HTTP_IS_LWS(*ptr))) { + sl.st.status = 0; + sl.st.c.ptr = ptr; + goto http_msg_rpcode; + } + if (likely(HTTP_IS_SPHT(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpver_sp, http_msg_ood, state, H1_MSG_RPVER_SP); + /* so it's a CR/LF, this is invalid */ + state = H1_MSG_RPVER_SP; + goto http_msg_invalid; + + case H1_MSG_RPCODE: + http_msg_rpcode: + if (likely(HTTP_IS_DIGIT(*ptr))) { + sl.st.status = sl.st.status * 10 + *ptr - '0'; + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpcode, http_msg_ood, state, H1_MSG_RPCODE); + } + + if (unlikely(!HTTP_IS_LWS(*ptr))) { + state = H1_MSG_RPCODE; + goto http_msg_invalid; + } + + if (likely(HTTP_IS_SPHT(*ptr))) { + sl.st.c.len = ptr - sl.st.c.ptr; + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpcode_sp, http_msg_ood, state, H1_MSG_RPCODE_SP); + } + + /* so it's a CR/LF, so there is no reason phrase */ + sl.st.c.len = ptr - sl.st.c.ptr; + + http_msg_rsp_reason: + sl.st.r = ist2(ptr, 0); + goto http_msg_rpline_eol; + + case H1_MSG_RPCODE_SP: + http_msg_rpcode_sp: + if (likely(!HTTP_IS_LWS(*ptr))) { + sl.st.r.ptr = ptr; + goto http_msg_rpreason; + } + if (likely(HTTP_IS_SPHT(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpcode_sp, http_msg_ood, state, H1_MSG_RPCODE_SP); + /* so it's a CR/LF, so there is no reason phrase */ + goto http_msg_rsp_reason; + + case H1_MSG_RPREASON: + http_msg_rpreason: + if (likely(!HTTP_IS_CRLF(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpreason, http_msg_ood, state, H1_MSG_RPREASON); + sl.st.r.len = ptr - sl.st.r.ptr; + http_msg_rpline_eol: + /* We have seen the end of line. Note that we do not + * necessarily have the \n yet, but at least we know that we + * have EITHER \r OR \n, otherwise the response would not be + * complete. We can then record the response length and return + * to the caller which will be able to register it. + */ + + if (likely(!skip_update)) { + if (unlikely(hdr_count >= hdr_num)) { + state = H1_MSG_RPREASON; + goto http_output_full; + } + if (!(h1m->flags & H1_MF_NO_PHDR)) + http_set_hdr(&hdr[hdr_count++], ist(":status"), sl.st.c); + } + + sol = ptr - start; + if (likely(*ptr == '\r')) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_rpline_end, http_msg_ood, state, H1_MSG_RPLINE_END); + goto http_msg_rpline_end; + + case H1_MSG_RPLINE_END: + http_msg_rpline_end: + /* sol must point to the first of CR or LF. */ + EXPECT_LF_HERE(ptr, http_msg_invalid, state, H1_MSG_RPLINE_END); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_first, http_msg_ood, state, H1_MSG_HDR_FIRST); + /* stop here */ + + case H1_MSG_HDR_FIRST: + http_msg_hdr_first: + sol = ptr - start; + if (likely(!HTTP_IS_CRLF(*ptr))) { + goto http_msg_hdr_name; + } + + if (likely(*ptr == '\r')) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_last_lf, http_msg_ood, state, H1_MSG_LAST_LF); + goto http_msg_last_lf; + + case H1_MSG_HDR_NAME: + http_msg_hdr_name: + /* assumes sol points to the first char */ + if (likely(HTTP_IS_TOKEN(*ptr))) { + if (!skip_update) { + /* turn it to lower case if needed */ + if (isupper((unsigned char)*ptr) && h1m->flags & H1_MF_TOLOWER) + *ptr = tolower((unsigned char)*ptr); + } + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_name, http_msg_ood, state, H1_MSG_HDR_NAME); + } + + if (likely(*ptr == ':')) { + col = ptr - start; + if (col <= sol) { + state = H1_MSG_HDR_NAME; + goto http_msg_invalid; + } + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_sp, http_msg_ood, state, H1_MSG_HDR_L1_SP); + } + + if (likely(h1m->err_pos < -1) || *ptr == '\n') { + state = H1_MSG_HDR_NAME; + goto http_msg_invalid; + } + + if (h1m->err_pos == -1) /* capture the error pointer */ + h1m->err_pos = ptr - start + skip; /* >= 0 now */ + + /* and we still accept this non-token character */ + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_name, http_msg_ood, state, H1_MSG_HDR_NAME); + + case H1_MSG_HDR_L1_SP: + http_msg_hdr_l1_sp: + /* assumes sol points to the first char */ + if (likely(HTTP_IS_SPHT(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_sp, http_msg_ood, state, H1_MSG_HDR_L1_SP); + + /* header value can be basically anything except CR/LF */ + sov = ptr - start; + + if (likely(!HTTP_IS_CRLF(*ptr))) { + goto http_msg_hdr_val; + } + + if (likely(*ptr == '\r')) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_lf, http_msg_ood, state, H1_MSG_HDR_L1_LF); + goto http_msg_hdr_l1_lf; + + case H1_MSG_HDR_L1_LF: + http_msg_hdr_l1_lf: + EXPECT_LF_HERE(ptr, http_msg_invalid, state, H1_MSG_HDR_L1_LF); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_lws, http_msg_ood, state, H1_MSG_HDR_L1_LWS); + + case H1_MSG_HDR_L1_LWS: + http_msg_hdr_l1_lws: + if (likely(HTTP_IS_SPHT(*ptr))) { + if (!skip_update) { + /* replace HT,CR,LF with spaces */ + for (; start + sov < ptr; sov++) + start[sov] = ' '; + } + goto http_msg_hdr_l1_sp; + } + /* we had a header consisting only in spaces ! */ + eol = sov; + goto http_msg_complete_header; + + case H1_MSG_HDR_VAL: + http_msg_hdr_val: + /* assumes sol points to the first char, and sov + * points to the first character of the value. + */ + + /* speedup: we'll skip packs of 4 or 8 bytes not containing bytes 0x0D + * and lower. In fact since most of the time is spent in the loop, we + * also remove the sign bit test so that bytes 0x8e..0x0d break the + * loop, but we don't care since they're very rare in header values. + */ +#ifdef HA_UNALIGNED_LE64 + while (ptr <= end - sizeof(long)) { + if ((*(long *)ptr - 0x0e0e0e0e0e0e0e0eULL) & 0x8080808080808080ULL) + goto http_msg_hdr_val2; + ptr += sizeof(long); + } +#endif +#ifdef HA_UNALIGNED_LE + while (ptr <= end - sizeof(int)) { + if ((*(int*)ptr - 0x0e0e0e0e) & 0x80808080) + goto http_msg_hdr_val2; + ptr += sizeof(int); + } +#endif + if (ptr >= end) { + state = H1_MSG_HDR_VAL; + goto http_msg_ood; + } + http_msg_hdr_val2: + if (likely(!*ptr)) { + /* RFC9110 clarified that NUL is explicitly forbidden in header values + * (like CR and LF). + */ + if (h1m->err_pos < -1) { /* PR_O2_REQBUG_OK not set */ + state = H1_MSG_HDR_VAL; + goto http_msg_invalid; + } + if (h1m->err_pos == -1) /* PR_O2_REQBUG_OK set: just log */ + h1m->err_pos = ptr - start + skip; + } + if (likely(!HTTP_IS_CRLF(*ptr))) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_val2, http_msg_ood, state, H1_MSG_HDR_VAL); + + eol = ptr - start; + /* Note: we could also copy eol into ->eoh so that we have the + * real header end in case it ends with lots of LWS, but is this + * really needed ? + */ + if (likely(*ptr == '\r')) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l2_lf, http_msg_ood, state, H1_MSG_HDR_L2_LF); + goto http_msg_hdr_l2_lf; + + case H1_MSG_HDR_L2_LF: + http_msg_hdr_l2_lf: + EXPECT_LF_HERE(ptr, http_msg_invalid, state, H1_MSG_HDR_L2_LF); + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l2_lws, http_msg_ood, state, H1_MSG_HDR_L2_LWS); + + case H1_MSG_HDR_L2_LWS: + http_msg_hdr_l2_lws: + if (unlikely(HTTP_IS_SPHT(*ptr))) { + if (!skip_update) { + /* LWS: replace HT,CR,LF with spaces */ + for (; start + eol < ptr; eol++) + start[eol] = ' '; + } + goto http_msg_hdr_val; + } + http_msg_complete_header: + /* + * It was a new header, so the last one is finished. Assumes + * <sol> points to the first char of the name, <col> to the + * colon, <sov> points to the first character of the value and + * <eol> to the first CR or LF so we know how the line ends. We + * will trim spaces around the value. It's possible to do it by + * adjusting <eol> and <sov> which are no more used after this. + * We can add the header field to the list. + */ + if (likely(!skip_update)) { + while (sov < eol && HTTP_IS_LWS(start[sov])) + sov++; + + while (eol - 1 > sov && HTTP_IS_LWS(start[eol - 1])) + eol--; + + + n = ist2(start + sol, col - sol); + v = ist2(start + sov, eol - sov); + + do { + int ret; + + if (unlikely(hdr_count >= hdr_num)) { + state = H1_MSG_HDR_L2_LWS; + goto http_output_full; + } + + if (isteqi(n, ist("transfer-encoding"))) { + ret = h1_parse_xfer_enc_header(h1m, v); + if (ret < 0) { + state = H1_MSG_HDR_L2_LWS; + ptr = v.ptr; /* Set ptr on the error */ + goto http_msg_invalid; + } + else if (ret == 0) { + /* skip it */ + break; + } + } + else if (isteqi(n, ist("content-length"))) { + ret = h1_parse_cont_len_header(h1m, &v); + + if (ret < 0) { + state = H1_MSG_HDR_L2_LWS; + ptr = v.ptr; /* Set ptr on the error */ + goto http_msg_invalid; + } + else if (ret == 0) { + /* skip it */ + break; + } + } + else if (isteqi(n, ist("connection"))) { + h1_parse_connection_header(h1m, &v); + if (!v.len) { + /* skip it */ + break; + } + } + else if (isteqi(n, ist("upgrade"))) { + h1_parse_upgrade_header(h1m, v); + } + else if (!(h1m->flags & H1_MF_RESP) && isteqi(n, ist("host"))) { + if (host_idx == -1) + host_idx = hdr_count; + else { + if (!isteqi(v, hdr[host_idx].v)) { + state = H1_MSG_HDR_L2_LWS; + ptr = v.ptr; /* Set ptr on the error */ + goto http_msg_invalid; + } + /* if the same host, skip it */ + break; + } + } + + http_set_hdr(&hdr[hdr_count++], n, v); + } while (0); + } + + sol = ptr - start; + + if (likely(!HTTP_IS_CRLF(*ptr))) + goto http_msg_hdr_name; + + if (likely(*ptr == '\r')) + EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_last_lf, http_msg_ood, state, H1_MSG_LAST_LF); + goto http_msg_last_lf; + + case H1_MSG_LAST_LF: + http_msg_last_lf: + EXPECT_LF_HERE(ptr, http_msg_invalid, state, H1_MSG_LAST_LF); + ptr++; + /* <ptr> now points to the first byte of payload. If needed sol + * still points to the first of either CR or LF of the empty + * line ending the headers block. + */ + if (likely(!skip_update)) { + if (unlikely(hdr_count >= hdr_num)) { + state = H1_MSG_LAST_LF; + goto http_output_full; + } + http_set_hdr(&hdr[hdr_count++], ist2(start+sol, 0), ist("")); + } + + /* reaching here we've parsed the whole message. We may detect + * that we were already continuing an interrupted parsing pass + * so we were silently looking for the end of message not + * updating anything before deciding to parse it fully at once. + * It's guaranteed that we won't match this test twice in a row + * since restarting will turn zero. + */ + if (restarting) + goto restart; + + + if (!(h1m->flags & (H1_MF_HDRS_ONLY|H1_MF_RESP))) { + struct http_uri_parser parser = http_uri_parser_init(sl.rq.u); + struct ist scheme, authority; + int ret; + + scheme = http_parse_scheme(&parser); + authority = http_parse_authority(&parser, 1); + if (sl.rq.meth == HTTP_METH_CONNECT) { + struct ist *host = ((host_idx != -1) ? &hdr[host_idx].v : NULL); + + ret = h1_validate_connect_authority(authority, host); + if (ret < 0) { + if (h1m->err_pos < -1) { + state = H1_MSG_LAST_LF; + /* WT: gcc seems to see a path where sl.rq.u.ptr was used + * uninitialized, but it doesn't know that the function is + * called with initial states making this impossible. + */ + ALREADY_CHECKED(sl.rq.u.ptr); + ptr = ((ret == -1) ? sl.rq.u.ptr : host->ptr); /* Set ptr on the error */ + goto http_msg_invalid; + } + if (h1m->err_pos == -1) /* capture the error pointer */ + h1m->err_pos = ((ret == -1) ? sl.rq.u.ptr : host->ptr) - start + skip; /* >= 0 now */ + } + } + else if (host_idx != -1 && istlen(authority)) { + struct ist host = hdr[host_idx].v; + + /* For non-CONNECT method, the authority must match the host header value */ + if (!isteqi(authority, host)) { + ret = h1_validate_mismatch_authority(scheme, authority, host); + if (ret < 0) { + if (h1m->err_pos < -1) { + state = H1_MSG_LAST_LF; + ptr = host.ptr; /* Set ptr on the error */ + goto http_msg_invalid; + } + if (h1m->err_pos == -1) /* capture the error pointer */ + h1m->err_pos = v.ptr - start + skip; /* >= 0 now */ + } + } + } + } + + state = H1_MSG_DATA; + if (h1m->flags & H1_MF_XFER_ENC) { + if (h1m->flags & H1_MF_CLEN) { + /* T-E + C-L: force close and remove C-L */ + h1m->flags |= H1_MF_CONN_CLO; + h1m->flags &= ~H1_MF_CLEN; + h1m->curr_len = h1m->body_len = 0; + hdr_count = http_del_hdr(hdr, ist("content-length")); + } + else if (!(h1m->flags & H1_MF_VER_11)) { + /* T-E + HTTP/1.0: force close */ + h1m->flags |= H1_MF_CONN_CLO; + } + + if (h1m->flags & H1_MF_CHNK) + state = H1_MSG_CHUNK_SIZE; + else if (!(h1m->flags & H1_MF_RESP)) { + /* cf RFC7230#3.3.3 : transfer-encoding in + * request without chunked encoding is invalid. + */ + goto http_msg_invalid; + } + } + + break; + + default: + /* impossible states */ + goto http_msg_invalid; + } + + /* Now we've left the headers state and are either in H1_MSG_DATA or + * H1_MSG_CHUNK_SIZE. + */ + + if (slp && !skip_update) + *slp = sl; + + h1m->state = state; + h1m->next = ptr - start + skip; + return h1m->next; + + http_msg_ood: + /* out of data at <ptr> during state <state> */ + if (slp && !skip_update) + *slp = sl; + + h1m->state = state; + h1m->next = ptr - start + skip; + return 0; + + http_msg_invalid: + /* invalid message, error at <ptr> */ + if (slp && !skip_update) + *slp = sl; + + h1m->err_state = h1m->state = state; + h1m->err_pos = h1m->next = ptr - start + skip; + return -1; + + http_output_full: + /* no more room to store the current header, error at <ptr> */ + if (slp && !skip_update) + *slp = sl; + + h1m->err_state = h1m->state = state; + h1m->err_pos = h1m->next = ptr - start + skip; + return -2; + + restart: + h1m->flags &= H1_MF_RESTART_MASK; + h1m->curr_len = h1m->body_len = h1m->next = 0; + if (h1m->flags & H1_MF_RESP) + h1m->state = H1_MSG_RPBEFORE; + else + h1m->state = H1_MSG_RQBEFORE; + goto try_again; +} + +/* This function performs a very minimal parsing of the trailers block present + * at offset <ofs> in <buf> for up to <max> bytes, and returns the number of + * bytes to delete to skip the trailers. It may return 0 if it's missing some + * input data, or < 0 in case of parse error (in which case the caller may have + * to decide how to proceed, possibly eating everything). + */ +int h1_measure_trailers(const struct buffer *buf, unsigned int ofs, unsigned int max) +{ + const char *stop = b_peek(buf, ofs + max); + int count = ofs; + + while (1) { + const char *p1 = NULL, *p2 = NULL; + const char *start = b_peek(buf, count); + const char *ptr = start; + + /* scan current line and stop at LF or CRLF */ + while (1) { + if (ptr == stop) + return 0; + + if (*ptr == '\n') { + if (!p1) + p1 = ptr; + p2 = ptr; + break; + } + + if (*ptr == '\r') { + if (p1) + return -1; + p1 = ptr; + } + + ptr = b_next(buf, ptr); + } + + /* after LF; point to beginning of next line */ + p2 = b_next(buf, p2); + count += b_dist(buf, start, p2); + + /* LF/CRLF at beginning of line => end of trailers at p2. + * Everything was scheduled for forwarding, there's nothing left + * from this message. */ + if (p1 == start) + break; + /* OK, next line then */ + } + return count - ofs; +} + +/* Generate a random key for a WebSocket Handshake in respect with rfc6455 + * The key is 128-bits long encoded as a base64 string in <key_out> parameter + * (25 bytes long). + */ +void h1_generate_random_ws_input_key(char key_out[25]) +{ + /* generate a random websocket key */ + const uint64_t rand1 = ha_random64(), rand2 = ha_random64(); + char key[16]; + + memcpy(key, &rand1, 8); + memcpy(&key[8], &rand2, 8); + a2base64(key, 16, key_out, 25); +} + +#define H1_WS_KEY_SUFFIX_GUID "258EAFA5-E914-47DA-95CA-C5AB0DC85B11" + +/* + * Calculate the WebSocket handshake response key from <key_in>. Following the + * rfc6455, <key_in> must be 24 bytes longs. The result is stored in <key_out> + * as a 29 bytes long string. + */ +void h1_calculate_ws_output_key(const char *key, char *result) +{ + blk_SHA_CTX sha1_ctx; + char hash_in[60], hash_out[20]; + + /* concatenate the key with a fixed suffix */ + memcpy(hash_in, key, 24); + memcpy(&hash_in[24], H1_WS_KEY_SUFFIX_GUID, 36); + + /* sha1 the result */ + blk_SHA1_Init(&sha1_ctx); + blk_SHA1_Update(&sha1_ctx, hash_in, 60); + blk_SHA1_Final((unsigned char *)hash_out, &sha1_ctx); + + /* encode in base64 the hash */ + a2base64(hash_out, 20, result, 29); +} diff --git a/src/h1_htx.c b/src/h1_htx.c new file mode 100644 index 0000000..f4f13fc --- /dev/null +++ b/src/h1_htx.c @@ -0,0 +1,1074 @@ +/* + * Functions to manipulate H1 messages using the internal representation. + * + * Copyright (C) 2019 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/global.h> +#include <haproxy/h1.h> +#include <haproxy/h1_htx.h> +#include <haproxy/http.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/tools.h> + +/* Estimate the size of the HTX headers after the parsing, including the EOH. */ +static size_t h1_eval_htx_hdrs_size(const struct http_hdr *hdrs) +{ + size_t sz = 0; + int i; + + for (i = 0; hdrs[i].n.len; i++) + sz += sizeof(struct htx_blk) + hdrs[i].n.len + hdrs[i].v.len; + sz += sizeof(struct htx_blk) + 1; + return sz; +} + +/* Estimate the size of the HTX request after the parsing. */ +static size_t h1_eval_htx_size(const struct ist p1, const struct ist p2, const struct ist p3, + const struct http_hdr *hdrs) +{ + size_t sz; + + /* size of the HTX start-line */ + sz = sizeof(struct htx_blk) + sizeof(struct htx_sl) + p1.len + p2.len + p3.len; + sz += h1_eval_htx_hdrs_size(hdrs); + return sz; +} + +/* Check the validity of the request version. If the version is valid, it + * returns 1. Otherwise, it returns 0. + */ +static int h1_process_req_vsn(struct h1m *h1m, union h1_sl *sl) +{ + /* RFC7230#2.6 has enforced the format of the HTTP version string to be + * exactly one digit "." one digit. This check may be disabled using + * option accept-invalid-http-request. + */ + if (h1m->err_pos == -2) { /* PR_O2_REQBUG_OK not set */ + if (sl->rq.v.len != 8) + return 0; + + if (!istnmatch(sl->rq.v, ist("HTTP/"), 5) || + !isdigit((unsigned char)*(sl->rq.v.ptr + 5)) || + *(sl->rq.v.ptr + 6) != '.' || + !isdigit((unsigned char)*(sl->rq.v.ptr + 7))) + return 0; + } + else if (!sl->rq.v.len) { + /* try to convert HTTP/0.9 requests to HTTP/1.0 */ + + /* RFC 1945 allows only GET for HTTP/0.9 requests */ + if (sl->rq.meth != HTTP_METH_GET) + return 0; + + /* HTTP/0.9 requests *must* have a request URI, per RFC 1945 */ + if (!sl->rq.u.len) + return 0; + + /* Add HTTP version */ + sl->rq.v = ist("HTTP/1.0"); + return 1; + } + + if ((sl->rq.v.len == 8) && + ((*(sl->rq.v.ptr + 5) > '1') || + ((*(sl->rq.v.ptr + 5) == '1') && (*(sl->rq.v.ptr + 7) >= '1')))) + h1m->flags |= H1_MF_VER_11; + return 1; +} + +/* Check the validity of the response version. If the version is valid, it + * returns 1. Otherwise, it returns 0. + */ +static int h1_process_res_vsn(struct h1m *h1m, union h1_sl *sl) +{ + /* RFC7230#2.6 has enforced the format of the HTTP version string to be + * exactly one digit "." one digit. This check may be disabled using + * option accept-invalid-http-request. + */ + if (h1m->err_pos == -2) { /* PR_O2_REQBUG_OK not set */ + if (sl->st.v.len != 8) + return 0; + + if (*(sl->st.v.ptr + 4) != '/' || + !isdigit((unsigned char)*(sl->st.v.ptr + 5)) || + *(sl->st.v.ptr + 6) != '.' || + !isdigit((unsigned char)*(sl->st.v.ptr + 7))) + return 0; + } + + if ((sl->st.v.len == 8) && + ((*(sl->st.v.ptr + 5) > '1') || + ((*(sl->st.v.ptr + 5) == '1') && (*(sl->st.v.ptr + 7) >= '1')))) + h1m->flags |= H1_MF_VER_11; + + return 1; +} + +/* Convert H1M flags to HTX start-line flags. */ +static unsigned int h1m_htx_sl_flags(struct h1m *h1m) +{ + unsigned int flags = HTX_SL_F_NONE; + + if (h1m->flags & H1_MF_RESP) + flags |= HTX_SL_F_IS_RESP; + if (h1m->flags & H1_MF_VER_11) + flags |= HTX_SL_F_VER_11; + if (h1m->flags & H1_MF_XFER_ENC) + flags |= HTX_SL_F_XFER_ENC; + if (h1m->flags & H1_MF_XFER_LEN) { + flags |= HTX_SL_F_XFER_LEN; + if (h1m->flags & H1_MF_CHNK) + flags |= HTX_SL_F_CHNK; + else if (h1m->flags & H1_MF_CLEN) { + flags |= HTX_SL_F_CLEN; + if (h1m->body_len == 0) + flags |= HTX_SL_F_BODYLESS; + } + else + flags |= HTX_SL_F_BODYLESS; + } + if (h1m->flags & H1_MF_CONN_UPG) + flags |= HTX_SL_F_CONN_UPG; + return flags; +} + +/* Postprocess the parsed headers for a request and convert them into an htx + * message. It returns the number of bytes parsed if > 0, or 0 if it couldn't + * proceed. Parsing errors are reported by setting the htx flag + * HTX_FL_PARSING_ERROR and filling h1m->err_pos and h1m->err_state fields. + */ +static int h1_postparse_req_hdrs(struct h1m *h1m, union h1_sl *h1sl, struct htx *htx, + struct http_hdr *hdrs, size_t max) +{ + struct htx_sl *sl; + struct ist meth, uri, vsn; + unsigned int flags = 0; + + /* <h1sl> is always defined for a request */ + meth = h1sl->rq.m; + uri = h1sl->rq.u; + vsn = h1sl->rq.v; + + /* Be sure the message, once converted into HTX, will not exceed the max + * size allowed. + */ + if (h1_eval_htx_size(meth, uri, vsn, hdrs) > max) { + if (htx_is_empty(htx)) + goto error; + goto output_full; + } + + /* By default, request have always a known length */ + h1m->flags |= H1_MF_XFER_LEN; + + if (h1sl->rq.meth == HTTP_METH_CONNECT) { + h1m->flags &= ~(H1_MF_CLEN|H1_MF_CHNK); + h1m->curr_len = h1m->body_len = 0; + } + else if (h1sl->rq.meth == HTTP_METH_HEAD) + flags |= HTX_SL_F_BODYLESS_RESP; + + + flags |= h1m_htx_sl_flags(h1m); + if ((flags & (HTX_SL_F_CONN_UPG|HTX_SL_F_BODYLESS)) == HTX_SL_F_CONN_UPG) { + int i; + + for (i = 0; hdrs[i].n.len; i++) { + if (isteqi(hdrs[i].n, ist("upgrade"))) + hdrs[i].v = IST_NULL; + } + h1m->flags &=~ H1_MF_CONN_UPG; + flags &= ~HTX_SL_F_CONN_UPG; + } + sl = htx_add_stline(htx, HTX_BLK_REQ_SL, flags, meth, uri, vsn); + if (!sl || !htx_add_all_headers(htx, hdrs)) + goto error; + sl->info.req.meth = h1sl->rq.meth; + + /* Check if the uri contains an authority. Also check if it contains an + * explicit scheme and if it is "http" or "https". */ + if (h1sl->rq.meth == HTTP_METH_CONNECT) + sl->flags |= HTX_SL_F_HAS_AUTHORITY; + else if (uri.len && uri.ptr[0] != '/' && uri.ptr[0] != '*') { + sl->flags |= (HTX_SL_F_HAS_AUTHORITY|HTX_SL_F_HAS_SCHM); + if (uri.len > 4 && (uri.ptr[0] | 0x20) == 'h') + sl->flags |= ((uri.ptr[4] == ':') ? HTX_SL_F_SCHM_HTTP : HTX_SL_F_SCHM_HTTPS); + + /* absolute-form target URI present, proceed to scheme-based + * normalization */ + http_scheme_based_normalize(htx); + } + + /* If body length cannot be determined, set htx->extra to + * HTX_UNKOWN_PAYLOAD_LENGTH. This value is impossible in other cases. + */ + htx->extra = ((h1m->flags & H1_MF_XFER_LEN) ? h1m->curr_len : HTX_UNKOWN_PAYLOAD_LENGTH); + + end: + return 1; + output_full: + h1m_init_req(h1m); + h1m->flags |= (H1_MF_NO_PHDR|H1_MF_CLEAN_CONN_HDR); + return -2; + error: + h1m->err_pos = h1m->next; + h1m->err_state = h1m->state; + htx->flags |= HTX_FL_PARSING_ERROR; + return -1; +} + +/* Postprocess the parsed headers for a response and convert them into an htx + * message. It returns the number of bytes parsed if > 0, or 0 if it couldn't + * proceed. Parsing errors are reported by setting the htx flag + * HTX_FL_PARSING_ERROR and filling h1m->err_pos and h1m->err_state fields. + */ +static int h1_postparse_res_hdrs(struct h1m *h1m, union h1_sl *h1sl, struct htx *htx, + struct http_hdr *hdrs, size_t max) +{ + struct htx_sl *sl; + struct ist vsn, status, reason; + unsigned int flags = 0; + uint16_t code = 0; + + if (h1sl) { + /* For HTTP responses, the start-line was parsed */ + code = h1sl->st.status; + vsn = h1sl->st.v; + status = h1sl->st.c; + reason = h1sl->st.r; + } + else { + /* For FCGI responses, there is no start(-line but the "Status" + * header must be parsed, if found. + */ + int hdr; + + vsn = ((h1m->flags & H1_MF_VER_11) ? ist("HTTP/1.1") : ist("HTTP/1.0")); + for (hdr = 0; hdrs[hdr].n.len; hdr++) { + if (isteqi(hdrs[hdr].n, ist("status"))) { + code = http_parse_status_val(hdrs[hdr].v, &status, &reason); + } + else if (isteqi(hdrs[hdr].n, ist("location"))) { + code = 302; + status = ist("302"); + reason = ist("Found"); + } + } + if (!code) { + code = 200; + status = ist("200"); + reason = ist("OK"); + } + /* FIXME: Check the codes 1xx ? */ + } + + /* Be sure the message, once converted into HTX, will not exceed the max + * size allowed. + */ + if (h1_eval_htx_size(vsn, status, reason, hdrs) > max) { + if (htx_is_empty(htx)) + goto error; + goto output_full; + } + + if ((h1m->flags & (H1_MF_CONN_UPG|H1_MF_UPG_WEBSOCKET)) && code != 101) + h1m->flags &= ~(H1_MF_CONN_UPG|H1_MF_UPG_WEBSOCKET); + + if (((h1m->flags & H1_MF_METH_CONNECT) && code >= 200 && code < 300) || code == 101) { + h1m->flags &= ~(H1_MF_CLEN|H1_MF_CHNK); + h1m->flags |= H1_MF_XFER_LEN; + h1m->curr_len = h1m->body_len = 0; + flags |= HTX_SL_F_BODYLESS_RESP; + } + else if ((h1m->flags & H1_MF_METH_HEAD) || (code >= 100 && code < 200) || + (code == 204) || (code == 304)) { + /* Responses known to have no body. */ + h1m->flags |= H1_MF_XFER_LEN; + h1m->curr_len = h1m->body_len = 0; + flags |= HTX_SL_F_BODYLESS_RESP; + } + else if (h1m->flags & (H1_MF_CLEN|H1_MF_CHNK)) { + /* Responses with a known body length. */ + h1m->flags |= H1_MF_XFER_LEN; + } + + flags |= h1m_htx_sl_flags(h1m); + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, vsn, status, reason); + if (!sl || !htx_add_all_headers(htx, hdrs)) + goto error; + sl->info.res.status = code; + + /* If body length cannot be determined, set htx->extra to + * HTX_UNKOWN_PAYLOAD_LENGTH. This value is impossible in other cases. + */ + htx->extra = ((h1m->flags & H1_MF_XFER_LEN) ? h1m->curr_len : HTX_UNKOWN_PAYLOAD_LENGTH); + + end: + return 1; + output_full: + h1m_init_res(h1m); + h1m->flags |= (H1_MF_NO_PHDR|H1_MF_CLEAN_CONN_HDR); + return -2; + error: + h1m->err_pos = h1m->next; + h1m->err_state = h1m->state; + htx->flags |= HTX_FL_PARSING_ERROR; + return -1; +} + +/* Parse HTTP/1 headers. It returns the number of bytes parsed on success, 0 if + * headers are incomplete, -1 if an error occurred or -2 if it needs more space + * to proceed while the output buffer is not empty. Parsing errors are reported + * by setting the htx flag HTX_FL_PARSING_ERROR and filling h1m->err_pos and + * h1m->err_state fields. This functions is responsible to update the parser + * state <h1m> and the start-line <h1sl> if not NULL. For the requests, <h1sl> + * must always be provided. For responses, <h1sl> may be NULL and <h1m> flags + * HTTP_METH_CONNECT of HTTP_METH_HEAD may be set. + */ +int h1_parse_msg_hdrs(struct h1m *h1m, union h1_sl *h1sl, struct htx *dsthtx, + struct buffer *srcbuf, size_t ofs, size_t max) +{ + struct http_hdr hdrs[global.tune.max_http_hdr]; + int total = 0, ret = 0; + + if (!max || !b_data(srcbuf)) + goto end; + + /* Realing input buffer if necessary */ + if (b_head(srcbuf) + b_data(srcbuf) > b_wrap(srcbuf)) + b_slow_realign_ofs(srcbuf, trash.area, 0); + + if (!h1sl) { + /* If there no start-line, be sure to only parse the headers */ + h1m->flags |= H1_MF_HDRS_ONLY; + } + ret = h1_headers_to_hdr_list(b_peek(srcbuf, ofs), b_tail(srcbuf), + hdrs, sizeof(hdrs)/sizeof(hdrs[0]), h1m, h1sl); + if (ret <= 0) { + /* Incomplete or invalid message. If the input buffer only + * contains headers and is full, which is detected by it being + * full and the offset to be zero, it's an error because + * headers are too large to be handled by the parser. */ + if (ret < 0 || (!ret && !ofs && !buf_room_for_htx_data(srcbuf))) + goto error; + goto end; + } + total = ret; + + /* messages headers fully parsed, do some checks to prepare the body + * parsing. + */ + + if (!(h1m->flags & H1_MF_RESP)) { + if (!h1_process_req_vsn(h1m, h1sl)) { + h1m->err_pos = h1sl->rq.v.ptr - b_head(srcbuf); + h1m->err_state = h1m->state; + goto vsn_error; + } + ret = h1_postparse_req_hdrs(h1m, h1sl, dsthtx, hdrs, max); + if (ret < 0) + return ret; + } + else { + if (h1sl && !h1_process_res_vsn(h1m, h1sl)) { + h1m->err_pos = h1sl->st.v.ptr - b_head(srcbuf); + h1m->err_state = h1m->state; + goto vsn_error; + } + ret = h1_postparse_res_hdrs(h1m, h1sl, dsthtx, hdrs, max); + if (ret < 0) + return ret; + } + + /* Switch messages without any payload to DONE state */ + if (((h1m->flags & H1_MF_CLEN) && h1m->body_len == 0) || + ((h1m->flags & (H1_MF_XFER_LEN|H1_MF_CLEN|H1_MF_CHNK)) == H1_MF_XFER_LEN)) { + h1m->state = H1_MSG_DONE; + dsthtx->flags |= HTX_FL_EOM; + } + + end: + return total; + error: + h1m->err_pos = h1m->next; + h1m->err_state = h1m->state; + vsn_error: + dsthtx->flags |= HTX_FL_PARSING_ERROR; + return -1; + +} + +/* Copy data from <srbuf> into an DATA block in <dsthtx>. If possible, a + * zero-copy is performed. It returns the number of bytes copied. + */ +static size_t h1_copy_msg_data(struct htx **dsthtx, struct buffer *srcbuf, size_t ofs, + size_t count, size_t max, struct buffer *htxbuf) +{ + struct htx *tmp_htx = *dsthtx; + size_t block1, block2, ret = 0; + + /* Be prepared to create at least one HTX block by reserving its size + * and adjust <count> accordingly. + */ + if (max <= sizeof(struct htx_blk)) + goto end; + max -= sizeof(struct htx_blk); + if (count > max) + count = max; + + /* very often with large files we'll face the following + * situation : + * - htx is empty and points to <htxbuf> + * - count == srcbuf->data + * - srcbuf->head == sizeof(struct htx) + * => we can swap the buffers and place an htx header into + * the target buffer instead + */ + if (unlikely(htx_is_empty(tmp_htx) && count == b_data(srcbuf) && + !ofs && b_head_ofs(srcbuf) == sizeof(struct htx))) { + void *raw_area = srcbuf->area; + void *htx_area = htxbuf->area; + struct htx_blk *blk; + + srcbuf->area = htx_area; + htxbuf->area = raw_area; + tmp_htx = (struct htx *)htxbuf->area; + tmp_htx->size = htxbuf->size - sizeof(*tmp_htx); + htx_reset(tmp_htx); + b_set_data(htxbuf, b_size(htxbuf)); + + blk = htx_add_blk(tmp_htx, HTX_BLK_DATA, count); + blk->info += count; + + *dsthtx = tmp_htx; + /* nothing else to do, the old buffer now contains an + * empty pre-initialized HTX header + */ + return count; + } + + /* * First block is the copy of contiguous data starting at offset <ofs> + * with <count> as max. <max> is updated accordingly + * + * * Second block is the remaining (count - block1) if <max> is large + * enough. Another HTX block is reserved. + */ + block1 = b_contig_data(srcbuf, ofs); + block2 = 0; + if (block1 > count) + block1 = count; + max -= block1; + + if (max > sizeof(struct htx_blk)) { + block2 = count - block1; + max -= sizeof(struct htx_blk); + if (block2 > max) + block2 = max; + } + + ret = htx_add_data(tmp_htx, ist2(b_peek(srcbuf, ofs), block1)); + if (ret == block1 && block2) + ret += htx_add_data(tmp_htx, ist2(b_orig(srcbuf), block2)); + end: + return ret; +} + +static const char hextable[] = { + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1, + -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 +}; + +/* Generic function to parse the current HTTP chunk. It may be used to parsed + * any kind of chunks, including incomplete HTTP chunks or split chunks + * because the buffer wraps. This version tries to performed zero-copy on large + * chunks if possible. + */ +static size_t h1_parse_chunk(struct h1m *h1m, struct htx **dsthtx, + struct buffer *srcbuf, size_t ofs, size_t *max, + struct buffer *htxbuf) +{ + uint64_t chksz; + size_t sz, used, lmax, total = 0; + int ret = 0; + + lmax = *max; + switch (h1m->state) { + case H1_MSG_DATA: + new_chunk: + used = htx_used_space(*dsthtx); + if (b_data(srcbuf) == ofs || lmax <= sizeof(struct htx_blk)) + break; + + sz = b_data(srcbuf) - ofs; + if (unlikely(sz > h1m->curr_len)) + sz = h1m->curr_len; + sz = h1_copy_msg_data(dsthtx, srcbuf, ofs, sz, lmax, htxbuf); + lmax -= htx_used_space(*dsthtx) - used; + ofs += sz; + total += sz; + h1m->curr_len -= sz; + if (h1m->curr_len) + break; + + h1m->state = H1_MSG_CHUNK_CRLF; + __fallthrough; + + case H1_MSG_CHUNK_CRLF: + ret = h1_skip_chunk_crlf(srcbuf, ofs, b_data(srcbuf)); + if (ret <= 0) + break; + ofs += ret; + total += ret; + + /* Don't parse next chunk to try to handle contiguous chunks if possible */ + h1m->state = H1_MSG_CHUNK_SIZE; + break; + + case H1_MSG_CHUNK_SIZE: + ret = h1_parse_chunk_size(srcbuf, ofs, b_data(srcbuf), &chksz); + if (ret <= 0) + break; + h1m->state = ((!chksz) ? H1_MSG_TRAILERS : H1_MSG_DATA); + h1m->curr_len = chksz; + h1m->body_len += chksz; + ofs += ret; + total += ret; + + if (h1m->curr_len) { + h1m->state = H1_MSG_DATA; + goto new_chunk; + } + h1m->state = H1_MSG_TRAILERS; + break; + + default: + /* unexpected */ + ret = -1; + break; + } + + if (ret < 0) { + (*dsthtx)->flags |= HTX_FL_PARSING_ERROR; + h1m->err_state = h1m->state; + h1m->err_pos = ofs; + total = 0; + } + + /* Don't forget to update htx->extra */ + (*dsthtx)->extra = h1m->curr_len; + *max = lmax; + return total; +} + +/* Parses full contiguous HTTP chunks. This version is optimized for small + * chunks and does not performed zero-copy. It must be called in + * H1_MSG_CHUNK_SIZE state. Be careful if you change something in this + * function. It is really sensitive, any change may have an impact on + * performance. + */ +static size_t h1_parse_full_contig_chunks(struct h1m *h1m, struct htx **dsthtx, + struct buffer *srcbuf, size_t ofs, size_t *max, + struct buffer *htxbuf) +{ + char *start, *end, *dptr; + ssize_t dpos, ridx, save; + size_t lmax, total = 0; + uint64_t chksz; + struct htx_ret htxret; + + lmax = *max; + if (lmax <= sizeof(struct htx_blk)) + goto out; + + /* source info : + * start : pointer at <ofs> position + * end : pointer marking the end of data to parse + * ridx : the reverse index (negative) marking the parser position (end[ridx]) + */ + ridx = -b_contig_data(srcbuf, ofs); + if (!ridx) + goto out; + start = b_peek(srcbuf, ofs); + end = start - ridx; + + /* Reserve the maximum possible size for the data */ + htxret = htx_reserve_max_data(*dsthtx); + if (!htxret.blk) + goto out; + + /* destination info : + * dptr : pointer on the beginning of the data + * dpos : current position where to copy data + */ + dptr = htx_get_blk_ptr(*dsthtx, htxret.blk); + dpos = htxret.ret; + + /* Empty DATA block is not possible, thus if <dpos> is the beginning of + * the block, it means it is a new block. We can remove the block size + * from <max>. Then we must adjust it if it exceeds the free size in the + * block. + */ + if (!dpos) + lmax -= sizeof(struct htx_blk); + if (lmax > htx_get_blksz(htxret.blk) - dpos) + lmax = htx_get_blksz(htxret.blk) - dpos; + + while (1) { + /* The chunk size is in the following form, though we are only + * interested in the size and CRLF : + * 1*HEXDIGIT *WSP *[ ';' extensions ] CRLF + */ + chksz = 0; + save = ridx; /* Save the parser position to rewind if necessary */ + while (1) { + int c; + + if (!ridx) + goto end_parsing; + + /* Convert current character */ + c = hextable[(unsigned char)end[ridx]]; + + /* not a hex digit anymore */ + if (c & 0xF0) + break; + + /* Update current chunk size */ + chksz = (chksz << 4) + c; + + if (unlikely(chksz & 0xF0000000000000ULL)) { + /* Don't get more than 13 hexa-digit (2^52 - 1) + * to never fed possibly bogus values from + * languages that use floats for their integers + */ + goto parsing_error; + } + ++ridx; + } + + if (unlikely(chksz > lmax)) + goto end_parsing; + + if (unlikely(ridx == save)) { + /* empty size not allowed */ + goto parsing_error; + } + + /* Skip spaces */ + while (HTTP_IS_SPHT(end[ridx])) { + if (!++ridx) + goto end_parsing; + } + + /* Up to there, we know that at least one byte is present. Check + * for the end of chunk size. + */ + while (1) { + if (likely(end[ridx] == '\r')) { + /* Parse CRLF */ + if (!++ridx) + goto end_parsing; + if (unlikely(end[ridx] != '\n')) { + /* CR must be followed by LF */ + goto parsing_error; + } + + /* done */ + ++ridx; + break; + } + else if (likely(end[ridx] == ';')) { + /* chunk extension, ends at next CRLF */ + if (!++ridx) + goto end_parsing; + while (!HTTP_IS_CRLF(end[ridx])) { + if (!++ridx) + goto end_parsing; + } + /* we have a CRLF now, loop above */ + continue; + } + else { + /* all other characters are unexpected, especially LF alone */ + goto parsing_error; + } + } + + /* Exit if it is the last chunk */ + if (unlikely(!chksz)) { + h1m->state = H1_MSG_TRAILERS; + save = ridx; + goto end_parsing; + } + + /* Now check if the whole chunk is here (including the CRLF at + * the end), otherwise we switch in H1_MSG_DATA state. + */ + if (chksz + 2 > -ridx) { + h1m->curr_len = chksz; + h1m->body_len += chksz; + h1m->state = H1_MSG_DATA; + (*dsthtx)->extra = h1m->curr_len; + save = ridx; + goto end_parsing; + } + + memcpy(dptr + dpos, end + ridx, chksz); + h1m->body_len += chksz; + lmax -= chksz; + dpos += chksz; + ridx += chksz; + + /* Parse CRLF */ + if (unlikely(end[ridx] != '\r')) { + h1m->state = H1_MSG_CHUNK_CRLF; + goto parsing_error; + } + ++ridx; + if (end[ridx] != '\n') { + h1m->state = H1_MSG_CHUNK_CRLF; + goto parsing_error; + } + ++ridx; + } + + end_parsing: + ridx = save; + + /* Adjust the HTX block size or remove the block if nothing was copied + * (Empty HTX data block are not supported). + */ + if (!dpos) + htx_remove_blk(*dsthtx, htxret.blk); + else + htx_change_blk_value_len(*dsthtx, htxret.blk, dpos); + total = end + ridx - start; + *max = lmax; + + out: + return total; + + parsing_error: + (*dsthtx)->flags |= HTX_FL_PARSING_ERROR; + h1m->err_state = h1m->state; + h1m->err_pos = ofs + end + ridx - start; + return 0; +} + +/* Parse HTTP chunks. This function relies on an optimized function to parse + * contiguous chunks if possible. Otherwise, when a chunk is incomplete or when + * the underlying buffer is wrapping, a generic function is used. + */ +static size_t h1_parse_msg_chunks(struct h1m *h1m, struct htx **dsthtx, + struct buffer *srcbuf, size_t ofs, size_t max, + struct buffer *htxbuf) +{ + size_t ret, total = 0; + + while (ofs < b_data(srcbuf)) { + ret = 0; + + /* First parse full contiguous chunks. It is only possible if we + * are waiting for the next chunk size. + */ + if (h1m->state == H1_MSG_CHUNK_SIZE) { + ret = h1_parse_full_contig_chunks(h1m, dsthtx, srcbuf, ofs, &max, htxbuf); + /* exit on error */ + if (!ret && (*dsthtx)->flags & HTX_FL_PARSING_ERROR) { + total = 0; + break; + } + /* or let a chance to parse remaining data */ + total += ret; + ofs += ret; + ret = 0; + } + + /* If some data remains, try to parse it using the generic + * function handling incomplete chunks and split chunks + * because of a wrapping buffer. + */ + if (h1m->state < H1_MSG_TRAILERS && ofs < b_data(srcbuf)) { + ret = h1_parse_chunk(h1m, dsthtx, srcbuf, ofs, &max, htxbuf); + total += ret; + ofs += ret; + } + + /* nothing more was parsed or parsing was stopped on incomplete + * chunk, we can exit, handling parsing error if necessary. + */ + if (!ret || h1m->state != H1_MSG_CHUNK_SIZE) { + if ((*dsthtx)->flags & HTX_FL_PARSING_ERROR) + total = 0; + break; + } + } + + return total; +} + +/* Parse HTTP/1 body. It returns the number of bytes parsed if > 0, or 0 if it + * couldn't proceed. Parsing errors are reported by setting the htx flags + * HTX_FL_PARSING_ERROR and filling h1m->err_pos and h1m->err_state fields. This + * functions is responsible to update the parser state <h1m>. + */ +size_t h1_parse_msg_data(struct h1m *h1m, struct htx **dsthtx, + struct buffer *srcbuf, size_t ofs, size_t max, + struct buffer *htxbuf) +{ + size_t sz, total = 0; + + if (b_data(srcbuf) == ofs) + return 0; + + if (h1m->flags & H1_MF_CLEN) { + /* content-length: read only h2m->body_len */ + sz = b_data(srcbuf) - ofs; + if (unlikely(sz > h1m->curr_len)) + sz = h1m->curr_len; + sz = h1_copy_msg_data(dsthtx, srcbuf, ofs, sz, max, htxbuf); + h1m->curr_len -= sz; + (*dsthtx)->extra = h1m->curr_len; + total += sz; + if (!h1m->curr_len) { + h1m->state = H1_MSG_DONE; + (*dsthtx)->flags |= HTX_FL_EOM; + } + } + else if (h1m->flags & H1_MF_CHNK) { + /* te:chunked : parse chunks */ + total += h1_parse_msg_chunks(h1m, dsthtx, srcbuf, ofs, max, htxbuf); + } + else if (h1m->flags & H1_MF_XFER_LEN) { + /* XFER_LEN is set but not CLEN nor CHNK, it means there is no + * body. Switch the message in DONE state + */ + h1m->state = H1_MSG_DONE; + (*dsthtx)->flags |= HTX_FL_EOM; + } + else { + /* no content length, read till SHUTW */ + sz = b_data(srcbuf) - ofs; + sz = h1_copy_msg_data(dsthtx, srcbuf, ofs, sz, max, htxbuf); + total += sz; + } + + return total; +} + +/* Parse HTTP/1 trailers. It returns the number of bytes parsed on success, 0 if + * trailers are incomplete, -1 if an error occurred or -2 if it needs more space + * to proceed while the output buffer is not empty. Parsing errors are reported + * by setting the htx flags HTX_FL_PARSING_ERROR and filling h1m->err_pos and + * h1m->err_state fields. This functions is responsible to update the parser + * state <h1m>. + */ +int h1_parse_msg_tlrs(struct h1m *h1m, struct htx *dsthtx, + struct buffer *srcbuf, size_t ofs, size_t max) +{ + struct http_hdr hdrs[global.tune.max_http_hdr]; + struct h1m tlr_h1m; + int ret = 0; + + if (b_data(srcbuf) == ofs) { + /* Nothing to parse */ + goto end; + } + if (!max) { + /* No more room */ + goto output_full; + } + + /* Realing input buffer if necessary */ + if (b_peek(srcbuf, ofs) > b_tail(srcbuf)) + b_slow_realign_ofs(srcbuf, trash.area, 0); + + tlr_h1m.flags = (H1_MF_NO_PHDR|H1_MF_HDRS_ONLY); + tlr_h1m.err_pos = h1m->err_pos; + ret = h1_headers_to_hdr_list(b_peek(srcbuf, ofs), b_tail(srcbuf), + hdrs, sizeof(hdrs)/sizeof(hdrs[0]), &tlr_h1m, NULL); + if (ret <= 0) { + /* Incomplete or invalid trailers. If the input buffer only + * contains trailers and is full, which is detected by it being + * full and the offset to be zero, it's an error because + * trailers are too large to be handled by the parser. */ + if (ret < 0 || (!ret && !ofs && !buf_room_for_htx_data(srcbuf))) + goto error; + goto end; + } + + /* messages trailers fully parsed. */ + if (h1_eval_htx_hdrs_size(hdrs) > max) { + if (htx_is_empty(dsthtx)) + goto error; + goto output_full; + } + + if (!htx_add_all_trailers(dsthtx, hdrs)) + goto error; + + h1m->state = H1_MSG_DONE; + dsthtx->flags |= HTX_FL_EOM; + + end: + return ret; + output_full: + return -2; + error: + h1m->err_state = h1m->state; + h1m->err_pos = h1m->next; + dsthtx->flags |= HTX_FL_PARSING_ERROR; + return -1; +} + +/* Appends the H1 representation of the request line <sl> to the chunk <chk>. It + * returns 1 if data are successfully appended, otherwise it returns 0. + */ +int h1_format_htx_reqline(const struct htx_sl *sl, struct buffer *chk) +{ + struct ist uri; + size_t sz = chk->data; + + uri = h1_get_uri(sl); + if (!chunk_memcat(chk, HTX_SL_REQ_MPTR(sl), HTX_SL_REQ_MLEN(sl)) || + !chunk_memcat(chk, " ", 1) || + !chunk_memcat(chk, uri.ptr, uri.len) || + !chunk_memcat(chk, " ", 1)) + goto full; + + if (sl->flags & HTX_SL_F_VER_11) { + if (!chunk_memcat(chk, "HTTP/1.1", 8)) + goto full; + } + else { + if (!chunk_memcat(chk, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl))) + goto full; + } + + if (!chunk_memcat(chk, "\r\n", 2)) + goto full; + + return 1; + + full: + chk->data = sz; + return 0; +} + +/* Appends the H1 representation of the status line <sl> to the chunk <chk>. It + * returns 1 if data are successfully appended, otherwise it returns 0. + */ +int h1_format_htx_stline(const struct htx_sl *sl, struct buffer *chk) +{ + size_t sz = chk->data; + + if (HTX_SL_LEN(sl) + 4 > b_room(chk)) + return 0; + + if (sl->flags & HTX_SL_F_VER_11) { + if (!chunk_memcat(chk, "HTTP/1.1", 8)) + goto full; + } + else { + if (!chunk_memcat(chk, HTX_SL_RES_VPTR(sl), HTX_SL_RES_VLEN(sl))) + goto full; + } + if (!chunk_memcat(chk, " ", 1) || + !chunk_memcat(chk, HTX_SL_RES_CPTR(sl), HTX_SL_RES_CLEN(sl)) || + !chunk_memcat(chk, " ", 1) || + !chunk_memcat(chk, HTX_SL_RES_RPTR(sl), HTX_SL_RES_RLEN(sl)) || + !chunk_memcat(chk, "\r\n", 2)) + goto full; + + return 1; + + full: + chk->data = sz; + return 0; +} + +/* Appends the H1 representation of the header <n> with the value <v> to the + * chunk <chk>. It returns 1 if data are successfully appended, otherwise it + * returns 0. + */ +int h1_format_htx_hdr(const struct ist n, const struct ist v, struct buffer *chk) +{ + size_t sz = chk->data; + + if (n.len + v.len + 4 > b_room(chk)) + return 0; + + if (!chunk_memcat(chk, n.ptr, n.len) || + !chunk_memcat(chk, ": ", 2) || + !chunk_memcat(chk, v.ptr, v.len) || + !chunk_memcat(chk, "\r\n", 2)) + goto full; + + return 1; + + full: + chk->data = sz; + return 0; +} + +/* Appends the H1 representation of the data <data> to the chunk <chk>. If + * <chunked> is non-zero, it emits HTTP/1 chunk-encoded data. It returns 1 if + * data are successfully appended, otherwise it returns 0. + */ +int h1_format_htx_data(const struct ist data, struct buffer *chk, int chunked) +{ + size_t sz = chk->data; + + if (chunked) { + uint32_t chksz; + char tmp[10]; + char *beg, *end; + + chksz = data.len; + + beg = end = tmp+10; + *--beg = '\n'; + *--beg = '\r'; + do { + *--beg = hextab[chksz & 0xF]; + } while (chksz >>= 4); + + if (!chunk_memcat(chk, beg, end - beg) || + !chunk_memcat(chk, data.ptr, data.len) || + !chunk_memcat(chk, "\r\n", 2)) + goto full; + } + else { + if (!chunk_memcat(chk, data.ptr, data.len)) + return 0; + } + + return 1; + + full: + chk->data = sz; + return 0; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/h2.c b/src/h2.c new file mode 100644 index 0000000..9c60cc6 --- /dev/null +++ b/src/h2.c @@ -0,0 +1,814 @@ +/* + * HTTP/2 protocol processing + * + * Copyright 2017 Willy Tarreau <w@1wt.eu> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <inttypes.h> +#include <haproxy/api.h> +#include <haproxy/global.h> +#include <haproxy/h2.h> +#include <haproxy/http-hdr-t.h> +#include <haproxy/http.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <import/ist.h> + + +struct h2_frame_definition h2_frame_definition[H2_FT_ENTRIES] = { + [H2_FT_DATA ] = { .dir = 3, .min_id = 1, .max_id = H2_MAX_STREAM_ID, .min_len = 0, .max_len = H2_MAX_FRAME_LEN, }, + [H2_FT_HEADERS ] = { .dir = 3, .min_id = 1, .max_id = H2_MAX_STREAM_ID, .min_len = 1, .max_len = H2_MAX_FRAME_LEN, }, + [H2_FT_PRIORITY ] = { .dir = 3, .min_id = 1, .max_id = H2_MAX_STREAM_ID, .min_len = 5, .max_len = 5, }, + [H2_FT_RST_STREAM ] = { .dir = 3, .min_id = 1, .max_id = H2_MAX_STREAM_ID, .min_len = 4, .max_len = 4, }, + [H2_FT_SETTINGS ] = { .dir = 3, .min_id = 0, .max_id = 0, .min_len = 0, .max_len = H2_MAX_FRAME_LEN, }, + [H2_FT_PUSH_PROMISE ] = { .dir = 0, .min_id = 1, .max_id = H2_MAX_STREAM_ID, .min_len = 4, .max_len = H2_MAX_FRAME_LEN, }, + [H2_FT_PING ] = { .dir = 3, .min_id = 0, .max_id = 0, .min_len = 8, .max_len = 8, }, + [H2_FT_GOAWAY ] = { .dir = 3, .min_id = 0, .max_id = 0, .min_len = 8, .max_len = H2_MAX_FRAME_LEN, }, + [H2_FT_WINDOW_UPDATE] = { .dir = 3, .min_id = 0, .max_id = H2_MAX_STREAM_ID, .min_len = 4, .max_len = 4, }, + [H2_FT_CONTINUATION ] = { .dir = 3, .min_id = 1, .max_id = H2_MAX_STREAM_ID, .min_len = 0, .max_len = H2_MAX_FRAME_LEN, }, +}; + +/* Prepare the request line into <htx> from pseudo headers stored in <phdr[]>. + * <fields> indicates what was found so far. This should be called once at the + * detection of the first general header field or at the end of the request if + * no general header field was found yet. Returns the created start line on + * success, or NULL on failure. Upon success, <msgf> is updated with a few + * H2_MSGF_* flags indicating what was found while parsing. + * + * The rules below deserve a bit of explanation. There tends to be some + * confusion regarding H2's authority vs the Host header. They are different + * though may sometimes be exchanged. In H2, the request line is broken into : + * - :method + * - :scheme + * - :authority + * - :path + * + * An equivalent HTTP/1.x absolute-form request would then look like : + * <:method> <:scheme>://<:authority><:path> HTTP/x.y + * + * Except for CONNECT which doesn't have scheme nor path and looks like : + * <:method> <:authority> HTTP/x.y + * + * It's worth noting that H2 still supports an encoding to map H1 origin-form + * and asterisk-form requests. These ones do not specify the authority. However + * in H2 they must still specify the scheme, which is not present in H1. Also, + * when encoding an absolute-form H1 request without a path, the path + * automatically becomes "/" except for the OPTIONS method where it + * becomes "*". + * + * As such it is explicitly permitted for an H2 client to send a request + * featuring a Host header and no :authority, though it's not the recommended + * way to use H2 for a client. It is however the only permitted way to encode + * an origin-form H1 request over H2. Thus we need to respect such differences + * as much as possible when re-encoding the H2 request into HTX. + */ +static struct htx_sl *h2_prepare_htx_reqline(uint32_t fields, struct ist *phdr, struct htx *htx, unsigned int *msgf) +{ + struct ist uri, meth_sl; + unsigned int flags = HTX_SL_F_NONE; + struct htx_sl *sl; + enum http_meth_t meth; + size_t i; + + if ((fields & H2_PHDR_FND_METH) && isteq(phdr[H2_PHDR_IDX_METH], ist("CONNECT"))) { + if (fields & H2_PHDR_FND_PROT) { + /* rfc 8441 Extended Connect Protocol + * #4 :scheme and :path must be present, as well as + * :authority like all h2 requests + */ + if (!(fields & H2_PHDR_FND_SCHM)) { + /* missing scheme */ + goto fail; + } + else if (!(fields & H2_PHDR_FND_PATH)) { + /* missing path */ + goto fail; + } + else if (!(fields & H2_PHDR_FND_AUTH)) { + /* missing authority */ + goto fail; + } + + flags |= HTX_SL_F_HAS_SCHM; + if (isteqi(phdr[H2_PHDR_IDX_SCHM], ist("http"))) + flags |= HTX_SL_F_SCHM_HTTP; + else if (isteqi(phdr[H2_PHDR_IDX_SCHM], ist("https"))) + flags |= HTX_SL_F_SCHM_HTTPS; + else if (!http_validate_scheme(phdr[H2_PHDR_IDX_SCHM])) + htx->flags |= HTX_FL_PARSING_ERROR; + + meth_sl = ist("GET"); + + *msgf |= H2_MSGF_EXT_CONNECT; + /* no ES on the HEADERS frame but no body either for + * Extended CONNECT */ + *msgf &= ~H2_MSGF_BODY; + } + else { + /* RFC 7540 #8.2.6 regarding CONNECT: ":scheme" and ":path" + * MUST be omitted ; ":authority" contains the host and port + * to connect to. + */ + if (fields & H2_PHDR_FND_SCHM) { + /* scheme not allowed */ + goto fail; + } + else if (fields & H2_PHDR_FND_PATH) { + /* path not allowed */ + goto fail; + } + else if (!(fields & H2_PHDR_FND_AUTH)) { + /* missing authority */ + goto fail; + } + + meth_sl = phdr[H2_PHDR_IDX_METH]; + } + + *msgf |= H2_MSGF_BODY_TUNNEL; + } + else if ((fields & (H2_PHDR_FND_METH|H2_PHDR_FND_SCHM|H2_PHDR_FND_PATH)) != + (H2_PHDR_FND_METH|H2_PHDR_FND_SCHM|H2_PHDR_FND_PATH)) { + /* RFC 7540 #8.1.2.3 : all requests MUST include exactly one + * valid value for the ":method", ":scheme" and ":path" phdr + * unless it is a CONNECT request. + */ + if (!(fields & H2_PHDR_FND_METH)) { + /* missing method */ + goto fail; + } + else if (!(fields & H2_PHDR_FND_SCHM)) { + /* missing scheme */ + goto fail; + } + else { + /* missing path */ + goto fail; + } + } + else { /* regular methods */ + /* RFC3986#6.2.2.1: scheme is case-insensitive. We need to + * classify the scheme as "present/http", "present/https", + * "present/other", "absent" so as to decide whether or not + * we're facing a normalized URI that will have to be encoded + * in origin or absolute form. Indeed, 7540#8.1.2.3 says that + * clients should use the absolute form, thus we cannot infer + * whether or not the client wanted to use a proxy here. + */ + flags |= HTX_SL_F_HAS_SCHM; + if (isteqi(phdr[H2_PHDR_IDX_SCHM], ist("http"))) + flags |= HTX_SL_F_SCHM_HTTP; + else if (isteqi(phdr[H2_PHDR_IDX_SCHM], ist("https"))) + flags |= HTX_SL_F_SCHM_HTTPS; + else if (!http_validate_scheme(phdr[H2_PHDR_IDX_SCHM])) + htx->flags |= HTX_FL_PARSING_ERROR; + + meth_sl = phdr[H2_PHDR_IDX_METH]; + } + + if (fields & H2_PHDR_FND_PATH) { + /* 7540#8.1.2.3: :path must not be empty, and must be either + * '*' or an RFC3986 "path-absolute" starting with a "/" but + * not with "//". + * However, this "path-absolute" was a mistake which was + * later fixed in http2bis as "absolute-path" to match + * HTTP/1, thus also allowing "//". + */ + if (unlikely(!phdr[H2_PHDR_IDX_PATH].len)) + goto fail; + else if (unlikely(phdr[H2_PHDR_IDX_PATH].ptr[0] != '/')) { + if (!isteq(phdr[H2_PHDR_IDX_PATH], ist("*"))) + goto fail; + } + } + + if (!(flags & HTX_SL_F_HAS_SCHM)) { + /* no scheme, use authority only (CONNECT) */ + uri = phdr[H2_PHDR_IDX_AUTH]; + flags |= HTX_SL_F_HAS_AUTHORITY; + } + else if (fields & H2_PHDR_FND_AUTH) { + /* authority is present, let's use the absolute form. We simply + * use the trash to concatenate them since all of them MUST fit + * in a bufsize since it's where they come from. + */ + uri = ist2bin(trash.area, phdr[H2_PHDR_IDX_SCHM]); + istcat(&uri, ist("://"), trash.size); + istcat(&uri, phdr[H2_PHDR_IDX_AUTH], trash.size); + if (!isteq(phdr[H2_PHDR_IDX_PATH], ist("*"))) + istcat(&uri, phdr[H2_PHDR_IDX_PATH], trash.size); + flags |= HTX_SL_F_HAS_AUTHORITY; + + if (flags & (HTX_SL_F_SCHM_HTTP|HTX_SL_F_SCHM_HTTPS)) { + /* we don't know if it was originally an absolute or a + * relative request because newer versions of HTTP use + * the absolute URI format by default, which we call + * the normalized URI format internally. This is the + * strongly recommended way of sending a request for + * a regular client, so we cannot distinguish this + * from a request intended for a proxy. For other + * schemes however there is no doubt. + */ + flags |= HTX_SL_F_NORMALIZED_URI; + } + } + else { + /* usual schemes with or without authority, use origin form */ + uri = phdr[H2_PHDR_IDX_PATH]; + if (fields & H2_PHDR_FND_AUTH) + flags |= HTX_SL_F_HAS_AUTHORITY; + } + + /* The method is a non-empty token (RFC7231#4.1) */ + if (!meth_sl.len) + goto fail; + for (i = 0; i < meth_sl.len; i++) { + if (!HTTP_IS_TOKEN(meth_sl.ptr[i])) + htx->flags |= HTX_FL_PARSING_ERROR; + } + + /* make sure the final URI isn't empty. Note that 7540#8.1.2.3 states + * that :path must not be empty. + */ + if (!uri.len) + goto fail; + + /* The final URI must not contain LWS nor CTL characters */ + for (i = 0; i < uri.len; i++) { + unsigned char c = uri.ptr[i]; + if (HTTP_IS_LWS(c) || HTTP_IS_CTL(c)) + htx->flags |= HTX_FL_PARSING_ERROR; + } + + /* Set HTX start-line flags */ + flags |= HTX_SL_F_VER_11; // V2 in fact + flags |= HTX_SL_F_XFER_LEN; // xfer len always known with H2 + + + meth = find_http_meth(meth_sl.ptr, meth_sl.len); + if (meth == HTTP_METH_HEAD) { + *msgf |= H2_MSGF_BODYLESS_RSP; + flags |= HTX_SL_F_BODYLESS_RESP; + } + + sl = htx_add_stline(htx, HTX_BLK_REQ_SL, flags, meth_sl, uri, ist("HTTP/2.0")); + if (!sl) + goto fail; + sl->info.req.meth = meth; + return sl; + fail: + return NULL; +} + +/* Takes an H2 request present in the headers list <list> terminated by a name + * being <NULL,0> and emits the equivalent HTX request according to the rules + * documented in RFC7540 #8.1.2. The output contents are emitted in <htx>, and + * non-zero is returned if some bytes were emitted. In case of error, a + * negative error code is returned. + * + * Upon success, <msgf> is filled with a few H2_MSGF_* flags indicating what + * was found while parsing. The caller must set it to zero in or H2_MSGF_BODY + * if a body is detected (!ES). + * + * The headers list <list> must be composed of : + * - n.name != NULL, n.len > 0 : literal header name + * - n.name == NULL, n.len > 0 : indexed pseudo header name number <n.len> + * among H2_PHDR_IDX_* + * - n.name ignored, n.len == 0 : end of list + * - in all cases except the end of list, v.name and v.len must designate a + * valid value. + * + * The Cookie header will be reassembled at the end, and for this, the <list> + * will be used to create a linked list, so its contents may be destroyed. + * + * When <relaxed> is non-nul, some non-dangerous checks will be ignored. This + * is in order to satisfy "option accept-invalid-http-request" for + * interoperability purposes. + */ +int h2_make_htx_request(struct http_hdr *list, struct htx *htx, unsigned int *msgf, unsigned long long *body_len, int relaxed) +{ + struct ist phdr_val[H2_PHDR_NUM_ENTRIES]; + uint32_t fields; /* bit mask of H2_PHDR_FND_* */ + uint32_t idx; + int ck, lck; /* cookie index and last cookie index */ + int phdr; + int ret; + int i; + struct htx_sl *sl = NULL; + unsigned int sl_flags = 0; + const char *ctl; + + lck = ck = -1; // no cookie for now + fields = 0; + for (idx = 0; list[idx].n.len != 0; idx++) { + if (!isttest(list[idx].n)) { + /* this is an indexed pseudo-header */ + phdr = list[idx].n.len; + } + else { + /* this can be any type of header */ + /* RFC7540#8.1.2: upper case not allowed in header field names. + * #10.3: header names must be valid (i.e. match a token). + * For pseudo-headers we check from 2nd char and for other ones + * from the first char, because HTTP_IS_TOKEN() also excludes + * the colon. + */ + phdr = h2_str_to_phdr(list[idx].n); + + for (i = !!phdr; i < list[idx].n.len; i++) + if ((uint8_t)(list[idx].n.ptr[i] - 'A') < 'Z' - 'A' || !HTTP_IS_TOKEN(list[idx].n.ptr[i])) + goto fail; + } + + /* RFC7540#10.3: intermediaries forwarding to HTTP/1 must take care of + * rejecting NUL, CR and LF characters. For :path we reject all CTL + * chars, spaces, and '#'. + */ + if (phdr == H2_PHDR_IDX_PATH && !relaxed) { + ctl = ist_find_range(list[idx].v, 0, '#'); + if (unlikely(ctl) && http_path_has_forbidden_char(list[idx].v, ctl)) + goto fail; + } else { + ctl = ist_find_ctl(list[idx].v); + if (unlikely(ctl) && http_header_has_forbidden_char(list[idx].v, ctl)) + goto fail; + } + + if (phdr > 0 && phdr < H2_PHDR_NUM_ENTRIES) { + /* insert a pseudo header by its index (in phdr) and value (in value) */ + if (fields & ((1 << phdr) | H2_PHDR_FND_NONE)) { + if (fields & H2_PHDR_FND_NONE) { + /* pseudo header field after regular headers */ + goto fail; + } + else { + /* repeated pseudo header field */ + goto fail; + } + } + fields |= 1 << phdr; + phdr_val[phdr] = list[idx].v; + continue; + } + else if (phdr != 0) { + /* invalid pseudo header -- should never happen here */ + goto fail; + } + + /* regular header field in (name,value) */ + if (unlikely(!(fields & H2_PHDR_FND_NONE))) { + /* no more pseudo-headers, time to build the request line */ + sl = h2_prepare_htx_reqline(fields, phdr_val, htx, msgf); + if (!sl) + goto fail; + fields |= H2_PHDR_FND_NONE; + + /* http2bis draft recommends to drop Host in favor of :authority when + * the latter is present. This is required to make sure there is no + * discrepancy between the authority and the host header, especially + * since routing rules usually involve Host. Here we already know if + * :authority was found so we can emit it right now and mark the host + * as filled so that it's skipped later. + */ + if (fields & H2_PHDR_FND_AUTH) { + if (!htx_add_header(htx, ist("host"), phdr_val[H2_PHDR_IDX_AUTH])) + goto fail; + fields |= H2_PHDR_FND_HOST; + } + } + + if (isteq(list[idx].n, ist("host"))) { + if (fields & H2_PHDR_FND_HOST) + continue; + + fields |= H2_PHDR_FND_HOST; + } + + if (isteq(list[idx].n, ist("content-length"))) { + ret = http_parse_cont_len_header(&list[idx].v, body_len, + *msgf & H2_MSGF_BODY_CL); + if (ret < 0) + goto fail; + + *msgf |= H2_MSGF_BODY_CL; + sl_flags |= HTX_SL_F_CLEN; + if (ret == 0) + continue; // skip this duplicate + } + + /* these ones are forbidden in requests (RFC7540#8.1.2.2) */ + if (isteq(list[idx].n, ist("connection")) || + isteq(list[idx].n, ist("proxy-connection")) || + isteq(list[idx].n, ist("keep-alive")) || + isteq(list[idx].n, ist("upgrade")) || + isteq(list[idx].n, ist("transfer-encoding"))) + goto fail; + + if (isteq(list[idx].n, ist("te")) && !isteq(list[idx].v, ist("trailers"))) + goto fail; + + /* cookie requires special processing at the end */ + if (isteq(list[idx].n, ist("cookie"))) { + http_cookie_register(list, idx, &ck, &lck); + continue; + } + + if (!htx_add_header(htx, list[idx].n, list[idx].v)) + goto fail; + } + + /* RFC7540#8.1.2.1 mandates to reject response pseudo-headers (:status) */ + if (fields & H2_PHDR_FND_STAT) + goto fail; + + /* Let's dump the request now if not yet emitted. */ + if (!(fields & H2_PHDR_FND_NONE)) { + sl = h2_prepare_htx_reqline(fields, phdr_val, htx, msgf); + if (!sl) + goto fail; + } + + if (*msgf & H2_MSGF_BODY_TUNNEL) + *msgf &= ~(H2_MSGF_BODY|H2_MSGF_BODY_CL); + + if (!(*msgf & H2_MSGF_BODY) || ((*msgf & H2_MSGF_BODY_CL) && *body_len == 0) || + (*msgf & H2_MSGF_BODY_TUNNEL)) { + /* Request without body or tunnel requested */ + sl_flags |= HTX_SL_F_BODYLESS; + htx->flags |= HTX_FL_EOM; + } + + if (*msgf & H2_MSGF_EXT_CONNECT) { + if (!htx_add_header(htx, ist("upgrade"), phdr_val[H2_PHDR_IDX_PROT])) + goto fail; + if (!htx_add_header(htx, ist("connection"), ist("upgrade"))) + goto fail; + sl_flags |= HTX_SL_F_CONN_UPG; + } + + /* update the start line with last detected header info */ + sl->flags |= sl_flags; + + /* complete with missing Host if needed (we may validate this test if + * no regular header was found). + */ + if ((fields & (H2_PHDR_FND_HOST|H2_PHDR_FND_AUTH)) == H2_PHDR_FND_AUTH) { + /* missing Host field, use :authority instead */ + if (!htx_add_header(htx, ist("host"), phdr_val[H2_PHDR_IDX_AUTH])) + goto fail; + } + + /* now we may have to build a cookie list. We'll dump the values of all + * visited headers. + */ + if (ck >= 0) { + if (http_cookie_merge(htx, list, ck)) + goto fail; + } + + /* now send the end of headers marker */ + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto fail; + + /* proceed to scheme-based normalization on target-URI */ + if (fields & H2_PHDR_FND_SCHM) + http_scheme_based_normalize(htx); + + ret = 1; + return ret; + + fail: + return -1; +} + +/* Prepare the status line into <htx> from pseudo headers stored in <phdr[]>. + * <fields> indicates what was found so far. This should be called once at the + * detection of the first general header field or at the end of the message if + * no general header field was found yet. Returns the created start line on + * success, or NULL on failure. Upon success, <msgf> is updated with a few + * H2_MSGF_* flags indicating what was found while parsing. + */ +static struct htx_sl *h2_prepare_htx_stsline(uint32_t fields, struct ist *phdr, struct htx *htx, unsigned int *msgf) +{ + unsigned int status, flags = HTX_SL_F_IS_RESP; + struct htx_sl *sl; + struct ist stat; + + /* only :status is allowed as a pseudo header */ + if (!(fields & H2_PHDR_FND_STAT)) + goto fail; + + if (phdr[H2_PHDR_IDX_STAT].len != 3) + goto fail; + + /* if Extended CONNECT is used, convert status code from 200 to htx 101 + * following rfc 8441 */ + if (unlikely(*msgf & H2_MSGF_EXT_CONNECT) && + isteq(phdr[H2_PHDR_IDX_STAT], ist("200"))) { + stat = ist("101"); + status = 101; + } + else { + unsigned char h, t, u; + + stat = phdr[H2_PHDR_IDX_STAT]; + + h = stat.ptr[0] - '0'; + t = stat.ptr[1] - '0'; + u = stat.ptr[2] - '0'; + if (h > 9 || t > 9 || u > 9) + goto fail; + status = h * 100 + t * 10 + u; + } + + /* 101 responses are not supported in H2, so return a error. + * On 1xx responses there is no ES on the HEADERS frame but there is no + * body. So remove the flag H2_MSGF_BODY and add H2_MSGF_RSP_1XX to + * notify the decoder another HEADERS frame is expected. + * 204/304 response have no body by definition. So remove the flag + * H2_MSGF_BODY and set H2_MSGF_BODYLESS_RSP. + * + * Note however that there is a special condition for Extended CONNECT. + * In this case, we explicitly convert it to HTX 101 to mimic + * Get+Upgrade HTTP/1.1 mechanism + */ + if (status == 101) { + if (!(*msgf & H2_MSGF_EXT_CONNECT)) + goto fail; + } + else if (status < 200) { + *msgf |= H2_MSGF_RSP_1XX; + *msgf &= ~H2_MSGF_BODY; + } + else if (status == 204 || status == 304) { + *msgf &= ~H2_MSGF_BODY; + *msgf |= H2_MSGF_BODYLESS_RSP; + flags |= HTX_SL_F_BODYLESS_RESP; + } + + /* Set HTX start-line flags */ + flags |= HTX_SL_F_VER_11; // V2 in fact + flags |= HTX_SL_F_XFER_LEN; // xfer len always known with H2 + + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, ist("HTTP/2.0"), stat, ist("")); + if (!sl) + goto fail; + sl->info.res.status = status; + return sl; + fail: + return NULL; +} + +/* Takes an H2 response present in the headers list <list> terminated by a name + * being <NULL,0> and emits the equivalent HTX response according to the rules + * documented in RFC7540 #8.1.2. The output contents are emitted in <htx>, and + * a positive value is returned if some bytes were emitted. In case of error, a + * negative error code is returned. + * + * Upon success, <msgf> is filled with a few H2_MSGF_* flags indicating what + * was found while parsing. The caller must set it to zero in or H2_MSGF_BODY + * if a body is detected (!ES). + * + * The headers list <list> must be composed of : + * - n.name != NULL, n.len > 0 : literal header name + * - n.name == NULL, n.len > 0 : indexed pseudo header name number <n.len> + * among H2_PHDR_IDX_* + * - n.name ignored, n.len == 0 : end of list + * - in all cases except the end of list, v.name and v.len must designate a + * valid value. + * + * <upgrade_protocol> is only used if the htx status code is 101 indicating a + * response to an upgrade or h2-equivalent request. + */ +int h2_make_htx_response(struct http_hdr *list, struct htx *htx, unsigned int *msgf, unsigned long long *body_len, char *upgrade_protocol) +{ + struct ist phdr_val[H2_PHDR_NUM_ENTRIES]; + uint32_t fields; /* bit mask of H2_PHDR_FND_* */ + uint32_t idx; + int phdr; + int ret; + int i; + struct htx_sl *sl = NULL; + unsigned int sl_flags = 0; + const char *ctl; + + fields = 0; + for (idx = 0; list[idx].n.len != 0; idx++) { + if (!isttest(list[idx].n)) { + /* this is an indexed pseudo-header */ + phdr = list[idx].n.len; + } + else { + /* this can be any type of header */ + /* RFC7540#8.1.2: upper case not allowed in header field names. + * #10.3: header names must be valid (i.e. match a token). + * For pseudo-headers we check from 2nd char and for other ones + * from the first char, because HTTP_IS_TOKEN() also excludes + * the colon. + */ + phdr = h2_str_to_phdr(list[idx].n); + + for (i = !!phdr; i < list[idx].n.len; i++) + if ((uint8_t)(list[idx].n.ptr[i] - 'A') < 'Z' - 'A' || !HTTP_IS_TOKEN(list[idx].n.ptr[i])) + goto fail; + } + + /* RFC7540#10.3: intermediaries forwarding to HTTP/1 must take care of + * rejecting NUL, CR and LF characters. + */ + ctl = ist_find_ctl(list[idx].v); + if (unlikely(ctl) && http_header_has_forbidden_char(list[idx].v, ctl)) + goto fail; + + if (phdr > 0 && phdr < H2_PHDR_NUM_ENTRIES) { + /* insert a pseudo header by its index (in phdr) and value (in value) */ + if (fields & ((1 << phdr) | H2_PHDR_FND_NONE)) { + if (fields & H2_PHDR_FND_NONE) { + /* pseudo header field after regular headers */ + goto fail; + } + else { + /* repeated pseudo header field */ + goto fail; + } + } + fields |= 1 << phdr; + phdr_val[phdr] = list[idx].v; + continue; + } + else if (phdr != 0) { + /* invalid pseudo header -- should never happen here */ + goto fail; + } + + /* regular header field in (name,value) */ + if (!(fields & H2_PHDR_FND_NONE)) { + /* no more pseudo-headers, time to build the status line */ + sl = h2_prepare_htx_stsline(fields, phdr_val, htx, msgf); + if (!sl) + goto fail; + fields |= H2_PHDR_FND_NONE; + } + + if (isteq(list[idx].n, ist("content-length"))) { + ret = http_parse_cont_len_header(&list[idx].v, body_len, + *msgf & H2_MSGF_BODY_CL); + if (ret < 0) + goto fail; + + *msgf |= H2_MSGF_BODY_CL; + sl_flags |= HTX_SL_F_CLEN; + if (ret == 0) + continue; // skip this duplicate + } + + /* these ones are forbidden in responses (RFC7540#8.1.2.2) */ + if (isteq(list[idx].n, ist("connection")) || + isteq(list[idx].n, ist("proxy-connection")) || + isteq(list[idx].n, ist("keep-alive")) || + isteq(list[idx].n, ist("upgrade")) || + isteq(list[idx].n, ist("transfer-encoding"))) + goto fail; + + if (!htx_add_header(htx, list[idx].n, list[idx].v)) + goto fail; + } + + /* RFC7540#8.1.2.1 mandates to reject request pseudo-headers */ + if (fields & (H2_PHDR_FND_AUTH|H2_PHDR_FND_METH|H2_PHDR_FND_PATH|H2_PHDR_FND_SCHM)) + goto fail; + + /* Let's dump the request now if not yet emitted. */ + if (!(fields & H2_PHDR_FND_NONE)) { + sl = h2_prepare_htx_stsline(fields, phdr_val, htx, msgf); + if (!sl) + goto fail; + } + + if (sl->info.res.status == 101 && upgrade_protocol) { + if (!htx_add_header(htx, ist("connection"), ist("upgrade"))) + goto fail; + if (!htx_add_header(htx, ist("upgrade"), ist(upgrade_protocol))) + goto fail; + sl_flags |= HTX_SL_F_CONN_UPG; + } + + if ((*msgf & H2_MSGF_BODY_TUNNEL) && + ((sl->info.res.status >= 200 && sl->info.res.status < 300) || sl->info.res.status == 101)) + *msgf &= ~(H2_MSGF_BODY|H2_MSGF_BODY_CL); + else + *msgf &= ~H2_MSGF_BODY_TUNNEL; + + if (!(*msgf & H2_MSGF_BODY) || ((*msgf & H2_MSGF_BODY_CL) && *body_len == 0) || + (*msgf & H2_MSGF_BODY_TUNNEL)) { + /* Response without body or tunnel successfully established */ + sl_flags |= HTX_SL_F_BODYLESS; + htx->flags |= HTX_FL_EOM; + } + + /* update the start line with last detected header info */ + sl->flags |= sl_flags; + + if ((*msgf & (H2_MSGF_BODY|H2_MSGF_BODY_TUNNEL|H2_MSGF_BODY_CL)) == H2_MSGF_BODY) { + /* FIXME: Do we need to signal anything when we have a body and + * no content-length, to have the equivalent of H1's chunked + * encoding? + */ + } + + /* now send the end of headers marker */ + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto fail; + + ret = 1; + return ret; + + fail: + return -1; +} + +/* Takes an H2 headers list <list> terminated by a name being <NULL,0> and emits + * the equivalent HTX trailers blocks. The output contents are emitted in <htx>, + * and a positive value is returned if some bytes were emitted. In case of + * error, a negative error code is returned. The caller must have verified that + * the message in the buffer is compatible with receipt of trailers. + * + * The headers list <list> must be composed of : + * - n.name != NULL, n.len > 0 : literal header name + * - n.name == NULL, n.len > 0 : indexed pseudo header name number <n.len> + * among H2_PHDR_IDX_* (illegal here) + * - n.name ignored, n.len == 0 : end of list + * - in all cases except the end of list, v.name and v.len must designate a + * valid value. + */ +int h2_make_htx_trailers(struct http_hdr *list, struct htx *htx) +{ + const char *ctl; + uint32_t idx; + int i; + + for (idx = 0; list[idx].n.len != 0; idx++) { + if (!isttest(list[idx].n)) { + /* This is an indexed pseudo-header (RFC7540#8.1.2.1) */ + goto fail; + } + + /* RFC7540#8.1.2: upper case not allowed in header field names. + * #10.3: header names must be valid (i.e. match a token). This + * also catches pseudo-headers which are forbidden in trailers. + */ + for (i = 0; i < list[idx].n.len; i++) + if ((uint8_t)(list[idx].n.ptr[i] - 'A') < 'Z' - 'A' || !HTTP_IS_TOKEN(list[idx].n.ptr[i])) + goto fail; + + /* these ones are forbidden in trailers (RFC7540#8.1.2.2) */ + if (isteq(list[idx].n, ist("host")) || + isteq(list[idx].n, ist("content-length")) || + isteq(list[idx].n, ist("connection")) || + isteq(list[idx].n, ist("proxy-connection")) || + isteq(list[idx].n, ist("keep-alive")) || + isteq(list[idx].n, ist("upgrade")) || + isteq(list[idx].n, ist("te")) || + isteq(list[idx].n, ist("transfer-encoding"))) + goto fail; + + /* RFC7540#10.3: intermediaries forwarding to HTTP/1 must take care of + * rejecting NUL, CR and LF characters. + */ + ctl = ist_find_ctl(list[idx].v); + if (unlikely(ctl) && http_header_has_forbidden_char(list[idx].v, ctl)) + goto fail; + + if (!htx_add_trailer(htx, list[idx].n, list[idx].v)) + goto fail; + } + + if (!htx_add_endof(htx, HTX_BLK_EOT)) + goto fail; + + return 1; + + fail: + return -1; +} diff --git a/src/h3.c b/src/h3.c new file mode 100644 index 0000000..4aa1a52 --- /dev/null +++ b/src/h3.c @@ -0,0 +1,2403 @@ +/* + * HTTP/3 protocol processing + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <import/ist.h> + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/h3.h> +#include <haproxy/h3_stats.h> +#include <haproxy/http.h> +#include <haproxy/http-hdr-t.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/intops.h> +#include <haproxy/istbuf.h> +#include <haproxy/mux_quic.h> +#include <haproxy/pool.h> +#include <haproxy/qmux_http.h> +#include <haproxy/qpack-dec.h> +#include <haproxy/qpack-enc.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_enc.h> +#include <haproxy/quic_frame.h> +#include <haproxy/stats-t.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> + +/* trace source and events */ +static void h3_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +static const struct trace_event h3_trace_events[] = { +#define H3_EV_RX_FRAME (1ULL << 0) + { .mask = H3_EV_RX_FRAME, .name = "rx_frame", .desc = "receipt of any H3 frame" }, +#define H3_EV_RX_DATA (1ULL << 1) + { .mask = H3_EV_RX_DATA, .name = "rx_data", .desc = "receipt of H3 DATA frame" }, +#define H3_EV_RX_HDR (1ULL << 2) + { .mask = H3_EV_RX_HDR, .name = "rx_hdr", .desc = "receipt of H3 HEADERS frame" }, +#define H3_EV_RX_SETTINGS (1ULL << 3) + { .mask = H3_EV_RX_SETTINGS, .name = "rx_settings", .desc = "receipt of H3 SETTINGS frame" }, +#define H3_EV_TX_DATA (1ULL << 4) + { .mask = H3_EV_TX_DATA, .name = "tx_data", .desc = "transmission of H3 DATA frame" }, +#define H3_EV_TX_HDR (1ULL << 5) + { .mask = H3_EV_TX_HDR, .name = "tx_hdr", .desc = "transmission of H3 HEADERS frame" }, +#define H3_EV_TX_SETTINGS (1ULL << 6) + { .mask = H3_EV_TX_SETTINGS, .name = "tx_settings", .desc = "transmission of H3 SETTINGS frame" }, +#define H3_EV_H3S_NEW (1ULL << 7) + { .mask = H3_EV_H3S_NEW, .name = "h3s_new", .desc = "new H3 stream" }, +#define H3_EV_H3S_END (1ULL << 8) + { .mask = H3_EV_H3S_END, .name = "h3s_end", .desc = "H3 stream terminated" }, +#define H3_EV_H3C_NEW (1ULL << 9) + { .mask = H3_EV_H3C_NEW, .name = "h3c_new", .desc = "new H3 connection" }, +#define H3_EV_H3C_END (1ULL << 10) + { .mask = H3_EV_H3C_END, .name = "h3c_end", .desc = "H3 connection terminated" }, +#define H3_EV_STRM_SEND (1ULL << 12) + { .mask = H3_EV_STRM_SEND, .name = "strm_send", .desc = "sending data for stream" }, + { } +}; + +static const struct name_desc h3_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the connection */ }, + /* arg2 */ { .name="qcs", .desc="QUIC stream" }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc h3_trace_decoding[] = { +#define H3_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define H3_VERB_MINIMAL 2 + { .name="minimal", .desc="report only qcc/qcs state and flags, no real decoding" }, + { /* end */ } +}; + +struct trace_source trace_h3 = { + .name = IST("h3"), + .desc = "HTTP/3 transcoder", + .arg_def = TRC_ARG1_CONN, /* TRACE()'s first argument is always a connection */ + .default_cb = h3_trace, + .known_events = h3_trace_events, + .lockon_args = h3_trace_lockon_args, + .decoding = h3_trace_decoding, + .report_events = ~0, /* report everything by default */ +}; + +#define TRACE_SOURCE &trace_h3 +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + +#if defined(DEBUG_H3) +#define h3_debug_printf fprintf +#define h3_debug_hexdump debug_hexdump +#else +#define h3_debug_printf(...) do { } while (0) +#define h3_debug_hexdump(...) do { } while (0) +#endif + +#define H3_CF_SETTINGS_SENT 0x00000001 /* SETTINGS frame already sent on local control stream */ +#define H3_CF_SETTINGS_RECV 0x00000002 /* SETTINGS frame already received on remote control stream */ +#define H3_CF_UNI_CTRL_SET 0x00000004 /* Remote H3 Control stream opened */ +#define H3_CF_UNI_QPACK_DEC_SET 0x00000008 /* Remote QPACK decoder stream opened */ +#define H3_CF_UNI_QPACK_ENC_SET 0x00000010 /* Remote QPACK encoder stream opened */ +#define H3_CF_GOAWAY_SENT 0x00000020 /* GOAWAY sent on local control stream */ + +/* Default settings */ +static uint64_t h3_settings_qpack_max_table_capacity = 0; +static uint64_t h3_settings_qpack_blocked_streams = 4096; +static uint64_t h3_settings_max_field_section_size = QUIC_VARINT_8_BYTE_MAX; /* Unlimited */ + +struct h3c { + struct qcc *qcc; + struct qcs *ctrl_strm; /* Control stream */ + enum h3_err err; + uint32_t flags; + + /* Settings */ + uint64_t qpack_max_table_capacity; + uint64_t qpack_blocked_streams; + uint64_t max_field_section_size; + + uint64_t id_goaway; /* stream ID used for a GOAWAY frame */ + + struct buffer_wait buf_wait; /* wait list for buffer allocations */ + /* Stats counters */ + struct h3_counters *prx_counters; +}; + +DECLARE_STATIC_POOL(pool_head_h3c, "h3c", sizeof(struct h3c)); + +#define H3_SF_UNI_INIT 0x00000001 /* stream type not parsed for unidirectional stream */ +#define H3_SF_UNI_NO_H3 0x00000002 /* unidirectional stream does not carry H3 frames */ +#define H3_SF_HAVE_CLEN 0x00000004 /* content-length header is present */ + +struct h3s { + struct h3c *h3c; + + enum h3s_t type; + enum h3s_st_req st_req; /* only used for request streams */ + uint64_t demux_frame_len; + uint64_t demux_frame_type; + + unsigned long long body_len; /* known request body length from content-length header if present */ + unsigned long long data_len; /* total length of all parsed DATA */ + + int flags; + int err; /* used for stream reset */ +}; + +DECLARE_STATIC_POOL(pool_head_h3s, "h3s", sizeof(struct h3s)); + +/* Initialize an uni-stream <qcs> by reading its type from <b>. + * + * Returns the count of consumed bytes or a negative error code. + */ +static ssize_t h3_init_uni_stream(struct h3c *h3c, struct qcs *qcs, + struct buffer *b) +{ + /* decode unidirectional stream type */ + struct h3s *h3s = qcs->ctx; + uint64_t type; + size_t len = 0, ret; + + TRACE_ENTER(H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + + /* Function reserved to uni streams. Must be called only once per stream instance. */ + BUG_ON(!quic_stream_is_uni(qcs->id) || h3s->flags & H3_SF_UNI_INIT); + + ret = b_quic_dec_int(&type, b, &len); + if (!ret) { + /* not enough data to decode uni stream type, retry later */ + TRACE_DATA("cannot decode uni stream type due to incomplete data", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + goto out; + } + + switch (type) { + case H3_UNI_S_T_CTRL: + if (h3c->flags & H3_CF_UNI_CTRL_SET) { + TRACE_ERROR("duplicated control stream", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + qcc_set_error(qcs->qcc, H3_STREAM_CREATION_ERROR, 1); + goto err; + } + h3c->flags |= H3_CF_UNI_CTRL_SET; + h3s->type = H3S_T_CTRL; + break; + + case H3_UNI_S_T_PUSH: + /* TODO not supported for the moment */ + h3s->type = H3S_T_PUSH; + break; + + case H3_UNI_S_T_QPACK_DEC: + if (h3c->flags & H3_CF_UNI_QPACK_DEC_SET) { + TRACE_ERROR("duplicated qpack decoder stream", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + qcc_set_error(qcs->qcc, H3_STREAM_CREATION_ERROR, 1); + goto err; + } + h3c->flags |= H3_CF_UNI_QPACK_DEC_SET; + h3s->type = H3S_T_QPACK_DEC; + h3s->flags |= H3_SF_UNI_NO_H3; + break; + + case H3_UNI_S_T_QPACK_ENC: + if (h3c->flags & H3_CF_UNI_QPACK_ENC_SET) { + TRACE_ERROR("duplicated qpack encoder stream", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + qcc_set_error(qcs->qcc, H3_STREAM_CREATION_ERROR, 1); + goto err; + } + h3c->flags |= H3_CF_UNI_QPACK_ENC_SET; + h3s->type = H3S_T_QPACK_ENC; + h3s->flags |= H3_SF_UNI_NO_H3; + break; + + default: + /* draft-ietf-quic-http34 9. Extensions to HTTP/3 + * + * Implementations MUST [...] abort reading on unidirectional + * streams that have unknown or unsupported types. + */ + TRACE_STATE("abort reading on unknown uni stream type", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + qcc_abort_stream_read(qcs); + goto err; + } + + h3s->flags |= H3_SF_UNI_INIT; + + out: + TRACE_LEAVE(H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + return len; + + err: + TRACE_DEVEL("leaving on error", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + return -1; +} + +/* Parse a buffer <b> for a <qcs> uni-stream which does not contains H3 frames. + * This may be used for QPACK encoder/decoder streams for example. <fin> is set + * if this is the last frame of the stream. + * + * Returns the number of consumed bytes or a negative error code. + */ +static ssize_t h3_parse_uni_stream_no_h3(struct qcs *qcs, struct buffer *b, int fin) +{ + struct h3s *h3s = qcs->ctx; + + /* Function reserved to non-HTTP/3 unidirectional streams. */ + BUG_ON(!quic_stream_is_uni(qcs->id) || !(h3s->flags & H3_SF_UNI_NO_H3)); + + switch (h3s->type) { + case H3S_T_QPACK_DEC: + if (qpack_decode_dec(b, fin, qcs)) + return -1; + break; + case H3S_T_QPACK_ENC: + if (qpack_decode_enc(b, fin, qcs)) + return -1; + break; + case H3S_T_UNKNOWN: + default: + /* Unknown stream should be flagged with QC_SF_READ_ABORTED. */ + ABORT_NOW(); + } + + /* TODO adjust return code */ + return 0; +} + +/* Decode a H3 frame header from <rxbuf> buffer. The frame type is stored in + * <ftype> and length in <flen>. + * + * Returns the size of the H3 frame header. Note that the input buffer is not + * consumed. + */ +static inline size_t h3_decode_frm_header(uint64_t *ftype, uint64_t *flen, + struct buffer *b) +{ + size_t hlen; + + hlen = 0; + if (!b_quic_dec_int(ftype, b, &hlen) || + !b_quic_dec_int(flen, b, &hlen)) { + return 0; + } + + return hlen; +} + +/* Check if H3 frame of type <ftype> is valid when received on stream <qcs>. + * + * Returns 0 if frame valid, otherwise HTTP/3 error code. + */ +static int h3_check_frame_valid(struct h3c *h3c, struct qcs *qcs, uint64_t ftype) +{ + struct h3s *h3s = qcs->ctx; + int ret = 0; + + /* Stream type must be known to ensure frame is valid for this stream. */ + BUG_ON(h3s->type == H3S_T_UNKNOWN); + + switch (ftype) { + case H3_FT_DATA: + /* cf H3_FT_HEADERS case. */ + if (h3s->type == H3S_T_CTRL || + (h3s->st_req != H3S_ST_REQ_HEADERS && h3s->st_req != H3S_ST_REQ_DATA)) { + ret = H3_FRAME_UNEXPECTED; + } + + break; + + case H3_FT_HEADERS: + /* RFC 9114 4.1. HTTP Message Framing + * + * + * An HTTP message (request or response) consists of: + * 1. the header section, including message control data, sent as a + * single HEADERS frame, + * 2. optionally, the content, if present, sent as a series of DATA + * frames, and + * 3. optionally, the trailer section, if present, sent as a single + * HEADERS frame. + * + * [...] + * + * Receipt of an invalid sequence of frames MUST be treated as a + * connection error of type H3_FRAME_UNEXPECTED. In particular, a DATA + * frame before any HEADERS frame, or a HEADERS or DATA frame after the + * trailing HEADERS frame, is considered invalid. Other frame types, + * especially unknown frame types, might be permitted subject to their + * own rules; see Section 9. + */ + if (h3s->type == H3S_T_CTRL || h3s->st_req == H3S_ST_REQ_TRAILERS) + ret = H3_FRAME_UNEXPECTED; + break; + + case H3_FT_CANCEL_PUSH: + case H3_FT_GOAWAY: + case H3_FT_MAX_PUSH_ID: + /* RFC 9114 7.2.3. CANCEL_PUSH + * + * A CANCEL_PUSH frame is sent on the control stream. Receiving a + * CANCEL_PUSH frame on a stream other than the control stream MUST be + * treated as a connection error of type H3_FRAME_UNEXPECTED. + */ + + /* RFC 9114 7.2.6. GOAWAY + * + * A client MUST treat a GOAWAY frame on a stream other than the + * control stream as a connection error of type H3_FRAME_UNEXPECTED. + */ + + /* RFC 9114 7.2.7. MAX_PUSH_ID + * + * The MAX_PUSH_ID frame is always sent on the control stream. Receipt + * of a MAX_PUSH_ID frame on any other stream MUST be treated as a + * connection error of type H3_FRAME_UNEXPECTED. + */ + + if (h3s->type != H3S_T_CTRL) + ret = H3_FRAME_UNEXPECTED; + else if (!(h3c->flags & H3_CF_SETTINGS_RECV)) + ret = H3_MISSING_SETTINGS; + break; + + case H3_FT_SETTINGS: + /* RFC 9114 7.2.4. SETTINGS + * + * A SETTINGS frame MUST be sent as the first frame of + * each control stream (see Section 6.2.1) by each peer, and it MUST NOT + * be sent subsequently. If an endpoint receives a second SETTINGS frame + * on the control stream, the endpoint MUST respond with a connection + * error of type H3_FRAME_UNEXPECTED. + * + * SETTINGS frames MUST NOT be sent on any stream other than the control + * stream. If an endpoint receives a SETTINGS frame on a different + * stream, the endpoint MUST respond with a connection error of type + * H3_FRAME_UNEXPECTED. + */ + if (h3s->type != H3S_T_CTRL || h3c->flags & H3_CF_SETTINGS_RECV) + ret = H3_FRAME_UNEXPECTED; + break; + + case H3_FT_PUSH_PROMISE: + /* RFC 9114 7.2.5. PUSH_PROMISE + * + * A client MUST NOT send a PUSH_PROMISE frame. A server MUST treat the + * receipt of a PUSH_PROMISE frame as a connection error of type + * H3_FRAME_UNEXPECTED. + */ + + /* TODO server-side only. */ + ret = H3_FRAME_UNEXPECTED; + break; + + default: + /* RFC 9114 9. Extensions to HTTP/3 + * + * Implementations MUST ignore unknown or unsupported values in all + * extensible protocol elements. [...] + * However, where a known frame type is required to be in a + * specific location, such as the SETTINGS frame as the first frame of + * the control stream (see Section 6.2.1), an unknown frame type does + * not satisfy that requirement and SHOULD be treated as an error. + */ + if (h3s->type == H3S_T_CTRL && !(h3c->flags & H3_CF_SETTINGS_RECV)) + ret = H3_MISSING_SETTINGS; + break; + } + + return ret; +} + +/* Check from stream <qcs> that length of all DATA frames does not exceed with + * a previously parsed content-length header. <fin> must be set for the last + * data of the stream so that length of DATA frames must be equal to the + * content-length. + * + * This must only be called for a stream with H3_SF_HAVE_CLEN flag. + * + * Return 0 on valid else non-zero. + */ +static int h3_check_body_size(struct qcs *qcs, int fin) +{ + struct h3s *h3s = qcs->ctx; + int ret = 0; + TRACE_ENTER(H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + + /* Reserved for streams with a previously parsed content-length header. */ + BUG_ON(!(h3s->flags & H3_SF_HAVE_CLEN)); + + /* RFC 9114 4.1.2. Malformed Requests and Responses + * + * A request or response that is defined as having content when it + * contains a Content-Length header field (Section 8.6 of [HTTP]) is + * malformed if the value of the Content-Length header field does not + * equal the sum of the DATA frame lengths received. + * + * TODO for backend support + * A response that is + * defined as never having content, even when a Content-Length is + * present, can have a non-zero Content-Length header field even though + * no content is included in DATA frames. + */ + if (h3s->data_len > h3s->body_len || + (fin && h3s->data_len < h3s->body_len)) { + TRACE_ERROR("Content-length does not match DATA frame size", H3_EV_RX_FRAME|H3_EV_RX_DATA, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + ret = -1; + } + + TRACE_LEAVE(H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + return ret; +} + +/* Set <auth> authority header to the new value <value> for <qcs> stream. This + * ensures that value is conformant to the specification. If <auth> is a + * non-null length string, it ensures that <value> is identical to it. + * + * Returns 0 on success else non-zero. + */ +static int h3_set_authority(struct qcs *qcs, struct ist *auth, const struct ist value) +{ + /* RFC 9114 4.3.1. Request Pseudo-Header Fields + * + * If the :scheme pseudo-header field identifies a scheme that has a + * mandatory authority component (including "http" and "https"), the + * request MUST contain either an :authority pseudo-header field or a + * Host header field. If these fields are present, they MUST NOT be + * empty. If both fields are present, they MUST contain the same value. + */ + + /* Check that if a previous value is set the new value is identical. */ + if (isttest(*auth) && !isteq(*auth, value)) { + TRACE_ERROR("difference between :authority and host headers", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + return 1; + } + + /* Check that value is not empty. */ + if (!istlen(value)) { + TRACE_ERROR("empty :authority/host header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + return 1; + } + + *auth = value; + return 0; +} + +/* Parse from buffer <buf> a H3 HEADERS frame of length <len>. Data are copied + * in a local HTX buffer and transfer to the stream connector layer. <fin> must be + * set if this is the last data to transfer from this stream. + * + * Returns the number of consumed bytes or a negative error code. On error + * either the connection should be closed or the stream reset using codes + * provided in h3c.err / h3s.err. + */ +static ssize_t h3_headers_to_htx(struct qcs *qcs, const struct buffer *buf, + uint64_t len, char fin) +{ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c; + struct buffer htx_buf = BUF_NULL; + struct buffer *tmp = get_trash_chunk(); + struct htx *htx = NULL; + struct htx_sl *sl; + struct http_hdr list[global.tune.max_http_hdr]; + unsigned int flags = HTX_SL_F_NONE; + struct ist meth = IST_NULL, path = IST_NULL; + struct ist scheme = IST_NULL, authority = IST_NULL; + int hdr_idx, ret; + int cookie = -1, last_cookie = -1, i; + const char *ctl; + int relaxed = !!(h3c->qcc->proxy->options2 & PR_O2_REQBUG_OK); + + /* RFC 9114 4.1.2. Malformed Requests and Responses + * + * A malformed request or response is one that is an otherwise valid + * sequence of frames but is invalid due to: + * - the presence of prohibited fields or pseudo-header fields, + * - the absence of mandatory pseudo-header fields, + * - invalid values for pseudo-header fields, + * - pseudo-header fields after fields, + * - an invalid sequence of HTTP messages, + * - the inclusion of uppercase field names, or + * - the inclusion of invalid characters in field names or values. + * + * [...] + * + * Intermediaries that process HTTP requests or responses (i.e., any + * intermediary not acting as a tunnel) MUST NOT forward a malformed + * request or response. Malformed requests or responses that are + * detected MUST be treated as a stream error of type H3_MESSAGE_ERROR. + */ + + TRACE_ENTER(H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + + /* TODO support trailer parsing in this function */ + + /* TODO support buffer wrapping */ + BUG_ON(b_head(buf) + len >= b_wrap(buf)); + ret = qpack_decode_fs((const unsigned char *)b_head(buf), len, tmp, + list, sizeof(list) / sizeof(list[0])); + if (ret < 0) { + TRACE_ERROR("QPACK decoding error", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3c->err = -ret; + len = -1; + goto out; + } + + if (!qcs_get_buf(qcs, &htx_buf)) { + TRACE_ERROR("HTX buffer alloc failure", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + BUG_ON(!b_size(&htx_buf)); /* TODO */ + htx = htx_from_buf(&htx_buf); + + /* first treat pseudo-header to build the start line */ + hdr_idx = 0; + while (1) { + /* RFC 9114 4.3. HTTP Control Data + * + * Endpoints MUST treat a request or response that contains + * undefined or invalid pseudo-header fields as malformed. + * + * All pseudo-header fields MUST appear in the header section before + * regular header fields. Any request or response that contains a + * pseudo-header field that appears in a header section after a regular + * header field MUST be treated as malformed. + */ + + /* Stop at first non pseudo-header. */ + if (!istmatch(list[hdr_idx].n, ist(":"))) + break; + + /* RFC 9114 10.3 Intermediary-Encapsulation Attacks + * + * While most values that can be encoded will not alter field + * parsing, carriage return (ASCII 0x0d), line feed (ASCII 0x0a), + * and the null character (ASCII 0x00) might be exploited by an + * attacker if they are translated verbatim. Any request or + * response that contains a character not permitted in a field + * value MUST be treated as malformed + */ + + /* look for forbidden control characters in the pseudo-header value */ + ctl = ist_find_ctl(list[hdr_idx].v); + if (unlikely(ctl) && http_header_has_forbidden_char(list[hdr_idx].v, ctl)) { + TRACE_ERROR("control character present in pseudo-header value", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + /* pseudo-header. Malformed name with uppercase character or + * invalid token will be rejected in the else clause. + */ + if (isteq(list[hdr_idx].n, ist(":method"))) { + if (isttest(meth)) { + TRACE_ERROR("duplicated method pseudo-header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + meth = list[hdr_idx].v; + } + else if (isteq(list[hdr_idx].n, ist(":path"))) { + if (isttest(path)) { + TRACE_ERROR("duplicated path pseudo-header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + if (!relaxed) { + /* we need to reject any control chars or '#' from the path, + * unless option accept-invalid-http-request is set. + */ + ctl = ist_find_range(list[hdr_idx].v, 0, '#'); + if (unlikely(ctl) && http_path_has_forbidden_char(list[hdr_idx].v, ctl)) { + TRACE_ERROR("forbidden character in ':path' pseudo-header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + } + + path = list[hdr_idx].v; + } + else if (isteq(list[hdr_idx].n, ist(":scheme"))) { + if (isttest(scheme)) { + /* duplicated pseudo-header */ + TRACE_ERROR("duplicated scheme pseudo-header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + scheme = list[hdr_idx].v; + } + else if (isteq(list[hdr_idx].n, ist(":authority"))) { + if (isttest(authority)) { + TRACE_ERROR("duplicated authority pseudo-header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + if (h3_set_authority(qcs, &authority, list[hdr_idx].v)) { + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + } + else { + TRACE_ERROR("unknown pseudo-header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + ++hdr_idx; + } + + if (!istmatch(meth, ist("CONNECT"))) { + /* RFC 9114 4.3.1. Request Pseudo-Header Fields + * + * All HTTP/3 requests MUST include exactly one value for the :method, + * :scheme, and :path pseudo-header fields, unless the request is a + * CONNECT request; see Section 4.4. + */ + if (!isttest(meth) || !isttest(scheme) || !isttest(path)) { + TRACE_ERROR("missing mandatory pseudo-header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + } + + flags |= HTX_SL_F_VER_11; + flags |= HTX_SL_F_XFER_LEN; + + sl = htx_add_stline(htx, HTX_BLK_REQ_SL, flags, meth, path, ist("HTTP/3.0")); + if (!sl) { + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + + if (fin) + sl->flags |= HTX_SL_F_BODYLESS; + + sl->info.req.meth = find_http_meth(meth.ptr, meth.len); + + if (isttest(authority)) { + if (!htx_add_header(htx, ist("host"), authority)) { + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + } + + /* now treat standard headers */ + while (1) { + if (isteq(list[hdr_idx].n, ist(""))) + break; + + if (istmatch(list[hdr_idx].n, ist(":"))) { + TRACE_ERROR("pseudo-header field after fields", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + for (i = 0; i < list[hdr_idx].n.len; ++i) { + const char c = list[hdr_idx].n.ptr[i]; + if ((uint8_t)(c - 'A') < 'Z' - 'A' || !HTTP_IS_TOKEN(c)) { + TRACE_ERROR("invalid characters in field name", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + } + + + /* RFC 9114 10.3 Intermediary-Encapsulation Attacks + * + * While most values that can be encoded will not alter field + * parsing, carriage return (ASCII 0x0d), line feed (ASCII 0x0a), + * and the null character (ASCII 0x00) might be exploited by an + * attacker if they are translated verbatim. Any request or + * response that contains a character not permitted in a field + * value MUST be treated as malformed + */ + + /* look for forbidden control characters in the header value */ + ctl = ist_find_ctl(list[hdr_idx].v); + if (unlikely(ctl) && http_header_has_forbidden_char(list[hdr_idx].v, ctl)) { + TRACE_ERROR("control character present in header value", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + if (isteq(list[hdr_idx].n, ist("host"))) { + if (h3_set_authority(qcs, &authority, list[hdr_idx].v)) { + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + } + else if (isteq(list[hdr_idx].n, ist("cookie"))) { + http_cookie_register(list, hdr_idx, &cookie, &last_cookie); + ++hdr_idx; + continue; + } + else if (isteq(list[hdr_idx].n, ist("content-length"))) { + ret = http_parse_cont_len_header(&list[hdr_idx].v, + &h3s->body_len, + h3s->flags & H3_SF_HAVE_CLEN); + if (ret < 0) { + TRACE_ERROR("invalid content-length", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + else if (!ret) { + /* Skip duplicated value. */ + ++hdr_idx; + continue; + } + + h3s->flags |= H3_SF_HAVE_CLEN; + sl->flags |= HTX_SL_F_CLEN; + /* This will fail if current frame is the last one and + * content-length is not null. + */ + if (h3_check_body_size(qcs, fin)) { + len = -1; + goto out; + } + } + else if (isteq(list[hdr_idx].n, ist("connection")) || + isteq(list[hdr_idx].n, ist("proxy-connection")) || + isteq(list[hdr_idx].n, ist("keep-alive")) || + isteq(list[hdr_idx].n, ist("transfer-encoding"))) { + /* RFC 9114 4.2. HTTP Fields + * + * HTTP/3 does not use the Connection header field to indicate + * connection-specific fields; in this protocol, connection- + * specific metadata is conveyed by other means. An endpoint + * MUST NOT generate an HTTP/3 field section containing + * connection-specific fields; any message containing + * connection-specific fields MUST be treated as malformed. + */ + TRACE_ERROR("invalid connection header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + else if (isteq(list[hdr_idx].n, ist("te")) && + !isteq(list[hdr_idx].v, ist("trailers"))) { + /* RFC 9114 4.2. HTTP Fields + * + * The only exception to this is the TE header field, which MAY + * be present in an HTTP/3 request header; when it is, it MUST + * NOT contain any value other than "trailers". + */ + TRACE_ERROR("invalid te header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + if (!htx_add_header(htx, list[hdr_idx].n, list[hdr_idx].v)) { + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + ++hdr_idx; + } + + /* RFC 9114 4.3.1. Request Pseudo-Header Fields + * + * If the :scheme pseudo-header field identifies a scheme that has a + * mandatory authority component (including "http" and "https"), the + * request MUST contain either an :authority pseudo-header field or a + * Host header field. + */ + if (!isttest(authority)) { + TRACE_ERROR("missing mandatory pseudo-header", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + if (cookie >= 0) { + if (http_cookie_merge(htx, list, cookie)) { + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + } + + if (!htx_add_endof(htx, HTX_BLK_EOH)) { + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + + if (fin) + htx->flags |= HTX_FL_EOM; + + htx_to_buf(htx, &htx_buf); + htx = NULL; + + if (!qcs_attach_sc(qcs, &htx_buf, fin)) { + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + + /* RFC 9114 5.2. Connection Shutdown + * + * The GOAWAY frame contains an identifier that + * indicates to the receiver the range of requests or pushes that were + * or might be processed in this connection. The server sends a client- + * initiated bidirectional stream ID; the client sends a push ID. + * Requests or pushes with the indicated identifier or greater are + * rejected (Section 4.1.1) by the sender of the GOAWAY. This + * identifier MAY be zero if no requests or pushes were processed. + */ + if (qcs->id >= h3c->id_goaway) + h3c->id_goaway = qcs->id + 4; + + out: + /* HTX may be non NULL if error before previous htx_to_buf(). */ + if (htx) + htx_to_buf(htx, &htx_buf); + + /* buffer is transferred to the stream connector and set to NULL + * except on stream creation error. + */ + if (b_size(&htx_buf)) { + b_free(&htx_buf); + offer_buffers(NULL, 1); + } + + TRACE_LEAVE(H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + return len; +} + +/* Parse from buffer <buf> a H3 HEADERS frame of length <len> used as trailers. + * Data are copied in a local HTX buffer and transfer to the stream connector + * layer. <fin> must be set if this is the last data to transfer from this + * stream. + * + * Returns the number of consumed bytes or a negative error code. On error + * either the connection should be closed or the stream reset using codes + * provided in h3c.err / h3s.err. + */ +static ssize_t h3_trailers_to_htx(struct qcs *qcs, const struct buffer *buf, + uint64_t len, char fin) +{ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c; + struct buffer *tmp = get_trash_chunk(); + struct buffer *appbuf = NULL; + struct htx *htx = NULL; + struct htx_sl *sl; + struct http_hdr list[global.tune.max_http_hdr]; + int hdr_idx, ret; + const char *ctl; + int i; + + TRACE_ENTER(H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + + /* TODO support buffer wrapping */ + BUG_ON(b_head(buf) + len >= b_wrap(buf)); + ret = qpack_decode_fs((const unsigned char *)b_head(buf), len, tmp, + list, sizeof(list) / sizeof(list[0])); + if (ret < 0) { + TRACE_ERROR("QPACK decoding error", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3c->err = -ret; + len = -1; + goto out; + } + + if (!(appbuf = qcs_get_buf(qcs, &qcs->rx.app_buf))) { + TRACE_ERROR("HTX buffer alloc failure", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + BUG_ON(!b_size(appbuf)); /* TODO */ + htx = htx_from_buf(appbuf); + + if (!h3s->data_len) { + /* Notify that no body is present. This can only happens if + * there is H3 HEADERS as trailers without or empty H3 DATA + * frame. So this is probably not realistice ? + * + * TODO if sl is NULL because already consumed there is no way + * to notify about missing body. + */ + sl = http_get_stline(htx); + if (sl) + sl->flags |= HTX_SL_F_BODYLESS; + else + TRACE_ERROR("cannot notify missing body after trailers", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + } + + hdr_idx = 0; + while (1) { + if (isteq(list[hdr_idx].n, ist(""))) + break; + + /* RFC 9114 4.3. HTTP Control Data + * + * Pseudo-header + * fields MUST NOT appear in trailer sections. + */ + if (istmatch(list[hdr_idx].n, ist(":"))) { + TRACE_ERROR("pseudo-header field in trailers", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + for (i = 0; i < list[hdr_idx].n.len; ++i) { + const char c = list[hdr_idx].n.ptr[i]; + if ((uint8_t)(c - 'A') < 'Z' - 'A' || !HTTP_IS_TOKEN(c)) { + TRACE_ERROR("invalid characters in field name", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + } + + /* forbidden HTTP/3 headers, cf h3_headers_to_htx() */ + if (isteq(list[hdr_idx].n, ist("host")) || + isteq(list[hdr_idx].n, ist("content-length")) || + isteq(list[hdr_idx].n, ist("connection")) || + isteq(list[hdr_idx].n, ist("proxy-connection")) || + isteq(list[hdr_idx].n, ist("keep-alive")) || + isteq(list[hdr_idx].n, ist("te")) || + isteq(list[hdr_idx].n, ist("transfer-encoding"))) { + TRACE_ERROR("forbidden HTTP/3 headers", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + /* RFC 9114 10.3 Intermediary-Encapsulation Attacks + * + * While most values that can be encoded will not alter field + * parsing, carriage return (ASCII 0x0d), line feed (ASCII 0x0a), + * and the null character (ASCII 0x00) might be exploited by an + * attacker if they are translated verbatim. Any request or + * response that contains a character not permitted in a field + * value MUST be treated as malformed + */ + + /* look for forbidden control characters in the trailer value */ + ctl = ist_find_ctl(list[hdr_idx].v); + if (unlikely(ctl) && http_header_has_forbidden_char(list[hdr_idx].v, ctl)) { + TRACE_ERROR("control character present in trailer value", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3s->err = H3_MESSAGE_ERROR; + len = -1; + goto out; + } + + if (!htx_add_trailer(htx, list[hdr_idx].n, list[hdr_idx].v)) { + TRACE_ERROR("cannot add trailer", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + + ++hdr_idx; + } + + if (!htx_add_endof(htx, HTX_BLK_EOT)) { + TRACE_ERROR("cannot add trailer", H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + + if (fin) + htx->flags |= HTX_FL_EOM; + + out: + /* HTX may be non NULL if error before previous htx_to_buf(). */ + if (appbuf) + htx_to_buf(htx, appbuf); + + TRACE_LEAVE(H3_EV_RX_FRAME|H3_EV_RX_HDR, qcs->qcc->conn, qcs); + return len; +} + +/* Copy from buffer <buf> a H3 DATA frame of length <len> in QUIC stream <qcs> + * HTX buffer. <fin> must be set if this is the last data to transfer from this + * stream. + * + * Returns the number of consumed bytes or a negative error code. + */ +static ssize_t h3_data_to_htx(struct qcs *qcs, const struct buffer *buf, + uint64_t len, char fin) +{ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c; + struct buffer *appbuf; + struct htx *htx = NULL; + size_t htx_sent = 0; + int htx_space; + char *head; + + TRACE_ENTER(H3_EV_RX_FRAME|H3_EV_RX_DATA, qcs->qcc->conn, qcs); + + if (!(appbuf = qcs_get_buf(qcs, &qcs->rx.app_buf))) { + TRACE_ERROR("data buffer alloc failure", H3_EV_RX_FRAME|H3_EV_RX_DATA, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + len = -1; + goto out; + } + + htx = htx_from_buf(appbuf); + + if (len > b_data(buf)) { + len = b_data(buf); + fin = 0; + } + + head = b_head(buf); + retry: + htx_space = htx_free_data_space(htx); + if (!htx_space) { + qcs->flags |= QC_SF_DEM_FULL; + goto out; + } + + if (len > htx_space) { + len = htx_space; + fin = 0; + } + + if (head + len > b_wrap(buf)) { + size_t contig = b_wrap(buf) - head; + htx_sent = htx_add_data(htx, ist2(b_head(buf), contig)); + if (htx_sent < contig) { + qcs->flags |= QC_SF_DEM_FULL; + goto out; + } + + len -= contig; + head = b_orig(buf); + goto retry; + } + + htx_sent += htx_add_data(htx, ist2(head, len)); + if (htx_sent < len) { + qcs->flags |= QC_SF_DEM_FULL; + goto out; + } + + if (fin && len == htx_sent) + htx->flags |= HTX_FL_EOM; + + out: + if (appbuf) + htx_to_buf(htx, appbuf); + + TRACE_LEAVE(H3_EV_RX_FRAME|H3_EV_RX_DATA, qcs->qcc->conn, qcs); + return htx_sent; +} + +/* Parse a SETTINGS frame of length <len> of payload <buf>. + * + * Returns the number of consumed bytes or a negative error code. + */ +static ssize_t h3_parse_settings_frm(struct h3c *h3c, const struct buffer *buf, + size_t len) +{ + struct buffer b; + uint64_t id, value; + size_t ret = 0; + long mask = 0; /* used to detect duplicated settings identifier */ + + TRACE_ENTER(H3_EV_RX_FRAME|H3_EV_RX_SETTINGS, h3c->qcc->conn); + + /* Work on a copy of <buf>. */ + b = b_make(b_orig(buf), b_size(buf), b_head_ofs(buf), len); + + while (b_data(&b)) { + if (!b_quic_dec_int(&id, &b, &ret) || !b_quic_dec_int(&value, &b, &ret)) { + h3c->err = H3_FRAME_ERROR; + return -1; + } + + h3_debug_printf(stderr, "%s id: %llu value: %llu\n", + __func__, (unsigned long long)id, (unsigned long long)value); + + /* draft-ietf-quic-http34 7.2.4. SETTINGS + * + * The same setting identifier MUST NOT occur more than once in the + * SETTINGS frame. A receiver MAY treat the presence of duplicate + * setting identifiers as a connection error of type H3_SETTINGS_ERROR. + */ + + /* Ignore duplicate check for ID too big used for GREASE. */ + if (id < sizeof(mask)) { + if (ha_bit_test(id, &mask)) { + h3c->err = H3_SETTINGS_ERROR; + return -1; + } + ha_bit_set(id, &mask); + } + + switch (id) { + case H3_SETTINGS_QPACK_MAX_TABLE_CAPACITY: + h3c->qpack_max_table_capacity = value; + break; + case H3_SETTINGS_MAX_FIELD_SECTION_SIZE: + h3c->max_field_section_size = value; + break; + case H3_SETTINGS_QPACK_BLOCKED_STREAMS: + h3c->qpack_blocked_streams = value; + break; + + case H3_SETTINGS_RESERVED_0: + case H3_SETTINGS_RESERVED_2: + case H3_SETTINGS_RESERVED_3: + case H3_SETTINGS_RESERVED_4: + case H3_SETTINGS_RESERVED_5: + /* draft-ietf-quic-http34 7.2.4.1. Defined SETTINGS Parameters + * + * Setting identifiers which were defined in [HTTP2] where there is no + * corresponding HTTP/3 setting have also been reserved + * (Section 11.2.2). These reserved settings MUST NOT be sent, and + * their receipt MUST be treated as a connection error of type + * H3_SETTINGS_ERROR. + */ + h3c->err = H3_SETTINGS_ERROR; + return -1; + default: + /* MUST be ignored */ + break; + } + } + + TRACE_LEAVE(H3_EV_RX_FRAME|H3_EV_RX_SETTINGS, h3c->qcc->conn); + return ret; +} + +/* Decode <qcs> remotely initiated bidi-stream. <fin> must be set to indicate + * that we received the last data of the stream. + * + * Returns 0 on success else non-zero. + */ +static ssize_t h3_decode_qcs(struct qcs *qcs, struct buffer *b, int fin) +{ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c; + ssize_t total = 0, ret; + + TRACE_ENTER(H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + + if (quic_stream_is_uni(qcs->id) && !(h3s->flags & H3_SF_UNI_INIT)) { + ret = h3_init_uni_stream(h3c, qcs, b); + if (ret < 0) { + TRACE_ERROR("cannot initialize uni stream", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + goto err; + } + else if (!ret) { + /* not enough data to initialize uni stream, retry later */ + goto done; + } + + total += ret; + } + + if (quic_stream_is_uni(qcs->id) && (h3s->flags & H3_SF_UNI_NO_H3)) { + /* For non-h3 STREAM, parse it and return immediately. */ + if ((ret = h3_parse_uni_stream_no_h3(qcs, b, fin)) < 0) { + TRACE_ERROR("error when parsing non-HTTP3 uni stream", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + goto err; + } + + total += ret; + goto done; + } + + /* RFC 9114 6.2.1. Control Streams + * + * The sender MUST NOT close the control stream, and the receiver MUST NOT + * request that the sender close the control stream. If either control + * stream is closed at any point, this MUST be treated as a connection + * error of type H3_CLOSED_CRITICAL_STREAM. + */ + if (h3s->type == H3S_T_CTRL && fin) { + TRACE_ERROR("control stream closed by remote peer", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + qcc_set_error(qcs->qcc, H3_CLOSED_CRITICAL_STREAM, 1); + goto err; + } + + if (!b_data(b) && fin && quic_stream_is_bidi(qcs->id)) { + struct buffer *appbuf; + struct htx *htx; + + TRACE_PROTO("received FIN without data", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + if (!(appbuf = qcs_get_buf(qcs, &qcs->rx.app_buf))) { + TRACE_ERROR("data buffer alloc failure", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + goto err; + } + + htx = htx_from_buf(appbuf); + if (!htx_set_eom(htx)) { + TRACE_ERROR("cannot set EOM", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + } + htx_to_buf(htx, appbuf); + goto done; + } + + while (b_data(b) && !(qcs->flags & QC_SF_DEM_FULL) && !h3c->err && !h3s->err) { + uint64_t ftype, flen; + char last_stream_frame = 0; + + if (!h3s->demux_frame_len) { + /* Switch to a new frame. */ + size_t hlen = h3_decode_frm_header(&ftype, &flen, b); + if (!hlen) { + TRACE_PROTO("pause parsing on incomplete frame header", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + break; + } + + h3s->demux_frame_type = ftype; + h3s->demux_frame_len = flen; + total += hlen; + TRACE_PROTO("parsing a new frame", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + + /* Check that content-length is not exceeded on a new DATA frame. */ + if (ftype == H3_FT_DATA) { + h3s->data_len += flen; + if (h3s->flags & H3_SF_HAVE_CLEN && h3_check_body_size(qcs, (fin && flen == b_data(b)))) + break; + } + + if ((ret = h3_check_frame_valid(h3c, qcs, ftype))) { + TRACE_ERROR("received an invalid frame", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + qcc_set_error(qcs->qcc, ret, 1); + goto err; + } + + if (!b_data(b)) + break; + } + + flen = h3s->demux_frame_len; + ftype = h3s->demux_frame_type; + + /* Do not demux incomplete frames except H3 DATA which can be + * fragmented in multiple HTX blocks. + */ + if (flen > b_data(b) && ftype != H3_FT_DATA) { + /* Reject frames bigger than bufsize. + * + * TODO HEADERS should in complement be limited with H3 + * SETTINGS_MAX_FIELD_SECTION_SIZE parameter to prevent + * excessive decompressed size. + */ + if (flen > QC_S_RX_BUF_SZ) { + TRACE_ERROR("received a too big frame", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + qcc_set_error(qcs->qcc, H3_EXCESSIVE_LOAD, 1); + goto err; + } + break; + } + + last_stream_frame = (fin && flen == b_data(b)); + + /* Check content-length equality with DATA frames length on the last frame. */ + if (last_stream_frame && h3s->flags & H3_SF_HAVE_CLEN && h3_check_body_size(qcs, last_stream_frame)) + break; + + h3_inc_frame_type_cnt(h3c->prx_counters, ftype); + switch (ftype) { + case H3_FT_DATA: + ret = h3_data_to_htx(qcs, b, flen, last_stream_frame); + h3s->st_req = H3S_ST_REQ_DATA; + break; + case H3_FT_HEADERS: + if (h3s->st_req == H3S_ST_REQ_BEFORE) { + ret = h3_headers_to_htx(qcs, b, flen, last_stream_frame); + h3s->st_req = H3S_ST_REQ_HEADERS; + } + else { + ret = h3_trailers_to_htx(qcs, b, flen, last_stream_frame); + h3s->st_req = H3S_ST_REQ_TRAILERS; + } + break; + case H3_FT_CANCEL_PUSH: + case H3_FT_PUSH_PROMISE: + case H3_FT_MAX_PUSH_ID: + case H3_FT_GOAWAY: + /* Not supported */ + ret = flen; + break; + case H3_FT_SETTINGS: + ret = h3_parse_settings_frm(qcs->qcc->ctx, b, flen); + if (ret < 0) { + TRACE_ERROR("error on SETTINGS parsing", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + qcc_set_error(qcs->qcc, h3c->err, 1); + goto err; + } + h3c->flags |= H3_CF_SETTINGS_RECV; + break; + default: + /* draft-ietf-quic-http34 9. Extensions to HTTP/3 + * + * Implementations MUST discard frames [...] that have unknown + * or unsupported types. + */ + ret = flen; + break; + } + + if (ret > 0) { + BUG_ON(h3s->demux_frame_len < ret); + h3s->demux_frame_len -= ret; + b_del(b, ret); + total += ret; + } + } + + /* Reset demux frame type for traces. */ + if (!h3s->demux_frame_len) + h3s->demux_frame_type = H3_FT_UNINIT; + + /* Interrupt decoding on stream/connection error detected. */ + if (h3s->err) { + qcc_abort_stream_read(qcs); + qcc_reset_stream(qcs, h3s->err); + return b_data(b); + } + else if (h3c->err) { + qcc_set_error(qcs->qcc, h3c->err, 1); + return b_data(b); + } + + /* TODO may be useful to wakeup the MUX if blocked due to full buffer. + * However, currently, io-cb of MUX does not handle Rx. + */ + + done: + TRACE_LEAVE(H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + return total; + + err: + TRACE_DEVEL("leaving on error", H3_EV_RX_FRAME, qcs->qcc->conn, qcs); + return -1; +} + +/* Returns buffer for data sending. + * May be NULL if the allocation failed. + */ +static struct buffer *mux_get_buf(struct qcs *qcs) +{ + if (!b_size(&qcs->tx.buf)) + b_alloc(&qcs->tx.buf); + + return &qcs->tx.buf; +} + +/* Function used to emit stream data from <qcs> control uni-stream. + * + * On success return the number of sent bytes. A negative code is used on + * error. + */ +static int h3_control_send(struct qcs *qcs, void *ctx) +{ + int ret; + struct h3c *h3c = ctx; + unsigned char data[(2 + 3) * 2 * QUIC_VARINT_MAX_SIZE]; /* enough for 3 settings */ + struct buffer pos, *res; + size_t frm_len; + + TRACE_ENTER(H3_EV_TX_SETTINGS, qcs->qcc->conn, qcs); + + BUG_ON_HOT(h3c->flags & H3_CF_SETTINGS_SENT); + + ret = 0; + pos = b_make((char *)data, sizeof(data), 0, 0); + + frm_len = quic_int_getsize(H3_SETTINGS_QPACK_MAX_TABLE_CAPACITY) + + quic_int_getsize(h3_settings_qpack_max_table_capacity) + + quic_int_getsize(H3_SETTINGS_QPACK_BLOCKED_STREAMS) + + quic_int_getsize(h3_settings_qpack_blocked_streams); + if (h3_settings_max_field_section_size) { + frm_len += quic_int_getsize(H3_SETTINGS_MAX_FIELD_SECTION_SIZE) + + quic_int_getsize(h3_settings_max_field_section_size); + } + + b_quic_enc_int(&pos, H3_UNI_S_T_CTRL, 0); + /* Build a SETTINGS frame */ + b_quic_enc_int(&pos, H3_FT_SETTINGS, 0); + b_quic_enc_int(&pos, frm_len, 0); + b_quic_enc_int(&pos, H3_SETTINGS_QPACK_MAX_TABLE_CAPACITY, 0); + b_quic_enc_int(&pos, h3_settings_qpack_max_table_capacity, 0); + b_quic_enc_int(&pos, H3_SETTINGS_QPACK_BLOCKED_STREAMS, 0); + b_quic_enc_int(&pos, h3_settings_qpack_blocked_streams, 0); + if (h3_settings_max_field_section_size) { + b_quic_enc_int(&pos, H3_SETTINGS_MAX_FIELD_SECTION_SIZE, 0); + b_quic_enc_int(&pos, h3_settings_max_field_section_size, 0); + } + + res = mux_get_buf(qcs); + if (b_is_null(res)) { + TRACE_ERROR("cannot allocate Tx buffer", H3_EV_TX_SETTINGS, qcs->qcc->conn, qcs); + goto err; + } + + if (b_room(res) < b_data(&pos)) { + // TODO the mux should be put in blocked state, with + // the stream in state waiting for settings to be sent + ABORT_NOW(); + } + + ret = b_force_xfer(res, &pos, b_data(&pos)); + if (ret > 0) { + /* Register qcs for sending before other streams. */ + qcc_send_stream(qcs, 1); + h3c->flags |= H3_CF_SETTINGS_SENT; + } + + TRACE_LEAVE(H3_EV_TX_SETTINGS, qcs->qcc->conn, qcs); + return ret; + + err: + TRACE_DEVEL("leaving on error", H3_EV_TX_SETTINGS, qcs->qcc->conn, qcs); + return -1; +} + +static int h3_resp_headers_send(struct qcs *qcs, struct htx *htx) +{ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c; + struct buffer outbuf; + struct buffer headers_buf = BUF_NULL; + struct buffer *res; + struct http_hdr list[global.tune.max_http_hdr]; + struct htx_sl *sl; + struct htx_blk *blk; + enum htx_blk_type type; + int frame_length_size; /* size in bytes of frame length varint field */ + int ret = 0; + int hdr; + int status = 0; + + TRACE_ENTER(H3_EV_TX_HDR, qcs->qcc->conn, qcs); + + sl = NULL; + hdr = 0; + for (blk = htx_get_head_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + type = htx_get_blk_type(blk); + + if (type == HTX_BLK_UNUSED) + continue; + + if (type == HTX_BLK_EOH) + break; + + if (type == HTX_BLK_RES_SL) { + /* start-line -> HEADERS h3 frame */ + BUG_ON(sl); + sl = htx_get_blk_ptr(htx, blk); + /* TODO should be on h3 layer */ + status = sl->info.res.status; + } + else if (type == HTX_BLK_HDR) { + if (unlikely(hdr >= sizeof(list) / sizeof(list[0]) - 1)) { + TRACE_ERROR("too many headers", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + goto err; + } + list[hdr].n = htx_get_blk_name(htx, blk); + list[hdr].v = htx_get_blk_value(htx, blk); + hdr++; + } + else { + ABORT_NOW(); + goto err; + } + } + + BUG_ON(!sl); + + list[hdr].n = ist(""); + + res = mux_get_buf(qcs); + if (b_is_null(res)) { + TRACE_ERROR("cannot allocate Tx buffer", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + goto err; + } + + /* At least 5 bytes to store frame type + length as a varint max size */ + if (b_room(res) < 5) + ABORT_NOW(); + + b_reset(&outbuf); + outbuf = b_make(b_tail(res), b_contig_space(res), 0, 0); + /* Start the headers after frame type + length */ + headers_buf = b_make(b_head(res) + 5, b_size(res) - 5, 0, 0); + + if (qpack_encode_field_section_line(&headers_buf)) + ABORT_NOW(); + if (qpack_encode_int_status(&headers_buf, status)) { + TRACE_ERROR("invalid status code", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + goto err; + } + + for (hdr = 0; hdr < sizeof(list) / sizeof(list[0]); ++hdr) { + if (isteq(list[hdr].n, ist(""))) + break; + + /* RFC 9114 4.2. HTTP Fields + * + * An intermediary transforming an HTTP/1.x message to HTTP/3 + * MUST remove connection-specific header fields as discussed in + * Section 7.6.1 of [HTTP], or their messages will be treated by + * other HTTP/3 endpoints as malformed. + */ + if (isteq(list[hdr].n, ist("connection")) || + isteq(list[hdr].n, ist("proxy-connection")) || + isteq(list[hdr].n, ist("keep-alive")) || + isteq(list[hdr].n, ist("transfer-encoding"))) { + continue; + } + else if (isteq(list[hdr].n, ist("te"))) { + /* "te" may only be sent with "trailers" if this value + * is present, otherwise it must be deleted. + */ + const struct ist v = istist(list[hdr].v, ist("trailers")); + if (!isttest(v) || (v.len > 8 && v.ptr[8] != ',')) + continue; + list[hdr].v = ist("trailers"); + } + + if (qpack_encode_header(&headers_buf, list[hdr].n, list[hdr].v)) + ABORT_NOW(); + } + + /* Now that all headers are encoded, we are certain that res buffer is + * big enough + */ + frame_length_size = quic_int_getsize(b_data(&headers_buf)); + res->head += 4 - frame_length_size; + b_putchr(res, 0x01); /* h3 HEADERS frame type */ + if (!b_quic_enc_int(res, b_data(&headers_buf), 0)) + ABORT_NOW(); + b_add(res, b_data(&headers_buf)); + + ret = 0; + blk = htx_get_head_blk(htx); + while (blk) { + type = htx_get_blk_type(blk); + ret += htx_get_blksz(blk); + blk = htx_remove_blk(htx, blk); + if (type == HTX_BLK_EOH) + break; + } + + TRACE_LEAVE(H3_EV_TX_HDR, qcs->qcc->conn, qcs); + return ret; + + err: + TRACE_DEVEL("leaving on error", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + return -1; +} + +/* Convert a series of HTX trailer blocks from <htx> buffer into <qcs> buffer + * as a H3 HEADERS frame. H3 forbidden trailers are skipped. HTX trailer blocks + * are removed from <htx> until EOT is found and itself removed. + * + * If only a EOT HTX block is present without trailer, no H3 frame is produced. + * Caller is responsible to emit an empty QUIC STREAM frame to signal the end + * of the stream. + * + * Returns the size of HTX blocks removed. + */ +static int h3_resp_trailers_send(struct qcs *qcs, struct htx *htx) +{ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c; + struct buffer headers_buf = BUF_NULL; + struct buffer *res; + struct http_hdr list[global.tune.max_http_hdr]; + struct htx_blk *blk; + enum htx_blk_type type; + char *tail; + int ret = 0; + int hdr; + + TRACE_ENTER(H3_EV_TX_HDR, qcs->qcc->conn, qcs); + + hdr = 0; + for (blk = htx_get_head_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + type = htx_get_blk_type(blk); + + if (type == HTX_BLK_UNUSED) + continue; + + if (type == HTX_BLK_EOT) + break; + + if (type == HTX_BLK_TLR) { + if (unlikely(hdr >= sizeof(list) / sizeof(list[0]) - 1)) { + TRACE_ERROR("too many headers", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + goto err; + } + list[hdr].n = htx_get_blk_name(htx, blk); + list[hdr].v = htx_get_blk_value(htx, blk); + hdr++; + } + else { + TRACE_ERROR("unexpected HTX block", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + goto err; + } + } + + if (!hdr) { + /* No headers encoded here so no need to generate a H3 HEADERS + * frame. Mux will send an empty QUIC STREAM frame with FIN. + */ + TRACE_DATA("skipping trailer", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + goto end; + } + list[hdr].n = ist(""); + + res = mux_get_buf(qcs); + if (b_is_null(res)) { + TRACE_ERROR("cannot allocate Tx buffer", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + goto err; + } + + /* At least 9 bytes to store frame type + length as a varint max size */ + if (b_room(res) < 9) { + qcs->flags |= QC_SF_BLK_MROOM; + goto err; + } + + /* Force buffer realignment as size required to encode headers is unknown. */ + if (b_space_wraps(res)) + b_slow_realign(res, trash.area, b_data(res)); + /* Start the headers after frame type + length */ + headers_buf = b_make(b_peek(res, b_data(res) + 9), b_contig_space(res) - 9, 0, 0); + + if (qpack_encode_field_section_line(&headers_buf)) { + qcs->flags |= QC_SF_BLK_MROOM; + goto err; + } + + tail = b_tail(&headers_buf); + for (hdr = 0; hdr < sizeof(list) / sizeof(list[0]); ++hdr) { + if (isteq(list[hdr].n, ist(""))) + break; + + /* forbidden HTTP/3 headers, cf h3_resp_headers_send() */ + if (isteq(list[hdr].n, ist("host")) || + isteq(list[hdr].n, ist("content-length")) || + isteq(list[hdr].n, ist("connection")) || + isteq(list[hdr].n, ist("proxy-connection")) || + isteq(list[hdr].n, ist("keep-alive")) || + isteq(list[hdr].n, ist("te")) || + isteq(list[hdr].n, ist("transfer-encoding"))) { + continue; + } + + if (qpack_encode_header(&headers_buf, list[hdr].n, list[hdr].v)) { + qcs->flags |= QC_SF_BLK_MROOM; + goto err; + } + } + + /* Check that at least one header was encoded in buffer. */ + if (b_tail(&headers_buf) == tail) { + /* No headers encoded here so no need to generate a H3 HEADERS + * frame. Mux will send an empty QUIC STREAM frame with FIN. + */ + TRACE_DATA("skipping trailer", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + goto end; + } + + /* Now that all headers are encoded, we are certain that res buffer is + * big enough. + */ + b_putchr(res, 0x01); /* h3 HEADERS frame type */ + if (!b_quic_enc_int(res, b_data(&headers_buf), 8)) + ABORT_NOW(); + b_add(res, b_data(&headers_buf)); + + end: + ret = 0; + blk = htx_get_head_blk(htx); + while (blk) { + type = htx_get_blk_type(blk); + ret += htx_get_blksz(blk); + blk = htx_remove_blk(htx, blk); + if (type == HTX_BLK_EOT) + break; + } + + TRACE_LEAVE(H3_EV_TX_HDR, qcs->qcc->conn, qcs); + return ret; + + err: + TRACE_DEVEL("leaving on error", H3_EV_TX_HDR, qcs->qcc->conn, qcs); + return -1; +} + +/* Returns the total of bytes sent. This corresponds to the + * total bytes of HTX block removed. A negative error code is returned in case + * of a fatal error which should caused a connection closure. + */ +static int h3_resp_data_send(struct qcs *qcs, struct buffer *buf, size_t count) +{ + struct htx *htx; + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c; + struct buffer outbuf; + struct buffer *res; + size_t total = 0; + int bsize, fsize, hsize; + struct htx_blk *blk; + enum htx_blk_type type; + + TRACE_ENTER(H3_EV_TX_DATA, qcs->qcc->conn, qcs); + + htx = htx_from_buf(buf); + + new_frame: + if (!count || htx_is_empty(htx)) + goto end; + + blk = htx_get_head_blk(htx); + type = htx_get_blk_type(blk); + fsize = bsize = htx_get_blksz(blk); + + /* h3 DATA headers : 1-byte frame type + varint frame length */ + hsize = 1 + QUIC_VARINT_MAX_SIZE; + + if (type != HTX_BLK_DATA) + goto end; + + res = mux_get_buf(qcs); + if (b_is_null(res)) { + TRACE_ERROR("cannot allocate Tx buffer", H3_EV_TX_DATA, qcs->qcc->conn, qcs); + h3c->err = H3_INTERNAL_ERROR; + goto err; + } + + if (unlikely(fsize == count && + !b_data(res) && + htx_nbblks(htx) == 1 && type == HTX_BLK_DATA)) { + void *old_area = res->area; + + /* map an H2 frame to the HTX block so that we can put the + * frame header there. + */ + *res = b_make(buf->area, buf->size, sizeof(struct htx) + blk->addr - hsize, fsize + hsize); + outbuf = b_make(b_head(res), hsize, 0, 0); + b_putchr(&outbuf, 0x00); /* h3 frame type = DATA */ + b_quic_enc_int(&outbuf, fsize, QUIC_VARINT_MAX_SIZE); /* h3 frame length */ + + /* and exchange with our old area */ + buf->area = old_area; + buf->data = buf->head = 0; + total += fsize; + fsize = 0; + goto end; + } + + if (fsize > count) + fsize = count; + + while (1) { + b_reset(&outbuf); + outbuf = b_make(b_tail(res), b_contig_space(res), 0, 0); + if (b_size(&outbuf) > hsize || !b_space_wraps(res)) + break; + b_slow_realign(res, trash.area, b_data(res)); + } + + /* Not enough room for headers and at least one data byte, block the + * stream. It is expected that the stream connector layer will subscribe + * on SEND. + */ + if (b_size(&outbuf) <= hsize) { + TRACE_STATE("not enough room for data frame", H3_EV_TX_DATA, qcs->qcc->conn, qcs); + qcs->flags |= QC_SF_BLK_MROOM; + goto end; + } + + if (b_size(&outbuf) < hsize + fsize) + fsize = b_size(&outbuf) - hsize; + BUG_ON(fsize <= 0); + + b_putchr(&outbuf, 0x00); /* h3 frame type = DATA */ + b_quic_enc_int(&outbuf, fsize, 0); /* h3 frame length */ + + b_putblk(&outbuf, htx_get_blk_ptr(htx, blk), fsize); + total += fsize; + count -= fsize; + + if (fsize == bsize) + htx_remove_blk(htx, blk); + else + htx_cut_data_blk(htx, blk, fsize); + + /* commit the buffer */ + b_add(res, b_data(&outbuf)); + goto new_frame; + + end: + TRACE_LEAVE(H3_EV_TX_DATA, qcs->qcc->conn, qcs); + return total; + + err: + BUG_ON(total); /* Must return HTX removed size if at least on frame encoded. */ + TRACE_DEVEL("leaving on error", H3_EV_TX_DATA, qcs->qcc->conn, qcs); + return -1; +} + +static size_t h3_snd_buf(struct qcs *qcs, struct buffer *buf, size_t count) +{ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c; + size_t total = 0; + enum htx_blk_type btype; + struct htx *htx; + struct htx_blk *blk; + uint32_t bsize; + int32_t idx; + int ret = 0; + + TRACE_ENTER(H3_EV_STRM_SEND, qcs->qcc->conn, qcs); + + htx = htx_from_buf(buf); + + if (htx->extra && htx->extra == HTX_UNKOWN_PAYLOAD_LENGTH) + qcs->flags |= QC_SF_UNKNOWN_PL_LENGTH; + + while (count && !htx_is_empty(htx) && + !(qcs->flags & QC_SF_BLK_MROOM) && !h3c->err) { + + idx = htx_get_head(htx); + blk = htx_get_blk(htx, idx); + btype = htx_get_blk_type(blk); + bsize = htx_get_blksz(blk); + + /* Not implemented : QUIC on backend side */ + BUG_ON(btype == HTX_BLK_REQ_SL); + + switch (btype) { + case HTX_BLK_RES_SL: + /* start-line -> HEADERS h3 frame */ + ret = h3_resp_headers_send(qcs, htx); + if (ret > 0) { + total += ret; + count -= ret; + if (ret < bsize) + goto out; + } + break; + + case HTX_BLK_DATA: + ret = h3_resp_data_send(qcs, buf, count); + if (ret > 0) { + htx = htx_from_buf(buf); + total += ret; + count -= ret; + if (ret < bsize) + goto out; + } + break; + + case HTX_BLK_TLR: + case HTX_BLK_EOT: + ret = h3_resp_trailers_send(qcs, htx); + if (ret > 0) { + total += ret; + count -= ret; + if (ret < bsize) + goto out; + } + break; + + default: + htx_remove_blk(htx, blk); + total += bsize; + count -= bsize; + break; + } + + /* If an error occured, either buffer space or connection error + * must be set to break current loop. + */ + BUG_ON(ret < 0 && !(qcs->flags & QC_SF_BLK_MROOM) && !h3c->err); + } + + /* Interrupt sending on connection error. */ + if (unlikely(h3c->err)) { + qcc_set_error(qcs->qcc, h3c->err, 1); + goto out; + } + + /* RFC 9114 4.1. HTTP Message Framing + * + * A server can send a complete response prior to the client sending an + * entire request if the response does not depend on any portion of the + * request that has not been sent and received. When the server does not + * need to receive the remainder of the request, it MAY abort reading + * the request stream, send a complete response, and cleanly close the + * sending part of the stream. The error code H3_NO_ERROR SHOULD be used + * when requesting that the client stop sending on the request stream. + * Clients MUST NOT discard complete responses as a result of having + * their request terminated abruptly, though clients can always discard + * responses at their discretion for other reasons. If the server sends + * a partial or complete response but does not abort reading the + * request, clients SHOULD continue sending the content of the request + * and close the stream normally. + */ + if (unlikely((htx->flags & HTX_FL_EOM) && htx_is_empty(htx)) && + !qcs_is_close_remote(qcs)) { + /* Generate a STOP_SENDING if full response transferred before + * receiving the full request. + */ + qcs->err = H3_NO_ERROR; + qcc_abort_stream_read(qcs); + } + + out: + htx_to_buf(htx, buf); + + TRACE_LEAVE(H3_EV_STRM_SEND, qcs->qcc->conn, qcs); + return total; +} + +static size_t h3_nego_ff(struct qcs *qcs, size_t count) +{ + struct buffer *res; + int hsize; + size_t sz, ret = 0; + + TRACE_ENTER(H3_EV_STRM_SEND, qcs->qcc->conn, qcs); + + res = mux_get_buf(qcs); + if (b_is_null(res)) { + qcs->sd->iobuf.flags |= IOBUF_FL_NO_FF; + goto end; + } + + /* h3 DATA headers : 1-byte frame type + varint frame length */ + hsize = 1 + QUIC_VARINT_MAX_SIZE; + while (1) { + if (b_contig_space(res) >= hsize || !b_space_wraps(res)) + break; + b_slow_realign(res, trash.area, b_data(res)); + } + + /* Not enough room for headers and at least one data byte, block the + * stream. It is expected that the stream connector layer will subscribe + * on SEND. + */ + if (b_contig_space(res) <= hsize) { + qcs->flags |= QC_SF_BLK_MROOM; + qcs->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + goto end; + } + + /* Cannot forward more than available room in output buffer */ + sz = b_contig_space(res) - hsize; + if (count > sz) + count = sz; + + qcs->sd->iobuf.buf = res; + qcs->sd->iobuf.offset = hsize; + qcs->sd->iobuf.data = 0; + + ret = count; + end: + TRACE_LEAVE(H3_EV_STRM_SEND, qcs->qcc->conn, qcs); + return ret; +} + +static size_t h3_done_ff(struct qcs *qcs) +{ + size_t total = qcs->sd->iobuf.data; + TRACE_ENTER(H3_EV_STRM_SEND, qcs->qcc->conn, qcs); + + h3_debug_printf(stderr, "%s\n", __func__); + + if (qcs->sd->iobuf.data) { + b_sub(qcs->sd->iobuf.buf, qcs->sd->iobuf.data); + b_putchr(qcs->sd->iobuf.buf, 0x00); /* h3 frame type = DATA */ + b_quic_enc_int(qcs->sd->iobuf.buf, qcs->sd->iobuf.data, QUIC_VARINT_MAX_SIZE); /* h3 frame length */ + b_add(qcs->sd->iobuf.buf, qcs->sd->iobuf.data); + } + + qcs->sd->iobuf.buf = NULL; + qcs->sd->iobuf.offset = 0; + qcs->sd->iobuf.data = 0; + + TRACE_LEAVE(H3_EV_STRM_SEND, qcs->qcc->conn, qcs); + return total; +} + +/* Notify about a closure on <qcs> stream requested by the remote peer. + * + * Stream channel <side> is explained relative to our endpoint : WR for + * STOP_SENDING or RD for RESET_STREAM reception. Callback decode_qcs() is used + * instead for closure performed using a STREAM frame with FIN bit. + * + * The main objective of this function is to check if closure is valid + * according to HTTP/3 specification. + * + * Returns 0 on success else non-zero. A CONNECTION_CLOSE is generated on + * error. + */ +static int h3_close(struct qcs *qcs, enum qcc_app_ops_close_side side) +{ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c;; + + /* RFC 9114 6.2.1. Control Streams + * + * The sender + * MUST NOT close the control stream, and the receiver MUST NOT + * request that the sender close the control stream. If either + * control stream is closed at any point, this MUST be treated + * as a connection error of type H3_CLOSED_CRITICAL_STREAM. + */ + if (qcs == h3c->ctrl_strm || h3s->type == H3S_T_CTRL) { + TRACE_ERROR("closure detected on control stream", H3_EV_H3S_END, qcs->qcc->conn, qcs); + qcc_set_error(qcs->qcc, H3_CLOSED_CRITICAL_STREAM, 1); + return 1; + } + + return 0; +} + +static int h3_attach(struct qcs *qcs, void *conn_ctx) +{ + struct h3c *h3c = conn_ctx; + struct h3s *h3s = NULL; + + TRACE_ENTER(H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + + /* RFC 9114 5.2. Connection Shutdown + * + * Upon sending + * a GOAWAY frame, the endpoint SHOULD explicitly cancel (see + * Sections 4.1.1 and 7.2.3) any requests or pushes that have + * identifiers greater than or equal to the one indicated, in + * order to clean up transport state for the affected streams. + * The endpoint SHOULD continue to do so as more requests or + * pushes arrive. + */ + if (h3c->flags & H3_CF_GOAWAY_SENT && qcs->id >= h3c->id_goaway && + quic_stream_is_bidi(qcs->id)) { + /* Reject request and do not allocate a h3s context. + * TODO support push uni-stream rejection. + */ + TRACE_STATE("reject stream higher than goaway", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + qcc_abort_stream_read(qcs); + qcc_reset_stream(qcs, H3_REQUEST_REJECTED); + goto done; + } + + h3s = pool_alloc(pool_head_h3s); + if (!h3s) { + TRACE_ERROR("h3s allocation failure", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + goto err; + } + + qcs->ctx = h3s; + h3s->h3c = conn_ctx; + + h3s->demux_frame_len = 0; + h3s->demux_frame_type = H3_FT_UNINIT; + h3s->body_len = 0; + h3s->data_len = 0; + h3s->flags = 0; + h3s->err = 0; + + if (quic_stream_is_bidi(qcs->id)) { + h3s->type = H3S_T_REQ; + h3s->st_req = H3S_ST_REQ_BEFORE; + qcs_wait_http_req(qcs); + } + else { + /* stream type must be decoded for unidirectional streams */ + h3s->type = H3S_T_UNKNOWN; + } + + done: + TRACE_LEAVE(H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + return 0; + + err: + TRACE_DEVEL("leaving in error", H3_EV_H3S_NEW, qcs->qcc->conn, qcs); + return 1; +} + +static void h3_detach(struct qcs *qcs) +{ + struct h3s *h3s = qcs->ctx; + + TRACE_ENTER(H3_EV_H3S_END, qcs->qcc->conn, qcs); + + pool_free(pool_head_h3s, h3s); + qcs->ctx = NULL; + + TRACE_LEAVE(H3_EV_H3S_END, qcs->qcc->conn, qcs); +} + +/* Initialize H3 control stream and prepare SETTINGS emission. + * + * Returns 0 on success else non-zero. + */ +static int h3_finalize(void *ctx) +{ + struct h3c *h3c = ctx; + struct qcc *qcc = h3c->qcc; + struct qcs *qcs; + + TRACE_ENTER(H3_EV_H3C_NEW, qcc->conn); + + qcs = qcc_init_stream_local(h3c->qcc, 0); + if (!qcs) { + TRACE_ERROR("cannot init control stream", H3_EV_H3C_NEW, qcc->conn); + goto err; + } + + h3c->ctrl_strm = qcs; + + if (h3_control_send(qcs, h3c) < 0) + goto err; + + TRACE_LEAVE(H3_EV_H3C_NEW, qcc->conn); + return 0; + + err: + TRACE_DEVEL("leaving on error", H3_EV_H3C_NEW, qcc->conn); + return 1; +} + +/* Generate a GOAWAY frame for <h3c> connection on the control stream. + * + * Returns 0 on success else non-zero. + */ +static int h3_send_goaway(struct h3c *h3c) +{ + struct qcs *qcs = h3c->ctrl_strm; + struct buffer pos, *res; + unsigned char data[3 * QUIC_VARINT_MAX_SIZE]; + size_t frm_len = quic_int_getsize(h3c->id_goaway); + + TRACE_ENTER(H3_EV_H3C_END, h3c->qcc->conn); + + if (!qcs) { + TRACE_ERROR("control stream not initialized", H3_EV_H3C_END, h3c->qcc->conn); + goto err; + } + + pos = b_make((char *)data, sizeof(data), 0, 0); + + b_quic_enc_int(&pos, H3_FT_GOAWAY, 0); + b_quic_enc_int(&pos, frm_len, 0); + b_quic_enc_int(&pos, h3c->id_goaway, 0); + + res = mux_get_buf(qcs); + if (b_is_null(res) || b_room(res) < b_data(&pos)) { + /* Do not try forcefully to emit GOAWAY if no space left. */ + TRACE_ERROR("cannot send GOAWAY", H3_EV_H3C_END, h3c->qcc->conn, qcs); + goto err; + } + + b_force_xfer(res, &pos, b_data(&pos)); + qcc_send_stream(qcs, 1); + + h3c->flags |= H3_CF_GOAWAY_SENT; + TRACE_LEAVE(H3_EV_H3C_END, h3c->qcc->conn); + return 0; + + err: + /* Consider GOAWAY as sent even if not really the case. This will + * block future stream opening using H3_REQUEST_REJECTED reset. + */ + h3c->flags |= H3_CF_GOAWAY_SENT; + TRACE_DEVEL("leaving in error", H3_EV_H3C_END, h3c->qcc->conn); + return 1; +} + +/* Initialize the HTTP/3 context for <qcc> mux. + * Return 1 if succeeded, 0 if not. + */ +static int h3_init(struct qcc *qcc) +{ + struct h3c *h3c; + struct quic_conn *qc = qcc->conn->handle.qc; + + TRACE_ENTER(H3_EV_H3C_NEW, qcc->conn); + + h3c = pool_alloc(pool_head_h3c); + if (!h3c) { + TRACE_ERROR("cannot allocate h3c", H3_EV_H3C_NEW, qcc->conn); + goto fail_no_h3; + } + + h3c->qcc = qcc; + h3c->ctrl_strm = NULL; + h3c->err = 0; + h3c->flags = 0; + h3c->id_goaway = 0; + + qcc->ctx = h3c; + /* TODO cleanup only ref to quic_conn */ + h3c->prx_counters = + EXTRA_COUNTERS_GET(qc->li->bind_conf->frontend->extra_counters_fe, + &h3_stats_module); + LIST_INIT(&h3c->buf_wait.list); + + TRACE_LEAVE(H3_EV_H3C_NEW, qcc->conn); + return 1; + + fail_no_h3: + TRACE_DEVEL("leaving on error", H3_EV_H3C_NEW, qcc->conn); + return 0; +} + +/* Send a HTTP/3 GOAWAY followed by a CONNECTION_CLOSE_APP. */ +static void h3_shutdown(void *ctx) +{ + struct h3c *h3c = ctx; + + TRACE_ENTER(H3_EV_H3C_END, h3c->qcc->conn); + + /* RFC 9114 5.2. Connection Shutdown + * + * Even when a connection is not idle, either endpoint can decide to + * stop using the connection and initiate a graceful connection close. + * Endpoints initiate the graceful shutdown of an HTTP/3 connection by + * sending a GOAWAY frame. + */ + h3_send_goaway(h3c); + + /* RFC 9114 5.2. Connection Shutdown + * + * An endpoint that completes a + * graceful shutdown SHOULD use the H3_NO_ERROR error code when closing + * the connection. + */ + h3c->qcc->err = quic_err_app(H3_NO_ERROR); + + TRACE_LEAVE(H3_EV_H3C_END, h3c->qcc->conn); +} + +static void h3_release(void *ctx) +{ + struct h3c *h3c = ctx; + pool_free(pool_head_h3c, h3c); +} + +/* Increment the h3 error code counters for <error_code> value */ +static void h3_stats_inc_err_cnt(void *ctx, int err_code) +{ + struct h3c *h3c = ctx; + + h3_inc_err_cnt(h3c->prx_counters, err_code); +} + +static inline const char *h3_ft_str(uint64_t type) +{ + switch (type) { + case H3_FT_DATA: return "DATA"; + case H3_FT_HEADERS: return "HEADERS"; + case H3_FT_SETTINGS: return "SETTINGS"; + case H3_FT_PUSH_PROMISE: return "PUSH_PROMISE"; + case H3_FT_MAX_PUSH_ID: return "MAX_PUSH_ID"; + case H3_FT_CANCEL_PUSH: return "CANCEL_PUSH"; + case H3_FT_GOAWAY: return "GOAWAY"; + default: return "_UNKNOWN_"; + } +} + +/* h3 trace handler */ +static void h3_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct connection *conn = a1; + const struct qcc *qcc = conn ? conn->ctx : NULL; + const struct qcs *qcs = a2; + const struct h3s *h3s = qcs ? qcs->ctx : NULL; + + if (!qcc) + return; + + if (src->verbosity > H3_VERB_CLEAN) { + chunk_appendf(&trace_buf, " : qcc=%p(F)", qcc); + if (qcc->conn->handle.qc) + chunk_appendf(&trace_buf, " qc=%p", qcc->conn->handle.qc); + + if (qcs) + chunk_appendf(&trace_buf, " qcs=%p(%llu)", qcs, (ull)qcs->id); + + if (h3s && h3s->demux_frame_type != H3_FT_UNINIT) { + chunk_appendf(&trace_buf, " h3s.dem=%s/%llu", + h3_ft_str(h3s->demux_frame_type), (ull)h3s->demux_frame_len); + } + } +} + +/* HTTP/3 application layer operations */ +const struct qcc_app_ops h3_ops = { + .init = h3_init, + .attach = h3_attach, + .decode_qcs = h3_decode_qcs, + .snd_buf = h3_snd_buf, + .nego_ff = h3_nego_ff, + .done_ff = h3_done_ff, + .close = h3_close, + .detach = h3_detach, + .finalize = h3_finalize, + .shutdown = h3_shutdown, + .inc_err_cnt = h3_stats_inc_err_cnt, + .release = h3_release, +}; diff --git a/src/h3_stats.c b/src/h3_stats.c new file mode 100644 index 0000000..c96093f --- /dev/null +++ b/src/h3_stats.c @@ -0,0 +1,276 @@ +#include <haproxy/h3.h> +#include <haproxy/stats.h> + +enum { + /* h3 frame type counters */ + H3_ST_DATA, + H3_ST_HEADERS, + H3_ST_CANCEL_PUSH, + H3_ST_PUSH_PROMISE, + H3_ST_MAX_PUSH_ID, + H3_ST_GOAWAY, + H3_ST_SETTINGS, + /* h3 error counters */ + H3_ST_H3_NO_ERROR, + H3_ST_H3_GENERAL_PROTOCOL_ERROR, + H3_ST_H3_INTERNAL_ERROR, + H3_ST_H3_STREAM_CREATION_ERROR, + H3_ST_H3_CLOSED_CRITICAL_STREAM, + H3_ST_H3_FRAME_UNEXPECTED, + H3_ST_H3_FRAME_ERROR, + H3_ST_H3_EXCESSIVE_LOAD, + H3_ST_H3_ID_ERROR, + H3_ST_H3_SETTINGS_ERROR, + H3_ST_H3_MISSING_SETTINGS, + H3_ST_H3_REQUEST_REJECTED, + H3_ST_H3_REQUEST_CANCELLED, + H3_ST_H3_REQUEST_INCOMPLETE, + H3_ST_H3_MESSAGE_ERROR, + H3_ST_H3_CONNECT_ERROR, + H3_ST_H3_VERSION_FALLBACK, + /* QPACK error counters */ + H3_ST_QPACK_DECOMPRESSION_FAILED, + H3_ST_QPACK_ENCODER_STREAM_ERROR, + H3_ST_QPACK_DECODER_STREAM_ERROR, + H3_STATS_COUNT /* must be the last */ +}; + +static struct name_desc h3_stats[] = { + /* h3 frame type counters */ + [H3_ST_DATA] = { .name = "h3_data", + .desc = "Total number of DATA frames received" }, + [H3_ST_HEADERS] = { .name = "h3_headers", + .desc = "Total number of HEADERS frames received" }, + [H3_ST_CANCEL_PUSH] = { .name = "h3_cancel_push", + .desc = "Total number of CANCEL_PUSH frames received" }, + [H3_ST_PUSH_PROMISE] = { .name = "h3_push_promise", + .desc = "Total number of PUSH_PROMISE frames received" }, + [H3_ST_MAX_PUSH_ID] = { .name = "h3_max_push_id", + .desc = "Total number of MAX_PUSH_ID frames received" }, + [H3_ST_GOAWAY] = { .name = "h3_goaway", + .desc = "Total number of GOAWAY frames received" }, + [H3_ST_SETTINGS] = { .name = "h3_settings", + .desc = "Total number of SETTINGS frames received" }, + /* h3 error counters */ + [H3_ST_H3_NO_ERROR] = { .name = "h3_no_error", + .desc = "Total number of H3_NO_ERROR errors received" }, + [H3_ST_H3_GENERAL_PROTOCOL_ERROR] = { .name = "h3_general_protocol_error", + .desc = "Total number of H3_GENERAL_PROTOCOL_ERROR errors received" }, + [H3_ST_H3_INTERNAL_ERROR] = { .name = "h3_internal_error", + .desc = "Total number of H3_INTERNAL_ERROR errors received" }, + [H3_ST_H3_STREAM_CREATION_ERROR] = { .name = "h3_stream_creation_error", + .desc = "Total number of H3_STREAM_CREATION_ERROR errors received" }, + [H3_ST_H3_CLOSED_CRITICAL_STREAM] = { .name = "h3_closed_critical_stream", + .desc = "Total number of H3_CLOSED_CRITICAL_STREAM errors received" }, + [H3_ST_H3_FRAME_UNEXPECTED] = { .name = "h3_frame_unexpected", + .desc = "Total number of H3_FRAME_UNEXPECTED errors received" }, + [H3_ST_H3_FRAME_ERROR] = { .name = "h3_frame_error", + .desc = "Total number of H3_FRAME_ERROR errors received" }, + [H3_ST_H3_EXCESSIVE_LOAD] = { .name = "h3_excessive_load", + .desc = "Total number of H3_EXCESSIVE_LOAD errors received" }, + [H3_ST_H3_ID_ERROR] = { .name = "h3_id_error", + .desc = "Total number of H3_ID_ERROR errors received" }, + [H3_ST_H3_SETTINGS_ERROR] = { .name = "h3_settings_error", + .desc = "Total number of H3_SETTINGS_ERROR errors received" }, + [H3_ST_H3_MISSING_SETTINGS] = { .name = "h3_missing_settings", + .desc = "Total number of H3_MISSING_SETTINGS errors received" }, + [H3_ST_H3_REQUEST_REJECTED] = { .name = "h3_request_rejected", + .desc = "Total number of H3_REQUEST_REJECTED errors received" }, + [H3_ST_H3_REQUEST_CANCELLED] = { .name = "h3_request_cancelled", + .desc = "Total number of H3_REQUEST_CANCELLED errors received" }, + [H3_ST_H3_REQUEST_INCOMPLETE] = { .name = "h3_request_incomplete", + .desc = "Total number of H3_REQUEST_INCOMPLETE errors received" }, + [H3_ST_H3_MESSAGE_ERROR] = { .name = "h3_message_error", + .desc = "Total number of H3_MESSAGE_ERROR errors received" }, + [H3_ST_H3_CONNECT_ERROR] = { .name = "h3_connect_error", + .desc = "Total number of H3_CONNECT_ERROR errors received" }, + [H3_ST_H3_VERSION_FALLBACK] = { .name = "h3_version_fallback", + .desc = "Total number of H3_VERSION_FALLBACK errors received" }, + /* QPACK error counters */ + [H3_ST_QPACK_DECOMPRESSION_FAILED] = { .name = "pack_decompression_failed", + .desc = "Total number of QPACK_DECOMPRESSION_FAILED errors received" }, + [H3_ST_QPACK_ENCODER_STREAM_ERROR] = { .name = "qpack_encoder_stream_error", + .desc = "Total number of QPACK_ENCODER_STREAM_ERROR errors received" }, + [H3_ST_QPACK_DECODER_STREAM_ERROR] = { .name = "qpack_decoder_stream_error", + .desc = "Total number of QPACK_DECODER_STREAM_ERROR errors received" }, +}; + +static struct h3_counters { + /* h3 frame type counters */ + long long h3_data; /* total number of DATA frames received */ + long long h3_headers; /* total number of HEADERS frames received */ + long long h3_cancel_push; /* total number of CANCEL_PUSH frames received */ + long long h3_push_promise; /* total number of PUSH_PROMISE frames received */ + long long h3_max_push_id; /* total number of MAX_PUSH_ID frames received */ + long long h3_goaway; /* total number of GOAWAY frames received */ + long long h3_settings; /* total number of SETTINGS frames received */ + /* h3 error counters */ + long long h3_no_error; /* total number of H3_NO_ERROR errors received */ + long long h3_general_protocol_error; /* total number of H3_GENERAL_PROTOCOL_ERROR errors received */ + long long h3_internal_error; /* total number of H3_INTERNAL_ERROR errors received */ + long long h3_stream_creation_error; /* total number of H3_STREAM_CREATION_ERROR errors received */ + long long h3_closed_critical_stream; /* total number of H3_CLOSED_CRITICAL_STREAM errors received */ + long long h3_frame_unexpected; /* total number of H3_FRAME_UNEXPECTED errors received */ + long long h3_frame_error; /* total number of H3_FRAME_ERROR errors received */ + long long h3_excessive_load; /* total number of H3_EXCESSIVE_LOAD errors received */ + long long h3_id_error; /* total number of H3_ID_ERROR errors received */ + long long h3_settings_error; /* total number of H3_SETTINGS_ERROR errors received */ + long long h3_missing_settings; /* total number of H3_MISSING_SETTINGS errors received */ + long long h3_request_rejected; /* total number of H3_REQUEST_REJECTED errors received */ + long long h3_request_cancelled; /* total number of H3_REQUEST_CANCELLED errors received */ + long long h3_request_incomplete; /* total number of H3_REQUEST_INCOMPLETE errors received */ + long long h3_message_error; /* total number of H3_MESSAGE_ERROR errors received */ + long long h3_connect_error; /* total number of H3_CONNECT_ERROR errors received */ + long long h3_version_fallback; /* total number of H3_VERSION_FALLBACK errors received */ + /* QPACK error counters */ + long long qpack_decompression_failed; /* total number of QPACK_DECOMPRESSION_FAILED errors received */ + long long qpack_encoder_stream_error; /* total number of QPACK_ENCODER_STREAM_ERROR errors received */ + long long qpack_decoder_stream_error; /* total number of QPACK_DECODER_STREAM_ERROR errors received */ +} h3_counters; + +static void h3_fill_stats(void *data, struct field *stats) +{ + struct h3_counters *counters = data; + + /* h3 frame type counters */ + stats[H3_ST_DATA] = mkf_u64(FN_COUNTER, counters->h3_data); + stats[H3_ST_HEADERS] = mkf_u64(FN_COUNTER, counters->h3_headers); + stats[H3_ST_CANCEL_PUSH] = mkf_u64(FN_COUNTER, counters->h3_cancel_push); + stats[H3_ST_PUSH_PROMISE] = mkf_u64(FN_COUNTER, counters->h3_push_promise); + stats[H3_ST_MAX_PUSH_ID] = mkf_u64(FN_COUNTER, counters->h3_max_push_id); + stats[H3_ST_GOAWAY] = mkf_u64(FN_COUNTER, counters->h3_goaway); + stats[H3_ST_SETTINGS] = mkf_u64(FN_COUNTER, counters->h3_settings); + /* h3 error counters */ + stats[H3_ST_H3_NO_ERROR] = mkf_u64(FN_COUNTER, counters->h3_no_error); + stats[H3_ST_H3_GENERAL_PROTOCOL_ERROR] = mkf_u64(FN_COUNTER, counters->h3_general_protocol_error); + stats[H3_ST_H3_INTERNAL_ERROR] = mkf_u64(FN_COUNTER, counters->h3_internal_error); + stats[H3_ST_H3_STREAM_CREATION_ERROR] = mkf_u64(FN_COUNTER, counters->h3_stream_creation_error); + stats[H3_ST_H3_CLOSED_CRITICAL_STREAM] = mkf_u64(FN_COUNTER, counters->h3_closed_critical_stream); + stats[H3_ST_H3_FRAME_UNEXPECTED] = mkf_u64(FN_COUNTER, counters->h3_frame_unexpected); + stats[H3_ST_H3_FRAME_ERROR] = mkf_u64(FN_COUNTER, counters->h3_frame_error); + stats[H3_ST_H3_EXCESSIVE_LOAD] = mkf_u64(FN_COUNTER, counters->h3_excessive_load); + stats[H3_ST_H3_ID_ERROR] = mkf_u64(FN_COUNTER, counters->h3_id_error); + stats[H3_ST_H3_SETTINGS_ERROR] = mkf_u64(FN_COUNTER, counters->h3_settings_error); + stats[H3_ST_H3_MISSING_SETTINGS] = mkf_u64(FN_COUNTER, counters->h3_missing_settings); + stats[H3_ST_H3_REQUEST_REJECTED] = mkf_u64(FN_COUNTER, counters->h3_request_rejected); + stats[H3_ST_H3_REQUEST_CANCELLED] = mkf_u64(FN_COUNTER, counters->h3_request_cancelled); + stats[H3_ST_H3_REQUEST_INCOMPLETE] = mkf_u64(FN_COUNTER, counters->h3_request_incomplete); + stats[H3_ST_H3_MESSAGE_ERROR] = mkf_u64(FN_COUNTER, counters->h3_message_error); + stats[H3_ST_H3_CONNECT_ERROR] = mkf_u64(FN_COUNTER, counters->h3_connect_error); + stats[H3_ST_H3_VERSION_FALLBACK] = mkf_u64(FN_COUNTER, counters->h3_version_fallback); + /* QPACK error counters */ + stats[H3_ST_QPACK_DECOMPRESSION_FAILED] = mkf_u64(FN_COUNTER, counters->qpack_decompression_failed); + stats[H3_ST_QPACK_ENCODER_STREAM_ERROR] = mkf_u64(FN_COUNTER, counters->qpack_encoder_stream_error); + stats[H3_ST_QPACK_DECODER_STREAM_ERROR] = mkf_u64(FN_COUNTER, counters->qpack_decoder_stream_error); +} + +struct stats_module h3_stats_module = { + .name = "h3", + .fill_stats = h3_fill_stats, + .stats = h3_stats, + .stats_count = H3_STATS_COUNT, + .counters = &h3_counters, + .counters_size = sizeof(h3_counters), + .domain_flags = MK_STATS_PROXY_DOMAIN(STATS_PX_CAP_FE), + .clearable = 1, +}; + +INITCALL1(STG_REGISTER, stats_register_module, &h3_stats_module); + +void h3_inc_err_cnt(struct h3_counters *ctrs, int error_code) +{ + switch (error_code) { + case H3_NO_ERROR: + HA_ATOMIC_INC(&ctrs->h3_no_error); + break; + case H3_GENERAL_PROTOCOL_ERROR: + HA_ATOMIC_INC(&ctrs->h3_general_protocol_error); + break; + case H3_INTERNAL_ERROR: + HA_ATOMIC_INC(&ctrs->h3_internal_error); + break; + case H3_STREAM_CREATION_ERROR: + HA_ATOMIC_INC(&ctrs->h3_stream_creation_error); + break; + case H3_CLOSED_CRITICAL_STREAM: + HA_ATOMIC_INC(&ctrs->h3_closed_critical_stream); + break; + case H3_FRAME_UNEXPECTED: + HA_ATOMIC_INC(&ctrs->h3_frame_unexpected); + break; + case H3_FRAME_ERROR: + HA_ATOMIC_INC(&ctrs->h3_frame_error); + break; + case H3_EXCESSIVE_LOAD: + HA_ATOMIC_INC(&ctrs->h3_excessive_load); + break; + case H3_ID_ERROR: + HA_ATOMIC_INC(&ctrs->h3_id_error); + break; + case H3_SETTINGS_ERROR: + HA_ATOMIC_INC(&ctrs->h3_settings_error); + break; + case H3_MISSING_SETTINGS: + HA_ATOMIC_INC(&ctrs->h3_missing_settings); + break; + case H3_REQUEST_REJECTED: + HA_ATOMIC_INC(&ctrs->h3_request_rejected); + break; + case H3_REQUEST_CANCELLED: + HA_ATOMIC_INC(&ctrs->h3_request_cancelled); + break; + case H3_REQUEST_INCOMPLETE: + HA_ATOMIC_INC(&ctrs->h3_request_incomplete); + break; + case H3_MESSAGE_ERROR: + HA_ATOMIC_INC(&ctrs->h3_message_error); + break; + case H3_CONNECT_ERROR: + HA_ATOMIC_INC(&ctrs->h3_connect_error); + break; + case H3_VERSION_FALLBACK: + HA_ATOMIC_INC(&ctrs->h3_version_fallback); + break; + case QPACK_DECOMPRESSION_FAILED: + HA_ATOMIC_INC(&ctrs->qpack_decompression_failed); + break; + case QPACK_ENCODER_STREAM_ERROR: + HA_ATOMIC_INC(&ctrs->qpack_encoder_stream_error); + break; + case QPACK_DECODER_STREAM_ERROR: + HA_ATOMIC_INC(&ctrs->qpack_decoder_stream_error); + break; + default: + break; + + } +} + +void h3_inc_frame_type_cnt(struct h3_counters *ctrs, int frm_type) +{ + switch (frm_type) { + case H3_FT_DATA: + HA_ATOMIC_INC(&ctrs->h3_data); + break; + case H3_FT_HEADERS: + HA_ATOMIC_INC(&ctrs->h3_headers); + break; + case H3_FT_CANCEL_PUSH: + HA_ATOMIC_INC(&ctrs->h3_cancel_push); + break; + case H3_FT_PUSH_PROMISE: + HA_ATOMIC_INC(&ctrs->h3_push_promise); + break; + case H3_FT_MAX_PUSH_ID: + HA_ATOMIC_INC(&ctrs->h3_max_push_id); + break; + case H3_FT_GOAWAY: + HA_ATOMIC_INC(&ctrs->h3_goaway); + break; + case H3_FT_SETTINGS: + HA_ATOMIC_INC(&ctrs->h3_settings); + break; + default: + break; + } +} diff --git a/src/haproxy.c b/src/haproxy.c new file mode 100644 index 0000000..4c739f4 --- /dev/null +++ b/src/haproxy.c @@ -0,0 +1,3962 @@ +/* + * HAProxy : High Availability-enabled HTTP/TCP proxy + * Copyright 2000-2024 Willy Tarreau <willy@haproxy.org>. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <ctype.h> +#include <dirent.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/tcp.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <netdb.h> +#include <fcntl.h> +#include <errno.h> +#include <signal.h> +#include <stdarg.h> +#include <sys/resource.h> +#include <sys/utsname.h> +#include <sys/wait.h> +#include <time.h> +#include <syslog.h> +#include <grp.h> + +#ifdef USE_THREAD +#include <pthread.h> +#endif + +#ifdef USE_CPU_AFFINITY +#include <sched.h> +#if defined(__FreeBSD__) || defined(__DragonFly__) +#include <sys/param.h> +#ifdef __FreeBSD__ +#include <sys/cpuset.h> +#endif +#endif +#endif + +#if defined(USE_PRCTL) +#include <sys/prctl.h> +#endif + +#if defined(USE_PROCCTL) +#include <sys/procctl.h> +#endif + +#ifdef DEBUG_FULL +#include <assert.h> +#endif +#if defined(USE_SYSTEMD) +#include <systemd/sd-daemon.h> +#endif + +#include <import/sha1.h> + +#include <haproxy/acl.h> +#include <haproxy/action.h> +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/auth.h> +#include <haproxy/base64.h> +#include <haproxy/capture-t.h> +#include <haproxy/cfgcond.h> +#include <haproxy/cfgdiag.h> +#include <haproxy/cfgparse.h> +#include <haproxy/chunk.h> +#include <haproxy/cli.h> +#include <haproxy/clock.h> +#include <haproxy/connection.h> +#ifdef USE_CPU_AFFINITY +#include <haproxy/cpuset.h> +#endif +#include <haproxy/debug.h> +#include <haproxy/dns.h> +#include <haproxy/dynbuf.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/filters.h> +#include <haproxy/global.h> +#include <haproxy/hlua.h> +#include <haproxy/http_rules.h> +#if defined(USE_LINUX_CAP) +#include <haproxy/linuxcap.h> +#endif +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/mworker.h> +#include <haproxy/namespace.h> +#include <haproxy/net_helper.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_tp-t.h> +#include <haproxy/pattern.h> +#include <haproxy/peers.h> +#include <haproxy/pool.h> +#include <haproxy/protocol.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/proxy.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/server.h> +#include <haproxy/session.h> +#include <haproxy/signal.h> +#include <haproxy/sock.h> +#include <haproxy/sock_inet.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/stats-t.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/thread.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> +#include <haproxy/uri_auth-t.h> +#include <haproxy/vars.h> +#include <haproxy/version.h> + + +/* array of init calls for older platforms */ +DECLARE_INIT_STAGES; + +/* create a read_mostly section to hold variables which are accessed a lot + * but which almost never change. The purpose is to isolate them in their + * own cache lines where they don't risk to be perturbated by write accesses + * to neighbor variables. We need to create an empty aligned variable for + * this. The fact that the variable is of size zero means that it will be + * eliminated at link time if no other variable uses it, but alignment will + * be respected. + */ +empty_t __read_mostly_align HA_SECTION("read_mostly") ALIGNED(64); + +#ifdef BUILD_FEATURES +char *build_features = BUILD_FEATURES; +#else +char *build_features = ""; +#endif + +/* list of config files */ +static struct list cfg_cfgfiles = LIST_HEAD_INIT(cfg_cfgfiles); +int pid; /* current process id */ + +static unsigned long stopping_tgroup_mask; /* Thread groups acknowledging stopping */ + +/* global options */ +struct global global = { + .hard_stop_after = TICK_ETERNITY, + .close_spread_time = TICK_ETERNITY, + .close_spread_end = TICK_ETERNITY, + .numa_cpu_mapping = 1, + .nbthread = 0, + .req_count = 0, + .loggers = LIST_HEAD_INIT(global.loggers), + .maxzlibmem = DEFAULT_MAXZLIBMEM * 1024U * 1024U, + .comp_rate_lim = 0, + .ssl_server_verify = SSL_SERVER_VERIFY_REQUIRED, + .unix_bind = { + .ux = { + .uid = -1, + .gid = -1, + .mode = 0, + } + }, + .tune = { + .options = GTUNE_LISTENER_MQ_OPT, + .bufsize = (BUFSIZE + 2*sizeof(void *) - 1) & -(2*sizeof(void *)), + .maxrewrite = MAXREWRITE, + .reserved_bufs = RESERVED_BUFS, + .pattern_cache = DEFAULT_PAT_LRU_SIZE, + .pool_low_ratio = 20, + .pool_high_ratio = 25, + .max_http_hdr = MAX_HTTP_HDR, +#ifdef USE_OPENSSL + .sslcachesize = SSLCACHESIZE, +#endif + .comp_maxlevel = 1, +#ifdef DEFAULT_IDLE_TIMER + .idle_timer = DEFAULT_IDLE_TIMER, +#else + .idle_timer = 1000, /* 1 second */ +#endif + .nb_stk_ctr = MAX_SESS_STKCTR, + .default_shards = -2, /* by-group */ +#ifdef USE_QUIC + .quic_backend_max_idle_timeout = QUIC_TP_DFLT_BACK_MAX_IDLE_TIMEOUT, + .quic_frontend_max_idle_timeout = QUIC_TP_DFLT_FRONT_MAX_IDLE_TIMEOUT, + .quic_frontend_max_streams_bidi = QUIC_TP_DFLT_FRONT_MAX_STREAMS_BIDI, + .quic_reorder_ratio = QUIC_DFLT_REORDER_RATIO, + .quic_retry_threshold = QUIC_DFLT_RETRY_THRESHOLD, + .quic_max_frame_loss = QUIC_DFLT_MAX_FRAME_LOSS, + .quic_streams_buf = 30, +#endif /* USE_QUIC */ + }, +#ifdef USE_OPENSSL +#ifdef DEFAULT_MAXSSLCONN + .maxsslconn = DEFAULT_MAXSSLCONN, +#endif +#endif + /* others NULL OK */ +}; + +/*********************************************************************/ + +int stopping; /* non zero means stopping in progress */ +int killed; /* non zero means a hard-stop is triggered */ +int jobs = 0; /* number of active jobs (conns, listeners, active tasks, ...) */ +int unstoppable_jobs = 0; /* number of active jobs that can't be stopped during a soft stop */ +int active_peers = 0; /* number of active peers (connection attempts and connected) */ +int connected_peers = 0; /* number of connected peers (verified ones) */ +int arg_mode = 0; /* MODE_DEBUG etc as passed on command line ... */ +char *change_dir = NULL; /* set when -C is passed */ +char *check_condition = NULL; /* check condition passed to -cc */ + +/* Here we store information about the pids of the processes we may pause + * or kill. We will send them a signal every 10 ms until we can bind to all + * our ports. With 200 retries, that's about 2 seconds. + */ +#define MAX_START_RETRIES 200 +static int *oldpids = NULL; +static int oldpids_sig; /* use USR1 or TERM */ + +/* Path to the unix socket we use to retrieve listener sockets from the old process */ +static const char *old_unixsocket; + +int atexit_flag = 0; + +int nb_oldpids = 0; +const int zero = 0; +const int one = 1; +const struct linger nolinger = { .l_onoff = 1, .l_linger = 0 }; + +char hostname[MAX_HOSTNAME_LEN]; +char *localpeer = NULL; +static char *kwd_dump = NULL; // list of keyword dumps to produce + +static char **old_argv = NULL; /* previous argv but cleaned up */ + +struct list proc_list = LIST_HEAD_INIT(proc_list); + +int master = 0; /* 1 if in master, 0 if in child */ +unsigned int rlim_fd_cur_at_boot = 0; +unsigned int rlim_fd_max_at_boot = 0; + +/* per-boot randomness */ +unsigned char boot_seed[20]; /* per-boot random seed (160 bits initially) */ + +/* takes the thread config in argument or NULL for any thread */ +static void *run_thread_poll_loop(void *data); + +/* bitfield of a few warnings to emit just once (WARN_*) */ +unsigned int warned = 0; + +/* set if experimental features have been used for the current process */ +unsigned int tainted = 0; + +unsigned int experimental_directives_allowed = 0; + +int check_kw_experimental(struct cfg_keyword *kw, const char *file, int linenum, + char **errmsg) +{ + if (kw->flags & KWF_EXPERIMENTAL) { + if (!experimental_directives_allowed) { + memprintf(errmsg, "parsing [%s:%d] : '%s' directive is experimental, must be allowed via a global 'expose-experimental-directives'", + file, linenum, kw->kw); + return 1; + } + mark_tainted(TAINTED_CONFIG_EXP_KW_DECLARED); + } + + return 0; +} + +/* master CLI configuration (-S flag) */ +struct list mworker_cli_conf = LIST_HEAD_INIT(mworker_cli_conf); + +/* These are strings to be reported in the output of "haproxy -vv". They may + * either be constants (in which case must_free must be zero) or dynamically + * allocated strings to pass to free() on exit, and in this case must_free + * must be non-zero. + */ +struct list build_opts_list = LIST_HEAD_INIT(build_opts_list); +struct build_opts_str { + struct list list; + const char *str; + int must_free; +}; + +/*********************************************************************/ +/* general purpose functions ***************************************/ +/*********************************************************************/ + +/* used to register some build option strings at boot. Set must_free to + * non-zero if the string must be freed upon exit. + */ +void hap_register_build_opts(const char *str, int must_free) +{ + struct build_opts_str *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->str = str; + b->must_free = must_free; + LIST_APPEND(&build_opts_list, &b->list); +} + +/* returns the first build option when <curr> is NULL, or the next one when + * <curr> is passed the last returned value. NULL when there is no more entries + * in the list. Otherwise the returned pointer is &opt->str so the caller can + * print it as *ret. + */ +const char **hap_get_next_build_opt(const char **curr) +{ + struct build_opts_str *head, *start; + + head = container_of(&build_opts_list, struct build_opts_str, list); + + if (curr) + start = container_of(curr, struct build_opts_str, str); + else + start = head; + + start = container_of(start->list.n, struct build_opts_str, list); + + if (start == head) + return NULL; + + return &start->str; +} + +/* used to make a new feature appear in the build_features list at boot time. + * The feature must be in the format "XXX" without the leading "+" which will + * be automatically appended. + */ +void hap_register_feature(const char *name) +{ + static int must_free = 0; + int new_len = strlen(build_features) + 2 + strlen(name); + char *new_features; + + new_features = malloc(new_len + 1); + if (!new_features) + return; + + strlcpy2(new_features, build_features, new_len); + snprintf(new_features, new_len + 1, "%s +%s", build_features, name); + + if (must_free) + ha_free(&build_features); + + build_features = new_features; + must_free = 1; +} + +#define VERSION_MAX_ELTS 7 + +/* This function splits an haproxy version string into an array of integers. + * The syntax of the supported version string is the following: + * + * <a>[.<b>[.<c>[.<d>]]][-{dev,pre,rc}<f>][-*][-<g>] + * + * This validates for example: + * 1.2.1-pre2, 1.2.1, 1.2.10.1, 1.3.16-rc1, 1.4-dev3, 1.5-dev18, 1.5-dev18-43 + * 2.4-dev18-f6818d-20 + * + * The result is set in a array of <VERSION_MAX_ELTS> elements. Each letter has + * one fixed place in the array. The tags take a numeric value called <e> which + * defaults to 3. "dev" is 1, "rc" and "pre" are 2. Numbers not encountered are + * considered as zero (henxe 1.5 and 1.5.0 are the same). + * + * The resulting values are: + * 1.2.1-pre2 1, 2, 1, 0, 2, 2, 0 + * 1.2.1 1, 2, 1, 0, 3, 0, 0 + * 1.2.10.1 1, 2, 10, 1, 3, 0, 0 + * 1.3.16-rc1 1, 3, 16, 0, 2, 1, 0 + * 1.4-dev3 1, 4, 0, 0, 1, 3, 0 + * 1.5-dev18 1, 5, 0, 0, 1, 18, 0 + * 1.5-dev18-43 1, 5, 0, 0, 1, 18, 43 + * 2.4-dev18-f6818d-20 2, 4, 0, 0, 1, 18, 20 + * + * The function returns non-zero if the conversion succeeded, or zero if it + * failed. + */ +int split_version(const char *version, unsigned int *value) +{ + const char *p, *s; + char *error; + int nelts; + + /* Initialize array with zeroes */ + for (nelts = 0; nelts < VERSION_MAX_ELTS; nelts++) + value[nelts] = 0; + value[4] = 3; + + p = version; + + /* If the version number is empty, return false */ + if (*p == '\0') + return 0; + + /* Convert first number <a> */ + value[0] = strtol(p, &error, 10); + p = error + 1; + if (*error == '\0') + return 1; + if (*error == '-') + goto split_version_tag; + if (*error != '.') + return 0; + + /* Convert first number <b> */ + value[1] = strtol(p, &error, 10); + p = error + 1; + if (*error == '\0') + return 1; + if (*error == '-') + goto split_version_tag; + if (*error != '.') + return 0; + + /* Convert first number <c> */ + value[2] = strtol(p, &error, 10); + p = error + 1; + if (*error == '\0') + return 1; + if (*error == '-') + goto split_version_tag; + if (*error != '.') + return 0; + + /* Convert first number <d> */ + value[3] = strtol(p, &error, 10); + p = error + 1; + if (*error == '\0') + return 1; + if (*error != '-') + return 0; + + split_version_tag: + /* Check for commit number */ + if (*p >= '0' && *p <= '9') + goto split_version_commit; + + /* Read tag */ + if (strncmp(p, "dev", 3) == 0) { value[4] = 1; p += 3; } + else if (strncmp(p, "rc", 2) == 0) { value[4] = 2; p += 2; } + else if (strncmp(p, "pre", 3) == 0) { value[4] = 2; p += 3; } + else + goto split_version_commit; + + /* Convert tag number */ + value[5] = strtol(p, &error, 10); + p = error + 1; + if (*error == '\0') + return 1; + if (*error != '-') + return 0; + + split_version_commit: + /* Search the last "-" */ + s = strrchr(p, '-'); + if (s) { + s++; + if (*s == '\0') + return 0; + value[6] = strtol(s, &error, 10); + if (*error != '\0') + value[6] = 0; + return 1; + } + + /* convert the version */ + value[6] = strtol(p, &error, 10); + if (*error != '\0') + value[6] = 0; + + return 1; +} + +/* This function compares the current haproxy version with an arbitrary version + * string. It returns: + * -1 : the version in argument is older than the current haproxy version + * 0 : the version in argument is the same as the current haproxy version + * 1 : the version in argument is newer than the current haproxy version + * + * Or some errors: + * -2 : the current haproxy version is not parsable + * -3 : the version in argument is not parsable + */ +int compare_current_version(const char *version) +{ + unsigned int loc[VERSION_MAX_ELTS]; + unsigned int mod[VERSION_MAX_ELTS]; + int i; + + /* split versions */ + if (!split_version(haproxy_version, loc)) + return -2; + if (!split_version(version, mod)) + return -3; + + /* compare versions */ + for (i = 0; i < VERSION_MAX_ELTS; i++) { + if (mod[i] < loc[i]) + return -1; + else if (mod[i] > loc[i]) + return 1; + } + return 0; +} + +void display_version() +{ + struct utsname utsname; + + printf("HAProxy version %s %s - https://haproxy.org/\n" + PRODUCT_STATUS "\n", haproxy_version, haproxy_date); + + if (strlen(PRODUCT_URL_BUGS) > 0) { + char base_version[20]; + int dots = 0; + char *del; + + /* only retrieve the base version without distro-specific extensions */ + for (del = haproxy_version; *del; del++) { + if (*del == '.') + dots++; + else if (*del < '0' || *del > '9') + break; + } + + strlcpy2(base_version, haproxy_version, del - haproxy_version + 1); + if (dots < 2) + printf("Known bugs: https://github.com/haproxy/haproxy/issues?q=is:issue+is:open\n"); + else + printf("Known bugs: " PRODUCT_URL_BUGS "\n", base_version); + } + + if (uname(&utsname) == 0) { + printf("Running on: %s %s %s %s\n", utsname.sysname, utsname.release, utsname.version, utsname.machine); + } +} + +static void display_build_opts() +{ + const char **opt; + + printf("Build options :" +#ifdef BUILD_TARGET + "\n TARGET = " BUILD_TARGET +#endif +#ifdef BUILD_CPU + "\n CPU = " BUILD_CPU +#endif +#ifdef BUILD_CC + "\n CC = " BUILD_CC +#endif +#ifdef BUILD_CFLAGS + "\n CFLAGS = " BUILD_CFLAGS +#endif +#ifdef BUILD_OPTIONS + "\n OPTIONS = " BUILD_OPTIONS +#endif +#ifdef BUILD_DEBUG + "\n DEBUG = " BUILD_DEBUG +#endif + "\n\nFeature list : %s" + "\n\nDefault settings :" + "\n bufsize = %d, maxrewrite = %d, maxpollevents = %d" + "\n\n", + build_features, BUFSIZE, MAXREWRITE, MAX_POLL_EVENTS); + + for (opt = NULL; (opt = hap_get_next_build_opt(opt)); puts(*opt)) + ; + + putchar('\n'); + + list_pollers(stdout); + putchar('\n'); + list_mux_proto(stdout); + putchar('\n'); + list_services(stdout); + putchar('\n'); + list_filters(stdout); + putchar('\n'); +} + +/* + * This function prints the command line usage and exits + */ +static void usage(char *name) +{ + display_version(); + fprintf(stderr, + "Usage : %s [-f <cfgfile|cfgdir>]* [ -vdV" + "D ] [ -n <maxconn> ] [ -N <maxpconn> ]\n" + " [ -p <pidfile> ] [ -m <max megs> ] [ -C <dir> ] [-- <cfgfile>*]\n" + " -v displays version ; -vv shows known build options.\n" + " -d enters debug mode ; -db only disables background mode.\n" + " -dM[<byte>,help,...] debug memory (default: poison with <byte>/0x50)\n" + " -dt activate traces on stderr\n" + " -V enters verbose mode (disables quiet mode)\n" + " -D goes daemon ; -C changes to <dir> before loading files.\n" + " -W master-worker mode.\n" +#if defined(USE_SYSTEMD) + " -Ws master-worker mode with systemd notify support.\n" +#endif + " -q quiet mode : don't display messages\n" + " -c check mode : only check config files and exit\n" + " -cc check condition : evaluate a condition and exit\n" + " -n sets the maximum total # of connections (uses ulimit -n)\n" + " -m limits the usable amount of memory (in MB)\n" + " -N sets the default, per-proxy maximum # of connections (%d)\n" + " -L set local peer name (default to hostname)\n" + " -p writes pids of all children to this file\n" + " -dC[[key],line] display the configuration file, if there is a key, the file will be anonymised\n" +#if defined(USE_EPOLL) + " -de disables epoll() usage even when available\n" +#endif +#if defined(USE_KQUEUE) + " -dk disables kqueue() usage even when available\n" +#endif +#if defined(USE_EVPORTS) + " -dv disables event ports usage even when available\n" +#endif +#if defined(USE_POLL) + " -dp disables poll() usage even when available\n" +#endif +#if defined(USE_LINUX_SPLICE) + " -dS disables splice usage (broken on old kernels)\n" +#endif +#if defined(USE_GETADDRINFO) + " -dG disables getaddrinfo() usage\n" +#endif +#if defined(SO_REUSEPORT) + " -dR disables SO_REUSEPORT usage\n" +#endif +#if defined(HA_HAVE_DUMP_LIBS) + " -dL dumps loaded object files after config checks\n" +#endif + " -dK{class[,...]} dump registered keywords (use 'help' for list)\n" + " -dr ignores server address resolution failures\n" + " -dV disables SSL verify on servers side\n" + " -dW fails if any warning is emitted\n" + " -dD diagnostic mode : warn about suspicious configuration statements\n" + " -dF disable fast-forward\n" + " -dZ disable zero-copy forwarding\n" + " -sf/-st [pid ]* finishes/terminates old pids.\n" + " -x <unix_socket> get listening sockets from a unix socket\n" + " -S <bind>[,<bind options>...] new master CLI\n" + "\n", + name, cfg_maxpconn); + exit(1); +} + + + +/*********************************************************************/ +/* more specific functions ***************************************/ +/*********************************************************************/ + +/* sends the signal <sig> to all pids found in <oldpids>. Returns the number of + * pids the signal was correctly delivered to. + */ +int tell_old_pids(int sig) +{ + int p; + int ret = 0; + for (p = 0; p < nb_oldpids; p++) + if (kill(oldpids[p], sig) == 0) + ret++; + return ret; +} + +/* + * remove a pid forom the olpid array and decrease nb_oldpids + * return 1 pid was found otherwise return 0 + */ + +int delete_oldpid(int pid) +{ + int i; + + for (i = 0; i < nb_oldpids; i++) { + if (oldpids[i] == pid) { + oldpids[i] = oldpids[nb_oldpids - 1]; + oldpids[nb_oldpids - 1] = 0; + nb_oldpids--; + return 1; + } + } + return 0; +} + + +/* + * When called, this function reexec haproxy with -sf followed by current + * children PIDs and possibly old children PIDs if they didn't leave yet. + */ +static void mworker_reexec(int hardreload) +{ + char **next_argv = NULL; + int old_argc = 0; /* previous number of argument */ + int next_argc = 0; + int i = 0; + char *msg = NULL; + struct rlimit limit; + struct mworker_proc *current_child = NULL; + + mworker_block_signals(); + setenv("HAPROXY_MWORKER_REEXEC", "1", 1); + + mworker_cleanup_proc(); + mworker_proc_list_to_env(); /* put the children description in the env */ + + /* ensure that we close correctly every listeners before reexecuting */ + mworker_cleanlisteners(); + + /* during the reload we must ensure that every FDs that can't be + * reuse (ie those that are not referenced in the proc_list) + * are closed or they will leak. */ + + /* close the listeners FD */ + mworker_cli_proxy_stop(); + + if (fdtab) + deinit_pollers(); + +#ifdef HAVE_SSL_RAND_KEEP_RANDOM_DEVICES_OPEN + /* close random device FDs */ + RAND_keep_random_devices_open(0); +#endif + + /* restore the initial FD limits */ + limit.rlim_cur = rlim_fd_cur_at_boot; + limit.rlim_max = rlim_fd_max_at_boot; + if (raise_rlim_nofile(&limit, &limit) != 0) { + ha_warning("Failed to restore initial FD limits (cur=%u max=%u), using cur=%u max=%u\n", + rlim_fd_cur_at_boot, rlim_fd_max_at_boot, + (unsigned int)limit.rlim_cur, (unsigned int)limit.rlim_max); + } + + /* compute length */ + while (old_argv[old_argc]) + old_argc++; + + /* 1 for haproxy -sf, 2 for -x /socket */ + next_argv = calloc(old_argc + 1 + 2 + mworker_child_nb() + 1, + sizeof(*next_argv)); + if (next_argv == NULL) + goto alloc_error; + + /* copy the program name */ + next_argv[next_argc++] = old_argv[0]; + + /* insert the new options just after argv[0] in case we have a -- */ + + if (getenv("HAPROXY_MWORKER_WAIT_ONLY") == NULL) { + /* add -sf <PID>* to argv */ + if (mworker_child_nb() > 0) { + struct mworker_proc *child; + + if (hardreload) + next_argv[next_argc++] = "-st"; + else + next_argv[next_argc++] = "-sf"; + + list_for_each_entry(child, &proc_list, list) { + if (!(child->options & PROC_O_LEAVING) && (child->options & PROC_O_TYPE_WORKER)) + current_child = child; + + if (!(child->options & (PROC_O_TYPE_WORKER|PROC_O_TYPE_PROG)) || child->pid <= -1) + continue; + if ((next_argv[next_argc++] = memprintf(&msg, "%d", child->pid)) == NULL) + goto alloc_error; + msg = NULL; + } + } + + if (current_child) { + /* add the -x option with the socketpair of the current worker */ + next_argv[next_argc++] = "-x"; + if ((next_argv[next_argc++] = memprintf(&msg, "sockpair@%d", current_child->ipc_fd[0])) == NULL) + goto alloc_error; + msg = NULL; + } + } + + /* copy the previous options */ + for (i = 1; i < old_argc; i++) + next_argv[next_argc++] = old_argv[i]; + + signal(SIGPROF, SIG_IGN); + execvp(next_argv[0], next_argv); + ha_warning("Failed to reexecute the master process [%d]: %s\n", pid, strerror(errno)); + ha_free(&next_argv); + return; + +alloc_error: + ha_free(&next_argv); + ha_warning("Failed to reexecute the master process [%d]: Cannot allocate memory\n", pid); + return; +} + +/* reexec haproxy in waitmode */ +static void mworker_reexec_waitmode() +{ + setenv("HAPROXY_MWORKER_WAIT_ONLY", "1", 1); + mworker_reexec(0); +} + +/* reload haproxy and emit a warning */ +void mworker_reload(int hardreload) +{ + struct mworker_proc *child; + struct per_thread_deinit_fct *ptdf; + + ha_notice("Reloading HAProxy%s\n", hardreload?" (hard-reload)":""); + + /* close the poller FD and the thread waker pipe FD */ + list_for_each_entry(ptdf, &per_thread_deinit_list, list) + ptdf->fct(); + + /* increment the number of reloads */ + list_for_each_entry(child, &proc_list, list) { + child->reloads++; + } + +#if defined(USE_SYSTEMD) + if (global.tune.options & GTUNE_USE_SYSTEMD) + sd_notify(0, "RELOADING=1\nSTATUS=Reloading Configuration.\n"); +#endif + mworker_reexec(hardreload); +} + +static void mworker_loop() +{ + + /* Busy polling makes no sense in the master :-) */ + global.tune.options &= ~GTUNE_BUSY_POLLING; + + + signal_unregister(SIGTTIN); + signal_unregister(SIGTTOU); + signal_unregister(SIGUSR1); + signal_unregister(SIGHUP); + signal_unregister(SIGQUIT); + + signal_register_fct(SIGTERM, mworker_catch_sigterm, SIGTERM); + signal_register_fct(SIGUSR1, mworker_catch_sigterm, SIGUSR1); + signal_register_fct(SIGTTIN, mworker_broadcast_signal, SIGTTIN); + signal_register_fct(SIGTTOU, mworker_broadcast_signal, SIGTTOU); + signal_register_fct(SIGINT, mworker_catch_sigterm, SIGINT); + signal_register_fct(SIGHUP, mworker_catch_sighup, SIGHUP); + signal_register_fct(SIGUSR2, mworker_catch_sighup, SIGUSR2); + signal_register_fct(SIGCHLD, mworker_catch_sigchld, SIGCHLD); + + mworker_unblock_signals(); + mworker_cleantasks(); + + mworker_catch_sigchld(NULL); /* ensure we clean the children in case + some SIGCHLD were lost */ + + jobs++; /* this is the "master" job, we want to take care of the + signals even if there is no listener so the poll loop don't + leave */ + + fork_poller(); + run_thread_poll_loop(NULL); +} + +/* + * Reexec the process in failure mode, instead of exiting + */ +void reexec_on_failure() +{ + struct mworker_proc *child; + + if (!atexit_flag) + return; + + /* get the info of the children in the env */ + if (mworker_env_to_proc_list() < 0) { + exit(EXIT_FAILURE); + } + + /* increment the number of failed reloads */ + list_for_each_entry(child, &proc_list, list) { + child->failedreloads++; + } + + /* do not keep unused FDs retrieved from the previous process */ + sock_drop_unused_old_sockets(); + + usermsgs_clr(NULL); + setenv("HAPROXY_LOAD_SUCCESS", "0", 1); + ha_warning("Loading failure!\n"); +#if defined(USE_SYSTEMD) + /* the sd_notify API is not able to send a reload failure signal. So + * the READY=1 signal still need to be sent */ + if (global.tune.options & GTUNE_USE_SYSTEMD) + sd_notify(0, "READY=1\nSTATUS=Reload failed!\n"); +#endif + + mworker_reexec_waitmode(); +} + +/* + * Exit with an error message upon a wait-mode failure. + */ +void exit_on_waitmode_failure() +{ + if (!atexit_flag) + return; + + ha_alert("Non-recoverable mworker wait-mode error, exiting.\n"); +} + + +/* + * upon SIGUSR1, let's have a soft stop. Note that soft_stop() broadcasts + * a signal zero to all subscribers. This means that it's as easy as + * subscribing to signal 0 to get informed about an imminent shutdown. + */ +static void sig_soft_stop(struct sig_handler *sh) +{ + soft_stop(); + signal_unregister_handler(sh); + pool_gc(NULL); +} + +/* + * upon SIGTTOU, we pause everything + */ +static void sig_pause(struct sig_handler *sh) +{ + if (protocol_pause_all() & ERR_FATAL) { + const char *msg = "Some proxies refused to pause, performing soft stop now.\n"; + ha_warning("%s", msg); + send_log(NULL, LOG_WARNING, "%s", msg); + soft_stop(); + } + pool_gc(NULL); +} + +/* + * upon SIGTTIN, let's have a soft stop. + */ +static void sig_listen(struct sig_handler *sh) +{ + if (protocol_resume_all() & ERR_FATAL) { + const char *msg = "Some proxies refused to resume, probably due to a conflict on a listening port. You may want to try again after the conflicting application is stopped, otherwise a restart might be needed to resume safe operations.\n"; + ha_warning("%s", msg); + send_log(NULL, LOG_WARNING, "%s", msg); + } +} + +/* + * this function dumps every server's state when the process receives SIGHUP. + */ +static void sig_dump_state(struct sig_handler *sh) +{ + struct proxy *p = proxies_list; + + ha_warning("SIGHUP received, dumping servers states.\n"); + while (p) { + struct server *s = p->srv; + + send_log(p, LOG_NOTICE, "SIGHUP received, dumping servers states for proxy %s.\n", p->id); + while (s) { + chunk_printf(&trash, + "SIGHUP: Server %s/%s is %s. Conn: %d act, %d pend, %lld tot.", + p->id, s->id, + (s->cur_state != SRV_ST_STOPPED) ? "UP" : "DOWN", + s->cur_sess, s->queue.length, s->counters.cum_sess); + ha_warning("%s\n", trash.area); + send_log(p, LOG_NOTICE, "%s\n", trash.area); + s = s->next; + } + + /* FIXME: those info are a bit outdated. We should be able to distinguish between FE and BE. */ + if (!p->srv) { + chunk_printf(&trash, + "SIGHUP: Proxy %s has no servers. Conn: act(FE+BE): %d+%d, %d pend (%d unass), tot(FE+BE): %lld+%lld.", + p->id, + p->feconn, p->beconn, p->totpend, p->queue.length, p->fe_counters.cum_conn, p->be_counters.cum_conn); + } else if (p->srv_act == 0) { + chunk_printf(&trash, + "SIGHUP: Proxy %s %s ! Conn: act(FE+BE): %d+%d, %d pend (%d unass), tot(FE+BE): %lld+%lld.", + p->id, + (p->srv_bck) ? "is running on backup servers" : "has no server available", + p->feconn, p->beconn, p->totpend, p->queue.length, p->fe_counters.cum_conn, p->be_counters.cum_conn); + } else { + chunk_printf(&trash, + "SIGHUP: Proxy %s has %d active servers and %d backup servers available." + " Conn: act(FE+BE): %d+%d, %d pend (%d unass), tot(FE+BE): %lld+%lld.", + p->id, p->srv_act, p->srv_bck, + p->feconn, p->beconn, p->totpend, p->queue.length, p->fe_counters.cum_conn, p->be_counters.cum_conn); + } + ha_warning("%s\n", trash.area); + send_log(p, LOG_NOTICE, "%s\n", trash.area); + + p = p->next; + } +} + +static void dump(struct sig_handler *sh) +{ + /* dump memory usage then free everything possible */ + dump_pools(); + pool_gc(NULL); +} + +/* + * This function dup2 the stdio FDs (0,1,2) with <fd>, then closes <fd> + * If <fd> < 0, it opens /dev/null and use it to dup + * + * In the case of chrooting, you have to open /dev/null before the chroot, and + * pass the <fd> to this function + */ +static void stdio_quiet(int fd) +{ + if (fd < 0) + fd = open("/dev/null", O_RDWR, 0); + + if (fd > -1) { + fclose(stdin); + fclose(stdout); + fclose(stderr); + + dup2(fd, 0); + dup2(fd, 1); + dup2(fd, 2); + if (fd > 2) + close(fd); + return; + } + + ha_alert("Cannot open /dev/null\n"); + exit(EXIT_FAILURE); +} + + +/* This function checks if cfg_cfgfiles contains directories. + * If it finds one, it adds all the files (and only files) it contains + * in cfg_cfgfiles in place of the directory (and removes the directory). + * It adds the files in lexical order. + * It adds only files with .cfg extension. + * It doesn't add files with name starting with '.' + */ +static void cfgfiles_expand_directories(void) +{ + struct wordlist *wl, *wlb; + char *err = NULL; + + list_for_each_entry_safe(wl, wlb, &cfg_cfgfiles, list) { + struct stat file_stat; + struct dirent **dir_entries = NULL; + int dir_entries_nb; + int dir_entries_it; + + if (stat(wl->s, &file_stat)) { + ha_alert("Cannot open configuration file/directory %s : %s\n", + wl->s, + strerror(errno)); + exit(1); + } + + if (!S_ISDIR(file_stat.st_mode)) + continue; + + /* from this point wl->s is a directory */ + + dir_entries_nb = scandir(wl->s, &dir_entries, NULL, alphasort); + if (dir_entries_nb < 0) { + ha_alert("Cannot open configuration directory %s : %s\n", + wl->s, + strerror(errno)); + exit(1); + } + + /* for each element in the directory wl->s */ + for (dir_entries_it = 0; dir_entries_it < dir_entries_nb; dir_entries_it++) { + struct dirent *dir_entry = dir_entries[dir_entries_it]; + char *filename = NULL; + char *d_name_cfgext = strstr(dir_entry->d_name, ".cfg"); + + /* don't add filename that begin with . + * only add filename with .cfg extension + */ + if (dir_entry->d_name[0] == '.' || + !(d_name_cfgext && d_name_cfgext[4] == '\0')) + goto next_dir_entry; + + if (!memprintf(&filename, "%s/%s", wl->s, dir_entry->d_name)) { + ha_alert("Cannot load configuration files %s : out of memory.\n", + filename); + exit(1); + } + + if (stat(filename, &file_stat)) { + ha_alert("Cannot open configuration file %s : %s\n", + wl->s, + strerror(errno)); + exit(1); + } + + /* don't add anything else than regular file in cfg_cfgfiles + * this way we avoid loops + */ + if (!S_ISREG(file_stat.st_mode)) + goto next_dir_entry; + + if (!list_append_word(&wl->list, filename, &err)) { + ha_alert("Cannot load configuration files %s : %s\n", + filename, + err); + exit(1); + } + +next_dir_entry: + free(filename); + free(dir_entry); + } + + free(dir_entries); + + /* remove the current directory (wl) from cfg_cfgfiles */ + free(wl->s); + LIST_DELETE(&wl->list); + free(wl); + } + + free(err); +} + +/* + * copy and cleanup the current argv + * Remove the -sf /-st / -x parameters + * Return an allocated copy of argv + */ + +static char **copy_argv(int argc, char **argv) +{ + char **newargv, **retargv; + + newargv = calloc(argc + 2, sizeof(*newargv)); + if (newargv == NULL) { + ha_warning("Cannot allocate memory\n"); + return NULL; + } + retargv = newargv; + + /* first copy argv[0] */ + *newargv++ = *argv++; + argc--; + + while (argc > 0) { + if (**argv != '-') { + /* non options are copied but will fail in the argument parser */ + *newargv++ = *argv++; + argc--; + + } else { + char *flag; + + flag = *argv + 1; + + if (flag[0] == '-' && flag[1] == 0) { + /* "--\0" copy every arguments till the end of argv */ + *newargv++ = *argv++; + argc--; + + while (argc > 0) { + *newargv++ = *argv++; + argc--; + } + } else { + switch (*flag) { + case 's': + /* -sf / -st and their parameters are ignored */ + if (flag[1] == 'f' || flag[1] == 't') { + argc--; + argv++; + /* The list can't contain a negative value since the only + way to know the end of this list is by looking for the + next option or the end of the options */ + while (argc > 0 && argv[0][0] != '-') { + argc--; + argv++; + } + } else { + argc--; + argv++; + + } + break; + + case 'x': + /* this option and its parameter are ignored */ + argc--; + argv++; + if (argc > 0) { + argc--; + argv++; + } + break; + + case 'C': + case 'n': + case 'm': + case 'N': + case 'L': + case 'f': + case 'p': + case 'S': + /* these options have only 1 parameter which must be copied and can start with a '-' */ + *newargv++ = *argv++; + argc--; + if (argc == 0) + goto error; + *newargv++ = *argv++; + argc--; + break; + default: + /* for other options just copy them without parameters, this is also done + * for options like "--foo", but this will fail in the argument parser. + * */ + *newargv++ = *argv++; + argc--; + break; + } + } + } + } + + return retargv; + +error: + free(retargv); + return NULL; +} + + +/* Performs basic random seed initialization. The main issue with this is that + * srandom_r() only takes 32 bits and purposely provides a reproducible sequence, + * which means that there will only be 4 billion possible random sequences once + * srandom() is called, regardless of the internal state. Not calling it is + * even worse as we'll always produce the same randoms sequences. What we do + * here is to create an initial sequence from various entropy sources, hash it + * using SHA1 and keep the resulting 160 bits available globally. + * + * We initialize the current process with the first 32 bits before starting the + * polling loop, where all this will be changed to have process specific and + * thread specific sequences. + * + * Before starting threads, it's still possible to call random() as srandom() + * is initialized from this, but after threads and/or processes are started, + * only ha_random() is expected to be used to guarantee distinct sequences. + */ +static void ha_random_boot(char *const *argv) +{ + unsigned char message[256]; + unsigned char *m = message; + struct timeval tv; + blk_SHA_CTX ctx; + unsigned long l; + int fd; + int i; + + /* start with current time as pseudo-random seed */ + gettimeofday(&tv, NULL); + write_u32(m, tv.tv_sec); m += 4; + write_u32(m, tv.tv_usec); m += 4; + + /* PID and PPID add some OS-based randomness */ + write_u16(m, getpid()); m += 2; + write_u16(m, getppid()); m += 2; + + /* take up to 160 bits bytes from /dev/urandom if available (non-blocking) */ + fd = open("/dev/urandom", O_RDONLY); + if (fd >= 0) { + i = read(fd, m, 20); + if (i > 0) + m += i; + close(fd); + } + + /* take up to 160 bits bytes from openssl (non-blocking) */ +#ifdef USE_OPENSSL + if (RAND_bytes(m, 20) == 1) + m += 20; +#endif + + /* take 160 bits from existing random in case it was already initialized */ + for (i = 0; i < 5; i++) { + write_u32(m, random()); + m += 4; + } + + /* stack address (benefit form operating system's ASLR) */ + l = (unsigned long)&m; + memcpy(m, &l, sizeof(l)); m += sizeof(l); + + /* argv address (benefit form operating system's ASLR) */ + l = (unsigned long)&argv; + memcpy(m, &l, sizeof(l)); m += sizeof(l); + + /* use tv_usec again after all the operations above */ + gettimeofday(&tv, NULL); + write_u32(m, tv.tv_usec); m += 4; + + /* + * At this point, ~84-92 bytes have been used + */ + + /* finish with the hostname */ + strncpy((char *)m, hostname, message + sizeof(message) - m); + m += strlen(hostname); + + /* total message length */ + l = m - message; + + memset(&ctx, 0, sizeof(ctx)); + blk_SHA1_Init(&ctx); + blk_SHA1_Update(&ctx, message, l); + blk_SHA1_Final(boot_seed, &ctx); + + srandom(read_u32(boot_seed)); + ha_random_seed(boot_seed, sizeof(boot_seed)); +} + +/* considers splicing proxies' maxconn, computes the ideal global.maxpipes + * setting, and returns it. It may return -1 meaning "unlimited" if some + * unlimited proxies have been found and the global.maxconn value is not yet + * set. It may also return a value greater than maxconn if it's not yet set. + * Note that a value of zero means there is no need for pipes. -1 is never + * returned if global.maxconn is valid. + */ +static int compute_ideal_maxpipes() +{ + struct proxy *cur; + int nbfe = 0, nbbe = 0; + int unlimited = 0; + int pipes; + int max; + + for (cur = proxies_list; cur; cur = cur->next) { + if (cur->options2 & (PR_O2_SPLIC_ANY)) { + if (cur->cap & PR_CAP_FE) { + max = cur->maxconn; + nbfe += max; + if (!max) { + unlimited = 1; + break; + } + } + if (cur->cap & PR_CAP_BE) { + max = cur->fullconn ? cur->fullconn : global.maxconn; + nbbe += max; + if (!max) { + unlimited = 1; + break; + } + } + } + } + + pipes = MAX(nbfe, nbbe); + if (global.maxconn) { + if (pipes > global.maxconn || unlimited) + pipes = global.maxconn; + } else if (unlimited) { + pipes = -1; + } + + return pipes >= 4 ? pipes / 4 : pipes; +} + +/* considers global.maxsocks, global.maxpipes, async engines, SSL frontends and + * rlimits and computes an ideal maxconn. It's meant to be called only when + * maxsock contains the sum of listening FDs, before it is updated based on + * maxconn and pipes. If there are not enough FDs left, DEFAULT_MAXCONN (by + * default 100) is returned as it is expected that it will even run on tight + * environments, and will maintain compatibility with previous packages that + * used to rely on this value as the default one. The system will emit a + * warning indicating how many FDs are missing anyway if needed. + */ +static int compute_ideal_maxconn() +{ + int ssl_sides = !!global.ssl_used_frontend + !!global.ssl_used_backend; + int engine_fds = global.ssl_used_async_engines * ssl_sides; + int pipes = compute_ideal_maxpipes(); + int remain = MAX(rlim_fd_cur_at_boot, rlim_fd_max_at_boot); + int maxconn; + + /* we have to take into account these elements : + * - number of engine_fds, which inflates the number of FD needed per + * connection by this number. + * - number of pipes per connection on average : for the unlimited + * case, this is 0.5 pipe FDs per connection, otherwise it's a + * fixed value of 2*pipes. + * - two FDs per connection + */ + + if (global.fd_hard_limit && remain > global.fd_hard_limit) + remain = global.fd_hard_limit; + + /* subtract listeners and checks */ + remain -= global.maxsock; + + /* one epoll_fd/kqueue_fd per thread */ + remain -= global.nbthread; + + /* one wake-up pipe (2 fd) per thread */ + remain -= 2 * global.nbthread; + + /* Fixed pipes values : we only subtract them if they're not larger + * than the remaining FDs because pipes are optional. + */ + if (pipes >= 0 && pipes * 2 < remain) + remain -= pipes * 2; + + if (pipes < 0) { + /* maxsock = maxconn * 2 + maxconn/4 * 2 + maxconn * engine_fds. + * = maxconn * (2 + 0.5 + engine_fds) + * = maxconn * (4 + 1 + 2*engine_fds) / 2 + */ + maxconn = 2 * remain / (5 + 2 * engine_fds); + } else { + /* maxsock = maxconn * 2 + maxconn * engine_fds. + * = maxconn * (2 + engine_fds) + */ + maxconn = remain / (2 + engine_fds); + } + + return MAX(maxconn, DEFAULT_MAXCONN); +} + +/* computes the estimated maxsock value for the given maxconn based on the + * possibly set global.maxpipes and existing partial global.maxsock. It may + * temporarily change global.maxconn for the time needed to propagate the + * computations, and will reset it. + */ +static int compute_ideal_maxsock(int maxconn) +{ + int maxpipes = global.maxpipes; + int maxsock = global.maxsock; + + + if (!maxpipes) { + int old_maxconn = global.maxconn; + + global.maxconn = maxconn; + maxpipes = compute_ideal_maxpipes(); + global.maxconn = old_maxconn; + } + + maxsock += maxconn * 2; /* each connection needs two sockets */ + maxsock += maxpipes * 2; /* each pipe needs two FDs */ + maxsock += global.nbthread; /* one epoll_fd/kqueue_fd per thread */ + maxsock += 2 * global.nbthread; /* one wake-up pipe (2 fd) per thread */ + + /* compute fd used by async engines */ + if (global.ssl_used_async_engines) { + int sides = !!global.ssl_used_frontend + !!global.ssl_used_backend; + + maxsock += maxconn * sides * global.ssl_used_async_engines; + } + return maxsock; +} + +/* Tests if it is possible to set the current process's RLIMIT_NOFILE to + * <maxsock>, then sets it back to the previous value. Returns non-zero if the + * value is accepted, non-zero otherwise. This is used to determine if an + * automatic limit may be applied or not. When it is not, the caller knows that + * the highest we can do is the rlim_max at boot. In case of error, we return + * that the setting is possible, so that we defer the error processing to the + * final stage in charge of enforcing this. + */ +static int check_if_maxsock_permitted(int maxsock) +{ + struct rlimit orig_limit, test_limit; + int ret; + + if (global.fd_hard_limit && maxsock > global.fd_hard_limit) + return 0; + + if (getrlimit(RLIMIT_NOFILE, &orig_limit) != 0) + return 1; + + /* don't go further if we can't even set to what we have */ + if (raise_rlim_nofile(NULL, &orig_limit) != 0) + return 1; + + test_limit.rlim_max = MAX(maxsock, orig_limit.rlim_max); + test_limit.rlim_cur = test_limit.rlim_max; + ret = raise_rlim_nofile(NULL, &test_limit); + + if (raise_rlim_nofile(NULL, &orig_limit) != 0) + return 1; + + return ret == 0; +} + +/* This performs th every basic early initialization at the end of the PREPARE + * init stage. It may only assume that list heads are initialized, but not that + * anything else is correct. It will initialize a number of variables that + * depend on command line and will pre-parse the command line. If it fails, it + * directly exits. + */ +static void init_early(int argc, char **argv) +{ + char *progname; + char *tmp; + int len; + + setenv("HAPROXY_STARTUP_VERSION", HAPROXY_VERSION, 0); + + /* First, let's initialize most global variables */ + totalconn = actconn = listeners = stopping = 0; + killed = pid = 0; + + global.maxsock = 10; /* reserve 10 fds ; will be incremented by socket eaters */ + global.rlimit_memmax_all = HAPROXY_MEMMAX; + global.mode = MODE_STARTING; + + /* if we were in mworker mode, we should restart in mworker mode */ + if (getenv("HAPROXY_MWORKER_REEXEC") != NULL) + global.mode |= MODE_MWORKER; + + /* initialize date, time, and pid */ + tzset(); + clock_init_process_date(); + start_date = date; + start_time_ns = now_ns; + pid = getpid(); + + /* Set local host name and adjust some environment variables. + * NB: POSIX does not make it mandatory for gethostname() to + * NULL-terminate the string in case of truncation, and at least + * FreeBSD appears not to do it. + */ + memset(hostname, 0, sizeof(hostname)); + gethostname(hostname, sizeof(hostname) - 1); + + /* preset some environment variables */ + localpeer = strdup(hostname); + if (!localpeer || setenv("HAPROXY_LOCALPEER", localpeer, 1) < 0) { + ha_alert("Cannot allocate memory for local peer.\n"); + exit(EXIT_FAILURE); + } + + /* extract the program name from argv[0], it will be used for the logs + * and error messages. + */ + progname = *argv; + while ((tmp = strchr(progname, '/')) != NULL) + progname = tmp + 1; + + len = strlen(progname); + progname = strdup(progname); + if (!progname) { + ha_alert("Cannot allocate memory for log_tag.\n"); + exit(EXIT_FAILURE); + } + + chunk_initlen(&global.log_tag, progname, len, len); +} + +/* handles program arguments. Very minimal parsing is performed, variables are + * fed with some values, and lists are completed with other ones. In case of + * error, it will exit. + */ +static void init_args(int argc, char **argv) +{ + char *progname = global.log_tag.area; + char *err_msg = NULL; + + /* pre-fill in the global tuning options before we let the cmdline + * change them. + */ + global.tune.options |= GTUNE_USE_SELECT; /* select() is always available */ +#if defined(USE_POLL) + global.tune.options |= GTUNE_USE_POLL; +#endif +#if defined(USE_EPOLL) + global.tune.options |= GTUNE_USE_EPOLL; +#endif +#if defined(USE_KQUEUE) + global.tune.options |= GTUNE_USE_KQUEUE; +#endif +#if defined(USE_EVPORTS) + global.tune.options |= GTUNE_USE_EVPORTS; +#endif +#if defined(USE_LINUX_SPLICE) + global.tune.options |= GTUNE_USE_SPLICE; +#endif +#if defined(USE_GETADDRINFO) + global.tune.options |= GTUNE_USE_GAI; +#endif +#ifdef USE_THREAD + global.tune.options |= GTUNE_IDLE_POOL_SHARED; +#endif +#ifdef USE_QUIC + global.tune.options |= GTUNE_QUIC_SOCK_PER_CONN; +#endif + global.tune.options |= GTUNE_STRICT_LIMITS; + + global.tune.options |= GTUNE_USE_FAST_FWD; /* Use fast-forward by default */ + + /* Use zero-copy forwarding by default */ + global.tune.no_zero_copy_fwd = NO_ZERO_COPY_FWD_QUIC_SND; + + /* keep a copy of original arguments for the master process */ + old_argv = copy_argv(argc, argv); + if (!old_argv) { + ha_alert("failed to copy argv.\n"); + exit(EXIT_FAILURE); + } + + /* skip program name and start */ + argc--; argv++; + while (argc > 0) { + char *flag; + + if (**argv == '-') { + flag = *argv+1; + + /* 1 arg */ + if (*flag == 'v') { + display_version(); + if (flag[1] == 'v') /* -vv */ + display_build_opts(); + deinit_and_exit(0); + } +#if defined(USE_EPOLL) + else if (*flag == 'd' && flag[1] == 'e') + global.tune.options &= ~GTUNE_USE_EPOLL; +#endif +#if defined(USE_POLL) + else if (*flag == 'd' && flag[1] == 'p') + global.tune.options &= ~GTUNE_USE_POLL; +#endif +#if defined(USE_KQUEUE) + else if (*flag == 'd' && flag[1] == 'k') + global.tune.options &= ~GTUNE_USE_KQUEUE; +#endif +#if defined(USE_EVPORTS) + else if (*flag == 'd' && flag[1] == 'v') + global.tune.options &= ~GTUNE_USE_EVPORTS; +#endif +#if defined(USE_LINUX_SPLICE) + else if (*flag == 'd' && flag[1] == 'S') + global.tune.options &= ~GTUNE_USE_SPLICE; +#endif +#if defined(USE_GETADDRINFO) + else if (*flag == 'd' && flag[1] == 'G') + global.tune.options &= ~GTUNE_USE_GAI; +#endif +#if defined(SO_REUSEPORT) + else if (*flag == 'd' && flag[1] == 'R') + protocol_clrf_all(PROTO_F_REUSEPORT_SUPPORTED); +#endif + else if (*flag == 'd' && flag[1] == 'F') + global.tune.options &= ~GTUNE_USE_FAST_FWD; + else if (*flag == 'd' && flag[1] == 'V') + global.ssl_server_verify = SSL_SERVER_VERIFY_NONE; + else if (*flag == 'd' && flag[1] == 'Z') + global.tune.no_zero_copy_fwd |= NO_ZERO_COPY_FWD; + else if (*flag == 'V') + arg_mode |= MODE_VERBOSE; + else if (*flag == 'd' && flag[1] == 'C') { + char *end; + char *key; + + key = flag + 2; + for (;key && *key; key = end) { + end = strchr(key, ','); + if (end) + *(end++) = 0; + + if (strcmp(key, "line") == 0) + arg_mode |= MODE_DUMP_NB_L; + + } + arg_mode |= MODE_DUMP_CFG; + HA_ATOMIC_STORE(&global.anon_key, atoll(flag + 2)); + } + else if (*flag == 'd' && flag[1] == 'b') + arg_mode |= MODE_FOREGROUND; + else if (*flag == 'd' && flag[1] == 'D') + arg_mode |= MODE_DIAG; + else if (*flag == 'd' && flag[1] == 'W') + arg_mode |= MODE_ZERO_WARNING; + else if (*flag == 'd' && flag[1] == 'M') { + int ret = pool_parse_debugging(flag + 2, &err_msg); + + if (ret <= -1) { + if (ret < -1) + ha_alert("-dM: %s\n", err_msg); + else + printf("%s\n", err_msg); + ha_free(&err_msg); + exit(ret < -1 ? EXIT_FAILURE : 0); + } else if (ret == 0) { + ha_warning("-dM: %s\n", err_msg); + ha_free(&err_msg); + } + } + else if (*flag == 'd' && flag[1] == 'r') + global.tune.options |= GTUNE_RESOLVE_DONTFAIL; +#if defined(HA_HAVE_DUMP_LIBS) + else if (*flag == 'd' && flag[1] == 'L') + arg_mode |= MODE_DUMP_LIBS; +#endif + else if (*flag == 'd' && flag[1] == 'K') { + arg_mode |= MODE_DUMP_KWD; + kwd_dump = flag + 2; + } + else if (*flag == 'd' && flag[1] == 't') { + if (argc > 1 && argv[1][0] != '-') { + if (trace_parse_cmd(argv[1], &err_msg)) { + ha_alert("-dt: %s.\n", err_msg); + ha_free(&err_msg); + exit(EXIT_FAILURE); + } + argc--; argv++; + } + else { + trace_parse_cmd(NULL, NULL); + } + } + else if (*flag == 'd') + arg_mode |= MODE_DEBUG; + else if (*flag == 'c' && flag[1] == 'c') { + arg_mode |= MODE_CHECK_CONDITION; + argv++; + argc--; + check_condition = *argv; + } + else if (*flag == 'c') + arg_mode |= MODE_CHECK; + else if (*flag == 'D') + arg_mode |= MODE_DAEMON; + else if (*flag == 'W' && flag[1] == 's') { + arg_mode |= MODE_MWORKER | MODE_FOREGROUND; +#if defined(USE_SYSTEMD) + global.tune.options |= GTUNE_USE_SYSTEMD; +#else + ha_alert("master-worker mode with systemd support (-Ws) requested, but not compiled. Use master-worker mode (-W) if you are not using Type=notify in your unit file or recompile with USE_SYSTEMD=1.\n\n"); + usage(progname); +#endif + } + else if (*flag == 'W') + arg_mode |= MODE_MWORKER; + else if (*flag == 'q') + arg_mode |= MODE_QUIET; + else if (*flag == 'x') { + if (argc <= 1) { + ha_alert("Unix socket path expected with the -x flag\n\n"); + usage(progname); + } + if (old_unixsocket) + ha_warning("-x option already set, overwriting the value\n"); + old_unixsocket = argv[1]; + + argv++; + argc--; + } + else if (*flag == 'S') { + struct wordlist *c; + + if (argc <= 1) { + ha_alert("Socket and optional bind parameters expected with the -S flag\n"); + usage(progname); + } + if ((c = malloc(sizeof(*c))) == NULL || (c->s = strdup(argv[1])) == NULL) { + ha_alert("Cannot allocate memory\n"); + exit(EXIT_FAILURE); + } + LIST_INSERT(&mworker_cli_conf, &c->list); + + argv++; + argc--; + } + else if (*flag == 's' && (flag[1] == 'f' || flag[1] == 't')) { + /* list of pids to finish ('f') or terminate ('t') */ + + if (flag[1] == 'f') + oldpids_sig = SIGUSR1; /* finish then exit */ + else + oldpids_sig = SIGTERM; /* terminate immediately */ + while (argc > 1 && argv[1][0] != '-') { + char * endptr = NULL; + oldpids = realloc(oldpids, (nb_oldpids + 1) * sizeof(int)); + if (!oldpids) { + ha_alert("Cannot allocate old pid : out of memory.\n"); + exit(1); + } + argc--; argv++; + errno = 0; + oldpids[nb_oldpids] = strtol(*argv, &endptr, 10); + if (errno) { + ha_alert("-%2s option: failed to parse {%s}: %s\n", + flag, + *argv, strerror(errno)); + exit(1); + } else if (endptr && strlen(endptr)) { + while (isspace((unsigned char)*endptr)) endptr++; + if (*endptr != 0) { + ha_alert("-%2s option: some bytes unconsumed in PID list {%s}\n", + flag, endptr); + exit(1); + } + } + if (oldpids[nb_oldpids] <= 0) + usage(progname); + nb_oldpids++; + } + } + else if (flag[0] == '-' && flag[1] == 0) { /* "--" */ + /* now that's a cfgfile list */ + argv++; argc--; + while (argc > 0) { + if (!list_append_word(&cfg_cfgfiles, *argv, &err_msg)) { + ha_alert("Cannot load configuration file/directory %s : %s\n", + *argv, + err_msg); + exit(1); + } + argv++; argc--; + } + break; + } + else { /* >=2 args */ + argv++; argc--; + if (argc == 0) + usage(progname); + + switch (*flag) { + case 'C' : change_dir = *argv; break; + case 'n' : cfg_maxconn = atol(*argv); break; + case 'm' : global.rlimit_memmax_all = atol(*argv); break; + case 'N' : cfg_maxpconn = atol(*argv); break; + case 'L' : + free(localpeer); + if ((localpeer = strdup(*argv)) == NULL) { + ha_alert("Cannot allocate memory for local peer.\n"); + exit(EXIT_FAILURE); + } + setenv("HAPROXY_LOCALPEER", localpeer, 1); + global.localpeer_cmdline = 1; + break; + case 'f' : + if (!list_append_word(&cfg_cfgfiles, *argv, &err_msg)) { + ha_alert("Cannot load configuration file/directory %s : %s\n", + *argv, + err_msg); + exit(1); + } + break; + case 'p' : + free(global.pidfile); + if ((global.pidfile = strdup(*argv)) == NULL) { + ha_alert("Cannot allocate memory for pidfile.\n"); + exit(EXIT_FAILURE); + } + break; + default: usage(progname); + } + } + } + else + usage(progname); + argv++; argc--; + } + free(err_msg); +} + +/* call the various keyword dump functions based on the comma-delimited list of + * classes in kwd_dump. + */ +static void dump_registered_keywords(void) +{ + char *end; + int all __maybe_unused = 0; + + for (; kwd_dump && *kwd_dump; kwd_dump = end) { + end = strchr(kwd_dump, ','); + if (end) + *(end++) = 0; + + if (strcmp(kwd_dump, "help") == 0) { + printf("# List of supported keyword classes:\n"); + printf("all: list all keywords\n"); + printf("acl: ACL keywords\n"); + printf("cfg: configuration keywords\n"); + printf("cli: CLI keywords\n"); + printf("cnv: sample converter keywords\n"); + printf("flt: filter names\n"); + printf("smp: sample fetch functions\n"); + printf("svc: service names\n"); + continue; + } + else if (strcmp(kwd_dump, "all") == 0) { + all = 1; + } + + if (all || strcmp(kwd_dump, "acl") == 0) { + printf("# List of registered ACL keywords:\n"); + acl_dump_kwd(); + } + + if (all || strcmp(kwd_dump, "cfg") == 0) { + printf("# List of registered configuration keywords:\n"); + cfg_dump_registered_keywords(); + } + + if (all || strcmp(kwd_dump, "cli") == 0) { + printf("# List of registered CLI keywords:\n"); + cli_list_keywords(); + } + + if (all || strcmp(kwd_dump, "cnv") == 0) { + printf("# List of registered sample converter functions:\n"); + smp_dump_conv_kw(); + } + + if (all || strcmp(kwd_dump, "flt") == 0) { + printf("# List of registered filter names:\n"); + flt_dump_kws(NULL); + } + + if (all || strcmp(kwd_dump, "smp") == 0) { + printf("# List of registered sample fetch functions:\n"); + smp_dump_fetch_kw(); + } + + if (all || strcmp(kwd_dump, "svc") == 0) { + printf("# List of registered service names:\n"); + list_services(NULL); + } + } +} + +/* Generate a random cluster-secret in case the setting is not provided in the + * configuration. This allows to use features which rely on it albeit with some + * limitations. + */ +static void generate_random_cluster_secret() +{ + /* used as a default random cluster-secret if none defined. */ + uint64_t rand; + + /* The caller must not overwrite an already defined secret. */ + BUG_ON(cluster_secret_isset); + + rand = ha_random64(); + memcpy(global.cluster_secret, &rand, sizeof(rand)); + rand = ha_random64(); + memcpy(global.cluster_secret + sizeof(rand), &rand, sizeof(rand)); + cluster_secret_isset = 1; +} + +/* + * This function initializes all the necessary variables. It only returns + * if everything is OK. If something fails, it exits. + */ +static void init(int argc, char **argv) +{ + char *progname = global.log_tag.area; + int err_code = 0; + struct wordlist *wl; + struct proxy *px; + struct post_check_fct *pcf; + struct pre_check_fct *prcf; + int ideal_maxconn; + const char *cc, *cflags, *opts; + +#ifdef USE_OPENSSL +#ifdef USE_OPENSSL_WOLFSSL + wolfSSL_Init(); + wolfSSL_Debugging_ON(); +#endif + +#ifdef USE_OPENSSL_AWSLC + const char *version_str = OpenSSL_version(OPENSSL_VERSION); + if (strncmp(version_str, "AWS-LC", 6) != 0) { + ha_alert("HAPRoxy built with AWS-LC but running with %s.\n", version_str); + exit(1); + } +#endif + +#if (HA_OPENSSL_VERSION_NUMBER < 0x1010000fL) + /* Initialize the error strings of OpenSSL + * It only needs to be done explicitly with older versions of the SSL + * library. On newer versions, errors strings are loaded during start + * up. */ + SSL_load_error_strings(); +#endif +#endif + + startup_logs_init(); + + if (init_acl() != 0) + exit(1); + + /* Initialise lua. */ + hlua_init(); + + global.mode |= (arg_mode & (MODE_DAEMON | MODE_MWORKER | MODE_FOREGROUND | MODE_VERBOSE + | MODE_QUIET | MODE_CHECK | MODE_DEBUG | MODE_ZERO_WARNING + | MODE_DIAG | MODE_CHECK_CONDITION | MODE_DUMP_LIBS | MODE_DUMP_KWD + | MODE_DUMP_CFG | MODE_DUMP_NB_L)); + + if (getenv("HAPROXY_MWORKER_WAIT_ONLY")) { + unsetenv("HAPROXY_MWORKER_WAIT_ONLY"); + global.mode |= MODE_MWORKER_WAIT; + global.mode &= ~MODE_MWORKER; + } + + /* set the atexit functions when not doing configuration check */ + if (!(global.mode & (MODE_CHECK | MODE_CHECK_CONDITION)) + && (getenv("HAPROXY_MWORKER_REEXEC") != NULL)) { + + if (global.mode & MODE_MWORKER) { + atexit_flag = 1; + atexit(reexec_on_failure); + } else if (global.mode & MODE_MWORKER_WAIT) { + atexit_flag = 1; + atexit(exit_on_waitmode_failure); + } + } + + if (change_dir && chdir(change_dir) < 0) { + ha_alert("Could not change to directory %s : %s\n", change_dir, strerror(errno)); + exit(1); + } + + usermsgs_clr("config"); + + if (global.mode & MODE_CHECK_CONDITION) { + int result; + + uint32_t err; + const char *errptr; + char *errmsg = NULL; + + char *args[MAX_LINE_ARGS+1]; + int arg = sizeof(args) / sizeof(*args); + size_t outlen; + char *w; + + if (!check_condition) + usage(progname); + + outlen = strlen(check_condition) + 1; + err = parse_line(check_condition, check_condition, &outlen, args, &arg, + PARSE_OPT_ENV | PARSE_OPT_WORD_EXPAND | PARSE_OPT_DQUOTE | PARSE_OPT_SQUOTE | PARSE_OPT_BKSLASH, + &errptr); + + if (err & PARSE_ERR_QUOTE) { + ha_alert("Syntax Error in condition: Unmatched quote.\n"); + exit(2); + } + + if (err & PARSE_ERR_HEX) { + ha_alert("Syntax Error in condition: Truncated or invalid hexadecimal sequence.\n"); + exit(2); + } + + if (err & (PARSE_ERR_TOOLARGE|PARSE_ERR_OVERLAP)) { + ha_alert("Error in condition: Line too long.\n"); + exit(2); + } + + if (err & PARSE_ERR_TOOMANY) { + ha_alert("Error in condition: Too many words.\n"); + exit(2); + } + + if (err) { + ha_alert("Unhandled error in condition, please report this to the developers.\n"); + exit(2); + } + + /* remerge all words into a single expression */ + for (w = *args; (w += strlen(w)) < check_condition + outlen - 1; *w = ' ') + ; + + result = cfg_eval_condition(args, &errmsg, &errptr); + + if (result < 0) { + if (errmsg) + ha_alert("Failed to evaluate condition: %s\n", errmsg); + + exit(2); + } + + exit(result ? 0 : 1); + } + + /* in wait mode, we don't try to read the configuration files */ + if (!(global.mode & MODE_MWORKER_WAIT)) { + char *env_cfgfiles = NULL; + int env_err = 0; + + /* handle cfgfiles that are actually directories */ + cfgfiles_expand_directories(); + + if (LIST_ISEMPTY(&cfg_cfgfiles)) + usage(progname); + + /* temporary create environment variables with default + * values to ease user configuration. Do not forget to + * unset them after the list_for_each_entry loop. + */ + setenv("HAPROXY_HTTP_LOG_FMT", default_http_log_format, 1); + setenv("HAPROXY_HTTPS_LOG_FMT", default_https_log_format, 1); + setenv("HAPROXY_TCP_LOG_FMT", default_tcp_log_format, 1); + setenv("HAPROXY_BRANCH", PRODUCT_BRANCH, 1); + list_for_each_entry(wl, &cfg_cfgfiles, list) { + int ret; + + if (env_err == 0) { + if (!memprintf(&env_cfgfiles, "%s%s%s", + (env_cfgfiles ? env_cfgfiles : ""), + (env_cfgfiles ? ";" : ""), wl->s)) + env_err = 1; + } + + ret = readcfgfile(wl->s); + if (ret == -1) { + ha_alert("Could not open configuration file %s : %s\n", + wl->s, strerror(errno)); + free(env_cfgfiles); + exit(1); + } + if (ret & (ERR_ABORT|ERR_FATAL)) + ha_alert("Error(s) found in configuration file : %s\n", wl->s); + err_code |= ret; + if (err_code & ERR_ABORT) { + free(env_cfgfiles); + exit(1); + } + } + /* remove temporary environment variables. */ + unsetenv("HAPROXY_BRANCH"); + unsetenv("HAPROXY_HTTP_LOG_FMT"); + unsetenv("HAPROXY_HTTPS_LOG_FMT"); + unsetenv("HAPROXY_TCP_LOG_FMT"); + + /* do not try to resolve arguments nor to spot inconsistencies when + * the configuration contains fatal errors caused by files not found + * or failed memory allocations. + */ + if (err_code & (ERR_ABORT|ERR_FATAL)) { + ha_alert("Fatal errors found in configuration.\n"); + free(env_cfgfiles); + exit(1); + } + if (env_err) { + ha_alert("Could not allocate memory for HAPROXY_CFGFILES env variable\n"); + exit(1); + } + setenv("HAPROXY_CFGFILES", env_cfgfiles, 1); + free(env_cfgfiles); + + } + if (global.mode & MODE_MWORKER) { + struct mworker_proc *tmproc; + + setenv("HAPROXY_MWORKER", "1", 1); + + if (getenv("HAPROXY_MWORKER_REEXEC") == NULL) { + + tmproc = mworker_proc_new(); + if (!tmproc) { + ha_alert("Cannot allocate process structures.\n"); + exit(EXIT_FAILURE); + } + tmproc->options |= PROC_O_TYPE_MASTER; /* master */ + tmproc->pid = pid; + tmproc->timestamp = start_date.tv_sec; + proc_self = tmproc; + + LIST_APPEND(&proc_list, &tmproc->list); + } + + tmproc = mworker_proc_new(); + if (!tmproc) { + ha_alert("Cannot allocate process structures.\n"); + exit(EXIT_FAILURE); + } + tmproc->options |= PROC_O_TYPE_WORKER; /* worker */ + + if (mworker_cli_sockpair_new(tmproc, 0) < 0) { + exit(EXIT_FAILURE); + } + + LIST_APPEND(&proc_list, &tmproc->list); + } + + if (global.mode & MODE_MWORKER_WAIT) { + /* in exec mode, there's always exactly one thread. Failure to + * set these ones now will result in nbthread being detected + * automatically. + */ + global.nbtgroups = 1; + global.nbthread = 1; + } + + if (global.mode & (MODE_MWORKER|MODE_MWORKER_WAIT)) { + struct wordlist *it, *c; + + master = 1; + /* get the info of the children in the env */ + if (mworker_env_to_proc_list() < 0) { + exit(EXIT_FAILURE); + } + + if (!LIST_ISEMPTY(&mworker_cli_conf)) { + char *path = NULL; + + if (mworker_cli_proxy_create() < 0) { + ha_alert("Can't create the master's CLI.\n"); + exit(EXIT_FAILURE); + } + + list_for_each_entry_safe(c, it, &mworker_cli_conf, list) { + + if (mworker_cli_proxy_new_listener(c->s) == NULL) { + ha_alert("Can't create the master's CLI.\n"); + exit(EXIT_FAILURE); + } + LIST_DELETE(&c->list); + free(c->s); + free(c); + } + /* Creates the mcli_reload listener, which is the listener used + * to retrieve the master CLI session which asked for the reload. + * + * ipc_fd[1] will be used as a listener, and ipc_fd[0] + * will be used to send the FD of the session. + * + * Both FDs will be kept in the master. The sockets are + * created only if they weren't inherited. + */ + if ((proc_self->ipc_fd[1] == -1) && + socketpair(AF_UNIX, SOCK_STREAM, 0, proc_self->ipc_fd) < 0) { + ha_alert("cannot create the mcli_reload socketpair.\n"); + exit(EXIT_FAILURE); + } + + /* Create the mcli_reload listener from the proc_self struct */ + memprintf(&path, "sockpair@%d", proc_self->ipc_fd[1]); + mcli_reload_bind_conf = mworker_cli_proxy_new_listener(path); + if (mcli_reload_bind_conf == NULL) { + ha_alert("Cannot create the mcli_reload listener.\n"); + exit(EXIT_FAILURE); + } + ha_free(&path); + } + } + + if (!LIST_ISEMPTY(&mworker_cli_conf) && !(arg_mode & MODE_MWORKER)) { + ha_alert("a master CLI socket was defined, but master-worker mode (-W) is not enabled.\n"); + exit(EXIT_FAILURE); + } + + /* destroy unreferenced defaults proxies */ + proxy_destroy_all_unref_defaults(); + + list_for_each_entry(prcf, &pre_check_list, list) + err_code |= prcf->fct(); + + if (err_code & (ERR_ABORT|ERR_FATAL)) { + ha_alert("Fatal errors found in configuration.\n"); + exit(1); + } + + /* update the ready date that will be used to count the startup time + * during config checks (e.g. to schedule certain tasks if needed) + */ + clock_update_date(0, 1); + clock_adjust_now_offset(); + ready_date = date; + + + /* Note: global.nbthread will be initialized as part of this call */ + err_code |= check_config_validity(); + + /* update the ready date to also account for the check time */ + clock_update_date(0, 1); + clock_adjust_now_offset(); + ready_date = date; + + for (px = proxies_list; px; px = px->next) { + struct server *srv; + struct post_proxy_check_fct *ppcf; + struct post_server_check_fct *pscf; + + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + continue; + + list_for_each_entry(pscf, &post_server_check_list, list) { + for (srv = px->srv; srv; srv = srv->next) + err_code |= pscf->fct(srv); + } + list_for_each_entry(ppcf, &post_proxy_check_list, list) + err_code |= ppcf->fct(px); + } + if (err_code & (ERR_ABORT|ERR_FATAL)) { + ha_alert("Fatal errors found in configuration.\n"); + exit(1); + } + + err_code |= pattern_finalize_config(); + if (err_code & (ERR_ABORT|ERR_FATAL)) { + ha_alert("Failed to finalize pattern config.\n"); + exit(1); + } + + if (global.rlimit_memmax_all) + global.rlimit_memmax = global.rlimit_memmax_all; + +#ifdef USE_NS + err_code |= netns_init(); + if (err_code & (ERR_ABORT|ERR_FATAL)) { + ha_alert("Failed to initialize namespace support.\n"); + exit(1); + } +#endif + + thread_detect_binding_discrepancies(); + thread_detect_more_than_cpus(); + + /* Apply server states */ + apply_server_state(); + + for (px = proxies_list; px; px = px->next) + srv_compute_all_admin_states(px); + + /* Apply servers' configured address */ + err_code |= srv_init_addr(); + if (err_code & (ERR_ABORT|ERR_FATAL)) { + ha_alert("Failed to initialize server(s) addr.\n"); + exit(1); + } + + if (warned & WARN_ANY && global.mode & MODE_ZERO_WARNING) { + ha_alert("Some warnings were found and 'zero-warning' is set. Aborting.\n"); + exit(1); + } + +#if defined(HA_HAVE_DUMP_LIBS) + if (global.mode & MODE_DUMP_LIBS) { + qfprintf(stdout, "List of loaded object files:\n"); + chunk_reset(&trash); + if (dump_libs(&trash, ((arg_mode & (MODE_QUIET|MODE_VERBOSE)) == MODE_VERBOSE))) + printf("%s", trash.area); + } +#endif + + if (global.mode & MODE_DUMP_KWD) + dump_registered_keywords(); + + if (global.mode & MODE_DIAG) { + cfg_run_diagnostics(); + } + + if (global.mode & MODE_CHECK) { + struct peers *pr; + struct proxy *px; + + if (warned & WARN_ANY) + qfprintf(stdout, "Warnings were found.\n"); + + for (pr = cfg_peers; pr; pr = pr->next) + if (pr->peers_fe) + break; + + for (px = proxies_list; px; px = px->next) + if (!(px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) && px->li_all) + break; + + if (!px) { + /* We may only have log-forward section */ + for (px = cfg_log_forward; px; px = px->next) + if (!(px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) && px->li_all) + break; + } + + if (pr || px) { + /* At least one peer or one listener has been found */ + if (global.mode & MODE_VERBOSE) + qfprintf(stdout, "Configuration file is valid\n"); + deinit_and_exit(0); + } + qfprintf(stdout, "Configuration file has no error but will not start (no listener) => exit(2).\n"); + exit(2); + } + + if (global.mode & MODE_DUMP_CFG) + deinit_and_exit(0); + +#ifdef USE_OPENSSL + + /* Initialize SSL random generator. Must be called before chroot for + * access to /dev/urandom, and before ha_random_boot() which may use + * RAND_bytes(). + */ + if (!ssl_initialize_random()) { + ha_alert("OpenSSL random data generator initialization failed.\n"); + exit(EXIT_FAILURE); + } +#endif + ha_random_boot(argv); // the argv pointer brings some kernel-fed entropy + + /* now we know the buffer size, we can initialize the channels and buffers */ + init_buffer(); + + list_for_each_entry(pcf, &post_check_list, list) { + err_code |= pcf->fct(); + if (err_code & (ERR_ABORT|ERR_FATAL)) + exit(1); + } + + /* set the default maxconn in the master, but let it be rewritable with -n */ + if (global.mode & MODE_MWORKER_WAIT) + global.maxconn = MASTER_MAXCONN; + + if (cfg_maxconn > 0) + global.maxconn = cfg_maxconn; + + if (global.cli_fe) + global.maxsock += global.cli_fe->maxconn; + + if (cfg_peers) { + /* peers also need to bypass global maxconn */ + struct peers *p = cfg_peers; + + for (p = cfg_peers; p; p = p->next) + if (p->peers_fe) + global.maxsock += p->peers_fe->maxconn; + } + + /* Now we want to compute the maxconn and possibly maxsslconn values. + * It's a bit tricky. Maxconn defaults to the pre-computed value based + * on rlim_fd_cur and the number of FDs in use due to the configuration, + * and maxsslconn defaults to DEFAULT_MAXSSLCONN. On top of that we can + * enforce a lower limit based on memmax. + * + * If memmax is set, then it depends on which values are set. If + * maxsslconn is set, we use memmax to determine how many cleartext + * connections may be added, and set maxconn to the sum of the two. + * If maxconn is set and not maxsslconn, maxsslconn is computed from + * the remaining amount of memory between memmax and the cleartext + * connections. If neither are set, then it is considered that all + * connections are SSL-capable, and maxconn is computed based on this, + * then maxsslconn accordingly. We need to know if SSL is used on the + * frontends, backends, or both, because when it's used on both sides, + * we need twice the value for maxsslconn, but we only count the + * handshake once since it is not performed on the two sides at the + * same time (frontend-side is terminated before backend-side begins). + * The SSL stack is supposed to have filled ssl_session_cost and + * ssl_handshake_cost during its initialization. In any case, if + * SYSTEM_MAXCONN is set, we still enforce it as an upper limit for + * maxconn in order to protect the system. + */ + ideal_maxconn = compute_ideal_maxconn(); + + if (!global.rlimit_memmax) { + if (global.maxconn == 0) { + global.maxconn = ideal_maxconn; + if (global.mode & (MODE_VERBOSE|MODE_DEBUG)) + fprintf(stderr, "Note: setting global.maxconn to %d.\n", global.maxconn); + } + } +#ifdef USE_OPENSSL + else if (!global.maxconn && !global.maxsslconn && + (global.ssl_used_frontend || global.ssl_used_backend)) { + /* memmax is set, compute everything automatically. Here we want + * to ensure that all SSL connections will be served. We take + * care of the number of sides where SSL is used, and consider + * the worst case : SSL used on both sides and doing a handshake + * simultaneously. Note that we can't have more than maxconn + * handshakes at a time by definition, so for the worst case of + * two SSL conns per connection, we count a single handshake. + */ + int sides = !!global.ssl_used_frontend + !!global.ssl_used_backend; + int64_t mem = global.rlimit_memmax * 1048576ULL; + int retried = 0; + + mem -= global.tune.sslcachesize * 200ULL; // about 200 bytes per SSL cache entry + mem -= global.maxzlibmem; + mem = mem * MEM_USABLE_RATIO; + + /* Principle: we test once to set maxconn according to the free + * memory. If it results in values the system rejects, we try a + * second time by respecting rlim_fd_max. If it fails again, we + * go back to the initial value and will let the final code + * dealing with rlimit report the error. That's up to 3 attempts. + */ + do { + global.maxconn = mem / + ((STREAM_MAX_COST + 2 * global.tune.bufsize) + // stream + 2 buffers per stream + sides * global.ssl_session_max_cost + // SSL buffers, one per side + global.ssl_handshake_max_cost); // 1 handshake per connection max + + if (retried == 1) + global.maxconn = MIN(global.maxconn, ideal_maxconn); + global.maxconn = round_2dig(global.maxconn); +#ifdef SYSTEM_MAXCONN + if (global.maxconn > SYSTEM_MAXCONN) + global.maxconn = SYSTEM_MAXCONN; +#endif /* SYSTEM_MAXCONN */ + global.maxsslconn = sides * global.maxconn; + + if (check_if_maxsock_permitted(compute_ideal_maxsock(global.maxconn))) + break; + } while (retried++ < 2); + + if (global.mode & (MODE_VERBOSE|MODE_DEBUG)) + fprintf(stderr, "Note: setting global.maxconn to %d and global.maxsslconn to %d.\n", + global.maxconn, global.maxsslconn); + } + else if (!global.maxsslconn && + (global.ssl_used_frontend || global.ssl_used_backend)) { + /* memmax and maxconn are known, compute maxsslconn automatically. + * maxsslconn being forced, we don't know how many of it will be + * on each side if both sides are being used. The worst case is + * when all connections use only one SSL instance because + * handshakes may be on two sides at the same time. + */ + int sides = !!global.ssl_used_frontend + !!global.ssl_used_backend; + int64_t mem = global.rlimit_memmax * 1048576ULL; + int64_t sslmem; + + mem -= global.tune.sslcachesize * 200ULL; // about 200 bytes per SSL cache entry + mem -= global.maxzlibmem; + mem = mem * MEM_USABLE_RATIO; + + sslmem = mem - global.maxconn * (int64_t)(STREAM_MAX_COST + 2 * global.tune.bufsize); + global.maxsslconn = sslmem / (global.ssl_session_max_cost + global.ssl_handshake_max_cost); + global.maxsslconn = round_2dig(global.maxsslconn); + + if (sslmem <= 0 || global.maxsslconn < sides) { + ha_alert("Cannot compute the automatic maxsslconn because global.maxconn is already too " + "high for the global.memmax value (%d MB). The absolute maximum possible value " + "without SSL is %d, but %d was found and SSL is in use.\n", + global.rlimit_memmax, + (int)(mem / (STREAM_MAX_COST + 2 * global.tune.bufsize)), + global.maxconn); + exit(1); + } + + if (global.maxsslconn > sides * global.maxconn) + global.maxsslconn = sides * global.maxconn; + + if (global.mode & (MODE_VERBOSE|MODE_DEBUG)) + fprintf(stderr, "Note: setting global.maxsslconn to %d\n", global.maxsslconn); + } +#endif + else if (!global.maxconn) { + /* memmax and maxsslconn are known/unused, compute maxconn automatically */ + int sides = !!global.ssl_used_frontend + !!global.ssl_used_backend; + int64_t mem = global.rlimit_memmax * 1048576ULL; + int64_t clearmem; + int retried = 0; + + if (global.ssl_used_frontend || global.ssl_used_backend) + mem -= global.tune.sslcachesize * 200ULL; // about 200 bytes per SSL cache entry + + mem -= global.maxzlibmem; + mem = mem * MEM_USABLE_RATIO; + + clearmem = mem; + if (sides) + clearmem -= (global.ssl_session_max_cost + global.ssl_handshake_max_cost) * (int64_t)global.maxsslconn; + + /* Principle: we test once to set maxconn according to the free + * memory. If it results in values the system rejects, we try a + * second time by respecting rlim_fd_max. If it fails again, we + * go back to the initial value and will let the final code + * dealing with rlimit report the error. That's up to 3 attempts. + */ + do { + global.maxconn = clearmem / (STREAM_MAX_COST + 2 * global.tune.bufsize); + if (retried == 1) + global.maxconn = MIN(global.maxconn, ideal_maxconn); + global.maxconn = round_2dig(global.maxconn); +#ifdef SYSTEM_MAXCONN + if (global.maxconn > SYSTEM_MAXCONN) + global.maxconn = SYSTEM_MAXCONN; +#endif /* SYSTEM_MAXCONN */ + + if (clearmem <= 0 || !global.maxconn) { + ha_alert("Cannot compute the automatic maxconn because global.maxsslconn is already too " + "high for the global.memmax value (%d MB). The absolute maximum possible value " + "is %d, but %d was found.\n", + global.rlimit_memmax, + (int)(mem / (global.ssl_session_max_cost + global.ssl_handshake_max_cost)), + global.maxsslconn); + exit(1); + } + + if (check_if_maxsock_permitted(compute_ideal_maxsock(global.maxconn))) + break; + } while (retried++ < 2); + + if (global.mode & (MODE_VERBOSE|MODE_DEBUG)) { + if (sides && global.maxsslconn > sides * global.maxconn) { + fprintf(stderr, "Note: global.maxsslconn is forced to %d which causes global.maxconn " + "to be limited to %d. Better reduce global.maxsslconn to get more " + "room for extra connections.\n", global.maxsslconn, global.maxconn); + } + fprintf(stderr, "Note: setting global.maxconn to %d\n", global.maxconn); + } + } + + global.maxsock = compute_ideal_maxsock(global.maxconn); + global.hardmaxconn = global.maxconn; + if (!global.maxpipes) + global.maxpipes = compute_ideal_maxpipes(); + + /* update connection pool thresholds */ + global.tune.pool_low_count = ((long long)global.maxsock * global.tune.pool_low_ratio + 99) / 100; + global.tune.pool_high_count = ((long long)global.maxsock * global.tune.pool_high_ratio + 99) / 100; + + proxy_adjust_all_maxconn(); + + if (global.tune.maxpollevents <= 0) + global.tune.maxpollevents = MAX_POLL_EVENTS; + + if (global.tune.runqueue_depth <= 0) { + /* tests on various thread counts from 1 to 64 have shown an + * optimal queue depth following roughly 1/sqrt(threads). + */ + int s = my_flsl(global.nbthread); + s += (global.nbthread / s); // roughly twice the sqrt. + global.tune.runqueue_depth = RUNQUEUE_DEPTH * 2 / s; + } + + if (global.tune.recv_enough == 0) + global.tune.recv_enough = MIN_RECV_AT_ONCE_ENOUGH; + + if (global.tune.maxrewrite >= global.tune.bufsize / 2) + global.tune.maxrewrite = global.tune.bufsize / 2; + + usermsgs_clr(NULL); + + if (arg_mode & (MODE_DEBUG | MODE_FOREGROUND)) { + /* command line debug mode inhibits configuration mode */ + global.mode &= ~(MODE_DAEMON | MODE_QUIET); + global.mode |= (arg_mode & (MODE_DEBUG | MODE_FOREGROUND)); + } + + if (arg_mode & MODE_DAEMON) { + /* command line daemon mode inhibits foreground and debug modes mode */ + global.mode &= ~(MODE_DEBUG | MODE_FOREGROUND); + global.mode |= arg_mode & MODE_DAEMON; + } + + global.mode |= (arg_mode & (MODE_QUIET | MODE_VERBOSE)); + + if ((global.mode & MODE_DEBUG) && (global.mode & (MODE_DAEMON | MODE_QUIET))) { + ha_warning("<debug> mode incompatible with <quiet> and <daemon>. Keeping <debug> only.\n"); + global.mode &= ~(MODE_DAEMON | MODE_QUIET); + } + + /* Realloc trash buffers because global.tune.bufsize may have changed */ + if (!init_trash_buffers(0)) { + ha_alert("failed to initialize trash buffers.\n"); + exit(1); + } + + if (!init_log_buffers()) { + ha_alert("failed to initialize log buffers.\n"); + exit(1); + } + + if (!cluster_secret_isset) + generate_random_cluster_secret(); + + /* + * Note: we could register external pollers here. + * Built-in pollers have been registered before main(). + */ + + if (!(global.tune.options & GTUNE_USE_KQUEUE)) + disable_poller("kqueue"); + + if (!(global.tune.options & GTUNE_USE_EVPORTS)) + disable_poller("evports"); + + if (!(global.tune.options & GTUNE_USE_EPOLL)) + disable_poller("epoll"); + + if (!(global.tune.options & GTUNE_USE_POLL)) + disable_poller("poll"); + + if (!(global.tune.options & GTUNE_USE_SELECT)) + disable_poller("select"); + + /* Note: we could disable any poller by name here */ + + if (global.mode & (MODE_VERBOSE|MODE_DEBUG)) { + list_pollers(stderr); + fprintf(stderr, "\n"); + list_filters(stderr); + } + + if (!init_pollers()) { + ha_alert("No polling mechanism available.\n" + " This may happen when using thread-groups with old pollers (poll/select), or\n" + " it is possible that haproxy was built with TARGET=generic and that FD_SETSIZE\n" + " is too low on this platform to support maxconn and the number of listeners\n" + " and servers. You should rebuild haproxy specifying your system using TARGET=\n" + " in order to support other polling systems (poll, epoll, kqueue) or reduce the\n" + " global maxconn setting to accommodate the system's limitation. For reference,\n" + " FD_SETSIZE=%d on this system, global.maxconn=%d resulting in a maximum of\n" + " %d file descriptors. You should thus reduce global.maxconn by %d. Also,\n" + " check build settings using 'haproxy -vv'.\n\n", + FD_SETSIZE, global.maxconn, global.maxsock, (global.maxsock + 1 - FD_SETSIZE) / 2); + exit(1); + } + if (global.mode & (MODE_VERBOSE|MODE_DEBUG)) { + printf("Using %s() as the polling mechanism.\n", cur_poller.name); + } + + if (!global.node) + global.node = strdup(hostname); + + /* stop disabled proxies */ + for (px = proxies_list; px; px = px->next) { + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + stop_proxy(px); + } + + if (!hlua_post_init()) + exit(1); + + /* Set the per-thread pool cache size to the default value if not set. + * This is the right place to decide to automatically adjust it (e.g. + * check L2 cache size, thread counts or take into account certain + * expensive pools). + */ + if (!global.tune.pool_cache_size) + global.tune.pool_cache_size = CONFIG_HAP_POOL_CACHE_SIZE; + + /* fill in a few info about our version and build options */ + chunk_reset(&trash); + + /* toolchain */ + cc = chunk_newstr(&trash); +#if defined(__clang_version__) + chunk_appendf(&trash, "clang-" __clang_version__); +#elif defined(__VERSION__) + chunk_appendf(&trash, "gcc-" __VERSION__); +#endif +#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) + chunk_appendf(&trash, "+asan"); +#endif + /* toolchain opts */ + cflags = chunk_newstr(&trash); +#ifdef BUILD_CC + chunk_appendf(&trash, "%s", BUILD_CC); +#endif +#ifdef BUILD_CFLAGS + chunk_appendf(&trash, " %s", BUILD_CFLAGS); +#endif +#ifdef BUILD_DEBUG + chunk_appendf(&trash, " %s", BUILD_DEBUG); +#endif + /* settings */ + opts = chunk_newstr(&trash); +#ifdef BUILD_TARGET + chunk_appendf(&trash, "TARGET='%s'", BUILD_TARGET); +#endif +#ifdef BUILD_CPU + chunk_appendf(&trash, " CPU='%s'", BUILD_CPU); +#endif +#ifdef BUILD_OPTIONS + chunk_appendf(&trash, " %s", BUILD_OPTIONS); +#endif + + post_mortem_add_component("haproxy", haproxy_version, cc, cflags, opts, argv[0]); +} + +void deinit(void) +{ + struct proxy *p = proxies_list, *p0; + struct wordlist *wl, *wlb; + struct uri_auth *uap, *ua = NULL; + struct logger *log, *logb; + struct build_opts_str *bol, *bolb; + struct post_deinit_fct *pdf, *pdfb; + struct proxy_deinit_fct *pxdf, *pxdfb; + struct server_deinit_fct *srvdf, *srvdfb; + struct per_thread_init_fct *tif, *tifb; + struct per_thread_deinit_fct *tdf, *tdfb; + struct per_thread_alloc_fct *taf, *tafb; + struct per_thread_free_fct *tff, *tffb; + struct post_server_check_fct *pscf, *pscfb; + struct post_check_fct *pcf, *pcfb; + struct post_proxy_check_fct *ppcf, *ppcfb; + struct pre_check_fct *prcf, *prcfb; + struct cfg_postparser *pprs, *pprsb; + int cur_fd; + + /* the user may want to skip this phase */ + if (global.tune.options & GTUNE_QUICK_EXIT) + return; + + /* At this point the listeners state is weird: + * - most listeners are still bound and referenced in their protocol + * - some might be zombies that are not in their proto anymore, but + * still appear in their proxy's listeners with a valid FD. + * - some might be stopped and still appear in their proxy as FD #-1 + * - among all of them, some might be inherited hence shared and we're + * not allowed to pause them or whatever, we must just close them. + * - finally some are not listeners (pipes, logs, stdout, etc) and + * must be left intact. + * + * The safe way to proceed is to unbind (and close) whatever is not yet + * unbound so that no more receiver/listener remains alive. Then close + * remaining listener FDs, which correspond to zombie listeners (those + * belonging to disabled proxies that were in another process). + * objt_listener() would be cleaner here but not converted yet. + */ + protocol_unbind_all(); + + for (cur_fd = 0; cur_fd < global.maxsock; cur_fd++) { + if (!fdtab || !fdtab[cur_fd].owner) + continue; + + if (fdtab[cur_fd].iocb == &sock_accept_iocb) { + struct listener *l = fdtab[cur_fd].owner; + + BUG_ON(l->state != LI_INIT); + unbind_listener(l); + } + } + + deinit_signals(); + while (p) { + /* build a list of unique uri_auths */ + if (!ua) + ua = p->uri_auth; + else { + /* check if p->uri_auth is unique */ + for (uap = ua; uap; uap=uap->next) + if (uap == p->uri_auth) + break; + + if (!uap && p->uri_auth) { + /* add it, if it is */ + p->uri_auth->next = ua; + ua = p->uri_auth; + } + } + + p0 = p; + p = p->next; + free_proxy(p0); + }/* end while(p) */ + + /* we don't need to free sink_proxies_list nor cfg_log_forward proxies since + * they are respectively cleaned up in sink_deinit() and deinit_log_forward() + */ + + /* destroy all referenced defaults proxies */ + proxy_destroy_all_unref_defaults(); + + while (ua) { + struct stat_scope *scope, *scopep; + + uap = ua; + ua = ua->next; + + free(uap->uri_prefix); + free(uap->auth_realm); + free(uap->node); + free(uap->desc); + + userlist_free(uap->userlist); + free_act_rules(&uap->http_req_rules); + + scope = uap->scope; + while (scope) { + scopep = scope; + scope = scope->next; + + free(scopep->px_id); + free(scopep); + } + + free(uap); + } + + userlist_free(userlist); + + cfg_unregister_sections(); + + deinit_log_buffers(); + + list_for_each_entry(pdf, &post_deinit_list, list) + pdf->fct(); + + ha_free(&global.log_send_hostname); + chunk_destroy(&global.log_tag); + ha_free(&global.chroot); + ha_free(&global.pidfile); + ha_free(&global.node); + ha_free(&global.desc); + ha_free(&oldpids); + ha_free(&old_argv); + ha_free(&localpeer); + ha_free(&global.server_state_base); + ha_free(&global.server_state_file); + task_destroy(idle_conn_task); + idle_conn_task = NULL; + + list_for_each_entry_safe(log, logb, &global.loggers, list) { + LIST_DEL_INIT(&log->list); + free_logger(log); + } + + list_for_each_entry_safe(wl, wlb, &cfg_cfgfiles, list) { + free(wl->s); + LIST_DELETE(&wl->list); + free(wl); + } + + list_for_each_entry_safe(bol, bolb, &build_opts_list, list) { + if (bol->must_free) + free((void *)bol->str); + LIST_DELETE(&bol->list); + free(bol); + } + + list_for_each_entry_safe(pxdf, pxdfb, &proxy_deinit_list, list) { + LIST_DELETE(&pxdf->list); + free(pxdf); + } + + list_for_each_entry_safe(pdf, pdfb, &post_deinit_list, list) { + LIST_DELETE(&pdf->list); + free(pdf); + } + + list_for_each_entry_safe(srvdf, srvdfb, &server_deinit_list, list) { + LIST_DELETE(&srvdf->list); + free(srvdf); + } + + list_for_each_entry_safe(pcf, pcfb, &post_check_list, list) { + LIST_DELETE(&pcf->list); + free(pcf); + } + + list_for_each_entry_safe(pscf, pscfb, &post_server_check_list, list) { + LIST_DELETE(&pscf->list); + free(pscf); + } + + list_for_each_entry_safe(ppcf, ppcfb, &post_proxy_check_list, list) { + LIST_DELETE(&ppcf->list); + free(ppcf); + } + + list_for_each_entry_safe(prcf, prcfb, &pre_check_list, list) { + LIST_DELETE(&prcf->list); + free(prcf); + } + + list_for_each_entry_safe(tif, tifb, &per_thread_init_list, list) { + LIST_DELETE(&tif->list); + free(tif); + } + + list_for_each_entry_safe(tdf, tdfb, &per_thread_deinit_list, list) { + LIST_DELETE(&tdf->list); + free(tdf); + } + + list_for_each_entry_safe(taf, tafb, &per_thread_alloc_list, list) { + LIST_DELETE(&taf->list); + free(taf); + } + + list_for_each_entry_safe(tff, tffb, &per_thread_free_list, list) { + LIST_DELETE(&tff->list); + free(tff); + } + + list_for_each_entry_safe(pprs, pprsb, &postparsers, list) { + LIST_DELETE(&pprs->list); + free(pprs); + } + + vars_prune(&proc_vars, NULL, NULL); + pool_destroy_all(); + deinit_pollers(); +} /* end deinit() */ + +__attribute__((noreturn)) void deinit_and_exit(int status) +{ + global.mode |= MODE_STOPPING; + deinit(); + exit(status); +} + +/* Runs the polling loop */ +void run_poll_loop() +{ + int next, wake; + + _HA_ATOMIC_OR(&th_ctx->flags, TH_FL_IN_LOOP); + + clock_update_date(0,1); + while (1) { + wake_expired_tasks(); + + /* check if we caught some signals and process them in the + first thread */ + if (signal_queue_len && tid == 0) { + activity[tid].wake_signal++; + signal_process_queue(); + } + + /* Process a few tasks */ + process_runnable_tasks(); + + /* also stop if we failed to cleanly stop all tasks */ + if (killed > 1) + break; + + /* expire immediately if events or signals are pending */ + wake = 1; + if (thread_has_tasks()) + activity[tid].wake_tasks++; + else { + _HA_ATOMIC_OR(&th_ctx->flags, TH_FL_SLEEPING); + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_NOTIFIED); + __ha_barrier_atomic_store(); + if (thread_has_tasks()) { + activity[tid].wake_tasks++; + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_SLEEPING); + } else if (signal_queue_len) { + /* this check is required after setting TH_FL_SLEEPING to avoid + * a race with wakeup on signals using wake_threads() */ + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_SLEEPING); + } else + wake = 0; + } + + if (!wake) { + int i; + + if (stopping) { + /* stop muxes/quic-conns before acknowledging stopping */ + if (!(tg_ctx->stopping_threads & ti->ltid_bit)) { + task_wakeup(mux_stopping_data[tid].task, TASK_WOKEN_OTHER); + wake = 1; + } + + if (_HA_ATOMIC_OR_FETCH(&tg_ctx->stopping_threads, ti->ltid_bit) == ti->ltid_bit && + _HA_ATOMIC_OR_FETCH(&stopping_tgroup_mask, tg->tgid_bit) == tg->tgid_bit) { + /* first one to detect it, notify all threads that stopping was just set */ + for (i = 0; i < global.nbthread; i++) { + if (_HA_ATOMIC_LOAD(&ha_thread_info[i].tg->threads_enabled) & + ha_thread_info[i].ltid_bit & + ~_HA_ATOMIC_LOAD(&ha_thread_info[i].tg_ctx->stopping_threads)) + wake_thread(i); + } + } + } + + /* stop when there's nothing left to do */ + if ((jobs - unstoppable_jobs) == 0 && + (_HA_ATOMIC_LOAD(&stopping_tgroup_mask) & all_tgroups_mask) == all_tgroups_mask) { + /* check that all threads are aware of the stopping status */ + for (i = 0; i < global.nbtgroups; i++) + if ((_HA_ATOMIC_LOAD(&ha_tgroup_ctx[i].stopping_threads) & + _HA_ATOMIC_LOAD(&ha_tgroup_info[i].threads_enabled)) != + _HA_ATOMIC_LOAD(&ha_tgroup_info[i].threads_enabled)) + break; +#ifdef USE_THREAD + if (i == global.nbtgroups) { + /* all are OK, let's wake them all and stop */ + for (i = 0; i < global.nbthread; i++) + if (i != tid && _HA_ATOMIC_LOAD(&ha_thread_info[i].tg->threads_enabled) & ha_thread_info[i].ltid_bit) + wake_thread(i); + break; + } +#endif + } + } + + /* If we have to sleep, measure how long */ + next = wake ? TICK_ETERNITY : next_timer_expiry(); + + /* The poller will ensure it returns around <next> */ + cur_poller.poll(&cur_poller, next, wake); + + activity[tid].loops++; + } + + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_IN_LOOP); +} + +static void *run_thread_poll_loop(void *data) +{ + struct per_thread_alloc_fct *ptaf; + struct per_thread_init_fct *ptif; + struct per_thread_deinit_fct *ptdf; + struct per_thread_free_fct *ptff; + static int init_left = 0; + __decl_thread(static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER); + __decl_thread(static pthread_cond_t init_cond = PTHREAD_COND_INITIALIZER); + + ha_set_thread(data); + set_thread_cpu_affinity(); + clock_set_local_source(); + +#ifdef USE_THREAD + ha_thread_info[tid].pth_id = ha_get_pthread_id(tid); +#endif + ha_thread_info[tid].stack_top = __builtin_frame_address(0); + + /* thread is started, from now on it is not idle nor harmless */ + thread_harmless_end(); + thread_idle_end(); + _HA_ATOMIC_OR(&th_ctx->flags, TH_FL_STARTED); + + /* Now, initialize one thread init at a time. This is better since + * some init code is a bit tricky and may release global resources + * after reallocating them locally. This will also ensure there is + * no race on file descriptors allocation. + */ +#ifdef USE_THREAD + pthread_mutex_lock(&init_mutex); +#endif + /* The first thread must set the number of threads left */ + if (!init_left) + init_left = global.nbthread; + init_left--; + + clock_init_thread_date(); + + /* per-thread alloc calls performed here are not allowed to snoop on + * other threads, so they are free to initialize at their own rhythm + * as long as they act as if they were alone. None of them may rely + * on resources initialized by the other ones. + */ + list_for_each_entry(ptaf, &per_thread_alloc_list, list) { + if (!ptaf->fct()) { + ha_alert("failed to allocate resources for thread %u.\n", tid); +#ifdef USE_THREAD + pthread_mutex_unlock(&init_mutex); +#endif + exit(1); + } + } + + /* per-thread init calls performed here are not allowed to snoop on + * other threads, so they are free to initialize at their own rhythm + * as long as they act as if they were alone. + */ + list_for_each_entry(ptif, &per_thread_init_list, list) { + if (!ptif->fct()) { + ha_alert("failed to initialize thread %u.\n", tid); +#ifdef USE_THREAD + pthread_mutex_unlock(&init_mutex); +#endif + exit(1); + } + } + + /* enabling protocols will result in fd_insert() calls to be performed, + * we want all threads to have already allocated their local fd tables + * before doing so, thus only the last thread does it. + */ + if (init_left == 0) + protocol_enable_all(); + +#ifdef USE_THREAD + pthread_cond_broadcast(&init_cond); + pthread_mutex_unlock(&init_mutex); + + /* now wait for other threads to finish starting */ + pthread_mutex_lock(&init_mutex); + while (init_left) + pthread_cond_wait(&init_cond, &init_mutex); + pthread_mutex_unlock(&init_mutex); +#endif + +#if defined(PR_SET_NO_NEW_PRIVS) && defined(USE_PRCTL) + /* Let's refrain from using setuid executables. This way the impact of + * an eventual vulnerability in a library remains limited. It may + * impact external checks but who cares about them anyway ? In the + * worst case it's possible to disable the option. Obviously we do this + * in workers only. We can't hard-fail on this one as it really is + * implementation dependent though we're interested in feedback, hence + * the warning. + */ + if (!(global.tune.options & GTUNE_INSECURE_SETUID) && !master) { + static int warn_fail; + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1 && !_HA_ATOMIC_FETCH_ADD(&warn_fail, 1)) { + ha_warning("Failed to disable setuid, please report to developers with detailed " + "information about your operating system. You can silence this warning " + "by adding 'insecure-setuid-wanted' in the 'global' section.\n"); + } + } +#endif + +#if defined(RLIMIT_NPROC) + /* all threads have started, it's now time to prevent any new thread + * or process from starting. Obviously we do this in workers only. We + * can't hard-fail on this one as it really is implementation dependent + * though we're interested in feedback, hence the warning. + */ + if (!(global.tune.options & GTUNE_INSECURE_FORK) && !master) { + struct rlimit limit = { .rlim_cur = 0, .rlim_max = 0 }; + static int warn_fail; + + if (setrlimit(RLIMIT_NPROC, &limit) == -1 && !_HA_ATOMIC_FETCH_ADD(&warn_fail, 1)) { + ha_warning("Failed to disable forks, please report to developers with detailed " + "information about your operating system. You can silence this warning " + "by adding 'insecure-fork-wanted' in the 'global' section.\n"); + } + } +#endif + run_poll_loop(); + + list_for_each_entry(ptdf, &per_thread_deinit_list, list) + ptdf->fct(); + + list_for_each_entry(ptff, &per_thread_free_list, list) + ptff->fct(); + +#ifdef USE_THREAD + if (!_HA_ATOMIC_AND_FETCH(&ha_tgroup_info[ti->tgid-1].threads_enabled, ~ti->ltid_bit)) + _HA_ATOMIC_AND(&all_tgroups_mask, ~tg->tgid_bit); + if (!_HA_ATOMIC_AND_FETCH(&tg_ctx->stopping_threads, ~ti->ltid_bit)) + _HA_ATOMIC_AND(&stopping_tgroup_mask, ~tg->tgid_bit); + if (tid > 0) + pthread_exit(NULL); +#endif + return NULL; +} + +/* set uid/gid depending on global settings */ +static void set_identity(const char *program_name) +{ + int from_uid __maybe_unused = geteuid(); + + if (global.gid) { + if (getgroups(0, NULL) > 0 && setgroups(0, NULL) == -1) + ha_warning("[%s.main()] Failed to drop supplementary groups. Using 'gid'/'group'" + " without 'uid'/'user' is generally useless.\n", program_name); + + if (setgid(global.gid) == -1) { + ha_alert("[%s.main()] Cannot set gid %d.\n", program_name, global.gid); + protocol_unbind_all(); + exit(1); + } + } + +#if defined(USE_LINUX_CAP) + if (prepare_caps_for_setuid(from_uid, global.uid) < 0) { + ha_alert("[%s.main()] Cannot switch uid to %d.\n", program_name, global.uid); + protocol_unbind_all(); + exit(1); + } +#endif + + if (global.uid && setuid(global.uid) == -1) { + ha_alert("[%s.main()] Cannot set uid %d.\n", program_name, global.uid); + protocol_unbind_all(); + exit(1); + } + +#if defined(USE_LINUX_CAP) + if (finalize_caps_after_setuid(from_uid, global.uid) < 0) { + ha_alert("[%s.main()] Cannot switch uid to %d.\n", program_name, global.uid); + protocol_unbind_all(); + exit(1); + } +#endif +} + +int main(int argc, char **argv) +{ + int err, retry; + struct rlimit limit; + int pidfd = -1; + int intovf = (unsigned char)argc + 1; /* let the compiler know it's strictly positive */ + + /* Catch broken toolchains */ + if (sizeof(long) != sizeof(void *) || (intovf + 0x7FFFFFFF >= intovf)) { + const char *msg; + + if (sizeof(long) != sizeof(void *)) + /* Apparently MingW64 was not made for us and can also break openssl */ + msg = "The compiler this program was built with uses unsupported integral type sizes.\n" + "Most likely it follows the unsupported LLP64 model. Never try to link HAProxy\n" + "against libraries built with that compiler either! Please only use a compiler\n" + "producing ILP32 or LP64 programs for both programs and libraries.\n"; + else if (intovf + 0x7FFFFFFF >= intovf) + /* Catch forced CFLAGS that miss 2-complement integer overflow */ + msg = "The source code was miscompiled by the compiler, which usually indicates that\n" + "some of the CFLAGS needed to work around overzealous compiler optimizations\n" + "were overwritten at build time. Please do not force CFLAGS, and read Makefile\n" + "and INSTALL files to decide on the best way to pass your local build options.\n"; + else + msg = "Bug in the compiler bug detection code, please report it to developers!\n"; + + fprintf(stderr, + "FATAL ERROR: invalid code detected -- cannot go further, please recompile!\n" + "%s" + "\nBuild options :" +#ifdef BUILD_TARGET + "\n TARGET = " BUILD_TARGET +#endif +#ifdef BUILD_CPU + "\n CPU = " BUILD_CPU +#endif +#ifdef BUILD_CC + "\n CC = " BUILD_CC +#endif +#ifdef BUILD_CFLAGS + "\n CFLAGS = " BUILD_CFLAGS +#endif +#ifdef BUILD_OPTIONS + "\n OPTIONS = " BUILD_OPTIONS +#endif +#ifdef BUILD_DEBUG + "\n DEBUG = " BUILD_DEBUG +#endif + "\n\n", msg); + + return 1; + } + + setvbuf(stdout, NULL, _IONBF, 0); + + /* take a copy of initial limits before we possibly change them */ + getrlimit(RLIMIT_NOFILE, &limit); + + if (limit.rlim_max == RLIM_INFINITY) + limit.rlim_max = limit.rlim_cur; + rlim_fd_cur_at_boot = limit.rlim_cur; + rlim_fd_max_at_boot = limit.rlim_max; + + /* process all initcalls in order of potential dependency */ + RUN_INITCALLS(STG_PREPARE); + RUN_INITCALLS(STG_LOCK); + RUN_INITCALLS(STG_REGISTER); + + /* now's time to initialize early boot variables */ + init_early(argc, argv); + + /* handles argument parsing */ + init_args(argc, argv); + + RUN_INITCALLS(STG_ALLOC); + RUN_INITCALLS(STG_POOL); + + /* some code really needs to have the trash properly allocated */ + if (!trash.area) { + ha_alert("failed to initialize trash buffers.\n"); + exit(1); + } + + RUN_INITCALLS(STG_INIT); + + /* this is the late init where the config is parsed */ + init(argc, argv); + + signal_register_fct(SIGQUIT, dump, SIGQUIT); + signal_register_fct(SIGUSR1, sig_soft_stop, SIGUSR1); + signal_register_fct(SIGHUP, sig_dump_state, SIGHUP); + signal_register_fct(SIGUSR2, NULL, 0); + + /* Always catch SIGPIPE even on platforms which define MSG_NOSIGNAL. + * Some recent FreeBSD setups report broken pipes, and MSG_NOSIGNAL + * was defined there, so let's stay on the safe side. + */ + signal_register_fct(SIGPIPE, NULL, 0); + + /* ulimits */ + if (!global.rlimit_nofile) + global.rlimit_nofile = global.maxsock; + + if (global.rlimit_nofile) { + limit.rlim_cur = global.rlimit_nofile; + limit.rlim_max = MAX(rlim_fd_max_at_boot, limit.rlim_cur); + + if ((global.fd_hard_limit && limit.rlim_cur > global.fd_hard_limit) || + raise_rlim_nofile(NULL, &limit) != 0) { + getrlimit(RLIMIT_NOFILE, &limit); + if (global.fd_hard_limit && limit.rlim_cur > global.fd_hard_limit) + limit.rlim_cur = global.fd_hard_limit; + + if (global.tune.options & GTUNE_STRICT_LIMITS) { + ha_alert("[%s.main()] Cannot raise FD limit to %d, limit is %d.\n", + argv[0], global.rlimit_nofile, (int)limit.rlim_cur); + exit(1); + } + else { + /* try to set it to the max possible at least */ + limit.rlim_cur = limit.rlim_max; + if (global.fd_hard_limit && limit.rlim_cur > global.fd_hard_limit) + limit.rlim_cur = global.fd_hard_limit; + + if (raise_rlim_nofile(&limit, &limit) == 0) + getrlimit(RLIMIT_NOFILE, &limit); + + ha_warning("[%s.main()] Cannot raise FD limit to %d, limit is %d.\n", + argv[0], global.rlimit_nofile, (int)limit.rlim_cur); + global.rlimit_nofile = limit.rlim_cur; + } + } + } + + if (global.rlimit_memmax) { + limit.rlim_cur = limit.rlim_max = + global.rlimit_memmax * 1048576ULL; +#ifdef RLIMIT_AS + if (setrlimit(RLIMIT_AS, &limit) == -1) { + if (global.tune.options & GTUNE_STRICT_LIMITS) { + ha_alert("[%s.main()] Cannot fix MEM limit to %d megs.\n", + argv[0], global.rlimit_memmax); + exit(1); + } + else + ha_warning("[%s.main()] Cannot fix MEM limit to %d megs.\n", + argv[0], global.rlimit_memmax); + } +#else + if (setrlimit(RLIMIT_DATA, &limit) == -1) { + if (global.tune.options & GTUNE_STRICT_LIMITS) { + ha_alert("[%s.main()] Cannot fix MEM limit to %d megs.\n", + argv[0], global.rlimit_memmax); + exit(1); + } + else + ha_warning("[%s.main()] Cannot fix MEM limit to %d megs.\n", + argv[0], global.rlimit_memmax); + } +#endif + } + + /* Try to get the listeners FD from the previous process using + * _getsocks on the stat socket, it must never been done in wait mode + * and check mode + */ + if (old_unixsocket && + !(global.mode & (MODE_MWORKER_WAIT|MODE_CHECK|MODE_CHECK_CONDITION))) { + if (strcmp("/dev/null", old_unixsocket) != 0) { + if (sock_get_old_sockets(old_unixsocket) != 0) { + ha_alert("Failed to get the sockets from the old process!\n"); + if (!(global.mode & MODE_MWORKER)) + exit(1); + } + } + } + + /* We will loop at most 100 times with 10 ms delay each time. + * That's at most 1 second. We only send a signal to old pids + * if we cannot grab at least one port. + */ + retry = MAX_START_RETRIES; + err = ERR_NONE; + while (retry >= 0) { + struct timeval w; + err = protocol_bind_all(retry == 0 || nb_oldpids == 0); + /* exit the loop on no error or fatal error */ + if ((err & (ERR_RETRYABLE|ERR_FATAL)) != ERR_RETRYABLE) + break; + if (nb_oldpids == 0 || retry == 0) + break; + + /* FIXME-20060514: Solaris and OpenBSD do not support shutdown() on + * listening sockets. So on those platforms, it would be wiser to + * simply send SIGUSR1, which will not be undoable. + */ + if (tell_old_pids(SIGTTOU) == 0) { + /* no need to wait if we can't contact old pids */ + retry = 0; + continue; + } + /* give some time to old processes to stop listening */ + w.tv_sec = 0; + w.tv_usec = 10*1000; + select(0, NULL, NULL, NULL, &w); + retry--; + } + + /* Note: protocol_bind_all() sends an alert when it fails. */ + if ((err & ~ERR_WARN) != ERR_NONE) { + ha_alert("[%s.main()] Some protocols failed to start their listeners! Exiting.\n", argv[0]); + if (retry != MAX_START_RETRIES && nb_oldpids) + tell_old_pids(SIGTTIN); + protocol_unbind_all(); /* cleanup everything we can */ + exit(1); + } + + if (!(global.mode & MODE_MWORKER_WAIT) && listeners == 0) { + ha_alert("[%s.main()] No enabled listener found (check for 'bind' directives) ! Exiting.\n", argv[0]); + /* Note: we don't have to send anything to the old pids because we + * never stopped them. */ + exit(1); + } + + /* Ok, all listeners should now be bound, close any leftover sockets + * the previous process gave us, we don't need them anymore + */ + sock_drop_unused_old_sockets(); + + /* prepare pause/play signals */ + signal_register_fct(SIGTTOU, sig_pause, SIGTTOU); + signal_register_fct(SIGTTIN, sig_listen, SIGTTIN); + + /* MODE_QUIET can inhibit alerts and warnings below this line */ + + if (getenv("HAPROXY_MWORKER_REEXEC") != NULL) { + /* either stdin/out/err are already closed or should stay as they are. */ + if ((global.mode & MODE_DAEMON)) { + /* daemon mode re-executing, stdin/stdout/stderr are already closed so keep quiet */ + global.mode &= ~MODE_VERBOSE; + global.mode |= MODE_QUIET; /* ensure that we won't say anything from now */ + } + } else { + if ((global.mode & MODE_QUIET) && !(global.mode & MODE_VERBOSE)) { + /* detach from the tty */ + stdio_quiet(-1); + } + } + + /* open log & pid files before the chroot */ + if ((global.mode & MODE_DAEMON || global.mode & MODE_MWORKER) && + !(global.mode & MODE_MWORKER_WAIT) && global.pidfile != NULL) { + unlink(global.pidfile); + pidfd = open(global.pidfile, O_CREAT | O_WRONLY | O_TRUNC, 0644); + if (pidfd < 0) { + ha_alert("[%s.main()] Cannot create pidfile %s\n", argv[0], global.pidfile); + if (nb_oldpids) + tell_old_pids(SIGTTIN); + protocol_unbind_all(); + exit(1); + } + } + + if ((global.last_checks & LSTCHK_NETADM) && global.uid) { + ha_alert("[%s.main()] Some configuration options require full privileges, so global.uid cannot be changed.\n" + "", argv[0]); + protocol_unbind_all(); + exit(1); + } + + /* If the user is not root, we'll still let them try the configuration + * but we inform them that unexpected behaviour may occur. + */ + if ((global.last_checks & LSTCHK_NETADM) && getuid()) + ha_warning("[%s.main()] Some options which require full privileges" + " might not work well.\n" + "", argv[0]); + + if ((global.mode & (MODE_MWORKER|MODE_DAEMON)) == 0) { + + /* chroot if needed */ + if (global.chroot != NULL) { + if (chroot(global.chroot) == -1 || chdir("/") == -1) { + ha_alert("[%s.main()] Cannot chroot(%s).\n", argv[0], global.chroot); + if (nb_oldpids) + tell_old_pids(SIGTTIN); + protocol_unbind_all(); + exit(1); + } + } + } + + if (nb_oldpids && !(global.mode & MODE_MWORKER_WAIT)) + nb_oldpids = tell_old_pids(oldpids_sig); + + /* send a SIGTERM to workers who have a too high reloads number */ + if ((global.mode & MODE_MWORKER) && !(global.mode & MODE_MWORKER_WAIT)) + mworker_kill_max_reloads(SIGTERM); + + /* Note that any error at this stage will be fatal because we will not + * be able to restart the old pids. + */ + + if ((global.mode & (MODE_MWORKER | MODE_DAEMON)) == 0) + set_identity(argv[0]); + + /* check ulimits */ + limit.rlim_cur = limit.rlim_max = 0; + getrlimit(RLIMIT_NOFILE, &limit); + if (limit.rlim_cur < global.maxsock) { + if (global.tune.options & GTUNE_STRICT_LIMITS) { + ha_alert("[%s.main()] FD limit (%d) too low for maxconn=%d/maxsock=%d. " + "Please raise 'ulimit-n' to %d or more to avoid any trouble.\n", + argv[0], (int)limit.rlim_cur, global.maxconn, global.maxsock, + global.maxsock); + exit(1); + } + else + ha_alert("[%s.main()] FD limit (%d) too low for maxconn=%d/maxsock=%d. " + "Please raise 'ulimit-n' to %d or more to avoid any trouble.\n", + argv[0], (int)limit.rlim_cur, global.maxconn, global.maxsock, + global.maxsock); + } + + if (global.prealloc_fd && fcntl((int)limit.rlim_cur - 1, F_GETFD) == -1) { + if (dup2(0, (int)limit.rlim_cur - 1) == -1) + ha_warning("[%s.main()] Unable to preallocate file descriptor %d : %s", + argv[0], (int)limit.rlim_cur - 1, strerror(errno)); + else + close((int)limit.rlim_cur - 1); + } + + /* update the ready date a last time to also account for final setup time */ + clock_update_date(0, 1); + clock_adjust_now_offset(); + ready_date = date; + + if (global.mode & (MODE_DAEMON | MODE_MWORKER | MODE_MWORKER_WAIT)) { + int ret = 0; + int in_parent = 0; + int devnullfd = -1; + + /* + * if daemon + mworker: must fork here to let a master + * process live in background before forking children + */ + + if ((getenv("HAPROXY_MWORKER_REEXEC") == NULL) + && (global.mode & MODE_MWORKER) + && (global.mode & MODE_DAEMON)) { + ret = fork(); + if (ret < 0) { + ha_alert("[%s.main()] Cannot fork.\n", argv[0]); + protocol_unbind_all(); + exit(1); /* there has been an error */ + } else if (ret > 0) { /* parent leave to daemonize */ + exit(0); + } else /* change the process group ID in the child (master process) */ + setsid(); + } + + + /* if in master-worker mode, write the PID of the father */ + if (global.mode & MODE_MWORKER) { + char pidstr[100]; + snprintf(pidstr, sizeof(pidstr), "%d\n", (int)getpid()); + if (pidfd >= 0) + DISGUISE(write(pidfd, pidstr, strlen(pidstr))); + } + + /* the father launches the required number of processes */ + if (!(global.mode & MODE_MWORKER_WAIT)) { + struct ring *tmp_startup_logs = NULL; + + if (global.mode & MODE_MWORKER) + mworker_ext_launch_all(); + + /* at this point the worker must have his own startup_logs buffer */ + tmp_startup_logs = startup_logs_dup(startup_logs); + ret = fork(); + if (ret < 0) { + ha_alert("[%s.main()] Cannot fork.\n", argv[0]); + protocol_unbind_all(); + exit(1); /* there has been an error */ + } + else if (ret == 0) { /* child breaks here */ + startup_logs_free(startup_logs); + startup_logs = tmp_startup_logs; + /* This one must not be exported, it's internal! */ + unsetenv("HAPROXY_MWORKER_REEXEC"); + ha_random_jump96(1); + } + else { /* parent here */ + in_parent = 1; + + if (pidfd >= 0 && !(global.mode & MODE_MWORKER)) { + char pidstr[100]; + snprintf(pidstr, sizeof(pidstr), "%d\n", ret); + DISGUISE(write(pidfd, pidstr, strlen(pidstr))); + } + if (global.mode & MODE_MWORKER) { + struct mworker_proc *child; + + ha_notice("New worker (%d) forked\n", ret); + /* find the right mworker_proc */ + list_for_each_entry(child, &proc_list, list) { + if (child->reloads == 0 && + child->options & PROC_O_TYPE_WORKER && + child->pid == -1) { + child->timestamp = date.tv_sec; + child->pid = ret; + child->version = strdup(haproxy_version); + /* at this step the fd is bound for the worker, set it to -1 so + * it could be close in case of errors in mworker_cleanup_proc() */ + child->ipc_fd[1] = -1; + break; + } + } + } + } + + } else { + /* wait mode */ + in_parent = 1; + } + + /* close the pidfile both in children and father */ + if (pidfd >= 0) { + //lseek(pidfd, 0, SEEK_SET); /* debug: emulate eglibc bug */ + close(pidfd); + } + + /* We won't ever use this anymore */ + ha_free(&global.pidfile); + + if (in_parent) { + if (global.mode & (MODE_MWORKER|MODE_MWORKER_WAIT)) { + master = 1; + + if ((!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) && + (global.mode & MODE_DAEMON)) { + /* detach from the tty, this is required to properly daemonize. */ + if ((getenv("HAPROXY_MWORKER_REEXEC") == NULL)) + stdio_quiet(-1); + + global.mode &= ~MODE_VERBOSE; + global.mode |= MODE_QUIET; /* ensure that we won't say anything from now */ + } + + if (global.mode & MODE_MWORKER_WAIT) { + /* only the wait mode handles the master CLI */ + mworker_loop(); + } else { + +#if defined(USE_SYSTEMD) + if (global.tune.options & GTUNE_USE_SYSTEMD) + sd_notifyf(0, "READY=1\nMAINPID=%lu\nSTATUS=Ready.\n", (unsigned long)getpid()); +#endif + /* if not in wait mode, reload in wait mode to free the memory */ + setenv("HAPROXY_LOAD_SUCCESS", "1", 1); + ha_notice("Loading success.\n"); + proc_self->failedreloads = 0; /* reset the number of failure */ + mworker_reexec_waitmode(); + } + /* should never get there */ + exit(EXIT_FAILURE); + } +#if defined(USE_OPENSSL) && !defined(OPENSSL_NO_DH) + ssl_free_dh(); +#endif + exit(0); /* parent must leave */ + } + + /* child must never use the atexit function */ + atexit_flag = 0; + + /* close useless master sockets */ + if (global.mode & MODE_MWORKER) { + struct mworker_proc *child, *it; + master = 0; + + mworker_cli_proxy_stop(); + + /* free proc struct of other processes */ + list_for_each_entry_safe(child, it, &proc_list, list) { + /* close the FD of the master side for all + * workers, we don't need to close the worker + * side of other workers since it's done with + * the bind_proc */ + if (child->ipc_fd[0] >= 0) { + close(child->ipc_fd[0]); + child->ipc_fd[0] = -1; + } + if (child->options & PROC_O_TYPE_WORKER && + child->reloads == 0 && + child->pid == -1) { + /* keep this struct if this is our pid */ + proc_self = child; + continue; + } + LIST_DELETE(&child->list); + mworker_free_child(child); + child = NULL; + } + } + + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) { + devnullfd = open("/dev/null", O_RDWR, 0); + if (devnullfd < 0) { + ha_alert("Cannot open /dev/null\n"); + exit(EXIT_FAILURE); + } + } + + /* Must chroot and setgid/setuid in the children */ + /* chroot if needed */ + if (global.chroot != NULL) { + if (chroot(global.chroot) == -1 || chdir("/") == -1) { + ha_alert("[%s.main()] Cannot chroot(%s).\n", argv[0], global.chroot); + if (nb_oldpids) + tell_old_pids(SIGTTIN); + protocol_unbind_all(); + exit(1); + } + } + + ha_free(&global.chroot); + set_identity(argv[0]); + + /* + * This is only done in daemon mode because we might want the + * logs on stdout in mworker mode. If we're NOT in QUIET mode, + * we should now close the 3 first FDs to ensure that we can + * detach from the TTY. We MUST NOT do it in other cases since + * it would have already be done, and 0-2 would have been + * affected to listening sockets + */ + if ((global.mode & MODE_DAEMON) && + (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE))) { + /* detach from the tty */ + stdio_quiet(devnullfd); + global.mode &= ~MODE_VERBOSE; + global.mode |= MODE_QUIET; /* ensure that we won't say anything from now */ + } + pid = getpid(); /* update child's pid */ + if (!(global.mode & MODE_MWORKER)) /* in mworker mode we don't want a new pgid for the children */ + setsid(); + fork_poller(); + } + + /* pass through every cli socket, and check if it's bound to + * the current process and if it exposes listeners sockets. + * Caution: the GTUNE_SOCKET_TRANSFER is now set after the fork. + * */ + + if (global.cli_fe) { + struct bind_conf *bind_conf; + + list_for_each_entry(bind_conf, &global.cli_fe->conf.bind, by_fe) { + if (bind_conf->level & ACCESS_FD_LISTENERS) { + global.tune.options |= GTUNE_SOCKET_TRANSFER; + break; + } + } + } + + /* Note that here we can't be in the parent/master anymore */ +#if !defined(USE_THREAD) && defined(USE_CPU_AFFINITY) + if (ha_cpuset_count(&cpu_map[0].thread[0])) { /* only do this if the process has a CPU map */ + +#if defined(CPUSET_USE_CPUSET) || defined(__DragonFly__) + struct hap_cpuset *set = &cpu_map[0].thread[0]; + sched_setaffinity(0, sizeof(set->cpuset), &set->cpuset); +#elif defined(__FreeBSD__) + struct hap_cpuset *set = &cpu_map[0].thread[0]; + ret = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, sizeof(set->cpuset), &set->cpuset); +#endif + } +#endif + /* try our best to re-enable core dumps depending on system capabilities. + * What is addressed here : + * - remove file size limits + * - remove core size limits + * - mark the process dumpable again if it lost it due to user/group + */ + if (global.tune.options & GTUNE_SET_DUMPABLE) { + limit.rlim_cur = limit.rlim_max = RLIM_INFINITY; + +#if defined(RLIMIT_FSIZE) + if (setrlimit(RLIMIT_FSIZE, &limit) == -1) { + if (global.tune.options & GTUNE_STRICT_LIMITS) { + ha_alert("[%s.main()] Failed to set the raise the maximum " + "file size.\n", argv[0]); + exit(1); + } + else + ha_warning("[%s.main()] Failed to set the raise the maximum " + "file size.\n", argv[0]); + } +#endif + +#if defined(RLIMIT_CORE) + if (setrlimit(RLIMIT_CORE, &limit) == -1) { + if (global.tune.options & GTUNE_STRICT_LIMITS) { + ha_alert("[%s.main()] Failed to set the raise the core " + "dump size.\n", argv[0]); + exit(1); + } + else + ha_warning("[%s.main()] Failed to set the raise the core " + "dump size.\n", argv[0]); + } +#endif + +#if defined(USE_PRCTL) + if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) == -1) + ha_warning("[%s.main()] Failed to set the dumpable flag, " + "no core will be dumped.\n", argv[0]); +#elif defined(USE_PROCCTL) + { + int traceable = PROC_TRACE_CTL_ENABLE; + if (procctl(P_PID, getpid(), PROC_TRACE_CTL, &traceable) == -1) + ha_warning("[%s.main()] Failed to set the traceable flag, " + "no core will be dumped.\n", argv[0]); + } +#endif + } + + global.mode &= ~MODE_STARTING; + reset_usermsgs_ctx(); + + /* start threads 2 and above */ + setup_extra_threads(&run_thread_poll_loop); + + /* when multithreading we need to let only the thread 0 handle the signals */ + haproxy_unblock_signals(); + + /* Finally, start the poll loop for the first thread */ + run_thread_poll_loop(&ha_thread_info[0]); + + /* wait for all threads to terminate */ + wait_for_threads_completion(); + + deinit_and_exit(0); +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 0000000..5c92e94 --- /dev/null +++ b/src/hash.c @@ -0,0 +1,190 @@ +/* + * Hash function implementation + * + * See mailing list thread on "Consistent hashing alternative to sdbm" + * http://marc.info/?l=haproxy&m=138213693909219 + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + + +#include <haproxy/compiler.h> +#include <haproxy/hash.h> + + +unsigned int hash_wt6(const void *input, int len) +{ + const unsigned char *key = input; + unsigned h0 = 0xa53c965aUL; + unsigned h1 = 0x5ca6953aUL; + unsigned step0 = 6; + unsigned step1 = 18; + + for (; len > 0; len--) { + unsigned int t; + + t = *key; + key++; + + h0 = ~(h0 ^ t); + h1 = ~(h1 + t); + + t = (h1 << step0) | (h1 >> (32-step0)); + h1 = (h0 << step1) | (h0 >> (32-step1)); + h0 = t; + + t = ((h0 >> 16) ^ h1) & 0xffff; + step0 = t & 0x1F; + step1 = t >> 11; + } + return h0 ^ h1; +} + +unsigned int hash_djb2(const void *input, int len) +{ + const unsigned char *key = input; + unsigned int hash = 5381; + + /* the hash unrolled eight times */ + for (; len >= 8; len -= 8) { + hash = ((hash << 5) + hash) + *key++; + hash = ((hash << 5) + hash) + *key++; + hash = ((hash << 5) + hash) + *key++; + hash = ((hash << 5) + hash) + *key++; + hash = ((hash << 5) + hash) + *key++; + hash = ((hash << 5) + hash) + *key++; + hash = ((hash << 5) + hash) + *key++; + hash = ((hash << 5) + hash) + *key++; + } + switch (len) { + case 7: hash = ((hash << 5) + hash) + *key++; __fallthrough; + case 6: hash = ((hash << 5) + hash) + *key++; __fallthrough; + case 5: hash = ((hash << 5) + hash) + *key++; __fallthrough; + case 4: hash = ((hash << 5) + hash) + *key++; __fallthrough; + case 3: hash = ((hash << 5) + hash) + *key++; __fallthrough; + case 2: hash = ((hash << 5) + hash) + *key++; __fallthrough; + case 1: hash = ((hash << 5) + hash) + *key++; break; + default: /* case 0: */ break; + } + return hash; +} + +unsigned int hash_sdbm(const void *input, int len) +{ + const unsigned char *key = input; + unsigned int hash = 0; + int c; + + while (len--) { + c = *key++; + hash = c + (hash << 6) + (hash << 16) - hash; + } + + return hash; +} + +/* Small yet efficient CRC32 calculation loosely inspired from crc32b found + * here : http://www.hackersdelight.org/hdcodetxt/crc.c.txt + * The magic value represents the polynom with one bit per exponent. Much + * faster table-based versions exist but are pointless for our usage here, + * this hash already sustains gigabit speed which is far faster than what + * we'd ever need. Better preserve the CPU's cache instead. + */ +unsigned int hash_crc32(const void *input, int len) +{ + const unsigned char *key = input; + unsigned int hash; + int bit; + + hash = ~0; + while (len--) { + hash ^= *key++; + for (bit = 0; bit < 8; bit++) + hash = (hash >> 1) ^ ((hash & 1) ? 0xedb88320 : 0); + } + return ~hash; +} + +/* CRC32c poly 0x11EDC6F41 (RFC4960, Appendix B [8].) */ +static const uint32_t crctable[256] = { + 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, + 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, + 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, + 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, + 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, + 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, + 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, + 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, + 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, + 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, + 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, + 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, + 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, + 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, + 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, + 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, + 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, + 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, + 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, + 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, + 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, + 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, + 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, + 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, + 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, + 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, + 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, + 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, + 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, + 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, + 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, + 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, + 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, + 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, + 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, + 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, + 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, + 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, + 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, + 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, + 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, + 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, + 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, + 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, + 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, + 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, + 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, + 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, + 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, + 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, + 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, + 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, + 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, + 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, + 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, + 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, + 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, + 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, + 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, + 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, + 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, + 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, + 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL, + 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L +}; + +uint32_t hash_crc32c(const void *input, int len) +{ + const unsigned char *buf = input; + uint32_t crc = 0xffffffff; + while (len-- > 0) { + crc = (crc >> 8) ^ crctable[(crc ^ (*buf++)) & 0xff]; + } + return (crc ^ 0xffffffff); +} diff --git a/src/hlua.c b/src/hlua.c new file mode 100644 index 0000000..d1f5323 --- /dev/null +++ b/src/hlua.c @@ -0,0 +1,13961 @@ +/* + * Lua unsafe core engine + * + * Copyright 2015-2016 Thierry Fournier <tfournier@arpalert.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE + +#include <ctype.h> +#include <setjmp.h> + +#include <lauxlib.h> +#include <lua.h> +#include <lualib.h> + +#if !defined(LUA_VERSION_NUM) || LUA_VERSION_NUM < 503 +#error "Requires Lua 5.3 or later." +#endif + +#include <import/ebpttree.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/arg.h> +#include <haproxy/auth.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/cli.h> +#include <haproxy/clock.h> +#include <haproxy/connection.h> +#include <haproxy/filters.h> +#include <haproxy/h1.h> +#include <haproxy/hlua.h> +#include <haproxy/hlua_fcn.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_client.h> +#include <haproxy/http_fetch.h> +#include <haproxy/http_htx.h> +#include <haproxy/http_rules.h> +#include <haproxy/log.h> +#include <haproxy/map.h> +#include <haproxy/obj_type.h> +#include <haproxy/pattern.h> +#include <haproxy/payload.h> +#include <haproxy/proxy.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/session.h> +#include <haproxy/ssl_ckch.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/stats-t.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/thread.h> +#include <haproxy/tools.h> +#include <haproxy/vars.h> +#include <haproxy/xref.h> +#include <haproxy/event_hdl.h> +#include <haproxy/check.h> +#include <haproxy/mailers.h> + +/* Global LUA flags */ + +enum hlua_log_opt { + /* tune.lua.log.loggers */ + HLUA_LOG_LOGGERS_ON = 0x00000001, /* forward logs to current loggers */ + + /* tune.lua.log.stderr */ + HLUA_LOG_STDERR_ON = 0x00000010, /* forward logs to stderr */ + HLUA_LOG_STDERR_AUTO = 0x00000020, /* forward logs to stderr if no loggers */ + HLUA_LOG_STDERR_MASK = 0x00000030, +}; +/* default log options, made of flags in hlua_log_opt */ +static uint hlua_log_opts = HLUA_LOG_LOGGERS_ON | HLUA_LOG_STDERR_AUTO; + +/* Lua uses longjmp to perform yield or throwing errors. This + * macro is used only for identifying the function that can + * not return because a longjmp is executed. + * __LJMP marks a prototype of hlua file that can use longjmp. + * WILL_LJMP() marks an lua function that will use longjmp. + * MAY_LJMP() marks an lua function that may use longjmp. + */ +#define __LJMP +#define WILL_LJMP(func) do { func; my_unreachable(); } while(0) +#define MAY_LJMP(func) func + +/* This couple of function executes securely some Lua calls outside of + * the lua runtime environment. Each Lua call can return a longjmp + * if it encounter a memory error. + * + * Lua documentation extract: + * + * If an error happens outside any protected environment, Lua calls + * a panic function (see lua_atpanic) and then calls abort, thus + * exiting the host application. Your panic function can avoid this + * exit by never returning (e.g., doing a long jump to your own + * recovery point outside Lua). + * + * The panic function runs as if it were a message handler (see + * #2.3); in particular, the error message is at the top of the + * stack. However, there is no guarantee about stack space. To push + * anything on the stack, the panic function must first check the + * available space (see #4.2). + * + * We must check all the Lua entry point. This includes: + * - The include/proto/hlua.h exported functions + * - the task wrapper function + * - The action wrapper function + * - The converters wrapper function + * - The sample-fetch wrapper functions + * + * It is tolerated that the initialisation function returns an abort. + * Before each Lua abort, an error message is written on stderr. + * + * The macro SET_SAFE_LJMP initialise the longjmp. The Macro + * RESET_SAFE_LJMP reset the longjmp. These function must be macro + * because they must be exists in the program stack when the longjmp + * is called. + * + * Note that the Lua processing is not really thread safe. It provides + * heavy system which consists to add our own lock function in the Lua + * code and recompile the library. This system will probably not accepted + * by maintainers of various distribs. + * + * Our main execution point of the Lua is the function lua_resume(). A + * quick looking on the Lua sources displays a lua_lock() a the start + * of function and a lua_unlock() at the end of the function. So I + * conclude that the Lua thread safe mode just perform a mutex around + * all execution. So I prefer to do this in the HAProxy code, it will be + * easier for distro maintainers. + * + * Note that the HAProxy lua functions rounded by the macro SET_SAFE_LJMP + * and RESET_SAFE_LJMP manipulates the Lua stack, so it will be careful + * to set mutex around these functions. + */ +__decl_spinlock(hlua_global_lock); +THREAD_LOCAL jmp_buf safe_ljmp_env; +static int hlua_panic_safe(lua_State *L) { return 0; } +static int hlua_panic_ljmp(lua_State *L) { WILL_LJMP(longjmp(safe_ljmp_env, 1)); return 0; } + +/* This is the chained list of struct hlua_function referenced + * for haproxy action, sample-fetches, converters, cli and + * applet bindings. It is used for a post-initialisation control. + */ +static struct list referenced_functions = LIST_HEAD_INIT(referenced_functions); + +/* This variable is used only during initialization to identify the Lua state + * currently being initialized. 0 is the common lua state, 1 to n are the Lua + * states dedicated to each thread (in this case hlua_state_id==tid+1). + */ +static int hlua_state_id; + +/* This is a NULL-terminated list of lua file which are referenced to load per thread */ +static char ***per_thread_load = NULL; + +lua_State *hlua_init_state(int thread_id); + +/* This function takes the Lua global lock. Keep this function's visibility + * global so that it can appear in stack dumps and performance profiles! + */ +static inline void lua_take_global_lock() +{ + HA_SPIN_LOCK(LUA_LOCK, &hlua_global_lock); +} + +static inline void lua_drop_global_lock() +{ + HA_SPIN_UNLOCK(LUA_LOCK, &hlua_global_lock); +} + +/* lua lock helpers: only lock when required + * + * state_id == 0: we're operating on the main lua stack (shared between + * os threads), so we need to acquire the main lock + * + * If the thread already owns the lock (_hlua_locked != 0), skip the lock + * attempt. This could happen if we run under protected lua environment. + * Not doing this could result in deadlocks because of nested locking + * attempts from the same thread + */ +static THREAD_LOCAL int _hlua_locked = 0; +static inline void hlua_lock(struct hlua *hlua) +{ + if (hlua->state_id != 0) + return; + if (!_hlua_locked) + lua_take_global_lock(); + _hlua_locked += 1; +} +static inline void hlua_unlock(struct hlua *hlua) +{ + if (hlua->state_id != 0) + return; + BUG_ON(_hlua_locked <= 0); + _hlua_locked--; + /* drop the lock once the lock count reaches 0 */ + if (!_hlua_locked) + lua_drop_global_lock(); +} + +#define SET_SAFE_LJMP_L(__L, __HLUA) \ + ({ \ + int ret; \ + hlua_lock(__HLUA); \ + if (setjmp(safe_ljmp_env) != 0) { \ + lua_atpanic(__L, hlua_panic_safe); \ + ret = 0; \ + hlua_unlock(__HLUA); \ + } else { \ + lua_atpanic(__L, hlua_panic_ljmp); \ + ret = 1; \ + } \ + ret; \ + }) + +/* If we are the last function catching Lua errors, we + * must reset the panic function. + */ +#define RESET_SAFE_LJMP_L(__L, __HLUA) \ + do { \ + lua_atpanic(__L, hlua_panic_safe); \ + hlua_unlock(__HLUA); \ + } while(0) + +#define SET_SAFE_LJMP(__HLUA) \ + SET_SAFE_LJMP_L((__HLUA)->T, __HLUA) + +#define RESET_SAFE_LJMP(__HLUA) \ + RESET_SAFE_LJMP_L((__HLUA)->T, __HLUA) + +#define SET_SAFE_LJMP_PARENT(__HLUA) \ + SET_SAFE_LJMP_L(hlua_states[(__HLUA)->state_id], __HLUA) + +#define RESET_SAFE_LJMP_PARENT(__HLUA) \ + RESET_SAFE_LJMP_L(hlua_states[(__HLUA)->state_id], __HLUA) + +/* Applet status flags */ +#define APPLET_DONE 0x01 /* applet processing is done. */ +/* unused: 0x02 */ +#define APPLET_HDR_SENT 0x04 /* Response header sent. */ +/* unused: 0x08, 0x10 */ +#define APPLET_HTTP11 0x20 /* Last chunk sent. */ +#define APPLET_RSP_SENT 0x40 /* The response was fully sent */ + +/* The main Lua execution context. The 0 index is the + * common state shared by all threads. + */ +static lua_State *hlua_states[MAX_THREADS + 1]; + +#define HLUA_FLT_CB_FINAL 0x00000001 +#define HLUA_FLT_CB_RETVAL 0x00000002 +#define HLUA_FLT_CB_ARG_CHN 0x00000004 +#define HLUA_FLT_CB_ARG_HTTP_MSG 0x00000008 + +#define HLUA_FLT_CTX_FL_PAYLOAD 0x00000001 + +struct hlua_reg_filter { + char *name; + int flt_ref[MAX_THREADS + 1]; + int fun_ref[MAX_THREADS + 1]; + struct list l; +}; + +struct hlua_flt_config { + struct hlua_reg_filter *reg; + int ref[MAX_THREADS + 1]; + char **args; +}; + +struct hlua_flt_ctx { + int ref; /* ref to the filter lua object */ + struct hlua *hlua[2]; /* lua runtime context (0: request, 1: response) */ + unsigned int cur_off[2]; /* current offset (0: request, 1: response) */ + unsigned int cur_len[2]; /* current forwardable length (0: request, 1: response) */ + unsigned int flags; /* HLUA_FLT_CTX_FL_* */ +}; + +/* appctx context used by the cosockets */ +struct hlua_csk_ctx { + int connected; + struct xref xref; /* cross reference with the Lua object owner. */ + struct list wake_on_read; + struct list wake_on_write; + struct appctx *appctx; + struct server *srv; + int timeout; + int die; +}; + +/* appctx context used by TCP services */ +struct hlua_tcp_ctx { + struct hlua *hlua; + int flags; + struct task *task; +}; + +/* appctx context used by HTTP services */ +struct hlua_http_ctx { + struct hlua *hlua; + int left_bytes; /* The max amount of bytes that we can read. */ + int flags; + int status; + const char *reason; + struct task *task; +}; + +/* used by registered CLI keywords */ +struct hlua_cli_ctx { + struct hlua *hlua; + struct task *task; + struct hlua_function *fcn; +}; + +DECLARE_STATIC_POOL(pool_head_hlua_flt_ctx, "hlua_flt_ctx", sizeof(struct hlua_flt_ctx)); + +static int hlua_filter_from_payload(struct filter *filter); + +/* This is the chained list of struct hlua_flt referenced + * for haproxy filters. It is used for a post-initialisation control. + */ +static struct list referenced_filters = LIST_HEAD_INIT(referenced_filters); + + +/* This is the memory pool containing struct lua for applets + * (including cli). + */ +DECLARE_STATIC_POOL(pool_head_hlua, "hlua", sizeof(struct hlua)); + +/* Used for Socket connection. */ +static struct proxy *socket_proxy; +static struct server *socket_tcp; +#ifdef USE_OPENSSL +static struct server *socket_ssl; +#endif + +/* List head of the function called at the initialisation time. */ +struct list hlua_init_functions[MAX_THREADS + 1]; + +/* The following variables contains the reference of the different + * Lua classes. These references are useful for identify metadata + * associated with an object. + */ +static int class_txn_ref; +static int class_socket_ref; +static int class_channel_ref; +static int class_fetches_ref; +static int class_converters_ref; +static int class_http_ref; +static int class_http_msg_ref; +static int class_httpclient_ref; +static int class_map_ref; +static int class_applet_tcp_ref; +static int class_applet_http_ref; +static int class_txn_reply_ref; + +/* Lua max execution timeouts. By default, stream-related + * lua coroutines (e.g.: actions) have a short timeout. + * On the other hand tasks coroutines don't have a timeout because + * a task may remain alive during all the haproxy execution. + * + * Timeouts are expressed in milliseconds, they are meant to be used + * with hlua timer's API exclusively. + * 0 means no timeout + */ +static uint32_t hlua_timeout_burst = 1000; /* burst timeout. */ +static uint32_t hlua_timeout_session = 4000; /* session timeout. */ +static uint32_t hlua_timeout_task = 0; /* task timeout. */ +static uint32_t hlua_timeout_applet = 4000; /* applet timeout. */ + +/* hlua multipurpose timer: + * used to compute burst lua time (within a single hlua_ctx_resume()) + * and cumulative lua time for a given coroutine, and to check + * the lua coroutine against the configured timeouts + */ + +/* fetch per-thread cpu_time with ms precision (may wrap) */ +static inline uint32_t _hlua_time_ms() +{ + /* We're interested in the current cpu time in ms, which will be returned + * as a uint32_t to save some space. + * We must take the following into account: + * + * - now_cpu_time_fast() which returns the time in nanoseconds as a uint64_t + * will wrap every 585 years. + * - uint32_t may only contain 4294967295ms (~=49.7 days), so _hlua_time_ms() + * itself will also wrap every 49.7 days. + * + * While we can safely ignore the now_cpu_time_fast() wrap, we must + * take care of the uint32_t wrap by making sure to exclusively + * manipulate the time using uint32_t everywhere _hlua_time_ms() + * is involved. + */ + return (uint32_t)(now_cpu_time_fast() / 1000000ULL); +} + +/* computes time spent in a single lua execution (in ms) */ +static inline uint32_t _hlua_time_burst(const struct hlua_timer *timer) +{ + uint32_t burst_ms; + + /* wrapping is expected and properly + * handled thanks to _hlua_time_ms() and burst_ms + * being of the same type + */ + burst_ms = _hlua_time_ms() - timer->start; + return burst_ms; +} + +static inline void hlua_timer_init(struct hlua_timer *timer, unsigned int max) +{ + timer->cumulative = 0; + timer->burst = 0; + timer->max = max; +} + +/* reset the timer ctx between 2 yields */ +static inline void hlua_timer_reset(struct hlua_timer *timer) +{ + timer->cumulative += timer->burst; + timer->burst = 0; +} + +/* start the timer right before a new execution */ +static inline void hlua_timer_start(struct hlua_timer *timer) +{ + timer->start = _hlua_time_ms(); +} + +/* update hlua timer when finishing an execution */ +static inline void hlua_timer_stop(struct hlua_timer *timer) +{ + timer->burst += _hlua_time_burst(timer); +} + +/* check the timers for current hlua context: + * - first check for burst timeout (max execution time for the current + hlua resume, ie: time between effective yields) + * - then check for yield cumulative timeout + * + * Returns 1 if the check succeeded and 0 if it failed + * (ie: timeout exceeded) + */ +static inline int hlua_timer_check(const struct hlua_timer *timer) +{ + uint32_t pburst = _hlua_time_burst(timer); /* pending burst time in ms */ + + if (hlua_timeout_burst && (timer->burst + pburst) > hlua_timeout_burst) + return 0; /* burst timeout exceeded */ + if (timer->max && (timer->cumulative + timer->burst + pburst) > timer->max) + return 0; /* cumulative timeout exceeded */ + return 1; /* ok */ +} + +/* Interrupts the Lua processing each "hlua_nb_instruction" instructions. + * it is used for preventing infinite loops. + * + * I test the scheer with an infinite loop containing one incrementation + * and one test. I run this loop between 10 seconds, I raise a ceil of + * 710M loops from one interrupt each 9000 instructions, so I fix the value + * to one interrupt each 10 000 instructions. + * + * configured | Number of + * instructions | loops executed + * between two | in milions + * forced yields | + * ---------------+--------------- + * 10 | 160 + * 500 | 670 + * 1000 | 680 + * 5000 | 700 + * 7000 | 700 + * 8000 | 700 + * 9000 | 710 <- ceil + * 10000 | 710 + * 100000 | 710 + * 1000000 | 710 + * + */ +static unsigned int hlua_nb_instruction = 10000; + +/* Descriptor for the memory allocation state. The limit is pre-initialised to + * 0 until it is replaced by "tune.lua.maxmem" during the config parsing, or it + * is replaced with ~0 during post_init after everything was loaded. This way + * it is guaranteed that if limit is ~0 the boot is complete and that if it's + * zero it's not yet limited and proper accounting is required. + */ +struct hlua_mem_allocator { + size_t allocated; + size_t limit; +}; + +static struct hlua_mem_allocator hlua_global_allocator THREAD_ALIGNED(64); + +/* hlua event subscription */ +struct hlua_event_sub { + int fcn_ref; + int state_id; + struct hlua *hlua; + struct task *task; + event_hdl_async_equeue equeue; + struct event_hdl_sub *sub; + uint8_t paused; +}; + +/* This is the memory pool containing struct hlua_event_sub + * for event subscriptions from lua + */ +DECLARE_STATIC_POOL(pool_head_hlua_event_sub, "hlua_esub", sizeof(struct hlua_event_sub)); + +/* These functions converts types between HAProxy internal args or + * sample and LUA types. Another function permits to check if the + * LUA stack contains arguments according with an required ARG_T + * format. + */ +__LJMP static int hlua_arg2lua(lua_State *L, const struct arg *arg); +static int hlua_lua2arg(lua_State *L, int ud, struct arg *arg); +__LJMP static int hlua_lua2arg_check(lua_State *L, int first, struct arg *argp, + uint64_t mask, struct proxy *p); +__LJMP static int hlua_smp2lua(lua_State *L, struct sample *smp); +__LJMP static int hlua_smp2lua_str(lua_State *L, struct sample *smp); +static int hlua_lua2smp(lua_State *L, int ud, struct sample *smp); + +__LJMP static int hlua_http_get_headers(lua_State *L, struct http_msg *msg); + +struct prepend_path { + struct list l; + char *type; + char *path; +}; + +static struct list prepend_path_list = LIST_HEAD_INIT(prepend_path_list); + +#define SEND_ERR(__be, __fmt, __args...) \ + do { \ + send_log(__be, LOG_ERR, __fmt, ## __args); \ + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) \ + ha_alert(__fmt, ## __args); \ + } while (0) + +static inline struct hlua_function *new_hlua_function() +{ + struct hlua_function *fcn; + int i; + + fcn = calloc(1, sizeof(*fcn)); + if (!fcn) + return NULL; + LIST_APPEND(&referenced_functions, &fcn->l); + for (i = 0; i < MAX_THREADS + 1; i++) + fcn->function_ref[i] = -1; + return fcn; +} + +static inline void release_hlua_function(struct hlua_function *fcn) +{ + if (!fcn) + return; + if (fcn->name) + ha_free(&fcn->name); + LIST_DELETE(&fcn->l); + ha_free(&fcn); +} + +/* If the common state is set, the stack id is 0, otherwise it is the tid + 1 */ +static inline int fcn_ref_to_stack_id(struct hlua_function *fcn) +{ + if (fcn->function_ref[0] == -1) + return tid + 1; + return 0; +} + +/* Create a new registered filter. Only its name is filled */ +static inline struct hlua_reg_filter *new_hlua_reg_filter(const char *name) +{ + struct hlua_reg_filter *reg_flt; + int i; + + reg_flt = calloc(1, sizeof(*reg_flt)); + if (!reg_flt) + return NULL; + reg_flt->name = strdup(name); + if (!reg_flt->name) { + free(reg_flt); + return NULL; + } + LIST_APPEND(&referenced_filters, ®_flt->l); + for (i = 0; i < MAX_THREADS + 1; i++) { + reg_flt->flt_ref[i] = -1; + reg_flt->fun_ref[i] = -1; + } + return reg_flt; +} + +/* Release a registered filter */ +static inline void release_hlua_reg_filter(struct hlua_reg_filter *reg_flt) +{ + if (!reg_flt) + return; + if (reg_flt->name) + ha_free(®_flt->name); + LIST_DELETE(®_flt->l); + ha_free(®_flt); +} + +/* If the common state is set, the stack id is 0, otherwise it is the tid + 1 */ +static inline int reg_flt_to_stack_id(struct hlua_reg_filter *reg_flt) +{ + if (reg_flt->fun_ref[0] == -1) + return tid + 1; + return 0; +} + +/* Used to check an Lua function type in the stack. It creates and + * returns a reference of the function. This function throws an + * error if the argument is not a "function". + * When no longer used, the ref must be released with hlua_unref() + */ +__LJMP int hlua_checkfunction(lua_State *L, int argno) +{ + if (!lua_isfunction(L, argno)) { + const char *msg = lua_pushfstring(L, "function expected, got %s", luaL_typename(L, argno)); + WILL_LJMP(luaL_argerror(L, argno, msg)); + } + lua_pushvalue(L, argno); + return luaL_ref(L, LUA_REGISTRYINDEX); +} + +/* Used to check an Lua table type in the stack. It creates and + * returns a reference of the table. This function throws an + * error if the argument is not a "table". + * When no longer used, the ref must be released with hlua_unref() + */ +__LJMP int hlua_checktable(lua_State *L, int argno) +{ + if (!lua_istable(L, argno)) { + const char *msg = lua_pushfstring(L, "table expected, got %s", luaL_typename(L, argno)); + WILL_LJMP(luaL_argerror(L, argno, msg)); + } + lua_pushvalue(L, argno); + return luaL_ref(L, LUA_REGISTRYINDEX); +} + +/* Get a reference to the object that is at the top of the stack + * The referenced object will be popped from the stack + * + * The function returns the reference to the object which must + * be cleared using hlua_unref() when no longer used + */ +__LJMP int hlua_ref(lua_State *L) +{ + return MAY_LJMP(luaL_ref(L, LUA_REGISTRYINDEX)); +} + +/* Pushes a reference previously created using luaL_ref(L, LUA_REGISTRYINDEX) + * on <L> stack + * (ie: hlua_checkfunction(), hlua_checktable() or hlua_ref()) + * + * When the reference is no longer used, it should be released by calling + * hlua_unref() + * + * <L> can be from any co-routine as long as it belongs to the same lua + * parent state that the one used to get the reference. + */ +void hlua_pushref(lua_State *L, int ref) +{ + lua_rawgeti(L, LUA_REGISTRYINDEX, ref); +} + +/* Releases a reference previously created using luaL_ref(L, LUA_REGISTRYINDEX) + * (ie: hlua_checkfunction(), hlua_checktable() or hlua_ref()) + * + * This will allow the reference to be reused and the referred object + * to be garbage collected. + * + * <L> can be from any co-routine as long as it belongs to the same lua + * parent state that the one used to get the reference. + */ +void hlua_unref(lua_State *L, int ref) +{ + luaL_unref(L, LUA_REGISTRYINDEX, ref); +} + +__LJMP const char *hlua_traceback(lua_State *L, const char* sep) +{ + lua_Debug ar; + int level = 0; + struct buffer *msg = get_trash_chunk(); + + while (lua_getstack(L, level++, &ar)) { + /* Fill fields: + * 'S': fills in the fields source, short_src, linedefined, lastlinedefined, and what; + * 'l': fills in the field currentline; + * 'n': fills in the field name and namewhat; + * 't': fills in the field istailcall; + */ + lua_getinfo(L, "Slnt", &ar); + + /* skip these empty entries, usually they come from deep C functions */ + if (ar.currentline < 0 && *ar.what == 'C' && !*ar.namewhat && !ar.name) + continue; + + /* Add separator */ + if (b_data(msg)) + chunk_appendf(msg, "%s", sep); + + /* Append code localisation */ + if (ar.currentline > 0) + chunk_appendf(msg, "%s:%d: ", ar.short_src, ar.currentline); + else + chunk_appendf(msg, "%s: ", ar.short_src); + + /* + * Get function name + * + * if namewhat is no empty, name is defined. + * what contains "Lua" for Lua function, "C" for C function, + * or "main" for main code. + */ + if (*ar.namewhat != '\0' && ar.name != NULL) /* is there a name from code? */ + chunk_appendf(msg, "in %s '%s'", ar.namewhat, ar.name); /* use it */ + + else if (*ar.what == 'm') /* "main", the code is not executed in a function */ + chunk_appendf(msg, "in main chunk"); + + else if (*ar.what != 'C') /* for Lua functions, use <file:line> */ + chunk_appendf(msg, "in function line %d", ar.linedefined); + + else /* nothing left... */ + chunk_appendf(msg, "?"); + + + /* Display tailed call */ + if (ar.istailcall) + chunk_appendf(msg, " ..."); + } + + return msg->area; +} + + +/* This function check the number of arguments available in the + * stack. If the number of arguments available is not the same + * then <nb> an error is thrown. + */ +__LJMP static inline void check_args(lua_State *L, int nb, char *fcn) +{ + if (lua_gettop(L) == nb) + return; + WILL_LJMP(luaL_error(L, "'%s' needs %d arguments", fcn, nb)); +} + +/* This function pushes an error string prefixed by the file name + * and the line number where the error is encountered. + */ +static int hlua_pusherror(lua_State *L, const char *fmt, ...) +{ + va_list argp; + va_start(argp, fmt); + luaL_where(L, 1); + lua_pushvfstring(L, fmt, argp); + va_end(argp); + lua_concat(L, 2); + return 1; +} + +/* This functions is used with sample fetch and converters. It + * converts the HAProxy configuration argument in a lua stack + * values. + * + * It takes an array of "arg", and each entry of the array is + * converted and pushed in the LUA stack. + */ +__LJMP static int hlua_arg2lua(lua_State *L, const struct arg *arg) +{ + switch (arg->type) { + case ARGT_SINT: + case ARGT_TIME: + case ARGT_SIZE: + lua_pushinteger(L, arg->data.sint); + break; + + case ARGT_STR: + lua_pushlstring(L, arg->data.str.area, arg->data.str.data); + break; + + case ARGT_IPV4: + case ARGT_IPV6: + case ARGT_MSK4: + case ARGT_MSK6: + case ARGT_FE: + case ARGT_BE: + case ARGT_TAB: + case ARGT_SRV: + case ARGT_USR: + case ARGT_MAP: + default: + lua_pushnil(L); + break; + } + return 1; +} + +/* This function take one entry in an LUA stack at the index "ud", + * and try to convert it in an HAProxy argument entry. This is useful + * with sample fetch wrappers. The input arguments are given to the + * lua wrapper and converted as arg list by the function. + * + * Note: although lua_tolstring() may raise a memory error according to + * lua documentation, in practise this could only happen when using to + * use lua_tolstring() on a number (lua will try to push the number as a + * string on the stack, and this may result in memory failure), so here we + * assume that hlua_lua2arg() will never raise an exception since it is + * exclusively used with lua string inputs. + * + * Note2: You should be extra careful when using <arg> argument, since + * string arguments rely on lua_tolstring() which returns a pointer to lua + * object that may be garbage collected at any time when removed from lua + * stack, thus you should make sure that <arg> is only used from a local + * scope within lua context (and not exported or stored in a lua-independent + * ctx) and that related lua object still exists when accessing arg data. + * See: https://www.lua.org/manual/5.4/manual.html#4.1.3 + */ +static int hlua_lua2arg(lua_State *L, int ud, struct arg *arg) +{ + switch (lua_type(L, ud)) { + + case LUA_TNUMBER: + case LUA_TBOOLEAN: + arg->type = ARGT_SINT; + arg->data.sint = lua_tointeger(L, ud); + break; + + case LUA_TSTRING: + arg->type = ARGT_STR; + arg->data.str.area = (char *)lua_tolstring(L, ud, &arg->data.str.data); + /* We don't know the actual size of the underlying allocation, so be conservative. */ + arg->data.str.size = arg->data.str.data+1; /* count the terminating null byte */ + arg->data.str.head = 0; + break; + + case LUA_TUSERDATA: + case LUA_TNIL: + case LUA_TTABLE: + case LUA_TFUNCTION: + case LUA_TTHREAD: + case LUA_TLIGHTUSERDATA: + arg->type = ARGT_SINT; + arg->data.sint = 0; + break; + } + return 1; +} + +/* the following functions are used to convert a struct sample + * in Lua type. This useful to convert the return of the + * fetches or converters. + */ +__LJMP static int hlua_smp2lua(lua_State *L, struct sample *smp) +{ + switch (smp->data.type) { + case SMP_T_SINT: + case SMP_T_BOOL: + lua_pushinteger(L, smp->data.u.sint); + break; + + case SMP_T_BIN: + case SMP_T_STR: + lua_pushlstring(L, smp->data.u.str.area, smp->data.u.str.data); + break; + + case SMP_T_METH: + switch (smp->data.u.meth.meth) { + case HTTP_METH_OPTIONS: lua_pushstring(L, "OPTIONS"); break; + case HTTP_METH_GET: lua_pushstring(L, "GET"); break; + case HTTP_METH_HEAD: lua_pushstring(L, "HEAD"); break; + case HTTP_METH_POST: lua_pushstring(L, "POST"); break; + case HTTP_METH_PUT: lua_pushstring(L, "PUT"); break; + case HTTP_METH_DELETE: lua_pushstring(L, "DELETE"); break; + case HTTP_METH_TRACE: lua_pushstring(L, "TRACE"); break; + case HTTP_METH_CONNECT: lua_pushstring(L, "CONNECT"); break; + case HTTP_METH_OTHER: + lua_pushlstring(L, smp->data.u.meth.str.area, smp->data.u.meth.str.data); + break; + default: + lua_pushnil(L); + break; + } + break; + + case SMP_T_IPV4: + case SMP_T_IPV6: + case SMP_T_ADDR: /* This type is never used to qualify a sample. */ + if (sample_casts[smp->data.type][SMP_T_STR] && + sample_casts[smp->data.type][SMP_T_STR](smp)) + lua_pushlstring(L, smp->data.u.str.area, smp->data.u.str.data); + else + lua_pushnil(L); + break; + default: + lua_pushnil(L); + break; + } + return 1; +} + +/* the following functions are used to convert a struct sample + * in Lua strings. This is useful to convert the return of the + * fetches or converters. + */ +__LJMP static int hlua_smp2lua_str(lua_State *L, struct sample *smp) +{ + switch (smp->data.type) { + + case SMP_T_BIN: + case SMP_T_STR: + lua_pushlstring(L, smp->data.u.str.area, smp->data.u.str.data); + break; + + case SMP_T_METH: + switch (smp->data.u.meth.meth) { + case HTTP_METH_OPTIONS: lua_pushstring(L, "OPTIONS"); break; + case HTTP_METH_GET: lua_pushstring(L, "GET"); break; + case HTTP_METH_HEAD: lua_pushstring(L, "HEAD"); break; + case HTTP_METH_POST: lua_pushstring(L, "POST"); break; + case HTTP_METH_PUT: lua_pushstring(L, "PUT"); break; + case HTTP_METH_DELETE: lua_pushstring(L, "DELETE"); break; + case HTTP_METH_TRACE: lua_pushstring(L, "TRACE"); break; + case HTTP_METH_CONNECT: lua_pushstring(L, "CONNECT"); break; + case HTTP_METH_OTHER: + lua_pushlstring(L, smp->data.u.meth.str.area, smp->data.u.meth.str.data); + break; + default: + lua_pushstring(L, ""); + break; + } + break; + + case SMP_T_SINT: + case SMP_T_BOOL: + case SMP_T_IPV4: + case SMP_T_IPV6: + case SMP_T_ADDR: /* This type is never used to qualify a sample. */ + if (sample_casts[smp->data.type][SMP_T_STR] && + sample_casts[smp->data.type][SMP_T_STR](smp)) + lua_pushlstring(L, smp->data.u.str.area, smp->data.u.str.data); + else + lua_pushstring(L, ""); + break; + default: + lua_pushstring(L, ""); + break; + } + return 1; +} + +/* The following function is used to convert a Lua type to a + * struct sample. This is useful to provide data from LUA code to + * a converter. + * + * Note: although lua_tolstring() may raise a memory error according to + * lua documentation, in practise this could only happen when using to + * use lua_tolstring() on a number (lua will try to push the number as a + * string on the stack, and this may result in memory failure), so here we + * assume that hlua_lua2arg() will never raise an exception since it is + * exclusively used with lua string inputs. + * + * Note2: You should be extra careful when using <smp> argument, since + * string arguments rely on lua_tolstring() which returns a pointer to lua + * object that may be garbage collected at any time when removed from lua + * stack, thus you should make sure that <smp> is only used from a local + * scope within lua context (not exported or stored in a lua-independent + * ctx) and that related lua object still exists when accessing arg data. + * See: https://www.lua.org/manual/5.4/manual.html#4.1.3 + * + * If you don't comply with this usage restriction, then you should consider + * duplicating the smp using smp_dup() to make it portable (little overhead), + * as this will ensure that the smp always points to valid memory block. + */ +static int hlua_lua2smp(lua_State *L, int ud, struct sample *smp) +{ + switch (lua_type(L, ud)) { + + case LUA_TNUMBER: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = lua_tointeger(L, ud); + break; + + + case LUA_TBOOLEAN: + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = lua_toboolean(L, ud); + break; + + case LUA_TSTRING: + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_CONST; + smp->data.u.str.area = (char *)lua_tolstring(L, ud, &smp->data.u.str.data); + /* We don't know the actual size of the underlying allocation, so be conservative. */ + smp->data.u.str.size = smp->data.u.str.data+1; /* count the terminating null byte */ + smp->data.u.str.head = 0; + break; + + case LUA_TUSERDATA: + case LUA_TNIL: + case LUA_TTABLE: + case LUA_TFUNCTION: + case LUA_TTHREAD: + case LUA_TLIGHTUSERDATA: + case LUA_TNONE: + default: + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 0; + break; + } + return 1; +} + +/* This function check the "argp" built by another conversion function + * is in accord with the expected argp defined by the "mask". The function + * returns true or false. It can be adjust the types if there compatibles. + * + * This function assumes that the argp argument contains ARGM_NBARGS + 1 + * entries and that there is at least one stop at the last position. + */ +__LJMP int hlua_lua2arg_check(lua_State *L, int first, struct arg *argp, + uint64_t mask, struct proxy *p) +{ + int min_arg; + int idx; + struct proxy *px; + struct userlist *ul; + struct my_regex *reg; + const char *msg = NULL; + char *sname, *pname, *err = NULL; + + idx = 0; + min_arg = ARGM(mask); + mask >>= ARGM_BITS; + + while (1) { + struct buffer tmp = BUF_NULL; + + /* Check for mandatory arguments. */ + if (argp[idx].type == ARGT_STOP) { + if (idx < min_arg) { + + /* If miss other argument than the first one, we return an error. */ + if (idx > 0) { + msg = "Mandatory argument expected"; + goto error; + } + + /* If first argument have a certain type, some default values + * may be used. See the function smp_resolve_args(). + */ + switch (mask & ARGT_MASK) { + + case ARGT_FE: + if (!(p->cap & PR_CAP_FE)) { + msg = "Mandatory argument expected"; + goto error; + } + argp[idx].data.prx = p; + argp[idx].type = ARGT_FE; + argp[idx+1].type = ARGT_STOP; + break; + + case ARGT_BE: + if (!(p->cap & PR_CAP_BE)) { + msg = "Mandatory argument expected"; + goto error; + } + argp[idx].data.prx = p; + argp[idx].type = ARGT_BE; + argp[idx+1].type = ARGT_STOP; + break; + + case ARGT_TAB: + if (!p->table) { + msg = "Mandatory argument expected"; + goto error; + } + argp[idx].data.t = p->table; + argp[idx].type = ARGT_TAB; + argp[idx+1].type = ARGT_STOP; + break; + + default: + msg = "Mandatory argument expected"; + goto error; + break; + } + } + break; + } + + /* Check for exceed the number of required argument. */ + if ((mask & ARGT_MASK) == ARGT_STOP && + argp[idx].type != ARGT_STOP) { + msg = "Last argument expected"; + goto error; + } + + if ((mask & ARGT_MASK) == ARGT_STOP && + argp[idx].type == ARGT_STOP) { + break; + } + + /* Convert some argument types. All string in argp[] are for not + * duplicated yet. + */ + switch (mask & ARGT_MASK) { + case ARGT_SINT: + if (argp[idx].type != ARGT_SINT) { + msg = "integer expected"; + goto error; + } + argp[idx].type = ARGT_SINT; + break; + + case ARGT_TIME: + if (argp[idx].type != ARGT_SINT) { + msg = "integer expected"; + goto error; + } + argp[idx].type = ARGT_TIME; + break; + + case ARGT_SIZE: + if (argp[idx].type != ARGT_SINT) { + msg = "integer expected"; + goto error; + } + argp[idx].type = ARGT_SIZE; + break; + + case ARGT_FE: + if (argp[idx].type != ARGT_STR) { + msg = "string expected"; + goto error; + } + argp[idx].data.prx = proxy_fe_by_name(argp[idx].data.str.area); + if (!argp[idx].data.prx) { + msg = "frontend doesn't exist"; + goto error; + } + argp[idx].type = ARGT_FE; + break; + + case ARGT_BE: + if (argp[idx].type != ARGT_STR) { + msg = "string expected"; + goto error; + } + argp[idx].data.prx = proxy_be_by_name(argp[idx].data.str.area); + if (!argp[idx].data.prx) { + msg = "backend doesn't exist"; + goto error; + } + argp[idx].type = ARGT_BE; + break; + + case ARGT_TAB: + if (argp[idx].type != ARGT_STR) { + msg = "string expected"; + goto error; + } + argp[idx].data.t = stktable_find_by_name(argp[idx].data.str.area); + if (!argp[idx].data.t) { + msg = "table doesn't exist"; + goto error; + } + argp[idx].type = ARGT_TAB; + break; + + case ARGT_SRV: + if (argp[idx].type != ARGT_STR) { + msg = "string expected"; + goto error; + } + sname = strrchr(argp[idx].data.str.area, '/'); + if (sname) { + *sname++ = '\0'; + pname = argp[idx].data.str.area; + px = proxy_be_by_name(pname); + if (!px) { + msg = "backend doesn't exist"; + goto error; + } + } + else { + sname = argp[idx].data.str.area; + px = p; + } + argp[idx].data.srv = findserver(px, sname); + if (!argp[idx].data.srv) { + msg = "server doesn't exist"; + goto error; + } + argp[idx].type = ARGT_SRV; + break; + + case ARGT_IPV4: + if (argp[idx].type != ARGT_STR) { + msg = "string expected"; + goto error; + } + if (inet_pton(AF_INET, argp[idx].data.str.area, &argp[idx].data.ipv4)) { + msg = "invalid IPv4 address"; + goto error; + } + argp[idx].type = ARGT_IPV4; + break; + + case ARGT_MSK4: + if (argp[idx].type == ARGT_SINT) + len2mask4(argp[idx].data.sint, &argp[idx].data.ipv4); + else if (argp[idx].type == ARGT_STR) { + if (!str2mask(argp[idx].data.str.area, &argp[idx].data.ipv4)) { + msg = "invalid IPv4 mask"; + goto error; + } + } + else { + msg = "integer or string expected"; + goto error; + } + argp[idx].type = ARGT_MSK4; + break; + + case ARGT_IPV6: + if (argp[idx].type != ARGT_STR) { + msg = "string expected"; + goto error; + } + if (inet_pton(AF_INET6, argp[idx].data.str.area, &argp[idx].data.ipv6)) { + msg = "invalid IPv6 address"; + goto error; + } + argp[idx].type = ARGT_IPV6; + break; + + case ARGT_MSK6: + if (argp[idx].type == ARGT_SINT) + len2mask6(argp[idx].data.sint, &argp[idx].data.ipv6); + else if (argp[idx].type == ARGT_STR) { + if (!str2mask6(argp[idx].data.str.area, &argp[idx].data.ipv6)) { + msg = "invalid IPv6 mask"; + goto error; + } + } + else { + msg = "integer or string expected"; + goto error; + } + argp[idx].type = ARGT_MSK6; + break; + + case ARGT_REG: + if (argp[idx].type != ARGT_STR) { + msg = "string expected"; + goto error; + } + reg = regex_comp(argp[idx].data.str.area, !(argp[idx].type_flags & ARGF_REG_ICASE), 1, &err); + if (!reg) { + msg = lua_pushfstring(L, "error compiling regex '%s' : '%s'", + argp[idx].data.str.area, err); + free(err); + goto error; + } + argp[idx].type = ARGT_REG; + argp[idx].data.reg = reg; + break; + + case ARGT_USR: + if (argp[idx].type != ARGT_STR) { + msg = "string expected"; + goto error; + } + if (p->uri_auth && p->uri_auth->userlist && + strcmp(p->uri_auth->userlist->name, argp[idx].data.str.area) == 0) + ul = p->uri_auth->userlist; + else + ul = auth_find_userlist(argp[idx].data.str.area); + + if (!ul) { + msg = lua_pushfstring(L, "unable to find userlist '%s'", argp[idx].data.str.area); + goto error; + } + argp[idx].type = ARGT_USR; + argp[idx].data.usr = ul; + break; + + case ARGT_STR: + if (!chunk_dup(&tmp, &argp[idx].data.str)) { + msg = "unable to duplicate string arg"; + goto error; + } + argp[idx].data.str = tmp; + break; + + case ARGT_MAP: + msg = "type not yet supported"; + goto error; + break; + + } + + /* Check for type of argument. */ + if ((mask & ARGT_MASK) != argp[idx].type) { + msg = lua_pushfstring(L, "'%s' expected, got '%s'", + arg_type_names[(mask & ARGT_MASK)], + arg_type_names[argp[idx].type & ARGT_MASK]); + goto error; + } + + /* Next argument. */ + mask >>= ARGT_BITS; + idx++; + } + return 0; + + error: + argp[idx].type = ARGT_STOP; + free_args(argp); + WILL_LJMP(luaL_argerror(L, first + idx, msg)); + return 0; /* Never reached */ +} + +/* + * The following functions are used to make correspondence between the the + * executed lua pointer and the "struct hlua *" that contain the context. + * + * - hlua_gethlua : return the hlua context associated with an lua_State. + * - hlua_sethlua : create the association between hlua context and lua_state. + */ +inline struct hlua *hlua_gethlua(lua_State *L) +{ + struct hlua **hlua = lua_getextraspace(L); + return *hlua; +} +static inline void hlua_sethlua(struct hlua *hlua) +{ + struct hlua **hlua_store = lua_getextraspace(hlua->T); + *hlua_store = hlua; +} + +/* Will return a non-NULL string indicating the Lua call trace if the caller + * currently is executing from within a Lua function. One line per entry will + * be emitted, and each extra line will be prefixed with <pfx>. If a current + * Lua function is not detected, NULL is returned. + */ +const char *hlua_show_current_location(const char *pfx) +{ + lua_State *L; + lua_Debug ar; + + /* global or per-thread stack initializing ? */ + if (hlua_state_id != -1 && (L = hlua_states[hlua_state_id]) && lua_getstack(L, 0, &ar)) + return hlua_traceback(L, pfx); + + /* per-thread stack running ? */ + if (hlua_states[tid + 1] && (L = hlua_states[tid + 1]) && lua_getstack(L, 0, &ar)) + return hlua_traceback(L, pfx); + + /* global stack running ? */ + if (hlua_states[0] && (L = hlua_states[0]) && lua_getstack(L, 0, &ar)) + return hlua_traceback(L, pfx); + + return NULL; +} + +/* This function is used to send logs. It tries to send them to: + * - the log target applicable in the current context, OR + * - stderr when no logger is in use for the current context + */ +static inline void hlua_sendlog(struct proxy *px, int level, const char *msg) +{ + struct tm tm; + char *p; + + /* Cleanup the log message. */ + p = trash.area; + for (; *msg != '\0'; msg++, p++) { + if (p >= trash.area + trash.size - 1) { + /* Break the message if exceed the buffer size. */ + *(p-4) = ' '; + *(p-3) = '.'; + *(p-2) = '.'; + *(p-1) = '.'; + break; + } + if (isprint((unsigned char)*msg)) + *p = *msg; + else + *p = '.'; + } + *p = '\0'; + + if (hlua_log_opts & HLUA_LOG_LOGGERS_ON) + send_log(px, level, "%s\n", trash.area); + + if (!(global.mode & MODE_QUIET) || (global.mode & (MODE_VERBOSE | MODE_STARTING))) { + if (!(hlua_log_opts & HLUA_LOG_STDERR_MASK)) + return; + + /* when logging via stderr is set to 'auto', it behaves like 'off' unless one of: + * - logging via loggers is disabled + * - this is a non-proxy context and there is no global logger configured + * - this is a proxy context and the proxy has no logger configured + */ + if ((hlua_log_opts & (HLUA_LOG_STDERR_MASK | HLUA_LOG_LOGGERS_ON)) == (HLUA_LOG_STDERR_AUTO | HLUA_LOG_LOGGERS_ON)) { + /* AUTO=OFF in non-proxy context only if at least one global logger is defined */ + if ((px == NULL) && (!LIST_ISEMPTY(&global.loggers))) + return; + + /* AUTO=OFF in proxy context only if at least one logger is configured for the proxy */ + if ((px != NULL) && (!LIST_ISEMPTY(&px->loggers))) + return; + } + + if (level == LOG_DEBUG && !(global.mode & MODE_DEBUG)) + return; + + get_localtime(date.tv_sec, &tm); + fprintf(stderr, "[%s] %03d/%02d%02d%02d (%d) : %s\n", + log_levels[level], tm.tm_yday, tm.tm_hour, tm.tm_min, tm.tm_sec, + (int)getpid(), trash.area); + fflush(stderr); + } +} + +/* This function just ensure that the yield will be always + * returned with a timeout and permit to set some flags + * <timeout> is a tick value + */ +__LJMP void hlua_yieldk(lua_State *L, int nresults, lua_KContext ctx, + lua_KFunction k, int timeout, unsigned int flags) +{ + struct hlua *hlua; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) { + return; + } + + /* Set the wake timeout. If timeout is required, we set + * the expiration time. + */ + hlua->wake_time = timeout; + + hlua->flags |= flags; + + /* Process the yield. */ + MAY_LJMP(lua_yieldk(L, nresults, ctx, k)); +} + +/* This function initialises the Lua environment stored in the stream. + * It must be called at the start of the stream. This function creates + * an LUA coroutine. It can not be use to crete the main LUA context. + * + * This function is particular. it initialises a new Lua thread. If the + * initialisation fails (example: out of memory error), the lua function + * throws an error (longjmp). + * + * This function manipulates two Lua stacks: the main and the thread. Only + * the main stack can fail. The thread is not manipulated. This function + * MUST NOT manipulate the created thread stack state, because it is not + * protected against errors thrown by the thread stack. + */ +int hlua_ctx_init(struct hlua *lua, int state_id, struct task *task) +{ + lua->Mref = LUA_REFNIL; + lua->flags = 0; + lua->gc_count = 0; + lua->wake_time = TICK_ETERNITY; + lua->state_id = state_id; + hlua_timer_init(&lua->timer, 0); /* default value, no timeout */ + LIST_INIT(&lua->com); + MT_LIST_INIT(&lua->hc_list); + if (!SET_SAFE_LJMP_PARENT(lua)) { + lua->Tref = LUA_REFNIL; + return 0; + } + lua->T = lua_newthread(hlua_states[state_id]); + if (!lua->T) { + lua->Tref = LUA_REFNIL; + RESET_SAFE_LJMP_PARENT(lua); + return 0; + } + hlua_sethlua(lua); + lua->Tref = luaL_ref(hlua_states[state_id], LUA_REGISTRYINDEX); + lua->task = task; + RESET_SAFE_LJMP_PARENT(lua); + return 1; +} + +/* kill all associated httpclient to this hlua task + * We must take extra precautions as we're manipulating lua-exposed + * objects without the main lua lock. + */ +static void hlua_httpclient_destroy_all(struct hlua *hlua) +{ + struct hlua_httpclient *hlua_hc; + + /* use thread-safe accessors for hc_list since GC cycle initiated by + * another thread sharing the same main lua stack (lua coroutine) + * could execute hlua_httpclient_gc() on the hlua->hc_list items + * in parallel: Lua GC applies on the main stack, it is not limited to + * a single coroutine stack, see Github issue #2037 for reference. + * Remember, coroutines created using lua_newthread() are not meant to + * be thread safe in Lua. (From lua co-author: + * http://lua-users.org/lists/lua-l/2011-07/msg00072.html) + * + * This security measure is superfluous when 'lua-load-per-thread' is used + * since in this case coroutines exclusively run on the same thread + * (main stack is not shared between OS threads). + */ + while ((hlua_hc = MT_LIST_POP(&hlua->hc_list, typeof(hlua_hc), by_hlua))) { + httpclient_stop_and_destroy(hlua_hc->hc); + hlua_hc->hc = NULL; + } +} + + +/* Used to destroy the Lua coroutine when the attached stream or task + * is destroyed. The destroy also the memory context. The struct "lua" + * will be freed. + */ +void hlua_ctx_destroy(struct hlua *lua) +{ + if (!lua) + return; + + if (!lua->T) + goto end; + + /* clean all running httpclient */ + hlua_httpclient_destroy_all(lua); + + /* Purge all the pending signals. */ + notification_purge(&lua->com); + + if (!SET_SAFE_LJMP(lua)) + return; + luaL_unref(lua->T, LUA_REGISTRYINDEX, lua->Mref); + RESET_SAFE_LJMP(lua); + + if (!SET_SAFE_LJMP_PARENT(lua)) + return; + luaL_unref(hlua_states[lua->state_id], LUA_REGISTRYINDEX, lua->Tref); + RESET_SAFE_LJMP_PARENT(lua); + /* Forces a garbage collecting process. If the Lua program is finished + * without error, we run the GC on the thread pointer. Its freed all + * the unused memory. + * If the thread is finnish with an error or is currently yielded, + * it seems that the GC applied on the thread doesn't clean anything, + * so e run the GC on the main thread. + * NOTE: maybe this action locks all the Lua threads untiml the en of + * the garbage collection. + */ + if (lua->gc_count) { + if (!SET_SAFE_LJMP_PARENT(lua)) + return; + lua_gc(hlua_states[lua->state_id], LUA_GCCOLLECT, 0); + RESET_SAFE_LJMP_PARENT(lua); + } + + lua->T = NULL; + +end: + pool_free(pool_head_hlua, lua); +} + +/* This function is used to restore the Lua context when a coroutine + * fails. This function copy the common memory between old coroutine + * and the new coroutine. The old coroutine is destroyed, and its + * replaced by the new coroutine. + * If the flag "keep_msg" is set, the last entry of the old is assumed + * as string error message and it is copied in the new stack. + */ +static int hlua_ctx_renew(struct hlua *lua, int keep_msg) +{ + lua_State *T; + int new_ref; + + /* New Lua coroutine. */ + T = lua_newthread(hlua_states[lua->state_id]); + if (!T) + return 0; + + /* Copy last error message. */ + if (keep_msg) + lua_xmove(lua->T, T, 1); + + /* Copy data between the coroutines. */ + lua_rawgeti(lua->T, LUA_REGISTRYINDEX, lua->Mref); + lua_xmove(lua->T, T, 1); + new_ref = luaL_ref(T, LUA_REGISTRYINDEX); /* Value popped. */ + + /* Destroy old data. */ + luaL_unref(lua->T, LUA_REGISTRYINDEX, lua->Mref); + + /* The thread is garbage collected by Lua. */ + luaL_unref(hlua_states[lua->state_id], LUA_REGISTRYINDEX, lua->Tref); + + /* Fill the struct with the new coroutine values. */ + lua->Mref = new_ref; + lua->T = T; + lua->Tref = luaL_ref(hlua_states[lua->state_id], LUA_REGISTRYINDEX); + + /* Set context. */ + hlua_sethlua(lua); + + return 1; +} + +void hlua_hook(lua_State *L, lua_Debug *ar) +{ + struct hlua *hlua; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) + return; + + if (hlua->T != L) { + /* We don't want to enforce a yield on a sub coroutine, since + * we have no guarantees that the yield will be handled properly. + * Indeed, only the hlua->T coroutine is being handled through + * hlua_ctx_resume() function. + * + * Instead, we simply check for timeouts and wait for the sub + * coroutine to finish.. + */ + goto check_timeout; + } + + /* Lua cannot yield when its returning from a function, + * so, we can fix the interrupt hook to 1 instruction, + * expecting that the function is finished. + */ + if (lua_gethookmask(L) & LUA_MASKRET) { + lua_sethook(hlua->T, hlua_hook, LUA_MASKCOUNT, 1); + return; + } + + /* If we interrupt the Lua processing in yieldable state, we yield. + * If the state is not yieldable, trying yield causes an error. + */ + if (lua_isyieldable(L)) { + /* note: for converters/fetches.. where yielding is not allowed + * hlua_ctx_resume() will simply perform a goto resume_execution + * instead of rescheduling hlua->task. + * also: hlua_ctx_resume() will take care of checking execution + * timeout and re-applying the hook as needed. + */ + MAY_LJMP(hlua_yieldk(L, 0, 0, NULL, TICK_ETERNITY, HLUA_CTRLYIELD)); + /* lua docs says that the hook should return immediately after lua_yieldk + * + * From: https://www.lua.org/manual/5.3/manual.html#lua_yieldk + * + * Moreover, it seems that we don't want to continue after the yield + * because the end of the function is about handling unyieldable function, + * which is not the case here. + * + * ->if we don't return lua_sethook gets incorrectly set with MASKRET later + * in the function. + */ + return; + } + + check_timeout: + /* If we cannot yield, check the timeout. */ + if (!hlua_timer_check(&hlua->timer)) { + lua_pushfstring(L, "execution timeout"); + WILL_LJMP(lua_error(L)); + } + + /* Try to interrupt the process at the end of the current + * unyieldable function. + */ + lua_sethook(hlua->T, hlua_hook, LUA_MASKRET|LUA_MASKCOUNT, hlua_nb_instruction); +} + +/* This function start or resumes the Lua stack execution. If the flag + * "yield_allowed" if no set and the LUA stack execution returns a yield + * The function return an error. + * + * The function can returns 4 values: + * - HLUA_E_OK : The execution is terminated without any errors. + * - HLUA_E_AGAIN : The execution must continue at the next associated + * task wakeup. + * - HLUA_E_ERRMSG : An error has occurred, an error message is set in + * the top of the stack. + * - HLUA_E_ERR : An error has occurred without error message. + * + * If an error occurred, the stack is renewed and it is ready to run new + * LUA code. + */ +static enum hlua_exec hlua_ctx_resume(struct hlua *lua, int yield_allowed) +{ +#if defined(LUA_VERSION_NUM) && LUA_VERSION_NUM >= 504 + int nres; +#endif + int ret; + const char *msg; + const char *trace; + + /* Lock the whole Lua execution. This lock must be before the + * label "resume_execution". + */ + hlua_lock(lua); + + /* reset the timer as we might be re-entering the function to + * resume the coroutine after a successful yield + * (cumulative time will be updated) + */ + hlua_timer_reset(&lua->timer); + +resume_execution: + + /* This hook interrupts the Lua processing each 'hlua_nb_instruction' + * instructions. it is used for preventing infinite loops. + */ + lua_sethook(lua->T, hlua_hook, LUA_MASKCOUNT, hlua_nb_instruction); + + /* Remove all flags except the running flags. */ + HLUA_SET_RUN(lua); + HLUA_CLR_CTRLYIELD(lua); + HLUA_CLR_WAKERESWR(lua); + HLUA_CLR_WAKEREQWR(lua); + HLUA_CLR_NOYIELD(lua); + if (!yield_allowed) + HLUA_SET_NOYIELD(lua); + + /* reset wake_time. */ + lua->wake_time = TICK_ETERNITY; + + /* start the timer as we're about to start lua processing */ + hlua_timer_start(&lua->timer); + + /* Call the function. */ +#if defined(LUA_VERSION_NUM) && LUA_VERSION_NUM >= 504 + ret = lua_resume(lua->T, hlua_states[lua->state_id], lua->nargs, &nres); +#else + ret = lua_resume(lua->T, hlua_states[lua->state_id], lua->nargs); +#endif + + /* out of lua processing, stop the timer */ + hlua_timer_stop(&lua->timer); + + /* reset nargs because those possibly passed to the lua_resume() call + * were already consumed, and since we may call lua_resume() again + * after a successful yield, we don't want to pass stale nargs hint + * to the Lua API. As such, nargs should be set explicitly before each + * lua_resume() (or hlua_ctx_resume()) invocation if needed. + */ + lua->nargs = 0; + + switch (ret) { + + case LUA_OK: + ret = HLUA_E_OK; + break; + + case LUA_YIELD: + /* Check if the execution timeout is expired. If it is the case, we + * break the Lua execution. + */ + if (!hlua_timer_check(&lua->timer)) { + lua_settop(lua->T, 0); /* Empty the stack. */ + ret = HLUA_E_ETMOUT; + break; + } + /* Process the forced yield. if the general yield is not allowed or + * if no task were associated this the current Lua execution + * coroutine, we resume the execution. Else we want to return in the + * scheduler and we want to be waked up again, to continue the + * current Lua execution. So we schedule our own task. + */ + if (HLUA_IS_CTRLYIELDING(lua)) { + if (!yield_allowed || !lua->task) + goto resume_execution; + task_wakeup(lua->task, TASK_WOKEN_MSG); + } + if (!yield_allowed) { + lua_settop(lua->T, 0); /* Empty the stack. */ + ret = HLUA_E_YIELD; + break; + } + ret = HLUA_E_AGAIN; + break; + + case LUA_ERRRUN: + + /* Special exit case. The traditional exit is returned as an error + * because the errors ares the only one mean to return immediately + * from and lua execution. + */ + if (lua->flags & HLUA_EXIT) { + ret = HLUA_E_OK; + hlua_ctx_renew(lua, 1); + break; + } + + lua->wake_time = TICK_ETERNITY; + if (!lua_checkstack(lua->T, 1)) { + ret = HLUA_E_ERR; + break; + } + msg = lua_tostring(lua->T, -1); + lua_settop(lua->T, 0); /* Empty the stack. */ + trace = hlua_traceback(lua->T, ", "); + if (msg) + lua_pushfstring(lua->T, "[state-id %d] runtime error: %s from %s", lua->state_id, msg, trace); + else + lua_pushfstring(lua->T, "[state-id %d] unknown runtime error from %s", lua->state_id, trace); + ret = HLUA_E_ERRMSG; + break; + + case LUA_ERRMEM: + lua->wake_time = TICK_ETERNITY; + lua_settop(lua->T, 0); /* Empty the stack. */ + ret = HLUA_E_NOMEM; + break; + + case LUA_ERRERR: + lua->wake_time = TICK_ETERNITY; + if (!lua_checkstack(lua->T, 1)) { + ret = HLUA_E_ERR; + break; + } + msg = lua_tostring(lua->T, -1); + lua_settop(lua->T, 0); /* Empty the stack. */ + if (msg) + lua_pushfstring(lua->T, "[state-id %d] message handler error: %s", lua->state_id, msg); + else + lua_pushfstring(lua->T, "[state-id %d] message handler error", lua->state_id); + ret = HLUA_E_ERRMSG; + break; + + default: + lua->wake_time = TICK_ETERNITY; + lua_settop(lua->T, 0); /* Empty the stack. */ + ret = HLUA_E_ERR; + break; + } + + switch (ret) { + case HLUA_E_AGAIN: + break; + + case HLUA_E_ERRMSG: + notification_purge(&lua->com); + hlua_ctx_renew(lua, 1); + HLUA_CLR_RUN(lua); + break; + + case HLUA_E_ETMOUT: + case HLUA_E_NOMEM: + case HLUA_E_YIELD: + case HLUA_E_ERR: + HLUA_CLR_RUN(lua); + notification_purge(&lua->com); + hlua_ctx_renew(lua, 0); + break; + + case HLUA_E_OK: + HLUA_CLR_RUN(lua); + notification_purge(&lua->com); + break; + } + + /* This is the main exit point, remove the Lua lock. */ + hlua_unlock(lua); + + return ret; +} + +/* This function exit the current code. */ +__LJMP static int hlua_done(lua_State *L) +{ + struct hlua *hlua; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) + return 0; + + hlua->flags |= HLUA_EXIT; + WILL_LJMP(lua_error(L)); + + return 0; +} + +/* This function is an LUA binding. It provides a function + * for deleting ACL from a referenced ACL file. + */ +__LJMP static int hlua_del_acl(lua_State *L) +{ + const char *name; + const char *key; + struct pat_ref *ref; + + MAY_LJMP(check_args(L, 2, "del_acl")); + + name = MAY_LJMP(luaL_checkstring(L, 1)); + key = MAY_LJMP(luaL_checkstring(L, 2)); + + ref = pat_ref_lookup(name); + if (!ref) + WILL_LJMP(luaL_error(L, "'del_acl': unknown acl file '%s'", name)); + + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ref->lock); + pat_ref_delete(ref, key); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ref->lock); + return 0; +} + +/* This function is an LUA binding. It provides a function + * for deleting map entry from a referenced map file. + */ +static int hlua_del_map(lua_State *L) +{ + const char *name; + const char *key; + struct pat_ref *ref; + + MAY_LJMP(check_args(L, 2, "del_map")); + + name = MAY_LJMP(luaL_checkstring(L, 1)); + key = MAY_LJMP(luaL_checkstring(L, 2)); + + ref = pat_ref_lookup(name); + if (!ref) + WILL_LJMP(luaL_error(L, "'del_map': unknown acl file '%s'", name)); + + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ref->lock); + pat_ref_delete(ref, key); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ref->lock); + return 0; +} + +/* This function is an LUA binding. It provides a function + * for adding ACL pattern from a referenced ACL file. + */ +static int hlua_add_acl(lua_State *L) +{ + const char *name; + const char *key; + struct pat_ref *ref; + + MAY_LJMP(check_args(L, 2, "add_acl")); + + name = MAY_LJMP(luaL_checkstring(L, 1)); + key = MAY_LJMP(luaL_checkstring(L, 2)); + + ref = pat_ref_lookup(name); + if (!ref) + WILL_LJMP(luaL_error(L, "'add_acl': unknown acl file '%s'", name)); + + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ref->lock); + if (pat_ref_find_elt(ref, key) == NULL) + pat_ref_add(ref, key, NULL, NULL); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ref->lock); + return 0; +} + +/* This function is an LUA binding. It provides a function + * for setting map pattern and sample from a referenced map + * file. + */ +static int hlua_set_map(lua_State *L) +{ + const char *name; + const char *key; + const char *value; + struct pat_ref *ref; + + MAY_LJMP(check_args(L, 3, "set_map")); + + name = MAY_LJMP(luaL_checkstring(L, 1)); + key = MAY_LJMP(luaL_checkstring(L, 2)); + value = MAY_LJMP(luaL_checkstring(L, 3)); + + ref = pat_ref_lookup(name); + if (!ref) + WILL_LJMP(luaL_error(L, "'set_map': unknown map file '%s'", name)); + + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ref->lock); + if (pat_ref_find_elt(ref, key) != NULL) + pat_ref_set(ref, key, value, NULL, NULL); + else + pat_ref_add(ref, key, value, NULL); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ref->lock); + return 0; +} + +/* This function is an LUA binding. It provides a function + * for retrieving a var from the proc scope in core. + */ + static int hlua_core_get_var(lua_State *L) +{ + const char *name; + size_t len; + struct sample smp; + + MAY_LJMP(check_args(L, 1, "get_var")); + + name = MAY_LJMP(luaL_checklstring(L, 1, &len)); + + /* We can only retrieve information from the proc. scope */ + /* FIXME: I didn't want to expose vars_hash_name from vars.c */ + if (len < 5 || strncmp(name, "proc.", 5) != 0) + WILL_LJMP(luaL_error(L, "'get_var': Only 'proc.' scope allowed to be retrieved in 'core.get_var()'.")); + + memset(&smp, 0, sizeof(smp)); + if (!vars_get_by_name(name, len, &smp, NULL)) { + lua_pushnil(L); + return 1; + } + + return MAY_LJMP(hlua_smp2lua(L, &smp)); + return 1; +} + +/* This function disables the sending of email through the + * legacy email sending function which is implemented using + * checks. + * + * It may not be used during runtime. + */ +__LJMP static int hlua_disable_legacy_mailers(lua_State *L) +{ + if (hlua_gethlua(L)) + WILL_LJMP(luaL_error(L, "disable_legacy_mailers: " + "not available outside of init or body context")); + send_email_disabled = 1; + return 0; +} + +/* A class is a lot of memory that contain data. This data can be a table, + * an integer or user data. This data is associated with a metatable. This + * metatable have an original version registered in the global context with + * the name of the object (_G[<name>] = <metable> ). + * + * A metable is a table that modify the standard behavior of a standard + * access to the associated data. The entries of this new metatable are + * defined as is: + * + * http://lua-users.org/wiki/MetatableEvents + * + * __index + * + * we access an absent field in a table, the result is nil. This is + * true, but it is not the whole truth. Actually, such access triggers + * the interpreter to look for an __index metamethod: If there is no + * such method, as usually happens, then the access results in nil; + * otherwise, the metamethod will provide the result. + * + * Control 'prototype' inheritance. When accessing "myTable[key]" and + * the key does not appear in the table, but the metatable has an __index + * property: + * + * - if the value is a function, the function is called, passing in the + * table and the key; the return value of that function is returned as + * the result. + * + * - if the value is another table, the value of the key in that table is + * asked for and returned (and if it doesn't exist in that table, but that + * table's metatable has an __index property, then it continues on up) + * + * - Use "rawget(myTable,key)" to skip this metamethod. + * + * http://www.lua.org/pil/13.4.1.html + * + * __newindex + * + * Like __index, but control property assignment. + * + * __mode - Control weak references. A string value with one or both + * of the characters 'k' and 'v' which specifies that the the + * keys and/or values in the table are weak references. + * + * __call - Treat a table like a function. When a table is followed by + * parenthesis such as "myTable( 'foo' )" and the metatable has + * a __call key pointing to a function, that function is invoked + * (passing any specified arguments) and the return value is + * returned. + * + * __metatable - Hide the metatable. When "getmetatable( myTable )" is + * called, if the metatable for myTable has a __metatable + * key, the value of that key is returned instead of the + * actual metatable. + * + * __tostring - Control string representation. When the builtin + * "tostring( myTable )" function is called, if the metatable + * for myTable has a __tostring property set to a function, + * that function is invoked (passing myTable to it) and the + * return value is used as the string representation. + * + * __len - Control table length. When the table length is requested using + * the length operator ( '#' ), if the metatable for myTable has + * a __len key pointing to a function, that function is invoked + * (passing myTable to it) and the return value used as the value + * of "#myTable". + * + * __gc - Userdata finalizer code. When userdata is set to be garbage + * collected, if the metatable has a __gc field pointing to a + * function, that function is first invoked, passing the userdata + * to it. The __gc metamethod is not called for tables. + * (See http://lua-users.org/lists/lua-l/2006-11/msg00508.html) + * + * Special metamethods for redefining standard operators: + * http://www.lua.org/pil/13.1.html + * + * __add "+" + * __sub "-" + * __mul "*" + * __div "/" + * __unm "!" + * __pow "^" + * __concat ".." + * + * Special methods for redefining standard relations + * http://www.lua.org/pil/13.2.html + * + * __eq "==" + * __lt "<" + * __le "<=" + */ + +/* + * + * + * Class Map + * + * + */ + +/* Returns a struct hlua_map if the stack entry "ud" is + * a class session, otherwise it throws an error. + */ +__LJMP static struct map_descriptor *hlua_checkmap(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_map_ref)); +} + +/* This function is the map constructor. It don't need + * the class Map object. It creates and return a new Map + * object. It must be called only during "body" or "init" + * context because it process some filesystem accesses. + */ +__LJMP static int hlua_map_new(struct lua_State *L) +{ + const char *fn; + int match = PAT_MATCH_STR; + struct sample_conv conv; + const char *file = ""; + int line = 0; + lua_Debug ar; + char *err = NULL; + struct arg args[2]; + + if (lua_gettop(L) < 1 || lua_gettop(L) > 2) + WILL_LJMP(luaL_error(L, "'new' needs at least 1 argument.")); + + fn = MAY_LJMP(luaL_checkstring(L, 1)); + + if (lua_gettop(L) >= 2) { + match = MAY_LJMP(luaL_checkinteger(L, 2)); + if (match < 0 || match >= PAT_MATCH_NUM) + WILL_LJMP(luaL_error(L, "'new' needs a valid match method.")); + } + + /* Get Lua filename and line number. */ + if (lua_getstack(L, 1, &ar)) { /* check function at level */ + lua_getinfo(L, "Sl", &ar); /* get info about it */ + if (ar.currentline > 0) { /* is there info? */ + file = ar.short_src; + line = ar.currentline; + } + } + + /* fill fake sample_conv struct. */ + conv.kw = ""; /* unused. */ + conv.process = NULL; /* unused. */ + conv.arg_mask = 0; /* unused. */ + conv.val_args = NULL; /* unused. */ + conv.out_type = SMP_T_STR; + conv.private = (void *)(long)match; + switch (match) { + case PAT_MATCH_STR: conv.in_type = SMP_T_STR; break; + case PAT_MATCH_BEG: conv.in_type = SMP_T_STR; break; + case PAT_MATCH_SUB: conv.in_type = SMP_T_STR; break; + case PAT_MATCH_DIR: conv.in_type = SMP_T_STR; break; + case PAT_MATCH_DOM: conv.in_type = SMP_T_STR; break; + case PAT_MATCH_END: conv.in_type = SMP_T_STR; break; + case PAT_MATCH_REG: conv.in_type = SMP_T_STR; break; + case PAT_MATCH_INT: conv.in_type = SMP_T_SINT; break; + case PAT_MATCH_IP: conv.in_type = SMP_T_ADDR; break; + default: + WILL_LJMP(luaL_error(L, "'new' doesn't support this match mode.")); + } + + /* fill fake args. */ + args[0].type = ARGT_STR; + args[0].data.str.area = strdup(fn); + args[0].data.str.data = strlen(fn); + args[0].data.str.size = args[0].data.str.data+1; + args[1].type = ARGT_STOP; + + /* load the map. */ + if (!sample_load_map(args, &conv, file, line, &err)) { + /* error case: we can't use luaL_error because we must + * free the err variable. + */ + luaL_where(L, 1); + lua_pushfstring(L, "'new': %s.", err); + lua_concat(L, 2); + free(err); + chunk_destroy(&args[0].data.str); + WILL_LJMP(lua_error(L)); + } + + /* create the lua object. */ + lua_newtable(L); + lua_pushlightuserdata(L, args[0].data.map); + lua_rawseti(L, -2, 0); + + /* Pop a class Map metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_map_ref); + lua_setmetatable(L, -2); + + + return 1; +} + +__LJMP static inline int _hlua_map_lookup(struct lua_State *L, int str) +{ + struct map_descriptor *desc; + struct pattern *pat; + struct sample smp; + + MAY_LJMP(check_args(L, 2, "lookup")); + desc = MAY_LJMP(hlua_checkmap(L, 1)); + if (desc->pat.expect_type == SMP_T_SINT) { + smp.data.type = SMP_T_SINT; + smp.data.u.sint = MAY_LJMP(luaL_checkinteger(L, 2)); + } + else { + smp.data.type = SMP_T_STR; + smp.flags = SMP_F_CONST; + smp.data.u.str.area = (char *)MAY_LJMP(luaL_checklstring(L, 2, (size_t *)&smp.data.u.str.data)); + smp.data.u.str.size = smp.data.u.str.data + 1; + } + + pat = pattern_exec_match(&desc->pat, &smp, 1); + if (!pat || !pat->data) { + if (str) + lua_pushstring(L, ""); + else + lua_pushnil(L); + return 1; + } + + /* The Lua pattern must return a string, so we can't check the returned type */ + lua_pushlstring(L, pat->data->u.str.area, pat->data->u.str.data); + return 1; +} + +__LJMP static int hlua_map_lookup(struct lua_State *L) +{ + return _hlua_map_lookup(L, 0); +} + +__LJMP static int hlua_map_slookup(struct lua_State *L) +{ + return _hlua_map_lookup(L, 1); +} + +/* + * + * + * Class Socket + * + * + */ + +__LJMP static struct hlua_socket *hlua_checksocket(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_socket_ref)); +} + +/* This function is the handler called for each I/O on the established + * connection. It is used for notify space available to send or data + * received. + */ +static void hlua_socket_handler(struct appctx *appctx) +{ + struct hlua_csk_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) { + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + notification_wake(&ctx->wake_on_read); + notification_wake(&ctx->wake_on_write); + return; + } + + if (ctx->die) { + se_fl_set(appctx->sedesc, SE_FL_EOI|SE_FL_EOS); + notification_wake(&ctx->wake_on_read); + notification_wake(&ctx->wake_on_write); + return; + } + + /* If we can't write, wakeup the pending write signals. */ + if (channel_output_closed(sc_ic(sc))) + notification_wake(&ctx->wake_on_write); + + /* If we can't read, wakeup the pending read signals. */ + if (channel_input_closed(sc_oc(sc))) + notification_wake(&ctx->wake_on_read); + + /* if the connection is not established, inform the stream that we want + * to be notified whenever the connection completes. + */ + if (sc_opposite(sc)->state < SC_ST_EST) { + applet_need_more_data(appctx); + se_need_remote_conn(appctx->sedesc); + applet_have_more_data(appctx); + return; + } + + /* This function is called after the connect. */ + ctx->connected = 1; + + /* Wake the tasks which wants to write if the buffer have available space. */ + if (channel_may_recv(sc_ic(sc))) + notification_wake(&ctx->wake_on_write); + + /* Wake the tasks which wants to read if the buffer contains data. */ + if (co_data(sc_oc(sc))) + notification_wake(&ctx->wake_on_read); + + /* If write notifications are registered, we considers we want + * to write, so we clear the blocking flag. + */ + if (notification_registered(&ctx->wake_on_write)) + applet_have_more_data(appctx); +} + +static int hlua_socket_init(struct appctx *appctx) +{ + struct hlua_csk_ctx *csk_ctx = appctx->svcctx; + struct stream *s; + + if (appctx_finalize_startup(appctx, socket_proxy, &BUF_NULL) == -1) + goto error; + + s = appctx_strm(appctx); + + /* Configure "right" stream connector. This stconn is used to connect + * and retrieve data from the server. The connection is initialized + * with the "struct server". + */ + sc_set_state(s->scb, SC_ST_ASS); + + /* Force destination server. */ + s->flags |= SF_DIRECT | SF_ASSIGNED | SF_BE_ASSIGNED; + s->target = &csk_ctx->srv->obj_type; + + if (csk_ctx->timeout) { + s->sess->fe->timeout.connect = csk_ctx->timeout; + s->scf->ioto = csk_ctx->timeout; + s->scb->ioto = csk_ctx->timeout; + } + + return 0; + + error: + return -1; +} + +/* This function is called when the "struct stream" is destroyed. + * Remove the link from the object to this stream. + * Wake all the pending signals. + */ +static void hlua_socket_release(struct appctx *appctx) +{ + struct hlua_csk_ctx *ctx = appctx->svcctx; + struct xref *peer; + + /* Remove my link in the original objects. */ + peer = xref_get_peer_and_lock(&ctx->xref); + if (peer) + xref_disconnect(&ctx->xref, peer); + + /* Wake all the task waiting for me. */ + notification_wake(&ctx->wake_on_read); + notification_wake(&ctx->wake_on_write); +} + +/* If the garbage collectio of the object is launch, nobody + * uses this object. If the stream does not exists, just quit. + * Send the shutdown signal to the stream. In some cases, + * pending signal can rest in the read and write lists. destroy + * it. + */ +__LJMP static int hlua_socket_gc(lua_State *L) +{ + struct hlua_socket *socket; + struct hlua_csk_ctx *ctx; + struct xref *peer; + + MAY_LJMP(check_args(L, 1, "__gc")); + + socket = MAY_LJMP(hlua_checksocket(L, 1)); + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) + return 0; + + ctx = container_of(peer, struct hlua_csk_ctx, xref); + + /* Set the flag which destroy the session. */ + ctx->die = 1; + appctx_wakeup(ctx->appctx); + + /* Remove all reference between the Lua stack and the coroutine stream. */ + xref_disconnect(&socket->xref, peer); + return 0; +} + +/* The close function send shutdown signal and break the + * links between the stream and the object. + */ +__LJMP static int hlua_socket_close_helper(lua_State *L) +{ + struct hlua_socket *socket; + struct hlua_csk_ctx *ctx; + struct xref *peer; + struct hlua *hlua; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) + return 0; + + socket = MAY_LJMP(hlua_checksocket(L, 1)); + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) + return 0; + + hlua->gc_count--; + ctx = container_of(peer, struct hlua_csk_ctx, xref); + + /* Set the flag which destroy the session. */ + ctx->die = 1; + appctx_wakeup(ctx->appctx); + + /* Remove all reference between the Lua stack and the coroutine stream. */ + xref_disconnect(&socket->xref, peer); + return 0; +} + +/* The close function calls close_helper. + */ +__LJMP static int hlua_socket_close(lua_State *L) +{ + MAY_LJMP(check_args(L, 1, "close")); + return hlua_socket_close_helper(L); +} + +/* This Lua function assumes that the stack contain three parameters. + * 1 - USERDATA containing a struct socket + * 2 - INTEGER with values of the macro defined below + * If the integer is -1, we must read at most one line. + * If the integer is -2, we ust read all the data until the + * end of the stream. + * If the integer is positive value, we must read a number of + * bytes corresponding to this value. + */ +#define HLSR_READ_LINE (-1) +#define HLSR_READ_ALL (-2) +__LJMP static int hlua_socket_receive_yield(struct lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_socket *socket = MAY_LJMP(hlua_checksocket(L, 1)); + int wanted = lua_tointeger(L, 2); + struct hlua *hlua; + struct hlua_csk_ctx *csk_ctx; + struct appctx *appctx; + size_t len; + int nblk; + const char *blk1; + size_t len1; + const char *blk2; + size_t len2; + int skip_at_end = 0; + struct channel *oc; + struct stream *s; + struct xref *peer; + int missing_bytes; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + + /* Check if this lua stack is schedulable. */ + if (!hlua || !hlua->task) + WILL_LJMP(luaL_error(L, "The 'receive' function is only allowed in " + "'frontend', 'backend' or 'task'")); + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + /* check for connection break. If some data where read, return it. */ + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) + goto no_peer; + + csk_ctx = container_of(peer, struct hlua_csk_ctx, xref); + if (!csk_ctx->connected) + goto connection_closed; + + appctx = csk_ctx->appctx; + s = appctx_strm(appctx); + + oc = &s->res; + if (wanted == HLSR_READ_LINE) { + /* Read line. */ + nblk = co_getline_nc(oc, &blk1, &len1, &blk2, &len2); + if (nblk < 0) /* Connection close. */ + goto connection_closed; + if (nblk == 0) /* No data available. */ + goto connection_empty; + + /* remove final \r\n. */ + if (nblk == 1) { + if (blk1[len1-1] == '\n') { + len1--; + skip_at_end++; + if (blk1[len1-1] == '\r') { + len1--; + skip_at_end++; + } + } + } + else { + if (blk2[len2-1] == '\n') { + len2--; + skip_at_end++; + if (blk2[len2-1] == '\r') { + len2--; + skip_at_end++; + } + } + } + } + + else if (wanted == HLSR_READ_ALL) { + /* Read all the available data. */ + nblk = co_getblk_nc(oc, &blk1, &len1, &blk2, &len2); + if (nblk < 0) /* Connection close. */ + goto connection_closed; + if (nblk == 0) /* No data available. */ + goto connection_empty; + } + + else { + /* Read a block of data. */ + nblk = co_getblk_nc(oc, &blk1, &len1, &blk2, &len2); + if (nblk < 0) /* Connection close. */ + goto connection_closed; + if (nblk == 0) /* No data available. */ + goto connection_empty; + + missing_bytes = wanted - socket->b.n; + if (len1 > missing_bytes) { + nblk = 1; + len1 = missing_bytes; + } if (nblk == 2 && len1 + len2 > missing_bytes) + len2 = missing_bytes - len1; + } + + len = len1; + + luaL_addlstring(&socket->b, blk1, len1); + if (nblk == 2) { + len += len2; + luaL_addlstring(&socket->b, blk2, len2); + } + + /* Consume data. */ + co_skip(oc, len + skip_at_end); + + /* Don't wait anything. */ + appctx_wakeup(appctx); + + /* If the pattern reclaim to read all the data + * in the connection, got out. + */ + if (wanted == HLSR_READ_ALL) + goto connection_empty; + else if (wanted >= 0 && socket->b.n < wanted) + goto connection_empty; + + /* Return result. */ + luaL_pushresult(&socket->b); + xref_unlock(&socket->xref, peer); + return 1; + +connection_closed: + + xref_unlock(&socket->xref, peer); + +no_peer: + + /* If the buffer containds data. */ + if (socket->b.n > 0) { + luaL_pushresult(&socket->b); + return 1; + } + lua_pushnil(L); + lua_pushstring(L, "connection closed."); + return 2; + +connection_empty: + + if (!notification_new(&hlua->com, &csk_ctx->wake_on_read, hlua->task)) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "out of memory")); + } + xref_unlock(&socket->xref, peer); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_socket_receive_yield, TICK_ETERNITY, 0)); + return 0; +} + +/* This Lua function gets two parameters. The first one can be string + * or a number. If the string is "*l", the user requires one line. If + * the string is "*a", the user requires all the contents of the stream. + * If the value is a number, the user require a number of bytes equal + * to the value. The default value is "*l" (a line). + * + * This parameter with a variable type is converted in integer. This + * integer takes this values: + * -1 : read a line + * -2 : read all the stream + * >0 : amount of bytes. + * + * The second parameter is optional. It contains a string that must be + * concatenated with the read data. + */ +__LJMP static int hlua_socket_receive(struct lua_State *L) +{ + int wanted = HLSR_READ_LINE; + const char *pattern; + int lastarg, type; + char *error; + size_t len; + struct hlua_socket *socket; + + if (lua_gettop(L) < 1 || lua_gettop(L) > 3) + WILL_LJMP(luaL_error(L, "The 'receive' function requires between 1 and 3 arguments.")); + + socket = MAY_LJMP(hlua_checksocket(L, 1)); + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + /* check for pattern. */ + if (lua_gettop(L) >= 2) { + type = lua_type(L, 2); + if (type == LUA_TSTRING) { + pattern = lua_tostring(L, 2); + if (strcmp(pattern, "*a") == 0) + wanted = HLSR_READ_ALL; + else if (strcmp(pattern, "*l") == 0) + wanted = HLSR_READ_LINE; + else { + wanted = strtoll(pattern, &error, 10); + if (*error != '\0') + WILL_LJMP(luaL_error(L, "Unsupported pattern.")); + } + } + else if (type == LUA_TNUMBER) { + wanted = lua_tointeger(L, 2); + if (wanted < 0) + WILL_LJMP(luaL_error(L, "Unsupported size.")); + } + } + + /* Set pattern. */ + lua_pushinteger(L, wanted); + + /* Check if we would replace the top by itself. */ + if (lua_gettop(L) != 2) + lua_replace(L, 2); + + /* Save index of the top of the stack because since buffers are used, it + * may change + */ + lastarg = lua_gettop(L); + + /* init buffer, and fill it with prefix. */ + luaL_buffinit(L, &socket->b); + + /* Check prefix. */ + if (lastarg >= 3) { + if (lua_type(L, 3) != LUA_TSTRING) + WILL_LJMP(luaL_error(L, "Expect a 'string' for the prefix")); + pattern = lua_tolstring(L, 3, &len); + luaL_addlstring(&socket->b, pattern, len); + } + + return __LJMP(hlua_socket_receive_yield(L, 0, 0)); +} + +/* Write the Lua input string in the output buffer. + * This function returns a yield if no space is available. + */ +static int hlua_socket_write_yield(struct lua_State *L,int status, lua_KContext ctx) +{ + struct hlua_socket *socket; + struct hlua *hlua; + struct hlua_csk_ctx *csk_ctx; + struct appctx *appctx; + size_t buf_len; + const char *buf; + int len; + int send_len; + int sent; + struct xref *peer; + struct stream *s; + struct stconn *sc; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + + /* Check if this lua stack is schedulable. */ + if (!hlua || !hlua->task) + WILL_LJMP(luaL_error(L, "The 'write' function is only allowed in " + "'frontend', 'backend' or 'task'")); + + /* Get object */ + socket = MAY_LJMP(hlua_checksocket(L, 1)); + buf = MAY_LJMP(luaL_checklstring(L, 2, &buf_len)); + sent = MAY_LJMP(luaL_checkinteger(L, 3)); + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + /* check for connection break. If some data where read, return it. */ + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) { + lua_pushinteger(L, -1); + return 1; + } + + csk_ctx = container_of(peer, struct hlua_csk_ctx, xref); + if (!csk_ctx->connected) { + xref_unlock(&socket->xref, peer); + lua_pushinteger(L, -1); + return 1; + } + + appctx = csk_ctx->appctx; + sc = appctx_sc(appctx); + s = __sc_strm(sc); + + /* Check for connection close. */ + if (channel_output_closed(&s->req)) { + xref_unlock(&socket->xref, peer); + lua_pushinteger(L, -1); + return 1; + } + + /* Update the input buffer data. */ + buf += sent; + send_len = buf_len - sent; + + /* All the data are sent. */ + if (sent >= buf_len) { + xref_unlock(&socket->xref, peer); + return 1; /* Implicitly return the length sent. */ + } + + /* Check if the buffer is available because HAProxy doesn't allocate + * the request buffer if its not required. + */ + if (s->req.buf.size == 0) { + if (!sc_alloc_ibuf(sc, &appctx->buffer_wait)) + goto hlua_socket_write_yield_return; + } + + /* Check for available space. */ + len = b_room(&s->req.buf); + if (len <= 0) { + goto hlua_socket_write_yield_return; + } + + /* send data */ + if (len < send_len) + send_len = len; + len = ci_putblk(&s->req, buf, send_len); + + /* "Not enough space" (-1), "Buffer too little to contain + * the data" (-2) are not expected because the available length + * is tested. + * Other unknown error are also not expected. + */ + if (len <= 0) { + if (len == -1) + s->req.flags |= CF_WAKE_WRITE; + + MAY_LJMP(hlua_socket_close_helper(L)); + lua_pop(L, 1); + lua_pushinteger(L, -1); + xref_unlock(&socket->xref, peer); + return 1; + } + + /* update buffers. */ + appctx_wakeup(appctx); + + /* Update length sent. */ + lua_pop(L, 1); + lua_pushinteger(L, sent + len); + + /* All the data buffer is sent ? */ + if (sent + len >= buf_len) { + xref_unlock(&socket->xref, peer); + return 1; + } + +hlua_socket_write_yield_return: + if (!notification_new(&hlua->com, &csk_ctx->wake_on_write, hlua->task)) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "out of memory")); + } + xref_unlock(&socket->xref, peer); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_socket_write_yield, TICK_ETERNITY, 0)); + return 0; +} + +/* This function initiate the send of data. It just check the input + * parameters and push an integer in the Lua stack that contain the + * amount of data written to the buffer. This is used by the function + * "hlua_socket_write_yield" that can yield. + * + * The Lua function gets between 3 and 4 parameters. The first one is + * the associated object. The second is a string buffer. The third is + * a facultative integer that represents where is the buffer position + * of the start of the data that can send. The first byte is the + * position "1". The default value is "1". The fourth argument is a + * facultative integer that represents where is the buffer position + * of the end of the data that can send. The default is the last byte. + */ +static int hlua_socket_send(struct lua_State *L) +{ + int i; + int j; + const char *buf; + size_t buf_len; + + /* Check number of arguments. */ + if (lua_gettop(L) < 2 || lua_gettop(L) > 4) + WILL_LJMP(luaL_error(L, "'send' needs between 2 and 4 arguments")); + + /* Get the string. */ + buf = MAY_LJMP(luaL_checklstring(L, 2, &buf_len)); + + /* Get and check j. */ + if (lua_gettop(L) == 4) { + j = MAY_LJMP(luaL_checkinteger(L, 4)); + if (j < 0) + j = buf_len + j + 1; + if (j > buf_len) + j = buf_len + 1; + lua_pop(L, 1); + } + else + j = buf_len; + + /* Get and check i. */ + if (lua_gettop(L) == 3) { + i = MAY_LJMP(luaL_checkinteger(L, 3)); + if (i < 0) + i = buf_len + i + 1; + if (i > buf_len) + i = buf_len + 1; + lua_pop(L, 1); + } else + i = 1; + + /* Check bth i and j. */ + if (i > j) { + lua_pushinteger(L, 0); + return 1; + } + if (i == 0 && j == 0) { + lua_pushinteger(L, 0); + return 1; + } + if (i == 0) + i = 1; + if (j == 0) + j = 1; + + /* Pop the string. */ + lua_pop(L, 1); + + /* Update the buffer length. */ + buf += i - 1; + buf_len = j - i + 1; + lua_pushlstring(L, buf, buf_len); + + /* This unsigned is used to remember the amount of sent data. */ + lua_pushinteger(L, 0); + + return MAY_LJMP(hlua_socket_write_yield(L, 0, 0)); +} + +#define SOCKET_INFO_MAX_LEN sizeof("[0000:0000:0000:0000:0000:0000:0000:0000]:12345") +__LJMP static inline int hlua_socket_info(struct lua_State *L, const struct sockaddr_storage *addr) +{ + static char buffer[SOCKET_INFO_MAX_LEN]; + int ret; + int len; + char *p; + + ret = addr_to_str(addr, buffer+1, SOCKET_INFO_MAX_LEN-1); + if (ret <= 0) { + lua_pushnil(L); + return 1; + } + + if (ret == AF_UNIX) { + lua_pushstring(L, buffer+1); + return 1; + } + else if (ret == AF_INET6) { + buffer[0] = '['; + len = strlen(buffer); + buffer[len] = ']'; + len++; + buffer[len] = ':'; + len++; + p = buffer; + } + else if (ret == AF_INET) { + p = buffer + 1; + len = strlen(p); + p[len] = ':'; + len++; + } + else { + lua_pushnil(L); + return 1; + } + + if (port_to_str(addr, p + len, SOCKET_INFO_MAX_LEN-1 - len) <= 0) { + lua_pushnil(L); + return 1; + } + + lua_pushstring(L, p); + return 1; +} + +/* Returns information about the peer of the connection. */ +__LJMP static int hlua_socket_getpeername(struct lua_State *L) +{ + struct hlua_socket *socket; + struct xref *peer; + struct hlua_csk_ctx *csk_ctx; + struct appctx *appctx; + struct stconn *sc; + const struct sockaddr_storage *dst; + int ret; + + MAY_LJMP(check_args(L, 1, "getpeername")); + + socket = MAY_LJMP(hlua_checksocket(L, 1)); + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + /* check for connection break. If some data where read, return it. */ + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) { + lua_pushnil(L); + return 1; + } + + csk_ctx = container_of(peer, struct hlua_csk_ctx, xref); + if (!csk_ctx->connected) { + xref_unlock(&socket->xref, peer); + lua_pushnil(L); + return 1; + } + + appctx = csk_ctx->appctx; + sc = appctx_sc(appctx); + dst = sc_dst(sc_opposite(sc)); + if (!dst) { + xref_unlock(&socket->xref, peer); + lua_pushnil(L); + return 1; + } + + ret = MAY_LJMP(hlua_socket_info(L, dst)); + xref_unlock(&socket->xref, peer); + return ret; +} + +/* Returns information about my connection side. */ +static int hlua_socket_getsockname(struct lua_State *L) +{ + struct hlua_socket *socket; + struct connection *conn; + struct appctx *appctx; + struct xref *peer; + struct hlua_csk_ctx *csk_ctx; + struct stream *s; + int ret; + + MAY_LJMP(check_args(L, 1, "getsockname")); + + socket = MAY_LJMP(hlua_checksocket(L, 1)); + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + /* check for connection break. If some data where read, return it. */ + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) { + lua_pushnil(L); + return 1; + } + + csk_ctx = container_of(peer, struct hlua_csk_ctx, xref); + if (!csk_ctx->connected) { + xref_unlock(&socket->xref, peer); + lua_pushnil(L); + return 1; + } + + appctx = csk_ctx->appctx; + s = appctx_strm(appctx); + + conn = sc_conn(s->scb); + if (!conn || !conn_get_src(conn)) { + xref_unlock(&socket->xref, peer); + lua_pushnil(L); + return 1; + } + + ret = hlua_socket_info(L, conn->src); + xref_unlock(&socket->xref, peer); + return ret; +} + +/* This struct define the applet. */ +static struct applet update_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<LUA_TCP>", + .fct = hlua_socket_handler, + .init = hlua_socket_init, + .release = hlua_socket_release, +}; + +__LJMP static int hlua_socket_connect_yield(struct lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_socket *socket = MAY_LJMP(hlua_checksocket(L, 1)); + struct hlua *hlua; + struct xref *peer; + struct hlua_csk_ctx *csk_ctx; + struct appctx *appctx; + struct stream *s; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) + return 0; + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + /* check for connection break. If some data where read, return it. */ + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) { + lua_pushnil(L); + lua_pushstring(L, "Can't connect"); + return 2; + } + + csk_ctx = container_of(peer, struct hlua_csk_ctx, xref); + appctx = csk_ctx->appctx; + s = appctx_strm(appctx); + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + } + + /* Check for connection close. */ + if (!hlua || channel_output_closed(&s->req)) { + xref_unlock(&socket->xref, peer); + lua_pushnil(L); + lua_pushstring(L, "Can't connect"); + return 2; + } + + appctx = __sc_appctx(s->scf); + + /* Check for connection established. */ + if (csk_ctx->connected) { + xref_unlock(&socket->xref, peer); + lua_pushinteger(L, 1); + return 1; + } + + if (!notification_new(&hlua->com, &csk_ctx->wake_on_write, hlua->task)) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "out of memory error")); + } + xref_unlock(&socket->xref, peer); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_socket_connect_yield, TICK_ETERNITY, 0)); + return 0; +} + +/* This function fail or initite the connection. */ +__LJMP static int hlua_socket_connect(struct lua_State *L) +{ + struct hlua_socket *socket; + int port = -1; + const char *ip; + struct hlua *hlua; + struct hlua_csk_ctx *csk_ctx; + struct appctx *appctx; + int low, high; + struct sockaddr_storage *addr; + struct xref *peer; + struct stconn *sc; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) + return 0; + + if (lua_gettop(L) < 2) + WILL_LJMP(luaL_error(L, "connect: need at least 2 arguments")); + + /* Get args. */ + socket = MAY_LJMP(hlua_checksocket(L, 1)); + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + ip = MAY_LJMP(luaL_checkstring(L, 2)); + if (lua_gettop(L) >= 3) { + luaL_Buffer b; + port = MAY_LJMP(luaL_checkinteger(L, 3)); + + /* Force the ip to end with a colon, to support IPv6 addresses + * that are not enclosed within square brackets. + */ + if (port > 0) { + luaL_buffinit(L, &b); + luaL_addstring(&b, ip); + luaL_addchar(&b, ':'); + luaL_pushresult(&b); + ip = lua_tolstring(L, lua_gettop(L), NULL); + } + } + + /* check for connection break. If some data where read, return it. */ + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) { + lua_pushnil(L); + return 1; + } + + csk_ctx = container_of(peer, struct hlua_csk_ctx, xref); + if (!csk_ctx->srv) + csk_ctx->srv = socket_tcp; + + /* Parse ip address. */ + addr = str2sa_range(ip, NULL, &low, &high, NULL, NULL, NULL, NULL, NULL, NULL, PA_O_PORT_OK | PA_O_STREAM); + if (!addr) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "connect: cannot parse destination address '%s'", ip)); + } + + /* Set port. */ + if (low == 0) { + if (addr->ss_family == AF_INET) { + if (port == -1) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "connect: port missing")); + } + ((struct sockaddr_in *)addr)->sin_port = htons(port); + } else if (addr->ss_family == AF_INET6) { + if (port == -1) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "connect: port missing")); + } + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); + } + } + + appctx = csk_ctx->appctx; + if (appctx_sc(appctx)) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "connect: connect already performed\n")); + } + + if (appctx_init(appctx) == -1) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "connect: fail to init applet.")); + } + + sc = appctx_sc(appctx); + + if (!sockaddr_alloc(&sc_opposite(sc)->dst, addr, sizeof(*addr))) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "connect: internal error")); + } + + /* inform the stream that we want to be notified whenever the + * connection completes. + */ + applet_need_more_data(appctx); + applet_have_more_data(appctx); + appctx_wakeup(appctx); + + hlua->gc_count++; + + if (!notification_new(&hlua->com, &csk_ctx->wake_on_write, hlua->task)) { + xref_unlock(&socket->xref, peer); + WILL_LJMP(luaL_error(L, "out of memory")); + } + xref_unlock(&socket->xref, peer); + + /* Return yield waiting for connection. */ + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_socket_connect_yield, TICK_ETERNITY, 0)); + + return 0; +} + +#ifdef USE_OPENSSL +__LJMP static int hlua_socket_connect_ssl(struct lua_State *L) +{ + struct hlua_socket *socket; + struct xref *peer; + + MAY_LJMP(check_args(L, 3, "connect_ssl")); + socket = MAY_LJMP(hlua_checksocket(L, 1)); + + /* check for connection break. If some data where read, return it. */ + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) { + lua_pushnil(L); + return 1; + } + + container_of(peer, struct hlua_csk_ctx, xref)->srv = socket_ssl; + + xref_unlock(&socket->xref, peer); + return MAY_LJMP(hlua_socket_connect(L)); +} +#endif + +__LJMP static int hlua_socket_setoption(struct lua_State *L) +{ + return 0; +} + +__LJMP static int hlua_socket_settimeout(struct lua_State *L) +{ + struct hlua_socket *socket; + int tmout; + double dtmout; + struct xref *peer; + struct hlua_csk_ctx *csk_ctx; + struct appctx *appctx; + struct stream *s; + + MAY_LJMP(check_args(L, 2, "settimeout")); + + socket = MAY_LJMP(hlua_checksocket(L, 1)); + + /* convert the timeout to millis */ + dtmout = MAY_LJMP(luaL_checknumber(L, 2)) * 1000; + + /* Check for negative values */ + if (dtmout < 0) + WILL_LJMP(luaL_error(L, "settimeout: cannot set negatives values")); + + if (dtmout > INT_MAX) /* overflow check */ + WILL_LJMP(luaL_error(L, "settimeout: cannot set values larger than %d ms", INT_MAX)); + + tmout = MS_TO_TICKS((int)dtmout); + if (tmout == 0) + tmout++; /* very small timeouts are adjusted to a minimum of 1ms */ + + /* Check if we run on the same thread than the xreator thread. + * We cannot access to the socket if the thread is different. + */ + if (socket->tid != tid) + WILL_LJMP(luaL_error(L, "connect: cannot use socket on other thread")); + + /* check for connection break. If some data were read, return it. */ + peer = xref_get_peer_and_lock(&socket->xref); + if (!peer) { + hlua_pusherror(L, "socket: not yet initialised, you can't set timeouts."); + WILL_LJMP(lua_error(L)); + return 0; + } + + csk_ctx = container_of(peer, struct hlua_csk_ctx, xref); + csk_ctx->timeout = tmout; + + appctx = csk_ctx->appctx; + if (!appctx_sc(appctx)) + goto end; + + s = appctx_strm(csk_ctx->appctx); + + s->sess->fe->timeout.connect = tmout; + s->scf->ioto = tmout; + s->scb->ioto = tmout; + + s->task->expire = (tick_is_expired(s->task->expire, now_ms) ? 0 : s->task->expire); + s->task->expire = tick_first(s->task->expire, tick_add_ifset(now_ms, tmout)); + task_queue(s->task); + + end: + xref_unlock(&socket->xref, peer); + lua_pushinteger(L, 1); + return 1; +} + +__LJMP static int hlua_socket_new(lua_State *L) +{ + struct hlua_socket *socket; + struct hlua_csk_ctx *ctx; + struct appctx *appctx; + + /* Check stack size. */ + if (!lua_checkstack(L, 3)) { + hlua_pusherror(L, "socket: full stack"); + goto out_fail_conf; + } + + /* Create the object: obj[0] = userdata. */ + lua_newtable(L); + socket = MAY_LJMP(lua_newuserdata(L, sizeof(*socket))); + lua_rawseti(L, -2, 0); + memset(socket, 0, sizeof(*socket)); + socket->tid = tid; + + /* Check if the various memory pools are initialized. */ + if (!pool_head_stream || !pool_head_buffer) { + hlua_pusherror(L, "socket: uninitialized pools."); + goto out_fail_conf; + } + + /* Pop a class stream metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_socket_ref); + lua_setmetatable(L, -2); + + /* Create the applet context */ + appctx = appctx_new_here(&update_applet, NULL); + if (!appctx) { + hlua_pusherror(L, "socket: out of memory"); + goto out_fail_conf; + } + ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + ctx->connected = 0; + ctx->die = 0; + ctx->srv = NULL; + ctx->timeout = 0; + ctx->appctx = appctx; + LIST_INIT(&ctx->wake_on_write); + LIST_INIT(&ctx->wake_on_read); + + /* Initialise cross reference between stream and Lua socket object. */ + xref_create(&socket->xref, &ctx->xref); + return 1; + + out_fail_conf: + WILL_LJMP(lua_error(L)); + return 0; +} + +/* + * + * + * Class Channel + * + * + */ + +/* Returns the struct hlua_channel join to the class channel in the + * stack entry "ud" or throws an argument error. + */ +__LJMP static struct channel *hlua_checkchannel(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_channel_ref)); +} + +/* Pushes the channel onto the top of the stack. If the stask does not have a + * free slots, the function fails and returns 0; + */ +static int hlua_channel_new(lua_State *L, struct channel *channel) +{ + /* Check stack size. */ + if (!lua_checkstack(L, 3)) + return 0; + + lua_newtable(L); + lua_pushlightuserdata(L, channel); + lua_rawseti(L, -2, 0); + + /* Pop a class sesison metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_channel_ref); + lua_setmetatable(L, -2); + return 1; +} + +/* Helper function returning a filter attached to a channel at the position <ud> + * in the stack, filling the current offset and length of the filter. If no + * filter is attached, NULL is returned and <offset> and <len> are not + * initialized. + */ +static struct filter *hlua_channel_filter(lua_State *L, int ud, struct channel *chn, size_t *offset, size_t *len) +{ + struct filter *filter = NULL; + + if (lua_getfield(L, ud, "__filter") == LUA_TLIGHTUSERDATA) { + struct hlua_flt_ctx *flt_ctx; + + filter = lua_touserdata (L, -1); + flt_ctx = filter->ctx; + if (hlua_filter_from_payload(filter)) { + *offset = flt_ctx->cur_off[CHN_IDX(chn)]; + *len = flt_ctx->cur_len[CHN_IDX(chn)]; + } + } + + lua_pop(L, 1); + return filter; +} + +/* Copies <len> bytes of data present in the channel's buffer, starting at the +* offset <offset>, and put it in a LUA string variable. It is the caller +* responsibility to ensure <len> and <offset> are valid. It always return the +* length of the built string. <len> may be 0, in this case, an empty string is +* created and 0 is returned. +*/ +static inline int _hlua_channel_dup(struct channel *chn, lua_State *L, size_t offset, size_t len) +{ + size_t block1, block2; + luaL_Buffer b; + + block1 = len; + if (block1 > b_contig_data(&chn->buf, b_peek_ofs(&chn->buf, offset))) + block1 = b_contig_data(&chn->buf, b_peek_ofs(&chn->buf, offset)); + block2 = len - block1; + + luaL_buffinit(L, &b); + luaL_addlstring(&b, b_peek(&chn->buf, offset), block1); + if (block2) + luaL_addlstring(&b, b_orig(&chn->buf), block2); + luaL_pushresult(&b); + return len; +} + +/* Inserts the string <str> to the channel's buffer at the offset <offset>. This + * function returns -1 if data cannot be copied. Otherwise, it returns the + * number of bytes copied. + */ +static int _hlua_channel_insert(struct channel *chn, lua_State *L, struct ist str, size_t offset) +{ + int ret = 0; + + /* Nothing to do, just return */ + if (unlikely(istlen(str) == 0)) + goto end; + + if (istlen(str) > c_room(chn)) { + ret = -1; + goto end; + } + ret = b_insert_blk(&chn->buf, offset, istptr(str), istlen(str)); + + end: + return ret; +} + +/* Removes <len> bytes of data at the absolute position <offset>. + */ +static void _hlua_channel_delete(struct channel *chn, size_t offset, size_t len) +{ + size_t end = offset + len; + + if (b_peek(&chn->buf, end) != b_tail(&chn->buf)) + b_move(&chn->buf, b_peek_ofs(&chn->buf, end), + b_data(&chn->buf) - end, -len); + b_sub(&chn->buf, len); +} + +/* Copies input data in the channel's buffer. It is possible to set a specific + * offset (0 by default) and a length (all remaining input data starting for the + * offset by default). If there is not enough input data and more data can be + * received, this function yields. + * + * From an action, All input data are considered. For a filter, the offset and + * the length of input data to consider are retrieved from the filter context. + */ +__LJMP static int hlua_channel_get_data_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct channel *chn; + struct filter *filter; + size_t input, output; + int offset, len; + + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + + output = co_data(chn); + input = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &output, &input); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + offset = output; + if (lua_gettop(L) > 1) { + offset = MAY_LJMP(luaL_checkinteger(L, 2)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset < output || offset > input + output) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + len = output + input - offset; + if (lua_gettop(L) == 3) { + len = MAY_LJMP(luaL_checkinteger(L, 3)); + if (!len) + goto dup; + if (len == -1) + len = global.tune.bufsize; + if (len < 0) { + lua_pushfstring(L, "length out of range."); + WILL_LJMP(lua_error(L)); + } + } + + /* Wait for more data if possible if no length was specified and there + * is no data or not enough data was received. + */ + if (!len || offset + len > output + input) { + if (!HLUA_CANT_YIELD(hlua_gethlua(L)) && !channel_input_closed(chn) && channel_may_recv(chn)) { + /* Yield waiting for more data, as requested */ + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_channel_get_data_yield, TICK_ETERNITY, 0)); + } + + /* Return 'nil' if there is no data and the channel can't receive more data */ + if (!len) { + lua_pushnil(L); + return -1; + } + + /* Otherwise, return all data */ + len = output + input - offset; + } + + dup: + _hlua_channel_dup(chn, L, offset, len); + return 1; +} + +/* Copies the first line (including the trailing LF) of input data in the + * channel's buffer. It is possible to set a specific offset (0 by default) and + * a length (all remaining input data starting for the offset by default). If + * there is not enough input data and more data can be received, the function + * yields. If a length is explicitly specified, no more data are + * copied. Otherwise, if no LF is found and more data can be received, this + * function yields. + * + * From an action, All input data are considered. For a filter, the offset and + * the length of input data to consider are retrieved from the filter context. + */ +__LJMP static int hlua_channel_get_line_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct channel *chn; + struct filter *filter; + size_t l, input, output; + int offset, len; + + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + output = co_data(chn); + input = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &output, &input); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + offset = output; + if (lua_gettop(L) > 1) { + offset = MAY_LJMP(luaL_checkinteger(L, 2)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset < output || offset > input + output) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + + len = output + input - offset; + if (lua_gettop(L) == 3) { + len = MAY_LJMP(luaL_checkinteger(L, 3)); + if (!len) + goto dup; + if (len == -1) + len = global.tune.bufsize; + if (len < 0) { + lua_pushfstring(L, "length out of range."); + WILL_LJMP(lua_error(L)); + } + } + + for (l = 0; l < len; l++) { + if (l + offset >= output + input) + break; + if (*(b_peek(&chn->buf, offset + l)) == '\n') { + len = l+1; + goto dup; + } + } + + /* Wait for more data if possible if no line is found and no length was + * specified or not enough data was received. + */ + if (lua_gettop(L) != 3 || offset + len > output + input) { + if (!HLUA_CANT_YIELD(hlua_gethlua(L)) && !channel_input_closed(chn) && channel_may_recv(chn)) { + /* Yield waiting for more data */ + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_channel_get_line_yield, TICK_ETERNITY, 0)); + } + + /* Return 'nil' if there is no data and the channel can't receive more data */ + if (!len) { + lua_pushnil(L); + return -1; + } + + /* Otherwise, return all data */ + len = output + input - offset; + } + + dup: + _hlua_channel_dup(chn, L, offset, len); + return 1; +} + +/* [ DEPRECATED ] + * + * Duplicate all input data foud in the channel's buffer. The data are not + * removed from the buffer. This function relies on _hlua_channel_dup(). + * + * From an action, All input data are considered. For a filter, the offset and + * the length of input data to consider are retrieved from the filter context. + */ +__LJMP static int hlua_channel_dup(lua_State *L) +{ + struct channel *chn; + struct filter *filter; + size_t offset, len; + + MAY_LJMP(check_args(L, 1, "dup")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + + offset = co_data(chn); + len = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &offset, &len); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + if (!ci_data(chn) && channel_input_closed(chn)) { + lua_pushnil(L); + return 1; + } + + _hlua_channel_dup(chn, L, offset, len); + return 1; +} + +/* [ DEPRECATED ] + * + * Get all input data foud in the channel's buffer. The data are removed from + * the buffer after the copy. This function relies on _hlua_channel_dup() and + * _hlua_channel_delete(). + * + * From an action, All input data are considered. For a filter, the offset and + * the length of input data to consider are retrieved from the filter context. + */ +__LJMP static int hlua_channel_get(lua_State *L) +{ + struct channel *chn; + struct filter *filter; + size_t offset, len; + int ret; + + MAY_LJMP(check_args(L, 1, "get")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + + offset = co_data(chn); + len = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &offset, &len); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + if (!ci_data(chn) && channel_input_closed(chn)) { + lua_pushnil(L); + return 1; + } + + ret = _hlua_channel_dup(chn, L, offset, len); + _hlua_channel_delete(chn, offset, ret); + return 1; +} + +/* This functions consumes and returns one line. If the channel is closed, + * and the last data does not contains a final '\n', the data are returned + * without the final '\n'. When no more data are available, it returns nil + * value. + * + * From an action, All input data are considered. For a filter, the offset and + * the length of input data to consider are retrieved from the filter context. + */ +__LJMP static int hlua_channel_getline_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct channel *chn; + struct filter *filter; + size_t l, offset, len; + int ret; + + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + + offset = co_data(chn); + len = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &offset, &len); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + if (!ci_data(chn) && channel_input_closed(chn)) { + lua_pushnil(L); + return 1; + } + + for (l = 0; l < len; l++) { + if (*(b_peek(&chn->buf, offset+l)) == '\n') { + len = l+1; + goto dup; + } + } + + if (!HLUA_CANT_YIELD(hlua_gethlua(L)) && !channel_input_closed(chn) && channel_may_recv(chn)) { + /* Yield waiting for more data */ + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_channel_getline_yield, TICK_ETERNITY, 0)); + } + + dup: + ret = _hlua_channel_dup(chn, L, offset, len); + _hlua_channel_delete(chn, offset, ret); + return 1; +} + +/* [ DEPRECATED ] + * + * Check arguments for the function "hlua_channel_getline_yield". + */ +__LJMP static int hlua_channel_getline(lua_State *L) +{ + struct channel *chn; + + MAY_LJMP(check_args(L, 1, "getline")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + return MAY_LJMP(hlua_channel_getline_yield(L, 0, 0)); +} + +/* Retrieves a given amount of input data at the given offset. By default all + * available input data are returned. The offset may be negactive to start from + * the end of input data. The length may be -1 to set it to the maximum buffer + * size. + */ +__LJMP static int hlua_channel_get_data(lua_State *L) +{ + struct channel *chn; + + if (lua_gettop(L) < 1 || lua_gettop(L) > 3) + WILL_LJMP(luaL_error(L, "'data' expects at most 2 arguments")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + return MAY_LJMP(hlua_channel_get_data_yield(L, 0, 0)); +} + +/* Retrieves a given amount of input data at the given offset. By default all + * available input data are returned. The offset may be negactive to start from + * the end of input data. The length may be -1 to set it to the maximum buffer + * size. + */ +__LJMP static int hlua_channel_get_line(lua_State *L) +{ + struct channel *chn; + + if (lua_gettop(L) < 1 || lua_gettop(L) > 3) + WILL_LJMP(luaL_error(L, "'line' expects at most 2 arguments")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + return MAY_LJMP(hlua_channel_get_line_yield(L, 0, 0)); +} + +/* Appends a string into the input side of channel. It returns the length of the + * written string, or -1 if the channel is closed or if the buffer size is too + * little for the data. 0 may be returned if nothing is copied. This function + * does not yield. + * + * For a filter, the context is updated on success. + */ +__LJMP static int hlua_channel_append(lua_State *L) +{ + struct channel *chn; + struct filter *filter; + const char *str; + size_t sz, offset, len; + int ret; + + MAY_LJMP(check_args(L, 2, "append")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + + offset = co_data(chn); + len = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &offset, &len); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + ret = _hlua_channel_insert(chn, L, ist2(str, sz), offset); + if (ret > 0 && filter) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + flt_update_offsets(filter, chn, ret); + flt_ctx->cur_len[CHN_IDX(chn)] += ret; + } + lua_pushinteger(L, ret); + return 1; +} + +/* Prepends a string into the input side of channel. It returns the length of the + * written string, or -1 if the channel is closed or if the buffer size is too + * little for the data. 0 may be returned if nothing is copied. This function + * does not yield. + * + * For a filter, the context is updated on success. + */ +__LJMP static int hlua_channel_prepend(lua_State *L) +{ + struct channel *chn; + struct filter *filter; + const char *str; + size_t sz, offset, len; + int ret; + + MAY_LJMP(check_args(L, 2, "prepend")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + + offset = co_data(chn); + len = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &offset, &len); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + ret = _hlua_channel_insert(chn, L, ist2(str, sz), offset); + if (ret > 0 && filter) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + flt_update_offsets(filter, chn, ret); + flt_ctx->cur_len[CHN_IDX(chn)] += ret; + } + + lua_pushinteger(L, ret); + return 1; +} + +/* Inserts a given amount of input data at the given offset by a string + * content. By default the string is appended in front of input data. It + * returns the length of the written string, or -1 if the channel is closed or + * if the buffer size is too little for the data. + * + * For a filter, the context is updated on success. + */ +__LJMP static int hlua_channel_insert_data(lua_State *L) +{ + struct channel *chn; + struct filter *filter; + const char *str; + size_t sz, input, output; + int ret, offset; + + if (lua_gettop(L) < 2 || lua_gettop(L) > 3) + WILL_LJMP(luaL_error(L, "'insert' expects at least 1 argument and at most 2 arguments")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + + output = co_data(chn); + input = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &output, &input); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + offset = output; + if (lua_gettop(L) > 2) { + offset = MAY_LJMP(luaL_checkinteger(L, 3)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset > output + input) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + + ret = _hlua_channel_insert(chn, L, ist2(str, sz), offset); + if (ret > 0 && filter) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + flt_update_offsets(filter, chn, ret); + flt_ctx->cur_len[CHN_IDX(chn)] += ret; + } + + lua_pushinteger(L, ret); + return 1; +} +/* Replaces a given amount of input data at the given offset by a string + * content. By default all remaining data are removed (offset = 0 and len = + * -1). It returns the length of the written string, or -1 if the channel is + * closed or if the buffer size is too little for the data. + * + * For a filter, the context is updated on success. + */ +__LJMP static int hlua_channel_set_data(lua_State *L) +{ + struct channel *chn; + struct filter *filter; + const char *str; + size_t sz, input, output; + int ret, offset, len; + + if (lua_gettop(L) < 2 || lua_gettop(L) > 4) + WILL_LJMP(luaL_error(L, "'set' expects at least 1 argument and at most 3 arguments")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + + output = co_data(chn); + input = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &output, &input); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + offset = output; + if (lua_gettop(L) > 2) { + offset = MAY_LJMP(luaL_checkinteger(L, 3)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset < output || offset > input + output) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + + len = output + input - offset; + if (lua_gettop(L) == 4) { + len = MAY_LJMP(luaL_checkinteger(L, 4)); + if (!len) + goto set; + if (len == -1) + len = output + input - offset; + if (len < 0 || offset + len > output + input) { + lua_pushfstring(L, "length out of range."); + WILL_LJMP(lua_error(L)); + } + } + + set: + /* Be sure we can copied the string once input data will be removed. */ + if (sz > c_room(chn) + len) + lua_pushinteger(L, -1); + else { + _hlua_channel_delete(chn, offset, len); + ret = _hlua_channel_insert(chn, L, ist2(str, sz), offset); + if (filter) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + len -= (ret > 0 ? ret : 0); + flt_update_offsets(filter, chn, -len); + flt_ctx->cur_len[CHN_IDX(chn)] -= len; + } + + lua_pushinteger(L, ret); + } + return 1; +} + +/* Removes a given amount of input data at the given offset. By default all + * input data are removed (offset = 0 and len = -1). It returns the amount of + * the removed data. + * + * For a filter, the context is updated on success. + */ +__LJMP static int hlua_channel_del_data(lua_State *L) +{ + struct channel *chn; + struct filter *filter; + size_t input, output; + int offset, len; + + if (lua_gettop(L) < 1 || lua_gettop(L) > 3) + WILL_LJMP(luaL_error(L, "'remove' expects at most 2 arguments")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + + output = co_data(chn); + input = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &output, &input); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + offset = output; + if (lua_gettop(L) > 1) { + offset = MAY_LJMP(luaL_checkinteger(L, 2)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset < output || offset > input + output) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + + len = output + input - offset; + if (lua_gettop(L) == 3) { + len = MAY_LJMP(luaL_checkinteger(L, 3)); + if (!len) + goto end; + if (len == -1) + len = output + input - offset; + if (len < 0 || offset + len > output + input) { + lua_pushfstring(L, "length out of range."); + WILL_LJMP(lua_error(L)); + } + } + + _hlua_channel_delete(chn, offset, len); + if (filter) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + flt_update_offsets(filter, chn, -len); + flt_ctx->cur_len[CHN_IDX(chn)] -= len; + } + + end: + lua_pushinteger(L, len); + return 1; +} + +/* Append data in the output side of the buffer. This data is immediately + * sent. The function returns the amount of data written. If the buffer + * cannot contain the data, the function yields. The function returns -1 + * if the channel is closed. + */ +__LJMP static int hlua_channel_send_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct channel *chn; + struct filter *filter; + const char *str; + size_t offset, len, sz; + int l, ret; + struct hlua *hlua; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) { + lua_pushnil(L); + return 1; + } + + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + l = MAY_LJMP(luaL_checkinteger(L, 3)); + + offset = co_data(chn); + len = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &offset, &len); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + + if (unlikely(channel_output_closed(chn))) { + lua_pushinteger(L, -1); + return 1; + } + + len = c_room(chn); + if (len > sz -l) { + if (filter) { + lua_pushinteger(L, -1); + return 1; + } + len = sz - l; + } + + ret = _hlua_channel_insert(chn, L, ist2(str, len), offset); + if (ret == -1) { + lua_pop(L, 1); + lua_pushinteger(L, -1); + return 1; + } + if (ret) { + if (filter) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + + flt_update_offsets(filter, chn, ret); + FLT_OFF(filter, chn) += ret; + flt_ctx->cur_off[CHN_IDX(chn)] += ret; + } + else + c_adv(chn, ret); + + l += ret; + lua_pop(L, 1); + lua_pushinteger(L, l); + } + + if (l < sz) { + /* Yield only if the channel's output is not empty. + * Otherwise it means we cannot add more data. */ + if (co_data(chn) == 0 || HLUA_CANT_YIELD(hlua_gethlua(L))) + return 1; + + /* If we are waiting for space in the response buffer, we + * must set the flag WAKERESWR. This flag required the task + * wake up if any activity is detected on the response buffer. + */ + if (chn->flags & CF_ISRESP) + HLUA_SET_WAKERESWR(hlua); + else + HLUA_SET_WAKEREQWR(hlua); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_channel_send_yield, TICK_ETERNITY, 0)); + } + + return 1; +} + +/* Just a wrapper of "_hlua_channel_send". This wrapper permits + * yield the LUA process, and resume it without checking the + * input arguments. + * + * This function cannot be called from a filter. + */ +__LJMP static int hlua_channel_send(lua_State *L) +{ + struct channel *chn; + + MAY_LJMP(check_args(L, 2, "send")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + lua_pushinteger(L, 0); + return MAY_LJMP(hlua_channel_send_yield(L, 0, 0)); +} + +/* This function forward and amount of butes. The data pass from + * the input side of the buffer to the output side, and can be + * forwarded. This function never fails. + * + * The Lua function takes an amount of bytes to be forwarded in + * input. It returns the number of bytes forwarded. + */ +__LJMP static int hlua_channel_forward_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct channel *chn; + struct filter *filter; + size_t offset, len, fwd; + int l, max; + struct hlua *hlua; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) { + lua_pushnil(L); + return 1; + } + + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + fwd = MAY_LJMP(luaL_checkinteger(L, 2)); + l = MAY_LJMP(luaL_checkinteger(L, -1)); + + offset = co_data(chn); + len = ci_data(chn); + + filter = hlua_channel_filter(L, 1, chn, &offset, &len); + if (filter && !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + max = fwd - l; + if (max > len) + max = len; + + if (filter) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + FLT_OFF(filter, chn) += max; + flt_ctx->cur_off[CHN_IDX(chn)] += max; + flt_ctx->cur_len[CHN_IDX(chn)] -= max; + } + else + channel_forward(chn, max); + + l += max; + lua_pop(L, 1); + lua_pushinteger(L, l); + + /* Check if it miss bytes to forward. */ + if (l < fwd) { + /* The the input channel or the output channel are closed, we + * must return the amount of data forwarded. + */ + if (channel_input_closed(chn) || channel_output_closed(chn) || HLUA_CANT_YIELD(hlua_gethlua(L))) + return 1; + + /* If we are waiting for space data in the response buffer, we + * must set the flag WAKERESWR. This flag required the task + * wake up if any activity is detected on the response buffer. + */ + if (chn->flags & CF_ISRESP) + HLUA_SET_WAKERESWR(hlua); + else + HLUA_SET_WAKEREQWR(hlua); + + /* Otherwise, we can yield waiting for new data in the input side. */ + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_channel_forward_yield, TICK_ETERNITY, 0)); + } + + return 1; +} + +/* Just check the input and prepare the stack for the previous + * function "hlua_channel_forward_yield" + * + * This function cannot be called from a filter. + */ +__LJMP static int hlua_channel_forward(lua_State *L) +{ + struct channel *chn; + + MAY_LJMP(check_args(L, 2, "forward")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + if (IS_HTX_STRM(chn_strm(chn))) { + lua_pushfstring(L, "Cannot manipulate HAProxy channels in HTTP mode."); + WILL_LJMP(lua_error(L)); + } + lua_pushinteger(L, 0); + return MAY_LJMP(hlua_channel_forward_yield(L, 0, 0)); +} + +/* Just returns the number of bytes available in the input + * side of the buffer. This function never fails. + */ +__LJMP static int hlua_channel_get_in_len(lua_State *L) +{ + struct channel *chn; + struct filter *filter; + size_t output, input; + + MAY_LJMP(check_args(L, 1, "input")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + + output = co_data(chn); + input = ci_data(chn); + filter = hlua_channel_filter(L, 1, chn, &output, &input); + if (filter || !IS_HTX_STRM(chn_strm(chn))) + lua_pushinteger(L, input); + else { + struct htx *htx = htxbuf(&chn->buf); + + lua_pushinteger(L, htx->data - co_data(chn)); + } + return 1; +} + +/* Returns true if the channel is full. */ +__LJMP static int hlua_channel_is_full(lua_State *L) +{ + struct channel *chn; + + MAY_LJMP(check_args(L, 1, "is_full")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + /* ignore the reserve, we are not on a producer side (ie in an + * applet). + */ + lua_pushboolean(L, channel_full(chn, 0)); + return 1; +} + +/* Returns true if the channel may still receive data. */ +__LJMP static int hlua_channel_may_recv(lua_State *L) +{ + struct channel *chn; + + MAY_LJMP(check_args(L, 1, "may_recv")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + lua_pushboolean(L, (!channel_input_closed(chn) && channel_may_recv(chn))); + return 1; +} + +/* Returns true if the channel is the response channel. */ +__LJMP static int hlua_channel_is_resp(lua_State *L) +{ + struct channel *chn; + + MAY_LJMP(check_args(L, 1, "is_resp")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + + lua_pushboolean(L, !!(chn->flags & CF_ISRESP)); + return 1; +} + +/* Just returns the number of bytes available in the output + * side of the buffer. This function never fails. + */ +__LJMP static int hlua_channel_get_out_len(lua_State *L) +{ + struct channel *chn; + size_t output, input; + + MAY_LJMP(check_args(L, 1, "output")); + chn = MAY_LJMP(hlua_checkchannel(L, 1)); + + output = co_data(chn); + input = ci_data(chn); + hlua_channel_filter(L, 1, chn, &output, &input); + + lua_pushinteger(L, output); + return 1; +} + +/* + * + * + * Class Fetches + * + * + */ + +/* Returns a struct hlua_session if the stack entry "ud" is + * a class stream, otherwise it throws an error. + */ +__LJMP static struct hlua_smp *hlua_checkfetches(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_fetches_ref)); +} + +/* This function creates and push in the stack a fetch object according + * with a current TXN. + */ +static int hlua_fetches_new(lua_State *L, struct hlua_txn *txn, unsigned int flags) +{ + struct hlua_smp *hsmp; + + /* Check stack size. */ + if (!lua_checkstack(L, 3)) + return 0; + + /* Create the object: obj[0] = userdata. + * Note that the base of the Fetches object is the + * transaction object. + */ + lua_newtable(L); + hsmp = lua_newuserdata(L, sizeof(*hsmp)); + lua_rawseti(L, -2, 0); + + hsmp->s = txn->s; + hsmp->p = txn->p; + hsmp->dir = txn->dir; + hsmp->flags = flags; + + /* Pop a class sesison metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_fetches_ref); + lua_setmetatable(L, -2); + + return 1; +} + +/* This function is an LUA binding. It is called with each sample-fetch. + * It uses closure argument to store the associated sample-fetch. It + * returns only one argument or throws an error. An error is thrown + * only if an error is encountered during the argument parsing. If + * the "sample-fetch" function fails, nil is returned. + */ +__LJMP static int hlua_run_sample_fetch(lua_State *L) +{ + struct hlua_smp *hsmp; + struct sample_fetch *f; + struct arg args[ARGM_NBARGS + 1] = {{0}}; + int i; + struct sample smp; + + /* Get closure arguments. */ + f = lua_touserdata(L, lua_upvalueindex(1)); + + /* Get traditional arguments. */ + hsmp = MAY_LJMP(hlua_checkfetches(L, 1)); + + /* Check execution authorization. */ + if (f->use & SMP_USE_HTTP_ANY && + !(hsmp->flags & HLUA_F_MAY_USE_HTTP)) { + lua_pushfstring(L, "the sample-fetch '%s' needs an HTTP parser which " + "is not available in Lua services", f->kw); + WILL_LJMP(lua_error(L)); + } + + /* Get extra arguments. */ + for (i = 0; i < lua_gettop(L) - 1; i++) { + if (i >= ARGM_NBARGS) + break; + hlua_lua2arg(L, i + 2, &args[i]); + } + args[i].type = ARGT_STOP; + args[i].data.str.area = NULL; + + /* Check arguments. */ + MAY_LJMP(hlua_lua2arg_check(L, 2, args, f->arg_mask, hsmp->p)); + + /* Run the special args checker. */ + if (f->val_args && !f->val_args(args, NULL)) { + lua_pushfstring(L, "error in arguments"); + goto error; + } + + /* Initialise the sample. */ + memset(&smp, 0, sizeof(smp)); + + /* Run the sample fetch process. */ + smp_set_owner(&smp, hsmp->p, hsmp->s->sess, hsmp->s, hsmp->dir & SMP_OPT_DIR); + if (!f->process(args, &smp, f->kw, f->private)) { + if (hsmp->flags & HLUA_F_AS_STRING) + lua_pushstring(L, ""); + else + lua_pushnil(L); + goto end; + } + + /* Convert the returned sample in lua value. */ + if (hsmp->flags & HLUA_F_AS_STRING) + MAY_LJMP(hlua_smp2lua_str(L, &smp)); + else + MAY_LJMP(hlua_smp2lua(L, &smp)); + + end: + free_args(args); + return 1; + + error: + free_args(args); + WILL_LJMP(lua_error(L)); + return 0; /* Never reached */ +} + +/* + * + * + * Class Converters + * + * + */ + +/* Returns a struct hlua_session if the stack entry "ud" is + * a class stream, otherwise it throws an error. + */ +__LJMP static struct hlua_smp *hlua_checkconverters(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_converters_ref)); +} + +/* This function creates and push in the stack a Converters object + * according with a current TXN. + */ +static int hlua_converters_new(lua_State *L, struct hlua_txn *txn, unsigned int flags) +{ + struct hlua_smp *hsmp; + + /* Check stack size. */ + if (!lua_checkstack(L, 3)) + return 0; + + /* Create the object: obj[0] = userdata. + * Note that the base of the Converters object is the + * same than the TXN object. + */ + lua_newtable(L); + hsmp = lua_newuserdata(L, sizeof(*hsmp)); + lua_rawseti(L, -2, 0); + + hsmp->s = txn->s; + hsmp->p = txn->p; + hsmp->dir = txn->dir; + hsmp->flags = flags; + + /* Pop a class stream metatable and affect it to the table. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_converters_ref); + lua_setmetatable(L, -2); + + return 1; +} + +/* This function is an LUA binding. It is called with each converter. + * It uses closure argument to store the associated converter. It + * returns only one argument or throws an error. An error is thrown + * only if an error is encountered during the argument parsing. If + * the converter function function fails, nil is returned. + */ +__LJMP static int hlua_run_sample_conv(lua_State *L) +{ + struct hlua_smp *hsmp; + struct sample_conv *conv; + struct arg args[ARGM_NBARGS + 1] = {{0}}; + int i; + struct sample smp; + + /* Get closure arguments. */ + conv = lua_touserdata(L, lua_upvalueindex(1)); + + /* Get traditional arguments. */ + hsmp = MAY_LJMP(hlua_checkconverters(L, 1)); + + /* Get extra arguments. */ + for (i = 0; i < lua_gettop(L) - 2; i++) { + if (i >= ARGM_NBARGS) + break; + hlua_lua2arg(L, i + 3, &args[i]); + } + args[i].type = ARGT_STOP; + args[i].data.str.area = NULL; + + /* Check arguments. */ + MAY_LJMP(hlua_lua2arg_check(L, 3, args, conv->arg_mask, hsmp->p)); + + /* Run the special args checker. */ + if (conv->val_args && !conv->val_args(args, conv, "", 0, NULL)) { + hlua_pusherror(L, "error in arguments"); + goto error; + } + + /* Initialise the sample. */ + memset(&smp, 0, sizeof(smp)); + if (!hlua_lua2smp(L, 2, &smp)) { + hlua_pusherror(L, "error in the input argument"); + goto error; + } + + smp_set_owner(&smp, hsmp->p, hsmp->s->sess, hsmp->s, hsmp->dir & SMP_OPT_DIR); + + /* Apply expected cast. */ + if (!sample_casts[smp.data.type][conv->in_type]) { + hlua_pusherror(L, "invalid input argument: cannot cast '%s' to '%s'", + smp_to_type[smp.data.type], smp_to_type[conv->in_type]); + goto error; + } + if (sample_casts[smp.data.type][conv->in_type] != c_none && + !sample_casts[smp.data.type][conv->in_type](&smp)) { + hlua_pusherror(L, "error during the input argument casting"); + goto error; + } + + /* Run the sample conversion process. */ + if (!conv->process(args, &smp, conv->private)) { + if (hsmp->flags & HLUA_F_AS_STRING) + lua_pushstring(L, ""); + else + lua_pushnil(L); + goto end; + } + + /* Convert the returned sample in lua value. */ + if (hsmp->flags & HLUA_F_AS_STRING) + MAY_LJMP(hlua_smp2lua_str(L, &smp)); + else + MAY_LJMP(hlua_smp2lua(L, &smp)); + end: + free_args(args); + return 1; + + error: + free_args(args); + WILL_LJMP(lua_error(L)); + return 0; /* Never reached */ +} + +/* + * + * + * Class AppletTCP + * + * + */ + +/* Returns a struct hlua_txn if the stack entry "ud" is + * a class stream, otherwise it throws an error. + */ +__LJMP static struct hlua_appctx *hlua_checkapplet_tcp(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_applet_tcp_ref)); +} + +/* This function creates and push in the stack an Applet object + * according with a current TXN. + */ +static int hlua_applet_tcp_new(lua_State *L, struct appctx *ctx) +{ + struct hlua_appctx *luactx; + struct stream *s = appctx_strm(ctx); + struct proxy *p; + + ALREADY_CHECKED(s); + p = s->be; + + /* Check stack size. */ + if (!lua_checkstack(L, 3)) + return 0; + + /* Create the object: obj[0] = userdata. + * Note that the base of the Converters object is the + * same than the TXN object. + */ + lua_newtable(L); + luactx = lua_newuserdata(L, sizeof(*luactx)); + lua_rawseti(L, -2, 0); + luactx->appctx = ctx; + luactx->htxn.s = s; + luactx->htxn.p = p; + + /* Create the "f" field that contains a list of fetches. */ + lua_pushstring(L, "f"); + if (!hlua_fetches_new(L, &luactx->htxn, 0)) + return 0; + lua_settable(L, -3); + + /* Create the "sf" field that contains a list of stringsafe fetches. */ + lua_pushstring(L, "sf"); + if (!hlua_fetches_new(L, &luactx->htxn, HLUA_F_AS_STRING)) + return 0; + lua_settable(L, -3); + + /* Create the "c" field that contains a list of converters. */ + lua_pushstring(L, "c"); + if (!hlua_converters_new(L, &luactx->htxn, 0)) + return 0; + lua_settable(L, -3); + + /* Create the "sc" field that contains a list of stringsafe converters. */ + lua_pushstring(L, "sc"); + if (!hlua_converters_new(L, &luactx->htxn, HLUA_F_AS_STRING)) + return 0; + lua_settable(L, -3); + + /* Pop a class stream metatable and affect it to the table. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_applet_tcp_ref); + lua_setmetatable(L, -2); + + return 1; +} + +__LJMP static int hlua_applet_tcp_set_var(lua_State *L) +{ + struct hlua_appctx *luactx; + struct stream *s; + const char *name; + size_t len; + struct sample smp; + + if (lua_gettop(L) < 3 || lua_gettop(L) > 4) + WILL_LJMP(luaL_error(L, "'set_var' needs between 3 and 4 arguments")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + s = luactx->htxn.s; + + /* Converts the third argument in a sample. */ + memset(&smp, 0, sizeof(smp)); + hlua_lua2smp(L, 3, &smp); + + /* Store the sample in a variable. We don't need to dup the smp, vars API + * already takes care of duplicating dynamic var data. + */ + smp_set_owner(&smp, s->be, s->sess, s, 0); + + if (lua_gettop(L) == 4 && lua_toboolean(L, 4)) + lua_pushboolean(L, vars_set_by_name_ifexist(name, len, &smp) != 0); + else + lua_pushboolean(L, vars_set_by_name(name, len, &smp) != 0); + + return 1; +} + +__LJMP static int hlua_applet_tcp_unset_var(lua_State *L) +{ + struct hlua_appctx *luactx; + struct stream *s; + const char *name; + size_t len; + struct sample smp; + + MAY_LJMP(check_args(L, 2, "unset_var")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + s = luactx->htxn.s; + + /* Unset the variable. */ + smp_set_owner(&smp, s->be, s->sess, s, 0); + lua_pushboolean(L, vars_unset_by_name_ifexist(name, len, &smp) != 0); + return 1; +} + +__LJMP static int hlua_applet_tcp_get_var(lua_State *L) +{ + struct hlua_appctx *luactx; + struct stream *s; + const char *name; + size_t len; + struct sample smp; + + MAY_LJMP(check_args(L, 2, "get_var")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + s = luactx->htxn.s; + + smp_set_owner(&smp, s->be, s->sess, s, 0); + if (!vars_get_by_name(name, len, &smp, NULL)) { + lua_pushnil(L); + return 1; + } + + return MAY_LJMP(hlua_smp2lua(L, &smp)); +} + +__LJMP static int hlua_applet_tcp_set_priv(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + struct stream *s = luactx->htxn.s; + struct hlua *hlua; + + /* Note that this hlua struct is from the session and not from the applet. */ + if (!s->hlua) + return 0; + hlua = s->hlua; + + MAY_LJMP(check_args(L, 2, "set_priv")); + + /* Remove previous value. */ + luaL_unref(L, LUA_REGISTRYINDEX, hlua->Mref); + + /* Get and store new value. */ + lua_pushvalue(L, 2); /* Copy the element 2 at the top of the stack. */ + hlua->Mref = luaL_ref(L, LUA_REGISTRYINDEX); /* pop the previously pushed value. */ + + return 0; +} + +__LJMP static int hlua_applet_tcp_get_priv(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + struct stream *s = luactx->htxn.s; + struct hlua *hlua; + + /* Note that this hlua struct is from the session and not from the applet. */ + if (!s->hlua) { + lua_pushnil(L); + return 1; + } + hlua = s->hlua; + + /* Push configuration index in the stack. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, hlua->Mref); + + return 1; +} + +/* If expected data not yet available, it returns a yield. This function + * consumes the data in the buffer. It returns a string containing the + * data. This string can be empty. + */ +__LJMP static int hlua_applet_tcp_getline_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + struct stconn *sc = appctx_sc(luactx->appctx); + int ret; + const char *blk1; + size_t len1; + const char *blk2; + size_t len2; + + /* Read the maximum amount of data available. */ + ret = co_getline_nc(sc_oc(sc), &blk1, &len1, &blk2, &len2); + + /* Data not yet available. return yield. */ + if (ret == 0) { + applet_need_more_data(luactx->appctx); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_tcp_getline_yield, TICK_ETERNITY, 0)); + } + + /* End of data: commit the total strings and return. */ + if (ret < 0) { + luaL_pushresult(&luactx->b); + return 1; + } + + /* Ensure that the block 2 length is usable. */ + if (ret == 1) + len2 = 0; + + /* don't check the max length read and don't check. */ + luaL_addlstring(&luactx->b, blk1, len1); + luaL_addlstring(&luactx->b, blk2, len2); + + /* Consume input channel output buffer data. */ + co_skip(sc_oc(sc), len1 + len2); + luaL_pushresult(&luactx->b); + return 1; +} + +/* Check arguments for the function "hlua_channel_get_yield". */ +__LJMP static int hlua_applet_tcp_getline(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + + /* Initialise the string catenation. */ + luaL_buffinit(L, &luactx->b); + + return MAY_LJMP(hlua_applet_tcp_getline_yield(L, 0, 0)); +} + +/* If expected data not yet available, it returns a yield. This function + * consumes the data in the buffer. It returns a string containing the + * data. This string can be empty. + */ +__LJMP static int hlua_applet_tcp_recv_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + struct stconn *sc = appctx_sc(luactx->appctx); + size_t len = MAY_LJMP(luaL_checkinteger(L, 2)); + int ret; + const char *blk1; + size_t len1; + const char *blk2; + size_t len2; + + /* Read the maximum amount of data available. */ + ret = co_getblk_nc(sc_oc(sc), &blk1, &len1, &blk2, &len2); + + /* Data not yet available. return yield. */ + if (ret == 0) { + applet_need_more_data(luactx->appctx); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_tcp_recv_yield, TICK_ETERNITY, 0)); + } + + /* End of data: commit the total strings and return. */ + if (ret < 0) { + luaL_pushresult(&luactx->b); + return 1; + } + + /* Ensure that the block 2 length is usable. */ + if (ret == 1) + len2 = 0; + + if (len == -1) { + + /* If len == -1, catenate all the data avalaile and + * yield because we want to get all the data until + * the end of data stream. + */ + luaL_addlstring(&luactx->b, blk1, len1); + luaL_addlstring(&luactx->b, blk2, len2); + co_skip(sc_oc(sc), len1 + len2); + applet_need_more_data(luactx->appctx); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_tcp_recv_yield, TICK_ETERNITY, 0)); + + } else { + + /* Copy the first block caping to the length required. */ + if (len1 > len) + len1 = len; + luaL_addlstring(&luactx->b, blk1, len1); + len -= len1; + + /* Copy the second block. */ + if (len2 > len) + len2 = len; + luaL_addlstring(&luactx->b, blk2, len2); + len -= len2; + + /* Consume input channel output buffer data. */ + co_skip(sc_oc(sc), len1 + len2); + + /* If there is no other data available, yield waiting for new data. */ + if (len > 0) { + lua_pushinteger(L, len); + lua_replace(L, 2); + applet_need_more_data(luactx->appctx); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_tcp_recv_yield, TICK_ETERNITY, 0)); + } + + /* return the result. */ + luaL_pushresult(&luactx->b); + return 1; + } + + /* we never execute this */ + hlua_pusherror(L, "Lua: internal error"); + WILL_LJMP(lua_error(L)); + return 0; +} + +/* Check arguments for the function "hlua_channel_get_yield". */ +__LJMP static int hlua_applet_tcp_recv(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + int len = -1; + + if (lua_gettop(L) > 2) + WILL_LJMP(luaL_error(L, "The 'recv' function requires between 1 and 2 arguments.")); + if (lua_gettop(L) >= 2) { + len = MAY_LJMP(luaL_checkinteger(L, 2)); + lua_pop(L, 1); + } + + /* Confirm or set the required length */ + lua_pushinteger(L, len); + + /* Initialise the string catenation. */ + luaL_buffinit(L, &luactx->b); + + return MAY_LJMP(hlua_applet_tcp_recv_yield(L, 0, 0)); +} + +/* Append data in the output side of the buffer. This data is immediately + * sent. The function returns the amount of data written. If the buffer + * cannot contain the data, the function yields. The function returns -1 + * if the channel is closed. + */ +__LJMP static int hlua_applet_tcp_send_yield(lua_State *L, int status, lua_KContext ctx) +{ + size_t len; + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_tcp(L, 1)); + const char *str = MAY_LJMP(luaL_checklstring(L, 2, &len)); + int l = MAY_LJMP(luaL_checkinteger(L, 3)); + struct stconn *sc = appctx_sc(luactx->appctx); + struct channel *chn = sc_ic(sc); + int max; + + /* Get the max amount of data which can write as input in the channel. */ + max = channel_recv_max(chn); + if (max > (len - l)) + max = len - l; + + /* Copy data. */ + ci_putblk(chn, str + l, max); + + /* update counters. */ + l += max; + lua_pop(L, 1); + lua_pushinteger(L, l); + + /* If some data is not send, declares the situation to the + * applet, and returns a yield. + */ + if (l < len) { + sc_need_room(sc, channel_recv_max(chn) + 1); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_tcp_send_yield, TICK_ETERNITY, 0)); + } + + return 1; +} + +/* Just a wrapper of "hlua_applet_tcp_send_yield". This wrapper permits + * yield the LUA process, and resume it without checking the + * input arguments. + */ +__LJMP static int hlua_applet_tcp_send(lua_State *L) +{ + MAY_LJMP(check_args(L, 2, "send")); + lua_pushinteger(L, 0); + + return MAY_LJMP(hlua_applet_tcp_send_yield(L, 0, 0)); +} + +/* + * + * + * Class AppletHTTP + * + * + */ + +/* Returns a struct hlua_txn if the stack entry "ud" is + * a class stream, otherwise it throws an error. + */ +__LJMP static struct hlua_appctx *hlua_checkapplet_http(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_applet_http_ref)); +} + +/* This function creates and push in the stack an Applet object + * according with a current TXN. + * It relies on the caller to have already reserved the room in ctx->svcctx + * for the local storage of hlua_http_ctx. + */ +static int hlua_applet_http_new(lua_State *L, struct appctx *ctx) +{ + struct hlua_http_ctx *http_ctx = ctx->svcctx; + struct hlua_appctx *luactx; + struct hlua_txn htxn; + struct stream *s = appctx_strm(ctx); + struct proxy *px = s->be; + struct htx *htx; + struct htx_blk *blk; + struct htx_sl *sl; + struct ist path; + unsigned long long len = 0; + int32_t pos; + struct http_uri_parser parser; + + /* Check stack size. */ + if (!lua_checkstack(L, 3)) + return 0; + + /* Create the object: obj[0] = userdata. + * Note that the base of the Converters object is the + * same than the TXN object. + */ + lua_newtable(L); + luactx = lua_newuserdata(L, sizeof(*luactx)); + lua_rawseti(L, -2, 0); + luactx->appctx = ctx; + http_ctx->status = 200; /* Default status code returned. */ + http_ctx->reason = NULL; /* Use default reason based on status */ + luactx->htxn.s = s; + luactx->htxn.p = px; + + /* Create the "f" field that contains a list of fetches. */ + lua_pushstring(L, "f"); + if (!hlua_fetches_new(L, &luactx->htxn, 0)) + return 0; + lua_settable(L, -3); + + /* Create the "sf" field that contains a list of stringsafe fetches. */ + lua_pushstring(L, "sf"); + if (!hlua_fetches_new(L, &luactx->htxn, HLUA_F_AS_STRING)) + return 0; + lua_settable(L, -3); + + /* Create the "c" field that contains a list of converters. */ + lua_pushstring(L, "c"); + if (!hlua_converters_new(L, &luactx->htxn, 0)) + return 0; + lua_settable(L, -3); + + /* Create the "sc" field that contains a list of stringsafe converters. */ + lua_pushstring(L, "sc"); + if (!hlua_converters_new(L, &luactx->htxn, HLUA_F_AS_STRING)) + return 0; + lua_settable(L, -3); + + htx = htxbuf(&s->req.buf); + blk = htx_get_first_blk(htx); + BUG_ON(!blk || htx_get_blk_type(blk) != HTX_BLK_REQ_SL); + sl = htx_get_blk_ptr(htx, blk); + + /* Stores the request method. */ + lua_pushstring(L, "method"); + lua_pushlstring(L, HTX_SL_REQ_MPTR(sl), HTX_SL_REQ_MLEN(sl)); + lua_settable(L, -3); + + /* Stores the http version. */ + lua_pushstring(L, "version"); + lua_pushlstring(L, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl)); + lua_settable(L, -3); + + /* creates an array of headers. hlua_http_get_headers() crates and push + * the array on the top of the stack. + */ + lua_pushstring(L, "headers"); + htxn.s = s; + htxn.p = px; + htxn.dir = SMP_OPT_DIR_REQ; + if (!hlua_http_get_headers(L, &htxn.s->txn->req)) + return 0; + lua_settable(L, -3); + + parser = http_uri_parser_init(htx_sl_req_uri(sl)); + path = http_parse_path(&parser); + if (isttest(path)) { + char *p, *q, *end; + + p = path.ptr; + end = istend(path); + q = p; + while (q < end && *q != '?') + q++; + + /* Stores the request path. */ + lua_pushstring(L, "path"); + lua_pushlstring(L, p, q - p); + lua_settable(L, -3); + + /* Stores the query string. */ + lua_pushstring(L, "qs"); + if (*q == '?') + q++; + lua_pushlstring(L, q, end - q); + lua_settable(L, -3); + } + + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_DATA) + len += htx_get_blksz(blk); + } + if (htx->extra != HTX_UNKOWN_PAYLOAD_LENGTH) + len += htx->extra; + + /* Stores the request path. */ + lua_pushstring(L, "length"); + lua_pushinteger(L, len); + lua_settable(L, -3); + + /* Create an empty array of HTTP request headers. */ + lua_pushstring(L, "response"); + lua_newtable(L); + lua_settable(L, -3); + + /* Pop a class stream metatable and affect it to the table. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_applet_http_ref); + lua_setmetatable(L, -2); + + return 1; +} + +__LJMP static int hlua_applet_http_set_var(lua_State *L) +{ + struct hlua_appctx *luactx; + struct stream *s; + const char *name; + size_t len; + struct sample smp; + + if (lua_gettop(L) < 3 || lua_gettop(L) > 4) + WILL_LJMP(luaL_error(L, "'set_var' needs between 3 and 4 arguments")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + s = luactx->htxn.s; + + /* Converts the third argument in a sample. */ + memset(&smp, 0, sizeof(smp)); + hlua_lua2smp(L, 3, &smp); + + /* Store the sample in a variable. We don't need to dup the smp, vars API + * already takes care of duplicating dynamic var data. + */ + smp_set_owner(&smp, s->be, s->sess, s, 0); + + if (lua_gettop(L) == 4 && lua_toboolean(L, 4)) + lua_pushboolean(L, vars_set_by_name_ifexist(name, len, &smp) != 0); + else + lua_pushboolean(L, vars_set_by_name(name, len, &smp) != 0); + + return 1; +} + +__LJMP static int hlua_applet_http_unset_var(lua_State *L) +{ + struct hlua_appctx *luactx; + struct stream *s; + const char *name; + size_t len; + struct sample smp; + + MAY_LJMP(check_args(L, 2, "unset_var")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + s = luactx->htxn.s; + + /* Unset the variable. */ + smp_set_owner(&smp, s->be, s->sess, s, 0); + lua_pushboolean(L, vars_unset_by_name_ifexist(name, len, &smp) != 0); + return 1; +} + +__LJMP static int hlua_applet_http_get_var(lua_State *L) +{ + struct hlua_appctx *luactx; + struct stream *s; + const char *name; + size_t len; + struct sample smp; + + MAY_LJMP(check_args(L, 2, "get_var")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + s = luactx->htxn.s; + + smp_set_owner(&smp, s->be, s->sess, s, 0); + if (!vars_get_by_name(name, len, &smp, NULL)) { + lua_pushnil(L); + return 1; + } + + return MAY_LJMP(hlua_smp2lua(L, &smp)); +} + +__LJMP static int hlua_applet_http_set_priv(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + struct stream *s = luactx->htxn.s; + struct hlua *hlua; + + /* Note that this hlua struct is from the session and not from the applet. */ + if (!s->hlua) + return 0; + hlua = s->hlua; + + MAY_LJMP(check_args(L, 2, "set_priv")); + + /* Remove previous value. */ + luaL_unref(L, LUA_REGISTRYINDEX, hlua->Mref); + + /* Get and store new value. */ + lua_pushvalue(L, 2); /* Copy the element 2 at the top of the stack. */ + hlua->Mref = luaL_ref(L, LUA_REGISTRYINDEX); /* pop the previously pushed value. */ + + return 0; +} + +__LJMP static int hlua_applet_http_get_priv(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + struct stream *s = luactx->htxn.s; + struct hlua *hlua; + + /* Note that this hlua struct is from the session and not from the applet. */ + if (!s->hlua) { + lua_pushnil(L); + return 1; + } + hlua = s->hlua; + + /* Push configuration index in the stack. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, hlua->Mref); + + return 1; +} + +/* If expected data not yet available, it returns a yield. This function + * consumes the data in the buffer. It returns a string containing the + * data. This string can be empty. + */ +__LJMP static int hlua_applet_http_getline_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + struct stconn *sc = appctx_sc(luactx->appctx); + struct channel *req = sc_oc(sc); + struct htx *htx; + struct htx_blk *blk; + size_t count; + int stop = 0; + + htx = htx_from_buf(&req->buf); + count = co_data(req); + blk = htx_get_first_blk(htx); + + while (count && !stop && blk) { + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t sz = htx_get_blksz(blk); + struct ist v; + uint32_t vlen; + char *nl; + + vlen = sz; + if (vlen > count) { + if (type != HTX_BLK_DATA) + break; + vlen = count; + } + + switch (type) { + case HTX_BLK_UNUSED: + break; + + case HTX_BLK_DATA: + v = htx_get_blk_value(htx, blk); + v.len = vlen; + nl = istchr(v, '\n'); + if (nl != NULL) { + stop = 1; + vlen = nl - v.ptr + 1; + } + luaL_addlstring(&luactx->b, v.ptr, vlen); + break; + + case HTX_BLK_TLR: + case HTX_BLK_EOT: + stop = 1; + break; + + default: + break; + } + + c_rew(req, vlen); + count -= vlen; + if (sz == vlen) + blk = htx_remove_blk(htx, blk); + else { + htx_cut_data_blk(htx, blk, vlen); + break; + } + } + + /* The message was fully consumed and no more data are expected + * (EOM flag set). + */ + if (htx_is_empty(htx) && (sc_opposite(sc)->flags & SC_FL_EOI)) + stop = 1; + + htx_to_buf(htx, &req->buf); + if (!stop) { + applet_need_more_data(luactx->appctx); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_http_getline_yield, TICK_ETERNITY, 0)); + } + + /* return the result. */ + luaL_pushresult(&luactx->b); + return 1; +} + + +/* Check arguments for the function "hlua_channel_get_yield". */ +__LJMP static int hlua_applet_http_getline(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + + /* Initialise the string catenation. */ + luaL_buffinit(L, &luactx->b); + + return MAY_LJMP(hlua_applet_http_getline_yield(L, 0, 0)); +} + +/* If expected data not yet available, it returns a yield. This function + * consumes the data in the buffer. It returns a string containing the + * data. This string can be empty. + */ +__LJMP static int hlua_applet_http_recv_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + struct stconn *sc = appctx_sc(luactx->appctx); + struct channel *req = sc_oc(sc); + struct htx *htx; + struct htx_blk *blk; + size_t count; + int len; + + htx = htx_from_buf(&req->buf); + len = MAY_LJMP(luaL_checkinteger(L, 2)); + count = co_data(req); + blk = htx_get_head_blk(htx); + while (count && len && blk) { + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t sz = htx_get_blksz(blk); + struct ist v; + uint32_t vlen; + + vlen = sz; + if (len > 0 && vlen > len) + vlen = len; + if (vlen > count) { + if (type != HTX_BLK_DATA) + break; + vlen = count; + } + + switch (type) { + case HTX_BLK_UNUSED: + break; + + case HTX_BLK_DATA: + v = htx_get_blk_value(htx, blk); + luaL_addlstring(&luactx->b, v.ptr, vlen); + break; + + case HTX_BLK_TLR: + case HTX_BLK_EOT: + len = 0; + break; + + default: + break; + } + + c_rew(req, vlen); + count -= vlen; + if (len > 0) + len -= vlen; + if (sz == vlen) + blk = htx_remove_blk(htx, blk); + else { + htx_cut_data_blk(htx, blk, vlen); + break; + } + } + + /* The message was fully consumed and no more data are expected + * (EOM flag set). + */ + if (htx_is_empty(htx) && (sc_opposite(sc)->flags & SC_FL_EOI)) + len = 0; + + htx_to_buf(htx, &req->buf); + + /* If we are no other data available, yield waiting for new data. */ + if (len) { + if (len > 0) { + lua_pushinteger(L, len); + lua_replace(L, 2); + } + applet_need_more_data(luactx->appctx); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_http_recv_yield, TICK_ETERNITY, 0)); + } + + /* return the result. */ + luaL_pushresult(&luactx->b); + return 1; +} + +/* Check arguments for the function "hlua_channel_get_yield". */ +__LJMP static int hlua_applet_http_recv(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + int len = -1; + + /* Check arguments. */ + if (lua_gettop(L) > 2) + WILL_LJMP(luaL_error(L, "The 'recv' function requires between 1 and 2 arguments.")); + if (lua_gettop(L) >= 2) { + len = MAY_LJMP(luaL_checkinteger(L, 2)); + lua_pop(L, 1); + } + + lua_pushinteger(L, len); + + /* Initialise the string catenation. */ + luaL_buffinit(L, &luactx->b); + + return MAY_LJMP(hlua_applet_http_recv_yield(L, 0, 0)); +} + +/* Append data in the output side of the buffer. This data is immediately + * sent. The function returns the amount of data written. If the buffer + * cannot contain the data, the function yields. The function returns -1 + * if the channel is closed. + */ +__LJMP static int hlua_applet_http_send_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + struct stconn *sc = appctx_sc(luactx->appctx); + struct channel *res = sc_ic(sc); + struct htx *htx = htx_from_buf(&res->buf); + const char *data; + size_t len; + int l = MAY_LJMP(luaL_checkinteger(L, 3)); + int max; + + max = htx_get_max_blksz(htx, channel_htx_recv_max(res, htx)); + if (!max) + goto snd_yield; + + data = MAY_LJMP(luaL_checklstring(L, 2, &len)); + + /* Get the max amount of data which can write as input in the channel. */ + if (max > (len - l)) + max = len - l; + + /* Copy data. */ + max = htx_add_data(htx, ist2(data + l, max)); + channel_add_input(res, max); + + /* update counters. */ + l += max; + lua_pop(L, 1); + lua_pushinteger(L, l); + + /* If some data is not send, declares the situation to the + * applet, and returns a yield. + */ + if (l < len) { + snd_yield: + htx_to_buf(htx, &res->buf); + sc_need_room(sc, channel_recv_max(res) + 1); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_http_send_yield, TICK_ETERNITY, 0)); + } + + htx_to_buf(htx, &res->buf); + return 1; +} + +/* Just a wrapper of "hlua_applet_send_yield". This wrapper permits + * yield the LUA process, and resume it without checking the + * input arguments. + */ +__LJMP static int hlua_applet_http_send(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + struct hlua_http_ctx *http_ctx = luactx->appctx->svcctx; + + /* We want to send some data. Headers must be sent. */ + if (!(http_ctx->flags & APPLET_HDR_SENT)) { + hlua_pusherror(L, "Lua: 'send' you must call start_response() before sending data."); + WILL_LJMP(lua_error(L)); + } + + /* This integer is used for followinf the amount of data sent. */ + lua_pushinteger(L, 0); + + return MAY_LJMP(hlua_applet_http_send_yield(L, 0, 0)); +} + +__LJMP static int hlua_applet_http_addheader(lua_State *L) +{ + const char *name; + int ret; + + MAY_LJMP(hlua_checkapplet_http(L, 1)); + name = MAY_LJMP(luaL_checkstring(L, 2)); + MAY_LJMP(luaL_checkstring(L, 3)); + + /* Push in the stack the "response" entry. */ + ret = lua_getfield(L, 1, "response"); + if (ret != LUA_TTABLE) { + hlua_pusherror(L, "Lua: 'add_header' internal error: AppletHTTP['response'] " + "is expected as an array. %s found", lua_typename(L, ret)); + WILL_LJMP(lua_error(L)); + } + + /* check if the header is already registered if it is not + * the case, register it. + */ + ret = lua_getfield(L, -1, name); + if (ret == LUA_TNIL) { + + /* Entry not found. */ + lua_pop(L, 1); /* remove the nil. The "response" table is the top of the stack. */ + + /* Insert the new header name in the array in the top of the stack. + * It left the new array in the top of the stack. + */ + lua_newtable(L); + lua_pushvalue(L, 2); + lua_pushvalue(L, -2); + lua_settable(L, -4); + + } else if (ret != LUA_TTABLE) { + + /* corruption error. */ + hlua_pusherror(L, "Lua: 'add_header' internal error: AppletHTTP['response']['%s'] " + "is expected as an array. %s found", name, lua_typename(L, ret)); + WILL_LJMP(lua_error(L)); + } + + /* Now the top of thestack is an array of values. We push + * the header value as new entry. + */ + lua_pushvalue(L, 3); + ret = lua_rawlen(L, -2); + lua_rawseti(L, -2, ret + 1); + lua_pushboolean(L, 1); + return 1; +} + +__LJMP static int hlua_applet_http_status(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + int status = MAY_LJMP(luaL_checkinteger(L, 2)); + const char *reason = MAY_LJMP(luaL_optlstring(L, 3, NULL, NULL)); + struct hlua_http_ctx *http_ctx = luactx->appctx->svcctx; + + if (status < 100 || status > 599) { + lua_pushboolean(L, 0); + return 1; + } + + http_ctx->status = status; + http_ctx->reason = reason; + lua_pushboolean(L, 1); + return 1; +} + + +__LJMP static int hlua_applet_http_send_response(lua_State *L) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + struct hlua_http_ctx *http_ctx = luactx->appctx->svcctx; + struct stconn *sc = appctx_sc(luactx->appctx); + struct channel *res = sc_ic(sc); + struct htx *htx; + struct htx_sl *sl; + struct h1m h1m; + const char *status, *reason; + const char *name, *value; + size_t nlen, vlen; + unsigned int flags; + + /* Send the message at once. */ + htx = htx_from_buf(&res->buf); + h1m_init_res(&h1m); + + /* Use the same http version than the request. */ + status = ultoa_r(http_ctx->status, trash.area, trash.size); + reason = http_ctx->reason; + if (reason == NULL) + reason = http_get_reason(http_ctx->status); + if (http_ctx->flags & APPLET_HTTP11) { + flags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11); + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, ist("HTTP/1.1"), ist(status), ist(reason)); + } + else { + flags = HTX_SL_F_IS_RESP; + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, ist("HTTP/1.0"), ist(status), ist(reason)); + } + if (!sl) { + hlua_pusherror(L, "Lua applet http '%s': Failed to create response.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name); + WILL_LJMP(lua_error(L)); + } + sl->info.res.status = http_ctx->status; + + /* Get the array associated to the field "response" in the object AppletHTTP. */ + if (lua_getfield(L, 1, "response") != LUA_TTABLE) { + hlua_pusherror(L, "Lua applet http '%s': AppletHTTP['response'] missing.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name); + WILL_LJMP(lua_error(L)); + } + + /* Browse the list of headers. */ + lua_pushnil(L); + while(lua_next(L, -2) != 0) { + /* We expect a string as -2. */ + if (lua_type(L, -2) != LUA_TSTRING) { + hlua_pusherror(L, "Lua applet http '%s': AppletHTTP['response'][] element must be a string. got %s.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name, + lua_typename(L, lua_type(L, -2))); + WILL_LJMP(lua_error(L)); + } + name = lua_tolstring(L, -2, &nlen); + + /* We expect an array as -1. */ + if (lua_type(L, -1) != LUA_TTABLE) { + hlua_pusherror(L, "Lua applet http '%s': AppletHTTP['response']['%s'] element must be an table. got %s.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name, + name, + lua_typename(L, lua_type(L, -1))); + WILL_LJMP(lua_error(L)); + } + + /* Browse the table who is on the top of the stack. */ + lua_pushnil(L); + while(lua_next(L, -2) != 0) { + int id; + + /* We expect a number as -2. */ + if (lua_type(L, -2) != LUA_TNUMBER) { + hlua_pusherror(L, "Lua applet http '%s': AppletHTTP['response']['%s'][] element must be a number. got %s.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name, + name, + lua_typename(L, lua_type(L, -2))); + WILL_LJMP(lua_error(L)); + } + id = lua_tointeger(L, -2); + + /* We expect a string as -2. */ + if (lua_type(L, -1) != LUA_TSTRING) { + hlua_pusherror(L, "Lua applet http '%s': AppletHTTP['response']['%s'][%d] element must be a string. got %s.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name, + name, id, + lua_typename(L, lua_type(L, -1))); + WILL_LJMP(lua_error(L)); + } + value = lua_tolstring(L, -1, &vlen); + + /* Simple Protocol checks. */ + if (isteqi(ist2(name, nlen), ist("transfer-encoding"))) { + int ret; + + ret = h1_parse_xfer_enc_header(&h1m, ist2(value, vlen)); + if (ret < 0) { + hlua_pusherror(L, "Lua applet http '%s': Invalid '%s' header.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name, + name); + WILL_LJMP(lua_error(L)); + } + else if (ret == 0) + goto next; /* Skip it */ + } + else if (isteqi(ist2(name, nlen), ist("content-length"))) { + struct ist v = ist2(value, vlen); + int ret; + + ret = h1_parse_cont_len_header(&h1m, &v); + if (ret < 0) { + hlua_pusherror(L, "Lua applet http '%s': Invalid '%s' header.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name, + name); + WILL_LJMP(lua_error(L)); + } + else if (ret == 0) + goto next; /* Skip it */ + } + + /* Add a new header */ + if (!htx_add_header(htx, ist2(name, nlen), ist2(value, vlen))) { + hlua_pusherror(L, "Lua applet http '%s': Failed to add header '%s' in the response.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name, + name); + WILL_LJMP(lua_error(L)); + } + next: + /* Remove the array from the stack, and get next element with a remaining string. */ + lua_pop(L, 1); + } + + /* Remove the array from the stack, and get next element with a remaining string. */ + lua_pop(L, 1); + } + + if (h1m.flags & H1_MF_CHNK) + h1m.flags &= ~H1_MF_CLEN; + if (h1m.flags & (H1_MF_CLEN|H1_MF_CHNK)) + h1m.flags |= H1_MF_XFER_LEN; + + /* Uset HTX start-line flags */ + if (h1m.flags & H1_MF_XFER_ENC) + flags |= HTX_SL_F_XFER_ENC; + if (h1m.flags & H1_MF_XFER_LEN) { + flags |= HTX_SL_F_XFER_LEN; + if (h1m.flags & H1_MF_CHNK) + flags |= HTX_SL_F_CHNK; + else if (h1m.flags & H1_MF_CLEN) + flags |= HTX_SL_F_CLEN; + if (h1m.body_len == 0) + flags |= HTX_SL_F_BODYLESS; + } + sl->flags |= flags; + + /* If we don't have a content-length set, and the HTTP version is 1.1 + * and the status code implies the presence of a message body, we must + * announce a transfer encoding chunked. This is required by haproxy + * for the keepalive compliance. If the applet announces a transfer-encoding + * chunked itself, don't do anything. + */ + if ((flags & (HTX_SL_F_VER_11|HTX_SL_F_XFER_LEN)) == HTX_SL_F_VER_11 && + http_ctx->status >= 200 && http_ctx->status != 204 && http_ctx->status != 304) { + /* Add a new header */ + sl->flags |= (HTX_SL_F_XFER_ENC|H1_MF_CHNK|H1_MF_XFER_LEN); + if (!htx_add_header(htx, ist("transfer-encoding"), ist("chunked"))) { + hlua_pusherror(L, "Lua applet http '%s': Failed to add header 'transfer-encoding' in the response.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name); + WILL_LJMP(lua_error(L)); + } + } + + /* Finalize headers. */ + if (!htx_add_endof(htx, HTX_BLK_EOH)) { + hlua_pusherror(L, "Lua applet http '%s': Failed create the response.\n", + luactx->appctx->rule->arg.hlua_rule->fcn->name); + WILL_LJMP(lua_error(L)); + } + + if (htx_used_space(htx) > b_size(&res->buf) - global.tune.maxrewrite) { + b_reset(&res->buf); + hlua_pusherror(L, "Lua: 'start_response': response header block too big"); + WILL_LJMP(lua_error(L)); + } + + htx_to_buf(htx, &res->buf); + channel_add_input(res, htx->data); + + /* Headers sent, set the flag. */ + http_ctx->flags |= APPLET_HDR_SENT; + return 0; + +} +/* We will build the status line and the headers of the HTTP response. + * We will try send at once if its not possible, we give back the hand + * waiting for more room. + */ +__LJMP static int hlua_applet_http_start_response_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_appctx *luactx = MAY_LJMP(hlua_checkapplet_http(L, 1)); + struct stconn *sc = appctx_sc(luactx->appctx); + struct channel *res = sc_ic(sc); + + if (co_data(res)) { + sc_need_room(sc, -1); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_applet_http_start_response_yield, TICK_ETERNITY, 0)); + } + return MAY_LJMP(hlua_applet_http_send_response(L)); +} + + +__LJMP static int hlua_applet_http_start_response(lua_State *L) +{ + return MAY_LJMP(hlua_applet_http_start_response_yield(L, 0, 0)); +} + +/* + * + * + * Class HTTP + * + * + */ + +/* Returns a struct hlua_txn if the stack entry "ud" is + * a class stream, otherwise it throws an error. + */ +__LJMP static struct hlua_txn *hlua_checkhttp(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_http_ref)); +} + +/* This function creates and push in the stack a HTTP object + * according with a current TXN. + */ +static int hlua_http_new(lua_State *L, struct hlua_txn *txn) +{ + struct hlua_txn *htxn; + + /* Check stack size. */ + if (!lua_checkstack(L, 3)) + return 0; + + /* Create the object: obj[0] = userdata. + * Note that the base of the Converters object is the + * same than the TXN object. + */ + lua_newtable(L); + htxn = lua_newuserdata(L, sizeof(*htxn)); + lua_rawseti(L, -2, 0); + + htxn->s = txn->s; + htxn->p = txn->p; + htxn->dir = txn->dir; + htxn->flags = txn->flags; + + /* Pop a class stream metatable and affect it to the table. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_http_ref); + lua_setmetatable(L, -2); + + return 1; +} + +/* This function creates and returns an array containing the status-line + * elements. This function does not fails. + */ +__LJMP static int hlua_http_get_stline(lua_State *L, struct htx_sl *sl) +{ + /* Create the table. */ + lua_newtable(L); + + if (sl->flags & HTX_SL_F_IS_RESP) { + lua_pushstring(L, "version"); + lua_pushlstring(L, HTX_SL_RES_VPTR(sl), HTX_SL_RES_VLEN(sl)); + lua_settable(L, -3); + lua_pushstring(L, "code"); + lua_pushlstring(L, HTX_SL_RES_CPTR(sl), HTX_SL_RES_CLEN(sl)); + lua_settable(L, -3); + lua_pushstring(L, "reason"); + lua_pushlstring(L, HTX_SL_RES_RPTR(sl), HTX_SL_RES_RLEN(sl)); + lua_settable(L, -3); + } + else { + lua_pushstring(L, "method"); + lua_pushlstring(L, HTX_SL_REQ_MPTR(sl), HTX_SL_REQ_MLEN(sl)); + lua_settable(L, -3); + lua_pushstring(L, "uri"); + lua_pushlstring(L, HTX_SL_REQ_UPTR(sl), HTX_SL_REQ_ULEN(sl)); + lua_settable(L, -3); + lua_pushstring(L, "version"); + lua_pushlstring(L, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl)); + lua_settable(L, -3); + } + return 1; +} + +/* This function creates ans returns an array of HTTP headers. + * This function does not fails. It is used as wrapper with the + * 2 following functions. + */ +__LJMP static int hlua_http_get_headers(lua_State *L, struct http_msg *msg) +{ + struct htx *htx; + int32_t pos; + + /* Create the table. */ + lua_newtable(L); + + + htx = htxbuf(&msg->chn->buf); + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist n, v; + int len; + + if (type == HTX_BLK_HDR) { + n = htx_get_blk_name(htx,blk); + v = htx_get_blk_value(htx, blk); + } + else if (type == HTX_BLK_EOH) + break; + else + continue; + + /* Check for existing entry: + * assume that the table is on the top of the stack, and + * push the key in the stack, the function lua_gettable() + * perform the lookup. + */ + lua_pushlstring(L, n.ptr, n.len); + lua_gettable(L, -2); + + switch (lua_type(L, -1)) { + case LUA_TNIL: + /* Table not found, create it. */ + lua_pop(L, 1); /* remove the nil value. */ + lua_pushlstring(L, n.ptr, n.len); /* push the header name as key. */ + lua_newtable(L); /* create and push empty table. */ + lua_pushlstring(L, v.ptr, v.len); /* push header value. */ + lua_rawseti(L, -2, 0); /* index header value (pop it). */ + lua_rawset(L, -3); /* index new table with header name (pop the values). */ + break; + + case LUA_TTABLE: + /* Entry found: push the value in the table. */ + len = lua_rawlen(L, -1); + lua_pushlstring(L, v.ptr, v.len); /* push header value. */ + lua_rawseti(L, -2, len+1); /* index header value (pop it). */ + lua_pop(L, 1); /* remove the table (it is stored in the main table). */ + break; + + default: + /* Other cases are errors. */ + hlua_pusherror(L, "internal error during the parsing of headers."); + WILL_LJMP(lua_error(L)); + } + } + return 1; +} + +__LJMP static int hlua_http_req_get_headers(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 1, "req_get_headers")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return hlua_http_get_headers(L, &htxn->s->txn->req); +} + +__LJMP static int hlua_http_res_get_headers(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 1, "res_get_headers")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_RES || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return hlua_http_get_headers(L, &htxn->s->txn->rsp); +} + +/* This function replace full header, or just a value in + * the request or in the response. It is a wrapper fir the + * 4 following functions. + */ +__LJMP static inline int hlua_http_rep_hdr(lua_State *L, struct http_msg *msg, int full) +{ + size_t name_len; + const char *name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + const char *reg = MAY_LJMP(luaL_checkstring(L, 3)); + const char *value = MAY_LJMP(luaL_checkstring(L, 4)); + struct htx *htx; + struct my_regex *re; + + if (!(re = regex_comp(reg, 1, 1, NULL))) + WILL_LJMP(luaL_argerror(L, 3, "invalid regex")); + + htx = htxbuf(&msg->chn->buf); + http_replace_hdrs(chn_strm(msg->chn), htx, ist2(name, name_len), value, re, full); + regex_free(re); + return 0; +} + +__LJMP static int hlua_http_req_rep_hdr(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 4, "req_rep_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return MAY_LJMP(hlua_http_rep_hdr(L, &htxn->s->txn->req, 1)); +} + +__LJMP static int hlua_http_res_rep_hdr(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 4, "res_rep_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_RES || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return MAY_LJMP(hlua_http_rep_hdr(L, &htxn->s->txn->rsp, 1)); +} + +__LJMP static int hlua_http_req_rep_val(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 4, "req_rep_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return MAY_LJMP(hlua_http_rep_hdr(L, &htxn->s->txn->req, 0)); +} + +__LJMP static int hlua_http_res_rep_val(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 4, "res_rep_val")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_RES || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return MAY_LJMP(hlua_http_rep_hdr(L, &htxn->s->txn->rsp, 0)); +} + +/* This function deletes all the occurrences of an header. + * It is a wrapper for the 2 following functions. + */ +__LJMP static inline int hlua_http_del_hdr(lua_State *L, struct http_msg *msg) +{ + size_t len; + const char *name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + struct htx *htx = htxbuf(&msg->chn->buf); + struct http_hdr_ctx ctx; + + ctx.blk = NULL; + while (http_find_header(htx, ist2(name, len), &ctx, 1)) + http_remove_header(htx, &ctx); + return 0; +} + +__LJMP static int hlua_http_req_del_hdr(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "req_del_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return hlua_http_del_hdr(L, &htxn->s->txn->req); +} + +__LJMP static int hlua_http_res_del_hdr(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "res_del_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_RES || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return hlua_http_del_hdr(L, &htxn->s->txn->rsp); +} + +/* This function adds an header. It is a wrapper used by + * the 2 following functions. + */ +__LJMP static inline int hlua_http_add_hdr(lua_State *L, struct http_msg *msg) +{ + size_t name_len; + const char *name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + size_t value_len; + const char *value = MAY_LJMP(luaL_checklstring(L, 3, &value_len)); + struct htx *htx = htxbuf(&msg->chn->buf); + + lua_pushboolean(L, http_add_header(htx, ist2(name, name_len), + ist2(value, value_len))); + return 0; +} + +__LJMP static int hlua_http_req_add_hdr(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 3, "req_add_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return hlua_http_add_hdr(L, &htxn->s->txn->req); +} + +__LJMP static int hlua_http_res_add_hdr(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 3, "res_add_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_RES || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + return hlua_http_add_hdr(L, &htxn->s->txn->rsp); +} + +static int hlua_http_req_set_hdr(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 3, "req_set_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + hlua_http_del_hdr(L, &htxn->s->txn->req); + return hlua_http_add_hdr(L, &htxn->s->txn->req); +} + +static int hlua_http_res_set_hdr(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 3, "res_set_hdr")); + htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + + if (htxn->dir != SMP_OPT_DIR_RES || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + hlua_http_del_hdr(L, &htxn->s->txn->rsp); + return hlua_http_add_hdr(L, &htxn->s->txn->rsp); +} + +/* This function set the method. */ +static int hlua_http_req_set_meth(lua_State *L) +{ + struct hlua_txn *htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + size_t name_len; + const char *name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + lua_pushboolean(L, http_req_replace_stline(0, name, name_len, htxn->p, htxn->s) != -1); + return 1; +} + +/* This function set the method. */ +static int hlua_http_req_set_path(lua_State *L) +{ + struct hlua_txn *htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + size_t name_len; + const char *name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + lua_pushboolean(L, http_req_replace_stline(1, name, name_len, htxn->p, htxn->s) != -1); + return 1; +} + +/* This function set the query-string. */ +static int hlua_http_req_set_query(lua_State *L) +{ + struct hlua_txn *htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + size_t name_len; + const char *name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + /* Check length. */ + if (name_len > trash.size - 1) { + lua_pushboolean(L, 0); + return 1; + } + + /* Add the mark question as prefix. */ + chunk_reset(&trash); + trash.area[trash.data++] = '?'; + memcpy(trash.area + trash.data, name, name_len); + trash.data += name_len; + + lua_pushboolean(L, + http_req_replace_stline(2, trash.area, trash.data, htxn->p, htxn->s) != -1); + return 1; +} + +/* This function set the uri. */ +static int hlua_http_req_set_uri(lua_State *L) +{ + struct hlua_txn *htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + size_t name_len; + const char *name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + + if (htxn->dir != SMP_OPT_DIR_REQ || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + lua_pushboolean(L, http_req_replace_stline(3, name, name_len, htxn->p, htxn->s) != -1); + return 1; +} + +/* This function set the response code & optionally reason. */ +static int hlua_http_res_set_status(lua_State *L) +{ + struct hlua_txn *htxn = MAY_LJMP(hlua_checkhttp(L, 1)); + unsigned int code = MAY_LJMP(luaL_checkinteger(L, 2)); + const char *str = MAY_LJMP(luaL_optlstring(L, 3, NULL, NULL)); + const struct ist reason = ist2(str, (str ? strlen(str) : 0)); + + if (htxn->dir != SMP_OPT_DIR_RES || !IS_HTX_STRM(htxn->s)) + WILL_LJMP(lua_error(L)); + + http_res_set_status(code, reason, htxn->s); + return 0; +} + +/* + * + * + * Class HTTPMessage + * + * + */ + +/* Returns a struct http_msg if the stack entry "ud" is a class HTTPMessage, + * otherwise it throws an error. + */ +__LJMP static struct http_msg *hlua_checkhttpmsg(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_http_msg_ref)); +} + +/* Creates and pushes on the stack a HTTP object according with a current TXN. + */ +static int hlua_http_msg_new(lua_State *L, struct http_msg *msg) +{ + /* Check stack size. */ + if (!lua_checkstack(L, 3)) + return 0; + + lua_newtable(L); + lua_pushlightuserdata(L, msg); + lua_rawseti(L, -2, 0); + + /* Create the "channel" field that contains the request channel object. */ + lua_pushstring(L, "channel"); + if (!hlua_channel_new(L, msg->chn)) + return 0; + lua_rawset(L, -3); + + /* Pop a class stream metatable and affect it to the table. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_http_msg_ref); + lua_setmetatable(L, -2); + + return 1; +} + +/* Helper function returning a filter attached to the HTTP message at the + * position <ud> in the stack, filling the current offset and length of the + * filter. If no filter is attached, NULL is returned and <offset> and <len> are + * filled with output and input length respectively. + */ +static struct filter *hlua_http_msg_filter(lua_State *L, int ud, struct http_msg *msg, size_t *offset, size_t *len) +{ + struct channel *chn = msg->chn; + struct htx *htx = htxbuf(&chn->buf); + struct filter *filter = NULL; + + *offset = co_data(msg->chn); + *len = htx->data - co_data(msg->chn); + + if (lua_getfield(L, ud, "__filter") == LUA_TLIGHTUSERDATA) { + filter = lua_touserdata (L, -1); + if (msg->msg_state >= HTTP_MSG_DATA) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + *offset = flt_ctx->cur_off[CHN_IDX(chn)]; + *len = flt_ctx->cur_len[CHN_IDX(chn)]; + } + } + + lua_pop(L, 1); + return filter; +} + +/* Returns true if the channel attached to the HTTP message is the response + * channel. + */ +__LJMP static int hlua_http_msg_is_resp(lua_State *L) +{ + struct http_msg *msg; + + MAY_LJMP(check_args(L, 1, "is_resp")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + lua_pushboolean(L, !!(msg->chn->flags & CF_ISRESP)); + return 1; +} + +/* Returns an array containing the elements status-line of the HTTP message. It relies + * on hlua_http_get_stline(). + */ +__LJMP static int hlua_http_msg_get_stline(lua_State *L) +{ + struct http_msg *msg; + struct htx *htx; + struct htx_sl *sl; + + MAY_LJMP(check_args(L, 1, "get_stline")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + htx = htxbuf(&msg->chn->buf); + sl = http_get_stline(htx); + if (!sl) + return 0; + return hlua_http_get_stline(L, sl); +} + +/* Returns an array containing all headers of the HTTP message. it relies on + * hlua_http_get_headers(). + */ +__LJMP static int hlua_http_msg_get_headers(lua_State *L) +{ + struct http_msg *msg; + + MAY_LJMP(check_args(L, 1, "get_headers")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + return hlua_http_get_headers(L, msg); +} + +/* Deletes all occurrences of an header in the HTTP message matching on its + * name. It relies on hlua_http_del_hdr(). + */ +__LJMP static int hlua_http_msg_del_hdr(lua_State *L) +{ + struct http_msg *msg; + + MAY_LJMP(check_args(L, 2, "del_header")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + return hlua_http_del_hdr(L, msg); +} + +/* Matches the full value line of all occurrences of an header in the HTTP + * message given its name against a regex and replaces it if it matches. It + * relies on hlua_http_rep_hdr(). + */ +__LJMP static int hlua_http_msg_rep_hdr(lua_State *L) +{ + struct http_msg *msg; + + MAY_LJMP(check_args(L, 4, "rep_header")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + return hlua_http_rep_hdr(L, msg, 1); +} + +/* Matches all comma-separated values of all occurrences of an header in the HTTP + * message given its name against a regex and replaces it if it matches. It + * relies on hlua_http_rep_hdr(). + */ +__LJMP static int hlua_http_msg_rep_val(lua_State *L) +{ + struct http_msg *msg; + + MAY_LJMP(check_args(L, 4, "rep_value")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + return hlua_http_rep_hdr(L, msg, 0); +} + +/* Add an header in the HTTP message. It relies on hlua_http_add_hdr() */ +__LJMP static int hlua_http_msg_add_hdr(lua_State *L) +{ + struct http_msg *msg; + + MAY_LJMP(check_args(L, 3, "add_header")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + return hlua_http_add_hdr(L, msg); +} + +/* Add an header in the HTTP message removing existing headers with the same + * name. It relies on hlua_http_del_hdr() and hlua_http_add_hdr(). + */ +__LJMP static int hlua_http_msg_set_hdr(lua_State *L) +{ + struct http_msg *msg; + + MAY_LJMP(check_args(L, 3, "set_header")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + hlua_http_del_hdr(L, msg); + return hlua_http_add_hdr(L, msg); +} + +/* Rewrites the request method. It relies on http_req_replace_stline(). */ +__LJMP static int hlua_http_msg_set_meth(lua_State *L) +{ + struct stream *s; + struct http_msg *msg; + const char *name; + size_t name_len; + + MAY_LJMP(check_args(L, 2, "set_method")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + + if ((msg->chn->flags & CF_ISRESP) || msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + s = chn_strm(msg->chn); + lua_pushboolean(L, http_req_replace_stline(0, name, name_len, s->be, s) != -1); + return 1; +} + +/* Rewrites the request path. It relies on http_req_replace_stline(). */ +__LJMP static int hlua_http_msg_set_path(lua_State *L) +{ + struct stream *s; + struct http_msg *msg; + const char *name; + size_t name_len; + + MAY_LJMP(check_args(L, 2, "set_path")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + + if ((msg->chn->flags & CF_ISRESP) || msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + s = chn_strm(msg->chn); + lua_pushboolean(L, http_req_replace_stline(1, name, name_len, s->be, s) != -1); + return 1; +} + +/* Rewrites the request query-string. It relies on http_req_replace_stline(). */ +__LJMP static int hlua_http_msg_set_query(lua_State *L) +{ + struct stream *s; + struct http_msg *msg; + const char *name; + size_t name_len; + + MAY_LJMP(check_args(L, 2, "set_query")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + + if ((msg->chn->flags & CF_ISRESP) || msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + /* Check length. */ + if (name_len > trash.size - 1) { + lua_pushboolean(L, 0); + return 1; + } + + /* Add the mark question as prefix. */ + chunk_reset(&trash); + trash.area[trash.data++] = '?'; + memcpy(trash.area + trash.data, name, name_len); + trash.data += name_len; + + s = chn_strm(msg->chn); + lua_pushboolean(L, http_req_replace_stline(2, trash.area, trash.data, s->be, s) != -1); + return 1; +} + +/* Rewrites the request URI. It relies on http_req_replace_stline(). */ +__LJMP static int hlua_http_msg_set_uri(lua_State *L) +{ + struct stream *s; + struct http_msg *msg; + const char *name; + size_t name_len; + + MAY_LJMP(check_args(L, 2, "set_uri")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &name_len)); + + if ((msg->chn->flags & CF_ISRESP) || msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + s = chn_strm(msg->chn); + lua_pushboolean(L, http_req_replace_stline(3, name, name_len, s->be, s) != -1); + return 1; +} + +/* Rewrites the response status code. It relies on http_res_set_status(). */ +__LJMP static int hlua_http_msg_set_status(lua_State *L) +{ + struct http_msg *msg; + unsigned int code; + const char *reason; + size_t reason_len; + + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + code = MAY_LJMP(luaL_checkinteger(L, 2)); + reason = MAY_LJMP(luaL_optlstring(L, 3, NULL, &reason_len)); + + if (!(msg->chn->flags & CF_ISRESP) || msg->msg_state > HTTP_MSG_BODY) + WILL_LJMP(lua_error(L)); + + lua_pushboolean(L, http_res_set_status(code, ist2(reason, reason_len), chn_strm(msg->chn)) != -1); + return 1; +} + +/* Returns true if the HTTP message is full. */ +__LJMP static int hlua_http_msg_is_full(lua_State *L) +{ + struct http_msg *msg; + + MAY_LJMP(check_args(L, 1, "is_full")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + lua_pushboolean(L, channel_full(msg->chn, 0)); + return 1; +} + +/* Returns true if the HTTP message may still receive data. */ +__LJMP static int hlua_http_msg_may_recv(lua_State *L) +{ + struct http_msg *msg; + struct htx *htx; + + MAY_LJMP(check_args(L, 1, "may_recv")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + htx = htxbuf(&msg->chn->buf); + lua_pushboolean(L, (htx_expect_more(htx) && !channel_input_closed(msg->chn) && channel_may_recv(msg->chn))); + return 1; +} + +/* Returns true if the HTTP message EOM was received */ +__LJMP static int hlua_http_msg_is_eom(lua_State *L) +{ + struct http_msg *msg; + struct htx *htx; + + MAY_LJMP(check_args(L, 1, "may_recv")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + htx = htxbuf(&msg->chn->buf); + lua_pushboolean(L, !htx_expect_more(htx)); + return 1; +} + +/* Returns the number of bytes available in the input side of the HTTP + * message. This function never fails. + */ +__LJMP static int hlua_http_msg_get_in_len(lua_State *L) +{ + struct http_msg *msg; + size_t output, input; + + MAY_LJMP(check_args(L, 1, "input")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + hlua_http_msg_filter(L, 1, msg, &output, &input); + lua_pushinteger(L, input); + return 1; +} + +/* Returns the number of bytes available in the output side of the HTTP + * message. This function never fails. + */ +__LJMP static int hlua_http_msg_get_out_len(lua_State *L) +{ + struct http_msg *msg; + size_t output, input; + + MAY_LJMP(check_args(L, 1, "output")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + hlua_http_msg_filter(L, 1, msg, &output, &input); + lua_pushinteger(L, output); + return 1; +} + +/* Copies at most <len> bytes of DATA blocks from the HTTP message <msg> + * starting at the offset <offset> and put it in a string LUA variables. It + * returns the built string length. It stops on the first non-DATA HTX + * block. This function is called during the payload filtering, so the headers + * are already scheduled for output (from the filter point of view). + */ +static int _hlua_http_msg_dup(struct http_msg *msg, lua_State *L, size_t offset, size_t len) +{ + struct htx *htx = htxbuf(&msg->chn->buf); + struct htx_blk *blk; + struct htx_ret htxret; + luaL_Buffer b; + int ret = 0; + + luaL_buffinit(L, &b); + htxret = htx_find_offset(htx, offset); + for (blk = htxret.blk, offset = htxret.ret; blk && len; blk = htx_get_next_blk(htx, blk)) { + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist v; + + switch (type) { + case HTX_BLK_UNUSED: + break; + + case HTX_BLK_DATA: + v = htx_get_blk_value(htx, blk); + v = istadv(v, offset); + v = isttrim(v, len); + + luaL_addlstring(&b, v.ptr, v.len); + ret += v.len; + break; + + default: + if (!ret) + goto no_data; + goto end; + } + offset = 0; + } + +end: + if (!ret && (htx->flags & HTX_FL_EOM)) + goto no_data; + luaL_pushresult(&b); + return ret; + + no_data: + /* Remove the empty string and push nil on the stack */ + lua_pop(L, 1); + lua_pushnil(L); + return 0; +} + +/* Copies the string <str> to the HTTP message <msg> at the offset + * <offset>. This function returns -1 if data cannot be copied. Otherwise, it + * returns the amount of data written. This function is responsible to update + * the filter context. + */ +static int _hlua_http_msg_insert(struct http_msg *msg, struct filter *filter, struct ist str, size_t offset) +{ + struct htx *htx = htx_from_buf(&msg->chn->buf); + struct htx_ret htxret; + int /*max, */ret = 0; + + /* Nothing to do, just return */ + if (unlikely(istlen(str) == 0)) + goto end; + + if (istlen(str) > htx_free_data_space(htx)) { + ret = -1; + goto end; + } + + htxret = htx_find_offset(htx, offset); + if (!htxret.blk || htx_get_blk_type(htxret.blk) != HTX_BLK_DATA) { + if (!htx_add_last_data(htx, str)) + goto end; + } + else { + struct ist v = htx_get_blk_value(htx, htxret.blk); + v.ptr += htxret.ret; + v.len = 0; + if (!htx_replace_blk_value(htx, htxret.blk, v, str)) + goto end; + } + ret = str.len; + if (ret) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + flt_update_offsets(filter, msg->chn, ret); + flt_ctx->cur_len[CHN_IDX(msg->chn)] += ret; + } + + end: + htx_to_buf(htx, &msg->chn->buf); + return ret; +} + +/* Helper function removing at most <len> bytes of DATA blocks at the absolute + * position <offset>. It stops on the first non-DATA HTX block. This function is + * called during the payload filtering, so the headers are already scheduled for + * output (from the filter point of view). This function is responsible to + * update the filter context. + */ +static void _hlua_http_msg_delete(struct http_msg *msg, struct filter *filter, size_t offset, size_t len) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + struct htx *htx = htx_from_buf(&msg->chn->buf); + struct htx_blk *blk; + struct htx_ret htxret; + size_t ret = 0; + + /* Be sure <len> is always the amount of DATA to remove */ + if (htx->data == offset+len && htx_get_tail_type(htx) == HTX_BLK_DATA) { + /* When htx tail type == HTX_BLK_DATA, no need to take care + * of special blocks like HTX_BLK_EOT. + * We simply truncate after offset + * (truncate targeted blk and discard the following ones) + */ + htx_truncate(htx, offset); + ret = len; + goto end; + } + + htxret = htx_find_offset(htx, offset); + blk = htxret.blk; + if (htxret.ret) { + /* dealing with offset: we need to trim targeted blk */ + struct ist v; + + if (htx_get_blk_type(blk) != HTX_BLK_DATA) + goto end; + + v = htx_get_blk_value(htx, blk); + v = istadv(v, htxret.ret); + + v = isttrim(v, len); + /* trimming data in blk: discard everything after the offset + * (replace 'v' with 'IST_NULL') + */ + blk = htx_replace_blk_value(htx, blk, v, IST_NULL); + if (blk && v.len < len) { + /* In this case, caller wants to keep removing data, + * but we need to spare current blk + * because it was already trimmed + */ + blk = htx_get_next_blk(htx, blk); + } + len -= v.len; + ret += v.len; + } + + + while (blk && len) { + /* there is more data that needs to be discarded */ + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t sz = htx_get_blksz(blk); + + switch (type) { + case HTX_BLK_UNUSED: + break; + + case HTX_BLK_DATA: + if (len < sz) { + /* don't discard whole blk, only part of it + * (from the beginning) + */ + htx_cut_data_blk(htx, blk, len); + ret += len; + goto end; + } + break; + + default: + /* HTX_BLK_EOT blk won't be removed */ + goto end; + } + + /* Remove all the data block */ + len -= sz; + ret += sz; + blk = htx_remove_blk(htx, blk); + } + +end: + flt_update_offsets(filter, msg->chn, -ret); + flt_ctx->cur_len[CHN_IDX(msg->chn)] -= ret; + /* WARNING: we don't call htx_to_buf() on purpose, because we don't want + * to loose the EOM flag if the message is empty. + */ +} + +/* Copies input data found in an HTTP message. Unlike the channel function used + * to duplicate raw data, this one can only be called inside a filter, from + * http_payload callback. So it cannot yield. An exception is returned if it is + * called from another callback. If nothing was copied, a nil value is pushed on + * the stack. + */ +__LJMP static int hlua_http_msg_get_body(lua_State *L) +{ + struct http_msg *msg; + struct filter *filter; + size_t output, input; + int offset, len; + + if (lua_gettop(L) < 1 || lua_gettop(L) > 3) + WILL_LJMP(luaL_error(L, "'data' expects at most 2 arguments")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state < HTTP_MSG_DATA) + WILL_LJMP(lua_error(L)); + + filter = hlua_http_msg_filter(L, 1, msg, &output, &input); + if (!filter || !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + if (!ci_data(msg->chn) && channel_input_closed(msg->chn)) { + lua_pushnil(L); + return 1; + } + + offset = output; + if (lua_gettop(L) > 1) { + offset = MAY_LJMP(luaL_checkinteger(L, 2)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset < output || offset > input + output) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + len = output + input - offset; + if (lua_gettop(L) == 3) { + len = MAY_LJMP(luaL_checkinteger(L, 3)); + if (!len) + goto dup; + if (len == -1) + len = global.tune.bufsize; + if (len < 0) { + lua_pushfstring(L, "length out of range."); + WILL_LJMP(lua_error(L)); + } + } + + dup: + _hlua_http_msg_dup(msg, L, offset, len); + return 1; +} + +/* Appends a string to the HTTP message, after all existing DATA blocks but + * before the trailers, if any. It returns the amount of data written or -1 if + * nothing was copied. Unlike the channel function used to append data, this one + * can only be called inside a filter, from http_payload callback. So it cannot + * yield. An exception is returned if it is called from another callback. + */ +__LJMP static int hlua_http_msg_append(lua_State *L) +{ + struct http_msg *msg; + struct filter *filter; + const char *str; + size_t offset, len, sz; + int ret; + + MAY_LJMP(check_args(L, 2, "append")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state < HTTP_MSG_DATA) + WILL_LJMP(lua_error(L)); + + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + filter = hlua_http_msg_filter(L, 1, msg, &offset, &len); + if (!filter || !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + ret = _hlua_http_msg_insert(msg, filter, ist2(str, sz), offset+len); + lua_pushinteger(L, ret); + return 1; +} + +/* Prepends a string to the HTTP message, before all existing DATA blocks. It + * returns the amount of data written or -1 if nothing was copied. Unlike the + * channel function used to prepend data, this one can only be called inside a + * filter, from http_payload callback. So it cannot yield. An exception is + * returned if it is called from another callback. + */ +__LJMP static int hlua_http_msg_prepend(lua_State *L) +{ + struct http_msg *msg; + struct filter *filter; + const char *str; + size_t offset, len, sz; + int ret; + + MAY_LJMP(check_args(L, 2, "prepend")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state < HTTP_MSG_DATA) + WILL_LJMP(lua_error(L)); + + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + filter = hlua_http_msg_filter(L, 1, msg, &offset, &len); + if (!filter || !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + ret = _hlua_http_msg_insert(msg, filter, ist2(str, sz), offset); + lua_pushinteger(L, ret); + return 1; +} + +/* Inserts a string to the HTTP message at a given offset. By default the string + * is appended at the end of DATA blocks. It returns the amount of data written + * or -1 if nothing was copied. Unlike the channel function used to insert data, + * this one can only be called inside a filter, from http_payload callback. So + * it cannot yield. An exception is returned if it is called from another + * callback. + */ +__LJMP static int hlua_http_msg_insert_data(lua_State *L) +{ + struct http_msg *msg; + struct filter *filter; + const char *str; + size_t input, output, sz; + int offset; + int ret; + + if (lua_gettop(L) < 2 || lua_gettop(L) > 3) + WILL_LJMP(luaL_error(L, "'insert' expects at least 1 argument and at most 2 arguments")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state < HTTP_MSG_DATA) + WILL_LJMP(lua_error(L)); + + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + filter = hlua_http_msg_filter(L, 1, msg, &output, &input); + if (!filter || !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + offset = output; + if (lua_gettop(L) > 2) { + offset = MAY_LJMP(luaL_checkinteger(L, 3)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset > output + input) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + + ret = _hlua_http_msg_insert(msg, filter, ist2(str, sz), offset); + lua_pushinteger(L, ret); + return 1; +} + +/* Removes a given amount of data from the HTTP message at a given offset. By + * default all DATA blocks are removed. It returns the amount of data + * removed. Unlike the channel function used to remove data, this one can only + * be called inside a filter, from http_payload callback. So it cannot yield. An + * exception is returned if it is called from another callback. + */ +__LJMP static int hlua_http_msg_del_data(lua_State *L) +{ + struct http_msg *msg; + struct filter *filter; + size_t input, output; + int offset, len; + + if (lua_gettop(L) < 1 || lua_gettop(L) > 3) + WILL_LJMP(luaL_error(L, "'remove' expects at most 2 arguments")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state < HTTP_MSG_DATA) + WILL_LJMP(lua_error(L)); + + filter = hlua_http_msg_filter(L, 1, msg, &output, &input); + if (!filter || !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + offset = output; + if (lua_gettop(L) > 1) { + offset = MAY_LJMP(luaL_checkinteger(L, 2)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset > output + input) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + + len = output + input - offset; + if (lua_gettop(L) == 3) { + len = MAY_LJMP(luaL_checkinteger(L, 3)); + if (!len) + goto end; + if (len == -1) + len = output + input - offset; + if (len < 0 || offset + len > output + input) { + lua_pushfstring(L, "length out of range."); + WILL_LJMP(lua_error(L)); + } + } + + _hlua_http_msg_delete(msg, filter, offset, len); + + end: + lua_pushinteger(L, len); + return 1; +} + +/* Replaces a given amount of data at the given offset by a string. By default, + * all remaining data are removed, accordingly to the filter context. It returns + * the amount of data written or -1 if nothing was copied. Unlike the channel + * function used to replace data, this one can only be called inside a filter, + * from http_payload callback. So it cannot yield. An exception is returned if + * it is called from another callback. + */ +__LJMP static int hlua_http_msg_set_data(lua_State *L) +{ + struct http_msg *msg; + struct filter *filter; + struct htx *htx; + const char *str; + size_t input, output, sz; + int offset, len; + int ret; + + if (lua_gettop(L) < 2 || lua_gettop(L) > 4) + WILL_LJMP(luaL_error(L, "'set' expects at least 1 argument and at most 3 arguments")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state < HTTP_MSG_DATA) + WILL_LJMP(lua_error(L)); + + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + filter = hlua_http_msg_filter(L, 1, msg, &output, &input); + if (!filter || !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + offset = output; + if (lua_gettop(L) > 2) { + offset = MAY_LJMP(luaL_checkinteger(L, 3)); + if (offset < 0) + offset = MAX(0, (int)input + offset); + offset += output; + if (offset < output || offset > input + output) { + lua_pushfstring(L, "offset out of range."); + WILL_LJMP(lua_error(L)); + } + } + + len = output + input - offset; + if (lua_gettop(L) == 4) { + len = MAY_LJMP(luaL_checkinteger(L, 4)); + if (!len) + goto set; + if (len == -1) + len = output + input - offset; + if (len < 0 || offset + len > output + input) { + lua_pushfstring(L, "length out of range."); + WILL_LJMP(lua_error(L)); + } + } + + set: + /* Be sure we can copied the string once input data will be removed. */ + htx = htx_from_buf(&msg->chn->buf); + if (sz > htx_free_data_space(htx) + len) + lua_pushinteger(L, -1); + else { + _hlua_http_msg_delete(msg, filter, offset, len); + ret = _hlua_http_msg_insert(msg, filter, ist2(str, sz), offset); + lua_pushinteger(L, ret); + } + return 1; +} + +/* Prepends data into an HTTP message and forward it, from the filter point of + * view. It returns the amount of data written or -1 if nothing was sent. Unlike + * the channel function used to send data, this one can only be called inside a + * filter, from http_payload callback. So it cannot yield. An exception is + * returned if it is called from another callback. + */ +__LJMP static int hlua_http_msg_send(lua_State *L) +{ + struct http_msg *msg; + struct filter *filter; + struct htx *htx; + const char *str; + size_t offset, len, sz; + int ret; + + MAY_LJMP(check_args(L, 2, "send")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state < HTTP_MSG_DATA) + WILL_LJMP(lua_error(L)); + + str = MAY_LJMP(luaL_checklstring(L, 2, &sz)); + filter = hlua_http_msg_filter(L, 1, msg, &offset, &len); + if (!filter || !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + /* Return an error if the channel's output is closed */ + if (unlikely(channel_output_closed(msg->chn))) { + lua_pushinteger(L, -1); + return 1; + } + + htx = htx_from_buf(&msg->chn->buf); + if (sz > htx_free_data_space(htx)) { + lua_pushinteger(L, -1); + return 1; + } + + ret = _hlua_http_msg_insert(msg, filter, ist2(str, sz), offset); + if (ret > 0) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + FLT_OFF(filter, msg->chn) += ret; + flt_ctx->cur_len[CHN_IDX(msg->chn)] -= ret; + flt_ctx->cur_off[CHN_IDX(msg->chn)] += ret; + } + + lua_pushinteger(L, ret); + return 1; +} + +/* Forwards a given amount of bytes. It return -1 if the channel's output is + * closed. Otherwise, it returns the number of bytes forwarded. Unlike the + * channel function used to forward data, this one can only be called inside a + * filter, from http_payload callback. So it cannot yield. An exception is + * returned if it is called from another callback. All other functions deal with + * DATA block, this one not. +*/ +__LJMP static int hlua_http_msg_forward(lua_State *L) +{ + struct http_msg *msg; + struct filter *filter; + size_t offset, len; + int fwd, ret = 0; + + MAY_LJMP(check_args(L, 2, "forward")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + + if (msg->msg_state < HTTP_MSG_DATA) + WILL_LJMP(lua_error(L)); + + fwd = MAY_LJMP(luaL_checkinteger(L, 2)); + filter = hlua_http_msg_filter(L, 1, msg, &offset, &len); + if (!filter || !hlua_filter_from_payload(filter)) + WILL_LJMP(lua_error(L)); + + /* Nothing to do, just return */ + if (!fwd) + goto end; + + /* Return an error if the channel's output is closed */ + if (unlikely(channel_output_closed(msg->chn))) { + ret = -1; + goto end; + } + + ret = fwd; + if (ret > len) + ret = len; + + if (ret) { + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + FLT_OFF(filter, msg->chn) += ret; + flt_ctx->cur_off[CHN_IDX(msg->chn)] += ret; + flt_ctx->cur_len[CHN_IDX(msg->chn)] -= ret; + } + + end: + lua_pushinteger(L, ret); + return 1; +} + +/* Set EOM flag on the HTX message. + * + * NOTE: Not sure it is a good idea to manipulate this flag but for now I don't + * really know how to do without this feature. + */ +__LJMP static int hlua_http_msg_set_eom(lua_State *L) +{ + struct http_msg *msg; + struct htx *htx; + + MAY_LJMP(check_args(L, 1, "set_eom")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + htx = htxbuf(&msg->chn->buf); + htx->flags |= HTX_FL_EOM; + return 0; +} + +/* Unset EOM flag on the HTX message. + * + * NOTE: Not sure it is a good idea to manipulate this flag but for now I don't + * really know how to do without this feature. + */ +__LJMP static int hlua_http_msg_unset_eom(lua_State *L) +{ + struct http_msg *msg; + struct htx *htx; + + MAY_LJMP(check_args(L, 1, "set_eom")); + msg = MAY_LJMP(hlua_checkhttpmsg(L, 1)); + htx = htxbuf(&msg->chn->buf); + htx->flags &= ~HTX_FL_EOM; + return 0; +} + +/* + * + * + * Class HTTPClient + * + * + */ +__LJMP static struct hlua_httpclient *hlua_checkhttpclient(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_httpclient_ref)); +} + + +/* stops the httpclient and ask it to kill itself */ +__LJMP static int hlua_httpclient_gc(lua_State *L) +{ + struct hlua_httpclient *hlua_hc; + + MAY_LJMP(check_args(L, 1, "__gc")); + + hlua_hc = MAY_LJMP(hlua_checkhttpclient(L, 1)); + + if (MT_LIST_DELETE(&hlua_hc->by_hlua)) { + /* we won the race against hlua_httpclient_destroy_all() */ + httpclient_stop_and_destroy(hlua_hc->hc); + hlua_hc->hc = NULL; + } + + return 0; +} + + +__LJMP static int hlua_httpclient_new(lua_State *L) +{ + struct hlua_httpclient *hlua_hc; + struct hlua *hlua; + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) + return 0; + + /* Check stack size. */ + if (!lua_checkstack(L, 3)) { + hlua_pusherror(L, "httpclient: full stack"); + goto err; + } + /* Create the object: obj[0] = userdata. */ + lua_newtable(L); + hlua_hc = MAY_LJMP(lua_newuserdata(L, sizeof(*hlua_hc))); + lua_rawseti(L, -2, 0); + memset(hlua_hc, 0, sizeof(*hlua_hc)); + + hlua_hc->hc = httpclient_new(hlua, 0, IST_NULL); + if (!hlua_hc->hc) + goto err; + + MT_LIST_APPEND(&hlua->hc_list, &hlua_hc->by_hlua); + + /* Pop a class stream metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_httpclient_ref); + lua_setmetatable(L, -2); + + return 1; + + err: + WILL_LJMP(lua_error(L)); + return 0; +} + + +/* + * Callback of the httpclient, this callback wakes the lua task up, once the + * httpclient receives some data + * + */ + +static void hlua_httpclient_cb(struct httpclient *hc) +{ + struct hlua *hlua = hc->caller; + + if (!hlua || !hlua->task) + return; + + task_wakeup(hlua->task, TASK_WOKEN_MSG); +} + +/* + * Fill the lua stack with headers from the httpclient response + * This works the same way as the hlua_http_get_headers() function + */ +__LJMP static int hlua_httpclient_get_headers(lua_State *L, struct hlua_httpclient *hlua_hc) +{ + struct http_hdr *hdr; + + lua_newtable(L); + + for (hdr = hlua_hc->hc->res.hdrs; hdr && isttest(hdr->n); hdr++) { + struct ist n, v; + int len; + + n = hdr->n; + v = hdr->v; + + /* Check for existing entry: + * assume that the table is on the top of the stack, and + * push the key in the stack, the function lua_gettable() + * perform the lookup. + */ + + lua_pushlstring(L, n.ptr, n.len); + lua_gettable(L, -2); + + switch (lua_type(L, -1)) { + case LUA_TNIL: + /* Table not found, create it. */ + lua_pop(L, 1); /* remove the nil value. */ + lua_pushlstring(L, n.ptr, n.len); /* push the header name as key. */ + lua_newtable(L); /* create and push empty table. */ + lua_pushlstring(L, v.ptr, v.len); /* push header value. */ + lua_rawseti(L, -2, 0); /* index header value (pop it). */ + lua_rawset(L, -3); /* index new table with header name (pop the values). */ + break; + + case LUA_TTABLE: + /* Entry found: push the value in the table. */ + len = lua_rawlen(L, -1); + lua_pushlstring(L, v.ptr, v.len); /* push header value. */ + lua_rawseti(L, -2, len+1); /* index header value (pop it). */ + lua_pop(L, 1); /* remove the table (it is stored in the main table). */ + break; + + default: + /* Other cases are errors. */ + hlua_pusherror(L, "internal error during the parsing of headers."); + WILL_LJMP(lua_error(L)); + } + } + return 1; +} + +/* + * Allocate and return an array of http_hdr ist extracted from the <headers> lua table + * + * Caller must free the result + */ +struct http_hdr *hlua_httpclient_table_to_hdrs(lua_State *L) +{ + struct http_hdr hdrs[global.tune.max_http_hdr]; + struct http_hdr *result = NULL; + uint32_t hdr_num = 0; + + lua_pushnil(L); + while (lua_next(L, -2) != 0) { + struct ist name, value; + const char *n, *v; + size_t nlen, vlen; + + if (!lua_isstring(L, -2) || !lua_istable(L, -1)) { + /* Skip element if the key is not a string or if the value is not a table */ + goto next_hdr; + } + + n = lua_tolstring(L, -2, &nlen); + name = ist2(n, nlen); + + /* Loop on header's values */ + lua_pushnil(L); + while (lua_next(L, -2)) { + if (!lua_isstring(L, -1)) { + /* Skip the value if it is not a string */ + goto next_value; + } + + v = lua_tolstring(L, -1, &vlen); + value = ist2(v, vlen); + name = ist2(n, nlen); + + hdrs[hdr_num].n = istdup(name); + hdrs[hdr_num].v = istdup(value); + + hdr_num++; + + next_value: + lua_pop(L, 1); + } + + next_hdr: + lua_pop(L, 1); + + } + + if (hdr_num) { + /* alloc and copy the headers in the httpclient struct */ + result = calloc((hdr_num + 1), sizeof(*result)); + if (!result) + goto skip_headers; + memcpy(result, hdrs, sizeof(struct http_hdr) * (hdr_num + 1)); + + result[hdr_num].n = IST_NULL; + result[hdr_num].v = IST_NULL; + } + +skip_headers: + + return result; +} + + +/* + * For each yield, checks if there is some data in the httpclient and push them + * in the lua buffer, once the httpclient finished its job, push the result on + * the stack + */ +__LJMP static int hlua_httpclient_rcv_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct buffer *tr; + int res; + struct hlua *hlua = hlua_gethlua(L); + struct hlua_httpclient *hlua_hc = hlua_checkhttpclient(L, 1); + + + tr = get_trash_chunk(); + + res = httpclient_res_xfer(hlua_hc->hc, tr); + luaL_addlstring(&hlua_hc->b, b_orig(tr), res); + + if (!httpclient_data(hlua_hc->hc) && httpclient_ended(hlua_hc->hc)) { + + luaL_pushresult(&hlua_hc->b); + lua_settable(L, -3); + + lua_pushstring(L, "status"); + lua_pushinteger(L, hlua_hc->hc->res.status); + lua_settable(L, -3); + + + lua_pushstring(L, "reason"); + lua_pushlstring(L, hlua_hc->hc->res.reason.ptr, hlua_hc->hc->res.reason.len); + lua_settable(L, -3); + + lua_pushstring(L, "headers"); + hlua_httpclient_get_headers(L, hlua_hc); + lua_settable(L, -3); + + return 1; + } + + if (httpclient_data(hlua_hc->hc)) + task_wakeup(hlua->task, TASK_WOKEN_MSG); + + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_httpclient_rcv_yield, TICK_ETERNITY, 0)); + + return 0; +} + +/* + * Call this when trying to stream a body during a request + */ +__LJMP static int hlua_httpclient_snd_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct hlua *hlua; + struct hlua_httpclient *hlua_hc = hlua_checkhttpclient(L, 1); + const char *body_str = NULL; + int ret; + int end = 0; + size_t buf_len; + size_t to_send = 0; + + hlua = hlua_gethlua(L); + + if (!hlua || !hlua->task) + WILL_LJMP(luaL_error(L, "The 'get' function is only allowed in " + "'frontend', 'backend' or 'task'")); + + ret = lua_getfield(L, -1, "body"); + if (ret != LUA_TSTRING) + goto rcv; + + body_str = lua_tolstring(L, -1, &buf_len); + lua_pop(L, 1); + + to_send = buf_len - hlua_hc->sent; + + if ((hlua_hc->sent + to_send) >= buf_len) + end = 1; + + /* the end flag is always set since we are using the whole remaining size */ + hlua_hc->sent += httpclient_req_xfer(hlua_hc->hc, ist2(body_str + hlua_hc->sent, to_send), end); + + if (buf_len > hlua_hc->sent) { + /* still need to process the buffer */ + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_httpclient_snd_yield, TICK_ETERNITY, 0)); + } else { + goto rcv; + /* we sent the whole request buffer we can recv */ + } + return 0; + +rcv: + + /* we return a "res" object */ + lua_newtable(L); + + lua_pushstring(L, "body"); + luaL_buffinit(L, &hlua_hc->b); + + task_wakeup(hlua->task, TASK_WOKEN_MSG); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_httpclient_rcv_yield, TICK_ETERNITY, 0)); + + return 1; +} + +/* + * Send an HTTP request and wait for a response + */ + +__LJMP static int hlua_httpclient_send(lua_State *L, enum http_meth_t meth) +{ + struct hlua_httpclient *hlua_hc; + struct http_hdr *hdrs = NULL; + struct http_hdr *hdrs_i = NULL; + struct hlua *hlua; + const char *url_str = NULL; + const char *body_str = NULL; + size_t buf_len = 0; + int ret; + + hlua = hlua_gethlua(L); + + if (!hlua || !hlua->task) + WILL_LJMP(luaL_error(L, "The 'get' function is only allowed in " + "'frontend', 'backend' or 'task'")); + + if (lua_gettop(L) != 2 || lua_type(L, -1) != LUA_TTABLE) + WILL_LJMP(luaL_error(L, "'get' needs a table as argument")); + + hlua_hc = hlua_checkhttpclient(L, 1); + + lua_pushnil(L); /* first key */ + while (lua_next(L, 2)) { + if (strcmp(lua_tostring(L, -2), "dst") == 0) { + if (httpclient_set_dst(hlua_hc->hc, lua_tostring(L, -1)) < 0) + WILL_LJMP(luaL_error(L, "Can't use the 'dst' argument")); + + } else if (strcmp(lua_tostring(L, -2), "url") == 0) { + if (lua_type(L, -1) != LUA_TSTRING) + WILL_LJMP(luaL_error(L, "invalid parameter in 'url', must be a string")); + url_str = lua_tostring(L, -1); + + } else if (strcmp(lua_tostring(L, -2), "timeout") == 0) { + if (lua_type(L, -1) != LUA_TNUMBER) + WILL_LJMP(luaL_error(L, "invalid parameter in 'timeout', must be a number")); + httpclient_set_timeout(hlua_hc->hc, lua_tointeger(L, -1)); + + } else if (strcmp(lua_tostring(L, -2), "headers") == 0) { + if (lua_type(L, -1) != LUA_TTABLE) + WILL_LJMP(luaL_error(L, "invalid parameter in 'headers', must be a table")); + hdrs = hlua_httpclient_table_to_hdrs(L); + + } else if (strcmp(lua_tostring(L, -2), "body") == 0) { + if (lua_type(L, -1) != LUA_TSTRING) + WILL_LJMP(luaL_error(L, "invalid parameter in 'body', must be a string")); + body_str = lua_tolstring(L, -1, &buf_len); + + } else { + WILL_LJMP(luaL_error(L, "'%s' invalid parameter name", lua_tostring(L, -2))); + } + /* removes 'value'; keeps 'key' for next iteration */ + lua_pop(L, 1); + } + + if (!url_str) { + WILL_LJMP(luaL_error(L, "'get' need a 'url' argument")); + return 0; + } + + hlua_hc->sent = 0; + + istfree(&hlua_hc->hc->req.url); + hlua_hc->hc->req.url = istdup(ist(url_str)); + hlua_hc->hc->req.meth = meth; + + /* update the httpclient callbacks */ + hlua_hc->hc->ops.res_stline = hlua_httpclient_cb; + hlua_hc->hc->ops.res_headers = hlua_httpclient_cb; + hlua_hc->hc->ops.res_payload = hlua_httpclient_cb; + hlua_hc->hc->ops.res_end = hlua_httpclient_cb; + + /* a body is available, it will use the request callback */ + if (body_str && buf_len) { + hlua_hc->hc->ops.req_payload = hlua_httpclient_cb; + } + + ret = httpclient_req_gen(hlua_hc->hc, hlua_hc->hc->req.url, meth, hdrs, IST_NULL); + + /* free the temporary headers array */ + hdrs_i = hdrs; + while (hdrs_i && isttest(hdrs_i->n)) { + istfree(&hdrs_i->n); + istfree(&hdrs_i->v); + hdrs_i++; + } + ha_free(&hdrs); + + + if (ret != ERR_NONE) { + WILL_LJMP(luaL_error(L, "Can't generate the HTTP request")); + return 0; + } + + if (!httpclient_start(hlua_hc->hc)) + WILL_LJMP(luaL_error(L, "couldn't start the httpclient")); + + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_httpclient_snd_yield, TICK_ETERNITY, 0)); + + return 0; +} + +/* + * Sends an HTTP HEAD request and wait for a response + * + * httpclient:head(url, headers, payload) + */ +__LJMP static int hlua_httpclient_head(lua_State *L) +{ + return hlua_httpclient_send(L, HTTP_METH_HEAD); +} + +/* + * Send an HTTP GET request and wait for a response + * + * httpclient:get(url, headers, payload) + */ +__LJMP static int hlua_httpclient_get(lua_State *L) +{ + return hlua_httpclient_send(L, HTTP_METH_GET); + +} + +/* + * Sends an HTTP PUT request and wait for a response + * + * httpclient:put(url, headers, payload) + */ +__LJMP static int hlua_httpclient_put(lua_State *L) +{ + return hlua_httpclient_send(L, HTTP_METH_PUT); +} + +/* + * Send an HTTP POST request and wait for a response + * + * httpclient:post(url, headers, payload) + */ +__LJMP static int hlua_httpclient_post(lua_State *L) +{ + return hlua_httpclient_send(L, HTTP_METH_POST); +} + + +/* + * Sends an HTTP DELETE request and wait for a response + * + * httpclient:delete(url, headers, payload) + */ +__LJMP static int hlua_httpclient_delete(lua_State *L) +{ + return hlua_httpclient_send(L, HTTP_METH_DELETE); +} + +/* + * + * + * Class TXN + * + * + */ + +/* Returns a struct hlua_session if the stack entry "ud" is + * a class stream, otherwise it throws an error. + */ +__LJMP static struct hlua_txn *hlua_checktxn(lua_State *L, int ud) +{ + return MAY_LJMP(hlua_checkudata(L, ud, class_txn_ref)); +} + +__LJMP static int hlua_set_var(lua_State *L) +{ + struct hlua_txn *htxn; + const char *name; + size_t len; + struct sample smp; + + if (lua_gettop(L) < 3 || lua_gettop(L) > 4) + WILL_LJMP(luaL_error(L, "'set_var' needs between 3 and 4 arguments")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + + /* Converts the third argument in a sample. */ + memset(&smp, 0, sizeof(smp)); + hlua_lua2smp(L, 3, &smp); + + /* Store the sample in a variable. We don't need to dup the smp, vars API + * already takes care of duplicating dynamic var data. + */ + smp_set_owner(&smp, htxn->p, htxn->s->sess, htxn->s, htxn->dir & SMP_OPT_DIR); + + if (lua_gettop(L) == 4 && lua_toboolean(L, 4)) + lua_pushboolean(L, vars_set_by_name_ifexist(name, len, &smp) != 0); + else + lua_pushboolean(L, vars_set_by_name(name, len, &smp) != 0); + + return 1; +} + +__LJMP static int hlua_unset_var(lua_State *L) +{ + struct hlua_txn *htxn; + const char *name; + size_t len; + struct sample smp; + + MAY_LJMP(check_args(L, 2, "unset_var")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + + /* Unset the variable. */ + smp_set_owner(&smp, htxn->p, htxn->s->sess, htxn->s, htxn->dir & SMP_OPT_DIR); + lua_pushboolean(L, vars_unset_by_name_ifexist(name, len, &smp) != 0); + return 1; +} + +__LJMP static int hlua_get_var(lua_State *L) +{ + struct hlua_txn *htxn; + const char *name; + size_t len; + struct sample smp; + + MAY_LJMP(check_args(L, 2, "get_var")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + name = MAY_LJMP(luaL_checklstring(L, 2, &len)); + + smp_set_owner(&smp, htxn->p, htxn->s->sess, htxn->s, htxn->dir & SMP_OPT_DIR); + if (!vars_get_by_name(name, len, &smp, NULL)) { + lua_pushnil(L); + return 1; + } + + return MAY_LJMP(hlua_smp2lua(L, &smp)); +} + +__LJMP static int hlua_set_priv(lua_State *L) +{ + struct hlua *hlua; + + MAY_LJMP(check_args(L, 2, "set_priv")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + MAY_LJMP(hlua_checktxn(L, 1)); + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) + return 0; + + /* Remove previous value. */ + luaL_unref(L, LUA_REGISTRYINDEX, hlua->Mref); + + /* Get and store new value. */ + lua_pushvalue(L, 2); /* Copy the element 2 at the top of the stack. */ + hlua->Mref = luaL_ref(L, LUA_REGISTRYINDEX); /* pop the previously pushed value. */ + + return 0; +} + +__LJMP static int hlua_get_priv(lua_State *L) +{ + struct hlua *hlua; + + MAY_LJMP(check_args(L, 1, "get_priv")); + + /* It is useles to retrieve the stream, but this function + * runs only in a stream context. + */ + MAY_LJMP(hlua_checktxn(L, 1)); + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) { + lua_pushnil(L); + return 1; + } + + /* Push configuration index in the stack. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, hlua->Mref); + + return 1; +} + +/* Create stack entry containing a class TXN. This function + * return 0 if the stack does not contains free slots, + * otherwise it returns 1. + */ +static int hlua_txn_new(lua_State *L, struct stream *s, struct proxy *p, int dir, int flags) +{ + struct hlua_txn *htxn; + + /* Check stack size. */ + if (!lua_checkstack(L, 3)) + return 0; + + /* NOTE: The allocation never fails. The failure + * throw an error, and the function never returns. + * if the throw is not available, the process is aborted. + */ + /* Create the object: obj[0] = userdata. */ + lua_newtable(L); + htxn = lua_newuserdata(L, sizeof(*htxn)); + lua_rawseti(L, -2, 0); + + htxn->s = s; + htxn->p = p; + htxn->dir = dir; + htxn->flags = flags; + + /* Create the "f" field that contains a list of fetches. */ + lua_pushstring(L, "f"); + if (!hlua_fetches_new(L, htxn, HLUA_F_MAY_USE_HTTP)) + return 0; + lua_rawset(L, -3); + + /* Create the "sf" field that contains a list of stringsafe fetches. */ + lua_pushstring(L, "sf"); + if (!hlua_fetches_new(L, htxn, HLUA_F_MAY_USE_HTTP | HLUA_F_AS_STRING)) + return 0; + lua_rawset(L, -3); + + /* Create the "c" field that contains a list of converters. */ + lua_pushstring(L, "c"); + if (!hlua_converters_new(L, htxn, 0)) + return 0; + lua_rawset(L, -3); + + /* Create the "sc" field that contains a list of stringsafe converters. */ + lua_pushstring(L, "sc"); + if (!hlua_converters_new(L, htxn, HLUA_F_AS_STRING)) + return 0; + lua_rawset(L, -3); + + /* Create the "req" field that contains the request channel object. */ + lua_pushstring(L, "req"); + if (!hlua_channel_new(L, &s->req)) + return 0; + lua_rawset(L, -3); + + /* Create the "res" field that contains the response channel object. */ + lua_pushstring(L, "res"); + if (!hlua_channel_new(L, &s->res)) + return 0; + lua_rawset(L, -3); + + /* Creates the HTTP object is the current proxy allows http. */ + lua_pushstring(L, "http"); + if (IS_HTX_STRM(s)) { + if (!hlua_http_new(L, htxn)) + return 0; + } + else + lua_pushnil(L); + lua_rawset(L, -3); + + if ((htxn->flags & HLUA_TXN_CTX_MASK) == HLUA_TXN_FLT_CTX) { + /* HTTPMessage object are created when a lua TXN is created from + * a filter context only + */ + + /* Creates the HTTP-Request object is the current proxy allows http. */ + lua_pushstring(L, "http_req"); + if (p->mode == PR_MODE_HTTP) { + if (!hlua_http_msg_new(L, &s->txn->req)) + return 0; + } + else + lua_pushnil(L); + lua_rawset(L, -3); + + /* Creates the HTTP-Response object is the current proxy allows http. */ + lua_pushstring(L, "http_res"); + if (p->mode == PR_MODE_HTTP) { + if (!hlua_http_msg_new(L, &s->txn->rsp)) + return 0; + } + else + lua_pushnil(L); + lua_rawset(L, -3); + } + + /* Pop a class sesison metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_txn_ref); + lua_setmetatable(L, -2); + + return 1; +} + +__LJMP static int hlua_txn_deflog(lua_State *L) +{ + const char *msg; + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "deflog")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + msg = MAY_LJMP(luaL_checkstring(L, 2)); + + hlua_sendlog(htxn->s->be, htxn->s->logs.level, msg); + return 0; +} + +__LJMP static int hlua_txn_log(lua_State *L) +{ + int level; + const char *msg; + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 3, "log")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + level = MAY_LJMP(luaL_checkinteger(L, 2)); + msg = MAY_LJMP(luaL_checkstring(L, 3)); + + if (level < 0 || level >= NB_LOG_LEVELS) + WILL_LJMP(luaL_argerror(L, 1, "Invalid loglevel.")); + + hlua_sendlog(htxn->s->be, level, msg); + return 0; +} + +__LJMP static int hlua_txn_log_debug(lua_State *L) +{ + const char *msg; + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "Debug")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + msg = MAY_LJMP(luaL_checkstring(L, 2)); + hlua_sendlog(htxn->s->be, LOG_DEBUG, msg); + return 0; +} + +__LJMP static int hlua_txn_log_info(lua_State *L) +{ + const char *msg; + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "Info")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + msg = MAY_LJMP(luaL_checkstring(L, 2)); + hlua_sendlog(htxn->s->be, LOG_INFO, msg); + return 0; +} + +__LJMP static int hlua_txn_log_warning(lua_State *L) +{ + const char *msg; + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "Warning")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + msg = MAY_LJMP(luaL_checkstring(L, 2)); + hlua_sendlog(htxn->s->be, LOG_WARNING, msg); + return 0; +} + +__LJMP static int hlua_txn_log_alert(lua_State *L) +{ + const char *msg; + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "Alert")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + msg = MAY_LJMP(luaL_checkstring(L, 2)); + hlua_sendlog(htxn->s->be, LOG_ALERT, msg); + return 0; +} + +__LJMP static int hlua_txn_set_loglevel(lua_State *L) +{ + struct hlua_txn *htxn; + int ll; + + MAY_LJMP(check_args(L, 2, "set_loglevel")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + ll = MAY_LJMP(luaL_checkinteger(L, 2)); + + if (ll < 0 || ll > 7) + WILL_LJMP(luaL_argerror(L, 2, "Bad log level. It must be between 0 and 7")); + + htxn->s->logs.level = ll; + return 0; +} + +__LJMP static int hlua_txn_set_tos(lua_State *L) +{ + struct hlua_txn *htxn; + int tos; + + MAY_LJMP(check_args(L, 2, "set_tos")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + tos = MAY_LJMP(luaL_checkinteger(L, 2)); + + conn_set_tos(objt_conn(htxn->s->sess->origin), tos); + return 0; +} + +__LJMP static int hlua_txn_set_mark(lua_State *L) +{ + struct hlua_txn *htxn; + int mark; + + MAY_LJMP(check_args(L, 2, "set_mark")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + mark = MAY_LJMP(luaL_checkinteger(L, 2)); + + conn_set_mark(objt_conn(htxn->s->sess->origin), mark); + return 0; +} + +__LJMP static int hlua_txn_set_priority_class(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "set_priority_class")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + htxn->s->priority_class = queue_limit_class(MAY_LJMP(luaL_checkinteger(L, 2))); + return 0; +} + +__LJMP static int hlua_txn_set_priority_offset(lua_State *L) +{ + struct hlua_txn *htxn; + + MAY_LJMP(check_args(L, 2, "set_priority_offset")); + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + htxn->s->priority_offset = queue_limit_offset(MAY_LJMP(luaL_checkinteger(L, 2))); + return 0; +} + +/* Forward the Reply object to the client. This function converts the reply in + * HTX an push it to into the response channel. It is response to forward the + * message and terminate the transaction. It returns 1 on success and 0 on + * error. The Reply must be on top of the stack. + */ +__LJMP static int hlua_txn_forward_reply(lua_State *L, struct stream *s) +{ + struct htx *htx; + struct htx_sl *sl; + struct h1m h1m; + const char *status, *reason, *body; + size_t status_len, reason_len, body_len; + int ret, code, flags; + + code = 200; + status = "200"; + status_len = 3; + ret = lua_getfield(L, -1, "status"); + if (ret == LUA_TNUMBER) { + code = lua_tointeger(L, -1); + status = lua_tolstring(L, -1, &status_len); + } + lua_pop(L, 1); + + reason = http_get_reason(code); + reason_len = strlen(reason); + ret = lua_getfield(L, -1, "reason"); + if (ret == LUA_TSTRING) + reason = lua_tolstring(L, -1, &reason_len); + lua_pop(L, 1); + + body = NULL; + body_len = 0; + ret = lua_getfield(L, -1, "body"); + if (ret == LUA_TSTRING) + body = lua_tolstring(L, -1, &body_len); + lua_pop(L, 1); + + /* Prepare the response before inserting the headers */ + h1m_init_res(&h1m); + htx = htx_from_buf(&s->res.buf); + channel_htx_truncate(&s->res, htx); + if (s->txn->req.flags & HTTP_MSGF_VER_11) { + flags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11); + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, ist("HTTP/1.1"), + ist2(status, status_len), ist2(reason, reason_len)); + } + else { + flags = HTX_SL_F_IS_RESP; + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, ist("HTTP/1.0"), + ist2(status, status_len), ist2(reason, reason_len)); + } + if (!sl) + goto fail; + sl->info.res.status = code; + + /* Push in the stack the "headers" entry. */ + ret = lua_getfield(L, -1, "headers"); + if (ret != LUA_TTABLE) + goto skip_headers; + + lua_pushnil(L); + while (lua_next(L, -2) != 0) { + struct ist name, value; + const char *n, *v; + size_t nlen, vlen; + + if (!lua_isstring(L, -2) || !lua_istable(L, -1)) { + /* Skip element if the key is not a string or if the value is not a table */ + goto next_hdr; + } + + n = lua_tolstring(L, -2, &nlen); + name = ist2(n, nlen); + if (isteqi(name, ist("content-length"))) { + /* Always skip content-length header. It will be added + * later with the correct len + */ + goto next_hdr; + } + + /* Loop on header's values */ + lua_pushnil(L); + while (lua_next(L, -2)) { + if (!lua_isstring(L, -1)) { + /* Skip the value if it is not a string */ + goto next_value; + } + + v = lua_tolstring(L, -1, &vlen); + value = ist2(v, vlen); + + if (isteqi(name, ist("transfer-encoding"))) + h1_parse_xfer_enc_header(&h1m, value); + if (!htx_add_header(htx, ist2(n, nlen), ist2(v, vlen))) + goto fail; + + next_value: + lua_pop(L, 1); + } + + next_hdr: + lua_pop(L, 1); + } + skip_headers: + lua_pop(L, 1); + + /* Update h1m flags: CLEN is set if CHNK is not present */ + if (!(h1m.flags & H1_MF_CHNK)) { + const char *clen = ultoa(body_len); + + h1m.flags |= H1_MF_CLEN; + if (!htx_add_header(htx, ist("content-length"), ist(clen))) + goto fail; + } + if (h1m.flags & (H1_MF_CLEN|H1_MF_CHNK)) + h1m.flags |= H1_MF_XFER_LEN; + + /* Update HTX start-line flags */ + if (h1m.flags & H1_MF_XFER_ENC) + flags |= HTX_SL_F_XFER_ENC; + if (h1m.flags & H1_MF_XFER_LEN) { + flags |= HTX_SL_F_XFER_LEN; + if (h1m.flags & H1_MF_CHNK) + flags |= HTX_SL_F_CHNK; + else if (h1m.flags & H1_MF_CLEN) + flags |= HTX_SL_F_CLEN; + if (h1m.body_len == 0) + flags |= HTX_SL_F_BODYLESS; + } + sl->flags |= flags; + + + if (!htx_add_endof(htx, HTX_BLK_EOH) || + (body_len && !htx_add_data_atonce(htx, ist2(body, body_len)))) + goto fail; + + htx->flags |= HTX_FL_EOM; + + /* Now, forward the response and terminate the transaction */ + s->txn->status = code; + htx_to_buf(htx, &s->res.buf); + if (!http_forward_proxy_resp(s, 1)) + goto fail; + + return 1; + + fail: + channel_htx_truncate(&s->res, htx); + return 0; +} + +/* Terminate a transaction if called from a lua action. For TCP streams, + * processing is just aborted. Nothing is returned to the client and all + * arguments are ignored. For HTTP streams, if a reply is passed as argument, it + * is forwarded to the client before terminating the transaction. On success, + * the function exits with ACT_RET_DONE code. If an error occurred, it exits + * with ACT_RET_ERR code. If this function is not called from a lua action, it + * just exits without any processing. + */ +__LJMP static int hlua_txn_done(lua_State *L) +{ + struct hlua_txn *htxn; + struct stream *s; + int finst; + + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + + /* If the flags NOTERM is set, we cannot terminate the session, so we + * just end the execution of the current lua code. */ + if (htxn->flags & HLUA_TXN_NOTERM) + WILL_LJMP(hlua_done(L)); + + s = htxn->s; + if (!IS_HTX_STRM(htxn->s)) { + struct channel *req = &s->req; + struct channel *res = &s->res; + + channel_auto_read(req); + channel_abort(req); + channel_erase(req); + + channel_auto_read(res); + channel_auto_close(res); + sc_schedule_abort(s->scb); + + finst = ((htxn->dir == SMP_OPT_DIR_REQ) ? SF_FINST_R : SF_FINST_D); + goto done; + } + + if (lua_gettop(L) == 1 || !lua_istable(L, 2)) { + /* No reply or invalid reply */ + s->txn->status = 0; + http_reply_and_close(s, 0, NULL); + } + else { + /* Remove extra args to have the reply on top of the stack */ + if (lua_gettop(L) > 2) + lua_pop(L, lua_gettop(L) - 2); + + if (!hlua_txn_forward_reply(L, s)) { + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + lua_pushinteger(L, ACT_RET_ERR); + WILL_LJMP(hlua_done(L)); + return 0; /* Never reached */ + } + } + + finst = ((htxn->dir == SMP_OPT_DIR_REQ) ? SF_FINST_R : SF_FINST_H); + if (htxn->dir == SMP_OPT_DIR_REQ) { + /* let's log the request time */ + s->logs.request_ts = now_ns; + if (s->sess->fe == s->be) /* report it if the request was intercepted by the frontend */ + _HA_ATOMIC_INC(&s->sess->fe->fe_counters.intercepted_req); + } + + done: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_LOCAL; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= finst; + + if ((htxn->flags & HLUA_TXN_CTX_MASK) == HLUA_TXN_FLT_CTX) + lua_pushinteger(L, -1); + else + lua_pushinteger(L, ACT_RET_ABRT); + WILL_LJMP(hlua_done(L)); + return 0; +} + +/* + * + * + * Class REPLY + * + * + */ + +/* Pushes the TXN reply onto the top of the stack. If the stask does not have a + * free slots, the function fails and returns 0; + */ +static int hlua_txn_reply_new(lua_State *L) +{ + struct hlua_txn *htxn; + const char *reason, *body = NULL; + int ret, status; + + htxn = MAY_LJMP(hlua_checktxn(L, 1)); + if (!IS_HTX_STRM(htxn->s)) { + hlua_pusherror(L, "txn object is not an HTTP transaction."); + WILL_LJMP(lua_error(L)); + } + + /* Default value */ + status = 200; + reason = http_get_reason(status); + + if (lua_istable(L, 2)) { + /* load status and reason from the table argument at index 2 */ + ret = lua_getfield(L, 2, "status"); + if (ret == LUA_TNIL) + goto reason; + else if (ret != LUA_TNUMBER) { + /* invalid status: ignore the reason */ + goto body; + } + status = lua_tointeger(L, -1); + + reason: + lua_pop(L, 1); /* restore the stack: remove status */ + ret = lua_getfield(L, 2, "reason"); + if (ret == LUA_TSTRING) + reason = lua_tostring(L, -1); + + body: + lua_pop(L, 1); /* restore the stack: remove invalid status or reason */ + ret = lua_getfield(L, 2, "body"); + if (ret == LUA_TSTRING) + body = lua_tostring(L, -1); + lua_pop(L, 1); /* restore the stack: remove body */ + } + + /* Create the Reply table */ + lua_newtable(L); + + /* Add status element */ + lua_pushstring(L, "status"); + lua_pushinteger(L, status); + lua_settable(L, -3); + + /* Add reason element */ + reason = http_get_reason(status); + lua_pushstring(L, "reason"); + lua_pushstring(L, reason); + lua_settable(L, -3); + + /* Add body element, nil if undefined */ + lua_pushstring(L, "body"); + if (body) + lua_pushstring(L, body); + else + lua_pushnil(L); + lua_settable(L, -3); + + /* Add headers element */ + lua_pushstring(L, "headers"); + lua_newtable(L); + + /* stack: [ txn, <Arg:table>, <Reply:table>, "headers", <headers:table> ] */ + if (lua_istable(L, 2)) { + /* load headers from the table argument at index 2. If it is a table, copy it. */ + ret = lua_getfield(L, 2, "headers"); + if (ret == LUA_TTABLE) { + /* stack: [ ... <headers:table>, <table> ] */ + lua_pushnil(L); + while (lua_next(L, -2) != 0) { + /* stack: [ ... <headers:table>, <table>, k, v] */ + if (!lua_isstring(L, -1) && !lua_istable(L, -1)) { + /* invalid value type, skip it */ + lua_pop(L, 1); + continue; + } + + + /* Duplicate the key and swap it with the value. */ + lua_pushvalue(L, -2); + lua_insert(L, -2); + /* stack: [ ... <headers:table>, <table>, k, k, v ] */ + + lua_newtable(L); + lua_insert(L, -2); + /* stack: [ ... <headers:table>, <table>, k, k, <inner:table>, v ] */ + + if (lua_isstring(L, -1)) { + /* push the value in the inner table */ + lua_rawseti(L, -2, 1); + } + else { /* table */ + lua_pushnil(L); + while (lua_next(L, -2) != 0) { + /* stack: [ ... <headers:table>, <table>, k, k, <inner:table>, <v:table>, k2, v2 ] */ + if (!lua_isstring(L, -1)) { + /* invalid value type, skip it*/ + lua_pop(L, 1); + continue; + } + /* push the value in the inner table */ + lua_rawseti(L, -4, lua_rawlen(L, -4) + 1); + /* stack: [ ... <headers:table>, <table>, k, k, <inner:table>, <v:table>, k2 ] */ + } + lua_pop(L, 1); + /* stack: [ ... <headers:table>, <table>, k, k, <inner:table> ] */ + } + + /* push (k,v) on the stack in the headers table: + * stack: [ ... <headers:table>, <table>, k, k, v ] + */ + lua_settable(L, -5); + /* stack: [ ... <headers:table>, <table>, k ] */ + } + } + lua_pop(L, 1); + } + /* stack: [ txn, <Arg:table>, <Reply:table>, "headers", <headers:table> ] */ + lua_settable(L, -3); + /* stack: [ txn, <Arg:table>, <Reply:table> ] */ + + /* Pop a class sesison metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_txn_reply_ref); + lua_setmetatable(L, -2); + return 1; +} + +/* Set the reply status code, and optionally the reason. If no reason is + * provided, the default one corresponding to the status code is used. + */ +__LJMP static int hlua_txn_reply_set_status(lua_State *L) +{ + int status = MAY_LJMP(luaL_checkinteger(L, 2)); + const char *reason = MAY_LJMP(luaL_optlstring(L, 3, NULL, NULL)); + + /* First argument (self) must be a table */ + MAY_LJMP(luaL_checktype(L, 1, LUA_TTABLE)); + + if (status < 100 || status > 599) { + lua_pushboolean(L, 0); + return 1; + } + if (!reason) + reason = http_get_reason(status); + + lua_pushinteger(L, status); + lua_setfield(L, 1, "status"); + + lua_pushstring(L, reason); + lua_setfield(L, 1, "reason"); + + lua_pushboolean(L, 1); + return 1; +} + +/* Add a header into the reply object. Each header name is associated to an + * array of values in the "headers" table. If the header name is not found, a + * new entry is created. + */ +__LJMP static int hlua_txn_reply_add_header(lua_State *L) +{ + const char *name = MAY_LJMP(luaL_checkstring(L, 2)); + const char *value = MAY_LJMP(luaL_checkstring(L, 3)); + int ret; + + /* First argument (self) must be a table */ + MAY_LJMP(luaL_checktype(L, 1, LUA_TTABLE)); + + /* Push in the stack the "headers" entry. */ + ret = lua_getfield(L, 1, "headers"); + if (ret != LUA_TTABLE) { + hlua_pusherror(L, "Reply['headers'] is expected to a an array. %s found", lua_typename(L, ret)); + WILL_LJMP(lua_error(L)); + } + + /* check if the header is already registered. If not, register it. */ + ret = lua_getfield(L, -1, name); + if (ret == LUA_TNIL) { + /* Entry not found. */ + lua_pop(L, 1); /* remove the nil. The "headers" table is the top of the stack. */ + + /* Insert the new header name in the array in the top of the stack. + * It left the new array in the top of the stack. + */ + lua_newtable(L); + lua_pushstring(L, name); + lua_pushvalue(L, -2); + lua_settable(L, -4); + } + else if (ret != LUA_TTABLE) { + hlua_pusherror(L, "Reply['headers']['%s'] is expected to be an array. %s found", name, lua_typename(L, ret)); + WILL_LJMP(lua_error(L)); + } + + /* Now the top of thestack is an array of values. We push + * the header value as new entry. + */ + lua_pushstring(L, value); + ret = lua_rawlen(L, -2); + lua_rawseti(L, -2, ret + 1); + + lua_pushboolean(L, 1); + return 1; +} + +/* Remove all occurrences of a given header name. */ +__LJMP static int hlua_txn_reply_del_header(lua_State *L) +{ + const char *name = MAY_LJMP(luaL_checkstring(L, 2)); + int ret; + + /* First argument (self) must be a table */ + MAY_LJMP(luaL_checktype(L, 1, LUA_TTABLE)); + + /* Push in the stack the "headers" entry. */ + ret = lua_getfield(L, 1, "headers"); + if (ret != LUA_TTABLE) { + hlua_pusherror(L, "Reply['headers'] is expected to be an array. %s found", lua_typename(L, ret)); + WILL_LJMP(lua_error(L)); + } + + lua_pushstring(L, name); + lua_pushnil(L); + lua_settable(L, -3); + + lua_pushboolean(L, 1); + return 1; +} + +/* Set the reply's body. Overwrite any existing entry. */ +__LJMP static int hlua_txn_reply_set_body(lua_State *L) +{ + const char *payload = MAY_LJMP(luaL_checkstring(L, 2)); + + /* First argument (self) must be a table */ + MAY_LJMP(luaL_checktype(L, 1, LUA_TTABLE)); + + lua_pushstring(L, payload); + lua_setfield(L, 1, "body"); + + lua_pushboolean(L, 1); + return 1; +} + +__LJMP static int hlua_log(lua_State *L) +{ + int level; + const char *msg; + + MAY_LJMP(check_args(L, 2, "log")); + level = MAY_LJMP(luaL_checkinteger(L, 1)); + msg = MAY_LJMP(luaL_checkstring(L, 2)); + + if (level < 0 || level >= NB_LOG_LEVELS) + WILL_LJMP(luaL_argerror(L, 1, "Invalid loglevel.")); + + hlua_sendlog(NULL, level, msg); + return 0; +} + +__LJMP static int hlua_log_debug(lua_State *L) +{ + const char *msg; + + MAY_LJMP(check_args(L, 1, "debug")); + msg = MAY_LJMP(luaL_checkstring(L, 1)); + hlua_sendlog(NULL, LOG_DEBUG, msg); + return 0; +} + +__LJMP static int hlua_log_info(lua_State *L) +{ + const char *msg; + + MAY_LJMP(check_args(L, 1, "info")); + msg = MAY_LJMP(luaL_checkstring(L, 1)); + hlua_sendlog(NULL, LOG_INFO, msg); + return 0; +} + +__LJMP static int hlua_log_warning(lua_State *L) +{ + const char *msg; + + MAY_LJMP(check_args(L, 1, "warning")); + msg = MAY_LJMP(luaL_checkstring(L, 1)); + hlua_sendlog(NULL, LOG_WARNING, msg); + return 0; +} + +__LJMP static int hlua_log_alert(lua_State *L) +{ + const char *msg; + + MAY_LJMP(check_args(L, 1, "alert")); + msg = MAY_LJMP(luaL_checkstring(L, 1)); + hlua_sendlog(NULL, LOG_ALERT, msg); + return 0; +} + +__LJMP static int hlua_sleep_yield(lua_State *L, int status, lua_KContext ctx) +{ + int wakeup_ms = lua_tointeger(L, -1); + if (!tick_is_expired(wakeup_ms, now_ms)) + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_sleep_yield, wakeup_ms, 0)); + return 0; +} + +__LJMP static int hlua_sleep(lua_State *L) +{ + unsigned int delay; + int wakeup_ms; // tick value + + MAY_LJMP(check_args(L, 1, "sleep")); + + delay = MAY_LJMP(luaL_checkinteger(L, 1)) * 1000; + wakeup_ms = tick_add(now_ms, delay); + lua_pushinteger(L, wakeup_ms); + + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_sleep_yield, wakeup_ms, 0)); + return 0; +} + +__LJMP static int hlua_msleep(lua_State *L) +{ + unsigned int delay; + int wakeup_ms; // tick value + + MAY_LJMP(check_args(L, 1, "msleep")); + + delay = MAY_LJMP(luaL_checkinteger(L, 1)); + wakeup_ms = tick_add(now_ms, delay); + lua_pushinteger(L, wakeup_ms); + + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_sleep_yield, wakeup_ms, 0)); + return 0; +} + +/* This functionis an LUA binding. it permits to give back + * the hand at the HAProxy scheduler. It is used when the + * LUA processing consumes a lot of time. + */ +__LJMP static int hlua_yield_yield(lua_State *L, int status, lua_KContext ctx) +{ + return 0; +} + +__LJMP static int hlua_yield(lua_State *L) +{ + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_yield_yield, TICK_ETERNITY, HLUA_CTRLYIELD)); + return 0; +} + +/* This function change the nice of the currently executed + * task. It is used set low or high priority at the current + * task. + */ +__LJMP static int hlua_set_nice(lua_State *L) +{ + struct hlua *hlua; + int nice; + + MAY_LJMP(check_args(L, 1, "set_nice")); + nice = MAY_LJMP(luaL_checkinteger(L, 1)); + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + + /* If the task is not set, I'm in a start mode. */ + if (!hlua || !hlua->task) + return 0; + + if (nice < -1024) + nice = -1024; + else if (nice > 1024) + nice = 1024; + + hlua->task->nice = nice; + return 0; +} + +/* safe lua coroutine.create() function: + * + * This is a simple wrapper for coroutine.create() that + * ensures the current hlua state ctx is available from + * the new subroutine state + */ +__LJMP static int hlua_coroutine_create(lua_State *L) +{ + lua_State *new; /* new coroutine state */ + struct hlua **hlua_store; + struct hlua *hlua = hlua_gethlua(L); + + new = lua_newthread(L); + if (!new) + return 0; + + hlua_store = lua_getextraspace(new); + /* Expose current hlua ctx on new lua thread + * (hlua_gethlua() will properly return the last "known" + * hlua ctx instead of NULL when it is called from such coroutines) + */ + *hlua_store = hlua; + + /* new lua thread is on the top of the stack, we + * need to duplicate first stack argument (<f> from coroutine.create(<f>)) + * on the top of the stack to be able to use xmove() to move it on the new + * stack + */ + lua_pushvalue(L, 1); + /* move <f> function to the new stack */ + lua_xmove(L, new, 1); + /* new lua thread is back at the top of the stack */ + return 1; +} + +/* This function is used as a callback of a task. It is called by the + * HAProxy task subsystem when the task is awaked. The LUA runtime can + * return an E_AGAIN signal, the emmiter of this signal must set a + * signal to wake the task. + * + * Task wrapper are longjmp safe because the only one Lua code + * executed is the safe hlua_ctx_resume(); + */ +struct task *hlua_process_task(struct task *task, void *context, unsigned int state) +{ + struct hlua *hlua = context; + enum hlua_exec status; + + if (task->tid < 0) + task->tid = tid; + + /* If it is the first call to the task, we must initialize the + * execution timeouts. + */ + if (!HLUA_IS_RUNNING(hlua)) + hlua_timer_init(&hlua->timer, hlua_timeout_task); + + /* Execute the Lua code. */ + status = hlua_ctx_resume(hlua, 1); + + switch (status) { + /* finished or yield */ + case HLUA_E_OK: + hlua_ctx_destroy(hlua); + task_destroy(task); + task = NULL; + break; + + case HLUA_E_AGAIN: /* co process or timeout wake me later. */ + notification_gc(&hlua->com); + task->expire = hlua->wake_time; + break; + + /* finished with error. */ + case HLUA_E_ETMOUT: + SEND_ERR(NULL, "Lua task: execution timeout.\n"); + goto err_task_abort; + case HLUA_E_ERRMSG: + SEND_ERR(NULL, "Lua task: %s.\n", lua_tostring(hlua->T, -1)); + goto err_task_abort; + case HLUA_E_ERR: + default: + SEND_ERR(NULL, "Lua task: unknown error.\n"); + err_task_abort: + hlua_ctx_destroy(hlua); + task_destroy(task); + task = NULL; + break; + } + return task; +} + +/* Helper function to prepare the lua ctx for a given stream + * + * ctx will be enforced in <state_id> parent stack on initial creation. + * If s->hlua->state_id differs from <state_id>, which may happen at + * runtime since existing stream hlua ctx will be reused for other + * "independent" (but stream-related) lua executions, hlua will be + * recreated with the expected state id. + * + * Returns 1 for success and 0 for failure + */ +static int hlua_stream_ctx_prepare(struct stream *s, int state_id) +{ + /* In the execution wrappers linked with a stream, the + * Lua context can be not initialized. This behavior + * permits to save performances because a systematic + * Lua initialization cause 5% performances loss. + */ + ctx_renew: + if (!s->hlua) { + struct hlua *hlua; + + hlua = pool_alloc(pool_head_hlua); + if (!hlua) + return 0; + HLUA_INIT(hlua); + if (!hlua_ctx_init(hlua, state_id, s->task)) { + pool_free(pool_head_hlua, hlua); + return 0; + } + s->hlua = hlua; + } + else if (s->hlua->state_id != state_id) { + /* ctx already created, but not in proper state. + * It should only happen after the previous execution is + * finished, otherwise it's probably a bug since we don't + * want to abort unfinished job.. + */ + BUG_ON(HLUA_IS_RUNNING(s->hlua)); + hlua_ctx_destroy(s->hlua); + s->hlua = NULL; + goto ctx_renew; + } + return 1; +} + +/* This function is an LUA binding that register LUA function to be + * executed after the HAProxy configuration parsing and before the + * HAProxy scheduler starts. This function expect only one LUA + * argument that is a function. This function returns nothing, but + * throws if an error is encountered. + */ +__LJMP static int hlua_register_init(lua_State *L) +{ + struct hlua_init_function *init; + int ref; + + MAY_LJMP(check_args(L, 1, "register_init")); + + if (hlua_gethlua(L)) { + /* runtime processing */ + WILL_LJMP(luaL_error(L, "register_init: not available outside of body context")); + } + + ref = MAY_LJMP(hlua_checkfunction(L, 1)); + + init = calloc(1, sizeof(*init)); + if (!init) { + hlua_unref(L, ref); + WILL_LJMP(luaL_error(L, "Lua out of memory error.")); + } + + init->function_ref = ref; + LIST_APPEND(&hlua_init_functions[hlua_state_id], &init->l); + return 0; +} + +/* This function is an LUA binding. It permits to register a task + * executed in parallel of the main HAroxy activity. The task is + * created and it is set in the HAProxy scheduler. It can be called + * from the "init" section, "post init" or during the runtime. + * + * Lua prototype: + * + * <none> core.register_task(<function>[, <arg1>[, <arg2>[, ...[, <arg4>]]]]) + * + * <arg1..4> are optional arguments that will be provided to <function> + */ +__LJMP static int hlua_register_task(lua_State *L) +{ + struct hlua *hlua = NULL; + struct task *task = NULL; + int ref; + int nb_arg; + int it; + int arg_ref[4]; /* optional arguments */ + int state_id; + + nb_arg = lua_gettop(L); + if (nb_arg < 1) + WILL_LJMP(luaL_error(L, "register_task: <func> argument is required")); + else if (nb_arg > 5) + WILL_LJMP(luaL_error(L, "register_task: no more that 4 optional arguments may be provided")); + + /* first arg: function ref */ + ref = MAY_LJMP(hlua_checkfunction(L, 1)); + + /* extract optional args (if any) */ + it = 0; + while (--nb_arg) { + lua_pushvalue(L, 2 + it); + arg_ref[it] = hlua_ref(L); /* get arg reference */ + it += 1; + } + nb_arg = it; + + /* Get the reference state. If the reference is NULL, L is the master + * state, otherwise hlua->T is. + */ + hlua = hlua_gethlua(L); + if (hlua) + /* we are in runtime processing */ + state_id = hlua->state_id; + else + /* we are in initialization mode */ + state_id = hlua_state_id; + + hlua = pool_alloc(pool_head_hlua); + if (!hlua) + goto alloc_error; + HLUA_INIT(hlua); + + /* We are in the common lua state, execute the task anywhere, + * otherwise, inherit the current thread identifier + */ + if (state_id == 0) + task = task_new_anywhere(); + else + task = task_new_here(); + if (!task) + goto alloc_error; + + task->context = hlua; + task->process = hlua_process_task; + + if (!hlua_ctx_init(hlua, state_id, task)) + goto alloc_error; + + /* Ensure there is enough space on the stack for the function + * plus optional arguments + */ + if (!lua_checkstack(hlua->T, (1 + nb_arg))) + goto alloc_error; + + /* Restore the function in the stack. */ + hlua_pushref(hlua->T, ref); + /* function ref not needed anymore since it was pushed to the substack */ + hlua_unref(L, ref); + + hlua->nargs = nb_arg; + + /* push optional arguments to the function */ + for (it = 0; it < nb_arg; it++) { + /* push arg to the stack */ + hlua_pushref(hlua->T, arg_ref[it]); + /* arg ref not needed anymore since it was pushed to the substack */ + hlua_unref(L, arg_ref[it]); + } + + /* Schedule task. */ + task_wakeup(task, TASK_WOKEN_INIT); + + return 0; + + alloc_error: + task_destroy(task); + hlua_unref(L, ref); + for (it = 0; it < nb_arg; it++) { + hlua_unref(L, arg_ref[it]); + } + hlua_ctx_destroy(hlua); + WILL_LJMP(luaL_error(L, "Lua out of memory error.")); + return 0; /* Never reached */ +} + +/* called from unsafe location */ +static void hlua_event_subscription_destroy(struct hlua_event_sub *hlua_sub) +{ + /* hlua cleanup */ + + hlua_lock(hlua_sub->hlua); + /* registry is shared between coroutines */ + hlua_unref(hlua_sub->hlua->T, hlua_sub->fcn_ref); + hlua_unlock(hlua_sub->hlua); + + hlua_ctx_destroy(hlua_sub->hlua); + + /* free */ + pool_free(pool_head_hlua_event_sub, hlua_sub); +} + +/* single event handler: hlua ctx is shared between multiple events handlers + * issued from the same subscription. Thus, it is not destroyed when the event + * is processed: it is destroyed when no more events are expected for the + * subscription (ie: when the subscription ends). + * + * Moreover, events are processed sequentially within the subscription: + * one event must be fully processed before another one may be processed. + * This ensures proper consistency for lua event handling from an ordering + * point of view. This is especially useful with server events for example + * where ADD/DEL/UP/DOWN events ordering really matters to trigger specific + * actions from lua (e.g.: sending emails or making API calls). + * + * Due to this design, each lua event handler is pleased to process the event + * as fast as possible to prevent the event queue from growing up. + * Strictly speaking, there is no runtime limit for the callback function + * (timeout set to default task timeout), but if the event queue goes past + * the limit of unconsumed events an error will be reported and the + * susbscription will pause itself for as long as it takes for the handler to + * catch up (events will be lost as a result). + * If the event handler does not need the sequential ordering and wants to + * process multiple events at a time, it may spawn a new side-task using + * 'core.register_task' to delegate the event handling and make parallel event + * processing within the same subscription set. + */ +static void hlua_event_handler(struct hlua *hlua) +{ + enum hlua_exec status; + + /* If it is the first call to the task, we must initialize the + * execution timeouts. + */ + if (!HLUA_IS_RUNNING(hlua)) + hlua_timer_init(&hlua->timer, hlua_timeout_task); + + /* make sure to reset the task expiry before each hlua_ctx_resume() + * since the task is re-used for multiple cb function calls + * We couldn't risk to have t->expire pointing to a past date because + * it was set during last function invocation but was never reset since + * (ie: E_AGAIN) + */ + hlua->task->expire = TICK_ETERNITY; + + /* Execute the Lua code. */ + status = hlua_ctx_resume(hlua, 1); + + switch (status) { + /* finished or yield */ + case HLUA_E_OK: + break; + + case HLUA_E_AGAIN: /* co process or timeout wake me later. */ + notification_gc(&hlua->com); + hlua->task->expire = hlua->wake_time; + break; + + /* finished with error. */ + case HLUA_E_ETMOUT: + SEND_ERR(NULL, "Lua event_hdl: execution timeout.\n"); + break; + + case HLUA_E_ERRMSG: + SEND_ERR(NULL, "Lua event_hdl: %s.\n", lua_tostring(hlua->T, -1)); + break; + + case HLUA_E_ERR: + default: + SEND_ERR(NULL, "Lua event_hdl: unknown error.\n"); + break; + } +} + +__LJMP static void hlua_event_hdl_cb_push_event_checkres(lua_State *L, + struct event_hdl_cb_data_server_checkres *check) +{ + lua_pushstring(L, "agent"); + lua_pushboolean(L, check->agent); + lua_settable(L, -3); + lua_pushstring(L, "result"); + switch (check->result) { + case CHK_RES_FAILED: + lua_pushstring(L, "FAILED"); + break; + case CHK_RES_PASSED: + lua_pushstring(L, "PASSED"); + break; + case CHK_RES_CONDPASS: + lua_pushstring(L, "CONDPASS"); + break; + default: + lua_pushnil(L); + break; + } + lua_settable(L, -3); + + lua_pushstring(L, "duration"); + lua_pushinteger(L, check->duration); + lua_settable(L, -3); + + lua_pushstring(L, "reason"); + lua_newtable(L); + + lua_pushstring(L, "short"); + lua_pushstring(L, get_check_status_info(check->reason.status)); + lua_settable(L, -3); + lua_pushstring(L, "desc"); + lua_pushstring(L, get_check_status_description(check->reason.status)); + lua_settable(L, -3); + if (check->reason.status >= HCHK_STATUS_L57DATA) { + /* code only available when the check reached data analysis stage */ + lua_pushstring(L, "code"); + lua_pushinteger(L, check->reason.code); + lua_settable(L, -3); + } + + lua_settable(L, -3); /* reason table */ + + lua_pushstring(L, "health"); + lua_newtable(L); + + lua_pushstring(L, "cur"); + lua_pushinteger(L, check->health.cur); + lua_settable(L, -3); + lua_pushstring(L, "rise"); + lua_pushinteger(L, check->health.rise); + lua_settable(L, -3); + lua_pushstring(L, "fall"); + lua_pushinteger(L, check->health.fall); + lua_settable(L, -3); + + lua_settable(L, -3); /* health table */ +} + +/* This function pushes various arguments such as event type and event data to + * the lua function that will be called to consume the event. + */ +__LJMP static void hlua_event_hdl_cb_push_args(struct hlua_event_sub *hlua_sub, + struct event_hdl_async_event *e) +{ + struct hlua *hlua = hlua_sub->hlua; + struct event_hdl_sub_type event = e->type; + void *data = e->data; + + /* push event type */ + hlua->nargs = 1; + lua_pushstring(hlua->T, event_hdl_sub_type_to_string(event)); + + /* push event data (according to event type) */ + if (event_hdl_sub_family_equal(EVENT_HDL_SUB_SERVER, event)) { + struct event_hdl_cb_data_server *e_server = data; + struct proxy *px; + struct server *server; + + hlua->nargs += 1; + lua_newtable(hlua->T); + /* Add server name */ + lua_pushstring(hlua->T, "name"); + lua_pushstring(hlua->T, e_server->safe.name); + lua_settable(hlua->T, -3); + /* Add server puid */ + lua_pushstring(hlua->T, "puid"); + lua_pushinteger(hlua->T, e_server->safe.puid); + lua_settable(hlua->T, -3); + /* Add server rid */ + lua_pushstring(hlua->T, "rid"); + lua_pushinteger(hlua->T, e_server->safe.rid); + lua_settable(hlua->T, -3); + /* Add server proxy name */ + lua_pushstring(hlua->T, "proxy_name"); + lua_pushstring(hlua->T, e_server->safe.proxy_name); + lua_settable(hlua->T, -3); + /* Add server proxy uuid */ + lua_pushstring(hlua->T, "proxy_uuid"); + lua_pushinteger(hlua->T, e_server->safe.proxy_uuid); + lua_settable(hlua->T, -3); + + /* special events, fetch additional info with explicit type casting */ + if (event_hdl_sub_type_equal(EVENT_HDL_SUB_SERVER_STATE, event)) { + struct event_hdl_cb_data_server_state *state = data; + int it; + + if (!lua_checkstack(hlua->T, 20)) + WILL_LJMP(luaL_error(hlua->T, "Lua out of memory error.")); + + /* state subclass */ + lua_pushstring(hlua->T, "state"); + lua_newtable(hlua->T); + + lua_pushstring(hlua->T, "admin"); + lua_pushboolean(hlua->T, state->safe.type); + lua_settable(hlua->T, -3); + + /* is it because of a check ? */ + if (!state->safe.type && + (state->safe.op_st_chg.cause == SRV_OP_STCHGC_HEALTH || + state->safe.op_st_chg.cause == SRV_OP_STCHGC_AGENT)) { + /* yes, provide check result */ + lua_pushstring(hlua->T, "check"); + lua_newtable(hlua->T); + hlua_event_hdl_cb_push_event_checkres(hlua->T, &state->safe.op_st_chg.check); + lua_settable(hlua->T, -3); /* check table */ + } + + lua_pushstring(hlua->T, "cause"); + if (state->safe.type) + lua_pushstring(hlua->T, srv_adm_st_chg_cause(state->safe.adm_st_chg.cause)); + else + lua_pushstring(hlua->T, srv_op_st_chg_cause(state->safe.op_st_chg.cause)); + lua_settable(hlua->T, -3); + + /* old_state, new_state */ + for (it = 0; it < 2; it++) { + enum srv_state srv_state = (!it) ? state->safe.old_state : state->safe.new_state; + + lua_pushstring(hlua->T, (!it) ? "old_state" : "new_state"); + switch (srv_state) { + case SRV_ST_STOPPED: + lua_pushstring(hlua->T, "STOPPED"); + break; + case SRV_ST_STOPPING: + lua_pushstring(hlua->T, "STOPPING"); + break; + case SRV_ST_STARTING: + lua_pushstring(hlua->T, "STARTING"); + break; + case SRV_ST_RUNNING: + lua_pushstring(hlua->T, "RUNNING"); + break; + default: + lua_pushnil(hlua->T); + break; + } + lua_settable(hlua->T, -3); + } + + /* requeued */ + lua_pushstring(hlua->T, "requeued"); + lua_pushinteger(hlua->T, state->safe.requeued); + lua_settable(hlua->T, -3); + + lua_settable(hlua->T, -3); /* state table */ + } + else if (event_hdl_sub_type_equal(EVENT_HDL_SUB_SERVER_ADMIN, event)) { + struct event_hdl_cb_data_server_admin *admin = data; + int it; + + if (!lua_checkstack(hlua->T, 20)) + WILL_LJMP(luaL_error(hlua->T, "Lua out of memory error.")); + + /* admin subclass */ + lua_pushstring(hlua->T, "admin"); + lua_newtable(hlua->T); + + lua_pushstring(hlua->T, "cause"); + lua_pushstring(hlua->T, srv_adm_st_chg_cause(admin->safe.cause)); + lua_settable(hlua->T, -3); + + /* old_admin, new_admin */ + for (it = 0; it < 2; it++) { + enum srv_admin srv_admin = (!it) ? admin->safe.old_admin : admin->safe.new_admin; + + lua_pushstring(hlua->T, (!it) ? "old_admin" : "new_admin"); + + /* admin state matrix */ + lua_newtable(hlua->T); + + lua_pushstring(hlua->T, "MAINT"); + lua_pushboolean(hlua->T, srv_admin & SRV_ADMF_MAINT); + lua_settable(hlua->T, -3); + lua_pushstring(hlua->T, "FMAINT"); + lua_pushboolean(hlua->T, srv_admin & SRV_ADMF_FMAINT); + lua_settable(hlua->T, -3); + lua_pushstring(hlua->T, "IMAINT"); + lua_pushboolean(hlua->T, srv_admin & SRV_ADMF_IMAINT); + lua_settable(hlua->T, -3); + lua_pushstring(hlua->T, "RMAINT"); + lua_pushboolean(hlua->T, srv_admin & SRV_ADMF_RMAINT); + lua_settable(hlua->T, -3); + lua_pushstring(hlua->T, "CMAINT"); + lua_pushboolean(hlua->T, srv_admin & SRV_ADMF_CMAINT); + lua_settable(hlua->T, -3); + + lua_pushstring(hlua->T, "DRAIN"); + lua_pushboolean(hlua->T, srv_admin & SRV_ADMF_DRAIN); + lua_settable(hlua->T, -3); + lua_pushstring(hlua->T, "FDRAIN"); + lua_pushboolean(hlua->T, srv_admin & SRV_ADMF_FDRAIN); + lua_settable(hlua->T, -3); + lua_pushstring(hlua->T, "IDRAIN"); + lua_pushboolean(hlua->T, srv_admin & SRV_ADMF_IDRAIN); + lua_settable(hlua->T, -3); + + lua_settable(hlua->T, -3); /* matrix table */ + } + /* requeued */ + lua_pushstring(hlua->T, "requeued"); + lua_pushinteger(hlua->T, admin->safe.requeued); + lua_settable(hlua->T, -3); + + lua_settable(hlua->T, -3); /* admin table */ + } + else if (event_hdl_sub_type_equal(EVENT_HDL_SUB_SERVER_CHECK, event)) { + struct event_hdl_cb_data_server_check *check = data; + + if (!lua_checkstack(hlua->T, 20)) + WILL_LJMP(luaL_error(hlua->T, "Lua out of memory error.")); + + /* check subclass */ + lua_pushstring(hlua->T, "check"); + lua_newtable(hlua->T); + + /* check result snapshot */ + hlua_event_hdl_cb_push_event_checkres(hlua->T, &check->safe.res); + + lua_settable(hlua->T, -3); /* check table */ + } + + /* attempt to provide reference server object + * (if it wasn't removed yet, SERVER_DEL will never succeed here) + */ + px = proxy_find_by_id(e_server->safe.proxy_uuid, PR_CAP_BE, 0); + BUG_ON(!px); + server = findserver_unique_id(px, e_server->safe.puid, e_server->safe.rid); + if (server) { + lua_pushstring(hlua->T, "reference"); + hlua_fcn_new_server(hlua->T, server); + lua_settable(hlua->T, -3); + } + } + /* sub mgmt */ + hlua->nargs += 1; + hlua_fcn_new_event_sub(hlua->T, hlua_sub->sub); + + /* when? */ + hlua->nargs += 1; + lua_pushinteger(hlua->T, e->when.tv_sec); +} + +/* events runner: if there's an ongoing hlua event handling process, finish it + * then, check if there are new events waiting to be processed + * (events are processed sequentially) + * + * We have a safety measure to warn/guard if the event queue is growing up + * too much due to many events being generated and lua handler is unable to + * keep up the pace (e.g.: when the event queue grows past 100 unconsumed events). + * TODO: make it tunable + */ +static struct task *hlua_event_runner(struct task *task, void *context, unsigned int state) +{ + struct hlua_event_sub *hlua_sub = context; + struct event_hdl_async_event *event; + const char *error = NULL; + + if (!hlua_sub->paused && event_hdl_async_equeue_size(&hlua_sub->equeue) > 100) { + const char *trace = NULL; + + /* We reached the limit of pending events in the queue: we should + * warn the user, and temporarily pause the subscription to give a chance + * to the handler to catch up? (it also prevents resource shortage since + * the queue could grow indefinitely otherwise) + * TODO: find a way to inform the handler that it missed some events + * (example: stats within the subscription in event_hdl api exposed via lua api?) + * + * Nonetheless, reaching this limit means that the handler is not fast enough + * and/or that it subscribed to events that happen too frequently and did not + * expect it. This could come from an inadequate design in the user's script. + */ + event_hdl_pause(hlua_sub->sub); + hlua_sub->paused = 1; + + if (SET_SAFE_LJMP(hlua_sub->hlua)) { + /* The following Lua call may fail. */ + trace = hlua_traceback(hlua_sub->hlua->T, ", "); + /* At this point the execution is safe. */ + RESET_SAFE_LJMP(hlua_sub->hlua); + } else { + /* Lua error was raised while fetching lua trace from current ctx */ + SEND_ERR(NULL, "Lua event_hdl: unexpected error (memory failure?).\n"); + } + ha_warning("Lua event_hdl: pausing the subscription because the handler fails " + "to keep up the pace (%u unconsumed events) from %s.\n", + event_hdl_async_equeue_size(&hlua_sub->equeue), + (trace) ? trace : "[unknown]"); + } + + if (HLUA_IS_RUNNING(hlua_sub->hlua)) { + /* ongoing hlua event handler, resume it */ + hlua_event_handler(hlua_sub->hlua); + } else if ((event = event_hdl_async_equeue_pop(&hlua_sub->equeue))) { /* check for new events */ + if (event_hdl_sub_type_equal(event->type, EVENT_HDL_SUB_END)) { + /* ending event: no more events to come */ + event_hdl_async_free_event(event); + task_destroy(task); + hlua_event_subscription_destroy(hlua_sub); + return NULL; + } + /* new event: start processing it */ + + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(hlua_sub->hlua)) { + if (lua_type(hlua_sub->hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(hlua_sub->hlua->T, -1); + else + error = "critical error"; + ha_alert("Lua event_hdl: %s.\n", error); + goto skip_event; + } + + /* Check stack available size. */ + if (!lua_checkstack(hlua_sub->hlua->T, 5)) { + ha_alert("Lua event_hdl: full stack.\n"); + RESET_SAFE_LJMP(hlua_sub->hlua); + goto skip_event; + } + + /* Restore the function in the stack. */ + hlua_pushref(hlua_sub->hlua->T, hlua_sub->fcn_ref); + + /* push args */ + hlua_sub->hlua->nargs = 0; + MAY_LJMP(hlua_event_hdl_cb_push_args(hlua_sub, event)); + + /* At this point the execution is safe. */ + RESET_SAFE_LJMP(hlua_sub->hlua); + + /* At this point the event was successfully translated into hlua ctx, + * or hlua error occurred, so we can safely discard it + */ + event_hdl_async_free_event(event); + event = NULL; + + hlua_event_handler(hlua_sub->hlua); + skip_event: + if (event) + event_hdl_async_free_event(event); + + } + + if (!HLUA_IS_RUNNING(hlua_sub->hlua)) { + /* we just finished the processing of one event.. + * check for new events before becoming idle + */ + if (!event_hdl_async_equeue_isempty(&hlua_sub->equeue)) { + /* more events to process, make sure the task + * will be resumed ASAP to process pending events + */ + task_wakeup(task, TASK_WOKEN_OTHER); + } + else if (hlua_sub->paused) { + /* empty queue, the handler caught up: resume the subscription */ + event_hdl_resume(hlua_sub->sub); + hlua_sub->paused = 0; + } + } + + return task; +} + +/* Must be called directly under lua protected/safe environment + * (not from external callback) + * <fcn_ref> should NOT be dropped after the function successfully returns: + * it will be done automatically in hlua_event_subscription_destroy() when the + * subscription ends. + * + * Returns the new subscription on success and NULL on failure (memory error) + */ +static struct event_hdl_sub *hlua_event_subscribe(event_hdl_sub_list *list, struct event_hdl_sub_type e_type, + int state_id, int fcn_ref) +{ + struct hlua_event_sub *hlua_sub; + struct task *task = NULL; + + hlua_sub = pool_alloc(pool_head_hlua_event_sub); + if (!hlua_sub) + goto mem_error; + hlua_sub->task = NULL; + hlua_sub->hlua = NULL; + hlua_sub->paused = 0; + if ((task = task_new_here()) == NULL) { + ha_alert("out of memory while allocating hlua event task"); + goto mem_error; + } + task->process = hlua_event_runner; + task->context = hlua_sub; + event_hdl_async_equeue_init(&hlua_sub->equeue); + hlua_sub->task = task; + hlua_sub->fcn_ref = fcn_ref; + hlua_sub->state_id = state_id; + hlua_sub->hlua = pool_alloc(pool_head_hlua); + if (!hlua_sub->hlua) + goto mem_error; + HLUA_INIT(hlua_sub->hlua); + if (!hlua_ctx_init(hlua_sub->hlua, hlua_sub->state_id, task)) + goto mem_error; + + hlua_sub->sub = event_hdl_subscribe_ptr(list, e_type, + EVENT_HDL_ASYNC_TASK(&hlua_sub->equeue, + task, + hlua_sub, + NULL)); + if (!hlua_sub->sub) + goto mem_error; + + return hlua_sub->sub; /* returns pointer to event_hdl_sub struct */ + + mem_error: + if (hlua_sub) { + task_destroy(hlua_sub->task); + if (hlua_sub->hlua) + hlua_ctx_destroy(hlua_sub->hlua); + pool_free(pool_head_hlua_event_sub, hlua_sub); + } + + return NULL; +} + +/* looks for an array of strings referring to a composition of event_hdl subscription + * types at <index> in <L> stack + */ +__LJMP static struct event_hdl_sub_type hlua_check_event_sub_types(lua_State *L, int index) +{ + struct event_hdl_sub_type subscriptions; + const char *msg; + + if (lua_type(L, index) != LUA_TTABLE) { + msg = lua_pushfstring(L, "table of strings expected, got %s", luaL_typename(L, index)); + luaL_argerror(L, index, msg); + } + + subscriptions = EVENT_HDL_SUB_NONE; + + /* browse the argument as an array. */ + lua_pushnil(L); + while (lua_next(L, index) != 0) { + if (lua_type(L, -1) != LUA_TSTRING) { + msg = lua_pushfstring(L, "table of strings expected, got %s", luaL_typename(L, index)); + luaL_argerror(L, index, msg); + } + + if (event_hdl_sub_type_equal(EVENT_HDL_SUB_NONE, event_hdl_string_to_sub_type(lua_tostring(L, -1)))) { + msg = lua_pushfstring(L, "'%s' event type is unknown", lua_tostring(L, -1)); + luaL_argerror(L, index, msg); + } + + /* perform subscriptions |= current sub */ + subscriptions = event_hdl_sub_type_add(subscriptions, event_hdl_string_to_sub_type(lua_tostring(L, -1))); + + /* pop the current value. */ + lua_pop(L, 1); + } + + return subscriptions; +} + +/* Wrapper for hlua_fcn_new_event_sub(): catch errors raised by + * the function to prevent LJMP + * + * If no error occurred, the function returns 1, else it returns 0 and + * the error message is pushed at the top of the stack + */ +__LJMP static int _hlua_new_event_sub_safe(lua_State *L) +{ + struct event_hdl_sub *sub = lua_touserdata(L, 1); + + /* this function may raise errors */ + return MAY_LJMP(hlua_fcn_new_event_sub(L, sub)); +} +static int hlua_new_event_sub_safe(lua_State *L, struct event_hdl_sub *sub) +{ + if (!lua_checkstack(L, 2)) + return 0; + lua_pushcfunction(L, _hlua_new_event_sub_safe); + lua_pushlightuserdata(L, sub); + switch (lua_pcall(L, 1, 1, 0)) { + case LUA_OK: + return 1; + default: + /* error was caught */ + return 0; + } +} + +/* This function is a LUA helper used for registering lua event callbacks. + * It expects an event subscription array and the function to be executed + * when subscribed events occur (stack arguments). + * It can be called from the "init" section, "post init" or during the runtime. + * + * <sub_list> is the subscription list where the subscription will be attempted + * + * Pushes the newly allocated subscription on the stack on success + */ +__LJMP int hlua_event_sub(lua_State *L, event_hdl_sub_list *sub_list) +{ + struct hlua *hlua; + struct event_hdl_sub *sub; + struct event_hdl_sub_type subscriptions; + int fcn_ref; + int state_id; + + MAY_LJMP(check_args(L, 2, "event_sub")); + + /* Get the reference state */ + hlua = hlua_gethlua(L); + if (hlua) + /* we are in runtime processing, any thread may subscribe to events: + * subscription events will be handled by the thread who performed + * the registration. + */ + state_id = hlua->state_id; + else { + /* we are in initialization mode, only thread 0 (actual calling thread) + * may subscribe to events to prevent the same handler (from different lua + * stacks) from being registered multiple times + * + * hlua_state_id == 0: monostack (lua-load) + * hlua_state_id > 0: hlua_state_id=tid+1, multi-stack (lua-load-per-thread) + * (thus if hlua_state_id > 1, it means we are not in primary thread ctx) + */ + if (hlua_state_id > 1) + return 0; /* skip registration */ + state_id = hlua_state_id; + } + + /* First argument : event subscriptions. */ + subscriptions = MAY_LJMP(hlua_check_event_sub_types(L, 1)); + + if (event_hdl_sub_type_equal(subscriptions, EVENT_HDL_SUB_NONE)) { + WILL_LJMP(luaL_error(L, "event_sub: no valid event types were provided")); + return 0; /* Never reached */ + } + + /* Second argument : lua function. */ + fcn_ref = MAY_LJMP(hlua_checkfunction(L, 2)); + + /* try to subscribe */ + sub = hlua_event_subscribe(sub_list, subscriptions, state_id, fcn_ref); + if (!sub) { + hlua_unref(L, fcn_ref); + WILL_LJMP(luaL_error(L, "event_sub: lua out of memory error")); + return 0; /* Never reached */ + } + + /* push the subscription to the stack + * + * Here we use the safe function so that lua errors will be + * handled explicitly to prevent 'sub' from being lost + */ + if (!hlua_new_event_sub_safe(L, sub)) { + /* Some events could already be pending in the handler's queue. + * However it is wiser to cancel the subscription since we are unable to + * provide a valid reference to it. + * Pending events will be delivered (unless lua keeps raising errors). + */ + event_hdl_unsubscribe(sub); /* cancel the subscription */ + WILL_LJMP(luaL_error(L, "event_sub: cannot push the subscription (%s)", lua_tostring(L, -1))); + return 0; /* Never reached */ + } + event_hdl_drop(sub); /* sub has been duplicated, discard old ref */ + + return 1; +} + +/* This function is a LUA wrapper used for registering global lua event callbacks + * The new subscription is pushed onto the stack on success + * Returns the number of arguments pushed to the stack (1 for success) + */ +__LJMP static int hlua_event_global_sub(lua_State *L) +{ + /* NULL <sub_list> = global subscription list */ + return MAY_LJMP(hlua_event_sub(L, NULL)); +} + +/* Wrapper called by HAProxy to execute an LUA converter. This wrapper + * doesn't allow "yield" functions because the HAProxy engine cannot + * resume converters. + */ +static int hlua_sample_conv_wrapper(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct hlua_function *fcn = private; + struct stream *stream = smp->strm; + const char *error; + + if (!stream) + return 0; + + if (!hlua_stream_ctx_prepare(stream, fcn_ref_to_stack_id(fcn))) { + SEND_ERR(stream->be, "Lua converter '%s': can't initialize Lua context.\n", fcn->name); + return 0; + } + + /* If it is the first run, initialize the data for the call. */ + if (!HLUA_IS_RUNNING(stream->hlua)) { + + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(stream->hlua)) { + if (lua_type(stream->hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(stream->hlua->T, -1); + else + error = "critical error"; + SEND_ERR(stream->be, "Lua converter '%s': %s.\n", fcn->name, error); + return 0; + } + + /* Check stack available size. */ + if (!lua_checkstack(stream->hlua->T, 1)) { + SEND_ERR(stream->be, "Lua converter '%s': full stack.\n", fcn->name); + RESET_SAFE_LJMP(stream->hlua); + return 0; + } + + /* Restore the function in the stack. */ + hlua_pushref(stream->hlua->T, fcn->function_ref[stream->hlua->state_id]); + + /* convert input sample and pust-it in the stack. */ + if (!lua_checkstack(stream->hlua->T, 1)) { + SEND_ERR(stream->be, "Lua converter '%s': full stack.\n", fcn->name); + RESET_SAFE_LJMP(stream->hlua); + return 0; + } + MAY_LJMP(hlua_smp2lua(stream->hlua->T, smp)); + stream->hlua->nargs = 1; + + /* push keywords in the stack. */ + if (arg_p) { + for (; arg_p->type != ARGT_STOP; arg_p++) { + if (!lua_checkstack(stream->hlua->T, 1)) { + SEND_ERR(stream->be, "Lua converter '%s': full stack.\n", fcn->name); + RESET_SAFE_LJMP(stream->hlua); + return 0; + } + MAY_LJMP(hlua_arg2lua(stream->hlua->T, arg_p)); + stream->hlua->nargs++; + } + } + + /* We must initialize the execution timeouts. */ + hlua_timer_init(&stream->hlua->timer, hlua_timeout_session); + + /* At this point the execution is safe. */ + RESET_SAFE_LJMP(stream->hlua); + } + + /* Execute the function. */ + switch (hlua_ctx_resume(stream->hlua, 0)) { + /* finished. */ + case HLUA_E_OK: + /* If the stack is empty, the function fails. */ + if (lua_gettop(stream->hlua->T) <= 0) + return 0; + + /* Convert the returned value in sample. */ + hlua_lua2smp(stream->hlua->T, -1, smp); + /* dup the smp before popping the related lua value and + * returning it to haproxy + */ + smp_dup(smp); + lua_pop(stream->hlua->T, 1); + return 1; + + /* yield. */ + case HLUA_E_AGAIN: + SEND_ERR(stream->be, "Lua converter '%s': cannot use yielded functions.\n", fcn->name); + return 0; + + /* finished with error. */ + case HLUA_E_ERRMSG: + /* Display log. */ + SEND_ERR(stream->be, "Lua converter '%s': %s.\n", + fcn->name, lua_tostring(stream->hlua->T, -1)); + lua_pop(stream->hlua->T, 1); + return 0; + + case HLUA_E_ETMOUT: + SEND_ERR(stream->be, "Lua converter '%s': execution timeout.\n", fcn->name); + return 0; + + case HLUA_E_NOMEM: + SEND_ERR(stream->be, "Lua converter '%s': out of memory error.\n", fcn->name); + return 0; + + case HLUA_E_YIELD: + SEND_ERR(stream->be, "Lua converter '%s': yield functions like core.tcp() or core.sleep() are not allowed.\n", fcn->name); + return 0; + + case HLUA_E_ERR: + /* Display log. */ + SEND_ERR(stream->be, "Lua converter '%s' returns an unknown error.\n", fcn->name); + __fallthrough; + + default: + return 0; + } +} + +/* Wrapper called by HAProxy to execute a sample-fetch. this wrapper + * doesn't allow "yield" functions because the HAProxy engine cannot + * resume sample-fetches. This function will be called by the sample + * fetch engine to call lua-based fetch operations. + */ +static int hlua_sample_fetch_wrapper(const struct arg *arg_p, struct sample *smp, + const char *kw, void *private) +{ + struct hlua_function *fcn = private; + struct stream *stream = smp->strm; + const char *error; + unsigned int hflags = HLUA_TXN_NOTERM | HLUA_TXN_SMP_CTX; + + if (!stream) + return 0; + + if (!hlua_stream_ctx_prepare(stream, fcn_ref_to_stack_id(fcn))) { + SEND_ERR(stream->be, "Lua sample-fetch '%s': can't initialize Lua context.\n", fcn->name); + return 0; + } + + /* If it is the first run, initialize the data for the call. */ + if (!HLUA_IS_RUNNING(stream->hlua)) { + + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(stream->hlua)) { + if (lua_type(stream->hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(stream->hlua->T, -1); + else + error = "critical error"; + SEND_ERR(smp->px, "Lua sample-fetch '%s': %s.\n", fcn->name, error); + return 0; + } + + /* Check stack available size. */ + if (!lua_checkstack(stream->hlua->T, 2)) { + SEND_ERR(smp->px, "Lua sample-fetch '%s': full stack.\n", fcn->name); + RESET_SAFE_LJMP(stream->hlua); + return 0; + } + + /* Restore the function in the stack. */ + hlua_pushref(stream->hlua->T, fcn->function_ref[stream->hlua->state_id]); + + /* push arguments in the stack. */ + if (!hlua_txn_new(stream->hlua->T, stream, smp->px, smp->opt & SMP_OPT_DIR, hflags)) { + SEND_ERR(smp->px, "Lua sample-fetch '%s': full stack.\n", fcn->name); + RESET_SAFE_LJMP(stream->hlua); + return 0; + } + stream->hlua->nargs = 1; + + /* push keywords in the stack. */ + for (; arg_p && arg_p->type != ARGT_STOP; arg_p++) { + /* Check stack available size. */ + if (!lua_checkstack(stream->hlua->T, 1)) { + SEND_ERR(smp->px, "Lua sample-fetch '%s': full stack.\n", fcn->name); + RESET_SAFE_LJMP(stream->hlua); + return 0; + } + MAY_LJMP(hlua_arg2lua(stream->hlua->T, arg_p)); + stream->hlua->nargs++; + } + + /* We must initialize the execution timeouts. */ + hlua_timer_init(&stream->hlua->timer, hlua_timeout_session); + + /* At this point the execution is safe. */ + RESET_SAFE_LJMP(stream->hlua); + } + + /* Execute the function. */ + switch (hlua_ctx_resume(stream->hlua, 0)) { + /* finished. */ + case HLUA_E_OK: + /* If the stack is empty, the function fails. */ + if (lua_gettop(stream->hlua->T) <= 0) + return 0; + + /* Convert the returned value in sample. */ + hlua_lua2smp(stream->hlua->T, -1, smp); + /* dup the smp before popping the related lua value and + * returning it to haproxy + */ + smp_dup(smp); + lua_pop(stream->hlua->T, 1); + + /* Set the end of execution flag. */ + smp->flags &= ~SMP_F_MAY_CHANGE; + return 1; + + /* yield. */ + case HLUA_E_AGAIN: + SEND_ERR(smp->px, "Lua sample-fetch '%s': cannot use yielded functions.\n", fcn->name); + return 0; + + /* finished with error. */ + case HLUA_E_ERRMSG: + /* Display log. */ + SEND_ERR(smp->px, "Lua sample-fetch '%s': %s.\n", + fcn->name, lua_tostring(stream->hlua->T, -1)); + lua_pop(stream->hlua->T, 1); + return 0; + + case HLUA_E_ETMOUT: + SEND_ERR(smp->px, "Lua sample-fetch '%s': execution timeout.\n", fcn->name); + return 0; + + case HLUA_E_NOMEM: + SEND_ERR(smp->px, "Lua sample-fetch '%s': out of memory error.\n", fcn->name); + return 0; + + case HLUA_E_YIELD: + SEND_ERR(smp->px, "Lua sample-fetch '%s': yield not allowed.\n", fcn->name); + return 0; + + case HLUA_E_ERR: + /* Display log. */ + SEND_ERR(smp->px, "Lua sample-fetch '%s' returns an unknown error.\n", fcn->name); + __fallthrough; + + default: + return 0; + } +} + +/* This function is an LUA binding used for registering + * "sample-conv" functions. It expects a converter name used + * in the haproxy configuration file, and an LUA function. + */ +__LJMP static int hlua_register_converters(lua_State *L) +{ + struct sample_conv_kw_list *sck; + const char *name; + int ref; + int len; + struct hlua_function *fcn = NULL; + struct sample_conv *sc; + struct buffer *trash; + + MAY_LJMP(check_args(L, 2, "register_converters")); + + if (hlua_gethlua(L)) { + /* runtime processing */ + WILL_LJMP(luaL_error(L, "register_converters: not available outside of body context")); + } + + /* First argument : converter name. */ + name = MAY_LJMP(luaL_checkstring(L, 1)); + + /* Second argument : lua function. */ + ref = MAY_LJMP(hlua_checkfunction(L, 2)); + + /* Check if the converter is already registered */ + trash = get_trash_chunk(); + chunk_printf(trash, "lua.%s", name); + sc = find_sample_conv(trash->area, trash->data); + if (sc != NULL) { + fcn = sc->private; + if (fcn->function_ref[hlua_state_id] != -1) { + ha_warning("Trying to register converter 'lua.%s' more than once. " + "This will become a hard error in version 2.5.\n", name); + hlua_unref(L, fcn->function_ref[hlua_state_id]); + } + fcn->function_ref[hlua_state_id] = ref; + return 0; + } + + /* Allocate and fill the sample fetch keyword struct. */ + sck = calloc(1, sizeof(*sck) + sizeof(struct sample_conv) * 2); + if (!sck) + goto alloc_error; + fcn = new_hlua_function(); + if (!fcn) + goto alloc_error; + + /* Fill fcn. */ + fcn->name = strdup(name); + if (!fcn->name) + goto alloc_error; + fcn->function_ref[hlua_state_id] = ref; + + /* List head */ + sck->list.n = sck->list.p = NULL; + + /* converter keyword. */ + len = strlen("lua.") + strlen(name) + 1; + sck->kw[0].kw = calloc(1, len); + if (!sck->kw[0].kw) + goto alloc_error; + + snprintf((char *)sck->kw[0].kw, len, "lua.%s", name); + sck->kw[0].process = hlua_sample_conv_wrapper; + sck->kw[0].arg_mask = ARG12(0,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR); + sck->kw[0].val_args = NULL; + sck->kw[0].in_type = SMP_T_STR; + sck->kw[0].out_type = SMP_T_STR; + sck->kw[0].private = fcn; + + /* Register this new converter */ + sample_register_convs(sck); + + return 0; + + alloc_error: + release_hlua_function(fcn); + hlua_unref(L, ref); + ha_free(&sck); + WILL_LJMP(luaL_error(L, "Lua out of memory error.")); + return 0; /* Never reached */ +} + +/* This function is an LUA binding used for registering + * "sample-fetch" functions. It expects a converter name used + * in the haproxy configuration file, and an LUA function. + */ +__LJMP static int hlua_register_fetches(lua_State *L) +{ + const char *name; + int ref; + int len; + struct sample_fetch_kw_list *sfk; + struct hlua_function *fcn = NULL; + struct sample_fetch *sf; + struct buffer *trash; + + MAY_LJMP(check_args(L, 2, "register_fetches")); + + if (hlua_gethlua(L)) { + /* runtime processing */ + WILL_LJMP(luaL_error(L, "register_fetches: not available outside of body context")); + } + + /* First argument : sample-fetch name. */ + name = MAY_LJMP(luaL_checkstring(L, 1)); + + /* Second argument : lua function. */ + ref = MAY_LJMP(hlua_checkfunction(L, 2)); + + /* Check if the sample-fetch is already registered */ + trash = get_trash_chunk(); + chunk_printf(trash, "lua.%s", name); + sf = find_sample_fetch(trash->area, trash->data); + if (sf != NULL) { + fcn = sf->private; + if (fcn->function_ref[hlua_state_id] != -1) { + ha_warning("Trying to register sample-fetch 'lua.%s' more than once. " + "This will become a hard error in version 2.5.\n", name); + hlua_unref(L, fcn->function_ref[hlua_state_id]); + } + fcn->function_ref[hlua_state_id] = ref; + return 0; + } + + /* Allocate and fill the sample fetch keyword struct. */ + sfk = calloc(1, sizeof(*sfk) + sizeof(struct sample_fetch) * 2); + if (!sfk) + goto alloc_error; + fcn = new_hlua_function(); + if (!fcn) + goto alloc_error; + + /* Fill fcn. */ + fcn->name = strdup(name); + if (!fcn->name) + goto alloc_error; + fcn->function_ref[hlua_state_id] = ref; + + /* List head */ + sfk->list.n = sfk->list.p = NULL; + + /* sample-fetch keyword. */ + len = strlen("lua.") + strlen(name) + 1; + sfk->kw[0].kw = calloc(1, len); + if (!sfk->kw[0].kw) + goto alloc_error; + + snprintf((char *)sfk->kw[0].kw, len, "lua.%s", name); + sfk->kw[0].process = hlua_sample_fetch_wrapper; + sfk->kw[0].arg_mask = ARG12(0,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR,STR); + sfk->kw[0].val_args = NULL; + sfk->kw[0].out_type = SMP_T_STR; + sfk->kw[0].use = SMP_USE_HTTP_ANY; + sfk->kw[0].val = 0; + sfk->kw[0].private = fcn; + + /* Register this new fetch. */ + sample_register_fetches(sfk); + + return 0; + + alloc_error: + release_hlua_function(fcn); + hlua_unref(L, ref); + ha_free(&sfk); + WILL_LJMP(luaL_error(L, "Lua out of memory error.")); + return 0; /* Never reached */ +} + +/* This function is a lua binding to set the wake_time. + */ +__LJMP static int hlua_set_wake_time(lua_State *L) +{ + struct hlua *hlua; + unsigned int delay; + int wakeup_ms; // tick value + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + if (!hlua) { + return 0; + } + + MAY_LJMP(check_args(L, 1, "wake_time")); + + delay = MAY_LJMP(luaL_checkinteger(L, 1)); + wakeup_ms = tick_add(now_ms, delay); + hlua->wake_time = wakeup_ms; + return 0; +} + +/* This function is a wrapper to execute each LUA function declared as an action + * wrapper during the initialisation period. This function may return any + * ACT_RET_* value. On error ACT_RET_CONT is returned and the action is + * ignored. If the lua action yields, ACT_RET_YIELD is returned. On success, the + * return value is the first element on the stack. + */ +static enum act_return hlua_action(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + char **arg; + unsigned int hflags = HLUA_TXN_ACT_CTX; + int dir, act_ret = ACT_RET_CONT; + const char *error; + + switch (rule->from) { + case ACT_F_TCP_REQ_CNT: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_RES_CNT: dir = SMP_OPT_DIR_RES; break; + case ACT_F_HTTP_REQ: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_HTTP_RES: dir = SMP_OPT_DIR_RES; break; + default: + SEND_ERR(px, "Lua: internal error while execute action.\n"); + goto end; + } + + if (!hlua_stream_ctx_prepare(s, fcn_ref_to_stack_id(rule->arg.hlua_rule->fcn))) { + SEND_ERR(px, "Lua action '%s': can't initialize Lua context.\n", + rule->arg.hlua_rule->fcn->name); + goto end; + } + + /* If it is the first run, initialize the data for the call. */ + if (!HLUA_IS_RUNNING(s->hlua)) { + + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(s->hlua)) { + if (lua_type(s->hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(s->hlua->T, -1); + else + error = "critical error"; + SEND_ERR(px, "Lua function '%s': %s.\n", + rule->arg.hlua_rule->fcn->name, error); + goto end; + } + + /* Check stack available size. */ + if (!lua_checkstack(s->hlua->T, 1)) { + SEND_ERR(px, "Lua function '%s': full stack.\n", + rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(s->hlua); + goto end; + } + + /* Restore the function in the stack. */ + hlua_pushref(s->hlua->T, rule->arg.hlua_rule->fcn->function_ref[s->hlua->state_id]); + + /* Create and and push object stream in the stack. */ + if (!hlua_txn_new(s->hlua->T, s, px, dir, hflags)) { + SEND_ERR(px, "Lua function '%s': full stack.\n", + rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(s->hlua); + goto end; + } + s->hlua->nargs = 1; + + /* push keywords in the stack. */ + for (arg = rule->arg.hlua_rule->args; arg && *arg; arg++) { + if (!lua_checkstack(s->hlua->T, 1)) { + SEND_ERR(px, "Lua function '%s': full stack.\n", + rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(s->hlua); + goto end; + } + lua_pushstring(s->hlua->T, *arg); + s->hlua->nargs++; + } + + /* Now the execution is safe. */ + RESET_SAFE_LJMP(s->hlua); + + /* We must initialize the execution timeouts. */ + hlua_timer_init(&s->hlua->timer, hlua_timeout_session); + } + + /* Execute the function. */ + switch (hlua_ctx_resume(s->hlua, !(flags & ACT_OPT_FINAL))) { + /* finished. */ + case HLUA_E_OK: + /* Catch the return value */ + if (lua_gettop(s->hlua->T) > 0) + act_ret = lua_tointeger(s->hlua->T, -1); + + /* Set timeout in the required channel. */ + if (act_ret == ACT_RET_YIELD) { + if (flags & ACT_OPT_FINAL) + goto err_yield; + + if (dir == SMP_OPT_DIR_REQ) + s->req.analyse_exp = tick_first((tick_is_expired(s->req.analyse_exp, now_ms) ? 0 : s->req.analyse_exp), + s->hlua->wake_time); + else + s->res.analyse_exp = tick_first((tick_is_expired(s->res.analyse_exp, now_ms) ? 0 : s->res.analyse_exp), + s->hlua->wake_time); + } + goto end; + + /* yield. */ + case HLUA_E_AGAIN: + /* Set timeout in the required channel. */ + if (dir == SMP_OPT_DIR_REQ) + s->req.analyse_exp = tick_first((tick_is_expired(s->req.analyse_exp, now_ms) ? 0 : s->req.analyse_exp), + s->hlua->wake_time); + else + s->res.analyse_exp = tick_first((tick_is_expired(s->res.analyse_exp, now_ms) ? 0 : s->res.analyse_exp), + s->hlua->wake_time); + + /* Some actions can be wake up when a "write" event + * is detected on a response channel. This is useful + * only for actions targeted on the requests. + */ + if (HLUA_IS_WAKERESWR(s->hlua)) + s->res.flags |= CF_WAKE_WRITE; + if (HLUA_IS_WAKEREQWR(s->hlua)) + s->req.flags |= CF_WAKE_WRITE; + act_ret = ACT_RET_YIELD; + goto end; + + /* finished with error. */ + case HLUA_E_ERRMSG: + /* Display log. */ + SEND_ERR(px, "Lua function '%s': %s.\n", + rule->arg.hlua_rule->fcn->name, lua_tostring(s->hlua->T, -1)); + lua_pop(s->hlua->T, 1); + goto end; + + case HLUA_E_ETMOUT: + SEND_ERR(px, "Lua function '%s': execution timeout.\n", rule->arg.hlua_rule->fcn->name); + goto end; + + case HLUA_E_NOMEM: + SEND_ERR(px, "Lua function '%s': out of memory error.\n", rule->arg.hlua_rule->fcn->name); + goto end; + + case HLUA_E_YIELD: + err_yield: + act_ret = ACT_RET_CONT; + SEND_ERR(px, "Lua function '%s': yield not allowed.\n", + rule->arg.hlua_rule->fcn->name); + goto end; + + case HLUA_E_ERR: + /* Display log. */ + SEND_ERR(px, "Lua function '%s' return an unknown error.\n", + rule->arg.hlua_rule->fcn->name); + + default: + goto end; + } + + end: + if (act_ret != ACT_RET_YIELD && s->hlua) + s->hlua->wake_time = TICK_ETERNITY; + return act_ret; +} + +struct task *hlua_applet_wakeup(struct task *t, void *context, unsigned int state) +{ + struct appctx *ctx = context; + + appctx_wakeup(ctx); + t->expire = TICK_ETERNITY; + return t; +} + +static int hlua_applet_tcp_init(struct appctx *ctx) +{ + struct hlua_tcp_ctx *tcp_ctx = applet_reserve_svcctx(ctx, sizeof(*tcp_ctx)); + struct stconn *sc = appctx_sc(ctx); + struct stream *strm = __sc_strm(sc); + struct hlua *hlua; + struct task *task; + char **arg; + const char *error; + + hlua = pool_alloc(pool_head_hlua); + if (!hlua) { + SEND_ERR(strm->be, "Lua applet tcp '%s': out of memory.\n", + ctx->rule->arg.hlua_rule->fcn->name); + return -1; + } + HLUA_INIT(hlua); + tcp_ctx->hlua = hlua; + tcp_ctx->flags = 0; + + /* Create task used by signal to wakeup applets. */ + task = task_new_here(); + if (!task) { + SEND_ERR(strm->be, "Lua applet tcp '%s': out of memory.\n", + ctx->rule->arg.hlua_rule->fcn->name); + return -1; + } + task->nice = 0; + task->context = ctx; + task->process = hlua_applet_wakeup; + tcp_ctx->task = task; + + /* In the execution wrappers linked with a stream, the + * Lua context can be not initialized. This behavior + * permits to save performances because a systematic + * Lua initialization cause 5% performances loss. + */ + if (!hlua_ctx_init(hlua, fcn_ref_to_stack_id(ctx->rule->arg.hlua_rule->fcn), task)) { + SEND_ERR(strm->be, "Lua applet tcp '%s': can't initialize Lua context.\n", + ctx->rule->arg.hlua_rule->fcn->name); + return -1; + } + + /* Set timeout according with the applet configuration. */ + hlua_timer_init(&hlua->timer, ctx->applet->timeout); + + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(hlua)) { + if (lua_type(hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(hlua->T, -1); + else + error = "critical error"; + SEND_ERR(strm->be, "Lua applet tcp '%s': %s.\n", + ctx->rule->arg.hlua_rule->fcn->name, error); + return -1; + } + + /* Check stack available size. */ + if (!lua_checkstack(hlua->T, 1)) { + SEND_ERR(strm->be, "Lua applet tcp '%s': full stack.\n", + ctx->rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(hlua); + return -1; + } + + /* Restore the function in the stack. */ + hlua_pushref(hlua->T, ctx->rule->arg.hlua_rule->fcn->function_ref[hlua->state_id]); + + /* Create and and push object stream in the stack. */ + if (!hlua_applet_tcp_new(hlua->T, ctx)) { + SEND_ERR(strm->be, "Lua applet tcp '%s': full stack.\n", + ctx->rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(hlua); + return -1; + } + hlua->nargs = 1; + + /* push keywords in the stack. */ + for (arg = ctx->rule->arg.hlua_rule->args; arg && *arg; arg++) { + if (!lua_checkstack(hlua->T, 1)) { + SEND_ERR(strm->be, "Lua applet tcp '%s': full stack.\n", + ctx->rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(hlua); + return -1; + } + lua_pushstring(hlua->T, *arg); + hlua->nargs++; + } + + RESET_SAFE_LJMP(hlua); + + /* Wakeup the applet ASAP. */ + applet_need_more_data(ctx); + applet_have_more_data(ctx); + + return 0; +} + +void hlua_applet_tcp_fct(struct appctx *ctx) +{ + struct hlua_tcp_ctx *tcp_ctx = ctx->svcctx; + struct stconn *sc = appctx_sc(ctx); + struct stream *strm = __sc_strm(sc); + struct act_rule *rule = ctx->rule; + struct proxy *px = strm->be; + struct hlua *hlua = tcp_ctx->hlua; + + if (unlikely(se_fl_test(ctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) + goto out; + + /* The applet execution is already done. */ + if (tcp_ctx->flags & APPLET_DONE) + goto out; + + /* Execute the function. */ + switch (hlua_ctx_resume(hlua, 1)) { + /* finished. */ + case HLUA_E_OK: + tcp_ctx->flags |= APPLET_DONE; + se_fl_set(ctx->sedesc, SE_FL_EOI|SE_FL_EOS); + break; + + /* yield. */ + case HLUA_E_AGAIN: + if (hlua->wake_time != TICK_ETERNITY) + task_schedule(tcp_ctx->task, hlua->wake_time); + break; + + /* finished with error. */ + case HLUA_E_ERRMSG: + /* Display log. */ + SEND_ERR(px, "Lua applet tcp '%s': %s.\n", + rule->arg.hlua_rule->fcn->name, lua_tostring(hlua->T, -1)); + lua_pop(hlua->T, 1); + goto error; + + case HLUA_E_ETMOUT: + SEND_ERR(px, "Lua applet tcp '%s': execution timeout.\n", + rule->arg.hlua_rule->fcn->name); + goto error; + + case HLUA_E_NOMEM: + SEND_ERR(px, "Lua applet tcp '%s': out of memory error.\n", + rule->arg.hlua_rule->fcn->name); + goto error; + + case HLUA_E_YIELD: /* unexpected */ + SEND_ERR(px, "Lua applet tcp '%s': yield not allowed.\n", + rule->arg.hlua_rule->fcn->name); + goto error; + + case HLUA_E_ERR: + /* Display log. */ + SEND_ERR(px, "Lua applet tcp '%s' return an unknown error.\n", + rule->arg.hlua_rule->fcn->name); + goto error; + + default: + goto error; + } + +out: + /* eat the whole request */ + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + return; + +error: + se_fl_set(ctx->sedesc, SE_FL_ERROR); + tcp_ctx->flags |= APPLET_DONE; + goto out; +} + +static void hlua_applet_tcp_release(struct appctx *ctx) +{ + struct hlua_tcp_ctx *tcp_ctx = ctx->svcctx; + + task_destroy(tcp_ctx->task); + tcp_ctx->task = NULL; + hlua_ctx_destroy(tcp_ctx->hlua); + tcp_ctx->hlua = NULL; +} + +/* The function returns 0 if the initialisation is complete or -1 if + * an errors occurs. It also reserves the appctx for an hlua_http_ctx. + */ +static int hlua_applet_http_init(struct appctx *ctx) +{ + struct hlua_http_ctx *http_ctx = applet_reserve_svcctx(ctx, sizeof(*http_ctx)); + struct stconn *sc = appctx_sc(ctx); + struct stream *strm = __sc_strm(sc); + struct http_txn *txn; + struct hlua *hlua; + char **arg; + struct task *task; + const char *error; + + txn = strm->txn; + hlua = pool_alloc(pool_head_hlua); + if (!hlua) { + SEND_ERR(strm->be, "Lua applet http '%s': out of memory.\n", + ctx->rule->arg.hlua_rule->fcn->name); + return -1; + } + HLUA_INIT(hlua); + http_ctx->hlua = hlua; + http_ctx->left_bytes = -1; + http_ctx->flags = 0; + + if (txn->req.flags & HTTP_MSGF_VER_11) + http_ctx->flags |= APPLET_HTTP11; + + /* Create task used by signal to wakeup applets. */ + task = task_new_here(); + if (!task) { + SEND_ERR(strm->be, "Lua applet http '%s': out of memory.\n", + ctx->rule->arg.hlua_rule->fcn->name); + return -1; + } + task->nice = 0; + task->context = ctx; + task->process = hlua_applet_wakeup; + http_ctx->task = task; + + /* In the execution wrappers linked with a stream, the + * Lua context can be not initialized. This behavior + * permits to save performances because a systematic + * Lua initialization cause 5% performances loss. + */ + if (!hlua_ctx_init(hlua, fcn_ref_to_stack_id(ctx->rule->arg.hlua_rule->fcn), task)) { + SEND_ERR(strm->be, "Lua applet http '%s': can't initialize Lua context.\n", + ctx->rule->arg.hlua_rule->fcn->name); + return -1; + } + + /* Set timeout according with the applet configuration. */ + hlua_timer_init(&hlua->timer, ctx->applet->timeout); + + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(hlua)) { + if (lua_type(hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(hlua->T, -1); + else + error = "critical error"; + SEND_ERR(strm->be, "Lua applet http '%s': %s.\n", + ctx->rule->arg.hlua_rule->fcn->name, error); + return -1; + } + + /* Check stack available size. */ + if (!lua_checkstack(hlua->T, 1)) { + SEND_ERR(strm->be, "Lua applet http '%s': full stack.\n", + ctx->rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(hlua); + return -1; + } + + /* Restore the function in the stack. */ + hlua_pushref(hlua->T, ctx->rule->arg.hlua_rule->fcn->function_ref[hlua->state_id]); + + /* Create and and push object stream in the stack. */ + if (!hlua_applet_http_new(hlua->T, ctx)) { + SEND_ERR(strm->be, "Lua applet http '%s': full stack.\n", + ctx->rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(hlua); + return -1; + } + hlua->nargs = 1; + + /* push keywords in the stack. */ + for (arg = ctx->rule->arg.hlua_rule->args; arg && *arg; arg++) { + if (!lua_checkstack(hlua->T, 1)) { + SEND_ERR(strm->be, "Lua applet http '%s': full stack.\n", + ctx->rule->arg.hlua_rule->fcn->name); + RESET_SAFE_LJMP(hlua); + return -1; + } + lua_pushstring(hlua->T, *arg); + hlua->nargs++; + } + + RESET_SAFE_LJMP(hlua); + + /* Wakeup the applet when data is ready for read. */ + applet_need_more_data(ctx); + + return 0; +} + +void hlua_applet_http_fct(struct appctx *ctx) +{ + struct hlua_http_ctx *http_ctx = ctx->svcctx; + struct stconn *sc = appctx_sc(ctx); + struct stream *strm = __sc_strm(sc); + struct channel *req = sc_oc(sc); + struct channel *res = sc_ic(sc); + struct act_rule *rule = ctx->rule; + struct proxy *px = strm->be; + struct hlua *hlua = http_ctx->hlua; + struct htx *req_htx, *res_htx; + + res_htx = htx_from_buf(&res->buf); + + if (unlikely(se_fl_test(ctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) + goto out; + + /* The applet execution is already done. */ + if (http_ctx->flags & APPLET_DONE) + goto out; + + /* Check if the input buffer is available. */ + if (!b_size(&res->buf)) { + sc_need_room(sc, 0); + goto out; + } + + /* Set the currently running flag. */ + if (!HLUA_IS_RUNNING(hlua) && + !(http_ctx->flags & APPLET_DONE)) { + if (!co_data(req)) { + applet_need_more_data(ctx); + goto out; + } + } + + /* Execute the function. */ + switch (hlua_ctx_resume(hlua, 1)) { + /* finished. */ + case HLUA_E_OK: + http_ctx->flags |= APPLET_DONE; + break; + + /* yield. */ + case HLUA_E_AGAIN: + if (hlua->wake_time != TICK_ETERNITY) + task_schedule(http_ctx->task, hlua->wake_time); + goto out; + + /* finished with error. */ + case HLUA_E_ERRMSG: + /* Display log. */ + SEND_ERR(px, "Lua applet http '%s': %s.\n", + rule->arg.hlua_rule->fcn->name, lua_tostring(hlua->T, -1)); + lua_pop(hlua->T, 1); + goto error; + + case HLUA_E_ETMOUT: + SEND_ERR(px, "Lua applet http '%s': execution timeout.\n", + rule->arg.hlua_rule->fcn->name); + goto error; + + case HLUA_E_NOMEM: + SEND_ERR(px, "Lua applet http '%s': out of memory error.\n", + rule->arg.hlua_rule->fcn->name); + goto error; + + case HLUA_E_YIELD: /* unexpected */ + SEND_ERR(px, "Lua applet http '%s': yield not allowed.\n", + rule->arg.hlua_rule->fcn->name); + goto error; + + case HLUA_E_ERR: + /* Display log. */ + SEND_ERR(px, "Lua applet http '%s' return an unknown error.\n", + rule->arg.hlua_rule->fcn->name); + goto error; + + default: + goto error; + } + + if (http_ctx->flags & APPLET_DONE) { + if (http_ctx->flags & APPLET_RSP_SENT) + goto out; + + if (!(http_ctx->flags & APPLET_HDR_SENT)) + goto error; + + /* no more data are expected. If the response buffer is empty + * for a chunked message, be sure to add something (EOT block in + * this case) to have something to send. It is important to be + * sure the EOM flags will be handled by the endpoint. + */ + if (htx_is_empty(res_htx) && (strm->txn->rsp.flags & (HTTP_MSGF_XFER_LEN|HTTP_MSGF_CNT_LEN)) == HTTP_MSGF_XFER_LEN) { + if (!htx_add_endof(res_htx, HTX_BLK_EOT)) { + sc_need_room(sc, sizeof(struct htx_blk)+1); + goto out; + } + channel_add_input(res, 1); + } + + res_htx->flags |= HTX_FL_EOM; + se_fl_set(ctx->sedesc, SE_FL_EOI|SE_FL_EOS); + strm->txn->status = http_ctx->status; + http_ctx->flags |= APPLET_RSP_SENT; + } + + out: + htx_to_buf(res_htx, &res->buf); + /* eat the whole request */ + if (co_data(req)) { + req_htx = htx_from_buf(&req->buf); + co_htx_skip(req, req_htx, co_data(req)); + htx_to_buf(req_htx, &req->buf); + } + return; + + error: + + /* If we are in HTTP mode, and we are not send any + * data, return a 500 server error in best effort: + * if there is no room available in the buffer, + * just close the connection. + */ + if (!(http_ctx->flags & APPLET_HDR_SENT)) { + struct buffer *err = &http_err_chunks[HTTP_ERR_500]; + + channel_erase(res); + res->buf.data = b_data(err); + memcpy(res->buf.area, b_head(err), b_data(err)); + res_htx = htx_from_buf(&res->buf); + channel_add_input(res, res_htx->data); + se_fl_set(ctx->sedesc, SE_FL_EOI|SE_FL_EOS); + } + else + se_fl_set(ctx->sedesc, SE_FL_ERROR); + + if (!(strm->flags & SF_ERR_MASK)) + strm->flags |= SF_ERR_RESOURCE; + http_ctx->flags |= APPLET_DONE; + goto out; +} + +static void hlua_applet_http_release(struct appctx *ctx) +{ + struct hlua_http_ctx *http_ctx = ctx->svcctx; + + task_destroy(http_ctx->task); + http_ctx->task = NULL; + hlua_ctx_destroy(http_ctx->hlua); + http_ctx->hlua = NULL; +} + +/* global {tcp|http}-request parser. Return ACT_RET_PRS_OK in + * success case, else return ACT_RET_PRS_ERR. + * + * This function can fail with an abort() due to an Lua critical error. + * We are in the configuration parsing process of HAProxy, this abort() is + * tolerated. + */ +static enum act_parse_ret action_register_lua(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + struct hlua_function *fcn = rule->kw->private; + int i; + + /* Memory for the rule. */ + rule->arg.hlua_rule = calloc(1, sizeof(*rule->arg.hlua_rule)); + if (!rule->arg.hlua_rule) { + memprintf(err, "out of memory error"); + goto error; + } + + /* Memory for arguments. */ + rule->arg.hlua_rule->args = calloc(fcn->nargs + 1, + sizeof(*rule->arg.hlua_rule->args)); + if (!rule->arg.hlua_rule->args) { + memprintf(err, "out of memory error"); + goto error; + } + + /* Reference the Lua function and store the reference. */ + rule->arg.hlua_rule->fcn = fcn; + + /* Expect some arguments */ + for (i = 0; i < fcn->nargs; i++) { + if (*args[*cur_arg] == '\0') { + memprintf(err, "expect %d arguments", fcn->nargs); + goto error; + } + rule->arg.hlua_rule->args[i] = strdup(args[*cur_arg]); + if (!rule->arg.hlua_rule->args[i]) { + memprintf(err, "out of memory error"); + goto error; + } + (*cur_arg)++; + } + rule->arg.hlua_rule->args[i] = NULL; + + rule->action = ACT_CUSTOM; + rule->action_ptr = hlua_action; + return ACT_RET_PRS_OK; + + error: + if (rule->arg.hlua_rule) { + if (rule->arg.hlua_rule->args) { + for (i = 0; i < fcn->nargs; i++) + ha_free(&rule->arg.hlua_rule->args[i]); + ha_free(&rule->arg.hlua_rule->args); + } + ha_free(&rule->arg.hlua_rule); + } + return ACT_RET_PRS_ERR; +} + +static enum act_parse_ret action_register_service_http(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + struct hlua_function *fcn = rule->kw->private; + + /* HTTP applets are forbidden in tcp-request rules. + * HTTP applet request requires everything initialized by + * "http_process_request" (analyzer flag AN_REQ_HTTP_INNER). + * The applet will be immediately initialized, but its before + * the call of this analyzer. + */ + if (rule->from != ACT_F_HTTP_REQ) { + memprintf(err, "HTTP applets are forbidden from 'tcp-request' rulesets"); + return ACT_RET_PRS_ERR; + } + + /* Memory for the rule. */ + rule->arg.hlua_rule = calloc(1, sizeof(*rule->arg.hlua_rule)); + if (!rule->arg.hlua_rule) { + memprintf(err, "out of memory error"); + return ACT_RET_PRS_ERR; + } + + /* Reference the Lua function and store the reference. */ + rule->arg.hlua_rule->fcn = fcn; + + /* TODO: later accept arguments. */ + rule->arg.hlua_rule->args = NULL; + + /* Add applet pointer in the rule. */ + rule->applet.obj_type = OBJ_TYPE_APPLET; + rule->applet.name = fcn->name; + rule->applet.init = hlua_applet_http_init; + rule->applet.fct = hlua_applet_http_fct; + rule->applet.release = hlua_applet_http_release; + rule->applet.timeout = hlua_timeout_applet; + + return ACT_RET_PRS_OK; +} + +/* This function is an LUA binding used for registering + * "sample-conv" functions. It expects a converter name used + * in the haproxy configuration file, and an LUA function. + */ +__LJMP static int hlua_register_action(lua_State *L) +{ + struct action_kw_list *akl = NULL; + const char *name; + int ref; + int len; + struct hlua_function *fcn = NULL; + int nargs; + struct buffer *trash; + struct action_kw *akw; + + /* Initialise the number of expected arguments at 0. */ + nargs = 0; + + if (lua_gettop(L) < 3 || lua_gettop(L) > 4) + WILL_LJMP(luaL_error(L, "'register_action' needs between 3 and 4 arguments")); + + if (hlua_gethlua(L)) { + /* runtime processing */ + WILL_LJMP(luaL_error(L, "register_action: not available outside of body context")); + } + + /* First argument : converter name. */ + name = MAY_LJMP(luaL_checkstring(L, 1)); + + /* Second argument : environment. */ + if (lua_type(L, 2) != LUA_TTABLE) + WILL_LJMP(luaL_error(L, "register_action: second argument must be a table of strings")); + + /* Third argument : lua function. */ + ref = MAY_LJMP(hlua_checkfunction(L, 3)); + + /* Fourth argument : number of mandatory arguments expected on the configuration line. */ + if (lua_gettop(L) >= 4) + nargs = MAY_LJMP(luaL_checkinteger(L, 4)); + + /* browse the second argument as an array. */ + lua_pushnil(L); + while (lua_next(L, 2) != 0) { + if (lua_type(L, -1) != LUA_TSTRING) { + hlua_unref(L, ref); + WILL_LJMP(luaL_error(L, "register_action: second argument must be a table of strings")); + } + + /* Check if action exists */ + trash = get_trash_chunk(); + chunk_printf(trash, "lua.%s", name); + if (strcmp(lua_tostring(L, -1), "tcp-req") == 0) { + akw = tcp_req_cont_action(trash->area); + } else if (strcmp(lua_tostring(L, -1), "tcp-res") == 0) { + akw = tcp_res_cont_action(trash->area); + } else if (strcmp(lua_tostring(L, -1), "http-req") == 0) { + akw = action_http_req_custom(trash->area); + } else if (strcmp(lua_tostring(L, -1), "http-res") == 0) { + akw = action_http_res_custom(trash->area); + } else if (strcmp(lua_tostring(L, -1), "http-after-res") == 0) { + akw = action_http_after_res_custom(trash->area); + } else { + akw = NULL; + } + if (akw != NULL) { + fcn = akw->private; + if (fcn->function_ref[hlua_state_id] != -1) { + ha_warning("Trying to register action 'lua.%s' more than once. " + "This will become a hard error in version 2.5.\n", name); + hlua_unref(L, fcn->function_ref[hlua_state_id]); + } + fcn->function_ref[hlua_state_id] = ref; + + /* pop the environment string. */ + lua_pop(L, 1); + continue; + } + + /* Check required environment. Only accepted "http" or "tcp". */ + /* Allocate and fill the sample fetch keyword struct. */ + akl = calloc(1, sizeof(*akl) + sizeof(struct action_kw) * 2); + if (!akl) + goto alloc_error;; + fcn = new_hlua_function(); + if (!fcn) + goto alloc_error; + + /* Fill fcn. */ + fcn->name = strdup(name); + if (!fcn->name) + goto alloc_error; + fcn->function_ref[hlua_state_id] = ref; + + /* Set the expected number of arguments. */ + fcn->nargs = nargs; + + /* List head */ + akl->list.n = akl->list.p = NULL; + + /* action keyword. */ + len = strlen("lua.") + strlen(name) + 1; + akl->kw[0].kw = calloc(1, len); + if (!akl->kw[0].kw) + goto alloc_error; + + snprintf((char *)akl->kw[0].kw, len, "lua.%s", name); + + akl->kw[0].flags = 0; + akl->kw[0].private = fcn; + akl->kw[0].parse = action_register_lua; + + /* select the action registering point. */ + if (strcmp(lua_tostring(L, -1), "tcp-req") == 0) + tcp_req_cont_keywords_register(akl); + else if (strcmp(lua_tostring(L, -1), "tcp-res") == 0) + tcp_res_cont_keywords_register(akl); + else if (strcmp(lua_tostring(L, -1), "http-req") == 0) + http_req_keywords_register(akl); + else if (strcmp(lua_tostring(L, -1), "http-res") == 0) + http_res_keywords_register(akl); + else if (strcmp(lua_tostring(L, -1), "http-after-res") == 0) + http_after_res_keywords_register(akl); + else { + release_hlua_function(fcn); + hlua_unref(L, ref); + if (akl) + ha_free((char **)&(akl->kw[0].kw)); + ha_free(&akl); + WILL_LJMP(luaL_error(L, "Lua action environment '%s' is unknown. " + "'tcp-req', 'tcp-res', 'http-req', 'http-res' " + "or 'http-after-res' " + "are expected.", lua_tostring(L, -1))); + } + + /* pop the environment string. */ + lua_pop(L, 1); + + /* reset for next loop */ + akl = NULL; + fcn = NULL; + } + return ACT_RET_PRS_OK; + + alloc_error: + release_hlua_function(fcn); + hlua_unref(L, ref); + ha_free(&akl); + WILL_LJMP(luaL_error(L, "Lua out of memory error.")); + return 0; /* Never reached */ +} + +static enum act_parse_ret action_register_service_tcp(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + struct hlua_function *fcn = rule->kw->private; + + if (px->mode == PR_MODE_HTTP) { + memprintf(err, "Lua TCP services cannot be used on HTTP proxies"); + return ACT_RET_PRS_ERR; + } + + /* Memory for the rule. */ + rule->arg.hlua_rule = calloc(1, sizeof(*rule->arg.hlua_rule)); + if (!rule->arg.hlua_rule) { + memprintf(err, "out of memory error"); + return ACT_RET_PRS_ERR; + } + + /* Reference the Lua function and store the reference. */ + rule->arg.hlua_rule->fcn = fcn; + + /* TODO: later accept arguments. */ + rule->arg.hlua_rule->args = NULL; + + /* Add applet pointer in the rule. */ + rule->applet.obj_type = OBJ_TYPE_APPLET; + rule->applet.name = fcn->name; + rule->applet.init = hlua_applet_tcp_init; + rule->applet.fct = hlua_applet_tcp_fct; + rule->applet.release = hlua_applet_tcp_release; + rule->applet.timeout = hlua_timeout_applet; + + return 0; +} + +/* This function is an LUA binding used for registering + * "sample-conv" functions. It expects a converter name used + * in the haproxy configuration file, and an LUA function. + */ +__LJMP static int hlua_register_service(lua_State *L) +{ + struct action_kw_list *akl; + const char *name; + const char *env; + int ref; + int len; + struct hlua_function *fcn = NULL; + struct buffer *trash; + struct action_kw *akw; + + MAY_LJMP(check_args(L, 3, "register_service")); + + if (hlua_gethlua(L)) { + /* runtime processing */ + WILL_LJMP(luaL_error(L, "register_service: not available outside of body context")); + } + + /* First argument : converter name. */ + name = MAY_LJMP(luaL_checkstring(L, 1)); + + /* Second argument : environment. */ + env = MAY_LJMP(luaL_checkstring(L, 2)); + + /* Third argument : lua function. */ + ref = MAY_LJMP(hlua_checkfunction(L, 3)); + + /* Check for service already registered */ + trash = get_trash_chunk(); + chunk_printf(trash, "lua.%s", name); + akw = service_find(trash->area); + if (akw != NULL) { + fcn = akw->private; + if (fcn->function_ref[hlua_state_id] != -1) { + ha_warning("Trying to register service 'lua.%s' more than once. " + "This will become a hard error in version 2.5.\n", name); + hlua_unref(L, fcn->function_ref[hlua_state_id]); + } + fcn->function_ref[hlua_state_id] = ref; + return 0; + } + + /* Allocate and fill the sample fetch keyword struct. */ + akl = calloc(1, sizeof(*akl) + sizeof(struct action_kw) * 2); + if (!akl) + goto alloc_error; + fcn = new_hlua_function(); + if (!fcn) + goto alloc_error; + + /* Fill fcn. */ + len = strlen("<lua.>") + strlen(name) + 1; + fcn->name = calloc(1, len); + if (!fcn->name) + goto alloc_error; + snprintf((char *)fcn->name, len, "<lua.%s>", name); + fcn->function_ref[hlua_state_id] = ref; + + /* List head */ + akl->list.n = akl->list.p = NULL; + + /* converter keyword. */ + len = strlen("lua.") + strlen(name) + 1; + akl->kw[0].kw = calloc(1, len); + if (!akl->kw[0].kw) + goto alloc_error; + + snprintf((char *)akl->kw[0].kw, len, "lua.%s", name); + + /* Check required environment. Only accepted "http" or "tcp". */ + if (strcmp(env, "tcp") == 0) + akl->kw[0].parse = action_register_service_tcp; + else if (strcmp(env, "http") == 0) + akl->kw[0].parse = action_register_service_http; + else { + release_hlua_function(fcn); + hlua_unref(L, ref); + if (akl) + ha_free((char **)&(akl->kw[0].kw)); + ha_free(&akl); + WILL_LJMP(luaL_error(L, "Lua service environment '%s' is unknown. " + "'tcp' or 'http' are expected.", env)); + } + + akl->kw[0].flags = 0; + akl->kw[0].private = fcn; + + /* End of array. */ + memset(&akl->kw[1], 0, sizeof(*akl->kw)); + + /* Register this new converter */ + service_keywords_register(akl); + + return 0; + + alloc_error: + release_hlua_function(fcn); + hlua_unref(L, ref); + ha_free(&akl); + WILL_LJMP(luaL_error(L, "Lua out of memory error.")); + return 0; /* Never reached */ +} + +/* This function initialises Lua cli handler. It copies the + * arguments in the Lua stack and create channel IO objects. + */ +static int hlua_cli_parse_fct(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct hlua_cli_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct hlua *hlua; + struct hlua_function *fcn; + int i; + const char *error; + + fcn = private; + ctx->fcn = private; + + hlua = pool_alloc(pool_head_hlua); + if (!hlua) { + SEND_ERR(NULL, "Lua cli '%s': out of memory.\n", fcn->name); + return 1; + } + HLUA_INIT(hlua); + ctx->hlua = hlua; + + /* Create task used by signal to wakeup applets. + * We use the same wakeup function than the Lua applet_tcp and + * applet_http. It is absolutely compatible. + */ + ctx->task = task_new_here(); + if (!ctx->task) { + SEND_ERR(NULL, "Lua cli '%s': out of memory.\n", fcn->name); + goto error; + } + ctx->task->nice = 0; + ctx->task->context = appctx; + ctx->task->process = hlua_applet_wakeup; + + /* Initialises the Lua context */ + if (!hlua_ctx_init(hlua, fcn_ref_to_stack_id(fcn), ctx->task)) { + SEND_ERR(NULL, "Lua cli '%s': can't initialize Lua context.\n", fcn->name); + goto error; + } + + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(hlua)) { + if (lua_type(hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(hlua->T, -1); + else + error = "critical error"; + SEND_ERR(NULL, "Lua cli '%s': %s.\n", fcn->name, error); + goto error; + } + + /* Check stack available size. */ + if (!lua_checkstack(hlua->T, 2)) { + SEND_ERR(NULL, "Lua cli '%s': full stack.\n", fcn->name); + goto error; + } + + /* Restore the function in the stack. */ + hlua_pushref(hlua->T, fcn->function_ref[hlua->state_id]); + + /* Once the arguments parsed, the CLI is like an AppletTCP, + * so push AppletTCP in the stack. + */ + if (!hlua_applet_tcp_new(hlua->T, appctx)) { + SEND_ERR(NULL, "Lua cli '%s': full stack.\n", fcn->name); + goto error; + } + hlua->nargs = 1; + + /* push keywords in the stack. */ + for (i = 0; *args[i]; i++) { + /* Check stack available size. */ + if (!lua_checkstack(hlua->T, 1)) { + SEND_ERR(NULL, "Lua cli '%s': full stack.\n", fcn->name); + goto error; + } + lua_pushstring(hlua->T, args[i]); + hlua->nargs++; + } + + /* We must initialize the execution timeouts. */ + hlua_timer_init(&hlua->timer, hlua_timeout_session); + + /* At this point the execution is safe. */ + RESET_SAFE_LJMP(hlua); + + /* It's ok */ + return 0; + + /* It's not ok. */ +error: + RESET_SAFE_LJMP(hlua); + hlua_ctx_destroy(hlua); + ctx->hlua = NULL; + return 1; +} + +static int hlua_cli_io_handler_fct(struct appctx *appctx) +{ + struct hlua_cli_ctx *ctx = appctx->svcctx; + struct hlua *hlua; + struct stconn *sc; + struct hlua_function *fcn; + + hlua = ctx->hlua; + sc = appctx_sc(appctx); + fcn = ctx->fcn; + + /* Execute the function. */ + switch (hlua_ctx_resume(hlua, 1)) { + + /* finished. */ + case HLUA_E_OK: + return 1; + + /* yield. */ + case HLUA_E_AGAIN: + /* We want write. */ + if (HLUA_IS_WAKERESWR(hlua)) + sc_need_room(sc, -1); + /* Set the timeout. */ + if (hlua->wake_time != TICK_ETERNITY) + task_schedule(hlua->task, hlua->wake_time); + return 0; + + /* finished with error. */ + case HLUA_E_ERRMSG: + /* Display log. */ + SEND_ERR(NULL, "Lua cli '%s': %s.\n", + fcn->name, lua_tostring(hlua->T, -1)); + lua_pop(hlua->T, 1); + return 1; + + case HLUA_E_ETMOUT: + SEND_ERR(NULL, "Lua converter '%s': execution timeout.\n", + fcn->name); + return 1; + + case HLUA_E_NOMEM: + SEND_ERR(NULL, "Lua converter '%s': out of memory error.\n", + fcn->name); + return 1; + + case HLUA_E_YIELD: /* unexpected */ + SEND_ERR(NULL, "Lua converter '%s': yield not allowed.\n", + fcn->name); + return 1; + + case HLUA_E_ERR: + /* Display log. */ + SEND_ERR(NULL, "Lua cli '%s' return an unknown error.\n", + fcn->name); + return 1; + + default: + return 1; + } + + return 1; +} + +static void hlua_cli_io_release_fct(struct appctx *appctx) +{ + struct hlua_cli_ctx *ctx = appctx->svcctx; + + hlua_ctx_destroy(ctx->hlua); + ctx->hlua = NULL; +} + +/* This function is an LUA binding used for registering + * new keywords in the cli. It expects a list of keywords + * which are the "path". It is limited to 5 keywords. A + * description of the command, a function to be executed + * for the parsing and a function for io handlers. + */ +__LJMP static int hlua_register_cli(lua_State *L) +{ + struct cli_kw_list *cli_kws; + const char *message; + int ref_io; + int len; + struct hlua_function *fcn = NULL; + int index; + int i; + struct buffer *trash; + const char *kw[5]; + struct cli_kw *cli_kw; + const char *errmsg; + char *end; + + MAY_LJMP(check_args(L, 3, "register_cli")); + + if (hlua_gethlua(L)) { + /* runtime processing */ + WILL_LJMP(luaL_error(L, "register_cli: not available outside of body context")); + } + + /* First argument : an array of maximum 5 keywords. */ + if (!lua_istable(L, 1)) + WILL_LJMP(luaL_argerror(L, 1, "1st argument must be a table")); + + /* Second argument : string with contextual message. */ + message = MAY_LJMP(luaL_checkstring(L, 2)); + + /* Third and fourth argument : lua function. */ + ref_io = MAY_LJMP(hlua_checkfunction(L, 3)); + + /* Check for CLI service already registered */ + trash = get_trash_chunk(); + index = 0; + lua_pushnil(L); + memset(kw, 0, sizeof(kw)); + while (lua_next(L, 1) != 0) { + if (index >= CLI_PREFIX_KW_NB) { + hlua_unref(L, ref_io); + WILL_LJMP(luaL_argerror(L, 1, "1st argument must be a table with a maximum of 5 entries")); + } + if (lua_type(L, -1) != LUA_TSTRING) { + hlua_unref(L, ref_io); + WILL_LJMP(luaL_argerror(L, 1, "1st argument must be a table filled with strings")); + } + kw[index] = lua_tostring(L, -1); + if (index == 0) + chunk_printf(trash, "%s", kw[index]); + else + chunk_appendf(trash, " %s", kw[index]); + index++; + lua_pop(L, 1); + } + cli_kw = cli_find_kw_exact((char **)kw); + if (cli_kw != NULL) { + fcn = cli_kw->private; + if (fcn->function_ref[hlua_state_id] != -1) { + ha_warning("Trying to register CLI keyword 'lua.%s' more than once. " + "This will become a hard error in version 2.5.\n", trash->area); + hlua_unref(L, fcn->function_ref[hlua_state_id]); + } + fcn->function_ref[hlua_state_id] = ref_io; + return 0; + } + + /* Allocate and fill the sample fetch keyword struct. */ + cli_kws = calloc(1, sizeof(*cli_kws) + sizeof(struct cli_kw) * 2); + if (!cli_kws) { + errmsg = "Lua out of memory error."; + goto error; + } + fcn = new_hlua_function(); + if (!fcn) { + errmsg = "Lua out of memory error."; + goto error; + } + + /* Fill path. */ + index = 0; + lua_pushnil(L); + while(lua_next(L, 1) != 0) { + if (index >= 5) { + errmsg = "1st argument must be a table with a maximum of 5 entries"; + goto error; + } + if (lua_type(L, -1) != LUA_TSTRING) { + errmsg = "1st argument must be a table filled with strings"; + goto error; + } + cli_kws->kw[0].str_kw[index] = strdup(lua_tostring(L, -1)); + if (!cli_kws->kw[0].str_kw[index]) { + errmsg = "Lua out of memory error."; + goto error; + } + index++; + lua_pop(L, 1); + } + + /* Copy help message. */ + cli_kws->kw[0].usage = strdup(message); + if (!cli_kws->kw[0].usage) { + errmsg = "Lua out of memory error."; + goto error; + } + + /* Fill fcn io handler. */ + len = strlen("<lua.cli>") + 1; + for (i = 0; i < index; i++) + len += strlen(cli_kws->kw[0].str_kw[i]) + 1; + fcn->name = calloc(1, len); + if (!fcn->name) { + errmsg = "Lua out of memory error."; + goto error; + } + + end = fcn->name; + len = 8; + memcpy(end, "<lua.cli", len); + end += len; + + for (i = 0; i < index; i++) { + *(end++) = '.'; + len = strlen(cli_kws->kw[0].str_kw[i]); + memcpy(end, cli_kws->kw[0].str_kw[i], len); + end += len; + } + *(end++) = '>'; + *(end++) = 0; + + fcn->function_ref[hlua_state_id] = ref_io; + + /* Fill last entries. */ + cli_kws->kw[0].private = fcn; + cli_kws->kw[0].parse = hlua_cli_parse_fct; + cli_kws->kw[0].io_handler = hlua_cli_io_handler_fct; + cli_kws->kw[0].io_release = hlua_cli_io_release_fct; + + /* Register this new converter */ + cli_register_kw(cli_kws); + + return 0; + + error: + release_hlua_function(fcn); + hlua_unref(L, ref_io); + if (cli_kws) { + for (i = 0; i < index; i++) + ha_free((char **)&(cli_kws->kw[0].str_kw[i])); + ha_free((char **)&(cli_kws->kw[0].usage)); + } + ha_free(&cli_kws); + WILL_LJMP(luaL_error(L, errmsg)); + return 0; /* Never reached */ +} + +static int hlua_filter_init_per_thread(struct proxy *px, struct flt_conf *fconf) +{ + struct hlua_flt_config *conf = fconf->conf; + lua_State *L; + int error, pos, state_id, flt_ref; + + state_id = reg_flt_to_stack_id(conf->reg); + L = hlua_states[state_id]; + pos = lua_gettop(L); + + /* The filter parsing function */ + hlua_pushref(L, conf->reg->fun_ref[state_id]); + + /* Push the filter class on the stack and resolve all callbacks */ + hlua_pushref(L, conf->reg->flt_ref[state_id]); + + /* Duplicate the filter class so each filter will have its own copy */ + lua_newtable(L); + lua_pushnil(L); + + while (lua_next(L, pos+2)) { + lua_pushvalue(L, -2); + lua_insert(L, -2); + lua_settable(L, -4); + } + flt_ref = hlua_ref(L); + + /* Remove the original lua filter class from the stack */ + lua_pop(L, 1); + + /* Push the copy on the stack */ + hlua_pushref(L, flt_ref); + + /* extra args are pushed in a table */ + lua_newtable(L); + for (pos = 0; conf->args[pos]; pos++) { + /* Check stack available size. */ + if (!lua_checkstack(L, 1)) { + ha_alert("Lua filter '%s' : Lua error : full stack.", conf->reg->name); + goto error; + } + lua_pushstring(L, conf->args[pos]); + lua_rawseti(L, -2, lua_rawlen(L, -2) + 1); + } + + error = lua_pcall(L, 2, LUA_MULTRET, 0); + switch (error) { + case LUA_OK: + /* replace the filter ref */ + conf->ref[state_id] = flt_ref; + break; + case LUA_ERRRUN: + ha_alert("Lua filter '%s' : runtime error : %s", conf->reg->name, lua_tostring(L, -1)); + goto error; + case LUA_ERRMEM: + ha_alert("Lua filter '%s' : out of memory error", conf->reg->name); + goto error; + case LUA_ERRERR: + ha_alert("Lua filter '%s' : message handler error : %s", conf->reg->name, lua_tostring(L, -1)); + goto error; +#if defined(LUA_VERSION_NUM) && LUA_VERSION_NUM <= 503 + case LUA_ERRGCMM: + ha_alert("Lua filter '%s' : garbage collector error : %s", conf->reg->name, lua_tostring(L, -1)); + goto error; +#endif + default: + ha_alert("Lua filter '%s' : unknown error : %s", conf->reg->name, lua_tostring(L, -1)); + goto error; + } + + lua_settop(L, 0); + return 0; + + error: + lua_settop(L, 0); + return -1; +} + +static void hlua_filter_deinit_per_thread(struct proxy *px, struct flt_conf *fconf) +{ + struct hlua_flt_config *conf = fconf->conf; + lua_State *L; + int state_id; + + if (!conf) + return; + + state_id = reg_flt_to_stack_id(conf->reg); + L = hlua_states[state_id]; + hlua_unref(L, conf->ref[state_id]); +} + +static int hlua_filter_init(struct proxy *px, struct flt_conf *fconf) +{ + struct hlua_flt_config *conf = fconf->conf; + int state_id = reg_flt_to_stack_id(conf->reg); + + /* Rely on per-thread init for global scripts */ + if (!state_id) + return hlua_filter_init_per_thread(px, fconf); + return 0; +} + +static void hlua_filter_deinit(struct proxy *px, struct flt_conf *fconf) +{ + + if (fconf->conf) { + struct hlua_flt_config *conf = fconf->conf; + int state_id = reg_flt_to_stack_id(conf->reg); + int pos; + + /* Rely on per-thread deinit for global scripts */ + if (!state_id) + hlua_filter_deinit_per_thread(px, fconf); + + for (pos = 0; conf->args[pos]; pos++) + free(conf->args[pos]); + free(conf->args); + } + ha_free(&fconf->conf); + ha_free((char **)&fconf->id); + ha_free(&fconf->ops); +} + +static int hlua_filter_new(struct stream *s, struct filter *filter) +{ + struct hlua_flt_config *conf = FLT_CONF(filter); + struct hlua_flt_ctx *flt_ctx = NULL; + int ret = 1; + + if (!hlua_stream_ctx_prepare(s, reg_flt_to_stack_id(conf->reg))) { + SEND_ERR(s->be, "Lua filter '%s': can't initialize filter Lua context.\n", + conf->reg->name); + ret = 0; + goto end; + } + + flt_ctx = pool_zalloc(pool_head_hlua_flt_ctx); + if (!flt_ctx) { + SEND_ERR(s->be, "Lua filter '%s': can't initialize filter Lua context.\n", + conf->reg->name); + ret = 0; + goto end; + } + flt_ctx->hlua[0] = pool_alloc(pool_head_hlua); + flt_ctx->hlua[1] = pool_alloc(pool_head_hlua); + if (!flt_ctx->hlua[0] || !flt_ctx->hlua[1]) { + SEND_ERR(s->be, "Lua filter '%s': can't initialize filter Lua context.\n", + conf->reg->name); + ret = 0; + goto end; + } + HLUA_INIT(flt_ctx->hlua[0]); + HLUA_INIT(flt_ctx->hlua[1]); + if (!hlua_ctx_init(flt_ctx->hlua[0], reg_flt_to_stack_id(conf->reg), s->task) || + !hlua_ctx_init(flt_ctx->hlua[1], reg_flt_to_stack_id(conf->reg), s->task)) { + SEND_ERR(s->be, "Lua filter '%s': can't initialize filter Lua context.\n", + conf->reg->name); + ret = 0; + goto end; + } + + if (!HLUA_IS_RUNNING(s->hlua)) { + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(s->hlua)) { + const char *error; + + if (lua_type(s->hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(s->hlua->T, -1); + else + error = "critical error"; + SEND_ERR(s->be, "Lua filter '%s': %s.\n", conf->reg->name, error); + ret = 0; + goto end; + } + + /* Check stack size. */ + if (!lua_checkstack(s->hlua->T, 1)) { + SEND_ERR(s->be, "Lua filter '%s': full stack.\n", conf->reg->name); + RESET_SAFE_LJMP(s->hlua); + ret = 0; + goto end; + } + + hlua_pushref(s->hlua->T, conf->ref[s->hlua->state_id]); + if (lua_getfield(s->hlua->T, -1, "new") != LUA_TFUNCTION) { + SEND_ERR(s->be, "Lua filter '%s': 'new' field is not a function.\n", + conf->reg->name); + RESET_SAFE_LJMP(s->hlua); + ret = 0; + goto end; + } + lua_insert(s->hlua->T, -2); + + /* Push the copy on the stack */ + s->hlua->nargs = 1; + + /* We must initialize the execution timeouts. */ + hlua_timer_init(&s->hlua->timer, hlua_timeout_session); + + /* At this point the execution is safe. */ + RESET_SAFE_LJMP(s->hlua); + } + + switch (hlua_ctx_resume(s->hlua, 0)) { + case HLUA_E_OK: + /* Nothing returned or not a table, ignore the filter for current stream */ + if (!lua_gettop(s->hlua->T) || !lua_istable(s->hlua->T, 1)) { + ret = 0; + goto end; + } + + /* Attached the filter pointer to the ctx */ + lua_pushstring(s->hlua->T, "__filter"); + lua_pushlightuserdata(s->hlua->T, filter); + lua_settable(s->hlua->T, -3); + + /* Save a ref on the filter ctx */ + lua_pushvalue(s->hlua->T, 1); + flt_ctx->ref = hlua_ref(s->hlua->T); + filter->ctx = flt_ctx; + break; + case HLUA_E_ERRMSG: + SEND_ERR(s->be, "Lua filter '%s' : %s.\n", conf->reg->name, lua_tostring(s->hlua->T, -1)); + ret = -1; + goto end; + case HLUA_E_ETMOUT: + SEND_ERR(s->be, "Lua filter '%s' : 'new' execution timeout.\n", conf->reg->name); + ret = 0; + goto end; + case HLUA_E_NOMEM: + SEND_ERR(s->be, "Lua filter '%s' : out of memory error.\n", conf->reg->name); + ret = 0; + goto end; + case HLUA_E_AGAIN: + case HLUA_E_YIELD: + SEND_ERR(s->be, "Lua filter '%s': yield functions like core.tcp() or core.sleep()" + " are not allowed from 'new' function.\n", conf->reg->name); + ret = 0; + goto end; + case HLUA_E_ERR: + SEND_ERR(s->be, "Lua filter '%s': 'new' returns an unknown error.\n", conf->reg->name); + ret = 0; + goto end; + default: + ret = 0; + goto end; + } + + end: + if (s->hlua) + lua_settop(s->hlua->T, 0); + if (ret <= 0) { + if (flt_ctx) { + hlua_ctx_destroy(flt_ctx->hlua[0]); + hlua_ctx_destroy(flt_ctx->hlua[1]); + pool_free(pool_head_hlua_flt_ctx, flt_ctx); + } + } + return ret; +} + +static void hlua_filter_delete(struct stream *s, struct filter *filter) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + hlua_unref(s->hlua->T, flt_ctx->ref); + hlua_ctx_destroy(flt_ctx->hlua[0]); + hlua_ctx_destroy(flt_ctx->hlua[1]); + pool_free(pool_head_hlua_flt_ctx, flt_ctx); + filter->ctx = NULL; +} + +static int hlua_filter_from_payload(struct filter *filter) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + return (flt_ctx && !!(flt_ctx->flags & HLUA_FLT_CTX_FL_PAYLOAD)); +} + +static int hlua_filter_callback(struct stream *s, struct filter *filter, const char *fun, + int dir, unsigned int flags) +{ + struct hlua *flt_hlua; + struct hlua_flt_config *conf = FLT_CONF(filter); + struct hlua_flt_ctx *flt_ctx = filter->ctx; + unsigned int hflags = HLUA_TXN_FLT_CTX; + int ret = 1; + + flt_hlua = flt_ctx->hlua[(dir == SMP_OPT_DIR_REQ ? 0 : 1)]; + if (!flt_hlua) + goto end; + + if (!HLUA_IS_RUNNING(flt_hlua)) { + int extra_idx = lua_gettop(flt_hlua->T); + + /* The following Lua calls can fail. */ + if (!SET_SAFE_LJMP(flt_hlua)) { + const char *error; + + if (lua_type(flt_hlua->T, -1) == LUA_TSTRING) + error = lua_tostring(flt_hlua->T, -1); + else + error = "critical error"; + SEND_ERR(s->be, "Lua filter '%s': %s.\n", conf->reg->name, error); + goto end; + } + + /* Check stack size. */ + if (!lua_checkstack(flt_hlua->T, 3)) { + SEND_ERR(s->be, "Lua filter '%s': full stack.\n", conf->reg->name); + RESET_SAFE_LJMP(flt_hlua); + goto end; + } + + hlua_pushref(flt_hlua->T, flt_ctx->ref); + if (lua_getfield(flt_hlua->T, -1, fun) != LUA_TFUNCTION) { + RESET_SAFE_LJMP(flt_hlua); + goto end; + } + lua_insert(flt_hlua->T, -2); + + if (!hlua_txn_new(flt_hlua->T, s, s->be, dir, hflags)) { + SEND_ERR(s->be, "Lua filter '%s': full stack.\n", conf->reg->name); + RESET_SAFE_LJMP(flt_hlua); + goto end; + } + flt_hlua->nargs = 2; + + if (flags & HLUA_FLT_CB_ARG_CHN) { + if (dir == SMP_OPT_DIR_REQ) + lua_getfield(flt_hlua->T, -1, "req"); + else + lua_getfield(flt_hlua->T, -1, "res"); + if (lua_type(flt_hlua->T, -1) == LUA_TTABLE) { + lua_pushstring(flt_hlua->T, "__filter"); + lua_pushlightuserdata(flt_hlua->T, filter); + lua_settable(flt_hlua->T, -3); + } + flt_hlua->nargs++; + } + else if (flags & HLUA_FLT_CB_ARG_HTTP_MSG) { + if (dir == SMP_OPT_DIR_REQ) + lua_getfield(flt_hlua->T, -1, "http_req"); + else + lua_getfield(flt_hlua->T, -1, "http_res"); + if (lua_type(flt_hlua->T, -1) == LUA_TTABLE) { + lua_pushstring(flt_hlua->T, "__filter"); + lua_pushlightuserdata(flt_hlua->T, filter); + lua_settable(flt_hlua->T, -3); + } + flt_hlua->nargs++; + } + + /* Check stack size. */ + if (!lua_checkstack(flt_hlua->T, 1)) { + SEND_ERR(s->be, "Lua filter '%s': full stack.\n", conf->reg->name); + RESET_SAFE_LJMP(flt_hlua); + goto end; + } + + while (extra_idx--) { + lua_pushvalue(flt_hlua->T, 1); + lua_remove(flt_hlua->T, 1); + flt_hlua->nargs++; + } + + /* We must initialize the execution timeouts. */ + hlua_timer_init(&flt_hlua->timer, hlua_timeout_session); + + /* At this point the execution is safe. */ + RESET_SAFE_LJMP(flt_hlua); + } + + switch (hlua_ctx_resume(flt_hlua, !(flags & HLUA_FLT_CB_FINAL))) { + case HLUA_E_OK: + /* Catch the return value if it required */ + if ((flags & HLUA_FLT_CB_RETVAL) && lua_gettop(flt_hlua->T) > 0) { + ret = lua_tointeger(flt_hlua->T, -1); + lua_settop(flt_hlua->T, 0); /* Empty the stack. */ + } + + /* Set timeout in the required channel. */ + if (flt_hlua->wake_time != TICK_ETERNITY) { + if (dir == SMP_OPT_DIR_REQ) + s->req.analyse_exp = flt_hlua->wake_time; + else + s->res.analyse_exp = flt_hlua->wake_time; + } + break; + case HLUA_E_AGAIN: + /* Set timeout in the required channel. */ + if (flt_hlua->wake_time != TICK_ETERNITY) { + if (dir == SMP_OPT_DIR_REQ) + s->req.analyse_exp = flt_hlua->wake_time; + else + s->res.analyse_exp = flt_hlua->wake_time; + } + /* Some actions can be wake up when a "write" event + * is detected on a response channel. This is useful + * only for actions targeted on the requests. + */ + if (HLUA_IS_WAKERESWR(flt_hlua)) + s->res.flags |= CF_WAKE_WRITE; + if (HLUA_IS_WAKEREQWR(flt_hlua)) + s->req.flags |= CF_WAKE_WRITE; + ret = 0; + goto end; + case HLUA_E_ERRMSG: + SEND_ERR(s->be, "Lua filter '%s' : %s.\n", conf->reg->name, lua_tostring(flt_hlua->T, -1)); + ret = -1; + goto end; + case HLUA_E_ETMOUT: + SEND_ERR(s->be, "Lua filter '%s' : '%s' callback execution timeout.\n", conf->reg->name, fun); + goto end; + case HLUA_E_NOMEM: + SEND_ERR(s->be, "Lua filter '%s' : out of memory error.\n", conf->reg->name); + goto end; + case HLUA_E_YIELD: + SEND_ERR(s->be, "Lua filter '%s': yield functions like core.tcp() or core.sleep()" + " are not allowed from '%s' callback.\n", conf->reg->name, fun); + goto end; + case HLUA_E_ERR: + SEND_ERR(s->be, "Lua filter '%s': '%s' returns an unknown error.\n", conf->reg->name, fun); + goto end; + default: + goto end; + } + + + end: + return ret; +} + +static int hlua_filter_start_analyze(struct stream *s, struct filter *filter, struct channel *chn) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + flt_ctx->flags = 0; + return hlua_filter_callback(s, filter, "start_analyze", + (!(chn->flags & CF_ISRESP) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES), + (HLUA_FLT_CB_FINAL | HLUA_FLT_CB_RETVAL | HLUA_FLT_CB_ARG_CHN)); +} + +static int hlua_filter_end_analyze(struct stream *s, struct filter *filter, struct channel *chn) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + flt_ctx->flags &= ~HLUA_FLT_CTX_FL_PAYLOAD; + return hlua_filter_callback(s, filter, "end_analyze", + (!(chn->flags & CF_ISRESP) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES), + (HLUA_FLT_CB_FINAL | HLUA_FLT_CB_RETVAL | HLUA_FLT_CB_ARG_CHN)); +} + +static int hlua_filter_http_headers(struct stream *s, struct filter *filter, struct http_msg *msg) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + flt_ctx->flags &= ~HLUA_FLT_CTX_FL_PAYLOAD; + return hlua_filter_callback(s, filter, "http_headers", + (!(msg->chn->flags & CF_ISRESP) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES), + (HLUA_FLT_CB_FINAL | HLUA_FLT_CB_RETVAL | HLUA_FLT_CB_ARG_HTTP_MSG)); +} + +static int hlua_filter_http_payload(struct stream *s, struct filter *filter, struct http_msg *msg, + unsigned int offset, unsigned int len) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + struct hlua *flt_hlua; + int dir = (!(msg->chn->flags & CF_ISRESP) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES); + int idx = (dir == SMP_OPT_DIR_REQ ? 0 : 1); + int ret; + + flt_hlua = flt_ctx->hlua[idx]; + flt_ctx->cur_off[idx] = offset; + flt_ctx->cur_len[idx] = len; + flt_ctx->flags |= HLUA_FLT_CTX_FL_PAYLOAD; + ret = hlua_filter_callback(s, filter, "http_payload", dir, (HLUA_FLT_CB_FINAL | HLUA_FLT_CB_ARG_HTTP_MSG)); + if (ret != -1) { + ret = flt_ctx->cur_len[idx]; + if (lua_gettop(flt_hlua->T) > 0) { + ret = lua_tointeger(flt_hlua->T, -1); + if (ret > flt_ctx->cur_len[idx]) + ret = flt_ctx->cur_len[idx]; + lua_settop(flt_hlua->T, 0); /* Empty the stack. */ + } + } + return ret; +} + +static int hlua_filter_http_end(struct stream *s, struct filter *filter, struct http_msg *msg) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + + flt_ctx->flags &= ~HLUA_FLT_CTX_FL_PAYLOAD; + return hlua_filter_callback(s, filter, "http_end", + (!(msg->chn->flags & CF_ISRESP) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES), + (HLUA_FLT_CB_FINAL | HLUA_FLT_CB_RETVAL | HLUA_FLT_CB_ARG_HTTP_MSG)); +} + +static int hlua_filter_tcp_payload(struct stream *s, struct filter *filter, struct channel *chn, + unsigned int offset, unsigned int len) +{ + struct hlua_flt_ctx *flt_ctx = filter->ctx; + struct hlua *flt_hlua; + int dir = (!(chn->flags & CF_ISRESP) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES); + int idx = (dir == SMP_OPT_DIR_REQ ? 0 : 1); + int ret; + + flt_hlua = flt_ctx->hlua[idx]; + flt_ctx->cur_off[idx] = offset; + flt_ctx->cur_len[idx] = len; + flt_ctx->flags |= HLUA_FLT_CTX_FL_PAYLOAD; + ret = hlua_filter_callback(s, filter, "tcp_payload", dir, (HLUA_FLT_CB_FINAL | HLUA_FLT_CB_ARG_CHN)); + if (ret != -1) { + ret = flt_ctx->cur_len[idx]; + if (lua_gettop(flt_hlua->T) > 0) { + ret = lua_tointeger(flt_hlua->T, -1); + if (ret > flt_ctx->cur_len[idx]) + ret = flt_ctx->cur_len[idx]; + lua_settop(flt_hlua->T, 0); /* Empty the stack. */ + } + } + return ret; +} + +static int hlua_filter_parse_fct(char **args, int *cur_arg, struct proxy *px, + struct flt_conf *fconf, char **err, void *private) +{ + struct hlua_reg_filter *reg_flt = private; + lua_State *L; + struct hlua_flt_config *conf = NULL; + const char *flt_id = NULL; + int state_id, pos, flt_flags = 0; + struct flt_ops *hlua_flt_ops = NULL; + + state_id = reg_flt_to_stack_id(reg_flt); + L = hlua_states[state_id]; + + /* Initialize the filter ops with default callbacks */ + hlua_flt_ops = calloc(1, sizeof(*hlua_flt_ops)); + if (!hlua_flt_ops) + goto error; + hlua_flt_ops->init = hlua_filter_init; + hlua_flt_ops->deinit = hlua_filter_deinit; + if (state_id) { + /* Set per-thread callback if script is loaded per-thread */ + hlua_flt_ops->init_per_thread = hlua_filter_init_per_thread; + hlua_flt_ops->deinit_per_thread = hlua_filter_deinit_per_thread; + } + hlua_flt_ops->attach = hlua_filter_new; + hlua_flt_ops->detach = hlua_filter_delete; + + /* Push the filter class on the stack and resolve all callbacks */ + hlua_pushref(L, reg_flt->flt_ref[state_id]); + + if (lua_getfield(L, -1, "start_analyze") == LUA_TFUNCTION) + hlua_flt_ops->channel_start_analyze = hlua_filter_start_analyze; + lua_pop(L, 1); + if (lua_getfield(L, -1, "end_analyze") == LUA_TFUNCTION) + hlua_flt_ops->channel_end_analyze = hlua_filter_end_analyze; + lua_pop(L, 1); + if (lua_getfield(L, -1, "http_headers") == LUA_TFUNCTION) + hlua_flt_ops->http_headers = hlua_filter_http_headers; + lua_pop(L, 1); + if (lua_getfield(L, -1, "http_payload") == LUA_TFUNCTION) + hlua_flt_ops->http_payload = hlua_filter_http_payload; + lua_pop(L, 1); + if (lua_getfield(L, -1, "http_end") == LUA_TFUNCTION) + hlua_flt_ops->http_end = hlua_filter_http_end; + lua_pop(L, 1); + if (lua_getfield(L, -1, "tcp_payload") == LUA_TFUNCTION) + hlua_flt_ops->tcp_payload = hlua_filter_tcp_payload; + lua_pop(L, 1); + + /* Get id and flags of the filter class */ + if (lua_getfield(L, -1, "id") == LUA_TSTRING) + flt_id = lua_tostring(L, -1); + lua_pop(L, 1); + if (lua_getfield(L, -1, "flags") == LUA_TNUMBER) + flt_flags = lua_tointeger(L, -1); + lua_pop(L, 1); + + /* Create the filter config */ + conf = calloc(1, sizeof(*conf)); + if (!conf) + goto error; + conf->reg = reg_flt; + + /* duplicate args */ + for (pos = 0; *args[*cur_arg + 1 + pos]; pos++); + conf->args = calloc(pos + 1, sizeof(*conf->args)); + if (!conf->args) + goto error; + for (pos = 0; *args[*cur_arg + 1 + pos]; pos++) { + conf->args[pos] = strdup(args[*cur_arg + 1 + pos]); + if (!conf->args[pos]) + goto error; + } + conf->args[pos] = NULL; + *cur_arg += pos + 1; + + if (flt_id) { + fconf->id = strdup(flt_id); + if (!fconf->id) + goto error; + } + fconf->flags = flt_flags; + fconf->conf = conf; + fconf->ops = hlua_flt_ops; + + lua_settop(L, 0); + return 0; + + error: + memprintf(err, "Lua filter '%s' : Lua out of memory error", reg_flt->name); + free(hlua_flt_ops); + if (conf && conf->args) { + for (pos = 0; conf->args[pos]; pos++) + free(conf->args[pos]); + free(conf->args); + } + free(conf); + free((char *)fconf->id); + lua_settop(L, 0); + return -1; +} + +__LJMP static int hlua_register_data_filter(lua_State *L) +{ + struct filter *filter; + struct channel *chn; + + MAY_LJMP(check_args(L, 2, "register_data_filter")); + MAY_LJMP(luaL_checktype(L, 1, LUA_TTABLE)); + chn = MAY_LJMP(hlua_checkchannel(L, 2)); + + lua_getfield(L, 1, "__filter"); + MAY_LJMP(luaL_checktype(L, -1, LUA_TLIGHTUSERDATA)); + filter = lua_touserdata (L, -1); + lua_pop(L, 1); + + register_data_filter(chn_strm(chn), chn, filter); + return 1; +} + +__LJMP static int hlua_unregister_data_filter(lua_State *L) +{ + struct filter *filter; + struct channel *chn; + + MAY_LJMP(check_args(L, 2, "unregister_data_filter")); + MAY_LJMP(luaL_checktype(L, 1, LUA_TTABLE)); + chn = MAY_LJMP(hlua_checkchannel(L, 2)); + + lua_getfield(L, 1, "__filter"); + MAY_LJMP(luaL_checktype(L, -1, LUA_TLIGHTUSERDATA)); + filter = lua_touserdata (L, -1); + lua_pop(L, 1); + + unregister_data_filter(chn_strm(chn), chn, filter); + return 1; +} + +/* This function is an LUA binding used for registering a filter. It expects a + * filter name used in the haproxy configuration file and a LUA function to + * parse configuration arguments. + */ +__LJMP static int hlua_register_filter(lua_State *L) +{ + struct buffer *trash; + struct flt_kw_list *fkl; + struct flt_kw *fkw; + const char *name; + struct hlua_reg_filter *reg_flt= NULL; + int flt_ref, fun_ref; + int len; + + MAY_LJMP(check_args(L, 3, "register_filter")); + + if (hlua_gethlua(L)) { + /* runtime processing */ + WILL_LJMP(luaL_error(L, "register_filter: not available outside of body context")); + } + + /* First argument : filter name. */ + name = MAY_LJMP(luaL_checkstring(L, 1)); + + /* Second argument : The filter class */ + flt_ref = MAY_LJMP(hlua_checktable(L, 2)); + + /* Third argument : lua function. */ + fun_ref = MAY_LJMP(hlua_checkfunction(L, 3)); + + trash = get_trash_chunk(); + chunk_printf(trash, "lua.%s", name); + fkw = flt_find_kw(trash->area); + if (fkw != NULL) { + reg_flt = fkw->private; + if (reg_flt->flt_ref[hlua_state_id] != -1 || reg_flt->fun_ref[hlua_state_id] != -1) { + ha_warning("Trying to register filter 'lua.%s' more than once. " + "This will become a hard error in version 2.5.\n", name); + if (reg_flt->flt_ref[hlua_state_id] != -1) + hlua_unref(L, reg_flt->flt_ref[hlua_state_id]); + if (reg_flt->fun_ref[hlua_state_id] != -1) + hlua_unref(L, reg_flt->fun_ref[hlua_state_id]); + } + reg_flt->flt_ref[hlua_state_id] = flt_ref; + reg_flt->fun_ref[hlua_state_id] = fun_ref; + return 0; + } + + fkl = calloc(1, sizeof(*fkl) + sizeof(struct flt_kw) * 2); + if (!fkl) + goto alloc_error; + fkl->scope = "HLUA"; + + reg_flt = new_hlua_reg_filter(name); + if (!reg_flt) + goto alloc_error; + + reg_flt->flt_ref[hlua_state_id] = flt_ref; + reg_flt->fun_ref[hlua_state_id] = fun_ref; + + /* The filter keyword */ + len = strlen("lua.") + strlen(name) + 1; + fkl->kw[0].kw = calloc(1, len); + if (!fkl->kw[0].kw) + goto alloc_error; + + snprintf((char *)fkl->kw[0].kw, len, "lua.%s", name); + + fkl->kw[0].parse = hlua_filter_parse_fct; + fkl->kw[0].private = reg_flt; + memset(&fkl->kw[1], 0, sizeof(*fkl->kw)); + + /* Register this new filter */ + flt_register_keywords(fkl); + + return 0; + + alloc_error: + release_hlua_reg_filter(reg_flt); + hlua_unref(L, flt_ref); + hlua_unref(L, fun_ref); + ha_free(&fkl); + WILL_LJMP(luaL_error(L, "Lua out of memory error.")); + return 0; /* Never reached */ +} + +static int hlua_read_timeout(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err, unsigned int *timeout) +{ + const char *error; + + error = parse_time_err(args[1], timeout, TIME_UNIT_MS); + if (error == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument <%s> to <%s> (maximum value is 2147483647 ms or ~24.8 days)", + args[1], args[0]); + return -1; + } + else if (error == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument <%s> to <%s> (minimum non-null value is 1 ms)", + args[1], args[0]); + return -1; + } + else if (error) { + memprintf(err, "%s: invalid timeout", args[0]); + return -1; + } + return 0; +} + +static int hlua_burst_timeout(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return hlua_read_timeout(args, section_type, curpx, defpx, + file, line, err, &hlua_timeout_burst); +} + +static int hlua_session_timeout(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return hlua_read_timeout(args, section_type, curpx, defpx, + file, line, err, &hlua_timeout_session); +} + +static int hlua_task_timeout(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return hlua_read_timeout(args, section_type, curpx, defpx, + file, line, err, &hlua_timeout_task); +} + +static int hlua_applet_timeout(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return hlua_read_timeout(args, section_type, curpx, defpx, + file, line, err, &hlua_timeout_applet); +} + +static int hlua_forced_yield(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *error; + + hlua_nb_instruction = strtoll(args[1], &error, 10); + if (*error != '\0') { + memprintf(err, "%s: invalid number", args[0]); + return -1; + } + return 0; +} + +static int hlua_parse_maxmem(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *error; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects an integer argument (Lua memory size in MB).", args[0]); + return -1; + } + hlua_global_allocator.limit = strtoll(args[1], &error, 10) * 1024L * 1024L; + if (*error != '\0') { + memprintf(err, "%s: invalid number %s (error at '%c')", args[0], args[1], *error); + return -1; + } + return 0; +} + +static int hlua_cfg_parse_log_loggers(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + hlua_log_opts |= HLUA_LOG_LOGGERS_ON; + else if (strcmp(args[1], "off") == 0) + hlua_log_opts &= ~HLUA_LOG_LOGGERS_ON; + else { + memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +static int hlua_cfg_parse_log_stderr(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + hlua_log_opts = (hlua_log_opts & ~HLUA_LOG_STDERR_MASK) | HLUA_LOG_STDERR_ON; + else if (strcmp(args[1], "auto") == 0) + hlua_log_opts = (hlua_log_opts & ~HLUA_LOG_STDERR_MASK) | HLUA_LOG_STDERR_AUTO; + else if (strcmp(args[1], "off") == 0) + hlua_log_opts &= ~HLUA_LOG_STDERR_MASK; + else { + memprintf(err, "'%s' expects either 'on', 'auto', or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +/* This function is called by the main configuration key "lua-load". It loads and + * execute an lua file during the parsing of the HAProxy configuration file. It is + * the main lua entry point. + * + * This function runs with the HAProxy keywords API. It returns -1 if an error + * occurs, otherwise it returns 0. + * + * In some error case, LUA set an error message in top of the stack. This function + * returns this error message in the HAProxy logs and pop it from the stack. + * + * This function can fail with an abort() due to an Lua critical error. + * We are in the configuration parsing process of HAProxy, this abort() is + * tolerated. + */ +static int hlua_load_state(char **args, lua_State *L, char **err) +{ + int error; + int nargs; + + /* Just load and compile the file. */ + error = luaL_loadfile(L, args[0]); + if (error) { + memprintf(err, "error in Lua file '%s': %s", args[0], lua_tostring(L, -1)); + lua_pop(L, 1); + return -1; + } + + /* Push args in the Lua stack, except the first one which is the filename */ + for (nargs = 1; *(args[nargs]) != 0; nargs++) { + /* Check stack size. */ + if (!lua_checkstack(L, 1)) { + memprintf(err, "Lua runtime error while loading arguments: stack is full."); + return -1; + } + lua_pushstring(L, args[nargs]); + } + nargs--; + + /* If no syntax error where detected, execute the code. */ + error = lua_pcall(L, nargs, LUA_MULTRET, 0); + switch (error) { + case LUA_OK: + break; + case LUA_ERRRUN: + memprintf(err, "Lua runtime error: %s", lua_tostring(L, -1)); + lua_pop(L, 1); + return -1; + case LUA_ERRMEM: + memprintf(err, "Lua out of memory error"); + return -1; + case LUA_ERRERR: + memprintf(err, "Lua message handler error: %s", lua_tostring(L, -1)); + lua_pop(L, 1); + return -1; +#if defined(LUA_VERSION_NUM) && LUA_VERSION_NUM <= 503 + case LUA_ERRGCMM: + memprintf(err, "Lua garbage collector error: %s", lua_tostring(L, -1)); + lua_pop(L, 1); + return -1; +#endif + default: + memprintf(err, "Lua unknown error: %s", lua_tostring(L, -1)); + lua_pop(L, 1); + return -1; + } + + return 0; +} + +static int hlua_load(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects a file name as parameter.", args[0]); + return -1; + } + + /* loading for global state */ + hlua_state_id = 0; + ha_set_thread(NULL); + return hlua_load_state(&args[1], hlua_states[0], err); +} + +static int hlua_load_per_thread(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int len; + int i; + + if (*(args[1]) == 0) { + memprintf(err, "'%s' expects a file as parameter.", args[0]); + return -1; + } + + if (per_thread_load == NULL) { + /* allocate the first entry large enough to store the final NULL */ + per_thread_load = calloc(1, sizeof(*per_thread_load)); + if (per_thread_load == NULL) { + memprintf(err, "out of memory error"); + return -1; + } + } + + /* count used entries */ + for (len = 0; per_thread_load[len] != NULL; len++) + ; + + per_thread_load = realloc(per_thread_load, (len + 2) * sizeof(*per_thread_load)); + if (per_thread_load == NULL) { + memprintf(err, "out of memory error"); + return -1; + } + per_thread_load[len + 1] = NULL; + + /* count args excepting the first, allocate array and copy args */ + for (i = 0; *(args[i + 1]) != 0; i++); + per_thread_load[len] = calloc(i + 1, sizeof(*per_thread_load[len])); + if (per_thread_load[len] == NULL) { + memprintf(err, "out of memory error"); + return -1; + } + for (i = 1; *(args[i]) != 0; i++) { + per_thread_load[len][i - 1] = strdup(args[i]); + if (per_thread_load[len][i - 1] == NULL) { + memprintf(err, "out of memory error"); + return -1; + } + } + per_thread_load[len][i - 1] = strdup(""); + if (per_thread_load[len][i - 1] == NULL) { + memprintf(err, "out of memory error"); + return -1; + } + + /* loading for thread 1 only */ + hlua_state_id = 1; + ha_set_thread(NULL); + return hlua_load_state(per_thread_load[len], hlua_states[1], err); +} + +/* Prepend the given <path> followed by a semicolon to the `package.<type>` variable + * in the given <ctx>. + */ +static int hlua_prepend_path(lua_State *L, char *type, char *path) +{ + lua_getglobal(L, "package"); /* push package variable */ + lua_pushstring(L, path); /* push given path */ + lua_pushstring(L, ";"); /* push semicolon */ + lua_getfield(L, -3, type); /* push old path */ + lua_concat(L, 3); /* concatenate to new path */ + lua_setfield(L, -2, type); /* store new path */ + lua_pop(L, 1); /* pop package variable */ + + return 0; +} + +static int hlua_config_prepend_path(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *path; + char *type = "path"; + struct prepend_path *p = NULL; + size_t i; + + if (too_many_args(2, args, err, NULL)) { + goto err; + } + + if (!(*args[1])) { + memprintf(err, "'%s' expects to receive a <path> as argument", args[0]); + goto err; + } + path = args[1]; + + if (*args[2]) { + if (strcmp(args[2], "path") != 0 && strcmp(args[2], "cpath") != 0) { + memprintf(err, "'%s' expects <type> to either be 'path' or 'cpath'", args[0]); + goto err; + } + type = args[2]; + } + + p = calloc(1, sizeof(*p)); + if (p == NULL) { + memprintf(err, "memory allocation failed"); + goto err; + } + p->path = strdup(path); + if (p->path == NULL) { + memprintf(err, "memory allocation failed"); + goto err2; + } + p->type = strdup(type); + if (p->type == NULL) { + memprintf(err, "memory allocation failed"); + goto err2; + } + LIST_APPEND(&prepend_path_list, &p->l); + + /* Handle the global state and the per-thread state for the first + * thread. The remaining threads will be initialized based on + * prepend_path_list. + */ + for (i = 0; i < 2; i++) { + lua_State *L = hlua_states[i]; + const char *error; + + if (setjmp(safe_ljmp_env) != 0) { + lua_atpanic(L, hlua_panic_safe); + if (lua_type(L, -1) == LUA_TSTRING) + error = lua_tostring(L, -1); + else + error = "critical error"; + fprintf(stderr, "lua-prepend-path: %s.\n", error); + exit(1); + } else { + lua_atpanic(L, hlua_panic_ljmp); + } + + hlua_prepend_path(L, type, path); + + lua_atpanic(L, hlua_panic_safe); + } + + return 0; + +err2: + free(p->type); + free(p->path); +err: + free(p); + return -1; +} + +/* configuration keywords declaration */ +static struct cfg_kw_list cfg_kws = {{ },{ + { CFG_GLOBAL, "lua-prepend-path", hlua_config_prepend_path }, + { CFG_GLOBAL, "lua-load", hlua_load }, + { CFG_GLOBAL, "lua-load-per-thread", hlua_load_per_thread }, + { CFG_GLOBAL, "tune.lua.session-timeout", hlua_session_timeout }, + { CFG_GLOBAL, "tune.lua.task-timeout", hlua_task_timeout }, + { CFG_GLOBAL, "tune.lua.service-timeout", hlua_applet_timeout }, + { CFG_GLOBAL, "tune.lua.burst-timeout", hlua_burst_timeout }, + { CFG_GLOBAL, "tune.lua.forced-yield", hlua_forced_yield }, + { CFG_GLOBAL, "tune.lua.maxmem", hlua_parse_maxmem }, + { CFG_GLOBAL, "tune.lua.log.loggers", hlua_cfg_parse_log_loggers }, + { CFG_GLOBAL, "tune.lua.log.stderr", hlua_cfg_parse_log_stderr }, + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +#ifdef USE_OPENSSL + +/* + * This function replace a ckch_store by another one, and rebuild the ckch_inst and all its dependencies. + * It does the sam as "cli_io_handler_commit_cert" but for lua, the major + * difference is that the yield in lua and for the CLI is not handled the same + * way. + */ +__LJMP static int hlua_ckch_commit_yield(lua_State *L, int status, lua_KContext ctx) +{ + struct ckch_inst **lua_ckchi = lua_touserdata(L, -1); + struct ckch_store **lua_ckchs = lua_touserdata(L, -2); + struct ckch_inst *ckchi = *lua_ckchi; + struct ckch_store *old_ckchs = lua_ckchs[0]; + struct ckch_store *new_ckchs = lua_ckchs[1]; + struct hlua *hlua; + char *err = NULL; + int y = 1; + + hlua = hlua_gethlua(L); + + /* get the first ckchi to copy */ + if (ckchi == NULL) + ckchi = LIST_ELEM(old_ckchs->ckch_inst.n, typeof(ckchi), by_ckchs); + + /* walk through the old ckch_inst and creates new ckch_inst using the updated ckchs */ + list_for_each_entry_from(ckchi, &old_ckchs->ckch_inst, by_ckchs) { + struct ckch_inst *new_inst; + + /* it takes a lot of CPU to creates SSL_CTXs, so we yield every 10 CKCH instances */ + if (y % 10 == 0) { + + *lua_ckchi = ckchi; + + task_wakeup(hlua->task, TASK_WOKEN_MSG); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_ckch_commit_yield, TICK_ETERNITY, 0)); + } + + if (ckch_inst_rebuild(new_ckchs, ckchi, &new_inst, &err)) + goto error; + + /* link the new ckch_inst to the duplicate */ + LIST_APPEND(&new_ckchs->ckch_inst, &new_inst->by_ckchs); + y++; + } + + /* The generation is finished, we can insert everything */ + ckch_store_replace(old_ckchs, new_ckchs); + + lua_pop(L, 2); /* pop the lua_ckchs and ckchi */ + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + return 0; + +error: + ckch_store_free(new_ckchs); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + WILL_LJMP(luaL_error(L, "%s", err)); + free(err); + + return 0; +} + +/* + * Replace a ckch_store <filename> in the ckchs_tree with a ckch_store created + * from the table in parameter. + * + * This is equivalent to "set ssl cert" + "commit ssl cert" over the CLI, which + * means it does not need to have a transaction since everything is done in the + * same function. + * + * CertCache.set{filename="", crt="", key="", sctl="", ocsp="", issuer=""} + * + */ +__LJMP static int hlua_ckch_set(lua_State *L) +{ + struct hlua *hlua; + struct ckch_inst **lua_ckchi; + struct ckch_store **lua_ckchs; + struct ckch_store *old_ckchs = NULL; + struct ckch_store *new_ckchs = NULL; + int errcode = 0; + char *err = NULL; + struct cert_exts *cert_ext = NULL; + char *filename; + struct ckch_data *data; + int ret; + + if (lua_type(L, -1) != LUA_TTABLE) + WILL_LJMP(luaL_error(L, "'CertCache.set' needs a table as argument")); + + hlua = hlua_gethlua(L); + + /* FIXME: this should not return an error but should come back later */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + WILL_LJMP(luaL_error(L, "CertCache already under lock")); + + ret = lua_getfield(L, -1, "filename"); + if (ret != LUA_TSTRING) { + memprintf(&err, "%sNo filename specified!", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + filename = (char *)lua_tostring(L, -1); + + + /* look for the filename in the tree */ + old_ckchs = ckchs_lookup(filename); + if (!old_ckchs) { + memprintf(&err, "%sCan't replace a certificate which is not referenced by the configuration!", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + /* TODO: handle extra_files_noext */ + + new_ckchs = ckchs_dup(old_ckchs); + if (!new_ckchs) { + memprintf(&err, "%sCannot allocate memory!", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + data = new_ckchs->data; + + /* loop on the field in the table, which have the same name as the + * possible extensions of files */ + lua_pushnil(L); + while (lua_next(L, 1)) { + int i; + const char *field = lua_tostring(L, -2); + char *payload = (char *)lua_tostring(L, -1); + + if (!field || strcmp(field, "filename") == 0) { + lua_pop(L, 1); + continue; + } + + for (i = 0; field && cert_exts[i].ext != NULL; i++) { + if (strcmp(field, cert_exts[i].ext) == 0) { + cert_ext = &cert_exts[i]; + break; + } + } + + /* this is the default type, the field is not supported */ + if (cert_ext == NULL) { + memprintf(&err, "%sUnsupported field '%s'", err ? err : "", field); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* Reset the OCSP CID */ + if (cert_ext->type == CERT_TYPE_PEM || cert_ext->type == CERT_TYPE_KEY || + cert_ext->type == CERT_TYPE_ISSUER) { + OCSP_CERTID_free(new_ckchs->data->ocsp_cid); + new_ckchs->data->ocsp_cid = NULL; + } + + /* apply the change on the duplicate */ + if (cert_ext->load(filename, payload, data, &err) != 0) { + memprintf(&err, "%sCan't load the payload for '%s'", err ? err : "", cert_ext->ext); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + lua_pop(L, 1); + } + + /* store the pointers on the lua stack */ + lua_ckchs = lua_newuserdata(L, sizeof(struct ckch_store *) * 2); + lua_ckchs[0] = old_ckchs; + lua_ckchs[1] = new_ckchs; + lua_ckchi = lua_newuserdata(L, sizeof(struct ckch_inst *)); + *lua_ckchi = NULL; + + task_wakeup(hlua->task, TASK_WOKEN_MSG); + MAY_LJMP(hlua_yieldk(L, 0, 0, hlua_ckch_commit_yield, TICK_ETERNITY, 0)); + +end: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + if (errcode & ERR_CODE) { + ckch_store_free(new_ckchs); + WILL_LJMP(luaL_error(L, "%s", err)); + } + free(err); + + return 0; +} + +#else + +__LJMP static int hlua_ckch_set(lua_State *L) +{ + WILL_LJMP(luaL_error(L, "'CertCache.set' needs an HAProxy built with OpenSSL")); + + return 0; +} +#endif /* ! USE_OPENSSL */ + + + +/* This function can fail with an abort() due to an Lua critical error. + * We are in the initialisation process of HAProxy, this abort() is + * tolerated. + */ +int hlua_post_init_state(lua_State *L) +{ + struct hlua_init_function *init; + const char *msg; + enum hlua_exec ret; + const char *error; + const char *kind; + const char *trace; + int return_status = 1; +#if defined(LUA_VERSION_NUM) && LUA_VERSION_NUM >= 504 + int nres; +#endif + + /* disable memory limit checks if limit is not set */ + if (!hlua_global_allocator.limit) + hlua_global_allocator.limit = ~hlua_global_allocator.limit; + + /* Call post initialisation function in safe environment. */ + if (setjmp(safe_ljmp_env) != 0) { + lua_atpanic(L, hlua_panic_safe); + if (lua_type(L, -1) == LUA_TSTRING) + error = lua_tostring(L, -1); + else + error = "critical error"; + fprintf(stderr, "Lua post-init: %s.\n", error); + exit(1); + } else { + lua_atpanic(L, hlua_panic_ljmp); + } + + list_for_each_entry(init, &hlua_init_functions[hlua_state_id], l) { + hlua_pushref(L, init->function_ref); + /* function ref should be released right away since it was pushed + * on the stack and will not be used anymore + */ + hlua_unref(L, init->function_ref); + +#if defined(LUA_VERSION_NUM) && LUA_VERSION_NUM >= 504 + ret = lua_resume(L, NULL, 0, &nres); +#else + ret = lua_resume(L, NULL, 0); +#endif + kind = NULL; + switch (ret) { + + case LUA_OK: + lua_pop(L, -1); + break; + + case LUA_ERRERR: + kind = "message handler error"; + __fallthrough; + case LUA_ERRRUN: + if (!kind) + kind = "runtime error"; + msg = lua_tostring(L, -1); + lua_settop(L, 0); /* Empty the stack. */ + trace = hlua_traceback(L, ", "); + if (msg) + ha_alert("Lua init: %s: '%s' from %s\n", kind, msg, trace); + else + ha_alert("Lua init: unknown %s from %s\n", kind, trace); + return_status = 0; + break; + + default: + /* Unknown error */ + kind = "Unknown error"; + __fallthrough; + case LUA_YIELD: + /* yield is not configured at this step, this state doesn't happen */ + if (!kind) + kind = "yield not allowed"; + __fallthrough; + case LUA_ERRMEM: + if (!kind) + kind = "out of memory error"; + lua_settop(L, 0); /* Empty the stack. */ + trace = hlua_traceback(L, ", "); + ha_alert("Lua init: %s: %s\n", kind, trace); + return_status = 0; + break; + } + if (!return_status) + break; + } + + lua_atpanic(L, hlua_panic_safe); + return return_status; +} + +int hlua_post_init() +{ + int ret; + int i; + int errors; + char *err = NULL; + struct hlua_function *fcn; + struct hlua_reg_filter *reg_flt; + +#if defined(USE_OPENSSL) + /* Initialize SSL server. */ + if (socket_ssl->xprt->prepare_srv) { + int saved_used_backed = global.ssl_used_backend; + // don't affect maxconn automatic computation + socket_ssl->xprt->prepare_srv(socket_ssl); + global.ssl_used_backend = saved_used_backed; + } +#endif + + /* Perform post init of common thread */ + hlua_state_id = 0; + ha_set_thread(&ha_thread_info[0]); + ret = hlua_post_init_state(hlua_states[hlua_state_id]); + if (ret == 0) + return 0; + + /* init remaining lua states and load files */ + for (hlua_state_id = 2; hlua_state_id < global.nbthread + 1; hlua_state_id++) { + + /* set thread context */ + ha_set_thread(&ha_thread_info[hlua_state_id - 1]); + + /* Init lua state */ + hlua_states[hlua_state_id] = hlua_init_state(hlua_state_id); + + /* Load lua files */ + for (i = 0; per_thread_load && per_thread_load[i]; i++) { + ret = hlua_load_state(per_thread_load[i], hlua_states[hlua_state_id], &err); + if (ret != 0) { + ha_alert("Lua init: %s\n", err); + return 0; + } + } + } + + /* Reset thread context */ + ha_set_thread(NULL); + + /* Execute post init for all states */ + for (hlua_state_id = 1; hlua_state_id < global.nbthread + 1; hlua_state_id++) { + + /* set thread context */ + ha_set_thread(&ha_thread_info[hlua_state_id - 1]); + + /* run post init */ + ret = hlua_post_init_state(hlua_states[hlua_state_id]); + if (ret == 0) + return 0; + } + + /* Reset thread context */ + ha_set_thread(NULL); + + /* control functions registering. Each function must have: + * - only the function_ref[0] set positive and all other to -1 + * - only the function_ref[0] set to -1 and all other positive + * This ensure a same reference is not used both in shared + * lua state and thread dedicated lua state. Note: is the case + * reach, the shared state is priority, but the bug will be + * complicated to found for the end user. + */ + errors = 0; + list_for_each_entry(fcn, &referenced_functions, l) { + ret = 0; + for (i = 1; i < global.nbthread + 1; i++) { + if (fcn->function_ref[i] == -1) + ret--; + else + ret++; + } + if (abs(ret) != global.nbthread) { + ha_alert("Lua function '%s' is not referenced in all thread. " + "Expect function in all thread or in none thread.\n", fcn->name); + errors++; + continue; + } + + if ((fcn->function_ref[0] == -1) == (ret < 0)) { + ha_alert("Lua function '%s' is referenced both ins shared Lua context (through lua-load) " + "and per-thread Lua context (through lua-load-per-thread). these two context " + "exclusive.\n", fcn->name); + errors++; + } + } + + /* Do the same with registered filters */ + list_for_each_entry(reg_flt, &referenced_filters, l) { + ret = 0; + for (i = 1; i < global.nbthread + 1; i++) { + if (reg_flt->flt_ref[i] == -1) + ret--; + else + ret++; + } + if (abs(ret) != global.nbthread) { + ha_alert("Lua filter '%s' is not referenced in all thread. " + "Expect function in all thread or in none thread.\n", reg_flt->name); + errors++; + continue; + } + + if ((reg_flt->flt_ref[0] == -1) == (ret < 0)) { + ha_alert("Lua filter '%s' is referenced both ins shared Lua context (through lua-load) " + "and per-thread Lua context (through lua-load-per-thread). these two context " + "exclusive.\n", fcn->name); + errors++; + } + } + + + if (errors > 0) + return 0; + + /* after this point, this global will no longer be used, so set to + * -1 in order to have probably a segfault if someone use it + */ + hlua_state_id = -1; + + return 1; +} + +/* The memory allocator used by the Lua stack. <ud> is a pointer to the + * allocator's context. <ptr> is the pointer to alloc/free/realloc. <osize> + * is the previously allocated size or the kind of object in case of a new + * allocation. <nsize> is the requested new size. A new allocation is + * indicated by <ptr> being NULL. A free is indicated by <nsize> being + * zero. This one verifies that the limits are respected but is optimized + * for the fast case where limits are not used, hence stats are not updated. + * + * Warning: while this API ressembles glibc's realloc() a lot, glibc surpasses + * POSIX by making realloc(ptr,0) an effective free(), but others do not do + * that and will simply allocate zero as if it were the result of malloc(0), + * so mapping this onto realloc() will lead to memory leaks on non-glibc + * systems. + */ +static void *hlua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) +{ + struct hlua_mem_allocator *zone = ud; + size_t limit, old, new; + + /* a limit of ~0 means unlimited and boot complete, so there's no need + * for accounting anymore. + */ + if (likely(~zone->limit == 0)) { + if (!nsize) + ha_free(&ptr); + else + ptr = realloc(ptr, nsize); + return ptr; + } + + if (!ptr) + osize = 0; + + /* enforce strict limits across all threads */ + limit = zone->limit; + old = _HA_ATOMIC_LOAD(&zone->allocated); + do { + new = old + nsize - osize; + if (unlikely(nsize && limit && new > limit)) + return NULL; + } while (!_HA_ATOMIC_CAS(&zone->allocated, &old, new)); + + if (!nsize) + ha_free(&ptr); + else + ptr = realloc(ptr, nsize); + + if (unlikely(!ptr && nsize)) // failed + _HA_ATOMIC_SUB(&zone->allocated, nsize - osize); + + __ha_barrier_atomic_store(); + return ptr; +} + +/* This function can fail with an abort() due to a Lua critical error. + * We are in the initialisation process of HAProxy, this abort() is + * tolerated. + */ +lua_State *hlua_init_state(int thread_num) +{ + int i; + int idx; + struct sample_fetch *sf; + struct sample_conv *sc; + char *p; + const char *error_msg; + void **context; + lua_State *L; + struct prepend_path *pp; + + /* Init main lua stack. */ + L = lua_newstate(hlua_alloc, &hlua_global_allocator); + + if (!L) { + fprintf(stderr, + "Lua init: critical error: lua_newstate() returned NULL." + " This may possibly be caused by a memory allocation error.\n"); + exit(1); + } + + /* Initialise Lua context to NULL */ + context = lua_getextraspace(L); + *context = NULL; + + /* From this point, until the end of the initialisation function, + * the Lua function can fail with an abort. We are in the initialisation + * process of HAProxy, this abort() is tolerated. + */ + + /* Call post initialisation function in safe environment. */ + if (setjmp(safe_ljmp_env) != 0) { + lua_atpanic(L, hlua_panic_safe); + if (lua_type(L, -1) == LUA_TSTRING) + error_msg = lua_tostring(L, -1); + else + error_msg = "critical error"; + fprintf(stderr, "Lua init: %s.\n", error_msg); + exit(1); + } else { + lua_atpanic(L, hlua_panic_ljmp); + } + + /* Initialise lua. */ + luaL_openlibs(L); +#define HLUA_PREPEND_PATH_TOSTRING1(x) #x +#define HLUA_PREPEND_PATH_TOSTRING(x) HLUA_PREPEND_PATH_TOSTRING1(x) +#ifdef HLUA_PREPEND_PATH + hlua_prepend_path(L, "path", HLUA_PREPEND_PATH_TOSTRING(HLUA_PREPEND_PATH)); +#endif +#ifdef HLUA_PREPEND_CPATH + hlua_prepend_path(L, "cpath", HLUA_PREPEND_PATH_TOSTRING(HLUA_PREPEND_CPATH)); +#endif +#undef HLUA_PREPEND_PATH_TOSTRING +#undef HLUA_PREPEND_PATH_TOSTRING1 + + /* Apply configured prepend path */ + list_for_each_entry(pp, &prepend_path_list, l) + hlua_prepend_path(L, pp->type, pp->path); + + /* + * Override some lua functions. + * + */ + + /* push our "safe" coroutine.create() function */ + lua_getglobal(L, "coroutine"); + lua_pushcclosure(L, hlua_coroutine_create, 0); + lua_setfield(L, -2, "create"); + + /* + * + * Create "core" object. + * + */ + + /* This table entry is the object "core" base. */ + lua_newtable(L); + + /* set the thread id */ + hlua_class_const_int(L, "thread", thread_num); + + /* Push the loglevel constants. */ + for (i = 0; i < NB_LOG_LEVELS; i++) + hlua_class_const_int(L, log_levels[i], i); + + /* Register special functions. */ + hlua_class_function(L, "register_init", hlua_register_init); + hlua_class_function(L, "register_task", hlua_register_task); + hlua_class_function(L, "register_fetches", hlua_register_fetches); + hlua_class_function(L, "register_converters", hlua_register_converters); + hlua_class_function(L, "register_action", hlua_register_action); + hlua_class_function(L, "register_service", hlua_register_service); + hlua_class_function(L, "register_cli", hlua_register_cli); + hlua_class_function(L, "register_filter", hlua_register_filter); + hlua_class_function(L, "yield", hlua_yield); + hlua_class_function(L, "set_nice", hlua_set_nice); + hlua_class_function(L, "sleep", hlua_sleep); + hlua_class_function(L, "msleep", hlua_msleep); + hlua_class_function(L, "add_acl", hlua_add_acl); + hlua_class_function(L, "del_acl", hlua_del_acl); + hlua_class_function(L, "set_map", hlua_set_map); + hlua_class_function(L, "del_map", hlua_del_map); + hlua_class_function(L, "get_var", hlua_core_get_var); + hlua_class_function(L, "tcp", hlua_socket_new); + hlua_class_function(L, "httpclient", hlua_httpclient_new); + hlua_class_function(L, "event_sub", hlua_event_global_sub); + hlua_class_function(L, "log", hlua_log); + hlua_class_function(L, "Debug", hlua_log_debug); + hlua_class_function(L, "Info", hlua_log_info); + hlua_class_function(L, "Warning", hlua_log_warning); + hlua_class_function(L, "Alert", hlua_log_alert); + hlua_class_function(L, "done", hlua_done); + hlua_class_function(L, "disable_legacy_mailers", hlua_disable_legacy_mailers); + hlua_fcn_reg_core_fcn(L); + + lua_setglobal(L, "core"); + + /* + * + * Create "act" object. + * + */ + + /* This table entry is the object "act" base. */ + lua_newtable(L); + + /* push action return constants */ + hlua_class_const_int(L, "CONTINUE", ACT_RET_CONT); + hlua_class_const_int(L, "STOP", ACT_RET_STOP); + hlua_class_const_int(L, "YIELD", ACT_RET_YIELD); + hlua_class_const_int(L, "ERROR", ACT_RET_ERR); + hlua_class_const_int(L, "DONE", ACT_RET_DONE); + hlua_class_const_int(L, "DENY", ACT_RET_DENY); + hlua_class_const_int(L, "ABORT", ACT_RET_ABRT); + hlua_class_const_int(L, "INVALID", ACT_RET_INV); + + hlua_class_function(L, "wake_time", hlua_set_wake_time); + + lua_setglobal(L, "act"); + + /* + * + * Create "Filter" object. + * + */ + + /* This table entry is the object "filter" base. */ + lua_newtable(L); + + /* push flags and constants */ + hlua_class_const_int(L, "CONTINUE", 1); + hlua_class_const_int(L, "WAIT", 0); + hlua_class_const_int(L, "ERROR", -1); + + hlua_class_const_int(L, "FLT_CFG_FL_HTX", FLT_CFG_FL_HTX); + + hlua_class_function(L, "wake_time", hlua_set_wake_time); + hlua_class_function(L, "register_data_filter", hlua_register_data_filter); + hlua_class_function(L, "unregister_data_filter", hlua_unregister_data_filter); + + lua_setglobal(L, "filter"); + + /* + * + * Register class Map + * + */ + + /* This table entry is the object "Map" base. */ + lua_newtable(L); + + /* register pattern types. */ + for (i=0; i<PAT_MATCH_NUM; i++) + hlua_class_const_int(L, pat_match_names[i], i); + for (i=0; i<PAT_MATCH_NUM; i++) { + snprintf(trash.area, trash.size, "_%s", pat_match_names[i]); + hlua_class_const_int(L, trash.area, i); + } + + /* register constructor. */ + hlua_class_function(L, "new", hlua_map_new); + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Register . */ + hlua_class_function(L, "lookup", hlua_map_lookup); + hlua_class_function(L, "slookup", hlua_map_slookup); + + lua_rawset(L, -3); + + /* Register previous table in the registry with reference and named entry. + * The function hlua_register_metatable() pops the stack, so we + * previously create a copy of the table. + */ + lua_pushvalue(L, -1); /* Copy the -1 entry and push it on the stack. */ + class_map_ref = hlua_register_metatable(L, CLASS_MAP); + + /* Assign the metatable to the mai Map object. */ + lua_setmetatable(L, -2); + + /* Set a name to the table. */ + lua_setglobal(L, "Map"); + + /* + * + * Register "CertCache" class + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + /* Register */ + hlua_class_function(L, "set", hlua_ckch_set); + lua_setglobal(L, CLASS_CERTCACHE); /* Create global object called Regex */ + + /* + * + * Register class Channel + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Register . */ + hlua_class_function(L, "data", hlua_channel_get_data); + hlua_class_function(L, "line", hlua_channel_get_line); + hlua_class_function(L, "set", hlua_channel_set_data); + hlua_class_function(L, "remove", hlua_channel_del_data); + hlua_class_function(L, "append", hlua_channel_append); + hlua_class_function(L, "prepend", hlua_channel_prepend); + hlua_class_function(L, "insert", hlua_channel_insert_data); + hlua_class_function(L, "send", hlua_channel_send); + hlua_class_function(L, "forward", hlua_channel_forward); + hlua_class_function(L, "input", hlua_channel_get_in_len); + hlua_class_function(L, "output", hlua_channel_get_out_len); + hlua_class_function(L, "may_recv", hlua_channel_may_recv); + hlua_class_function(L, "is_full", hlua_channel_is_full); + hlua_class_function(L, "is_resp", hlua_channel_is_resp); + + /* Deprecated API */ + hlua_class_function(L, "get", hlua_channel_get); + hlua_class_function(L, "dup", hlua_channel_dup); + hlua_class_function(L, "getline", hlua_channel_getline); + hlua_class_function(L, "get_in_len", hlua_channel_get_in_len); + hlua_class_function(L, "get_out_len", hlua_channel_get_out_len); + + lua_rawset(L, -3); + + /* Register previous table in the registry with reference and named entry. */ + class_channel_ref = hlua_register_metatable(L, CLASS_CHANNEL); + + /* + * + * Register class Fetches + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Browse existing fetches and create the associated + * object method. + */ + sf = NULL; + while ((sf = sample_fetch_getnext(sf, &idx)) != NULL) { + /* gL.Tua doesn't support '.' and '-' in the function names, replace it + * by an underscore. + */ + strlcpy2(trash.area, sf->kw, trash.size); + for (p = trash.area; *p; p++) + if (*p == '.' || *p == '-' || *p == '+') + *p = '_'; + + /* Register the function. */ + lua_pushstring(L, trash.area); + lua_pushlightuserdata(L, sf); + lua_pushcclosure(L, hlua_run_sample_fetch, 1); + lua_rawset(L, -3); + } + + lua_rawset(L, -3); + + /* Register previous table in the registry with reference and named entry. */ + class_fetches_ref = hlua_register_metatable(L, CLASS_FETCHES); + + /* + * + * Register class Converters + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Browse existing converters and create the associated + * object method. + */ + sc = NULL; + while ((sc = sample_conv_getnext(sc, &idx)) != NULL) { + /* gL.Tua doesn't support '.' and '-' in the function names, replace it + * by an underscore. + */ + strlcpy2(trash.area, sc->kw, trash.size); + for (p = trash.area; *p; p++) + if (*p == '.' || *p == '-' || *p == '+') + *p = '_'; + + /* Register the function. */ + lua_pushstring(L, trash.area); + lua_pushlightuserdata(L, sc); + lua_pushcclosure(L, hlua_run_sample_conv, 1); + lua_rawset(L, -3); + } + + lua_rawset(L, -3); + + /* Register previous table in the registry with reference and named entry. */ + class_converters_ref = hlua_register_metatable(L, CLASS_CONVERTERS); + + /* + * + * Register class HTTP + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Register Lua functions. */ + hlua_class_function(L, "req_get_headers",hlua_http_req_get_headers); + hlua_class_function(L, "req_del_header", hlua_http_req_del_hdr); + hlua_class_function(L, "req_rep_header", hlua_http_req_rep_hdr); + hlua_class_function(L, "req_rep_value", hlua_http_req_rep_val); + hlua_class_function(L, "req_add_header", hlua_http_req_add_hdr); + hlua_class_function(L, "req_set_header", hlua_http_req_set_hdr); + hlua_class_function(L, "req_set_method", hlua_http_req_set_meth); + hlua_class_function(L, "req_set_path", hlua_http_req_set_path); + hlua_class_function(L, "req_set_query", hlua_http_req_set_query); + hlua_class_function(L, "req_set_uri", hlua_http_req_set_uri); + + hlua_class_function(L, "res_get_headers",hlua_http_res_get_headers); + hlua_class_function(L, "res_del_header", hlua_http_res_del_hdr); + hlua_class_function(L, "res_rep_header", hlua_http_res_rep_hdr); + hlua_class_function(L, "res_rep_value", hlua_http_res_rep_val); + hlua_class_function(L, "res_add_header", hlua_http_res_add_hdr); + hlua_class_function(L, "res_set_header", hlua_http_res_set_hdr); + hlua_class_function(L, "res_set_status", hlua_http_res_set_status); + + lua_rawset(L, -3); + + /* Register previous table in the registry with reference and named entry. */ + class_http_ref = hlua_register_metatable(L, CLASS_HTTP); + + /* + * + * Register class HTTPMessage + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Register Lua functions. */ + hlua_class_function(L, "is_resp", hlua_http_msg_is_resp); + hlua_class_function(L, "get_stline", hlua_http_msg_get_stline); + hlua_class_function(L, "get_headers", hlua_http_msg_get_headers); + hlua_class_function(L, "del_header", hlua_http_msg_del_hdr); + hlua_class_function(L, "rep_header", hlua_http_msg_rep_hdr); + hlua_class_function(L, "rep_value", hlua_http_msg_rep_val); + hlua_class_function(L, "add_header", hlua_http_msg_add_hdr); + hlua_class_function(L, "set_header", hlua_http_msg_set_hdr); + hlua_class_function(L, "set_method", hlua_http_msg_set_meth); + hlua_class_function(L, "set_path", hlua_http_msg_set_path); + hlua_class_function(L, "set_query", hlua_http_msg_set_query); + hlua_class_function(L, "set_uri", hlua_http_msg_set_uri); + hlua_class_function(L, "set_status", hlua_http_msg_set_status); + hlua_class_function(L, "is_full", hlua_http_msg_is_full); + hlua_class_function(L, "may_recv", hlua_http_msg_may_recv); + hlua_class_function(L, "eom", hlua_http_msg_is_eom); + hlua_class_function(L, "input", hlua_http_msg_get_in_len); + hlua_class_function(L, "output", hlua_http_msg_get_out_len); + + hlua_class_function(L, "body", hlua_http_msg_get_body); + hlua_class_function(L, "set", hlua_http_msg_set_data); + hlua_class_function(L, "remove", hlua_http_msg_del_data); + hlua_class_function(L, "append", hlua_http_msg_append); + hlua_class_function(L, "prepend", hlua_http_msg_prepend); + hlua_class_function(L, "insert", hlua_http_msg_insert_data); + hlua_class_function(L, "set_eom", hlua_http_msg_set_eom); + hlua_class_function(L, "unset_eom", hlua_http_msg_unset_eom); + + hlua_class_function(L, "send", hlua_http_msg_send); + hlua_class_function(L, "forward", hlua_http_msg_forward); + + lua_rawset(L, -3); + + /* Register previous table in the registry with reference and named entry. */ + class_http_msg_ref = hlua_register_metatable(L, CLASS_HTTP_MSG); + + /* + * + * Register class HTTPClient + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + lua_pushstring(L, "__index"); + lua_newtable(L); + hlua_class_function(L, "get", hlua_httpclient_get); + hlua_class_function(L, "head", hlua_httpclient_head); + hlua_class_function(L, "put", hlua_httpclient_put); + hlua_class_function(L, "post", hlua_httpclient_post); + hlua_class_function(L, "delete", hlua_httpclient_delete); + lua_settable(L, -3); /* Sets the __index entry. */ + /* Register the garbage collector entry. */ + lua_pushstring(L, "__gc"); + lua_pushcclosure(L, hlua_httpclient_gc, 0); + lua_settable(L, -3); /* Push the last 2 entries in the table at index -3 */ + + + + class_httpclient_ref = hlua_register_metatable(L, CLASS_HTTPCLIENT); + /* + * + * Register class AppletTCP + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Register Lua functions. */ + hlua_class_function(L, "getline", hlua_applet_tcp_getline); + hlua_class_function(L, "receive", hlua_applet_tcp_recv); + hlua_class_function(L, "send", hlua_applet_tcp_send); + hlua_class_function(L, "set_priv", hlua_applet_tcp_set_priv); + hlua_class_function(L, "get_priv", hlua_applet_tcp_get_priv); + hlua_class_function(L, "set_var", hlua_applet_tcp_set_var); + hlua_class_function(L, "unset_var", hlua_applet_tcp_unset_var); + hlua_class_function(L, "get_var", hlua_applet_tcp_get_var); + + lua_settable(L, -3); + + /* Register previous table in the registry with reference and named entry. */ + class_applet_tcp_ref = hlua_register_metatable(L, CLASS_APPLET_TCP); + + /* + * + * Register class AppletHTTP + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Register Lua functions. */ + hlua_class_function(L, "set_priv", hlua_applet_http_set_priv); + hlua_class_function(L, "get_priv", hlua_applet_http_get_priv); + hlua_class_function(L, "set_var", hlua_applet_http_set_var); + hlua_class_function(L, "unset_var", hlua_applet_http_unset_var); + hlua_class_function(L, "get_var", hlua_applet_http_get_var); + hlua_class_function(L, "getline", hlua_applet_http_getline); + hlua_class_function(L, "receive", hlua_applet_http_recv); + hlua_class_function(L, "send", hlua_applet_http_send); + hlua_class_function(L, "add_header", hlua_applet_http_addheader); + hlua_class_function(L, "set_status", hlua_applet_http_status); + hlua_class_function(L, "start_response", hlua_applet_http_start_response); + + lua_settable(L, -3); + + /* Register previous table in the registry with reference and named entry. */ + class_applet_http_ref = hlua_register_metatable(L, CLASS_APPLET_HTTP); + + /* + * + * Register class TXN + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + + /* Register Lua functions. */ + hlua_class_function(L, "set_priv", hlua_set_priv); + hlua_class_function(L, "get_priv", hlua_get_priv); + hlua_class_function(L, "set_var", hlua_set_var); + hlua_class_function(L, "unset_var", hlua_unset_var); + hlua_class_function(L, "get_var", hlua_get_var); + hlua_class_function(L, "done", hlua_txn_done); + hlua_class_function(L, "reply", hlua_txn_reply_new); + hlua_class_function(L, "set_loglevel", hlua_txn_set_loglevel); + hlua_class_function(L, "set_tos", hlua_txn_set_tos); + hlua_class_function(L, "set_mark", hlua_txn_set_mark); + hlua_class_function(L, "set_priority_class", hlua_txn_set_priority_class); + hlua_class_function(L, "set_priority_offset", hlua_txn_set_priority_offset); + hlua_class_function(L, "deflog", hlua_txn_deflog); + hlua_class_function(L, "log", hlua_txn_log); + hlua_class_function(L, "Debug", hlua_txn_log_debug); + hlua_class_function(L, "Info", hlua_txn_log_info); + hlua_class_function(L, "Warning", hlua_txn_log_warning); + hlua_class_function(L, "Alert", hlua_txn_log_alert); + + lua_rawset(L, -3); + + /* Register previous table in the registry with reference and named entry. */ + class_txn_ref = hlua_register_metatable(L, CLASS_TXN); + + /* + * + * Register class reply + * + */ + lua_newtable(L); + lua_pushstring(L, "__index"); + lua_newtable(L); + hlua_class_function(L, "set_status", hlua_txn_reply_set_status); + hlua_class_function(L, "add_header", hlua_txn_reply_add_header); + hlua_class_function(L, "del_header", hlua_txn_reply_del_header); + hlua_class_function(L, "set_body", hlua_txn_reply_set_body); + lua_settable(L, -3); /* Sets the __index entry. */ + class_txn_reply_ref = luaL_ref(L, LUA_REGISTRYINDEX); + + + /* + * + * Register class Socket + * + */ + + /* Create and fill the metatable. */ + lua_newtable(L); + + /* Create and fill the __index entry. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + +#ifdef USE_OPENSSL + hlua_class_function(L, "connect_ssl", hlua_socket_connect_ssl); +#endif + hlua_class_function(L, "connect", hlua_socket_connect); + hlua_class_function(L, "send", hlua_socket_send); + hlua_class_function(L, "receive", hlua_socket_receive); + hlua_class_function(L, "close", hlua_socket_close); + hlua_class_function(L, "getpeername", hlua_socket_getpeername); + hlua_class_function(L, "getsockname", hlua_socket_getsockname); + hlua_class_function(L, "setoption", hlua_socket_setoption); + hlua_class_function(L, "settimeout", hlua_socket_settimeout); + + lua_rawset(L, -3); /* Push the last 2 entries in the table at index -3 */ + + /* Register the garbage collector entry. */ + lua_pushstring(L, "__gc"); + lua_pushcclosure(L, hlua_socket_gc, 0); + lua_rawset(L, -3); /* Push the last 2 entries in the table at index -3 */ + + /* Register previous table in the registry with reference and named entry. */ + class_socket_ref = hlua_register_metatable(L, CLASS_SOCKET); + + lua_atpanic(L, hlua_panic_safe); + + return L; +} + +void hlua_init(void) { + int i; + char *errmsg; +#ifdef USE_OPENSSL + struct srv_kw *kw; + int tmp_error; + char *error; + char *args[] = { /* SSL client configuration. */ + "ssl", + "verify", + "none", + NULL + }; +#endif + + /* Init post init function list head */ + for (i = 0; i < MAX_THREADS + 1; i++) + LIST_INIT(&hlua_init_functions[i]); + + /* Init state for common/shared lua parts */ + hlua_state_id = 0; + ha_set_thread(NULL); + hlua_states[0] = hlua_init_state(0); + + /* Init state 1 for thread 0. We have at least one thread. */ + hlua_state_id = 1; + ha_set_thread(NULL); + hlua_states[1] = hlua_init_state(1); + + /* Proxy and server configuration initialisation. */ + socket_proxy = alloc_new_proxy("LUA-SOCKET", PR_CAP_FE|PR_CAP_BE|PR_CAP_INT, &errmsg); + if (!socket_proxy) { + fprintf(stderr, "Lua init: %s\n", errmsg); + exit(1); + } + + /* Init TCP server: unchanged parameters */ + socket_tcp = new_server(socket_proxy); + if (!socket_tcp) { + fprintf(stderr, "Lua init: failed to allocate tcp server socket\n"); + exit(1); + } + +#ifdef USE_OPENSSL + /* Init TCP server: unchanged parameters */ + socket_ssl = new_server(socket_proxy); + if (!socket_ssl) { + fprintf(stderr, "Lua init: failed to allocate ssl server socket\n"); + exit(1); + } + + socket_ssl->use_ssl = 1; + socket_ssl->xprt = xprt_get(XPRT_SSL); + + for (i = 0; args[i] != NULL; i++) { + if ((kw = srv_find_kw(args[i])) != NULL) { /* Maybe it's registered server keyword */ + /* + * + * If the keyword is not known, we can search in the registered + * server keywords. This is useful to configure special SSL + * features like client certificates and ssl_verify. + * + */ + tmp_error = kw->parse(args, &i, socket_proxy, socket_ssl, &error); + if (tmp_error != 0) { + fprintf(stderr, "INTERNAL ERROR: %s\n", error); + abort(); /* This must be never arrives because the command line + not editable by the user. */ + } + i += kw->skip; + } + } +#endif + +} + +static void hlua_deinit() +{ + int thr; + struct hlua_reg_filter *reg_flt, *reg_flt_bck; + + list_for_each_entry_safe(reg_flt, reg_flt_bck, &referenced_filters, l) + release_hlua_reg_filter(reg_flt); + + for (thr = 0; thr < MAX_THREADS+1; thr++) { + if (hlua_states[thr]) + lua_close(hlua_states[thr]); + } + + srv_drop(socket_tcp); + +#ifdef USE_OPENSSL + srv_drop(socket_ssl); +#endif + + free_proxy(socket_proxy); +} + +REGISTER_POST_DEINIT(hlua_deinit); + +static void hlua_register_build_options(void) +{ + char *ptr = NULL; + + memprintf(&ptr, "Built with Lua version : %s", LUA_RELEASE); + hap_register_build_opts(ptr, 1); +} + +INITCALL0(STG_REGISTER, hlua_register_build_options); diff --git a/src/hlua_fcn.c b/src/hlua_fcn.c new file mode 100644 index 0000000..d8dcdfd --- /dev/null +++ b/src/hlua_fcn.c @@ -0,0 +1,2721 @@ +/* + * Lua safe functions + * + * Copyright 2015-2016 Thierry Fournier <tfournier@arpalert.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * All the functions in this file runs with a Lua stack, and can + * return with a longjmp. All of these function must be launched + * in an environment able to catch a longjmp, otherwise a + * critical error can be raised. + */ + +#define _GNU_SOURCE + +#include <lauxlib.h> +#include <lua.h> +#include <lualib.h> + +#include <import/ebmbtree.h> + +#include <haproxy/cli-t.h> +#include <haproxy/errors.h> +#include <haproxy/hlua.h> +#include <haproxy/hlua_fcn.h> +#include <haproxy/http.h> +#include <haproxy/net_helper.h> +#include <haproxy/pattern-t.h> +#include <haproxy/proxy.h> +#include <haproxy/regex.h> +#include <haproxy/server.h> +#include <haproxy/stats.h> +#include <haproxy/stick_table.h> +#include <haproxy/event_hdl.h> +#include <haproxy/stream-t.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/mailers.h> + +/* Contains the class reference of the concat object. */ +static int class_concat_ref; +static int class_queue_ref; +static int class_proxy_ref; +static int class_server_ref; +static int class_listener_ref; +static int class_event_sub_ref; +static int class_regex_ref; +static int class_stktable_ref; +static int class_proxy_list_ref; +static int class_server_list_ref; + +#define STATS_LEN (MAX((int)ST_F_TOTAL_FIELDS, (int)INF_TOTAL_FIELDS)) + +static THREAD_LOCAL struct field stats[STATS_LEN]; + +int hlua_checkboolean(lua_State *L, int index) +{ + if (!lua_isboolean(L, index)) + luaL_argerror(L, index, "boolean expected"); + return lua_toboolean(L, index); +} + +/* Helper to push unsigned integers to Lua stack, respecting Lua limitations */ +static int hlua_fcn_pushunsigned(lua_State *L, unsigned int val) +{ +#if (LUA_MAXINTEGER == LLONG_MAX || ((LUA_MAXINTEGER == LONG_MAX) && (__WORDSIZE == 64))) + lua_pushinteger(L, val); +#else + if (val > INT_MAX) + lua_pushnumber(L, (lua_Number)val); + else + lua_pushinteger(L, (int)val); +#endif + return 1; +} + +/* Helper to push unsigned long long to Lua stack, respecting Lua limitations */ +static int hlua_fcn_pushunsigned_ll(lua_State *L, unsigned long long val) { +#if (LUA_MAXINTEGER == LLONG_MAX || ((LUA_MAXINTEGER == LONG_MAX) && (__WORDSIZE == 64))) + /* 64 bits case, U64 is supported until LLONG_MAX */ + if (val > LLONG_MAX) + lua_pushnumber(L, (lua_Number)val); + else + lua_pushinteger(L, val); +#else + /* 32 bits case, U64 is supported until INT_MAX */ + if (val > INT_MAX) + lua_pushnumber(L, (lua_Number)val); + else + lua_pushinteger(L, (int)val); +#endif + return 1; +} + +/* This function gets a struct field and converts it in Lua + * variable. The variable is pushed at the top of the stack. + */ +int hlua_fcn_pushfield(lua_State *L, struct field *field) +{ + /* The lua_Integer is always signed. Its length depends on + * compilation options, so the following code is conditioned + * by some macros. Windows maros are not supported. + * If the number cannot be represented as integer, we try to + * convert to float. + */ + switch (field_format(field, 0)) { + + case FF_EMPTY: + lua_pushnil(L); + return 1; + + case FF_S32: + /* S32 is always supported. */ + lua_pushinteger(L, field->u.s32); + return 1; + + case FF_U32: +#if (LUA_MAXINTEGER == LLONG_MAX || ((LUA_MAXINTEGER == LONG_MAX) && (__WORDSIZE == 64))) + /* 64 bits case, U32 is always supported */ + lua_pushinteger(L, field->u.u32); +#else + /* 32 bits case, U32 is supported until INT_MAX. */ + if (field->u.u32 > INT_MAX) + lua_pushnumber(L, (lua_Number)field->u.u32); + else + lua_pushinteger(L, field->u.u32); +#endif + return 1; + + case FF_S64: +#if (LUA_MAXINTEGER == LLONG_MAX || ((LUA_MAXINTEGER == LONG_MAX) && (__WORDSIZE == 64))) + /* 64 bits case, S64 is always supported */ + lua_pushinteger(L, field->u.s64); +#else + /* 64 bits case, S64 is supported between INT_MIN and INT_MAX */ + if (field->u.s64 < INT_MIN || field->u.s64 > INT_MAX) + lua_pushnumber(L, (lua_Number)field->u.s64); + else + lua_pushinteger(L, (int)field->u.s64); +#endif + return 1; + + case FF_U64: +#if (LUA_MAXINTEGER == LLONG_MAX || ((LUA_MAXINTEGER == LONG_MAX) && (__WORDSIZE == 64))) + /* 64 bits case, U64 is supported until LLONG_MAX */ + if (field->u.u64 > LLONG_MAX) + lua_pushnumber(L, (lua_Number)field->u.u64); + else + lua_pushinteger(L, field->u.u64); +#else + /* 64 bits case, U64 is supported until INT_MAX */ + if (field->u.u64 > INT_MAX) + lua_pushnumber(L, (lua_Number)field->u.u64); + else + lua_pushinteger(L, (int)field->u.u64); +#endif + return 1; + + case FF_STR: + lua_pushstring(L, field->u.str); + return 1; + + default: + break; + } + + /* Default case, never reached. */ + lua_pushnil(L); + return 1; +} + +/* Some string are started or terminated by blank chars, + * this function removes the spaces, tabs, \r and + * \n at the begin and at the end of the string "str", and + * push the result in the lua stack. + * Returns a pointer to the Lua internal copy of the string. + */ +const char *hlua_pushstrippedstring(lua_State *L, const char *str) +{ + const char *p; + int l; + + for (p = str; HTTP_IS_LWS(*p); p++); + + for (l = strlen(p); l && HTTP_IS_LWS(p[l-1]); l--); + + return lua_pushlstring(L, p, l); +} + +/* The three following functions are useful for adding entries + * in a table. These functions takes a string and respectively an + * integer, a string or a function and add it to the table in the + * top of the stack. + * + * These functions throws an error if no more stack size is + * available. + */ +void hlua_class_const_int(lua_State *L, const char *name, int value) +{ + lua_pushstring(L, name); + lua_pushinteger(L, value); + lua_rawset(L, -3); +} +void hlua_class_const_str(lua_State *L, const char *name, const char *value) +{ + lua_pushstring(L, name); + lua_pushstring(L, value); + lua_rawset(L, -3); +} +void hlua_class_function(lua_State *L, const char *name, int (*function)(lua_State *L)) +{ + lua_pushstring(L, name); + lua_pushcclosure(L, function, 0); + lua_rawset(L, -3); +} + +/* This function returns a string containing the HAProxy object name. */ +int hlua_dump_object(struct lua_State *L) +{ + const char *name = (const char *)lua_tostring(L, lua_upvalueindex(1)); + lua_pushfstring(L, "HAProxy class %s", name); + return 1; +} + +/* This function register a table as metatable and. It names + * the metatable, and returns the associated reference. + * The original table is popped from the top of the stack. + * "name" is the referenced class name. + */ +int hlua_register_metatable(struct lua_State *L, char *name) +{ + /* Check the type of the top element. it must be + * a table. + */ + if (lua_type(L, -1) != LUA_TTABLE) + luaL_error(L, "hlua_register_metatable() requires a type Table " + "in the top of the stack"); + + /* Add the __tostring function which identify the + * created object. + */ + lua_pushstring(L, "__tostring"); + lua_pushstring(L, name); + lua_pushcclosure(L, hlua_dump_object, 1); + lua_rawset(L, -3); + + /* Register a named entry for the table. The table + * reference is copied first because the function + * lua_setfield() pop the entry. + */ + lua_pushvalue(L, -1); + lua_setfield(L, LUA_REGISTRYINDEX, name); + + /* Creates the reference of the object. The + * function luaL_ref pop the top of the stack. + */ + return luaL_ref(L, LUA_REGISTRYINDEX); +} + +/* Return an object of the expected type, or throws an error. */ +void *hlua_checkudata(lua_State *L, int ud, int class_ref) +{ + void *p; + int ret; + + /* Check if the stack entry is an array. */ + if (!lua_istable(L, ud)) + luaL_argerror(L, ud, NULL); + + /* pop the metatable of the referencecd object. */ + if (!lua_getmetatable(L, ud)) + luaL_argerror(L, ud, NULL); + + /* pop the expected metatable. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_ref); + + /* Check if the metadata have the expected type. */ + ret = lua_rawequal(L, -1, -2); + lua_pop(L, 2); + if (!ret) + luaL_argerror(L, ud, NULL); + + /* Push on the stack at the entry [0] of the table. */ + lua_rawgeti(L, ud, 0); + + /* Check if this entry is userdata. */ + p = lua_touserdata(L, -1); + if (!p) + luaL_argerror(L, ud, NULL); + + /* Remove the entry returned by lua_rawgeti(). */ + lua_pop(L, 1); + + /* Return the associated struct. */ + return p; +} + +/* This function return the current date at epoch format in milliseconds. */ +int hlua_now(lua_State *L) +{ + /* WT: the doc says "returns the current time" and later says that it's + * monotonic. So the best fit is to use start_date+(now-start_time). + */ + struct timeval tv; + + tv = NS_TO_TV(now_ns - start_time_ns); + tv_add(&tv, &tv, &start_date); + + lua_newtable(L); + lua_pushstring(L, "sec"); + lua_pushinteger(L, tv.tv_sec); + lua_rawset(L, -3); + lua_pushstring(L, "usec"); + lua_pushinteger(L, tv.tv_usec); + lua_rawset(L, -3); + return 1; +} + +/* This functions expects a Lua string as HTTP date, parse it and + * returns an integer containing the epoch format of the date, or + * nil if the parsing fails. + */ +static int hlua_parse_date(lua_State *L, int (*fcn)(const char *, int, struct tm*)) +{ + const char *str; + size_t len; + struct tm tm; + time_t time; + + str = luaL_checklstring(L, 1, &len); + + if (!fcn(str, len, &tm)) { + lua_pushnil(L); + return 1; + } + + /* This function considers the content of the broken-down time + * is exprimed in the UTC timezone. timegm don't care about + * the gnu variable tm_gmtoff. If gmtoff is set, or if you know + * the timezone from the broken-down time, it must be fixed + * after the conversion. + */ + time = my_timegm(&tm); + if (time == -1) { + lua_pushnil(L); + return 1; + } + + lua_pushinteger(L, (int)time); + return 1; +} +static int hlua_http_date(lua_State *L) +{ + return hlua_parse_date(L, parse_http_date); +} +static int hlua_imf_date(lua_State *L) +{ + return hlua_parse_date(L, parse_imf_date); +} +static int hlua_rfc850_date(lua_State *L) +{ + return hlua_parse_date(L, parse_rfc850_date); +} +static int hlua_asctime_date(lua_State *L) +{ + return hlua_parse_date(L, parse_asctime_date); +} + +static int hlua_get_info(lua_State *L) +{ + int i; + + stats_fill_info(stats, STATS_LEN, 0); + + lua_newtable(L); + for (i=0; i<INF_TOTAL_FIELDS; i++) { + lua_pushstring(L, info_fields[i].name); + hlua_fcn_pushfield(L, &stats[i]); + lua_settable(L, -3); + } + return 1; +} + +static struct hlua_concat *hlua_check_concat(lua_State *L, int ud) +{ + return (hlua_checkudata(L, ud, class_concat_ref)); +} + +static int hlua_concat_add(lua_State *L) +{ + struct hlua_concat *b; + char *buffer; + char *new; + const char *str; + size_t l; + + /* First arg must be a concat object. */ + b = hlua_check_concat(L, 1); + + /* Second arg must be a string. */ + str = luaL_checklstring(L, 2, &l); + + /* Get the buffer. */ + lua_rawgeti(L, 1, 1); + buffer = lua_touserdata(L, -1); + lua_pop(L, 1); + + /* Update the buffer size if it s required. The old buffer + * is crushed by the new in the object array, so it will + * be deleted by the GC. + * Note that in the first loop, the "new" variable is only + * used as a flag. + */ + new = NULL; + while (b->size - b->len < l) { + b->size += HLUA_CONCAT_BLOCSZ; + new = buffer; + } + if (new) { + new = lua_newuserdata(L, b->size); + memcpy(new, buffer, b->len); + lua_rawseti(L, 1, 1); + buffer = new; + } + + /* Copy string, and update metadata. */ + memcpy(buffer + b->len, str, l); + b->len += l; + return 0; +} + +static int hlua_concat_dump(lua_State *L) +{ + struct hlua_concat *b; + char *buffer; + + /* First arg must be a concat object. */ + b = hlua_check_concat(L, 1); + + /* Get the buffer. */ + lua_rawgeti(L, 1, 1); + buffer = lua_touserdata(L, -1); + lua_pop(L, 1); + + /* Push the soncatenated string in the stack. */ + lua_pushlstring(L, buffer, b->len); + return 1; +} + +int hlua_concat_new(lua_State *L) +{ + struct hlua_concat *b; + + lua_newtable(L); + b = lua_newuserdata(L, sizeof(*b)); + b->size = HLUA_CONCAT_BLOCSZ; + b->len = 0; + lua_rawseti(L, -2, 0); + lua_newuserdata(L, HLUA_CONCAT_BLOCSZ); + lua_rawseti(L, -2, 1); + + lua_rawgeti(L, LUA_REGISTRYINDEX, class_concat_ref); + lua_setmetatable(L, -2); + + return 1; +} + +static int concat_tostring(lua_State *L) +{ + const void *ptr = lua_topointer(L, 1); + lua_pushfstring(L, "Concat object: %p", ptr); + return 1; +} + +static void hlua_concat_init(lua_State *L) +{ + /* Creates the buffered concat object. */ + lua_newtable(L); + + lua_pushstring(L, "__tostring"); + lua_pushcclosure(L, concat_tostring, 0); + lua_settable(L, -3); + + lua_pushstring(L, "__index"); /* Creates the index entry. */ + lua_newtable(L); /* The "__index" content. */ + + lua_pushstring(L, "add"); + lua_pushcclosure(L, hlua_concat_add, 0); + lua_settable(L, -3); + + lua_pushstring(L, "dump"); + lua_pushcclosure(L, hlua_concat_dump, 0); + lua_settable(L, -3); + + lua_settable(L, -3); /* Sets the __index entry. */ + class_concat_ref = luaL_ref(L, LUA_REGISTRYINDEX); +} + +/* C backing storage for lua Queue class */ +struct hlua_queue { + uint32_t size; + struct mt_list list; + struct mt_list wait_tasks; +}; + +/* used to store lua objects in queue->list */ +struct hlua_queue_item { + int ref; /* lua object reference id */ + struct mt_list list; +}; + +/* used to store wait entries in queue->wait_tasks */ +struct hlua_queue_wait +{ + struct task *task; + struct mt_list entry; +}; + +/* This is the memory pool containing struct hlua_queue_item (queue items) + */ +DECLARE_STATIC_POOL(pool_head_hlua_queue, "hlua_queue", sizeof(struct hlua_queue_item)); + +/* This is the memory pool containing struct hlua_queue_wait + * (queue waiting tasks) + */ +DECLARE_STATIC_POOL(pool_head_hlua_queuew, "hlua_queuew", sizeof(struct hlua_queue_wait)); + +static struct hlua_queue *hlua_check_queue(lua_State *L, int ud) +{ + return hlua_checkudata(L, ud, class_queue_ref); +} + +/* queue:size(): returns an integer containing the current number of queued + * items. + */ +static int hlua_queue_size(lua_State *L) +{ + struct hlua_queue *queue = hlua_check_queue(L, 1); + + BUG_ON(!queue); + lua_pushinteger(L, HA_ATOMIC_LOAD(&queue->size)); + + return 1; +} + +/* queue:push(): push an item (any type, except nil) at the end of the queue + * + * Returns boolean:true for success and boolean:false on error + */ +static int hlua_queue_push(lua_State *L) +{ + struct hlua_queue *queue = hlua_check_queue(L, 1); + struct hlua_queue_item *item; + struct mt_list *elt1, elt2; + struct hlua_queue_wait *waiter; + + if (lua_gettop(L) != 2 || lua_isnoneornil(L, 2)) { + luaL_error(L, "unexpected argument"); + /* not reached */ + return 0; + } + BUG_ON(!queue); + + item = pool_alloc(pool_head_hlua_queue); + if (!item) { + /* memory error */ + lua_pushboolean(L, 0); + return 1; + } + + /* get a reference from lua object at the top of the stack */ + item->ref = hlua_ref(L); + + /* push new entry to the queue */ + MT_LIST_INIT(&item->list); + HA_ATOMIC_INC(&queue->size); + MT_LIST_APPEND(&queue->list, &item->list); + + /* notify tasks waiting on queue:pop_wait() (if any) */ + mt_list_for_each_entry_safe(waiter, &queue->wait_tasks, entry, elt1, elt2) { + task_wakeup(waiter->task, TASK_WOKEN_MSG); + } + + lua_pushboolean(L, 1); + return 1; +} + +/* internal queue pop helper, returns 1 if it successfully popped an item + * from the queue and pushed it on lua stack. + * + * Else it returns 0 (nothing is pushed on the stack) + */ +static int _hlua_queue_pop(lua_State *L, struct hlua_queue *queue) +{ + struct hlua_queue_item *item; + + item = MT_LIST_POP(&queue->list, typeof(item), list); + if (!item) + return 0; /* nothing in queue */ + + HA_ATOMIC_DEC(&queue->size); + /* push lua obj on the stack */ + hlua_pushref(L, item->ref); + + /* obj ref should be released right away since it was pushed + * on the stack and will not be used anymore + */ + hlua_unref(L, item->ref); + + /* free the queue item */ + pool_free(pool_head_hlua_queue, item); + + return 1; +} + +/* queue:pop(): returns the first item at the top of que queue or nil if + * the queue is empty. + */ +static int hlua_queue_pop(lua_State *L) +{ + struct hlua_queue *queue = hlua_check_queue(L, 1); + + BUG_ON(!queue); + if (!_hlua_queue_pop(L, queue)) { + /* nothing in queue, push nil */ + lua_pushnil(L); + } + return 1; /* either item or nil is at the top of the stack */ +} + +/* queue:pop_wait(): same as queue:pop() but doesn't return on empty queue. + * + * Aborts if used incorrectly and returns nil in case of memory error. + */ +static int _hlua_queue_pop_wait(lua_State *L, int status, lua_KContext ctx) +{ + struct hlua_queue *queue = hlua_check_queue(L, 1); + struct hlua_queue_wait *wait = lua_touserdata(L, 2); + + /* new pop attempt */ + if (!_hlua_queue_pop(L, queue)) { + hlua_yieldk(L, 0, 0, _hlua_queue_pop_wait, TICK_ETERNITY, 0); // wait retry + return 0; // never reached, yieldk won't return + } + + /* remove task from waiting list */ + MT_LIST_DELETE(&wait->entry); + pool_free(pool_head_hlua_queuew, wait); + + return 1; // success +} +static int hlua_queue_pop_wait(lua_State *L) +{ + struct hlua_queue *queue = hlua_check_queue(L, 1); + struct hlua_queue_wait *wait; + struct hlua *hlua; + + BUG_ON(!queue); + + /* Get hlua struct, or NULL if we execute from main lua state */ + hlua = hlua_gethlua(L); + + if (!hlua || HLUA_CANT_YIELD(hlua)) { + luaL_error(L, "pop_wait() may only be used within task context " + "(requires yielding)"); + return 0; /* not reached */ + } + + /* try opportunistic pop (there could already be pending items) */ + if (_hlua_queue_pop(L, queue)) + return 1; // success + + /* no pending items, waiting required */ + + wait = pool_alloc(pool_head_hlua_queuew); + if (!wait) { + lua_pushnil(L); + return 1; /* memory error, return nil */ + } + + wait->task = hlua->task; + MT_LIST_INIT(&wait->entry); + + /* add task to queue's wait list */ + MT_LIST_TRY_APPEND(&queue->wait_tasks, &wait->entry); + + /* push wait entry at index 2 on the stack (queue is already there) */ + lua_pushlightuserdata(L, wait); + + /* Go to waiting loop which immediately performs a new attempt to make + * sure we didn't miss a push during the wait entry initialization. + * + * _hlua_queue_pop_wait() won't return to us if it has to yield, which + * is the most likely scenario. What happens in this case is that yieldk + * call never returns, and instead Lua will call the continuation + * function after a successful resume, so the calling function will + * no longer be us, but Lua instead. And when the continuation function + * eventually returns (because it successfully popped an item), Lua will + * directly give the hand back to the Lua function that called us. + * + * More info here: https://www.lua.org/manual/5.4/manual.html#4.7 + */ + return _hlua_queue_pop_wait(L, LUA_OK, 0); +} + +static int hlua_queue_new(lua_State *L) +{ + struct hlua_queue *q; + + lua_newtable(L); + + /* set class metatable */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_queue_ref); + lua_setmetatable(L, -2); + + /* index:0 is queue userdata (c data) */ + q = lua_newuserdata(L, sizeof(*q)); + MT_LIST_INIT(&q->list); + MT_LIST_INIT(&q->wait_tasks); + q->size = 0; + lua_rawseti(L, -2, 0); + + /* class methods */ + hlua_class_function(L, "size", hlua_queue_size); + hlua_class_function(L, "pop", hlua_queue_pop); + hlua_class_function(L, "pop_wait", hlua_queue_pop_wait); + hlua_class_function(L, "push", hlua_queue_push); + + return 1; +} + +static int hlua_queue_gc(struct lua_State *L) +{ + struct hlua_queue *queue = hlua_check_queue(L, 1); + struct hlua_queue_wait *wait; + struct hlua_queue_item *item; + + /* Purge waiting tasks (if any) + * + * It is normally not expected to have waiting tasks, except if such + * task has been aborted while in the middle of a queue:pop_wait() + * function call. + */ + while ((wait = MT_LIST_POP(&queue->wait_tasks, typeof(wait), entry))) { + /* free the wait entry */ + pool_free(pool_head_hlua_queuew, wait); + } + + /* purge remaining (unconsumed) items in the queue */ + while ((item = MT_LIST_POP(&queue->list, typeof(item), list))) { + /* free the queue item */ + pool_free(pool_head_hlua_queue, item); + } + + /* queue (userdata) will automatically be freed by lua gc */ + + return 0; +} + +static void hlua_queue_init(lua_State *L) +{ + /* Creates the queue object. */ + lua_newtable(L); + + hlua_class_function(L, "__gc", hlua_queue_gc); + + class_queue_ref = luaL_ref(L, LUA_REGISTRYINDEX); +} + +int hlua_fcn_new_stktable(lua_State *L, struct stktable *tbl) +{ + lua_newtable(L); + + /* Pop a class stktbl metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_stktable_ref); + lua_setmetatable(L, -2); + + lua_pushlightuserdata(L, tbl); + lua_rawseti(L, -2, 0); + return 1; +} + +static struct stktable *hlua_check_stktable(lua_State *L, int ud) +{ + return hlua_checkudata(L, ud, class_stktable_ref); +} + +/* Extract stick table attributes into Lua table */ +int hlua_stktable_info(lua_State *L) +{ + struct stktable *tbl; + int dt; + + tbl = hlua_check_stktable(L, 1); + + if (!tbl->id) { + lua_pushnil(L); + return 1; + } + + lua_newtable(L); + + lua_pushstring(L, "type"); + lua_pushstring(L, stktable_types[tbl->type].kw); + lua_settable(L, -3); + + lua_pushstring(L, "length"); + lua_pushinteger(L, tbl->key_size); + lua_settable(L, -3); + + lua_pushstring(L, "size"); + hlua_fcn_pushunsigned(L, tbl->size); + lua_settable(L, -3); + + lua_pushstring(L, "used"); + hlua_fcn_pushunsigned(L, tbl->current); + lua_settable(L, -3); + + lua_pushstring(L, "nopurge"); + lua_pushboolean(L, tbl->nopurge > 0); + lua_settable(L, -3); + + lua_pushstring(L, "expire"); + lua_pushinteger(L, tbl->expire); + lua_settable(L, -3); + + /* Save data types periods (if applicable) in 'data' table */ + lua_pushstring(L, "data"); + lua_newtable(L); + + for (dt = 0; dt < STKTABLE_DATA_TYPES; dt++) { + if (tbl->data_ofs[dt] == 0) + continue; + + lua_pushstring(L, stktable_data_types[dt].name); + + if (stktable_data_types[dt].arg_type == ARG_T_DELAY) + lua_pushinteger(L, tbl->data_arg[dt].u); + else + lua_pushinteger(L, -1); + + lua_settable(L, -3); + } + + lua_settable(L, -3); + + return 1; +} + +/* Helper to get extract stick table entry into Lua table */ +static void hlua_stktable_entry(lua_State *L, struct stktable *t, struct stksess *ts) +{ + int dt; + void *ptr; + + for (dt = 0; dt < STKTABLE_DATA_TYPES; dt++) { + + ptr = stktable_data_ptr(t, ts, dt); + if (!ptr) + continue; + + lua_pushstring(L, stktable_data_types[dt].name); + + switch (stktable_data_types[dt].std_type) { + case STD_T_SINT: + lua_pushinteger(L, stktable_data_cast(ptr, std_t_sint)); + break; + case STD_T_UINT: + hlua_fcn_pushunsigned(L, stktable_data_cast(ptr, std_t_uint)); + break; + case STD_T_ULL: + hlua_fcn_pushunsigned_ll(L, stktable_data_cast(ptr, std_t_ull)); + break; + case STD_T_FRQP: + lua_pushinteger(L, read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[dt].u)); + break; + case STD_T_DICT: { + struct dict_entry *de; + de = stktable_data_cast(ptr, std_t_dict); + lua_pushstring(L, de ? (char *)de->value.key : "-"); + break; + } + } + + lua_settable(L, -3); + } +} + +/* Looks in table <t> for a sticky session matching key <key> + * Returns table with session data or nil + * + * The returned table always contains 'use' and 'expire' (integer) fields. + * For frequency/rate counters, each data entry is returned as table with + * 'value' and 'period' fields. + */ +int hlua_stktable_lookup(lua_State *L) +{ + struct stktable *t; + struct sample smp; + struct stktable_key *skey; + struct stksess *ts; + + t = hlua_check_stktable(L, 1); + smp.data.type = SMP_T_STR; + smp.flags = SMP_F_CONST; + smp.data.u.str.area = (char *)lua_tolstring(L, 2, &smp.data.u.str.data); + + skey = smp_to_stkey(&smp, t); + if (!skey) { + lua_pushnil(L); + return 1; + } + + ts = stktable_lookup_key(t, skey); + if (!ts) { + lua_pushnil(L); + return 1; + } + + lua_newtable(L); + lua_pushstring(L, "use"); + lua_pushinteger(L, HA_ATOMIC_LOAD(&ts->ref_cnt) - 1); + lua_settable(L, -3); + + lua_pushstring(L, "expire"); + lua_pushinteger(L, tick_remain(now_ms, ts->expire)); + lua_settable(L, -3); + + hlua_stktable_entry(L, t, ts); + HA_ATOMIC_DEC(&ts->ref_cnt); + + return 1; +} + +struct stk_filter { + long long val; + int type; + int op; +}; + + +/* Helper for returning errors to callers using Lua convention (nil, err) */ +static int hlua_error(lua_State *L, const char *fmt, ...) { + char buf[256]; + int len; + va_list args; + va_start(args, fmt); + len = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len < 0) { + ha_alert("hlua_error(): Could not write error message.\n"); + lua_pushnil(L); + return 1; + } else if (len >= sizeof(buf)) + ha_alert("hlua_error(): Error message was truncated.\n"); + + lua_pushnil(L); + lua_pushstring(L, buf); + + return 2; +} + +/* Dump the contents of stick table <t>*/ +int hlua_stktable_dump(lua_State *L) +{ + struct stktable *t; + struct ebmb_node *eb; + struct ebmb_node *n; + struct stksess *ts; + int type; + int op; + int dt; + long long val; + struct stk_filter filter[STKTABLE_FILTER_LEN]; + int filter_count = 0; + int i; + int skip_entry; + void *ptr; + + t = hlua_check_stktable(L, 1); + type = lua_type(L, 2); + + switch (type) { + case LUA_TNONE: + case LUA_TNIL: + break; + case LUA_TTABLE: + lua_pushnil(L); + while (lua_next(L, 2) != 0) { + int entry_idx = 0; + + if (filter_count >= STKTABLE_FILTER_LEN) + return hlua_error(L, "Filter table too large (len > %d)", STKTABLE_FILTER_LEN); + + if (lua_type(L, -1) != LUA_TTABLE || lua_rawlen(L, -1) != 3) + return hlua_error(L, "Filter table entry must be a triplet: {\"data_col\", \"op\", val} (entry #%d)", filter_count + 1); + + lua_pushnil(L); + while (lua_next(L, -2) != 0) { + switch (entry_idx) { + case 0: + if (lua_type(L, -1) != LUA_TSTRING) + return hlua_error(L, "Filter table data column must be string (entry #%d)", filter_count + 1); + + dt = stktable_get_data_type((char *)lua_tostring(L, -1)); + if (dt < 0 || t->data_ofs[dt] == 0) + return hlua_error(L, "Filter table data column not present in stick table (entry #%d)", filter_count + 1); + filter[filter_count].type = dt; + break; + case 1: + if (lua_type(L, -1) != LUA_TSTRING) + return hlua_error(L, "Filter table operator must be string (entry #%d)", filter_count + 1); + + op = get_std_op(lua_tostring(L, -1)); + if (op < 0) + return hlua_error(L, "Unknown operator in filter table (entry #%d)", filter_count + 1); + filter[filter_count].op = op; + break; + case 2: + val = lua_tointeger(L, -1); + filter[filter_count].val = val; + filter_count++; + break; + default: + break; + } + entry_idx++; + lua_pop(L, 1); + } + lua_pop(L, 1); + } + break; + default: + return hlua_error(L, "filter table expected"); + } + + lua_newtable(L); + + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->lock); + eb = ebmb_first(&t->keys); + for (n = eb; n; n = ebmb_next(n)) { + ts = ebmb_entry(n, struct stksess, key); + if (!ts) { + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->lock); + return 1; + } + HA_ATOMIC_INC(&ts->ref_cnt); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->lock); + + /* multi condition/value filter */ + skip_entry = 0; + for (i = 0; i < filter_count; i++) { + ptr = stktable_data_ptr(t, ts, filter[i].type); + if (!ptr) + continue; + + switch (stktable_data_types[filter[i].type].std_type) { + case STD_T_SINT: + val = stktable_data_cast(ptr, std_t_sint); + break; + case STD_T_UINT: + val = stktable_data_cast(ptr, std_t_uint); + break; + case STD_T_ULL: + val = stktable_data_cast(ptr, std_t_ull); + break; + case STD_T_FRQP: + val = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[filter[i].type].u); + break; + default: + continue; + break; + } + + op = filter[i].op; + + if ((val < filter[i].val && (op == STD_OP_EQ || op == STD_OP_GT || op == STD_OP_GE)) || + (val == filter[i].val && (op == STD_OP_NE || op == STD_OP_GT || op == STD_OP_LT)) || + (val > filter[i].val && (op == STD_OP_EQ || op == STD_OP_LT || op == STD_OP_LE))) { + skip_entry = 1; + break; + } + } + + if (skip_entry) { + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->lock); + HA_ATOMIC_DEC(&ts->ref_cnt); + continue; + } + + if (t->type == SMP_T_IPV4) { + char addr[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, (const void *)&ts->key.key, addr, sizeof(addr)); + lua_pushstring(L, addr); + } else if (t->type == SMP_T_IPV6) { + char addr[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, (const void *)&ts->key.key, addr, sizeof(addr)); + lua_pushstring(L, addr); + } else if (t->type == SMP_T_SINT) { + lua_pushinteger(L, *ts->key.key); + } else if (t->type == SMP_T_STR) { + lua_pushstring(L, (const char *)ts->key.key); + } else { + return hlua_error(L, "Unsupported stick table key type"); + } + + lua_newtable(L); + hlua_stktable_entry(L, t, ts); + lua_settable(L, -3); + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->lock); + HA_ATOMIC_DEC(&ts->ref_cnt); + } + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->lock); + + return 1; +} + +int hlua_fcn_new_listener(lua_State *L, struct listener *lst) +{ + lua_newtable(L); + + /* Pop a class sesison metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_listener_ref); + lua_setmetatable(L, -2); + + lua_pushlightuserdata(L, lst); + lua_rawseti(L, -2, 0); + return 1; +} + +static struct listener *hlua_check_listener(lua_State *L, int ud) +{ + return hlua_checkudata(L, ud, class_listener_ref); +} + +int hlua_listener_get_stats(lua_State *L) +{ + struct listener *li; + int i; + + li = hlua_check_listener(L, 1); + + if (!li->bind_conf->frontend) { + lua_pushnil(L); + return 1; + } + + stats_fill_li_stats(li->bind_conf->frontend, li, STAT_SHLGNDS, stats, + STATS_LEN, NULL); + + lua_newtable(L); + for (i=0; i<ST_F_TOTAL_FIELDS; i++) { + lua_pushstring(L, stat_fields[i].name); + hlua_fcn_pushfield(L, &stats[i]); + lua_settable(L, -3); + } + return 1; + +} + +int hlua_server_gc(lua_State *L) +{ + struct server *srv = hlua_checkudata(L, 1, class_server_ref); + + srv_drop(srv); /* srv_drop allows NULL srv */ + return 0; +} + +static struct server *hlua_check_server(lua_State *L, int ud) +{ + struct server *srv = hlua_checkudata(L, ud, class_server_ref); + if (srv->flags & SRV_F_DELETED) { + return NULL; + } + return srv; +} + +int hlua_server_get_stats(lua_State *L) +{ + struct server *srv; + int i; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + if (!srv->proxy) { + lua_pushnil(L); + return 1; + } + + stats_fill_sv_stats(srv->proxy, srv, STAT_SHLGNDS, stats, + STATS_LEN, NULL); + + lua_newtable(L); + for (i=0; i<ST_F_TOTAL_FIELDS; i++) { + lua_pushstring(L, stat_fields[i].name); + hlua_fcn_pushfield(L, &stats[i]); + lua_settable(L, -3); + } + return 1; + +} + +int hlua_server_get_proxy(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + if (!srv->proxy) { + lua_pushnil(L); + return 1; + } + + hlua_fcn_new_proxy(L, srv->proxy); + return 1; +} + +int hlua_server_get_addr(lua_State *L) +{ + struct server *srv; + char addr[INET6_ADDRSTRLEN]; + luaL_Buffer b; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + luaL_buffinit(L, &b); + + switch (srv->addr.ss_family) { + case AF_INET: + inet_ntop(AF_INET, &((struct sockaddr_in *)&srv->addr)->sin_addr, + addr, INET_ADDRSTRLEN); + luaL_addstring(&b, addr); + luaL_addstring(&b, ":"); + snprintf(addr, INET_ADDRSTRLEN, "%d", srv->svc_port); + luaL_addstring(&b, addr); + break; + case AF_INET6: + inet_ntop(AF_INET6, &((struct sockaddr_in6 *)&srv->addr)->sin6_addr, + addr, INET6_ADDRSTRLEN); + luaL_addstring(&b, addr); + luaL_addstring(&b, ":"); + snprintf(addr, INET_ADDRSTRLEN, "%d", srv->svc_port); + luaL_addstring(&b, addr); + break; + case AF_UNIX: + luaL_addstring(&b, (char *)((struct sockaddr_un *)&srv->addr)->sun_path); + break; + default: + luaL_addstring(&b, "<unknown>"); + break; + } + + luaL_pushresult(&b); + return 1; +} + +int hlua_server_get_puid(lua_State *L) +{ + struct server *srv; + char buffer[12]; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + snprintf(buffer, sizeof(buffer), "%d", srv->puid); + lua_pushstring(L, buffer); + return 1; +} + +int hlua_server_get_rid(lua_State *L) +{ + struct server *srv; + char buffer[12]; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + snprintf(buffer, sizeof(buffer), "%d", srv->rid); + lua_pushstring(L, buffer); + return 1; +} + +int hlua_server_get_name(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushstring(L, srv->id); + return 1; +} + +/* __index metamethod for server class + * support for additional keys that are missing from the main table + * stack:1 = table (server class), stack:2 = requested key + * Returns 1 if key is supported + * else returns 0 to make lua return NIL value to the caller + */ +static int hlua_server_index(struct lua_State *L) +{ + const char *key = lua_tostring(L, 2); + + if (!strcmp(key, "name")) { + if (ONLY_ONCE()) + ha_warning("hlua: use of server 'name' attribute is deprecated and will eventually be removed, please use get_name() function instead: %s\n", hlua_traceback(L, ", ")); + lua_pushvalue(L, 1); + hlua_server_get_name(L); + return 1; + } + if (!strcmp(key, "puid")) { + if (ONLY_ONCE()) + ha_warning("hlua: use of server 'puid' attribute is deprecated and will eventually be removed, please use get_puid() function instead: %s\n", hlua_traceback(L, ", ")); + lua_pushvalue(L, 1); + hlua_server_get_puid(L); + return 1; + } + /* unknown attribute */ + return 0; +} + +int hlua_server_is_draining(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushboolean(L, server_is_draining(srv)); + return 1; +} + +int hlua_server_is_backup(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushboolean(L, (srv->flags & SRV_F_BACKUP)); + return 1; +} + +int hlua_server_is_dynamic(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushboolean(L, (srv->flags & SRV_F_DYNAMIC)); + return 1; +} + +int hlua_server_get_cur_sess(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushinteger(L, srv->cur_sess); + return 1; +} + +int hlua_server_get_pend_conn(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushinteger(L, srv->queue.length); + return 1; +} + +int hlua_server_set_maxconn(lua_State *L) +{ + struct server *srv; + const char *maxconn; + const char *err; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + maxconn = luaL_checkstring(L, 2); + + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + err = server_parse_maxconn_change_request(srv, maxconn); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + if (!err) + lua_pushnil(L); + else + hlua_pushstrippedstring(L, err); + return 1; +} + +int hlua_server_get_maxconn(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushinteger(L, srv->maxconn); + return 1; +} + +int hlua_server_set_weight(lua_State *L) +{ + struct server *srv; + const char *weight; + const char *err; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + weight = luaL_checkstring(L, 2); + + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + err = server_parse_weight_change_request(srv, weight); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + if (!err) + lua_pushnil(L); + else + hlua_pushstrippedstring(L, err); + return 1; +} + +int hlua_server_get_weight(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushinteger(L, srv->uweight); + return 1; +} + +int hlua_server_set_addr(lua_State *L) +{ + struct server *srv; + const char *addr; + const char *port; + const char *err; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + addr = luaL_checkstring(L, 2); + if (lua_gettop(L) >= 3) + port = luaL_checkstring(L, 3); + else + port = NULL; + + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + err = srv_update_addr_port(srv, addr, port, "Lua script"); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + if (!err) + lua_pushnil(L); + else + hlua_pushstrippedstring(L, err); + return 1; +} + +int hlua_server_shut_sess(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_shutdown_streams(srv, SF_ERR_KILLED); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + return 0; +} + +int hlua_server_set_drain(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_adm_set_drain(srv); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + return 0; +} + +int hlua_server_set_maint(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_adm_set_maint(srv); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + return 0; +} + +int hlua_server_set_ready(lua_State *L) +{ + struct server *srv; + + srv = hlua_check_server(L, 1); + if (srv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_adm_set_ready(srv); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + return 0; +} + +int hlua_server_check_enable(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (sv->check.state & CHK_ST_CONFIGURED) { + sv->check.state |= CHK_ST_ENABLED; + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +int hlua_server_check_disable(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (sv->check.state & CHK_ST_CONFIGURED) { + sv->check.state &= ~CHK_ST_ENABLED; + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +int hlua_server_check_force_up(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (!(sv->track)) { + sv->check.health = sv->check.rise + sv->check.fall - 1; + srv_set_running(sv, SRV_OP_STCHGC_LUA); + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +int hlua_server_check_force_nolb(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (!(sv->track)) { + sv->check.health = sv->check.rise + sv->check.fall - 1; + srv_set_stopping(sv, SRV_OP_STCHGC_LUA); + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +int hlua_server_check_force_down(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (!(sv->track)) { + sv->check.health = 0; + srv_set_stopped(sv, SRV_OP_STCHGC_LUA); + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +int hlua_server_agent_enable(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (sv->agent.state & CHK_ST_CONFIGURED) { + sv->agent.state |= CHK_ST_ENABLED; + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +int hlua_server_agent_disable(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (sv->agent.state & CHK_ST_CONFIGURED) { + sv->agent.state &= ~CHK_ST_ENABLED; + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +int hlua_server_agent_force_up(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (sv->agent.state & CHK_ST_ENABLED) { + sv->agent.health = sv->agent.rise + sv->agent.fall - 1; + srv_set_running(sv, SRV_OP_STCHGC_LUA); + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +int hlua_server_agent_force_down(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (sv->agent.state & CHK_ST_ENABLED) { + sv->agent.health = 0; + srv_set_stopped(sv, SRV_OP_STCHGC_LUA); + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 0; +} + +/* returns the tracked server, if any */ +int hlua_server_tracking(lua_State *L) +{ + struct server *sv; + struct server *tracked; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + + tracked = sv->track; + if (tracked == NULL) + lua_pushnil(L); + else + hlua_fcn_new_server(L, tracked); + + return 1; +} + +/* returns an array of servers tracking the current server */ +int hlua_server_get_trackers(lua_State *L) +{ + struct server *sv; + struct server *cur_tracker; + int index; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + + lua_newtable(L); + cur_tracker = sv->trackers; + for (index = 1; cur_tracker; cur_tracker = cur_tracker->tracknext, index++) { + if (!lua_checkstack(L, 5)) + luaL_error(L, "Lua out of memory error."); + hlua_fcn_new_server(L, cur_tracker); + /* array index starts at 1 in Lua */ + lua_rawseti(L, -2, index); + } + return 1; +} + +/* hlua_event_sub wrapper for per-server subscription: + * + * hlua_event_sub() is called with sv->e_subs subscription list and + * lua arguments are passed as-is (skipping the first argument which + * is the server ctx) + */ +int hlua_server_event_sub(lua_State *L) +{ + struct server *sv; + + sv = hlua_check_server(L, 1); + if (sv == NULL) { + return 0; + } + /* remove first argument from the stack (server) */ + lua_remove(L, 1); + + /* try to subscribe within server's subscription list */ + return hlua_event_sub(L, &sv->e_subs); +} + +int hlua_fcn_new_server(lua_State *L, struct server *srv) +{ + lua_newtable(L); + + /* Pop a class server metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_server_ref); + lua_setmetatable(L, -2); + + lua_pushlightuserdata(L, srv); + lua_rawseti(L, -2, 0); + + /* userdata is affected: increment server refcount */ + srv_take(srv); + + /* set public methods */ + hlua_class_function(L, "get_name", hlua_server_get_name); + hlua_class_function(L, "get_puid", hlua_server_get_puid); + hlua_class_function(L, "get_rid", hlua_server_get_rid); + hlua_class_function(L, "is_draining", hlua_server_is_draining); + hlua_class_function(L, "is_backup", hlua_server_is_backup); + hlua_class_function(L, "is_dynamic", hlua_server_is_dynamic); + hlua_class_function(L, "get_cur_sess", hlua_server_get_cur_sess); + hlua_class_function(L, "get_pend_conn", hlua_server_get_pend_conn); + hlua_class_function(L, "set_maxconn", hlua_server_set_maxconn); + hlua_class_function(L, "get_maxconn", hlua_server_get_maxconn); + hlua_class_function(L, "set_weight", hlua_server_set_weight); + hlua_class_function(L, "get_weight", hlua_server_get_weight); + hlua_class_function(L, "set_addr", hlua_server_set_addr); + hlua_class_function(L, "get_addr", hlua_server_get_addr); + hlua_class_function(L, "get_stats", hlua_server_get_stats); + hlua_class_function(L, "get_proxy", hlua_server_get_proxy); + hlua_class_function(L, "shut_sess", hlua_server_shut_sess); + hlua_class_function(L, "set_drain", hlua_server_set_drain); + hlua_class_function(L, "set_maint", hlua_server_set_maint); + hlua_class_function(L, "set_ready", hlua_server_set_ready); + hlua_class_function(L, "check_enable", hlua_server_check_enable); + hlua_class_function(L, "check_disable", hlua_server_check_disable); + hlua_class_function(L, "check_force_up", hlua_server_check_force_up); + hlua_class_function(L, "check_force_nolb", hlua_server_check_force_nolb); + hlua_class_function(L, "check_force_down", hlua_server_check_force_down); + hlua_class_function(L, "agent_enable", hlua_server_agent_enable); + hlua_class_function(L, "agent_disable", hlua_server_agent_disable); + hlua_class_function(L, "agent_force_up", hlua_server_agent_force_up); + hlua_class_function(L, "agent_force_down", hlua_server_agent_force_down); + hlua_class_function(L, "tracking", hlua_server_tracking); + hlua_class_function(L, "get_trackers", hlua_server_get_trackers); + hlua_class_function(L, "event_sub", hlua_server_event_sub); + + return 1; +} + +static struct hlua_server_list *hlua_check_server_list(lua_State *L, int ud) +{ + return hlua_checkudata(L, ud, class_server_list_ref); +} + +/* does nothing and returns 0, only prevents insertions in the + * table which represents the list of servers + */ +int hlua_listable_servers_newindex(lua_State *L) { + return 0; +} + +/* first arg is the table (struct hlua_server_list * in metadata) + * second arg is the required index + */ +int hlua_listable_servers_index(lua_State *L) +{ + struct hlua_server_list *hlua_srv; + const char *name; + struct server *srv; + + hlua_srv = hlua_check_server_list(L, 1); + name = luaL_checkstring(L, 2); + + /* Perform a server lookup in px list */ + srv = server_find_by_name(hlua_srv->px, name); + if (srv == NULL) { + lua_pushnil(L); + return 1; + } + + hlua_fcn_new_server(L, srv); + return 1; +} + +/* iterator must return key as string and value as server + * object, if we reach end of list, it returns nil. + * The context knows the last returned server. if the + * context contains srv == NULL, we start enumeration. + * Then, use 'srv->next' ptr to iterate through the list + */ +int hlua_listable_servers_pairs_iterator(lua_State *L) +{ + int context_index; + struct hlua_server_list_iterator_context *ctx; + + context_index = lua_upvalueindex(1); + ctx = lua_touserdata(L, context_index); + + if (ctx->cur == NULL) { + /* First iteration, initialize list on the first server */ + ctx->cur = ctx->px->srv; + } else { + + /* Next server (next ptr is always valid, even if current + * server has the SRV_F_DELETED flag set) + */ + ctx->cur = ctx->cur->next; + } + + /* next server is null, end of iteration */ + if (ctx->cur == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushstring(L, ctx->cur->id); + hlua_fcn_new_server(L, ctx->cur); + return 2; +} + +/* init the iterator context, return iterator function + * with context as closure. The only argument is a + * server list object. + */ +int hlua_listable_servers_pairs(lua_State *L) +{ + struct hlua_server_list_iterator_context *ctx; + struct hlua_server_list *hlua_srv_list; + + hlua_srv_list = hlua_check_server_list(L, 1); + + ctx = lua_newuserdata(L, sizeof(*ctx)); + ctx->px = hlua_srv_list->px; + ctx->cur = NULL; + + lua_pushcclosure(L, hlua_listable_servers_pairs_iterator, 1); + return 1; +} + +void hlua_listable_servers(lua_State *L, struct proxy *px) +{ + struct hlua_server_list *list; + + lua_newtable(L); + list = lua_newuserdata(L, sizeof(*list)); + list->px = px; + lua_rawseti(L, -2, 0); + lua_rawgeti(L, LUA_REGISTRYINDEX, class_server_list_ref); + lua_setmetatable(L, -2); +} + +static struct proxy *hlua_check_proxy(lua_State *L, int ud) +{ + return hlua_checkudata(L, ud, class_proxy_ref); +} + +int hlua_proxy_get_name(lua_State *L) +{ + struct proxy *px; + + px = hlua_check_proxy(L, 1); + lua_pushstring(L, px->id); + return 1; +} + +int hlua_proxy_get_uuid(lua_State *L) +{ + struct proxy *px; + char buffer[17]; + + px = hlua_check_proxy(L, 1); + snprintf(buffer, sizeof(buffer), "%d", px->uuid); + lua_pushstring(L, buffer); + return 1; +} + +/* __index metamethod for proxy class + * support for additional keys that are missing from the main table + * stack:1 = table (proxy class), stack:2 = requested key + * Returns 1 if key is supported + * else returns 0 to make lua return NIL value to the caller + */ +static int hlua_proxy_index(struct lua_State *L) +{ + const char *key = lua_tostring(L, 2); + + if (!strcmp(key, "name")) { + if (ONLY_ONCE()) + ha_warning("hlua: use of proxy 'name' attribute is deprecated and will eventually be removed, please use get_name() function instead: %s\n", hlua_traceback(L, ", ")); + lua_pushvalue(L, 1); + hlua_proxy_get_name(L); + return 1; + } + if (!strcmp(key, "uuid")) { + if (ONLY_ONCE()) + ha_warning("hlua: use of proxy 'uuid' attribute is deprecated and will eventually be removed, please use get_uuid() function instead: %s\n", hlua_traceback(L, ", ")); + lua_pushvalue(L, 1); + hlua_proxy_get_uuid(L); + return 1; + } + /* unknown attribute */ + return 0; +} + +int hlua_proxy_pause(lua_State *L) +{ + struct proxy *px; + + px = hlua_check_proxy(L, 1); + /* safe to call without PROXY_LOCK - pause_proxy takes it */ + pause_proxy(px); + return 0; +} + +int hlua_proxy_resume(lua_State *L) +{ + struct proxy *px; + + px = hlua_check_proxy(L, 1); + /* safe to call without PROXY_LOCK - resume_proxy takes it */ + resume_proxy(px); + return 0; +} + +int hlua_proxy_stop(lua_State *L) +{ + struct proxy *px; + + px = hlua_check_proxy(L, 1); + /* safe to call without PROXY_LOCK - stop_proxy takes it */ + stop_proxy(px); + return 0; +} + +int hlua_proxy_get_cap(lua_State *L) +{ + struct proxy *px; + const char *str; + + px = hlua_check_proxy(L, 1); + str = proxy_cap_str(px->cap); + lua_pushstring(L, str); + return 1; +} + +int hlua_proxy_get_stats(lua_State *L) +{ + struct proxy *px; + int i; + + px = hlua_check_proxy(L, 1); + if (px->cap & PR_CAP_BE) + stats_fill_be_stats(px, STAT_SHLGNDS, stats, STATS_LEN, NULL); + else + stats_fill_fe_stats(px, stats, STATS_LEN, NULL); + lua_newtable(L); + for (i=0; i<ST_F_TOTAL_FIELDS; i++) { + lua_pushstring(L, stat_fields[i].name); + hlua_fcn_pushfield(L, &stats[i]); + lua_settable(L, -3); + } + return 1; +} + +int hlua_proxy_get_mode(lua_State *L) +{ + struct proxy *px; + const char *str; + + px = hlua_check_proxy(L, 1); + str = proxy_mode_str(px->mode); + lua_pushstring(L, str); + return 1; +} + +int hlua_proxy_shut_bcksess(lua_State *L) +{ + struct proxy *px; + + px = hlua_check_proxy(L, 1); + srv_shutdown_backup_streams(px, SF_ERR_KILLED); + return 0; +} + +int hlua_proxy_get_srv_act(lua_State *L) +{ + struct proxy *px; + + px = hlua_check_proxy(L, 1); + lua_pushinteger(L, px->srv_act); + return 1; +} + +int hlua_proxy_get_srv_bck(lua_State *L) +{ + struct proxy *px; + + px = hlua_check_proxy(L, 1); + lua_pushinteger(L, px->srv_bck); + return 1; +} + +/* Get mailers config info, used to implement email alert sending + * according to mailers config from lua. + */ +int hlua_proxy_get_mailers(lua_State *L) +{ + struct proxy *px; + int it; + struct mailer *mailer; + + px = hlua_check_proxy(L, 1); + + if (!px->email_alert.mailers.m) + return 0; /* email-alert mailers not found on proxy */ + + lua_newtable(L); + + /* option log-health-checks */ + lua_pushstring(L, "track_server_health"); + lua_pushboolean(L, (px->options2 & PR_O2_LOGHCHKS)); + lua_settable(L, -3); + + /* email-alert level */ + lua_pushstring(L, "log_level"); + lua_pushinteger(L, px->email_alert.level); + lua_settable(L, -3); + + /* email-alert mailers */ + lua_pushstring(L, "mailservers"); + lua_newtable(L); + for (it = 0, mailer = px->email_alert.mailers.m->mailer_list; + it < px->email_alert.mailers.m->count; it++, mailer = mailer->next) { + char *srv_address; + + lua_pushstring(L, mailer->id); + + /* For now, we depend on mailer->addr to restore mailer's address which + * was converted using str2sa_range() on startup. + * + * FIXME?: + * It could be a good idea to pass the raw address (unparsed) to allow fqdn + * to be resolved at runtime, unless we consider this as a pure legacy mode + * and mailers config support is going to be removed in the future? + */ + srv_address = sa2str(&mailer->addr, get_host_port(&mailer->addr), 0); + if (srv_address) { + lua_pushstring(L, srv_address); + ha_free(&srv_address); + lua_settable(L, -3); + } + } + lua_settable(L, -3); + + /* mailers timeout (from mailers section) */ + lua_pushstring(L, "mailservers_timeout"); + lua_pushinteger(L, px->email_alert.mailers.m->timeout.mail); + lua_settable(L, -3); + + /* email-alert myhostname */ + lua_pushstring(L, "smtp_hostname"); + lua_pushstring(L, px->email_alert.myhostname); + lua_settable(L, -3); + + /* email-alert from */ + lua_pushstring(L, "smtp_from"); + lua_pushstring(L, px->email_alert.from); + lua_settable(L, -3); + + /* email-alert to */ + lua_pushstring(L, "smtp_to"); + lua_pushstring(L, px->email_alert.to); + lua_settable(L, -3); + + return 1; +} + +int hlua_fcn_new_proxy(lua_State *L, struct proxy *px) +{ + struct listener *lst; + int lid; + char buffer[17]; + + lua_newtable(L); + + /* Pop a class proxy metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_proxy_ref); + lua_setmetatable(L, -2); + + lua_pushlightuserdata(L, px); + lua_rawseti(L, -2, 0); + + /* set public methods */ + hlua_class_function(L, "get_name", hlua_proxy_get_name); + hlua_class_function(L, "get_uuid", hlua_proxy_get_uuid); + hlua_class_function(L, "pause", hlua_proxy_pause); + hlua_class_function(L, "resume", hlua_proxy_resume); + hlua_class_function(L, "stop", hlua_proxy_stop); + hlua_class_function(L, "shut_bcksess", hlua_proxy_shut_bcksess); + hlua_class_function(L, "get_cap", hlua_proxy_get_cap); + hlua_class_function(L, "get_mode", hlua_proxy_get_mode); + hlua_class_function(L, "get_srv_act", hlua_proxy_get_srv_act); + hlua_class_function(L, "get_srv_bck", hlua_proxy_get_srv_bck); + hlua_class_function(L, "get_stats", hlua_proxy_get_stats); + hlua_class_function(L, "get_mailers", hlua_proxy_get_mailers); + + /* Browse and register servers. */ + lua_pushstring(L, "servers"); + hlua_listable_servers(L, px); + lua_settable(L, -3); + + /* Browse and register listeners. */ + lua_pushstring(L, "listeners"); + lua_newtable(L); + lid = 1; + list_for_each_entry(lst, &px->conf.listeners, by_fe) { + if (lst->name) + lua_pushstring(L, lst->name); + else { + snprintf(buffer, sizeof(buffer), "sock-%d", lid); + lid++; + lua_pushstring(L, buffer); + } + hlua_fcn_new_listener(L, lst); + lua_settable(L, -3); + } + lua_settable(L, -3); + + if (px->table && px->table->id) { + lua_pushstring(L, "stktable"); + hlua_fcn_new_stktable(L, px->table); + lua_settable(L, -3); + } + + return 1; +} + +static struct hlua_proxy_list *hlua_check_proxy_list(lua_State *L, int ud) +{ + return hlua_checkudata(L, ud, class_proxy_list_ref); +} + +/* does nothing and returns 0, only prevents insertions in the + * table which represent list of proxies + */ +int hlua_listable_proxies_newindex(lua_State *L) { + return 0; +} + +/* first arg is the table (struct hlua_proxy_list * in metadata) + * second arg is the required index + */ +int hlua_listable_proxies_index(lua_State *L) +{ + struct hlua_proxy_list *hlua_px; + const char *name; + struct proxy *px; + + hlua_px = hlua_check_proxy_list(L, 1); + name = luaL_checkstring(L, 2); + + px = NULL; + if (hlua_px->capabilities & PR_CAP_FE) { + px = proxy_find_by_name(name, PR_CAP_FE, 0); + } + if (!px && hlua_px->capabilities & PR_CAP_BE) { + px = proxy_find_by_name(name, PR_CAP_BE, 0); + } + if (px == NULL) { + lua_pushnil(L); + return 1; + } + + hlua_fcn_new_proxy(L, px); + return 1; +} + +static inline int hlua_listable_proxies_match(struct proxy *px, char cap) { + return ((px->cap & cap) && !(px->cap & (PR_CAP_DEF | PR_CAP_INT))); +} + +/* iterator must return key as string and value as proxy + * object, if we reach end of list, it returns nil + */ +int hlua_listable_proxies_pairs_iterator(lua_State *L) +{ + int context_index; + struct hlua_proxy_list_iterator_context *ctx; + + context_index = lua_upvalueindex(1); + ctx = lua_touserdata(L, context_index); + + if (ctx->next == NULL) { + lua_pushnil(L); + return 1; + } + + lua_pushstring(L, ctx->next->id); + hlua_fcn_new_proxy(L, ctx->next); + + for (ctx->next = ctx->next->next; + ctx->next && !hlua_listable_proxies_match(ctx->next, ctx->capabilities); + ctx->next = ctx->next->next); + + return 2; +} + +/* init the iterator context, return iterator function + * with context as closure. The only argument is a + * proxy object. + */ +int hlua_listable_proxies_pairs(lua_State *L) +{ + struct hlua_proxy_list_iterator_context *ctx; + struct hlua_proxy_list *hlua_px; + + hlua_px = hlua_check_proxy_list(L, 1); + + ctx = lua_newuserdata(L, sizeof(*ctx)); + + ctx->capabilities = hlua_px->capabilities; + for (ctx->next = proxies_list; + ctx->next && !hlua_listable_proxies_match(ctx->next, ctx->capabilities); + ctx->next = ctx->next->next); + lua_pushcclosure(L, hlua_listable_proxies_pairs_iterator, 1); + return 1; +} + +void hlua_listable_proxies(lua_State *L, char capabilities) +{ + struct hlua_proxy_list *list; + + lua_newtable(L); + list = lua_newuserdata(L, sizeof(*list)); + list->capabilities = capabilities; + lua_rawseti(L, -2, 0); + lua_rawgeti(L, LUA_REGISTRYINDEX, class_proxy_list_ref); + lua_setmetatable(L, -2); +} + +int hlua_event_sub_unsub(lua_State *L) +{ + struct event_hdl_sub *sub = hlua_checkudata(L, 1, class_event_sub_ref); + + BUG_ON(!sub); + event_hdl_take(sub); /* keep a reference on sub until the item is GCed */ + event_hdl_unsubscribe(sub); /* will automatically call event_hdl_drop() */ + return 0; +} + +int hlua_event_sub_gc(lua_State *L) +{ + struct event_hdl_sub *sub = hlua_checkudata(L, 1, class_event_sub_ref); + + BUG_ON(!sub); + event_hdl_drop(sub); /* final drop of the reference */ + return 0; +} + +int hlua_fcn_new_event_sub(lua_State *L, struct event_hdl_sub *sub) +{ + lua_newtable(L); + + /* Pop a class event_sub metatable and affect it to the userdata. */ + lua_rawgeti(L, LUA_REGISTRYINDEX, class_event_sub_ref); + lua_setmetatable(L, -2); + + lua_pushlightuserdata(L, sub); + lua_rawseti(L, -2, 0); + + /* userdata is affected: increment sub refcount */ + event_hdl_take(sub); + + /* set public methods */ + hlua_class_function(L, "unsub", hlua_event_sub_unsub); + + return 1; +} + +/* This Lua function take a string, a list of separators. + * It tokenize the input string using the list of separators + * as separator. + * + * The functionreturns a table filled with tokens. + */ +int hlua_tokenize(lua_State *L) +{ + const char *str; + const char *sep; + int index; + const char *token; + const char *p; + const char *c; + int ignore_empty; + + ignore_empty = 0; + + str = luaL_checkstring(L, 1); + sep = luaL_checkstring(L, 2); + if (lua_gettop(L) == 3) + ignore_empty = hlua_checkboolean(L, 3); + + lua_newtable(L); + index = 1; + token = str; + p = str; + while(1) { + for (c = sep; *c != '\0'; c++) + if (*p == *c) + break; + if (*p == *c) { + if ((!ignore_empty) || (p - token > 0)) { + lua_pushlstring(L, token, p - token); + lua_rawseti(L, -2, index); + index++; + } + token = p + 1; + } + if (*p == '\0') + break; + p++; + } + + return 1; +} + +int hlua_parse_addr(lua_State *L) +{ + struct net_addr *addr; + const char *str = luaL_checkstring(L, 1); + unsigned char mask; + + addr = lua_newuserdata(L, sizeof(struct net_addr)); + if (!addr) { + lua_pushnil(L); + return 1; + } + + if (str2net(str, PAT_MF_NO_DNS, &addr->addr.v4.ip, &addr->addr.v4.mask)) { + addr->family = AF_INET; + return 1; + } + + if (str62net(str, &addr->addr.v6.ip, &mask)) { + len2mask6(mask, &addr->addr.v6.mask); + addr->family = AF_INET6; + return 1; + } + + lua_pop(L, 1); + lua_pushnil(L); + return 1; +} + +int hlua_match_addr(lua_State *L) +{ + struct net_addr *addr1; + struct net_addr *addr2; + + if (!lua_isuserdata(L, 1) || + !lua_isuserdata(L, 2)) { + lua_pushboolean(L, 0); + return 1; + } + + addr1 = lua_touserdata(L, 1); + addr2 = lua_touserdata(L, 2); + + if (addr1->family != addr2->family) { + lua_pushboolean(L, 0); + return 1; + } + + if (addr1->family == AF_INET) { + if ((addr1->addr.v4.ip.s_addr & addr2->addr.v4.mask.s_addr) == + (addr2->addr.v4.ip.s_addr & addr1->addr.v4.mask.s_addr)) { + lua_pushboolean(L, 1); + return 1; + } + } else { + int i; + + for (i = 0; i < 16; i += 4) { + if ((read_u32(&addr1->addr.v6.ip.s6_addr[i]) & + read_u32(&addr2->addr.v6.mask.s6_addr[i])) != + (read_u32(&addr2->addr.v6.ip.s6_addr[i]) & + read_u32(&addr1->addr.v6.mask.s6_addr[i]))) + break; + } + if (i == 16) { + lua_pushboolean(L, 1); + return 1; + } + } + + lua_pushboolean(L, 0); + return 1; +} + +static struct my_regex **hlua_check_regex(lua_State *L, int ud) +{ + return (hlua_checkudata(L, ud, class_regex_ref)); +} + +static int hlua_regex_comp(struct lua_State *L) +{ + struct my_regex **regex; + const char *str; + int cs; + char *err; + + str = luaL_checkstring(L, 1); + luaL_argcheck(L, lua_isboolean(L, 2), 2, NULL); + cs = lua_toboolean(L, 2); + + regex = lua_newuserdata(L, sizeof(*regex)); + + err = NULL; + if (!(*regex = regex_comp(str, cs, 1, &err))) { + lua_pushboolean(L, 0); /* status error */ + lua_pushstring(L, err); /* Reason */ + free(err); + return 2; + } + + lua_pushboolean(L, 1); /* Status ok */ + + /* Create object */ + lua_newtable(L); + lua_pushvalue(L, -3); /* Get the userdata pointer. */ + lua_rawseti(L, -2, 0); + lua_rawgeti(L, LUA_REGISTRYINDEX, class_regex_ref); + lua_setmetatable(L, -2); + return 2; +} + +static int hlua_regex_exec(struct lua_State *L) +{ + struct my_regex **regex; + const char *str; + size_t len; + struct buffer *tmp; + + regex = hlua_check_regex(L, 1); + str = luaL_checklstring(L, 2, &len); + + if (!*regex) { + lua_pushboolean(L, 0); + return 1; + } + + /* Copy the string because regex_exec2 require a 'char *' + * and not a 'const char *'. + */ + tmp = get_trash_chunk(); + if (len >= tmp->size) { + lua_pushboolean(L, 0); + return 1; + } + memcpy(tmp->area, str, len); + + lua_pushboolean(L, regex_exec2(*regex, tmp->area, len)); + + return 1; +} + +static int hlua_regex_match(struct lua_State *L) +{ + struct my_regex **regex; + const char *str; + size_t len; + regmatch_t pmatch[20]; + int ret; + int i; + struct buffer *tmp; + + regex = hlua_check_regex(L, 1); + str = luaL_checklstring(L, 2, &len); + + if (!*regex) { + lua_pushboolean(L, 0); + return 1; + } + + /* Copy the string because regex_exec2 require a 'char *' + * and not a 'const char *'. + */ + tmp = get_trash_chunk(); + if (len >= tmp->size) { + lua_pushboolean(L, 0); + return 1; + } + memcpy(tmp->area, str, len); + + ret = regex_exec_match2(*regex, tmp->area, len, 20, pmatch, 0); + lua_pushboolean(L, ret); + lua_newtable(L); + if (ret) { + for (i = 0; i < 20 && pmatch[i].rm_so != -1; i++) { + lua_pushlstring(L, str + pmatch[i].rm_so, pmatch[i].rm_eo - pmatch[i].rm_so); + lua_rawseti(L, -2, i + 1); + } + } + return 2; +} + +static int hlua_regex_free(struct lua_State *L) +{ + struct my_regex **regex; + + regex = hlua_check_regex(L, 1); + regex_free(*regex); + *regex = NULL; + return 0; +} + +void hlua_fcn_reg_core_fcn(lua_State *L) +{ + hlua_concat_init(L); + hlua_queue_init(L); + + hlua_class_function(L, "now", hlua_now); + hlua_class_function(L, "http_date", hlua_http_date); + hlua_class_function(L, "imf_date", hlua_imf_date); + hlua_class_function(L, "rfc850_date", hlua_rfc850_date); + hlua_class_function(L, "asctime_date", hlua_asctime_date); + hlua_class_function(L, "concat", hlua_concat_new); + hlua_class_function(L, "queue", hlua_queue_new); + hlua_class_function(L, "get_info", hlua_get_info); + hlua_class_function(L, "parse_addr", hlua_parse_addr); + hlua_class_function(L, "match_addr", hlua_match_addr); + hlua_class_function(L, "tokenize", hlua_tokenize); + + /* Create regex object. */ + lua_newtable(L); + hlua_class_function(L, "new", hlua_regex_comp); + + lua_newtable(L); /* The metatable. */ + lua_pushstring(L, "__index"); + lua_newtable(L); + hlua_class_function(L, "exec", hlua_regex_exec); + hlua_class_function(L, "match", hlua_regex_match); + lua_rawset(L, -3); /* -> META["__index"] = TABLE */ + hlua_class_function(L, "__gc", hlua_regex_free); + + lua_pushvalue(L, -1); /* Duplicate the metatable reference. */ + class_regex_ref = hlua_register_metatable(L, CLASS_REGEX); + + lua_setmetatable(L, -2); + lua_setglobal(L, CLASS_REGEX); /* Create global object called Regex */ + + /* Create stktable object. */ + lua_newtable(L); + lua_pushstring(L, "__index"); + lua_newtable(L); + hlua_class_function(L, "info", hlua_stktable_info); + hlua_class_function(L, "lookup", hlua_stktable_lookup); + hlua_class_function(L, "dump", hlua_stktable_dump); + lua_settable(L, -3); /* -> META["__index"] = TABLE */ + class_stktable_ref = hlua_register_metatable(L, CLASS_STKTABLE); + + /* Create listener object. */ + lua_newtable(L); + lua_pushstring(L, "__index"); + lua_newtable(L); + hlua_class_function(L, "get_stats", hlua_listener_get_stats); + lua_settable(L, -3); /* -> META["__index"] = TABLE */ + class_listener_ref = hlua_register_metatable(L, CLASS_LISTENER); + + /* Create event_sub object. */ + lua_newtable(L); + hlua_class_function(L, "__gc", hlua_event_sub_gc); + class_event_sub_ref = hlua_register_metatable(L, CLASS_EVENT_SUB); + + /* Create server object. */ + lua_newtable(L); + hlua_class_function(L, "__gc", hlua_server_gc); + hlua_class_function(L, "__index", hlua_server_index); + class_server_ref = hlua_register_metatable(L, CLASS_SERVER); + + /* Create proxy object. */ + lua_newtable(L); + hlua_class_function(L, "__index", hlua_proxy_index); + class_proxy_ref = hlua_register_metatable(L, CLASS_PROXY); + + /* list of proxy objects. Instead of having a static array + * of proxies, we use special metamethods that rely on internal + * proxies list so that the array is resolved at runtime. + * + * To emulate the same behavior than Lua array, we implement some + * metatable functions: + * - __newindex : prevent the insertion of a new item in the array + * - __index : find a proxy in the list using "name" index + * - __pairs : iterate through available proxies in the list + */ + lua_newtable(L); + hlua_class_function(L, "__index", hlua_listable_proxies_index); + hlua_class_function(L, "__newindex", hlua_listable_proxies_newindex); + hlua_class_function(L, "__pairs", hlua_listable_proxies_pairs); + class_proxy_list_ref = hlua_register_metatable(L, CLASS_PROXY_LIST); + + /* Create proxies entry. */ + lua_pushstring(L, "proxies"); + hlua_listable_proxies(L, PR_CAP_LISTEN); + lua_settable(L, -3); + + /* Create frontends entry. */ + lua_pushstring(L, "frontends"); + hlua_listable_proxies(L, PR_CAP_FE); + lua_settable(L, -3); + + /* Create backends entry. */ + lua_pushstring(L, "backends"); + hlua_listable_proxies(L, PR_CAP_BE); + lua_settable(L, -3); + + /* list of server. This object is similar to + * CLASS_PROXY_LIST + */ + lua_newtable(L); + hlua_class_function(L, "__index", hlua_listable_servers_index); + hlua_class_function(L, "__newindex", hlua_listable_servers_newindex); + hlua_class_function(L, "__pairs", hlua_listable_servers_pairs); + class_server_list_ref = hlua_register_metatable(L, CLASS_SERVER_LIST); +} diff --git a/src/hpack-dec.c b/src/hpack-dec.c new file mode 100644 index 0000000..052a7c3 --- /dev/null +++ b/src/hpack-dec.c @@ -0,0 +1,475 @@ +/* + * HPACK decompressor (RFC7541) + * + * Copyright (C) 2014-2017 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <import/ist.h> +#include <haproxy/chunk.h> +#include <haproxy/global.h> +#include <haproxy/h2.h> +#include <haproxy/hpack-dec.h> +#include <haproxy/hpack-huff.h> +#include <haproxy/hpack-tbl.h> +#include <haproxy/tools.h> + + +#if defined(DEBUG_HPACK) +#define hpack_debug_printf printf +#define hpack_debug_hexdump debug_hexdump +#else +#define hpack_debug_printf(...) do { } while (0) +#define hpack_debug_hexdump(...) do { } while (0) +#endif + +/* reads a varint from <raw>'s lowest <b> bits and <len> bytes max (raw included). + * returns the 32-bit value on success after updating raw_in and len_in. Forces + * len_in to (uint32_t)-1 on truncated input. + */ +static uint32_t get_var_int(const uint8_t **raw_in, uint32_t *len_in, int b) +{ + uint32_t ret = 0; + int len = *len_in; + const uint8_t *raw = *raw_in; + uint8_t shift = 0; + + len--; + ret = *(raw++) & ((1 << b) - 1); + if (ret != (uint32_t)((1 << b) - 1)) + goto end; + + while (len && (*raw & 128)) { + ret += ((uint32_t)(*raw++) & 127) << shift; + shift += 7; + len--; + } + + /* last 7 bits */ + if (!len) + goto too_short; + len--; + ret += ((uint32_t)(*raw++) & 127) << shift; + + end: + *raw_in = raw; + *len_in = len; + return ret; + + too_short: + *len_in = (uint32_t)-1; + return 0; +} + +/* returns the pseudo-header <idx> corresponds to among the following values : + * - 0 = unknown, the header's string needs to be used instead + * - 1 = ":authority" + * - 2 = ":method" + * - 3 = ":path" + * - 4 = ":scheme" + * - 5 = ":status" + */ +static inline int hpack_idx_to_phdr(uint32_t idx) +{ + if (idx > 14) + return 0; + + idx >>= 1; + idx <<= 2; + return (0x55554321U >> idx) & 0xF; +} + +/* If <idx> designates a static header, returns <in>. Otherwise allocates some + * room from chunk <store> to duplicate <in> into it and returns the string + * allocated there. In case of allocation failure, returns a string whose + * pointer is NULL. + */ +static inline struct ist hpack_alloc_string(struct buffer *store, uint32_t idx, + struct ist in) +{ + struct ist out; + + if (idx < HPACK_SHT_SIZE) + return in; + + out.len = in.len; + out.ptr = chunk_newstr(store); + if (unlikely(!isttest(out))) + return out; + + if (unlikely(store->data + out.len > store->size)) { + out.ptr = NULL; + return out; + } + + store->data += out.len; + memcpy(out.ptr, in.ptr, out.len); + return out; +} + +/* decode an HPACK frame starting at <raw> for <len> bytes, using the dynamic + * headers table <dht>, produces the output into list <list> of <list_size> + * entries max, and uses pre-allocated buffer <tmp> for temporary storage (some + * list elements will point to it). Some <list> name entries may be made of a + * NULL pointer and a len, in which case they will designate a pseudo header + * index according to the values returned by hpack_idx_to_phdr() above. The + * number of <list> entries used is returned on success, or <0 on failure, with + * the opposite one of the HPACK_ERR_* codes. A last element is always zeroed + * and is not counted in the number of returned entries. This way the caller + * can use list[].n.len == 0 as a marker for the end of list. + */ +int hpack_decode_frame(struct hpack_dht *dht, const uint8_t *raw, uint32_t len, + struct http_hdr *list, int list_size, + struct buffer *tmp) +{ + uint32_t idx; + uint32_t nlen; + uint32_t vlen; + uint8_t huff; + struct ist name; + struct ist value; + int must_index; + int ret; + + hpack_debug_hexdump(stderr, "[HPACK-DEC] ", (const char *)raw, 0, len); + + chunk_reset(tmp); + ret = 0; + while (len) { + int __maybe_unused code = *raw; /* first byte, only for debugging */ + + must_index = 0; + if (*raw >= 0x80) { + /* indexed header field */ + if (*raw == 0x80) { + hpack_debug_printf("unhandled code 0x%02x (raw=%p, len=%u)\n", *raw, raw, len); + ret = -HPACK_ERR_UNKNOWN_OPCODE; + goto leave; + } + + hpack_debug_printf("%02x: p14: indexed header field : ", code); + + idx = get_var_int(&raw, &len, 7); + if (len == (uint32_t)-1) { // truncated + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TRUNCATED; + goto leave; + } + + hpack_debug_printf(" idx=%u ", idx); + + if (!hpack_valid_idx(dht, idx)) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + value = hpack_alloc_string(tmp, idx, hpack_idx_to_value(dht, idx)); + if (!isttest(value)) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + /* here we don't index so we can always keep the pseudo header number */ + name = ist2(NULL, hpack_idx_to_phdr(idx)); + + if (!name.len) { + name = hpack_alloc_string(tmp, idx, hpack_idx_to_name(dht, idx)); + if (!isttest(name)) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + } + /* <name> and <value> are now set and point to stable values */ + } + else if (*raw >= 0x20 && *raw <= 0x3f) { + /* max dyn table size change */ + hpack_debug_printf("%02x: p18: dynamic table size update : ", code); + + if (ret) { + /* 7541#4.2.1 : DHT size update must only be at the beginning */ + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + idx = get_var_int(&raw, &len, 5); + if (len == (uint32_t)-1) { // truncated + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TRUNCATED; + goto leave; + } + hpack_debug_printf(" new len=%u\n", idx); + + if (idx > dht->size) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_INVALID_ARGUMENT; + goto leave; + } + continue; + } + else if (!(*raw & (*raw - 0x10))) { + /* 0x00, 0x10, and 0x40 (0x20 and 0x80 were already handled above) */ + + /* literal header field without/never/with incremental indexing -- literal name */ + if (*raw == 0x00) + hpack_debug_printf("%02x: p17: literal without indexing : ", code); + else if (*raw == 0x10) + hpack_debug_printf("%02x: p18: literal never indexed : ", code); + else if (*raw == 0x40) + hpack_debug_printf("%02x: p16: literal with indexing : ", code); + + if (*raw == 0x40) + must_index = 1; + + raw++; len--; + + /* retrieve name */ + if (!len) { // truncated + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TRUNCATED; + goto leave; + } + + huff = *raw & 0x80; + nlen = get_var_int(&raw, &len, 7); + if (len == (uint32_t)-1 || len < nlen) { // truncated + hpack_debug_printf("##ERR@%d## (truncated): nlen=%d len=%d\n", + __LINE__, (int)nlen, (int)len); + ret = -HPACK_ERR_TRUNCATED; + goto leave; + } + + name = ist2(raw, nlen); + + raw += nlen; + len -= nlen; + + if (huff) { + char *ntrash = chunk_newstr(tmp); + if (!ntrash) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + nlen = huff_dec((const uint8_t *)name.ptr, name.len, ntrash, + tmp->size - tmp->data); + if (nlen == (uint32_t)-1) { + hpack_debug_printf("2: can't decode huffman.\n"); + ret = -HPACK_ERR_HUFFMAN; + goto leave; + } + hpack_debug_printf(" [name huff %d->%d] ", (int)name.len, (int)nlen); + + tmp->data += nlen; // make room for the value + name = ist2(ntrash, nlen); + } + + /* retrieve value */ + if (!len) { // truncated + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TRUNCATED; + goto leave; + } + + huff = *raw & 0x80; + vlen = get_var_int(&raw, &len, 7); + if (len == (uint32_t)-1 || len < vlen) { // truncated + hpack_debug_printf("##ERR@%d## : vlen=%d len=%d\n", + __LINE__, (int)vlen, (int)len); + ret = -HPACK_ERR_TRUNCATED; + goto leave; + } + + value = ist2(raw, vlen); + raw += vlen; + len -= vlen; + + if (huff) { + char *vtrash = chunk_newstr(tmp); + if (!vtrash) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + vlen = huff_dec((const uint8_t *)value.ptr, value.len, vtrash, + tmp->size - tmp->data); + if (vlen == (uint32_t)-1) { + hpack_debug_printf("3: can't decode huffman.\n"); + ret = -HPACK_ERR_HUFFMAN; + goto leave; + } + hpack_debug_printf(" [value huff %d->%d] ", (int)value.len, (int)vlen); + + tmp->data += vlen; // make room for the value + value = ist2(vtrash, vlen); + } + + /* <name> and <value> are correctly filled here */ + } + else { + /* 0x01..0x0f : literal header field without indexing -- indexed name */ + /* 0x11..0x1f : literal header field never indexed -- indexed name */ + /* 0x41..0x7f : literal header field with incremental indexing -- indexed name */ + + if (*raw <= 0x0f) + hpack_debug_printf("%02x: p16: literal without indexing -- indexed name : ", code); + else if (*raw >= 0x41) + hpack_debug_printf("%02x: p15: literal with indexing -- indexed name : ", code); + else + hpack_debug_printf("%02x: p16: literal never indexed -- indexed name : ", code); + + /* retrieve name index */ + if (*raw >= 0x41) { + must_index = 1; + idx = get_var_int(&raw, &len, 6); + } + else + idx = get_var_int(&raw, &len, 4); + + hpack_debug_printf(" idx=%u ", idx); + + if (len == (uint32_t)-1 || !len) { // truncated + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TRUNCATED; + goto leave; + } + + if (!hpack_valid_idx(dht, idx)) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + /* retrieve value */ + huff = *raw & 0x80; + vlen = get_var_int(&raw, &len, 7); + if (len == (uint32_t)-1 || len < vlen) { // truncated + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TRUNCATED; + goto leave; + } + + value = ist2(raw, vlen); + raw += vlen; + len -= vlen; + + if (huff) { + char *vtrash = chunk_newstr(tmp); + if (!vtrash) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + vlen = huff_dec((const uint8_t *)value.ptr, value.len, vtrash, + tmp->size - tmp->data); + if (vlen == (uint32_t)-1) { + hpack_debug_printf("##ERR@%d## can't decode huffman : ilen=%d osize=%d\n", + __LINE__, (int)value.len, + (int)(tmp->size - tmp->data)); + hpack_debug_hexdump(stderr, "[HUFFMAN] ", value.ptr, 0, value.len); + ret = -HPACK_ERR_HUFFMAN; + goto leave; + } + tmp->data += vlen; // make room for the value + value = ist2(vtrash, vlen); + } + + name = IST_NULL; + if (!must_index) + name.len = hpack_idx_to_phdr(idx); + + if (!name.len) { + name = hpack_alloc_string(tmp, idx, hpack_idx_to_name(dht, idx)); + if (!isttest(name)) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + } + /* <name> and <value> are correctly filled here */ + } + + /* We must not accept empty header names (forbidden by the spec and used + * as a list termination). + */ + if (!name.len) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_INVALID_ARGUMENT; + goto leave; + } + + /* here's what we have here : + * - name.len > 0 + * - value is filled with either const data or data allocated from tmp + * - name.ptr == NULL && !must_index : known pseudo-header #name.len + * - name.ptr != NULL || must_index : general header, unknown pseudo-header or index needed + */ + if (ret >= list_size) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + list[ret].n = name; + list[ret].v = value; + ret++; + + if (must_index && hpack_dht_insert(dht, name, value) < 0) { + hpack_debug_printf("failed to find some room in the dynamic table\n"); + ret = -HPACK_ERR_DHT_INSERT_FAIL; + goto leave; + } + + hpack_debug_printf("\e[1;34m%s\e[0m: ", + isttest(name) ? istpad(trash.area, name).ptr : h2_phdr_to_str(name.len)); + + hpack_debug_printf("\e[1;35m%s\e[0m [mustidx=%d, used=%d] [n=(%p,%d) v=(%p,%d)]\n", + istpad(trash.area, value).ptr, must_index, + dht->used, + name.ptr, (int)name.len, value.ptr, (int)value.len); + } + + if (ret >= list_size) { + ret = -HPACK_ERR_TOO_LARGE; + goto leave; + } + + /* put an end marker */ + list[ret].n = list[ret].v = IST_NULL; + ret++; + + leave: + hpack_debug_printf("-- done: ret=%d list_size=%d --\n", (int)ret, (int)list_size); + return ret; +} diff --git a/src/hpack-enc.c b/src/hpack-enc.c new file mode 100644 index 0000000..3ab21bc --- /dev/null +++ b/src/hpack-enc.c @@ -0,0 +1,210 @@ +/* + * HPACK decompressor (RFC7541) + * + * Copyright (C) 2014-2017 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <import/ist.h> +#include <haproxy/hpack-enc.h> +#include <haproxy/http-hdr-t.h> + +/* + * HPACK encoding: these tables were generated using gen-enc.c + */ + +/* encoding of stream of compressed headers. This stream is composed of series + * of <len:8b> <index:8b> <name:<len>*8b>. + */ +const char hpack_enc_stream[666] = { + /* 0: */ 0x03, 0x15, 0x61, 0x67, 0x65, 0x03, 0x3c, 0x76, + /* 8: */ 0x69, 0x61, 0x04, 0x21, 0x64, 0x61, 0x74, 0x65, + /* 16: */ 0x04, 0x26, 0x68, 0x6f, 0x73, 0x74, 0x04, 0x22, + /* 24: */ 0x65, 0x74, 0x61, 0x67, 0x04, 0x25, 0x66, 0x72, + /* 32: */ 0x6f, 0x6d, 0x04, 0x2d, 0x6c, 0x69, 0x6e, 0x6b, + /* 40: */ 0x04, 0x3b, 0x76, 0x61, 0x72, 0x79, 0x05, 0x04, + /* 48: */ 0x3a, 0x70, 0x61, 0x74, 0x68, 0x05, 0x16, 0x61, + /* 56: */ 0x6c, 0x6c, 0x6f, 0x77, 0x05, 0x32, 0x72, 0x61, + /* 64: */ 0x6e, 0x67, 0x65, 0x06, 0x13, 0x61, 0x63, 0x63, + /* 72: */ 0x65, 0x70, 0x74, 0x06, 0x36, 0x73, 0x65, 0x72, + /* 80: */ 0x76, 0x65, 0x72, 0x06, 0x20, 0x63, 0x6f, 0x6f, + /* 88: */ 0x6b, 0x69, 0x65, 0x06, 0x23, 0x65, 0x78, 0x70, + /* 96: */ 0x65, 0x63, 0x74, 0x07, 0x33, 0x72, 0x65, 0x66, + /* 104: */ 0x65, 0x72, 0x65, 0x72, 0x07, 0x24, 0x65, 0x78, + /* 112: */ 0x70, 0x69, 0x72, 0x65, 0x73, 0x07, 0x02, 0x3a, + /* 120: */ 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x07, 0x06, + /* 128: */ 0x3a, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x65, 0x07, + /* 136: */ 0x08, 0x3a, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, + /* 144: */ 0x07, 0x34, 0x72, 0x65, 0x66, 0x72, 0x65, 0x73, + /* 152: */ 0x68, 0x08, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x74, + /* 160: */ 0x69, 0x6f, 0x6e, 0x08, 0x27, 0x69, 0x66, 0x2d, + /* 168: */ 0x6d, 0x61, 0x74, 0x63, 0x68, 0x08, 0x2a, 0x69, + /* 176: */ 0x66, 0x2d, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x0a, + /* 184: */ 0x3a, 0x75, 0x73, 0x65, 0x72, 0x2d, 0x61, 0x67, + /* 192: */ 0x65, 0x6e, 0x74, 0x0a, 0x37, 0x73, 0x65, 0x74, + /* 200: */ 0x2d, 0x63, 0x6f, 0x6f, 0x6b, 0x69, 0x65, 0x0a, + /* 208: */ 0x01, 0x3a, 0x61, 0x75, 0x74, 0x68, 0x6f, 0x72, + /* 216: */ 0x69, 0x74, 0x79, 0x0b, 0x35, 0x72, 0x65, 0x74, + /* 224: */ 0x72, 0x79, 0x2d, 0x61, 0x66, 0x74, 0x65, 0x72, + /* 232: */ 0x0c, 0x1f, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + /* 240: */ 0x74, 0x2d, 0x74, 0x79, 0x70, 0x65, 0x0c, 0x2f, + /* 248: */ 0x6d, 0x61, 0x78, 0x2d, 0x66, 0x6f, 0x72, 0x77, + /* 256: */ 0x61, 0x72, 0x64, 0x73, 0x0d, 0x18, 0x63, 0x61, + /* 264: */ 0x63, 0x68, 0x65, 0x2d, 0x63, 0x6f, 0x6e, 0x74, + /* 272: */ 0x72, 0x6f, 0x6c, 0x0d, 0x2c, 0x6c, 0x61, 0x73, + /* 280: */ 0x74, 0x2d, 0x6d, 0x6f, 0x64, 0x69, 0x66, 0x69, + /* 288: */ 0x65, 0x64, 0x0d, 0x12, 0x61, 0x63, 0x63, 0x65, + /* 296: */ 0x70, 0x74, 0x2d, 0x72, 0x61, 0x6e, 0x67, 0x65, + /* 304: */ 0x73, 0x0d, 0x29, 0x69, 0x66, 0x2d, 0x6e, 0x6f, + /* 312: */ 0x6e, 0x65, 0x2d, 0x6d, 0x61, 0x74, 0x63, 0x68, + /* 320: */ 0x0d, 0x17, 0x61, 0x75, 0x74, 0x68, 0x6f, 0x72, + /* 328: */ 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x0d, + /* 336: */ 0x1e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + /* 344: */ 0x2d, 0x72, 0x61, 0x6e, 0x67, 0x65, 0x0e, 0x1c, + /* 352: */ 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, + /* 360: */ 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x0e, 0x0f, + /* 368: */ 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x2d, 0x63, + /* 376: */ 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x0f, 0x10, + /* 384: */ 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x2d, 0x65, + /* 392: */ 0x6e, 0x63, 0x6f, 0x64, 0x69, 0x6e, 0x67, 0x0f, + /* 400: */ 0x11, 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x2d, + /* 408: */ 0x6c, 0x61, 0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, + /* 416: */ 0x10, 0x1a, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + /* 424: */ 0x74, 0x2d, 0x65, 0x6e, 0x63, 0x6f, 0x64, 0x69, + /* 432: */ 0x6e, 0x67, 0x10, 0x1b, 0x63, 0x6f, 0x6e, 0x74, + /* 440: */ 0x65, 0x6e, 0x74, 0x2d, 0x6c, 0x61, 0x6e, 0x67, + /* 448: */ 0x75, 0x61, 0x67, 0x65, 0x10, 0x1d, 0x63, 0x6f, + /* 456: */ 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x6c, 0x6f, + /* 464: */ 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x10, 0x3d, + /* 472: */ 0x77, 0x77, 0x77, 0x2d, 0x61, 0x75, 0x74, 0x68, + /* 480: */ 0x65, 0x6e, 0x74, 0x69, 0x63, 0x61, 0x74, 0x65, + /* 488: */ 0x11, 0x39, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x66, + /* 496: */ 0x65, 0x72, 0x2d, 0x65, 0x6e, 0x63, 0x6f, 0x64, + /* 504: */ 0x69, 0x6e, 0x67, 0x11, 0x28, 0x69, 0x66, 0x2d, + /* 512: */ 0x6d, 0x6f, 0x64, 0x69, 0x66, 0x69, 0x65, 0x64, + /* 520: */ 0x2d, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x12, 0x30, + /* 528: */ 0x70, 0x72, 0x6f, 0x78, 0x79, 0x2d, 0x61, 0x75, + /* 536: */ 0x74, 0x68, 0x65, 0x6e, 0x74, 0x69, 0x63, 0x61, + /* 544: */ 0x74, 0x65, 0x13, 0x19, 0x63, 0x6f, 0x6e, 0x74, + /* 552: */ 0x65, 0x6e, 0x74, 0x2d, 0x64, 0x69, 0x73, 0x70, + /* 560: */ 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x13, + /* 568: */ 0x2b, 0x69, 0x66, 0x2d, 0x75, 0x6e, 0x6d, 0x6f, + /* 576: */ 0x64, 0x69, 0x66, 0x69, 0x65, 0x64, 0x2d, 0x73, + /* 584: */ 0x69, 0x6e, 0x63, 0x65, 0x13, 0x31, 0x70, 0x72, + /* 592: */ 0x6f, 0x78, 0x79, 0x2d, 0x61, 0x75, 0x74, 0x68, + /* 600: */ 0x6f, 0x72, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, + /* 608: */ 0x6e, 0x19, 0x38, 0x73, 0x74, 0x72, 0x69, 0x63, + /* 616: */ 0x74, 0x2d, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, + /* 624: */ 0x6f, 0x72, 0x74, 0x2d, 0x73, 0x65, 0x63, 0x75, + /* 632: */ 0x72, 0x69, 0x74, 0x79, 0x1b, 0x14, 0x61, 0x63, + /* 640: */ 0x63, 0x65, 0x73, 0x73, 0x2d, 0x63, 0x6f, 0x6e, + /* 648: */ 0x74, 0x72, 0x6f, 0x6c, 0x2d, 0x61, 0x6c, 0x6c, + /* 656: */ 0x6f, 0x77, 0x2d, 0x6f, 0x72, 0x69, 0x67, 0x69, + /* 664: */ 0x6e, 0x00, +}; + +/* This points to the first position in table hpack_enc_stream[] of a header + * of the same length. + */ +const signed short hpack_pos_len[32] = { + /* 0: */ -1, -1, -1, 0, 10, 46, 67, 99, + /* 8: */ 153, -1, 183, 219, 232, 260, 350, 382, + /* 16: */ 416, 488, 526, 546, -1, -1, -1, -1, + /* 24: */ -1, 609, -1, 636, -1, -1, -1, -1, +}; + +/* Tries to encode header whose name is <n> and value <v> into the chunk <out>. + * Returns non-zero on success, 0 on failure (buffer full). + */ +int hpack_encode_header(struct buffer *out, const struct ist n, + const struct ist v) +{ + int len = out->data; + int size = out->size; + int pos; + + if (len >= size) + return 0; + + /* look for the header field <n> in the static table */ + if (n.len >= sizeof(hpack_pos_len) / sizeof(hpack_pos_len[0])) + goto make_literal; + + pos = hpack_pos_len[n.len]; + if (pos >= 0) { + /* At least one header field of this length exist */ + do { + char idx; + + pos++; + idx = hpack_enc_stream[pos++]; + pos += n.len; + if (isteq(ist2(&hpack_enc_stream[pos - n.len], n.len), n)) { + /* emit literal with indexing (7541#6.2.1) : + * [ 0 | 1 | Index (6+) ] + */ + out->area[len++] = idx | 0x40; + goto emit_value; + } + } while ((unsigned char)hpack_enc_stream[pos] == n.len); + } + + make_literal: + if (likely(n.len < 127 && len + 2 + n.len <= size)) { + out->area[len++] = 0x00; /* literal without indexing -- new name */ + out->area[len++] = n.len; /* single-byte length encoding */ + ist2bin(out->area + len, n); + len += n.len; + } + else if (hpack_len_to_bytes(n.len) && + len + 1 + hpack_len_to_bytes(n.len) + n.len <= size) { + out->area[len++] = 0x00; /* literal without indexing -- new name */ + len = hpack_encode_len(out->area, len, n.len); + ist2bin(out->area + len, n); + len += n.len; + } + else { + /* header field name too large for the buffer */ + return 0; + } + + emit_value: + /* copy literal header field value */ + if (!hpack_len_to_bytes(v.len) || + len + hpack_len_to_bytes(v.len) + v.len > size) { + /* header value too large for the buffer */ + return 0; + } + + len = hpack_encode_len(out->area, len, v.len); + memcpy(out->area + len, v.ptr, v.len); + len += v.len; + + out->data = len; + return 1; +} diff --git a/src/hpack-huff.c b/src/hpack-huff.c new file mode 100644 index 0000000..77743be --- /dev/null +++ b/src/hpack-huff.c @@ -0,0 +1,861 @@ +/* + * Huffman decoding and encoding for HPACK (RFC7541) + * + * Copyright (C) 2014-2017 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <stdio.h> +#include <inttypes.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/hpack-huff.h> +#include <haproxy/net_helper.h> + +struct huff { + uint32_t c; /* code point */ + int b; /* bits */ +}; + +/* huffman table as per RFC7541 appendix B */ +static const struct huff ht[257] = { + [ 0] = { .c = 0x00001ff8, .b = 13 }, + [ 1] = { .c = 0x007fffd8, .b = 23 }, + [ 2] = { .c = 0x0fffffe2, .b = 28 }, + [ 3] = { .c = 0x0fffffe3, .b = 28 }, + [ 4] = { .c = 0x0fffffe4, .b = 28 }, + [ 5] = { .c = 0x0fffffe5, .b = 28 }, + [ 6] = { .c = 0x0fffffe6, .b = 28 }, + [ 7] = { .c = 0x0fffffe7, .b = 28 }, + [ 8] = { .c = 0x0fffffe8, .b = 28 }, + [ 9] = { .c = 0x00ffffea, .b = 24 }, + [ 10] = { .c = 0x3ffffffc, .b = 30 }, + [ 11] = { .c = 0x0fffffe9, .b = 28 }, + [ 12] = { .c = 0x0fffffea, .b = 28 }, + [ 13] = { .c = 0x3ffffffd, .b = 30 }, + [ 14] = { .c = 0x0fffffeb, .b = 28 }, + [ 15] = { .c = 0x0fffffec, .b = 28 }, + [ 16] = { .c = 0x0fffffed, .b = 28 }, + [ 17] = { .c = 0x0fffffee, .b = 28 }, + [ 18] = { .c = 0x0fffffef, .b = 28 }, + [ 19] = { .c = 0x0ffffff0, .b = 28 }, + [ 20] = { .c = 0x0ffffff1, .b = 28 }, + [ 21] = { .c = 0x0ffffff2, .b = 28 }, + [ 22] = { .c = 0x3ffffffe, .b = 30 }, + [ 23] = { .c = 0x0ffffff3, .b = 28 }, + [ 24] = { .c = 0x0ffffff4, .b = 28 }, + [ 25] = { .c = 0x0ffffff5, .b = 28 }, + [ 26] = { .c = 0x0ffffff6, .b = 28 }, + [ 27] = { .c = 0x0ffffff7, .b = 28 }, + [ 28] = { .c = 0x0ffffff8, .b = 28 }, + [ 29] = { .c = 0x0ffffff9, .b = 28 }, + [ 30] = { .c = 0x0ffffffa, .b = 28 }, + [ 31] = { .c = 0x0ffffffb, .b = 28 }, + [ 32] = { .c = 0x00000014, .b = 6 }, + [ 33] = { .c = 0x000003f8, .b = 10 }, + [ 34] = { .c = 0x000003f9, .b = 10 }, + [ 35] = { .c = 0x00000ffa, .b = 12 }, + [ 36] = { .c = 0x00001ff9, .b = 13 }, + [ 37] = { .c = 0x00000015, .b = 6 }, + [ 38] = { .c = 0x000000f8, .b = 8 }, + [ 39] = { .c = 0x000007fa, .b = 11 }, + [ 40] = { .c = 0x000003fa, .b = 10 }, + [ 41] = { .c = 0x000003fb, .b = 10 }, + [ 42] = { .c = 0x000000f9, .b = 8 }, + [ 43] = { .c = 0x000007fb, .b = 11 }, + [ 44] = { .c = 0x000000fa, .b = 8 }, + [ 45] = { .c = 0x00000016, .b = 6 }, + [ 46] = { .c = 0x00000017, .b = 6 }, + [ 47] = { .c = 0x00000018, .b = 6 }, + [ 48] = { .c = 0x00000000, .b = 5 }, + [ 49] = { .c = 0x00000001, .b = 5 }, + [ 50] = { .c = 0x00000002, .b = 5 }, + [ 51] = { .c = 0x00000019, .b = 6 }, + [ 52] = { .c = 0x0000001a, .b = 6 }, + [ 53] = { .c = 0x0000001b, .b = 6 }, + [ 54] = { .c = 0x0000001c, .b = 6 }, + [ 55] = { .c = 0x0000001d, .b = 6 }, + [ 56] = { .c = 0x0000001e, .b = 6 }, + [ 57] = { .c = 0x0000001f, .b = 6 }, + [ 58] = { .c = 0x0000005c, .b = 7 }, + [ 59] = { .c = 0x000000fb, .b = 8 }, + [ 60] = { .c = 0x00007ffc, .b = 15 }, + [ 61] = { .c = 0x00000020, .b = 6 }, + [ 62] = { .c = 0x00000ffb, .b = 12 }, + [ 63] = { .c = 0x000003fc, .b = 10 }, + [ 64] = { .c = 0x00001ffa, .b = 13 }, + [ 65] = { .c = 0x00000021, .b = 6 }, + [ 66] = { .c = 0x0000005d, .b = 7 }, + [ 67] = { .c = 0x0000005e, .b = 7 }, + [ 68] = { .c = 0x0000005f, .b = 7 }, + [ 69] = { .c = 0x00000060, .b = 7 }, + [ 70] = { .c = 0x00000061, .b = 7 }, + [ 71] = { .c = 0x00000062, .b = 7 }, + [ 72] = { .c = 0x00000063, .b = 7 }, + [ 73] = { .c = 0x00000064, .b = 7 }, + [ 74] = { .c = 0x00000065, .b = 7 }, + [ 75] = { .c = 0x00000066, .b = 7 }, + [ 76] = { .c = 0x00000067, .b = 7 }, + [ 77] = { .c = 0x00000068, .b = 7 }, + [ 78] = { .c = 0x00000069, .b = 7 }, + [ 79] = { .c = 0x0000006a, .b = 7 }, + [ 80] = { .c = 0x0000006b, .b = 7 }, + [ 81] = { .c = 0x0000006c, .b = 7 }, + [ 82] = { .c = 0x0000006d, .b = 7 }, + [ 83] = { .c = 0x0000006e, .b = 7 }, + [ 84] = { .c = 0x0000006f, .b = 7 }, + [ 85] = { .c = 0x00000070, .b = 7 }, + [ 86] = { .c = 0x00000071, .b = 7 }, + [ 87] = { .c = 0x00000072, .b = 7 }, + [ 88] = { .c = 0x000000fc, .b = 8 }, + [ 89] = { .c = 0x00000073, .b = 7 }, + [ 90] = { .c = 0x000000fd, .b = 8 }, + [ 91] = { .c = 0x00001ffb, .b = 13 }, + [ 92] = { .c = 0x0007fff0, .b = 19 }, + [ 93] = { .c = 0x00001ffc, .b = 13 }, + [ 94] = { .c = 0x00003ffc, .b = 14 }, + [ 95] = { .c = 0x00000022, .b = 6 }, + [ 96] = { .c = 0x00007ffd, .b = 15 }, + [ 97] = { .c = 0x00000003, .b = 5 }, + [ 98] = { .c = 0x00000023, .b = 6 }, + [ 99] = { .c = 0x00000004, .b = 5 }, + [100] = { .c = 0x00000024, .b = 6 }, + [101] = { .c = 0x00000005, .b = 5 }, + [102] = { .c = 0x00000025, .b = 6 }, + [103] = { .c = 0x00000026, .b = 6 }, + [104] = { .c = 0x00000027, .b = 6 }, + [105] = { .c = 0x00000006, .b = 5 }, + [106] = { .c = 0x00000074, .b = 7 }, + [107] = { .c = 0x00000075, .b = 7 }, + [108] = { .c = 0x00000028, .b = 6 }, + [109] = { .c = 0x00000029, .b = 6 }, + [110] = { .c = 0x0000002a, .b = 6 }, + [111] = { .c = 0x00000007, .b = 5 }, + [112] = { .c = 0x0000002b, .b = 6 }, + [113] = { .c = 0x00000076, .b = 7 }, + [114] = { .c = 0x0000002c, .b = 6 }, + [115] = { .c = 0x00000008, .b = 5 }, + [116] = { .c = 0x00000009, .b = 5 }, + [117] = { .c = 0x0000002d, .b = 6 }, + [118] = { .c = 0x00000077, .b = 7 }, + [119] = { .c = 0x00000078, .b = 7 }, + [120] = { .c = 0x00000079, .b = 7 }, + [121] = { .c = 0x0000007a, .b = 7 }, + [122] = { .c = 0x0000007b, .b = 7 }, + [123] = { .c = 0x00007ffe, .b = 15 }, + [124] = { .c = 0x000007fc, .b = 11 }, + [125] = { .c = 0x00003ffd, .b = 14 }, + [126] = { .c = 0x00001ffd, .b = 13 }, + [127] = { .c = 0x0ffffffc, .b = 28 }, + [128] = { .c = 0x000fffe6, .b = 20 }, + [129] = { .c = 0x003fffd2, .b = 22 }, + [130] = { .c = 0x000fffe7, .b = 20 }, + [131] = { .c = 0x000fffe8, .b = 20 }, + [132] = { .c = 0x003fffd3, .b = 22 }, + [133] = { .c = 0x003fffd4, .b = 22 }, + [134] = { .c = 0x003fffd5, .b = 22 }, + [135] = { .c = 0x007fffd9, .b = 23 }, + [136] = { .c = 0x003fffd6, .b = 22 }, + [137] = { .c = 0x007fffda, .b = 23 }, + [138] = { .c = 0x007fffdb, .b = 23 }, + [139] = { .c = 0x007fffdc, .b = 23 }, + [140] = { .c = 0x007fffdd, .b = 23 }, + [141] = { .c = 0x007fffde, .b = 23 }, + [142] = { .c = 0x00ffffeb, .b = 24 }, + [143] = { .c = 0x007fffdf, .b = 23 }, + [144] = { .c = 0x00ffffec, .b = 24 }, + [145] = { .c = 0x00ffffed, .b = 24 }, + [146] = { .c = 0x003fffd7, .b = 22 }, + [147] = { .c = 0x007fffe0, .b = 23 }, + [148] = { .c = 0x00ffffee, .b = 24 }, + [149] = { .c = 0x007fffe1, .b = 23 }, + [150] = { .c = 0x007fffe2, .b = 23 }, + [151] = { .c = 0x007fffe3, .b = 23 }, + [152] = { .c = 0x007fffe4, .b = 23 }, + [153] = { .c = 0x001fffdc, .b = 21 }, + [154] = { .c = 0x003fffd8, .b = 22 }, + [155] = { .c = 0x007fffe5, .b = 23 }, + [156] = { .c = 0x003fffd9, .b = 22 }, + [157] = { .c = 0x007fffe6, .b = 23 }, + [158] = { .c = 0x007fffe7, .b = 23 }, + [159] = { .c = 0x00ffffef, .b = 24 }, + [160] = { .c = 0x003fffda, .b = 22 }, + [161] = { .c = 0x001fffdd, .b = 21 }, + [162] = { .c = 0x000fffe9, .b = 20 }, + [163] = { .c = 0x003fffdb, .b = 22 }, + [164] = { .c = 0x003fffdc, .b = 22 }, + [165] = { .c = 0x007fffe8, .b = 23 }, + [166] = { .c = 0x007fffe9, .b = 23 }, + [167] = { .c = 0x001fffde, .b = 21 }, + [168] = { .c = 0x007fffea, .b = 23 }, + [169] = { .c = 0x003fffdd, .b = 22 }, + [170] = { .c = 0x003fffde, .b = 22 }, + [171] = { .c = 0x00fffff0, .b = 24 }, + [172] = { .c = 0x001fffdf, .b = 21 }, + [173] = { .c = 0x003fffdf, .b = 22 }, + [174] = { .c = 0x007fffeb, .b = 23 }, + [175] = { .c = 0x007fffec, .b = 23 }, + [176] = { .c = 0x001fffe0, .b = 21 }, + [177] = { .c = 0x001fffe1, .b = 21 }, + [178] = { .c = 0x003fffe0, .b = 22 }, + [179] = { .c = 0x001fffe2, .b = 21 }, + [180] = { .c = 0x007fffed, .b = 23 }, + [181] = { .c = 0x003fffe1, .b = 22 }, + [182] = { .c = 0x007fffee, .b = 23 }, + [183] = { .c = 0x007fffef, .b = 23 }, + [184] = { .c = 0x000fffea, .b = 20 }, + [185] = { .c = 0x003fffe2, .b = 22 }, + [186] = { .c = 0x003fffe3, .b = 22 }, + [187] = { .c = 0x003fffe4, .b = 22 }, + [188] = { .c = 0x007ffff0, .b = 23 }, + [189] = { .c = 0x003fffe5, .b = 22 }, + [190] = { .c = 0x003fffe6, .b = 22 }, + [191] = { .c = 0x007ffff1, .b = 23 }, + [192] = { .c = 0x03ffffe0, .b = 26 }, + [193] = { .c = 0x03ffffe1, .b = 26 }, + [194] = { .c = 0x000fffeb, .b = 20 }, + [195] = { .c = 0x0007fff1, .b = 19 }, + [196] = { .c = 0x003fffe7, .b = 22 }, + [197] = { .c = 0x007ffff2, .b = 23 }, + [198] = { .c = 0x003fffe8, .b = 22 }, + [199] = { .c = 0x01ffffec, .b = 25 }, + [200] = { .c = 0x03ffffe2, .b = 26 }, + [201] = { .c = 0x03ffffe3, .b = 26 }, + [202] = { .c = 0x03ffffe4, .b = 26 }, + [203] = { .c = 0x07ffffde, .b = 27 }, + [204] = { .c = 0x07ffffdf, .b = 27 }, + [205] = { .c = 0x03ffffe5, .b = 26 }, + [206] = { .c = 0x00fffff1, .b = 24 }, + [207] = { .c = 0x01ffffed, .b = 25 }, + [208] = { .c = 0x0007fff2, .b = 19 }, + [209] = { .c = 0x001fffe3, .b = 21 }, + [210] = { .c = 0x03ffffe6, .b = 26 }, + [211] = { .c = 0x07ffffe0, .b = 27 }, + [212] = { .c = 0x07ffffe1, .b = 27 }, + [213] = { .c = 0x03ffffe7, .b = 26 }, + [214] = { .c = 0x07ffffe2, .b = 27 }, + [215] = { .c = 0x00fffff2, .b = 24 }, + [216] = { .c = 0x001fffe4, .b = 21 }, + [217] = { .c = 0x001fffe5, .b = 21 }, + [218] = { .c = 0x03ffffe8, .b = 26 }, + [219] = { .c = 0x03ffffe9, .b = 26 }, + [220] = { .c = 0x0ffffffd, .b = 28 }, + [221] = { .c = 0x07ffffe3, .b = 27 }, + [222] = { .c = 0x07ffffe4, .b = 27 }, + [223] = { .c = 0x07ffffe5, .b = 27 }, + [224] = { .c = 0x000fffec, .b = 20 }, + [225] = { .c = 0x00fffff3, .b = 24 }, + [226] = { .c = 0x000fffed, .b = 20 }, + [227] = { .c = 0x001fffe6, .b = 21 }, + [228] = { .c = 0x003fffe9, .b = 22 }, + [229] = { .c = 0x001fffe7, .b = 21 }, + [230] = { .c = 0x001fffe8, .b = 21 }, + [231] = { .c = 0x007ffff3, .b = 23 }, + [232] = { .c = 0x003fffea, .b = 22 }, + [233] = { .c = 0x003fffeb, .b = 22 }, + [234] = { .c = 0x01ffffee, .b = 25 }, + [235] = { .c = 0x01ffffef, .b = 25 }, + [236] = { .c = 0x00fffff4, .b = 24 }, + [237] = { .c = 0x00fffff5, .b = 24 }, + [238] = { .c = 0x03ffffea, .b = 26 }, + [239] = { .c = 0x007ffff4, .b = 23 }, + [240] = { .c = 0x03ffffeb, .b = 26 }, + [241] = { .c = 0x07ffffe6, .b = 27 }, + [242] = { .c = 0x03ffffec, .b = 26 }, + [243] = { .c = 0x03ffffed, .b = 26 }, + [244] = { .c = 0x07ffffe7, .b = 27 }, + [245] = { .c = 0x07ffffe8, .b = 27 }, + [246] = { .c = 0x07ffffe9, .b = 27 }, + [247] = { .c = 0x07ffffea, .b = 27 }, + [248] = { .c = 0x07ffffeb, .b = 27 }, + [249] = { .c = 0x0ffffffe, .b = 28 }, + [250] = { .c = 0x07ffffec, .b = 27 }, + [251] = { .c = 0x07ffffed, .b = 27 }, + [252] = { .c = 0x07ffffee, .b = 27 }, + [253] = { .c = 0x07ffffef, .b = 27 }, + [254] = { .c = 0x07fffff0, .b = 27 }, + [255] = { .c = 0x03ffffee, .b = 26 }, + [256] = { .c = 0x3fffffff, .b = 30 }, /* EOS */ +}; + + +/* Reversed huffman codes, generated by dev/hpack/gen-rht.c from the table + * above, then simplified by hand by extracting the few different length + * values and writing code to produce them instead. + * + * The codes are aligned on the MSB since that's how they appear in the stream. + * + * Quick summary below of the way the tables work. They're based on how the + * prefixes are organized, starting from the MSB. + * + * These codes fit in a single octet (5 to 8 bits) : + * 00/5 08/5 10/5 18/5 20/5 28/5 30/5 38/5 + * 40/5 48/5 + * + * 50/6 54/6 58/6 5c/6 60/6 64/6 68/6 6c/6 + * 70/6 74/6 78/6 7c/6 80/6 84/6 88/6 8c/6 + * 90/6 94/6 98/6 9c/6 a0/6 a4/6 a8/6 ac/6 + * b0/6 b4/6 + * + * b8/7 ba/7 bc/7 be/7 c0/7 c2/7 c4/7 c6/7 + * c8/7 ca/7 cc/7 ce/7 d0/7 d2/7 d4/7 d6/7 + * d8/7 da/7 dc/7 de/7 e0/7 e2/7 e4/7 e6/7 + * e8/7 ea/7 ec/7 ee/7 f0/7 f2/7 f4/7 f6/7 + * + * f8/8 f9/8 fa/8 fb/8 fc/8 fd/8 + * + * ==> a single 256-symbol table based on the full byte provides a direct + * access and the bit count + * + * These codes fit in two octets (10 to 15 bits, neither 9 nor 16 bits code) : + * + * fe + 2 bits: + * 00/2 40/2 80/2 c0/2 + * + * ff + 2..7 bits : + * 00/2 + * 40/3 60/3 80/3 + * a0/4 b0/4 + * c0/5 c8/5 d0/5 d8/5 e0/5 e8/5 + * f0/6 f4/6 + * f8/7 fa/7 fc/7 + * + * ==> a single 256-symbol table made of b0.0 and b1.7-1 provides a direct + * access and the bit count after a miss on the first one above. + * + * These ones fit in three octets : + * ff fe + 3..5 bits : + * 00/3 20/3 40/3 60/4 70/4 80/4 90/4 a0/4 + * b0/4 c0/4 d0/4 + * e0/5 e8/5 f0/5 f8/5 + * + * ff ff + 5..8 bits : + * 00/5 08/5 10/5 18/5 20/5 28/5 30/5 38/5 + * 40/5 + * 48/6 4c/6 50/6 54/6 58/6 5c/6 60/6 64/6 + * 68/6 6c/6 70/6 74/6 78/6 7c/6 80/6 84/6 + * 88/6 8c/6 90/6 94/6 98/6 9c/6 a0/6 a4/6 + * a8/6 ac/6 + * b0/7 b2/7 b4/7 b6/7 b8/7 ba/7 bc/7 be/7 + * c0/7 c2/7 c4/7 c6/7 c8/7 ca/7 cc/7 ce/7 + * d0/7 d2/7 d4/7 d6/7 d8/7 da/7 dc/7 de/7 + * e0/7 e2/7 e4/7 e6/7 e8/7 + * ea/8 eb/8 ec/8 ed/8 ee/8 ef/8 f0/8 f1/8 + * f2/8 f3/8 f4/8 f5/8 + * + * ==> a 32-symbol table has to be applied to 0xfffe + * ==> a 256-symbol table has to be applied to 0xffff + * + * The other ones fit in four octets with 1 to 6 bits in the last one : + * ff ff f6 : 00/1 80/1 + * ff ff f7 : 00/1 80/1 + * ff ff f8 : 00/2 40/2 80/2 c0/2 + * ff ff f9 : 00/2 40/2 80/2 c0/2 + * ff ff fa : 00/2 40/2 80/2 c0/2 + * ff ff fb : 00/2 40/2 80/2 + * ff ff fb : c0/3 e0/3 + * ff ff fc : 00/3 20/3 40/3 60/3 80/3 a0/3 c0/3 e0/3 + * ff ff fd : 00/3 20/3 40/3 60/3 80/3 a0/3 c0/3 e0/3 + * ff ff fe : 00/3 + * ff ff fe : 20/4 30/4 40/4 50/4 60/4 70/4 80/4 90/4 a0/4 b0/4 c0/4 d0/4 e0/4 f0/4 + * ff ff ff : 00/4 10/4 20/4 30/4 40/4 50/4 60/4 70/4 80/4 90/4 a0/4 b0/4 c0/4 d0/4 e0/4 + * ff ff ff : f0/6 f4/6 f8/6 fc/6 + * + * ==> a 256-symbol table with b2.0-3,b3.7-4 gives all of them except the + * distinction between ffffff{f0,f4,f8,fc} which is rare enough + * and can be done by hand when bit count == 30. + * + * + * Code lengths : + * 5..8 : 0x00..0xfe + * 10..15 : 0xfe + * 0xff 0x00..0xfe + * 19..20 : 0xff 0xfe 0x00..0xdf + * 21 : 0xff 0xfe 0xe0..0xff + * 21 : 0xff 0xff 0x00..0x40 + * 22..24 : 0xff 0xff 0x00..0xf5 + * 24..28 : 0xff 0xff 0xf5..0xff + * 30 : 0xff 0xff 0xff 0xf0..0xff + * + * + * if b0 < 0xfe ==> 5..8 bits (74 codes) + * if b0 == 0xfe or 0xff : 10..15 + * => if b0 == 0xfe || b1 < 0xfe : lookup (b0:0|b1:7..1) (21 codes) + * + * -- b0 = 0xff -- + * if b1 == 0xfe : 19..21 bits + * => lookup b2:7..3 (15 codes) + * + * -- b0 = 0xff, b1 = 0xff : 147 codes -- + * if b2 < 0xf6 : 21..24 bits (76 codes) + * if b2 >= 0xf6 : 25..30 bits (71 codes) + * + * Algorithm: + * - if > 24 and < 32, read missing bits. + * - if less than 24 bits, read 1 byte. If past end, insert 0xff instead. + * - if b0 < 0xfe lookup b0 in table0[0..255] + * - else if b0 == 0xfe, manual lookup + * - else if b0 == 0xff, lookup b1 in table1[0..255] + * ... + */ + +uint8_t rht_bit31_24[256] = { + /* 0x00 */ 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, + /* 0x08 */ 0x31, 0x31, 0x31, 0x31, 0x31, 0x31, 0x31, 0x31, + /* 0x10 */ 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, + /* 0x18 */ 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, 0x61, + /* 0x20 */ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, + /* 0x28 */ 0x65, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65, + /* 0x30 */ 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, + /* 0x38 */ 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, + /* 0x40 */ 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, 0x73, + /* 0x48 */ 0x74, 0x74, 0x74, 0x74, 0x74, 0x74, 0x74, 0x74, + /* 0x50 */ 0x20, 0x20, 0x20, 0x20, + /* 0x54 */ 0x25, 0x25, 0x25, 0x25, + /* 0x58 */ 0x2d, 0x2d, 0x2d, 0x2d, + /* 0x5c */ 0x2e, 0x2e, 0x2e, 0x2e, + /* 0x60 */ 0x2f, 0x2f, 0x2f, 0x2f, + /* 0x64 */ 0x33, 0x33, 0x33, 0x33, + /* 0x68 */ 0x34, 0x34, 0x34, 0x34, + /* 0x6c */ 0x35, 0x35, 0x35, 0x35, + /* 0x70 */ 0x36, 0x36, 0x36, 0x36, + /* 0x74 */ 0x37, 0x37, 0x37, 0x37, + /* 0x78 */ 0x38, 0x38, 0x38, 0x38, + /* 0x7c */ 0x39, 0x39, 0x39, 0x39, + /* 0x80 */ 0x3d, 0x3d, 0x3d, 0x3d, + /* 0x84 */ 0x41, 0x41, 0x41, 0x41, + /* 0x88 */ 0x5f, 0x5f, 0x5f, 0x5f, + /* 0x8c */ 0x62, 0x62, 0x62, 0x62, + /* 0x90 */ 0x64, 0x64, 0x64, 0x64, + /* 0x94 */ 0x66, 0x66, 0x66, 0x66, + /* 0x98 */ 0x67, 0x67, 0x67, 0x67, + /* 0x9c */ 0x68, 0x68, 0x68, 0x68, + /* 0xa0 */ 0x6c, 0x6c, 0x6c, 0x6c, + /* 0xa4 */ 0x6d, 0x6d, 0x6d, 0x6d, + /* 0xa8 */ 0x6e, 0x6e, 0x6e, 0x6e, + /* 0xac */ 0x70, 0x70, 0x70, 0x70, + /* 0xb0 */ 0x72, 0x72, 0x72, 0x72, + /* 0xb4 */ 0x75, 0x75, 0x75, 0x75, + /* 0xb8 */ 0x3a, 0x3a, + /* 0xba */ 0x42, 0x42, + /* 0xbc */ 0x43, 0x43, + /* 0xbe */ 0x44, 0x44, + /* 0xc0 */ 0x45, 0x45, + /* 0xc2 */ 0x46, 0x46, + /* 0xc4 */ 0x47, 0x47, + /* 0xc6 */ 0x48, 0x48, + /* 0xc8 */ 0x49, 0x49, + /* 0xca */ 0x4a, 0x4a, + /* 0xcc */ 0x4b, 0x4b, + /* 0xce */ 0x4c, 0x4c, + /* 0xd0 */ 0x4d, 0x4d, + /* 0xd2 */ 0x4e, 0x4e, + /* 0xd4 */ 0x4f, 0x4f, + /* 0xd6 */ 0x50, 0x50, + /* 0xd8 */ 0x51, 0x51, + /* 0xda */ 0x52, 0x52, + /* 0xdc */ 0x53, 0x53, + /* 0xde */ 0x54, 0x54, + /* 0xe0 */ 0x55, 0x55, + /* 0xe2 */ 0x56, 0x56, + /* 0xe4 */ 0x57, 0x57, + /* 0xe6 */ 0x59, 0x59, + /* 0xe8 */ 0x6a, 0x6a, + /* 0xea */ 0x6b, 0x6b, + /* 0xec */ 0x71, 0x71, + /* 0xee */ 0x76, 0x76, + /* 0xf0 */ 0x77, 0x77, + /* 0xf2 */ 0x78, 0x78, + /* 0xf4 */ 0x79, 0x79, + /* 0xf6 */ 0x7a, 0x7a, + /* 0xf8 */ 0x26, + /* 0xf9 */ 0x2a, + /* 0xfa */ 0x2c, + /* 0xfb */ 0x3b, + /* 0xfc */ 0x58, + /* 0xfd */ 0x5a, +}; + +uint8_t rht_bit24_17[256] = { + /* 0x00 */ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + /* 0x10 */ 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, + /* 0x20 */ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + /* 0x30 */ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, + /* 0x40 */ 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, + /* 0x50 */ 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, + /* 0x60 */ 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, + /* 0x70 */ 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, 0x29, + /* 0x80 */ 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, + /* 0x90 */ 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, + /* 0xa0 */ 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, 0x27, + /* 0xb0 */ 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, + /* 0xc0 */ 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, 0x7c, + /* 0xd0 */ 0x23, 0x23, 0x23, 0x23, 0x23, 0x23, 0x23, 0x23, + /* 0xd8 */ 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, + /* 0xe0 */ 0x00, 0x00, 0x00, 0x00, + /* 0xe4 */ 0x24, 0x24, 0x24, 0x24, + /* 0xe8 */ 0x40, 0x40, 0x40, 0x40, + /* 0xec */ 0x5b, 0x5b, 0x5b, 0x5b, + /* 0xf0 */ 0x5d, 0x5d, 0x5d, 0x5d, + /* 0xf4 */ 0x7e, 0x7e, 0x7e, 0x7e, + /* 0xf8 */ 0x5e, 0x5e, + /* 0xfa */ 0x7d, 0x7d, + /* 0xfc */ 0x3c, + /* 0xfd */ 0x60, + /* 0xfe */ 0x7b, +}; + +uint8_t rht_bit15_8[256] = { + /* 0x00 */ 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, + /* 0x08 */ 0xb1, 0xb1, 0xb1, 0xb1, 0xb1, 0xb1, 0xb1, 0xb1, + /* 0x10 */ 0xb3, 0xb3, 0xb3, 0xb3, 0xb3, 0xb3, 0xb3, 0xb3, + /* 0x18 */ 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, 0xd1, + /* 0x20 */ 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, 0xd8, + /* 0x28 */ 0xd9, 0xd9, 0xd9, 0xd9, 0xd9, 0xd9, 0xd9, 0xd9, + /* 0x30 */ 0xe3, 0xe3, 0xe3, 0xe3, 0xe3, 0xe3, 0xe3, 0xe3, + /* 0x38 */ 0xe5, 0xe5, 0xe5, 0xe5, 0xe5, 0xe5, 0xe5, 0xe5, + /* 0x40 */ 0xe6, 0xe6, 0xe6, 0xe6, 0xe6, 0xe6, 0xe6, 0xe6, + /* 0x48 */ 0x81, 0x81, 0x81, 0x81, + /* 0x4c */ 0x84, 0x84, 0x84, 0x84, + /* 0x50 */ 0x85, 0x85, 0x85, 0x85, + /* 0x54 */ 0x86, 0x86, 0x86, 0x86, + /* 0x58 */ 0x88, 0x88, 0x88, 0x88, + /* 0x5c */ 0x92, 0x92, 0x92, 0x92, + /* 0x60 */ 0x9a, 0x9a, 0x9a, 0x9a, + /* 0x64 */ 0x9c, 0x9c, 0x9c, 0x9c, + /* 0x68 */ 0xa0, 0xa0, 0xa0, 0xa0, + /* 0x6c */ 0xa3, 0xa3, 0xa3, 0xa3, + /* 0x70 */ 0xa4, 0xa4, 0xa4, 0xa4, + /* 0x74 */ 0xa9, 0xa9, 0xa9, 0xa9, + /* 0x78 */ 0xaa, 0xaa, 0xaa, 0xaa, + /* 0x7c */ 0xad, 0xad, 0xad, 0xad, + /* 0x80 */ 0xb2, 0xb2, 0xb2, 0xb2, + /* 0x84 */ 0xb5, 0xb5, 0xb5, 0xb5, + /* 0x88 */ 0xb9, 0xb9, 0xb9, 0xb9, + /* 0x8c */ 0xba, 0xba, 0xba, 0xba, + /* 0x90 */ 0xbb, 0xbb, 0xbb, 0xbb, + /* 0x94 */ 0xbd, 0xbd, 0xbd, 0xbd, + /* 0x98 */ 0xbe, 0xbe, 0xbe, 0xbe, + /* 0x9c */ 0xc4, 0xc4, 0xc4, 0xc4, + /* 0xa0 */ 0xc6, 0xc6, 0xc6, 0xc6, + /* 0xa4 */ 0xe4, 0xe4, 0xe4, 0xe4, + /* 0xa8 */ 0xe8, 0xe8, 0xe8, 0xe8, + /* 0xac */ 0xe9, 0xe9, 0xe9, 0xe9, + /* 0xb0 */ 0x01, 0x01, + /* 0xb2 */ 0x87, 0x87, + /* 0xb4 */ 0x89, 0x89, + /* 0xb6 */ 0x8a, 0x8a, + /* 0xb8 */ 0x8b, 0x8b, + /* 0xba */ 0x8c, 0x8c, + /* 0xbc */ 0x8d, 0x8d, + /* 0xbe */ 0x8f, 0x8f, + /* 0xc0 */ 0x93, 0x93, + /* 0xc2 */ 0x95, 0x95, + /* 0xc4 */ 0x96, 0x96, + /* 0xc6 */ 0x97, 0x97, + /* 0xc8 */ 0x98, 0x98, + /* 0xca */ 0x9b, 0x9b, + /* 0xcc */ 0x9d, 0x9d, + /* 0xce */ 0x9e, 0x9e, + /* 0xd0 */ 0xa5, 0xa5, + /* 0xd2 */ 0xa6, 0xa6, + /* 0xd4 */ 0xa8, 0xa8, + /* 0xd6 */ 0xae, 0xae, + /* 0xd8 */ 0xaf, 0xaf, + /* 0xda */ 0xb4, 0xb4, + /* 0xdc */ 0xb6, 0xb6, + /* 0xde */ 0xb7, 0xb7, + /* 0xe0 */ 0xbc, 0xbc, + /* 0xe2 */ 0xbf, 0xbf, + /* 0xe4 */ 0xc5, 0xc5, + /* 0xe6 */ 0xe7, 0xe7, + /* 0xe8 */ 0xef, 0xef, + /* 0xea */ 0x09, + /* 0xeb */ 0x8e, + /* 0xec */ 0x90, + /* 0xed */ 0x91, + /* 0xee */ 0x94, + /* 0xef */ 0x9f, + /* 0xf0 */ 0xab, + /* 0xf1 */ 0xce, + /* 0xf2 */ 0xd7, + /* 0xf3 */ 0xe1, + /* 0xf4 */ 0xec, + /* 0xf5 */ 0xed, +}; + +/* below two non-overlapping tables are merged in order to save on L1D: + * - bits 15-11 for values 0x00-0x1f + * - bits 11-4 for values 0x60-0xff + * Note that there's no data between 0x20 and 0x5f, the caller must + * adjust its offsets by subtracting 0x40 for values 0x60 and above. + */ +uint8_t rht_bit15_11_11_4[192] = { + /* part used for bits 15-11 (0x00-0x1f) */ + /* 0x00 */ 0x5c, 0x5c, 0x5c, 0x5c, + /* 0x04 */ 0xc3, 0xc3, 0xc3, 0xc3, + /* 0x08 */ 0xd0, 0xd0, 0xd0, 0xd0, + /* 0x0c */ 0x80, 0x80, + /* 0x0e */ 0x82, 0x82, + /* 0x10 */ 0x83, 0x83, + /* 0x12 */ 0xa2, 0xa2, + /* 0x14 */ 0xb8, 0xb8, + /* 0x16 */ 0xc2, 0xc2, + /* 0x18 */ 0xe0, 0xe0, + /* 0x1a */ 0xe2, 0xe2, + /* 0x1c */ 0x99, + /* 0x1d */ 0xa1, + /* 0x1e */ 0xa7, + /* 0x1f */ 0xac, + + /* part used for bits 11-4 for 0xf600 (0x60-0xff), starting @0x20 */ + /* 0x60 */ 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, 0xc7, + /* 0x68 */ 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, 0xcf, + /* 0x70 */ 0xea, 0xea, 0xea, 0xea, 0xea, 0xea, 0xea, 0xea, + /* 0x78 */ 0xeb, 0xeb, 0xeb, 0xeb, 0xeb, 0xeb, 0xeb, 0xeb, + /* 0x80 */ 0xc0, 0xc0, 0xc0, 0xc0, + /* 0x84 */ 0xc1, 0xc1, 0xc1, 0xc1, + /* 0x88 */ 0xc8, 0xc8, 0xc8, 0xc8, + /* 0x8c */ 0xc9, 0xc9, 0xc9, 0xc9, + /* 0x90 */ 0xca, 0xca, 0xca, 0xca, + /* 0x94 */ 0xcd, 0xcd, 0xcd, 0xcd, + /* 0x98 */ 0xd2, 0xd2, 0xd2, 0xd2, + /* 0x9c */ 0xd5, 0xd5, 0xd5, 0xd5, + /* 0xa0 */ 0xda, 0xda, 0xda, 0xda, + /* 0xa4 */ 0xdb, 0xdb, 0xdb, 0xdb, + /* 0xa8 */ 0xee, 0xee, 0xee, 0xee, + /* 0xac */ 0xf0, 0xf0, 0xf0, 0xf0, + /* 0xb0 */ 0xf2, 0xf2, 0xf2, 0xf2, + /* 0xb4 */ 0xf3, 0xf3, 0xf3, 0xf3, + /* 0xb8 */ 0xff, 0xff, 0xff, 0xff, + /* 0xbc */ 0xcb, 0xcb, + /* 0xbe */ 0xcc, 0xcc, + /* 0xc0 */ 0xd3, 0xd3, + /* 0xc2 */ 0xd4, 0xd4, + /* 0xc4 */ 0xd6, 0xd6, + /* 0xc6 */ 0xdd, 0xdd, + /* 0xc8 */ 0xde, 0xde, + /* 0xca */ 0xdf, 0xdf, + /* 0xcc */ 0xf1, 0xf1, + /* 0xce */ 0xf4, 0xf4, + /* 0xd0 */ 0xf5, 0xf5, + /* 0xd2 */ 0xf6, 0xf6, + /* 0xd4 */ 0xf7, 0xf7, + /* 0xd6 */ 0xf8, 0xf8, + /* 0xd8 */ 0xfa, 0xfa, + /* 0xda */ 0xfb, 0xfb, + /* 0xdc */ 0xfc, 0xfc, + /* 0xde */ 0xfd, 0xfd, + /* 0xe0 */ 0xfe, 0xfe, + /* 0xe2 */ 0x02, + /* 0xe3 */ 0x03, + /* 0xe4 */ 0x04, + /* 0xe5 */ 0x05, + /* 0xe6 */ 0x06, + /* 0xe7 */ 0x07, + /* 0xe8 */ 0x08, + /* 0xe9 */ 0x0b, + /* 0xea */ 0x0c, + /* 0xeb */ 0x0e, + /* 0xec */ 0x0f, + /* 0xed */ 0x10, + /* 0xee */ 0x11, + /* 0xef */ 0x12, + /* 0xf0 */ 0x13, + /* 0xf1 */ 0x14, + /* 0xf2 */ 0x15, + /* 0xf3 */ 0x17, + /* 0xf4 */ 0x18, + /* 0xf5 */ 0x19, + /* 0xf6 */ 0x1a, + /* 0xf7 */ 0x1b, + /* 0xf8 */ 0x1c, + /* 0xf9 */ 0x1d, + /* 0xfa */ 0x1e, + /* 0xfb */ 0x1f, + /* 0xfc */ 0x7f, + /* 0xfd */ 0xdc, + /* 0xfe */ 0xf9, + /* 0xff */ 0x0a, + /* Note, for [0xff], l==30 and bits 2..3 give 00:0x0a, 01:0x0d, 10:0x16, 11:EOS */ +}; + +/* huffman-encode string <s> into the huff_tmp buffer and returns the amount + * of output bytes. The caller must ensure the output is large enough (ie at + * least 4 times as long as s). + * + * FIXME: bits are only counted for now, no code is emitted! + */ +int huff_enc(const char *s, char *out) +{ + int bits = 0; + + while (*s) { + bits += ht[(uint8_t)*s].b; + s++; + } + bits += 7; + + /* FIXME: huffman code is not emitted yet. */ + //memset(out, 'H', bits / 8); + return bits / 8; +} + +/* pass a huffman string, it will decode it and return the new output size or + * -1 in case of error. + * + * The principle of the decoder is to lookup full bytes in reverse-huffman + * tables. Since we may need up to 30 bits and the word positions are not + * always multiples of 8, we build the code word by shifting the "current" + * 32-bit word and the "next" one of the appropriate amount of bits. Once + * the shift goes beyond 32, words are swapped and the "next" one is refilled + * with new bytes. Shift operations are cheap when done a single time like this. + * On 64-bit platforms it is possible to further improve this by storing both + * of them in a single word. + */ +int huff_dec(const uint8_t *huff, int hlen, char *out, int olen) +{ + char *out_start = out; + char *out_end = out + olen; + const uint8_t *huff_end = huff + hlen; + uint32_t curr = 0; + uint32_t next = 0; + uint32_t shift; + uint32_t code; /* The 30-bit code being looked up, MSB-aligned */ + uint8_t sym; + int bleft; /* bits left */ + int l; + + code = 0; + shift = 64; // start with an empty buffer + bleft = hlen << 3; + while (bleft > 0 && out != out_end) { + while (shift >= 32) { + curr = next; + + /* read up to 4 bytes into next */ + next = 0; + + if (huff + 4 <= huff_end) { + next = read_n32(huff); + huff += 4; + } + else { + /* note: we append 0 and not 0xff so that we can + * distinguish shifted bits from a really inserted + * EOS. + */ + next = (((huff + 0 < huff_end) ? (uint32_t)huff[0] : 0x00) << 24) + + (((huff + 1 < huff_end) ? (uint32_t)huff[1] : 0x00) << 16) + + (((huff + 2 < huff_end) ? (uint32_t)huff[2] : 0x00) << 8) + + ((huff + 3 < huff_end) ? (uint32_t)huff[3] : 0x00); + huff = huff_end; + } + + shift -= 32; + } + + /* curr:next contain 64 bit of huffman code */ + code = curr; + if (shift) + code = (code << shift) + (next >> (32 - shift)); + + /* now we necessarily have 32 bits available */ + if (code < 0xfe000000) { + /* single byte */ + sym = code >> 24; + l = sym < 0xb8 ? + sym < 0x50 ? 5 : 6 : + sym < 0xf8 ? 7 : 8; + sym = rht_bit31_24[code >> 24]; + } + else if (code < 0xfffe0000) { + /* two bytes, 0xfe + 2 bits or 0xff + 2..7 bits */ + sym = code >> 17; + l = sym < 0xe0 ? + sym < 0xa0 ? 10 : sym < 0xd0 ? 11 : 12 : + sym < 0xf8 ? 13 : sym < 0xfc ? 14 : 15; + + sym = rht_bit24_17[(code >> 17) & 0xff]; + } + else if (code < 0xffff0000) { /* 3..5 bits */ + /* 0xff + 0xfe + 3..5 bits or + * 0xff + 0xff + 5..8 bits for values till 0xf5 + */ + sym = (code >> 11) & 0x1f; + l = sym < 0x0c ? 19 : sym < 0x1c ? 20 : 21; + sym = rht_bit15_11_11_4[(code >> 11) & 0x1f]; + } + else if (code < 0xfffff600) { /* 5..8 bits */ + /* that's 0xff + 0xff */ + sym = code >> 8; + + l = sym < 0xb0 ? + sym < 0x48 ? 21 : 22 : + sym < 0xea ? 23 : 24; + sym = rht_bit15_8[(code >> 8) & 0xff]; + } + else { + /* 0xff 0xff 0xf6..0xff */ + sym = code >> 4; /* sym = 0x60..0xff */ + l = sym < 0xbc ? + sym < 0x80 ? 25 : 26 : + sym < 0xe2 ? 27 : sym < 0xff ? 28 : 30; + if (sym < 0xff) + sym = rht_bit15_11_11_4[((code >> 4) & 0xff) - 0x40L]; + else if ((code & 0xff) == 0xf0) + sym = 10; + else if ((code & 0xff) == 0xf4) + sym = 13; + else if ((code & 0xff) == 0xf8) + sym = 22; + else { // 0xfc : EOS + break; + } + } + + if (!l || bleft - l < 0) + break; + + bleft -= l; + shift += l; + *out++ = sym; + } + + if (bleft > 0) { + /* some bits were not consumed after the last code, they must + * match EOS (ie: all ones) and there must be 7 bits or less. + * (7541#5.2). + */ + if (bleft > 7) + return -1; + + if ((code & -(1 << (32 - bleft))) != (uint32_t)-(1 << (32 - bleft))) + return -1; + } + + if (out < out_end) + *out = 0; // end of string whenever possible + return out - out_start; +} diff --git a/src/hpack-tbl.c b/src/hpack-tbl.c new file mode 100644 index 0000000..990d2f7 --- /dev/null +++ b/src/hpack-tbl.c @@ -0,0 +1,372 @@ +/* + * HPACK header table management (RFC7541) + * + * Copyright (C) 2014-2017 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <import/ist.h> +#include <haproxy/hpack-huff.h> +#include <haproxy/hpack-tbl.h> + +/* static header table as in RFC7541 Appendix A. [0] unused. */ +const struct http_hdr hpack_sht[HPACK_SHT_SIZE] = { + [ 1] = { .n = IST(":authority"), .v = IST("") }, + [ 2] = { .n = IST(":method"), .v = IST("GET") }, + [ 3] = { .n = IST(":method"), .v = IST("POST") }, + [ 4] = { .n = IST(":path"), .v = IST("/") }, + [ 5] = { .n = IST(":path"), .v = IST("/index.html") }, + [ 6] = { .n = IST(":scheme"), .v = IST("http") }, + [ 7] = { .n = IST(":scheme"), .v = IST("https") }, + [ 8] = { .n = IST(":status"), .v = IST("200") }, + [ 9] = { .n = IST(":status"), .v = IST("204") }, + [10] = { .n = IST(":status"), .v = IST("206") }, + [11] = { .n = IST(":status"), .v = IST("304") }, + [12] = { .n = IST(":status"), .v = IST("400") }, + [13] = { .n = IST(":status"), .v = IST("404") }, + [14] = { .n = IST(":status"), .v = IST("500") }, + [15] = { .n = IST("accept-charset"), .v = IST("") }, + [16] = { .n = IST("accept-encoding"), .v = IST("gzip, deflate") }, + [17] = { .n = IST("accept-language"), .v = IST("") }, + [18] = { .n = IST("accept-ranges"), .v = IST("") }, + [19] = { .n = IST("accept"), .v = IST("") }, + [20] = { .n = IST("access-control-allow-origin"), .v = IST("") }, + [21] = { .n = IST("age"), .v = IST("") }, + [22] = { .n = IST("allow"), .v = IST("") }, + [23] = { .n = IST("authorization"), .v = IST("") }, + [24] = { .n = IST("cache-control"), .v = IST("") }, + [25] = { .n = IST("content-disposition"), .v = IST("") }, + [26] = { .n = IST("content-encoding"), .v = IST("") }, + [27] = { .n = IST("content-language"), .v = IST("") }, + [28] = { .n = IST("content-length"), .v = IST("") }, + [29] = { .n = IST("content-location"), .v = IST("") }, + [30] = { .n = IST("content-range"), .v = IST("") }, + [31] = { .n = IST("content-type") , .v = IST("") }, + [32] = { .n = IST("cookie"), .v = IST("") }, + [33] = { .n = IST("date"), .v = IST("") }, + [34] = { .n = IST("etag"), .v = IST("") }, + [35] = { .n = IST("expect"), .v = IST("") }, + [36] = { .n = IST("expires"), .v = IST("") }, + [37] = { .n = IST("from"), .v = IST("") }, + [38] = { .n = IST("host"), .v = IST("") }, + [39] = { .n = IST("if-match"), .v = IST("") }, + [40] = { .n = IST("if-modified-since"), .v = IST("") }, + [41] = { .n = IST("if-none-match"), .v = IST("") }, + [42] = { .n = IST("if-range"), .v = IST("") }, + [43] = { .n = IST("if-unmodified-since"), .v = IST("") }, + [44] = { .n = IST("last-modified"), .v = IST("") }, + [45] = { .n = IST("link"), .v = IST("") }, + [46] = { .n = IST("location"), .v = IST("") }, + [47] = { .n = IST("max-forwards"), .v = IST("") }, + [48] = { .n = IST("proxy-authenticate"), .v = IST("") }, + [49] = { .n = IST("proxy-authorization"), .v = IST("") }, + [50] = { .n = IST("range"), .v = IST("") }, + [51] = { .n = IST("referer"), .v = IST("") }, + [52] = { .n = IST("refresh"), .v = IST("") }, + [53] = { .n = IST("retry-after"), .v = IST("") }, + [54] = { .n = IST("server"), .v = IST("") }, + [55] = { .n = IST("set-cookie"), .v = IST("") }, + [56] = { .n = IST("strict-transport-security"), .v = IST("") }, + [57] = { .n = IST("transfer-encoding"), .v = IST("") }, + [58] = { .n = IST("user-agent"), .v = IST("") }, + [59] = { .n = IST("vary"), .v = IST("") }, + [60] = { .n = IST("via"), .v = IST("") }, + [61] = { .n = IST("www-authenticate"), .v = IST("") }, +}; + +struct pool_head *pool_head_hpack_tbl __read_mostly = NULL; + +#ifdef DEBUG_HPACK +/* dump the whole dynamic header table */ +void hpack_dht_dump(FILE *out, const struct hpack_dht *dht) +{ + unsigned int i; + unsigned int slot; + char name[4096], value[4096]; + + for (i = HPACK_SHT_SIZE; i < HPACK_SHT_SIZE + dht->used; i++) { + slot = (hpack_get_dte(dht, i - HPACK_SHT_SIZE + 1) - dht->dte); + fprintf(out, "idx=%u slot=%u name=<%s> value=<%s> addr=%u-%u\n", + i, slot, + istpad(name, hpack_idx_to_name(dht, i)).ptr, + istpad(value, hpack_idx_to_value(dht, i)).ptr, + dht->dte[slot].addr, dht->dte[slot].addr+dht->dte[slot].nlen+dht->dte[slot].vlen-1); + } +} + +/* check for the whole dynamic header table consistency, abort on failures */ +void hpack_dht_check_consistency(const struct hpack_dht *dht) +{ + unsigned slot = hpack_dht_get_tail(dht); + unsigned used2 = dht->used; + unsigned total = 0; + + if (!dht->used) + return; + + if (dht->front >= dht->wrap) + abort(); + + if (dht->used > dht->wrap) + abort(); + + if (dht->head >= dht->wrap) + abort(); + + while (used2--) { + total += dht->dte[slot].nlen + dht->dte[slot].vlen; + slot++; + if (slot >= dht->wrap) + slot = 0; + } + + if (total != dht->total) { + fprintf(stderr, "%d: total=%u dht=%u\n", __LINE__, total, dht->total); + abort(); + } +} +#endif // DEBUG_HPACK + +/* rebuild a new dynamic header table from <dht> with an unwrapped index and + * contents at the end. The new table is returned, the caller must not use the + * previous one anymore. NULL may be returned if no table could be allocated. + */ +static struct hpack_dht *hpack_dht_defrag(struct hpack_dht *dht) +{ + struct hpack_dht *alt_dht; + uint16_t old, new; + uint32_t addr; + + /* Note: for small tables we could use alloca() instead but + * portability especially for large tables can be problematic. + */ + alt_dht = hpack_dht_alloc(); + if (!alt_dht) + return NULL; + + alt_dht->total = dht->total; + alt_dht->used = dht->used; + alt_dht->wrap = dht->used; + + new = 0; + addr = alt_dht->size; + + if (dht->used) { + /* start from the tail */ + old = hpack_dht_get_tail(dht); + do { + alt_dht->dte[new].nlen = dht->dte[old].nlen; + alt_dht->dte[new].vlen = dht->dte[old].vlen; + addr -= dht->dte[old].nlen + dht->dte[old].vlen; + alt_dht->dte[new].addr = addr; + + memcpy((void *)alt_dht + alt_dht->dte[new].addr, + (void *)dht + dht->dte[old].addr, + dht->dte[old].nlen + dht->dte[old].vlen); + + old++; + if (old >= dht->wrap) + old = 0; + new++; + } while (new < dht->used); + } + + alt_dht->front = alt_dht->head = new - 1; + + memcpy(dht, alt_dht, dht->size); + hpack_dht_free(alt_dht); + + return dht; +} + +/* Purges table dht until a header field of <needed> bytes fits according to + * the protocol (adding 32 bytes overhead). Returns non-zero on success, zero + * on failure (ie: table empty but still not sufficient). It must only be + * called when the table is not large enough to suit the new entry and there + * are some entries left. In case of doubt, use dht_make_room() instead. + */ +int __hpack_dht_make_room(struct hpack_dht *dht, unsigned int needed) +{ + unsigned int used = dht->used; + unsigned int wrap = dht->wrap; + unsigned int tail; + + do { + tail = ((dht->head + 1U < used) ? wrap : 0) + dht->head + 1U - used; + dht->total -= dht->dte[tail].nlen + dht->dte[tail].vlen; + if (tail == dht->front) + dht->front = dht->head; + used--; + } while (used && used * 32 + dht->total + needed + 32 > dht->size); + + dht->used = used; + + /* realign if empty */ + if (!used) + dht->front = dht->head = 0; + + /* pack the table if it doesn't wrap anymore */ + if (dht->head + 1U >= used) + dht->wrap = dht->head + 1; + + /* no need to check for 'used' here as if it doesn't fit, used==0 */ + return needed + 32 <= dht->size; +} + +/* tries to insert a new header <name>:<value> in front of the current head. A + * negative value is returned on error. + */ +int hpack_dht_insert(struct hpack_dht *dht, struct ist name, struct ist value) +{ + unsigned int used; + unsigned int head; + unsigned int prev; + unsigned int wrap; + unsigned int tail; + uint32_t headroom, tailroom; + + if (!hpack_dht_make_room(dht, name.len + value.len)) + return 0; + + /* Now there is enough room in the table, that's guaranteed by the + * protocol, but not necessarily where we need it. + */ + + used = dht->used; + if (!used) { + /* easy, the table was empty */ + dht->front = dht->head = 0; + dht->wrap = dht->used = 1; + dht->total = 0; + head = 0; + dht->dte[head].addr = dht->size - (name.len + value.len); + goto copy; + } + + /* compute the new head, used and wrap position */ + prev = head = dht->head; + wrap = dht->wrap; + tail = hpack_dht_get_tail(dht); + + used++; + head++; + + if (head >= wrap) { + /* head is leading the entries, we either need to push the + * table further or to loop back to released entries. We could + * force to loop back when at least half of the allocatable + * entries are free but in practice it never happens. + */ + if ((sizeof(*dht) + (wrap + 1) * sizeof(dht->dte[0]) <= dht->dte[dht->front].addr)) + wrap++; + else if (head >= used) /* there's a hole at the beginning */ + head = 0; + else { + /* no more room, head hits tail and the index cannot be + * extended, we have to realign the whole table. + */ + if (!hpack_dht_defrag(dht)) + return -1; + + wrap = dht->wrap + 1; + head = dht->head + 1; + prev = head - 1; + tail = 0; + } + } + else if (used >= wrap) { + /* we've hit the tail, we need to reorganize the index so that + * the head is at the end (but not necessarily move the data). + */ + if (!hpack_dht_defrag(dht)) + return -1; + + wrap = dht->wrap + 1; + head = dht->head + 1; + prev = head - 1; + tail = 0; + } + + /* Now we have updated head, used and wrap, we know that there is some + * available room at least from the protocol's perspective. This space + * is split in two areas : + * + * 1: if the previous head was the front cell, the space between the + * end of the index table and the front cell's address. + * 2: if the previous head was the front cell, the space between the + * end of the tail and the end of the table ; or if the previous + * head was not the front cell, the space between the end of the + * tail and the head's address. + */ + if (prev == dht->front) { + /* the area was contiguous */ + headroom = dht->dte[dht->front].addr - (sizeof(*dht) + wrap * sizeof(dht->dte[0])); + tailroom = dht->size - dht->dte[tail].addr - dht->dte[tail].nlen - dht->dte[tail].vlen; + } + else { + /* it's already wrapped so we can't store anything in the headroom */ + headroom = 0; + tailroom = dht->dte[prev].addr - dht->dte[tail].addr - dht->dte[tail].nlen - dht->dte[tail].vlen; + } + + /* We can decide to stop filling the headroom as soon as there's enough + * room left in the tail to suit the protocol, but tests show that in + * practice it almost never happens in other situations so the extra + * test is useless and we simply fill the headroom as long as it's + * available and we don't wrap. + */ + if (prev == dht->front && headroom >= name.len + value.len) { + /* install upfront and update ->front */ + dht->dte[head].addr = dht->dte[dht->front].addr - (name.len + value.len); + dht->front = head; + } + else if (tailroom >= name.len + value.len) { + dht->dte[head].addr = dht->dte[tail].addr + dht->dte[tail].nlen + dht->dte[tail].vlen + tailroom - (name.len + value.len); + } + else { + /* need to defragment the table before inserting upfront */ + dht = hpack_dht_defrag(dht); + wrap = dht->wrap + 1; + head = dht->head + 1; + dht->dte[head].addr = dht->dte[dht->front].addr - (name.len + value.len); + dht->front = head; + } + + dht->wrap = wrap; + dht->head = head; + dht->used = used; + + copy: + dht->total += name.len + value.len; + dht->dte[head].nlen = name.len; + dht->dte[head].vlen = value.len; + + memcpy((void *)dht + dht->dte[head].addr, name.ptr, name.len); + memcpy((void *)dht + dht->dte[head].addr + name.len, value.ptr, value.len); + return 0; +} diff --git a/src/hq_interop.c b/src/hq_interop.c new file mode 100644 index 0000000..31c2101 --- /dev/null +++ b/src/hq_interop.c @@ -0,0 +1,174 @@ +#include <haproxy/hq_interop.h> + +#include <import/ist.h> +#include <haproxy/buf.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/htx.h> +#include <haproxy/http.h> +#include <haproxy/mux_quic.h> +#include <haproxy/qmux_http.h> + +static ssize_t hq_interop_decode_qcs(struct qcs *qcs, struct buffer *b, int fin) +{ + struct htx *htx; + struct htx_sl *sl; + struct buffer htx_buf = BUF_NULL; + struct ist path; + char *ptr = b_head(b); + size_t data = b_data(b); + + /* hq-interop parser does not support buffer wrapping. */ + BUG_ON(b_data(b) != b_contig_data(b, 0)); + + /* hq-interop parser is only done once full message is received. */ + if (!fin) + return 0; + + b_alloc(&htx_buf); + htx = htx_from_buf(&htx_buf); + + /* skip method */ + while (data && HTTP_IS_TOKEN(*ptr)) { + ptr++; + data--; + } + + if (!data || !HTTP_IS_SPHT(*ptr)) { + fprintf(stderr, "truncated stream\n"); + return -1; + } + + ptr++; + if (!--data) { + fprintf(stderr, "truncated stream\n"); + return -1; + } + + if (HTTP_IS_LWS(*ptr)) { + fprintf(stderr, "malformed stream\n"); + return -1; + } + + /* extract path */ + path.ptr = ptr; + while (data && !HTTP_IS_LWS(*ptr)) { + ptr++; + data--; + } + + if (!data) { + fprintf(stderr, "truncated stream\n"); + return -1; + } + + path.len = ptr - path.ptr; + + sl = htx_add_stline(htx, HTX_BLK_REQ_SL, 0, ist("GET"), path, ist("HTTP/1.0")); + if (!sl) + return -1; + + sl->flags |= HTX_SL_F_BODYLESS; + sl->info.req.meth = find_http_meth("GET", 3); + + htx_add_endof(htx, HTX_BLK_EOH); + htx->flags |= HTX_FL_EOM; + htx_to_buf(htx, &htx_buf); + + if (!qcs_attach_sc(qcs, &htx_buf, fin)) + return -1; + + b_free(&htx_buf); + + return b_data(b); +} + +static struct buffer *mux_get_buf(struct qcs *qcs) +{ + if (!b_size(&qcs->tx.buf)) + b_alloc(&qcs->tx.buf); + + return &qcs->tx.buf; +} + +static size_t hq_interop_snd_buf(struct qcs *qcs, struct buffer *buf, + size_t count) +{ + enum htx_blk_type btype; + struct htx *htx; + struct htx_blk *blk; + int32_t idx; + uint32_t bsize, fsize; + struct buffer *res, outbuf; + size_t total = 0; + + res = mux_get_buf(qcs); + outbuf = b_make(b_tail(res), b_contig_space(res), 0, 0); + + htx = htx_from_buf(buf); + + if (htx->extra && htx->extra == HTX_UNKOWN_PAYLOAD_LENGTH) + qcs->flags |= QC_SF_UNKNOWN_PL_LENGTH; + + while (count && !htx_is_empty(htx) && !(qcs->flags & QC_SF_BLK_MROOM)) { + /* Not implemented : QUIC on backend side */ + idx = htx_get_head(htx); + blk = htx_get_blk(htx, idx); + btype = htx_get_blk_type(blk); + fsize = bsize = htx_get_blksz(blk); + + BUG_ON(btype == HTX_BLK_REQ_SL); + + switch (btype) { + case HTX_BLK_DATA: + if (fsize > count) + fsize = count; + + if (b_room(&outbuf) < fsize) + fsize = b_room(&outbuf); + + if (!fsize) { + qcs->flags |= QC_SF_BLK_MROOM; + goto end; + } + + b_putblk(&outbuf, htx_get_blk_ptr(htx, blk), fsize); + total += fsize; + count -= fsize; + + if (fsize == bsize) + htx_remove_blk(htx, blk); + else + htx_cut_data_blk(htx, blk, fsize); + break; + + /* only body is transferred on HTTP/0.9 */ + case HTX_BLK_RES_SL: + case HTX_BLK_TLR: + case HTX_BLK_EOT: + default: + htx_remove_blk(htx, blk); + total += bsize; + count -= bsize; + break; + } + } + + end: + b_add(res, b_data(&outbuf)); + htx_to_buf(htx, buf); + + return total; +} + +static int hq_interop_attach(struct qcs *qcs, void *conn_ctx) +{ + qcs_wait_http_req(qcs); + return 0; +} + +const struct qcc_app_ops hq_interop_ops = { + .decode_qcs = hq_interop_decode_qcs, + .snd_buf = hq_interop_snd_buf, + .attach = hq_interop_attach, +}; diff --git a/src/http.c b/src/http.c new file mode 100644 index 0000000..9599e0e --- /dev/null +++ b/src/http.c @@ -0,0 +1,1433 @@ +/* + * HTTP semantics + * + * Copyright 2000-2018 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <haproxy/api.h> +#include <haproxy/http.h> +#include <haproxy/tools.h> + +/* It is about twice as fast on recent architectures to lookup a byte in a + * table than to perform a boolean AND or OR between two tests. Refer to + * RFC2616/RFC5234/RFC7230 for those chars. A token is any ASCII char that is + * neither a separator nor a CTL char. An http ver_token is any ASCII which can + * be found in an HTTP version, which includes 'H', 'T', 'P', '/', '.' and any + * digit. Note: please do not overwrite values in assignment since gcc-2.95 + * will not handle them correctly. It's worth noting that chars 128..255 are + * nothing, not even control chars. + */ +const unsigned char http_char_classes[256] = { + [ 0] = HTTP_FLG_CTL, + [ 1] = HTTP_FLG_CTL, + [ 2] = HTTP_FLG_CTL, + [ 3] = HTTP_FLG_CTL, + [ 4] = HTTP_FLG_CTL, + [ 5] = HTTP_FLG_CTL, + [ 6] = HTTP_FLG_CTL, + [ 7] = HTTP_FLG_CTL, + [ 8] = HTTP_FLG_CTL, + [ 9] = HTTP_FLG_SPHT | HTTP_FLG_LWS | HTTP_FLG_SEP | HTTP_FLG_CTL, + [ 10] = HTTP_FLG_CRLF | HTTP_FLG_LWS | HTTP_FLG_CTL, + [ 11] = HTTP_FLG_CTL, + [ 12] = HTTP_FLG_CTL, + [ 13] = HTTP_FLG_CRLF | HTTP_FLG_LWS | HTTP_FLG_CTL, + [ 14] = HTTP_FLG_CTL, + [ 15] = HTTP_FLG_CTL, + [ 16] = HTTP_FLG_CTL, + [ 17] = HTTP_FLG_CTL, + [ 18] = HTTP_FLG_CTL, + [ 19] = HTTP_FLG_CTL, + [ 20] = HTTP_FLG_CTL, + [ 21] = HTTP_FLG_CTL, + [ 22] = HTTP_FLG_CTL, + [ 23] = HTTP_FLG_CTL, + [ 24] = HTTP_FLG_CTL, + [ 25] = HTTP_FLG_CTL, + [ 26] = HTTP_FLG_CTL, + [ 27] = HTTP_FLG_CTL, + [ 28] = HTTP_FLG_CTL, + [ 29] = HTTP_FLG_CTL, + [ 30] = HTTP_FLG_CTL, + [ 31] = HTTP_FLG_CTL, + [' '] = HTTP_FLG_SPHT | HTTP_FLG_LWS | HTTP_FLG_SEP, + ['!'] = HTTP_FLG_TOK, + ['"'] = HTTP_FLG_SEP, + ['#'] = HTTP_FLG_TOK, + ['$'] = HTTP_FLG_TOK, + ['%'] = HTTP_FLG_TOK, + ['&'] = HTTP_FLG_TOK, + [ 39] = HTTP_FLG_TOK, + ['('] = HTTP_FLG_SEP, + [')'] = HTTP_FLG_SEP, + ['*'] = HTTP_FLG_TOK, + ['+'] = HTTP_FLG_TOK, + [','] = HTTP_FLG_SEP, + ['-'] = HTTP_FLG_TOK, + ['.'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['/'] = HTTP_FLG_SEP | HTTP_FLG_VER, + ['0'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['1'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['2'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['3'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['4'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['5'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['6'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['7'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['8'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + ['9'] = HTTP_FLG_TOK | HTTP_FLG_VER | HTTP_FLG_DIG, + [':'] = HTTP_FLG_SEP, + [';'] = HTTP_FLG_SEP, + ['<'] = HTTP_FLG_SEP, + ['='] = HTTP_FLG_SEP, + ['>'] = HTTP_FLG_SEP, + ['?'] = HTTP_FLG_SEP, + ['@'] = HTTP_FLG_SEP, + ['A'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['B'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['C'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['D'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['E'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['F'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['G'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['H'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['I'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['J'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['K'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['L'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['M'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['N'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['O'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['P'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['Q'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['R'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['S'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['T'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['U'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['V'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['W'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['X'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['Y'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['Z'] = HTTP_FLG_TOK | HTTP_FLG_VER, + ['['] = HTTP_FLG_SEP, + [ 92] = HTTP_FLG_SEP, + [']'] = HTTP_FLG_SEP, + ['^'] = HTTP_FLG_TOK, + ['_'] = HTTP_FLG_TOK, + ['`'] = HTTP_FLG_TOK, + ['a'] = HTTP_FLG_TOK, + ['b'] = HTTP_FLG_TOK, + ['c'] = HTTP_FLG_TOK, + ['d'] = HTTP_FLG_TOK, + ['e'] = HTTP_FLG_TOK, + ['f'] = HTTP_FLG_TOK, + ['g'] = HTTP_FLG_TOK, + ['h'] = HTTP_FLG_TOK, + ['i'] = HTTP_FLG_TOK, + ['j'] = HTTP_FLG_TOK, + ['k'] = HTTP_FLG_TOK, + ['l'] = HTTP_FLG_TOK, + ['m'] = HTTP_FLG_TOK, + ['n'] = HTTP_FLG_TOK, + ['o'] = HTTP_FLG_TOK, + ['p'] = HTTP_FLG_TOK, + ['q'] = HTTP_FLG_TOK, + ['r'] = HTTP_FLG_TOK, + ['s'] = HTTP_FLG_TOK, + ['t'] = HTTP_FLG_TOK, + ['u'] = HTTP_FLG_TOK, + ['v'] = HTTP_FLG_TOK, + ['w'] = HTTP_FLG_TOK, + ['x'] = HTTP_FLG_TOK, + ['y'] = HTTP_FLG_TOK, + ['z'] = HTTP_FLG_TOK, + ['{'] = HTTP_FLG_SEP, + ['|'] = HTTP_FLG_TOK, + ['}'] = HTTP_FLG_SEP, + ['~'] = HTTP_FLG_TOK, + [127] = HTTP_FLG_CTL, +}; + +const int http_err_codes[HTTP_ERR_SIZE] = { + [HTTP_ERR_200] = 200, /* used by "monitor-uri" */ + [HTTP_ERR_400] = 400, + [HTTP_ERR_401] = 401, + [HTTP_ERR_403] = 403, + [HTTP_ERR_404] = 404, + [HTTP_ERR_405] = 405, + [HTTP_ERR_407] = 407, + [HTTP_ERR_408] = 408, + [HTTP_ERR_410] = 410, + [HTTP_ERR_413] = 413, + [HTTP_ERR_421] = 421, + [HTTP_ERR_422] = 422, + [HTTP_ERR_425] = 425, + [HTTP_ERR_429] = 429, + [HTTP_ERR_500] = 500, + [HTTP_ERR_501] = 501, + [HTTP_ERR_502] = 502, + [HTTP_ERR_503] = 503, + [HTTP_ERR_504] = 504, +}; + +const char *http_err_msgs[HTTP_ERR_SIZE] = { + [HTTP_ERR_200] = + "HTTP/1.1 200 OK\r\n" + "Content-length: 58\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>200 OK</h1>\nService ready.\n</body></html>\n", + + [HTTP_ERR_400] = + "HTTP/1.1 400 Bad request\r\n" + "Content-length: 90\r\n" + "Cache-Control: no-cache\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>400 Bad request</h1>\nYour browser sent an invalid request.\n</body></html>\n", + + [HTTP_ERR_401] = + "HTTP/1.1 401 Unauthorized\r\n" + "Content-length: 112\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>401 Unauthorized</h1>\nYou need a valid user and password to access this content.\n</body></html>\n", + + [HTTP_ERR_403] = + "HTTP/1.1 403 Forbidden\r\n" + "Content-length: 93\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>403 Forbidden</h1>\nRequest forbidden by administrative rules.\n</body></html>\n", + + [HTTP_ERR_404] = + "HTTP/1.1 404 Not Found\r\n" + "Content-length: 83\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>404 Not Found</h1>\nThe resource could not be found.\n</body></html>\n", + + [HTTP_ERR_405] = + "HTTP/1.1 405 Method Not Allowed\r\n" + "Content-length: 146\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>405 Method Not Allowed</h1>\nA request was made of a resource using a request method not supported by that resource\n</body></html>\n", + + [HTTP_ERR_407] = + "HTTP/1.1 407 Unauthorized\r\n" + "Content-length: 112\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>407 Unauthorized</h1>\nYou need a valid user and password to access this content.\n</body></html>\n", + + [HTTP_ERR_408] = + "HTTP/1.1 408 Request Time-out\r\n" + "Content-length: 110\r\n" + "Cache-Control: no-cache\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>408 Request Time-out</h1>\nYour browser didn't send a complete request in time.\n</body></html>\n", + + [HTTP_ERR_410] = + "HTTP/1.1 410 Gone\r\n" + "Content-length: 114\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>410 Gone</h1>\nThe resource is no longer available and will not be available again.\n</body></html>\n", + + [HTTP_ERR_413] = + "HTTP/1.1 413 Payload Too Large\r\n" + "Content-length: 106\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>413 Payload Too Large</h1>\nThe request entity exceeds the maximum allowed.\n</body></html>\n", + + [HTTP_ERR_421] = + "HTTP/1.1 421 Misdirected Request\r\n" + "Content-length: 104\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>421 Misdirected Request</h1>\nRequest sent to a non-authoritative server.\n</body></html>\n", + + [HTTP_ERR_422] = + "HTTP/1.1 422 Unprocessable Content\r\n" + "Content-length: 116\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>422 Unprocessable Content</h1>\nThe server cannot process the contained instructions.\n</body></html>\n", + + [HTTP_ERR_425] = + "HTTP/1.1 425 Too Early\r\n" + "Content-length: 80\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>425 Too Early</h1>\nYour browser sent early data.\n</body></html>\n", + + [HTTP_ERR_429] = + "HTTP/1.1 429 Too Many Requests\r\n" + "Content-length: 117\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>429 Too Many Requests</h1>\nYou have sent too many requests in a given amount of time.\n</body></html>\n", + + [HTTP_ERR_500] = + "HTTP/1.1 500 Internal Server Error\r\n" + "Content-length: 97\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>500 Internal Server Error</h1>\nAn internal server error occurred.\n</body></html>\n", + + [HTTP_ERR_501] = + "HTTP/1.1 501 Not Implemented\r\n" + "Content-length: 136\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>501 Not Implemented</h1>\n.The server does not support the functionality required to fulfill the request.\n</body></html>\n", + + [HTTP_ERR_502] = + "HTTP/1.1 502 Bad Gateway\r\n" + "Content-length: 107\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>502 Bad Gateway</h1>\nThe server returned an invalid or incomplete response.\n</body></html>\n", + + [HTTP_ERR_503] = + "HTTP/1.1 503 Service Unavailable\r\n" + "Content-length: 107\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>503 Service Unavailable</h1>\nNo server is available to handle this request.\n</body></html>\n", + + [HTTP_ERR_504] = + "HTTP/1.1 504 Gateway Time-out\r\n" + "Content-length: 92\r\n" + "Cache-Control: no-cache\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "<html><body><h1>504 Gateway Time-out</h1>\nThe server didn't respond in time.\n</body></html>\n", +}; + +const struct ist http_known_methods[HTTP_METH_OTHER] = { + [HTTP_METH_OPTIONS] = IST("OPTIONS"), + [HTTP_METH_GET] = IST("GET"), + [HTTP_METH_HEAD] = IST("HEAD"), + [HTTP_METH_POST] = IST("POST"), + [HTTP_METH_PUT] = IST("PUT"), + [HTTP_METH_DELETE] = IST("DELETE"), + [HTTP_METH_TRACE] = IST("TRACE"), + [HTTP_METH_CONNECT] = IST("CONNECT"), +}; + +/* + * returns a known method among HTTP_METH_* or HTTP_METH_OTHER for all unknown + * ones. + */ +enum http_meth_t find_http_meth(const char *str, const int len) +{ + const struct ist m = ist2(str, len); + + if (isteq(m, ist("GET"))) return HTTP_METH_GET; + else if (isteq(m, ist("HEAD"))) return HTTP_METH_HEAD; + else if (isteq(m, ist("POST"))) return HTTP_METH_POST; + else if (isteq(m, ist("CONNECT"))) return HTTP_METH_CONNECT; + else if (isteq(m, ist("PUT"))) return HTTP_METH_PUT; + else if (isteq(m, ist("OPTIONS"))) return HTTP_METH_OPTIONS; + else if (isteq(m, ist("DELETE"))) return HTTP_METH_DELETE; + else if (isteq(m, ist("TRACE"))) return HTTP_METH_TRACE; + else return HTTP_METH_OTHER; +} + +/* This function returns HTTP_ERR_<num> (enum) matching http status code. + * Returned value should match codes from http_err_codes. + */ +int http_get_status_idx(unsigned int status) +{ + switch (status) { + case 200: return HTTP_ERR_200; + case 400: return HTTP_ERR_400; + case 401: return HTTP_ERR_401; + case 403: return HTTP_ERR_403; + case 404: return HTTP_ERR_404; + case 405: return HTTP_ERR_405; + case 407: return HTTP_ERR_407; + case 408: return HTTP_ERR_408; + case 410: return HTTP_ERR_410; + case 413: return HTTP_ERR_413; + case 421: return HTTP_ERR_421; + case 422: return HTTP_ERR_422; + case 425: return HTTP_ERR_425; + case 429: return HTTP_ERR_429; + case 500: return HTTP_ERR_500; + case 501: return HTTP_ERR_501; + case 502: return HTTP_ERR_502; + case 503: return HTTP_ERR_503; + case 504: return HTTP_ERR_504; + default: return HTTP_ERR_500; + } +} + +/* This function returns a reason associated with the HTTP status. + * This function never fails, a message is always returned. + */ +const char *http_get_reason(unsigned int status) +{ + switch (status) { + case 100: return "Continue"; + case 101: return "Switching Protocols"; + case 102: return "Processing"; + case 200: return "OK"; + case 201: return "Created"; + case 202: return "Accepted"; + case 203: return "Non-Authoritative Information"; + case 204: return "No Content"; + case 205: return "Reset Content"; + case 206: return "Partial Content"; + case 207: return "Multi-Status"; + case 210: return "Content Different"; + case 226: return "IM Used"; + case 300: return "Multiple Choices"; + case 301: return "Moved Permanently"; + case 302: return "Found"; + case 303: return "See Other"; + case 304: return "Not Modified"; + case 305: return "Use Proxy"; + case 307: return "Temporary Redirect"; + case 308: return "Permanent Redirect"; + case 310: return "Too many Redirects"; + case 400: return "Bad Request"; + case 401: return "Unauthorized"; + case 402: return "Payment Required"; + case 403: return "Forbidden"; + case 404: return "Not Found"; + case 405: return "Method Not Allowed"; + case 406: return "Not Acceptable"; + case 407: return "Proxy Authentication Required"; + case 408: return "Request Time-out"; + case 409: return "Conflict"; + case 410: return "Gone"; + case 411: return "Length Required"; + case 412: return "Precondition Failed"; + case 413: return "Request Entity Too Large"; + case 414: return "Request-URI Too Long"; + case 415: return "Unsupported Media Type"; + case 416: return "Requested range unsatisfiable"; + case 417: return "Expectation failed"; + case 418: return "I'm a teapot"; + case 421: return "Misdirected Request"; + case 422: return "Unprocessable Content"; + case 423: return "Locked"; + case 424: return "Method failure"; + case 425: return "Too Early"; + case 426: return "Upgrade Required"; + case 428: return "Precondition Required"; + case 429: return "Too Many Requests"; + case 431: return "Request Header Fields Too Large"; + case 449: return "Retry With"; + case 450: return "Blocked by Windows Parental Controls"; + case 451: return "Unavailable For Legal Reasons"; + case 456: return "Unrecoverable Error"; + case 499: return "client has closed connection"; + case 500: return "Internal Server Error"; + case 501: return "Not Implemented"; + case 502: return "Bad Gateway or Proxy Error"; + case 503: return "Service Unavailable"; + case 504: return "Gateway Time-out"; + case 505: return "HTTP Version not supported"; + case 506: return "Variant also negotiate"; + case 507: return "Insufficient storage"; + case 508: return "Loop detected"; + case 509: return "Bandwidth Limit Exceeded"; + case 510: return "Not extended"; + case 511: return "Network authentication required"; + case 520: return "Web server is returning an unknown error"; + default: + switch (status) { + case 100 ... 199: return "Informational"; + case 200 ... 299: return "Success"; + case 300 ... 399: return "Redirection"; + case 400 ... 499: return "Client Error"; + case 500 ... 599: return "Server Error"; + default: return "Other"; + } + } +} + +/* Returns the ist string corresponding to port part (without ':') in the host + * <host>, IST_NULL if no ':' is found or an empty IST if there is no digit. In + * the last case, the result is the original ist trimmed to 0. So be sure to test + * the result length before doing any pointer arithmetic. +*/ +struct ist http_get_host_port(const struct ist host) +{ + char *start, *end, *ptr; + + start = istptr(host); + end = istend(host); + for (ptr = end; ptr > start && isdigit((unsigned char)*--ptr);); + + /* no port found */ + if (likely(*ptr != ':')) + return IST_NULL; + if (ptr+1 == end) + return isttrim(host, 0); + + return istnext(ist2(ptr, end - ptr)); +} + + +/* Return non-zero if the port <port> is a default port. If the scheme <schm> is + * set, it is used to detect default ports (HTTP => 80 and HTTPS => 443) + * port. Otherwise, both are considered as default ports. + */ +int http_is_default_port(const struct ist schm, const struct ist port) +{ + if (!istlen(port)) + return 1; + + if (!isttest(schm)) + return (isteq(port, ist("443")) || isteq(port, ist("80"))); + else + return (isteq(port, ist("443")) && isteqi(schm, ist("https://"))) || + (isteq(port, ist("80")) && isteqi(schm, ist("http://"))); +} + +/* Returns non-zero if the scheme <schm> is syntactically correct according to + * RFC3986#3.1, otherwise zero. It expects only the scheme and nothing else + * (particularly not the following "://"). + * Scheme = alpha *(alpha|digit|'+'|'-'|'.') + */ +int http_validate_scheme(const struct ist schm) +{ + size_t i; + + for (i = 0; i < schm.len; i++) { + if (likely((schm.ptr[i] >= 'a' && schm.ptr[i] <= 'z') || + (schm.ptr[i] >= 'A' && schm.ptr[i] <= 'Z'))) + continue; + if (unlikely(!i)) // first char must be alpha + return 0; + if ((schm.ptr[i] >= '0' && schm.ptr[i] <= '9') || + schm.ptr[i] == '+' || schm.ptr[i] == '-' || schm.ptr[i] == '.') + continue; + return 0; + } + return !!i; +} + +/* Parse the uri and looks for the scheme. If not found, an empty ist is + * returned. Otherwise, the ist pointing to the scheme is returned. + * + * <parser> must have been initialized via http_uri_parser_init. See the + * related http_uri_parser documentation for the specific API usage. + */ +struct ist http_parse_scheme(struct http_uri_parser *parser) +{ + const char *ptr, *start, *end; + + if (parser->state >= URI_PARSER_STATE_SCHEME_DONE) + goto not_found; + + if (parser->format != URI_PARSER_FORMAT_ABSURI_OR_AUTHORITY) + goto not_found; + + ptr = start = istptr(parser->uri); + end = istend(parser->uri); + + if (isalpha((unsigned char)*ptr)) { + /* this is a scheme as described by RFC3986, par. 3.1, or only + * an authority (in case of a CONNECT method). + */ + ptr++; + /* retrieve the scheme up to the suffix '://'. If the suffix is + * not found, this means there is no scheme and it is an + * authority-only uri. + */ + while (ptr < end && + (isalnum((unsigned char)*ptr) || *ptr == '+' || *ptr == '-' || *ptr == '.')) + ptr++; + if (ptr == end || *ptr++ != ':') + goto not_found; + if (ptr == end || *ptr++ != '/') + goto not_found; + if (ptr == end || *ptr++ != '/') + goto not_found; + } + else { + goto not_found; + } + + parser->uri = ist2(ptr, end - ptr); + parser->state = URI_PARSER_STATE_SCHEME_DONE; + return ist2(start, ptr - start); + + not_found: + parser->state = URI_PARSER_STATE_SCHEME_DONE; + return IST_NULL; +} + +/* Parse the uri and looks for the authority, between the scheme and the + * path. if no_userinfo is not zero, the part before the '@' (including it) is + * skipped. If not found, an empty ist is returned. Otherwise, the ist pointing + * on the authority is returned. + * + * <parser> must have been initialized via http_uri_parser_init. See the + * related http_uri_parser documentation for the specific API usage. + */ +struct ist http_parse_authority(struct http_uri_parser *parser, int no_userinfo) +{ + const char *ptr, *start, *end; + + if (parser->state >= URI_PARSER_STATE_AUTHORITY_DONE) + goto not_found; + + if (parser->format != URI_PARSER_FORMAT_ABSURI_OR_AUTHORITY) + goto not_found; + + if (parser->state < URI_PARSER_STATE_SCHEME_DONE) + http_parse_scheme(parser); + + ptr = start = istptr(parser->uri); + end = istend(parser->uri); + + while (ptr < end && *ptr != '/') { + if (*ptr++ == '@' && no_userinfo) + start = ptr; + } + + /* OK, ptr point on the '/' or the end */ + + authority: + parser->uri = ist2(ptr, end - ptr); + parser->state = URI_PARSER_STATE_AUTHORITY_DONE; + return ist2(start, ptr - start); + + not_found: + parser->state = URI_PARSER_STATE_AUTHORITY_DONE; + return IST_NULL; +} + +/* Parse the URI from the given transaction (which is assumed to be in request + * phase) and look for the "/" beginning the PATH. If not found, ist2(0,0) is + * returned. Otherwise the pointer and length are returned. + * + * <parser> must have been initialized via http_uri_parser_init. See the + * related http_uri_parser documentation for the specific API usage. + */ +struct ist http_parse_path(struct http_uri_parser *parser) +{ + const char *ptr, *end; + + if (parser->state >= URI_PARSER_STATE_PATH_DONE) + goto not_found; + + if (parser->format == URI_PARSER_FORMAT_EMPTY || + parser->format == URI_PARSER_FORMAT_ASTERISK) { + goto not_found; + } + + ptr = istptr(parser->uri); + end = istend(parser->uri); + + /* If the uri is in absolute-path format, first skip the scheme and + * authority parts. No scheme will be found if the uri is in authority + * format, which indicates that the path won't be present. + */ + if (parser->format == URI_PARSER_FORMAT_ABSURI_OR_AUTHORITY) { + if (parser->state < URI_PARSER_STATE_SCHEME_DONE) { + /* If no scheme found, uri is in authority format. No + * path is present. + */ + if (!isttest(http_parse_scheme(parser))) + goto not_found; + } + + if (parser->state < URI_PARSER_STATE_AUTHORITY_DONE) + http_parse_authority(parser, 1); + + ptr = istptr(parser->uri); + + if (ptr == end) + goto not_found; + } + + parser->state = URI_PARSER_STATE_PATH_DONE; + return ist2(ptr, end - ptr); + + not_found: + parser->state = URI_PARSER_STATE_PATH_DONE; + return IST_NULL; +} + +/* Parse <value> Content-Length header field of an HTTP request. The function + * checks all possible occurrences of a comma-delimited value, and verifies if + * any of them doesn't match a previous value. <value> is sanitized on return + * to contain a single value if several identical values were found. + * + * <body_len> must be a valid pointer and is used to return the parsed length + * unless values differ. Also if <not_first> is true, <body_len> is assumed to + * point to previously parsed value and which must be equal to the new length. + * This is useful if an HTTP message contains several Content-Length headers. + * + * Returns <0 if a value differs, 0 if the whole header can be dropped (i.e. + * already known), or >0 if the value can be indexed (first one). In the last + * case, the value might be adjusted and the caller must only add the updated + * value. + */ +int http_parse_cont_len_header(struct ist *value, unsigned long long *body_len, + int not_first) +{ + char *e, *n; + unsigned long long cl; + struct ist word; + int check_prev = not_first; + + word.ptr = value->ptr; + e = value->ptr + value->len; + + while (1) { + if (word.ptr >= e) { + /* empty header or empty value */ + goto fail; + } + + /* skip leading delimiter and blanks */ + if (unlikely(HTTP_IS_LWS(*word.ptr))) { + word.ptr++; + continue; + } + + /* digits only now */ + for (cl = 0, n = word.ptr; n < e; n++) { + unsigned int c = *n - '0'; + if (unlikely(c > 9)) { + /* non-digit */ + if (unlikely(n == word.ptr)) // spaces only + goto fail; + break; + } + + if (unlikely(!cl && n > word.ptr)) { + /* There was a leading zero before this digit, + * let's trim it. + */ + word.ptr = n; + } + + if (unlikely(cl > ULLONG_MAX / 10ULL)) + goto fail; /* multiply overflow */ + cl = cl * 10ULL; + if (unlikely(cl + c < cl)) + goto fail; /* addition overflow */ + cl = cl + c; + } + + /* keep a copy of the exact cleaned value */ + word.len = n - word.ptr; + + /* skip trailing LWS till next comma or EOL */ + for (; n < e; n++) { + if (!HTTP_IS_LWS(*n)) { + if (unlikely(*n != ',')) + goto fail; + break; + } + } + + /* if duplicate, must be equal */ + if (check_prev && cl != *body_len) + goto fail; + + /* OK, store this result as the one to be indexed */ + *body_len = cl; + *value = word; + + /* Now either n==e and we're done, or n points to the comma, + * and we skip it and continue. + */ + if (n++ == e) + break; + + word.ptr = n; + check_prev = 1; + } + + /* here we've reached the end with a single value or a series of + * identical values, all matching previous series if any. The last + * parsed value was sent back into <value>. We just have to decide + * if this occurrence has to be indexed (it's the first one) or + * silently skipped (it's not the first one) + */ + return !not_first; + fail: + return -1; +} + +/* + * Checks if <hdr> is exactly <name> for <len> chars, and ends with a colon. + * If so, returns the position of the first non-space character relative to + * <hdr>, or <end>-<hdr> if not found before. If no value is found, it tries + * to return a pointer to the place after the first space. Returns 0 if the + * header name does not match. Checks are case-insensitive. + */ +int http_header_match2(const char *hdr, const char *end, + const char *name, int len) +{ + const char *val; + + if (hdr + len >= end) + return 0; + if (hdr[len] != ':') + return 0; + if (strncasecmp(hdr, name, len) != 0) + return 0; + val = hdr + len + 1; + while (val < end && HTTP_IS_SPHT(*val)) + val++; + if ((val >= end) && (len + 2 <= end - hdr)) + return len + 2; /* we may replace starting from second space */ + return val - hdr; +} + +/* Find the end of the header value contained between <s> and <e>. See RFC7230, + * par 3.2 for more information. Note that it requires a valid header to return + * a valid result. This works for headers defined as comma-separated lists. + */ +char *http_find_hdr_value_end(char *s, const char *e) +{ + int quoted, qdpair; + + quoted = qdpair = 0; + +#ifdef HA_UNALIGNED_LE + /* speedup: skip everything not a comma nor a double quote */ + for (; s <= e - sizeof(int); s += sizeof(int)) { + unsigned int c = *(int *)s; // comma + unsigned int q = c; // quote + + c ^= 0x2c2c2c2c; // contains one zero on a comma + q ^= 0x22222222; // contains one zero on a quote + + c = (c - 0x01010101) & ~c; // contains 0x80 below a comma + q = (q - 0x01010101) & ~q; // contains 0x80 below a quote + + if ((c | q) & 0x80808080) + break; // found a comma or a quote + } +#endif + for (; s < e; s++) { + if (qdpair) qdpair = 0; + else if (quoted) { + if (*s == '\\') qdpair = 1; + else if (*s == '"') quoted = 0; + } + else if (*s == '"') quoted = 1; + else if (*s == ',') return s; + } + return s; +} + +/* Find the end of a cookie value contained between <s> and <e>. It works the + * same way as with headers above except that the semi-colon also ends a token. + * See RFC2965 for more information. Note that it requires a valid header to + * return a valid result. + */ +char *http_find_cookie_value_end(char *s, const char *e) +{ + int quoted, qdpair; + + quoted = qdpair = 0; + for (; s < e; s++) { + if (qdpair) qdpair = 0; + else if (quoted) { + if (*s == '\\') qdpair = 1; + else if (*s == '"') quoted = 0; + } + else if (*s == '"') quoted = 1; + else if (*s == ',' || *s == ';') return s; + } + return s; +} + +/* Try to find the next occurrence of a cookie name in a cookie header value. + * To match on any cookie name, <cookie_name_l> must be set to 0. + * The lookup begins at <hdr>. The pointer and size of the next occurrence of + * the cookie value is returned into *value and *value_l, and the function + * returns a pointer to the next pointer to search from if the value was found. + * Otherwise if the cookie was not found, NULL is returned and neither value + * nor value_l are touched. The input <hdr> string should first point to the + * header's value, and the <hdr_end> pointer must point to the first character + * not part of the value. <list> must be non-zero if value may represent a list + * of values (cookie headers). This makes it faster to abort parsing when no + * list is expected. + */ +char *http_extract_cookie_value(char *hdr, const char *hdr_end, + char *cookie_name, size_t cookie_name_l, + int list, char **value, size_t *value_l) +{ + char *equal, *att_end, *att_beg, *val_beg, *val_end; + char *next; + + /* we search at least a cookie name followed by an equal, and more + * generally something like this : + * Cookie: NAME1 = VALUE 1 ; NAME2 = VALUE2 ; NAME3 = VALUE3\r\n + */ + for (att_beg = hdr; att_beg + cookie_name_l + 1 < hdr_end; att_beg = next + 1) { + /* Iterate through all cookies on this line */ + + while (att_beg < hdr_end && HTTP_IS_SPHT(*att_beg)) + att_beg++; + + /* find att_end : this is the first character after the last non + * space before the equal. It may be equal to hdr_end. + */ + equal = att_end = att_beg; + + while (equal < hdr_end) { + if (*equal == '=' || *equal == ';' || (list && *equal == ',')) + break; + if (HTTP_IS_SPHT(*equal++)) + continue; + att_end = equal; + } + + /* here, <equal> points to '=', a delimiter or the end. <att_end> + * is between <att_beg> and <equal>, both may be identical. + */ + + /* look for end of cookie if there is an equal sign */ + if (equal < hdr_end && *equal == '=') { + /* look for the beginning of the value */ + val_beg = equal + 1; + while (val_beg < hdr_end && HTTP_IS_SPHT(*val_beg)) + val_beg++; + + /* find the end of the value, respecting quotes */ + next = http_find_cookie_value_end(val_beg, hdr_end); + + /* make val_end point to the first white space or delimiter after the value */ + val_end = next; + while (val_end > val_beg && HTTP_IS_SPHT(*(val_end - 1))) + val_end--; + } else { + val_beg = val_end = next = equal; + } + + /* We have nothing to do with attributes beginning with '$'. However, + * they will automatically be removed if a header before them is removed, + * since they're supposed to be linked together. + */ + if (*att_beg == '$') + continue; + + /* Ignore cookies with no equal sign */ + if (equal == next) + continue; + + /* Now we have the cookie name between att_beg and att_end, and + * its value between val_beg and val_end. + */ + + if (cookie_name_l == 0 || (att_end - att_beg == cookie_name_l && + memcmp(att_beg, cookie_name, cookie_name_l) == 0)) { + /* let's return this value and indicate where to go on from */ + *value = val_beg; + *value_l = val_end - val_beg; + return next + 1; + } + + /* Set-Cookie headers only have the name in the first attr=value part */ + if (!list) + break; + } + + return NULL; +} + +/* Try to find the next cookie name in a cookie header given a pointer + * <hdr_beg> to the starting position, a pointer <hdr_end> to the ending + * position to search in the cookie and a boolean <is_req> of type int that + * indicates if the stream direction is for request or response. + * The lookup begins at <hdr_beg>, which is assumed to be in + * Cookie / Set-Cookie header, and the function returns a pointer to the next + * position to search from if a valid cookie k-v pair is found for Cookie + * request header (<is_req> is non-zero) and <hdr_end> for Set-Cookie response + * header (<is_req> is zero). When the next cookie name is found, <ptr> will + * be pointing to the start of the cookie name, and <len> will be the length + * of the cookie name. + * Otherwise if there is no valid cookie k-v pair, NULL is returned. + * The <hdr_end> pointer must point to the first character + * not part of the Cookie / Set-Cookie header. + */ +char *http_extract_next_cookie_name(char *hdr_beg, char *hdr_end, int is_req, + char **ptr, size_t *len) +{ + char *equal, *att_end, *att_beg, *val_beg; + char *next; + + /* We search a valid cookie name between hdr_beg and hdr_end, + * followed by an equal. For example for the following cookie: + * Cookie: NAME1 = VALUE 1 ; NAME2 = VALUE2 ; NAME3 = VALUE3\r\n + * We want to find NAME1, NAME2, or NAME3 depending on where we start our search + * according to <hdr_beg> + */ + for (att_beg = hdr_beg; att_beg + 1 < hdr_end; att_beg = next + 1) { + while (att_beg < hdr_end && HTTP_IS_SPHT(*att_beg)) + att_beg++; + + /* find <att_end> : this is the first character after the last non + * space before the equal. It may be equal to <hdr_end>. + */ + equal = att_end = att_beg; + + while (equal < hdr_end) { + if (*equal == '=' || *equal == ';') + break; + if (HTTP_IS_SPHT(*equal++)) + continue; + att_end = equal; + } + + /* Here, <equal> points to '=', a delimiter or the end. <att_end> + * is between <att_beg> and <equal>, both may be identical. + */ + + /* Look for end of cookie if there is an equal sign */ + if (equal < hdr_end && *equal == '=') { + /* Look for the beginning of the value */ + val_beg = equal + 1; + while (val_beg < hdr_end && HTTP_IS_SPHT(*val_beg)) + val_beg++; + + /* Find the end of the value, respecting quotes */ + next = http_find_cookie_value_end(val_beg, hdr_end); + } else { + next = equal; + } + + /* We have nothing to do with attributes beginning with '$'. However, + * they will automatically be removed if a header before them is removed, + * since they're supposed to be linked together. + */ + if (*att_beg == '$') + continue; + + /* Ignore cookies with no equal sign */ + if (equal == next) + continue; + + /* Now we have the cookie name between <att_beg> and <att_end>, and + * <next> points to the end of cookie value + */ + *ptr = att_beg; + *len = att_end - att_beg; + + /* Return next position for Cookie request header and <hdr_end> for + * Set-Cookie response header as each Set-Cookie header is assumed to + * contain only 1 cookie + */ + if (is_req) + return next + 1; + return hdr_end; + } + + return NULL; +} + +/* Parses a qvalue and returns it multiplied by 1000, from 0 to 1000. If the + * value is larger than 1000, it is bound to 1000. The parser consumes up to + * 1 digit, one dot and 3 digits and stops on the first invalid character. + * Unparsable qvalues return 1000 as "q=1.000". + */ +int http_parse_qvalue(const char *qvalue, const char **end) +{ + int q = 1000; + + if (!isdigit((unsigned char)*qvalue)) + goto out; + q = (*qvalue++ - '0') * 1000; + + if (*qvalue++ != '.') + goto out; + + if (!isdigit((unsigned char)*qvalue)) + goto out; + q += (*qvalue++ - '0') * 100; + + if (!isdigit((unsigned char)*qvalue)) + goto out; + q += (*qvalue++ - '0') * 10; + + if (!isdigit((unsigned char)*qvalue)) + goto out; + q += (*qvalue++ - '0') * 1; + out: + if (q > 1000) + q = 1000; + if (end) + *end = qvalue; + return q; +} + +/* + * Given a url parameter, find the starting position of the first occurrence, + * or NULL if the parameter is not found. + * + * Example: if query_string is "yo=mama;ye=daddy" and url_param_name is "ye", + * the function will return query_string+8. + * + * Warning: this function returns a pointer that can point to the first chunk + * or the second chunk. The caller must be check the position before using the + * result. + */ +const char *http_find_url_param_pos(const char **chunks, + const char* url_param_name, size_t url_param_name_l, + char delim, char insensitive) +{ + const char *pos, *last, *equal; + const char **bufs = chunks; + int l1, l2; + + + pos = bufs[0]; + last = bufs[1]; + while (pos < last) { + /* Check the equal. */ + equal = pos + url_param_name_l; + if (fix_pointer_if_wrap(chunks, &equal)) { + if (equal >= chunks[3]) + return NULL; + } else { + if (equal >= chunks[1]) + return NULL; + } + if (*equal == '=') { + if (pos + url_param_name_l > last) { + /* process wrap case, we detect a wrap. In this case, the + * comparison is performed in two parts. + */ + + /* This is the end, we don't have any other chunk. */ + if (bufs != chunks || !bufs[2]) + return NULL; + + /* Compute the length of each part of the comparison. */ + l1 = last - pos; + l2 = url_param_name_l - l1; + + /* The second buffer is too short to contain the compared string. */ + if (bufs[2] + l2 > bufs[3]) + return NULL; + + if (insensitive) { + if (strncasecmp(pos, url_param_name, l1) == 0 && + strncasecmp(bufs[2], url_param_name+l1, l2) == 0) + return pos; + } + else { + if (memcmp(pos, url_param_name, l1) == 0 && + memcmp(bufs[2], url_param_name+l1, l2) == 0) + return pos; + } + + /* Perform wrapping and jump the string who fail the comparison. */ + bufs += 2; + pos = bufs[0] + l2; + last = bufs[1]; + + } else { + /* process a simple comparison.*/ + if (insensitive) { + if (strncasecmp(pos, url_param_name, url_param_name_l) == 0) + return pos; + } else { + if (memcmp(pos, url_param_name, url_param_name_l) == 0) + return pos; + } + pos += url_param_name_l + 1; + if (fix_pointer_if_wrap(chunks, &pos)) + last = bufs[2]; + } + } + + while (1) { + /* Look for the next delimiter. */ + while (pos < last && !http_is_param_delimiter(*pos, delim)) + pos++; + if (pos < last) + break; + /* process buffer wrapping. */ + if (bufs != chunks || !bufs[2]) + return NULL; + bufs += 2; + pos = bufs[0]; + last = bufs[1]; + } + pos++; + } + return NULL; +} + +/* + * Given a url parameter name and a query string, find the next value. + * An empty url_param_name matches the first available parameter. + * If the parameter is found, 1 is returned and *vstart / *vend are updated to + * respectively provide a pointer to the value and its end. + * Otherwise, 0 is returned and vstart/vend are not modified. + */ +int http_find_next_url_param(const char **chunks, + const char* url_param_name, size_t url_param_name_l, + const char **vstart, const char **vend, char delim, char insensitive) +{ + const char *arg_start, *qs_end; + const char *value_start, *value_end; + + arg_start = chunks[0]; + qs_end = chunks[1]; + if (url_param_name_l) { + /* Looks for an argument name. */ + arg_start = http_find_url_param_pos(chunks, + url_param_name, url_param_name_l, + delim, insensitive); + /* Check for wrapping. */ + if (arg_start >= qs_end) + qs_end = chunks[3]; + } + if (!arg_start) + return 0; + + if (!url_param_name_l) { + while (1) { + /* looks for the first argument. */ + value_start = memchr(arg_start, '=', qs_end - arg_start); + if (!value_start) { + /* Check for wrapping. */ + if (arg_start >= chunks[0] && + arg_start < chunks[1] && + chunks[2]) { + arg_start = chunks[2]; + qs_end = chunks[3]; + continue; + } + return 0; + } + break; + } + value_start++; + } + else { + /* Jump the argument length. */ + value_start = arg_start + url_param_name_l + 1; + + /* Check for pointer wrapping. */ + if (fix_pointer_if_wrap(chunks, &value_start)) { + /* Update the end pointer. */ + qs_end = chunks[3]; + + /* Check for overflow. */ + if (value_start >= qs_end) + return 0; + } + } + + value_end = value_start; + + while (1) { + while ((value_end < qs_end) && !http_is_param_delimiter(*value_end, delim)) + value_end++; + if (value_end < qs_end) + break; + /* process buffer wrapping. */ + if (value_end >= chunks[0] && + value_end < chunks[1] && + chunks[2]) { + value_end = chunks[2]; + qs_end = chunks[3]; + continue; + } + break; + } + + *vstart = value_start; + *vend = value_end; + return 1; +} + +/* Parses a single header line (without the CRLF) and splits it into its name + * and its value. The parsing is pretty naive and just skip spaces. + */ +int http_parse_header(const struct ist hdr, struct ist *name, struct ist *value) +{ + char *p = hdr.ptr; + char *end = p + hdr.len; + + name->len = value->len = 0; + + /* Skip leading spaces */ + for (; p < end && HTTP_IS_SPHT(*p); p++); + + /* Set the header name */ + name->ptr = p; + for (; p < end && HTTP_IS_TOKEN(*p); p++); + name->len = p - name->ptr; + + /* Skip the ':' and spaces before and after it */ + for (; p < end && HTTP_IS_SPHT(*p); p++); + if (p < end && *p == ':') p++; + for (; p < end && HTTP_IS_SPHT(*p); p++); + + /* Set the header value */ + value->ptr = p; + value->len = end - p; + + return 1; +} + +/* Parses a single start line (without the CRLF) and splits it into 3 parts. The + * parsing is pretty naive and just skip spaces. + */ +int http_parse_stline(const struct ist line, struct ist *p1, struct ist *p2, struct ist *p3) +{ + char *p = line.ptr; + char *end = p + line.len; + + p1->len = p2->len = p3->len = 0; + + /* Skip leading spaces */ + for (; p < end && HTTP_IS_SPHT(*p); p++); + + /* Set the first part */ + p1->ptr = p; + for (; p < end && HTTP_IS_TOKEN(*p); p++); + p1->len = p - p1->ptr; + + /* Skip spaces between p1 and p2 */ + for (; p < end && HTTP_IS_SPHT(*p); p++); + + /* Set the second part */ + p2->ptr = p; + for (; p < end && !HTTP_IS_SPHT(*p); p++); + p2->len = p - p2->ptr; + + /* Skip spaces between p2 and p3 */ + for (; p < end && HTTP_IS_SPHT(*p); p++); + + /* The remaining is the third value */ + p3->ptr = p; + p3->len = end - p; + + return 1; +} + +/* Parses value of a Status header with the following format: "Status: Code[ + * Reason]". The parsing is pretty naive and just skip spaces. It return the + * numeric value of the status code. + */ +int http_parse_status_val(const struct ist value, struct ist *status, struct ist *reason) +{ + char *p = value.ptr; + char *end = p + value.len; + uint16_t code; + + status->len = reason->len = 0; + + /* Skip leading spaces */ + for (; p < end && HTTP_IS_SPHT(*p); p++); + + /* Set the status part */ + status->ptr = p; + for (; p < end && HTTP_IS_TOKEN(*p); p++); + status->len = p - status->ptr; + + /* Skip spaces between status and reason */ + for (; p < end && HTTP_IS_SPHT(*p); p++); + + /* the remaining is the reason */ + reason->ptr = p; + reason->len = end - p; + + code = strl2ui(status->ptr, status->len); + return code; +} + + +/* Returns non-zero if the two ETags are comparable (see RFC 7232#2.3.2). + * If any of them is a weak ETag, we discard the weakness prefix and perform + * a strict string comparison. + * Returns 0 otherwise. + */ +int http_compare_etags(struct ist etag1, struct ist etag2) +{ + enum http_etag_type etag_type1; + enum http_etag_type etag_type2; + + etag_type1 = http_get_etag_type(etag1); + etag_type2 = http_get_etag_type(etag2); + + if (etag_type1 == ETAG_INVALID || etag_type2 == ETAG_INVALID) + return 0; + + /* Discard the 'W/' prefix an ETag is a weak one. */ + if (etag_type1 == ETAG_WEAK) + etag1 = istadv(etag1, 2); + if (etag_type2 == ETAG_WEAK) + etag2 = istadv(etag2, 2); + + return isteq(etag1, etag2); +} + + +/* + * Trim leading space or horizontal tab characters from <value> string. + * Returns the trimmed string. + */ +struct ist http_trim_leading_spht(struct ist value) +{ + struct ist ret = value; + + while (ret.len && HTTP_IS_SPHT(ret.ptr[0])) { + ++ret.ptr; + --ret.len; + } + + return ret; +} + +/* + * Trim trailing space or horizontal tab characters from <value> string. + * Returns the trimmed string. + */ +struct ist http_trim_trailing_spht(struct ist value) +{ + struct ist ret = value; + + while (ret.len && HTTP_IS_SPHT(ret.ptr[-1])) + --ret.len; + + return ret; +} diff --git a/src/http_acl.c b/src/http_acl.c new file mode 100644 index 0000000..bf29fc3 --- /dev/null +++ b/src/http_acl.c @@ -0,0 +1,185 @@ +/* + * HTTP ACLs declaration + * + * Copyright 2000-2018 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/types.h> + +#include <ctype.h> +#include <string.h> +#include <time.h> + +#include <haproxy/acl.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/auth.h> +#include <haproxy/chunk.h> +#include <haproxy/http.h> +#include <haproxy/pattern.h> +#include <haproxy/pool.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + + +/* We use the pre-parsed method if it is known, and store its number as an + * integer. If it is unknown, we use the pointer and the length. + */ +static int pat_parse_meth(const char *text, struct pattern *pattern, int mflags, char **err) +{ + int len, meth; + + len = strlen(text); + meth = find_http_meth(text, len); + + pattern->val.i = meth; + if (meth == HTTP_METH_OTHER) { + pattern->ptr.str = (char *)text; + pattern->len = len; + } + else { + pattern->ptr.str = NULL; + pattern->len = 0; + } + return 1; +} + +/* See above how the method is stored in the global pattern */ +static struct pattern *pat_match_meth(struct sample *smp, struct pattern_expr *expr, int fill) +{ + int icase; + struct pattern_list *lst; + struct pattern *pattern; + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + /* well-known method */ + if (pattern->val.i != HTTP_METH_OTHER) { + if (smp->data.u.meth.meth == pattern->val.i) + return pattern; + else + continue; + } + + /* Other method, we must compare the strings */ + if (pattern->len != smp->data.u.meth.str.data) + continue; + + icase = expr->mflags & PAT_MF_IGNORE_CASE; + if ((icase && strncasecmp(pattern->ptr.str, smp->data.u.meth.str.area, smp->data.u.meth.str.data) == 0) || + (!icase && strncmp(pattern->ptr.str, smp->data.u.meth.str.area, smp->data.u.meth.str.data) == 0)) + return pattern; + } + return NULL; +} + +/************************************************************************/ +/* All supported ACL keywords must be declared here. */ +/************************************************************************/ + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct acl_kw_list acl_kws = {ILH, { + { "base", "base", PAT_MATCH_STR }, + { "base_beg", "base", PAT_MATCH_BEG }, + { "base_dir", "base", PAT_MATCH_DIR }, + { "base_dom", "base", PAT_MATCH_DOM }, + { "base_end", "base", PAT_MATCH_END }, + { "base_len", "base", PAT_MATCH_LEN }, + { "base_reg", "base", PAT_MATCH_REG }, + { "base_sub", "base", PAT_MATCH_SUB }, + + { "cook", "req.cook", PAT_MATCH_STR }, + { "cook_beg", "req.cook", PAT_MATCH_BEG }, + { "cook_dir", "req.cook", PAT_MATCH_DIR }, + { "cook_dom", "req.cook", PAT_MATCH_DOM }, + { "cook_end", "req.cook", PAT_MATCH_END }, + { "cook_len", "req.cook", PAT_MATCH_LEN }, + { "cook_reg", "req.cook", PAT_MATCH_REG }, + { "cook_sub", "req.cook", PAT_MATCH_SUB }, + + { "hdr", "req.hdr", PAT_MATCH_STR }, + { "hdr_beg", "req.hdr", PAT_MATCH_BEG }, + { "hdr_dir", "req.hdr", PAT_MATCH_DIR }, + { "hdr_dom", "req.hdr", PAT_MATCH_DOM }, + { "hdr_end", "req.hdr", PAT_MATCH_END }, + { "hdr_len", "req.hdr", PAT_MATCH_LEN }, + { "hdr_reg", "req.hdr", PAT_MATCH_REG }, + { "hdr_sub", "req.hdr", PAT_MATCH_SUB }, + + /* these two declarations uses strings with list storage (in place + * of tree storage). The basic match is PAT_MATCH_STR, but the indexation + * and delete functions are relative to the list management. The parse + * and match method are related to the corresponding fetch methods. This + * is very particular ACL declaration mode. + */ + { "http_auth_group", NULL, PAT_MATCH_STR, NULL, pat_idx_list_str, NULL, NULL, pat_match_auth }, + { "method", NULL, PAT_MATCH_STR, pat_parse_meth, pat_idx_list_str, NULL, NULL, pat_match_meth }, + + { "path", "path", PAT_MATCH_STR }, + { "path_beg", "path", PAT_MATCH_BEG }, + { "path_dir", "path", PAT_MATCH_DIR }, + { "path_dom", "path", PAT_MATCH_DOM }, + { "path_end", "path", PAT_MATCH_END }, + { "path_len", "path", PAT_MATCH_LEN }, + { "path_reg", "path", PAT_MATCH_REG }, + { "path_sub", "path", PAT_MATCH_SUB }, + + { "req_ver", "req.ver", PAT_MATCH_STR }, + { "resp_ver", "res.ver", PAT_MATCH_STR }, + + { "scook", "res.cook", PAT_MATCH_STR }, + { "scook_beg", "res.cook", PAT_MATCH_BEG }, + { "scook_dir", "res.cook", PAT_MATCH_DIR }, + { "scook_dom", "res.cook", PAT_MATCH_DOM }, + { "scook_end", "res.cook", PAT_MATCH_END }, + { "scook_len", "res.cook", PAT_MATCH_LEN }, + { "scook_reg", "res.cook", PAT_MATCH_REG }, + { "scook_sub", "res.cook", PAT_MATCH_SUB }, + + { "shdr", "res.hdr", PAT_MATCH_STR }, + { "shdr_beg", "res.hdr", PAT_MATCH_BEG }, + { "shdr_dir", "res.hdr", PAT_MATCH_DIR }, + { "shdr_dom", "res.hdr", PAT_MATCH_DOM }, + { "shdr_end", "res.hdr", PAT_MATCH_END }, + { "shdr_len", "res.hdr", PAT_MATCH_LEN }, + { "shdr_reg", "res.hdr", PAT_MATCH_REG }, + { "shdr_sub", "res.hdr", PAT_MATCH_SUB }, + + { "url", "url", PAT_MATCH_STR }, + { "url_beg", "url", PAT_MATCH_BEG }, + { "url_dir", "url", PAT_MATCH_DIR }, + { "url_dom", "url", PAT_MATCH_DOM }, + { "url_end", "url", PAT_MATCH_END }, + { "url_len", "url", PAT_MATCH_LEN }, + { "url_reg", "url", PAT_MATCH_REG }, + { "url_sub", "url", PAT_MATCH_SUB }, + + { "urlp", "urlp", PAT_MATCH_STR }, + { "urlp_beg", "urlp", PAT_MATCH_BEG }, + { "urlp_dir", "urlp", PAT_MATCH_DIR }, + { "urlp_dom", "urlp", PAT_MATCH_DOM }, + { "urlp_end", "urlp", PAT_MATCH_END }, + { "urlp_len", "urlp", PAT_MATCH_LEN }, + { "urlp_reg", "urlp", PAT_MATCH_REG }, + { "urlp_sub", "urlp", PAT_MATCH_SUB }, + + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, acl_register_keywords, &acl_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/http_act.c b/src/http_act.c new file mode 100644 index 0000000..7d45780 --- /dev/null +++ b/src/http_act.c @@ -0,0 +1,2501 @@ +/* + * HTTP actions + * + * Copyright 2000-2018 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/types.h> + +#include <ctype.h> +#include <string.h> +#include <time.h> + +#include <haproxy/acl.h> +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/capture-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/chunk.h> +#include <haproxy/global.h> +#include <haproxy/http.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_htx.h> +#include <haproxy/http_rules.h> +#include <haproxy/log.h> +#include <haproxy/pattern.h> +#include <haproxy/pool.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> +#include <haproxy/uri_auth-t.h> +#include <haproxy/uri_normalizer.h> +#include <haproxy/version.h> + + +/* Release memory allocated by most of HTTP actions. Concretly, it releases + * <arg.http>. + */ +static void release_http_action(struct act_rule *rule) +{ + struct logformat_node *lf, *lfb; + + istfree(&rule->arg.http.str); + if (rule->arg.http.re) + regex_free(rule->arg.http.re); + list_for_each_entry_safe(lf, lfb, &rule->arg.http.fmt, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } +} + +/* Release memory allocated by HTTP actions relying on an http reply. Concretly, + * it releases <.arg.http_reply> + */ +static void release_act_http_reply(struct act_rule *rule) +{ + release_http_reply(rule->arg.http_reply); + rule->arg.http_reply = NULL; +} + + +/* Check function for HTTP actions relying on an http reply. The function + * returns 1 in success case, otherwise, it returns 0 and err is filled. + */ +static int check_act_http_reply(struct act_rule *rule, struct proxy *px, char **err) +{ + struct http_reply *reply = rule->arg.http_reply; + + if (!http_check_http_reply(reply, px, err)) { + release_act_http_reply(rule); + return 0; + } + return 1; +} + + +/* This function executes one of the set-{method,path,query,uri} actions. It + * builds a string in the trash from the specified format string. It finds + * the action to be performed in <.action>, previously filled by function + * parse_set_req_line(). The replacement action is executed by the function + * http_action_set_req_line(). On success, it returns ACT_RET_CONT. If an error + * occurs while soft rewrites are enabled, the action is canceled, but the rule + * processing continue. Otherwsize ACT_RET_ERR is returned. + */ +static enum act_return http_action_set_req_line(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct buffer *replace; + enum act_return ret = ACT_RET_CONT; + + replace = alloc_trash_chunk(); + if (!replace) + goto fail_alloc; + + /* If we have to create a query string, prepare a '?'. */ + if (rule->action == 2) // set-query + replace->area[replace->data++] = '?'; + replace->data += build_logline(s, replace->area + replace->data, + replace->size - replace->data, + &rule->arg.http.fmt); + + if (http_req_replace_stline(rule->action, replace->area, replace->data, px, s) == -1) + goto fail_rewrite; + + leave: + free_trash_chunk(replace); + return ret; + + fail_alloc: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + ret = ACT_RET_ERR; + goto leave; + + fail_rewrite: + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + + if (!(s->txn->req.flags & HTTP_MSGF_SOFT_RW)) { + ret = ACT_RET_ERR; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + } + goto leave; +} + +/* parse an http-request action among : + * set-method + * set-path + * set-pathq + * set-query + * set-uri + * + * All of them accept a single argument of type string representing a log-format. + * The resulting rule makes use of <http.fmt> to store the log-format list head, + * and <.action> to store the action type as an int (0=method, 1=path, 2=query, + * 3=uri). It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_set_req_line(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg = *orig_arg; + int cap = 0; + + switch (args[0][4]) { + case 'm' : + rule->action = 0; // set-method + break; + case 'p' : + if (args[0][8] == 'q') + rule->action = 4; // set-pathq + else + rule->action = 1; // set-path + break; + case 'q' : + rule->action = 2; // set-query + break; + case 'u' : + rule->action = 3; // set-uri + break; + default: + memprintf(err, "internal error: unhandled action '%s'", args[0]); + return ACT_RET_PRS_ERR; + } + rule->action_ptr = http_action_set_req_line; + rule->release_ptr = release_http_action; + LIST_INIT(&rule->arg.http.fmt); + + if (!*args[cur_arg] || + (*args[cur_arg + 1] && strcmp(args[cur_arg + 1], "if") != 0 && strcmp(args[cur_arg + 1], "unless") != 0)) { + memprintf(err, "expects exactly 1 argument <format>"); + return ACT_RET_PRS_ERR; + } + + px->conf.args.ctx = ARGC_HRQ; + if (px->cap & PR_CAP_FE) + cap |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + cap |= SMP_VAL_BE_HRQ_HDR; + if (!parse_logformat_string(args[cur_arg], px, &rule->arg.http.fmt, LOG_OPT_HTTP, cap, err)) { + return ACT_RET_PRS_ERR; + } + + (*orig_arg)++; + return ACT_RET_PRS_OK; +} + +/* This function executes the http-request normalize-uri action. + * `rule->action` is expected to be a value from `enum act_normalize_uri`. + * + * On success, it returns ACT_RET_CONT. If an error + * occurs while soft rewrites are enabled, the action is canceled, but the rule + * processing continue. Otherwsize ACT_RET_ERR is returned. + */ +static enum act_return http_action_normalize_uri(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + enum act_return ret = ACT_RET_CONT; + struct htx *htx = htxbuf(&s->req.buf); + const struct ist uri = htx_sl_req_uri(http_get_stline(htx)); + struct buffer *replace = alloc_trash_chunk(); + enum uri_normalizer_err err = URI_NORMALIZER_ERR_INTERNAL_ERROR; + + if (!replace) + goto fail_alloc; + + switch ((enum act_normalize_uri) rule->action) { + case ACT_NORMALIZE_URI_PATH_MERGE_SLASHES: { + struct http_uri_parser parser = http_uri_parser_init(uri); + const struct ist path = http_parse_path(&parser); + struct ist newpath = ist2(replace->area, replace->size); + + if (!isttest(path)) + goto leave; + + err = uri_normalizer_path_merge_slashes(iststop(path, '?'), &newpath); + + if (err != URI_NORMALIZER_ERR_NONE) + break; + + if (!http_replace_req_path(htx, newpath, 0)) + goto fail_rewrite; + + break; + } + case ACT_NORMALIZE_URI_PATH_STRIP_DOT: { + struct http_uri_parser parser = http_uri_parser_init(uri); + const struct ist path = http_parse_path(&parser); + struct ist newpath = ist2(replace->area, replace->size); + + if (!isttest(path)) + goto leave; + + err = uri_normalizer_path_dot(iststop(path, '?'), &newpath); + + if (err != URI_NORMALIZER_ERR_NONE) + break; + + if (!http_replace_req_path(htx, newpath, 0)) + goto fail_rewrite; + + break; + } + case ACT_NORMALIZE_URI_PATH_STRIP_DOTDOT: + case ACT_NORMALIZE_URI_PATH_STRIP_DOTDOT_FULL: { + struct http_uri_parser parser = http_uri_parser_init(uri); + const struct ist path = http_parse_path(&parser); + struct ist newpath = ist2(replace->area, replace->size); + + if (!isttest(path)) + goto leave; + + err = uri_normalizer_path_dotdot(iststop(path, '?'), rule->action == ACT_NORMALIZE_URI_PATH_STRIP_DOTDOT_FULL, &newpath); + + if (err != URI_NORMALIZER_ERR_NONE) + break; + + if (!http_replace_req_path(htx, newpath, 0)) + goto fail_rewrite; + + break; + } + case ACT_NORMALIZE_URI_QUERY_SORT_BY_NAME: { + struct http_uri_parser parser = http_uri_parser_init(uri); + const struct ist path = http_parse_path(&parser); + struct ist newquery = ist2(replace->area, replace->size); + + if (!isttest(path)) + goto leave; + + err = uri_normalizer_query_sort(istfind(path, '?'), '&', &newquery); + + if (err != URI_NORMALIZER_ERR_NONE) + break; + + if (!http_replace_req_query(htx, newquery)) + goto fail_rewrite; + + break; + } + case ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE: + case ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE_STRICT: { + struct http_uri_parser parser = http_uri_parser_init(uri); + const struct ist path = http_parse_path(&parser); + struct ist newpath = ist2(replace->area, replace->size); + + if (!isttest(path)) + goto leave; + + err = uri_normalizer_percent_upper(path, rule->action == ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE_STRICT, &newpath); + + if (err != URI_NORMALIZER_ERR_NONE) + break; + + if (!http_replace_req_path(htx, newpath, 1)) + goto fail_rewrite; + + break; + } + case ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED: + case ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT: { + struct http_uri_parser parser = http_uri_parser_init(uri); + const struct ist path = http_parse_path(&parser); + struct ist newpath = ist2(replace->area, replace->size); + + if (!isttest(path)) + goto leave; + + err = uri_normalizer_percent_decode_unreserved(path, rule->action == ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT, &newpath); + + if (err != URI_NORMALIZER_ERR_NONE) + break; + + if (!http_replace_req_path(htx, newpath, 1)) + goto fail_rewrite; + + break; + } + case ACT_NORMALIZE_URI_FRAGMENT_STRIP: { + struct http_uri_parser parser = http_uri_parser_init(uri); + const struct ist path = http_parse_path(&parser); + struct ist newpath = ist2(replace->area, replace->size); + + if (!isttest(path)) + goto leave; + + err = uri_normalizer_fragment_strip(path, &newpath); + + if (err != URI_NORMALIZER_ERR_NONE) + break; + + if (!http_replace_req_path(htx, newpath, 1)) + goto fail_rewrite; + + break; + } + case ACT_NORMALIZE_URI_FRAGMENT_ENCODE: { + struct http_uri_parser parser = http_uri_parser_init(uri); + const struct ist path = http_parse_path(&parser); + struct ist newpath = ist2(replace->area, replace->size); + + if (!isttest(path)) + goto leave; + + err = uri_normalizer_fragment_encode(path, &newpath); + + if (err != URI_NORMALIZER_ERR_NONE) + break; + + if (!http_replace_req_path(htx, newpath, 1)) + goto fail_rewrite; + + break; + } + } + + switch (err) { + case URI_NORMALIZER_ERR_NONE: + break; + case URI_NORMALIZER_ERR_INTERNAL_ERROR: + ret = ACT_RET_ERR; + break; + case URI_NORMALIZER_ERR_INVALID_INPUT: + ret = ACT_RET_INV; + break; + case URI_NORMALIZER_ERR_ALLOC: + goto fail_alloc; + } + + leave: + free_trash_chunk(replace); + return ret; + + fail_alloc: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + ret = ACT_RET_ERR; + goto leave; + + fail_rewrite: + _HA_ATOMIC_ADD(&sess->fe->fe_counters.failed_rewrites, 1); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_ADD(&s->be->be_counters.failed_rewrites, 1); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_ADD(&sess->listener->counters->failed_rewrites, 1); + if (objt_server(s->target)) + _HA_ATOMIC_ADD(&__objt_server(s->target)->counters.failed_rewrites, 1); + + if (!(s->txn->req.flags & HTTP_MSGF_SOFT_RW)) { + ret = ACT_RET_ERR; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + } + goto leave; +} + +/* Parses the http-request normalize-uri action. It expects a single <normalizer> + * argument, corresponding too a value in `enum act_normalize_uri`. + * + * It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_normalize_uri(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg = *orig_arg; + + rule->action_ptr = http_action_normalize_uri; + rule->release_ptr = NULL; + + if (!*args[cur_arg]) { + memprintf(err, "missing argument <normalizer>"); + return ACT_RET_PRS_ERR; + } + + if (strcmp(args[cur_arg], "path-merge-slashes") == 0) { + cur_arg++; + + rule->action = ACT_NORMALIZE_URI_PATH_MERGE_SLASHES; + } + else if (strcmp(args[cur_arg], "path-strip-dot") == 0) { + cur_arg++; + + rule->action = ACT_NORMALIZE_URI_PATH_STRIP_DOT; + } + else if (strcmp(args[cur_arg], "path-strip-dotdot") == 0) { + cur_arg++; + + if (strcmp(args[cur_arg], "full") == 0) { + cur_arg++; + rule->action = ACT_NORMALIZE_URI_PATH_STRIP_DOTDOT_FULL; + } + else if (!*args[cur_arg]) { + rule->action = ACT_NORMALIZE_URI_PATH_STRIP_DOTDOT; + } + else if (strcmp(args[cur_arg], "if") != 0 && strcmp(args[cur_arg], "unless") != 0) { + memprintf(err, "unknown argument '%s' for 'path-strip-dotdot' normalizer", args[cur_arg]); + return ACT_RET_PRS_ERR; + } + } + else if (strcmp(args[cur_arg], "query-sort-by-name") == 0) { + cur_arg++; + + rule->action = ACT_NORMALIZE_URI_QUERY_SORT_BY_NAME; + } + else if (strcmp(args[cur_arg], "percent-to-uppercase") == 0) { + cur_arg++; + + if (strcmp(args[cur_arg], "strict") == 0) { + cur_arg++; + rule->action = ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE_STRICT; + } + else if (!*args[cur_arg]) { + rule->action = ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE; + } + else if (strcmp(args[cur_arg], "if") != 0 && strcmp(args[cur_arg], "unless") != 0) { + memprintf(err, "unknown argument '%s' for 'percent-to-uppercase' normalizer", args[cur_arg]); + return ACT_RET_PRS_ERR; + } + } + else if (strcmp(args[cur_arg], "percent-decode-unreserved") == 0) { + cur_arg++; + + if (strcmp(args[cur_arg], "strict") == 0) { + cur_arg++; + rule->action = ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT; + } + else if (!*args[cur_arg]) { + rule->action = ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED; + } + else if (strcmp(args[cur_arg], "if") != 0 && strcmp(args[cur_arg], "unless") != 0) { + memprintf(err, "unknown argument '%s' for 'percent-decode-unreserved' normalizer", args[cur_arg]); + return ACT_RET_PRS_ERR; + } + } + else if (strcmp(args[cur_arg], "fragment-strip") == 0) { + cur_arg++; + + rule->action = ACT_NORMALIZE_URI_FRAGMENT_STRIP; + } + else if (strcmp(args[cur_arg], "fragment-encode") == 0) { + cur_arg++; + + rule->action = ACT_NORMALIZE_URI_FRAGMENT_ENCODE; + } + else { + memprintf(err, "unknown normalizer '%s'", args[cur_arg]); + return ACT_RET_PRS_ERR; + } + + *orig_arg = cur_arg; + return ACT_RET_PRS_OK; +} + +/* This function executes a replace-uri action. It finds its arguments in + * <rule>.arg.http. It builds a string in the trash from the format string + * previously filled by function parse_replace_uri() and will execute the regex + * in <http.re> to replace the URI. It uses the format string present in + * <http.fmt>. The component to act on (path/uri) is taken from <.action> which + * contains 1 for the path or 3 for the URI (values used by + * http_req_replace_stline()). On success, it returns ACT_RET_CONT. If an error + * occurs while soft rewrites are enabled, the action is canceled, but the rule + * processing continue. Otherwsize ACT_RET_ERR is returned. + */ +static enum act_return http_action_replace_uri(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + enum act_return ret = ACT_RET_CONT; + struct buffer *replace, *output; + struct ist uri; + int len; + + replace = alloc_trash_chunk(); + output = alloc_trash_chunk(); + if (!replace || !output) + goto fail_alloc; + uri = htx_sl_req_uri(http_get_stline(htxbuf(&s->req.buf))); + + if (rule->action == 1) { // replace-path + struct http_uri_parser parser = http_uri_parser_init(uri); + uri = iststop(http_parse_path(&parser), '?'); + } + else if (rule->action == 4) { // replace-pathq + struct http_uri_parser parser = http_uri_parser_init(uri); + uri = http_parse_path(&parser); + } + + if (!istlen(uri)) + goto leave; + + if (!regex_exec_match2(rule->arg.http.re, uri.ptr, uri.len, MAX_MATCH, pmatch, 0)) + goto leave; + + replace->data = build_logline(s, replace->area, replace->size, &rule->arg.http.fmt); + + /* note: uri.ptr doesn't need to be zero-terminated because it will + * only be used to pick pmatch references. + */ + len = exp_replace(output->area, output->size, uri.ptr, replace->area, pmatch); + if (len == -1) + goto fail_rewrite; + + if (http_req_replace_stline(rule->action, output->area, len, px, s) == -1) + goto fail_rewrite; + + leave: + free_trash_chunk(output); + free_trash_chunk(replace); + return ret; + + fail_alloc: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + ret = ACT_RET_ERR; + goto leave; + + fail_rewrite: + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + + if (!(s->txn->req.flags & HTTP_MSGF_SOFT_RW)) { + ret = ACT_RET_ERR; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + } + goto leave; +} + +/* parse a "replace-uri", "replace-path" or "replace-pathq" + * http-request action. + * This action takes 2 arguments (a regex and a replacement format string). + * The resulting rule makes use of <.action> to store the action (1/3 for now), + * <http.re> to store the compiled regex, and <http.fmt> to store the log-format + * list head. It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_replace_uri(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg = *orig_arg; + int cap = 0; + char *error = NULL; + + switch (args[0][8]) { + case 'p': + if (args[0][12] == 'q') + rule->action = 4; // replace-pathq, same as set-pathq + else + rule->action = 1; // replace-path, same as set-path + break; + case 'u': + rule->action = 3; // replace-uri, same as set-uri + break; + default: + memprintf(err, "internal error: unhandled action '%s'", args[0]); + return ACT_RET_PRS_ERR; + } + + rule->action_ptr = http_action_replace_uri; + rule->release_ptr = release_http_action; + LIST_INIT(&rule->arg.http.fmt); + + if (!*args[cur_arg] || !*args[cur_arg+1] || + (*args[cur_arg+2] && strcmp(args[cur_arg+2], "if") != 0 && strcmp(args[cur_arg+2], "unless") != 0)) { + memprintf(err, "expects exactly 2 arguments <match-regex> and <replace-format>"); + return ACT_RET_PRS_ERR; + } + + if (!(rule->arg.http.re = regex_comp(args[cur_arg], 1, 1, &error))) { + memprintf(err, "failed to parse the regex : %s", error); + free(error); + return ACT_RET_PRS_ERR; + } + + px->conf.args.ctx = ARGC_HRQ; + if (px->cap & PR_CAP_FE) + cap |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + cap |= SMP_VAL_BE_HRQ_HDR; + if (!parse_logformat_string(args[cur_arg + 1], px, &rule->arg.http.fmt, LOG_OPT_HTTP, cap, err)) { + regex_free(rule->arg.http.re); + return ACT_RET_PRS_ERR; + } + + (*orig_arg) += 2; + return ACT_RET_PRS_OK; +} + +/* This function is just a compliant action wrapper for "set-status". */ +static enum act_return action_http_set_status(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + if (http_res_set_status(rule->arg.http.i, rule->arg.http.str, s) == -1) { + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + + if (!(s->txn->req.flags & HTTP_MSGF_SOFT_RW)) { + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + return ACT_RET_ERR; + } + } + + return ACT_RET_CONT; +} + +/* parse set-status action: + * This action accepts a single argument of type int representing + * an http status code. It returns ACT_RET_PRS_OK on success, + * ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_set_status(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + char *error; + + rule->action = ACT_CUSTOM; + rule->action_ptr = action_http_set_status; + rule->release_ptr = release_http_action; + LIST_INIT(&rule->arg.http.fmt); + + /* Check if an argument is available */ + if (!*args[*orig_arg]) { + memprintf(err, "expects 1 argument: <status>; or 3 arguments: <status> reason <fmt>"); + return ACT_RET_PRS_ERR; + } + + /* convert status code as integer */ + rule->arg.http.i = strtol(args[*orig_arg], &error, 10); + if (*error != '\0' || rule->arg.http.i < 100 || rule->arg.http.i > 999) { + memprintf(err, "expects an integer status code between 100 and 999"); + return ACT_RET_PRS_ERR; + } + + (*orig_arg)++; + + /* set custom reason string */ + rule->arg.http.str = ist(NULL); // If null, we use the default reason for the status code. + if (*args[*orig_arg] && strcmp(args[*orig_arg], "reason") == 0 && + (*args[*orig_arg + 1] && strcmp(args[*orig_arg + 1], "if") != 0 && strcmp(args[*orig_arg + 1], "unless") != 0)) { + (*orig_arg)++; + rule->arg.http.str = ist(strdup(args[*orig_arg])); + (*orig_arg)++; + } + + return ACT_RET_PRS_OK; +} + +/* This function executes the "reject" HTTP action. It clears the request and + * response buffer without sending any response. It can be useful as an HTTP + * alternative to the silent-drop action to defend against DoS attacks, and may + * also be used with HTTP/2 to close a connection instead of just a stream. + * The txn status is unchanged, indicating no response was sent. The termination + * flags will indicate "PR". It always returns ACT_RET_ABRT. + */ +static enum act_return http_action_reject(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + sc_must_kill_conn(s->scf); + stream_abort(s); + s->req.analysers &= AN_REQ_FLT_END; + s->res.analysers &= AN_RES_FLT_END; + + _HA_ATOMIC_INC(&s->be->be_counters.denied_req); + _HA_ATOMIC_INC(&sess->fe->fe_counters.denied_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->denied_req); + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= SF_FINST_R; + + return ACT_RET_ABRT; +} + +/* parse the "reject" action: + * This action takes no argument and returns ACT_RET_PRS_OK on success, + * ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_action_reject(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + rule->action = ACT_CUSTOM; + rule->action_ptr = http_action_reject; + return ACT_RET_PRS_OK; +} + +/* This function executes the "disable-l7-retry" HTTP action. + * It disables L7 retries (all retry except for a connection failure). This + * can be useful for example to avoid retrying on POST requests. + * It just removes the L7 retry flag on the HTTP transaction, and always + * return ACT_RET_CONT; + */ +static enum act_return http_req_disable_l7_retry(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + /* In theory, the TX_L7_RETRY flags isn't set at this point, but + * let's be future-proof and remove it anyway. + */ + s->txn->flags &= ~TX_L7_RETRY; + s->txn->flags |= TX_D_L7_RETRY; + return ACT_RET_CONT; +} + +/* parse the "disable-l7-retry" action: + * This action takes no argument and returns ACT_RET_PRS_OK on success, + * ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_req_disable_l7_retry(const char **args, + int *orig_args, struct proxy *px, + struct act_rule *rule, char **err) +{ + rule->action = ACT_CUSTOM; + rule->action_ptr = http_req_disable_l7_retry; + return ACT_RET_PRS_OK; +} + +/* This function executes the "capture" action. It executes a fetch expression, + * turns the result into a string and puts it in a capture slot. It always + * returns 1. If an error occurs the action is cancelled, but the rule + * processing continues. + */ +static enum act_return http_action_req_capture(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct sample *key; + struct cap_hdr *h = rule->arg.cap.hdr; + char **cap = s->req_cap; + int len; + + key = sample_fetch_as_type(s->be, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.cap.expr, SMP_T_STR); + if (!key) + return ACT_RET_CONT; + + if (cap[h->index] == NULL) + cap[h->index] = pool_alloc(h->pool); + + if (cap[h->index] == NULL) /* no more capture memory */ + return ACT_RET_CONT; + + len = key->data.u.str.data; + if (len > h->len) + len = h->len; + + memcpy(cap[h->index], key->data.u.str.area, len); + cap[h->index][len] = 0; + return ACT_RET_CONT; +} + +/* This function executes the "capture" action and store the result in a + * capture slot if exists. It executes a fetch expression, turns the result + * into a string and puts it in a capture slot. It always returns 1. If an + * error occurs the action is cancelled, but the rule processing continues. + */ +static enum act_return http_action_req_capture_by_id(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct sample *key; + struct cap_hdr *h; + char **cap = s->req_cap; + struct proxy *fe = strm_fe(s); + int len; + int i; + + /* Look for the original configuration. */ + for (h = fe->req_cap, i = fe->nb_req_cap - 1; + h != NULL && i != rule->arg.capid.idx ; + i--, h = h->next); + if (!h) + return ACT_RET_CONT; + + key = sample_fetch_as_type(s->be, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.capid.expr, SMP_T_STR); + if (!key) + return ACT_RET_CONT; + + if (cap[h->index] == NULL) + cap[h->index] = pool_alloc(h->pool); + + if (cap[h->index] == NULL) /* no more capture memory */ + return ACT_RET_CONT; + + len = key->data.u.str.data; + if (len > h->len) + len = h->len; + + memcpy(cap[h->index], key->data.u.str.area, len); + cap[h->index][len] = 0; + return ACT_RET_CONT; +} + +/* Check an "http-request capture" action. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +static int check_http_req_capture(struct act_rule *rule, struct proxy *px, char **err) +{ + if (rule->action_ptr != http_action_req_capture_by_id) + return 1; + + /* capture slots can only be declared in frontends, so we can't check their + * existence in backends at configuration parsing step + */ + if (px->cap & PR_CAP_FE && rule->arg.capid.idx >= px->nb_req_cap) { + memprintf(err, "unable to find capture id '%d' referenced by http-request capture rule", + rule->arg.capid.idx); + return 0; + } + + return 1; +} + +/* Release memory allocate by an http capture action */ +static void release_http_capture(struct act_rule *rule) +{ + if (rule->action_ptr == http_action_req_capture) + release_sample_expr(rule->arg.cap.expr); + else + release_sample_expr(rule->arg.capid.expr); +} + +/* parse an "http-request capture" action. It takes a single argument which is + * a sample fetch expression. It stores the expression into arg->act.p[0] and + * the allocated hdr_cap struct or the preallocated "id" into arg->act.p[1]. + * It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_req_capture(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + struct sample_expr *expr; + struct cap_hdr *hdr; + int cur_arg; + int len = 0; + + for (cur_arg = *orig_arg; cur_arg < *orig_arg + 3 && *args[cur_arg]; cur_arg++) + if (strcmp(args[cur_arg], "if") == 0 || + strcmp(args[cur_arg], "unless") == 0) + break; + + if (cur_arg < *orig_arg + 3) { + memprintf(err, "expects <expression> [ 'len' <length> | id <idx> ]"); + return ACT_RET_PRS_ERR; + } + + cur_arg = *orig_arg; + expr = sample_parse_expr((char **)args, &cur_arg, px->conf.args.file, px->conf.args.line, err, &px->conf.args, NULL); + if (!expr) + return ACT_RET_PRS_ERR; + + if (!(expr->fetch->val & SMP_VAL_FE_HRQ_HDR)) { + memprintf(err, + "fetch method '%s' extracts information from '%s', none of which is available here", + args[cur_arg-1], sample_src_names(expr->fetch->use)); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + if (!args[cur_arg] || !*args[cur_arg]) { + memprintf(err, "expects 'len or 'id'"); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + if (strcmp(args[cur_arg], "len") == 0) { + cur_arg++; + + if (!(px->cap & PR_CAP_FE)) { + memprintf(err, "proxy '%s' has no frontend capability", px->id); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + px->conf.args.ctx = ARGC_CAP; + + if (!args[cur_arg]) { + memprintf(err, "missing length value"); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + /* we copy the table name for now, it will be resolved later */ + len = atoi(args[cur_arg]); + if (len <= 0) { + memprintf(err, "length must be > 0"); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + cur_arg++; + + hdr = calloc(1, sizeof(*hdr)); + if (!hdr) { + memprintf(err, "out of memory"); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + hdr->next = px->req_cap; + hdr->name = NULL; /* not a header capture */ + hdr->namelen = 0; + hdr->len = len; + hdr->pool = create_pool("caphdr", hdr->len + 1, MEM_F_SHARED); + hdr->index = px->nb_req_cap++; + + px->req_cap = hdr; + px->to_log |= LW_REQHDR; + + rule->action = ACT_CUSTOM; + rule->action_ptr = http_action_req_capture; + rule->release_ptr = release_http_capture; + rule->arg.cap.expr = expr; + rule->arg.cap.hdr = hdr; + } + + else if (strcmp(args[cur_arg], "id") == 0) { + int id; + char *error; + + cur_arg++; + + if (!args[cur_arg]) { + memprintf(err, "missing id value"); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + id = strtol(args[cur_arg], &error, 10); + if (*error != '\0') { + memprintf(err, "cannot parse id '%s'", args[cur_arg]); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + cur_arg++; + + px->conf.args.ctx = ARGC_CAP; + + rule->action = ACT_CUSTOM; + rule->action_ptr = http_action_req_capture_by_id; + rule->check_ptr = check_http_req_capture; + rule->release_ptr = release_http_capture; + rule->arg.capid.expr = expr; + rule->arg.capid.idx = id; + } + + else { + memprintf(err, "expects 'len' or 'id', found '%s'", args[cur_arg]); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + *orig_arg = cur_arg; + return ACT_RET_PRS_OK; +} + +/* This function executes the "capture" action and store the result in a + * capture slot if exists. It executes a fetch expression, turns the result + * into a string and puts it in a capture slot. It always returns 1. If an + * error occurs the action is cancelled, but the rule processing continues. + */ +static enum act_return http_action_res_capture_by_id(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct sample *key; + struct cap_hdr *h; + char **cap = s->res_cap; + struct proxy *fe = strm_fe(s); + int len; + int i; + + /* Look for the original configuration. */ + for (h = fe->rsp_cap, i = fe->nb_rsp_cap - 1; + h != NULL && i != rule->arg.capid.idx ; + i--, h = h->next); + if (!h) + return ACT_RET_CONT; + + key = sample_fetch_as_type(s->be, sess, s, SMP_OPT_DIR_RES|SMP_OPT_FINAL, rule->arg.capid.expr, SMP_T_STR); + if (!key) + return ACT_RET_CONT; + + if (cap[h->index] == NULL) + cap[h->index] = pool_alloc(h->pool); + + if (cap[h->index] == NULL) /* no more capture memory */ + return ACT_RET_CONT; + + len = key->data.u.str.data; + if (len > h->len) + len = h->len; + + memcpy(cap[h->index], key->data.u.str.area, len); + cap[h->index][len] = 0; + return ACT_RET_CONT; +} + +/* Check an "http-response capture" action. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +static int check_http_res_capture(struct act_rule *rule, struct proxy *px, char **err) +{ + if (rule->action_ptr != http_action_res_capture_by_id) + return 1; + + /* capture slots can only be declared in frontends, so we can't check their + * existence in backends at configuration parsing step + */ + if (px->cap & PR_CAP_FE && rule->arg.capid.idx >= px->nb_rsp_cap) { + memprintf(err, "unable to find capture id '%d' referenced by http-response capture rule", + rule->arg.capid.idx); + return 0; + } + + return 1; +} + +/* parse an "http-response capture" action. It takes a single argument which is + * a sample fetch expression. It stores the expression into arg->act.p[0] and + * the allocated hdr_cap struct of the preallocated id into arg->act.p[1]. + * It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_res_capture(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + struct sample_expr *expr; + int cur_arg; + int id; + char *error; + + for (cur_arg = *orig_arg; cur_arg < *orig_arg + 3 && *args[cur_arg]; cur_arg++) + if (strcmp(args[cur_arg], "if") == 0 || + strcmp(args[cur_arg], "unless") == 0) + break; + + if (cur_arg < *orig_arg + 3) { + memprintf(err, "expects <expression> id <idx>"); + return ACT_RET_PRS_ERR; + } + + cur_arg = *orig_arg; + expr = sample_parse_expr((char **)args, &cur_arg, px->conf.args.file, px->conf.args.line, err, &px->conf.args, NULL); + if (!expr) + return ACT_RET_PRS_ERR; + + if (!(expr->fetch->val & SMP_VAL_FE_HRS_HDR)) { + memprintf(err, + "fetch method '%s' extracts information from '%s', none of which is available here", + args[cur_arg-1], sample_src_names(expr->fetch->use)); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + if (!args[cur_arg] || !*args[cur_arg]) { + memprintf(err, "expects 'id'"); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + if (strcmp(args[cur_arg], "id") != 0) { + memprintf(err, "expects 'id', found '%s'", args[cur_arg]); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + cur_arg++; + + if (!args[cur_arg]) { + memprintf(err, "missing id value"); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + id = strtol(args[cur_arg], &error, 10); + if (*error != '\0') { + memprintf(err, "cannot parse id '%s'", args[cur_arg]); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + cur_arg++; + + px->conf.args.ctx = ARGC_CAP; + + rule->action = ACT_CUSTOM; + rule->action_ptr = http_action_res_capture_by_id; + rule->check_ptr = check_http_res_capture; + rule->release_ptr = release_http_capture; + rule->arg.capid.expr = expr; + rule->arg.capid.idx = id; + + *orig_arg = cur_arg; + return ACT_RET_PRS_OK; +} + +/* Parse a "allow" action for a request or a response rule. It takes no argument. It + * returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_allow(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + rule->action = ACT_ACTION_ALLOW; + rule->flags |= ACT_FLAG_FINAL; + return ACT_RET_PRS_OK; +} + +/* Parse "deny" or "tarpit" actions for a request rule or "deny" action for a + * response rule. It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on + * error. It relies on http_parse_http_reply() to set + * <.arg.http_reply>. + */ +static enum act_parse_ret parse_http_deny(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int default_status; + int cur_arg, arg = 0; + + cur_arg = *orig_arg; + if (rule->from == ACT_F_HTTP_REQ) { + if (strcmp(args[cur_arg - 1], "tarpit") == 0) { + rule->action = ACT_HTTP_REQ_TARPIT; + default_status = 500; + } + else { + rule->action = ACT_ACTION_DENY; + default_status = 403; + } + } + else { + rule->action = ACT_ACTION_DENY; + default_status = 502; + } + + /* If no args or only a deny_status specified, fallback on the legacy + * mode and use default error files despite the fact that + * default-errorfiles is not used. Otherwise, parse an http reply. + */ + + /* Prepare parsing of log-format strings */ + px->conf.args.ctx = ((rule->from == ACT_F_HTTP_REQ) ? ARGC_HRQ : ARGC_HRS); + + if (!*(args[cur_arg]) || strcmp(args[cur_arg], "if") == 0 || strcmp(args[cur_arg], "unless") == 0) { + rule->arg.http_reply = http_parse_http_reply((const char *[]){"default-errorfiles", ""}, &arg, px, default_status, err); + goto end; + } + + if (strcmp(args[cur_arg], "deny_status") == 0) { + if (!*(args[cur_arg+2]) || strcmp(args[cur_arg+2], "if") == 0 || strcmp(args[cur_arg+2], "unless") == 0) { + rule->arg.http_reply = http_parse_http_reply((const char *[]){"status", args[cur_arg+1], "default-errorfiles", ""}, + &arg, px, default_status, err); + *orig_arg += 2; + goto end; + } + args[cur_arg] += 5; /* skip "deny_" for the parsing */ + } + + rule->arg.http_reply = http_parse_http_reply(args, orig_arg, px, default_status, err); + + end: + if (!rule->arg.http_reply) + return ACT_RET_PRS_ERR; + + rule->flags |= ACT_FLAG_FINAL; + rule->check_ptr = check_act_http_reply; + rule->release_ptr = release_act_http_reply; + return ACT_RET_PRS_OK; +} + + +/* This function executes a auth action. It builds an 401/407 HTX message using + * the corresponding proxy's error message. On success, it returns + * ACT_RET_ABRT. If an error occurs ACT_RET_ERR is returned. + */ +static enum act_return http_action_auth(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct channel *req = &s->req; + struct channel *res = &s->res; + struct htx *htx = htx_from_buf(&res->buf); + struct http_reply *reply; + const char *auth_realm; + struct http_hdr_ctx ctx; + struct ist hdr; + + /* Auth might be performed on regular http-req rules as well as on stats */ + auth_realm = rule->arg.http.str.ptr; + if (!auth_realm) { + if (px->uri_auth && s->current_rule_list == &px->uri_auth->http_req_rules) + auth_realm = STATS_DEFAULT_REALM; + else + auth_realm = px->id; + } + + if (!(s->txn->flags & TX_USE_PX_CONN)) { + s->txn->status = 401; + hdr = ist("WWW-Authenticate"); + } + else { + s->txn->status = 407; + hdr = ist("Proxy-Authenticate"); + } + reply = http_error_message(s); + channel_htx_truncate(res, htx); + + if (chunk_printf(&trash, "Basic realm=\"%s\"", auth_realm) == -1) + goto fail; + + /* Write the generic 40x message */ + if (http_reply_to_htx(s, htx, reply) == -1) + goto fail; + + /* Remove all existing occurrences of the XXX-Authenticate header */ + ctx.blk = NULL; + while (http_find_header(htx, hdr, &ctx, 1)) + http_remove_header(htx, &ctx); + + /* Now a the right XXX-Authenticate header */ + if (!http_add_header(htx, hdr, ist2(b_orig(&trash), b_data(&trash)))) + goto fail; + + /* Finally forward the reply */ + htx_to_buf(htx, &res->buf); + if (!http_forward_proxy_resp(s, 1)) + goto fail; + + /* Note: Only eval on the request */ + s->logs.request_ts = now_ns; + req->analysers &= AN_REQ_FLT_END; + + if (s->sess->fe == s->be) /* report it if the request was intercepted by the frontend */ + _HA_ATOMIC_INC(&s->sess->fe->fe_counters.intercepted_req); + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_LOCAL; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= SF_FINST_R; + + stream_inc_http_err_ctr(s); + return ACT_RET_ABRT; + + fail: + /* If an error occurred, remove the incomplete HTTP response from the + * buffer */ + channel_htx_truncate(res, htx); + return ACT_RET_ERR; +} + +/* Parse a "auth" action. It may take 2 optional arguments to define a "realm" + * parameter. It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_auth(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg; + + rule->action = ACT_CUSTOM; + rule->flags |= ACT_FLAG_FINAL; + rule->action_ptr = http_action_auth; + rule->release_ptr = release_http_action; + LIST_INIT(&rule->arg.http.fmt); + + cur_arg = *orig_arg; + if (strcmp(args[cur_arg], "realm") == 0) { + cur_arg++; + if (!*args[cur_arg]) { + memprintf(err, "missing realm value.\n"); + return ACT_RET_PRS_ERR; + } + rule->arg.http.str = ist(strdup(args[cur_arg])); + cur_arg++; + } + + *orig_arg = cur_arg; + return ACT_RET_PRS_OK; +} + +/* This function executes a early-hint action. It adds an HTTP Early Hint HTTP + * 103 response header with <.arg.http.str> name and with a value built + * according to <.arg.http.fmt> log line format. If it is the first early-hint + * rule of series, the 103 response start-line is added first. At the end, if + * the next rule is not an early-hint rule or if it is the last rule, the EOH + * block is added to terminate the response. On success, it returns + * ACT_RET_CONT. If an error occurs while soft rewrites are enabled, the action + * is canceled, but the rule processing continue. Otherwsize ACT_RET_ERR is + * returned. + */ +static enum act_return http_action_early_hint(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct act_rule *next_rule; + struct channel *res = &s->res; + struct htx *htx = htx_from_buf(&res->buf); + struct buffer *value = alloc_trash_chunk(); + enum act_return ret = ACT_RET_CONT; + + if (!(s->txn->req.flags & HTTP_MSGF_VER_11)) + goto leave; + + if (!value) { + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + goto error; + } + + /* if there is no pending 103 response, start a new response. Otherwise, + * continue to add link to a previously started response + */ + if (s->txn->status != 103) { + struct htx_sl *sl; + unsigned int flags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11| + HTX_SL_F_XFER_LEN|HTX_SL_F_BODYLESS); + + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, + ist("HTTP/1.1"), ist("103"), ist("Early Hints")); + if (!sl) + goto error; + sl->info.res.status = 103; + s->txn->status = 103; + } + + /* Add the HTTP Early Hint HTTP 103 response header */ + value->data = build_logline(s, b_tail(value), b_room(value), &rule->arg.http.fmt); + if (!htx_add_header(htx, rule->arg.http.str, ist2(b_head(value), b_data(value)))) + goto error; + + /* if it is the last rule or the next one is not an early-hint or an + * conditional early-hint, terminate the current response. + */ + next_rule = LIST_NEXT(&rule->list, typeof(rule), list); + if (&next_rule->list == s->current_rule_list || next_rule->action_ptr != http_action_early_hint || next_rule->cond) { + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto error; + if (!http_forward_proxy_resp(s, 0)) + goto error; + s->txn->status = 0; + } + + leave: + free_trash_chunk(value); + return ret; + + error: + /* If an error occurred during an Early-hint rule, remove the incomplete + * HTTP 103 response from the buffer */ + channel_htx_truncate(res, htx); + ret = ACT_RET_ERR; + s->txn->status = 0; + goto leave; +} + +/* This function executes a set-header or add-header actions. It builds a string + * in the trash from the specified format string. It finds the action to be + * performed in <.action>, previously filled by function parse_set_header(). The + * replacement action is executed by the function http_action_set_header(). On + * success, it returns ACT_RET_CONT. If an error occurs while soft rewrites are + * enabled, the action is canceled, but the rule processing continue. Otherwsize + * ACT_RET_ERR is returned. + */ +static enum act_return http_action_set_header(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct http_msg *msg = ((rule->from == ACT_F_HTTP_REQ) ? &s->txn->req : &s->txn->rsp); + struct htx *htx = htxbuf(&msg->chn->buf); + enum act_return ret = ACT_RET_CONT; + struct buffer *replace; + struct http_hdr_ctx ctx; + struct ist n, v; + + replace = alloc_trash_chunk(); + if (!replace) + goto fail_alloc; + + replace->data = build_logline(s, replace->area, replace->size, &rule->arg.http.fmt); + n = rule->arg.http.str; + v = ist2(replace->area, replace->data); + + if (rule->action == 0) { // set-header + /* remove all occurrences of the header */ + ctx.blk = NULL; + while (http_find_header(htx, n, &ctx, 1)) + http_remove_header(htx, &ctx); + } + + /* Now add header */ + if (!http_add_header(htx, n, v)) + goto fail_rewrite; + + leave: + free_trash_chunk(replace); + return ret; + + fail_alloc: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + ret = ACT_RET_ERR; + goto leave; + + fail_rewrite: + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + + if (!(msg->flags & HTTP_MSGF_SOFT_RW)) { + ret = ACT_RET_ERR; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + } + goto leave; +} + +/* Parse a "set-header", "add-header" or "early-hint" actions. It takes an + * header name and a log-format string as arguments. It returns ACT_RET_PRS_OK + * on success, ACT_RET_PRS_ERR on error. + * + * Note: same function is used for the request and the response. However + * "early-hint" rules are only supported for request rules. + */ +static enum act_parse_ret parse_http_set_header(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cap = 0, cur_arg; + const char *p; + + if (args[*orig_arg-1][0] == 'e') { + rule->action = ACT_CUSTOM; + rule->action_ptr = http_action_early_hint; + } + else { + if (args[*orig_arg-1][0] == 's') + rule->action = 0; // set-header + else + rule->action = 1; // add-header + rule->action_ptr = http_action_set_header; + } + rule->release_ptr = release_http_action; + LIST_INIT(&rule->arg.http.fmt); + + cur_arg = *orig_arg; + if (!*args[cur_arg] || !*args[cur_arg+1]) { + memprintf(err, "expects exactly 2 arguments"); + return ACT_RET_PRS_ERR; + } + + + rule->arg.http.str = ist(strdup(args[cur_arg])); + + if (rule->from == ACT_F_HTTP_REQ) { + px->conf.args.ctx = ARGC_HRQ; + if (px->cap & PR_CAP_FE) + cap |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + cap |= SMP_VAL_BE_HRQ_HDR; + } + else{ + px->conf.args.ctx = ARGC_HRS; + if (px->cap & PR_CAP_FE) + cap |= SMP_VAL_FE_HRS_HDR; + if (px->cap & PR_CAP_BE) + cap |= SMP_VAL_BE_HRS_HDR; + } + + cur_arg++; + if (!parse_logformat_string(args[cur_arg], px, &rule->arg.http.fmt, LOG_OPT_HTTP, cap, err)) { + istfree(&rule->arg.http.str); + return ACT_RET_PRS_ERR; + } + + free(px->conf.lfs_file); + px->conf.lfs_file = strdup(px->conf.args.file); + px->conf.lfs_line = px->conf.args.line; + + /* some characters are totally forbidden in header names and + * may happen by accident when writing configs, causing strange + * failures in field. Better catch these ones early, nobody will + * miss them. In particular, a colon at the end (or anywhere + * after the first char) or a space/cr anywhere due to misplaced + * quotes are hard to spot. + */ + for (p = istptr(rule->arg.http.str); p < istend(rule->arg.http.str); p++) { + if (HTTP_IS_TOKEN(*p)) + continue; + if (p == istptr(rule->arg.http.str) && *p == ':') + continue; + /* we only report this as-is but it will not cause an error */ + memprintf(err, "header name '%s' contains forbidden character '%c'", istptr(rule->arg.http.str), *p); + break; + } + + *orig_arg = cur_arg + 1; + return ACT_RET_PRS_OK; +} + +/* This function executes a replace-header or replace-value actions. It + * builds a string in the trash from the specified format string. It finds + * the action to be performed in <.action>, previously filled by function + * parse_replace_header(). The replacement action is executed by the function + * http_action_replace_header(). On success, it returns ACT_RET_CONT. If an error + * occurs while soft rewrites are enabled, the action is canceled, but the rule + * processing continue. Otherwsize ACT_RET_ERR is returned. + */ +static enum act_return http_action_replace_header(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct http_msg *msg = ((rule->from == ACT_F_HTTP_REQ) ? &s->txn->req : &s->txn->rsp); + struct htx *htx = htxbuf(&msg->chn->buf); + enum act_return ret = ACT_RET_CONT; + struct buffer *replace; + int r; + + replace = alloc_trash_chunk(); + if (!replace) + goto fail_alloc; + + replace->data = build_logline(s, replace->area, replace->size, &rule->arg.http.fmt); + + r = http_replace_hdrs(s, htx, rule->arg.http.str, replace->area, rule->arg.http.re, (rule->action == 0)); + if (r == -1) + goto fail_rewrite; + + leave: + free_trash_chunk(replace); + return ret; + + fail_alloc: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + ret = ACT_RET_ERR; + goto leave; + + fail_rewrite: + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + + if (!(msg->flags & HTTP_MSGF_SOFT_RW)) { + ret = ACT_RET_ERR; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + } + goto leave; +} + +/* Parse a "replace-header" or "replace-value" actions. It takes an header name, + * a regex and replacement string as arguments. It returns ACT_RET_PRS_OK on + * success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_replace_header(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cap = 0, cur_arg; + + if (args[*orig_arg-1][8] == 'h') + rule->action = 0; // replace-header + else + rule->action = 1; // replace-value + rule->action_ptr = http_action_replace_header; + rule->release_ptr = release_http_action; + LIST_INIT(&rule->arg.http.fmt); + + cur_arg = *orig_arg; + if (!*args[cur_arg] || !*args[cur_arg+1] || !*args[cur_arg+2]) { + memprintf(err, "expects exactly 3 arguments"); + return ACT_RET_PRS_ERR; + } + + rule->arg.http.str = ist(strdup(args[cur_arg])); + + cur_arg++; + if (!(rule->arg.http.re = regex_comp(args[cur_arg], 1, 1, err))) { + istfree(&rule->arg.http.str); + return ACT_RET_PRS_ERR; + } + + if (rule->from == ACT_F_HTTP_REQ) { + px->conf.args.ctx = ARGC_HRQ; + if (px->cap & PR_CAP_FE) + cap |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + cap |= SMP_VAL_BE_HRQ_HDR; + } + else{ + px->conf.args.ctx = ARGC_HRS; + if (px->cap & PR_CAP_FE) + cap |= SMP_VAL_FE_HRS_HDR; + if (px->cap & PR_CAP_BE) + cap |= SMP_VAL_BE_HRS_HDR; + } + + cur_arg++; + if (!parse_logformat_string(args[cur_arg], px, &rule->arg.http.fmt, LOG_OPT_HTTP, cap, err)) { + istfree(&rule->arg.http.str); + regex_free(rule->arg.http.re); + return ACT_RET_PRS_ERR; + } + + free(px->conf.lfs_file); + px->conf.lfs_file = strdup(px->conf.args.file); + px->conf.lfs_line = px->conf.args.line; + + *orig_arg = cur_arg + 1; + return ACT_RET_PRS_OK; +} + +/* This function executes a del-header action with selected matching mode for + * header name. It finds the matching method to be performed in <.action>, previously + * filled by function parse_http_del_header(). On success, it returns ACT_RET_CONT. + * Otherwise ACT_RET_ERR is returned. + */ +static enum act_return http_action_del_header(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct http_hdr_ctx ctx; + struct http_msg *msg = ((rule->from == ACT_F_HTTP_REQ) ? &s->txn->req : &s->txn->rsp); + struct htx *htx = htxbuf(&msg->chn->buf); + enum act_return ret = ACT_RET_CONT; + + /* remove all occurrences of the header */ + ctx.blk = NULL; + switch (rule->action) { + case PAT_MATCH_STR: + while (http_find_header(htx, rule->arg.http.str, &ctx, 1)) + http_remove_header(htx, &ctx); + break; + case PAT_MATCH_BEG: + while (http_find_pfx_header(htx, rule->arg.http.str, &ctx, 1)) + http_remove_header(htx, &ctx); + break; + case PAT_MATCH_END: + while (http_find_sfx_header(htx, rule->arg.http.str, &ctx, 1)) + http_remove_header(htx, &ctx); + break; + case PAT_MATCH_SUB: + while (http_find_sub_header(htx, rule->arg.http.str, &ctx, 1)) + http_remove_header(htx, &ctx); + break; + case PAT_MATCH_REG: + while (http_match_header(htx, rule->arg.http.re, &ctx, 1)) + http_remove_header(htx, &ctx); + break; + default: + return ACT_RET_ERR; + } + return ret; +} + +/* Parse a "del-header" action. It takes string as a required argument, + * optional flag (currently only -m) and optional matching method of input string + * with header name to be deleted. Default matching method is exact match (-m str). + * It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_del_header(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg; + int pat_idx; + + /* set exact matching (-m str) as default */ + rule->action = PAT_MATCH_STR; + rule->action_ptr = http_action_del_header; + rule->release_ptr = release_http_action; + LIST_INIT(&rule->arg.http.fmt); + + cur_arg = *orig_arg; + if (!*args[cur_arg]) { + memprintf(err, "expects at least 1 argument"); + return ACT_RET_PRS_ERR; + } + + rule->arg.http.str = ist(strdup(args[cur_arg])); + px->conf.args.ctx = (rule->from == ACT_F_HTTP_REQ ? ARGC_HRQ : ARGC_HRS); + + if (strcmp(args[cur_arg+1], "-m") == 0) { + cur_arg++; + if (!*args[cur_arg+1]) { + memprintf(err, "-m flag expects exactly 1 argument"); + return ACT_RET_PRS_ERR; + } + + cur_arg++; + pat_idx = pat_find_match_name(args[cur_arg]); + switch (pat_idx) { + case PAT_MATCH_REG: + if (!(rule->arg.http.re = regex_comp(rule->arg.http.str.ptr, 1, 1, err))) + return ACT_RET_PRS_ERR; + __fallthrough; + case PAT_MATCH_STR: + case PAT_MATCH_BEG: + case PAT_MATCH_END: + case PAT_MATCH_SUB: + rule->action = pat_idx; + break; + default: + memprintf(err, "-m with unsupported matching method '%s'", args[cur_arg]); + return ACT_RET_PRS_ERR; + } + } + + *orig_arg = cur_arg + 1; + return ACT_RET_PRS_OK; +} + +/* Release memory allocated by an http redirect action. */ +static void release_http_redir(struct act_rule *rule) +{ + struct redirect_rule *redir; + + redir = rule->arg.redir; + if (!redir) + return; + + LIST_DELETE(&redir->list); + http_free_redirect_rule(redir); +} + +/* Parse a "redirect" action. It returns ACT_RET_PRS_OK on success, + * ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_redirect(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + struct redirect_rule *redir; + int dir, cur_arg; + + rule->action = ACT_HTTP_REDIR; + rule->release_ptr = release_http_redir; + + cur_arg = *orig_arg; + + dir = (rule->from == ACT_F_HTTP_REQ ? 0 : 1); + if ((redir = http_parse_redirect_rule(px->conf.args.file, px->conf.args.line, px, &args[cur_arg], err, 1, dir)) == NULL) + return ACT_RET_PRS_ERR; + + if (!(redir->flags & REDIRECT_FLAG_IGNORE_EMPTY)) + rule->flags |= ACT_FLAG_FINAL; + + rule->arg.redir = redir; + rule->cond = redir->cond; + redir->cond = NULL; + + /* skip all arguments */ + while (*args[cur_arg]) + cur_arg++; + + *orig_arg = cur_arg; + return ACT_RET_PRS_OK; +} + +/* This function executes a add-acl, del-acl, set-map or del-map actions. On + * success, it returns ACT_RET_CONT. Otherwsize ACT_RET_ERR is returned. + */ +static enum act_return http_action_set_map(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct pat_ref *ref; + struct buffer *key = NULL, *value = NULL; + enum act_return ret = ACT_RET_CONT; + + /* collect reference */ + ref = pat_ref_lookup(rule->arg.map.ref); + if (!ref) + goto leave; + + /* allocate key */ + key = alloc_trash_chunk(); + if (!key) + goto fail_alloc; + + /* collect key */ + key->data = build_logline(s, key->area, key->size, &rule->arg.map.key); + key->area[key->data] = '\0'; + + switch (rule->action) { + case 0: // add-acl + /* add entry only if it does not already exist */ + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ref->lock); + if (pat_ref_find_elt(ref, key->area) == NULL) + pat_ref_add(ref, key->area, NULL, NULL); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ref->lock); + break; + + case 1: // set-map + { + struct pat_ref_elt *elt; + + /* allocate value */ + value = alloc_trash_chunk(); + if (!value) + goto fail_alloc; + + /* collect value */ + value->data = build_logline(s, value->area, value->size, &rule->arg.map.value); + value->area[value->data] = '\0'; + + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ref->lock); + elt = pat_ref_find_elt(ref, key->area); + if (elt) { + /* update entry if it exists */ + pat_ref_set(ref, key->area, value->area, NULL, elt); + } + else { + /* insert a new entry */ + pat_ref_add(ref, key->area, value->area, NULL); + } + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ref->lock); + break; + } + + case 2: // del-acl + case 3: // del-map + /* returned code: 1=ok, 0=ko */ + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ref->lock); + pat_ref_delete(ref, key->area); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ref->lock); + break; + + default: + ret = ACT_RET_ERR; + } + + + leave: + free_trash_chunk(key); + free_trash_chunk(value); + return ret; + + fail_alloc: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + ret = ACT_RET_ERR; + goto leave; +} + +/* Release memory allocated by an http map/acl action. */ +static void release_http_map(struct act_rule *rule) +{ + struct logformat_node *lf, *lfb; + + free(rule->arg.map.ref); + list_for_each_entry_safe(lf, lfb, &rule->arg.map.key, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } + if (rule->action == 1) { + list_for_each_entry_safe(lf, lfb, &rule->arg.map.value, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } + } +} + +/* Parse a "add-acl", "del-acl", "set-map" or "del-map" actions. It takes one or + * two log-format string as argument depending on the action. The action is + * stored in <.action> as an int (0=add-acl, 1=set-map, 2=del-acl, + * 3=del-map). It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_set_map(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cap = 0, cur_arg; + + if (args[*orig_arg-1][0] == 'a') // add-acl + rule->action = 0; + else if (args[*orig_arg-1][0] == 's') // set-map + rule->action = 1; + else if (args[*orig_arg-1][4] == 'a') // del-acl + rule->action = 2; + else if (args[*orig_arg-1][4] == 'm') // del-map + rule->action = 3; + else { + memprintf(err, "internal error: unhandled action '%s'", args[0]); + return ACT_RET_PRS_ERR; + } + rule->action_ptr = http_action_set_map; + rule->release_ptr = release_http_map; + + cur_arg = *orig_arg; + if (rule->action == 1 && (!*args[cur_arg] || !*args[cur_arg+1])) { + /* 2 args for set-map */ + memprintf(err, "expects exactly 2 arguments"); + return ACT_RET_PRS_ERR; + } + else if (!*args[cur_arg]) { + /* only one arg for other actions */ + memprintf(err, "expects exactly 1 arguments"); + return ACT_RET_PRS_ERR; + } + + /* + * '+ 8' for 'set-map(' (same for del-map) + * '- 9' for 'set-map(' + trailing ')' (same for del-map) + */ + rule->arg.map.ref = my_strndup(args[cur_arg-1] + 8, strlen(args[cur_arg-1]) - 9); + + if (rule->from == ACT_F_HTTP_REQ) { + px->conf.args.ctx = ARGC_HRQ; + if (px->cap & PR_CAP_FE) + cap |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + cap |= SMP_VAL_BE_HRQ_HDR; + } + else{ + px->conf.args.ctx = ARGC_HRS; + if (px->cap & PR_CAP_FE) + cap |= SMP_VAL_FE_HRS_HDR; + if (px->cap & PR_CAP_BE) + cap |= SMP_VAL_BE_HRS_HDR; + } + + /* key pattern */ + LIST_INIT(&rule->arg.map.key); + if (!parse_logformat_string(args[cur_arg], px, &rule->arg.map.key, LOG_OPT_HTTP, cap, err)) { + free(rule->arg.map.ref); + return ACT_RET_PRS_ERR; + } + + if (rule->action == 1) { + /* value pattern for set-map only */ + cur_arg++; + LIST_INIT(&rule->arg.map.value); + if (!parse_logformat_string(args[cur_arg], px, &rule->arg.map.value, LOG_OPT_HTTP, cap, err)) { + free(rule->arg.map.ref); + return ACT_RET_PRS_ERR; + } + } + + free(px->conf.lfs_file); + px->conf.lfs_file = strdup(px->conf.args.file); + px->conf.lfs_line = px->conf.args.line; + + *orig_arg = cur_arg + 1; + return ACT_RET_PRS_OK; +} + +/* This function executes a track-sc* actions. On success, it returns + * ACT_RET_CONT. Otherwsize ACT_RET_ERR is returned. + */ +static enum act_return http_action_track_sc(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct stktable *t; + struct stksess *ts; + struct stktable_key *key; + void *ptr1, *ptr2, *ptr3, *ptr4, *ptr5, *ptr6; + int opt; + + ptr1 = ptr2 = ptr3 = ptr4 = ptr5 = ptr6 = NULL; + opt = ((rule->from == ACT_F_HTTP_REQ) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES) | SMP_OPT_FINAL; + + t = rule->arg.trk_ctr.table.t; + + if (stkctr_entry(&s->stkctr[rule->action])) + goto end; + + key = stktable_fetch_key(t, s->be, sess, s, opt, rule->arg.trk_ctr.expr, NULL); + + if (!key) + goto end; + ts = stktable_get_entry(t, key); + if (!ts) + goto end; + + stream_track_stkctr(&s->stkctr[rule->action], t, ts); + + /* let's count a new HTTP request as it's the first time we do it */ + ptr1 = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_REQ_CNT); + ptr2 = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_REQ_RATE); + + /* When the client triggers a 4xx from the server, it's most often due + * to a missing object or permission. These events should be tracked + * because if they happen often, it may indicate a brute force or a + * vulnerability scan. Normally this is done when receiving the response + * but here we're tracking after this ought to have been done so we have + * to do it on purpose. + */ + if (rule->from == ACT_F_HTTP_RES && (unsigned)(s->txn->status - 400) < 100) { + ptr3 = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_ERR_CNT); + ptr4 = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_ERR_RATE); + } + + if (rule->from == ACT_F_HTTP_RES && (unsigned)(s->txn->status - 500) < 100 && + s->txn->status != 501 && s->txn->status != 505) { + ptr5 = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_FAIL_CNT); + ptr6 = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_FAIL_RATE); + } + + if (ptr1 || ptr2 || ptr3 || ptr4 || ptr5 || ptr6) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + if (ptr1) + stktable_data_cast(ptr1, std_t_uint)++; + if (ptr2) + update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp), + t->data_arg[STKTABLE_DT_HTTP_REQ_RATE].u, 1); + if (ptr3) + stktable_data_cast(ptr3, std_t_uint)++; + if (ptr4) + update_freq_ctr_period(&stktable_data_cast(ptr4, std_t_frqp), + t->data_arg[STKTABLE_DT_HTTP_ERR_RATE].u, 1); + if (ptr5) + stktable_data_cast(ptr5, std_t_uint)++; + if (ptr6) + update_freq_ctr_period(&stktable_data_cast(ptr6, std_t_frqp), + t->data_arg[STKTABLE_DT_HTTP_FAIL_RATE].u, 1); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(t, ts, 0); + } + + stkctr_set_flags(&s->stkctr[rule->action], STKCTR_TRACK_CONTENT); + if (sess->fe != s->be) + stkctr_set_flags(&s->stkctr[rule->action], STKCTR_TRACK_BACKEND); + + end: + return ACT_RET_CONT; +} + +static void release_http_track_sc(struct act_rule *rule) +{ + release_sample_expr(rule->arg.trk_ctr.expr); +} + +/* Parse a "track-sc*" actions. It returns ACT_RET_PRS_OK on success, + * ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_track_sc(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + struct sample_expr *expr; + unsigned int where; + unsigned int tsc_num; + const char *tsc_num_str; + int cur_arg; + + tsc_num_str = &args[*orig_arg-1][8]; + if (cfg_parse_track_sc_num(&tsc_num, tsc_num_str, tsc_num_str + strlen(tsc_num_str), err) == -1) + return ACT_RET_PRS_ERR; + + cur_arg = *orig_arg; + expr = sample_parse_expr((char **)args, &cur_arg, px->conf.args.file, px->conf.args.line, + err, &px->conf.args, NULL); + if (!expr) + return ACT_RET_PRS_ERR; + + where = 0; + if (px->cap & PR_CAP_FE) + where |= (rule->from == ACT_F_HTTP_REQ ? SMP_VAL_FE_HRQ_HDR : SMP_VAL_FE_HRS_HDR); + if (px->cap & PR_CAP_BE) + where |= (rule->from == ACT_F_HTTP_REQ ? SMP_VAL_BE_HRQ_HDR : SMP_VAL_BE_HRS_HDR); + + if (!(expr->fetch->val & where)) { + memprintf(err, "fetch method '%s' extracts information from '%s', none of which is available here", + args[cur_arg-1], sample_src_names(expr->fetch->use)); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + if (strcmp(args[cur_arg], "table") == 0) { + cur_arg++; + if (!*args[cur_arg]) { + memprintf(err, "missing table name"); + release_sample_expr(expr); + return ACT_RET_PRS_ERR; + } + + /* we copy the table name for now, it will be resolved later */ + rule->arg.trk_ctr.table.n = strdup(args[cur_arg]); + cur_arg++; + } + + rule->action = tsc_num; + rule->arg.trk_ctr.expr = expr; + rule->action_ptr = http_action_track_sc; + rule->release_ptr = release_http_track_sc; + rule->check_ptr = check_trk_action; + + *orig_arg = cur_arg; + return ACT_RET_PRS_OK; +} + +static enum act_return action_timeout_set_stream_timeout(struct act_rule *rule, + struct proxy *px, + struct session *sess, + struct stream *s, + int flags) +{ + struct sample *key; + + if (rule->arg.timeout.expr) { + key = sample_fetch_as_type(px, sess, s, SMP_OPT_FINAL, rule->arg.timeout.expr, SMP_T_SINT); + if (!key) + return ACT_RET_CONT; + + stream_set_timeout(s, rule->arg.timeout.type, MS_TO_TICKS(key->data.u.sint)); + } + else { + stream_set_timeout(s, rule->arg.timeout.type, MS_TO_TICKS(rule->arg.timeout.value)); + } + + return ACT_RET_CONT; +} + +/* Parse a "set-timeout" action. Returns ACT_RET_PRS_ERR if parsing error. + */ +static enum act_parse_ret parse_http_set_timeout(const char **args, + int *orig_arg, + struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg; + + rule->action = ACT_CUSTOM; + rule->action_ptr = action_timeout_set_stream_timeout; + rule->release_ptr = release_timeout_action; + + cur_arg = *orig_arg; + if (!*args[cur_arg] || !*args[cur_arg + 1]) { + memprintf(err, "expects exactly 2 arguments"); + return ACT_RET_PRS_ERR; + } + + if (cfg_parse_rule_set_timeout(args, cur_arg, rule, px, err) == -1) { + return ACT_RET_PRS_ERR; + } + + *orig_arg = cur_arg + 2; + + return ACT_RET_PRS_OK; +} + +/* This function executes a strict-mode actions. On success, it always returns + * ACT_RET_CONT + */ +static enum act_return http_action_strict_mode(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct http_msg *msg = ((rule->from == ACT_F_HTTP_REQ) ? &s->txn->req : &s->txn->rsp); + + if (rule->action == 0) // strict-mode on + msg->flags &= ~HTTP_MSGF_SOFT_RW; + else // strict-mode off + msg->flags |= HTTP_MSGF_SOFT_RW; + return ACT_RET_CONT; +} + +/* Parse a "strict-mode" action. It returns ACT_RET_PRS_OK on success, + * ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_strict_mode(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg; + + cur_arg = *orig_arg; + if (!*args[cur_arg]) { + memprintf(err, "expects exactly 1 arguments"); + return ACT_RET_PRS_ERR; + } + + if (strcasecmp(args[cur_arg], "on") == 0) + rule->action = 0; // strict-mode on + else if (strcasecmp(args[cur_arg], "off") == 0) + rule->action = 1; // strict-mode off + else { + memprintf(err, "Unexpected value '%s'. Only 'on' and 'off' are supported", args[cur_arg]); + return ACT_RET_PRS_ERR; + } + rule->action_ptr = http_action_strict_mode; + + *orig_arg = cur_arg + 1; + return ACT_RET_PRS_OK; +} + +/* This function executes a return action. It builds an HTX message from an + * errorfile, an raw file or a log-format string, depending on <.action> + * value. On success, it returns ACT_RET_ABRT. If an error occurs ACT_RET_ERR is + * returned. + */ +static enum act_return http_action_return(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct channel *req = &s->req; + + s->txn->status = rule->arg.http_reply->status; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_LOCAL; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= ((rule->from == ACT_F_HTTP_REQ) ? SF_FINST_R : SF_FINST_H); + + if (http_reply_message(s, rule->arg.http_reply) == -1) + return ACT_RET_ERR; + + if (rule->from == ACT_F_HTTP_REQ) { + /* let's log the request time */ + s->logs.request_ts = now_ns; + req->analysers &= AN_REQ_FLT_END; + + if (s->sess->fe == s->be) /* report it if the request was intercepted by the frontend */ + _HA_ATOMIC_INC(&s->sess->fe->fe_counters.intercepted_req); + } + + return ACT_RET_ABRT; +} + +/* Parse a "return" action. It returns ACT_RET_PRS_OK on success, + * ACT_RET_PRS_ERR on error. It relies on http_parse_http_reply() to set + * <.arg.http_reply>. + */ +static enum act_parse_ret parse_http_return(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + /* Prepare parsing of log-format strings */ + px->conf.args.ctx = ((rule->from == ACT_F_HTTP_REQ) ? ARGC_HRQ : ARGC_HRS); + rule->arg.http_reply = http_parse_http_reply(args, orig_arg, px, 200, err); + if (!rule->arg.http_reply) + return ACT_RET_PRS_ERR; + + rule->flags |= ACT_FLAG_FINAL; + rule->action = ACT_CUSTOM; + rule->check_ptr = check_act_http_reply; + rule->action_ptr = http_action_return; + rule->release_ptr = release_act_http_reply; + return ACT_RET_PRS_OK; +} + + + +/* This function executes a wait-for-body action. It waits for the message + * payload for a max configured time (.arg.p[0]) and eventually for only first + * <arg.p[1]> bytes (0 means no limit). It relies on http_wait_for_msg_body() + * function. it returns ACT_RET_CONT when conditions are met to stop to wait. + * Otherwise ACT_RET_YIELD is returned to wait for more data. ACT_RET_INV is + * returned if a parsing error is raised by lower level and ACT_RET_ERR if an + * internal error occurred. Finally ACT_RET_ABRT is returned when a timeout + * occurred. + */ +static enum act_return http_action_wait_for_body(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct channel *chn = ((rule->from == ACT_F_HTTP_REQ) ? &s->req : &s->res); + unsigned int time = (uintptr_t)rule->arg.act.p[0]; + unsigned int bytes = (uintptr_t)rule->arg.act.p[1]; + + switch (http_wait_for_msg_body(s, chn, time, bytes)) { + case HTTP_RULE_RES_CONT: + return ACT_RET_CONT; + case HTTP_RULE_RES_YIELD: + return ACT_RET_YIELD; + case HTTP_RULE_RES_BADREQ: + return ACT_RET_INV; + case HTTP_RULE_RES_ERROR: + return ACT_RET_ERR; + case HTTP_RULE_RES_ABRT: + return ACT_RET_ABRT; + default: + return ACT_RET_ERR; + } +} + +/* Parse a "wait-for-body" action. It returns ACT_RET_PRS_OK on success, + * ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret parse_http_wait_for_body(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg; + unsigned int time, bytes; + const char *res; + + cur_arg = *orig_arg; + if (!*args[cur_arg]) { + memprintf(err, "expects time <time> [ at-least <bytes> ]"); + return ACT_RET_PRS_ERR; + } + + time = UINT_MAX; /* To be sure it is set */ + bytes = 0; /* Default value, wait all the body */ + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "time") == 0) { + if (!*args[cur_arg + 1]) { + memprintf(err, "missing argument for '%s'", args[cur_arg]); + return ACT_RET_PRS_ERR; + } + res = parse_time_err(args[cur_arg+1], &time, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "time overflow (maximum value is 2147483647 ms or ~24.8 days)"); + return ACT_RET_PRS_ERR; + } + if (res == PARSE_TIME_UNDER) { + memprintf(err, "time underflow (minimum non-null value is 1 ms)"); + return ACT_RET_PRS_ERR; + } + if (res) { + memprintf(err, "unexpected character '%c'", *res); + return ACT_RET_PRS_ERR; + } + cur_arg++; + } + else if (strcmp(args[cur_arg], "at-least") == 0) { + if (!*args[cur_arg + 1]) { + memprintf(err, "missing argument for '%s'", args[cur_arg]); + return ACT_RET_PRS_ERR; + } + res = parse_size_err(args[cur_arg+1], &bytes); + if (res) { + memprintf(err, "unexpected character '%c'", *res); + return ACT_RET_PRS_ERR; + } + cur_arg++; + } + else + break; + cur_arg++; + } + + if (time == UINT_MAX) { + memprintf(err, "expects time <time> [ at-least <bytes> ]"); + return ACT_RET_PRS_ERR; + } + + rule->arg.act.p[0] = (void *)(uintptr_t)time; + rule->arg.act.p[1] = (void *)(uintptr_t)bytes; + + *orig_arg = cur_arg; + + rule->action = ACT_CUSTOM; + rule->action_ptr = http_action_wait_for_body; + return ACT_RET_PRS_OK; +} + +/************************************************************************/ +/* All supported http-request action keywords must be declared here. */ +/************************************************************************/ + +static struct action_kw_list http_req_actions = { + .kw = { + { "add-acl", parse_http_set_map, KWF_MATCH_PREFIX }, + { "add-header", parse_http_set_header, 0 }, + { "allow", parse_http_allow, 0 }, + { "auth", parse_http_auth, 0 }, + { "capture", parse_http_req_capture, 0 }, + { "del-acl", parse_http_set_map, KWF_MATCH_PREFIX }, + { "del-header", parse_http_del_header, 0 }, + { "del-map", parse_http_set_map, KWF_MATCH_PREFIX }, + { "deny", parse_http_deny, 0 }, + { "disable-l7-retry", parse_http_req_disable_l7_retry, 0 }, + { "early-hint", parse_http_set_header, 0 }, + { "normalize-uri", parse_http_normalize_uri, KWF_EXPERIMENTAL }, + { "redirect", parse_http_redirect, 0 }, + { "reject", parse_http_action_reject, 0 }, + { "replace-header", parse_http_replace_header, 0 }, + { "replace-path", parse_replace_uri, 0 }, + { "replace-pathq", parse_replace_uri, 0 }, + { "replace-uri", parse_replace_uri, 0 }, + { "replace-value", parse_http_replace_header, 0 }, + { "return", parse_http_return, 0 }, + { "set-header", parse_http_set_header, 0 }, + { "set-map", parse_http_set_map, KWF_MATCH_PREFIX }, + { "set-method", parse_set_req_line, 0 }, + { "set-path", parse_set_req_line, 0 }, + { "set-pathq", parse_set_req_line, 0 }, + { "set-query", parse_set_req_line, 0 }, + { "set-uri", parse_set_req_line, 0 }, + { "strict-mode", parse_http_strict_mode, 0 }, + { "tarpit", parse_http_deny, 0 }, + { "track-sc", parse_http_track_sc, KWF_MATCH_PREFIX }, + { "set-timeout", parse_http_set_timeout, 0 }, + { "wait-for-body", parse_http_wait_for_body, 0 }, + { NULL, NULL } + } +}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_actions); + +static struct action_kw_list http_res_actions = { + .kw = { + { "add-acl", parse_http_set_map, KWF_MATCH_PREFIX }, + { "add-header", parse_http_set_header, 0 }, + { "allow", parse_http_allow, 0 }, + { "capture", parse_http_res_capture, 0 }, + { "del-acl", parse_http_set_map, KWF_MATCH_PREFIX }, + { "del-header", parse_http_del_header, 0 }, + { "del-map", parse_http_set_map, KWF_MATCH_PREFIX }, + { "deny", parse_http_deny, 0 }, + { "redirect", parse_http_redirect, 0 }, + { "replace-header", parse_http_replace_header, 0 }, + { "replace-value", parse_http_replace_header, 0 }, + { "return", parse_http_return, 0 }, + { "set-header", parse_http_set_header, 0 }, + { "set-map", parse_http_set_map, KWF_MATCH_PREFIX }, + { "set-status", parse_http_set_status, 0 }, + { "strict-mode", parse_http_strict_mode, 0 }, + { "track-sc", parse_http_track_sc, KWF_MATCH_PREFIX }, + { "set-timeout", parse_http_set_timeout, 0 }, + { "wait-for-body", parse_http_wait_for_body, 0 }, + { NULL, NULL } + } +}; + +INITCALL1(STG_REGISTER, http_res_keywords_register, &http_res_actions); + +static struct action_kw_list http_after_res_actions = { + .kw = { + { "add-header", parse_http_set_header, 0 }, + { "allow", parse_http_allow, 0 }, + { "capture", parse_http_res_capture, 0 }, + { "del-acl", parse_http_set_map, KWF_MATCH_PREFIX }, + { "del-header", parse_http_del_header, 0 }, + { "del-map", parse_http_set_map, KWF_MATCH_PREFIX }, + { "replace-header", parse_http_replace_header, 0 }, + { "replace-value", parse_http_replace_header, 0 }, + { "set-header", parse_http_set_header, 0 }, + { "set-map", parse_http_set_map, KWF_MATCH_PREFIX }, + { "set-status", parse_http_set_status, 0 }, + { "strict-mode", parse_http_strict_mode, 0 }, + { NULL, NULL } + } +}; + +INITCALL1(STG_REGISTER, http_after_res_keywords_register, &http_after_res_actions); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/http_ana.c b/src/http_ana.c new file mode 100644 index 0000000..178f874 --- /dev/null +++ b/src/http_ana.c @@ -0,0 +1,5153 @@ +/* + * HTTP protocol analyzer + * + * Copyright (C) 2018 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/acl.h> +#include <haproxy/action-t.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/backend.h> +#include <haproxy/base64.h> +#include <haproxy/capture-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/filters.h> +#include <haproxy/http.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_htx.h> +#include <haproxy/http_ext.h> +#include <haproxy/htx.h> +#include <haproxy/log.h> +#include <haproxy/net_helper.h> +#include <haproxy/proxy.h> +#include <haproxy/regex.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server-t.h> +#include <haproxy/stats.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/trace.h> +#include <haproxy/uri_auth-t.h> +#include <haproxy/vars.h> + + +#define TRACE_SOURCE &trace_strm + +extern const char *stat_status_codes[]; + +struct pool_head *pool_head_requri __read_mostly = NULL; +struct pool_head *pool_head_capture __read_mostly = NULL; + + +static void http_end_request(struct stream *s); +static void http_end_response(struct stream *s); + +static void http_capture_headers(struct htx *htx, char **cap, struct cap_hdr *cap_hdr); +static int http_del_hdr_value(char *start, char *end, char **from, char *next); +static size_t http_fmt_req_line(const struct htx_sl *sl, char *str, size_t len); +static void http_debug_stline(const char *dir, struct stream *s, const struct htx_sl *sl); +static void http_debug_hdr(const char *dir, struct stream *s, const struct ist n, const struct ist v); + +static enum rule_result http_req_get_intercept_rule(struct proxy *px, struct list *def_rules, struct list *rules, struct stream *s); +static enum rule_result http_res_get_intercept_rule(struct proxy *px, struct list *def_rules, struct list *rules, struct stream *s, uint8_t final); +static enum rule_result http_req_restrict_header_names(struct stream *s, struct htx *htx, struct proxy *px); + +static void http_manage_client_side_cookies(struct stream *s, struct channel *req); +static void http_manage_server_side_cookies(struct stream *s, struct channel *res); + +static int http_stats_check_uri(struct stream *s, struct http_txn *txn, struct proxy *px); +static int http_handle_stats(struct stream *s, struct channel *req, struct proxy *px); + +static int http_handle_expect_hdr(struct stream *s, struct htx *htx, struct http_msg *msg); +static int http_reply_100_continue(struct stream *s); + +/* This stream analyser waits for a complete HTTP request. It returns 1 if the + * processing can continue on next analysers, or zero if it either needs more + * data or wants to immediately abort the request (eg: timeout, error, ...). It + * is tied to AN_REQ_WAIT_HTTP and may may remove itself from s->req.analysers + * when it has nothing left to do, and may remove any analyser when it wants to + * abort. + */ +int http_wait_for_request(struct stream *s, struct channel *req, int an_bit) +{ + + /* + * We will analyze a complete HTTP request to check the its syntax. + * + * Once the start line and all headers are received, we may perform a + * capture of the error (if any), and we will set a few fields. We also + * check for monitor-uri, logging and finally headers capture. + */ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->req; + struct htx *htx; + struct htx_sl *sl; + char http_ver; + int len; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn, msg); + + if (unlikely(!IS_HTX_STRM(s))) { + /* It is only possible when a TCP stream is upgrade to HTTP. + * There is a transition period during which there is no + * data. The stream is still in raw mode and SF_IGNORE flag is + * still set. When this happens, the new mux is responsible to + * handle all errors. Thus we may leave immediately. + */ + BUG_ON(!(s->flags & SF_IGNORE) || !c_empty(&s->req)); + + /* Don't connect for now */ + channel_dont_connect(req); + + /* An abort at this stage means we are performing a "destructive" + * HTTP upgrade (TCP>H2). In this case, we can leave. + */ + if (s->scf->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) { + s->logs.logwait = 0; + s->logs.level = 0; + stream_abort(s); + req->analysers &= AN_REQ_FLT_END; + req->analyse_exp = TICK_ETERNITY; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA, s); + return 1; + } + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA, s); + return 0; + } + + htx = htxbuf(&req->buf); + sl = http_get_stline(htx); + len = HTX_SL_REQ_VLEN(sl); + if (len < 6) { + http_ver = 0; + } + else { + char *ptr; + + ptr = HTX_SL_REQ_VPTR(sl); + http_ver = ptr[5] - '0'; + } + + /* Parsing errors are caught here */ + if (htx->flags & (HTX_FL_PARSING_ERROR|HTX_FL_PROCESSING_ERROR)) { + stream_inc_http_req_ctr(s); + proxy_inc_fe_req_ctr(sess->listener, sess->fe, http_ver); + if (htx->flags & HTX_FL_PARSING_ERROR) { + stream_inc_http_err_ctr(s); + goto return_bad_req; + } + else + goto return_int_err; + } + + /* we're speaking HTTP here, so let's speak HTTP to the client */ + s->srv_error = http_return_srv_error; + + msg->msg_state = HTTP_MSG_BODY; + stream_inc_http_req_ctr(s); + proxy_inc_fe_req_ctr(sess->listener, sess->fe, http_ver); /* one more valid request for this FE */ + + /* kill the pending keep-alive timeout */ + req->analyse_exp = TICK_ETERNITY; + + BUG_ON(htx_get_first_type(htx) != HTX_BLK_REQ_SL); + + /* 0: we might have to print this header in debug mode */ + if (unlikely((global.mode & MODE_DEBUG) && + (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)))) { + int32_t pos; + + http_debug_stline("clireq", s, sl); + + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_EOH) + break; + if (type != HTX_BLK_HDR) + continue; + + http_debug_hdr("clihdr", s, + htx_get_blk_name(htx, blk), + htx_get_blk_value(htx, blk)); + } + } + + /* + * 1: identify the method and the version. Also set HTTP flags + */ + txn->meth = sl->info.req.meth; + if (sl->flags & HTX_SL_F_VER_11) + msg->flags |= HTTP_MSGF_VER_11; + msg->flags |= HTTP_MSGF_XFER_LEN; + if (sl->flags & HTX_SL_F_CLEN) + msg->flags |= HTTP_MSGF_CNT_LEN; + else if (sl->flags & HTX_SL_F_CHNK) + msg->flags |= HTTP_MSGF_TE_CHNK; + if (sl->flags & HTX_SL_F_BODYLESS) + msg->flags |= HTTP_MSGF_BODYLESS; + if (sl->flags & HTX_SL_F_CONN_UPG) + msg->flags |= HTTP_MSGF_CONN_UPG; + + /* we can make use of server redirect on GET and HEAD */ + if (txn->meth == HTTP_METH_GET || txn->meth == HTTP_METH_HEAD) + s->flags |= SF_REDIRECTABLE; + else if (txn->meth == HTTP_METH_OTHER && isteqi(htx_sl_req_meth(sl), ist("PRI"))) { + /* PRI is reserved for the HTTP/2 preface */ + goto return_bad_req; + } + + /* + * 2: check if the URI matches the monitor_uri. We have to do this for + * every request which gets in, because the monitor-uri is defined by + * the frontend. If the monitor-uri starts with a '/', the matching is + * done against the request's path. Otherwise, the request's uri is + * used. It is a workaround to let HTTP/2 health-checks work as + * expected. + */ + if (unlikely(isttest(sess->fe->monitor_uri))) { + const struct ist monitor_uri = sess->fe->monitor_uri; + struct http_uri_parser parser = http_uri_parser_init(htx_sl_req_uri(sl)); + + if ((istptr(monitor_uri)[0] == '/' && + isteq(http_parse_path(&parser), monitor_uri)) || + isteq(htx_sl_req_uri(sl), monitor_uri)) { + /* + * We have found the monitor URI + */ + struct acl_cond *cond; + + s->flags |= SF_MONITOR; + _HA_ATOMIC_INC(&sess->fe->fe_counters.intercepted_req); + + /* Check if we want to fail this monitor request or not */ + list_for_each_entry(cond, &sess->fe->mon_fail_cond, list) { + int ret = acl_exec_cond(cond, sess->fe, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + + ret = acl_pass(ret); + if (cond->pol == ACL_COND_UNLESS) + ret = !ret; + + if (ret) { + /* we fail this request, let's return 503 service unavail */ + txn->status = 503; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_LOCAL; /* we don't want a real error here */ + goto return_prx_cond; + } + } + + /* nothing to fail, let's reply normally */ + txn->status = 200; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_LOCAL; /* we don't want a real error here */ + goto return_prx_cond; + } + } + + /* + * 3: Maybe we have to copy the original REQURI for the logs ? + * Note: we cannot log anymore if the request has been + * classified as invalid. + */ + if (unlikely(s->logs.logwait & LW_REQ)) { + /* we have a complete HTTP request that we must log */ + if ((txn->uri = pool_alloc(pool_head_requri)) != NULL) { + size_t len; + + len = http_fmt_req_line(sl, txn->uri, global.tune.requri_len - 1); + txn->uri[len] = 0; + + if (!(s->logs.logwait &= ~(LW_REQ|LW_INIT))) + s->do_log(s); + } else { + ha_alert("HTTP logging : out of memory.\n"); + } + } + + /* if the frontend has "option http-use-proxy-header", we'll check if + * we have what looks like a proxied connection instead of a connection, + * and in this case set the TX_USE_PX_CONN flag to use Proxy-connection. + * Note that this is *not* RFC-compliant, however browsers and proxies + * happen to do that despite being non-standard :-( + * We consider that a request not beginning with either '/' or '*' is + * a proxied connection, which covers both "scheme://location" and + * CONNECT ip:port. + */ + if ((sess->fe->options2 & PR_O2_USE_PXHDR) && + *HTX_SL_REQ_UPTR(sl) != '/' && *HTX_SL_REQ_UPTR(sl) != '*') + txn->flags |= TX_USE_PX_CONN; + + /* 5: we may need to capture headers */ + if (unlikely((s->logs.logwait & LW_REQHDR) && s->req_cap)) + http_capture_headers(htx, s->req_cap, sess->fe->req_cap); + + /* we may have to wait for the request's body */ + if (s->be->options & PR_O_WREQ_BODY) + req->analysers |= AN_REQ_HTTP_BODY; + + /* + * RFC7234#4: + * A cache MUST write through requests with methods + * that are unsafe (Section 4.2.1 of [RFC7231]) to + * the origin server; i.e., a cache is not allowed + * to generate a reply to such a request before + * having forwarded the request and having received + * a corresponding response. + * + * RFC7231#4.2.1: + * Of the request methods defined by this + * specification, the GET, HEAD, OPTIONS, and TRACE + * methods are defined to be safe. + */ + if (likely(txn->meth == HTTP_METH_GET || + txn->meth == HTTP_METH_HEAD || + txn->meth == HTTP_METH_OPTIONS || + txn->meth == HTTP_METH_TRACE)) + txn->flags |= TX_CACHEABLE | TX_CACHE_COOK; + + /* end of job, return OK */ + req->analysers &= ~an_bit; + req->analyse_exp = TICK_ETERNITY; + + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 1; + + return_int_err: + txn->status = 500; + s->flags |= SF_ERR_INTERNAL; + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + goto return_prx_cond; + + return_bad_req: + txn->status = 400; + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + /* fall through */ + + return_prx_cond: + http_set_term_flags(s); + http_reply_and_close(s, txn->status, http_error_message(s)); + + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; +} + + +/* This stream analyser runs all HTTP request processing which is common to + * frontends and backends, which means blocking ACLs, filters, connection-close, + * reqadd, stats and redirects. This is performed for the designated proxy. + * It returns 1 if the processing can continue on next analysers, or zero if it + * either needs more data or wants to immediately abort the request (eg: deny, + * error, ...). + */ +int http_process_req_common(struct stream *s, struct channel *req, int an_bit, struct proxy *px) +{ + struct list *def_rules, *rules; + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->req; + struct htx *htx; + struct redirect_rule *rule; + enum rule_result verdict; + struct connection *conn = objt_conn(sess->origin); + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn, msg); + + htx = htxbuf(&req->buf); + + /* just in case we have some per-backend tracking. Only called the first + * execution of the analyser. */ + if (!s->current_rule && !s->current_rule_list) + stream_inc_be_http_req_ctr(s); + + def_rules = ((px->defpx && (an_bit == AN_REQ_HTTP_PROCESS_FE || px != sess->fe)) ? &px->defpx->http_req_rules : NULL); + rules = &px->http_req_rules; + + /* evaluate http-request rules */ + if ((def_rules && !LIST_ISEMPTY(def_rules)) || !LIST_ISEMPTY(rules)) { + verdict = http_req_get_intercept_rule(px, def_rules, rules, s); + + switch (verdict) { + case HTTP_RULE_RES_YIELD: /* some data miss, call the function later. */ + goto return_prx_yield; + + case HTTP_RULE_RES_CONT: + case HTTP_RULE_RES_STOP: /* nothing to do */ + break; + + case HTTP_RULE_RES_DENY: /* deny or tarpit */ + if (txn->flags & TX_CLTARPIT) + goto tarpit; + goto deny; + + case HTTP_RULE_RES_ABRT: /* abort request, response already sent. Eg: auth */ + goto return_prx_cond; + + case HTTP_RULE_RES_DONE: /* OK, but terminate request processing (eg: redirect) */ + goto done; + + case HTTP_RULE_RES_BADREQ: /* failed with a bad request */ + goto return_bad_req; + + case HTTP_RULE_RES_ERROR: /* failed with a bad request */ + goto return_int_err; + } + } + + if (px->options2 & (PR_O2_RSTRICT_REQ_HDR_NAMES_BLK|PR_O2_RSTRICT_REQ_HDR_NAMES_DEL)) { + verdict = http_req_restrict_header_names(s, htx, px); + if (verdict == HTTP_RULE_RES_DENY) + goto deny; + } + + if (conn && (conn->flags & CO_FL_EARLY_DATA) && + (conn->flags & (CO_FL_EARLY_SSL_HS | CO_FL_SSL_WAIT_HS))) { + struct http_hdr_ctx ctx; + + ctx.blk = NULL; + if (!http_find_header(htx, ist("Early-Data"), &ctx, 0)) { + if (unlikely(!http_add_header(htx, ist("Early-Data"), ist("1")))) + goto return_fail_rewrite; + } + } + + /* OK at this stage, we know that the request was accepted according to + * the http-request rules, we can check for the stats. Note that the + * URI is detected *before* the req* rules in order not to be affected + * by a possible reqrep, while they are processed *after* so that a + * reqdeny can still block them. This clearly needs to change in 1.6! + */ + if (!s->target && http_stats_check_uri(s, txn, px)) { + s->target = &http_stats_applet.obj_type; + if (unlikely(!sc_applet_create(s->scb, objt_applet(s->target)))) { + s->logs.request_ts = now_ns; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + goto return_int_err; + } + + /* parse the whole stats request and extract the relevant information */ + http_handle_stats(s, req, px); + verdict = http_req_get_intercept_rule(px, NULL, &px->uri_auth->http_req_rules, s); + /* not all actions implemented: deny, allow, auth */ + + if (verdict == HTTP_RULE_RES_DENY) /* stats http-request deny */ + goto deny; + + if (verdict == HTTP_RULE_RES_ABRT) /* stats auth / stats http-request auth */ + goto return_prx_cond; + + if (verdict == HTTP_RULE_RES_BADREQ) /* failed with a bad request */ + goto return_bad_req; + + if (verdict == HTTP_RULE_RES_ERROR) /* failed with a bad request */ + goto return_int_err; + } + + /* Proceed with the applets now. */ + if (unlikely(objt_applet(s->target))) { + if (sess->fe == s->be) /* report it if the request was intercepted by the frontend */ + _HA_ATOMIC_INC(&sess->fe->fe_counters.intercepted_req); + + if (http_handle_expect_hdr(s, htx, msg) == -1) + goto return_int_err; + + if (!(s->flags & SF_ERR_MASK)) // this is not really an error but it is + s->flags |= SF_ERR_LOCAL; // to mark that it comes from the proxy + http_set_term_flags(s); + + if (HAS_FILTERS(s)) + req->analysers |= AN_REQ_FLT_HTTP_HDRS; + + /* enable the minimally required analyzers to handle keep-alive and compression on the HTTP response */ + req->analysers &= (AN_REQ_HTTP_BODY | AN_REQ_FLT_HTTP_HDRS | AN_REQ_FLT_END); + req->analysers &= ~AN_REQ_FLT_XFER_DATA; + req->analysers |= AN_REQ_HTTP_XFER_BODY; + + s->scb->flags |= SC_FL_SND_ASAP; + s->flags |= SF_ASSIGNED; + goto done; + } + + /* check whether we have some ACLs set to redirect this request */ + list_for_each_entry(rule, &px->redirect_rules, list) { + if (rule->cond) { + int ret; + + ret = acl_exec_cond(rule->cond, px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + if (!ret) + continue; + } + if (!http_apply_redirect_rule(rule, s, txn)) + goto return_int_err; + goto done; + } + + /* POST requests may be accompanied with an "Expect: 100-Continue" header. + * If this happens, then the data will not come immediately, so we must + * send all what we have without waiting. Note that due to the small gain + * in waiting for the body of the request, it's easier to simply put the + * SC_FL_SND_ASAP flag on the back SC any time. It's a one-shot flag so it + * will remove itself once used. + */ + s->scb->flags |= SC_FL_SND_ASAP; + + done: /* done with this analyser, continue with next ones that the calling + * points will have set, if any. + */ + req->analyse_exp = TICK_ETERNITY; + done_without_exp: /* done with this analyser, but don't reset the analyse_exp. */ + req->analysers &= ~an_bit; + s->current_rule = s->current_rule_list = NULL; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 1; + + tarpit: + /* Allow cookie logging + */ + if (s->be->cookie_name || sess->fe->capture_name) + http_manage_client_side_cookies(s, req); + + /* When a connection is tarpitted, we use the tarpit timeout, + * which may be the same as the connect timeout if unspecified. + * If unset, then set it to zero because we really want it to + * eventually expire. We build the tarpit as an analyser. + */ + channel_htx_erase(&s->req, htx); + + /* wipe the request out so that we can drop the connection early + * if the client closes first. + */ + channel_dont_connect(req); + + req->analysers &= AN_REQ_FLT_END; /* remove switching rules etc... */ + req->analysers |= AN_REQ_HTTP_TARPIT; + req->analyse_exp = tick_add_ifset(now_ms, s->be->timeout.tarpit); + if (!req->analyse_exp) + req->analyse_exp = tick_add(now_ms, 0); + stream_inc_http_err_ctr(s); + _HA_ATOMIC_INC(&sess->fe->fe_counters.denied_req); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.denied_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->denied_req); + goto done_without_exp; + + deny: /* this request was blocked (denied) */ + + /* Allow cookie logging + */ + if (s->be->cookie_name || sess->fe->capture_name) + http_manage_client_side_cookies(s, req); + + s->logs.request_ts = now_ns; + stream_inc_http_err_ctr(s); + _HA_ATOMIC_INC(&sess->fe->fe_counters.denied_req); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.denied_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->denied_req); + goto return_prx_err; + + return_fail_rewrite: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + /* fall through */ + + return_int_err: + txn->status = 500; + s->flags |= SF_ERR_INTERNAL; + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + goto return_prx_err; + + return_bad_req: + txn->status = 400; + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + /* fall through */ + + return_prx_err: + http_set_term_flags(s); + http_reply_and_close(s, txn->status, http_error_message(s)); + /* fall through */ + + return_prx_cond: + http_set_term_flags(s); + + req->analysers &= AN_REQ_FLT_END; + req->analyse_exp = TICK_ETERNITY; + s->current_rule = s->current_rule_list = NULL; + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; + + return_prx_yield: + channel_dont_connect(req); + DBG_TRACE_DEVEL("waiting for more data", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; +} + +/* This function performs all the processing enabled for the current request. + * It returns 1 if the processing can continue on next analysers, or zero if it + * needs more data, encounters an error, or wants to immediately abort the + * request. It relies on buffers flags, and updates s->req.analysers. + */ +int http_process_request(struct stream *s, struct channel *req, int an_bit) +{ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct htx *htx; + struct connection *cli_conn = objt_conn(strm_sess(s)->origin); + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + + /* + * Right now, we know that we have processed the entire headers + * and that unwanted requests have been filtered out. We can do + * whatever we want with the remaining request. Also, now we + * may have separate values for ->fe, ->be. + */ + htx = htxbuf(&req->buf); + + /* + * 7: Now we can work with the cookies. + * Note that doing so might move headers in the request, but + * the fields will stay coherent and the URI will not move. + * This should only be performed in the backend. + */ + if (s->be->cookie_name || sess->fe->capture_name) + http_manage_client_side_cookies(s, req); + + /* 8: Generate unique ID if a "unique-id-format" is defined. + * + * A unique ID is generated even when it is not sent to ensure that the ID can make use of + * fetches only available in the HTTP request processing stage. + */ + if (!LIST_ISEMPTY(&sess->fe->format_unique_id)) { + struct ist unique_id = stream_generate_unique_id(s, &sess->fe->format_unique_id); + + if (!isttest(unique_id)) { + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + goto return_int_err; + } + + /* send unique ID if a "unique-id-header" is defined */ + if (isttest(sess->fe->header_unique_id) && + unlikely(!http_add_header(htx, sess->fe->header_unique_id, unique_id))) + goto return_fail_rewrite; + } + + /* handle http extensions (if configured) */ + if (unlikely(!http_handle_7239_header(s, req))) + goto return_fail_rewrite; + if (unlikely(!http_handle_xff_header(s, req))) + goto return_fail_rewrite; + if (unlikely(!http_handle_xot_header(s, req))) + goto return_fail_rewrite; + + /* Filter the request headers if there are filters attached to the + * stream. + */ + if (HAS_FILTERS(s)) + req->analysers |= AN_REQ_FLT_HTTP_HDRS; + + /* If we have no server assigned yet and we're balancing on url_param + * with a POST request, we may be interested in checking the body for + * that parameter. This will be done in another analyser. + */ + if (!(s->flags & (SF_ASSIGNED|SF_DIRECT)) && + s->txn->meth == HTTP_METH_POST && + (s->be->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_PH) { + channel_dont_connect(req); + req->analysers |= AN_REQ_HTTP_BODY; + } + + req->analysers &= ~AN_REQ_FLT_XFER_DATA; + req->analysers |= AN_REQ_HTTP_XFER_BODY; + + /* We expect some data from the client. Unless we know for sure + * we already have a full request, we have to re-enable quick-ack + * in case we previously disabled it, otherwise we might cause + * the client to delay further data. + */ + if ((sess->listener && (sess->listener->bind_conf->options & BC_O_NOQUICKACK)) && !(htx->flags & HTX_FL_EOM)) + conn_set_quickack(cli_conn, 1); + + /************************************************************* + * OK, that's finished for the headers. We have done what we * + * could. Let's switch to the DATA state. * + ************************************************************/ + req->analyse_exp = TICK_ETERNITY; + req->analysers &= ~an_bit; + + s->logs.request_ts = now_ns; + /* OK let's go on with the BODY now */ + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 1; + + return_fail_rewrite: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + /* fall through */ + + return_int_err: + txn->status = 500; + s->flags |= SF_ERR_INTERNAL; + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + + http_set_term_flags(s); + http_reply_and_close(s, txn->status, http_error_message(s)); + + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; +} + +/* This function is an analyser which processes the HTTP tarpit. It always + * returns zero, at the beginning because it prevents any other processing + * from occurring, and at the end because it terminates the request. + */ +int http_process_tarpit(struct stream *s, struct channel *req, int an_bit) +{ + struct http_txn *txn = s->txn; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn, &txn->req); + /* This connection is being tarpitted. The CLIENT side has + * already set the connect expiration date to the right + * timeout. We just have to check that the client is still + * there and that the timeout has not expired. + */ + channel_dont_connect(req); + if (!(s->scf->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) && + !tick_is_expired(req->analyse_exp, now_ms)) { + /* Be sure to drain all data from the request channel */ + channel_htx_erase(req, htxbuf(&req->buf)); + DBG_TRACE_DEVEL("waiting for tarpit timeout expiry", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + } + + + /* We will set the queue timer to the time spent, just for + * logging purposes. We fake a 500 server error, so that the + * attacker will not suspect his connection has been tarpitted. + * It will not cause trouble to the logs because we can exclude + * the tarpitted connections by filtering on the 'PT' status flags. + */ + s->logs.t_queue = ns_to_ms(now_ns - s->logs.accept_ts); + + http_set_term_flags(s); + http_reply_and_close(s, txn->status, (!(s->scf->flags & SC_FL_ERROR) ? http_error_message(s) : NULL)); + + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; +} + +/* This function is an analyser which waits for the HTTP request body. It waits + * for either the buffer to be full, or the full advertised contents to have + * reached the buffer. It must only be called after the standard HTTP request + * processing has occurred, because it expects the request to be parsed and will + * look for the Expect header. It may send a 100-Continue interim response. It + * returns zero if it needs to read more data, or 1 once it has completed its + * analysis. + */ +int http_wait_for_request_body(struct stream *s, struct channel *req, int an_bit) +{ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn, &s->txn->req); + + + switch (http_wait_for_msg_body(s, req, s->be->timeout.httpreq, 0)) { + case HTTP_RULE_RES_CONT: + goto http_end; + case HTTP_RULE_RES_YIELD: + goto missing_data_or_waiting; + case HTTP_RULE_RES_BADREQ: + goto return_bad_req; + case HTTP_RULE_RES_ERROR: + goto return_int_err; + case HTTP_RULE_RES_ABRT: + goto return_prx_cond; + default: + goto return_int_err; + } + + http_end: + /* The situation will not evolve, so let's give up on the analysis. */ + s->logs.request_ts = now_ns; /* update the request timer to reflect full request */ + req->analysers &= ~an_bit; + req->analyse_exp = TICK_ETERNITY; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 1; + + missing_data_or_waiting: + channel_dont_connect(req); + DBG_TRACE_DEVEL("waiting for more data", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + + return_int_err: + txn->status = 500; + s->flags |= SF_ERR_INTERNAL; + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + goto return_prx_err; + + return_bad_req: /* let's centralize all bad requests */ + txn->status = 400; + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + /* fall through */ + + return_prx_err: + http_set_term_flags(s); + http_reply_and_close(s, txn->status, http_error_message(s)); + /* fall through */ + + return_prx_cond: + http_set_term_flags(s); + + req->analysers &= AN_REQ_FLT_END; + req->analyse_exp = TICK_ETERNITY; + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; +} + +/* This function is an analyser which forwards request body (including chunk + * sizes if any). It is called as soon as we must forward, even if we forward + * zero byte. The only situation where it must not be called is when we're in + * tunnel mode and we want to forward till the close. It's used both to forward + * remaining data and to resync after end of body. It expects the msg_state to + * be between MSG_BODY and MSG_DONE (inclusive). It returns zero if it needs to + * read more data, or 1 once we can go on with next request or end the stream. + * When in MSG_DATA or MSG_TRAILERS, it will automatically forward chunk_len + * bytes of pending data + the headers if not already done. + */ +int http_request_forward_body(struct stream *s, struct channel *req, int an_bit) +{ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->req; + struct htx *htx; + short status = 0; + int ret; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn, msg); + + htx = htxbuf(&req->buf); + + if (htx->flags & HTX_FL_PARSING_ERROR) + goto return_bad_req; + if (htx->flags & HTX_FL_PROCESSING_ERROR) + goto return_int_err; + + /* Note that we don't have to send 100-continue back because we don't + * need the data to complete our job, and it's up to the server to + * decide whether to return 100, 417 or anything else in return of + * an "Expect: 100-continue" header. + */ + if (msg->msg_state == HTTP_MSG_BODY) + msg->msg_state = HTTP_MSG_DATA; + + /* in most states, we should abort in case of early close */ + channel_auto_close(req); + + if (req->to_forward) { + if (req->to_forward == CHN_INFINITE_FORWARD) { + if (s->scf->flags & SC_FL_EOI) + msg->msg_state = HTTP_MSG_ENDING; + } + else { + /* We can't process the buffer's contents yet */ + req->flags |= CF_WAKE_WRITE; + goto missing_data_or_waiting; + } + } + + if (msg->msg_state >= HTTP_MSG_ENDING) + goto ending; + + if (txn->meth == HTTP_METH_CONNECT) { + msg->msg_state = HTTP_MSG_ENDING; + goto ending; + } + + /* Forward input data. We get it by removing all outgoing data not + * forwarded yet from HTX data size. If there are some data filters, we + * let them decide the amount of data to forward. + */ + if (HAS_REQ_DATA_FILTERS(s)) { + ret = flt_http_payload(s, msg, htx->data); + if (ret < 0) + goto return_bad_req; + c_adv(req, ret); + } + else { + c_adv(req, htx->data - co_data(req)); + if ((global.tune.options & GTUNE_USE_FAST_FWD) && (msg->flags & HTTP_MSGF_XFER_LEN)) + channel_htx_forward_forever(req, htx); + } + + if (htx->data != co_data(req)) + goto missing_data_or_waiting; + + /* Check if the end-of-message is reached and if so, switch the message + * in HTTP_MSG_ENDING state. Then if all data was marked to be + * forwarded, set the state to HTTP_MSG_DONE. + */ + if (!(htx->flags & HTX_FL_EOM)) + goto missing_data_or_waiting; + + msg->msg_state = HTTP_MSG_ENDING; + + ending: + s->scb->flags &= ~SC_FL_SND_EXP_MORE; /* no more data are expected to be send */ + + /* other states, ENDING...TUNNEL */ + if (msg->msg_state >= HTTP_MSG_DONE) + goto done; + + if (HAS_REQ_DATA_FILTERS(s)) { + ret = flt_http_end(s, msg); + if (ret <= 0) { + if (!ret) + goto missing_data_or_waiting; + goto return_bad_req; + } + } + + if (txn->meth == HTTP_METH_CONNECT) + msg->msg_state = HTTP_MSG_TUNNEL; + else { + msg->msg_state = HTTP_MSG_DONE; + req->to_forward = 0; + } + + done: + /* we don't want to forward closes on DONE except in tunnel mode. */ + if (!(txn->flags & TX_CON_WANT_TUN)) + channel_dont_close(req); + + if ((s->scb->flags & SC_FL_SHUT_DONE) && co_data(req)) { + /* request errors are most likely due to the server aborting the + * transfer.Bit handle server aborts only if there is no + * response. Otherwise, let a change to forward the response + * first. + */ + if (htx_is_empty(htxbuf(&s->res.buf))) + goto return_srv_abort; + } + + http_end_request(s); + if (!(req->analysers & an_bit)) { + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 1; + } + + /* If "option abortonclose" is set on the backend, we want to monitor + * the client's connection and forward any shutdown notification to the + * server, which will decide whether to close or to go on processing the + * request. We only do that in tunnel mode, and not in other modes since + * it can be abused to exhaust source ports. */ + if (s->be->options & PR_O_ABRT_CLOSE) { + channel_auto_read(req); + if ((s->scf->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) && !(txn->flags & TX_CON_WANT_TUN)) + s->scb->flags |= SC_FL_NOLINGER; + channel_auto_close(req); + } + else if (s->txn->meth == HTTP_METH_POST) { + /* POST requests may require to read extra CRLF sent by broken + * browsers and which could cause an RST to be sent upon close + * on some systems (eg: Linux). */ + channel_auto_read(req); + } + DBG_TRACE_DEVEL("waiting for the end of the HTTP txn", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + + missing_data_or_waiting: + /* stop waiting for data if the input is closed before the end */ + if (msg->msg_state < HTTP_MSG_ENDING && (s->scf->flags & (SC_FL_ABRT_DONE|SC_FL_EOS))) + goto return_cli_abort; + + waiting: + /* waiting for the last bits to leave the buffer */ + if (s->scb->flags & SC_FL_SHUT_DONE) { + /* Handle server aborts only if there is no response. Otherwise, + * let a change to forward the response first. + */ + if (htx_is_empty(htxbuf(&s->res.buf))) + goto return_srv_abort; + } + + /* When TE: chunked is used, we need to get there again to parse remaining + * chunks even if the client has closed, so we don't want to set CF_DONTCLOSE. + * And when content-length is used, we never want to let the possible + * shutdown be forwarded to the other side, as the state machine will + * take care of it once the client responds. It's also important to + * prevent TIME_WAITs from accumulating on the backend side, and for + * HTTP/2 where the last frame comes with a shutdown. + */ + if (msg->flags & HTTP_MSGF_XFER_LEN) + channel_dont_close(req); + + /* We know that more data are expected, but we couldn't send more that + * what we did. So we always set the SC_FL_SND_EXP_MORE flag so that the + * system knows it must not set a PUSH on this first part. Interactive + * modes are already handled by the stream sock layer. We must not do + * this in content-length mode because it could present the MSG_MORE + * flag with the last block of forwarded data, which would cause an + * additional delay to be observed by the receiver. + */ + if (HAS_REQ_DATA_FILTERS(s)) + s->scb->flags |= SC_FL_SND_EXP_MORE; + + DBG_TRACE_DEVEL("waiting for more data to forward", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + + return_cli_abort: + _HA_ATOMIC_INC(&sess->fe->fe_counters.cli_aborts); + _HA_ATOMIC_INC(&s->be->be_counters.cli_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->cli_aborts); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.cli_aborts); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= ((req->flags & CF_READ_TIMEOUT) ? SF_ERR_CLITO : SF_ERR_CLICL); + status = 400; + goto return_prx_cond; + + return_srv_abort: + _HA_ATOMIC_INC(&sess->fe->fe_counters.srv_aborts); + _HA_ATOMIC_INC(&s->be->be_counters.srv_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->srv_aborts); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.srv_aborts); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= ((req->flags & CF_WRITE_TIMEOUT) ? SF_ERR_SRVTO : SF_ERR_SRVCL); + status = 502; + goto return_prx_cond; + + return_int_err: + s->flags |= SF_ERR_INTERNAL; + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.internal_errors); + status = 500; + goto return_prx_cond; + + return_bad_req: + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + status = 400; + /* fall through */ + + return_prx_cond: + http_set_term_flags(s); + if (txn->status > 0) { + /* Note: we don't send any error if some data were already sent */ + http_reply_and_close(s, txn->status, NULL); + } else { + txn->status = status; + http_reply_and_close(s, txn->status, http_error_message(s)); + } + DBG_TRACE_DEVEL("leaving on error ", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; +} + +/* Reset the stream and the backend stream connector to a situation suitable for attemption connection */ +/* Returns 0 if we can attempt to retry, -1 otherwise */ +static __inline int do_l7_retry(struct stream *s, struct stconn *sc) +{ + struct channel *req, *res; + int co_data; + + if (s->conn_retries >= s->be->conn_retries) + return -1; + s->conn_retries++; + if (objt_server(s->target)) { + if (s->flags & SF_CURR_SESS) { + s->flags &= ~SF_CURR_SESS; + _HA_ATOMIC_DEC(&__objt_server(s->target)->cur_sess); + } + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.retries); + } + _HA_ATOMIC_INC(&s->be->be_counters.retries); + + req = &s->req; + res = &s->res; + + /* Remove any write error from the request, and read error from the response */ + s->scf->flags &= ~(SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED); + req->flags &= ~CF_WRITE_TIMEOUT; + res->flags &= ~(CF_READ_TIMEOUT | CF_READ_EVENT); + res->analysers &= AN_RES_FLT_END; + s->conn_err_type = STRM_ET_NONE; + s->flags &= ~(SF_CONN_EXP | SF_ERR_MASK | SF_FINST_MASK); + s->conn_exp = TICK_ETERNITY; + stream_choose_redispatch(s); + res->to_forward = 0; + res->analyse_exp = TICK_ETERNITY; + res->total = 0; + + s->scb->flags &= ~(SC_FL_ERROR|SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED); + if (sc_reset_endp(s->scb) < 0) { + s->flags |= SF_ERR_INTERNAL; + return -1; + } + + b_free(&req->buf); + /* Swap the L7 buffer with the channel buffer */ + /* We know we stored the co_data as b_data, so get it there */ + co_data = b_data(&s->txn->l7_buffer); + b_set_data(&s->txn->l7_buffer, b_size(&s->txn->l7_buffer)); + b_xfer(&req->buf, &s->txn->l7_buffer, b_data(&s->txn->l7_buffer)); + co_set_data(req, co_data); + + DBG_TRACE_DEVEL("perform a L7 retry", STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, s->txn); + + b_reset(&res->buf); + co_set_data(res, 0); + return 0; +} + +/* This stream analyser waits for a complete HTTP response. It returns 1 if the + * processing can continue on next analysers, or zero if it either needs more + * data or wants to immediately abort the response (eg: timeout, error, ...). It + * is tied to AN_RES_WAIT_HTTP and may may remove itself from s->res.analysers + * when it has nothing left to do, and may remove any analyser when it wants to + * abort. + */ +int http_wait_for_response(struct stream *s, struct channel *rep, int an_bit) +{ + /* + * We will analyze a complete HTTP response to check the its syntax. + * + * Once the start line and all headers are received, we may perform a + * capture of the error (if any), and we will set a few fields. We also + * logging and finally headers capture. + */ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->rsp; + struct htx *htx; + struct connection *srv_conn; + struct htx_sl *sl; + int n; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn, msg); + + htx = htxbuf(&rep->buf); + + /* Parsing errors are caught here */ + if (htx->flags & HTX_FL_PARSING_ERROR) + goto return_bad_res; + if (htx->flags & HTX_FL_PROCESSING_ERROR) + goto return_int_err; + + /* + * Now we quickly check if we have found a full valid response. + * If not so, we check the FD and buffer states before leaving. + * A full response is indicated by the fact that we have seen + * the double LF/CRLF, so the state is >= HTTP_MSG_BODY. Invalid + * responses are checked first. + * + * Depending on whether the client is still there or not, we + * may send an error response back or not. Note that normally + * we should only check for HTTP status there, and check I/O + * errors somewhere else. + */ + next_one: + if (unlikely(htx_is_empty(htx) || htx->first == -1)) { + /* 1: have we encountered a read error ? */ + if (s->scb->flags & SC_FL_ERROR) { + struct connection *conn = sc_conn(s->scb); + + + if ((txn->flags & TX_L7_RETRY) && + (s->be->retry_type & PR_RE_DISCONNECTED) && + (!conn || conn->err_code != CO_ER_SSL_EARLY_FAILED)) { + if (co_data(rep) || do_l7_retry(s, s->scb) == 0) + return 0; + } + + /* Perform a L7 retry on empty response or because server refuses the early data. */ + if ((txn->flags & TX_L7_RETRY) && + (s->be->retry_type & PR_RE_EARLY_ERROR) && + conn && conn->err_code == CO_ER_SSL_EARLY_FAILED && + do_l7_retry(s, s->scb) == 0) { + DBG_TRACE_DEVEL("leaving on L7 retry", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + } + + if (txn->flags & TX_NOT_FIRST) + goto abort_keep_alive; + + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (objt_server(s->target)) { + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_resp); + health_adjust(__objt_server(s->target), HANA_STATUS_HTTP_READ_ERROR); + } + + /* if the server refused the early data, just send a 425 */ + if (conn && conn->err_code == CO_ER_SSL_EARLY_FAILED) + txn->status = 425; + else { + txn->status = 502; + stream_inc_http_fail_ctr(s); + } + + s->scb->flags |= SC_FL_NOLINGER; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_SRVCL; + http_set_term_flags(s); + + http_reply_and_close(s, txn->status, http_error_message(s)); + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; + } + + /* 2: read timeout : return a 504 to the client. */ + else if (rep->flags & CF_READ_TIMEOUT) { + if ((txn->flags & TX_L7_RETRY) && + (s->be->retry_type & PR_RE_TIMEOUT)) { + if (co_data(rep) || do_l7_retry(s, s->scb) == 0) { + DBG_TRACE_DEVEL("leaving on L7 retry", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + } + } + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (objt_server(s->target)) { + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_resp); + health_adjust(__objt_server(s->target), HANA_STATUS_HTTP_READ_TIMEOUT); + } + + txn->status = 504; + stream_inc_http_fail_ctr(s); + s->scb->flags |= SC_FL_NOLINGER; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_SRVTO; + http_set_term_flags(s); + + http_reply_and_close(s, txn->status, http_error_message(s)); + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; + } + + /* 3: client abort with an abortonclose */ + else if ((s->scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && (s->scb->flags & SC_FL_SHUT_DONE) && + (s->scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE))) { + _HA_ATOMIC_INC(&sess->fe->fe_counters.cli_aborts); + _HA_ATOMIC_INC(&s->be->be_counters.cli_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->cli_aborts); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.cli_aborts); + + txn->status = 400; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_CLICL; + http_set_term_flags(s); + + http_reply_and_close(s, txn->status, http_error_message(s)); + + /* process_stream() will take care of the error */ + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; + } + + /* 4: close from server, capture the response if the server has started to respond */ + else if (s->scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) { + if ((txn->flags & TX_L7_RETRY) && + (s->be->retry_type & PR_RE_DISCONNECTED)) { + if (co_data(rep) || do_l7_retry(s, s->scb) == 0) { + DBG_TRACE_DEVEL("leaving on L7 retry", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + } + } + + if (txn->flags & TX_NOT_FIRST) + goto abort_keep_alive; + + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (objt_server(s->target)) { + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_resp); + health_adjust(__objt_server(s->target), HANA_STATUS_HTTP_BROKEN_PIPE); + } + + txn->status = 502; + stream_inc_http_fail_ctr(s); + s->scb->flags |= SC_FL_NOLINGER; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_SRVCL; + http_set_term_flags(s); + + http_reply_and_close(s, txn->status, http_error_message(s)); + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; + } + + /* 5: write error to client (we don't send any message then) */ + else if (sc_ep_test(s->scf, SE_FL_ERR_PENDING)) { + if (txn->flags & TX_NOT_FIRST) + goto abort_keep_alive; + + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_resp); + rep->analysers &= AN_RES_FLT_END; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_CLICL; + http_set_term_flags(s); + + /* process_stream() will take care of the error */ + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; + } + + channel_dont_close(rep); + s->scb->flags |= SC_FL_RCV_ONCE; /* try to get back here ASAP */ + DBG_TRACE_DEVEL("waiting for more data", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + } + + /* More interesting part now : we know that we have a complete + * response which at least looks like HTTP. We have an indicator + * of each header's length, so we can parse them quickly. + */ + BUG_ON(htx_get_first_type(htx) != HTX_BLK_RES_SL); + sl = http_get_stline(htx); + + /* Perform a L7 retry because of the status code */ + if ((txn->flags & TX_L7_RETRY) && + l7_status_match(s->be, sl->info.res.status) && + do_l7_retry(s, s->scb) == 0) { + DBG_TRACE_DEVEL("leaving on L7 retry", STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + } + + /* Now, L7 buffer is useless, it can be released */ + b_free(&txn->l7_buffer); + + msg->msg_state = HTTP_MSG_BODY; + + + /* 0: we might have to print this header in debug mode */ + if (unlikely((global.mode & MODE_DEBUG) && + (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)))) { + int32_t pos; + + http_debug_stline("srvrep", s, sl); + + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_EOH) + break; + if (type != HTX_BLK_HDR) + continue; + + http_debug_hdr("srvhdr", s, + htx_get_blk_name(htx, blk), + htx_get_blk_value(htx, blk)); + } + } + + /* 1: get the status code and the version. Also set HTTP flags */ + txn->server_status = txn->status = sl->info.res.status; + if (sl->flags & HTX_SL_F_VER_11) + msg->flags |= HTTP_MSGF_VER_11; + if (sl->flags & HTX_SL_F_XFER_LEN) { + msg->flags |= HTTP_MSGF_XFER_LEN; + if (sl->flags & HTX_SL_F_CLEN) + msg->flags |= HTTP_MSGF_CNT_LEN; + else if (sl->flags & HTX_SL_F_CHNK) + msg->flags |= HTTP_MSGF_TE_CHNK; + } + if (sl->flags & HTX_SL_F_BODYLESS) + msg->flags |= HTTP_MSGF_BODYLESS; + if (sl->flags & HTX_SL_F_CONN_UPG) + msg->flags |= HTTP_MSGF_CONN_UPG; + + n = txn->status / 100; + if (n < 1 || n > 5) + n = 0; + + /* when the client triggers a 4xx from the server, it's most often due + * to a missing object or permission. These events should be tracked + * because if they happen often, it may indicate a brute force or a + * vulnerability scan. + */ + if (n == 4) + stream_inc_http_err_ctr(s); + + if (n == 5 && txn->status != 501 && txn->status != 505) + stream_inc_http_fail_ctr(s); + + if (objt_server(s->target)) { + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.p.http.rsp[n]); + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.p.http.cum_req); + } + + /* Adjust server's health based on status code. Note: status codes 501 + * and 505 are triggered on demand by client request, so we must not + * count them as server failures. + */ + if (objt_server(s->target)) { + if (txn->status >= 100 && (txn->status < 500 || txn->status == 501 || txn->status == 505)) + health_adjust(__objt_server(s->target), HANA_STATUS_HTTP_OK); + else + health_adjust(__objt_server(s->target), HANA_STATUS_HTTP_STS); + } + + /* + * We may be facing a 100-continue response, or any other informational + * 1xx response which is non-final, in which case this is not the right + * response, and we're waiting for the next one. Let's allow this response + * to go to the client and wait for the next one. There's an exception for + * 101 which is used later in the code to switch protocols. + */ + if (txn->status < 200 && + (txn->status == 100 || txn->status >= 102)) { + FLT_STRM_CB(s, flt_http_reset(s, msg)); + htx->first = channel_htx_fwd_headers(rep, htx); + msg->msg_state = HTTP_MSG_RPBEFORE; + msg->flags = 0; + txn->server_status = txn->status = 0; + s->logs.t_data = -1; /* was not a response yet */ + s->scf->flags |= SC_FL_SND_ASAP; /* Send ASAP informational messages */ + goto next_one; + } + + /* A 101-switching-protocols must contains a Connection header with the + * "upgrade" option and the request too. It means both are agree to + * upgrade. It is not so strict because there is no test on the Upgrade + * header content. But it is probably stronger enough for now. + */ + if (txn->status == 101 && + (!(txn->req.flags & HTTP_MSGF_CONN_UPG) || !(txn->rsp.flags & HTTP_MSGF_CONN_UPG))) + goto return_bad_res; + + /* + * 2: check for cacheability. + */ + + switch (txn->status) { + case 200: + case 203: + case 204: + case 206: + case 300: + case 301: + case 404: + case 405: + case 410: + case 414: + case 501: + break; + default: + /* RFC7231#6.1: + * Responses with status codes that are defined as + * cacheable by default (e.g., 200, 203, 204, 206, + * 300, 301, 404, 405, 410, 414, and 501 in this + * specification) can be reused by a cache with + * heuristic expiration unless otherwise indicated + * by the method definition or explicit cache + * controls [RFC7234]; all other status codes are + * not cacheable by default. + */ + txn->flags &= ~(TX_CACHEABLE | TX_CACHE_COOK); + break; + } + + /* + * 3: we may need to capture headers + */ + s->logs.logwait &= ~LW_RESP; + if (unlikely((s->logs.logwait & LW_RSPHDR) && s->res_cap)) + http_capture_headers(htx, s->res_cap, sess->fe->rsp_cap); + + /* Skip parsing if no content length is possible. */ + if (unlikely((txn->meth == HTTP_METH_CONNECT && txn->status >= 200 && txn->status < 300) || + txn->status == 101)) { + /* Either we've established an explicit tunnel, or we're + * switching the protocol. In both cases, we're very unlikely + * to understand the next protocols. We have to switch to tunnel + * mode, so that we transfer the request and responses then let + * this protocol pass unmodified. When we later implement specific + * parsers for such protocols, we'll want to check the Upgrade + * header which contains information about that protocol for + * responses with status 101 (eg: see RFC2817 about TLS). + */ + txn->flags |= TX_CON_WANT_TUN; + } + + /* check for NTML authentication headers in 401 (WWW-Authenticate) and + * 407 (Proxy-Authenticate) responses and set the connection to private + */ + srv_conn = sc_conn(s->scb); + if (srv_conn) { + struct ist hdr; + struct http_hdr_ctx ctx; + + if (txn->status == 401) + hdr = ist("WWW-Authenticate"); + else if (txn->status == 407) + hdr = ist("Proxy-Authenticate"); + else + goto end; + + ctx.blk = NULL; + while (http_find_header(htx, hdr, &ctx, 0)) { + /* If www-authenticate contains "Negotiate", "Nego2", or "NTLM", + * possibly followed by blanks and a base64 string, the connection + * is private. Since it's a mess to deal with, we only check for + * values starting with "NTLM" or "Nego". Note that often multiple + * headers are sent by the server there. + */ + if ((ctx.value.len >= 4 && strncasecmp(ctx.value.ptr, "Nego", 4) == 0) || + (ctx.value.len >= 4 && strncasecmp(ctx.value.ptr, "NTLM", 4) == 0)) { + sess->flags |= SESS_FL_PREFER_LAST; + conn_set_owner(srv_conn, sess, NULL); + conn_set_private(srv_conn); + /* If it fail now, the same will be done in mux->detach() callback */ + session_add_conn(srv_conn->owner, srv_conn, srv_conn->target); + break; + } + } + } + + end: + /* we want to have the response time before we start processing it */ + s->logs.t_data = ns_to_ms(now_ns - s->logs.accept_ts); + + /* end of job, return OK */ + rep->analysers &= ~an_bit; + rep->analyse_exp = TICK_ETERNITY; + channel_auto_close(rep); + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 1; + + return_int_err: + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.internal_errors); + txn->status = 500; + s->flags |= SF_ERR_INTERNAL; + goto return_prx_cond; + + return_bad_res: + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (objt_server(s->target)) { + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_resp); + health_adjust(__objt_server(s->target), HANA_STATUS_HTTP_HDRRSP); + } + if ((s->be->retry_type & PR_RE_JUNK_REQUEST) && + (txn->flags & TX_L7_RETRY) && + do_l7_retry(s, s->scb) == 0) { + DBG_TRACE_DEVEL("leaving on L7 retry", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + } + txn->status = 502; + stream_inc_http_fail_ctr(s); + /* fall through */ + + return_prx_cond: + http_set_term_flags(s); + http_reply_and_close(s, txn->status, http_error_message(s)); + + s->scb->flags |= SC_FL_NOLINGER; + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; + + abort_keep_alive: + /* A keep-alive request to the server failed on a network error. + * The client is required to retry. We need to close without returning + * any other information so that the client retries. + */ + txn->status = 0; + s->logs.logwait = 0; + s->logs.level = 0; + s->scf->flags &= ~SC_FL_SND_EXP_MORE; /* speed up sending a previous response */ + http_reply_and_close(s, txn->status, NULL); + DBG_TRACE_DEVEL("leaving by closing K/A connection", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; +} + +/* This function performs all the processing enabled for the current response. + * It normally returns 1 unless it wants to break. It relies on buffers flags, + * and updates s->res.analysers. It might make sense to explode it into several + * other functions. It works like process_request (see indications above). + */ +int http_process_res_common(struct stream *s, struct channel *rep, int an_bit, struct proxy *px) +{ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->rsp; + struct htx *htx; + struct proxy *cur_proxy; + enum rule_result ret = HTTP_RULE_RES_CONT; + + if (unlikely(msg->msg_state < HTTP_MSG_BODY)) /* we need more data */ + return 0; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn, msg); + + htx = htxbuf(&rep->buf); + + /* The stats applet needs to adjust the Connection header but we don't + * apply any filter there. + */ + if (unlikely(objt_applet(s->target) == &http_stats_applet)) { + rep->analysers &= ~an_bit; + rep->analyse_exp = TICK_ETERNITY; + goto end; + } + + /* + * We will have to evaluate the filters. + * As opposed to version 1.2, now they will be evaluated in the + * filters order and not in the header order. This means that + * each filter has to be validated among all headers. + * + * Filters are tried with ->be first, then with ->fe if it is + * different from ->be. + * + * Maybe we are in resume condiion. In this case I choose the + * "struct proxy" which contains the rule list matching the resume + * pointer. If none of these "struct proxy" match, I initialise + * the process with the first one. + * + * In fact, I check only correspondence between the current list + * pointer and the ->fe rule list. If it doesn't match, I initialize + * the loop with the ->be. + */ + if (s->current_rule_list == &sess->fe->http_res_rules || + (sess->fe->defpx && s->current_rule_list == &sess->fe->defpx->http_res_rules)) + cur_proxy = sess->fe; + else + cur_proxy = s->be; + + while (1) { + /* evaluate http-response rules */ + if (ret == HTTP_RULE_RES_CONT || ret == HTTP_RULE_RES_STOP) { + struct list *def_rules, *rules; + + def_rules = ((cur_proxy->defpx && (cur_proxy == s->be || cur_proxy->defpx != s->be->defpx)) ? &cur_proxy->defpx->http_res_rules : NULL); + rules = &cur_proxy->http_res_rules; + + ret = http_res_get_intercept_rule(cur_proxy, def_rules, rules, s, 0); + + switch (ret) { + case HTTP_RULE_RES_YIELD: /* some data miss, call the function later. */ + goto return_prx_yield; + + case HTTP_RULE_RES_CONT: + case HTTP_RULE_RES_STOP: /* nothing to do */ + break; + + case HTTP_RULE_RES_DENY: /* deny or tarpit */ + goto deny; + + case HTTP_RULE_RES_ABRT: /* abort request, response already sent */ + goto return_prx_cond; + + case HTTP_RULE_RES_DONE: /* OK, but terminate request processing (eg: redirect) */ + goto done; + + case HTTP_RULE_RES_BADREQ: /* failed with a bad request */ + goto return_bad_res; + + case HTTP_RULE_RES_ERROR: /* failed with a bad request */ + goto return_int_err; + } + + } + + /* check whether we're already working on the frontend */ + if (cur_proxy == sess->fe) + break; + cur_proxy = sess->fe; + } + + /* OK that's all we can do for 1xx responses */ + if (unlikely(txn->status < 200 && txn->status != 101)) + goto end; + + /* + * Now check for a server cookie. + */ + if (s->be->cookie_name || sess->fe->capture_name || (s->be->options & PR_O_CHK_CACHE)) + http_manage_server_side_cookies(s, rep); + + /* + * Check for cache-control or pragma headers if required. + */ + if ((s->be->options & PR_O_CHK_CACHE) || (s->be->ck_opts & PR_CK_NOC)) + http_check_response_for_cacheability(s, rep); + + /* + * Add server cookie in the response if needed + */ + if (objt_server(s->target) && (s->be->ck_opts & PR_CK_INS) && + !((txn->flags & TX_SCK_FOUND) && (s->be->ck_opts & PR_CK_PSV)) && + (!(s->flags & SF_DIRECT) || + ((s->be->cookie_maxidle || txn->cookie_last_date) && + (!txn->cookie_last_date || (txn->cookie_last_date - date.tv_sec) < 0)) || + (s->be->cookie_maxlife && !txn->cookie_first_date) || // set the first_date + (!s->be->cookie_maxlife && txn->cookie_first_date)) && // remove the first_date + (!(s->be->ck_opts & PR_CK_POST) || (txn->meth == HTTP_METH_POST)) && + !(s->flags & SF_IGNORE_PRST)) { + /* the server is known, it's not the one the client requested, or the + * cookie's last seen date needs to be refreshed. We have to + * insert a set-cookie here, except if we want to insert only on POST + * requests and this one isn't. Note that servers which don't have cookies + * (eg: some backup servers) will return a full cookie removal request. + */ + if (!__objt_server(s->target)->cookie) { + chunk_printf(&trash, + "%s=; Expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/", + s->be->cookie_name); + } + else { + chunk_printf(&trash, "%s=%s", s->be->cookie_name, __objt_server(s->target)->cookie); + + if (s->be->cookie_maxidle || s->be->cookie_maxlife) { + /* emit last_date, which is mandatory */ + trash.area[trash.data++] = COOKIE_DELIM_DATE; + s30tob64((date.tv_sec+3) >> 2, + trash.area + trash.data); + trash.data += 5; + + if (s->be->cookie_maxlife) { + /* emit first_date, which is either the original one or + * the current date. + */ + trash.area[trash.data++] = COOKIE_DELIM_DATE; + s30tob64(txn->cookie_first_date ? + txn->cookie_first_date >> 2 : + (date.tv_sec+3) >> 2, + trash.area + trash.data); + trash.data += 5; + } + } + chunk_appendf(&trash, "; path=/"); + } + + if (s->be->cookie_domain) + chunk_appendf(&trash, "; domain=%s", s->be->cookie_domain); + + if (s->be->ck_opts & PR_CK_HTTPONLY) + chunk_appendf(&trash, "; HttpOnly"); + + if (s->be->ck_opts & PR_CK_SECURE) + chunk_appendf(&trash, "; Secure"); + + if (s->be->cookie_attrs) + chunk_appendf(&trash, "; %s", s->be->cookie_attrs); + + if (unlikely(!http_add_header(htx, ist("Set-Cookie"), ist2(trash.area, trash.data)))) + goto return_fail_rewrite; + + txn->flags &= ~TX_SCK_MASK; + if (__objt_server(s->target)->cookie && (s->flags & SF_DIRECT)) + /* the server did not change, only the date was updated */ + txn->flags |= TX_SCK_UPDATED; + else + txn->flags |= TX_SCK_INSERTED; + + /* Here, we will tell an eventual cache on the client side that we don't + * want it to cache this reply because HTTP/1.0 caches also cache cookies ! + * Some caches understand the correct form: 'no-cache="set-cookie"', but + * others don't (eg: apache <= 1.3.26). So we use 'private' instead. + */ + if ((s->be->ck_opts & PR_CK_NOC) && (txn->flags & TX_CACHEABLE)) { + + txn->flags &= ~TX_CACHEABLE & ~TX_CACHE_COOK; + + if (unlikely(!http_add_header(htx, ist("Cache-control"), ist("private")))) + goto return_fail_rewrite; + } + } + + /* + * Check if result will be cacheable with a cookie. + * We'll block the response if security checks have caught + * nasty things such as a cacheable cookie. + */ + if (((txn->flags & (TX_CACHEABLE | TX_CACHE_COOK | TX_SCK_PRESENT)) == + (TX_CACHEABLE | TX_CACHE_COOK | TX_SCK_PRESENT)) && + (s->be->options & PR_O_CHK_CACHE)) { + /* we're in presence of a cacheable response containing + * a set-cookie header. We'll block it as requested by + * the 'checkcache' option, and send an alert. + */ + ha_alert("Blocking cacheable cookie in response from instance %s, server %s.\n", + s->be->id, objt_server(s->target) ? __objt_server(s->target)->id : "<dispatch>"); + send_log(s->be, LOG_ALERT, + "Blocking cacheable cookie in response from instance %s, server %s.\n", + s->be->id, objt_server(s->target) ? __objt_server(s->target)->id : "<dispatch>"); + goto deny; + } + + end: + /* + * Evaluate after-response rules before forwarding the response. rules + * from the backend are evaluated first, then one from the frontend if + * it differs. + */ + if (!http_eval_after_res_rules(s)) + goto return_int_err; + + /* Filter the response headers if there are filters attached to the + * stream. + */ + if (HAS_FILTERS(s)) + rep->analysers |= AN_RES_FLT_HTTP_HDRS; + + /* Always enter in the body analyzer */ + rep->analysers &= ~AN_RES_FLT_XFER_DATA; + rep->analysers |= AN_RES_HTTP_XFER_BODY; + + /* if the user wants to log as soon as possible, without counting + * bytes from the server, then this is the right moment. We have + * to temporarily assign bytes_out to log what we currently have. + */ + if (!LIST_ISEMPTY(&sess->fe->logformat) && !(s->logs.logwait & LW_BYTES)) { + s->logs.t_close = s->logs.t_data; /* to get a valid end date */ + s->logs.bytes_out = htx->data; + s->do_log(s); + s->logs.bytes_out = 0; + } + + done: + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + rep->analysers &= ~an_bit; + rep->analyse_exp = TICK_ETERNITY; + s->current_rule = s->current_rule_list = NULL; + return 1; + + deny: + _HA_ATOMIC_INC(&sess->fe->fe_counters.denied_resp); + _HA_ATOMIC_INC(&s->be->be_counters.denied_resp); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->denied_resp); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.denied_resp); + goto return_prx_err; + + return_fail_rewrite: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_rewrites); + _HA_ATOMIC_INC(&s->be->be_counters.failed_rewrites); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_rewrites); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_rewrites); + /* fall through */ + + return_int_err: + txn->status = 500; + s->flags |= SF_ERR_INTERNAL; + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.internal_errors); + goto return_prx_err; + + return_bad_res: + txn->status = 502; + stream_inc_http_fail_ctr(s); + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (objt_server(s->target)) { + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_resp); + health_adjust(__objt_server(s->target), HANA_STATUS_HTTP_RSP); + } + /* fall through */ + + return_prx_err: + http_set_term_flags(s); + http_reply_and_close(s, txn->status, http_error_message(s)); + /* fall through */ + + return_prx_cond: + s->logs.t_data = -1; /* was not a valid response */ + s->scb->flags |= SC_FL_NOLINGER; + + http_set_term_flags(s); + + rep->analysers &= AN_RES_FLT_END; + s->req.analysers &= AN_REQ_FLT_END; + rep->analyse_exp = TICK_ETERNITY; + s->current_rule = s->current_rule_list = NULL; + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; + + return_prx_yield: + channel_dont_close(rep); + DBG_TRACE_DEVEL("waiting for more data", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; +} + +/* This function is an analyser which forwards response body (including chunk + * sizes if any). It is called as soon as we must forward, even if we forward + * zero byte. The only situation where it must not be called is when we're in + * tunnel mode and we want to forward till the close. It's used both to forward + * remaining data and to resync after end of body. It expects the msg_state to + * be between MSG_BODY and MSG_DONE (inclusive). It returns zero if it needs to + * read more data, or 1 once we can go on with next request or end the stream. + * + * It is capable of compressing response data both in content-length mode and + * in chunked mode. The state machines follows different flows depending on + * whether content-length and chunked modes are used, since there are no + * trailers in content-length : + * + * chk-mode cl-mode + * ,----- BODY -----. + * / \ + * V size > 0 V chk-mode + * .--> SIZE -------------> DATA -------------> CRLF + * | | size == 0 | last byte | + * | v final crlf v inspected | + * | TRAILERS -----------> DONE | + * | | + * `----------------------------------------------' + * + * Compression only happens in the DATA state, and must be flushed in final + * states (TRAILERS/DONE) or when leaving on missing data. Normal forwarding + * is performed at once on final states for all bytes parsed, or when leaving + * on missing data. + */ +int http_response_forward_body(struct stream *s, struct channel *res, int an_bit) +{ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct http_msg *msg = &s->txn->rsp; + struct htx *htx; + int ret; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn, msg); + + htx = htxbuf(&res->buf); + + if (htx->flags & HTX_FL_PARSING_ERROR) + goto return_bad_res; + if (htx->flags & HTX_FL_PROCESSING_ERROR) + goto return_int_err; + + if (msg->msg_state == HTTP_MSG_BODY) + msg->msg_state = HTTP_MSG_DATA; + + /* in most states, we should abort in case of early close */ + channel_auto_close(res); + + if (res->to_forward) { + if (res->to_forward == CHN_INFINITE_FORWARD) { + if (s->scb->flags & SC_FL_EOI) + msg->msg_state = HTTP_MSG_ENDING; + } + else { + /* We can't process the buffer's contents yet */ + res->flags |= CF_WAKE_WRITE; + goto missing_data_or_waiting; + } + } + + if (msg->msg_state >= HTTP_MSG_ENDING) + goto ending; + + if ((txn->meth == HTTP_METH_CONNECT && txn->status >= 200 && txn->status < 300) || txn->status == 101 || + (!(msg->flags & HTTP_MSGF_XFER_LEN) && !HAS_RSP_DATA_FILTERS(s))) { + msg->msg_state = HTTP_MSG_ENDING; + goto ending; + } + + /* Forward input data. We get it by removing all outgoing data not + * forwarded yet from HTX data size. If there are some data filters, we + * let them decide the amount of data to forward. + */ + if (HAS_RSP_DATA_FILTERS(s)) { + ret = flt_http_payload(s, msg, htx->data); + if (ret < 0) + goto return_bad_res; + c_adv(res, ret); + } + else { + c_adv(res, htx->data - co_data(res)); + if ((global.tune.options & GTUNE_USE_FAST_FWD) && (msg->flags & HTTP_MSGF_XFER_LEN)) + channel_htx_forward_forever(res, htx); + } + + if (htx->data != co_data(res)) + goto missing_data_or_waiting; + + if (!(msg->flags & HTTP_MSGF_XFER_LEN) && (s->scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE))) { + msg->msg_state = HTTP_MSG_ENDING; + goto ending; + } + + /* Check if the end-of-message is reached and if so, switch the message + * in HTTP_MSG_ENDING state. Then if all data was marked to be + * forwarded, set the state to HTTP_MSG_DONE. + */ + if (!(htx->flags & HTX_FL_EOM)) + goto missing_data_or_waiting; + + msg->msg_state = HTTP_MSG_ENDING; + + ending: + s->scf->flags &= ~SC_FL_SND_EXP_MORE; /* no more data are expected to be sent */ + + /* other states, ENDING...TUNNEL */ + if (msg->msg_state >= HTTP_MSG_DONE) + goto done; + + if (HAS_RSP_DATA_FILTERS(s)) { + ret = flt_http_end(s, msg); + if (ret <= 0) { + if (!ret) + goto missing_data_or_waiting; + goto return_bad_res; + } + } + + if (!(txn->flags & TX_CON_WANT_TUN) && !(msg->flags & HTTP_MSGF_XFER_LEN)) { + /* One-side tunnel */ + msg->msg_state = HTTP_MSG_TUNNEL; + } + else { + msg->msg_state = HTTP_MSG_DONE; + res->to_forward = 0; + } + + done: + + channel_dont_close(res); + + if ((s->scf->flags & SC_FL_SHUT_DONE) && co_data(res)) { + /* response errors are most likely due to the client aborting + * the transfer. */ + goto return_cli_abort; + } + + http_end_response(s); + if (!(res->analysers & an_bit)) { + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 1; + } + DBG_TRACE_DEVEL("waiting for the end of the HTTP txn", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + + missing_data_or_waiting: + if (s->scf->flags & SC_FL_SHUT_DONE) + goto return_cli_abort; + + /* stop waiting for data if the input is closed before the end. If the + * client side was already closed, it means that the client has aborted, + * so we don't want to count this as a server abort. Otherwise it's a + * server abort. + */ + if (msg->msg_state < HTTP_MSG_ENDING && (s->scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE))) { + if ((s->scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && + (s->scb->flags & SC_FL_SHUT_DONE)) + goto return_cli_abort; + /* If we have some pending data, we continue the processing */ + if (htx_is_empty(htx)) + goto return_srv_abort; + } + + /* When TE: chunked is used, we need to get there again to parse + * remaining chunks even if the server has closed, so we don't want to + * set CF_DONTCLOSE. Similarly when there is a content-leng or if there + * are filters registered on the stream, we don't want to forward a + * close + */ + if ((msg->flags & HTTP_MSGF_XFER_LEN) || HAS_RSP_DATA_FILTERS(s)) + channel_dont_close(res); + + /* We know that more data are expected, but we couldn't send more that + * what we did. So we always set the SC_FL_SND_EXP_MORE flag so that the + * system knows it must not set a PUSH on this first part. Interactive + * modes are already handled by the stream sock layer. We must not do + * this in content-length mode because it could present the MSG_MORE + * flag with the last block of forwarded data, which would cause an + * additional delay to be observed by the receiver. + */ + if (HAS_RSP_DATA_FILTERS(s)) + s->scf->flags |= SC_FL_SND_EXP_MORE; + + /* the stream handler will take care of timeouts and errors */ + DBG_TRACE_DEVEL("waiting for more data to forward", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA, s, txn); + return 0; + + return_srv_abort: + _HA_ATOMIC_INC(&sess->fe->fe_counters.srv_aborts); + _HA_ATOMIC_INC(&s->be->be_counters.srv_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->srv_aborts); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.srv_aborts); + stream_inc_http_fail_ctr(s); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= ((res->flags & CF_READ_TIMEOUT) ? SF_ERR_SRVTO : SF_ERR_SRVCL); + goto return_error; + + return_cli_abort: + _HA_ATOMIC_INC(&sess->fe->fe_counters.cli_aborts); + _HA_ATOMIC_INC(&s->be->be_counters.cli_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->cli_aborts); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.cli_aborts); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= ((res->flags & CF_WRITE_TIMEOUT) ? SF_ERR_CLITO : SF_ERR_CLICL); + goto return_error; + + return_int_err: + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.internal_errors); + s->flags |= SF_ERR_INTERNAL; + goto return_error; + + return_bad_res: + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (objt_server(s->target)) { + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_resp); + health_adjust(__objt_server(s->target), HANA_STATUS_HTTP_RSP); + } + stream_inc_http_fail_ctr(s); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_SRVCL; + /* fall through */ + + return_error: + /* don't send any error message as we're in the body */ + http_set_term_flags(s); + http_reply_and_close(s, txn->status, NULL); + stream_inc_http_fail_ctr(s); + DBG_TRACE_DEVEL("leaving on error", + STRM_EV_STRM_ANA|STRM_EV_HTTP_ANA|STRM_EV_HTTP_ERR, s, txn); + return 0; +} + +/* Perform an HTTP redirect based on the information in <rule>. The function + * returns zero in case of an irrecoverable error such as too large a request + * to build a valid response, 1 in case of successful redirect (hence the rule + * is final), or 2 if the rule has to be silently skipped. + */ +int http_apply_redirect_rule(struct redirect_rule *rule, struct stream *s, struct http_txn *txn) +{ + struct channel *req = &s->req; + struct channel *res = &s->res; + struct htx *htx; + struct htx_sl *sl; + struct buffer *chunk; + struct ist status, reason, location; + unsigned int flags; + int ret = 1, close = 0; /* Try to keep the connection alive byt default */ + + chunk = alloc_trash_chunk(); + if (!chunk) { + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + goto fail; + } + + /* + * Create the location + */ + htx = htxbuf(&req->buf); + switch(rule->type) { + case REDIRECT_TYPE_SCHEME: { + struct http_hdr_ctx ctx; + struct ist path, host; + struct http_uri_parser parser; + + host = ist(""); + ctx.blk = NULL; + if (http_find_header(htx, ist("Host"), &ctx, 0)) + host = ctx.value; + + sl = http_get_stline(htx); + parser = http_uri_parser_init(htx_sl_req_uri(sl)); + path = http_parse_path(&parser); + /* build message using path */ + if (isttest(path)) { + if (rule->flags & REDIRECT_FLAG_DROP_QS) { + int qs = 0; + while (qs < path.len) { + if (*(path.ptr + qs) == '?') { + path.len = qs; + break; + } + qs++; + } + } + } + else + path = ist("/"); + + if (rule->rdr_str) { /* this is an old "redirect" rule */ + /* add scheme */ + if (!chunk_memcat(chunk, rule->rdr_str, rule->rdr_len)) + goto fail; + } + else { + /* add scheme with executing log format */ + chunk->data += build_logline(s, chunk->area + chunk->data, + chunk->size - chunk->data, + &rule->rdr_fmt); + } + /* add "://" + host + path */ + if (!chunk_memcat(chunk, "://", 3) || + !chunk_memcat(chunk, host.ptr, host.len) || + !chunk_memcat(chunk, path.ptr, path.len)) + goto fail; + + /* append a slash at the end of the location if needed and missing */ + if (chunk->data && chunk->area[chunk->data - 1] != '/' && + (rule->flags & REDIRECT_FLAG_APPEND_SLASH)) { + if (chunk->data + 1 >= chunk->size) + goto fail; + chunk->area[chunk->data++] = '/'; + } + break; + } + + case REDIRECT_TYPE_PREFIX: { + struct ist path; + struct http_uri_parser parser; + + sl = http_get_stline(htx); + parser = http_uri_parser_init(htx_sl_req_uri(sl)); + path = http_parse_path(&parser); + /* build message using path */ + if (isttest(path)) { + if (rule->flags & REDIRECT_FLAG_DROP_QS) { + int qs = 0; + while (qs < path.len) { + if (*(path.ptr + qs) == '?') { + path.len = qs; + break; + } + qs++; + } + } + } + else + path = ist("/"); + + if (rule->rdr_str) { /* this is an old "redirect" rule */ + /* add prefix. Note that if prefix == "/", we don't want to + * add anything, otherwise it makes it hard for the user to + * configure a self-redirection. + */ + if (rule->rdr_len != 1 || *rule->rdr_str != '/') { + if (!chunk_memcat(chunk, rule->rdr_str, rule->rdr_len)) + goto fail; + } + } + else { + /* add prefix with executing log format */ + chunk->data += build_logline(s, chunk->area + chunk->data, + chunk->size - chunk->data, + &rule->rdr_fmt); + } + + /* add path */ + if (!chunk_memcat(chunk, path.ptr, path.len)) + goto fail; + + /* append a slash at the end of the location if needed and missing */ + if (chunk->data && chunk->area[chunk->data - 1] != '/' && + (rule->flags & REDIRECT_FLAG_APPEND_SLASH)) { + if (chunk->data + 1 >= chunk->size) + goto fail; + chunk->area[chunk->data++] = '/'; + } + break; + } + case REDIRECT_TYPE_LOCATION: + default: + if (rule->rdr_str) { /* this is an old "redirect" rule */ + /* add location */ + if (!chunk_memcat(chunk, rule->rdr_str, rule->rdr_len)) + goto fail; + } + else { + /* add location with executing log format */ + int len = build_logline(s, chunk->area + chunk->data, + chunk->size - chunk->data, + &rule->rdr_fmt); + if (!len && rule->flags & REDIRECT_FLAG_IGNORE_EMPTY) { + ret = 2; + goto out; + } + + chunk->data += len; + } + break; + } + location = ist2(chunk->area, chunk->data); + + /* + * Create the 30x response + */ + switch (rule->code) { + case 308: + status = ist("308"); + reason = ist("Permanent Redirect"); + break; + case 307: + status = ist("307"); + reason = ist("Temporary Redirect"); + break; + case 303: + status = ist("303"); + reason = ist("See Other"); + break; + case 301: + status = ist("301"); + reason = ist("Moved Permanently"); + break; + case 302: + default: + status = ist("302"); + reason = ist("Found"); + break; + } + + if (!(txn->req.flags & HTTP_MSGF_BODYLESS) && txn->req.msg_state != HTTP_MSG_DONE) + close = 1; + + htx = htx_from_buf(&res->buf); + /* Trim any possible response */ + channel_htx_truncate(&s->res, htx); + flags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11|HTX_SL_F_XFER_LEN|HTX_SL_F_CLEN|HTX_SL_F_BODYLESS); + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, ist("HTTP/1.1"), status, reason); + if (!sl) + goto fail; + sl->info.res.status = rule->code; + s->txn->status = rule->code; + + if (close && !htx_add_header(htx, ist("Connection"), ist("close"))) + goto fail; + + if (!htx_add_header(htx, ist("Content-length"), ist("0")) || + !htx_add_header(htx, ist("Location"), location)) + goto fail; + + if (rule->code == 302 || rule->code == 303 || rule->code == 307) { + if (!htx_add_header(htx, ist("Cache-Control"), ist("no-cache"))) + goto fail; + } + + if (rule->cookie_len) { + if (!htx_add_header(htx, ist("Set-Cookie"), ist2(rule->cookie_str, rule->cookie_len))) + goto fail; + } + + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto fail; + + htx->flags |= HTX_FL_EOM; + htx_to_buf(htx, &res->buf); + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_LOCAL; + http_set_term_flags(s); + + if (!http_forward_proxy_resp(s, 1)) + goto fail; + + if (rule->flags & REDIRECT_FLAG_FROM_REQ) { + /* let's log the request time */ + s->logs.request_ts = now_ns; + req->analysers &= AN_REQ_FLT_END; + + if (s->sess->fe == s->be) /* report it if the request was intercepted by the frontend */ + _HA_ATOMIC_INC(&s->sess->fe->fe_counters.intercepted_req); + } + + out: + free_trash_chunk(chunk); + return ret; + + fail: + /* If an error occurred, remove the incomplete HTTP response from the + * buffer */ + channel_htx_truncate(res, htxbuf(&res->buf)); + ret = 0; + goto out; +} + +/* This function filters the request header names to only allow [0-9a-zA-Z-] + * characters. Depending on the proxy configuration, headers with a name not + * matching this charset are removed or the request is rejected with a + * 403-Forbidden response if such name are found. It returns HTTP_RULE_RES_CONT + * to continue the request processing or HTTP_RULE_RES_DENY if the request is + * rejected. + */ +static enum rule_result http_req_restrict_header_names(struct stream *s, struct htx *htx, struct proxy *px) +{ + struct htx_blk *blk; + enum rule_result rule_ret = HTTP_RULE_RES_CONT; + + blk = htx_get_first_blk(htx); + while (blk) { + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_HDR) { + struct ist n = htx_get_blk_name(htx, blk); + int i, end = istlen(n); + + for (i = 0; i < end; i++) { + if (!isalnum((unsigned char)n.ptr[i]) && n.ptr[i] != '-') { + break; + } + } + + if (i < end) { + /* Disallowed character found - block the request or remove the header */ + if (px->options2 & PR_O2_RSTRICT_REQ_HDR_NAMES_BLK) + goto block; + blk = htx_remove_blk(htx, blk); + continue; + } + } + if (type == HTX_BLK_EOH) + break; + + blk = htx_get_next_blk(htx, blk); + } + out: + return rule_ret; + block: + /* Block the request returning a 403-Forbidden response */ + s->txn->status = 403; + rule_ret = HTTP_RULE_RES_DENY; + goto out; +} + +/* Replace all headers matching the name <name>. The header value is replaced if + * it matches the regex <re>. <str> is used for the replacement. If <full> is + * set to 1, the full-line is matched and replaced. Otherwise, comma-separated + * values are evaluated one by one. It returns 0 on success and -1 on error. + */ +int http_replace_hdrs(struct stream* s, struct htx *htx, struct ist name, + const char *str, struct my_regex *re, int full) +{ + struct http_hdr_ctx ctx; + + ctx.blk = NULL; + while (http_find_header(htx, name, &ctx, full)) { + struct buffer *output = get_trash_chunk(); + + if (!regex_exec_match2(re, ctx.value.ptr, ctx.value.len, MAX_MATCH, pmatch, 0)) + continue; + + output->data = exp_replace(output->area, output->size, ctx.value.ptr, str, pmatch); + if (output->data == -1) + return -1; + if (!http_replace_header_value(htx, &ctx, ist2(output->area, output->data))) + return -1; + } + return 0; +} + +/* This function executes one of the set-{method,path,query,uri} actions. It + * takes the string from the variable 'replace' with length 'len', then modifies + * the relevant part of the request line accordingly. Then it updates various + * pointers to the next elements which were moved, and the total buffer length. + * It finds the action to be performed in p[2], previously filled by function + * parse_set_req_line(). It returns 0 in case of success, -1 in case of internal + * error, though this can be revisited when this code is finally exploited. + * + * 'action' can be '0' to replace method, '1' to replace path, '2' to replace + * query string, 3 to replace uri or 4 to replace the path+query. + * + * In query string case, the mark question '?' must be set at the start of the + * string by the caller, event if the replacement query string is empty. + */ +int http_req_replace_stline(int action, const char *replace, int len, + struct proxy *px, struct stream *s) +{ + struct htx *htx = htxbuf(&s->req.buf); + + switch (action) { + case 0: // method + if (!http_replace_req_meth(htx, ist2(replace, len))) + return -1; + break; + + case 1: // path + if (!http_replace_req_path(htx, ist2(replace, len), 0)) + return -1; + break; + + case 2: // query + if (!http_replace_req_query(htx, ist2(replace, len))) + return -1; + break; + + case 3: // uri + if (!http_replace_req_uri(htx, ist2(replace, len))) + return -1; + break; + + case 4: // path + query + if (!http_replace_req_path(htx, ist2(replace, len), 1)) + return -1; + break; + + default: + return -1; + } + return 0; +} + +/* This function replace the HTTP status code and the associated message. The + * variable <status> contains the new status code. This function never fails. It + * returns 0 in case of success, -1 in case of internal error. + */ +int http_res_set_status(unsigned int status, struct ist reason, struct stream *s) +{ + struct htx *htx = htxbuf(&s->res.buf); + char *res; + + chunk_reset(&trash); + res = ultoa_o(status, trash.area, trash.size); + trash.data = res - trash.area; + + /* Do we have a custom reason format string? */ + if (!isttest(reason)) { + const char *str = http_get_reason(status); + reason = ist(str); + } + + if (!http_replace_res_status(htx, ist2(trash.area, trash.data), reason)) + return -1; + s->txn->status = status; + return 0; +} + +/* Executes the http-request rules <rules> for stream <s>, proxy <px> and + * transaction <txn>. Returns the verdict of the first rule that prevents + * further processing of the request (auth, deny, ...), and defaults to + * HTTP_RULE_RES_STOP if it executed all rules or stopped on an allow, or + * HTTP_RULE_RES_CONT if the last rule was reached. It may set the TX_CLTARPIT + * on txn->flags if it encounters a tarpit rule. If <deny_status> is not NULL + * and a deny/tarpit rule is matched, it will be filled with this rule's deny + * status. + */ +static enum rule_result http_req_get_intercept_rule(struct proxy *px, struct list *def_rules, + struct list *rules, struct stream *s) +{ + struct session *sess = strm_sess(s); + struct http_txn *txn = s->txn; + struct act_rule *rule; + enum rule_result rule_ret = HTTP_RULE_RES_CONT; + int act_opts = 0; + + /* If "the current_rule_list" match the executed rule list, we are in + * resume condition. If a resume is needed it is always in the action + * and never in the ACL or converters. In this case, we initialise the + * current rule, and go to the action execution point. + */ + if (s->current_rule) { + rule = s->current_rule; + s->current_rule = NULL; + if (s->current_rule_list == rules || (def_rules && s->current_rule_list == def_rules)) + goto resume_execution; + } + s->current_rule_list = ((!def_rules || s->current_rule_list == def_rules) ? rules : def_rules); + + restart: + /* start the ruleset evaluation in strict mode */ + txn->req.flags &= ~HTTP_MSGF_SOFT_RW; + + list_for_each_entry(rule, s->current_rule_list, list) { + /* check optional condition */ + if (rule->cond) { + int ret; + + ret = acl_exec_cond(rule->cond, px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + + if (!ret) /* condition not matched */ + continue; + } + + act_opts |= ACT_OPT_FIRST; + resume_execution: + if (rule->kw->flags & KWF_EXPERIMENTAL) + mark_tainted(TAINTED_ACTION_EXP_EXECUTED); + + /* Always call the action function if defined */ + if (rule->action_ptr) { + if ((s->scf->flags & SC_FL_ERROR) || + ((s->scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && + (px->options & PR_O_ABRT_CLOSE))) + act_opts |= ACT_OPT_FINAL; + + switch (rule->action_ptr(rule, px, sess, s, act_opts)) { + case ACT_RET_CONT: + break; + case ACT_RET_STOP: + rule_ret = HTTP_RULE_RES_STOP; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_YIELD: + s->current_rule = rule; + rule_ret = HTTP_RULE_RES_YIELD; + goto end; + case ACT_RET_ERR: + rule_ret = HTTP_RULE_RES_ERROR; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_DONE: + rule_ret = HTTP_RULE_RES_DONE; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_DENY: + if (txn->status == -1) + txn->status = 403; + rule_ret = HTTP_RULE_RES_DENY; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_ABRT: + rule_ret = HTTP_RULE_RES_ABRT; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_INV: + rule_ret = HTTP_RULE_RES_BADREQ; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + } + continue; /* eval the next rule */ + } + + /* If not action function defined, check for known actions */ + switch (rule->action) { + case ACT_ACTION_ALLOW: + rule_ret = HTTP_RULE_RES_STOP; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + + case ACT_ACTION_DENY: + txn->status = rule->arg.http_reply->status; + txn->http_reply = rule->arg.http_reply; + rule_ret = HTTP_RULE_RES_DENY; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + + case ACT_HTTP_REQ_TARPIT: + txn->flags |= TX_CLTARPIT; + txn->status = rule->arg.http_reply->status; + txn->http_reply = rule->arg.http_reply; + rule_ret = HTTP_RULE_RES_DENY; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + + case ACT_HTTP_REDIR: { + int ret = http_apply_redirect_rule(rule->arg.redir, s, txn); + + if (ret == 2) // 2 == skip + break; + + rule_ret = ret ? HTTP_RULE_RES_ABRT : HTTP_RULE_RES_ERROR; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + } + + /* other flags exists, but normally, they never be matched. */ + default: + break; + } + } + + if (def_rules && s->current_rule_list == def_rules) { + s->current_rule_list = rules; + goto restart; + } + + end: + /* if the ruleset evaluation is finished reset the strict mode */ + if (rule_ret != HTTP_RULE_RES_YIELD) + txn->req.flags &= ~HTTP_MSGF_SOFT_RW; + + /* we reached the end of the rules, nothing to report */ + return rule_ret; +} + +/* Executes the http-response rules <rules> for stream <s> and proxy <px>. It + * returns one of 5 possible statuses: HTTP_RULE_RES_CONT, HTTP_RULE_RES_STOP, + * HTTP_RULE_RES_DONE, HTTP_RULE_RES_YIELD, or HTTP_RULE_RES_BADREQ. If *CONT + * is returned, the process can continue the evaluation of next rule list. If + * *STOP or *DONE is returned, the process must stop the evaluation. If *BADREQ + * is returned, it means the operation could not be processed and a server error + * must be returned. If *YIELD is returned, the caller must call again the + * function with the same context. + */ +static enum rule_result http_res_get_intercept_rule(struct proxy *px, struct list *def_rules, + struct list *rules, struct stream *s, uint8_t final) +{ + struct session *sess = strm_sess(s); + struct http_txn *txn = s->txn; + struct act_rule *rule; + enum rule_result rule_ret = HTTP_RULE_RES_CONT; + int act_opts = 0; + + if (final) + act_opts |= ACT_OPT_FINAL; + /* If "the current_rule_list" match the executed rule list, we are in + * resume condition. If a resume is needed it is always in the action + * and never in the ACL or converters. In this case, we initialise the + * current rule, and go to the action execution point. + */ + if (s->current_rule) { + rule = s->current_rule; + s->current_rule = NULL; + if (s->current_rule_list == rules || (def_rules && s->current_rule_list == def_rules)) + goto resume_execution; + } + s->current_rule_list = ((!def_rules || s->current_rule_list == def_rules) ? rules : def_rules); + + restart: + + /* start the ruleset evaluation in strict mode */ + txn->rsp.flags &= ~HTTP_MSGF_SOFT_RW; + + list_for_each_entry(rule, s->current_rule_list, list) { + /* check optional condition */ + if (rule->cond) { + int ret; + + ret = acl_exec_cond(rule->cond, px, sess, s, SMP_OPT_DIR_RES|SMP_OPT_FINAL); + ret = acl_pass(ret); + + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + + if (!ret) /* condition not matched */ + continue; + } + + act_opts |= ACT_OPT_FIRST; +resume_execution: + if (rule->kw->flags & KWF_EXPERIMENTAL) + mark_tainted(TAINTED_ACTION_EXP_EXECUTED); + + /* Always call the action function if defined */ + if (rule->action_ptr) { + if ((s->scf->flags & SC_FL_ERROR) || + ((s->scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && + (px->options & PR_O_ABRT_CLOSE))) + act_opts |= ACT_OPT_FINAL; + + switch (rule->action_ptr(rule, px, sess, s, act_opts)) { + case ACT_RET_CONT: + break; + case ACT_RET_STOP: + rule_ret = HTTP_RULE_RES_STOP; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_YIELD: + s->current_rule = rule; + rule_ret = HTTP_RULE_RES_YIELD; + goto end; + case ACT_RET_ERR: + rule_ret = HTTP_RULE_RES_ERROR; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_DONE: + rule_ret = HTTP_RULE_RES_DONE; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_DENY: + if (txn->status == -1) + txn->status = 502; + rule_ret = HTTP_RULE_RES_DENY; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_ABRT: + rule_ret = HTTP_RULE_RES_ABRT; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_INV: + rule_ret = HTTP_RULE_RES_BADREQ; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + } + continue; /* eval the next rule */ + } + + /* If not action function defined, check for known actions */ + switch (rule->action) { + case ACT_ACTION_ALLOW: + rule_ret = HTTP_RULE_RES_STOP; /* "allow" rules are OK */ + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + + case ACT_ACTION_DENY: + txn->status = rule->arg.http_reply->status; + txn->http_reply = rule->arg.http_reply; + rule_ret = HTTP_RULE_RES_DENY; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + + case ACT_HTTP_REDIR: { + int ret = http_apply_redirect_rule(rule->arg.redir, s, txn); + + if (ret == 2) // 2 == skip + break; + + rule_ret = ret ? HTTP_RULE_RES_ABRT : HTTP_RULE_RES_ERROR; + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + } + /* other flags exists, but normally, they never be matched. */ + default: + break; + } + } + + if (def_rules && s->current_rule_list == def_rules) { + s->current_rule_list = rules; + goto restart; + } + + end: + /* if the ruleset evaluation is finished reset the strict mode */ + if (rule_ret != HTTP_RULE_RES_YIELD) + txn->rsp.flags &= ~HTTP_MSGF_SOFT_RW; + + /* we reached the end of the rules, nothing to report */ + return rule_ret; +} + +/* Executes backend and frontend http-after-response rules for the stream <s>, + * in that order. it return 1 on success and 0 on error. It is the caller + * responsibility to catch error or ignore it. If it catches it, this function + * may be called a second time, for the internal error. + */ +int http_eval_after_res_rules(struct stream *s) +{ + struct list *def_rules, *rules; + struct session *sess = s->sess; + enum rule_result ret = HTTP_RULE_RES_CONT; + + /* Eval after-response ruleset only if the reply is not const */ + if (s->txn->flags & TX_CONST_REPLY) + goto end; + + /* prune the request variables if not already done and swap to the response variables. */ + if (s->vars_reqres.scope != SCOPE_RES) { + if (!LIST_ISEMPTY(&s->vars_reqres.head)) + vars_prune(&s->vars_reqres, s->sess, s); + vars_init_head(&s->vars_reqres, SCOPE_RES); + } + + def_rules = (s->be->defpx ? &s->be->defpx->http_after_res_rules : NULL); + rules = &s->be->http_after_res_rules; + + ret = http_res_get_intercept_rule(s->be, def_rules, rules, s, 1); + if ((ret == HTTP_RULE_RES_CONT || ret == HTTP_RULE_RES_STOP) && sess->fe != s->be) { + def_rules = ((sess->fe->defpx && sess->fe->defpx != s->be->defpx) ? &sess->fe->defpx->http_after_res_rules : NULL); + rules = &sess->fe->http_after_res_rules; + ret = http_res_get_intercept_rule(sess->fe, def_rules, rules, s, 1); + } + + end: + /* All other codes than CONTINUE, STOP or DONE are forbidden */ + return (ret == HTTP_RULE_RES_CONT || ret == HTTP_RULE_RES_STOP || ret == HTTP_RULE_RES_DONE); +} + +/* + * Manage client-side cookie. It can impact performance by about 2% so it is + * desirable to call it only when needed. This code is quite complex because + * of the multiple very crappy and ambiguous syntaxes we have to support. it + * highly recommended not to touch this part without a good reason ! + */ +static void http_manage_client_side_cookies(struct stream *s, struct channel *req) +{ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct htx *htx; + struct http_hdr_ctx ctx; + char *hdr_beg, *hdr_end, *del_from; + char *prev, *att_beg, *att_end, *equal, *val_beg, *val_end, *next; + int preserve_hdr; + + htx = htxbuf(&req->buf); + ctx.blk = NULL; + while (http_find_header(htx, ist("Cookie"), &ctx, 1)) { + int is_first = 1; + del_from = NULL; /* nothing to be deleted */ + preserve_hdr = 0; /* assume we may kill the whole header */ + + /* Now look for cookies. Conforming to RFC2109, we have to support + * attributes whose name begin with a '$', and associate them with + * the right cookie, if we want to delete this cookie. + * So there are 3 cases for each cookie read : + * 1) it's a special attribute, beginning with a '$' : ignore it. + * 2) it's a server id cookie that we *MAY* want to delete : save + * some pointers on it (last semi-colon, beginning of cookie...) + * 3) it's an application cookie : we *MAY* have to delete a previous + * "special" cookie. + * At the end of loop, if a "special" cookie remains, we may have to + * remove it. If no application cookie persists in the header, we + * *MUST* delete it. + * + * Note: RFC2965 is unclear about the processing of spaces around + * the equal sign in the ATTR=VALUE form. A careful inspection of + * the RFC explicitly allows spaces before it, and not within the + * tokens (attrs or values). An inspection of RFC2109 allows that + * too but section 10.1.3 lets one think that spaces may be allowed + * after the equal sign too, resulting in some (rare) buggy + * implementations trying to do that. So let's do what servers do. + * Latest ietf draft forbids spaces all around. Also, earlier RFCs + * allowed quoted strings in values, with any possible character + * after a backslash, including control chars and delimiters, which + * causes parsing to become ambiguous. Browsers also allow spaces + * within values even without quotes. + * + * We have to keep multiple pointers in order to support cookie + * removal at the beginning, middle or end of header without + * corrupting the header. All of these headers are valid : + * + * hdr_beg hdr_end + * | | + * v | + * NAME1=VALUE1;NAME2=VALUE2;NAME3=VALUE3 | + * NAME1=VALUE1;NAME2_ONLY ;NAME3=VALUE3 v + * NAME1 = VALUE 1 ; NAME2 = VALUE2 ; NAME3 = VALUE3 + * | | | | | | | + * | | | | | | | + * | | | | | | +--> next + * | | | | | +----> val_end + * | | | | +-----------> val_beg + * | | | +--------------> equal + * | | +----------------> att_end + * | +---------------------> att_beg + * +--------------------------> prev + * + */ + hdr_beg = ctx.value.ptr; + hdr_end = hdr_beg + ctx.value.len; + for (prev = hdr_beg; prev < hdr_end; prev = next) { + /* Iterate through all cookies on this line */ + + /* find att_beg */ + att_beg = prev; + if (!is_first) + att_beg++; + is_first = 0; + + while (att_beg < hdr_end && HTTP_IS_SPHT(*att_beg)) + att_beg++; + + /* find att_end : this is the first character after the last non + * space before the equal. It may be equal to hdr_end. + */ + equal = att_end = att_beg; + while (equal < hdr_end) { + if (*equal == '=' || *equal == ',' || *equal == ';') + break; + if (HTTP_IS_SPHT(*equal++)) + continue; + att_end = equal; + } + + /* here, <equal> points to '=', a delimiter or the end. <att_end> + * is between <att_beg> and <equal>, both may be identical. + */ + /* look for end of cookie if there is an equal sign */ + if (equal < hdr_end && *equal == '=') { + /* look for the beginning of the value */ + val_beg = equal + 1; + while (val_beg < hdr_end && HTTP_IS_SPHT(*val_beg)) + val_beg++; + + /* find the end of the value, respecting quotes */ + next = http_find_cookie_value_end(val_beg, hdr_end); + + /* make val_end point to the first white space or delimiter after the value */ + val_end = next; + while (val_end > val_beg && HTTP_IS_SPHT(*(val_end - 1))) + val_end--; + } + else + val_beg = val_end = next = equal; + + /* We have nothing to do with attributes beginning with + * '$'. However, they will automatically be removed if a + * header before them is removed, since they're supposed + * to be linked together. + */ + if (*att_beg == '$') + continue; + + /* Ignore cookies with no equal sign */ + if (equal == next) { + /* This is not our cookie, so we must preserve it. But if we already + * scheduled another cookie for removal, we cannot remove the + * complete header, but we can remove the previous block itself. + */ + preserve_hdr = 1; + if (del_from != NULL) { + int delta = http_del_hdr_value(hdr_beg, hdr_end, &del_from, prev); + val_end += delta; + next += delta; + hdr_end += delta; + prev = del_from; + del_from = NULL; + } + continue; + } + + /* if there are spaces around the equal sign, we need to + * strip them otherwise we'll get trouble for cookie captures, + * or even for rewrites. Since this happens extremely rarely, + * it does not hurt performance. + */ + if (unlikely(att_end != equal || val_beg > equal + 1)) { + int stripped_before = 0; + int stripped_after = 0; + + if (att_end != equal) { + memmove(att_end, equal, hdr_end - equal); + stripped_before = (att_end - equal); + equal += stripped_before; + val_beg += stripped_before; + } + + if (val_beg > equal + 1) { + memmove(equal + 1, val_beg, hdr_end + stripped_before - val_beg); + stripped_after = (equal + 1) - val_beg; + val_beg += stripped_after; + stripped_before += stripped_after; + } + + val_end += stripped_before; + next += stripped_before; + hdr_end += stripped_before; + } + /* now everything is as on the diagram above */ + + /* First, let's see if we want to capture this cookie. We check + * that we don't already have a client side cookie, because we + * can only capture one. Also as an optimisation, we ignore + * cookies shorter than the declared name. + */ + if (sess->fe->capture_name != NULL && txn->cli_cookie == NULL && + (val_end - att_beg >= sess->fe->capture_namelen) && + memcmp(att_beg, sess->fe->capture_name, sess->fe->capture_namelen) == 0) { + int log_len = val_end - att_beg; + + if ((txn->cli_cookie = pool_alloc(pool_head_capture)) == NULL) { + ha_alert("HTTP logging : out of memory.\n"); + } else { + if (log_len > sess->fe->capture_len) + log_len = sess->fe->capture_len; + memcpy(txn->cli_cookie, att_beg, log_len); + txn->cli_cookie[log_len] = 0; + } + } + + /* Persistence cookies in passive, rewrite or insert mode have the + * following form : + * + * Cookie: NAME=SRV[|<lastseen>[|<firstseen>]] + * + * For cookies in prefix mode, the form is : + * + * Cookie: NAME=SRV~VALUE + */ + if ((att_end - att_beg == s->be->cookie_len) && (s->be->cookie_name != NULL) && + (memcmp(att_beg, s->be->cookie_name, att_end - att_beg) == 0)) { + struct server *srv = s->be->srv; + char *delim; + + /* if we're in cookie prefix mode, we'll search the delimiter so that we + * have the server ID between val_beg and delim, and the original cookie between + * delim+1 and val_end. Otherwise, delim==val_end : + * + * hdr_beg + * | + * v + * NAME=SRV; # in all but prefix modes + * NAME=SRV~OPAQUE ; # in prefix mode + * || || | |+-> next + * || || | +--> val_end + * || || +---------> delim + * || |+------------> val_beg + * || +-------------> att_end = equal + * |+-----------------> att_beg + * +------------------> prev + * + */ + if (s->be->ck_opts & PR_CK_PFX) { + for (delim = val_beg; delim < val_end; delim++) + if (*delim == COOKIE_DELIM) + break; + } + else { + char *vbar1; + delim = val_end; + /* Now check if the cookie contains a date field, which would + * appear after a vertical bar ('|') just after the server name + * and before the delimiter. + */ + vbar1 = memchr(val_beg, COOKIE_DELIM_DATE, val_end - val_beg); + if (vbar1) { + /* OK, so left of the bar is the server's cookie and + * right is the last seen date. It is a base64 encoded + * 30-bit value representing the UNIX date since the + * epoch in 4-second quantities. + */ + int val; + delim = vbar1++; + if (val_end - vbar1 >= 5) { + val = b64tos30(vbar1); + if (val > 0) + txn->cookie_last_date = val << 2; + } + /* look for a second vertical bar */ + vbar1 = memchr(vbar1, COOKIE_DELIM_DATE, val_end - vbar1); + if (vbar1 && (val_end - vbar1 > 5)) { + val = b64tos30(vbar1 + 1); + if (val > 0) + txn->cookie_first_date = val << 2; + } + } + } + + /* if the cookie has an expiration date and the proxy wants to check + * it, then we do that now. We first check if the cookie is too old, + * then only if it has expired. We detect strict overflow because the + * time resolution here is not great (4 seconds). Cookies with dates + * in the future are ignored if their offset is beyond one day. This + * allows an admin to fix timezone issues without expiring everyone + * and at the same time avoids keeping unwanted side effects for too + * long. + */ + if (txn->cookie_first_date && s->be->cookie_maxlife && + (((signed)(date.tv_sec - txn->cookie_first_date) > (signed)s->be->cookie_maxlife) || + ((signed)(txn->cookie_first_date - date.tv_sec) > 86400))) { + txn->flags &= ~TX_CK_MASK; + txn->flags |= TX_CK_OLD; + delim = val_beg; // let's pretend we have not found the cookie + txn->cookie_first_date = 0; + txn->cookie_last_date = 0; + } + else if (txn->cookie_last_date && s->be->cookie_maxidle && + (((signed)(date.tv_sec - txn->cookie_last_date) > (signed)s->be->cookie_maxidle) || + ((signed)(txn->cookie_last_date - date.tv_sec) > 86400))) { + txn->flags &= ~TX_CK_MASK; + txn->flags |= TX_CK_EXPIRED; + delim = val_beg; // let's pretend we have not found the cookie + txn->cookie_first_date = 0; + txn->cookie_last_date = 0; + } + + /* Here, we'll look for the first running server which supports the cookie. + * This allows to share a same cookie between several servers, for example + * to dedicate backup servers to specific servers only. + * However, to prevent clients from sticking to cookie-less backup server + * when they have incidentely learned an empty cookie, we simply ignore + * empty cookies and mark them as invalid. + * The same behaviour is applied when persistence must be ignored. + */ + if ((delim == val_beg) || (s->flags & (SF_IGNORE_PRST | SF_ASSIGNED))) + srv = NULL; + + while (srv) { + if (srv->cookie && (srv->cklen == delim - val_beg) && + !memcmp(val_beg, srv->cookie, delim - val_beg)) { + if ((srv->cur_state != SRV_ST_STOPPED) || + (s->be->options & PR_O_PERSIST) || + (s->flags & SF_FORCE_PRST)) { + /* we found the server and we can use it */ + txn->flags &= ~TX_CK_MASK; + txn->flags |= (srv->cur_state != SRV_ST_STOPPED) ? TX_CK_VALID : TX_CK_DOWN; + s->flags |= SF_DIRECT | SF_ASSIGNED; + s->target = &srv->obj_type; + break; + } else { + /* we found a server, but it's down, + * mark it as such and go on in case + * another one is available. + */ + txn->flags &= ~TX_CK_MASK; + txn->flags |= TX_CK_DOWN; + } + } + srv = srv->next; + } + + if (!srv && !(txn->flags & (TX_CK_DOWN|TX_CK_EXPIRED|TX_CK_OLD))) { + /* no server matched this cookie or we deliberately skipped it */ + txn->flags &= ~TX_CK_MASK; + if ((s->flags & (SF_IGNORE_PRST | SF_ASSIGNED))) + txn->flags |= TX_CK_UNUSED; + else + txn->flags |= TX_CK_INVALID; + } + + /* depending on the cookie mode, we may have to either : + * - delete the complete cookie if we're in insert+indirect mode, so that + * the server never sees it ; + * - remove the server id from the cookie value, and tag the cookie as an + * application cookie so that it does not get accidentally removed later, + * if we're in cookie prefix mode + */ + if ((s->be->ck_opts & PR_CK_PFX) && (delim != val_end)) { + int delta; /* negative */ + + memmove(val_beg, delim + 1, hdr_end - (delim + 1)); + delta = val_beg - (delim + 1); + val_end += delta; + next += delta; + hdr_end += delta; + del_from = NULL; + preserve_hdr = 1; /* we want to keep this cookie */ + } + else if (del_from == NULL && + (s->be->ck_opts & (PR_CK_INS | PR_CK_IND)) == (PR_CK_INS | PR_CK_IND)) { + del_from = prev; + } + } + else { + /* This is not our cookie, so we must preserve it. But if we already + * scheduled another cookie for removal, we cannot remove the + * complete header, but we can remove the previous block itself. + */ + preserve_hdr = 1; + + if (del_from != NULL) { + int delta = http_del_hdr_value(hdr_beg, hdr_end, &del_from, prev); + if (att_beg >= del_from) + att_beg += delta; + if (att_end >= del_from) + att_end += delta; + val_beg += delta; + val_end += delta; + next += delta; + hdr_end += delta; + prev = del_from; + del_from = NULL; + } + } + + } /* for each cookie */ + + + /* There are no more cookies on this line. + * We may still have one (or several) marked for deletion at the + * end of the line. We must do this now in two ways : + * - if some cookies must be preserved, we only delete from the + * mark to the end of line ; + * - if nothing needs to be preserved, simply delete the whole header + */ + if (del_from) { + hdr_end = (preserve_hdr ? del_from : hdr_beg); + } + if ((hdr_end - hdr_beg) != ctx.value.len) { + if (hdr_beg != hdr_end) + htx_change_blk_value_len(htx, ctx.blk, hdr_end - hdr_beg); + else + http_remove_header(htx, &ctx); + } + } /* for each "Cookie header */ +} + +/* + * Manage server-side cookies. It can impact performance by about 2% so it is + * desirable to call it only when needed. This function is also used when we + * just need to know if there is a cookie (eg: for check-cache). + */ +static void http_manage_server_side_cookies(struct stream *s, struct channel *res) +{ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct htx *htx; + struct http_hdr_ctx ctx; + struct server *srv; + char *hdr_beg, *hdr_end; + char *prev, *att_beg, *att_end, *equal, *val_beg, *val_end, *next; + + htx = htxbuf(&res->buf); + + ctx.blk = NULL; + while (http_find_header(htx, ist("Set-Cookie"), &ctx, 1)) { + int is_first = 1; + + /* OK, right now we know we have a Set-Cookie* at hdr_beg, and + * <prev> points to the colon. + */ + txn->flags |= TX_SCK_PRESENT; + + /* Maybe we only wanted to see if there was a Set-Cookie (eg: + * check-cache is enabled) and we are not interested in checking + * them. Warning, the cookie capture is declared in the frontend. + */ + if (s->be->cookie_name == NULL && sess->fe->capture_name == NULL) + break; + + /* OK so now we know we have to process this response cookie. + * The format of the Set-Cookie header is slightly different + * from the format of the Cookie header in that it does not + * support the comma as a cookie delimiter (thus the header + * cannot be folded) because the Expires attribute described in + * the original Netscape's spec may contain an unquoted date + * with a comma inside. We have to live with this because + * many browsers don't support Max-Age and some browsers don't + * support quoted strings. However the Set-Cookie2 header is + * clean but basically nobody supports it. + * + * We have to keep multiple pointers in order to support cookie + * removal at the beginning, middle or end of header without + * corrupting the header (in case of set-cookie2). A special + * pointer, <scav> points to the beginning of the set-cookie-av + * fields after the first semi-colon. The <next> pointer points + * either to the end of line (set-cookie) or next unquoted comma + * (set-cookie2). All of these headers are valid : + * + * hdr_beg hdr_end + * | | + * v | + * NAME1 = VALUE 1 ; Secure; Path="/" | + * NAME=VALUE; Secure; Expires=Thu, 01-Jan-1970 00:00:01 GMT v + * NAME = VALUE ; Secure; Expires=Thu, 01-Jan-1970 00:00:01 GMT + * NAME1 = VALUE 1 ; Max-Age=0, NAME2=VALUE2; Discard + * | | | | | | | | + * | | | | | | | +-> next + * | | | | | | +------------> scav + * | | | | | +--------------> val_end + * | | | | +--------------------> val_beg + * | | | +----------------------> equal + * | | +------------------------> att_end + * | +----------------------------> att_beg + * +------------------------------> prev + * -------------------------------> hdr_beg + */ + hdr_beg = ctx.value.ptr; + hdr_end = hdr_beg + ctx.value.len; + for (prev = hdr_beg; prev < hdr_end; prev = next) { + + /* Iterate through all cookies on this line */ + + /* find att_beg */ + att_beg = prev; + if (!is_first) + att_beg++; + is_first = 0; + + while (att_beg < hdr_end && HTTP_IS_SPHT(*att_beg)) + att_beg++; + + /* find att_end : this is the first character after the last non + * space before the equal. It may be equal to hdr_end. + */ + equal = att_end = att_beg; + + while (equal < hdr_end) { + if (*equal == '=' || *equal == ';') + break; + if (HTTP_IS_SPHT(*equal++)) + continue; + att_end = equal; + } + + /* here, <equal> points to '=', a delimiter or the end. <att_end> + * is between <att_beg> and <equal>, both may be identical. + */ + + /* look for end of cookie if there is an equal sign */ + if (equal < hdr_end && *equal == '=') { + /* look for the beginning of the value */ + val_beg = equal + 1; + while (val_beg < hdr_end && HTTP_IS_SPHT(*val_beg)) + val_beg++; + + /* find the end of the value, respecting quotes */ + next = http_find_cookie_value_end(val_beg, hdr_end); + + /* make val_end point to the first white space or delimiter after the value */ + val_end = next; + while (val_end > val_beg && HTTP_IS_SPHT(*(val_end - 1))) + val_end--; + } + else { + /* <equal> points to next comma, semi-colon or EOL */ + val_beg = val_end = next = equal; + } + + if (next < hdr_end) { + /* For Set-Cookie, since commas are permitted + * in values, skip to the end. + */ + next = hdr_end; + } + + /* Now everything is as on the diagram above */ + + /* Ignore cookies with no equal sign */ + if (equal == val_end) + continue; + + /* If there are spaces around the equal sign, we need to + * strip them otherwise we'll get trouble for cookie captures, + * or even for rewrites. Since this happens extremely rarely, + * it does not hurt performance. + */ + if (unlikely(att_end != equal || val_beg > equal + 1)) { + int stripped_before = 0; + int stripped_after = 0; + + if (att_end != equal) { + memmove(att_end, equal, hdr_end - equal); + stripped_before = (att_end - equal); + equal += stripped_before; + val_beg += stripped_before; + } + + if (val_beg > equal + 1) { + memmove(equal + 1, val_beg, hdr_end + stripped_before - val_beg); + stripped_after = (equal + 1) - val_beg; + val_beg += stripped_after; + stripped_before += stripped_after; + } + + val_end += stripped_before; + next += stripped_before; + hdr_end += stripped_before; + + htx_change_blk_value_len(htx, ctx.blk, hdr_end - hdr_beg); + ctx.value.len = hdr_end - hdr_beg; + } + + /* First, let's see if we want to capture this cookie. We check + * that we don't already have a server side cookie, because we + * can only capture one. Also as an optimisation, we ignore + * cookies shorter than the declared name. + */ + if (sess->fe->capture_name != NULL && + txn->srv_cookie == NULL && + (val_end - att_beg >= sess->fe->capture_namelen) && + memcmp(att_beg, sess->fe->capture_name, sess->fe->capture_namelen) == 0) { + int log_len = val_end - att_beg; + if ((txn->srv_cookie = pool_alloc(pool_head_capture)) == NULL) { + ha_alert("HTTP logging : out of memory.\n"); + } + else { + if (log_len > sess->fe->capture_len) + log_len = sess->fe->capture_len; + memcpy(txn->srv_cookie, att_beg, log_len); + txn->srv_cookie[log_len] = 0; + } + } + + srv = objt_server(s->target); + /* now check if we need to process it for persistence */ + if (!(s->flags & SF_IGNORE_PRST) && + (att_end - att_beg == s->be->cookie_len) && (s->be->cookie_name != NULL) && + (memcmp(att_beg, s->be->cookie_name, att_end - att_beg) == 0)) { + /* assume passive cookie by default */ + txn->flags &= ~TX_SCK_MASK; + txn->flags |= TX_SCK_FOUND; + + /* If the cookie is in insert mode on a known server, we'll delete + * this occurrence because we'll insert another one later. + * We'll delete it too if the "indirect" option is set and we're in + * a direct access. + */ + if (s->be->ck_opts & PR_CK_PSV) { + /* The "preserve" flag was set, we don't want to touch the + * server's cookie. + */ + } + else if ((srv && (s->be->ck_opts & PR_CK_INS)) || + ((s->flags & SF_DIRECT) && (s->be->ck_opts & PR_CK_IND))) { + /* this cookie must be deleted */ + if (prev == hdr_beg && next == hdr_end) { + /* whole header */ + http_remove_header(htx, &ctx); + /* note: while both invalid now, <next> and <hdr_end> + * are still equal, so the for() will stop as expected. + */ + } else { + /* just remove the value */ + int delta = http_del_hdr_value(hdr_beg, hdr_end, &prev, next); + next = prev; + hdr_end += delta; + } + txn->flags &= ~TX_SCK_MASK; + txn->flags |= TX_SCK_DELETED; + /* and go on with next cookie */ + } + else if (srv && srv->cookie && (s->be->ck_opts & PR_CK_RW)) { + /* replace bytes val_beg->val_end with the cookie name associated + * with this server since we know it. + */ + int sliding, delta; + + ctx.value = ist2(val_beg, val_end - val_beg); + ctx.lws_before = ctx.lws_after = 0; + http_replace_header_value(htx, &ctx, ist2(srv->cookie, srv->cklen)); + delta = srv->cklen - (val_end - val_beg); + sliding = (ctx.value.ptr - val_beg); + hdr_beg += sliding; + val_beg += sliding; + next += sliding + delta; + hdr_end += sliding + delta; + + txn->flags &= ~TX_SCK_MASK; + txn->flags |= TX_SCK_REPLACED; + } + else if (srv && srv->cookie && (s->be->ck_opts & PR_CK_PFX)) { + /* insert the cookie name associated with this server + * before existing cookie, and insert a delimiter between them.. + */ + int sliding, delta; + ctx.value = ist2(val_beg, 0); + ctx.lws_before = ctx.lws_after = 0; + http_replace_header_value(htx, &ctx, ist2(srv->cookie, srv->cklen + 1)); + delta = srv->cklen + 1; + sliding = (ctx.value.ptr - val_beg); + hdr_beg += sliding; + val_beg += sliding; + next += sliding + delta; + hdr_end += sliding + delta; + + val_beg[srv->cklen] = COOKIE_DELIM; + txn->flags &= ~TX_SCK_MASK; + txn->flags |= TX_SCK_REPLACED; + } + } + /* that's done for this cookie, check the next one on the same + * line when next != hdr_end (which should normally not happen + * with set-cookie2 support removed). + */ + } + } +} + +/* + * Parses the Cache-Control and Pragma request header fields to determine if + * the request may be served from the cache and/or if it is cacheable. Updates + * s->txn->flags. + */ +void http_check_request_for_cacheability(struct stream *s, struct channel *req) +{ + struct http_txn *txn = s->txn; + struct htx *htx; + struct http_hdr_ctx ctx = { .blk = NULL }; + int pragma_found, cc_found; + + if ((txn->flags & (TX_CACHEABLE|TX_CACHE_IGNORE)) == TX_CACHE_IGNORE) + return; /* nothing more to do here */ + + htx = htxbuf(&req->buf); + pragma_found = cc_found = 0; + + /* Check "pragma" header for HTTP/1.0 compatibility. */ + if (http_find_header(htx, ist("pragma"), &ctx, 1)) { + if (isteqi(ctx.value, ist("no-cache"))) { + pragma_found = 1; + } + } + + ctx.blk = NULL; + /* Don't use the cache and don't try to store if we found the + * Authorization header */ + if (http_find_header(htx, ist("authorization"), &ctx, 1)) { + txn->flags &= ~TX_CACHEABLE & ~TX_CACHE_COOK; + txn->flags |= TX_CACHE_IGNORE; + } + + + /* Look for "cache-control" header and iterate over all the values + * until we find one that specifies that caching is possible or not. */ + ctx.blk = NULL; + while (http_find_header(htx, ist("cache-control"), &ctx, 0)) { + cc_found = 1; + /* We don't check the values after max-age, max-stale nor min-fresh, + * we simply don't use the cache when they're specified. */ + if (istmatchi(ctx.value, ist("max-age")) || + istmatchi(ctx.value, ist("no-cache")) || + istmatchi(ctx.value, ist("max-stale")) || + istmatchi(ctx.value, ist("min-fresh"))) { + txn->flags |= TX_CACHE_IGNORE; + continue; + } + if (istmatchi(ctx.value, ist("no-store"))) { + txn->flags &= ~TX_CACHEABLE & ~TX_CACHE_COOK; + continue; + } + } + + /* RFC7234#5.4: + * When the Cache-Control header field is also present and + * understood in a request, Pragma is ignored. + * When the Cache-Control header field is not present in a + * request, caches MUST consider the no-cache request + * pragma-directive as having the same effect as if + * "Cache-Control: no-cache" were present. + */ + if (!cc_found && pragma_found) + txn->flags |= TX_CACHE_IGNORE; +} + +/* + * Check if response is cacheable or not. Updates s->txn->flags. + */ +void http_check_response_for_cacheability(struct stream *s, struct channel *res) +{ + struct http_txn *txn = s->txn; + struct http_hdr_ctx ctx = { .blk = NULL }; + struct htx *htx; + int has_freshness_info = 0; + int has_validator = 0; + int has_null_maxage = 0; + + if (txn->status < 200) { + /* do not try to cache interim responses! */ + txn->flags &= ~TX_CACHEABLE & ~TX_CACHE_COOK; + return; + } + + htx = htxbuf(&res->buf); + /* Check "pragma" header for HTTP/1.0 compatibility. */ + if (http_find_header(htx, ist("pragma"), &ctx, 1)) { + if (isteqi(ctx.value, ist("no-cache"))) { + txn->flags &= ~TX_CACHEABLE & ~TX_CACHE_COOK; + return; + } + } + + /* Look for "cache-control" header and iterate over all the values + * until we find one that specifies that caching is possible or not. */ + ctx.blk = NULL; + while (http_find_header(htx, ist("cache-control"), &ctx, 0)) { + if (isteqi(ctx.value, ist("public"))) { + txn->flags |= TX_CACHEABLE | TX_CACHE_COOK; + continue; + } + /* This max-age might be overridden by a s-maxage directive, do + * not unset the TX_CACHEABLE yet. */ + if (isteqi(ctx.value, ist("max-age=0"))) { + has_null_maxage = 1; + continue; + } + + if (isteqi(ctx.value, ist("private")) || + isteqi(ctx.value, ist("no-cache")) || + isteqi(ctx.value, ist("no-store")) || + isteqi(ctx.value, ist("s-maxage=0"))) { + txn->flags &= ~TX_CACHEABLE & ~TX_CACHE_COOK; + continue; + } + /* We might have a no-cache="set-cookie" form. */ + if (istmatchi(ctx.value, ist("no-cache=\"set-cookie"))) { + txn->flags &= ~TX_CACHE_COOK; + continue; + } + + if (istmatchi(ctx.value, ist("s-maxage"))) { + has_freshness_info = 1; + has_null_maxage = 0; /* The null max-age is overridden, ignore it */ + continue; + } + if (istmatchi(ctx.value, ist("max-age"))) { + has_freshness_info = 1; + continue; + } + } + + /* We had a 'max-age=0' directive but no extra s-maxage, do not cache + * the response. */ + if (has_null_maxage) { + txn->flags &= ~TX_CACHEABLE & ~TX_CACHE_COOK; + } + + /* If no freshness information could be found in Cache-Control values, + * look for an Expires header. */ + if (!has_freshness_info) { + ctx.blk = NULL; + has_freshness_info = http_find_header(htx, ist("expires"), &ctx, 0); + } + + /* If no freshness information could be found in Cache-Control or Expires + * values, look for an explicit validator. */ + if (!has_freshness_info) { + ctx.blk = NULL; + has_validator = 1; + if (!http_find_header(htx, ist("etag"), &ctx, 0)) { + ctx.blk = NULL; + if (!http_find_header(htx, ist("last-modified"), &ctx, 0)) + has_validator = 0; + } + } + + /* We won't store an entry that has neither a cache validator nor an + * explicit expiration time, as suggested in RFC 7234#3. */ + if (!has_freshness_info && !has_validator) + txn->flags &= ~TX_CACHEABLE; +} + +/* + * In a GET, HEAD or POST request, check if the requested URI matches the stats uri + * for the current proxy. + * + * It is assumed that the request is either a HEAD, GET, or POST and that the + * uri_auth field is valid. + * + * Returns 1 if stats should be provided, otherwise 0. + */ +static int http_stats_check_uri(struct stream *s, struct http_txn *txn, struct proxy *px) +{ + struct uri_auth *uri_auth = px->uri_auth; + struct htx *htx; + struct htx_sl *sl; + struct ist uri; + + if (!uri_auth) + return 0; + + if (txn->meth != HTTP_METH_GET && txn->meth != HTTP_METH_HEAD && txn->meth != HTTP_METH_POST) + return 0; + + htx = htxbuf(&s->req.buf); + sl = http_get_stline(htx); + uri = htx_sl_req_uri(sl); + if (*uri_auth->uri_prefix == '/') { + struct http_uri_parser parser = http_uri_parser_init(uri); + uri = http_parse_path(&parser); + } + + /* check URI size */ + if (uri_auth->uri_len > uri.len) + return 0; + + if (memcmp(uri.ptr, uri_auth->uri_prefix, uri_auth->uri_len) != 0) + return 0; + + return 1; +} + +/* This function prepares an applet to handle the stats. It can deal with the + * "100-continue" expectation, check that admin rules are met for POST requests, + * and program a response message if something was unexpected. It cannot fail + * and always relies on the stats applet to complete the job. It does not touch + * analysers nor counters, which are left to the caller. It does not touch + * s->target which is supposed to already point to the stats applet. The caller + * is expected to have already assigned an appctx to the stream. + */ +static int http_handle_stats(struct stream *s, struct channel *req, struct proxy *px) +{ + struct stats_admin_rule *stats_admin_rule; + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct http_msg *msg = &txn->req; + struct uri_auth *uri_auth = px->uri_auth; + const char *h, *lookup, *end; + struct appctx *appctx = __sc_appctx(s->scb); + struct show_stat_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct htx *htx; + struct htx_sl *sl; + + appctx->st1 = 0; + ctx->state = STAT_STATE_INIT; + ctx->st_code = STAT_STATUS_INIT; + ctx->http_px = px; + ctx->flags |= uri_auth->flags; + ctx->flags |= STAT_FMT_HTML; /* assume HTML mode by default */ + if ((msg->flags & HTTP_MSGF_VER_11) && (txn->meth != HTTP_METH_HEAD)) + ctx->flags |= STAT_CHUNKED; + + htx = htxbuf(&req->buf); + sl = http_get_stline(htx); + lookup = HTX_SL_REQ_UPTR(sl) + uri_auth->uri_len; + end = HTX_SL_REQ_UPTR(sl) + HTX_SL_REQ_ULEN(sl); + + for (h = lookup; h <= end - 3; h++) { + if (memcmp(h, ";up", 3) == 0) { + ctx->flags |= STAT_HIDE_DOWN; + break; + } + } + + for (h = lookup; h <= end - 9; h++) { + if (memcmp(h, ";no-maint", 9) == 0) { + ctx->flags |= STAT_HIDE_MAINT; + break; + } + } + + if (uri_auth->refresh) { + for (h = lookup; h <= end - 10; h++) { + if (memcmp(h, ";norefresh", 10) == 0) { + ctx->flags |= STAT_NO_REFRESH; + break; + } + } + } + + for (h = lookup; h <= end - 4; h++) { + if (memcmp(h, ";csv", 4) == 0) { + ctx->flags &= ~(STAT_FMT_MASK|STAT_JSON_SCHM); + break; + } + } + + for (h = lookup; h <= end - 6; h++) { + if (memcmp(h, ";typed", 6) == 0) { + ctx->flags &= ~(STAT_FMT_MASK|STAT_JSON_SCHM); + ctx->flags |= STAT_FMT_TYPED; + break; + } + } + + for (h = lookup; h <= end - 5; h++) { + if (memcmp(h, ";json", 5) == 0) { + ctx->flags &= ~(STAT_FMT_MASK|STAT_JSON_SCHM); + ctx->flags |= STAT_FMT_JSON; + break; + } + } + + for (h = lookup; h <= end - 12; h++) { + if (memcmp(h, ";json-schema", 12) == 0) { + ctx->flags &= ~STAT_FMT_MASK; + ctx->flags |= STAT_JSON_SCHM; + break; + } + } + + for (h = lookup; h <= end - 8; h++) { + if (memcmp(h, ";st=", 4) == 0) { + int i; + h += 4; + ctx->st_code = STAT_STATUS_UNKN; + for (i = STAT_STATUS_INIT + 1; i < STAT_STATUS_SIZE; i++) { + if (strncmp(stat_status_codes[i], h, 4) == 0) { + ctx->st_code = i; + break; + } + } + break; + } + } + + ctx->scope_str = 0; + ctx->scope_len = 0; + for (h = lookup; h <= end - 8; h++) { + if (memcmp(h, STAT_SCOPE_INPUT_NAME "=", strlen(STAT_SCOPE_INPUT_NAME) + 1) == 0) { + int itx = 0; + const char *h2; + char scope_txt[STAT_SCOPE_TXT_MAXLEN + 1]; + const char *err; + + h += strlen(STAT_SCOPE_INPUT_NAME) + 1; + h2 = h; + ctx->scope_str = h2 - HTX_SL_REQ_UPTR(sl); + while (h < end) { + if (*h == ';' || *h == '&' || *h == ' ') + break; + itx++; + h++; + } + + if (itx > STAT_SCOPE_TXT_MAXLEN) + itx = STAT_SCOPE_TXT_MAXLEN; + ctx->scope_len = itx; + + /* scope_txt = search query, ctx->scope_len is always <= STAT_SCOPE_TXT_MAXLEN */ + memcpy(scope_txt, h2, itx); + scope_txt[itx] = '\0'; + err = invalid_char(scope_txt); + if (err) { + /* bad char in search text => clear scope */ + ctx->scope_str = 0; + ctx->scope_len = 0; + } + break; + } + } + + /* now check whether we have some admin rules for this request */ + list_for_each_entry(stats_admin_rule, &uri_auth->admin_rules, list) { + int ret = 1; + + if (stats_admin_rule->cond) { + ret = acl_exec_cond(stats_admin_rule->cond, s->be, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (stats_admin_rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + /* no rule, or the rule matches */ + ctx->flags |= STAT_ADMIN; + break; + } + } + + if (txn->meth == HTTP_METH_GET || txn->meth == HTTP_METH_HEAD) + appctx->st0 = STAT_HTTP_HEAD; + else if (txn->meth == HTTP_METH_POST) { + if (ctx->flags & STAT_ADMIN) { + appctx->st0 = STAT_HTTP_POST; + if (msg->msg_state < HTTP_MSG_DATA) + req->analysers |= AN_REQ_HTTP_BODY; + } + else { + /* POST without admin level */ + ctx->flags &= ~STAT_CHUNKED; + ctx->st_code = STAT_STATUS_DENY; + appctx->st0 = STAT_HTTP_LAST; + } + } + else { + /* Unsupported method */ + ctx->flags &= ~STAT_CHUNKED; + ctx->st_code = STAT_STATUS_IVAL; + appctx->st0 = STAT_HTTP_LAST; + } + + s->task->nice = -32; /* small boost for HTTP statistics */ + return 1; +} + +/* This function waits for the message payload at most <time> milliseconds (may + * be set to TICK_ETERNITY). It stops to wait if at least <bytes> bytes of the + * payload are received (0 means no limit). It returns HTTP_RULE_* depending on + * the result: + * + * - HTTP_RULE_RES_CONT when conditions are met to stop waiting + * - HTTP_RULE_RES_YIELD to wait for more data + * - HTTP_RULE_RES_ABRT when a timeout occurred. + * - HTTP_RULE_RES_BADREQ if a parsing error is raised by lower level + * - HTTP_RULE_RES_ERROR if an internal error occurred + * + * If a timeout occurred, this function is responsible to emit the right response + * to the client, depending on the channel (408 on request side, 504 on response + * side). All other errors must be handled by the caller. + */ +enum rule_result http_wait_for_msg_body(struct stream *s, struct channel *chn, + unsigned int time, unsigned int bytes) +{ + struct session *sess = s->sess; + struct http_txn *txn = s->txn; + struct http_msg *msg = ((chn->flags & CF_ISRESP) ? &txn->rsp : &txn->req); + struct htx *htx; + enum rule_result ret = HTTP_RULE_RES_CONT; + + htx = htxbuf(&chn->buf); + + if (htx->flags & HTX_FL_PARSING_ERROR) { + ret = HTTP_RULE_RES_BADREQ; + goto end; + } + if (htx->flags & HTX_FL_PROCESSING_ERROR) { + ret = HTTP_RULE_RES_ERROR; + goto end; + } + + /* Do nothing for bodyless and CONNECT requests */ + if (txn->meth == HTTP_METH_CONNECT || (msg->flags & HTTP_MSGF_BODYLESS)) + goto end; + + if (!(chn->flags & CF_ISRESP)) { + if (http_handle_expect_hdr(s, htx, msg) == -1) { + ret = HTTP_RULE_RES_ERROR; + goto end; + } + } + + /* Now we're are waiting for the payload. We just need to know if all + * data have been received or if the buffer is full. + */ + if ((htx->flags & HTX_FL_EOM) || + htx_get_tail_type(htx) > HTX_BLK_DATA || + channel_htx_full(chn, htx, global.tune.maxrewrite) || + sc_waiting_room(chn_prod(chn))) + goto end; + + if (bytes) { + struct htx_blk *blk; + unsigned int len = 0; + + for (blk = htx_get_first_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + if (htx_get_blk_type(blk) != HTX_BLK_DATA) + continue; + len += htx_get_blksz(blk); + if (len >= bytes) + goto end; + } + } + + if ((chn->flags & CF_READ_TIMEOUT) || tick_is_expired(chn->analyse_exp, now_ms)) { + if (!(chn->flags & CF_ISRESP)) + goto abort_req; + goto abort_res; + } + + /* we get here if we need to wait for more data */ + if (!(chn_prod(chn)->flags & (SC_FL_EOS|SC_FL_ABRT_DONE))) { + if (!tick_isset(chn->analyse_exp)) + chn->analyse_exp = tick_add_ifset(now_ms, time); + ret = HTTP_RULE_RES_YIELD; + } + + end: + return ret; + + abort: + http_set_term_flags(s); + http_reply_and_close(s, txn->status, http_error_message(s)); + ret = HTTP_RULE_RES_ABRT; + goto end; + + abort_req: + txn->status = 408; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_CLITO; + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + goto abort; + + abort_res: + txn->status = 504; + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_SRVTO; + stream_inc_http_fail_ctr(s); + goto abort; +} + +void http_perform_server_redirect(struct stream *s, struct stconn *sc) +{ + struct channel *req = &s->req; + struct channel *res = &s->res; + struct server *srv; + struct htx *htx; + struct htx_sl *sl; + struct ist path, location; + unsigned int flags; + struct http_uri_parser parser; + + /* + * Create the location + */ + chunk_reset(&trash); + + /* 1: add the server's prefix */ + /* special prefix "/" means don't change URL */ + srv = __objt_server(s->target); + if (srv->rdr_len != 1 || *srv->rdr_pfx != '/') { + if (!chunk_memcat(&trash, srv->rdr_pfx, srv->rdr_len)) + return; + } + + /* 2: add the request Path */ + htx = htxbuf(&req->buf); + sl = http_get_stline(htx); + parser = http_uri_parser_init(htx_sl_req_uri(sl)); + path = http_parse_path(&parser); + if (!isttest(path)) + return; + + if (!chunk_memcat(&trash, path.ptr, path.len)) + return; + location = ist2(trash.area, trash.data); + + /* + * Create the 302 response + */ + htx = htx_from_buf(&res->buf); + flags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11|HTX_SL_F_XFER_LEN|HTX_SL_F_CLEN|HTX_SL_F_BODYLESS); + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, + ist("HTTP/1.1"), ist("302"), ist("Found")); + if (!sl) + goto fail; + sl->info.res.status = 302; + s->txn->status = 302; + + if (!htx_add_header(htx, ist("Cache-Control"), ist("no-cache")) || + !htx_add_header(htx, ist("Connection"), ist("close")) || + !htx_add_header(htx, ist("Content-length"), ist("0")) || + !htx_add_header(htx, ist("Location"), location)) + goto fail; + + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto fail; + + htx->flags |= HTX_FL_EOM; + htx_to_buf(htx, &res->buf); + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_LOCAL; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= SF_FINST_C; + + if (!http_forward_proxy_resp(s, 1)) + goto fail; + + /* return without error. */ + sc_abort(sc); + sc_shutdown(sc); + s->conn_err_type = STRM_ET_NONE; + sc->state = SC_ST_CLO; + + + /* FIXME: we should increase a counter of redirects per server and per backend. */ + srv_inc_sess_ctr(srv); + srv_set_sess_last(srv); + return; + + fail: + /* If an error occurred, remove the incomplete HTTP response from the + * buffer */ + channel_htx_truncate(res, htx); +} + +/* This function terminates the request because it was completely analyzed or + * because an error was triggered during the body forwarding. + */ +static void http_end_request(struct stream *s) +{ + struct channel *chn = &s->req; + struct http_txn *txn = s->txn; + + DBG_TRACE_ENTER(STRM_EV_HTTP_ANA, s, txn); + + if (unlikely(txn->req.msg_state < HTTP_MSG_DONE)) { + DBG_TRACE_DEVEL("waiting end of the request", STRM_EV_HTTP_ANA, s, txn); + return; + } + + if (txn->req.msg_state == HTTP_MSG_DONE) { + /* No need to read anymore, the request was completely parsed. + * We can shut the read side unless we want to abort_on_close, + * or we have a POST request. The issue with POST requests is + * that some browsers still send a CRLF after the request, and + * this CRLF must be read so that it does not remain in the kernel + * buffers, otherwise a close could cause an RST on some systems + * (eg: Linux). + */ + if (!(s->be->options & PR_O_ABRT_CLOSE) && txn->meth != HTTP_METH_POST) + channel_dont_read(chn); + + /* if the server closes the connection, we want to immediately react + * and close the socket to save packets and syscalls. + */ + s->scb->flags |= SC_FL_NOHALF; + + /* In any case we've finished parsing the request so we must + * disable Nagle when sending data because 1) we're not going + * to shut this side, and 2) the server is waiting for us to + * send pending data. + */ + s->scb->flags |= SC_FL_SND_NEVERWAIT; + + if (txn->rsp.msg_state < HTTP_MSG_BODY || + (txn->rsp.msg_state < HTTP_MSG_DONE && s->scb->state != SC_ST_CLO)) { + /* The server has not finished to respond and the + * backend SC is not closed, so we don't want to move in + * order not to upset it. + */ + DBG_TRACE_DEVEL("waiting end of the response", STRM_EV_HTTP_ANA, s, txn); + return; + } + + /* When we get here, it means that both the request and the + * response have finished receiving. Depending on the connection + * mode, we'll have to wait for the last bytes to leave in either + * direction, and sometimes for a close to be effective. + */ + if (txn->flags & TX_CON_WANT_TUN) { + /* Tunnel mode will not have any analyser so it needs to + * poll for reads. + */ + channel_auto_read(&s->req); + txn->req.msg_state = HTTP_MSG_TUNNEL; + if (txn->rsp.msg_state != HTTP_MSG_TUNNEL) + s->res.flags |= CF_WAKE_ONCE; + } + else { + /* we're not expecting any new data to come for this + * transaction, so we can close it. + * + * However, there is an exception if the response + * length is undefined. In this case, we need to wait + * the close from the server. The response will be + * switched in TUNNEL mode until the end. + */ + if (!(txn->rsp.flags & HTTP_MSGF_XFER_LEN) && + txn->rsp.msg_state != HTTP_MSG_CLOSED) + goto check_channel_flags; + + if (!(s->scb->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED))) { + sc_schedule_abort(s->scf); + sc_schedule_shutdown(s->scb); + } + } + goto check_channel_flags; + } + + if (txn->req.msg_state == HTTP_MSG_CLOSING) { + http_msg_closing: + /* nothing else to forward, just waiting for the output buffer + * to be empty and for the shut_wanted to take effect. + */ + if (!co_data(chn)) { + txn->req.msg_state = HTTP_MSG_CLOSED; + goto http_msg_closed; + } + DBG_TRACE_LEAVE(STRM_EV_HTTP_ANA, s, txn); + return; + } + + if (txn->req.msg_state == HTTP_MSG_CLOSED) { + http_msg_closed: + /* if we don't know whether the server will close, we need to hard close */ + if (txn->rsp.flags & HTTP_MSGF_XFER_LEN) + s->scb->flags |= SC_FL_NOLINGER; /* we want to close ASAP */ + /* see above in MSG_DONE why we only do this in these states */ + if (!(s->be->options & PR_O_ABRT_CLOSE)) + channel_dont_read(chn); + goto end; + } + + check_channel_flags: + /* Here, we are in HTTP_MSG_DONE or HTTP_MSG_TUNNEL */ + if (s->scb->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) { + /* if we've just closed an output, let's switch */ + txn->req.msg_state = HTTP_MSG_CLOSING; + goto http_msg_closing; + } + + end: + chn->analysers &= AN_REQ_FLT_END; + if (txn->req.msg_state == HTTP_MSG_TUNNEL) { + s->scb->flags |= SC_FL_SND_NEVERWAIT; + if (HAS_REQ_DATA_FILTERS(s)) + chn->analysers |= AN_REQ_FLT_XFER_DATA; + else + c_adv(chn, htxbuf(&chn->buf)->data - co_data(chn)); + } + channel_auto_close(chn); + channel_auto_read(chn); + DBG_TRACE_LEAVE(STRM_EV_HTTP_ANA, s, txn); +} + + +/* This function terminates the response because it was completely analyzed or + * because an error was triggered during the body forwarding. + */ +static void http_end_response(struct stream *s) +{ + struct channel *chn = &s->res; + struct http_txn *txn = s->txn; + + DBG_TRACE_ENTER(STRM_EV_HTTP_ANA, s, txn); + + if (unlikely(txn->rsp.msg_state < HTTP_MSG_DONE)) { + DBG_TRACE_DEVEL("waiting end of the response", STRM_EV_HTTP_ANA, s, txn); + return; + } + + if (txn->rsp.msg_state == HTTP_MSG_DONE) { + /* In theory, we don't need to read anymore, but we must + * still monitor the server connection for a possible close + * while the request is being uploaded, so we don't disable + * reading. + */ + /* channel_dont_read(chn); */ + + if (txn->req.msg_state < HTTP_MSG_DONE && s->scf->state != SC_ST_CLO) { + /* The client seems to still be sending data, probably + * because we got an error response during an upload. + * We have the choice of either breaking the connection + * or letting it pass through. Let's do the later. + */ + DBG_TRACE_DEVEL("waiting end of the request", STRM_EV_HTTP_ANA, s, txn); + return; + } + + /* When we get here, it means that both the request and the + * response have finished receiving. Depending on the connection + * mode, we'll have to wait for the last bytes to leave in either + * direction, and sometimes for a close to be effective. + */ + if (txn->flags & TX_CON_WANT_TUN) { + channel_auto_read(&s->res); + txn->rsp.msg_state = HTTP_MSG_TUNNEL; + if (txn->req.msg_state != HTTP_MSG_TUNNEL) + s->req.flags |= CF_WAKE_ONCE; + } + else { + /* we're not expecting any new data to come for this + * transaction, so we can close it. + */ + if (!(s->scf->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED))) { + sc_schedule_abort(s->scb); + sc_schedule_shutdown(s->scf); + } + } + goto check_channel_flags; + } + + if (txn->rsp.msg_state == HTTP_MSG_CLOSING) { + http_msg_closing: + /* nothing else to forward, just waiting for the output buffer + * to be empty and for the shut_wanted to take effect. + */ + if (!co_data(chn)) { + txn->rsp.msg_state = HTTP_MSG_CLOSED; + goto http_msg_closed; + } + DBG_TRACE_LEAVE(STRM_EV_HTTP_ANA, s, txn); + return; + } + + if (txn->rsp.msg_state == HTTP_MSG_CLOSED) { + http_msg_closed: + /* drop any pending data */ + channel_htx_truncate(&s->req, htxbuf(&s->req.buf)); + channel_abort(&s->req); + goto end; + } + + check_channel_flags: + /* Here, we are in HTTP_MSG_DONE or HTTP_MSG_TUNNEL */ + if (s->scf->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) { + /* if we've just closed an output, let's switch */ + txn->rsp.msg_state = HTTP_MSG_CLOSING; + goto http_msg_closing; + } + + end: + chn->analysers &= AN_RES_FLT_END; + if (txn->rsp.msg_state == HTTP_MSG_TUNNEL) { + s->scf->flags |= SC_FL_SND_NEVERWAIT; + if (HAS_RSP_DATA_FILTERS(s)) + chn->analysers |= AN_RES_FLT_XFER_DATA; + else + c_adv(chn, htxbuf(&chn->buf)->data - co_data(chn)); + } + channel_auto_close(chn); + channel_auto_read(chn); + DBG_TRACE_LEAVE(STRM_EV_HTTP_ANA, s, txn); +} + +/* Forward a response generated by HAProxy (error/redirect/return). This + * function forwards all pending incoming data. If <final> is set to 0, nothing + * more is performed. It is used for 1xx informational messages. Otherwise, the + * transaction is terminated and the request is emptied. On success 1 is + * returned. If an error occurred, 0 is returned. If it fails, this function + * only exits. It is the caller responsibility to do the cleanup. + */ +int http_forward_proxy_resp(struct stream *s, int final) +{ + struct channel *req = &s->req; + struct channel *res = &s->res; + struct htx *htx = htxbuf(&res->buf); + size_t data; + + if (final) { + htx->flags |= HTX_FL_PROXY_RESP; + + if (!htx_is_empty(htx) && !http_eval_after_res_rules(s)) + return 0; + + if (s->txn->meth == HTTP_METH_HEAD) + htx_skip_msg_payload(htx); + + channel_auto_read(req); + channel_abort(req); + channel_htx_erase(req, htxbuf(&req->buf)); + + channel_auto_read(res); + channel_auto_close(res); + sc_schedule_abort(s->scb); + s->scb->flags |= SC_FL_EOI; /* The response is terminated, add EOI */ + htxbuf(&res->buf)->flags |= HTX_FL_EOM; /* no more data are expected */ + } + else { + /* Send ASAP informational messages. Rely on SC_FL_EOI for final + * response. + */ + s->scf->flags |= SC_FL_SND_ASAP; + } + + data = htx->data - co_data(res); + c_adv(res, data); + htx->first = -1; + res->total += data; + return 1; +} + +void http_server_error(struct stream *s, struct stconn *sc, int err, + int finst, struct http_reply *msg) +{ + if (!(s->flags & SF_ERR_MASK)) + s->flags |= err; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= finst; + + http_reply_and_close(s, s->txn->status, msg); +} + +void http_reply_and_close(struct stream *s, short status, struct http_reply *msg) +{ + if (!msg) { + channel_htx_truncate(&s->res, htxbuf(&s->res.buf)); + goto end; + } + + if (http_reply_message(s, msg) == -1) { + /* On error, return a 500 error message, but don't rewrite it if + * it is already an internal error. If it was already a "const" + * 500 error, just fail. + */ + if (s->txn->status == 500) { + if (s->txn->flags & TX_CONST_REPLY) + goto end; + s->txn->flags |= TX_CONST_REPLY; + } + s->txn->status = 500; + s->txn->http_reply = NULL; + return http_reply_and_close(s, s->txn->status, http_error_message(s)); + } + +end: + /* At this staged, HTTP analysis is finished */ + s->req.analysers &= AN_REQ_FLT_END; + s->req.analyse_exp = TICK_ETERNITY; + + s->res.analysers &= AN_RES_FLT_END; + s->res.analyse_exp = TICK_ETERNITY; + + channel_auto_read(&s->req); + channel_abort(&s->req); + channel_htx_erase(&s->req, htxbuf(&s->req.buf)); + channel_auto_read(&s->res); + channel_auto_close(&s->res); + sc_schedule_abort(s->scb); +} + +struct http_reply *http_error_message(struct stream *s) +{ + const int msgnum = http_get_status_idx(s->txn->status); + + if (s->txn->http_reply) + return s->txn->http_reply; + else if (s->be->replies[msgnum]) + return s->be->replies[msgnum]; + else if (strm_fe(s)->replies[msgnum]) + return strm_fe(s)->replies[msgnum]; + else + return &http_err_replies[msgnum]; +} + +/* Produces an HTX message from an http reply. Depending on the http reply type, + * a, errorfile, an raw file or a log-format string is used. On success, it + * returns 0. If an error occurs -1 is returned. If it fails, this function only + * exits. It is the caller responsibility to do the cleanup. + */ +int http_reply_to_htx(struct stream *s, struct htx *htx, struct http_reply *reply) +{ + struct buffer *errmsg; + struct htx_sl *sl; + struct buffer *body = NULL; + const char *status, *reason, *clen, *ctype; + unsigned int slflags; + int ret = 0; + + /* + * - HTTP_REPLY_ERRFILES unexpected here. handled as no payload if so + * + * - HTTP_REPLY_INDIRECT: switch on another reply if defined or handled + * as no payload if NULL. the TXN status code is set with the status + * of the original reply. + */ + + if (reply->type == HTTP_REPLY_INDIRECT) { + if (reply->body.reply) + reply = reply->body.reply; + } + if (reply->type == HTTP_REPLY_ERRMSG && !reply->body.errmsg) { + /* get default error message */ + if (reply == s->txn->http_reply) + s->txn->http_reply = NULL; + reply = http_error_message(s); + if (reply->type == HTTP_REPLY_INDIRECT) { + if (reply->body.reply) + reply = reply->body.reply; + } + } + + if (reply->type == HTTP_REPLY_ERRMSG) { + /* implicit or explicit error message*/ + errmsg = reply->body.errmsg; + if (errmsg && !b_is_null(errmsg)) { + if (!htx_copy_msg(htx, errmsg)) + goto fail; + } + } + else { + /* no payload, file or log-format string */ + if (reply->type == HTTP_REPLY_RAW) { + /* file */ + body = &reply->body.obj; + } + else if (reply->type == HTTP_REPLY_LOGFMT) { + /* log-format string */ + body = alloc_trash_chunk(); + if (!body) + goto fail_alloc; + body->data = build_logline(s, body->area, body->size, &reply->body.fmt); + } + /* else no payload */ + + status = ultoa(reply->status); + reason = http_get_reason(reply->status); + slflags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11|HTX_SL_F_XFER_LEN|HTX_SL_F_CLEN); + if (!body || !b_data(body)) + slflags |= HTX_SL_F_BODYLESS; + sl = htx_add_stline(htx, HTX_BLK_RES_SL, slflags, ist("HTTP/1.1"), ist(status), ist(reason)); + if (!sl) + goto fail; + sl->info.res.status = reply->status; + + clen = (body ? ultoa(b_data(body)) : "0"); + ctype = reply->ctype; + + if (!LIST_ISEMPTY(&reply->hdrs)) { + struct http_reply_hdr *hdr; + struct buffer *value = alloc_trash_chunk(); + + if (!value) + goto fail; + + list_for_each_entry(hdr, &reply->hdrs, list) { + chunk_reset(value); + value->data = build_logline(s, value->area, value->size, &hdr->value); + if (b_data(value) && !htx_add_header(htx, hdr->name, ist2(b_head(value), b_data(value)))) { + free_trash_chunk(value); + goto fail; + } + chunk_reset(value); + } + free_trash_chunk(value); + } + + if (!htx_add_header(htx, ist("content-length"), ist(clen)) || + (body && b_data(body) && ctype && !htx_add_header(htx, ist("content-type"), ist(ctype))) || + !htx_add_endof(htx, HTX_BLK_EOH) || + (body && b_data(body) && !htx_add_data_atonce(htx, ist2(b_head(body), b_data(body))))) + goto fail; + + htx->flags |= HTX_FL_EOM; + } + + leave: + if (reply->type == HTTP_REPLY_LOGFMT) + free_trash_chunk(body); + return ret; + + fail_alloc: + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + /* fall through */ + fail: + ret = -1; + goto leave; +} + +/* Send an http reply to the client. On success, it returns 0. If an error + * occurs -1 is returned and the response channel is truncated, removing this + * way the faulty reply. This function may fail when the reply is formatted + * (http_reply_to_htx) or when the reply is forwarded + * (http_forward_proxy_resp). On the last case, it is because a + * http-after-response rule fails. + */ +int http_reply_message(struct stream *s, struct http_reply *reply) +{ + struct channel *res = &s->res; + struct htx *htx = htx_from_buf(&res->buf); + + if (s->txn->status == -1) + s->txn->status = reply->status; + channel_htx_truncate(res, htx); + + if (http_reply_to_htx(s, htx, reply) == -1) + goto fail; + + htx_to_buf(htx, &s->res.buf); + if (!http_forward_proxy_resp(s, 1)) + goto fail; + return 0; + + fail: + channel_htx_truncate(res, htx); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + return -1; +} + +/* Return the error message corresponding to s->conn_err_type. It is assumed + * that the server side is closed. Note that err_type is actually a + * bitmask, where almost only aborts may be cumulated with other + * values. We consider that aborted operations are more important + * than timeouts or errors due to the fact that nobody else in the + * logs might explain incomplete retries. All others should avoid + * being cumulated. It should normally not be possible to have multiple + * aborts at once, but just in case, the first one in sequence is reported. + * Note that connection errors appearing on the second request of a keep-alive + * connection are not reported since this allows the client to retry. + */ +void http_return_srv_error(struct stream *s, struct stconn *sc) +{ + int err_type = s->conn_err_type; + + /* set s->txn->status for http_error_message(s) */ + if (err_type & STRM_ET_QUEUE_ABRT) { + s->txn->status = -1; + http_server_error(s, sc, SF_ERR_CLICL, SF_FINST_Q, NULL); + } + else if (err_type & STRM_ET_CONN_ABRT) { + s->txn->status = -1; + http_server_error(s, sc, SF_ERR_CLICL, SF_FINST_C, NULL); + } + else if (err_type & STRM_ET_QUEUE_TO) { + s->txn->status = 503; + http_server_error(s, sc, SF_ERR_SRVTO, SF_FINST_Q, + http_error_message(s)); + } + else if (err_type & STRM_ET_QUEUE_ERR) { + s->txn->status = 503; + http_server_error(s, sc, SF_ERR_SRVCL, SF_FINST_Q, + http_error_message(s)); + } + else if (err_type & STRM_ET_CONN_TO) { + s->txn->status = 503; + http_server_error(s, sc, SF_ERR_SRVTO, SF_FINST_C, + (s->txn->flags & TX_NOT_FIRST) ? NULL : + http_error_message(s)); + } + else if (err_type & STRM_ET_CONN_ERR) { + s->txn->status = 503; + http_server_error(s, sc, SF_ERR_SRVCL, SF_FINST_C, + (s->flags & SF_SRV_REUSED) ? NULL : + http_error_message(s)); + } + else if (err_type & STRM_ET_CONN_RES) { + s->txn->status = 503; + http_server_error(s, sc, SF_ERR_RESOURCE, SF_FINST_C, + (s->txn->flags & TX_NOT_FIRST) ? NULL : + http_error_message(s)); + } + else { /* STRM_ET_CONN_OTHER and others */ + s->txn->status = 500; + http_server_error(s, sc, SF_ERR_INTERNAL, SF_FINST_C, + http_error_message(s)); + } +} + + +/* Handle Expect: 100-continue for HTTP/1.1 messages if necessary. It returns 0 + * on success and -1 on error. + */ +static int http_handle_expect_hdr(struct stream *s, struct htx *htx, struct http_msg *msg) +{ + /* If we have HTTP/1.1 message with a body and Expect: 100-continue, + * then we must send an HTTP/1.1 100 Continue intermediate response. + */ + if (!(msg->flags & HTTP_MSGF_EXPECT_CHECKED) && + (msg->flags & HTTP_MSGF_VER_11) && + (msg->flags & (HTTP_MSGF_CNT_LEN|HTTP_MSGF_TE_CHNK))) { + struct ist hdr = { .ptr = "Expect", .len = 6 }; + struct http_hdr_ctx ctx; + + ctx.blk = NULL; + /* Expect is allowed in 1.1, look for it */ + if (http_find_header(htx, hdr, &ctx, 0) && + unlikely(isteqi(ctx.value, ist2("100-continue", 12)))) { + if (http_reply_100_continue(s) == -1) + return -1; + http_remove_header(htx, &ctx); + } + } + msg->flags |= HTTP_MSGF_EXPECT_CHECKED; + return 0; +} + +/* Send a 100-Continue response to the client. It returns 0 on success and -1 + * on error. The response channel is updated accordingly. + */ +static int http_reply_100_continue(struct stream *s) +{ + struct channel *res = &s->res; + struct htx *htx = htx_from_buf(&res->buf); + struct htx_sl *sl; + unsigned int flags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11| + HTX_SL_F_XFER_LEN|HTX_SL_F_BODYLESS); + + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, + ist("HTTP/1.1"), ist("100"), ist("Continue")); + if (!sl) + goto fail; + sl->info.res.status = 100; + + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto fail; + + if (!http_forward_proxy_resp(s, 0)) + goto fail; + return 0; + + fail: + /* If an error occurred, remove the incomplete HTTP response from the + * buffer */ + channel_htx_truncate(res, htx); + return -1; +} + + +/* + * Capture headers from message <htx> according to header list <cap_hdr>, and + * fill the <cap> pointers appropriately. + */ +static void http_capture_headers(struct htx *htx, char **cap, struct cap_hdr *cap_hdr) +{ + struct cap_hdr *h; + int32_t pos; + + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist n, v; + + if (type == HTX_BLK_EOH) + break; + if (type != HTX_BLK_HDR) + continue; + + n = htx_get_blk_name(htx, blk); + + for (h = cap_hdr; h; h = h->next) { + if (h->namelen && (h->namelen == n.len) && + (strncasecmp(n.ptr, h->name, h->namelen) == 0)) { + if (cap[h->index] == NULL) + cap[h->index] = + pool_alloc(h->pool); + + if (cap[h->index] == NULL) { + ha_alert("HTTP capture : out of memory.\n"); + break; + } + + v = htx_get_blk_value(htx, blk); + v = isttrim(v, h->len); + + memcpy(cap[h->index], v.ptr, v.len); + cap[h->index][v.len]=0; + } + } + } +} + +/* Delete a value in a header between delimiters <from> and <next>. The header + * itself is delimited by <start> and <end> pointers. The number of characters + * displaced is returned, and the pointer to the first delimiter is updated if + * required. The function tries as much as possible to respect the following + * principles : + * - replace <from> delimiter by the <next> one unless <from> points to <start>, + * in which case <next> is simply removed + * - set exactly one space character after the new first delimiter, unless there + * are not enough characters in the block being moved to do so. + * - remove unneeded spaces before the previous delimiter and after the new + * one. + * + * It is the caller's responsibility to ensure that : + * - <from> points to a valid delimiter or <start> ; + * - <next> points to a valid delimiter or <end> ; + * - there are non-space chars before <from>. + */ +static int http_del_hdr_value(char *start, char *end, char **from, char *next) +{ + char *prev = *from; + + if (prev == start) { + /* We're removing the first value. eat the semicolon, if <next> + * is lower than <end> */ + if (next < end) + next++; + + while (next < end && HTTP_IS_SPHT(*next)) + next++; + } + else { + /* Remove useless spaces before the old delimiter. */ + while (HTTP_IS_SPHT(*(prev-1))) + prev--; + *from = prev; + + /* copy the delimiter and if possible a space if we're + * not at the end of the line. + */ + if (next < end) { + *prev++ = *next++; + if (prev + 1 < next) + *prev++ = ' '; + while (next < end && HTTP_IS_SPHT(*next)) + next++; + } + } + memmove(prev, next, end - next); + return (prev - next); +} + + +/* Formats the start line of the request (without CRLF) and puts it in <str> and + * return the written length. The line can be truncated if it exceeds <len>. + */ +static size_t http_fmt_req_line(const struct htx_sl *sl, char *str, size_t len) +{ + struct ist dst = ist2(str, 0); + + if (istcat(&dst, htx_sl_req_meth(sl), len) == -1) + goto end; + if (dst.len + 1 > len) + goto end; + dst.ptr[dst.len++] = ' '; + + if (istcat(&dst, htx_sl_req_uri(sl), len) == -1) + goto end; + if (dst.len + 1 > len) + goto end; + dst.ptr[dst.len++] = ' '; + + istcat(&dst, htx_sl_req_vsn(sl), len); + end: + return dst.len; +} + +/* + * Print a debug line with a start line. + */ +static void http_debug_stline(const char *dir, struct stream *s, const struct htx_sl *sl) +{ + struct session *sess = strm_sess(s); + int max; + + chunk_printf(&trash, "%08x:%s.%s[%04x:%04x]: ", s->uniq_id, s->be->id, + dir, + objt_conn(sess->origin) ? (unsigned short)__objt_conn(sess->origin)->handle.fd : -1, + sc_conn(s->scb) ? (unsigned short)(__sc_conn(s->scb))->handle.fd : -1); + + max = HTX_SL_P1_LEN(sl); + UBOUND(max, trash.size - trash.data - 3); + chunk_memcat(&trash, HTX_SL_P1_PTR(sl), max); + trash.area[trash.data++] = ' '; + + max = HTX_SL_P2_LEN(sl); + UBOUND(max, trash.size - trash.data - 2); + chunk_memcat(&trash, HTX_SL_P2_PTR(sl), max); + trash.area[trash.data++] = ' '; + + max = HTX_SL_P3_LEN(sl); + UBOUND(max, trash.size - trash.data - 1); + chunk_memcat(&trash, HTX_SL_P3_PTR(sl), max); + trash.area[trash.data++] = '\n'; + + DISGUISE(write(1, trash.area, trash.data)); +} + +/* + * Print a debug line with a header. + */ +static void http_debug_hdr(const char *dir, struct stream *s, const struct ist n, const struct ist v) +{ + struct session *sess = strm_sess(s); + int max; + + chunk_printf(&trash, "%08x:%s.%s[%04x:%04x]: ", s->uniq_id, s->be->id, + dir, + objt_conn(sess->origin) ? (unsigned short)__objt_conn(sess->origin)->handle.fd : -1, + sc_conn(s->scb) ? (unsigned short)(__sc_conn(s->scb))->handle.fd : -1); + + max = n.len; + UBOUND(max, trash.size - trash.data - 3); + chunk_memcat(&trash, n.ptr, max); + trash.area[trash.data++] = ':'; + trash.area[trash.data++] = ' '; + + max = v.len; + UBOUND(max, trash.size - trash.data - 1); + chunk_memcat(&trash, v.ptr, max); + trash.area[trash.data++] = '\n'; + + DISGUISE(write(1, trash.area, trash.data)); +} + +void http_txn_reset_req(struct http_txn *txn) +{ + txn->req.flags = 0; + txn->req.msg_state = HTTP_MSG_RQBEFORE; /* at the very beginning of the request */ +} + +void http_txn_reset_res(struct http_txn *txn) +{ + txn->rsp.flags = 0; + txn->rsp.msg_state = HTTP_MSG_RPBEFORE; /* at the very beginning of the response */ +} + +/* + * Create and initialize a new HTTP transaction for stream <s>. This should be + * used before processing any new request. It returns the transaction or NLULL + * on error. + */ +struct http_txn *http_create_txn(struct stream *s) +{ + struct http_txn *txn; + struct stconn *sc = s->scf; + + txn = pool_alloc(pool_head_http_txn); + if (!txn) + return NULL; + s->txn = txn; + + txn->meth = HTTP_METH_OTHER; + txn->flags = ((sc && sc_ep_test(sc, SE_FL_NOT_FIRST)) ? TX_NOT_FIRST : 0); + txn->status = -1; + txn->server_status = -1; + txn->http_reply = NULL; + txn->l7_buffer = BUF_NULL; + write_u32(txn->cache_hash, 0); + + txn->cookie_first_date = 0; + txn->cookie_last_date = 0; + + txn->srv_cookie = NULL; + txn->cli_cookie = NULL; + txn->uri = NULL; + + http_txn_reset_req(txn); + http_txn_reset_res(txn); + + txn->req.chn = &s->req; + txn->rsp.chn = &s->res; + + txn->auth.method = HTTP_AUTH_UNKNOWN; + + /* here we don't want to re-initialize s->vars_txn and s->vars_reqres + * variable lists, because they were already initialized upon stream + * creation in stream_new(), and thus may already contain some variables + */ + + return txn; +} + +/* to be used at the end of a transaction */ +void http_destroy_txn(struct stream *s) +{ + struct http_txn *txn = s->txn; + + /* these ones will have been dynamically allocated */ + pool_free(pool_head_requri, txn->uri); + pool_free(pool_head_capture, txn->cli_cookie); + pool_free(pool_head_capture, txn->srv_cookie); + pool_free(pool_head_uniqueid, s->unique_id.ptr); + + s->unique_id = IST_NULL; + txn->uri = NULL; + txn->srv_cookie = NULL; + txn->cli_cookie = NULL; + + if (!LIST_ISEMPTY(&s->vars_txn.head)) + vars_prune(&s->vars_txn, s->sess, s); + if (!LIST_ISEMPTY(&s->vars_reqres.head)) + vars_prune(&s->vars_reqres, s->sess, s); + + b_free(&txn->l7_buffer); + + pool_free(pool_head_http_txn, txn); + s->txn = NULL; +} + + +void http_set_term_flags(struct stream *s) +{ + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + + if (!(s->flags & SF_FINST_MASK)) { + if (s->scb->state == SC_ST_INI) { + /* Before any connection attempt on the server side, we + * are still in the request analysis. Just take case to + * detect tarpit error + */ + if (s->req.analysers & AN_REQ_HTTP_TARPIT) + s->flags |= SF_FINST_T; + else + s->flags |= SF_FINST_R; + } + else if (s->scb->state == SC_ST_QUE) + s->flags |= SF_FINST_Q; + else if (sc_state_in(s->scb->state, SC_SB_REQ|SC_SB_TAR|SC_SB_ASS|SC_SB_CON|SC_SB_CER|SC_SB_RDY)) { + if (unlikely(objt_applet(s->target))) { + s->flags |= SF_FINST_R; + } + else + s->flags |= SF_FINST_C; + } + else { + if (s->txn->rsp.msg_state < HTTP_MSG_DATA) { + /* We are still processing the response headers */ + s->flags |= SF_FINST_H; + } + // (res == (done|closing|closed)) & (res->flags & shutw) + else if (s->txn->rsp.msg_state >= HTTP_MSG_DONE && s->txn->rsp.msg_state < HTTP_MSG_TUNNEL && + (s->flags & (SF_ERR_CLITO|SF_ERR_CLICL))) { + /* A client error was reported and we are + * transmitting the last block of data + */ + s->flags |= SF_FINST_L; + } + else { + /* Otherwise we are in DATA phase on both sides */ + s->flags |= SF_FINST_D; + } + } + } +} + + +DECLARE_POOL(pool_head_http_txn, "http_txn", sizeof(struct http_txn)); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/http_client.c b/src/http_client.c new file mode 100644 index 0000000..d7e50c0 --- /dev/null +++ b/src/http_client.c @@ -0,0 +1,1598 @@ +/* + * HTTP Client + * + * Copyright (C) 2021 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This file implements an HTTP Client API. + * + */ + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cli.h> +#include <haproxy/ssl_ckch.h> +#include <haproxy/dynbuf.h> +#include <haproxy/cfgparse.h> +#include <haproxy/global.h> +#include <haproxy/istbuf.h> +#include <haproxy/h1_htx.h> +#include <haproxy/http.h> +#include <haproxy/http_ana-t.h> +#include <haproxy/http_client.h> +#include <haproxy/http_htx.h> +#include <haproxy/http_rules.h> +#include <haproxy/htx.h> +#include <haproxy/log.h> +#include <haproxy/proxy.h> +#include <haproxy/resolvers.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/sock_inet.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> + +#include <string.h> + +static struct proxy *httpclient_proxy; + +#ifdef USE_OPENSSL +/* if the httpclient is not configured, error are ignored and features are limited */ +static int hard_error_ssl = 0; +static int httpclient_ssl_verify = SSL_SOCK_VERIFY_REQUIRED; +static char *httpclient_ssl_ca_file = NULL; +#endif +static struct applet httpclient_applet; + +/* if the httpclient is not configured, error are ignored and features are limited */ +static int hard_error_resolvers = 0; +static char *resolvers_id = NULL; +static char *resolvers_prefer = NULL; +static int resolvers_disabled = 0; + +static int httpclient_retries = CONN_RETRIES; +static int httpclient_timeout_connect = MS_TO_TICKS(5000); + +/* --- This part of the file implement an HTTP client over the CLI --- + * The functions will be starting by "hc_cli" for "httpclient cli" + */ + +/* the CLI context for the httpclient command */ +struct hcli_svc_ctx { + struct httpclient *hc; /* the httpclient instance */ + uint flags; /* flags from HC_CLI_F_* above */ +}; + +/* These are the callback used by the HTTP Client when it needs to notify new + * data, we only sets a flag in the IO handler via the svcctx. + */ +void hc_cli_res_stline_cb(struct httpclient *hc) +{ + struct appctx *appctx = hc->caller; + struct hcli_svc_ctx *ctx; + + if (!appctx) + return; + + ctx = appctx->svcctx; + ctx->flags |= HC_F_RES_STLINE; + appctx_wakeup(appctx); +} + +void hc_cli_res_headers_cb(struct httpclient *hc) +{ + struct appctx *appctx = hc->caller; + struct hcli_svc_ctx *ctx; + + if (!appctx) + return; + + ctx = appctx->svcctx; + ctx->flags |= HC_F_RES_HDR; + appctx_wakeup(appctx); +} + +void hc_cli_res_body_cb(struct httpclient *hc) +{ + struct appctx *appctx = hc->caller; + struct hcli_svc_ctx *ctx; + + if (!appctx) + return; + + ctx = appctx->svcctx; + ctx->flags |= HC_F_RES_BODY; + appctx_wakeup(appctx); +} + +void hc_cli_res_end_cb(struct httpclient *hc) +{ + struct appctx *appctx = hc->caller; + struct hcli_svc_ctx *ctx; + + if (!appctx) + return; + + ctx = appctx->svcctx; + ctx->flags |= HC_F_RES_END; + appctx_wakeup(appctx); +} + +/* + * Parse an httpclient keyword on the cli: + * httpclient <ID> <method> <URI> + */ +static int hc_cli_parse(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct hcli_svc_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct httpclient *hc; + char *err = NULL; + enum http_meth_t meth; + char *meth_str; + struct ist uri; + struct ist body = IST_NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[1] || !*args[2]) { + memprintf(&err, ": not enough parameters"); + goto err; + } + + meth_str = args[1]; + uri = ist(args[2]); + + if (payload) + body = ist(payload); + + meth = find_http_meth(meth_str, strlen(meth_str)); + + hc = httpclient_new(appctx, meth, uri); + if (!hc) { + goto err; + } + + /* update the httpclient callbacks */ + hc->ops.res_stline = hc_cli_res_stline_cb; + hc->ops.res_headers = hc_cli_res_headers_cb; + hc->ops.res_payload = hc_cli_res_body_cb; + hc->ops.res_end = hc_cli_res_end_cb; + + ctx->hc = hc; /* store the httpclient ptr in the applet */ + ctx->flags = 0; + + if (httpclient_req_gen(hc, hc->req.url, hc->req.meth, NULL, body) != ERR_NONE) + goto err; + + + if (!httpclient_start(hc)) + goto err; + + return 0; + +err: + memprintf(&err, "Can't start the HTTP client%s.\n", err ? err : ""); + return cli_err(appctx, err); +} + +/* This function dumps the content of the httpclient receive buffer + * on the CLI output + * + * Return 1 when the processing is finished + * return 0 if it needs to be called again + */ +static int hc_cli_io_handler(struct appctx *appctx) +{ + struct hcli_svc_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct httpclient *hc = ctx->hc; + struct http_hdr *hdrs, *hdr; + + if (ctx->flags & HC_F_RES_STLINE) { + chunk_printf(&trash, "%.*s %d %.*s\n", (unsigned int)istlen(hc->res.vsn), istptr(hc->res.vsn), + hc->res.status, (unsigned int)istlen(hc->res.reason), istptr(hc->res.reason)); + if (applet_putchk(appctx, &trash) == -1) + goto more; + ctx->flags &= ~HC_F_RES_STLINE; + } + + if (ctx->flags & HC_F_RES_HDR) { + chunk_reset(&trash); + hdrs = hc->res.hdrs; + for (hdr = hdrs; isttest(hdr->v); hdr++) { + if (!h1_format_htx_hdr(hdr->n, hdr->v, &trash)) + goto too_many_hdrs; + } + if (!chunk_memcat(&trash, "\r\n", 2)) + goto too_many_hdrs; + if (applet_putchk(appctx, &trash) == -1) + goto more; + ctx->flags &= ~HC_F_RES_HDR; + } + + if (ctx->flags & HC_F_RES_BODY) { + int ret; + + ret = httpclient_res_xfer(hc, sc_ib(sc)); + channel_add_input(sc_ic(sc), ret); /* forward what we put in the buffer channel */ + + /* remove the flag if the buffer was emptied */ + if (httpclient_data(hc)) + goto more; + ctx->flags &= ~HC_F_RES_BODY; + } + + /* we must close only if F_END is the last flag */ + if (ctx->flags == HC_F_RES_END) { + ctx->flags &= ~HC_F_RES_END; + goto end; + } + +more: + if (!ctx->flags) + applet_have_no_more_data(appctx); + return 0; +end: + return 1; + +too_many_hdrs: + return cli_err(appctx, "Too many headers.\n"); +} + +static void hc_cli_release(struct appctx *appctx) +{ + struct hcli_svc_ctx *ctx = appctx->svcctx; + struct httpclient *hc = ctx->hc; + + /* Everything possible was printed on the CLI, we can destroy the client */ + httpclient_stop_and_destroy(hc); + + return; +} + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "httpclient", NULL }, "httpclient <method> <URI> : launch an HTTP request", hc_cli_parse, hc_cli_io_handler, hc_cli_release, NULL, ACCESS_EXPERT}, + { { NULL }, NULL, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + + +/* --- This part of the file implements the actual HTTP client API --- */ + +/* + * Generate a simple request and fill the httpclient request buffer with it. + * The request contains a request line generated from the absolute <url> and + * <meth> as well as list of headers <hdrs>. + * + * If the buffer was filled correctly the function returns 0, if not it returns + * an error_code but there is no guarantee that the buffer wasn't modified. + */ +int httpclient_req_gen(struct httpclient *hc, const struct ist url, enum http_meth_t meth, const struct http_hdr *hdrs, const struct ist payload) +{ + struct htx_sl *sl; + struct htx *htx; + int err_code = 0; + struct ist meth_ist, vsn; + unsigned int flags = HTX_SL_F_VER_11 | HTX_SL_F_NORMALIZED_URI | HTX_SL_F_HAS_SCHM; + int i; + int foundhost = 0, foundaccept = 0, foundua = 0; + + if (!b_alloc(&hc->req.buf)) + goto error; + + if (meth >= HTTP_METH_OTHER) + goto error; + + meth_ist = http_known_methods[meth]; + + vsn = ist("HTTP/1.1"); + + htx = htx_from_buf(&hc->req.buf); + if (!htx) + goto error; + + if (!hc->ops.req_payload && !isttest(payload)) + flags |= HTX_SL_F_BODYLESS; + + sl = htx_add_stline(htx, HTX_BLK_REQ_SL, flags, meth_ist, url, vsn); + if (!sl) { + goto error; + } + sl->info.req.meth = meth; + + for (i = 0; hdrs && hdrs[i].n.len; i++) { + /* Don't check the value length because a header value may be empty */ + if (isttest(hdrs[i].v) == 0) + continue; + + if (isteqi(hdrs[i].n, ist("host"))) + foundhost = 1; + else if (isteqi(hdrs[i].n, ist("accept"))) + foundaccept = 1; + else if (isteqi(hdrs[i].n, ist("user-agent"))) + foundua = 1; + + if (!htx_add_header(htx, hdrs[i].n, hdrs[i].v)) + goto error; + } + + if (!foundhost) { + /* Add Host Header from URL */ + if (!htx_add_header(htx, ist("Host"), ist("h"))) + goto error; + if (!http_update_host(htx, sl, url)) + goto error; + } + + if (!foundaccept) { + if (!htx_add_header(htx, ist("Accept"), ist("*/*"))) + goto error; + } + + if (!foundua) { + if (!htx_add_header(htx, ist("User-Agent"), ist(HTTPCLIENT_USERAGENT))) + goto error; + } + + + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto error; + + if (isttest(payload) && istlen(payload)) { + /* add the payload if it can feat in the buffer, no need to set + * the Content-Length, the data will be sent chunked */ + if (!htx_add_data_atonce(htx, payload)) + goto error; + } + + /* If req.payload was set, does not set the end of stream which *MUST* + * be set in the callback */ + if (!hc->ops.req_payload) + htx->flags |= HTX_FL_EOM; + + htx_to_buf(htx, &hc->req.buf); + + return 0; +error: + err_code |= ERR_ALERT | ERR_ABORT; + return err_code; +} + +/* + * transfer the response to the destination buffer and wakeup the HTTP client + * applet so it could fill again its buffer. + * + * Return the number of bytes transferred. + */ +int httpclient_res_xfer(struct httpclient *hc, struct buffer *dst) +{ + size_t room = b_room(dst); + int ret; + + ret = b_force_xfer(dst, &hc->res.buf, MIN(room, b_data(&hc->res.buf))); + /* call the client once we consumed all data */ + if (!b_data(&hc->res.buf)) { + b_free(&hc->res.buf); + if (hc->appctx) + appctx_wakeup(hc->appctx); + } + return ret; +} + +/* + * Transfer raw HTTP payload from src, and insert it into HTX format in the + * httpclient. + * + * Must be used to transfer the request body. + * Then wakeup the httpclient so it can transfer it. + * + * <end> tries to add the ending data flag if it succeed to copy all data. + * + * Return the number of bytes copied from src. + */ +int httpclient_req_xfer(struct httpclient *hc, struct ist src, int end) +{ + int ret = 0; + struct htx *htx; + + if (!b_alloc(&hc->req.buf)) + goto error; + + htx = htx_from_buf(&hc->req.buf); + if (!htx) + goto error; + + if (hc->appctx) + appctx_wakeup(hc->appctx); + + ret += htx_add_data(htx, src); + + + /* if we copied all the data and the end flag is set */ + if ((istlen(src) == ret) && end) { + /* no more data are expected. If the HTX buffer is empty, be + * sure to add something (EOT block in this case) to have + * something to send. It is important to be sure the EOM flags + * will be handled by the endpoint. Because the message is + * empty, this should not fail. Otherwise it is an error + */ + if (htx_is_empty(htx)) { + if (!htx_add_endof(htx, HTX_BLK_EOT)) + goto error; + } + htx->flags |= HTX_FL_EOM; + } + htx_to_buf(htx, &hc->req.buf); + +error: + + return ret; +} + +/* Set the 'timeout server' in ms for the next httpclient request */ +void httpclient_set_timeout(struct httpclient *hc, int timeout) +{ + hc->timeout_server = timeout; +} + +/* + * Sets a destination for the httpclient from an HAProxy addr format + * This will prevent to determine the destination from the URL + * Return 0 in case of success or -1 otherwise. + */ +int httpclient_set_dst(struct httpclient *hc, const char *dst) +{ + struct sockaddr_storage *sk; + char *errmsg = NULL; + + sockaddr_free(&hc->dst); + /* 'sk' is statically allocated (no need to be freed). */ + sk = str2sa_range(dst, NULL, NULL, NULL, NULL, NULL, NULL, + &errmsg, NULL, NULL, + PA_O_PORT_OK | PA_O_STREAM | PA_O_XPRT | PA_O_CONNECT); + if (!sk) { + ha_alert("httpclient: Failed to parse destination address in %s\n", errmsg); + free(errmsg); + return -1; + } + + if (!sockaddr_alloc(&hc->dst, sk, sizeof(*sk))) { + ha_alert("httpclient: Failed to allocate sockaddr in %s:%d.\n", __FUNCTION__, __LINE__); + return -1; + } + + return 0; +} + +/* + * Split <url> in <scheme>, <host>, <port> + */ +static int httpclient_spliturl(struct ist url, enum http_scheme *scheme, + struct ist *host, int *port) +{ + enum http_scheme scheme_tmp = SCH_HTTP; + int port_tmp = 0; + struct ist scheme_ist, authority_ist, host_ist, port_ist; + char *p, *end; + struct http_uri_parser parser; + + parser = http_uri_parser_init(url); + scheme_ist = http_parse_scheme(&parser); + if (!isttest(scheme_ist)) { + return 0; + } + + if (isteqi(scheme_ist, ist("http://"))){ + scheme_tmp = SCH_HTTP; + port_tmp = 80; + } else if (isteqi(scheme_ist, ist("https://"))) { + scheme_tmp = SCH_HTTPS; + port_tmp = 443; + } + + authority_ist = http_parse_authority(&parser, 1); + if (!isttest(authority_ist)) { + return 0; + } + p = end = istend(authority_ist); + + /* look for a port at the end of the authority */ + while (p > istptr(authority_ist) && isdigit((unsigned char)*--p)) + ; + + if (*p == ':') { + host_ist = ist2(istptr(authority_ist), p - istptr(authority_ist)); + port_ist = istnext(ist2(p, end - p)); + ist2str(trash.area, port_ist); + port_tmp = atoi(trash.area); + } else { + host_ist = authority_ist; + } + + if (scheme) + *scheme = scheme_tmp; + if (host) + *host = host_ist; + if (port) + *port = port_tmp; + + return 1; +} + +/* + * Start the HTTP client + * Create the appctx, session, stream and wakeup the applet + * + * Return the <appctx> or NULL if it failed + */ +struct appctx *httpclient_start(struct httpclient *hc) +{ + struct applet *applet = &httpclient_applet; + struct appctx *appctx; + + /* if the client was started and not ended, an applet is already + * running, we shouldn't try anything */ + if (httpclient_started(hc) && !httpclient_ended(hc)) + return NULL; + + /* The HTTP client will be created in the same thread as the caller, + * avoiding threading issues */ + appctx = appctx_new_here(applet, NULL); + if (!appctx) + goto out; + appctx->svcctx = hc; + hc->flags = 0; + + if (appctx_init(appctx) == -1) { + ha_alert("httpclient: Failed to initialize appctx %s:%d.\n", __FUNCTION__, __LINE__); + goto out_free_appctx; + } + + return appctx; + +out_free_appctx: + appctx_free_on_early_error(appctx); +out: + + return NULL; +} + +/* + * This function tries to destroy the httpclient if it wasn't running. + * If it was running, stop the client and ask it to autodestroy itself. + * + * Once this function is used, all pointer sto the client must be removed + * + */ +void httpclient_stop_and_destroy(struct httpclient *hc) +{ + + /* The httpclient was already stopped or never started, we can safely destroy it */ + if (hc->flags & HTTPCLIENT_FS_ENDED || !(hc->flags & HTTPCLIENT_FS_STARTED)) { + httpclient_destroy(hc); + } else { + /* if the client wasn't stopped, ask for a stop and destroy */ + hc->flags |= (HTTPCLIENT_FA_AUTOKILL | HTTPCLIENT_FA_STOP); + /* the calling applet doesn't exist anymore */ + hc->caller = NULL; + if (hc->appctx) + appctx_wakeup(hc->appctx); + } +} + +/* Free the httpclient */ +void httpclient_destroy(struct httpclient *hc) +{ + struct http_hdr *hdrs; + + + if (!hc) + return; + + /* we should never destroy a client which was started but not stopped */ + BUG_ON(httpclient_started(hc) && !httpclient_ended(hc)); + + /* request */ + istfree(&hc->req.url); + b_free(&hc->req.buf); + /* response */ + istfree(&hc->res.vsn); + istfree(&hc->res.reason); + hdrs = hc->res.hdrs; + while (hdrs && isttest(hdrs->n)) { + istfree(&hdrs->n); + istfree(&hdrs->v); + hdrs++; + } + ha_free(&hc->res.hdrs); + b_free(&hc->res.buf); + sockaddr_free(&hc->dst); + + free(hc); + + return; +} + +/* Allocate an httpclient and its buffers + * Use the default httpclient_proxy + * + * Return NULL on failure */ +struct httpclient *httpclient_new(void *caller, enum http_meth_t meth, struct ist url) +{ + struct httpclient *hc; + + hc = calloc(1, sizeof(*hc)); + if (!hc) + goto err; + + hc->req.buf = BUF_NULL; + hc->res.buf = BUF_NULL; + hc->caller = caller; + hc->req.url = istdup(url); + hc->req.meth = meth; + httpclient_set_proxy(hc, httpclient_proxy); + + return hc; + +err: + httpclient_destroy(hc); + return NULL; +} + +/* Allocate an httpclient and its buffers, + * Use the proxy <px> + * + * Return and httpclient or NULL. + */ +struct httpclient *httpclient_new_from_proxy(struct proxy *px, void *caller, enum http_meth_t meth, struct ist url) +{ + struct httpclient *hc; + + hc = httpclient_new(caller, meth, url); + if (!hc) + return NULL; + + httpclient_set_proxy(hc, px); + + return hc; +} + +/* + * Configure an httpclient with a specific proxy <px> + * + * The proxy <px> must contains 2 srv, one configured for clear connections, the other for SSL. + * + */ +int httpclient_set_proxy(struct httpclient *hc, struct proxy *px) +{ + struct server *srv; + + hc->px = px; + + for (srv = px->srv; srv != NULL; srv = srv->next) { + if (srv->xprt == xprt_get(XPRT_RAW)) { + hc->srv_raw = srv; +#ifdef USE_OPENSSL + } else if (srv->xprt == xprt_get(XPRT_SSL)) { + hc->srv_ssl = srv; +#endif + } + } + + return 0; +} + +void httpclient_applet_io_handler(struct appctx *appctx) +{ + struct httpclient *hc = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct stream *s = __sc_strm(sc); + struct channel *req = &s->req; + struct channel *res = &s->res; + struct htx_blk *blk = NULL; + struct htx *htx; + struct htx_sl *sl = NULL; + uint32_t hdr_num; + uint32_t sz; + int ret; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) { + if (co_data(res)) { + htx = htx_from_buf(&res->buf); + co_htx_skip(res, htx, co_data(res)); + htx_to_buf(htx, &res->buf); + } + goto out; + } + /* The IO handler could be called after the release, so we need to + * check if hc is still there to run the IO handler */ + if (!hc) + goto out; + + while (1) { + + /* required to stop */ + if (hc->flags & HTTPCLIENT_FA_STOP) + goto error; + + switch(appctx->st0) { + + case HTTPCLIENT_S_REQ: + /* we know that the buffer is empty here, since + * it's the first call, we can freely copy the + * request from the httpclient buffer */ + ret = b_xfer(&req->buf, &hc->req.buf, b_data(&hc->req.buf)); + if (!ret) { + sc_need_room(sc, 0); + goto out; + } + + if (!b_data(&hc->req.buf)) + b_free(&hc->req.buf); + + htx = htx_from_buf(&req->buf); + if (!htx) { + sc_need_room(sc, 0); + goto out; + } + + channel_add_input(req, htx->data); + + if (htx->flags & HTX_FL_EOM) /* check if a body need to be added */ + appctx->st0 = HTTPCLIENT_S_RES_STLINE; + else + appctx->st0 = HTTPCLIENT_S_REQ_BODY; + + goto out; /* we need to leave the IO handler once we wrote the request */ + break; + + case HTTPCLIENT_S_REQ_BODY: + /* call the payload callback */ + { + if (hc->ops.req_payload) { + struct htx *hc_htx; + + /* call the request callback */ + hc->ops.req_payload(hc); + + hc_htx = htxbuf(&hc->req.buf); + if (htx_is_empty(hc_htx)) + goto out; + + htx = htx_from_buf(&req->buf); + if (htx_is_empty(htx)) { + size_t data = hc_htx->data; + + /* Here htx_to_buf() will set buffer data to 0 because + * the HTX is empty, and allow us to do an xfer. + */ + htx_to_buf(hc_htx, &hc->req.buf); + htx_to_buf(htx, &req->buf); + b_xfer(&req->buf, &hc->req.buf, b_data(&hc->req.buf)); + channel_add_input(req, data); + } else { + struct htx_ret ret; + + ret = htx_xfer_blks(htx, hc_htx, htx_used_space(hc_htx), HTX_BLK_UNUSED); + channel_add_input(req, ret.ret); + + /* we must copy the EOM if we empty the buffer */ + if (htx_is_empty(hc_htx)) { + htx->flags |= (hc_htx->flags & HTX_FL_EOM); + } + htx_to_buf(htx, &req->buf); + htx_to_buf(hc_htx, &hc->req.buf); + } + + + if (!b_data(&hc->req.buf)) + b_free(&hc->req.buf); + } + + htx = htxbuf(&req->buf); + + /* if the request contains the HTX_FL_EOM, we finished the request part. */ + if (htx->flags & HTX_FL_EOM) + appctx->st0 = HTTPCLIENT_S_RES_STLINE; + + goto process_data; /* we need to leave the IO handler once we wrote the request */ + } + break; + + case HTTPCLIENT_S_RES_STLINE: + /* Request is finished, report EOI */ + se_fl_set(appctx->sedesc, SE_FL_EOI); + + /* copy the start line in the hc structure,then remove the htx block */ + if (!co_data(res)) + goto out; + htx = htxbuf(&res->buf); + if (htx_is_empty(htx)) + goto out; + blk = htx_get_head_blk(htx); + if (blk && (htx_get_blk_type(blk) == HTX_BLK_RES_SL)) + sl = htx_get_blk_ptr(htx, blk); + if (!sl || (!(sl->flags & HTX_SL_F_IS_RESP))) + goto out; + + /* copy the status line in the httpclient */ + hc->res.status = sl->info.res.status; + hc->res.vsn = istdup(htx_sl_res_vsn(sl)); + hc->res.reason = istdup(htx_sl_res_reason(sl)); + sz = htx_get_blksz(blk); + c_rew(res, sz); + htx_remove_blk(htx, blk); + /* caller callback */ + if (hc->ops.res_stline) + hc->ops.res_stline(hc); + + htx_to_buf(htx, &res->buf); + + /* if there is no HTX data anymore and the EOM flag is + * set, leave (no body) */ + if (htx_is_empty(htx) && htx->flags & HTX_FL_EOM) + appctx->st0 = HTTPCLIENT_S_RES_END; + else + appctx->st0 = HTTPCLIENT_S_RES_HDR; + + break; + + case HTTPCLIENT_S_RES_HDR: + /* first copy the headers in a local hdrs + * structure, once we the total numbers of the + * header we allocate the right size and copy + * them. The htx block of the headers are + * removed each time one is read */ + { + struct http_hdr hdrs[global.tune.max_http_hdr]; + + if (!co_data(res)) + goto out; + htx = htxbuf(&res->buf); + if (htx_is_empty(htx)) + goto out; + + hdr_num = 0; + blk = htx_get_head_blk(htx); + while (blk) { + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t sz = htx_get_blksz(blk); + + c_rew(res, sz); + + if (type == HTX_BLK_HDR) { + hdrs[hdr_num].n = istdup(htx_get_blk_name(htx, blk)); + hdrs[hdr_num].v = istdup(htx_get_blk_value(htx, blk)); + hdr_num++; + } + else if (type == HTX_BLK_EOH) { + /* create a NULL end of array and leave the loop */ + hdrs[hdr_num].n = IST_NULL; + hdrs[hdr_num].v = IST_NULL; + htx_remove_blk(htx, blk); + break; + } + blk = htx_remove_blk(htx, blk); + } + htx_to_buf(htx, &res->buf); + + if (hdr_num) { + /* alloc and copy the headers in the httpclient struct */ + hc->res.hdrs = calloc((hdr_num + 1), sizeof(*hc->res.hdrs)); + if (!hc->res.hdrs) + goto error; + memcpy(hc->res.hdrs, hdrs, sizeof(struct http_hdr) * (hdr_num + 1)); + + /* caller callback */ + if (hc->ops.res_headers) + hc->ops.res_headers(hc); + } + + /* if there is no HTX data anymore and the EOM flag is + * set, leave (no body) */ + if (htx_is_empty(htx) && htx->flags & HTX_FL_EOM) { + appctx->st0 = HTTPCLIENT_S_RES_END; + } else { + appctx->st0 = HTTPCLIENT_S_RES_BODY; + } + } + break; + + case HTTPCLIENT_S_RES_BODY: + /* + * The IO handler removes the htx blocks in the response buffer and + * push them in the hc->res.buf buffer in a raw format. + */ + if (!co_data(res)) + goto out; + + htx = htxbuf(&res->buf); + if (htx_is_empty(htx)) + goto out; + + if (!b_alloc(&hc->res.buf)) + goto out; + + if (b_full(&hc->res.buf)) + goto process_data; + + /* decapsule the htx data to raw data */ + blk = htx_get_head_blk(htx); + while (blk) { + enum htx_blk_type type = htx_get_blk_type(blk); + size_t count = co_data(res); + uint32_t blksz = htx_get_blksz(blk); + uint32_t room = b_room(&hc->res.buf); + uint32_t vlen; + + /* we should try to copy the maximum output data in a block, which fit + * the destination buffer */ + vlen = MIN(count, blksz); + vlen = MIN(vlen, room); + + if (vlen == 0) { + htx_to_buf(htx, &res->buf); + goto process_data; + } + + if (type == HTX_BLK_DATA) { + struct ist v = htx_get_blk_value(htx, blk); + + __b_putblk(&hc->res.buf, v.ptr, vlen); + c_rew(res, vlen); + + if (vlen == blksz) + blk = htx_remove_blk(htx, blk); + else + htx_cut_data_blk(htx, blk, vlen); + + /* the data must be processed by the caller in the receive phase */ + if (hc->ops.res_payload) + hc->ops.res_payload(hc); + + /* cannot copy everything, need to process */ + if (vlen != blksz) { + htx_to_buf(htx, &res->buf); + goto process_data; + } + } else { + if (vlen != blksz) { + htx_to_buf(htx, &res->buf); + goto process_data; + } + + /* remove any block which is not a data block */ + c_rew(res, blksz); + blk = htx_remove_blk(htx, blk); + } + } + + htx_to_buf(htx, &res->buf); + + /* if not finished, should be called again */ + if (!(htx_is_empty(htx) && (htx->flags & HTX_FL_EOM))) + goto out; + + + /* end of message, we should quit */ + appctx->st0 = HTTPCLIENT_S_RES_END; + break; + + case HTTPCLIENT_S_RES_END: + se_fl_set(appctx->sedesc, SE_FL_EOS); + goto out; + break; + } + } + +out: + return; + +process_data: + sc_will_read(sc); + goto out; + +error: + se_fl_set(appctx->sedesc, SE_FL_ERROR); + goto out; +} + +int httpclient_applet_init(struct appctx *appctx) +{ + struct httpclient *hc = appctx->svcctx; + struct stream *s; + struct sockaddr_storage *addr = NULL; + struct sockaddr_storage ss_url = {}; + struct sockaddr_storage *ss_dst; + enum obj_type *target = NULL; + struct ist host = IST_NULL; + enum http_scheme scheme; + int port; + int doresolve = 0; + + + /* parse the URL and */ + if (!httpclient_spliturl(hc->req.url, &scheme, &host, &port)) + goto out_error; + + if (hc->dst) { + /* if httpclient_set_dst() was used, sets the alternative address */ + ss_dst = hc->dst; + } else { + /* set the dst using the host, or 0.0.0.0 to resolve */ + ist2str(trash.area, host); + ss_dst = str2ip2(trash.area, &ss_url, 0); + if (!ss_dst) { /* couldn't get an IP from that, try to resolve */ + doresolve = 1; + ss_dst = str2ip2("0.0.0.0", &ss_url, 0); + } + sock_inet_set_port(ss_dst, port); + } + + if (!sockaddr_alloc(&addr, ss_dst, sizeof(*ss_dst))) + goto out_error; + + /* choose the SSL server or not */ + switch (scheme) { + case SCH_HTTP: + target = &hc->srv_raw->obj_type; + break; + case SCH_HTTPS: +#ifdef USE_OPENSSL + if (hc->srv_ssl) { + target = &hc->srv_ssl->obj_type; + } else { + ha_alert("httpclient: SSL was disabled (wrong verify/ca-file)!\n"); + goto out_free_addr; + } +#else + ha_alert("httpclient: OpenSSL is not available %s:%d.\n", __FUNCTION__, __LINE__); + goto out_free_addr; +#endif + break; + } + + if (appctx_finalize_startup(appctx, hc->px, &hc->req.buf) == -1) { + ha_alert("httpclient: Failed to initialize appctx %s:%d.\n", __FUNCTION__, __LINE__); + goto out_free_addr; + } + + s = appctx_strm(appctx); + s->target = target; + /* set the "timeout server" */ + s->scb->ioto = hc->timeout_server; + + if (doresolve) { + /* in order to do the set-dst we need to put the address on the front */ + s->scf->dst = addr; + } else { + /* in cases we don't use the resolve we already have the address + * and must put it on the backend side, some of the cases are + * not meant to be used on the frontend (sockpair, unix socket etc.) */ + s->scb->dst = addr; + } + + s->scb->flags |= (SC_FL_RCV_ONCE|SC_FL_NOLINGER); + s->flags |= SF_ASSIGNED; + + /* applet is waiting for data */ + applet_need_more_data(appctx); + appctx_wakeup(appctx); + + hc->appctx = appctx; + hc->flags |= HTTPCLIENT_FS_STARTED; + + /* The request was transferred when the stream was created. So switch + * directly to REQ_BODY or RES_STLINE state + */ + appctx->st0 = (hc->ops.req_payload ? HTTPCLIENT_S_REQ_BODY : HTTPCLIENT_S_RES_STLINE); + return 0; + + out_free_addr: + sockaddr_free(&addr); + out_error: + return -1; +} + +void httpclient_applet_release(struct appctx *appctx) +{ + struct httpclient *hc = appctx->svcctx; + + /* mark the httpclient as ended */ + hc->flags |= HTTPCLIENT_FS_ENDED; + /* the applet is leaving, remove the ptr so we don't try to call it + * again from the caller */ + hc->appctx = NULL; + + if (hc->ops.res_end) + hc->ops.res_end(hc); + + /* destroy the httpclient when set to autotokill */ + if (hc->flags & HTTPCLIENT_FA_AUTOKILL) { + httpclient_destroy(hc); + } + + /* be sure not to use this ptr anymore if the IO handler is called a + * last time */ + appctx->svcctx = NULL; + + return; +} + +/* HTTP client applet */ +static struct applet httpclient_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<HTTPCLIENT>", + .fct = httpclient_applet_io_handler, + .init = httpclient_applet_init, + .release = httpclient_applet_release, +}; + + +static int httpclient_resolve_init(struct proxy *px) +{ + struct act_rule *rule; + int i; + char *do_resolve = NULL; + char *http_rules[][11] = { + { "set-var(txn.hc_ip)", "dst", "" }, + { do_resolve, "hdr(Host),host_only", "if", "{", "var(txn.hc_ip)", "-m", "ip", "0.0.0.0", "}", "" }, + { "return", "status", "503", "if", "{", "var(txn.hc_ip)", "-m", "ip", "0.0.0.0", "}", "" }, + { "capture", "var(txn.hc_ip)", "len", "40", "" }, + { "set-dst", "var(txn.hc_ip)", "" }, + { "" } + }; + + + if (resolvers_disabled) + return 0; + + if (!resolvers_id) + resolvers_id = strdup("default"); + + memprintf(&do_resolve, "do-resolve(txn.hc_ip,%s%s%s)", resolvers_id, resolvers_prefer ? "," : "", resolvers_prefer ? resolvers_prefer : ""); + http_rules[1][0] = do_resolve; + + /* Try to create the default resolvers section */ + resolvers_create_default(); + + /* if the resolver does not exist and no hard_error was set, simply ignore resolving */ + if (!find_resolvers_by_id(resolvers_id) && !hard_error_resolvers) { + free(do_resolve); + return 0; + } + + + for (i = 0; *http_rules[i][0] != '\0'; i++) { + rule = parse_http_req_cond((const char **)http_rules[i], "httpclient", 0, px); + if (!rule) { + free(do_resolve); + ha_alert("Couldn't setup the httpclient resolver.\n"); + return 1; + } + LIST_APPEND(&px->http_req_rules, &rule->list); + } + + free(do_resolve); + return 0; +} + +/* + * Creates an internal proxy which will be used for httpclient. + * This will allocate 2 servers (raw and ssl) and 1 proxy. + * + * This function must be called from a precheck callback. + * + * Return a proxy or NULL. + */ +struct proxy *httpclient_create_proxy(const char *id) +{ + int err_code = ERR_NONE; + char *errmsg = NULL; + struct proxy *px = NULL; + struct server *srv_raw = NULL; +#ifdef USE_OPENSSL + struct server *srv_ssl = NULL; +#endif + + if (global.mode & MODE_MWORKER_WAIT) + return ERR_NONE; + + px = alloc_new_proxy(id, PR_CAP_LISTEN|PR_CAP_INT|PR_CAP_HTTPCLIENT, &errmsg); + if (!px) { + memprintf(&errmsg, "couldn't allocate proxy."); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + px->options |= PR_O_WREQ_BODY; + px->retry_type |= PR_RE_CONN_FAILED | PR_RE_DISCONNECTED | PR_RE_TIMEOUT; + px->options2 |= PR_O2_INDEPSTR; + px->mode = PR_MODE_HTTP; + px->maxconn = 0; + px->accept = NULL; + px->conn_retries = httpclient_retries; + px->timeout.connect = httpclient_timeout_connect; + px->timeout.client = TICK_ETERNITY; + /* The HTTP Client use the "option httplog" with the global loggers */ + px->conf.logformat_string = httpclient_log_format; + px->http_needed = 1; + + /* clear HTTP server */ + srv_raw = new_server(px); + if (!srv_raw) { + memprintf(&errmsg, "out of memory."); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + srv_settings_cpy(srv_raw, &px->defsrv, 0); + srv_raw->iweight = 0; + srv_raw->uweight = 0; + srv_raw->xprt = xprt_get(XPRT_RAW); + srv_raw->flags |= SRV_F_MAPPORTS; /* needed to apply the port change with resolving */ + srv_raw->id = strdup("<HTTPCLIENT>"); + if (!srv_raw->id) { + memprintf(&errmsg, "out of memory."); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + +#ifdef USE_OPENSSL + /* SSL HTTP server */ + srv_ssl = new_server(px); + if (!srv_ssl) { + memprintf(&errmsg, "out of memory."); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + srv_settings_cpy(srv_ssl, &px->defsrv, 0); + srv_ssl->iweight = 0; + srv_ssl->uweight = 0; + srv_ssl->xprt = xprt_get(XPRT_SSL); + srv_ssl->use_ssl = 1; + srv_ssl->flags |= SRV_F_MAPPORTS; /* needed to apply the port change with resolving */ + srv_ssl->id = strdup("<HTTPSCLIENT>"); + if (!srv_ssl->id) { + memprintf(&errmsg, "out of memory."); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + if (ssl_sock_parse_alpn("h2,http/1.1", &srv_ssl->ssl_ctx.alpn_str, &srv_ssl->ssl_ctx.alpn_len, &errmsg) != 0) { + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } +#endif + srv_ssl->ssl_ctx.verify = httpclient_ssl_verify; + /* if the verify is required, try to load the system CA */ + if (httpclient_ssl_verify == SSL_SOCK_VERIFY_REQUIRED) { + + srv_ssl->ssl_ctx.ca_file = strdup(httpclient_ssl_ca_file ? httpclient_ssl_ca_file : "@system-ca"); + if (!__ssl_store_load_locations_file(srv_ssl->ssl_ctx.ca_file, 1, CAFILE_CERT, !hard_error_ssl)) { + /* if we failed to load the ca-file, only quits in + * error with hard_error, otherwise just disable the + * feature. */ + if (hard_error_ssl) { + memprintf(&errmsg, "cannot initialize SSL verify with 'ca-file \"%s\"'.", srv_ssl->ssl_ctx.ca_file); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } else { + ha_free(&srv_ssl->ssl_ctx.ca_file); + srv_drop(srv_ssl); + srv_ssl = NULL; + } + } + } + +#endif + + /* add the proxy in the proxy list only if everything is successful */ + px->next = proxies_list; + proxies_list = px; + + if (httpclient_resolve_init(px) != 0) { + memprintf(&errmsg, "cannot initialize resolvers."); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + /* link the 2 servers in the proxy */ + srv_raw->next = px->srv; + px->srv = srv_raw; + +#ifdef USE_OPENSSL + if (srv_ssl) { + srv_ssl->next = px->srv; + px->srv = srv_ssl; + } +#endif + + +err: + if (err_code & ERR_CODE) { + ha_alert("httpclient: cannot initialize: %s\n", errmsg); + free(errmsg); + srv_drop(srv_raw); +#ifdef USE_OPENSSL + srv_drop(srv_ssl); +#endif + free_proxy(px); + + return NULL; + } + return px; +} + +/* + * Initialize the proxy for the HTTP client with 2 servers, one for raw HTTP, + * the other for HTTPS. + */ +static int httpclient_precheck() +{ + /* initialize the default httpclient_proxy which is used for the CLI and the lua */ + + httpclient_proxy = httpclient_create_proxy("<HTTPCLIENT>"); + if (!httpclient_proxy) + return 1; + + return 0; +} + +/* Initialize the logs for every proxy dedicated to the httpclient */ +static int httpclient_postcheck_proxy(struct proxy *curproxy) +{ + int err_code = ERR_NONE; + struct logger *logger; + char *errmsg = NULL; +#ifdef USE_OPENSSL + struct server *srv = NULL; + struct server *srv_ssl = NULL; +#endif + + if (global.mode & MODE_MWORKER_WAIT) + return ERR_NONE; + + if (!(curproxy->cap & PR_CAP_HTTPCLIENT)) + return ERR_NONE; /* nothing to do */ + + /* copy logs from "global" log list */ + list_for_each_entry(logger, &global.loggers, list) { + struct logger *node = dup_logger(logger); + + if (!node) { + memprintf(&errmsg, "out of memory."); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + LIST_APPEND(&curproxy->loggers, &node->list); + } + if (curproxy->conf.logformat_string) { + curproxy->conf.args.ctx = ARGC_LOG; + if (!parse_logformat_string(curproxy->conf.logformat_string, curproxy, &curproxy->logformat, + LOG_OPT_MANDATORY|LOG_OPT_MERGE_SPACES, + SMP_VAL_FE_LOG_END, &errmsg)) { + memprintf(&errmsg, "failed to parse log-format : %s.", errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + curproxy->conf.args.file = NULL; + curproxy->conf.args.line = 0; + } + +#ifdef USE_OPENSSL + /* initialize the SNI for the SSL servers */ + + for (srv = curproxy->srv; srv != NULL; srv = srv->next) { + if (srv->xprt == xprt_get(XPRT_SSL)) { + srv_ssl = srv; + } + } + if (srv_ssl && !srv_ssl->sni_expr) { + /* init the SNI expression */ + /* always use the host header as SNI, without the port */ + srv_ssl->sni_expr = strdup("req.hdr(host),field(1,:)"); + err_code |= server_parse_sni_expr(srv_ssl, curproxy, &errmsg); + if (err_code & ERR_CODE) { + memprintf(&errmsg, "failed to configure sni: %s.", errmsg); + goto err; + } + } +#endif + +err: + if (err_code & ERR_CODE) { + ha_alert("httpclient: failed to initialize: %s\n", errmsg); + free(errmsg); + + } + return err_code; +} + +/* initialize the proxy and servers for the HTTP client */ + +REGISTER_PRE_CHECK(httpclient_precheck); +REGISTER_POST_PROXY_CHECK(httpclient_postcheck_proxy); + +static int httpclient_parse_global_resolvers(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + /* any configuration should set the hard_error flag */ + hard_error_resolvers = 1; + + free(resolvers_id); + resolvers_id = strdup(args[1]); + + return 0; +} + +/* config parser for global "httpclient.resolvers.disabled", accepts "on" or "off" */ +static int httpclient_parse_global_resolvers_disabled(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + resolvers_disabled = 1; + else if (strcmp(args[1], "off") == 0) + resolvers_disabled = 0; + else { + memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +static int httpclient_parse_global_prefer(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + /* any configuration should set the hard_error flag */ + hard_error_resolvers = 1; + + + if (strcmp(args[1],"ipv4") == 0) + resolvers_prefer = "ipv4"; + else if (strcmp(args[1],"ipv6") == 0) + resolvers_prefer = "ipv6"; + else { + ha_alert("parsing [%s:%d] : '%s' expects 'ipv4' or 'ipv6' as argument.\n", file, line, args[0]); + return -1; + } + + return 0; +} + + +#ifdef USE_OPENSSL +static int httpclient_parse_global_ca_file(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + /* any configuration should set the hard_error flag */ + hard_error_ssl = 1; + + free(httpclient_ssl_ca_file); + httpclient_ssl_ca_file = strdup(args[1]); + + return 0; +} + +static int httpclient_parse_global_verify(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + /* any configuration should set the hard_error flag */ + hard_error_ssl = 1; + + if (strcmp(args[1],"none") == 0) + httpclient_ssl_verify = SSL_SOCK_VERIFY_NONE; + else if (strcmp(args[1],"required") == 0) + httpclient_ssl_verify = SSL_SOCK_VERIFY_REQUIRED; + else { + ha_alert("parsing [%s:%d] : '%s' expects 'none' or 'required' as argument.\n", file, line, args[0]); + return -1; + } + + return 0; +} +#endif /* ! USE_OPENSSL */ + +static int httpclient_parse_global_retries(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", + file, line, args[0]); + return -1; + } + httpclient_retries = atol(args[1]); + + return 0; +} + +static int httpclient_parse_global_timeout_connect(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + const char *res; + unsigned timeout; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", + file, line, args[0]); + return -1; + } + + res = parse_time_err(args[1], &timeout, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[1], args[0]); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[1], args[0]); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in '%s'", *res, args[0]); + return -1; + } + + if (*args[2] != 0) { + memprintf(err, "'%s' : unexpected extra argument '%s' after value '%s'.", args[0], args[2], args[1]); + return -1; + } + + httpclient_timeout_connect = MS_TO_TICKS(timeout); + + return 0; +} + + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "httpclient.resolvers.disabled", httpclient_parse_global_resolvers_disabled }, + { CFG_GLOBAL, "httpclient.resolvers.id", httpclient_parse_global_resolvers }, + { CFG_GLOBAL, "httpclient.resolvers.prefer", httpclient_parse_global_prefer }, + { CFG_GLOBAL, "httpclient.retries", httpclient_parse_global_retries }, + { CFG_GLOBAL, "httpclient.timeout.connect", httpclient_parse_global_timeout_connect }, +#ifdef USE_OPENSSL + { CFG_GLOBAL, "httpclient.ssl.verify", httpclient_parse_global_verify }, + { CFG_GLOBAL, "httpclient.ssl.ca-file", httpclient_parse_global_ca_file }, +#endif + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/http_conv.c b/src/http_conv.c new file mode 100644 index 0000000..cf515a8 --- /dev/null +++ b/src/http_conv.c @@ -0,0 +1,453 @@ +/* + * HTTP sample conversion + * + * Copyright 2000-2018 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/types.h> + +#include <ctype.h> +#include <string.h> +#include <time.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/capture-t.h> +#include <haproxy/chunk.h> +#include <haproxy/http.h> +#include <haproxy/pool.h> +#include <haproxy/sample.h> +#include <haproxy/stream.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + +static int smp_check_http_date_unit(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + return smp_check_date_unit(args, err); +} + +/* takes an UINT value on input supposed to represent the time since EPOCH, + * adds an optional offset found in args[0] and emits a string representing + * the date in RFC-1123/5322 format. If optional unit param in args[1] is + * provided, decode timestamp in milliseconds ("ms") or microseconds("us"), + * and use relevant output date format. + */ +static int sample_conv_http_date(const struct arg *args, struct sample *smp, void *private) +{ + const char day[7][4] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" }; + const char mon[12][4] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; + struct buffer *temp; + struct tm tm; + int sec_frac = 0; + time_t curr_date; + + /* add offset */ + if (args[0].type == ARGT_SINT) + smp->data.u.sint += args[0].data.sint; + + /* report in milliseconds */ + if (args[1].type == ARGT_SINT && args[1].data.sint == TIME_UNIT_MS) { + sec_frac = smp->data.u.sint % 1000; + smp->data.u.sint /= 1000; + } + /* report in microseconds */ + else if (args[1].type == ARGT_SINT && args[1].data.sint == TIME_UNIT_US) { + sec_frac = smp->data.u.sint % 1000000; + smp->data.u.sint /= 1000000; + } + + /* With high numbers, the date returned can be negative, the 55 bits mask prevent this. */ + curr_date = smp->data.u.sint & 0x007fffffffffffffLL; + + get_gmtime(curr_date, &tm); + + temp = get_trash_chunk(); + if (args[1].type == ARGT_SINT && args[1].data.sint != TIME_UNIT_S) { + temp->data = snprintf(temp->area, temp->size - temp->data, + "%s, %02d %s %04d %02d:%02d:%02d.%d GMT", + day[tm.tm_wday], tm.tm_mday, mon[tm.tm_mon], + 1900+tm.tm_year, + tm.tm_hour, tm.tm_min, tm.tm_sec, sec_frac); + } else { + temp->data = snprintf(temp->area, temp->size - temp->data, + "%s, %02d %s %04d %02d:%02d:%02d GMT", + day[tm.tm_wday], tm.tm_mday, mon[tm.tm_mon], + 1900+tm.tm_year, + tm.tm_hour, tm.tm_min, tm.tm_sec); + } + + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + return 1; +} + +/* Arguments: The list of expected value, the number of parts returned and the separator */ +static int sample_conv_q_preferred(const struct arg *args, struct sample *smp, void *private) +{ + const char *al = smp->data.u.str.area; + const char *end = al + smp->data.u.str.data; + const char *token; + int toklen; + int qvalue; + const char *str; + const char *w; + int best_q = 0; + + /* Set the constant to the sample, because the output of the + * function will be peek in the constant configuration string. + */ + smp->flags |= SMP_F_CONST; + smp->data.u.str.size = 0; + smp->data.u.str.area = ""; + smp->data.u.str.data = 0; + + /* Parse the accept language */ + while (1) { + + /* Jump spaces, quit if the end is detected. */ + while (al < end && isspace((unsigned char)*al)) + al++; + if (al >= end) + break; + + /* Start of the first word. */ + token = al; + + /* Look for separator: isspace(), ',' or ';'. Next value if 0 length word. */ + while (al < end && *al != ';' && *al != ',' && !isspace((unsigned char)*al)) + al++; + if (al == token) + goto expect_comma; + + /* Length of the token. */ + toklen = al - token; + qvalue = 1000; + + /* Check if the token exists in the list. If the token not exists, + * jump to the next token. + */ + str = args[0].data.str.area; + w = str; + while (1) { + if (*str == ';' || *str == '\0') { + if (http_language_range_match(token, toklen, w, str - w)) + goto look_for_q; + if (*str == '\0') + goto expect_comma; + w = str + 1; + } + str++; + } + goto expect_comma; + +look_for_q: + + /* Jump spaces, quit if the end is detected. */ + while (al < end && isspace((unsigned char)*al)) + al++; + if (al >= end) + goto process_value; + + /* If ',' is found, process the result */ + if (*al == ',') + goto process_value; + + /* If the character is different from ';', look + * for the end of the header part in best effort. + */ + if (*al != ';') + goto expect_comma; + + /* Assumes that the char is ';', now expect "q=". */ + al++; + + /* Jump spaces, process value if the end is detected. */ + while (al < end && isspace((unsigned char)*al)) + al++; + if (al >= end) + goto process_value; + + /* Expect 'q'. If no 'q', continue in best effort */ + if (*al != 'q') + goto process_value; + al++; + + /* Jump spaces, process value if the end is detected. */ + while (al < end && isspace((unsigned char)*al)) + al++; + if (al >= end) + goto process_value; + + /* Expect '='. If no '=', continue in best effort */ + if (*al != '=') + goto process_value; + al++; + + /* Jump spaces, process value if the end is detected. */ + while (al < end && isspace((unsigned char)*al)) + al++; + if (al >= end) + goto process_value; + + /* Parse the q value. */ + qvalue = http_parse_qvalue(al, &al); + +process_value: + + /* If the new q value is the best q value, then store the associated + * language in the response. If qvalue is the biggest value (1000), + * break the process. + */ + if (qvalue > best_q) { + smp->data.u.str.area = (char *)w; + smp->data.u.str.data = str - w; + if (qvalue >= 1000) + break; + best_q = qvalue; + } + +expect_comma: + + /* Expect comma or end. If the end is detected, quit the loop. */ + while (al < end && *al != ',') + al++; + if (al >= end) + break; + + /* Comma is found, jump it and restart the analyzer. */ + al++; + } + + /* Set default value if required. */ + if (smp->data.u.str.data == 0 && args[1].type == ARGT_STR) { + smp->data.u.str.area = args[1].data.str.area; + smp->data.u.str.data = args[1].data.str.data; + } + + /* Return true only if a matching language was found. */ + return smp->data.u.str.data != 0; +} + +/* This fetch url-decode any input string. */ +static int sample_conv_url_dec(const struct arg *args, struct sample *smp, void *private) +{ + int in_form = 0; + int len; + + /* If the constant flag is set or if not size is available at + * the end of the buffer, copy the string in other buffer + * before decoding. + */ + if (smp->flags & SMP_F_CONST || smp->data.u.str.size <= smp->data.u.str.data) { + struct buffer *str = get_trash_chunk(); + memcpy(str->area, smp->data.u.str.area, smp->data.u.str.data); + smp->data.u.str.area = str->area; + smp->data.u.str.size = str->size; + smp->flags &= ~SMP_F_CONST; + } + + /* Add final \0 required by url_decode(), and convert the input string. */ + smp->data.u.str.area[smp->data.u.str.data] = '\0'; + + if (args[0].type == ARGT_SINT) + in_form = !!args[0].data.sint; + + len = url_decode(smp->data.u.str.area, in_form); + if (len < 0) + return 0; + smp->data.u.str.data = len; + return 1; +} + +/* url-encode types and encode maps */ +enum encode_type { + ENC_QUERY = 0, +}; +long query_encode_map[(256 / 8) / sizeof(long)]; + +/* Check url-encode type */ +static int sample_conv_url_enc_check(struct arg *arg, struct sample_conv *conv, + const char *file, int line, char **err) +{ + enum encode_type enc_type; + + if (strcmp(arg->data.str.area, "") == 0) + enc_type = ENC_QUERY; + else if (strcmp(arg->data.str.area, "query") == 0) + enc_type = ENC_QUERY; + else { + memprintf(err, "Unexpected encode type. " + "Allowed value is 'query'"); + return 0; + } + + chunk_destroy(&arg->data.str); + arg->type = ARGT_SINT; + arg->data.sint = enc_type; + return 1; +} + +/* Initializes some url encode data at boot */ +static void sample_conf_url_enc_init() +{ + int i; + + memset(query_encode_map, 0, sizeof(query_encode_map)); + /* use rfc3986 to determine list of characters to keep unchanged for + * query string */ + for (i = 0; i < 256; i++) { + if (!((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z') + || (i >= '0' && i <= '9') || + i == '-' || i == '.' || i == '_' || i == '~')) + ha_bit_set(i, query_encode_map); + } +} + +INITCALL0(STG_PREPARE, sample_conf_url_enc_init); + +/* This fetch url-encode any input string. Only support query string for now */ +static int sample_conv_url_enc(const struct arg *args, struct sample *smp, void + *private) +{ + enum encode_type enc_type; + struct buffer *trash = get_trash_chunk(); + long *encode_map; + char *ret; + + enc_type = ENC_QUERY; + enc_type = args->data.sint; + + if (enc_type == ENC_QUERY) + encode_map = query_encode_map; + else + return 0; + + ret = encode_chunk(trash->area, trash->area + trash->size, '%', + encode_map, &smp->data.u.str); + if (ret == NULL || *ret != '\0') + return 0; + trash->data = ret - trash->area; + smp->data.u.str = *trash; + return 1; +} + +static int smp_conv_req_capture(const struct arg *args, struct sample *smp, void *private) +{ + struct proxy *fe; + int idx, i; + struct cap_hdr *hdr; + int len; + + if (args->type != ARGT_SINT) + return 0; + + if (!smp->strm) + return 0; + + fe = strm_fe(smp->strm); + idx = args->data.sint; + + /* Check the availability of the capture id. */ + if (idx > fe->nb_req_cap - 1) + return 0; + + /* Look for the original configuration. */ + for (hdr = fe->req_cap, i = fe->nb_req_cap - 1; + hdr != NULL && i != idx ; + i--, hdr = hdr->next); + if (!hdr) + return 0; + + /* check for the memory allocation */ + if (smp->strm->req_cap[hdr->index] == NULL) + smp->strm->req_cap[hdr->index] = pool_alloc(hdr->pool); + if (smp->strm->req_cap[hdr->index] == NULL) + return 0; + + /* Check length. */ + len = smp->data.u.str.data; + if (len > hdr->len) + len = hdr->len; + + /* Capture input data. */ + memcpy(smp->strm->req_cap[idx], smp->data.u.str.area, len); + smp->strm->req_cap[idx][len] = '\0'; + + return 1; +} + +static int smp_conv_res_capture(const struct arg *args, struct sample *smp, void *private) +{ + struct proxy *fe; + int idx, i; + struct cap_hdr *hdr; + int len; + + if (args->type != ARGT_SINT) + return 0; + + if (!smp->strm) + return 0; + + fe = strm_fe(smp->strm); + idx = args->data.sint; + + /* Check the availability of the capture id. */ + if (idx > fe->nb_rsp_cap - 1) + return 0; + + /* Look for the original configuration. */ + for (hdr = fe->rsp_cap, i = fe->nb_rsp_cap - 1; + hdr != NULL && i != idx ; + i--, hdr = hdr->next); + if (!hdr) + return 0; + + /* check for the memory allocation */ + if (smp->strm->res_cap[hdr->index] == NULL) + smp->strm->res_cap[hdr->index] = pool_alloc(hdr->pool); + if (smp->strm->res_cap[hdr->index] == NULL) + return 0; + + /* Check length. */ + len = smp->data.u.str.data; + if (len > hdr->len) + len = hdr->len; + + /* Capture input data. */ + memcpy(smp->strm->res_cap[idx], smp->data.u.str.area, len); + smp->strm->res_cap[idx][len] = '\0'; + + return 1; +} + +/************************************************************************/ +/* All supported converter keywords must be declared here. */ +/************************************************************************/ + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_conv_kw_list sample_conv_kws = {ILH, { + { "http_date", sample_conv_http_date, ARG2(0,SINT,STR), smp_check_http_date_unit, SMP_T_SINT, SMP_T_STR}, + { "language", sample_conv_q_preferred, ARG2(1,STR,STR), NULL, SMP_T_STR, SMP_T_STR}, + { "capture-req", smp_conv_req_capture, ARG1(1,SINT), NULL, SMP_T_STR, SMP_T_STR}, + { "capture-res", smp_conv_res_capture, ARG1(1,SINT), NULL, SMP_T_STR, SMP_T_STR}, + { "url_dec", sample_conv_url_dec, ARG1(0,SINT), NULL, SMP_T_STR, SMP_T_STR}, + { "url_enc", sample_conv_url_enc, ARG1(1,STR), sample_conv_url_enc_check, SMP_T_STR, SMP_T_STR}, + { NULL, NULL, 0, 0, 0 }, +}}; + +INITCALL1(STG_REGISTER, sample_register_convs, &sample_conv_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/http_ext.c b/src/http_ext.c new file mode 100644 index 0000000..a367519 --- /dev/null +++ b/src/http_ext.c @@ -0,0 +1,1881 @@ +/* + * HTTP extensions logic and helpers + * + * Copyright 2022 HAProxy Technologies + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2.1 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/sample.h> +#include <haproxy/http_htx.h> +#include <haproxy/http_ext.h> +#include <haproxy/chunk.h> +#include <haproxy/stream.h> +#include <haproxy/proxy.h> +#include <haproxy/sc_strm.h> +#include <haproxy/obj_type.h> +#include <haproxy/cfgparse.h> +#include <haproxy/arg.h> +#include <haproxy/initcall.h> +#include <haproxy/tools.h> + +/* + * =========== ANALYZE =========== + * below are http process/ana helpers + */ + +/* checks if <input> contains rfc7239 compliant port + * Returns 1 for success and 0 for failure + * if <port> is not NULL, it will be set to the extracted value contained + * in <input> + * <input> will be consumed accordingly (parsed/extracted characters are + * removed from <input>) + */ +static inline int http_7239_extract_port(struct ist *input, uint16_t *port) +{ + char *start = istptr(*input); + uint32_t port_cast = 0; + int it = 0; + + /* strtol does not support non-null terminated str, + * we extract port ourselves + */ + while (it < istlen(*input) && + isdigit((unsigned char)start[it])) { + port_cast = (port_cast * 10) + (start[it] - '0'); + if (port_cast > 65535) + return 0; /* invalid port */ + it += 1; + } + if (!port_cast) + return 0; /* invalid port */ + /* ok */ + if (port) + *port = (uint16_t)port_cast; + *input = istadv(*input, it); + return 1; +} + +/* check if char is a valid obfuscated identifier char + * (according to 7239 RFC) + * Returns non zero value for valid char + */ +static inline int http_7239_valid_obfsc(char c) +{ + return (isalnum((unsigned char)c) || + (c == '.' || c == '-' || c == '_')); +} + +/* checks if <input> contains rfc7239 compliant obfuscated identifier + * Returns 1 for success and 0 for failure + * if <obfs> is not NULL, it will be set to the extracted value contained + * in <input> + * <input> will be consumed accordingly (parsed/extracted characters are + * removed from <input>) + */ +static inline int http_7239_extract_obfs(struct ist *input, struct ist *obfs) +{ + int it = 0; + + if (obfs) + obfs->ptr = input->ptr; + + while (it < istlen(*input) && istptr(*input)[it] != ';') { + if (!http_7239_valid_obfsc(istptr(*input)[it])) + break; /* end of obfs token */ + it += 1; + } + if (obfs) + obfs->len = it; + *input = istadv(*input, it); + return !!it; +} + +/* checks if <input> contains rfc7239 compliant IPV4 address + * Returns 1 for success and 0 for failure + * if <ip> is not NULL, it will be set to the extracted value contained + * in <input> + * <input> will be consumed accordingly (parsed/extracted characters are + * removed from <input>) + */ +static inline int http_7239_extract_ipv4(struct ist *input, struct in_addr *ip) +{ + char ip4[INET_ADDRSTRLEN]; + unsigned char buf[sizeof(struct in_addr)]; + int it = 0; + + /* extract ipv4 addr */ + while (it < istlen(*input) && it < (sizeof(ip4) - 1)) { + if (!isdigit((unsigned char)istptr(*input)[it]) && + istptr(*input)[it] != '.') + break; /* no more ip4 char */ + ip4[it] = istptr(*input)[it]; + it += 1; + } + ip4[it] = 0; + if (inet_pton(AF_INET, ip4, buf) != 1) + return 0; /* invalid ip4 addr */ + /* ok */ + if (ip) + memcpy(ip, buf, sizeof(buf)); + *input = istadv(*input, it); + return 1; +} + +/* checks if <input> contains rfc7239 compliant IPV6 address + * assuming input.len >= 1 and first char is '[' + * Returns 1 for success and 0 for failure + * if <ip> is not NULL, it will be set to the extracted value contained + * in <input> + * <input> will be consumed accordingly (parsed/extracted characters are + * removed from <input>) + */ +static inline int http_7239_extract_ipv6(struct ist *input, struct in6_addr *ip) +{ + char ip6[INET6_ADDRSTRLEN]; + unsigned char buf[sizeof(struct in6_addr)]; + int it = 0; + + *input = istnext(*input); /* skip '[' leading char */ + /* extract ipv6 addr */ + while (it < istlen(*input) && + it < (sizeof(ip6) - 1)) { + if (!isalnum((unsigned char)istptr(*input)[it]) && + istptr(*input)[it] != ':') + break; /* no more ip6 char */ + ip6[it] = istptr(*input)[it]; + it += 1; + } + ip6[it] = 0; + if ((istlen(*input)-it) < 1 || istptr(*input)[it] != ']') + return 0; /* missing ending "]" char */ + it += 1; + if (inet_pton(AF_INET6, ip6, buf) != 1) + return 0; /* invalid ip6 addr */ + /* ok */ + if (ip) + memcpy(ip, buf, sizeof(buf)); + *input = istadv(*input, it); + return 1; +} + +/* checks if <input> contains rfc7239 compliant host + * <quoted> is used to determine if the current input is being extracted + * from a quoted (non zero) or unquoted (zero) token, as the parsing rules + * differ whether the input is quoted or not according to the rfc. + * Returns 1 for success and 0 for failure + * if <host> is not NULL, it will be set to the extracted value contained + * in <input> + * <input> will be consumed accordingly (parsed/extracted characters are + * removed from <input>) + */ +static inline int http_7239_extract_host(struct ist *input, struct ist *host, int quoted) +{ + if (istlen(*input) < 1) + return 0; /* invalid input */ + + if (host) + host->ptr = input->ptr; + + if (quoted && *istptr(*input) == '[') { + /* raw ipv6 address */ + if (!http_7239_extract_ipv6(input, NULL)) + return 0; /* invalid addr */ + } + else { + /* ipv4 or dns */ + while (istlen(*input)) { + if (!isalnum((unsigned char)*istptr(*input)) && + *istptr(*input) != '.') + break; /* end of hostname token */ + *input = istnext(*input); + } + } + if (istlen(*input) < 1 || *istptr(*input) != ':') { + goto out; /* no optional port provided */ + } + if (!quoted) + return 0; /* not supported */ + *input = istnext(*input); /* skip ':' */ + /* validate port */ + if (!http_7239_extract_port(input, NULL)) + return 0; /* invalid port */ + out: + if (host) + host->len = (input->ptr - host->ptr); + return 1; +} + +/* checks if <input> contains rfc7239 compliant nodename + * <quoted> is used to determine if the current input is being extracted + * from a quoted (non zero) or unquoted (zero) token, as the parsing rules + * differ whether the input is quoted or not according to the rfc. + * Returns 1 for success and 0 for failure + * if <nodename> is not NULL, it will be set to the extracted value contained + * in <input> + * <input> will be consumed accordingly (parsed/extracted characters are + * removed from <input>) + */ +static inline int http_7239_extract_nodename(struct ist *input, struct forwarded_header_nodename *nodename, int quoted) +{ + if (istlen(*input) < 1) + return 0; /* invalid input */ + if (*istptr(*input) == '_') { + struct ist *obfs = NULL; + + /* obfuscated nodename */ + *input = istnext(*input); /* skip '_' */ + if (nodename) { + nodename->type = FORWARDED_HEADER_OBFS; + obfs = &nodename->obfs; + } + if (!http_7239_extract_obfs(input, obfs)) + return 0; /* invalid obfs */ + } else if (*istptr(*input) == 'u') { + /* "unknown" nodename? */ + if (istlen(*input) < 7 || + strncmp("unknown", istptr(*input), 7)) + return 0; /* syntax error */ + *input = istadv(*input, 7); /* skip "unknown" */ + if (nodename) + nodename->type = FORWARDED_HEADER_UNK; + } else if (quoted && *istptr(*input) == '[') { + struct in6_addr *ip6 = NULL; + + /* ipv6 address */ + if (nodename) { + struct sockaddr_in6 *addr = (void *)&nodename->ip; + + ip6 = &addr->sin6_addr; + addr->sin6_family = AF_INET6; + nodename->type = FORWARDED_HEADER_IP; + } + if (!http_7239_extract_ipv6(input, ip6)) + return 0; /* invalid ip6 */ + } else if (*istptr(*input)) { + struct in_addr *ip = NULL; + + /* ipv4 address */ + if (nodename) { + struct sockaddr_in *addr = (void *)&nodename->ip; + + ip = &addr->sin_addr; + addr->sin_family = AF_INET; + nodename->type = FORWARDED_HEADER_IP; + } + if (!http_7239_extract_ipv4(input, ip)) + return 0; /* invalid ip */ + } else + return 0; /* unexpected char */ + + /* ok */ + return 1; +} + +/* checks if <input> contains rfc7239 compliant nodeport + * <quoted> is used to determine if the current input is being extracted + * from a quoted (non zero) or unquoted (zero) token, as the parsing rules + * differ whether the input is quoted or not according to the rfc. + * Returns 1 for success and 0 for failure + * if <nodeport> is not NULL, it will be set to the extracted value contained + * in <input> + * <input> will be consumed accordingly (parsed/extracted characters are + * removed from <input>) + */ +static inline int http_7239_extract_nodeport(struct ist *input, struct forwarded_header_nodeport *nodeport) +{ + if (*istptr(*input) == '_') { + struct ist *obfs = NULL; + + /* obfuscated nodeport */ + *input = istnext(*input); /* skip '_' */ + if (nodeport) { + nodeport->type = FORWARDED_HEADER_OBFS; + obfs = &nodeport->obfs; + } + if (!http_7239_extract_obfs(input, obfs)) + return 0; /* invalid obfs */ + } else { + uint16_t *port = NULL; + + /* normal port */ + if (nodeport) { + nodeport->type = FORWARDED_HEADER_PORT; + port = &nodeport->port; + } + if (!http_7239_extract_port(input, port)) + return 0; /* invalid port */ + } + /* ok */ + return 1; +} + +/* checks if <input> contains rfc7239 compliant node (nodename:nodeport token) + * <quoted> is used to determine if the current input is being extracted + * from a quoted (non zero) or unquoted (zero) token, as the parsing rules + * differ whether the input is quoted or not according to the rfc. + * Returns 1 for success and 0 for failure + * if <node> is not NULL, it will be set to the extracted value contained + * in <input> + * <input> will be consumed accordingly (parsed/extracted characters are + * removed from <input>) + */ +static inline int http_7239_extract_node(struct ist *input, struct forwarded_header_node *node, int quoted) +{ + struct forwarded_header_nodename *nodename = NULL; + struct forwarded_header_nodeport *nodeport = NULL; + + if (node) { + nodename = &node->nodename; + nodeport = &node->nodeport; + node->raw.ptr = input->ptr; + } + if (!http_7239_extract_nodename(input, nodename, quoted)) + return 0; /* invalid nodename */ + if (istlen(*input) < 1 || *istptr(*input) != ':') { + if (node) + node->nodeport.type = FORWARDED_HEADER_UNK; + goto out; /* no optional port provided */ + } + if (!quoted) + return 0; /* not supported */ + *input = istnext(*input); + if (!http_7239_extract_nodeport(input, nodeport)) + return 0; /* invalid nodeport */ + out: + /* ok */ + if (node) + node->raw.len = input->ptr - node->raw.ptr; + return 1; +} + +static inline int _forwarded_header_save_ctx(struct forwarded_header_ctx *ctx, int current_step, int required_steps) +{ + return (ctx && (current_step & required_steps)); +} + +static inline void _forwarded_header_quote_expected(struct ist *hdr, uint8_t *quoted) +{ + if (istlen(*hdr) > 0 && *istptr(*hdr) == '"') { + *quoted = 1; + /* node is quoted, we must find corresponding + * ending quote at the end of the token + */ + *hdr = istnext(*hdr); /* skip quote */ + } +} + +/* checks if current header <hdr> is RFC 7239 compliant and can be "trusted". + * function will stop parsing as soon as every <required_steps> have + * been validated or error is encountered. + * Provide FORWARDED_HEADER_ALL for a full header validating spectrum. + * You may provide limited scope to perform quick searches on specific attributes + * If <ctx> is provided (not NULL), parsed attributes will be stored according to + * their types, allowing you to extract some useful information from the header. + * Returns 0 on failure and <validated_steps> bitfield on success. + */ +int http_validate_7239_header(struct ist hdr, int required_steps, struct forwarded_header_ctx *ctx) +{ + int validated_steps = 0; + int current_step = 0; + uint8_t first = 1; + uint8_t quoted = 0; + + while (istlen(hdr) && (required_steps & ~validated_steps)) { + if (!first) { + if (*istptr(hdr) == ';') + hdr = istnext(hdr); /* skip ';' */ + else + goto not_ok; /* unexpected char */ + } + else + first = 0; + + if (!(validated_steps & FORWARDED_HEADER_FOR) && istlen(hdr) > 4 && + strncmp("for=", istptr(hdr), 4) == 0) { + struct forwarded_header_node *node = NULL; + + /* for parameter */ + current_step = FORWARDED_HEADER_FOR; + hdr = istadv(hdr, 4); /* skip "for=" */ + _forwarded_header_quote_expected(&hdr, "ed); + if (_forwarded_header_save_ctx(ctx, current_step, required_steps)) + node = &ctx->nfor; + /* validate node */ + if (!http_7239_extract_node(&hdr, node, quoted)) + goto not_ok; /* invalid node */ + } + else if (!(validated_steps & FORWARDED_HEADER_BY) && istlen(hdr) > 3 && + strncmp("by=", istptr(hdr), 3) == 0) { + struct forwarded_header_node *node = NULL; + + /* by parameter */ + current_step = FORWARDED_HEADER_BY; + hdr = istadv(hdr, 3); /* skip "by=" */ + _forwarded_header_quote_expected(&hdr, "ed); + if (_forwarded_header_save_ctx(ctx, current_step, required_steps)) + node = &ctx->nby; + /* validate node */ + if (!http_7239_extract_node(&hdr, node, quoted)) + goto not_ok; /* invalid node */ + } + else if (!(validated_steps & FORWARDED_HEADER_HOST) && istlen(hdr) > 5 && + strncmp("host=", istptr(hdr), 5) == 0) { + struct ist *host = NULL; + + /* host parameter */ + current_step = FORWARDED_HEADER_HOST; + hdr = istadv(hdr, 5); /* skip "host=" */ + _forwarded_header_quote_expected(&hdr, "ed); + if (_forwarded_header_save_ctx(ctx, current_step, required_steps)) + host = &ctx->host; + /* validate host */ + if (!http_7239_extract_host(&hdr, host, quoted)) + goto not_ok; /* invalid host */ + } + else if (!(validated_steps & FORWARDED_HEADER_PROTO) && istlen(hdr) > 6 && + strncmp("proto=", istptr(hdr), 6) == 0) { + /* proto parameter */ + current_step = FORWARDED_HEADER_PROTO; + hdr = istadv(hdr, 6); /* skip "proto=" */ + /* validate proto (only common used http|https are supported for now) */ + if (istlen(hdr) < 4 || strncmp("http", istptr(hdr), 4)) + goto not_ok; + hdr = istadv(hdr, 4); /* skip "http" */ + if (istlen(hdr) && *istptr(hdr) == 's') { + hdr = istnext(hdr); + if (_forwarded_header_save_ctx(ctx, current_step, required_steps)) + ctx->proto = FORWARDED_HEADER_HTTPS; + } else if (_forwarded_header_save_ctx(ctx, current_step, required_steps)) + ctx->proto = FORWARDED_HEADER_HTTP; + /* rfc allows for potential proto quoting, but we don't support + * it: it is not common usage + */ + } + else { + /* not supported + * rfc allows for upcoming extensions + * but obviously, we can't trust them + * as they are not yet standardized + */ + + goto not_ok; + } + /* quote check */ + if (quoted) { + if (istlen(hdr) < 1 || *istptr(hdr) != '"') { + /* matching ending quote not found */ + goto not_ok; + } + hdr = istnext(hdr); /* skip ending quote */ + quoted = 0; /* reset */ + } + validated_steps |= current_step; + } + + return validated_steps; + + not_ok: + return 0; +} + +static inline void _7239_print_ip6(struct buffer *out, struct in6_addr *ip6_addr, int quoted) +{ + char pn[INET6_ADDRSTRLEN]; + + inet_ntop(AF_INET6, + ip6_addr, + pn, sizeof(pn)); + if (!quoted) + chunk_appendf(out, "\""); /* explicit quoting required for ipv6 */ + chunk_appendf(out, "[%s]", pn); +} + +static inline void http_build_7239_header_nodename(struct buffer *out, + struct stream *s, struct proxy *curproxy, + const struct sockaddr_storage *addr, + struct http_ext_7239_forby *forby) +{ + struct in6_addr *ip6_addr; + int quoted = !!forby->np_mode; + + if (forby->nn_mode == HTTP_7239_FORBY_ORIG) { + if (addr && addr->ss_family == AF_INET) { + unsigned char *pn = (unsigned char *)&((struct sockaddr_in *)addr)->sin_addr; + + chunk_appendf(out, "%d.%d.%d.%d", pn[0], pn[1], pn[2], pn[3]); + } + else if (addr && addr->ss_family == AF_INET6) { + ip6_addr = &((struct sockaddr_in6 *)addr)->sin6_addr; + _7239_print_ip6(out, ip6_addr, quoted); + } + /* else: not supported */ + } + else if (forby->nn_mode == HTTP_7239_FORBY_SMP && forby->nn_expr) { + struct sample *smp; + + smp = sample_process(curproxy, s->sess, s, + SMP_OPT_DIR_REQ | SMP_OPT_FINAL, forby->nn_expr, NULL); + + if (smp) { + if (smp->data.type == SMP_T_IPV6) { + /* smp is valid IP6, print with RFC compliant output */ + ip6_addr = &smp->data.u.ipv6; + _7239_print_ip6(out, ip6_addr, quoted); + } + else if (sample_casts[smp->data.type][SMP_T_STR] && + sample_casts[smp->data.type][SMP_T_STR](smp)) { + struct ist validate_n = ist2(smp->data.u.str.area, smp->data.u.str.data); + struct ist validate_o = ist2(smp->data.u.str.area, smp->data.u.str.data); + struct forwarded_header_nodename nodename; + + /* validate nodename */ + if (http_7239_extract_nodename(&validate_n, &nodename, 1) && + !istlen(validate_n)) { + if (nodename.type == FORWARDED_HEADER_IP && + nodename.ip.ss_family == AF_INET6) { + /* special care needed for valid ip6 nodename (quoting) */ + ip6_addr = &((struct sockaddr_in6 *)&nodename.ip)->sin6_addr; + _7239_print_ip6(out, ip6_addr, quoted); + } else { + /* no special care needed, input is already rfc compliant, + * just print as regular non quoted string + */ + chunk_cat(out, &smp->data.u.str); + } + } + else if (http_7239_extract_obfs(&validate_o, NULL) && + !istlen(validate_o)) { + /* raw user input that should be printed as 7239 obfs */ + chunk_appendf(out, "_%.*s", (int)smp->data.u.str.data, smp->data.u.str.area); + } + /* else: not compliant */ + } + /* else: cannot be casted to str */ + } + /* else: smp error */ + } +} + +static inline void http_build_7239_header_nodeport(struct buffer *out, + struct stream *s, struct proxy *curproxy, + const struct sockaddr_storage *addr, + struct http_ext_7239_forby *forby) +{ + if (forby->np_mode == HTTP_7239_FORBY_ORIG) { + if (addr && addr->ss_family == AF_INET) + chunk_appendf(out, "%d", ntohs(((struct sockaddr_in *)addr)->sin_port)); + else if (addr && addr->ss_family == AF_INET6) + chunk_appendf(out, "%d", ntohs(((struct sockaddr_in6 *)addr)->sin6_port)); + /* else: not supported */ + } + else if (forby->np_mode == HTTP_7239_FORBY_SMP && forby->np_expr) { + struct sample *smp; + + smp = sample_fetch_as_type(curproxy, s->sess, s, + SMP_OPT_DIR_REQ | SMP_OPT_FINAL, forby->np_expr, SMP_T_STR); + if (smp) { + struct ist validate_n = ist2(smp->data.u.str.area, smp->data.u.str.data); + struct ist validate_o = ist2(smp->data.u.str.area, smp->data.u.str.data); + + /* validate nodeport */ + if (http_7239_extract_nodeport(&validate_n, NULL) && + !istlen(validate_n)) { + /* no special care needed, input is already rfc compliant, + * just print as regular non quoted string + */ + chunk_cat(out, &smp->data.u.str); + } + else if (http_7239_extract_obfs(&validate_o, NULL) && + !istlen(validate_o)) { + /* raw user input that should be printed as 7239 obfs */ + chunk_appendf(out, "_%.*s", (int)smp->data.u.str.data, smp->data.u.str.area); + } + /* else: not compliant */ + } + /* else: smp error */ + } +} + +static inline void http_build_7239_header_node(struct buffer *out, + struct stream *s, struct proxy *curproxy, + const struct sockaddr_storage *addr, + struct http_ext_7239_forby *forby) +{ + size_t offset_start; + size_t offset_save; + + offset_start = out->data; + if (forby->np_mode) + chunk_appendf(out, "\""); + offset_save = out->data; + http_build_7239_header_nodename(out, s, curproxy, addr, forby); + if (offset_save == out->data) { + /* could not build nodename, either because some + * data is not available or user is providing bad input + */ + chunk_appendf(out, "unknown"); + } + if (forby->np_mode) { + chunk_appendf(out, ":"); + offset_save = out->data; + http_build_7239_header_nodeport(out, s, curproxy, addr, forby); + if (offset_save == out->data) { + /* could not build nodeport, either because some data is + * not available or user is providing bad input + */ + out->data = offset_save - 1; + } + } + if (out->data != offset_start && out->area[offset_start] == '"') + chunk_appendf(out, "\""); /* add matching end quote */ +} + +static inline void http_build_7239_header_host(struct buffer *out, + struct stream *s, struct proxy *curproxy, + struct htx *htx, struct http_ext_7239_host *host) +{ + struct http_hdr_ctx ctx = { .blk = NULL }; + char *str = NULL; + int str_len = 0; + + if (host->mode == HTTP_7239_HOST_ORIG && + http_find_header(htx, ist("host"), &ctx, 0)) { + str = ctx.value.ptr; + str_len = ctx.value.len; + print_host: + { + struct ist validate = ist2(str, str_len); + /* host check, to ensure rfc compliant output + * (assuming host is quoted/escaped) + */ + if (http_7239_extract_host(&validate, NULL, 1) && !istlen(validate)) + chunk_memcat(out, str, str_len); + /* else: not compliant or partially compliant */ + } + + } + else if (host->mode == HTTP_7239_HOST_SMP && host->expr) { + struct sample *smp; + + smp = sample_fetch_as_type(curproxy, s->sess, s, + SMP_OPT_DIR_REQ | SMP_OPT_FINAL, host->expr, SMP_T_STR); + if (smp) { + str = smp->data.u.str.area; + str_len = smp->data.u.str.data; + goto print_host; + } + /* else: smp error */ + } +} + +/* Tries build 7239 header according to <curproxy> parameters and <s> context + * It both depends on <curproxy>->http_ext->fwd for config and <s> for request + * context data. + * The function will write output to <out> buffer + * Returns 1 for success and 0 for error (ie: not enough space in buffer) + */ +static int http_build_7239_header(struct buffer *out, + struct stream *s, struct proxy *curproxy, struct htx *htx) +{ + struct connection *cli_conn = objt_conn(strm_sess(s)->origin); + + if (curproxy->http_ext->fwd->p_proto) { + chunk_appendf(out, "%sproto=%s", ((out->data) ? ";" : ""), + ((conn_is_ssl(cli_conn)) ? "https" : "http")); + } + if (curproxy->http_ext->fwd->p_host.mode) { + /* always add quotes for host parameter to make output compliance checks simpler */ + chunk_appendf(out, "%shost=\"", ((out->data) ? ";" : "")); + /* ignore return value for now, but could be useful some day */ + http_build_7239_header_host(out, s, curproxy, htx, &curproxy->http_ext->fwd->p_host); + chunk_appendf(out, "\""); + } + + if (curproxy->http_ext->fwd->p_by.nn_mode) { + const struct sockaddr_storage *dst = sc_dst(s->scf); + + chunk_appendf(out, "%sby=", ((out->data) ? ";" : "")); + http_build_7239_header_node(out, s, curproxy, dst, &curproxy->http_ext->fwd->p_by); + } + + if (curproxy->http_ext->fwd->p_for.nn_mode) { + const struct sockaddr_storage *src = sc_src(s->scf); + + chunk_appendf(out, "%sfor=", ((out->data) ? ";" : "")); + http_build_7239_header_node(out, s, curproxy, src, &curproxy->http_ext->fwd->p_for); + } + if (unlikely(out->data == out->size)) { + /* not enough space in buffer, error */ + return 0; + } + return 1; +} + +/* This function will try to inject RFC 7239 forwarded header if + * configured on the backend (ignored for frontends). + * Will do nothing if the option is not enabled on the proxy. + * Returns 1 for success and 0 for failure + */ +int http_handle_7239_header(struct stream *s, struct channel *req) +{ + struct proxy *curproxy = s->be; /* ignore frontend */ + + if (curproxy->http_ext && curproxy->http_ext->fwd) { + struct htx *htx = htxbuf(&req->buf); + int validate = 1; + struct http_hdr_ctx find = { .blk = NULL }; + struct http_hdr_ctx last = { .blk = NULL}; + struct ist hdr = ist("forwarded"); + + /* ok, let's build forwarded header */ + chunk_reset(&trash); + if (unlikely(!http_build_7239_header(&trash, s, curproxy, htx))) + return 0; /* error when building header (bad user conf or memory error) */ + + /* validate existing forwarded header (including multiple values), + * hard stop if error is encountered + */ + while (http_find_header(htx, hdr, &find, 0)) { + /* validate current header chunk */ + if (!http_validate_7239_header(find.value, FORWARDED_HEADER_ALL, NULL)) { + /* at least one error, existing forwarded header not OK, add our own + * forwarded header, so that it can be trusted + */ + validate = 0; + break; + } + last = find; + } + /* no errors, append our data at the end of existing header */ + if (last.blk && validate) { + if (unlikely(!http_append_header_value(htx, &last, ist2(trash.area, trash.data)))) + return 0; /* htx error */ + } + else { + if (unlikely(!http_add_header(htx, hdr, ist2(trash.area, trash.data)))) + return 0; /* htx error */ + } + } + return 1; +} + +/* + * add X-Forwarded-For if either the frontend or the backend + * asks for it. + * Returns 1 for success and 0 for failure + */ +int http_handle_xff_header(struct stream *s, struct channel *req) +{ + struct session *sess = s->sess; + struct http_ext_xff *f_xff = NULL; + struct http_ext_xff *b_xff = NULL; + + if (sess->fe->http_ext && sess->fe->http_ext->xff) { + /* frontend */ + f_xff = sess->fe->http_ext->xff; + } + if (s->be->http_ext && s->be->http_ext->xff) { + /* backend */ + b_xff = s->be->http_ext->xff; + } + + if (f_xff || b_xff) { + struct htx *htx = htxbuf(&req->buf); + const struct sockaddr_storage *src = sc_src(s->scf); + struct http_hdr_ctx ctx = { .blk = NULL }; + struct ist hdr = ((b_xff) ? b_xff->hdr_name : f_xff->hdr_name); + + if ((!f_xff || f_xff->mode == HTTP_XFF_IFNONE) && + (!b_xff || b_xff->mode == HTTP_XFF_IFNONE) && + http_find_header(htx, hdr, &ctx, 0)) { + /* The header is set to be added only if none is present + * and we found it, so don't do anything. + */ + } + else if (src && src->ss_family == AF_INET) { + /* Add an X-Forwarded-For header unless the source IP is + * in the 'except' network range. + */ + if ((!f_xff || ipcmp2net(src, &f_xff->except_net)) && + (!b_xff || ipcmp2net(src, &b_xff->except_net))) { + unsigned char *pn = (unsigned char *)&((struct sockaddr_in *)src)->sin_addr; + + /* Note: we rely on the backend to get the header name to be used for + * x-forwarded-for, because the header is really meant for the backends. + * However, if the backend did not specify any option, we have to rely + * on the frontend's header name. + */ + chunk_printf(&trash, "%d.%d.%d.%d", pn[0], pn[1], pn[2], pn[3]); + if (unlikely(!http_add_header(htx, hdr, ist2(trash.area, trash.data)))) + return 0; + } + } + else if (src && src->ss_family == AF_INET6) { + /* Add an X-Forwarded-For header unless the source IP is + * in the 'except' network range. + */ + if ((!f_xff || ipcmp2net(src, &f_xff->except_net)) && + (!b_xff || ipcmp2net(src, &b_xff->except_net))) { + char pn[INET6_ADDRSTRLEN]; + + inet_ntop(AF_INET6, + (const void *)&((struct sockaddr_in6 *)(src))->sin6_addr, + pn, sizeof(pn)); + + /* Note: we rely on the backend to get the header name to be used for + * x-forwarded-for, because the header is really meant for the backends. + * However, if the backend did not specify any option, we have to rely + * on the frontend's header name. + */ + chunk_printf(&trash, "%s", pn); + if (unlikely(!http_add_header(htx, hdr, ist2(trash.area, trash.data)))) + return 0; + } + } + } + return 1; +} + +/* + * add X-Original-To if either the frontend or the backend + * asks for it. + * Returns 1 for success and 0 for failure + */ +int http_handle_xot_header(struct stream *s, struct channel *req) +{ + struct session *sess = s->sess; + struct http_ext_xot *f_xot = NULL; + struct http_ext_xot *b_xot = NULL; + + if (sess->fe->http_ext && sess->fe->http_ext->xot) { + /* frontend */ + f_xot = sess->fe->http_ext->xot; + } + if (s->be->http_ext && s->be->http_ext->xot) { + /* backend */ + BUG_ON(!s->be->http_ext); + b_xot = s->be->http_ext->xot; + } + + if (f_xot || b_xot) { + struct htx *htx = htxbuf(&req->buf); + const struct sockaddr_storage *dst = sc_dst(s->scf); + struct ist hdr = ((b_xot) ? b_xot->hdr_name : f_xot->hdr_name); + + if (dst && dst->ss_family == AF_INET) { + /* Add an X-Original-To header unless the destination IP is + * in the 'except' network range. + */ + if ((!f_xot || ipcmp2net(dst, &f_xot->except_net)) && + (!b_xot || ipcmp2net(dst, &b_xot->except_net))) { + unsigned char *pn = (unsigned char *)&((struct sockaddr_in *)dst)->sin_addr; + + /* Note: we rely on the backend to get the header name to be used for + * x-original-to, because the header is really meant for the backends. + * However, if the backend did not specify any option, we have to rely + * on the frontend's header name. + */ + chunk_printf(&trash, "%d.%d.%d.%d", pn[0], pn[1], pn[2], pn[3]); + if (unlikely(!http_add_header(htx, hdr, ist2(trash.area, trash.data)))) + return 0; + } + } + else if (dst && dst->ss_family == AF_INET6) { + /* Add an X-Original-To header unless the source IP is + * in the 'except' network range. + */ + if ((!f_xot || ipcmp2net(dst, &f_xot->except_net)) && + (!b_xot || ipcmp2net(dst, &b_xot->except_net))) { + char pn[INET6_ADDRSTRLEN]; + + inet_ntop(AF_INET6, + (const void *)&((struct sockaddr_in6 *)dst)->sin6_addr, + pn, sizeof(pn)); + + /* Note: we rely on the backend to get the header name to be used for + * x-forwarded-for, because the header is really meant for the backends. + * However, if the backend did not specify any option, we have to rely + * on the frontend's header name. + */ + chunk_printf(&trash, "%s", pn); + if (unlikely(!http_add_header(htx, hdr, ist2(trash.area, trash.data)))) + return 0; + } + } + } + return 1; +} + +/* + * =========== CONFIG =========== + * below are helpers to parse http ext options from the config + */ +static int proxy_http_parse_oom(const char *file, int linenum) +{ + int err_code = 0; + + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + return err_code; +} + +static inline int _proxy_http_parse_7239_expr(char **args, int *cur_arg, + const char *file, int linenum, + char **expr_s) +{ + int err_code = 0; + + if (!*args[*cur_arg + 1]) { + ha_alert("parsing [%s:%d]: '%s' expects <expr> as argument.\n", + file, linenum, args[*cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + *cur_arg += 1; + ha_free(expr_s); + *expr_s = strdup(args[*cur_arg]); + if (!*expr_s) + return proxy_http_parse_oom(file, linenum); + *cur_arg += 1; + out: + return err_code; +} + +/* forwarded/7239 RFC: tries to parse "option forwarded" config keyword + * Returns a composition of ERR_ABORT, ERR_ALERT, ERR_FATAL, ERR_WARN + */ +int proxy_http_parse_7239(char **args, int cur_arg, + struct proxy *curproxy, const struct proxy *defpx, + const char *file, int linenum) +{ + struct http_ext_7239 *fwd; + int err_code = 0; + + if (warnifnotcap(curproxy, PR_CAP_BE, file, linenum, "option forwarded", NULL)) { + /* option is ignored for frontends */ + err_code |= ERR_WARN; + goto out; + } + + if (!http_ext_7239_prepare(curproxy)) + return proxy_http_parse_oom(file, linenum); + + fwd = curproxy->http_ext->fwd; + + fwd->p_proto = 0; + fwd->p_host.mode = 0; + fwd->p_for.nn_mode = 0; + fwd->p_for.np_mode = 0; + fwd->p_by.nn_mode = 0; + fwd->p_by.np_mode = 0; + ha_free(&fwd->c_file); + fwd->c_file = strdup(file); + fwd->c_line = linenum; + + /* start at 2, since 0+1 = "option" "forwarded" */ + cur_arg = 2; + if (!*(args[cur_arg])) { + /* no optional argument provided, use default settings */ + fwd->p_for.nn_mode = HTTP_7239_FORBY_ORIG; /* enable for and mimic xff */ + fwd->p_proto = 1; /* enable proto */ + goto out; + } + /* loop to go through optional arguments */ + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "proto") == 0) { + fwd->p_proto = 1; + cur_arg += 1; + } else if (strcmp(args[cur_arg], "host") == 0) { + fwd->p_host.mode = HTTP_7239_HOST_ORIG; + cur_arg += 1; + } else if (strcmp(args[cur_arg], "host-expr") == 0) { + fwd->p_host.mode = HTTP_7239_HOST_SMP; + err_code |= _proxy_http_parse_7239_expr(args, &cur_arg, file, linenum, + &fwd->p_host.expr_s); + if (err_code & ERR_CODE) + goto out; + } else if (strcmp(args[cur_arg], "by") == 0) { + fwd->p_by.nn_mode = HTTP_7239_FORBY_ORIG; + cur_arg += 1; + } else if (strcmp(args[cur_arg], "by-expr") == 0) { + fwd->p_by.nn_mode = HTTP_7239_FORBY_SMP; + err_code |= _proxy_http_parse_7239_expr(args, &cur_arg, file, linenum, + &fwd->p_by.nn_expr_s); + if (err_code & ERR_CODE) + goto out; + } else if (strcmp(args[cur_arg], "for") == 0) { + fwd->p_for.nn_mode = HTTP_7239_FORBY_ORIG; + cur_arg += 1; + } else if (strcmp(args[cur_arg], "for-expr") == 0) { + fwd->p_for.nn_mode = HTTP_7239_FORBY_SMP; + err_code |= _proxy_http_parse_7239_expr(args, &cur_arg, file, linenum, + &fwd->p_for.nn_expr_s); + if (err_code & ERR_CODE) + goto out; + } else if (strcmp(args[cur_arg], "by_port") == 0) { + fwd->p_by.np_mode = HTTP_7239_FORBY_ORIG; + cur_arg += 1; + } else if (strcmp(args[cur_arg], "by_port-expr") == 0) { + fwd->p_by.np_mode = HTTP_7239_FORBY_SMP; + err_code |= _proxy_http_parse_7239_expr(args, &cur_arg, file, linenum, + &fwd->p_by.np_expr_s); + if (err_code & ERR_CODE) + goto out; + } else if (strcmp(args[cur_arg], "for_port") == 0) { + fwd->p_for.np_mode = HTTP_7239_FORBY_ORIG; + cur_arg += 1; + } else if (strcmp(args[cur_arg], "for_port-expr") == 0) { + fwd->p_for.np_mode = HTTP_7239_FORBY_SMP; + err_code |= _proxy_http_parse_7239_expr(args, &cur_arg, file, linenum, + &fwd->p_for.np_expr_s); + if (err_code & ERR_CODE) + goto out; + } else { + /* unknown suboption - catchall */ + ha_alert("parsing [%s:%d] : '%s %s' only supports optional values: 'proto', 'host', " + "'host-expr', 'by', 'by-expr', 'by_port', 'by_port-expr', " + "'for', 'for-expr', 'for_port' and 'for_port-expr'.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } /* end while loop */ + + /* consistency check */ + if (fwd->p_by.np_mode && + !fwd->p_by.nn_mode) { + fwd->p_by.np_mode = 0; + ha_free(&fwd->p_by.np_expr_s); + ha_warning("parsing [%s:%d] : '%s %s' : '%s' will be ignored because both 'by' " + "and 'by-expr' are unset\n", + file, linenum, args[0], args[1], + ((fwd->p_by.np_mode == HTTP_7239_FORBY_ORIG) ? "by_port" : "by_port-expr")); + err_code |= ERR_WARN; + } + if (fwd->p_for.np_mode && + !fwd->p_for.nn_mode) { + fwd->p_for.np_mode = 0; + ha_free(&fwd->p_for.np_expr_s); + ha_warning("parsing [%s:%d] : '%s %s' : '%s' will be ignored because both 'for' " + "and 'for-expr' are unset\n", + file, linenum, args[0], args[1], + ((fwd->p_for.np_mode == HTTP_7239_FORBY_ORIG) ? "for_port" : "for_port-expr")); + err_code |= ERR_WARN; + } + + out: + return err_code; +} + +/* rfc7239 forwarded option needs a postparsing step + * to convert parsing hints into runtime usable sample expressions + * Returns a composition of ERR_NONE, ERR_FATAL, ERR_ALERT, ERR_WARN + */ +int proxy_http_compile_7239(struct proxy *curproxy) +{ + struct http_ext_7239 *fwd; + int err = ERR_NONE; + int loop; + + if (!(curproxy->cap & PR_CAP_BE)) { + /* no backend cap: not supported (ie: frontend) */ + goto out; + } + + /* should not happen (test should be performed after BE cap test) */ + BUG_ON(!curproxy->http_ext || !curproxy->http_ext->fwd); + + curproxy->conf.args.ctx = ARGC_OPT; /* option */ + curproxy->conf.args.file = curproxy->http_ext->fwd->c_file; + curproxy->conf.args.line = curproxy->http_ext->fwd->c_line; + fwd = curproxy->http_ext->fwd; + + /* it is important that we keep iterating on error to make sure + * all fwd config fields are in the same state (post-parsing state) + */ + for (loop = 0; loop < 5; loop++) { + char **expr_str = NULL; + struct sample_expr **expr = NULL; + struct sample_expr *cur_expr; + char *err_str = NULL; + int smp = 0; + int idx = 0; + + switch (loop) { + case 0: + /* host */ + expr_str = &fwd->p_host.expr_s; + expr = &fwd->p_host.expr; + smp = (fwd->p_host.mode == HTTP_7239_HOST_SMP); + break; + case 1: + /* by->node */ + expr_str = &fwd->p_by.nn_expr_s; + expr = &fwd->p_by.nn_expr; + smp = (fwd->p_by.nn_mode == HTTP_7239_FORBY_SMP); + break; + case 2: + /* by->nodeport */ + expr_str = &fwd->p_by.np_expr_s; + expr = &fwd->p_by.np_expr; + smp = (fwd->p_by.np_mode == HTTP_7239_FORBY_SMP); + break; + case 3: + /* for->node */ + expr_str = &fwd->p_for.nn_expr_s; + expr = &fwd->p_for.nn_expr; + smp = (fwd->p_for.nn_mode == HTTP_7239_FORBY_SMP); + break; + case 4: + /* for->nodeport */ + expr_str = &fwd->p_for.np_expr_s; + expr = &fwd->p_for.np_expr; + smp = (fwd->p_for.np_mode == HTTP_7239_FORBY_SMP); + break; + } + if (!smp) + continue; /* no expr */ + + /* expr and expr_str cannot be NULL past this point */ + BUG_ON(!expr || !expr_str); + + if (!*expr_str) { + /* should not happen unless system memory exhaustion */ + ha_alert("%s '%s' [%s:%d]: failed to parse 'option forwarded' expression : %s.\n", + proxy_type_str(curproxy), curproxy->id, + fwd->c_file, fwd->c_line, + "memory error"); + err |= ERR_ALERT | ERR_FATAL; + continue; + } + + cur_expr = + sample_parse_expr((char*[]){*expr_str, NULL}, &idx, + fwd->c_file, + fwd->c_line, + &err_str, &curproxy->conf.args, NULL); + + if (!cur_expr) { + ha_alert("%s '%s' [%s:%d]: failed to parse 'option forwarded' expression '%s' in : %s.\n", + proxy_type_str(curproxy), curproxy->id, + fwd->c_file, fwd->c_line, + *expr_str, err_str); + ha_free(&err_str); + err |= ERR_ALERT | ERR_FATAL; + } + else if (!(cur_expr->fetch->val & SMP_VAL_BE_HRQ_HDR)) { + /* fetch not available in this context: sample expr is resolved + * within backend right after headers are processed. + * (in http_process_request()) + * -> we simply warn the user about the misuse + */ + ha_warning("%s '%s' [%s:%d]: in 'option forwarded' sample expression '%s' : " + "some args extract information from '%s', " + "none of which is available here.\n", + proxy_type_str(curproxy), curproxy->id, + fwd->c_file, fwd->c_line, + *expr_str, sample_ckp_names(cur_expr->fetch->use)); + err |= ERR_WARN; + } + /* post parsing individual expr cleanup */ + ha_free(expr_str); + + /* expr assignment */ + *expr = cur_expr; + } + curproxy->conf.args.file = NULL; + curproxy->conf.args.line = 0; + + /* post parsing general cleanup */ + ha_free(&fwd->c_file); + fwd->c_line = 0; + + fwd->c_mode = 1; /* parsing completed */ + + out: + return err; +} + +/* x-forwarded-for: tries to parse "option forwardfor" config keyword + * Returns a composition of ERR_NONE, ERR_FATAL, ERR_ALERT + */ +int proxy_http_parse_xff(char **args, int cur_arg, + struct proxy *curproxy, const struct proxy *defpx, + const char *file, int linenum) +{ + struct http_ext_xff *xff; + int err_code = 0; + + if (!http_ext_xff_prepare(curproxy)) + return proxy_http_parse_oom(file, linenum); + + xff = curproxy->http_ext->xff; + + /* insert x-forwarded-for field, but not for the IP address listed as an except. + * set default options (ie: bitfield, header name, etc) + */ + + xff->mode = HTTP_XFF_ALWAYS; + + istfree(&xff->hdr_name); + xff->hdr_name = istdup(ist(DEF_XFORWARDFOR_HDR)); + if (!isttest(xff->hdr_name)) + return proxy_http_parse_oom(file, linenum); + xff->except_net.family = AF_UNSPEC; + + /* loop to go through arguments - start at 2, since 0+1 = "option" "forwardfor" */ + cur_arg = 2; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "except") == 0) { + unsigned char mask; + int i; + + /* suboption except - needs additional argument for it */ + if (*(args[cur_arg+1]) && + str2net(args[cur_arg+1], 1, &xff->except_net.addr.v4.ip, &xff->except_net.addr.v4.mask)) { + xff->except_net.family = AF_INET; + xff->except_net.addr.v4.ip.s_addr &= xff->except_net.addr.v4.mask.s_addr; + } + else if (*(args[cur_arg+1]) && + str62net(args[cur_arg+1], &xff->except_net.addr.v6.ip, &mask)) { + xff->except_net.family = AF_INET6; + len2mask6(mask, &xff->except_net.addr.v6.mask); + for (i = 0; i < 16; i++) + xff->except_net.addr.v6.ip.s6_addr[i] &= xff->except_net.addr.v6.mask.s6_addr[i]; + } + else { + ha_alert("parsing [%s:%d] : '%s %s %s' expects <address>[/mask] as argument.\n", + file, linenum, args[0], args[1], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + /* flush useless bits */ + cur_arg += 2; + } else if (strcmp(args[cur_arg], "header") == 0) { + /* suboption header - needs additional argument for it */ + if (*(args[cur_arg+1]) == 0) { + ha_alert("parsing [%s:%d] : '%s %s %s' expects <header_name> as argument.\n", + file, linenum, args[0], args[1], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + istfree(&xff->hdr_name); + xff->hdr_name = istdup(ist(args[cur_arg+1])); + if (!isttest(xff->hdr_name)) + return proxy_http_parse_oom(file, linenum); + cur_arg += 2; + } else if (strcmp(args[cur_arg], "if-none") == 0) { + xff->mode = HTTP_XFF_IFNONE; + cur_arg += 1; + } else { + /* unknown suboption - catchall */ + ha_alert("parsing [%s:%d] : '%s %s' only supports optional values: 'except', 'header' and 'if-none'.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } /* end while loop */ + out: + return err_code; +} + +/* x-original-to: tries to parse "option originalto" config keyword + * Returns a composition of ERR_NONE, ERR_FATAL, ERR_ALERT + */ +int proxy_http_parse_xot(char **args, int cur_arg, + struct proxy *curproxy, const struct proxy *defpx, + const char *file, int linenum) +{ + struct http_ext_xot *xot; + int err_code = 0; + + if (!http_ext_xot_prepare(curproxy)) + return proxy_http_parse_oom(file, linenum); + + xot = curproxy->http_ext->xot; + + /* insert x-original-to field, but not for the IP address listed as an except. + * set default options (ie: bitfield, header name, etc) + */ + + istfree(&xot->hdr_name); + xot->hdr_name = istdup(ist(DEF_XORIGINALTO_HDR)); + if (!isttest(xot->hdr_name)) + return proxy_http_parse_oom(file, linenum); + xot->except_net.family = AF_UNSPEC; + + /* loop to go through arguments - start at 2, since 0+1 = "option" "originalto" */ + cur_arg = 2; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "except") == 0) { + unsigned char mask; + int i; + + /* suboption except - needs additional argument for it */ + if (*(args[cur_arg+1]) && + str2net(args[cur_arg+1], 1, &xot->except_net.addr.v4.ip, &xot->except_net.addr.v4.mask)) { + xot->except_net.family = AF_INET; + xot->except_net.addr.v4.ip.s_addr &= xot->except_net.addr.v4.mask.s_addr; + } + else if (*(args[cur_arg+1]) && + str62net(args[cur_arg+1], &xot->except_net.addr.v6.ip, &mask)) { + xot->except_net.family = AF_INET6; + len2mask6(mask, &xot->except_net.addr.v6.mask); + for (i = 0; i < 16; i++) + xot->except_net.addr.v6.ip.s6_addr[i] &= xot->except_net.addr.v6.mask.s6_addr[i]; + } + else { + ha_alert("parsing [%s:%d] : '%s %s %s' expects <address>[/mask] as argument.\n", + file, linenum, args[0], args[1], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + cur_arg += 2; + } else if (strcmp(args[cur_arg], "header") == 0) { + /* suboption header - needs additional argument for it */ + if (*(args[cur_arg+1]) == 0) { + ha_alert("parsing [%s:%d] : '%s %s %s' expects <header_name> as argument.\n", + file, linenum, args[0], args[1], args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + istfree(&xot->hdr_name); + xot->hdr_name = istdup(ist(args[cur_arg+1])); + if (!isttest(xot->hdr_name)) + return proxy_http_parse_oom(file, linenum); + cur_arg += 2; + } else { + /* unknown suboption - catchall */ + ha_alert("parsing [%s:%d] : '%s %s' only supports optional values: 'except' and 'header'.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } /* end while loop */ + + out: + return err_code; +} + +/* + * =========== MGMT =========== + * below are helpers to manage http ext options + */ + +/* Ensure http_ext->fwd is properly allocated and + * initialized for <curproxy>. + * The function will leverage http_ext_prepare() to make + * sure http_ext is properly allocated and initialized as well. + * Returns 1 for success and 0 for failure (memory error) + */ +int http_ext_7239_prepare(struct proxy *curproxy) +{ + struct http_ext_7239 *fwd; + + if (!http_ext_prepare(curproxy)) + return 0; + if (curproxy->http_ext->fwd) + return 1; /* nothing to do */ + + fwd = malloc(sizeof(*fwd)); + if (!fwd) + return 0; + /* initialize fwd mandatory fields */ + fwd->c_mode = 0; /* pre-compile (parse) time */ + fwd->c_file = NULL; + fwd->p_host.expr_s = NULL; + fwd->p_by.nn_expr_s = NULL; + fwd->p_by.np_expr_s = NULL; + fwd->p_for.nn_expr_s = NULL; + fwd->p_for.np_expr_s = NULL; + /* assign */ + curproxy->http_ext->fwd = fwd; + return 1; +} + +/* Ensure http_ext->xff is properly allocated and + * initialized for <curproxy>. + * The function will leverage http_ext_prepare() to make + * sure http_ext is properly allocated and initialized as well. + * Returns 1 for success and 0 for failure (memory error) + */ +int http_ext_xff_prepare(struct proxy *curproxy) +{ + struct http_ext_xff *xff; + + if (!http_ext_prepare(curproxy)) + return 0; + if (curproxy->http_ext->xff) + return 1; /* nothing to do */ + + xff = malloc(sizeof(*xff)); + if (!xff) + return 0; + /* initialize xff mandatory fields */ + xff->hdr_name = IST_NULL; + /* assign */ + curproxy->http_ext->xff = xff; + return 1; +} + +/* Ensure http_ext->xot is properly allocated and + * initialized for <curproxy>. + * The function will leverage http_ext_prepare() to make + * sure http_ext is properly allocated and initialized as well. + * Returns 1 for success and 0 for failure (memory error) + */ +int http_ext_xot_prepare(struct proxy *curproxy) +{ + struct http_ext_xot *xot; + + if (!http_ext_prepare(curproxy)) + return 0; + if (curproxy->http_ext->xot) + return 1; /* nothing to do */ + + xot = malloc(sizeof(*xot)); + if (!xot) + return 0; + /* initialize xot mandatory fields */ + xot->hdr_name = IST_NULL; + /* assign */ + curproxy->http_ext->xot = xot; + return 1; +} + +/* deep clean http_ext->fwd parameter for <curproxy> + * http_ext->fwd will be freed + * clean behavior will differ depending on http_ext->fwd + * state. If fwd is in 'parsed' state, parsing hints will be + * cleaned. Else, it means fwd is in 'compiled' state, in this + * case we're cleaning compiled results. + * This is because parse and compile memory areas are shared in + * a single union to optimize struct http_ext_7239 size. + */ +void http_ext_7239_clean(struct proxy *curproxy) +{ + struct http_ext_7239 *clean; + + if (!curproxy->http_ext) + return; + clean = curproxy->http_ext->fwd; + if (!clean) + return; /* nothing to do */ + if (!clean->c_mode) { + /* parsed */ + ha_free(&clean->c_file); + ha_free(&clean->p_host.expr_s); + ha_free(&clean->p_by.nn_expr_s); + ha_free(&clean->p_by.np_expr_s); + ha_free(&clean->p_for.nn_expr_s); + ha_free(&clean->p_for.np_expr_s); + } + else { + /* compiled */ + release_sample_expr(clean->p_host.expr); + clean->p_host.expr = NULL; + release_sample_expr(clean->p_by.nn_expr); + clean->p_by.nn_expr = NULL; + release_sample_expr(clean->p_by.np_expr); + clean->p_by.np_expr = NULL; + release_sample_expr(clean->p_for.nn_expr); + clean->p_for.nn_expr = NULL; + release_sample_expr(clean->p_for.np_expr); + clean->p_for.np_expr = NULL; + } + /* free fwd */ + ha_free(&curproxy->http_ext->fwd); +} + +/* deep clean http_ext->xff parameter for <curproxy> + * http_ext->xff will be freed + */ +void http_ext_xff_clean(struct proxy *curproxy) +{ + struct http_ext_xff *clean; + + if (!curproxy->http_ext) + return; + clean = curproxy->http_ext->xff; + if (!clean) + return; /* nothing to do */ + istfree(&clean->hdr_name); + /* free xff */ + ha_free(&curproxy->http_ext->xff); +} + +/* deep clean http_ext->xot parameter for <curproxy> + * http_ext->xot will be freed + */ +void http_ext_xot_clean(struct proxy *curproxy) +{ + struct http_ext_xot *clean; + + if (!curproxy->http_ext) + return; + clean = curproxy->http_ext->xot; + if (!clean) + return; /* nothing to do */ + istfree(&clean->hdr_name); + /* free xot */ + ha_free(&curproxy->http_ext->xot); +} + +/* duplicate http_ext->fwd parameters from <def> to <cpy> + * performs the required memory allocation and initialization + */ +void http_ext_7239_dup(const struct proxy *def, struct proxy *cpy) +{ + struct http_ext_7239 *dest = NULL; + struct http_ext_7239 *orig = NULL; + + /* feature requires backend cap */ + if (!(cpy->cap & PR_CAP_BE)) + return; + + if (def->http_ext == NULL || def->http_ext->fwd == NULL) + return; + + orig = def->http_ext->fwd; + + if (orig->c_mode) + return; /* copy not supported once compiled */ + + if (!http_ext_7239_prepare(cpy)) + return; + + dest = cpy->http_ext->fwd; + + if (orig->c_file) + dest->c_file = strdup(orig->c_file); + dest->c_line = orig->c_line; + /* proto */ + dest->p_proto = orig->p_proto; + /* host */ + dest->p_host.mode = orig->p_host.mode; + if (orig->p_host.expr_s) + dest->p_host.expr_s = strdup(orig->p_host.expr_s); + /* by - nodename */ + dest->p_by.nn_mode = orig->p_by.nn_mode; + if (orig->p_by.nn_expr_s) + dest->p_by.nn_expr_s = strdup(orig->p_by.nn_expr_s); + /* by - nodeport */ + dest->p_by.np_mode = orig->p_by.np_mode; + if (orig->p_by.np_expr_s) + dest->p_by.np_expr_s = strdup(orig->p_by.np_expr_s); + /* for - nodename */ + dest->p_for.nn_mode = orig->p_for.nn_mode; + if (orig->p_for.nn_expr_s) + dest->p_for.nn_expr_s = strdup(orig->p_for.nn_expr_s); + /* for - nodeport */ + dest->p_for.np_mode = orig->p_for.np_mode; + if (orig->p_for.np_expr_s) + dest->p_for.np_expr_s = strdup(orig->p_for.np_expr_s); +} + +/* duplicate http_ext->xff parameters from <def> to <cpy> + * performs the required memory allocation and initialization + */ +void http_ext_xff_dup(const struct proxy *def, struct proxy *cpy) +{ + struct http_ext_xff *dest = NULL; + struct http_ext_xff *orig = NULL; + + if (def->http_ext == NULL || def->http_ext->xff == NULL || + !http_ext_xff_prepare(cpy)) + return; + + orig = def->http_ext->xff; + dest = cpy->http_ext->xff; + + if (isttest(orig->hdr_name)) + dest->hdr_name = istdup(orig->hdr_name); + dest->mode = orig->mode; + dest->except_net = orig->except_net; +} + +/* duplicate http_ext->xot parameters from <def> to <cpy> + * performs the required memory allocation and initialization + */ +void http_ext_xot_dup(const struct proxy *def, struct proxy *cpy) +{ + struct http_ext_xot *dest = NULL; + struct http_ext_xot *orig = NULL; + + if (def->http_ext == NULL || def->http_ext->xot == NULL || + !http_ext_xot_prepare(cpy)) + return; + + orig = def->http_ext->xot; + dest = cpy->http_ext->xot; + + if (isttest(orig->hdr_name)) + dest->hdr_name = istdup(orig->hdr_name); + dest->except_net = orig->except_net; +} + +/* Allocate new http_ext and initialize it + * if needed + * Returns 1 for success and 0 for failure + */ +int http_ext_prepare(struct proxy *curproxy) +{ + if (curproxy->http_ext) + return 1; /* nothing to do */ + + curproxy->http_ext = malloc(sizeof(*curproxy->http_ext)); + if (!curproxy->http_ext) + return 0; /* failure */ + /* first init, set supported ext to NULL */ + curproxy->http_ext->fwd = NULL; + curproxy->http_ext->xff = NULL; + curproxy->http_ext->xot = NULL; + return 1; +} + +/* duplicate existing http_ext from <defproxy> to <curproxy> + */ +void http_ext_dup(const struct proxy *defproxy, struct proxy *curproxy) +{ + /* copy defproxy.http_ext members */ + http_ext_7239_dup(defproxy, curproxy); + http_ext_xff_dup(defproxy, curproxy); + http_ext_xot_dup(defproxy, curproxy); +} + +/* deep clean http_ext for <curproxy> (if previously allocated) + */ +void http_ext_clean(struct proxy *curproxy) +{ + if (!curproxy->http_ext) + return; /* nothing to do */ + /* first, free supported ext */ + http_ext_7239_clean(curproxy); + http_ext_xff_clean(curproxy); + http_ext_xot_clean(curproxy); + + /* then, free http_ext */ + ha_free(&curproxy->http_ext); +} + +/* soft clean (only clean http_ext if no more options are used) */ +void http_ext_softclean(struct proxy *curproxy) +{ + if (!curproxy->http_ext) + return; /* nothing to do */ + if (!curproxy->http_ext->fwd && + !curproxy->http_ext->xff && + !curproxy->http_ext->xot) { + /* no more use for http_ext, all options are disabled */ + http_ext_clean(curproxy); + } +} + +/* Perform some consistency checks on px.http_ext after parsing + * is completed. + * We make sure to perform a softclean in case some options were + * to be disabled in this check. This way we can release some memory. + * Returns a composition of ERR_NONE, ERR_ALERT, ERR_FATAL, ERR_WARN + */ +static int check_http_ext_postconf(struct proxy *px) { + int err = ERR_NONE; + + if (px->http_ext) { + /* consistency check for http_ext */ + if (px->mode != PR_MODE_HTTP && !(px->options & PR_O_HTTP_UPG)) { + /* http is disabled on px, yet it is required by http_ext */ + if (px->http_ext->fwd) { + ha_warning("'option %s' ignored for %s '%s' as it requires HTTP mode.\n", + "forwarded", proxy_type_str(px), px->id); + err |= ERR_WARN; + http_ext_7239_clean(px); + } + if (px->http_ext->xff) { + ha_warning("'option %s' ignored for %s '%s' as it requires HTTP mode.\n", + "forwardfor", proxy_type_str(px), px->id); + err |= ERR_WARN; + http_ext_xff_clean(px); + } + if (px->http_ext->xot) { + ha_warning("'option %s' ignored for %s '%s' as it requires HTTP mode.\n", + "originalto", proxy_type_str(px), px->id); + err |= ERR_WARN; + http_ext_xot_clean(px); + } + } else if (px->http_ext->fwd) { + /* option "forwarded" may need to compile its expressions */ + err |= proxy_http_compile_7239(px); + } + /* http_ext post init early cleanup */ + http_ext_softclean(px); + + } + return err; +} + +REGISTER_POST_PROXY_CHECK(check_http_ext_postconf); +/* + * =========== CONV =========== + * related converters + */ + +/* input: string representing 7239 forwarded header single value + * does not take arguments + * output: 1 if header is RFC compliant, 0 otherwise + */ +static int sample_conv_7239_valid(const struct arg *args, struct sample *smp, void *private) +{ + struct ist input = ist2(smp->data.u.str.area, smp->data.u.str.data); + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = !!http_validate_7239_header(input, FORWARDED_HEADER_ALL, NULL); + return 1; +} + +/* input: string representing 7239 forwarded header single value + * argument: parameter name to look for in the header + * output: header parameter raw value, as a string + */ +static int sample_conv_7239_field(const struct arg *args, struct sample *smp, void *private) +{ + struct ist input = ist2(smp->data.u.str.area, smp->data.u.str.data); + struct buffer *output; + struct forwarded_header_ctx ctx; + int validate; + int field = 0; + + if (strcmp(args->data.str.area, "proto") == 0) + field = FORWARDED_HEADER_PROTO; + else if (strcmp(args->data.str.area, "host") == 0) + field = FORWARDED_HEADER_HOST; + else if (strcmp(args->data.str.area, "for") == 0) + field = FORWARDED_HEADER_FOR; + else if (strcmp(args->data.str.area, "by") == 0) + field = FORWARDED_HEADER_BY; + + validate = http_validate_7239_header(input, FORWARDED_HEADER_ALL, &ctx); + if (!(validate & field)) + return 0; /* invalid header or header does not contain field */ + output = get_trash_chunk(); + switch (field) { + case FORWARDED_HEADER_PROTO: + if (ctx.proto == FORWARDED_HEADER_HTTP) + chunk_appendf(output, "http"); + else if (ctx.proto == FORWARDED_HEADER_HTTPS) + chunk_appendf(output, "https"); + break; + case FORWARDED_HEADER_HOST: + chunk_istcat(output, ctx.host); + break; + case FORWARDED_HEADER_FOR: + chunk_istcat(output, ctx.nfor.raw); + break; + case FORWARDED_HEADER_BY: + chunk_istcat(output, ctx.nby.raw); + break; + default: + break; + } + smp->flags &= ~SMP_F_CONST; + smp->data.type = SMP_T_STR; + smp->data.u.str = *output; + return 1; +} + +/* input: substring representing 7239 forwarded header node + * output: forwarded header nodename translated to either + * ipv4 address, ipv6 address or str + * ('_' prefix if obfuscated, or "unknown" if unknown) + */ +static int sample_conv_7239_n2nn(const struct arg *args, struct sample *smp, void *private) +{ + struct ist input = ist2(smp->data.u.str.area, smp->data.u.str.data); + struct forwarded_header_node ctx; + struct buffer *output; + + if (http_7239_extract_node(&input, &ctx, 1) == 0) + return 0; /* could not extract node */ + switch (ctx.nodename.type) { + case FORWARDED_HEADER_UNK: + output = get_trash_chunk(); + chunk_appendf(output, "unknown"); + smp->flags &= ~SMP_F_CONST; + smp->data.type = SMP_T_STR; + smp->data.u.str = *output; + break; + case FORWARDED_HEADER_OBFS: + output = get_trash_chunk(); + chunk_appendf(output, "_"); /* append obfs prefix */ + chunk_istcat(output, ctx.nodename.obfs); + smp->flags &= ~SMP_F_CONST; + smp->data.type = SMP_T_STR; + smp->data.u.str = *output; + break; + case FORWARDED_HEADER_IP: + if (ctx.nodename.ip.ss_family == AF_INET) { + smp->data.type = SMP_T_IPV4; + smp->data.u.ipv4 = ((struct sockaddr_in *)&ctx.nodename.ip)->sin_addr; + } + else if (ctx.nodename.ip.ss_family == AF_INET6) { + smp->data.type = SMP_T_IPV6; + smp->data.u.ipv6 = ((struct sockaddr_in6 *)&ctx.nodename.ip)->sin6_addr; + } + else + return 0; /* unsupported */ + break; + default: + return 0; /* unsupported */ + } + return 1; +} + +/* input: substring representing 7239 forwarded header node + * output: forwarded header nodeport translated to either + * integer or str for obfuscated ('_' prefix) + */ +static int sample_conv_7239_n2np(const struct arg *args, struct sample *smp, void *private) +{ + struct ist input = ist2(smp->data.u.str.area, smp->data.u.str.data); + struct forwarded_header_node ctx; + struct buffer *output; + + if (http_7239_extract_node(&input, &ctx, 1) == 0) + return 0; /* could not extract node */ + + switch (ctx.nodeport.type) { + case FORWARDED_HEADER_UNK: + return 0; /* not provided */ + case FORWARDED_HEADER_OBFS: + output = get_trash_chunk(); + chunk_appendf(output, "_"); /* append obfs prefix */ + chunk_istcat(output, ctx.nodeport.obfs); + smp->flags &= ~SMP_F_CONST; + smp->data.type = SMP_T_STR; + smp->data.u.str = *output; + break; + case FORWARDED_HEADER_PORT: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = ctx.nodeport.port; + break; + default: + return 0; /* unsupported */ + } + + return 1; +} + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_conv_kw_list sample_conv_kws = {ILH, { + { "rfc7239_is_valid", sample_conv_7239_valid, 0, NULL, SMP_T_STR, SMP_T_BOOL}, + { "rfc7239_field", sample_conv_7239_field, ARG1(1,STR), NULL, SMP_T_STR, SMP_T_STR}, + { "rfc7239_n2nn", sample_conv_7239_n2nn, 0, NULL, SMP_T_STR, SMP_T_ANY}, + { "rfc7239_n2np", sample_conv_7239_n2np, 0, NULL, SMP_T_STR, SMP_T_ANY}, + { NULL, NULL, 0, 0, 0 }, +}}; + +INITCALL1(STG_REGISTER, sample_register_convs, &sample_conv_kws); diff --git a/src/http_fetch.c b/src/http_fetch.c new file mode 100644 index 0000000..1f3e4a0 --- /dev/null +++ b/src/http_fetch.c @@ -0,0 +1,2368 @@ +/* + * HTTP samples fetching + * + * Copyright 2000-2018 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/types.h> + +#include <ctype.h> +#include <string.h> +#include <time.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/auth.h> +#include <haproxy/base64.h> +#include <haproxy/channel.h> +#include <haproxy/chunk.h> +#include <haproxy/connection.h> +#include <haproxy/global.h> +#include <haproxy/h1.h> +#include <haproxy/h1_htx.h> +#include <haproxy/http.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_fetch.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/obj_type.h> +#include <haproxy/pool.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stream.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + + +/* this struct is used between calls to smp_fetch_hdr() or smp_fetch_cookie() */ +static THREAD_LOCAL struct http_hdr_ctx static_http_hdr_ctx; +/* this is used to convert raw connection buffers to htx */ +static THREAD_LOCAL struct buffer static_raw_htx_chunk; +static THREAD_LOCAL char *static_raw_htx_buf; + +#define SMP_REQ_CHN(smp) (smp->strm ? &smp->strm->req : NULL) +#define SMP_RES_CHN(smp) (smp->strm ? &smp->strm->res : NULL) + +/* This function returns the static htx chunk, where raw connections get + * converted to HTX as needed for samplxsing. + */ +struct buffer *get_raw_htx_chunk(void) +{ + chunk_reset(&static_raw_htx_chunk); + return &static_raw_htx_chunk; +} + +static int alloc_raw_htx_chunk_per_thread() +{ + static_raw_htx_buf = malloc(global.tune.bufsize); + if (!static_raw_htx_buf) + return 0; + chunk_init(&static_raw_htx_chunk, static_raw_htx_buf, global.tune.bufsize); + return 1; +} + +static void free_raw_htx_chunk_per_thread() +{ + ha_free(&static_raw_htx_buf); +} + +REGISTER_PER_THREAD_ALLOC(alloc_raw_htx_chunk_per_thread); +REGISTER_PER_THREAD_FREE(free_raw_htx_chunk_per_thread); + +/* + * Returns the data from Authorization header. Function may be called more + * than once so data is stored in txn->auth_data. When no header is found + * or auth method is unknown auth_method is set to HTTP_AUTH_WRONG to avoid + * searching again for something we are unable to find anyway. However, if + * the result if valid, the cache is not reused because we would risk to + * have the credentials overwritten by another stream in parallel. + * The caller is responsible for passing a sample with a valid stream/txn, + * and a valid htx. + */ + +static int get_http_auth(struct sample *smp, struct htx *htx) +{ + struct stream *s = smp->strm; + struct http_txn *txn = s->txn; + struct http_hdr_ctx ctx = { .blk = NULL }; + struct ist hdr; + struct buffer auth_method; + char *p; + int len; + +#ifdef DEBUG_AUTH + printf("Auth for stream %p: %d\n", s, txn->auth.method); +#endif + if (txn->auth.method == HTTP_AUTH_WRONG) + return 0; + + txn->auth.method = HTTP_AUTH_WRONG; + + if (txn->flags & TX_USE_PX_CONN) + hdr = ist("Proxy-Authorization"); + else + hdr = ist("Authorization"); + + ctx.blk = NULL; + if (!http_find_header(htx, hdr, &ctx, 0)) + return 0; + + p = memchr(ctx.value.ptr, ' ', ctx.value.len); + if (!p || p == ctx.value.ptr) /* if no space was found or if the space is the first character */ + return 0; + len = p - ctx.value.ptr; + + if (chunk_initlen(&auth_method, ctx.value.ptr, 0, len) != 1) + return 0; + + /* According to RFC7235, there could be multiple spaces between the + * scheme and its value, we must skip all of them. + */ + while (p < istend(ctx.value) && *p == ' ') + ++p; + + chunk_initlen(&txn->auth.method_data, p, 0, istend(ctx.value) - p); + + if (!strncasecmp("Basic", auth_method.area, auth_method.data)) { + struct buffer *http_auth = get_trash_chunk(); + + len = base64dec(txn->auth.method_data.area, + txn->auth.method_data.data, + http_auth->area, global.tune.bufsize - 1); + + if (len < 0) + return 0; + + + http_auth->area[len] = '\0'; + + p = strchr(http_auth->area, ':'); + + if (!p) + return 0; + + txn->auth.user = http_auth->area; + *p = '\0'; + txn->auth.pass = p+1; + + txn->auth.method = HTTP_AUTH_BASIC; + return 1; + } else if (!strncasecmp("Bearer", auth_method.area, auth_method.data)) { + txn->auth.method = HTTP_AUTH_BEARER; + return 1; + } + + return 0; +} + +/* This function ensures that the prerequisites for an L7 fetch are ready, + * which means that a request or response is ready. If some data is missing, + * a parsing attempt is made. This is useful in TCP-based ACLs which are able + * to extract data from L7. If <vol> is non-null during a prefetch, another + * test is made to ensure the required information is not gone. + * + * The function returns : + * NULL with SMP_F_MAY_CHANGE in the sample flags if some data is missing to + * decide whether or not an HTTP message is present ; + * NULL if the requested data cannot be fetched or if it is certain that + * we'll never have any HTTP message there; this includes null strm or chn. + * NULL if the sample's direction does not match the channel's (i.e. the + * function was asked to work on the wrong channel) + * The HTX message if ready + */ +struct htx *smp_prefetch_htx(struct sample *smp, struct channel *chn, struct check *check, int vol) +{ + struct stream *s = smp->strm; + struct http_txn *txn = NULL; + struct htx *htx = NULL; + struct http_msg *msg; + struct htx_sl *sl; + + if (chn && + (((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_REQ && (chn->flags & CF_ISRESP)) || + ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES && !(chn->flags & CF_ISRESP)))) + return 0; + + /* Note: it is possible that <s> is NULL when called before stream + * initialization (eg: tcp-request connection), so this function is the + * one responsible for guarding against this case for all HTTP users. + * + * In the health check context, the stream and the channel must be NULL + * and <check> must be set. In this case, only the input buffer, + * corresponding to the response, is considered. It is the caller + * responsibility to provide <check>. + */ + BUG_ON(check && (s || chn)); + if (!s || !chn) { + if (check) { + htx = htxbuf(&check->bi); + + /* Analyse not yet started */ + if (htx_is_empty(htx) || htx->first == -1) + return NULL; + + sl = http_get_stline(htx); + if (vol && !sl) { + /* The start-line was already forwarded, it is too late to fetch anything */ + return NULL; + } + goto end; + } + + return NULL; + } + + if (!s->txn && !http_create_txn(s)) + return NULL; + txn = s->txn; + msg = (!(chn->flags & CF_ISRESP) ? &txn->req : &txn->rsp); + + if (IS_HTX_STRM(s)) { + htx = htxbuf(&chn->buf); + + if (htx->flags & HTX_FL_PARSING_ERROR) + return NULL; + + if (msg->msg_state < HTTP_MSG_BODY) { + /* Analyse not yet started */ + if (htx_is_empty(htx) || htx->first == -1) { + /* Parsing is done by the mux, just wait */ + smp->flags |= SMP_F_MAY_CHANGE; + return NULL; + } + } + sl = http_get_stline(htx); + if (vol && !sl) { + /* The start-line was already forwarded, it is too late to fetch anything */ + return NULL; + } + } + else { /* RAW mode */ + struct buffer *buf; + struct h1m h1m; + struct http_hdr hdrs[global.tune.max_http_hdr]; + union h1_sl h1sl; + unsigned int flags = HTX_FL_NONE; + int ret; + + /* no HTTP fetch on the response in TCP mode */ + if (chn->flags & CF_ISRESP) + return NULL; + + /* Now we are working on the request only */ + buf = &chn->buf; + if (b_head(buf) + b_data(buf) > b_wrap(buf)) + b_slow_realign(buf, trash.area, 0); + + h1m_init_req(&h1m); + ret = h1_headers_to_hdr_list(b_head(buf), b_stop(buf), + hdrs, sizeof(hdrs)/sizeof(hdrs[0]), &h1m, &h1sl); + if (ret <= 0) { + /* Invalid or too big*/ + if (ret < 0 || channel_full(&s->req, global.tune.maxrewrite)) + return NULL; + + /* wait for a full request */ + smp->flags |= SMP_F_MAY_CHANGE; + return NULL; + } + + /* OK we just got a valid HTTP message. We have to convert it + * into an HTX message. + */ + if (unlikely(h1sl.rq.v.len == 0)) { + /* try to convert HTTP/0.9 requests to HTTP/1.0 */ + if (h1sl.rq.meth != HTTP_METH_GET || !h1sl.rq.u.len) + return NULL; + h1sl.rq.v = ist("HTTP/1.0"); + } + + /* Set HTX start-line flags */ + if (h1m.flags & H1_MF_VER_11) + flags |= HTX_SL_F_VER_11; + if (h1m.flags & H1_MF_XFER_ENC) + flags |= HTX_SL_F_XFER_ENC; + flags |= HTX_SL_F_XFER_LEN; + if (h1m.flags & H1_MF_CHNK) + flags |= HTX_SL_F_CHNK; + else if (h1m.flags & H1_MF_CLEN) + flags |= HTX_SL_F_CLEN; + + htx = htx_from_buf(get_raw_htx_chunk()); + sl = htx_add_stline(htx, HTX_BLK_REQ_SL, flags, h1sl.rq.m, h1sl.rq.u, h1sl.rq.v); + if (!sl || !htx_add_all_headers(htx, hdrs)) + return NULL; + sl->info.req.meth = h1sl.rq.meth; + } + + /* OK we just got a valid HTTP message. If not already done by + * HTTP analyzers, we have some minor preparation to perform so + * that further checks can rely on HTTP tests. + */ + if (sl && msg->msg_state < HTTP_MSG_BODY) { + if (!(chn->flags & CF_ISRESP)) { + txn->meth = sl->info.req.meth; + if (txn->meth == HTTP_METH_GET || txn->meth == HTTP_METH_HEAD) + s->flags |= SF_REDIRECTABLE; + } + else { + if (txn->status == -1) + txn->status = sl->info.res.status; + if (!(htx->flags & HTX_FL_PROXY_RESP) && txn->server_status == -1) + txn->server_status = sl->info.res.status; + } + if (sl->flags & HTX_SL_F_VER_11) + msg->flags |= HTTP_MSGF_VER_11; + } + + /* everything's OK */ + end: + return htx; +} + +/* This function fetches the method of current HTTP request and stores + * it in the global pattern struct as a chunk. There are two possibilities : + * - if the method is known (not HTTP_METH_OTHER), its identifier is stored + * in <len> and <ptr> is NULL ; + * - if the method is unknown (HTTP_METH_OTHER), <ptr> points to the text and + * <len> to its length. + * This is intended to be used with pat_match_meth() only. + */ +static int smp_fetch_meth(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct http_txn *txn; + struct htx *htx = NULL; + int meth; + + txn = (smp->strm ? smp->strm->txn : NULL); + if (!txn) + return 0; + + meth = txn->meth; + if (meth == HTTP_METH_OTHER) { + htx = smp_prefetch_htx(smp, chn, NULL, 1); + if (!htx) + return 0; + meth = txn->meth; + } + + smp->data.type = SMP_T_METH; + smp->data.u.meth.meth = meth; + if (meth == HTTP_METH_OTHER) { + struct htx_sl *sl; + + sl = http_get_stline(htx); + smp->flags |= SMP_F_CONST; + smp->data.u.meth.str.area = HTX_SL_REQ_MPTR(sl); + smp->data.u.meth.str.data = HTX_SL_REQ_MLEN(sl); + } + smp->flags |= SMP_F_VOL_1ST; + return 1; +} + +static int smp_fetch_rqver(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + char *ptr; + int len; + + if (!htx) + return 0; + + sl = http_get_stline(htx); + len = HTX_SL_REQ_VLEN(sl); + ptr = HTX_SL_REQ_VPTR(sl); + + while ((len-- > 0) && (*ptr++ != '/')); + if (len <= 0) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = ptr; + smp->data.u.str.data = len; + + smp->flags = SMP_F_VOL_1ST | SMP_F_CONST; + return 1; +} + +static int smp_fetch_stver(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_RES_CHN(smp); + struct check *check = objt_check(smp->sess->origin); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct htx_sl *sl; + char *ptr; + int len; + + if (!htx) + return 0; + + sl = http_get_stline(htx); + len = HTX_SL_RES_VLEN(sl); + ptr = HTX_SL_RES_VPTR(sl); + + while ((len-- > 0) && (*ptr++ != '/')); + if (len <= 0) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = ptr; + smp->data.u.str.data = len; + + smp->flags = SMP_F_VOL_1ST | SMP_F_CONST; + return 1; +} + +/* 3. Check on Status Code. We manipulate integers here. */ +static int smp_fetch_stcode(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_RES_CHN(smp); + struct check *check = objt_check(smp->sess->origin); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct htx_sl *sl; + char *ptr; + int len; + + if (!htx) + return 0; + + sl = http_get_stline(htx); + len = HTX_SL_RES_CLEN(sl); + ptr = HTX_SL_RES_CPTR(sl); + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = __strl2ui(ptr, len); + smp->flags = SMP_F_VOL_1ST; + return 1; +} + +/* It returns the server or the txn status code, depending on the keyword */ +static int smp_fetch_srv_status(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct http_txn *txn; + short status; + + txn = (smp->strm ? smp->strm->txn : NULL); + if (!txn) + return 0; + + status = (kw[0] == 't' ? txn->status : txn->server_status); + if (status == -1) { + struct channel *chn = SMP_RES_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + + if (!htx) + return 0; + + status = (kw[0] == 't' ? txn->status : txn->server_status); + } + + if (kw[0] != 't') + smp->flags = SMP_F_VOL_1ST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = status; + return 1; +} + +static int smp_fetch_uniqueid(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct ist unique_id; + + if (LIST_ISEMPTY(&smp->sess->fe->format_unique_id)) + return 0; + + if (!smp->strm) + return 0; + + unique_id = stream_generate_unique_id(smp->strm, &smp->sess->fe->format_unique_id); + if (!isttest(unique_id)) + return 0; + + smp->data.u.str.area = smp->strm->unique_id.ptr; + smp->data.u.str.data = smp->strm->unique_id.len; + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + return 1; +} + +/* Returns a string block containing all headers including the + * empty line which separates headers from the body. This is useful + * for some headers analysis. + */ +static int smp_fetch_hdrs(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.hdrs, res.hdrs */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct buffer *temp; + int32_t pos; + + if (!htx) + return 0; + temp = get_trash_chunk(); + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_HDR) { + struct ist n = htx_get_blk_name(htx, blk); + struct ist v = htx_get_blk_value(htx, blk); + + if (!h1_format_htx_hdr(n, v, temp)) + return 0; + } + else if (type == HTX_BLK_EOH) { + if (!chunk_memcat(temp, "\r\n", 2)) + return 0; + break; + } + } + smp->data.type = SMP_T_STR; + smp->data.u.str = *temp; + return 1; +} + +/* Returns the header request in a length/value encoded format. + * This is useful for exchanges with the SPOE. + * + * A "length value" is a multibyte code encoding numbers. It uses the + * SPOE format. The encoding is the following: + * + * Each couple "header name" / "header value" is composed + * like this: + * "length value" "header name bytes" + * "length value" "header value bytes" + * When the last header is reached, the header name and the header + * value are empty. Their length are 0 + */ +static int smp_fetch_hdrs_bin(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.hdrs_bin, res.hdrs_bin */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct buffer *temp; + char *p, *end; + int32_t pos; + int ret; + + if (!htx) + return 0; + temp = get_trash_chunk(); + p = temp->area; + end = temp->area + temp->size; + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist n, v; + + if (type == HTX_BLK_HDR) { + n = htx_get_blk_name(htx,blk); + v = htx_get_blk_value(htx, blk); + + /* encode the header name. */ + ret = encode_varint(n.len, &p, end); + if (ret == -1) + return 0; + if (p + n.len > end) + return 0; + memcpy(p, n.ptr, n.len); + p += n.len; + + /* encode the header value. */ + ret = encode_varint(v.len, &p, end); + if (ret == -1) + return 0; + if (p + v.len > end) + return 0; + memcpy(p, v.ptr, v.len); + p += v.len; + + } + else if (type == HTX_BLK_EOH) { + /* encode the end of the header list with empty + * header name and header value. + */ + ret = encode_varint(0, &p, end); + if (ret == -1) + return 0; + ret = encode_varint(0, &p, end); + if (ret == -1) + return 0; + break; + } + } + + /* Initialise sample data which will be filled. */ + smp->data.type = SMP_T_BIN; + smp->data.u.str.area = temp->area; + smp->data.u.str.data = p - temp->area; + smp->data.u.str.size = temp->size; + return 1; +} + +/* returns the longest available part of the body. This requires that the body + * has been waited for using http-buffer-request. + */ +static int smp_fetch_body(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.body, res.body */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct buffer *temp; + int32_t pos; + int finished = 0; + + if (!htx) + return 0; + + temp = get_trash_chunk(); + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) { + finished = 1; + break; + } + if (type == HTX_BLK_DATA) { + if (!h1_format_htx_data(htx_get_blk_value(htx, blk), temp, 0)) + return 0; + } + } + + smp->data.type = SMP_T_BIN; + smp->data.u.str = *temp; + smp->flags = SMP_F_VOL_TEST; + + if (!finished && (check || (chn && !channel_full(chn, global.tune.maxrewrite) && + !(chn_prod(chn)->flags & (SC_FL_EOI|SC_FL_EOS|SC_FL_ABRT_DONE))))) + smp->flags |= SMP_F_MAY_CHANGE; + + return 1; +} + + +/* returns the available length of the body. This requires that the body + * has been waited for using http-buffer-request. + */ +static int smp_fetch_body_len(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.body_len, res.body_len */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + int32_t pos; + unsigned long long len = 0; + + if (!htx) + return 0; + + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_DATA) + len += htx_get_blksz(blk); + } + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = len; + smp->flags = SMP_F_VOL_TEST; + return 1; +} + + +/* returns the advertised length of the body, or the advertised size of the + * chunks available in the buffer. This requires that the body has been waited + * for using http-buffer-request. + */ +static int smp_fetch_body_size(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.body_size, res.body_size */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + int32_t pos; + unsigned long long len = 0; + + if (!htx) + return 0; + + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_DATA) + len += htx_get_blksz(blk); + } + if (htx->extra != HTX_UNKOWN_PAYLOAD_LENGTH) + len += htx->extra; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = len; + smp->flags = SMP_F_VOL_TEST; + return 1; +} + + +/* 4. Check on URL/URI. A pointer to the URI is stored. */ +static int smp_fetch_url(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + + if (!htx) + return 0; + sl = http_get_stline(htx); + smp->data.type = SMP_T_STR; + smp->data.u.str.area = HTX_SL_REQ_UPTR(sl); + smp->data.u.str.data = HTX_SL_REQ_ULEN(sl); + smp->flags = SMP_F_VOL_1ST | SMP_F_CONST; + return 1; +} + +static int smp_fetch_url_ip(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + struct sockaddr_storage addr; + + memset(&addr, 0, sizeof(addr)); + + if (!htx) + return 0; + sl = http_get_stline(htx); + if (url2sa(HTX_SL_REQ_UPTR(sl), HTX_SL_REQ_ULEN(sl), &addr, NULL) < 0) + return 0; + + if (addr.ss_family != AF_INET) + return 0; + + smp->data.type = SMP_T_IPV4; + smp->data.u.ipv4 = ((struct sockaddr_in *)&addr)->sin_addr; + smp->flags = 0; + return 1; +} + +static int smp_fetch_url_port(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + struct sockaddr_storage addr; + + memset(&addr, 0, sizeof(addr)); + + if (!htx) + return 0; + sl = http_get_stline(htx); + if (url2sa(HTX_SL_REQ_UPTR(sl), HTX_SL_REQ_ULEN(sl), &addr, NULL) < 0) + return 0; + + if (addr.ss_family != AF_INET) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = get_host_port(&addr); + smp->flags = 0; + return 1; +} + +/* Fetch an HTTP header. A pointer to the beginning of the value is returned. + * Accepts an optional argument of type string containing the header field name, + * and an optional argument of type signed or unsigned integer to request an + * explicit occurrence of the header. Note that in the event of a missing name, + * headers are considered from the first one. It does not stop on commas and + * returns full lines instead (useful for User-Agent or Date for example). + */ +static int smp_fetch_fhdr(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.fhdr, res.fhdr */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct http_hdr_ctx *ctx = smp->ctx.a[0]; + struct ist name; + int occ = 0; + + if (!ctx) { + /* first call */ + ctx = &static_http_hdr_ctx; + ctx->blk = NULL; + smp->ctx.a[0] = ctx; + } + + if (args[0].type != ARGT_STR) + return 0; + name = ist2(args[0].data.str.area, args[0].data.str.data); + + if (args[1].type == ARGT_SINT) + occ = args[1].data.sint; + + if (!htx) + return 0; + + if (ctx && !(smp->flags & SMP_F_NOT_LAST)) + /* search for header from the beginning */ + ctx->blk = NULL; + + if (!occ && !(smp->opt & SMP_OPT_ITERATE)) + /* no explicit occurrence and single fetch => last header by default */ + occ = -1; + + if (!occ) + /* prepare to report multiple occurrences for ACL fetches */ + smp->flags |= SMP_F_NOT_LAST; + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_VOL_HDR | SMP_F_CONST; + if (http_get_htx_fhdr(htx, name, occ, ctx, &smp->data.u.str.area, &smp->data.u.str.data)) + return 1; + smp->flags &= ~SMP_F_NOT_LAST; + return 0; +} + +/* 6. Check on HTTP header count. The number of occurrences is returned. + * Accepts exactly 1 argument of type string. It does not stop on commas and + * returns full lines instead (useful for User-Agent or Date for example). + */ +static int smp_fetch_fhdr_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.fhdr_cnt, res.fhdr_cnt */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct http_hdr_ctx ctx; + struct ist name; + int cnt; + + if (!htx) + return 0; + + if (args->type == ARGT_STR) { + name = ist2(args->data.str.area, args->data.str.data); + } else { + name = IST_NULL; + } + + ctx.blk = NULL; + cnt = 0; + while (http_find_header(htx, name, &ctx, 1)) + cnt++; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = cnt; + smp->flags = SMP_F_VOL_HDR; + return 1; +} + +static int smp_fetch_hdr_names(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.hdr_names, res.hdr_names */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct buffer *temp; + char del = ','; + + int32_t pos; + + if (!htx) + return 0; + + if (args->type == ARGT_STR) + del = *args[0].data.str.area; + + temp = get_trash_chunk(); + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist n; + + if (type == HTX_BLK_EOH) + break; + if (type != HTX_BLK_HDR) + continue; + n = htx_get_blk_name(htx, blk); + + if (temp->data) + temp->area[temp->data++] = del; + chunk_istcat(temp, n); + } + + smp->data.type = SMP_T_STR; + smp->data.u.str = *temp; + smp->flags = SMP_F_VOL_HDR; + return 1; +} + +/* Fetch an HTTP header. A pointer to the beginning of the value is returned. + * Accepts an optional argument of type string containing the header field name, + * and an optional argument of type signed or unsigned integer to request an + * explicit occurrence of the header. Note that in the event of a missing name, + * headers are considered from the first one. + */ +static int smp_fetch_hdr(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.hdr / hdr, res.hdr / shdr */ + struct channel *chn = ((kw[0] == 'h' || kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[0] == 's' || kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct http_hdr_ctx *ctx = smp->ctx.a[0]; + struct ist name; + int occ = 0; + + if (!ctx) { + /* first call */ + ctx = &static_http_hdr_ctx; + ctx->blk = NULL; + smp->ctx.a[0] = ctx; + } + + if (args[0].type != ARGT_STR) + return 0; + name = ist2(args[0].data.str.area, args[0].data.str.data); + + if (args[1].type == ARGT_SINT) + occ = args[1].data.sint; + + if (!htx) + return 0; + + if (ctx && !(smp->flags & SMP_F_NOT_LAST)) + /* search for header from the beginning */ + ctx->blk = NULL; + + if (!occ && !(smp->opt & SMP_OPT_ITERATE)) + /* no explicit occurrence and single fetch => last header by default */ + occ = -1; + + if (!occ) + /* prepare to report multiple occurrences for ACL fetches */ + smp->flags |= SMP_F_NOT_LAST; + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_VOL_HDR | SMP_F_CONST; + if (http_get_htx_hdr(htx, name, occ, ctx, &smp->data.u.str.area, &smp->data.u.str.data)) + return 1; + + smp->flags &= ~SMP_F_NOT_LAST; + return 0; +} + +/* Same than smp_fetch_hdr() but only relies on the sample direction to choose + * the right channel. So instead of duplicating the code, we just change the + * keyword and then fallback on smp_fetch_hdr(). + */ +static int smp_fetch_chn_hdr(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + kw = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_REQ ? "req.hdr" : "res.hdr"); + return smp_fetch_hdr(args, smp, kw, private); +} + +/* 6. Check on HTTP header count. The number of occurrences is returned. + * Accepts exactly 1 argument of type string. + */ +static int smp_fetch_hdr_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.hdr_cnt / hdr_cnt, res.hdr_cnt / shdr_cnt */ + struct channel *chn = ((kw[0] == 'h' || kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[0] == 's' || kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct http_hdr_ctx ctx; + struct ist name; + int cnt; + + if (!htx) + return 0; + + if (args->type == ARGT_STR) { + name = ist2(args->data.str.area, args->data.str.data); + } else { + name = IST_NULL; + } + + ctx.blk = NULL; + cnt = 0; + while (http_find_header(htx, name, &ctx, 0)) + cnt++; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = cnt; + smp->flags = SMP_F_VOL_HDR; + return 1; +} + +/* Fetch an HTTP header's integer value. The integer value is returned. It + * takes a mandatory argument of type string and an optional one of type int + * to designate a specific occurrence. It returns an unsigned integer, which + * may or may not be appropriate for everything. + */ +static int smp_fetch_hdr_val(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int ret = smp_fetch_hdr(args, smp, kw, private); + + if (ret > 0) { + smp->data.type = SMP_T_SINT; + smp->data.u.sint = strl2ic(smp->data.u.str.area, + smp->data.u.str.data); + } + + return ret; +} + +/* Fetch an HTTP header's IP value. takes a mandatory argument of type string + * and an optional one of type int to designate a specific occurrence. + * It returns an IPv4 or IPv6 address. Addresses surrounded by invalid chars + * are rejected. However IPv4 addresses may be followed with a colon and a + * valid port number. + */ +static int smp_fetch_hdr_ip(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct buffer *temp = get_trash_chunk(); + int ret, len; + int port; + + while ((ret = smp_fetch_hdr(args, smp, kw, private)) > 0) { + if (smp->data.u.str.data < temp->size - 1) { + memcpy(temp->area, smp->data.u.str.area, + smp->data.u.str.data); + temp->area[smp->data.u.str.data] = '\0'; + len = url2ipv4((char *) temp->area, &smp->data.u.ipv4); + if (len > 0 && len == smp->data.u.str.data) { + /* plain IPv4 address */ + smp->data.type = SMP_T_IPV4; + break; + } else if (len > 0 && temp->area[len] == ':' && + strl2irc(temp->area + len + 1, smp->data.u.str.data - len - 1, &port) == 0 && + port >= 0 && port <= 65535) { + /* IPv4 address suffixed with ':' followed by a valid port number */ + smp->data.type = SMP_T_IPV4; + break; + } else if (temp->area[0] == '[' && temp->area[smp->data.u.str.data-1] == ']') { + /* IPv6 address enclosed in square brackets */ + temp->area[smp->data.u.str.data-1] = '\0'; + if (inet_pton(AF_INET6, temp->area+1, &smp->data.u.ipv6)) { + smp->data.type = SMP_T_IPV6; + break; + } + } else if (inet_pton(AF_INET6, temp->area, &smp->data.u.ipv6)) { + /* plain IPv6 address */ + smp->data.type = SMP_T_IPV6; + break; + } + } + + /* if the header doesn't match an IP address, fetch next one */ + if (!(smp->flags & SMP_F_NOT_LAST)) + return 0; + } + return ret; +} + +/* 8. Check on URI PATH. A pointer to the PATH is stored. The path starts at the + * first '/' after the possible hostname. It ends before the possible '?' except + * for 'pathq' keyword. + */ +static int smp_fetch_path(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + struct ist path; + struct http_uri_parser parser; + + if (!htx) + return 0; + + sl = http_get_stline(htx); + parser = http_uri_parser_init(htx_sl_req_uri(sl)); + + if (kw[4] == 'q' && (kw[0] == 'p' || kw[0] == 'b')) // pathq or baseq + path = http_parse_path(&parser); + else + path = iststop(http_parse_path(&parser), '?'); + + if (!isttest(path)) + return 0; + + /* OK, we got the '/' ! */ + smp->data.type = SMP_T_STR; + smp->data.u.str.area = path.ptr; + smp->data.u.str.data = path.len; + smp->flags = SMP_F_VOL_1ST | SMP_F_CONST; + return 1; +} + +/* This produces a concatenation of the first occurrence of the Host header + * followed by the path component if it begins with a slash ('/'). This means + * that '*' will not be added, resulting in exactly the first Host entry. + * If no Host header is found, then the path is returned as-is. The returned + * value is stored in the trash so it does not need to be marked constant. + * The returned sample is of type string. + */ +static int smp_fetch_base(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + struct buffer *temp; + struct http_hdr_ctx ctx; + struct ist path; + struct http_uri_parser parser; + + if (!htx) + return 0; + + ctx.blk = NULL; + if (!http_find_header(htx, ist("Host"), &ctx, 0) || !ctx.value.len) + return smp_fetch_path(args, smp, kw, private); + + /* OK we have the header value in ctx.value */ + temp = get_trash_chunk(); + chunk_istcat(temp, ctx.value); + + /* now retrieve the path */ + sl = http_get_stline(htx); + parser = http_uri_parser_init(htx_sl_req_uri(sl)); + path = http_parse_path(&parser); + if (isttest(path)) { + size_t len; + + if (kw[4] == 'q' && kw[0] == 'b') { // baseq + len = path.len; + } else { + for (len = 0; len < path.len && *(path.ptr + len) != '?'; len++) + ; + } + + if (len && *(path.ptr) == '/') + chunk_memcat(temp, path.ptr, len); + } + + smp->data.type = SMP_T_STR; + smp->data.u.str = *temp; + smp->flags = SMP_F_VOL_1ST; + return 1; +} + +/* This produces a 32-bit hash of the concatenation of the first occurrence of + * the Host header followed by the path component if it begins with a slash ('/'). + * This means that '*' will not be added, resulting in exactly the first Host + * entry. If no Host header is found, then the path is used. The resulting value + * is hashed using the path hash followed by a full avalanche hash and provides a + * 32-bit integer value. This fetch is useful for tracking per-path activity on + * high-traffic sites without having to store whole paths. + */ +static int smp_fetch_base32(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + struct http_hdr_ctx ctx; + struct ist path; + unsigned int hash = 0; + struct http_uri_parser parser; + + if (!htx) + return 0; + + ctx.blk = NULL; + if (http_find_header(htx, ist("Host"), &ctx, 0)) { + /* OK we have the header value in ctx.value */ + while (ctx.value.len--) + hash = *(ctx.value.ptr++) + (hash << 6) + (hash << 16) - hash; + } + + /* now retrieve the path */ + sl = http_get_stline(htx); + parser = http_uri_parser_init(htx_sl_req_uri(sl)); + path = http_parse_path(&parser); + if (isttest(path)) { + size_t len; + + for (len = 0; len < path.len && *(path.ptr + len) != '?'; len++) + ; + + if (len && *(path.ptr) == '/') { + while (len--) + hash = *(path.ptr++) + (hash << 6) + (hash << 16) - hash; + } + } + + hash = full_hash(hash); + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = hash; + smp->flags = SMP_F_VOL_1ST; + return 1; +} + +/* This concatenates the source address with the 32-bit hash of the Host and + * path as returned by smp_fetch_base32(). The idea is to have per-source and + * per-path counters. The result is a binary block from 8 to 20 bytes depending + * on the source address length. The path hash is stored before the address so + * that in environments where IPv6 is insignificant, truncating the output to + * 8 bytes would still work. + */ +static int smp_fetch_base32_src(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + const struct sockaddr_storage *src = (smp->strm ? sc_src(smp->strm->scf) : NULL); + struct buffer *temp; + + if (!src) + return 0; + + if (!smp_fetch_base32(args, smp, kw, private)) + return 0; + + temp = get_trash_chunk(); + *(unsigned int *) temp->area = htonl(smp->data.u.sint); + temp->data += sizeof(unsigned int); + + switch (src->ss_family) { + case AF_INET: + memcpy(temp->area + temp->data, + &((struct sockaddr_in *)src)->sin_addr, + 4); + temp->data += 4; + break; + case AF_INET6: + memcpy(temp->area + temp->data, + &((struct sockaddr_in6 *)src)->sin6_addr, + 16); + temp->data += 16; + break; + default: + return 0; + } + + smp->data.u.str = *temp; + smp->data.type = SMP_T_BIN; + return 1; +} + +/* Extracts the query string, which comes after the question mark '?'. If no + * question mark is found, nothing is returned. Otherwise it returns a sample + * of type string carrying the whole query string. + */ +static int smp_fetch_query(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + char *ptr, *end; + + if (!htx) + return 0; + + sl = http_get_stline(htx); + ptr = HTX_SL_REQ_UPTR(sl); + end = HTX_SL_REQ_UPTR(sl) + HTX_SL_REQ_ULEN(sl); + + /* look up the '?' */ + do { + if (ptr == end) + return 0; + } while (*ptr++ != '?'); + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = ptr; + smp->data.u.str.data = end - ptr; + smp->flags = SMP_F_VOL_1ST | SMP_F_CONST; + return 1; +} + +static int smp_fetch_proto_http(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 0); + + if (!htx) + return 0; + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 1; + return 1; +} + +/* return a valid test if the current request is the first one on the connection */ +static int smp_fetch_http_first_req(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = !(smp->strm->txn->flags & TX_NOT_FIRST); + return 1; +} + +/* Fetch the authentication method if there is an Authorization header. It + * relies on get_http_auth() + */ +static int smp_fetch_http_auth_type(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct http_txn *txn; + + if (!htx) + return 0; + + txn = smp->strm->txn; + if (!get_http_auth(smp, htx)) + return 0; + + switch (txn->auth.method) { + case HTTP_AUTH_BASIC: + smp->data.u.str.area = "Basic"; + smp->data.u.str.data = 5; + break; + case HTTP_AUTH_DIGEST: + /* Unexpected because not supported */ + smp->data.u.str.area = "Digest"; + smp->data.u.str.data = 6; + break; + case HTTP_AUTH_BEARER: + smp->data.u.str.area = "Bearer"; + smp->data.u.str.data = 6; + break; + default: + return 0; + } + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + return 1; +} + +/* Fetch the user supplied if there is an Authorization header. It relies on + * get_http_auth() + */ +static int smp_fetch_http_auth_user(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct http_txn *txn; + + if (!htx) + return 0; + + txn = smp->strm->txn; + if (!get_http_auth(smp, htx) || txn->auth.method != HTTP_AUTH_BASIC) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = txn->auth.user; + smp->data.u.str.data = strlen(txn->auth.user); + smp->flags = SMP_F_CONST; + return 1; +} + +/* Fetch the password supplied if there is an Authorization header. It relies on + * get_http_auth() + */ +static int smp_fetch_http_auth_pass(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct http_txn *txn; + + if (!htx) + return 0; + + txn = smp->strm->txn; + if (!get_http_auth(smp, htx) || txn->auth.method != HTTP_AUTH_BASIC) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = txn->auth.pass; + smp->data.u.str.data = strlen(txn->auth.pass); + smp->flags = SMP_F_CONST; + return 1; +} + +static int smp_fetch_http_auth_bearer(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct http_txn *txn; + struct buffer bearer_val = {}; + + if (!htx) + return 0; + + if (args->type == ARGT_STR) { + struct http_hdr_ctx ctx; + struct ist hdr_name = ist2(args->data.str.area, args->data.str.data); + + ctx.blk = NULL; + if (http_find_header(htx, hdr_name, &ctx, 0)) { + struct ist type = istsplit(&ctx.value, ' '); + + /* There must be "at least" one space character between + * the scheme and the following value so ctx.value might + * still have leading spaces here (see RFC7235). + */ + ctx.value = istskip(ctx.value, ' '); + + if (isteqi(type, ist("Bearer")) && istlen(ctx.value)) + chunk_initlen(&bearer_val, istptr(ctx.value), 0, istlen(ctx.value)); + } + } + else { + txn = smp->strm->txn; + if (!get_http_auth(smp, htx) || txn->auth.method != HTTP_AUTH_BEARER) + return 0; + + bearer_val = txn->auth.method_data; + } + + smp->data.type = SMP_T_STR; + smp->data.u.str = bearer_val; + smp->flags = SMP_F_CONST; + return 1; +} + +/* Accepts exactly 1 argument of type userlist */ +static int smp_fetch_http_auth(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + + if (args->type != ARGT_USR) + return 0; + + if (!htx) + return 0; + if (!get_http_auth(smp, htx) || smp->strm->txn->auth.method != HTTP_AUTH_BASIC) + return 0; + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = check_user(args->data.usr, smp->strm->txn->auth.user, + smp->strm->txn->auth.pass); + return 1; +} + +/* Accepts exactly 1 argument of type userlist */ +static int smp_fetch_http_auth_grp(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + + if (args->type != ARGT_USR) + return 0; + + if (!htx) + return 0; + if (!get_http_auth(smp, htx) || smp->strm->txn->auth.method != HTTP_AUTH_BASIC) + return 0; + + /* if the user does not belong to the userlist or has a wrong password, + * report that it unconditionally does not match. Otherwise we return + * a string containing the username. + */ + if (!check_user(args->data.usr, smp->strm->txn->auth.user, + smp->strm->txn->auth.pass)) + return 0; + + /* pat_match_auth() will need the user list */ + smp->ctx.a[0] = args->data.usr; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.area = smp->strm->txn->auth.user; + smp->data.u.str.data = strlen(smp->strm->txn->auth.user); + + return 1; +} + +/* Fetch a captured HTTP request header. The index is the position of + * the "capture" option in the configuration file + */ +static int smp_fetch_capture_req_hdr(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *fe; + int idx; + + if (args->type != ARGT_SINT) + return 0; + + if (!smp->strm) + return 0; + + fe = strm_fe(smp->strm); + idx = args->data.sint; + + if (idx > (fe->nb_req_cap - 1) || smp->strm->req_cap == NULL || smp->strm->req_cap[idx] == NULL) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_CONST; + smp->data.u.str.area = smp->strm->req_cap[idx]; + smp->data.u.str.data = strlen(smp->strm->req_cap[idx]); + + return 1; +} + +/* Fetch a captured HTTP response header. The index is the position of + * the "capture" option in the configuration file + */ +static int smp_fetch_capture_res_hdr(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct proxy *fe; + int idx; + + if (args->type != ARGT_SINT) + return 0; + + if (!smp->strm) + return 0; + + fe = strm_fe(smp->strm); + idx = args->data.sint; + + if (idx > (fe->nb_rsp_cap - 1) || smp->strm->res_cap == NULL || smp->strm->res_cap[idx] == NULL) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_CONST; + smp->data.u.str.area = smp->strm->res_cap[idx]; + smp->data.u.str.data = strlen(smp->strm->res_cap[idx]); + + return 1; +} + +/* Extracts the METHOD in the HTTP request, the txn->uri should be filled before the call */ +static int smp_fetch_capture_req_method(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct buffer *temp; + struct http_txn *txn; + char *ptr; + + if (!smp->strm) + return 0; + + txn = smp->strm->txn; + if (!txn || !txn->uri) + return 0; + + ptr = txn->uri; + + while (*ptr != ' ' && *ptr != '\0') /* find first space */ + ptr++; + + temp = get_trash_chunk(); + temp->area = txn->uri; + temp->data = ptr - txn->uri; + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + + return 1; + +} + +/* Extracts the path in the HTTP request, the txn->uri should be filled before the call */ +static int smp_fetch_capture_req_uri(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct http_txn *txn; + struct ist path; + const char *ptr; + struct http_uri_parser parser; + + if (!smp->strm) + return 0; + + txn = smp->strm->txn; + if (!txn || !txn->uri) + return 0; + + ptr = txn->uri; + + while (*ptr != ' ' && *ptr != '\0') /* find first space */ + ptr++; + + if (!*ptr) + return 0; + + /* skip the first space and find space after URI */ + path = ist2(++ptr, 0); + while (*ptr != ' ' && *ptr != '\0') + ptr++; + path.len = ptr - path.ptr; + + parser = http_uri_parser_init(path); + path = http_parse_path(&parser); + if (!isttest(path)) + return 0; + + smp->data.u.str.area = path.ptr; + smp->data.u.str.data = path.len; + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + + return 1; +} + +/* Retrieves the HTTP version from the request (either 1.0 or 1.1) and emits it + * as a string (either "HTTP/1.0" or "HTTP/1.1"). + */ +static int smp_fetch_capture_req_ver(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct http_txn *txn; + + if (!smp->strm) + return 0; + + txn = smp->strm->txn; + if (!txn || txn->req.msg_state < HTTP_MSG_BODY) + return 0; + + if (txn->req.flags & HTTP_MSGF_VER_11) + smp->data.u.str.area = "HTTP/1.1"; + else + smp->data.u.str.area = "HTTP/1.0"; + + smp->data.u.str.data = 8; + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + return 1; + +} + +/* Retrieves the HTTP version from the response (either 1.0 or 1.1) and emits it + * as a string (either "HTTP/1.0" or "HTTP/1.1"). + */ +static int smp_fetch_capture_res_ver(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct http_txn *txn; + + if (!smp->strm) + return 0; + + txn = smp->strm->txn; + if (!txn || txn->rsp.msg_state < HTTP_MSG_BODY) + return 0; + + if (txn->rsp.flags & HTTP_MSGF_VER_11) + smp->data.u.str.area = "HTTP/1.1"; + else + smp->data.u.str.area = "HTTP/1.0"; + + smp->data.u.str.data = 8; + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + return 1; + +} + +/* Iterate over all cookies present in a message. The context is stored in + * smp->ctx.a[0] for the in-header position, smp->ctx.a[1] for the + * end-of-header-value, and smp->ctx.a[2] for the hdr_ctx. Depending on + * the direction, multiple cookies may be parsed on the same line or not. + * If provided, the searched cookie name is in args, in args->data.str. If + * the input options indicate that no iterating is desired, then only last + * value is fetched if any. If no cookie name is provided, the first cookie + * value found is fetched. The returned sample is of type CSTR. Can be used + * to parse cookies in other files. + */ +static int smp_fetch_cookie(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.cookie / cookie / cook, res.cookie / scook / set-cookie */ + struct channel *chn = ((kw[0] == 'c' || kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[0] == 's' || kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct http_hdr_ctx *ctx = smp->ctx.a[2]; + struct ist hdr; + char *cook = NULL; + size_t cook_l = 0; + int found = 0; + + if (args->type == ARGT_STR) { + cook = args->data.str.area; + cook_l = args->data.str.data; + } + + if (!ctx) { + /* first call */ + ctx = &static_http_hdr_ctx; + ctx->blk = NULL; + smp->ctx.a[2] = ctx; + } + + if (!htx) + return 0; + + hdr = (!(check || (chn && chn->flags & CF_ISRESP)) ? ist("Cookie") : ist("Set-Cookie")); + + /* OK so basically here, either we want only one value or we want to + * iterate over all of them and we fetch the next one. In this last case + * SMP_OPT_ITERATE option is set. + */ + + if (!(smp->flags & SMP_F_NOT_LAST)) { + /* search for the header from the beginning, we must first initialize + * the search parameters. + */ + smp->ctx.a[0] = NULL; + ctx->blk = NULL; + } + + smp->flags |= SMP_F_VOL_HDR; + while (1) { + /* Note: smp->ctx.a[0] == NULL every time we need to fetch a new header */ + if (!smp->ctx.a[0]) { + if (!http_find_header(htx, hdr, ctx, 0)) + goto out; + + if (ctx->value.len < cook_l + 1) + continue; + + smp->ctx.a[0] = ctx->value.ptr; + smp->ctx.a[1] = smp->ctx.a[0] + ctx->value.len; + } + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_CONST; + smp->ctx.a[0] = http_extract_cookie_value(smp->ctx.a[0], smp->ctx.a[1], + cook, cook_l, + (smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_REQ, + &smp->data.u.str.area, + &smp->data.u.str.data); + if (smp->ctx.a[0]) { + found = 1; + if (smp->opt & SMP_OPT_ITERATE) { + /* iterate on cookie value */ + smp->flags |= SMP_F_NOT_LAST; + return 1; + } + if (args->data.str.data == 0) { + /* No cookie name, first occurrence returned */ + break; + } + } + /* if we're looking for last occurrence, let's loop */ + } + + /* all cookie headers and values were scanned. If we're looking for the + * last occurrence, we may return it now. + */ + out: + smp->flags &= ~SMP_F_NOT_LAST; + return found; +} + +/* Same than smp_fetch_cookie() but only relies on the sample direction to + * choose the right channel. So instead of duplicating the code, we just change + * the keyword and then fallback on smp_fetch_cookie(). + */ +static int smp_fetch_chn_cookie(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + kw = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_REQ ? "req.cook" : "res.cook"); + return smp_fetch_cookie(args, smp, kw, private); +} + +/* Iterate over all cookies present in a request to count how many occurrences + * match the name in args and args->data.str.len. If <multi> is non-null, then + * multiple cookies may be parsed on the same line. The returned sample is of + * type UINT. Accepts exactly 1 argument of type string. + */ +static int smp_fetch_cookie_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.cook_cnt / cook_cnt, res.cook_cnt / scook_cnt */ + struct channel *chn = ((kw[0] == 'c' || kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[0] == 's' || kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct http_hdr_ctx ctx; + struct ist hdr; + char *val_beg, *val_end; + char *cook = NULL; + size_t cook_l = 0; + int cnt; + + if (args->type == ARGT_STR){ + cook = args->data.str.area; + cook_l = args->data.str.data; + } + + if (!htx) + return 0; + + hdr = (!(check || (chn && chn->flags & CF_ISRESP)) ? ist("Cookie") : ist("Set-Cookie")); + + val_end = val_beg = NULL; + ctx.blk = NULL; + cnt = 0; + while (1) { + /* Note: val_beg == NULL every time we need to fetch a new header */ + if (!val_beg) { + if (!http_find_header(htx, hdr, &ctx, 0)) + break; + + if (ctx.value.len < cook_l + 1) + continue; + + val_beg = ctx.value.ptr; + val_end = val_beg + ctx.value.len; + } + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_CONST; + while ((val_beg = http_extract_cookie_value(val_beg, val_end, + cook, cook_l, + (smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_REQ, + &smp->data.u.str.area, + &smp->data.u.str.data))) { + cnt++; + } + } + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = cnt; + smp->flags |= SMP_F_VOL_HDR; + return 1; +} + +/* Fetch an cookie's integer value. The integer value is returned. It + * takes a mandatory argument of type string. It relies on smp_fetch_cookie(). + */ +static int smp_fetch_cookie_val(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int ret = smp_fetch_cookie(args, smp, kw, private); + + if (ret > 0) { + smp->data.type = SMP_T_SINT; + smp->data.u.sint = strl2ic(smp->data.u.str.area, + smp->data.u.str.data); + } + + return ret; +} + +/* Iterate over all cookies present in a message, + * and return the list of cookie names separated by + * the input argument character. + * If no input argument is provided, + * the default delimiter is ','. + * The returned sample is of type CSTR. + */ +static int smp_fetch_cookie_names(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + /* possible keywords: req.cook_names, res.cook_names */ + struct channel *chn = ((kw[2] == 'q') ? SMP_REQ_CHN(smp) : SMP_RES_CHN(smp)); + struct check *check = ((kw[2] == 's') ? objt_check(smp->sess->origin) : NULL); + struct htx *htx = smp_prefetch_htx(smp, chn, check, 1); + struct http_hdr_ctx ctx; + struct ist hdr; + struct buffer *temp; + char del = ','; + char *ptr, *attr_beg, *attr_end; + size_t len = 0; + int is_req = !(check || (chn && chn->flags & CF_ISRESP)); + + if (!htx) + return 0; + + if (args->type == ARGT_STR) + del = *args[0].data.str.area; + + hdr = (is_req ? ist("Cookie") : ist("Set-Cookie")); + temp = get_trash_chunk(); + + smp->flags |= SMP_F_VOL_HDR; + attr_end = attr_beg = NULL; + ctx.blk = NULL; + /* Scan through all headers and extract all cookie names from + * 1. Cookie header(s) for request channel OR + * 2. Set-Cookie header(s) for response channel + */ + while (1) { + /* Note: attr_beg == NULL every time we need to fetch a new header */ + if (!attr_beg) { + /* For Set-Cookie, we need to fetch the entire header line (set flag to 1) */ + if (!http_find_header(htx, hdr, &ctx, !is_req)) + break; + attr_beg = ctx.value.ptr; + attr_end = attr_beg + ctx.value.len; + } + + while (1) { + attr_beg = http_extract_next_cookie_name(attr_beg, attr_end, is_req, &ptr, &len); + if (!attr_beg) + break; + + /* prepend delimiter if this is not the first cookie name found */ + if (temp->data) + temp->area[temp->data++] = del; + + /* At this point ptr should point to the start of the cookie name and len would be the length of the cookie name */ + if (!chunk_memcat(temp, ptr, len)) + return 0; + } + } + smp->data.type = SMP_T_STR; + smp->data.u.str = *temp; + return 1; +} + +/************************************************************************/ +/* The code below is dedicated to sample fetches */ +/************************************************************************/ + +/* This scans a URL-encoded query string. It takes an optionally wrapping + * string whose first contiguous chunk has its beginning in ctx->a[0] and end + * in ctx->a[1], and the optional second part in (ctx->a[2]..ctx->a[3]). The + * pointers are updated for next iteration before leaving. + */ +static int smp_fetch_param(char delim, const char *name, int name_len, const struct arg *args, struct sample *smp, const char *kw, void *private, char insensitive) +{ + const char *vstart, *vend; + struct buffer *temp; + const char **chunks = (const char **)smp->ctx.a; + + if (!http_find_next_url_param(chunks, name, name_len, + &vstart, &vend, delim, insensitive)) + return 0; + + /* Create sample. If the value is contiguous, return the pointer as CONST, + * if the value is wrapped, copy-it in a buffer. + */ + smp->data.type = SMP_T_STR; + if (chunks[2] && + vstart >= chunks[0] && vstart <= chunks[1] && + vend >= chunks[2] && vend <= chunks[3]) { + /* Wrapped case. */ + temp = get_trash_chunk(); + memcpy(temp->area, vstart, chunks[1] - vstart); + memcpy(temp->area + ( chunks[1] - vstart ), chunks[2], + vend - chunks[2]); + smp->data.u.str.area = temp->area; + smp->data.u.str.data = ( chunks[1] - vstart ) + ( vend - chunks[2] ); + } else { + /* Contiguous case. */ + smp->data.u.str.area = (char *)vstart; + smp->data.u.str.data = vend - vstart; + smp->flags = SMP_F_VOL_1ST | SMP_F_CONST; + } + + /* Update context, check wrapping. */ + chunks[0] = vend; + if (chunks[2] && vend >= chunks[2] && vend <= chunks[3]) { + chunks[1] = chunks[3]; + chunks[2] = NULL; + } + + if (chunks[0] < chunks[1]) + smp->flags |= SMP_F_NOT_LAST; + + return 1; +} + +/* This function iterates over each parameter of the query string. It uses + * ctx->a[0] and ctx->a[1] to store the beginning and end of the current + * parameter. Since it uses smp_fetch_param(), ctx->a[2..3] are both NULL. + * An optional parameter name is passed in args[0], otherwise any parameter is + * considered. It supports an optional delimiter argument for the beginning of + * the string in args[1], which defaults to "?". + */ +static int smp_fetch_url_param(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + char delim = '?'; + const char *name; + int name_len; + char insensitive = 0; + + if ((args[0].type && args[0].type != ARGT_STR) || + (args[1].type && args[1].type != ARGT_STR) || + (args[2].type && args[2].type != ARGT_STR)) + return 0; + + name = ""; + name_len = 0; + if (args->type == ARGT_STR) { + name = args->data.str.area; + name_len = args->data.str.data; + } + + if (args[1].type && *args[1].data.str.area) + delim = *args[1].data.str.area; + if (args[2].type && *args[2].data.str.area == 'i') + insensitive = 1; + + if (!smp->ctx.a[0]) { // first call, find the query string + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct htx_sl *sl; + + if (!htx) + return 0; + + sl = http_get_stline(htx); + smp->ctx.a[0] = http_find_param_list(HTX_SL_REQ_UPTR(sl), HTX_SL_REQ_ULEN(sl), delim); + if (!smp->ctx.a[0]) + return 0; + + smp->ctx.a[1] = HTX_SL_REQ_UPTR(sl) + HTX_SL_REQ_ULEN(sl); + + /* Assume that the context is filled with NULL pointer + * before the first call. + * smp->ctx.a[2] = NULL; + * smp->ctx.a[3] = NULL; + */ + } + + return smp_fetch_param(delim, name, name_len, args, smp, kw, private, insensitive); +} + +/* This function iterates over each parameter of the body. This requires + * that the body has been waited for using http-buffer-request. It uses + * ctx->a[0] and ctx->a[1] to store the beginning and end of the first + * contiguous part of the body, and optionally ctx->a[2..3] to reference the + * optional second part if the body wraps at the end of the buffer. An optional + * parameter name is passed in args[0], otherwise any parameter is considered. + */ +static int smp_fetch_body_param(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + const char *name; + int name_len; + char insensitive = 0; + + if ((args[0].type && args[0].type != ARGT_STR) || + (args[1].type && args[1].type != ARGT_STR)) + return 0; + + name = ""; + name_len = 0; + if (args[0].type == ARGT_STR) { + name = args[0].data.str.area; + name_len = args[0].data.str.data; + } + + if (args[1].type && *args[1].data.str.area == 'i') + insensitive = 1; + + if (!smp->ctx.a[0]) { // first call, find the query string + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct buffer *temp; + int32_t pos; + + if (!htx) + return 0; + + temp = get_trash_chunk(); + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_DATA) { + if (!h1_format_htx_data(htx_get_blk_value(htx, blk), temp, 0)) + return 0; + } + } + + smp->ctx.a[0] = temp->area; + smp->ctx.a[1] = temp->area + temp->data; + + /* Assume that the context is filled with NULL pointer + * before the first call. + * smp->ctx.a[2] = NULL; + * smp->ctx.a[3] = NULL; + */ + + } + + return smp_fetch_param('&', name, name_len, args, smp, kw, private, insensitive); +} + +/* Return the signed integer value for the specified url parameter (see url_param + * above). + */ +static int smp_fetch_url_param_val(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int ret = smp_fetch_url_param(args, smp, kw, private); + + if (ret > 0) { + smp->data.type = SMP_T_SINT; + smp->data.u.sint = strl2ic(smp->data.u.str.area, + smp->data.u.str.data); + } + + return ret; +} + +/* This produces a 32-bit hash of the concatenation of the first occurrence of + * the Host header followed by the path component if it begins with a slash ('/'). + * This means that '*' will not be added, resulting in exactly the first Host + * entry. If no Host header is found, then the path is used. The resulting value + * is hashed using the url hash followed by a full avalanche hash and provides a + * 32-bit integer value. This fetch is useful for tracking per-URL activity on + * high-traffic sites without having to store whole paths. + * this differs from the base32 functions in that it includes the url parameters + * as well as the path + */ +static int smp_fetch_url32(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn = SMP_REQ_CHN(smp); + struct htx *htx = smp_prefetch_htx(smp, chn, NULL, 1); + struct http_hdr_ctx ctx; + struct htx_sl *sl; + struct ist path; + unsigned int hash = 0; + struct http_uri_parser parser; + + if (!htx) + return 0; + + ctx.blk = NULL; + if (http_find_header(htx, ist("Host"), &ctx, 1)) { + /* OK we have the header value in ctx.value */ + while (ctx.value.len--) + hash = *(ctx.value.ptr++) + (hash << 6) + (hash << 16) - hash; + } + + /* now retrieve the path */ + sl = http_get_stline(htx); + parser = http_uri_parser_init(htx_sl_req_uri(sl)); + path = http_parse_path(&parser); + if (path.len && *(path.ptr) == '/') { + while (path.len--) + hash = *(path.ptr++) + (hash << 6) + (hash << 16) - hash; + } + + hash = full_hash(hash); + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = hash; + smp->flags = SMP_F_VOL_1ST; + return 1; +} + +/* This concatenates the source address with the 32-bit hash of the Host and + * URL as returned by smp_fetch_base32(). The idea is to have per-source and + * per-url counters. The result is a binary block from 8 to 20 bytes depending + * on the source address length. The URL hash is stored before the address so + * that in environments where IPv6 is insignificant, truncating the output to + * 8 bytes would still work. + */ +static int smp_fetch_url32_src(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + const struct sockaddr_storage *src = (smp->strm ? sc_src(smp->strm->scf) : NULL); + struct buffer *temp; + + if (!src) + return 0; + + if (!smp_fetch_url32(args, smp, kw, private)) + return 0; + + temp = get_trash_chunk(); + *(unsigned int *) temp->area = htonl(smp->data.u.sint); + temp->data += sizeof(unsigned int); + + switch (src->ss_family) { + case AF_INET: + memcpy(temp->area + temp->data, + &((struct sockaddr_in *)src)->sin_addr, + 4); + temp->data += 4; + break; + case AF_INET6: + memcpy(temp->area + temp->data, + &((struct sockaddr_in6 *)src)->sin6_addr, + 16); + temp->data += 16; + break; + default: + return 0; + } + + smp->data.u.str = *temp; + smp->data.type = SMP_T_BIN; + return 1; +} + +/************************************************************************/ +/* Other utility functions */ +/************************************************************************/ + +/* This function is used to validate the arguments passed to any "hdr" fetch + * keyword. These keywords support an optional positive or negative occurrence + * number. We must ensure that the number is greater than -MAX_HDR_HISTORY. It + * is assumed that the types are already the correct ones. Returns 0 on error, + * non-zero if OK. If <err> is not NULL, it will be filled with a pointer to an + * error message in case of error, that the caller is responsible for freeing. + * The initial location must either be freeable or NULL. + * Note: this function's pointer is checked from Lua. + */ +int val_hdr(struct arg *arg, char **err_msg) +{ + if (arg && arg[1].type == ARGT_SINT && arg[1].data.sint < -MAX_HDR_HISTORY) { + memprintf(err_msg, "header occurrence must be >= %d", -MAX_HDR_HISTORY); + return 0; + } + return 1; +} + +/************************************************************************/ +/* All supported sample fetch keywords must be declared here. */ +/************************************************************************/ + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "base", smp_fetch_base, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "base32", smp_fetch_base32, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "base32+src", smp_fetch_base32_src, 0, NULL, SMP_T_BIN, SMP_USE_HRQHV }, + { "baseq", smp_fetch_base, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + + /* capture are allocated and are permanent in the stream */ + { "capture.req.hdr", smp_fetch_capture_req_hdr, ARG1(1,SINT), NULL, SMP_T_STR, SMP_USE_HRQHP }, + + /* retrieve these captures from the HTTP logs */ + { "capture.req.method", smp_fetch_capture_req_method, 0, NULL, SMP_T_STR, SMP_USE_HRQHP }, + { "capture.req.uri", smp_fetch_capture_req_uri, 0, NULL, SMP_T_STR, SMP_USE_HRQHP }, + { "capture.req.ver", smp_fetch_capture_req_ver, 0, NULL, SMP_T_STR, SMP_USE_HRQHP }, + + { "capture.res.hdr", smp_fetch_capture_res_hdr, ARG1(1,SINT), NULL, SMP_T_STR, SMP_USE_HRSHP }, + { "capture.res.ver", smp_fetch_capture_res_ver, 0, NULL, SMP_T_STR, SMP_USE_HRQHP }, + + /* cookie is valid in both directions (eg: for "stick ...") but cook* + * are only here to match the ACL's name, are request-only and are used + * for ACL compatibility only. + */ + { "cook", smp_fetch_cookie, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "cookie", smp_fetch_chn_cookie, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRQHV|SMP_USE_HRSHV }, + { "cook_cnt", smp_fetch_cookie_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "cook_val", smp_fetch_cookie_val, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRQHV }, + + /* hdr is valid in both directions (eg: for "stick ...") but hdr_* are + * only here to match the ACL's name, are request-only and are used for + * ACL compatibility only. + */ + { "hdr", smp_fetch_chn_hdr, ARG2(0,STR,SINT), val_hdr, SMP_T_STR, SMP_USE_HRQHV|SMP_USE_HRSHV }, + { "hdr_cnt", smp_fetch_hdr_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "hdr_ip", smp_fetch_hdr_ip, ARG2(0,STR,SINT), val_hdr, SMP_T_ADDR, SMP_USE_HRQHV }, + { "hdr_val", smp_fetch_hdr_val, ARG2(0,STR,SINT), val_hdr, SMP_T_SINT, SMP_USE_HRQHV }, + + { "http_auth_type", smp_fetch_http_auth_type, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "http_auth_user", smp_fetch_http_auth_user, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "http_auth_pass", smp_fetch_http_auth_pass, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "http_auth_bearer", smp_fetch_http_auth_bearer, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "http_auth", smp_fetch_http_auth, ARG1(1,USR), NULL, SMP_T_BOOL, SMP_USE_HRQHV }, + { "http_auth_group", smp_fetch_http_auth_grp, ARG1(1,USR), NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "http_first_req", smp_fetch_http_first_req, 0, NULL, SMP_T_BOOL, SMP_USE_HRQHP }, + { "method", smp_fetch_meth, 0, NULL, SMP_T_METH, SMP_USE_HRQHP }, + { "path", smp_fetch_path, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "pathq", smp_fetch_path, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "query", smp_fetch_query, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + + /* HTTP protocol on the request path */ + { "req.proto_http", smp_fetch_proto_http, 0, NULL, SMP_T_BOOL, SMP_USE_HRQHP }, + { "req_proto_http", smp_fetch_proto_http, 0, NULL, SMP_T_BOOL, SMP_USE_HRQHP }, + + /* HTTP version on the request path */ + { "req.ver", smp_fetch_rqver, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "req_ver", smp_fetch_rqver, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + + { "req.body", smp_fetch_body, 0, NULL, SMP_T_BIN, SMP_USE_HRQHV }, + { "req.body_len", smp_fetch_body_len, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "req.body_size", smp_fetch_body_size, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "req.body_param", smp_fetch_body_param, ARG2(0,STR,STR), NULL, SMP_T_BIN, SMP_USE_HRQHV }, + + { "req.hdrs", smp_fetch_hdrs, 0, NULL, SMP_T_BIN, SMP_USE_HRQHV }, + { "req.hdrs_bin", smp_fetch_hdrs_bin, 0, NULL, SMP_T_BIN, SMP_USE_HRQHV }, + + /* HTTP version on the response path */ + { "res.ver", smp_fetch_stver, 0, NULL, SMP_T_STR, SMP_USE_HRSHV }, + { "resp_ver", smp_fetch_stver, 0, NULL, SMP_T_STR, SMP_USE_HRSHV }, + + { "res.body", smp_fetch_body, 0, NULL, SMP_T_BIN, SMP_USE_HRSHV }, + { "res.body_len", smp_fetch_body_len, 0, NULL, SMP_T_SINT, SMP_USE_HRSHV }, + { "res.body_size", smp_fetch_body_size, 0, NULL, SMP_T_SINT, SMP_USE_HRSHV }, + + { "res.hdrs", smp_fetch_hdrs, 0, NULL, SMP_T_BIN, SMP_USE_HRSHV }, + { "res.hdrs_bin", smp_fetch_hdrs_bin, 0, NULL, SMP_T_BIN, SMP_USE_HRSHV }, + + /* explicit req.{cook,hdr} are used to force the fetch direction to be request-only */ + { "req.cook", smp_fetch_cookie, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "req.cook_cnt", smp_fetch_cookie_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "req.cook_val", smp_fetch_cookie_val, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "req.cook_names", smp_fetch_cookie_names, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRQHV }, + + { "req.fhdr", smp_fetch_fhdr, ARG2(0,STR,SINT), val_hdr, SMP_T_STR, SMP_USE_HRQHV }, + { "req.fhdr_cnt", smp_fetch_fhdr_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "req.hdr", smp_fetch_hdr, ARG2(0,STR,SINT), val_hdr, SMP_T_STR, SMP_USE_HRQHV }, + { "req.hdr_cnt", smp_fetch_hdr_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "req.hdr_ip", smp_fetch_hdr_ip, ARG2(0,STR,SINT), val_hdr, SMP_T_ADDR, SMP_USE_HRQHV }, + { "req.hdr_names", smp_fetch_hdr_names, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "req.hdr_val", smp_fetch_hdr_val, ARG2(0,STR,SINT), val_hdr, SMP_T_SINT, SMP_USE_HRQHV }, + + /* explicit req.{cook,hdr} are used to force the fetch direction to be response-only */ + { "res.cook", smp_fetch_cookie, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRSHV }, + { "res.cook_cnt", smp_fetch_cookie_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRSHV }, + { "res.cook_val", smp_fetch_cookie_val, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRSHV }, + { "res.cook_names", smp_fetch_cookie_names, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRSHV }, + + { "res.fhdr", smp_fetch_fhdr, ARG2(0,STR,SINT), val_hdr, SMP_T_STR, SMP_USE_HRSHV }, + { "res.fhdr_cnt", smp_fetch_fhdr_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRSHV }, + { "res.hdr", smp_fetch_hdr, ARG2(0,STR,SINT), val_hdr, SMP_T_STR, SMP_USE_HRSHV }, + { "res.hdr_cnt", smp_fetch_hdr_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRSHV }, + { "res.hdr_ip", smp_fetch_hdr_ip, ARG2(0,STR,SINT), val_hdr, SMP_T_ADDR, SMP_USE_HRSHV }, + { "res.hdr_names", smp_fetch_hdr_names, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRSHV }, + { "res.hdr_val", smp_fetch_hdr_val, ARG2(0,STR,SINT), val_hdr, SMP_T_SINT, SMP_USE_HRSHV }, + + { "server_status", smp_fetch_srv_status, 0, NULL, SMP_T_SINT, SMP_USE_HRSHP }, + + /* scook is valid only on the response and is used for ACL compatibility */ + { "scook", smp_fetch_cookie, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_HRSHV }, + { "scook_cnt", smp_fetch_cookie_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRSHV }, + { "scook_val", smp_fetch_cookie_val, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRSHV }, + + /* shdr is valid only on the response and is used for ACL compatibility */ + { "shdr", smp_fetch_hdr, ARG2(0,STR,SINT), val_hdr, SMP_T_STR, SMP_USE_HRSHV }, + { "shdr_cnt", smp_fetch_hdr_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_HRSHV }, + { "shdr_ip", smp_fetch_hdr_ip, ARG2(0,STR,SINT), val_hdr, SMP_T_ADDR, SMP_USE_HRSHV }, + { "shdr_val", smp_fetch_hdr_val, ARG2(0,STR,SINT), val_hdr, SMP_T_SINT, SMP_USE_HRSHV }, + + { "status", smp_fetch_stcode, 0, NULL, SMP_T_SINT, SMP_USE_HRSHP }, + { "txn.status", smp_fetch_srv_status, 0, NULL, SMP_T_SINT, SMP_USE_HRSHP }, + { "unique-id", smp_fetch_uniqueid, 0, NULL, SMP_T_STR, SMP_SRC_L4SRV }, + { "url", smp_fetch_url, 0, NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "url32", smp_fetch_url32, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "url32+src", smp_fetch_url32_src, 0, NULL, SMP_T_BIN, SMP_USE_HRQHV }, + { "url_ip", smp_fetch_url_ip, 0, NULL, SMP_T_IPV4, SMP_USE_HRQHV }, + { "url_port", smp_fetch_url_port, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV }, + { "url_param", smp_fetch_url_param, ARG3(0,STR,STR,STR), NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "urlp" , smp_fetch_url_param, ARG3(0,STR,STR,STR), NULL, SMP_T_STR, SMP_USE_HRQHV }, + { "urlp_val", smp_fetch_url_param_val, ARG3(0,STR,STR,STR), NULL, SMP_T_SINT, SMP_USE_HRQHV }, + + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/http_htx.c b/src/http_htx.c new file mode 100644 index 0000000..004d343 --- /dev/null +++ b/src/http_htx.c @@ -0,0 +1,3028 @@ +/* + * Functions to manipulate HTTP messages using the internal representation. + * + * Copyright (C) 2018 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <ctype.h> +#include <fcntl.h> +#include <unistd.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/cfgparse.h> +#include <haproxy/global.h> +#include <haproxy/h1.h> +#include <haproxy/http.h> +#include <haproxy/http-hdr.h> +#include <haproxy/http_fetch.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/log.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/tools.h> + + +struct buffer http_err_chunks[HTTP_ERR_SIZE]; +struct http_reply http_err_replies[HTTP_ERR_SIZE]; + +struct eb_root http_error_messages = EB_ROOT; +struct list http_errors_list = LIST_HEAD_INIT(http_errors_list); +struct list http_replies_list = LIST_HEAD_INIT(http_replies_list); + +/* The declaration of an errorfiles/errorfile directives. Used during config + * parsing only. */ +struct conf_errors { + char type; /* directive type (0: errorfiles, 1: errorfile) */ + union { + struct { + int status; /* the status code associated to this error */ + struct http_reply *reply; /* the http reply for the errorfile */ + } errorfile; /* describe an "errorfile" directive */ + struct { + char *name; /* the http-errors section name */ + char status[HTTP_ERR_SIZE]; /* list of status to import (0: ignore, 1: implicit import, 2: explicit import) */ + } errorfiles; /* describe an "errorfiles" directive */ + } info; + + char *file; /* file where the directive appears */ + int line; /* line where the directive appears */ + + struct list list; /* next conf_errors */ +}; + +/* Returns the next unporocessed start line in the HTX message. It returns NULL + * if the start-line is undefined (first == -1). Otherwise, it returns the + * pointer on the htx_sl structure. + */ +struct htx_sl *http_get_stline(const struct htx *htx) +{ + struct htx_blk *blk; + + blk = htx_get_first_blk(htx); + if (!blk || (htx_get_blk_type(blk) != HTX_BLK_REQ_SL && htx_get_blk_type(blk) != HTX_BLK_RES_SL)) + return NULL; + return htx_get_blk_ptr(htx, blk); +} + +/* Returns the headers size in the HTX message */ +size_t http_get_hdrs_size(struct htx *htx) +{ + struct htx_blk *blk; + size_t sz = 0; + + blk = htx_get_first_blk(htx); + if (!blk || htx_get_blk_type(blk) > HTX_BLK_EOH) + return sz; + + for (; blk; blk = htx_get_next_blk(htx, blk)) { + sz += htx_get_blksz(blk); + if (htx_get_blk_type(blk) == HTX_BLK_EOH) + break; + } + return sz; +} + +/* Finds the first or next occurrence of header matching <pattern> in the HTX + * message <htx> using the context <ctx>. This structure holds everything + * necessary to use the header and find next occurrence. If its <blk> member is + * NULL, the header is searched from the beginning. Otherwise, the next + * occurrence is returned. The function returns 1 when it finds a value, and 0 + * when there is no more. It is designed to work with headers defined as + * comma-separated lists. If HTTP_FIND_FL_FULL flag is set, it works on + * full-line headers in whose comma is not a delimiter but is part of the + * syntax. A special case, if ctx->value is NULL when searching for a new values + * of a header, the current header is rescanned. This allows rescanning after a + * header deletion. + * + * The matching method is chosen by checking the flags : + * + * * HTTP_FIND_FL_MATCH_REG : <pattern> is a regex. header names matching + * the regex are evaluated. + * * HTTP_FIND_FL_MATCH_STR : <pattern> is a string. The header names equal + * to the string are evaluated. + * * HTTP_FIND_FL_MATCH_PFX : <pattern> is a string. The header names + * starting by the string are evaluated. + * * HTTP_FIND_FL_MATCH_SFX : <pattern> is a string. The header names + * ending by the string are evaluated. + * * HTTP_FIND_FL_MATCH_SUB : <pattern> is a string. The header names + * containing the string are evaluated. + */ + +#define HTTP_FIND_FL_MATCH_STR 0x0001 +#define HTTP_FIND_FL_MATCH_PFX 0x0002 +#define HTTP_FIND_FL_MATCH_SFX 0x0003 +#define HTTP_FIND_FL_MATCH_SUB 0x0004 +#define HTTP_FIND_FL_MATCH_REG 0x0005 +/* 0x0006..0x000f: for other matching methods */ +#define HTTP_FIND_FL_MATCH_TYPE 0x000F +#define HTTP_FIND_FL_FULL 0x0010 + +static int __http_find_header(const struct htx *htx, const void *pattern, struct http_hdr_ctx *ctx, int flags) +{ + struct htx_blk *blk = ctx->blk; + struct ist n, v; + enum htx_blk_type type; + + if (blk) { + char *p; + + if (!isttest(ctx->value)) + goto rescan_hdr; + if (flags & HTTP_FIND_FL_FULL) + goto next_blk; + v = htx_get_blk_value(htx, blk); + p = istend(ctx->value) + ctx->lws_after; + v.len -= (p - v.ptr); + v.ptr = p; + if (!v.len) + goto next_blk; + /* Skip comma */ + if (*(v.ptr) == ',') { + v = istnext(v); + } + + goto return_hdr; + } + + if (htx_is_empty(htx)) + return 0; + + for (blk = htx_get_first_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + rescan_hdr: + type = htx_get_blk_type(blk); + if (type == HTX_BLK_EOH) + break; + if (type != HTX_BLK_HDR) + continue; + + if ((flags & HTTP_FIND_FL_MATCH_TYPE) == HTTP_FIND_FL_MATCH_REG) { + const struct my_regex *re = pattern; + + n = htx_get_blk_name(htx, blk); + if (!regex_exec2(re, n.ptr, n.len)) + goto next_blk; + } + else { + const struct ist name = *(const struct ist *)(pattern); + + /* If no name was passed, we want any header. So skip the comparison */ + if (!istlen(name)) + goto match; + + n = htx_get_blk_name(htx, blk); + switch (flags & HTTP_FIND_FL_MATCH_TYPE) { + case HTTP_FIND_FL_MATCH_STR: + if (!isteqi(n, name)) + goto next_blk; + break; + case HTTP_FIND_FL_MATCH_PFX: + if (istlen(n) < istlen(name)) + goto next_blk; + + n = ist2(istptr(n), istlen(name)); + if (!isteqi(n, name)) + goto next_blk; + break; + case HTTP_FIND_FL_MATCH_SFX: + if (istlen(n) < istlen(name)) + goto next_blk; + + n = ist2(istend(n) - istlen(name), + istlen(name)); + if (!isteqi(n, name)) + goto next_blk; + break; + case HTTP_FIND_FL_MATCH_SUB: + if (!strnistr(n.ptr, n.len, name.ptr, name.len)) + goto next_blk; + break; + default: + goto next_blk; + break; + } + } + match: + v = htx_get_blk_value(htx, blk); + + return_hdr: + ctx->lws_before = 0; + ctx->lws_after = 0; + while (v.len && HTTP_IS_LWS(*v.ptr)) { + v = istnext(v); + ctx->lws_before++; + } + if (!(flags & HTTP_FIND_FL_FULL)) + v.len = http_find_hdr_value_end(v.ptr, istend(v)) - v.ptr; + + while (v.len && HTTP_IS_LWS(*(istend(v) - 1))) { + v.len--; + ctx->lws_after++; + } + ctx->blk = blk; + ctx->value = v; + return 1; + + next_blk: + ; + } + + ctx->blk = NULL; + ctx->value = ist(""); + ctx->lws_before = ctx->lws_after = 0; + return 0; +} + + +/* Header names must match <name> */ +int http_find_header(const struct htx *htx, const struct ist name, struct http_hdr_ctx *ctx, int full) +{ + return __http_find_header(htx, &name, ctx, HTTP_FIND_FL_MATCH_STR | (full ? HTTP_FIND_FL_FULL : 0)); +} + +/* Header names must match <name>. Same than http_find_header */ +int http_find_str_header(const struct htx *htx, const struct ist name, struct http_hdr_ctx *ctx, int full) +{ + return __http_find_header(htx, &name, ctx, HTTP_FIND_FL_MATCH_STR | (full ? HTTP_FIND_FL_FULL : 0)); +} + + +/* Header names must start with <prefix> */ +int http_find_pfx_header(const struct htx *htx, const struct ist prefix, struct http_hdr_ctx *ctx, int full) +{ + return __http_find_header(htx, &prefix, ctx, HTTP_FIND_FL_MATCH_PFX | (full ? HTTP_FIND_FL_FULL : 0)); +} + +/* Header names must end with <suffix> */ +int http_find_sfx_header(const struct htx *htx, const struct ist suffix, struct http_hdr_ctx *ctx, int full) +{ + return __http_find_header(htx, &suffix, ctx, HTTP_FIND_FL_MATCH_SFX | (full ? HTTP_FIND_FL_FULL : 0)); +} +/* Header names must contain <sub> */ +int http_find_sub_header(const struct htx *htx, const struct ist sub, struct http_hdr_ctx *ctx, int full) +{ + return __http_find_header(htx, &sub, ctx, HTTP_FIND_FL_MATCH_SUB | (full ? HTTP_FIND_FL_FULL : 0)); +} + +/* Header names must match <re> regex*/ +int http_match_header(const struct htx *htx, const struct my_regex *re, struct http_hdr_ctx *ctx, int full) +{ + return __http_find_header(htx, re, ctx, HTTP_FIND_FL_MATCH_REG | (full ? HTTP_FIND_FL_FULL : 0)); +} + + +/* Adds a header block int the HTX message <htx>, just before the EOH block. It + * returns 1 on success, otherwise it returns 0. + */ +int http_add_header(struct htx *htx, const struct ist n, const struct ist v) +{ + struct htx_blk *blk; + struct htx_sl *sl; + enum htx_blk_type type = htx_get_tail_type(htx); + int32_t prev; + + blk = htx_add_header(htx, n, v); + if (!blk) + goto fail; + + if (unlikely(type < HTX_BLK_EOH)) + goto end; + + /* <blk> is the head, swap it iteratively with its predecessor to place + * it just before the end-of-header block. So blocks remains ordered. */ + for (prev = htx_get_prev(htx, htx->tail); prev != htx->first; prev = htx_get_prev(htx, prev)) { + struct htx_blk *pblk = htx_get_blk(htx, prev); + enum htx_blk_type type = htx_get_blk_type(pblk); + + /* Swap .addr and .info fields */ + blk->addr ^= pblk->addr; pblk->addr ^= blk->addr; blk->addr ^= pblk->addr; + blk->info ^= pblk->info; pblk->info ^= blk->info; blk->info ^= pblk->info; + + if (blk->addr == pblk->addr) + blk->addr += htx_get_blksz(pblk); + + /* Stop when end-of-header is reached */ + if (type == HTX_BLK_EOH) + break; + + blk = pblk; + } + + end: + sl = http_get_stline(htx); + if (sl && (sl->flags & HTX_SL_F_HAS_AUTHORITY) && isteqi(n, ist("host"))) { + if (!http_update_authority(htx, sl, v)) + goto fail; + } + return 1; + + fail: + return 0; +} + +/* Replaces parts of the start-line of the HTX message <htx>. It returns 1 on + * success, otherwise it returns 0. + */ +int http_replace_stline(struct htx *htx, const struct ist p1, const struct ist p2, const struct ist p3) +{ + struct htx_blk *blk; + + blk = htx_get_first_blk(htx); + if (!blk || !htx_replace_stline(htx, blk, p1, p2, p3)) + return 0; + return 1; +} + +/* Replace the request method in the HTX message <htx> by <meth>. It returns 1 + * on success, otherwise 0. + */ +int http_replace_req_meth(struct htx *htx, const struct ist meth) +{ + struct buffer *temp = get_trash_chunk(); + struct htx_sl *sl = http_get_stline(htx); + struct ist uri, vsn; + + if (!sl) + return 0; + + /* Start by copying old uri and version */ + chunk_memcat(temp, HTX_SL_REQ_UPTR(sl), HTX_SL_REQ_ULEN(sl)); /* uri */ + uri = ist2(temp->area, HTX_SL_REQ_ULEN(sl)); + + chunk_memcat(temp, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl)); /* vsn */ + vsn = ist2(temp->area + uri.len, HTX_SL_REQ_VLEN(sl)); + + /* create the new start line */ + sl->info.req.meth = find_http_meth(meth.ptr, meth.len); + return http_replace_stline(htx, meth, uri, vsn); +} + +/* Replace the request uri in the HTX message <htx> by <uri>. It returns 1 on + * success, otherwise 0. + */ +int http_replace_req_uri(struct htx *htx, const struct ist uri) +{ + struct buffer *temp = get_trash_chunk(); + struct htx_sl *sl = http_get_stline(htx); + struct ist meth, vsn; + + if (!sl) + goto fail; + + /* Start by copying old method and version */ + chunk_memcat(temp, HTX_SL_REQ_MPTR(sl), HTX_SL_REQ_MLEN(sl)); /* meth */ + meth = ist2(temp->area, HTX_SL_REQ_MLEN(sl)); + + chunk_memcat(temp, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl)); /* vsn */ + vsn = ist2(temp->area + meth.len, HTX_SL_REQ_VLEN(sl)); + + /* create the new start line */ + if (!http_replace_stline(htx, meth, uri, vsn)) + goto fail; + + sl = http_get_stline(htx); + ALREADY_CHECKED(sl); /* the stline exists because http_replace_stline() succeeded */ + + sl->flags &= ~HTX_SL_F_NORMALIZED_URI; + if (!http_update_host(htx, sl, uri)) + goto fail; + + return 1; + fail: + return 0; +} + +/* Replace the request path in the HTX message <htx> by <path>. The host part is + * preserverd. if <with_qs> is set, the query string is evaluated as part of the + * path and replaced. Otherwise, it is preserved too. It returns 1 on success, + * otherwise 0. + */ +int http_replace_req_path(struct htx *htx, const struct ist path, int with_qs) +{ + struct buffer *temp = get_trash_chunk(); + struct htx_sl *sl = http_get_stline(htx); + struct ist meth, uri, vsn, p; + size_t plen = 0; + struct http_uri_parser parser; + + if (!sl) + return 0; + + uri = htx_sl_req_uri(sl); + parser = http_uri_parser_init(uri); + p = http_parse_path(&parser); + if (!isttest(p)) + p = uri; + if (with_qs) + plen = p.len; + else { + while (plen < p.len && *(p.ptr + plen) != '?') + plen++; + } + + /* Start by copying old method and version and create the new uri */ + chunk_memcat(temp, HTX_SL_REQ_MPTR(sl), HTX_SL_REQ_MLEN(sl)); /* meth */ + meth = ist2(temp->area, HTX_SL_REQ_MLEN(sl)); + + chunk_memcat(temp, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl)); /* vsn */ + vsn = ist2(temp->area + meth.len, HTX_SL_REQ_VLEN(sl)); + + chunk_memcat(temp, uri.ptr, p.ptr - uri.ptr); /* uri: host part */ + chunk_istcat(temp, path); /* uri: new path */ + chunk_memcat(temp, p.ptr + plen, p.len - plen); /* uri: QS part */ + uri = ist2(temp->area + meth.len + vsn.len, uri.len - plen + path.len); + + /* create the new start line */ + return http_replace_stline(htx, meth, uri, vsn); +} + +/* Replace the request query-string in the HTX message <htx> by <query>. The + * host part and the path are preserved. It returns 1 on success, otherwise + * 0. + */ +int http_replace_req_query(struct htx *htx, const struct ist query) +{ + struct buffer *temp = get_trash_chunk(); + struct htx_sl *sl = http_get_stline(htx); + struct ist meth, uri, vsn, q; + int offset = 1; + + if (!sl) + return 0; + + uri = htx_sl_req_uri(sl); + q = uri; + while (q.len > 0 && *(q.ptr) != '?') { + q = istnext(q); + } + + /* skip the question mark or indicate that we must insert it + * (but only if the format string is not empty then). + */ + if (q.len) { + q = istnext(q); + } + else if (query.len > 1) + offset = 0; + + /* Start by copying old method and version and create the new uri */ + chunk_memcat(temp, HTX_SL_REQ_MPTR(sl), HTX_SL_REQ_MLEN(sl)); /* meth */ + meth = ist2(temp->area, HTX_SL_REQ_MLEN(sl)); + + chunk_memcat(temp, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl)); /* vsn */ + vsn = ist2(temp->area + meth.len, HTX_SL_REQ_VLEN(sl)); + + chunk_memcat(temp, uri.ptr, q.ptr - uri.ptr); /* uri: host + path part */ + chunk_memcat(temp, query.ptr + offset, query.len - offset); /* uri: new QS */ + uri = ist2(temp->area + meth.len + vsn.len, uri.len - q.len + query.len - offset); + + /* create the new start line */ + return http_replace_stline(htx, meth, uri, vsn); +} + +/* Replace the response status in the HTX message <htx> by <status>. It returns + * 1 on success, otherwise 0. +*/ +int http_replace_res_status(struct htx *htx, const struct ist status, const struct ist reason) +{ + struct buffer *temp = get_trash_chunk(); + struct htx_sl *sl = http_get_stline(htx); + struct ist vsn, r; + + if (!sl) + return 0; + + /* Start by copying old uri and version */ + chunk_memcat(temp, HTX_SL_RES_VPTR(sl), HTX_SL_RES_VLEN(sl)); /* vsn */ + vsn = ist2(temp->area, HTX_SL_RES_VLEN(sl)); + r = reason; + if (!isttest(r)) { + chunk_memcat(temp, HTX_SL_RES_RPTR(sl), HTX_SL_RES_RLEN(sl)); /* reason */ + r = ist2(temp->area + vsn.len, HTX_SL_RES_RLEN(sl)); + } + + /* create the new start line */ + sl->info.res.status = strl2ui(status.ptr, status.len); + return http_replace_stline(htx, vsn, status, r); +} + +/* Replace the response reason in the HTX message <htx> by <reason>. It returns + * 1 on success, otherwise 0. +*/ +int http_replace_res_reason(struct htx *htx, const struct ist reason) +{ + struct buffer *temp = get_trash_chunk(); + struct htx_sl *sl = http_get_stline(htx); + struct ist vsn, status; + + if (!sl) + return 0; + + /* Start by copying old uri and version */ + chunk_memcat(temp, HTX_SL_RES_VPTR(sl), HTX_SL_RES_VLEN(sl)); /* vsn */ + vsn = ist2(temp->area, HTX_SL_RES_VLEN(sl)); + + chunk_memcat(temp, HTX_SL_RES_CPTR(sl), HTX_SL_RES_CLEN(sl)); /* code */ + status = ist2(temp->area + vsn.len, HTX_SL_RES_CLEN(sl)); + + /* create the new start line */ + return http_replace_stline(htx, vsn, status, reason); +} + +/* Append new value <data> after <ctx> value in header + * if header is not empty (at least one value exists): + * - ',' delimiter is added before <data> is appended + * - <ctx> must be valid and must point to an existing value, + * else it is an error and prepend_value should be used instead. + * + * ctx is updated to point to new value + * + * Returns 1 on success and 0 on failure. + */ +int http_append_header_value(struct htx *htx, struct http_hdr_ctx *ctx, const struct ist data) +{ + char *start; + struct htx_blk *blk = ctx->blk; + struct ist v; + uint32_t off = 0; + + if (!blk) + goto fail; + + v = htx_get_blk_value(htx, blk); + + if (!istlen(v)) { + start = v.ptr; + goto empty; /* header is empty, append without ',' */ + } + if (unlikely(!istlen(ctx->value))) + goto fail; /* invalid: value is empty, not supported */ + + start = istend(ctx->value) + ctx->lws_after; + off = start - v.ptr; + + blk = htx_replace_blk_value(htx, blk, ist2(start, 0), ist(",")); + if (!blk) + goto fail; + off += 1; /* add 1 for ',' */ + v = htx_get_blk_value(htx, blk); + start = v.ptr + off; + + empty: + blk = htx_replace_blk_value(htx, blk, ist2(start, 0), data); + if (!blk) + goto fail; + v = htx_get_blk_value(htx, blk); + + ctx->blk = blk; + ctx->value = ist2(v.ptr + off, data.len); + ctx->lws_before = ctx->lws_after = 0; + + return 1; + fail: + return 0; +} + +/* Prepend new value <data> before <ctx> value in header + * if <ctx> is not first value (at least one value exists): + * - ',' delimiter is added after <data> is prepended + * + * ctx is updated to point to new value + * + * Returns 1 on success and 0 on failure. + */ +int http_prepend_header_value(struct htx *htx, struct http_hdr_ctx *ctx, const struct ist data) +{ + char *start; + struct htx_blk *blk = ctx->blk; + struct ist v; + uint32_t off = 0; + uint8_t first; + + if (!blk) + goto fail; + + v = htx_get_blk_value(htx, blk); + + first = !istlen(v); + start = first ? v.ptr : istptr(ctx->value) - ctx->lws_before; + + if (unlikely(!istlen(ctx->value))) + goto fail; /* invalid: value is empty, not supported */ + + off = start - v.ptr; + + blk = htx_replace_blk_value(htx, blk, ist2(start, 0), data); + if (!blk) + goto fail; + v = htx_get_blk_value(htx, blk); + + if (first) + goto end; /* header is empty, don't append ',' */ + + start = v.ptr + off + data.len; + + blk = htx_replace_blk_value(htx, blk, ist2(start, 0), ist(",")); + if (!blk) + goto fail; + v = htx_get_blk_value(htx, blk); + + end: + ctx->blk = blk; + ctx->value = ist2(v.ptr + off, data.len); + ctx->lws_before = ctx->lws_after = 0; + + return 1; + fail: + return 0; +} + +/* Replaces a part of a header value referenced in the context <ctx> by + * <data>. It returns 1 on success, otherwise it returns 0. The context is + * updated if necessary. + */ +int http_replace_header_value(struct htx *htx, struct http_hdr_ctx *ctx, const struct ist data) +{ + struct htx_blk *blk = ctx->blk; + struct htx_sl *sl; + char *start; + struct ist v; + uint32_t len, off; + + if (!blk) + goto fail; + + v = htx_get_blk_value(htx, blk); + start = ctx->value.ptr - ctx->lws_before; + len = ctx->lws_before + ctx->value.len + ctx->lws_after; + off = start - v.ptr; + + blk = htx_replace_blk_value(htx, blk, ist2(start, len), data); + if (!blk) + goto fail; + + v = htx_get_blk_value(htx, blk); + + sl = http_get_stline(htx); + if (sl && (sl->flags & HTX_SL_F_HAS_AUTHORITY)) { + struct ist n = htx_get_blk_name(htx, blk); + + if (isteq(n, ist("host"))) { + if (!http_update_authority(htx, sl, v)) + goto fail; + ctx->blk = NULL; + http_find_header(htx, ist("host"), ctx, 1); + blk = ctx->blk; + v = htx_get_blk_value(htx, blk); + } + } + + ctx->blk = blk; + ctx->value = ist2(v.ptr + off, data.len); + ctx->lws_before = ctx->lws_after = 0; + + return 1; + fail: + return 0; +} + +/* Fully replaces a header referenced in the context <ctx> by the name <name> + * with the value <value>. It returns 1 on success, otherwise it returns 0. The + * context is updated if necessary. + */ +int http_replace_header(struct htx *htx, struct http_hdr_ctx *ctx, + const struct ist name, const struct ist value) +{ + struct htx_blk *blk = ctx->blk; + struct htx_sl *sl; + + if (!blk) + goto fail; + + blk = htx_replace_header(htx, blk, name, value); + if (!blk) + goto fail; + + sl = http_get_stline(htx); + if (sl && (sl->flags & HTX_SL_F_HAS_AUTHORITY) && isteqi(name, ist("host"))) { + if (!http_update_authority(htx, sl, value)) + goto fail; + ctx->blk = NULL; + http_find_header(htx, ist("host"), ctx, 1); + blk = ctx->blk; + } + + ctx->blk = blk; + ctx->value = ist(NULL); + ctx->lws_before = ctx->lws_after = 0; + + return 1; + fail: + return 0; +} + +/* Remove one value of a header. This only works on a <ctx> returned by + * http_find_header function. The value is removed, as well as surrounding commas + * if any. If the removed value was alone, the whole header is removed. The + * <ctx> is always updated accordingly, as well as the HTX message <htx>. It + * returns 1 on success. Otherwise, it returns 0. The <ctx> is always left in a + * form that can be handled by http_find_header() to find next occurrence. + */ +int http_remove_header(struct htx *htx, struct http_hdr_ctx *ctx) +{ + struct htx_blk *blk = ctx->blk; + char *start; + struct ist v; + uint32_t len; + + if (!blk) + return 0; + + start = ctx->value.ptr - ctx->lws_before; + len = ctx->lws_before + ctx->value.len + ctx->lws_after; + + v = htx_get_blk_value(htx, blk); + if (len == v.len) { + blk = htx_remove_blk(htx, blk); + if (blk || htx_is_empty(htx)) { + ctx->blk = blk; + ctx->value = IST_NULL; + ctx->lws_before = ctx->lws_after = 0; + } + else { + ctx->blk = htx_get_blk(htx, htx->tail); + ctx->value = htx_get_blk_value(htx, ctx->blk); + ctx->lws_before = ctx->lws_after = 0; + } + return 1; + } + + /* This was not the only value of this header. We have to remove the + * part pointed by ctx->value. If it is the last entry of the list, we + * remove the last separator. + */ + if (start == v.ptr) { + /* It's the first header part but not the only one. So remove + * the comma after it. */ + len++; + } + else { + /* There is at least one header part before the removed one. So + * remove the comma between them. */ + start--; + len++; + } + /* Update the block content and its len */ + memmove(start, start+len, v.len-len); + htx_change_blk_value_len(htx, blk, v.len-len); + + /* Finally update the ctx */ + ctx->value = ist2(start, 0); + ctx->lws_before = ctx->lws_after = 0; + + return 1; +} + +/* Updates the authority part of the uri with the value <host>. It happens when + * the header host is modified. It returns 0 on failure and 1 on success. It is + * the caller responsibility to provide the start-line and to be sure the uri + * contains an authority. Thus, if no authority is found in the uri, an error is + * returned. + */ +int http_update_authority(struct htx *htx, struct htx_sl *sl, const struct ist host) +{ + struct buffer *temp = get_trash_chunk(); + struct ist meth, vsn, uri, authority; + struct http_uri_parser parser; + + uri = htx_sl_req_uri(sl); + parser = http_uri_parser_init(uri); + authority = http_parse_authority(&parser, 1); + if (!authority.len) + return 0; + + /* Don't update the uri if there is no change */ + if (isteq(host, authority)) + return 1; + + /* Start by copying old method and version */ + chunk_memcat(temp, HTX_SL_REQ_MPTR(sl), HTX_SL_REQ_MLEN(sl)); /* meth */ + meth = ist2(temp->area, HTX_SL_REQ_MLEN(sl)); + + chunk_memcat(temp, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl)); /* vsn */ + vsn = ist2(temp->area + meth.len, HTX_SL_REQ_VLEN(sl)); + + chunk_memcat(temp, uri.ptr, authority.ptr - uri.ptr); + chunk_istcat(temp, host); + chunk_memcat(temp, istend(authority), istend(uri) - istend(authority)); + uri = ist2(temp->area + meth.len + vsn.len, host.len + uri.len - authority.len); /* uri */ + + return http_replace_stline(htx, meth, uri, vsn); + +} + +/* Update the header host by extracting the authority of the uri <uri>. flags of + * the start-line are also updated accordingly. For orgin-form and asterisk-form + * uri, the header host is not changed and the flag HTX_SL_F_HAS_AUTHORITY is + * removed from the flags of the start-line. Otherwise, this flag is set and the + * authority is used to set the value of the header host. This function returns + * 0 on failure and 1 on success. +*/ +int http_update_host(struct htx *htx, struct htx_sl *sl, const struct ist uri) +{ + struct ist authority; + struct http_hdr_ctx ctx; + struct http_uri_parser parser = http_uri_parser_init(uri); + + if (parser.format == URI_PARSER_FORMAT_EMPTY || + parser.format == URI_PARSER_FORMAT_ASTERISK || + parser.format == URI_PARSER_FORMAT_ABSPATH) { + sl->flags &= ~HTX_SL_F_HAS_AUTHORITY; + } + else { + sl->flags |= HTX_SL_F_HAS_AUTHORITY; + if (sl->info.req.meth != HTTP_METH_CONNECT) { + // absolute-form (RFC7320 #5.3.2) + sl->flags |= HTX_SL_F_HAS_SCHM; + if (uri.len > 4 && (uri.ptr[0] | 0x20) == 'h') + sl->flags |= ((uri.ptr[4] == ':') ? HTX_SL_F_SCHM_HTTP : HTX_SL_F_SCHM_HTTPS); + + authority = http_parse_authority(&parser, 1); + if (!authority.len) + goto fail; + } + else { + // authority-form (RFC7320 #5.3.3) + authority = uri; + } + + /* Replace header host value */ + ctx.blk = NULL; + while (http_find_header(htx, ist("host"), &ctx, 1)) { + if (!http_replace_header_value(htx, &ctx, authority)) + goto fail; + } + + } + return 1; + fail: + return 0; +} + +/* Return in <vptr> and <vlen> the pointer and length of occurrence <occ> of + * header whose name is <hname> of length <hlen>. If <ctx> is null, lookup is + * performed over the whole headers. Otherwise it must contain a valid header + * context, initialised with ctx->blk=NULL for the first lookup in a series. If + * <occ> is positive or null, occurrence #occ from the beginning (or last ctx) + * is returned. Occ #0 and #1 are equivalent. If <occ> is negative (and no less + * than -MAX_HDR_HISTORY), the occurrence is counted from the last one which is + * -1. The value fetch stops at commas, so this function is suited for use with + * list headers. + * The return value is 0 if nothing was found, or non-zero otherwise. + */ +unsigned int http_get_htx_hdr(const struct htx *htx, const struct ist hdr, + int occ, struct http_hdr_ctx *ctx, char **vptr, size_t *vlen) +{ + struct http_hdr_ctx local_ctx; + struct ist val_hist[MAX_HDR_HISTORY]; + unsigned int hist_idx; + int found; + + if (!ctx) { + local_ctx.blk = NULL; + ctx = &local_ctx; + } + + if (occ >= 0) { + /* search from the beginning */ + while (http_find_header(htx, hdr, ctx, 0)) { + occ--; + if (occ <= 0) { + *vptr = ctx->value.ptr; + *vlen = ctx->value.len; + return 1; + } + } + return 0; + } + + /* negative occurrence, we scan all the list then walk back */ + if (-occ > MAX_HDR_HISTORY) + return 0; + + found = hist_idx = 0; + while (http_find_header(htx, hdr, ctx, 0)) { + val_hist[hist_idx] = ctx->value; + if (++hist_idx >= MAX_HDR_HISTORY) + hist_idx = 0; + found++; + } + if (-occ > found) + return 0; + + /* OK now we have the last occurrence in [hist_idx-1], and we need to + * find occurrence -occ. 0 <= hist_idx < MAX_HDR_HISTORY, and we have + * -10 <= occ <= -1. So we have to check [hist_idx%MAX_HDR_HISTORY+occ] + * to remain in the 0..9 range. + */ + hist_idx += occ + MAX_HDR_HISTORY; + if (hist_idx >= MAX_HDR_HISTORY) + hist_idx -= MAX_HDR_HISTORY; + *vptr = val_hist[hist_idx].ptr; + *vlen = val_hist[hist_idx].len; + return 1; +} + +/* Return in <vptr> and <vlen> the pointer and length of occurrence <occ> of + * header whose name is <hname> of length <hlen>. If <ctx> is null, lookup is + * performed over the whole headers. Otherwise it must contain a valid header + * context, initialised with ctx->blk=NULL for the first lookup in a series. If + * <occ> is positive or null, occurrence #occ from the beginning (or last ctx) + * is returned. Occ #0 and #1 are equivalent. If <occ> is negative (and no less + * than -MAX_HDR_HISTORY), the occurrence is counted from the last one which is + * -1. This function differs from http_get_hdr() in that it only returns full + * line header values and does not stop at commas. + * The return value is 0 if nothing was found, or non-zero otherwise. + */ +unsigned int http_get_htx_fhdr(const struct htx *htx, const struct ist hdr, + int occ, struct http_hdr_ctx *ctx, char **vptr, size_t *vlen) +{ + struct http_hdr_ctx local_ctx; + struct ist val_hist[MAX_HDR_HISTORY]; + unsigned int hist_idx; + int found; + + if (!ctx) { + local_ctx.blk = NULL; + ctx = &local_ctx; + } + + if (occ >= 0) { + /* search from the beginning */ + while (http_find_header(htx, hdr, ctx, 1)) { + occ--; + if (occ <= 0) { + *vptr = ctx->value.ptr; + *vlen = ctx->value.len; + return 1; + } + } + return 0; + } + + /* negative occurrence, we scan all the list then walk back */ + if (-occ > MAX_HDR_HISTORY) + return 0; + + found = hist_idx = 0; + while (http_find_header(htx, hdr, ctx, 1)) { + val_hist[hist_idx] = ctx->value; + if (++hist_idx >= MAX_HDR_HISTORY) + hist_idx = 0; + found++; + } + if (-occ > found) + return 0; + + /* OK now we have the last occurrence in [hist_idx-1], and we need to + * find occurrence -occ. 0 <= hist_idx < MAX_HDR_HISTORY, and we have + * -10 <= occ <= -1. So we have to check [hist_idx%MAX_HDR_HISTORY+occ] + * to remain in the 0..9 range. + */ + hist_idx += occ + MAX_HDR_HISTORY; + if (hist_idx >= MAX_HDR_HISTORY) + hist_idx -= MAX_HDR_HISTORY; + *vptr = val_hist[hist_idx].ptr; + *vlen = val_hist[hist_idx].len; + return 1; +} + +int http_str_to_htx(struct buffer *buf, struct ist raw, char **errmsg) +{ + struct htx *htx; + struct htx_sl *sl; + struct h1m h1m; + struct http_hdr hdrs[global.tune.max_http_hdr]; + union h1_sl h1sl; + unsigned int flags = HTX_SL_F_IS_RESP; + int ret = 0; + + b_reset(buf); + if (!raw.len) { + buf->size = 0; + buf->area = NULL; + return 1; + } + + buf->size = global.tune.bufsize; + buf->area = malloc(buf->size); + if (!buf->area) + goto error; + + h1m_init_res(&h1m); + h1m.flags |= H1_MF_NO_PHDR; + ret = h1_headers_to_hdr_list(raw.ptr, istend(raw), + hdrs, sizeof(hdrs)/sizeof(hdrs[0]), &h1m, &h1sl); + if (ret <= 0) { + memprintf(errmsg, "unable to parse headers (error offset: %d)", h1m.err_pos); + goto error; + } + + if (unlikely(h1sl.st.v.len != 8)) { + memprintf(errmsg, "invalid http version (%.*s)", (int)h1sl.st.v.len, h1sl.st.v.ptr); + goto error; + } + if ((*(h1sl.st.v.ptr + 5) > '1') || + ((*(h1sl.st.v.ptr + 5) == '1') && (*(h1sl.st.v.ptr + 7) >= '1'))) + h1m.flags |= H1_MF_VER_11; + + if (h1sl.st.status < 200 && (h1sl.st.status == 100 || h1sl.st.status >= 102)) { + memprintf(errmsg, "invalid http status code for an error message (%u)", + h1sl.st.status); + goto error; + } + + if (h1sl.st.status == 204 || h1sl.st.status == 304) { + /* Responses known to have no body. */ + h1m.flags &= ~(H1_MF_CLEN|H1_MF_CHNK); + h1m.flags |= H1_MF_XFER_LEN; + h1m.curr_len = h1m.body_len = 0; + } + else if (h1m.flags & (H1_MF_CLEN|H1_MF_CHNK)) + h1m.flags |= H1_MF_XFER_LEN; + + if (h1m.flags & H1_MF_VER_11) + flags |= HTX_SL_F_VER_11; + if (h1m.flags & H1_MF_XFER_ENC) + flags |= HTX_SL_F_XFER_ENC; + if (h1m.flags & H1_MF_XFER_LEN) { + flags |= HTX_SL_F_XFER_LEN; + if (h1m.flags & H1_MF_CHNK) { + memprintf(errmsg, "chunk-encoded payload not supported"); + goto error; + } + else if (h1m.flags & H1_MF_CLEN) { + flags |= HTX_SL_F_CLEN; + if (h1m.body_len == 0) + flags |= HTX_SL_F_BODYLESS; + } + else + flags |= HTX_SL_F_BODYLESS; + } + + if ((flags & HTX_SL_F_BODYLESS) && raw.len > ret) { + memprintf(errmsg, "message payload not expected"); + goto error; + } + if ((flags & HTX_SL_F_CLEN) && h1m.body_len != (raw.len - ret)) { + memprintf(errmsg, "payload size does not match the announced content-length (%lu != %lu)", + (unsigned long)(raw.len - ret), (unsigned long)h1m.body_len); + goto error; + } + + htx = htx_from_buf(buf); + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, h1sl.st.v, h1sl.st.c, h1sl.st.r); + if (!sl || !htx_add_all_headers(htx, hdrs)) { + memprintf(errmsg, "unable to add headers into the HTX message"); + goto error; + } + sl->info.res.status = h1sl.st.status; + + while (raw.len > ret) { + int sent = htx_add_data(htx, ist2(raw.ptr + ret, raw.len - ret)); + if (!sent) { + memprintf(errmsg, "unable to add payload into the HTX message"); + goto error; + } + ret += sent; + } + + htx->flags |= HTX_FL_EOM; + + return 1; + +error: + if (buf->size) + free(buf->area); + return 0; +} + +void release_http_reply(struct http_reply *http_reply) +{ + struct logformat_node *lf, *lfb; + struct http_reply_hdr *hdr, *hdrb; + + if (!http_reply) + return; + + ha_free(&http_reply->ctype); + list_for_each_entry_safe(hdr, hdrb, &http_reply->hdrs, list) { + LIST_DELETE(&hdr->list); + list_for_each_entry_safe(lf, lfb, &hdr->value, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } + istfree(&hdr->name); + free(hdr); + } + + if (http_reply->type == HTTP_REPLY_ERRFILES) { + ha_free(&http_reply->body.http_errors); + } + else if (http_reply->type == HTTP_REPLY_RAW) + chunk_destroy(&http_reply->body.obj); + else if (http_reply->type == HTTP_REPLY_LOGFMT) { + list_for_each_entry_safe(lf, lfb, &http_reply->body.fmt, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } + } + free(http_reply); +} + +static int http_htx_init(void) +{ + struct buffer chk; + struct ist raw; + char *errmsg = NULL; + int rc; + int err_code = 0; + + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) { + if (!http_err_msgs[rc]) { + ha_alert("Internal error: no default message defined for HTTP return code %d", rc); + err_code |= ERR_ALERT | ERR_FATAL; + continue; + } + + raw = ist(http_err_msgs[rc]); + if (!http_str_to_htx(&chk, raw, &errmsg)) { + ha_alert("Internal error: invalid default message for HTTP return code %d: %s.\n", + http_err_codes[rc], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + } + else if (errmsg) { + ha_warning("invalid default message for HTTP return code %d: %s.\n", http_err_codes[rc], errmsg); + err_code |= ERR_WARN; + } + + /* Reset errmsg */ + ha_free(&errmsg); + + http_err_chunks[rc] = chk; + http_err_replies[rc].type = HTTP_REPLY_ERRMSG; + http_err_replies[rc].status = http_err_codes[rc]; + http_err_replies[rc].ctype = NULL; + LIST_INIT(&http_err_replies[rc].hdrs); + http_err_replies[rc].body.errmsg = &http_err_chunks[rc]; + } +end: + return err_code; +} + +static void http_htx_deinit(void) +{ + struct http_errors *http_errs, *http_errsb; + struct http_reply *http_rep, *http_repb; + struct ebpt_node *node, *next; + struct http_error_msg *http_errmsg; + int rc; + + node = ebpt_first(&http_error_messages); + while (node) { + next = ebpt_next(node); + ebpt_delete(node); + http_errmsg = container_of(node, typeof(*http_errmsg), node); + chunk_destroy(&http_errmsg->msg); + free(node->key); + free(http_errmsg); + node = next; + } + + list_for_each_entry_safe(http_errs, http_errsb, &http_errors_list, list) { + free(http_errs->conf.file); + free(http_errs->id); + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) + release_http_reply(http_errs->replies[rc]); + LIST_DELETE(&http_errs->list); + free(http_errs); + } + + list_for_each_entry_safe(http_rep, http_repb, &http_replies_list, list) { + LIST_DELETE(&http_rep->list); + release_http_reply(http_rep); + } + + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) + chunk_destroy(&http_err_chunks[rc]); +} + +REGISTER_CONFIG_POSTPARSER("http_htx", http_htx_init); +REGISTER_POST_DEINIT(http_htx_deinit); + +/* Reads content of the error file <file> and convert it into an HTX message. On + * success, the HTX message is returned. On error, NULL is returned and an error + * message is written into the <errmsg> buffer. + */ +struct buffer *http_load_errorfile(const char *file, char **errmsg) +{ + struct buffer *buf = NULL; + struct buffer chk; + struct ebpt_node *node; + struct http_error_msg *http_errmsg; + struct stat stat; + char *err = NULL; + int errnum, errlen; + int fd = -1; + + /* already loaded */ + node = ebis_lookup_len(&http_error_messages, file, strlen(file)); + if (node) { + http_errmsg = container_of(node, typeof(*http_errmsg), node); + buf = &http_errmsg->msg; + goto out; + } + + /* Read the error file content */ + fd = open(file, O_RDONLY); + if ((fd < 0) || (fstat(fd, &stat) < 0)) { + memprintf(errmsg, "error opening file '%s'.", file); + goto out; + } + + if (stat.st_size <= global.tune.bufsize) + errlen = stat.st_size; + else { + ha_warning("custom error message file '%s' larger than %d bytes. Truncating.\n", + file, global.tune.bufsize); + errlen = global.tune.bufsize; + } + + err = malloc(errlen); + if (!err) { + memprintf(errmsg, "out of memory."); + goto out; + } + + errnum = read(fd, err, errlen); + if (errnum != errlen) { + memprintf(errmsg, "error reading file '%s'.", file); + goto out; + } + + /* Create the node corresponding to the error file */ + http_errmsg = calloc(1, sizeof(*http_errmsg)); + if (!http_errmsg) { + memprintf(errmsg, "out of memory."); + goto out; + } + http_errmsg->node.key = strdup(file); + if (!http_errmsg->node.key) { + memprintf(errmsg, "out of memory."); + free(http_errmsg); + goto out; + } + + /* Convert the error file into an HTX message */ + if (!http_str_to_htx(&chk, ist2(err, errlen), errmsg)) { + memprintf(errmsg, "'%s': %s", file, *errmsg); + free(http_errmsg->node.key); + free(http_errmsg); + goto out; + } + + /* Insert the node in the tree and return the HTX message */ + http_errmsg->msg = chk; + ebis_insert(&http_error_messages, &http_errmsg->node); + buf = &http_errmsg->msg; + + out: + if (fd >= 0) + close(fd); + free(err); + return buf; +} + +/* Convert the raw http message <msg> into an HTX message. On success, the HTX + * message is returned. On error, NULL is returned and an error message is + * written into the <errmsg> buffer. + */ +struct buffer *http_load_errormsg(const char *key, const struct ist msg, char **errmsg) +{ + struct buffer *buf = NULL; + struct buffer chk; + struct ebpt_node *node; + struct http_error_msg *http_errmsg; + + /* already loaded */ + node = ebis_lookup_len(&http_error_messages, key, strlen(key)); + if (node) { + http_errmsg = container_of(node, typeof(*http_errmsg), node); + buf = &http_errmsg->msg; + goto out; + } + /* Create the node corresponding to the error file */ + http_errmsg = calloc(1, sizeof(*http_errmsg)); + if (!http_errmsg) { + memprintf(errmsg, "out of memory."); + goto out; + } + http_errmsg->node.key = strdup(key); + if (!http_errmsg->node.key) { + memprintf(errmsg, "out of memory."); + free(http_errmsg); + goto out; + } + + /* Convert the error file into an HTX message */ + if (!http_str_to_htx(&chk, msg, errmsg)) { + memprintf(errmsg, "invalid error message: %s", *errmsg); + free(http_errmsg->node.key); + free(http_errmsg); + goto out; + } + + /* Insert the node in the tree and return the HTX message */ + http_errmsg->msg = chk; + ebis_insert(&http_error_messages, &http_errmsg->node); + buf = &http_errmsg->msg; + out: + return buf; +} + +/* This function parses the raw HTTP error file <file> for the status code + * <status>. It returns NULL if there is any error, otherwise it return the + * corresponding HTX message. + */ +struct buffer *http_parse_errorfile(int status, const char *file, char **errmsg) +{ + struct buffer *buf = NULL; + int rc; + + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) { + if (http_err_codes[rc] == status) { + buf = http_load_errorfile(file, errmsg); + break; + } + } + + if (rc >= HTTP_ERR_SIZE) + memprintf(errmsg, "status code '%d' not handled.", status); + return buf; +} + +/* This function creates HTX error message corresponding to a redirect message + * for the status code <status>. <url> is used as location url for the + * redirect. <errloc> is used to know if it is a 302 or a 303 redirect. It + * returns NULL if there is any error, otherwise it return the corresponding HTX + * message. + */ +struct buffer *http_parse_errorloc(int errloc, int status, const char *url, char **errmsg) +{ + static const char *HTTP_302 = + "HTTP/1.1 302 Found\r\n" + "Cache-Control: no-cache\r\n" + "Content-length: 0\r\n" + "Location: "; /* not terminated since it will be concatenated with the URL */ + static const char *HTTP_303 = + "HTTP/1.1 303 See Other\r\n" + "Cache-Control: no-cache\r\n" + "Content-length: 0\r\n" + "Location: "; /* not terminated since it will be concatenated with the URL */ + + struct buffer *buf = NULL; + const char *msg; + char *key = NULL, *err = NULL; + int rc, errlen; + + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) { + if (http_err_codes[rc] == status) { + /* Create the error key */ + if (!memprintf(&key, "errorloc%d %s", errloc, url)) { + memprintf(errmsg, "out of memory."); + goto out; + } + /* Create the error message */ + msg = (errloc == 302 ? HTTP_302 : HTTP_303); + errlen = strlen(msg) + strlen(url) + 5; + err = malloc(errlen); + if (!err) { + memprintf(errmsg, "out of memory."); + goto out; + } + errlen = snprintf(err, errlen, "%s%s\r\n\r\n", msg, url); + + /* Load it */ + buf = http_load_errormsg(key, ist2(err, errlen), errmsg); + break; + } + } + + if (rc >= HTTP_ERR_SIZE) + memprintf(errmsg, "status code '%d' not handled.", status); +out: + free(key); + free(err); + return buf; +} + +/* Check an "http reply" and, for replies referencing an http-errors section, + * try to find the right section and the right error message in this section. If + * found, the reply is updated. If the http-errors section exists but the error + * message is not found, no error message is set to fallback on the default + * ones. Otherwise (unknown section) an error is returned. + * + * The function returns 1 in success case, otherwise, it returns 0 and errmsg is + * filled. + */ +int http_check_http_reply(struct http_reply *reply, struct proxy *px, char **errmsg) +{ + struct http_errors *http_errs; + int ret = 1; + + if (reply->type != HTTP_REPLY_ERRFILES) + goto end; + + list_for_each_entry(http_errs, &http_errors_list, list) { + if (strcmp(http_errs->id, reply->body.http_errors) == 0) { + reply->type = HTTP_REPLY_INDIRECT; + free(reply->body.http_errors); + reply->body.reply = http_errs->replies[http_get_status_idx(reply->status)]; + if (!reply->body.reply) + ha_warning("Proxy '%s': status '%d' referenced by an http reply " + "not declared in http-errors section '%s'.\n", + px->id, reply->status, http_errs->id); + break; + } + } + + if (&http_errs->list == &http_errors_list) { + memprintf(errmsg, "unknown http-errors section '%s' referenced by an http reply ", + reply->body.http_errors); + ret = 0; + } + + end: + return ret; +} + +/* Parse an "http reply". It returns the reply on success or NULL on error. This + * function creates one of the following http replies : + * + * - HTTP_REPLY_EMPTY : dummy response, no payload + * - HTTP_REPLY_ERRMSG : implicit error message depending on the status code or explicit one + * - HTTP_REPLY_ERRFILES : points on an http-errors section (resolved during post-parsing) + * - HTTP_REPLY_RAW : explicit file object ('file' argument) + * - HTTP_REPLY_LOGFMT : explicit log-format string ('content' argument) + * + * The content-type must be defined for non-empty payload. It is ignored for + * error messages (implicit or explicit). When an http-errors section is + * referenced (HTTP_REPLY_ERRFILES), the real error message should be resolved + * during the configuration validity check or dynamically. It is the caller + * responsibility to choose. If no status code is configured, <default_status> + * is set. + */ +struct http_reply *http_parse_http_reply(const char **args, int *orig_arg, struct proxy *px, + int default_status, char **errmsg) +{ + struct logformat_node *lf, *lfb; + struct http_reply *reply = NULL; + struct http_reply_hdr *hdr, *hdrb; + struct stat stat; + const char *act_arg = NULL; + char *obj = NULL; + int cur_arg, cap = 0, objlen = 0, fd = -1; + + + reply = calloc(1, sizeof(*reply)); + if (!reply) { + memprintf(errmsg, "out of memory"); + goto error; + } + LIST_INIT(&reply->hdrs); + reply->type = HTTP_REPLY_EMPTY; + reply->status = default_status; + + if (px->conf.args.ctx == ARGC_HERR) + cap = (SMP_VAL_REQUEST | SMP_VAL_RESPONSE); + else { + if (px->cap & PR_CAP_FE) + cap |= ((px->conf.args.ctx == ARGC_HRQ) ? SMP_VAL_FE_HRQ_HDR : SMP_VAL_FE_HRS_HDR); + if (px->cap & PR_CAP_BE) + cap |= ((px->conf.args.ctx == ARGC_HRQ) ? SMP_VAL_BE_HRQ_HDR : SMP_VAL_BE_HRS_HDR); + } + + cur_arg = *orig_arg; + while (*args[cur_arg]) { + if (strcmp(args[cur_arg], "status") == 0) { + cur_arg++; + if (!*args[cur_arg]) { + memprintf(errmsg, "'%s' expects <status_code> as argument", args[cur_arg-1]); + goto error; + } + reply->status = atol(args[cur_arg]); + if (reply->status < 200 || reply->status > 599) { + memprintf(errmsg, "Unexpected status code '%d'", reply->status); + goto error; + } + cur_arg++; + } + else if (strcmp(args[cur_arg], "content-type") == 0) { + cur_arg++; + if (!*args[cur_arg]) { + memprintf(errmsg, "'%s' expects <ctype> as argument", args[cur_arg-1]); + goto error; + } + free(reply->ctype); + reply->ctype = strdup(args[cur_arg]); + cur_arg++; + } + else if (strcmp(args[cur_arg], "errorfiles") == 0) { + if (reply->type != HTTP_REPLY_EMPTY) { + memprintf(errmsg, "unexpected '%s' argument, '%s' already defined", args[cur_arg], act_arg); + goto error; + } + act_arg = args[cur_arg]; + cur_arg++; + if (!*args[cur_arg]) { + memprintf(errmsg, "'%s' expects <name> as argument", args[cur_arg-1]); + goto error; + } + reply->body.http_errors = strdup(args[cur_arg]); + if (!reply->body.http_errors) { + memprintf(errmsg, "out of memory"); + goto error; + } + reply->type = HTTP_REPLY_ERRFILES; + cur_arg++; + } + else if (strcmp(args[cur_arg], "default-errorfiles") == 0) { + if (reply->type != HTTP_REPLY_EMPTY) { + memprintf(errmsg, "unexpected '%s' argument, '%s' already defined", args[cur_arg], act_arg); + goto error; + } + act_arg = args[cur_arg]; + reply->type = HTTP_REPLY_ERRMSG; + cur_arg++; + } + else if (strcmp(args[cur_arg], "errorfile") == 0) { + if (reply->type != HTTP_REPLY_EMPTY) { + memprintf(errmsg, "unexpected '%s' argument, '%s' already defined", args[cur_arg], act_arg); + goto error; + } + act_arg = args[cur_arg]; + cur_arg++; + if (!*args[cur_arg]) { + memprintf(errmsg, "'%s' expects <fmt> as argument", args[cur_arg-1]); + goto error; + } + reply->body.errmsg = http_load_errorfile(args[cur_arg], errmsg); + if (!reply->body.errmsg) { + goto error; + } + reply->type = HTTP_REPLY_ERRMSG; + cur_arg++; + } + else if (strcmp(args[cur_arg], "file") == 0) { + if (reply->type != HTTP_REPLY_EMPTY) { + memprintf(errmsg, "unexpected '%s' argument, '%s' already defined", args[cur_arg], act_arg); + goto error; + } + act_arg = args[cur_arg]; + cur_arg++; + if (!*args[cur_arg]) { + memprintf(errmsg, "'%s' expects <file> as argument", args[cur_arg-1]); + goto error; + } + fd = open(args[cur_arg], O_RDONLY); + if ((fd < 0) || (fstat(fd, &stat) < 0)) { + memprintf(errmsg, "error opening file '%s'", args[cur_arg]); + goto error; + } + if (stat.st_size > global.tune.bufsize) { + memprintf(errmsg, "file '%s' exceeds the buffer size (%lld > %d)", + args[cur_arg], (long long)stat.st_size, global.tune.bufsize); + goto error; + } + objlen = stat.st_size; + obj = malloc(objlen); + if (!obj || read(fd, obj, objlen) != objlen) { + memprintf(errmsg, "error reading file '%s'", args[cur_arg]); + goto error; + } + close(fd); + fd = -1; + reply->type = HTTP_REPLY_RAW; + chunk_initlen(&reply->body.obj, obj, global.tune.bufsize, objlen); + obj = NULL; + cur_arg++; + } + else if (strcmp(args[cur_arg], "string") == 0) { + if (reply->type != HTTP_REPLY_EMPTY) { + memprintf(errmsg, "unexpected '%s' argument, '%s' already defined", args[cur_arg], act_arg); + goto error; + } + act_arg = args[cur_arg]; + cur_arg++; + if (!*args[cur_arg]) { + memprintf(errmsg, "'%s' expects <str> as argument", args[cur_arg-1]); + goto error; + } + obj = strdup(args[cur_arg]); + objlen = strlen(args[cur_arg]); + if (!obj) { + memprintf(errmsg, "out of memory"); + goto error; + } + reply->type = HTTP_REPLY_RAW; + chunk_initlen(&reply->body.obj, obj, global.tune.bufsize, objlen); + obj = NULL; + cur_arg++; + } + else if (strcmp(args[cur_arg], "lf-file") == 0) { + if (reply->type != HTTP_REPLY_EMPTY) { + memprintf(errmsg, "unexpected '%s' argument, '%s' already defined", args[cur_arg], act_arg); + goto error; + } + act_arg = args[cur_arg]; + cur_arg++; + if (!*args[cur_arg]) { + memprintf(errmsg, "'%s' expects <file> as argument", args[cur_arg-1]); + goto error; + } + fd = open(args[cur_arg], O_RDONLY); + if ((fd < 0) || (fstat(fd, &stat) < 0)) { + memprintf(errmsg, "error opening file '%s'", args[cur_arg]); + goto error; + } + if (stat.st_size > global.tune.bufsize) { + memprintf(errmsg, "file '%s' exceeds the buffer size (%lld > %d)", + args[cur_arg], (long long)stat.st_size, global.tune.bufsize); + goto error; + } + objlen = stat.st_size; + obj = malloc(objlen + 1); + if (!obj || read(fd, obj, objlen) != objlen) { + memprintf(errmsg, "error reading file '%s'", args[cur_arg]); + goto error; + } + close(fd); + fd = -1; + obj[objlen] = '\0'; + reply->type = HTTP_REPLY_LOGFMT; + LIST_INIT(&reply->body.fmt); + cur_arg++; + } + else if (strcmp(args[cur_arg], "lf-string") == 0) { + if (reply->type != HTTP_REPLY_EMPTY) { + memprintf(errmsg, "unexpected '%s' argument, '%s' already defined", args[cur_arg], act_arg); + goto error; + } + act_arg = args[cur_arg]; + cur_arg++; + if (!*args[cur_arg]) { + memprintf(errmsg, "'%s' expects <fmt> as argument", args[cur_arg-1]); + goto error; + } + obj = strdup(args[cur_arg]); + objlen = strlen(args[cur_arg]); + reply->type = HTTP_REPLY_LOGFMT; + LIST_INIT(&reply->body.fmt); + cur_arg++; + } + else if (strcmp(args[cur_arg], "hdr") == 0) { + cur_arg++; + if (!*args[cur_arg] || !*args[cur_arg+1]) { + memprintf(errmsg, "'%s' expects <name> and <value> as arguments", args[cur_arg-1]); + goto error; + } + if (strcasecmp(args[cur_arg], "content-length") == 0 || + strcasecmp(args[cur_arg], "transfer-encoding") == 0 || + strcasecmp(args[cur_arg], "content-type") == 0) { + ha_warning("parsing [%s:%d] : header '%s' always ignored by the http reply.\n", + px->conf.args.file, px->conf.args.line, args[cur_arg]); + cur_arg += 2; + continue; + } + hdr = calloc(1, sizeof(*hdr)); + if (!hdr) { + memprintf(errmsg, "'%s' : out of memory", args[cur_arg-1]); + goto error; + } + LIST_APPEND(&reply->hdrs, &hdr->list); + LIST_INIT(&hdr->value); + hdr->name = ist(strdup(args[cur_arg])); + if (!isttest(hdr->name)) { + memprintf(errmsg, "out of memory"); + goto error; + } + if (!parse_logformat_string(args[cur_arg+1], px, &hdr->value, LOG_OPT_HTTP, cap, errmsg)) + goto error; + + free(px->conf.lfs_file); + px->conf.lfs_file = strdup(px->conf.args.file); + px->conf.lfs_line = px->conf.args.line; + cur_arg += 2; + } + else + break; + } + + if (reply->type == HTTP_REPLY_EMPTY) { /* no payload */ + if (reply->ctype) { + ha_warning("parsing [%s:%d] : content-type '%s' ignored by the http reply because" + " neither errorfile nor payload defined.\n", + px->conf.args.file, px->conf.args.line, reply->ctype); + ha_free(&reply->ctype); + } + } + else if (reply->type == HTTP_REPLY_ERRFILES || reply->type == HTTP_REPLY_ERRMSG) { /* errorfiles or errorfile */ + + if (reply->type != HTTP_REPLY_ERRMSG || !reply->body.errmsg) { + /* default errorfile or errorfiles: check the status */ + int rc; + + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) { + if (http_err_codes[rc] == reply->status) + break; + } + + if (rc >= HTTP_ERR_SIZE) { + memprintf(errmsg, "status code '%d' not handled by default with '%s' argument.", + reply->status, act_arg); + goto error; + } + } + + if (reply->ctype) { + ha_warning("parsing [%s:%d] : content-type '%s' ignored by the http reply when used " + "with an erorrfile.\n", + px->conf.args.file, px->conf.args.line, reply->ctype); + ha_free(&reply->ctype); + } + if (!LIST_ISEMPTY(&reply->hdrs)) { + ha_warning("parsing [%s:%d] : hdr parameters ignored by the http reply when used " + "with an erorrfile.\n", + px->conf.args.file, px->conf.args.line); + list_for_each_entry_safe(hdr, hdrb, &reply->hdrs, list) { + LIST_DELETE(&hdr->list); + list_for_each_entry_safe(lf, lfb, &hdr->value, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } + istfree(&hdr->name); + free(hdr); + } + } + } + else if (reply->type == HTTP_REPLY_RAW) { /* explicit parameter using 'file' parameter*/ + if ((reply->status == 204 || reply->status == 304) && objlen) { + memprintf(errmsg, "No body expected for %d responses", reply->status); + goto error; + } + if (!reply->ctype && objlen) { + memprintf(errmsg, "a content type must be defined when non-empty payload is configured"); + goto error; + } + if (reply->ctype && !b_data(&reply->body.obj)) { + ha_warning("parsing [%s:%d] : content-type '%s' ignored by the http reply when used " + "with an empty payload.\n", + px->conf.args.file, px->conf.args.line, reply->ctype); + ha_free(&reply->ctype); + } + if (b_room(&reply->body.obj) < global.tune.maxrewrite) { + ha_warning("parsing [%s:%d] : http reply payload runs over the buffer space reserved to headers rewriting." + " It may lead to internal errors if strict rewriting mode is enabled.\n", + px->conf.args.file, px->conf.args.line); + } + } + else if (reply->type == HTTP_REPLY_LOGFMT) { /* log-format payload using 'lf-file' of 'lf-string' parameter */ + LIST_INIT(&reply->body.fmt); + if ((reply->status == 204 || reply->status == 304)) { + memprintf(errmsg, "No body expected for %d responses", reply->status); + goto error; + } + if (!reply->ctype) { + memprintf(errmsg, "a content type must be defined with a log-format payload"); + goto error; + } + if (!parse_logformat_string(obj, px, &reply->body.fmt, LOG_OPT_HTTP, cap, errmsg)) + goto error; + + free(px->conf.lfs_file); + px->conf.lfs_file = strdup(px->conf.args.file); + px->conf.lfs_line = px->conf.args.line; + } + + free(obj); + *orig_arg = cur_arg; + return reply; + + error: + free(obj); + if (fd >= 0) + close(fd); + release_http_reply(reply); + return NULL; +} + +/* Apply schemed-based normalization as described on rfc3986 on section 6.3.2. + * Returns 0 if no error has been found else non-zero. + * + * The normalization is processed on the target-uri at the condition that it is + * in absolute-form. In the case where the target-uri was normalized, every + * host headers values found are also replaced by the normalized hostname. This + * assumes that the target-uri and host headers were properly identify as + * similar before calling this function. + */ +int http_scheme_based_normalize(struct htx *htx) +{ + struct http_hdr_ctx ctx; + struct htx_sl *sl; + struct ist uri, scheme, authority, host, port; + struct http_uri_parser parser; + + sl = http_get_stline(htx); + + if (!sl || !(sl->flags & (HTX_SL_F_HAS_SCHM|HTX_SL_F_HAS_AUTHORITY))) + return 0; + + uri = htx_sl_req_uri(sl); + + parser = http_uri_parser_init(uri); + scheme = http_parse_scheme(&parser); + /* if no scheme found, no normalization to proceed */ + if (!isttest(scheme)) + return 0; + + /* Extract the port if present in authority */ + authority = http_parse_authority(&parser, 1); + port = http_get_host_port(authority); + if (!isttest(port)) { + /* if no port found, no normalization to proceed */ + return 0; + } + host = isttrim(authority, istlen(authority) - istlen(port) - 1); + + if (http_is_default_port(scheme, port)) { + /* reconstruct the uri with removal of the port */ + struct buffer *temp = get_trash_chunk(); + struct ist meth, vsn; + + /* meth */ + chunk_memcat(temp, HTX_SL_REQ_MPTR(sl), HTX_SL_REQ_MLEN(sl)); + meth = ist2(temp->area, HTX_SL_REQ_MLEN(sl)); + + /* vsn */ + chunk_memcat(temp, HTX_SL_REQ_VPTR(sl), HTX_SL_REQ_VLEN(sl)); + vsn = ist2(temp->area + meth.len, HTX_SL_REQ_VLEN(sl)); + + /* reconstruct uri without port */ + chunk_memcat(temp, uri.ptr, authority.ptr - uri.ptr); + chunk_istcat(temp, host); + chunk_memcat(temp, istend(authority), istend(uri) - istend(authority)); + uri = ist2(temp->area + meth.len + vsn.len, host.len + uri.len - authority.len); /* uri */ + + http_replace_stline(htx, meth, uri, vsn); + + /* replace every host headers values by the normalized host */ + ctx.blk = NULL; + while (http_find_header(htx, ist("host"), &ctx, 0)) { + if (!http_replace_header_value(htx, &ctx, host)) + goto fail; + } + } + + return 0; + + fail: + return 1; +} + +/* First step function to merge multiple cookie headers in a single entry. + * + * Use it for each cookie header at <idx> index over HTTP headers in <list>. + * <first> and <last> are state variables used internally and must be + * initialized to -1 before the first invocation. + */ +void http_cookie_register(struct http_hdr *list, int idx, int *first, int *last) +{ + /* Build a linked list of cookie headers. Use header length to point to + * the next one. The last entry will contains -1. + */ + + /* Caller is responsible to initialize *first and *last to -1 on first + * invocation. Both will thus be set to a valid index after it. + */ + BUG_ON(*first > 0 && *last < 0); + + /* Mark the current end of cookie linked list. */ + list[idx].n.len = -1; + if (*first < 0) { + /* Save first found cookie for http_cookie_merge call. */ + *first = idx; + } + else { + /* Update linked list of cookies. */ + list[*last].n.len = idx; + } + + *last = idx; +} + +/* Second step to merge multiple cookie headers in a single entry. + * + * Use it when looping over HTTP headers is done and <htx> message is built. + * This will concatenate each cookie headers present from <list> directly into + * <htx> message. <first> is reused from previous http_cookie_register + * invocation. + * + * Returns 0 on success else non-zero. + */ +int http_cookie_merge(struct htx *htx, struct http_hdr *list, int first) +{ + uint32_t fs; /* free space */ + uint32_t bs; /* block size */ + uint32_t vl; /* value len */ + uint32_t tl; /* total length */ + struct htx_blk *blk; + + if (first < 0) + return 0; + + blk = htx_add_header(htx, ist("cookie"), list[first].v); + if (!blk) + return 1; + + tl = list[first].v.len; + fs = htx_free_data_space(htx); + bs = htx_get_blksz(blk); + + /* for each extra cookie, we'll extend the cookie's value and insert + * ";" before the new value. + */ + fs += tl; /* first one is already counted */ + + /* Loop over cookies linked list built from http_cookie_register. */ + while ((first = list[first].n.len) >= 0) { + vl = list[first].v.len; + tl += vl + 2; + if (tl > fs) + return 1; + + htx_change_blk_value_len(htx, blk, tl); + *(char *)(htx_get_blk_ptr(htx, blk) + bs + 0) = ';'; + *(char *)(htx_get_blk_ptr(htx, blk) + bs + 1) = ' '; + memcpy(htx_get_blk_ptr(htx, blk) + bs + 2, + list[first].v.ptr, vl); + bs += vl + 2; + } + + return 0; +} + +/* Parses the "errorloc[302|303]" proxy keyword */ +static int proxy_parse_errorloc(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **errmsg) +{ + struct conf_errors *conf_err; + struct http_reply *reply; + struct buffer *msg; + int errloc, status; + int ret = 0; + + if (warnifnotcap(curpx, PR_CAP_FE | PR_CAP_BE, file, line, args[0], NULL)) { + ret = 1; + goto out; + } + + if (*(args[1]) == 0 || *(args[2]) == 0) { + memprintf(errmsg, "%s : expects <status_code> and <url> as arguments.\n", args[0]); + ret = -1; + goto out; + } + + status = atol(args[1]); + errloc = (strcmp(args[0], "errorloc303") == 0 ? 303 : 302); + msg = http_parse_errorloc(errloc, status, args[2], errmsg); + if (!msg) { + memprintf(errmsg, "%s : %s", args[0], *errmsg); + ret = -1; + goto out; + } + + reply = calloc(1, sizeof(*reply)); + if (!reply) { + memprintf(errmsg, "%s : out of memory.", args[0]); + ret = -1; + goto out; + } + reply->type = HTTP_REPLY_ERRMSG; + reply->status = status; + reply->ctype = NULL; + LIST_INIT(&reply->hdrs); + reply->body.errmsg = msg; + LIST_APPEND(&http_replies_list, &reply->list); + + conf_err = calloc(1, sizeof(*conf_err)); + if (!conf_err) { + memprintf(errmsg, "%s : out of memory.", args[0]); + free(reply); + ret = -1; + goto out; + } + conf_err->type = 1; + conf_err->info.errorfile.status = status; + conf_err->info.errorfile.reply = reply; + + conf_err->file = strdup(file); + conf_err->line = line; + LIST_APPEND(&curpx->conf.errors, &conf_err->list); + + /* handle warning message */ + if (*errmsg) + ret = 1; + out: + return ret; + +} + +/* Parses the "errorfile" proxy keyword */ +static int proxy_parse_errorfile(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **errmsg) +{ + struct conf_errors *conf_err; + struct http_reply *reply; + struct buffer *msg; + int status; + int ret = 0; + + if (warnifnotcap(curpx, PR_CAP_FE | PR_CAP_BE, file, line, args[0], NULL)) { + ret = 1; + goto out; + } + + if (*(args[1]) == 0 || *(args[2]) == 0) { + memprintf(errmsg, "%s : expects <status_code> and <file> as arguments.\n", args[0]); + ret = -1; + goto out; + } + + status = atol(args[1]); + msg = http_parse_errorfile(status, args[2], errmsg); + if (!msg) { + memprintf(errmsg, "%s : %s", args[0], *errmsg); + ret = -1; + goto out; + } + + reply = calloc(1, sizeof(*reply)); + if (!reply) { + memprintf(errmsg, "%s : out of memory.", args[0]); + ret = -1; + goto out; + } + reply->type = HTTP_REPLY_ERRMSG; + reply->status = status; + reply->ctype = NULL; + LIST_INIT(&reply->hdrs); + reply->body.errmsg = msg; + LIST_APPEND(&http_replies_list, &reply->list); + + conf_err = calloc(1, sizeof(*conf_err)); + if (!conf_err) { + memprintf(errmsg, "%s : out of memory.", args[0]); + free(reply); + ret = -1; + goto out; + } + conf_err->type = 1; + conf_err->info.errorfile.status = status; + conf_err->info.errorfile.reply = reply; + conf_err->file = strdup(file); + conf_err->line = line; + LIST_APPEND(&curpx->conf.errors, &conf_err->list); + + /* handle warning message */ + if (*errmsg) + ret = 1; + out: + return ret; + +} + +/* Parses the "errorfiles" proxy keyword */ +static int proxy_parse_errorfiles(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + struct conf_errors *conf_err = NULL; + char *name = NULL; + int rc, ret = 0; + + if (warnifnotcap(curpx, PR_CAP_FE | PR_CAP_BE, file, line, args[0], NULL)) { + ret = 1; + goto out; + } + + if (!*(args[1])) { + memprintf(err, "%s : expects <name> as argument.", args[0]); + ret = -1; + goto out; + } + + name = strdup(args[1]); + conf_err = calloc(1, sizeof(*conf_err)); + if (!name || !conf_err) { + memprintf(err, "%s : out of memory.", args[0]); + goto error; + } + conf_err->type = 0; + + conf_err->info.errorfiles.name = name; + if (!*(args[2])) { + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) + conf_err->info.errorfiles.status[rc] = 1; + } + else { + int cur_arg, status; + for (cur_arg = 2; *(args[cur_arg]); cur_arg++) { + status = atol(args[cur_arg]); + + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) { + if (http_err_codes[rc] == status) { + conf_err->info.errorfiles.status[rc] = 2; + break; + } + } + if (rc >= HTTP_ERR_SIZE) { + memprintf(err, "%s : status code '%d' not handled.", args[0], status); + goto error; + } + } + } + conf_err->file = strdup(file); + conf_err->line = line; + LIST_APPEND(&curpx->conf.errors, &conf_err->list); + out: + return ret; + + error: + free(name); + free(conf_err); + ret = -1; + goto out; +} + +/* Parses the "http-error" proxy keyword */ +static int proxy_parse_http_error(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **errmsg) +{ + struct conf_errors *conf_err; + struct http_reply *reply = NULL; + int rc, cur_arg, ret = 0; + + if (warnifnotcap(curpx, PR_CAP_FE | PR_CAP_BE, file, line, args[0], NULL)) { + ret = 1; + goto out; + } + + cur_arg = 1; + curpx->conf.args.ctx = ARGC_HERR; + reply = http_parse_http_reply((const char **)args, &cur_arg, curpx, 0, errmsg); + if (!reply) { + memprintf(errmsg, "%s : %s", args[0], *errmsg); + goto error; + } + else if (!reply->status) { + memprintf(errmsg, "%s : expects at least a <status> as arguments.\n", args[0]); + goto error; + } + + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) { + if (http_err_codes[rc] == reply->status) + break; + } + + if (rc >= HTTP_ERR_SIZE) { + memprintf(errmsg, "%s: status code '%d' not handled.", args[0], reply->status); + goto error; + } + if (*args[cur_arg]) { + memprintf(errmsg, "%s : unknown keyword '%s'.", args[0], args[cur_arg]); + goto error; + } + + conf_err = calloc(1, sizeof(*conf_err)); + if (!conf_err) { + memprintf(errmsg, "%s : out of memory.", args[0]); + goto error; + } + if (reply->type == HTTP_REPLY_ERRFILES) { + int rc = http_get_status_idx(reply->status); + + conf_err->type = 2; + conf_err->info.errorfiles.name = reply->body.http_errors; + conf_err->info.errorfiles.status[rc] = 2; + reply->body.http_errors = NULL; + release_http_reply(reply); + } + else { + conf_err->type = 1; + conf_err->info.errorfile.status = reply->status; + conf_err->info.errorfile.reply = reply; + LIST_APPEND(&http_replies_list, &reply->list); + } + conf_err->file = strdup(file); + conf_err->line = line; + LIST_APPEND(&curpx->conf.errors, &conf_err->list); + + /* handle warning message */ + if (*errmsg) + ret = 1; + out: + return ret; + + error: + release_http_reply(reply); + ret = -1; + goto out; + +} + +/* Check "errorfiles" proxy keyword */ +static int proxy_check_errors(struct proxy *px) +{ + struct conf_errors *conf_err, *conf_err_back; + struct http_errors *http_errs; + int rc, err = ERR_NONE; + + list_for_each_entry_safe(conf_err, conf_err_back, &px->conf.errors, list) { + if (conf_err->type == 1) { + /* errorfile */ + rc = http_get_status_idx(conf_err->info.errorfile.status); + px->replies[rc] = conf_err->info.errorfile.reply; + + /* For proxy, to rely on default replies, just don't reference a reply */ + if (px->replies[rc]->type == HTTP_REPLY_ERRMSG && !px->replies[rc]->body.errmsg) + px->replies[rc] = NULL; + } + else { + /* errorfiles */ + list_for_each_entry(http_errs, &http_errors_list, list) { + if (strcmp(http_errs->id, conf_err->info.errorfiles.name) == 0) + break; + } + + /* unknown http-errors section */ + if (&http_errs->list == &http_errors_list) { + ha_alert("proxy '%s': unknown http-errors section '%s' (at %s:%d).\n", + px->id, conf_err->info.errorfiles.name, conf_err->file, conf_err->line); + err |= ERR_ALERT | ERR_FATAL; + free(conf_err->info.errorfiles.name); + goto next; + } + + free(conf_err->info.errorfiles.name); + for (rc = 0; rc < HTTP_ERR_SIZE; rc++) { + if (conf_err->info.errorfiles.status[rc] > 0) { + if (http_errs->replies[rc]) + px->replies[rc] = http_errs->replies[rc]; + else if (conf_err->info.errorfiles.status[rc] == 2) + ha_warning("config: proxy '%s' : status '%d' not declared in" + " http-errors section '%s' (at %s:%d).\n", + px->id, http_err_codes[rc], http_errs->id, + conf_err->file, conf_err->line); + } + } + } + next: + LIST_DELETE(&conf_err->list); + free(conf_err->file); + free(conf_err); + } + + out: + return err; +} + +static int post_check_errors() +{ + struct ebpt_node *node; + struct http_error_msg *http_errmsg; + struct htx *htx; + int err_code = ERR_NONE; + + node = ebpt_first(&http_error_messages); + while (node) { + http_errmsg = container_of(node, typeof(*http_errmsg), node); + if (b_is_null(&http_errmsg->msg)) + goto next; + htx = htxbuf(&http_errmsg->msg); + if (htx_free_data_space(htx) < global.tune.maxrewrite) { + ha_warning("config: errorfile '%s' runs over the buffer space" + " reserved to headers rewriting. It may lead to internal errors if " + " http-after-response rules are evaluated on this message.\n", + (char *)node->key); + err_code |= ERR_WARN; + } + next: + node = ebpt_next(node); + } + + return err_code; +} + +int proxy_dup_default_conf_errors(struct proxy *curpx, const struct proxy *defpx, char **errmsg) +{ + struct conf_errors *conf_err, *new_conf_err = NULL; + int ret = 0; + + list_for_each_entry(conf_err, &defpx->conf.errors, list) { + new_conf_err = calloc(1, sizeof(*new_conf_err)); + if (!new_conf_err) { + memprintf(errmsg, "unable to duplicate default errors (out of memory)."); + goto out; + } + new_conf_err->type = conf_err->type; + if (conf_err->type == 1) { + new_conf_err->info.errorfile.status = conf_err->info.errorfile.status; + new_conf_err->info.errorfile.reply = conf_err->info.errorfile.reply; + } + else { + new_conf_err->info.errorfiles.name = strdup(conf_err->info.errorfiles.name); + if (!new_conf_err->info.errorfiles.name) { + memprintf(errmsg, "unable to duplicate default errors (out of memory)."); + goto out; + } + memcpy(&new_conf_err->info.errorfiles.status, &conf_err->info.errorfiles.status, + sizeof(conf_err->info.errorfiles.status)); + } + new_conf_err->file = strdup(conf_err->file); + new_conf_err->line = conf_err->line; + LIST_APPEND(&curpx->conf.errors, &new_conf_err->list); + new_conf_err = NULL; + } + ret = 1; + + out: + free(new_conf_err); + return ret; +} + +void proxy_release_conf_errors(struct proxy *px) +{ + struct conf_errors *conf_err, *conf_err_back; + + list_for_each_entry_safe(conf_err, conf_err_back, &px->conf.errors, list) { + if (conf_err->type == 0) + free(conf_err->info.errorfiles.name); + LIST_DELETE(&conf_err->list); + free(conf_err->file); + free(conf_err); + } +} + +/* + * Parse an <http-errors> section. + * Returns the error code, 0 if OK, or any combination of : + * - ERR_ABORT: must abort ASAP + * - ERR_FATAL: we can continue parsing but not start the service + * - ERR_WARN: a warning has been emitted + * - ERR_ALERT: an alert has been emitted + * Only the two first ones can stop processing, the two others are just + * indicators. + */ +static int cfg_parse_http_errors(const char *file, int linenum, char **args, int kwm) +{ + static struct http_errors *curr_errs = NULL; + int err_code = 0; + const char *err; + char *errmsg = NULL; + + if (strcmp(args[0], "http-errors") == 0) { /* new errors section */ + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing name for http-errors section.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + } + + list_for_each_entry(curr_errs, &http_errors_list, list) { + /* Error if two errors section owns the same name */ + if (strcmp(curr_errs->id, args[1]) == 0) { + ha_alert("parsing [%s:%d]: http-errors section '%s' already exists (declared at %s:%d).\n", + file, linenum, args[1], curr_errs->conf.file, curr_errs->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + + if ((curr_errs = calloc(1, sizeof(*curr_errs))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + LIST_APPEND(&http_errors_list, &curr_errs->list); + curr_errs->id = strdup(args[1]); + curr_errs->conf.file = strdup(file); + curr_errs->conf.line = linenum; + } + else if (strcmp(args[0], "errorfile") == 0) { /* error message from a file */ + struct http_reply *reply; + struct buffer *msg; + int status, rc; + + if (*(args[1]) == 0 || *(args[2]) == 0) { + ha_alert("parsing [%s:%d] : %s: expects <status_code> and <file> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + status = atol(args[1]); + msg = http_parse_errorfile(status, args[2], &errmsg); + if (!msg) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (errmsg) { + ha_warning("parsing [%s:%d] : %s: %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_WARN; + } + + reply = calloc(1, sizeof(*reply)); + if (!reply) { + ha_alert("parsing [%s:%d] : %s : out of memory.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + reply->type = HTTP_REPLY_ERRMSG; + reply->status = status; + reply->ctype = NULL; + LIST_INIT(&reply->hdrs); + reply->body.errmsg = msg; + + rc = http_get_status_idx(status); + curr_errs->replies[rc] = reply; + } + else if (*args[0] != 0) { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], cursection); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + +out: + free(errmsg); + return err_code; +} + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_LISTEN, "errorloc", proxy_parse_errorloc }, + { CFG_LISTEN, "errorloc302", proxy_parse_errorloc }, + { CFG_LISTEN, "errorloc303", proxy_parse_errorloc }, + { CFG_LISTEN, "errorfile", proxy_parse_errorfile }, + { CFG_LISTEN, "errorfiles", proxy_parse_errorfiles }, + { CFG_LISTEN, "http-error", proxy_parse_http_error }, + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); +REGISTER_POST_PROXY_CHECK(proxy_check_errors); +REGISTER_POST_CHECK(post_check_errors); + +REGISTER_CONFIG_SECTION("http-errors", cfg_parse_http_errors, NULL); + +/************************************************************************/ +/* HTX sample fetches */ +/************************************************************************/ + +/* Returns 1 if a stream is an HTX stream. Otherwise, it returns 0. */ +static int +smp_fetch_is_htx(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.u.sint = !!IS_HTX_STRM(smp->strm); + smp->data.type = SMP_T_BOOL; + return 1; +} + +/* Returns the number of blocks in an HTX message. The channel is chosen + * depending on the sample direction. */ +static int +smp_fetch_htx_nbblks(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + + if (!smp->strm) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + smp->data.u.sint = htx_nbblks(htx); + smp->data.type = SMP_T_SINT; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the size of an HTX message. The channel is chosen depending on the + * sample direction. */ +static int +smp_fetch_htx_size(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + + if (!smp->strm) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + smp->data.u.sint = htx->size; + smp->data.type = SMP_T_SINT; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the data size of an HTX message. The channel is chosen depending on the + * sample direction. */ +static int +smp_fetch_htx_data(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + + if (!smp->strm) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + smp->data.u.sint = htx->data; + smp->data.type = SMP_T_SINT; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the used space (data+meta) of an HTX message. The channel is chosen + * depending on the sample direction. */ +static int +smp_fetch_htx_used(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + + if (!smp->strm) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + smp->data.u.sint = htx_used_space(htx); + smp->data.type = SMP_T_SINT; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the free space (size-used) of an HTX message. The channel is chosen + * depending on the sample direction. */ +static int +smp_fetch_htx_free(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + + if (!smp->strm) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + smp->data.u.sint = htx_free_space(htx); + smp->data.type = SMP_T_SINT; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the free space for data (free-sizeof(blk)) of an HTX message. The + * channel is chosen depending on the sample direction. */ +static int +smp_fetch_htx_free_data(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + + if (!smp->strm) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + smp->data.u.sint = htx_free_data_space(htx); + smp->data.type = SMP_T_SINT; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns 1 if the HTX message contains EOM flag. Otherwise it returns 0. The + * channel is chosen depending on the sample direction. + */ +static int +smp_fetch_htx_has_eom(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + + if (!smp->strm) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + smp->data.u.sint = !!(htx->flags & HTX_FL_EOM); + smp->data.type = SMP_T_BOOL; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the type of a specific HTX block, if found in the message. Otherwise + * HTX_BLK_UNUSED is returned. Any positive integer (>= 0) is supported or + * "head", "tail" or "first". The channel is chosen depending on the sample + * direction. */ +static int +smp_fetch_htx_blk_type(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + enum htx_blk_type type; + int32_t pos; + + if (!smp->strm || !arg_p) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + pos = arg_p[0].data.sint; + if (pos == -1) + type = htx_get_head_type(htx); + else if (pos == -2) + type = htx_get_tail_type(htx); + else if (pos == -3) + type = htx_get_first_type(htx); + else + type = ((pos >= htx->head && pos <= htx->tail) + ? htx_get_blk_type(htx_get_blk(htx, pos)) + : HTX_BLK_UNUSED); + + chunk_initstr(&smp->data.u.str, htx_blk_type_str(type)); + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST | SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the size of a specific HTX block, if found in the message. Otherwise + * 0 is returned. Any positive integer (>= 0) is supported or "head", "tail" or + * "first". The channel is chosen depending on the sample direction. */ +static int +smp_fetch_htx_blk_size(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + struct htx_blk *blk; + int32_t pos; + + if (!smp->strm || !arg_p) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + pos = arg_p[0].data.sint; + if (pos == -1) + blk = htx_get_head_blk(htx); + else if (pos == -2) + blk = htx_get_tail_blk(htx); + else if (pos == -3) + blk = htx_get_first_blk(htx); + else + blk = ((pos >= htx->head && pos <= htx->tail) ? htx_get_blk(htx, pos) : NULL); + + smp->data.u.sint = (blk ? htx_get_blksz(blk) : 0); + smp->data.type = SMP_T_SINT; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the start-line if the selected HTX block exists and is a + * start-line. Otherwise 0 an empty string. Any positive integer (>= 0) is + * supported or "head", "tail" or "first". The channel is chosen depending on + * the sample direction. */ +static int +smp_fetch_htx_blk_stline(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct buffer *temp; + struct channel *chn; + struct htx *htx; + struct htx_blk *blk; + struct htx_sl *sl; + int32_t pos; + + if (!smp->strm || !arg_p) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + pos = arg_p[0].data.sint; + if (pos == -1) + blk = htx_get_head_blk(htx); + else if (pos == -2) + blk = htx_get_tail_blk(htx); + else if (pos == -3) + blk = htx_get_first_blk(htx); + else + blk = ((pos >= htx->head && pos <= htx->tail) ? htx_get_blk(htx, pos) : NULL); + + if (!blk || (htx_get_blk_type(blk) != HTX_BLK_REQ_SL && htx_get_blk_type(blk) != HTX_BLK_RES_SL)) { + smp->data.u.str.size = 0; + smp->data.u.str.area = ""; + smp->data.u.str.data = 0; + } + else { + sl = htx_get_blk_ptr(htx, blk); + + temp = get_trash_chunk(); + chunk_istcat(temp, htx_sl_p1(sl)); + temp->area[temp->data++] = ' '; + chunk_istcat(temp, htx_sl_p2(sl)); + temp->area[temp->data++] = ' '; + chunk_istcat(temp, htx_sl_p3(sl)); + + smp->data.u.str = *temp; + } + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the header name if the selected HTX block exists and is a header or a + * trailer. Otherwise 0 an empty string. Any positive integer (>= 0) is + * supported or "head", "tail" or "first". The channel is chosen depending on + * the sample direction. */ +static int +smp_fetch_htx_blk_hdrname(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + struct htx_blk *blk; + int32_t pos; + + if (!smp->strm || !arg_p) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + pos = arg_p[0].data.sint; + if (pos == -1) + blk = htx_get_head_blk(htx); + else if (pos == -2) + blk = htx_get_tail_blk(htx); + else if (pos == -3) + blk = htx_get_first_blk(htx); + else + blk = ((pos >= htx->head && pos <= htx->tail) ? htx_get_blk(htx, pos) : NULL); + + if (!blk || (htx_get_blk_type(blk) != HTX_BLK_HDR && htx_get_blk_type(blk) != HTX_BLK_TLR)) { + smp->data.u.str.size = 0; + smp->data.u.str.area = ""; + smp->data.u.str.data = 0; + } + else { + struct ist name = htx_get_blk_name(htx, blk); + + chunk_initlen(&smp->data.u.str, name.ptr, name.len, name.len); + } + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST | SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the header value if the selected HTX block exists and is a header or + * a trailer. Otherwise 0 an empty string. Any positive integer (>= 0) is + * supported or "head", "tail" or "first". The channel is chosen depending on + * the sample direction. */ +static int +smp_fetch_htx_blk_hdrval(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + struct htx_blk *blk; + int32_t pos; + + if (!smp->strm || !arg_p) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + pos = arg_p[0].data.sint; + if (pos == -1) + blk = htx_get_head_blk(htx); + else if (pos == -2) + blk = htx_get_tail_blk(htx); + else if (pos == -3) + blk = htx_get_first_blk(htx); + else + blk = ((pos >= htx->head && pos <= htx->tail) ? htx_get_blk(htx, pos) : NULL); + + if (!blk || (htx_get_blk_type(blk) != HTX_BLK_HDR && htx_get_blk_type(blk) != HTX_BLK_TLR)) { + smp->data.u.str.size = 0; + smp->data.u.str.area = ""; + smp->data.u.str.data = 0; + } + else { + struct ist val = htx_get_blk_value(htx, blk); + + chunk_initlen(&smp->data.u.str, val.ptr, val.len, val.len); + } + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST | SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns the value if the selected HTX block exists and is a data + * block. Otherwise 0 an empty string. Any positive integer (>= 0) is supported + * or "head", "tail" or "first". The channel is chosen depending on the sample + * direction. */ +static int +smp_fetch_htx_blk_data(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + struct channel *chn; + struct htx *htx; + struct htx_blk *blk; + int32_t pos; + + if (!smp->strm || !arg_p) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + htx = smp_prefetch_htx(smp, chn, NULL, 0); + if (!htx) + return 0; + + pos = arg_p[0].data.sint; + if (pos == -1) + blk = htx_get_head_blk(htx); + else if (pos == -2) + blk = htx_get_tail_blk(htx); + else if (pos == -3) + blk = htx_get_first_blk(htx); + else + blk = ((pos >= htx->head && pos <= htx->tail) ? htx_get_blk(htx, pos) : NULL); + + if (!blk || htx_get_blk_type(blk) != HTX_BLK_DATA) { + smp->data.u.str.size = 0; + smp->data.u.str.area = ""; + smp->data.u.str.data = 0; + } + else { + struct ist val = htx_get_blk_value(htx, blk); + + chunk_initlen(&smp->data.u.str, val.ptr, val.len, val.len); + } + smp->data.type = SMP_T_BIN; + smp->flags = SMP_F_CONST | SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* This function is used to validate the arguments passed to any "htx_blk" fetch + * keywords. An argument is expected by these keywords. It must be a positive + * integer or on of the following strings: "head", "tail" or "first". It returns + * 0 on error, and a non-zero value if OK. + */ +int val_blk_arg(struct arg *arg, char **err_msg) +{ + if (arg[0].type != ARGT_STR || !arg[0].data.str.data) { + memprintf(err_msg, "a block position is expected (> 0) or a special block name (head, tail, first)"); + return 0; + } + if (arg[0].data.str.data == 4 && !strncmp(arg[0].data.str.area, "head", 4)) { + chunk_destroy(&arg[0].data.str); + arg[0].type = ARGT_SINT; + arg[0].data.sint = -1; + } + else if (arg[0].data.str.data == 4 && !strncmp(arg[0].data.str.area, "tail", 4)) { + chunk_destroy(&arg[0].data.str); + arg[0].type = ARGT_SINT; + arg[0].data.sint = -2; + } + else if (arg[0].data.str.data == 5 && !strncmp(arg[0].data.str.area, "first", 5)) { + chunk_destroy(&arg[0].data.str); + arg[0].type = ARGT_SINT; + arg[0].data.sint = -3; + } + else { + int pos; + + for (pos = 0; pos < arg[0].data.str.data; pos++) { + if (!isdigit((unsigned char)arg[0].data.str.area[pos])) { + memprintf(err_msg, "invalid block position"); + return 0; + } + } + + pos = strl2uic(arg[0].data.str.area, arg[0].data.str.data); + if (pos < 0) { + memprintf(err_msg, "block position must not be negative"); + return 0; + } + chunk_destroy(&arg[0].data.str); + arg[0].type = ARGT_SINT; + arg[0].data.sint = pos; + } + + return 1; +} + + +/* Note: must not be declared <const> as its list will be overwritten. + * Note: htx sample fetches should only used for development purpose. + */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "internal.strm.is_htx", smp_fetch_is_htx, 0, NULL, SMP_T_BOOL, SMP_USE_INTRN }, + + { "internal.htx.nbblks", smp_fetch_htx_nbblks, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx.size", smp_fetch_htx_size, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx.data", smp_fetch_htx_data, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx.used", smp_fetch_htx_used, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx.free", smp_fetch_htx_free, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx.free_data", smp_fetch_htx_free_data, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx.has_eom", smp_fetch_htx_has_eom, 0, NULL, SMP_T_BOOL, SMP_USE_HRQHV|SMP_USE_HRSHV}, + + { "internal.htx_blk.type", smp_fetch_htx_blk_type, ARG1(1,STR), val_blk_arg, SMP_T_STR, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx_blk.size", smp_fetch_htx_blk_size, ARG1(1,STR), val_blk_arg, SMP_T_SINT, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx_blk.start_line", smp_fetch_htx_blk_stline, ARG1(1,STR), val_blk_arg, SMP_T_STR, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx_blk.hdrname", smp_fetch_htx_blk_hdrname, ARG1(1,STR), val_blk_arg, SMP_T_STR, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx_blk.hdrval", smp_fetch_htx_blk_hdrval, ARG1(1,STR), val_blk_arg, SMP_T_STR, SMP_USE_HRQHV|SMP_USE_HRSHV}, + { "internal.htx_blk.data", smp_fetch_htx_blk_data, ARG1(1,STR), val_blk_arg, SMP_T_BIN, SMP_USE_HRQHV|SMP_USE_HRSHV}, + + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); diff --git a/src/http_rules.c b/src/http_rules.c new file mode 100644 index 0000000..192f0c7 --- /dev/null +++ b/src/http_rules.c @@ -0,0 +1,530 @@ +/* + * HTTP rules parsing and registration + * + * Copyright 2000-2018 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/types.h> + +#include <ctype.h> +#include <string.h> +#include <time.h> + +#include <haproxy/acl.h> +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/capture-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/chunk.h> +#include <haproxy/global.h> +#include <haproxy/http.h> +#include <haproxy/http_ana-t.h> +#include <haproxy/http_rules.h> +#include <haproxy/log.h> +#include <haproxy/pool.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + + +/* List head of all known action keywords for "http-request" */ +struct action_kw_list http_req_keywords = { + .list = LIST_HEAD_INIT(http_req_keywords.list) +}; + +/* List head of all known action keywords for "http-response" */ +struct action_kw_list http_res_keywords = { + .list = LIST_HEAD_INIT(http_res_keywords.list) +}; + +/* List head of all known action keywords for "http-after-response" */ +struct action_kw_list http_after_res_keywords = { + .list = LIST_HEAD_INIT(http_after_res_keywords.list) +}; + +void http_req_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&http_req_keywords.list, &kw_list->list); +} + +void http_res_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&http_res_keywords.list, &kw_list->list); +} + +void http_after_res_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&http_after_res_keywords.list, &kw_list->list); +} + +/* + * Return the struct http_req_action_kw associated to a keyword. + */ +struct action_kw *action_http_req_custom(const char *kw) +{ + return action_lookup(&http_req_keywords.list, kw); +} + +/* + * Return the struct http_res_action_kw associated to a keyword. + */ +struct action_kw *action_http_res_custom(const char *kw) +{ + return action_lookup(&http_res_keywords.list, kw); +} + +/* + * Return the struct http_after_res_action_kw associated to a keyword. + */ +struct action_kw *action_http_after_res_custom(const char *kw) +{ + return action_lookup(&http_after_res_keywords.list, kw); +} + +/* parse an "http-request" rule */ +struct act_rule *parse_http_req_cond(const char **args, const char *file, int linenum, struct proxy *proxy) +{ + struct act_rule *rule; + const struct action_kw *custom = NULL; + int cur_arg; + + rule = new_act_rule(ACT_F_HTTP_REQ, file, linenum); + if (!rule) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + goto out; + } + + if (((custom = action_http_req_custom(args[0])) != NULL)) { + char *errmsg = NULL; + + cur_arg = 1; + /* try in the module list */ + rule->kw = custom; + + if (custom->flags & KWF_EXPERIMENTAL) { + if (!experimental_directives_allowed) { + ha_alert("parsing [%s:%d] : '%s' action is experimental, must be allowed via a global 'expose-experimental-directives'\n", + file, linenum, custom->kw); + goto out_err; + } + mark_tainted(TAINTED_CONFIG_EXP_KW_DECLARED); + } + + if (custom->parse(args, &cur_arg, proxy, rule, &errmsg) == ACT_RET_PRS_ERR) { + ha_alert("parsing [%s:%d] : error detected in %s '%s' while parsing 'http-request %s' rule : %s.\n", + file, linenum, proxy_type_str(proxy), proxy->id, args[0], errmsg); + free(errmsg); + goto out_err; + } + else if (errmsg) { + ha_warning("parsing [%s:%d] : %s.\n", file, linenum, errmsg); + free(errmsg); + } + } + else { + const char *best = action_suggest(args[0], &http_req_keywords.list, NULL); + + action_build_list(&http_req_keywords.list, &trash); + ha_alert("parsing [%s:%d]: 'http-request' expects %s, but got '%s'%s.%s%s%s\n", + file, linenum, trash.area, + args[0], *args[0] ? "" : " (missing argument)", + best ? " Did you mean '" : "", + best ? best : "", + best ? "' maybe ?" : ""); + goto out_err; + } + + if (strcmp(args[cur_arg], "if") == 0 || strcmp(args[cur_arg], "unless") == 0) { + struct acl_cond *cond; + char *errmsg = NULL; + + if ((cond = build_acl_cond(file, linenum, &proxy->acl, proxy, args+cur_arg, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing an 'http-request %s' condition : %s.\n", + file, linenum, args[0], errmsg); + free(errmsg); + goto out_err; + } + rule->cond = cond; + } + else if (*args[cur_arg]) { + ha_alert("parsing [%s:%d]: 'http-request %s' expects" + " either 'if' or 'unless' followed by a condition but found '%s'.\n", + file, linenum, args[0], args[cur_arg]); + goto out_err; + } + + return rule; + out_err: + free_act_rule(rule); + out: + return NULL; +} + +/* parse an "http-respose" rule */ +struct act_rule *parse_http_res_cond(const char **args, const char *file, int linenum, struct proxy *proxy) +{ + struct act_rule *rule; + const struct action_kw *custom = NULL; + int cur_arg; + + rule = new_act_rule(ACT_F_HTTP_RES, file, linenum); + if (!rule) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + goto out; + } + + if (((custom = action_http_res_custom(args[0])) != NULL)) { + char *errmsg = NULL; + + cur_arg = 1; + /* try in the module list */ + rule->kw = custom; + + if (custom->flags & KWF_EXPERIMENTAL) { + if (!experimental_directives_allowed) { + ha_alert("parsing [%s:%d] : '%s' action is experimental, must be allowed via a global 'expose-experimental-directives'\n", + file, linenum, custom->kw); + goto out_err; + } + mark_tainted(TAINTED_CONFIG_EXP_KW_DECLARED); + } + + if (custom->parse(args, &cur_arg, proxy, rule, &errmsg) == ACT_RET_PRS_ERR) { + ha_alert("parsing [%s:%d] : error detected in %s '%s' while parsing 'http-response %s' rule : %s.\n", + file, linenum, proxy_type_str(proxy), proxy->id, args[0], errmsg); + free(errmsg); + goto out_err; + } + else if (errmsg) { + ha_warning("parsing [%s:%d] : %s.\n", file, linenum, errmsg); + free(errmsg); + } + } + else { + const char *best = action_suggest(args[0], &http_res_keywords.list, NULL); + + action_build_list(&http_res_keywords.list, &trash); + ha_alert("parsing [%s:%d]: 'http-response' expects %s, but got '%s'%s.%s%s%s\n", + file, linenum, trash.area, + args[0], *args[0] ? "" : " (missing argument)", + best ? " Did you mean '" : "", + best ? best : "", + best ? "' maybe ?" : ""); + goto out_err; + } + + if (strcmp(args[cur_arg], "if") == 0 || strcmp(args[cur_arg], "unless") == 0) { + struct acl_cond *cond; + char *errmsg = NULL; + + if ((cond = build_acl_cond(file, linenum, &proxy->acl, proxy, args+cur_arg, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing an 'http-response %s' condition : %s.\n", + file, linenum, args[0], errmsg); + free(errmsg); + goto out_err; + } + rule->cond = cond; + } + else if (*args[cur_arg]) { + ha_alert("parsing [%s:%d]: 'http-response %s' expects" + " either 'if' or 'unless' followed by a condition but found '%s'.\n", + file, linenum, args[0], args[cur_arg]); + goto out_err; + } + + return rule; + out_err: + free_act_rule(rule); + out: + return NULL; +} + + +/* parse an "http-after-response" rule */ +struct act_rule *parse_http_after_res_cond(const char **args, const char *file, int linenum, struct proxy *proxy) +{ + struct act_rule *rule; + const struct action_kw *custom = NULL; + int cur_arg; + + rule = new_act_rule(ACT_F_HTTP_RES, file, linenum); + if (!rule) { + ha_alert("parsing [%s:%d]: out of memory.\n", file, linenum); + goto out; + } + + if (((custom = action_http_after_res_custom(args[0])) != NULL)) { + char *errmsg = NULL; + + cur_arg = 1; + /* try in the module list */ + rule->kw = custom; + if (custom->parse(args, &cur_arg, proxy, rule, &errmsg) == ACT_RET_PRS_ERR) { + ha_alert("parsing [%s:%d] : error detected in %s '%s' while parsing 'http-after-response %s' rule : %s.\n", + file, linenum, proxy_type_str(proxy), proxy->id, args[0], errmsg); + free(errmsg); + goto out_err; + } + else if (errmsg) { + ha_warning("parsing [%s:%d] : %s.\n", file, linenum, errmsg); + free(errmsg); + } + } + else { + const char *best = action_suggest(args[0], &http_after_res_keywords.list, NULL); + + action_build_list(&http_after_res_keywords.list, &trash); + ha_alert("parsing [%s:%d]: 'http-after-response' expects %s, but got '%s'%s.%s%s%s\n", + file, linenum, trash.area, + args[0], *args[0] ? "" : " (missing argument)", + best ? " Did you mean '" : "", + best ? best : "", + best ? "' maybe ?" : ""); + goto out_err; + } + + if (strcmp(args[cur_arg], "if") == 0 || strcmp(args[cur_arg], "unless") == 0) { + struct acl_cond *cond; + char *errmsg = NULL; + + if ((cond = build_acl_cond(file, linenum, &proxy->acl, proxy, args+cur_arg, &errmsg)) == NULL) { + ha_alert("parsing [%s:%d] : error detected while parsing an 'http-after-response %s' condition : %s.\n", + file, linenum, args[0], errmsg); + free(errmsg); + goto out_err; + } + rule->cond = cond; + } + else if (*args[cur_arg]) { + ha_alert("parsing [%s:%d]: 'http-after-response %s' expects" + " either 'if' or 'unless' followed by a condition but found '%s'.\n", + file, linenum, args[0], args[cur_arg]); + goto out_err; + } + + return rule; + out_err: + free_act_rule(rule); + out: + return NULL; +} + +/* completely free redirect rule */ +void http_free_redirect_rule(struct redirect_rule *rdr) +{ + struct logformat_node *lf, *lfb; + + free_acl_cond(rdr->cond); + free(rdr->rdr_str); + free(rdr->cookie_str); + list_for_each_entry_safe(lf, lfb, &rdr->rdr_fmt, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } + free(rdr); +} + +/* Parses a redirect rule. Returns the redirect rule on success or NULL on error, + * with <err> filled with the error message. If <use_fmt> is not null, builds a + * dynamic log-format rule instead of a static string. Parameter <dir> indicates + * the direction of the rule, and equals 0 for request, non-zero for responses. + */ +struct redirect_rule *http_parse_redirect_rule(const char *file, int linenum, struct proxy *curproxy, + const char **args, char **errmsg, int use_fmt, int dir) +{ + struct redirect_rule *rule = NULL; + int cur_arg; + int type = REDIRECT_TYPE_NONE; + int code = 302; + const char *destination = NULL; + const char *cookie = NULL; + int cookie_set = 0; + unsigned int flags = (!dir ? REDIRECT_FLAG_FROM_REQ : REDIRECT_FLAG_NONE); + struct acl_cond *cond = NULL; + + cur_arg = 0; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "location") == 0) { + if (!*args[cur_arg + 1]) + goto missing_arg; + + type = REDIRECT_TYPE_LOCATION; + cur_arg++; + destination = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "prefix") == 0) { + if (!*args[cur_arg + 1]) + goto missing_arg; + type = REDIRECT_TYPE_PREFIX; + cur_arg++; + destination = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "scheme") == 0) { + if (!*args[cur_arg + 1]) + goto missing_arg; + + type = REDIRECT_TYPE_SCHEME; + cur_arg++; + destination = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "set-cookie") == 0) { + if (!*args[cur_arg + 1]) + goto missing_arg; + + cur_arg++; + cookie = args[cur_arg]; + cookie_set = 1; + } + else if (strcmp(args[cur_arg], "clear-cookie") == 0) { + if (!*args[cur_arg + 1]) + goto missing_arg; + + cur_arg++; + cookie = args[cur_arg]; + cookie_set = 0; + } + else if (strcmp(args[cur_arg], "code") == 0) { + if (!*args[cur_arg + 1]) + goto missing_arg; + + cur_arg++; + code = atol(args[cur_arg]); + if (code < 301 || code > 308 || (code > 303 && code < 307)) { + memprintf(errmsg, + "'%s': unsupported HTTP code '%s' (must be one of 301, 302, 303, 307 or 308)", + args[cur_arg - 1], args[cur_arg]); + goto err; + } + } + else if (strcmp(args[cur_arg], "drop-query") == 0) { + flags |= REDIRECT_FLAG_DROP_QS; + } + else if (strcmp(args[cur_arg], "append-slash") == 0) { + flags |= REDIRECT_FLAG_APPEND_SLASH; + } + else if (strcmp(args[cur_arg], "ignore-empty") == 0) { + flags |= REDIRECT_FLAG_IGNORE_EMPTY; + } + else if (strcmp(args[cur_arg], "if") == 0 || + strcmp(args[cur_arg], "unless") == 0) { + cond = build_acl_cond(file, linenum, &curproxy->acl, curproxy, (const char **)args + cur_arg, errmsg); + if (!cond) { + memprintf(errmsg, "error in condition: %s", *errmsg); + goto err; + } + break; + } + else { + memprintf(errmsg, + "expects 'code', 'prefix', 'location', 'scheme', 'set-cookie', 'clear-cookie', 'drop-query', 'ignore-empty' or 'append-slash' (was '%s')", + args[cur_arg]); + goto err; + } + cur_arg++; + } + + if (type == REDIRECT_TYPE_NONE) { + memprintf(errmsg, "redirection type expected ('prefix', 'location', or 'scheme')"); + goto err; + } + + if (dir && type != REDIRECT_TYPE_LOCATION) { + memprintf(errmsg, "response only supports redirect type 'location'"); + goto err; + } + + rule = calloc(1, sizeof(*rule)); + if (!rule) + goto out_of_memory; + rule->cond = cond; + LIST_INIT(&rule->rdr_fmt); + + if (!use_fmt) { + /* old-style static redirect rule */ + rule->rdr_str = strdup(destination); + if (!rule->rdr_str) + goto out_of_memory; + rule->rdr_len = strlen(destination); + } + else { + /* log-format based redirect rule */ + int cap = 0; + + /* Parse destination. Note that in the REDIRECT_TYPE_PREFIX case, + * if prefix == "/", we don't want to add anything, otherwise it + * makes it hard for the user to configure a self-redirection. + */ + curproxy->conf.args.ctx = ARGC_RDR; + if (curproxy->cap & PR_CAP_FE) + cap |= (dir ? SMP_VAL_FE_HRS_HDR : SMP_VAL_FE_HRQ_HDR); + if (curproxy->cap & PR_CAP_BE) + cap |= (dir ? SMP_VAL_BE_HRS_HDR : SMP_VAL_BE_HRQ_HDR); + if (!(type == REDIRECT_TYPE_PREFIX && destination[0] == '/' && destination[1] == '\0')) { + if (!parse_logformat_string(destination, curproxy, &rule->rdr_fmt, LOG_OPT_HTTP, cap, errmsg)) { + goto err; + } + free(curproxy->conf.lfs_file); + curproxy->conf.lfs_file = strdup(curproxy->conf.args.file); + curproxy->conf.lfs_line = curproxy->conf.args.line; + } + } + + if (cookie) { + /* depending on cookie_set, either we want to set the cookie, or to clear it. + * a clear consists in appending "; path=/; Max-Age=0;" at the end. + */ + rule->cookie_len = strlen(cookie); + if (cookie_set) { + rule->cookie_str = malloc(rule->cookie_len + 10); + if (!rule->cookie_str) + goto out_of_memory; + memcpy(rule->cookie_str, cookie, rule->cookie_len); + memcpy(rule->cookie_str + rule->cookie_len, "; path=/;", 10); + rule->cookie_len += 9; + } else { + rule->cookie_str = malloc(rule->cookie_len + 21); + if (!rule->cookie_str) + goto out_of_memory; + memcpy(rule->cookie_str, cookie, rule->cookie_len); + memcpy(rule->cookie_str + rule->cookie_len, "; path=/; Max-Age=0;", 21); + rule->cookie_len += 20; + } + } + rule->type = type; + rule->code = code; + rule->flags = flags; + LIST_INIT(&rule->list); + return rule; + + missing_arg: + memprintf(errmsg, "missing argument for '%s'", args[cur_arg]); + goto err; + out_of_memory: + memprintf(errmsg, "parsing [%s:%d]: out of memory.", file, linenum); + err: + if (rule) + http_free_redirect_rule(rule); + else if (cond) { + /* rule not yet allocated, but cond already is */ + free_acl_cond(cond); + } + + return NULL; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/htx.c b/src/htx.c new file mode 100644 index 0000000..feb7eec --- /dev/null +++ b/src/htx.c @@ -0,0 +1,1099 @@ +/* + * internal HTTP message + * + * Copyright 2018 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/chunk.h> +#include <haproxy/htx.h> +#include <haproxy/net_helper.h> + +struct htx htx_empty = { .size = 0, .data = 0, .head = -1, .tail = -1, .first = -1 }; + +/* tests show that 63% of these calls are for 64-bit chunks, so better avoid calling + * memcpy() for that! + */ +static inline __attribute__((always_inline)) void htx_memcpy(void *dst, void *src, size_t len) +{ + if (likely(len == 8)) + write_u64(dst, read_u64(src)); + else + memcpy(dst, src, len); +} + +/* Defragments an HTX message. It removes unused blocks and unwraps the payloads + * part. A temporary buffer is used to do so. This function never fails. Most of + * time, we need keep a ref on a specific HTX block. Thus is <blk> is set, the + * pointer on its new position, after defrag, is returned. In addition, if the + * size of the block must be altered, <blkinfo> info must be provided (!= + * 0). But in this case, it remains the caller responsibility to update the + * block content. + */ +/* TODO: merge data blocks into one */ +struct htx_blk *htx_defrag(struct htx *htx, struct htx_blk *blk, uint32_t blkinfo) +{ + struct buffer *chunk = get_trash_chunk(); + struct htx *tmp = htxbuf(chunk); + struct htx_blk *newblk, *oldblk; + uint32_t new, old, blkpos; + uint32_t addr, blksz; + int32_t first = -1; + + if (htx->head == -1) + return NULL; + + blkpos = -1; + + new = 0; + addr = 0; + tmp->size = htx->size; + tmp->data = 0; + + /* start from the head */ + for (old = htx_get_head(htx); old != -1; old = htx_get_next(htx, old)) { + oldblk = htx_get_blk(htx, old); + if (htx_get_blk_type(oldblk) == HTX_BLK_UNUSED) + continue; + + blksz = htx_get_blksz(oldblk); + htx_memcpy((void *)tmp->blocks + addr, htx_get_blk_ptr(htx, oldblk), blksz); + + /* update the start-line position */ + if (htx->first == old) + first = new; + + newblk = htx_get_blk(tmp, new); + newblk->addr = addr; + newblk->info = oldblk->info; + + /* if <blk> is defined, save its new position */ + if (blk != NULL && blk == oldblk) { + if (blkinfo) + newblk->info = blkinfo; + blkpos = new; + } + + blksz = htx_get_blksz(newblk); + addr += blksz; + tmp->data += blksz; + new++; + } + + htx->data = tmp->data; + htx->first = first; + htx->head = 0; + htx->tail = new - 1; + htx->head_addr = htx->end_addr = 0; + htx->tail_addr = addr; + htx->flags &= ~HTX_FL_FRAGMENTED; + htx_memcpy((void *)htx->blocks, (void *)tmp->blocks, htx->size); + + return ((blkpos == -1) ? NULL : htx_get_blk(htx, blkpos)); +} + +/* Degragments HTX blocks of an HTX message. Payloads part is keep untouched + * here. This function will move back all blocks starting at the position 0, + * removing unused blocks. It must never be called with an empty message. + */ +static void htx_defrag_blks(struct htx *htx) +{ + int32_t pos, new; + + new = 0; + for (pos = htx_get_head(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *posblk, *newblk; + + if (pos == new) { + new++; + continue; + } + + posblk = htx_get_blk(htx, pos); + if (htx_get_blk_type(posblk) == HTX_BLK_UNUSED) + continue; + + if (htx->first == pos) + htx->first = new; + newblk = htx_get_blk(htx, new++); + newblk->info = posblk->info; + newblk->addr = posblk->addr; + } + BUG_ON(!new); + htx->head = 0; + htx->tail = new - 1; +} + +/* Reserves a new block in the HTX message <htx> with a content of <blksz> + * bytes. If there is not enough space, NULL is returned. Otherwise the reserved + * block is returned and the HTX message is updated. Space for this new block is + * reserved in the HTX message. But it is the caller responsibility to set right + * info in the block to reflect the stored data. + */ +static struct htx_blk *htx_reserve_nxblk(struct htx *htx, uint32_t blksz) +{ + struct htx_blk *blk; + uint32_t tail, headroom, tailroom; + + if (blksz > htx_free_data_space(htx)) + return NULL; /* full */ + + if (htx->head == -1) { + /* Empty message */ + htx->head = htx->tail = htx->first = 0; + blk = htx_get_blk(htx, htx->tail); + blk->addr = 0; + htx->data = blksz; + htx->tail_addr = blksz; + return blk; + } + + /* Find the block's position. First, we try to get the next position in + * the message, increasing the tail by one. If this position is not + * available with some holes, we try to defrag the blocks without + * touching their paylood. If it is impossible, we fully defrag the + * message. + */ + tail = htx->tail + 1; + if (htx_pos_to_addr(htx, tail) >= htx->tail_addr) + ; + else if (htx->head > 0) { + htx_defrag_blks(htx); + tail = htx->tail + 1; + BUG_ON(htx_pos_to_addr(htx, tail) < htx->tail_addr); + } + else + goto defrag; + + /* Now, we have found the block's position. Try to find where to put its + * payload. The free space is split in two areas: + * + * * The free space in front of the blocks table. This one is used if and + * only if the other one was not used yet. + * + * * The free space at the beginning of the message. Once this one is + * used, the other one is never used again, until the next defrag. + */ + headroom = (htx->end_addr - htx->head_addr); + tailroom = (!htx->head_addr ? htx_pos_to_addr(htx, tail) - htx->tail_addr : 0); + BUG_ON((int32_t)headroom < 0); + BUG_ON((int32_t)tailroom < 0); + + if (blksz <= tailroom) { + blk = htx_get_blk(htx, tail); + blk->addr = htx->tail_addr; + htx->tail_addr += blksz; + } + else if (blksz <= headroom) { + blk = htx_get_blk(htx, tail); + blk->addr = htx->head_addr; + htx->head_addr += blksz; + } + else { + defrag: + /* need to defragment the message before inserting upfront */ + htx_defrag(htx, NULL, 0); + tail = htx->tail + 1; + blk = htx_get_blk(htx, tail); + blk->addr = htx->tail_addr; + htx->tail_addr += blksz; + } + + htx->tail = tail; + htx->data += blksz; + /* Set first position if not already set */ + if (htx->first == -1) + htx->first = tail; + + BUG_ON((int32_t)htx->tail_addr < 0); + BUG_ON((int32_t)htx->head_addr < 0); + BUG_ON(htx->end_addr > htx->tail_addr); + BUG_ON(htx->head_addr > htx->end_addr); + + return blk; +} + +/* Prepares the block to an expansion of its payload. The payload will be + * expanded by <delta> bytes and we need find where this expansion will be + * performed. It can be a compression if <delta> is negative. This function only + * updates all addresses. The caller have the responsibility to perform the + * expansion and update the block and the HTX message accordingly. No error must + * occur. It returns following values: + * + * 0: The expansion cannot be performed, there is not enough space. + * + * 1: the expansion must be performed in place, there is enough space after + * the block's payload to handle it. This is especially true if it is a + * compression and not an expansion. + * + * 2: the block's payload must be moved at the new block address before doing + * the expansion. + * + * 3: the HTX message message must be defragmented + */ +static int htx_prepare_blk_expansion(struct htx *htx, struct htx_blk *blk, int32_t delta) +{ + uint32_t sz, tailroom, headroom; + int ret = 3; + + BUG_ON(htx->head == -1); + + headroom = (htx->end_addr - htx->head_addr); + tailroom = (htx_pos_to_addr(htx, htx->tail) - htx->tail_addr); + BUG_ON((int32_t)headroom < 0); + BUG_ON((int32_t)tailroom < 0); + + sz = htx_get_blksz(blk); + if (delta <= 0) { + /* It is a compression, it can be performed in place */ + if (blk->addr+sz == htx->tail_addr) + htx->tail_addr += delta; + else if (blk->addr+sz == htx->head_addr) + htx->head_addr += delta; + ret = 1; + } + else if (delta > htx_free_space(htx)) { + /* There is not enough space to handle the expansion */ + ret = 0; + } + else if (blk->addr+sz == htx->tail_addr) { + /* The block's payload is just before the tail room */ + if (delta < tailroom) { + /* Expand the block's payload */ + htx->tail_addr += delta; + ret = 1; + } + else if ((sz + delta) < headroom) { + uint32_t oldaddr = blk->addr; + + /* Move the block's payload into the headroom */ + blk->addr = htx->head_addr; + htx->tail_addr -= sz; + htx->head_addr += sz + delta; + if (oldaddr == htx->end_addr) { + if (htx->end_addr == htx->tail_addr) { + htx->tail_addr = htx->head_addr; + htx->head_addr = htx->end_addr = 0; + } + else + htx->end_addr += sz; + } + ret = 2; + } + } + else if (blk->addr+sz == htx->head_addr) { + /* The block's payload is just before the head room */ + if (delta < headroom) { + /* Expand the block's payload */ + htx->head_addr += delta; + ret = 1; + } + } + else { + /* The block's payload is not at the rooms edge */ + if (!htx->head_addr && sz+delta < tailroom) { + /* Move the block's payload into the tailroom */ + if (blk->addr == htx->end_addr) + htx->end_addr += sz; + blk->addr = htx->tail_addr; + htx->tail_addr += sz + delta; + ret = 2; + } + else if (sz+delta < headroom) { + /* Move the block's payload into the headroom */ + if (blk->addr == htx->end_addr) + htx->end_addr += sz; + blk->addr = htx->head_addr; + htx->head_addr += sz + delta; + ret = 2; + } + } + /* Otherwise defrag the HTX message */ + + BUG_ON((int32_t)htx->tail_addr < 0); + BUG_ON((int32_t)htx->head_addr < 0); + BUG_ON(htx->end_addr > htx->tail_addr); + BUG_ON(htx->head_addr > htx->end_addr); + return ret; +} + +/* Adds a new block of type <type> in the HTX message <htx>. Its content size is + * passed but it is the caller responsibility to do the copy. + */ +struct htx_blk *htx_add_blk(struct htx *htx, enum htx_blk_type type, uint32_t blksz) +{ + struct htx_blk *blk; + + BUG_ON(blksz >= 256 << 20); + blk = htx_reserve_nxblk(htx, blksz); + if (!blk) + return NULL; + BUG_ON(blk->addr > htx->size); + + blk->info = (type << 28); + return blk; +} + +/* Removes the block <blk> from the HTX message <htx>. The function returns the + * block following <blk> or NULL if <blk> is the last block or the last inserted + * one. + */ +struct htx_blk *htx_remove_blk(struct htx *htx, struct htx_blk *blk) +{ + enum htx_blk_type type; + uint32_t pos, addr, sz; + + BUG_ON(!blk || htx->head == -1); + + /* This is the last block in use */ + if (htx->head == htx->tail) { + uint32_t flags = (htx->flags & ~HTX_FL_FRAGMENTED); /* Preserve flags except FRAGMENTED */ + + htx_reset(htx); + htx->flags = flags; /* restore flags */ + return NULL; + } + + type = htx_get_blk_type(blk); + pos = htx_get_blk_pos(htx, blk); + sz = htx_get_blksz(blk); + addr = blk->addr; + if (type != HTX_BLK_UNUSED) { + /* Mark the block as unused, decrement allocated size */ + htx->data -= htx_get_blksz(blk); + blk->info = ((uint32_t)HTX_BLK_UNUSED << 28); + } + + /* There is at least 2 blocks, so tail is always > 0 */ + if (pos == htx->head) { + /* move the head forward */ + htx->head++; + } + else if (pos == htx->tail) { + /* remove the tail. this was the last inserted block so + * return NULL. */ + htx->tail--; + blk = NULL; + goto end; + } + else + htx->flags |= HTX_FL_FRAGMENTED; + + blk = htx_get_blk(htx, pos+1); + + end: + if (pos == htx->first) + htx->first = (blk ? htx_get_blk_pos(htx, blk) : -1); + + if (htx->head == htx->tail) { + /* If there is just one block in the HTX message, free space can + * be adjusted. This operation could save some defrags. */ + struct htx_blk *lastblk = htx_get_blk(htx, htx->tail); + + htx->head_addr = 0; + htx->end_addr = lastblk->addr; + htx->tail_addr = lastblk->addr+htx->data; + } + else { + if (addr+sz == htx->tail_addr) + htx->tail_addr = addr; + else if (addr+sz == htx->head_addr) + htx->head_addr = addr; + if (addr == htx->end_addr) { + if (htx->tail_addr == htx->end_addr) { + htx->tail_addr = htx->head_addr; + htx->head_addr = htx->end_addr = 0; + } + else + htx->end_addr += sz; + } + } + + BUG_ON((int32_t)htx->tail_addr < 0); + BUG_ON((int32_t)htx->head_addr < 0); + BUG_ON(htx->end_addr > htx->tail_addr); + BUG_ON(htx->head_addr > htx->end_addr); + return blk; +} + +/* Looks for the HTX block containing the offset <offset>, starting at the HTX + * message's head. The function returns an htx_ret with the found HTX block and + * the position inside this block where the offset is. If the offset <offset> is + * outside of the HTX message, htx_ret.blk is set to NULL. + */ +struct htx_ret htx_find_offset(struct htx *htx, uint32_t offset) +{ + struct htx_blk *blk; + struct htx_ret htxret = { .blk = NULL, .ret = 0 }; + + if (offset >= htx->data) + return htxret; + + for (blk = htx_get_head_blk(htx); blk && offset; blk = htx_get_next_blk(htx, blk)) { + uint32_t sz = htx_get_blksz(blk); + + if (offset < sz) + break; + offset -= sz; + } + htxret.blk = blk; + htxret.ret = offset; + return htxret; +} + +/* Removes all blocks after the one containing the offset <offset>. This last + * one may be truncated if it is a DATA block. + */ +void htx_truncate(struct htx *htx, uint32_t offset) +{ + struct htx_blk *blk; + struct htx_ret htxret = htx_find_offset(htx, offset); + + blk = htxret.blk; + if (blk && htxret.ret && htx_get_blk_type(blk) == HTX_BLK_DATA) { + htx_change_blk_value_len(htx, blk, htxret.ret); + blk = htx_get_next_blk(htx, blk); + } + while (blk) + blk = htx_remove_blk(htx, blk); +} + +/* Drains <count> bytes from the HTX message <htx>. If the last block is a DATA + * block, it will be cut if necessary. Others blocks will be removed at once if + * <count> is large enough. The function returns an htx_ret with the first block + * remaining in the message and the amount of data drained. If everything is + * removed, htx_ret.blk is set to NULL. + */ +struct htx_ret htx_drain(struct htx *htx, uint32_t count) +{ + struct htx_blk *blk; + struct htx_ret htxret = { .blk = NULL, .ret = 0 }; + + if (count == htx->data) { + uint32_t flags = (htx->flags & ~HTX_FL_FRAGMENTED); /* Preserve flags except FRAGMENTED */ + + htx_reset(htx); + htx->flags = flags; /* restore flags */ + htxret.ret = count; + return htxret; + } + + blk = htx_get_head_blk(htx); + while (count && blk) { + uint32_t sz = htx_get_blksz(blk); + enum htx_blk_type type = htx_get_blk_type(blk); + + /* Ignore unused block */ + if (type == HTX_BLK_UNUSED) + goto next; + + if (sz > count) { + if (type == HTX_BLK_DATA) { + htx_cut_data_blk(htx, blk, count); + htxret.ret += count; + } + break; + } + count -= sz; + htxret.ret += sz; + next: + blk = htx_remove_blk(htx, blk); + } + htxret.blk = blk; + + return htxret; +} + +/* Tries to append data to the last inserted block, if the type matches and if + * there is enough space to take it all. If the space wraps, the buffer is + * defragmented and a new block is inserted. If an error occurred, NULL is + * returned. Otherwise, on success, the updated block (or the new one) is + * returned. Due to its nature this function can be expensive and should be + * avoided whenever possible. + */ +struct htx_blk *htx_add_data_atonce(struct htx *htx, struct ist data) +{ + struct htx_blk *blk, *tailblk; + void *ptr; + uint32_t len, sz, tailroom, headroom; + + if (htx->head == -1) + goto add_new_block; + + /* Not enough space to store data */ + if (data.len > htx_free_data_space(htx)) + return NULL; + + /* get the tail block and its size */ + tailblk = htx_get_tail_blk(htx); + if (tailblk == NULL) + goto add_new_block; + sz = htx_get_blksz(tailblk); + + /* Don't try to append data if the last inserted block is not of the + * same type */ + if (htx_get_blk_type(tailblk) != HTX_BLK_DATA) + goto add_new_block; + + /* + * Same type and enough space: append data + */ + headroom = (htx->end_addr - htx->head_addr); + tailroom = (htx_pos_to_addr(htx, htx->tail) - htx->tail_addr); + BUG_ON((int32_t)headroom < 0); + BUG_ON((int32_t)tailroom < 0); + + len = data.len; + if (tailblk->addr+sz == htx->tail_addr) { + if (data.len <= tailroom) + goto append_data; + else if (!htx->head_addr) { + len = tailroom; + goto append_data; + } + } + else if (tailblk->addr+sz == htx->head_addr && data.len <= headroom) + goto append_data; + + goto add_new_block; + + append_data: + /* Append data and update the block itself */ + ptr = htx_get_blk_ptr(htx, tailblk); + htx_memcpy(ptr+sz, data.ptr, len); + htx_change_blk_value_len(htx, tailblk, sz+len); + + if (data.len == len) { + blk = tailblk; + goto end; + } + data = istadv(data, len); + + add_new_block: + blk = htx_add_blk(htx, HTX_BLK_DATA, data.len); + if (!blk) + return NULL; + + blk->info += data.len; + htx_memcpy(htx_get_blk_ptr(htx, blk), data.ptr, data.len); + + end: + BUG_ON((int32_t)htx->tail_addr < 0); + BUG_ON((int32_t)htx->head_addr < 0); + BUG_ON(htx->end_addr > htx->tail_addr); + BUG_ON(htx->head_addr > htx->end_addr); + return blk; +} + +/* Replaces a value part of a block by a new one. The new part can be smaller or + * larger than the old one. This function works for any kind of block with + * attached data. It returns the new block on success, otherwise it returns + * NULL. + */ +struct htx_blk *htx_replace_blk_value(struct htx *htx, struct htx_blk *blk, + const struct ist old, const struct ist new) +{ + struct ist n, v; + int32_t delta; + int ret; + + n = htx_get_blk_name(htx, blk); + v = htx_get_blk_value(htx, blk); + delta = new.len - old.len; + ret = htx_prepare_blk_expansion(htx, blk, delta); + if (!ret) + return NULL; /* not enough space */ + + if (ret == 1) { /* Replace in place */ + if (delta <= 0) { + /* compression: copy new data first then move the end */ + htx_memcpy(old.ptr, new.ptr, new.len); + memmove(old.ptr + new.len, istend(old), + istend(v) - istend(old)); + } + else { + /* expansion: move the end first then copy new data */ + memmove(old.ptr + new.len, istend(old), + istend(v) - istend(old)); + htx_memcpy(old.ptr, new.ptr, new.len); + } + + /* set the new block size and update HTX message */ + htx_set_blk_value_len(blk, v.len + delta); + htx->data += delta; + } + else if (ret == 2) { /* New address but no defrag */ + void *ptr = htx_get_blk_ptr(htx, blk); + + /* Copy the name, if any */ + htx_memcpy(ptr, n.ptr, n.len); + ptr += n.len; + + /* Copy value before old part, if any */ + htx_memcpy(ptr, v.ptr, old.ptr - v.ptr); + ptr += old.ptr - v.ptr; + + /* Copy new value */ + htx_memcpy(ptr, new.ptr, new.len); + ptr += new.len; + + /* Copy value after old part, if any */ + htx_memcpy(ptr, istend(old), istend(v) - istend(old)); + + /* set the new block size and update HTX message */ + htx_set_blk_value_len(blk, v.len + delta); + htx->data += delta; + } + else { /* Do a degrag first (it is always an expansion) */ + struct htx_blk tmpblk; + int32_t offset; + + /* use tmpblk to set new block size before defrag and to compute + * the offset after defrag + */ + tmpblk.addr = blk->addr; + tmpblk.info = blk->info; + htx_set_blk_value_len(&tmpblk, v.len + delta); + + /* htx_defrag() will take care to update the block size and the htx message */ + blk = htx_defrag(htx, blk, tmpblk.info); + + /* newblk is now the new HTX block. Compute the offset to copy/move payload */ + offset = blk->addr - tmpblk.addr; + + /* move the end first and copy new data + */ + memmove(old.ptr + offset + new.len, old.ptr + offset + old.len, + istend(v) - istend(old)); + htx_memcpy(old.ptr + offset, new.ptr, new.len); + } + return blk; +} + +/* Transfer HTX blocks from <src> to <dst>, stopping on the first block of the + * type <mark> (typically EOH or EOT) or when <count> bytes were moved + * (including payload and meta-data). It returns the number of bytes moved and + * the last HTX block inserted in <dst>. + */ +struct htx_ret htx_xfer_blks(struct htx *dst, struct htx *src, uint32_t count, + enum htx_blk_type mark) +{ + struct htx_blk *blk, *dstblk; + struct htx_blk *srcref, *dstref; + enum htx_blk_type type; + uint32_t info, max, sz, ret; + + ret = htx_used_space(dst); + srcref = dstref = dstblk = NULL; + + /* blocks are not removed yet from <src> HTX message to be able to + * rollback the transfer if all the headers/trailers are not copied. + */ + for (blk = htx_get_head_blk(src); blk && count; blk = htx_get_next_blk(src, blk)) { + type = htx_get_blk_type(blk); + + /* Ignore unused block */ + if (type == HTX_BLK_UNUSED) + continue; + + + max = htx_get_max_blksz(dst, count); + if (!max) + break; + + sz = htx_get_blksz(blk); + info = blk->info; + if (sz > max) { + /* Only DATA blocks can be partially xferred */ + if (type != HTX_BLK_DATA) + break; + sz = max; + info = (type << 28) + sz; + } + + dstblk = htx_reserve_nxblk(dst, sz); + if (!dstblk) + break; + dstblk->info = info; + htx_memcpy(htx_get_blk_ptr(dst, dstblk), htx_get_blk_ptr(src, blk), sz); + + count -= sizeof(dstblk) + sz; + if (blk->info != info) { + /* Partial xfer: don't remove <blk> from <src> but + * resize its content */ + htx_cut_data_blk(src, blk, sz); + break; + } + + if (type == mark) { + blk = htx_get_next_blk(src, blk); + srcref = dstref = NULL; + break; + } + + /* Save <blk> to <srcref> and <dstblk> to <dstref> when we start + * to xfer headers or trailers. When EOH/EOT block is reached, + * both are reset. It is mandatory to be able to rollback a + * partial transfer. + */ + if (!srcref && !dstref && + (type == HTX_BLK_REQ_SL || type == HTX_BLK_RES_SL || type == HTX_BLK_TLR)) { + srcref = blk; + dstref = dstblk; + } + else if (type == HTX_BLK_EOH || type == HTX_BLK_EOT) + srcref = dstref = NULL; + } + + if (unlikely(dstref)) { + /* Headers or trailers part was partially xferred, so rollback + * the copy by removing all block between <dstref> and <dstblk>, + * both included. <dstblk> may be NULL. + */ + while (dstref && dstref != dstblk) + dstref = htx_remove_blk(dst, dstref); + if (dstblk) + htx_remove_blk(dst, dstblk); + + /* <dst> HTX message is empty, it means the headers or trailers + * part is too big to be copied at once. + */ + if (htx_is_empty(dst)) + src->flags |= HTX_FL_PARSING_ERROR; + } + + /* Now, remove xferred blocks from <src> htx message */ + if (!blk && !srcref) { + /* End of src reached, all blocks were consumed, drain all data */ + htx_drain(src, src->data); + } + else { + /* Remove all block from the head to <blk>, or <srcref> if defined, excluded */ + srcref = (srcref ? srcref : blk); + for (blk = htx_get_head_blk(src); blk && blk != srcref; blk = htx_remove_blk(src, blk)); + } + + end: + ret = htx_used_space(dst) - ret; + return (struct htx_ret){.ret = ret, .blk = dstblk}; +} + +/* Replaces an header by a new one. The new header can be smaller or larger than + * the old one. It returns the new block on success, otherwise it returns NULL. + * The header name is always lower cased. + */ +struct htx_blk *htx_replace_header(struct htx *htx, struct htx_blk *blk, + const struct ist name, const struct ist value) +{ + enum htx_blk_type type; + void *ptr; + int32_t delta; + int ret; + + type = htx_get_blk_type(blk); + if (type != HTX_BLK_HDR) + return NULL; + + delta = name.len + value.len - htx_get_blksz(blk); + ret = htx_prepare_blk_expansion(htx, blk, delta); + if (!ret) + return NULL; /* not enough space */ + + + /* Replace in place or at a new address is the same. We replace all the + * header (name+value). Only take care to defrag the message if + * necessary. */ + if (ret == 3) + blk = htx_defrag(htx, blk, (type << 28) + (value.len << 8) + name.len); + else { + /* Set the new block size and update HTX message */ + blk->info = (type << 28) + (value.len << 8) + name.len; + htx->data += delta; + } + + /* Finally, copy data. */ + ptr = htx_get_blk_ptr(htx, blk); + ist2bin_lc(ptr, name); + htx_memcpy(ptr + name.len, value.ptr, value.len); + return blk; +} + +/* Replaces the parts of the start-line. It returns the new start-line on + * success, otherwise it returns NULL. It is the caller responsibility to update + * sl->info, if necessary. + */ +struct htx_sl *htx_replace_stline(struct htx *htx, struct htx_blk *blk, const struct ist p1, + const struct ist p2, const struct ist p3) +{ + enum htx_blk_type type; + struct htx_sl *sl; + struct htx_sl tmp; /* used to save sl->info and sl->flags */ + uint32_t sz; + int32_t delta; + int ret; + + type = htx_get_blk_type(blk); + if (type != HTX_BLK_REQ_SL && type != HTX_BLK_RES_SL) + return NULL; + + /* Save start-line info and flags */ + sl = htx_get_blk_ptr(htx, blk); + tmp.info = sl->info; + tmp.flags = sl->flags; + + sz = htx_get_blksz(blk); + delta = sizeof(*sl) + p1.len + p2.len + p3.len - sz; + ret = htx_prepare_blk_expansion(htx, blk, delta); + if (!ret) + return NULL; /* not enough space */ + + /* Replace in place or at a new address is the same. We replace all the + * start-line. Only take care to defrag the message if necessary. */ + if (ret == 3) { + blk = htx_defrag(htx, blk, (type << 28) + sz + delta); + } + else { + /* Set the new block size and update HTX message */ + blk->info = (type << 28) + sz + delta; + htx->data += delta; + } + + /* Restore start-line info and flags and copy parts of the start-line */ + sl = htx_get_blk_ptr(htx, blk); + sl->info = tmp.info; + sl->flags = tmp.flags; + + HTX_SL_P1_LEN(sl) = p1.len; + HTX_SL_P2_LEN(sl) = p2.len; + HTX_SL_P3_LEN(sl) = p3.len; + + htx_memcpy(HTX_SL_P1_PTR(sl), p1.ptr, p1.len); + htx_memcpy(HTX_SL_P2_PTR(sl), p2.ptr, p2.len); + htx_memcpy(HTX_SL_P3_PTR(sl), p3.ptr, p3.len); + + return sl; +} + +/* Reserves the maximum possible size for an HTX data block, by extending an + * existing one or by creating a now one. It returns a compound result with the + * HTX block and the position where new data must be inserted (0 for a new + * block). If an error occurs or if there is no space left, NULL is returned + * instead of a pointer on an HTX block. + */ +struct htx_ret htx_reserve_max_data(struct htx *htx) +{ + struct htx_blk *blk, *tailblk; + uint32_t sz, room; + int32_t len = htx_free_data_space(htx); + + if (htx->head == -1) + goto rsv_new_block; + + if (!len) + return (struct htx_ret){.ret = 0, .blk = NULL}; + + /* get the tail and head block */ + tailblk = htx_get_tail_blk(htx); + if (tailblk == NULL) + goto rsv_new_block; + sz = htx_get_blksz(tailblk); + + /* Don't try to append data if the last inserted block is not of the + * same type */ + if (htx_get_blk_type(tailblk) != HTX_BLK_DATA) + goto rsv_new_block; + + /* + * Same type and enough space: append data + */ + if (!htx->head_addr) { + if (tailblk->addr+sz != htx->tail_addr) + goto rsv_new_block; + room = (htx_pos_to_addr(htx, htx->tail) - htx->tail_addr); + } + else { + if (tailblk->addr+sz != htx->head_addr) + goto rsv_new_block; + room = (htx->end_addr - htx->head_addr); + } + BUG_ON((int32_t)room < 0); + if (room < len) + len = room; + +append_data: + htx_change_blk_value_len(htx, tailblk, sz+len); + + BUG_ON((int32_t)htx->tail_addr < 0); + BUG_ON((int32_t)htx->head_addr < 0); + BUG_ON(htx->end_addr > htx->tail_addr); + BUG_ON(htx->head_addr > htx->end_addr); + return (struct htx_ret){.ret = sz, .blk = tailblk}; + +rsv_new_block: + blk = htx_add_blk(htx, HTX_BLK_DATA, len); + if (!blk) + return (struct htx_ret){.ret = 0, .blk = NULL}; + blk->info += len; + return (struct htx_ret){.ret = 0, .blk = blk}; +} + +/* Adds an HTX block of type DATA in <htx>. It first tries to append data if + * possible. It returns the number of bytes consumed from <data>, which may be + * zero if nothing could be copied. + */ +size_t htx_add_data(struct htx *htx, const struct ist data) +{ + struct htx_blk *blk, *tailblk; + void *ptr; + uint32_t sz, room; + int32_t len = data.len; + + /* Not enough space to store data */ + if (len > htx_free_data_space(htx)) + len = htx_free_data_space(htx); + + if (!len) + return 0; + + if (htx->head == -1) + goto add_new_block; + + /* get the tail and head block */ + tailblk = htx_get_tail_blk(htx); + if (tailblk == NULL) + goto add_new_block; + sz = htx_get_blksz(tailblk); + + /* Don't try to append data if the last inserted block is not of the + * same type */ + if (htx_get_blk_type(tailblk) != HTX_BLK_DATA) + goto add_new_block; + + /* + * Same type and enough space: append data + */ + if (!htx->head_addr) { + if (tailblk->addr+sz != htx->tail_addr) + goto add_new_block; + room = (htx_pos_to_addr(htx, htx->tail) - htx->tail_addr); + } + else { + if (tailblk->addr+sz != htx->head_addr) + goto add_new_block; + room = (htx->end_addr - htx->head_addr); + } + BUG_ON((int32_t)room < 0); + if (room < len) + len = room; + + append_data: + /* Append data and update the block itself */ + ptr = htx_get_blk_ptr(htx, tailblk); + htx_memcpy(ptr + sz, data.ptr, len); + htx_change_blk_value_len(htx, tailblk, sz+len); + + BUG_ON((int32_t)htx->tail_addr < 0); + BUG_ON((int32_t)htx->head_addr < 0); + BUG_ON(htx->end_addr > htx->tail_addr); + BUG_ON(htx->head_addr > htx->end_addr); + return len; + + add_new_block: + blk = htx_add_blk(htx, HTX_BLK_DATA, len); + if (!blk) + return 0; + + blk->info += len; + htx_memcpy(htx_get_blk_ptr(htx, blk), data.ptr, len); + return len; +} + + +/* Adds an HTX block of type DATA in <htx> just after all other DATA + * blocks. Because it relies on htx_add_data_atonce(), It may be happened to a + * DATA block if possible. But, if the function succeeds, it will be the last + * DATA block in all cases. If an error occurred, NULL is returned. Otherwise, + * on success, the updated block (or the new one) is returned. + */ +struct htx_blk *htx_add_last_data(struct htx *htx, struct ist data) +{ + struct htx_blk *blk, *pblk; + + blk = htx_add_data_atonce(htx, data); + if (!blk) + return NULL; + + for (pblk = htx_get_prev_blk(htx, blk); pblk; pblk = htx_get_prev_blk(htx, pblk)) { + if (htx_get_blk_type(pblk) <= HTX_BLK_DATA) + break; + + /* Swap .addr and .info fields */ + blk->addr ^= pblk->addr; pblk->addr ^= blk->addr; blk->addr ^= pblk->addr; + blk->info ^= pblk->info; pblk->info ^= blk->info; blk->info ^= pblk->info; + + if (blk->addr == pblk->addr) + blk->addr += htx_get_blksz(pblk); + blk = pblk; + } + + return blk; +} + +/* Moves the block <blk> just before the block <ref>. Both blocks must be in the + * HTX message <htx> and <blk> must be placed after <ref>. pointer to these + * blocks are updated to remain valid after the move. */ +void htx_move_blk_before(struct htx *htx, struct htx_blk **blk, struct htx_blk **ref) +{ + struct htx_blk *cblk, *pblk; + + cblk = *blk; + for (pblk = htx_get_prev_blk(htx, cblk); pblk; pblk = htx_get_prev_blk(htx, pblk)) { + /* Swap .addr and .info fields */ + cblk->addr ^= pblk->addr; pblk->addr ^= cblk->addr; cblk->addr ^= pblk->addr; + cblk->info ^= pblk->info; pblk->info ^= cblk->info; cblk->info ^= pblk->info; + + if (cblk->addr == pblk->addr) + cblk->addr += htx_get_blksz(pblk); + if (pblk == *ref) + break; + cblk = pblk; + } + *blk = cblk; + *ref = pblk; +} + +/* Append the HTX message <src> to the HTX message <dst>. It returns 1 on + * success and 0 on error. All the message or nothing is copied. If an error + * occurred, all blocks from <src> already appended to <dst> are truncated. + */ +int htx_append_msg(struct htx *dst, const struct htx *src) +{ + struct htx_blk *blk, *newblk; + enum htx_blk_type type; + uint32_t blksz, offset = dst->data; + + for (blk = htx_get_head_blk(src); blk; blk = htx_get_next_blk(src, blk)) { + type = htx_get_blk_type(blk); + + if (type == HTX_BLK_UNUSED) + continue; + + blksz = htx_get_blksz(blk); + newblk = htx_add_blk(dst, type, blksz); + if (!newblk) + goto error; + newblk->info = blk->info; + htx_memcpy(htx_get_blk_ptr(dst, newblk), htx_get_blk_ptr(src, blk), blksz); + } + + return 1; + + error: + htx_truncate(dst, offset); + return 0; +} diff --git a/src/init.c b/src/init.c new file mode 100644 index 0000000..6367ac5 --- /dev/null +++ b/src/init.c @@ -0,0 +1,249 @@ +#include <stdio.h> +#include <stdlib.h> + +#include <haproxy/init.h> +#include <haproxy/list.h> + +/* These functions are called just before a config validity check, which mean + * they are suited to use them in case we need to generate part of the + * configuration. It could be used for example to generate a proxy with + * multiple servers using the configuration parser itself. At this step the + * trash buffers are allocated. + * The functions must return 0 on success, or a combination + * of ERR_* flags (ERR_WARN, ERR_ABORT, ERR_FATAL, ...). The 2 latter cause + * and immediate exit, so the function must have emitted any useful error. + */ +struct list pre_check_list = LIST_HEAD_INIT(pre_check_list); + +/* These functions are called just after the point where the program exits + * after a config validity check, so they are generally suited for resource + * allocation and slow initializations that should be skipped during basic + * config checks. The functions must return 0 on success, or a combination + * of ERR_* flags (ERR_WARN, ERR_ABORT, ERR_FATAL, ...). The 2 latter cause + * and immediate exit, so the function must have emitted any useful error. + */ +struct list post_check_list = LIST_HEAD_INIT(post_check_list); + +/* These functions are called for each proxy just after the config validity + * check. The functions must return 0 on success, or a combination of ERR_* + * flags (ERR_WARN, ERR_ABORT, ERR_FATAL, ...). The 2 latter cause and immediate + * exit, so the function must have emitted any useful error. + */ +struct list post_proxy_check_list = LIST_HEAD_INIT(post_proxy_check_list); + +/* These functions are called for each server just after the config validity + * check. The functions must return 0 on success, or a combination of ERR_* + * flags (ERR_WARN, ERR_ABORT, ERR_FATAL, ...). The 2 latter cause and immediate + * exit, so the function must have emitted any useful error. + */ +struct list post_server_check_list = LIST_HEAD_INIT(post_server_check_list); + +/* These functions are called for each thread just after the thread creation + * and before running the init functions. They should be used to do per-thread + * (re-)allocations that are needed by subsequent functoins. They must return 0 + * if an error occurred. */ +struct list per_thread_alloc_list = LIST_HEAD_INIT(per_thread_alloc_list); + +/* These functions are called for each thread just after the thread creation + * and before running the scheduler. They should be used to do per-thread + * initializations. They must return 0 if an error occurred. */ +struct list per_thread_init_list = LIST_HEAD_INIT(per_thread_init_list); + +/* These functions are called when freeing the global sections at the end of + * deinit, after everything is stopped. They don't return anything. They should + * not release shared resources that are possibly used by other deinit + * functions, only close/release what is private. Use the per_thread_free_list + * to release shared resources. + */ +struct list post_deinit_list = LIST_HEAD_INIT(post_deinit_list); + +/* These functions are called when freeing a proxy during the deinit, after + * everything isg stopped. They don't return anything. They should not release + * the proxy itself or any shared resources that are possibly used by other + * deinit functions, only close/release what is private. + */ +struct list proxy_deinit_list = LIST_HEAD_INIT(proxy_deinit_list); + +/* These functions are called when freeing a server during the deinit, after + * everything isg stopped. They don't return anything. They should not release + * the proxy itself or any shared resources that are possibly used by other + * deinit functions, only close/release what is private. + */ +struct list server_deinit_list = LIST_HEAD_INIT(server_deinit_list); + +/* These functions are called when freeing the global sections at the end of + * deinit, after the thread deinit functions, to release unneeded memory + * allocations. They don't return anything, and they work in best effort mode + * as their sole goal is to make valgrind mostly happy. + */ +struct list per_thread_free_list = LIST_HEAD_INIT(per_thread_free_list); + +/* These functions are called for each thread just after the scheduler loop and + * before exiting the thread. They don't return anything and, as for post-deinit + * functions, they work in best effort mode as their sole goal is to make + * valgrind mostly happy. */ +struct list per_thread_deinit_list = LIST_HEAD_INIT(per_thread_deinit_list); + +/* used to register some initialization functions to call before the checks. */ +void hap_register_pre_check(int (*fct)()) +{ + struct pre_check_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&pre_check_list, &b->list); +} + +/* used to register some initialization functions to call after the checks. */ +void hap_register_post_check(int (*fct)()) +{ + struct post_check_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&post_check_list, &b->list); +} + +/* used to register some initialization functions to call for each proxy after + * the checks. + */ +void hap_register_post_proxy_check(int (*fct)(struct proxy *)) +{ + struct post_proxy_check_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&post_proxy_check_list, &b->list); +} + +/* used to register some initialization functions to call for each server after + * the checks. + */ +void hap_register_post_server_check(int (*fct)(struct server *)) +{ + struct post_server_check_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&post_server_check_list, &b->list); +} + +/* used to register some de-initialization functions to call after everything + * has stopped. + */ +void hap_register_post_deinit(void (*fct)()) +{ + struct post_deinit_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&post_deinit_list, &b->list); +} + +/* used to register some per proxy de-initialization functions to call after + * everything has stopped. + */ +void hap_register_proxy_deinit(void (*fct)(struct proxy *)) +{ + struct proxy_deinit_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&proxy_deinit_list, &b->list); +} + +/* used to register some per server de-initialization functions to call after + * everything has stopped. + */ +void hap_register_server_deinit(void (*fct)(struct server *)) +{ + struct server_deinit_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&server_deinit_list, &b->list); +} + +/* used to register some allocation functions to call for each thread. */ +void hap_register_per_thread_alloc(int (*fct)()) +{ + struct per_thread_alloc_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&per_thread_alloc_list, &b->list); +} + +/* used to register some initialization functions to call for each thread. */ +void hap_register_per_thread_init(int (*fct)()) +{ + struct per_thread_init_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&per_thread_init_list, &b->list); +} + +/* used to register some de-initialization functions to call for each thread. */ +void hap_register_per_thread_deinit(void (*fct)()) +{ + struct per_thread_deinit_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&per_thread_deinit_list, &b->list); +} + +/* used to register some free functions to call for each thread. */ +void hap_register_per_thread_free(void (*fct)()) +{ + struct per_thread_free_fct *b; + + b = calloc(1, sizeof(*b)); + if (!b) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + b->fct = fct; + LIST_APPEND(&per_thread_free_list, &b->list); +} diff --git a/src/jwt.c b/src/jwt.c new file mode 100644 index 0000000..6c4cbd3 --- /dev/null +++ b/src/jwt.c @@ -0,0 +1,478 @@ +/* + * JSON Web Token (JWT) processing + * + * Copyright 2021 HAProxy Technologies + * Remi Tricot-Le Breton <rlebreton@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <import/ebmbtree.h> +#include <import/ebsttree.h> + +#include <haproxy/api.h> +#include <haproxy/tools.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/base64.h> +#include <haproxy/jwt.h> +#include <haproxy/buf.h> + + +#ifdef USE_OPENSSL +/* Tree into which the public certificates used to validate JWTs will be stored. */ +static struct eb_root jwt_cert_tree = EB_ROOT_UNIQUE; + +/* + * The possible algorithm strings that can be found in a JWS's JOSE header are + * defined in section 3.1 of RFC7518. + */ +enum jwt_alg jwt_parse_alg(const char *alg_str, unsigned int alg_len) +{ + enum jwt_alg alg = JWT_ALG_DEFAULT; + + /* Algorithms are all 5 characters long apart from "none". */ + if (alg_len < sizeof("HS256")-1) { + if (alg_len == sizeof("none")-1 && strcmp("none", alg_str) == 0) + alg = JWS_ALG_NONE; + return alg; + } + + if (alg == JWT_ALG_DEFAULT) { + switch(*alg_str++) { + case 'H': + if (strncmp(alg_str, "S256", alg_len-1) == 0) + alg = JWS_ALG_HS256; + else if (strncmp(alg_str, "S384", alg_len-1) == 0) + alg = JWS_ALG_HS384; + else if (strncmp(alg_str, "S512", alg_len-1) == 0) + alg = JWS_ALG_HS512; + break; + case 'R': + if (strncmp(alg_str, "S256", alg_len-1) == 0) + alg = JWS_ALG_RS256; + else if (strncmp(alg_str, "S384", alg_len-1) == 0) + alg = JWS_ALG_RS384; + else if (strncmp(alg_str, "S512", alg_len-1) == 0) + alg = JWS_ALG_RS512; + break; + case 'E': + if (strncmp(alg_str, "S256", alg_len-1) == 0) + alg = JWS_ALG_ES256; + else if (strncmp(alg_str, "S384", alg_len-1) == 0) + alg = JWS_ALG_ES384; + else if (strncmp(alg_str, "S512", alg_len-1) == 0) + alg = JWS_ALG_ES512; + break; + case 'P': + if (strncmp(alg_str, "S256", alg_len-1) == 0) + alg = JWS_ALG_PS256; + else if (strncmp(alg_str, "S384", alg_len-1) == 0) + alg = JWS_ALG_PS384; + else if (strncmp(alg_str, "S512", alg_len-1) == 0) + alg = JWS_ALG_PS512; + break; + default: + break; + } + } + + return alg; +} + +/* + * Split a JWT into its separate dot-separated parts. + * Since only JWS following the Compact Serialization format are managed for + * now, we don't need to manage more than three subparts in the tokens. + * See section 3.1 of RFC7515 for more information about JWS Compact + * Serialization. + * Returns 0 in case of success. + */ +int jwt_tokenize(const struct buffer *jwt, struct jwt_item *items, unsigned int *item_num) +{ + char *ptr = jwt->area; + char *jwt_end = jwt->area + jwt->data; + unsigned int index = 0; + unsigned int length = 0; + + if (index < *item_num) { + items[index].start = ptr; + items[index].length = 0; + } + + while (index < *item_num && ptr < jwt_end) { + if (*ptr++ == '.') { + items[index++].length = length; + + if (index == *item_num) + return -1; + items[index].start = ptr; + items[index].length = 0; + length = 0; + } else + ++length; + } + + if (index < *item_num) + items[index].length = length; + + *item_num = (index+1); + + return (ptr != jwt_end); +} + +/* + * Parse a public certificate and insert it into the jwt_cert_tree. + * Returns 0 in case of success. + */ +int jwt_tree_load_cert(char *path, int pathlen, char **err) +{ + int retval = -1; + struct jwt_cert_tree_entry *entry = NULL; + EVP_PKEY *pkey = NULL; + BIO *bio = NULL; + + entry = calloc(1, sizeof(*entry) + pathlen + 1); + if (!entry) { + memprintf(err, "%sunable to allocate memory (jwt_cert_tree_entry).\n", err && *err ? *err : ""); + return -1; + } + memcpy(entry->path, path, pathlen + 1); + + if (ebst_insert(&jwt_cert_tree, &entry->node) != &entry->node) { + free(entry); + return 0; /* Entry already in the tree */ + } + + bio = BIO_new(BIO_s_file()); + if (!bio) { + memprintf(err, "%sunable to allocate memory (BIO).\n", err && *err ? *err : ""); + goto end; + } + + if (BIO_read_filename(bio, path) == 1) { + + pkey = PEM_read_bio_PUBKEY(bio, NULL, NULL, NULL); + + if (!pkey) { + memprintf(err, "%sfile not found (%s)\n", err && *err ? *err : "", path); + goto end; + } + + entry->pkey = pkey; + retval = 0; + } + +end: + if (retval) { + /* Some error happened during pkey parsing, remove the already + * inserted node from the tree and free it. + */ + ebmb_delete(&entry->node); + free(entry); + } + BIO_free(bio); + return retval; +} + +/* + * Calculate the HMAC signature of a specific JWT and check that it matches the + * one included in the token. + * Returns 1 in case of success. + */ +static enum jwt_vrfy_status +jwt_jwsverify_hmac(const struct jwt_ctx *ctx, const struct buffer *decoded_signature) +{ + const EVP_MD *evp = NULL; + unsigned char signature[EVP_MAX_MD_SIZE]; + unsigned int signature_length = 0; + unsigned char *hmac_res = NULL; + enum jwt_vrfy_status retval = JWT_VRFY_KO; + + switch(ctx->alg) { + case JWS_ALG_HS256: + evp = EVP_sha256(); + break; + case JWS_ALG_HS384: + evp = EVP_sha384(); + break; + case JWS_ALG_HS512: + evp = EVP_sha512(); + break; + default: break; + } + + hmac_res = HMAC(evp, ctx->key, ctx->key_length, (const unsigned char*)ctx->jose.start, + ctx->jose.length + ctx->claims.length + 1, signature, &signature_length); + + if (hmac_res && signature_length == decoded_signature->data && + (CRYPTO_memcmp(decoded_signature->area, signature, signature_length) == 0)) + retval = JWT_VRFY_OK; + + return retval; +} + +/* + * Convert a JWT ECDSA signature (R and S parameters concatenatedi, see section + * 3.4 of RFC7518) into an ECDSA_SIG that can be fed back into OpenSSL's digest + * verification functions. + * Returns 0 in case of success. + */ +static int convert_ecdsa_sig(const struct jwt_ctx *ctx, EVP_PKEY *pkey, struct buffer *signature) +{ + int retval = 0; + ECDSA_SIG *ecdsa_sig = NULL; + BIGNUM *ec_R = NULL, *ec_S = NULL; + unsigned int bignum_len; + unsigned char *p; + + ecdsa_sig = ECDSA_SIG_new(); + if (!ecdsa_sig) { + retval = JWT_VRFY_OUT_OF_MEMORY; + goto end; + } + + if (b_data(signature) % 2) { + retval = JWT_VRFY_INVALID_TOKEN; + goto end; + } + + bignum_len = b_data(signature) / 2; + + ec_R = BN_bin2bn((unsigned char*)b_orig(signature), bignum_len, NULL); + ec_S = BN_bin2bn((unsigned char *)(b_orig(signature) + bignum_len), bignum_len, NULL); + + if (!ec_R || !ec_S) { + retval = JWT_VRFY_INVALID_TOKEN; + goto end; + } + + /* Build ecdsa out of R and S values. */ + ECDSA_SIG_set0(ecdsa_sig, ec_R, ec_S); + + p = (unsigned char*)signature->area; + + signature->data = i2d_ECDSA_SIG(ecdsa_sig, &p); + if (signature->data == 0) { + retval = JWT_VRFY_INVALID_TOKEN; + goto end; + } + +end: + ECDSA_SIG_free(ecdsa_sig); + return retval; +} + +/* + * Check that the signature included in a JWT signed via RSA or ECDSA is valid + * and can be verified thanks to a given public certificate. + * Returns 1 in case of success. + */ +static enum jwt_vrfy_status +jwt_jwsverify_rsa_ecdsa(const struct jwt_ctx *ctx, struct buffer *decoded_signature) +{ + const EVP_MD *evp = NULL; + EVP_MD_CTX *evp_md_ctx; + EVP_PKEY_CTX *pkey_ctx = NULL; + enum jwt_vrfy_status retval = JWT_VRFY_KO; + struct ebmb_node *eb; + struct jwt_cert_tree_entry *entry = NULL; + int is_ecdsa = 0; + int padding = RSA_PKCS1_PADDING; + + switch(ctx->alg) { + case JWS_ALG_RS256: + evp = EVP_sha256(); + break; + case JWS_ALG_RS384: + evp = EVP_sha384(); + break; + case JWS_ALG_RS512: + evp = EVP_sha512(); + break; + + case JWS_ALG_ES256: + evp = EVP_sha256(); + is_ecdsa = 1; + break; + case JWS_ALG_ES384: + evp = EVP_sha384(); + is_ecdsa = 1; + break; + case JWS_ALG_ES512: + evp = EVP_sha512(); + is_ecdsa = 1; + break; + + case JWS_ALG_PS256: + evp = EVP_sha256(); + padding = RSA_PKCS1_PSS_PADDING; + break; + case JWS_ALG_PS384: + evp = EVP_sha384(); + padding = RSA_PKCS1_PSS_PADDING; + break; + case JWS_ALG_PS512: + evp = EVP_sha512(); + padding = RSA_PKCS1_PSS_PADDING; + break; + default: break; + } + + evp_md_ctx = EVP_MD_CTX_new(); + if (!evp_md_ctx) + return JWT_VRFY_OUT_OF_MEMORY; + + eb = ebst_lookup(&jwt_cert_tree, ctx->key); + + if (!eb) { + retval = JWT_VRFY_UNKNOWN_CERT; + goto end; + } + + entry = ebmb_entry(eb, struct jwt_cert_tree_entry, node); + + if (!entry->pkey) { + retval = JWT_VRFY_UNKNOWN_CERT; + goto end; + } + + /* + * ECXXX signatures are a direct concatenation of the (R, S) pair and + * need to be converted back to asn.1 in order for verify operations to + * work with OpenSSL. + */ + if (is_ecdsa) { + int conv_retval = convert_ecdsa_sig(ctx, entry->pkey, decoded_signature); + if (conv_retval != 0) { + retval = conv_retval; + goto end; + } + } + + if (EVP_DigestVerifyInit(evp_md_ctx, &pkey_ctx, evp, NULL, entry->pkey) == 1) { + if (is_ecdsa || EVP_PKEY_CTX_set_rsa_padding(pkey_ctx, padding) > 0) { + if (EVP_DigestVerifyUpdate(evp_md_ctx, (const unsigned char*)ctx->jose.start, + ctx->jose.length + ctx->claims.length + 1) == 1 && + EVP_DigestVerifyFinal(evp_md_ctx, (const unsigned char*)decoded_signature->area, decoded_signature->data) == 1) { + retval = JWT_VRFY_OK; + } + } + } + +end: + EVP_MD_CTX_free(evp_md_ctx); + return retval; +} + +/* + * Check that the <token> that was signed via algorithm <alg> using the <key> + * (either an HMAC secret or the path to a public certificate) has a valid + * signature. + * Returns 1 in case of success. + */ +enum jwt_vrfy_status jwt_verify(const struct buffer *token, const struct buffer *alg, + const struct buffer *key) +{ + struct jwt_item items[JWT_ELT_MAX] = { { 0 } }; + unsigned int item_num = JWT_ELT_MAX; + struct buffer *decoded_sig = NULL; + struct jwt_ctx ctx = {}; + enum jwt_vrfy_status retval = JWT_VRFY_KO; + int ret; + + ctx.alg = jwt_parse_alg(alg->area, alg->data); + + if (ctx.alg == JWT_ALG_DEFAULT) + return JWT_VRFY_UNKNOWN_ALG; + + if (jwt_tokenize(token, items, &item_num)) + return JWT_VRFY_INVALID_TOKEN; + + if (item_num != JWT_ELT_MAX) + if (ctx.alg != JWS_ALG_NONE || item_num != JWT_ELT_SIG) + return JWT_VRFY_INVALID_TOKEN; + + ctx.jose = items[JWT_ELT_JOSE]; + ctx.claims = items[JWT_ELT_CLAIMS]; + ctx.signature = items[JWT_ELT_SIG]; + + /* "alg" is "none", the signature must be empty for the JWS to be valid. */ + if (ctx.alg == JWS_ALG_NONE) { + return (ctx.signature.length == 0) ? JWT_VRFY_OK : JWT_VRFY_KO; + } + + if (ctx.signature.length == 0) + return JWT_VRFY_INVALID_TOKEN; + + decoded_sig = alloc_trash_chunk(); + if (!decoded_sig) + return JWT_VRFY_OUT_OF_MEMORY; + + ret = base64urldec(ctx.signature.start, ctx.signature.length, + decoded_sig->area, decoded_sig->size); + if (ret == -1) { + retval = JWT_VRFY_INVALID_TOKEN; + goto end; + } + + decoded_sig->data = ret; + ctx.key = key->area; + ctx.key_length = key->data; + + /* We have all three sections, signature calculation can begin. */ + + switch(ctx.alg) { + + case JWS_ALG_HS256: + case JWS_ALG_HS384: + case JWS_ALG_HS512: + /* HMAC + SHA-XXX */ + retval = jwt_jwsverify_hmac(&ctx, decoded_sig); + break; + case JWS_ALG_RS256: + case JWS_ALG_RS384: + case JWS_ALG_RS512: + case JWS_ALG_ES256: + case JWS_ALG_ES384: + case JWS_ALG_ES512: + case JWS_ALG_PS256: + case JWS_ALG_PS384: + case JWS_ALG_PS512: + /* RSASSA-PKCS1-v1_5 + SHA-XXX */ + /* ECDSA using P-XXX and SHA-XXX */ + /* RSASSA-PSS using SHA-XXX and MGF1 with SHA-XXX */ + retval = jwt_jwsverify_rsa_ecdsa(&ctx, decoded_sig); + break; + default: + /* Not managed yet */ + retval = JWT_VRFY_UNMANAGED_ALG; + break; + } + +end: + free_trash_chunk(decoded_sig); + + return retval; +} + +static void jwt_deinit(void) +{ + struct ebmb_node *node = NULL; + struct jwt_cert_tree_entry *entry = NULL; + + node = ebmb_first(&jwt_cert_tree); + while (node) { + entry = ebmb_entry(node, struct jwt_cert_tree_entry, node); + ebmb_delete(node); + EVP_PKEY_free(entry->pkey); + ha_free(&entry); + node = ebmb_first(&jwt_cert_tree); + } +} +REGISTER_POST_DEINIT(jwt_deinit); + + +#endif /* USE_OPENSSL */ diff --git a/src/lb_chash.c b/src/lb_chash.c new file mode 100644 index 0000000..4e8fb15 --- /dev/null +++ b/src/lb_chash.c @@ -0,0 +1,517 @@ +/* + * Consistent Hash implementation + * Please consult this very well detailed article for more information : + * http://www.spiteful.com/2008/03/17/programmers-toolbox-part-3-consistent-hashing/ + * + * Our implementation has to support both weighted hashing and weighted round + * robin because we'll use it to replace the previous map-based implementation + * which offered both algorithms. + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <import/eb32tree.h> +#include <haproxy/api.h> +#include <haproxy/backend.h> +#include <haproxy/errors.h> +#include <haproxy/queue.h> +#include <haproxy/server-t.h> +#include <haproxy/tools.h> + +/* Return next tree node after <node> which must still be in the tree, or be + * NULL. Lookup wraps around the end to the beginning. If the next node is the + * same node, return NULL. This is designed to find a valid next node before + * deleting one from the tree. + */ +static inline struct eb32_node *chash_skip_node(struct eb_root *root, struct eb32_node *node) +{ + struct eb32_node *stop = node; + + if (!node) + return NULL; + node = eb32_next(node); + if (!node) + node = eb32_first(root); + if (node == stop) + return NULL; + return node; +} + +/* Remove all of a server's entries from its tree. This may be used when + * setting a server down. + */ +static inline void chash_dequeue_srv(struct server *s) +{ + while (s->lb_nodes_now > 0) { + if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway + s->lb_nodes_now = s->lb_nodes_tot; + s->lb_nodes_now--; + if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node) + s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last); + eb32_delete(&s->lb_nodes[s->lb_nodes_now].node); + } +} + +/* Adjust the number of entries of a server in its tree. The server must appear + * as many times as its weight indicates it. If it's there too often, we remove + * the last occurrences. If it's not there enough, we add more occurrences. To + * remove a server from the tree, normally call this with eweight=0. + * + * The server's lock and the lbprm's lock must be held. + */ +static inline void chash_queue_dequeue_srv(struct server *s) +{ + while (s->lb_nodes_now > s->next_eweight) { + if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway + s->lb_nodes_now = s->lb_nodes_tot; + s->lb_nodes_now--; + if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node) + s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last); + eb32_delete(&s->lb_nodes[s->lb_nodes_now].node); + } + + /* Attempt to increase the total number of nodes, if the user + * increased the weight beyond the original weight + */ + if (s->lb_nodes_tot < s->next_eweight) { + struct tree_occ *new_nodes; + + /* First we need to remove all server's entries from its tree + * because the realloc will change all nodes pointers */ + chash_dequeue_srv(s); + + new_nodes = realloc(s->lb_nodes, s->next_eweight * sizeof(*new_nodes)); + if (new_nodes) { + unsigned int j; + + s->lb_nodes = new_nodes; + memset(&s->lb_nodes[s->lb_nodes_tot], 0, + (s->next_eweight - s->lb_nodes_tot) * sizeof(*s->lb_nodes)); + for (j = s->lb_nodes_tot; j < s->next_eweight; j++) { + s->lb_nodes[j].server = s; + s->lb_nodes[j].node.key = full_hash(s->puid * SRV_EWGHT_RANGE + j); + } + s->lb_nodes_tot = s->next_eweight; + } + } + while (s->lb_nodes_now < s->next_eweight) { + if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway + break; + if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node) + s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last); + eb32_insert(s->lb_tree, &s->lb_nodes[s->lb_nodes_now].node); + s->lb_nodes_now++; + } +} + +/* This function updates the server trees according to server <srv>'s new + * state. It should be called when server <srv>'s status changes to down. + * It is not important whether the server was already down or not. It is not + * important either that the new state is completely down (the caller may not + * know all the variables of a server's state). + * + * The server's lock must be held. The lbprm lock will be used. + */ +static void chash_set_server_status_down(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (srv_willbe_usable(srv)) + goto out_update_state; + + if (!srv_currently_usable(srv)) + /* server was already down */ + goto out_update_backend; + + if (srv->flags & SRV_F_BACKUP) { + p->lbprm.tot_wbck -= srv->cur_eweight; + p->srv_bck--; + + if (srv == p->lbprm.fbck) { + /* we lost the first backup server in a single-backup + * configuration, we must search another one. + */ + struct server *srv2 = p->lbprm.fbck; + do { + srv2 = srv2->next; + } while (srv2 && + !((srv2->flags & SRV_F_BACKUP) && + srv_willbe_usable(srv2))); + p->lbprm.fbck = srv2; + } + } else { + p->lbprm.tot_wact -= srv->cur_eweight; + p->srv_act--; + } + + chash_dequeue_srv(srv); + +out_update_backend: + /* check/update tot_used, tot_weight */ + update_backend_weight(p); + out_update_state: + srv_lb_commit_status(srv); + + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); +} + +/* This function updates the server trees according to server <srv>'s new + * state. It should be called when server <srv>'s status changes to up. + * It is not important whether the server was already down or not. It is not + * important either that the new state is completely UP (the caller may not + * know all the variables of a server's state). This function will not change + * the weight of a server which was already up. + * + * The server's lock must be held. The lbprm lock will be used. + */ +static void chash_set_server_status_up(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (!srv_willbe_usable(srv)) + goto out_update_state; + + if (srv_currently_usable(srv)) + /* server was already up */ + goto out_update_backend; + + if (srv->flags & SRV_F_BACKUP) { + p->lbprm.tot_wbck += srv->next_eweight; + p->srv_bck++; + + if (!(p->options & PR_O_USE_ALL_BK)) { + if (!p->lbprm.fbck) { + /* there was no backup server anymore */ + p->lbprm.fbck = srv; + } else { + /* we may have restored a backup server prior to fbck, + * in which case it should replace it. + */ + struct server *srv2 = srv; + do { + srv2 = srv2->next; + } while (srv2 && (srv2 != p->lbprm.fbck)); + if (srv2) + p->lbprm.fbck = srv; + } + } + } else { + p->lbprm.tot_wact += srv->next_eweight; + p->srv_act++; + } + + /* note that eweight cannot be 0 here */ + chash_queue_dequeue_srv(srv); + + out_update_backend: + /* check/update tot_used, tot_weight */ + update_backend_weight(p); + out_update_state: + srv_lb_commit_status(srv); + + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); +} + +/* This function must be called after an update to server <srv>'s effective + * weight. It may be called after a state change too. + * + * The server's lock must be held. The lbprm lock may be used. + */ +static void chash_update_server_weight(struct server *srv) +{ + int old_state, new_state; + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + /* If changing the server's weight changes its state, we simply apply + * the procedures we already have for status change. If the state + * remains down, the server is not in any tree, so it's as easy as + * updating its values. If the state remains up with different weights, + * there are some computations to perform to find a new place and + * possibly a new tree for this server. + */ + + old_state = srv_currently_usable(srv); + new_state = srv_willbe_usable(srv); + + if (!old_state && !new_state) { + srv_lb_commit_status(srv); + return; + } + else if (!old_state && new_state) { + chash_set_server_status_up(srv); + return; + } + else if (old_state && !new_state) { + chash_set_server_status_down(srv); + return; + } + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + /* only adjust the server's presence in the tree */ + chash_queue_dequeue_srv(srv); + + if (srv->flags & SRV_F_BACKUP) + p->lbprm.tot_wbck += srv->next_eweight - srv->cur_eweight; + else + p->lbprm.tot_wact += srv->next_eweight - srv->cur_eweight; + + update_backend_weight(p); + srv_lb_commit_status(srv); + + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); +} + +/* + * This function implements the "Consistent Hashing with Bounded Loads" algorithm + * of Mirrokni, Thorup, and Zadimoghaddam (arxiv:1608.01350), adapted for use with + * unequal server weights. + */ +int chash_server_is_eligible(struct server *s) +{ + /* The total number of slots to allocate is the total number of outstanding requests + * (including the one we're about to make) times the load-balance-factor, rounded up. + */ + unsigned tot_slots = ((s->proxy->served + 1) * s->proxy->lbprm.hash_balance_factor + 99) / 100; + unsigned slots_per_weight = tot_slots / s->proxy->lbprm.tot_weight; + unsigned remainder = tot_slots % s->proxy->lbprm.tot_weight; + + /* Allocate a whole number of slots per weight unit... */ + unsigned slots = s->cur_eweight * slots_per_weight; + + /* And then distribute the rest among servers proportionally to their weight. */ + slots += ((s->cumulative_weight + s->cur_eweight) * remainder) / s->proxy->lbprm.tot_weight + - (s->cumulative_weight * remainder) / s->proxy->lbprm.tot_weight; + + /* But never leave a server with 0. */ + if (slots == 0) + slots = 1; + + return s->served < slots; +} + +/* + * This function returns the running server from the CHASH tree, which is at + * the closest distance from the value of <hash>. Doing so ensures that even + * with a well imbalanced hash, if some servers are close to each other, they + * will still both receive traffic. If any server is found, it will be returned. + * It will also skip server <avoid> if the hash result ends on this one. + * If no valid server is found, NULL is returned. + * + * The lbprm's lock will be used in R/O mode. The server's lock is not used. + */ +struct server *chash_get_server_hash(struct proxy *p, unsigned int hash, const struct server *avoid) +{ + struct eb32_node *next, *prev; + struct server *nsrv, *psrv; + struct eb_root *root; + unsigned int dn, dp; + int loop; + + HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (p->srv_act) + root = &p->lbprm.chash.act; + else if (p->lbprm.fbck) { + nsrv = p->lbprm.fbck; + goto out; + } + else if (p->srv_bck) + root = &p->lbprm.chash.bck; + else { + nsrv = NULL; + goto out; + } + + /* find the node after and the node before */ + next = eb32_lookup_ge(root, hash); + if (!next) + next = eb32_first(root); + if (!next) { + nsrv = NULL; /* tree is empty */ + goto out; + } + + prev = eb32_prev(next); + if (!prev) + prev = eb32_last(root); + + nsrv = eb32_entry(next, struct tree_occ, node)->server; + psrv = eb32_entry(prev, struct tree_occ, node)->server; + + /* OK we're located between two servers, let's + * compare distances between hash and the two servers + * and select the closest server. + */ + dp = hash - prev->key; + dn = next->key - hash; + + if (dp <= dn) { + next = prev; + nsrv = psrv; + } + + loop = 0; + while (nsrv == avoid || (p->lbprm.hash_balance_factor && !chash_server_is_eligible(nsrv))) { + next = eb32_next(next); + if (!next) { + next = eb32_first(root); + if (++loop > 1) // protection against accidental loop + break; + } + nsrv = eb32_entry(next, struct tree_occ, node)->server; + } + + out: + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + return nsrv; +} + +/* Return next server from the CHASH tree in backend <p>. If the tree is empty, + * return NULL. Saturated servers are skipped. + * + * The lbprm's lock will be used in R/W mode. The server's lock is not used. + */ +struct server *chash_get_next_server(struct proxy *p, struct server *srvtoavoid) +{ + struct server *srv, *avoided; + struct eb32_node *node, *stop, *avoided_node; + struct eb_root *root; + + srv = avoided = NULL; + avoided_node = NULL; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + if (p->srv_act) + root = &p->lbprm.chash.act; + else if (p->lbprm.fbck) { + srv = p->lbprm.fbck; + goto out; + } + else if (p->srv_bck) + root = &p->lbprm.chash.bck; + else { + srv = NULL; + goto out; + } + + stop = node = p->lbprm.chash.last; + do { + struct server *s; + + if (node) + node = eb32_next(node); + if (!node) + node = eb32_first(root); + + p->lbprm.chash.last = node; + if (!node) { + /* no node is available */ + srv = NULL; + goto out; + } + + /* Note: if we came here after a down/up cycle with no last + * pointer, and after a redispatch (srvtoavoid is set), we + * must set stop to non-null otherwise we can loop forever. + */ + if (!stop) + stop = node; + + /* OK, we have a server. However, it may be saturated, in which + * case we don't want to reconsider it for now, so we'll simply + * skip it. Same if it's the server we try to avoid, in which + * case we simply remember it for later use if needed. + */ + s = eb32_entry(node, struct tree_occ, node)->server; + if (!s->maxconn || (!s->queue.length && s->served < srv_dynamic_maxconn(s))) { + if (s != srvtoavoid) { + srv = s; + break; + } + avoided = s; + avoided_node = node; + } + } while (node != stop); + + if (!srv) { + srv = avoided; + p->lbprm.chash.last = avoided_node; + } + + out: + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + return srv; +} + +/* This function is responsible for building the active and backup trees for + * consistent hashing. The servers receive an array of initialized nodes + * with their assigned keys. It also sets p->lbprm.wdiv to the eweight to + * uweight ratio. + * Return 0 in case of success, -1 in case of allocation failure. + */ +int chash_init_server_tree(struct proxy *p) +{ + struct server *srv; + struct eb_root init_head = EB_ROOT; + int node; + + p->lbprm.set_server_status_up = chash_set_server_status_up; + p->lbprm.set_server_status_down = chash_set_server_status_down; + p->lbprm.update_server_eweight = chash_update_server_weight; + p->lbprm.server_take_conn = NULL; + p->lbprm.server_drop_conn = NULL; + + p->lbprm.wdiv = BE_WEIGHT_SCALE; + for (srv = p->srv; srv; srv = srv->next) { + srv->next_eweight = (srv->uweight * p->lbprm.wdiv + p->lbprm.wmult - 1) / p->lbprm.wmult; + srv_lb_commit_status(srv); + } + + recount_servers(p); + update_backend_weight(p); + + p->lbprm.chash.act = init_head; + p->lbprm.chash.bck = init_head; + p->lbprm.chash.last = NULL; + + /* queue active and backup servers in two distinct groups */ + for (srv = p->srv; srv; srv = srv->next) { + srv->lb_tree = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.chash.bck : &p->lbprm.chash.act; + srv->lb_nodes_tot = srv->uweight * BE_WEIGHT_SCALE; + srv->lb_nodes_now = 0; + srv->lb_nodes = calloc(srv->lb_nodes_tot, + sizeof(*srv->lb_nodes)); + if (!srv->lb_nodes) { + ha_alert("failed to allocate lb_nodes for server %s.\n", srv->id); + return -1; + } + for (node = 0; node < srv->lb_nodes_tot; node++) { + srv->lb_nodes[node].server = srv; + srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE + node); + } + + if (srv_currently_usable(srv)) + chash_queue_dequeue_srv(srv); + } + return 0; +} diff --git a/src/lb_fas.c b/src/lb_fas.c new file mode 100644 index 0000000..d90388b --- /dev/null +++ b/src/lb_fas.c @@ -0,0 +1,348 @@ +/* + * First Available Server load balancing algorithm. + * + * This file implements an algorithm which emerged during a discussion with + * Steen Larsen, initially inspired from Anshul Gandhi et.al.'s work now + * described as "packing" in section 3.5: + * + * http://reports-archive.adm.cs.cmu.edu/anon/2012/CMU-CS-12-109.pdf + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <import/eb32tree.h> +#include <haproxy/api.h> +#include <haproxy/backend.h> +#include <haproxy/queue.h> +#include <haproxy/server-t.h> + + +/* Remove a server from a tree. It must have previously been dequeued. This + * function is meant to be called when a server is going down or has its + * weight disabled. + * + * The server's lock and the lbprm's lock must be held. + */ +static inline void fas_remove_from_tree(struct server *s) +{ + s->lb_tree = NULL; +} + +/* simply removes a server from a tree. + * + * The lbprm's lock must be held. + */ +static inline void fas_dequeue_srv(struct server *s) +{ + eb32_delete(&s->lb_node); +} + +/* Queue a server in its associated tree, assuming the weight is >0. + * Servers are sorted by unique ID so that we send all connections to the first + * available server in declaration order (or ID order) until its maxconn is + * reached. It is important to understand that the server weight is not used + * here. + * + * The lbprm's lock must be held. + */ +static inline void fas_queue_srv(struct server *s) +{ + s->lb_node.key = s->puid; + eb32_insert(s->lb_tree, &s->lb_node); +} + +/* Re-position the server in the FS tree after it has been assigned one + * connection or after it has released one. Note that it is possible that + * the server has been moved out of the tree due to failed health-checks. + * The lbprm's lock will be used. + */ +static void fas_srv_reposition(struct server *s) +{ + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + if (s->lb_tree) { + fas_dequeue_srv(s); + fas_queue_srv(s); + } + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); +} + +/* This function updates the server trees according to server <srv>'s new + * state. It should be called when server <srv>'s status changes to down. + * It is not important whether the server was already down or not. It is not + * important either that the new state is completely down (the caller may not + * know all the variables of a server's state). + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fas_set_server_status_down(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + if (srv_willbe_usable(srv)) + goto out_update_state; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (!srv_currently_usable(srv)) + /* server was already down */ + goto out_update_backend; + + if (srv->flags & SRV_F_BACKUP) { + p->lbprm.tot_wbck -= srv->cur_eweight; + p->srv_bck--; + + if (srv == p->lbprm.fbck) { + /* we lost the first backup server in a single-backup + * configuration, we must search another one. + */ + struct server *srv2 = p->lbprm.fbck; + do { + srv2 = srv2->next; + } while (srv2 && + !((srv2->flags & SRV_F_BACKUP) && + srv_willbe_usable(srv2))); + p->lbprm.fbck = srv2; + } + } else { + p->lbprm.tot_wact -= srv->cur_eweight; + p->srv_act--; + } + + fas_dequeue_srv(srv); + fas_remove_from_tree(srv); + + out_update_backend: + /* check/update tot_used, tot_weight */ + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + out_update_state: + srv_lb_commit_status(srv); +} + +/* This function updates the server trees according to server <srv>'s new + * state. It should be called when server <srv>'s status changes to up. + * It is not important whether the server was already down or not. It is not + * important either that the new state is completely UP (the caller may not + * know all the variables of a server's state). This function will not change + * the weight of a server which was already up. + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fas_set_server_status_up(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + if (!srv_willbe_usable(srv)) + goto out_update_state; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (srv_currently_usable(srv)) + /* server was already up */ + goto out_update_backend; + + if (srv->flags & SRV_F_BACKUP) { + srv->lb_tree = &p->lbprm.fas.bck; + p->lbprm.tot_wbck += srv->next_eweight; + p->srv_bck++; + + if (!(p->options & PR_O_USE_ALL_BK)) { + if (!p->lbprm.fbck) { + /* there was no backup server anymore */ + p->lbprm.fbck = srv; + } else { + /* we may have restored a backup server prior to fbck, + * in which case it should replace it. + */ + struct server *srv2 = srv; + do { + srv2 = srv2->next; + } while (srv2 && (srv2 != p->lbprm.fbck)); + if (srv2) + p->lbprm.fbck = srv; + } + } + } else { + srv->lb_tree = &p->lbprm.fas.act; + p->lbprm.tot_wact += srv->next_eweight; + p->srv_act++; + } + + /* note that eweight cannot be 0 here */ + fas_queue_srv(srv); + + out_update_backend: + /* check/update tot_used, tot_weight */ + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + out_update_state: + srv_lb_commit_status(srv); +} + +/* This function must be called after an update to server <srv>'s effective + * weight. It may be called after a state change too. + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fas_update_server_weight(struct server *srv) +{ + int old_state, new_state; + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + /* If changing the server's weight changes its state, we simply apply + * the procedures we already have for status change. If the state + * remains down, the server is not in any tree, so it's as easy as + * updating its values. If the state remains up with different weights, + * there are some computations to perform to find a new place and + * possibly a new tree for this server. + */ + + old_state = srv_currently_usable(srv); + new_state = srv_willbe_usable(srv); + + if (!old_state && !new_state) { + srv_lb_commit_status(srv); + return; + } + else if (!old_state && new_state) { + fas_set_server_status_up(srv); + return; + } + else if (old_state && !new_state) { + fas_set_server_status_down(srv); + return; + } + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (srv->lb_tree) + fas_dequeue_srv(srv); + + if (srv->flags & SRV_F_BACKUP) { + p->lbprm.tot_wbck += srv->next_eweight - srv->cur_eweight; + srv->lb_tree = &p->lbprm.fas.bck; + } else { + p->lbprm.tot_wact += srv->next_eweight - srv->cur_eweight; + srv->lb_tree = &p->lbprm.fas.act; + } + + fas_queue_srv(srv); + + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + srv_lb_commit_status(srv); +} + +/* This function is responsible for building the trees in case of fast + * weighted least-conns. It also sets p->lbprm.wdiv to the eweight to + * uweight ratio. Both active and backup groups are initialized. + */ +void fas_init_server_tree(struct proxy *p) +{ + struct server *srv; + struct eb_root init_head = EB_ROOT; + + p->lbprm.set_server_status_up = fas_set_server_status_up; + p->lbprm.set_server_status_down = fas_set_server_status_down; + p->lbprm.update_server_eweight = fas_update_server_weight; + p->lbprm.server_take_conn = fas_srv_reposition; + p->lbprm.server_drop_conn = fas_srv_reposition; + + p->lbprm.wdiv = BE_WEIGHT_SCALE; + for (srv = p->srv; srv; srv = srv->next) { + srv->next_eweight = (srv->uweight * p->lbprm.wdiv + p->lbprm.wmult - 1) / p->lbprm.wmult; + srv_lb_commit_status(srv); + } + + recount_servers(p); + update_backend_weight(p); + + p->lbprm.fas.act = init_head; + p->lbprm.fas.bck = init_head; + + /* queue active and backup servers in two distinct groups */ + for (srv = p->srv; srv; srv = srv->next) { + if (!srv_currently_usable(srv)) + continue; + srv->lb_tree = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.fas.bck : &p->lbprm.fas.act; + fas_queue_srv(srv); + } +} + +/* Return next server from the FS tree in backend <p>. If the tree is empty, + * return NULL. Saturated servers are skipped. + * + * The lbprm's lock will be used. The server's lock is not used. + */ +struct server *fas_get_next_server(struct proxy *p, struct server *srvtoavoid) +{ + struct server *srv, *avoided; + struct eb32_node *node; + + srv = avoided = NULL; + + HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock); + if (p->srv_act) + node = eb32_first(&p->lbprm.fas.act); + else if (p->lbprm.fbck) { + srv = p->lbprm.fbck; + goto out; + } + else if (p->srv_bck) + node = eb32_first(&p->lbprm.fas.bck); + else { + srv = NULL; + goto out; + } + + while (node) { + /* OK, we have a server. However, it may be saturated, in which + * case we don't want to reconsider it for now, so we'll simply + * skip it. Same if it's the server we try to avoid, in which + * case we simply remember it for later use if needed. + */ + struct server *s; + + s = eb32_entry(node, struct server, lb_node); + if (!s->maxconn || (!s->queue.length && s->served < srv_dynamic_maxconn(s))) { + if (s != srvtoavoid) { + srv = s; + break; + } + avoided = s; + } + node = eb32_next(node); + } + + if (!srv) + srv = avoided; + out: + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + return srv; +} + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/lb_fwlc.c b/src/lb_fwlc.c new file mode 100644 index 0000000..8e913d4 --- /dev/null +++ b/src/lb_fwlc.c @@ -0,0 +1,375 @@ +/* + * Fast Weighted Least Connection load balancing algorithm. + * + * Copyright 2000-2009 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <import/eb32tree.h> +#include <haproxy/api.h> +#include <haproxy/backend.h> +#include <haproxy/queue.h> +#include <haproxy/server-t.h> + + +/* Remove a server from a tree. It must have previously been dequeued. This + * function is meant to be called when a server is going down or has its + * weight disabled. + * + * The server's lock and the lbprm's lock must be held. + */ +static inline void fwlc_remove_from_tree(struct server *s) +{ + s->lb_tree = NULL; +} + +/* simply removes a server from a tree. + * + * The lbprm's lock must be held. + */ +static inline void fwlc_dequeue_srv(struct server *s) +{ + eb32_delete(&s->lb_node); +} + +/* Queue a server in its associated tree, assuming the <eweight> is >0. + * Servers are sorted by (#conns+1)/weight. To ensure maximum accuracy, + * we use (#conns+1)*SRV_EWGHT_MAX/eweight as the sorting key. The reason + * for using #conns+1 is to sort by weights in case the server is picked + * and not before it is picked. This provides a better load accuracy for + * low connection counts when weights differ and makes sure the round-robin + * applies between servers of highest weight first. However servers with no + * connection are always picked first so that under low loads, it's not + * always the single server with the highest weight that gets picked. + * + * NOTE: Depending on the calling context, we use s->next_eweight or + * s->cur_eweight. The next value is used when the server state is updated + * (because the weight changed for instance). During this step, the server + * state is not yet committed. The current value is used to reposition the + * server in the tree. This happens when the server is used. + * + * The lbprm's lock must be held. + */ +static inline void fwlc_queue_srv(struct server *s, unsigned int eweight) +{ + unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queue.length); + + s->lb_node.key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / eweight : 0; + eb32_insert(s->lb_tree, &s->lb_node); +} + +/* Re-position the server in the FWLC tree after it has been assigned one + * connection or after it has released one. Note that it is possible that + * the server has been moved out of the tree due to failed health-checks. + * The lbprm's lock will be used. + */ +static void fwlc_srv_reposition(struct server *s) +{ + unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queue.length); + unsigned int eweight = _HA_ATOMIC_LOAD(&s->cur_eweight); + unsigned int new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0; + + /* some calls will be made for no change (e.g connect_server() after + * assign_server(). Let's check that first. + */ + if (s->lb_node.node.leaf_p && eweight && s->lb_node.key == new_key) + return; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); + if (s->lb_tree) { + /* we might have been waiting for a while on the lock above + * so it's worth testing again because other threads are very + * likely to have released a connection or taken one leading + * to our target value (50% of the case in measurements). + */ + inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queue.length); + eweight = _HA_ATOMIC_LOAD(&s->cur_eweight); + new_key = inflight ? (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1) : 0; + if (!s->lb_node.node.leaf_p || s->lb_node.key != new_key) { + eb32_delete(&s->lb_node); + s->lb_node.key = new_key; + eb32_insert(s->lb_tree, &s->lb_node); + } + } + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock); +} + +/* This function updates the server trees according to server <srv>'s new + * state. It should be called when server <srv>'s status changes to down. + * It is not important whether the server was already down or not. It is not + * important either that the new state is completely down (the caller may not + * know all the variables of a server's state). + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fwlc_set_server_status_down(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + if (srv_willbe_usable(srv)) + goto out_update_state; + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + + if (!srv_currently_usable(srv)) + /* server was already down */ + goto out_update_backend; + + if (srv->flags & SRV_F_BACKUP) { + p->lbprm.tot_wbck -= srv->cur_eweight; + p->srv_bck--; + + if (srv == p->lbprm.fbck) { + /* we lost the first backup server in a single-backup + * configuration, we must search another one. + */ + struct server *srv2 = p->lbprm.fbck; + do { + srv2 = srv2->next; + } while (srv2 && + !((srv2->flags & SRV_F_BACKUP) && + srv_willbe_usable(srv2))); + p->lbprm.fbck = srv2; + } + } else { + p->lbprm.tot_wact -= srv->cur_eweight; + p->srv_act--; + } + + fwlc_dequeue_srv(srv); + fwlc_remove_from_tree(srv); + +out_update_backend: + /* check/update tot_used, tot_weight */ + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + out_update_state: + srv_lb_commit_status(srv); +} + +/* This function updates the server trees according to server <srv>'s new + * state. It should be called when server <srv>'s status changes to up. + * It is not important whether the server was already down or not. It is not + * important either that the new state is completely UP (the caller may not + * know all the variables of a server's state). This function will not change + * the weight of a server which was already up. + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fwlc_set_server_status_up(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + if (!srv_willbe_usable(srv)) + goto out_update_state; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (srv_currently_usable(srv)) + /* server was already up */ + goto out_update_backend; + + if (srv->flags & SRV_F_BACKUP) { + srv->lb_tree = &p->lbprm.fwlc.bck; + p->lbprm.tot_wbck += srv->next_eweight; + p->srv_bck++; + + if (!(p->options & PR_O_USE_ALL_BK)) { + if (!p->lbprm.fbck) { + /* there was no backup server anymore */ + p->lbprm.fbck = srv; + } else { + /* we may have restored a backup server prior to fbck, + * in which case it should replace it. + */ + struct server *srv2 = srv; + do { + srv2 = srv2->next; + } while (srv2 && (srv2 != p->lbprm.fbck)); + if (srv2) + p->lbprm.fbck = srv; + } + } + } else { + srv->lb_tree = &p->lbprm.fwlc.act; + p->lbprm.tot_wact += srv->next_eweight; + p->srv_act++; + } + + /* note that eweight cannot be 0 here */ + fwlc_queue_srv(srv, srv->next_eweight); + + out_update_backend: + /* check/update tot_used, tot_weight */ + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + out_update_state: + srv_lb_commit_status(srv); +} + +/* This function must be called after an update to server <srv>'s effective + * weight. It may be called after a state change too. + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fwlc_update_server_weight(struct server *srv) +{ + int old_state, new_state; + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + /* If changing the server's weight changes its state, we simply apply + * the procedures we already have for status change. If the state + * remains down, the server is not in any tree, so it's as easy as + * updating its values. If the state remains up with different weights, + * there are some computations to perform to find a new place and + * possibly a new tree for this server. + */ + + old_state = srv_currently_usable(srv); + new_state = srv_willbe_usable(srv); + + if (!old_state && !new_state) { + srv_lb_commit_status(srv); + return; + } + else if (!old_state && new_state) { + fwlc_set_server_status_up(srv); + return; + } + else if (old_state && !new_state) { + fwlc_set_server_status_down(srv); + return; + } + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (srv->lb_tree) + fwlc_dequeue_srv(srv); + + if (srv->flags & SRV_F_BACKUP) { + p->lbprm.tot_wbck += srv->next_eweight - srv->cur_eweight; + srv->lb_tree = &p->lbprm.fwlc.bck; + } else { + p->lbprm.tot_wact += srv->next_eweight - srv->cur_eweight; + srv->lb_tree = &p->lbprm.fwlc.act; + } + + fwlc_queue_srv(srv, srv->next_eweight); + + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + srv_lb_commit_status(srv); +} + +/* This function is responsible for building the trees in case of fast + * weighted least-conns. It also sets p->lbprm.wdiv to the eweight to + * uweight ratio. Both active and backup groups are initialized. + */ +void fwlc_init_server_tree(struct proxy *p) +{ + struct server *srv; + struct eb_root init_head = EB_ROOT; + + p->lbprm.set_server_status_up = fwlc_set_server_status_up; + p->lbprm.set_server_status_down = fwlc_set_server_status_down; + p->lbprm.update_server_eweight = fwlc_update_server_weight; + p->lbprm.server_take_conn = fwlc_srv_reposition; + p->lbprm.server_drop_conn = fwlc_srv_reposition; + + p->lbprm.wdiv = BE_WEIGHT_SCALE; + for (srv = p->srv; srv; srv = srv->next) { + srv->next_eweight = (srv->uweight * p->lbprm.wdiv + p->lbprm.wmult - 1) / p->lbprm.wmult; + srv_lb_commit_status(srv); + } + + recount_servers(p); + update_backend_weight(p); + + p->lbprm.fwlc.act = init_head; + p->lbprm.fwlc.bck = init_head; + + /* queue active and backup servers in two distinct groups */ + for (srv = p->srv; srv; srv = srv->next) { + if (!srv_currently_usable(srv)) + continue; + srv->lb_tree = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.fwlc.bck : &p->lbprm.fwlc.act; + fwlc_queue_srv(srv, srv->next_eweight); + } +} + +/* Return next server from the FWLC tree in backend <p>. If the tree is empty, + * return NULL. Saturated servers are skipped. + * + * The lbprm's lock will be used in R/O mode. The server's lock is not used. + */ +struct server *fwlc_get_next_server(struct proxy *p, struct server *srvtoavoid) +{ + struct server *srv, *avoided; + struct eb32_node *node; + + srv = avoided = NULL; + + HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock); + if (p->srv_act) + node = eb32_first(&p->lbprm.fwlc.act); + else if (p->lbprm.fbck) { + srv = p->lbprm.fbck; + goto out; + } + else if (p->srv_bck) + node = eb32_first(&p->lbprm.fwlc.bck); + else { + srv = NULL; + goto out; + } + + while (node) { + /* OK, we have a server. However, it may be saturated, in which + * case we don't want to reconsider it for now, so we'll simply + * skip it. Same if it's the server we try to avoid, in which + * case we simply remember it for later use if needed. + */ + struct server *s; + + s = eb32_entry(node, struct server, lb_node); + if (!s->maxconn || s->served + s->queue.length < srv_dynamic_maxconn(s) + s->maxqueue) { + if (s != srvtoavoid) { + srv = s; + break; + } + avoided = s; + } + node = eb32_next(node); + } + + if (!srv) + srv = avoided; + out: + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + return srv; +} + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/lb_fwrr.c b/src/lb_fwrr.c new file mode 100644 index 0000000..a762623 --- /dev/null +++ b/src/lb_fwrr.c @@ -0,0 +1,623 @@ +/* + * Fast Weighted Round Robin load balancing algorithm. + * + * Copyright 2000-2009 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <import/eb32tree.h> +#include <haproxy/api.h> +#include <haproxy/backend.h> +#include <haproxy/queue.h> +#include <haproxy/server-t.h> + + +static inline void fwrr_remove_from_tree(struct server *s); +static inline void fwrr_queue_by_weight(struct eb_root *root, struct server *s); +static inline void fwrr_dequeue_srv(struct server *s); +static void fwrr_get_srv(struct server *s); +static void fwrr_queue_srv(struct server *s); + + +/* This function updates the server trees according to server <srv>'s new + * state. It should be called when server <srv>'s status changes to down. + * It is not important whether the server was already down or not. It is not + * important either that the new state is completely down (the caller may not + * know all the variables of a server's state). + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fwrr_set_server_status_down(struct server *srv) +{ + struct proxy *p = srv->proxy; + struct fwrr_group *grp; + + if (!srv_lb_status_changed(srv)) + return; + + if (srv_willbe_usable(srv)) + goto out_update_state; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (!srv_currently_usable(srv)) + /* server was already down */ + goto out_update_backend; + + grp = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act; + grp->next_weight -= srv->cur_eweight; + + if (srv->flags & SRV_F_BACKUP) { + p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight; + p->srv_bck--; + + if (srv == p->lbprm.fbck) { + /* we lost the first backup server in a single-backup + * configuration, we must search another one. + */ + struct server *srv2 = p->lbprm.fbck; + do { + srv2 = srv2->next; + } while (srv2 && + !((srv2->flags & SRV_F_BACKUP) && + srv_willbe_usable(srv2))); + p->lbprm.fbck = srv2; + } + } else { + p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight; + p->srv_act--; + } + + fwrr_dequeue_srv(srv); + fwrr_remove_from_tree(srv); + +out_update_backend: + /* check/update tot_used, tot_weight */ + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + out_update_state: + srv_lb_commit_status(srv); +} + +/* This function updates the server trees according to server <srv>'s new + * state. It should be called when server <srv>'s status changes to up. + * It is not important whether the server was already down or not. It is not + * important either that the new state is completely UP (the caller may not + * know all the variables of a server's state). This function will not change + * the weight of a server which was already up. + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fwrr_set_server_status_up(struct server *srv) +{ + struct proxy *p = srv->proxy; + struct fwrr_group *grp; + + if (!srv_lb_status_changed(srv)) + return; + + if (!srv_willbe_usable(srv)) + goto out_update_state; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + if (srv_currently_usable(srv)) + /* server was already up */ + goto out_update_backend; + + grp = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act; + grp->next_weight += srv->next_eweight; + + if (srv->flags & SRV_F_BACKUP) { + p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight; + p->srv_bck++; + + if (!(p->options & PR_O_USE_ALL_BK)) { + if (!p->lbprm.fbck) { + /* there was no backup server anymore */ + p->lbprm.fbck = srv; + } else { + /* we may have restored a backup server prior to fbck, + * in which case it should replace it. + */ + struct server *srv2 = srv; + do { + srv2 = srv2->next; + } while (srv2 && (srv2 != p->lbprm.fbck)); + if (srv2) + p->lbprm.fbck = srv; + } + } + } else { + p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight; + p->srv_act++; + } + + /* note that eweight cannot be 0 here */ + fwrr_get_srv(srv); + srv->npos = grp->curr_pos + (grp->next_weight + grp->curr_weight - grp->curr_pos) / srv->next_eweight; + fwrr_queue_srv(srv); + +out_update_backend: + /* check/update tot_used, tot_weight */ + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + out_update_state: + srv_lb_commit_status(srv); +} + +/* This function must be called after an update to server <srv>'s effective + * weight. It may be called after a state change too. + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void fwrr_update_server_weight(struct server *srv) +{ + int old_state, new_state; + struct proxy *p = srv->proxy; + struct fwrr_group *grp; + + if (!srv_lb_status_changed(srv)) + return; + + /* If changing the server's weight changes its state, we simply apply + * the procedures we already have for status change. If the state + * remains down, the server is not in any tree, so it's as easy as + * updating its values. If the state remains up with different weights, + * there are some computations to perform to find a new place and + * possibly a new tree for this server. + */ + + old_state = srv_currently_usable(srv); + new_state = srv_willbe_usable(srv); + + if (!old_state && !new_state) { + srv_lb_commit_status(srv); + return; + } + else if (!old_state && new_state) { + fwrr_set_server_status_up(srv); + return; + } + else if (old_state && !new_state) { + fwrr_set_server_status_down(srv); + return; + } + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + + grp = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act; + grp->next_weight = grp->next_weight - srv->cur_eweight + srv->next_eweight; + + p->lbprm.tot_wact = p->lbprm.fwrr.act.next_weight; + p->lbprm.tot_wbck = p->lbprm.fwrr.bck.next_weight; + + if (srv->lb_tree == grp->init) { + fwrr_dequeue_srv(srv); + fwrr_queue_by_weight(grp->init, srv); + } + else if (!srv->lb_tree) { + /* FIXME: server was down. This is not possible right now but + * may be needed soon for slowstart or graceful shutdown. + */ + fwrr_dequeue_srv(srv); + fwrr_get_srv(srv); + srv->npos = grp->curr_pos + (grp->next_weight + grp->curr_weight - grp->curr_pos) / srv->next_eweight; + fwrr_queue_srv(srv); + } else { + /* The server is either active or in the next queue. If it's + * still in the active queue and it has not consumed all of its + * places, let's adjust its next position. + */ + fwrr_get_srv(srv); + + if (srv->next_eweight > 0) { + int prev_next = srv->npos; + int step = grp->next_weight / srv->next_eweight; + + srv->npos = srv->lpos + step; + srv->rweight = 0; + + if (srv->npos > prev_next) + srv->npos = prev_next; + if (srv->npos < grp->curr_pos + 2) + srv->npos = grp->curr_pos + step; + } else { + /* push it into the next tree */ + srv->npos = grp->curr_pos + grp->curr_weight; + } + + fwrr_dequeue_srv(srv); + fwrr_queue_srv(srv); + } + + update_backend_weight(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + + srv_lb_commit_status(srv); +} + +/* Remove a server from a tree. It must have previously been dequeued. This + * function is meant to be called when a server is going down or has its + * weight disabled. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static inline void fwrr_remove_from_tree(struct server *s) +{ + s->lb_tree = NULL; +} + +/* Queue a server in the weight tree <root>, assuming the weight is >0. + * We want to sort them by inverted weights, because we need to place + * heavy servers first in order to get a smooth distribution. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static inline void fwrr_queue_by_weight(struct eb_root *root, struct server *s) +{ + s->lb_node.key = SRV_EWGHT_MAX - s->next_eweight; + eb32_insert(root, &s->lb_node); + s->lb_tree = root; +} + +/* This function is responsible for building the weight trees in case of fast + * weighted round-robin. It also sets p->lbprm.wdiv to the eweight to uweight + * ratio. Both active and backup groups are initialized. + */ +void fwrr_init_server_groups(struct proxy *p) +{ + struct server *srv; + struct eb_root init_head = EB_ROOT; + + p->lbprm.set_server_status_up = fwrr_set_server_status_up; + p->lbprm.set_server_status_down = fwrr_set_server_status_down; + p->lbprm.update_server_eweight = fwrr_update_server_weight; + + p->lbprm.wdiv = BE_WEIGHT_SCALE; + for (srv = p->srv; srv; srv = srv->next) { + srv->next_eweight = (srv->uweight * p->lbprm.wdiv + p->lbprm.wmult - 1) / p->lbprm.wmult; + srv_lb_commit_status(srv); + } + + recount_servers(p); + update_backend_weight(p); + + /* prepare the active servers group */ + p->lbprm.fwrr.act.curr_pos = p->lbprm.fwrr.act.curr_weight = + p->lbprm.fwrr.act.next_weight = p->lbprm.tot_wact; + p->lbprm.fwrr.act.curr = p->lbprm.fwrr.act.t0 = + p->lbprm.fwrr.act.t1 = init_head; + p->lbprm.fwrr.act.init = &p->lbprm.fwrr.act.t0; + p->lbprm.fwrr.act.next = &p->lbprm.fwrr.act.t1; + + /* prepare the backup servers group */ + p->lbprm.fwrr.bck.curr_pos = p->lbprm.fwrr.bck.curr_weight = + p->lbprm.fwrr.bck.next_weight = p->lbprm.tot_wbck; + p->lbprm.fwrr.bck.curr = p->lbprm.fwrr.bck.t0 = + p->lbprm.fwrr.bck.t1 = init_head; + p->lbprm.fwrr.bck.init = &p->lbprm.fwrr.bck.t0; + p->lbprm.fwrr.bck.next = &p->lbprm.fwrr.bck.t1; + + /* queue active and backup servers in two distinct groups */ + for (srv = p->srv; srv; srv = srv->next) { + if (!srv_currently_usable(srv)) + continue; + fwrr_queue_by_weight((srv->flags & SRV_F_BACKUP) ? + p->lbprm.fwrr.bck.init : + p->lbprm.fwrr.act.init, + srv); + } +} + +/* simply removes a server from a weight tree. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static inline void fwrr_dequeue_srv(struct server *s) +{ + eb32_delete(&s->lb_node); +} + +/* queues a server into the appropriate group and tree depending on its + * backup status, and ->npos. If the server is disabled, simply assign + * it to the NULL tree. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static void fwrr_queue_srv(struct server *s) +{ + struct proxy *p = s->proxy; + struct fwrr_group *grp; + + grp = (s->flags & SRV_F_BACKUP) ? &p->lbprm.fwrr.bck : &p->lbprm.fwrr.act; + + /* Delay everything which does not fit into the window and everything + * which does not fit into the theoretical new window. + */ + if (!srv_willbe_usable(s)) { + fwrr_remove_from_tree(s); + } + else if (s->next_eweight <= 0 || + s->npos >= 2 * grp->curr_weight || + s->npos >= grp->curr_weight + grp->next_weight) { + /* put into next tree, and readjust npos in case we could + * finally take this back to current. */ + s->npos -= grp->curr_weight; + fwrr_queue_by_weight(grp->next, s); + } + else { + /* The sorting key is stored in units of s->npos * user_weight + * in order to avoid overflows. As stated in backend.h, the + * lower the scale, the rougher the weights modulation, and the + * higher the scale, the lower the number of servers without + * overflow. With this formula, the result is always positive, + * so we can use eb32_insert(). + */ + s->lb_node.key = SRV_UWGHT_RANGE * s->npos + + (unsigned)(SRV_EWGHT_MAX + s->rweight - s->next_eweight) / BE_WEIGHT_SCALE; + + eb32_insert(&grp->curr, &s->lb_node); + s->lb_tree = &grp->curr; + } +} + +/* prepares a server when extracting it from the "init" tree. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static inline void fwrr_get_srv_init(struct server *s) +{ + s->npos = s->rweight = 0; +} + +/* prepares a server when extracting it from the "next" tree. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static inline void fwrr_get_srv_next(struct server *s) +{ + struct fwrr_group *grp = (s->flags & SRV_F_BACKUP) ? + &s->proxy->lbprm.fwrr.bck : + &s->proxy->lbprm.fwrr.act; + + s->npos += grp->curr_weight; +} + +/* prepares a server when it was marked down. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static inline void fwrr_get_srv_down(struct server *s) +{ + struct fwrr_group *grp = (s->flags & SRV_F_BACKUP) ? + &s->proxy->lbprm.fwrr.bck : + &s->proxy->lbprm.fwrr.act; + + s->npos = grp->curr_pos; +} + +/* prepares a server when extracting it from its tree. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static void fwrr_get_srv(struct server *s) +{ + struct proxy *p = s->proxy; + struct fwrr_group *grp = (s->flags & SRV_F_BACKUP) ? + &p->lbprm.fwrr.bck : + &p->lbprm.fwrr.act; + + if (s->lb_tree == grp->init) { + fwrr_get_srv_init(s); + } + else if (s->lb_tree == grp->next) { + fwrr_get_srv_next(s); + } + else if (s->lb_tree == NULL) { + fwrr_get_srv_down(s); + } +} + +/* switches trees "init" and "next" for FWRR group <grp>. "init" should be empty + * when this happens, and "next" filled with servers sorted by weights. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static inline void fwrr_switch_trees(struct fwrr_group *grp) +{ + struct eb_root *swap; + swap = grp->init; + grp->init = grp->next; + grp->next = swap; + grp->curr_weight = grp->next_weight; + grp->curr_pos = grp->curr_weight; +} + +/* return next server from the current tree in FWRR group <grp>, or a server + * from the "init" tree if appropriate. If both trees are empty, return NULL. + * + * The lbprm's lock must be held. The server's lock is not used. + */ +static struct server *fwrr_get_server_from_group(struct fwrr_group *grp) +{ + struct eb32_node *node1; + struct eb32_node *node2; + struct server *s1 = NULL; + struct server *s2 = NULL; + + node1 = eb32_first(&grp->curr); + if (node1) { + s1 = eb32_entry(node1, struct server, lb_node); + if (s1->cur_eweight && s1->npos <= grp->curr_pos) + return s1; + } + + /* Either we have no server left, or we have a hole. We'll look in the + * init tree or a better proposal. At this point, if <s1> is non-null, + * it is guaranteed to remain available as the tree is locked. + */ + node2 = eb32_first(grp->init); + if (node2) { + s2 = eb32_entry(node2, struct server, lb_node); + if (s2->cur_eweight) { + fwrr_get_srv_init(s2); + return s2; + } + } + return s1; +} + +/* Computes next position of server <s> in the group. Nothing is done if <s> + * has a zero weight. + * + * The lbprm's lock must be held to protect lpos/npos/rweight. + */ +static inline void fwrr_update_position(struct fwrr_group *grp, struct server *s) +{ + unsigned int eweight = *(volatile unsigned int *)&s->cur_eweight; + + if (!eweight) + return; + + if (!s->npos) { + /* first time ever for this server */ + s->npos = grp->curr_pos; + } + + s->lpos = s->npos; + s->npos += grp->next_weight / eweight; + s->rweight += grp->next_weight % eweight; + + if (s->rweight >= eweight) { + s->rweight -= eweight; + s->npos++; + } +} + +/* Return next server from the current tree in backend <p>, or a server from + * the init tree if appropriate. If both trees are empty, return NULL. + * Saturated servers are skipped and requeued. + * + * The lbprm's lock will be used in R/W mode. The server's lock is not used. + */ +struct server *fwrr_get_next_server(struct proxy *p, struct server *srvtoavoid) +{ + struct server *srv, *full, *avoided; + struct fwrr_group *grp; + int switched; + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + if (p->srv_act) + grp = &p->lbprm.fwrr.act; + else if (p->lbprm.fbck) { + srv = p->lbprm.fbck; + goto out; + } + else if (p->srv_bck) + grp = &p->lbprm.fwrr.bck; + else { + srv = NULL; + goto out; + } + + switched = 0; + avoided = NULL; + full = NULL; /* NULL-terminated list of saturated servers */ + while (1) { + /* if we see an empty group, let's first try to collect weights + * which might have recently changed. + */ + if (!grp->curr_weight) + grp->curr_pos = grp->curr_weight = grp->next_weight; + + /* get first server from the "current" tree. When the end of + * the tree is reached, we may have to switch, but only once. + */ + while (1) { + srv = fwrr_get_server_from_group(grp); + if (srv) + break; + if (switched) { + if (avoided) { + srv = avoided; + goto take_this_one; + } + goto requeue_servers; + } + switched = 1; + fwrr_switch_trees(grp); + } + + /* OK, we have a server. However, it may be saturated, in which + * case we don't want to reconsider it for now. We'll update + * its position and dequeue it anyway, so that we can move it + * to a better place afterwards. + */ + fwrr_update_position(grp, srv); + fwrr_dequeue_srv(srv); + grp->curr_pos++; + if (!srv->maxconn || (!srv->queue.length && srv->served < srv_dynamic_maxconn(srv))) { + /* make sure it is not the server we are trying to exclude... */ + if (srv != srvtoavoid || avoided) + break; + + avoided = srv; /* ...but remember that is was selected yet avoided */ + } + + /* the server is saturated or avoided, let's chain it for later reinsertion. + */ + srv->next_full = full; + full = srv; + } + + take_this_one: + /* OK, we got the best server, let's update it */ + fwrr_queue_srv(srv); + + requeue_servers: + /* Requeue all extracted servers. If full==srv then it was + * avoided (unsuccessfully) and chained, omit it now. The + * only way to get there is by having <avoided>==NULL or + * <avoided>==<srv>. + */ + if (unlikely(full != NULL)) { + if (switched) { + /* the tree has switched, requeue all extracted servers + * into "init", because their place was lost, and only + * their weight matters. + */ + do { + if (likely(full != srv)) + fwrr_queue_by_weight(grp->init, full); + full = full->next_full; + } while (full); + } else { + /* requeue all extracted servers just as if they were consumed + * so that they regain their expected place. + */ + do { + if (likely(full != srv)) + fwrr_queue_srv(full); + full = full->next_full; + } while (full); + } + } + out: + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + return srv; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/lb_map.c b/src/lb_map.c new file mode 100644 index 0000000..592df91 --- /dev/null +++ b/src/lb_map.c @@ -0,0 +1,281 @@ +/* + * Map-based load-balancing (RR and HASH) + * + * Copyright 2000-2009 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <import/eb32tree.h> +#include <haproxy/api.h> +#include <haproxy/backend.h> +#include <haproxy/lb_map.h> +#include <haproxy/queue.h> +#include <haproxy/server-t.h> + +/* this function updates the map according to server <srv>'s new state. + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void map_set_server_status_down(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + if (srv_willbe_usable(srv)) + goto out_update_state; + + /* FIXME: could be optimized since we know what changed */ + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + recount_servers(p); + update_backend_weight(p); + recalc_server_map(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + out_update_state: + srv_lb_commit_status(srv); +} + +/* This function updates the map according to server <srv>'s new state. + * + * The server's lock must be held. The lbprm's lock will be used. + */ +static void map_set_server_status_up(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; + + if (!srv_willbe_usable(srv)) + goto out_update_state; + + /* FIXME: could be optimized since we know what changed */ + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + recount_servers(p); + update_backend_weight(p); + recalc_server_map(p); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); + out_update_state: + srv_lb_commit_status(srv); +} + +/* This function recomputes the server map for proxy px. It relies on + * px->lbprm.tot_wact, tot_wbck, tot_used, tot_weight, so it must be + * called after recount_servers(). It also expects px->lbprm.map.srv + * to be allocated with the largest size needed. It updates tot_weight. + * + * The lbprm's lock must be held. + */ +void recalc_server_map(struct proxy *px) +{ + int o, tot, flag; + struct server *cur, *best; + + switch (px->lbprm.tot_used) { + case 0: /* no server */ + return; + default: + tot = px->lbprm.tot_weight; + break; + } + + /* here we *know* that we have some servers */ + if (px->srv_act) + flag = 0; + else + flag = SRV_F_BACKUP; + + /* this algorithm gives priority to the first server, which means that + * it will respect the declaration order for equivalent weights, and + * that whatever the weights, the first server called will always be + * the first declared. This is an important assumption for the backup + * case, where we want the first server only. + */ + for (cur = px->srv; cur; cur = cur->next) + cur->wscore = 0; + + for (o = 0; o < tot; o++) { + int max = 0; + best = NULL; + for (cur = px->srv; cur; cur = cur->next) { + if ((cur->flags & SRV_F_BACKUP) == flag && + srv_willbe_usable(cur)) { + int v; + + /* If we are forced to return only one server, we don't want to + * go further, because we would return the wrong one due to + * divide overflow. + */ + if (tot == 1) { + best = cur; + /* note that best->wscore will be wrong but we don't care */ + break; + } + + _HA_ATOMIC_ADD(&cur->wscore, cur->next_eweight); + v = (cur->wscore + tot) / tot; /* result between 0 and 3 */ + if (best == NULL || v > max) { + max = v; + best = cur; + } + } + } + px->lbprm.map.srv[o] = best; + if (best) + _HA_ATOMIC_SUB(&best->wscore, tot); + } +} + +/* This function is responsible of building the server MAP for map-based LB + * algorithms, allocating the map, and setting p->lbprm.wmult to the GCD of the + * weights if applicable. It should be called only once per proxy, at config + * time. + */ +void init_server_map(struct proxy *p) +{ + struct server *srv; + int pgcd; + int act, bck; + + p->lbprm.set_server_status_up = map_set_server_status_up; + p->lbprm.set_server_status_down = map_set_server_status_down; + p->lbprm.update_server_eweight = NULL; + + if (!p->srv) + return; + + /* We will factor the weights to reduce the table, + * using Euclide's largest common divisor algorithm. + * Since we may have zero weights, we have to first + * find a non-zero weight server. + */ + pgcd = 1; + srv = p->srv; + while (srv && !srv->uweight) + srv = srv->next; + + if (srv) { + pgcd = srv->uweight; /* note: cannot be zero */ + while (pgcd > 1 && (srv = srv->next)) { + int w = srv->uweight; + while (w) { + int t = pgcd % w; + pgcd = w; + w = t; + } + } + } + + /* It is sometimes useful to know what factor to apply + * to the backend's effective weight to know its real + * weight. + */ + p->lbprm.wmult = pgcd; + + act = bck = 0; + for (srv = p->srv; srv; srv = srv->next) { + srv->next_eweight = (srv->uweight * p->lbprm.wdiv + p->lbprm.wmult - 1) / p->lbprm.wmult; + + if (srv->flags & SRV_F_BACKUP) + bck += srv->next_eweight; + else + act += srv->next_eweight; + srv_lb_commit_status(srv); + } + + /* this is the largest map we will ever need for this servers list */ + if (act < bck) + act = bck; + + if (!act) + act = 1; + + p->lbprm.map.srv = calloc(act, sizeof(*p->lbprm.map.srv)); + /* recounts servers and their weights */ + recount_servers(p); + update_backend_weight(p); + recalc_server_map(p); +} + +/* + * This function tries to find a running server with free connection slots for + * the proxy <px> following the round-robin method. + * If any server is found, it will be returned and px->lbprm.map.rr_idx will be updated + * to point to the next server. If no valid server is found, NULL is returned. + * + * The lbprm's lock will be used. + */ +struct server *map_get_server_rr(struct proxy *px, struct server *srvtoavoid) +{ + int newidx, avoididx; + struct server *srv, *avoided; + + HA_RWLOCK_SKLOCK(LBPRM_LOCK, &px->lbprm.lock); + if (px->lbprm.tot_weight == 0) { + avoided = NULL; + goto out; + } + + if (px->lbprm.map.rr_idx < 0 || px->lbprm.map.rr_idx >= px->lbprm.tot_weight) + px->lbprm.map.rr_idx = 0; + newidx = px->lbprm.map.rr_idx; + + avoided = NULL; + avoididx = 0; /* shut a gcc warning */ + do { + srv = px->lbprm.map.srv[newidx++]; + if (!srv->maxconn || (!srv->queue.length && srv->served < srv_dynamic_maxconn(srv))) { + /* make sure it is not the server we are try to exclude... */ + /* ...but remember that is was selected yet avoided */ + avoided = srv; + avoididx = newidx; + if (srv != srvtoavoid) { + px->lbprm.map.rr_idx = newidx; + goto out; + } + } + if (newidx == px->lbprm.tot_weight) + newidx = 0; + } while (newidx != px->lbprm.map.rr_idx); + + if (avoided) + px->lbprm.map.rr_idx = avoididx; + + out: + HA_RWLOCK_SKUNLOCK(LBPRM_LOCK, &px->lbprm.lock); + /* return NULL or srvtoavoid if found */ + return avoided; +} + +/* + * This function returns the running server from the map at the location + * pointed to by the result of a modulo operation on <hash>. The server map may + * be recomputed if required before being looked up. If any server is found, it + * will be returned. If no valid server is found, NULL is returned. + * + * The lbprm's lock will be used. + */ +struct server *map_get_server_hash(struct proxy *px, unsigned int hash) +{ + struct server *srv = NULL; + + HA_RWLOCK_RDLOCK(LBPRM_LOCK, &px->lbprm.lock); + if (px->lbprm.tot_weight) + srv = px->lbprm.map.srv[hash % px->lbprm.tot_weight]; + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &px->lbprm.lock); + return srv; +} + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/linuxcap.c b/src/linuxcap.c new file mode 100644 index 0000000..919086c --- /dev/null +++ b/src/linuxcap.c @@ -0,0 +1,191 @@ +/* + * Minimal handling of Linux kernel capabilities + * + * Copyright 2000-2023 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* Depending on distros, some have capset(), others use the more complicated + * libcap. Let's stick to what we need and the kernel documents (capset). + * Note that prctl is needed here. + */ +#include <linux/capability.h> +#include <sys/prctl.h> +#include <errno.h> +#include <unistd.h> +#include <syscall.h> + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/errors.h> +#include <haproxy/tools.h> + +/* supported names, zero-terminated */ +static const struct { + int cap; + const char *name; +} known_caps[] = { +#ifdef CAP_NET_RAW + { CAP_NET_RAW, "cap_net_raw" }, +#endif +#ifdef CAP_NET_ADMIN + { CAP_NET_ADMIN, "cap_net_admin" }, +#endif +#ifdef CAP_NET_BIND_SERVICE + { CAP_NET_BIND_SERVICE, "cap_net_bind_service" }, +#endif + /* must be last */ + { 0, 0 } +}; + +/* provided by sys/capability.h on some distros */ +static inline int capset(cap_user_header_t hdrp, const cap_user_data_t datap) +{ + return syscall(SYS_capset, hdrp, datap); +} + +/* defaults to zero, i.e. we don't keep any cap after setuid() */ +static uint32_t caplist; + +/* try to apply capabilities before switching UID from <from_uid> to <to_uid>. + * In practice we need to do this in 4 steps: + * - set PR_SET_KEEPCAPS to preserve caps across the final setuid() + * - set the effective and permitted caps ; + * - switch euid to non-zero + * - set the effective and permitted caps again + * - then the caller can safely call setuid() + * We don't do this if the current euid is not zero or if the target uid + * is zero. Returns >=0 on success, negative on failure. Alerts or warnings + * may be emitted. + */ +int prepare_caps_for_setuid(int from_uid, int to_uid) +{ + struct __user_cap_data_struct cap_data = { }; + struct __user_cap_header_struct cap_hdr = { + .pid = 0, /* current process */ + .version = _LINUX_CAPABILITY_VERSION_1, + }; + + if (from_uid != 0) + return 0; + + if (!to_uid) + return 0; + + if (!caplist) + return 0; + + if (prctl(PR_SET_KEEPCAPS, 1) == -1) { + ha_alert("Failed to preserve capabilities using prctl(): %s\n", strerror(errno)); + return -1; + } + + cap_data.effective = cap_data.permitted = caplist | (1 << CAP_SETUID); + if (capset(&cap_hdr, &cap_data) == -1) { + ha_alert("Failed to preset the capabilities to preserve using capset(): %s\n", strerror(errno)); + return -1; + } + + if (seteuid(to_uid) == -1) { + ha_alert("Failed to set effective uid to %d: %s\n", to_uid, strerror(errno)); + return -1; + } + + cap_data.effective = cap_data.permitted = caplist | (1 << CAP_SETUID); + if (capset(&cap_hdr, &cap_data) == -1) { + ha_alert("Failed to set the final capabilities using capset(): %s\n", strerror(errno)); + return -1; + } + /* all's good */ + return 0; +} + +/* finalize the capabilities after setuid(). The most important is to drop the + * CAP_SET_SETUID capability, which would otherwise allow to switch back to any + * UID and recover everything. + */ +int finalize_caps_after_setuid(int from_uid, int to_uid) +{ + struct __user_cap_data_struct cap_data = { }; + struct __user_cap_header_struct cap_hdr = { + .pid = 0, /* current process */ + .version = _LINUX_CAPABILITY_VERSION_1, + }; + + if (from_uid != 0) + return 0; + + if (!to_uid) + return 0; + + if (!caplist) + return 0; + + cap_data.effective = cap_data.permitted = caplist; + if (capset(&cap_hdr, &cap_data) == -1) { + ha_alert("Failed to drop the setuid capability using capset(): %s\n", strerror(errno)); + return -1; + } + /* all's good */ + return 0; +} + +/* parse the "setcap" global keyword. Returns -1 on failure, 0 on success. */ +static int cfg_parse_global_setcap(char **args, int section_type, + struct proxy *curpx, const struct proxy *defpx, + const char *file, int line, char **err) +{ + char *name = args[1]; + char *next; + uint32_t caps = 0; + int id; + + if (!*name) { + memprintf(err, "'%s' : missing capability name(s). ", args[0]); + goto dump_caps; + } + + while (name && *name) { + next = strchr(name, ','); + if (next) + *(next++) = '\0'; + + for (id = 0; known_caps[id].cap; id++) { + if (strcmp(name, known_caps[id].name) == 0) { + caps |= 1U << known_caps[id].cap; + break; + } + } + + if (!known_caps[id].cap) { + memprintf(err, "'%s' : unsupported capability '%s'. ", args[0], args[1]); + goto dump_caps; + } + name = next; + } + + caplist |= caps; + return 0; + + + dump_caps: + memprintf(err, "%s Supported ones are: ", *err); + + for (id = 0; known_caps[id].cap; id++) + memprintf(err, "%s%s%s%s", *err, + id ? known_caps[id+1].cap ? ", " : " and " : "", + known_caps[id].name, known_caps[id+1].cap ? "" : "."); + return -1; +} + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "setcap", cfg_parse_global_setcap }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/listener.c b/src/listener.c new file mode 100644 index 0000000..86d0945 --- /dev/null +++ b/src/listener.c @@ -0,0 +1,2487 @@ +/* + * Listener management functions. + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <haproxy/acl.h> +#include <haproxy/api.h> +#include <haproxy/activity.h> +#include <haproxy/cfgparse.h> +#include <haproxy/cli-t.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/quic_tp.h> +#include <haproxy/sample.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> +#include <haproxy/tools.h> + + +/* List head of all known bind keywords */ +struct bind_kw_list bind_keywords = { + .list = LIST_HEAD_INIT(bind_keywords.list) +}; + +/* list of the temporarily limited listeners because of lack of resource */ +static struct mt_list global_listener_queue = MT_LIST_HEAD_INIT(global_listener_queue); +static struct task *global_listener_queue_task; +/* number of times an accepted connection resulted in maxconn being reached */ +ullong maxconn_reached = 0; +__decl_thread(static HA_RWLOCK_T global_listener_rwlock); + +/* listener status for stats */ +const char* li_status_st[LI_STATE_COUNT] = { + [LI_STATUS_WAITING] = "WAITING", + [LI_STATUS_OPEN] = "OPEN", + [LI_STATUS_FULL] = "FULL", +}; + +#if defined(USE_THREAD) + +struct accept_queue_ring accept_queue_rings[MAX_THREADS] __attribute__((aligned(64))) = { }; + +/* dequeue and process a pending connection from the local accept queue (single + * consumer). Returns the accepted connection or NULL if none was found. + */ +struct connection *accept_queue_pop_sc(struct accept_queue_ring *ring) +{ + unsigned int pos, next; + struct connection *ptr; + struct connection **e; + uint32_t idx = _HA_ATOMIC_LOAD(&ring->idx); /* (head << 16) + tail */ + + pos = idx >> 16; + if (pos == (uint16_t)idx) + return NULL; + + next = pos + 1; + if (next >= ACCEPT_QUEUE_SIZE) + next = 0; + + e = &ring->entry[pos]; + + /* wait for the producer to update the listener's pointer */ + while (1) { + ptr = *e; + __ha_barrier_load(); + if (ptr) + break; + pl_cpu_relax(); + } + + /* release the entry */ + *e = NULL; + + __ha_barrier_store(); + do { + pos = (next << 16) | (idx & 0xffff); + } while (unlikely(!HA_ATOMIC_CAS(&ring->idx, &idx, pos) && __ha_cpu_relax())); + + return ptr; +} + + +/* tries to push a new accepted connection <conn> into ring <ring>. Returns + * non-zero if it succeeds, or zero if the ring is full. Supports multiple + * producers. + */ +int accept_queue_push_mp(struct accept_queue_ring *ring, struct connection *conn) +{ + unsigned int pos, next; + uint32_t idx = _HA_ATOMIC_LOAD(&ring->idx); /* (head << 16) + tail */ + + do { + pos = (uint16_t)idx; + next = pos + 1; + if (next >= ACCEPT_QUEUE_SIZE) + next = 0; + if (next == (idx >> 16)) + return 0; // ring full + next |= (idx & 0xffff0000U); + } while (unlikely(!_HA_ATOMIC_CAS(&ring->idx, &idx, next) && __ha_cpu_relax())); + + ring->entry[pos] = conn; + __ha_barrier_store(); + return 1; +} + +/* proceed with accepting new connections. Don't mark it static so that it appears + * in task dumps. + */ +struct task *accept_queue_process(struct task *t, void *context, unsigned int state) +{ + struct accept_queue_ring *ring = context; + struct connection *conn; + struct listener *li; + unsigned int max_accept; + int ret; + + /* if global.tune.maxaccept is -1, then max_accept is UINT_MAX. It + * is not really illimited, but it is probably enough. + */ + max_accept = global.tune.maxaccept ? global.tune.maxaccept : MAX_ACCEPT; + for (; max_accept; max_accept--) { + conn = accept_queue_pop_sc(ring); + if (!conn) + break; + + li = __objt_listener(conn->target); + _HA_ATOMIC_INC(&li->thr_conn[ti->ltid]); + ret = li->bind_conf->accept(conn); + if (ret <= 0) { + /* connection was terminated by the application */ + continue; + } + + /* increase the per-process number of cumulated sessions, this + * may only be done once l->bind_conf->accept() has accepted the + * connection. + */ + if (!(li->bind_conf->options & BC_O_UNLIMITED)) { + HA_ATOMIC_UPDATE_MAX(&global.sps_max, + update_freq_ctr(&global.sess_per_sec, 1)); + if (li->bind_conf->options & BC_O_USE_SSL) { + HA_ATOMIC_UPDATE_MAX(&global.ssl_max, + update_freq_ctr(&global.ssl_per_sec, 1)); + } + } + } + + /* ran out of budget ? Let's come here ASAP */ + if (!max_accept) + tasklet_wakeup(ring->tasklet); + + return NULL; +} + +/* Initializes the accept-queues. Returns 0 on success, otherwise ERR_* flags */ +static int accept_queue_init() +{ + struct tasklet *t; + int i; + + for (i = 0; i < global.nbthread; i++) { + t = tasklet_new(); + if (!t) { + ha_alert("Out of memory while initializing accept queue for thread %d\n", i); + return ERR_FATAL|ERR_ABORT; + } + t->tid = i; + t->process = accept_queue_process; + t->context = &accept_queue_rings[i]; + accept_queue_rings[i].tasklet = t; + } + return 0; +} + +REGISTER_CONFIG_POSTPARSER("multi-threaded accept queue", accept_queue_init); + +static void accept_queue_deinit() +{ + int i; + + for (i = 0; i < global.nbthread; i++) { + tasklet_free(accept_queue_rings[i].tasklet); + } +} + +REGISTER_POST_DEINIT(accept_queue_deinit); + +#endif // USE_THREAD + +/* Memory allocation and initialization of the per_thr field (one entry per + * bound thread). + * Returns 0 if the field has been successfully initialized, -1 on failure. + */ +int li_init_per_thr(struct listener *li) +{ + int nbthr = MIN(global.nbthread, MAX_THREADS_PER_GROUP); + int i; + + /* allocate per-thread elements for listener */ + li->per_thr = calloc(nbthr, sizeof(*li->per_thr)); + if (!li->per_thr) + return -1; + + for (i = 0; i < nbthr; ++i) { + MT_LIST_INIT(&li->per_thr[i].quic_accept.list); + MT_LIST_INIT(&li->per_thr[i].quic_accept.conns); + + li->per_thr[i].li = li; + } + + return 0; +} + +/* helper to get listener status for stats */ +enum li_status get_li_status(struct listener *l) +{ + if (!l->bind_conf->maxconn || l->nbconn < l->bind_conf->maxconn) { + if (l->state == LI_LIMITED) + return LI_STATUS_WAITING; + else + return LI_STATUS_OPEN; + } + return LI_STATUS_FULL; +} + +/* adjust the listener's state and its proxy's listener counters if needed. + * It must be called under the listener's lock, but uses atomic ops to change + * the proxy's counters so that the proxy lock is not needed. + */ +void listener_set_state(struct listener *l, enum li_state st) +{ + struct proxy *px = l->bind_conf->frontend; + + if (px) { + /* from state */ + switch (l->state) { + case LI_NEW: /* first call */ + _HA_ATOMIC_INC(&px->li_all); + break; + case LI_INIT: + case LI_ASSIGNED: + break; + case LI_PAUSED: + _HA_ATOMIC_DEC(&px->li_paused); + break; + case LI_LISTEN: + _HA_ATOMIC_DEC(&px->li_bound); + break; + case LI_READY: + case LI_FULL: + case LI_LIMITED: + _HA_ATOMIC_DEC(&px->li_ready); + break; + } + + /* to state */ + switch (st) { + case LI_NEW: + case LI_INIT: + case LI_ASSIGNED: + break; + case LI_PAUSED: + BUG_ON(l->rx.fd == -1); + _HA_ATOMIC_INC(&px->li_paused); + break; + case LI_LISTEN: + BUG_ON(l->rx.fd == -1 && !l->rx.rhttp.task); + _HA_ATOMIC_INC(&px->li_bound); + break; + case LI_READY: + case LI_FULL: + case LI_LIMITED: + BUG_ON(l->rx.fd == -1 && !l->rx.rhttp.task); + _HA_ATOMIC_INC(&px->li_ready); + l->flags |= LI_F_FINALIZED; + break; + } + } + l->state = st; +} + +/* This function adds the specified listener's file descriptor to the polling + * lists if it is in the LI_LISTEN state. The listener enters LI_READY or + * LI_FULL state depending on its number of connections. In daemon mode, we + * also support binding only the relevant processes to their respective + * listeners. We don't do that in debug mode however. + */ +void enable_listener(struct listener *listener) +{ + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &listener->lock); + + /* If this listener is supposed to be only in the master, close it in + * the workers. Conversely, if it's supposed to be only in the workers + * close it in the master. + */ + if (!!master != !!(listener->rx.flags & RX_F_MWORKER)) + do_unbind_listener(listener); + + if (listener->state == LI_LISTEN) { + BUG_ON(listener->rx.fd == -1 && !listener->rx.rhttp.task); + if ((global.mode & (MODE_DAEMON | MODE_MWORKER)) && + (!!master != !!(listener->rx.flags & RX_F_MWORKER))) { + /* we don't want to enable this listener and don't + * want any fd event to reach it. + */ + do_unbind_listener(listener); + } + else if (!listener->bind_conf->maxconn || listener->nbconn < listener->bind_conf->maxconn) { + listener->rx.proto->enable(listener); + listener_set_state(listener, LI_READY); + } + else { + listener_set_state(listener, LI_FULL); + } + } + + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &listener->lock); +} + +/* + * This function completely stops a listener. + * The proxy's listeners count is updated and the proxy is + * disabled and woken up after the last one is gone. + * It will need to operate under the proxy's lock, the protocol's lock and + * the listener's lock. The caller is responsible for indicating in lpx, + * lpr, lli whether the respective locks are already held (non-zero) or + * not (zero) so that the function picks the missing ones, in this order. + */ +void stop_listener(struct listener *l, int lpx, int lpr, int lli) +{ + struct proxy *px = l->bind_conf->frontend; + + if (l->bind_conf->options & BC_O_NOSTOP) { + /* master-worker sockpairs are never closed but don't count as a + * job. + */ + return; + } + + if (!lpx && px) + HA_RWLOCK_WRLOCK(PROXY_LOCK, &px->lock); + + if (!lpr) + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + + if (!lli) + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &l->lock); + + if (l->state > LI_INIT) { + do_unbind_listener(l); + + if (l->state >= LI_ASSIGNED) + __delete_listener(l); + + if (px) + proxy_cond_disable(px); + } + + if (!lli) + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &l->lock); + + if (!lpr) + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); + + if (!lpx && px) + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &px->lock); +} + +/* This function adds the specified <listener> to the protocol <proto>. It + * does nothing if the protocol was already added. The listener's state is + * automatically updated from LI_INIT to LI_ASSIGNED. The number of listeners + * for the protocol is updated. This must be called with the proto lock held. + */ +void default_add_listener(struct protocol *proto, struct listener *listener) +{ + if (listener->state != LI_INIT) + return; + listener_set_state(listener, LI_ASSIGNED); + listener->rx.proto = proto; + LIST_APPEND(&proto->receivers, &listener->rx.proto_list); + proto->nb_receivers++; +} + +/* default function called to suspend a listener: it simply passes the call to + * the underlying receiver. This is find for most socket-based protocols. This + * must be called under the listener's lock. It will return < 0 in case of + * failure, 0 if the listener was totally stopped, or > 0 if correctly paused.. + * If no receiver-level suspend is provided, the operation is assumed + * to succeed. + */ +int default_suspend_listener(struct listener *l) +{ + if (!l->rx.proto->rx_suspend) + return 1; + + return l->rx.proto->rx_suspend(&l->rx); +} + + +/* Tries to resume a suspended listener, and returns non-zero on success or + * zero on failure. On certain errors, an alert or a warning might be displayed. + * It must be called with the listener's lock held. Depending on the listener's + * state and protocol, a listen() call might be used to resume operations, or a + * call to the receiver's resume() function might be used as well. This is + * suitable as a default function for TCP and UDP. This must be called with the + * listener's lock held. + */ +int default_resume_listener(struct listener *l) +{ + int ret = 1; + + if (l->state == LI_ASSIGNED) { + char msg[100]; + char *errmsg; + int err; + + /* first, try to bind the receiver */ + err = l->rx.proto->fam->bind(&l->rx, &errmsg); + if (err != ERR_NONE) { + if (err & ERR_WARN) + ha_warning("Resuming listener: %s\n", errmsg); + else if (err & ERR_ALERT) + ha_alert("Resuming listener: %s\n", errmsg); + ha_free(&errmsg); + if (err & (ERR_FATAL | ERR_ABORT)) { + ret = 0; + goto end; + } + } + + /* then, try to listen: + * for now there's still always a listening function + * (same check performed in protocol_bind_all() + */ + BUG_ON(!l->rx.proto->listen); + err = l->rx.proto->listen(l, msg, sizeof(msg)); + if (err & ERR_ALERT) + ha_alert("Resuming listener: %s\n", msg); + else if (err & ERR_WARN) + ha_warning("Resuming listener: %s\n", msg); + + if (err & (ERR_FATAL | ERR_ABORT)) { + ret = 0; + goto end; + } + } + + if (l->state < LI_PAUSED) { + ret = 0; + goto end; + } + + if (l->state == LI_PAUSED && l->rx.proto->rx_resume && + l->rx.proto->rx_resume(&l->rx) <= 0) + ret = 0; + end: + return ret; +} + + +/* This function tries to temporarily disable a listener, depending on the OS + * capabilities. Linux unbinds the listen socket after a SHUT_RD, and ignores + * SHUT_WR. Solaris refuses either shutdown(). OpenBSD ignores SHUT_RD but + * closes upon SHUT_WR and refuses to rebind. So a common validation path + * involves SHUT_WR && listen && SHUT_RD. In case of success, the FD's polling + * is disabled. It normally returns non-zero, unless an error is reported. + * suspend() may totally stop a listener if it doesn't support the PAUSED + * state, in which case state will be set to ASSIGNED. + * It will need to operate under the proxy's lock and the listener's lock. + * The caller is responsible for indicating in lpx, lli whether the respective + * locks are already held (non-zero) or not (zero) so that the function pick + * the missing ones, in this order. + */ +int suspend_listener(struct listener *l, int lpx, int lli) +{ + struct proxy *px = l->bind_conf->frontend; + int ret = 1; + + if (!lpx && px) + HA_RWLOCK_WRLOCK(PROXY_LOCK, &px->lock); + + if (!lli) + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &l->lock); + + if (!(l->flags & LI_F_FINALIZED) || l->state <= LI_PAUSED) + goto end; + + if (l->rx.proto->suspend) { + ret = l->rx.proto->suspend(l); + /* if the suspend() fails, we don't want to change the + * current listener state + */ + if (ret < 0) + goto end; + } + + MT_LIST_DELETE(&l->wait_queue); + + /* ret == 0 means that the suspend() has been turned into + * an unbind(), meaning the listener is now stopped (ie: ABNS), we need + * to report this state change properly + */ + listener_set_state(l, ((ret) ? LI_PAUSED : LI_ASSIGNED)); + + if (px && !(l->flags & LI_F_SUSPENDED)) + px->li_suspended++; + l->flags |= LI_F_SUSPENDED; + + /* at this point, everything is under control, no error should be + * returned to calling function + */ + ret = 1; + + if (px && !(px->flags & PR_FL_PAUSED) && !px->li_ready) { + /* PROXY_LOCK is required */ + proxy_cond_pause(px); + ha_warning("Paused %s %s.\n", proxy_cap_str(px->cap), px->id); + send_log(px, LOG_WARNING, "Paused %s %s.\n", proxy_cap_str(px->cap), px->id); + } + end: + if (!lli) + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &l->lock); + + if (!lpx && px) + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &px->lock); + + return ret; +} + +/* This function tries to resume a temporarily disabled listener. Paused, full, + * limited and disabled listeners are handled, which means that this function + * may replace enable_listener(). The resulting state will either be LI_READY + * or LI_FULL. 0 is returned in case of failure to resume (eg: dead socket). + * Listeners bound to a different process are not woken up unless we're in + * foreground mode, and are ignored. If the listener was only in the assigned + * state, it's totally rebound. This can happen if a suspend() has completely + * stopped it. If the resume fails, 0 is returned and an error might be + * displayed. + * It will need to operate under the proxy's lock and the listener's lock. + * The caller is responsible for indicating in lpx, lli whether the respective + * locks are already held (non-zero) or not (zero) so that the function pick + * the missing ones, in this order. + */ +int resume_listener(struct listener *l, int lpx, int lli) +{ + struct proxy *px = l->bind_conf->frontend; + int ret = 1; + + if (!lpx && px) + HA_RWLOCK_WRLOCK(PROXY_LOCK, &px->lock); + + if (!lli) + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &l->lock); + + /* check that another thread didn't to the job in parallel (e.g. at the + * end of listen_accept() while we'd come from dequeue_all_listeners(). + */ + if (MT_LIST_INLIST(&l->wait_queue)) + goto end; + + if (!(l->flags & LI_F_FINALIZED) || l->state == LI_READY) + goto end; + + if (l->rx.proto->resume) { + ret = l->rx.proto->resume(l); + if (!ret) + goto end; /* failure to resume */ + } + + if (l->bind_conf->maxconn && l->nbconn >= l->bind_conf->maxconn) { + l->rx.proto->disable(l); + listener_set_state(l, LI_FULL); + goto done; + } + + l->rx.proto->enable(l); + listener_set_state(l, LI_READY); + + done: + if (px && (l->flags & LI_F_SUSPENDED)) + px->li_suspended--; + l->flags &= ~LI_F_SUSPENDED; + + if (px && (px->flags & PR_FL_PAUSED) && !px->li_suspended) { + /* PROXY_LOCK is required */ + proxy_cond_resume(px); + ha_warning("Resumed %s %s.\n", proxy_cap_str(px->cap), px->id); + send_log(px, LOG_WARNING, "Resumed %s %s.\n", proxy_cap_str(px->cap), px->id); + } + end: + if (!lli) + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &l->lock); + + if (!lpx && px) + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &px->lock); + + return ret; +} + +/* Same as resume_listener(), but will only work to resume from + * LI_FULL or LI_LIMITED states because we try to relax listeners that + * were temporarily restricted and not to resume inactive listeners that + * may have been paused or completely stopped in the meantime. + * Returns positive value for success and 0 for failure. + * It will need to operate under the proxy's lock and the listener's lock. + * The caller is responsible for indicating in lpx, lli whether the respective + * locks are already held (non-zero) or not (zero) so that the function pick + * the missing ones, in this order. + */ +int relax_listener(struct listener *l, int lpx, int lli) +{ + struct proxy *px = l->bind_conf->frontend; + int ret = 1; + + if (!lpx && px) + HA_RWLOCK_WRLOCK(PROXY_LOCK, &px->lock); + + if (!lli) + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &l->lock); + + if (l->state != LI_FULL && l->state != LI_LIMITED) + goto end; /* listener may be suspended or even stopped */ + ret = resume_listener(l, 1, 1); + + end: + if (!lli) + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &l->lock); + + if (!lpx && px) + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &px->lock); + + return ret; +} + +/* Marks a ready listener as full so that the stream code tries to re-enable + * it upon next close() using relax_listener(). + */ +static void listener_full(struct listener *l) +{ + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &l->lock); + if (l->state >= LI_READY) { + MT_LIST_DELETE(&l->wait_queue); + if (l->state != LI_FULL) { + l->rx.proto->disable(l); + listener_set_state(l, LI_FULL); + } + } + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &l->lock); +} + +/* Marks a ready listener as limited so that we only try to re-enable it when + * resources are free again. It will be queued into the specified queue. + */ +static void limit_listener(struct listener *l, struct mt_list *list) +{ + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &l->lock); + if (l->state == LI_READY) { + MT_LIST_TRY_APPEND(list, &l->wait_queue); + l->rx.proto->disable(l); + listener_set_state(l, LI_LIMITED); + } + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &l->lock); +} + +/* Dequeues all listeners waiting for a resource the global wait queue */ +void dequeue_all_listeners() +{ + struct listener *listener; + + while ((listener = MT_LIST_POP(&global_listener_queue, struct listener *, wait_queue))) { + /* This cannot fail because the listeners are by definition in + * the LI_LIMITED state. + */ + relax_listener(listener, 0, 0); + } +} + +/* Dequeues all listeners waiting for a resource in proxy <px>'s queue */ +void dequeue_proxy_listeners(struct proxy *px) +{ + struct listener *listener; + + while ((listener = MT_LIST_POP(&px->listener_queue, struct listener *, wait_queue))) { + /* This cannot fail because the listeners are by definition in + * the LI_LIMITED state. + */ + relax_listener(listener, 0, 0); + } +} + + +/* default function used to unbind a listener. This is for use by standard + * protocols working on top of accepted sockets. The receiver's rx_unbind() + * will automatically be used after the listener is disabled if the socket is + * still bound. This must be used under the listener's lock. + */ +void default_unbind_listener(struct listener *listener) +{ + if (listener->state <= LI_ASSIGNED) + goto out_close; + + if (listener->rx.fd == -1) { + listener_set_state(listener, LI_ASSIGNED); + goto out_close; + } + + if (listener->state >= LI_READY) { + listener->rx.proto->disable(listener); + if (listener->rx.flags & RX_F_BOUND) + listener_set_state(listener, LI_LISTEN); + } + + out_close: + if (listener->rx.flags & RX_F_BOUND) + listener->rx.proto->rx_unbind(&listener->rx); +} + +/* This function closes the listening socket for the specified listener, + * provided that it's already in a listening state. The protocol's unbind() + * is called to put the listener into LI_ASSIGNED or LI_LISTEN and handle + * the unbinding tasks. The listener enters then the LI_ASSIGNED state if + * the receiver is unbound. Must be called with the lock held. + */ +void do_unbind_listener(struct listener *listener) +{ + MT_LIST_DELETE(&listener->wait_queue); + + if (listener->rx.proto->unbind) + listener->rx.proto->unbind(listener); + + /* we may have to downgrade the listener if the rx was closed */ + if (!(listener->rx.flags & RX_F_BOUND) && listener->state > LI_ASSIGNED) + listener_set_state(listener, LI_ASSIGNED); +} + +/* This function closes the listening socket for the specified listener, + * provided that it's already in a listening state. The listener enters the + * LI_ASSIGNED state, except if the FD is not closed, in which case it may + * remain in LI_LISTEN. This function is intended to be used as a generic + * function for standard protocols. + */ +void unbind_listener(struct listener *listener) +{ + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &listener->lock); + do_unbind_listener(listener); + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &listener->lock); +} + +/* creates one or multiple listeners for bind_conf <bc> on sockaddr <ss> on port + * range <portl> to <porth>, and possibly attached to fd <fd> (or -1 for auto + * allocation). The address family is taken from ss->ss_family, and the protocol + * passed in <proto> must be usable on this family. The protocol's default iocb + * is automatically preset as the receivers' iocb. The number of jobs and + * listeners is automatically increased by the number of listeners created. It + * returns non-zero on success, zero on error with the error message set in <err>. + */ +int create_listeners(struct bind_conf *bc, const struct sockaddr_storage *ss, + int portl, int porth, int fd, struct protocol *proto, char **err) +{ + struct listener *l; + int port; + + for (port = portl; port <= porth; port++) { + l = calloc(1, sizeof(*l)); + if (!l) { + memprintf(err, "out of memory"); + return 0; + } + l->obj_type = OBJ_TYPE_LISTENER; + LIST_APPEND(&bc->frontend->conf.listeners, &l->by_fe); + LIST_APPEND(&bc->listeners, &l->by_bind); + l->bind_conf = bc; + l->rx.settings = &bc->settings; + l->rx.owner = l; + l->rx.iocb = proto->default_iocb; + l->rx.fd = fd; + + l->rx.rhttp.task = NULL; + l->rx.rhttp.srv = NULL; + l->rx.rhttp.pend_conn = NULL; + + memcpy(&l->rx.addr, ss, sizeof(*ss)); + if (proto->fam->set_port) + proto->fam->set_port(&l->rx.addr, port); + + MT_LIST_INIT(&l->wait_queue); + listener_set_state(l, LI_INIT); + + proto->add(proto, l); + + if (fd != -1) + l->rx.flags |= RX_F_INHERITED; + + l->extra_counters = NULL; + + HA_RWLOCK_INIT(&l->lock); + _HA_ATOMIC_INC(&jobs); + _HA_ATOMIC_INC(&listeners); + } + return 1; +} + +/* Optionally allocates a new shard info (if si == NULL) for receiver rx and + * assigns it to it, or attaches to an existing one. If the rx already had a + * shard_info, it is simply returned. It is illegal to call this function with + * an rx that's part of a group that is already attached. Attaching means the + * shard_info's thread count and group count are updated so the rx's group is + * added to the shard_info's group mask. The rx are added to the members in the + * attachment order, though it must not matter. It is meant for boot time setup + * and is not thread safe. NULL is returned on allocation failure. + */ +struct shard_info *shard_info_attach(struct receiver *rx, struct shard_info *si) +{ + if (rx->shard_info) + return rx->shard_info; + + if (!si) { + si = calloc(1, sizeof(*si)); + if (!si) + return NULL; + + si->ref = rx; + } + + rx->shard_info = si; + BUG_ON (si->tgroup_mask & 1UL << (rx->bind_tgroup - 1)); + si->tgroup_mask |= 1UL << (rx->bind_tgroup - 1); + si->nbgroups = my_popcountl(si->tgroup_mask); + si->nbthreads += my_popcountl(rx->bind_thread); + si->members[si->nbgroups - 1] = rx; + return si; +} + +/* Detaches the rx from an optional shard_info it may be attached to. If so, + * the thread counts, group masks and refcounts are updated. The members list + * remains contiguous by replacing the current entry with the last one. The + * reference continues to point to the first receiver. If the group count + * reaches zero, the shard_info is automatically released. + */ +void shard_info_detach(struct receiver *rx) +{ + struct shard_info *si = rx->shard_info; + uint gr; + + if (!si) + return; + + rx->shard_info = NULL; + + /* find the member slot this rx was attached to */ + for (gr = 0; gr < MAX_TGROUPS && si->members[gr] != rx; gr++) + ; + + BUG_ON(gr == MAX_TGROUPS); + + si->nbthreads -= my_popcountl(rx->bind_thread); + si->tgroup_mask &= ~(1UL << (rx->bind_tgroup - 1)); + si->nbgroups = my_popcountl(si->tgroup_mask); + + /* replace the member by the last one. If we removed the reference, we + * have to switch to another one. It's always the first entry so we can + * simply enforce it upon every removal. + */ + si->members[gr] = si->members[si->nbgroups]; + si->members[si->nbgroups] = NULL; + si->ref = si->members[0]; + + if (!si->nbgroups) + free(si); +} + +/* clones listener <src> and returns the new one. All dynamically allocated + * fields are reallocated (name for now). The new listener is inserted before + * the original one in the bind_conf and frontend lists. This allows it to be + * duplicated while iterating over the current list. The original listener must + * only be in the INIT or ASSIGNED states, and the new listener will only be + * placed into the INIT state. The counters are always set to NULL. Maxsock is + * updated. Returns NULL on allocation error. The shard_info is never taken so + * that the caller can decide what to do with it depending on how it intends to + * clone the listener. + */ +struct listener *clone_listener(struct listener *src) +{ + struct listener *l; + + l = calloc(1, sizeof(*l)); + if (!l) + goto oom1; + memcpy(l, src, sizeof(*l)); + + if (l->name) { + l->name = strdup(l->name); + if (!l->name) + goto oom2; + } + + l->rx.owner = l; + l->rx.shard_info = NULL; + l->state = LI_INIT; + l->counters = NULL; + l->extra_counters = NULL; + + LIST_APPEND(&src->by_fe, &l->by_fe); + LIST_APPEND(&src->by_bind, &l->by_bind); + + MT_LIST_INIT(&l->wait_queue); + + l->rx.proto->add(l->rx.proto, l); + + HA_RWLOCK_INIT(&l->lock); + _HA_ATOMIC_INC(&jobs); + _HA_ATOMIC_INC(&listeners); + global.maxsock++; + return l; + + oom2: + free(l); + oom1: + return NULL; +} + +/* Delete a listener from its protocol's list of listeners. The listener's + * state is automatically updated from LI_ASSIGNED to LI_INIT. The protocol's + * number of listeners is updated, as well as the global number of listeners + * and jobs. Note that the listener must have previously been unbound. This + * is a low-level function expected to be called with the proto_lock and the + * listener's lock held. + */ +void __delete_listener(struct listener *listener) +{ + if (listener->state == LI_ASSIGNED) { + listener_set_state(listener, LI_INIT); + LIST_DELETE(&listener->rx.proto_list); + shard_info_detach(&listener->rx); + listener->rx.proto->nb_receivers--; + _HA_ATOMIC_DEC(&jobs); + _HA_ATOMIC_DEC(&listeners); + } +} + +/* Delete a listener from its protocol's list of listeners (please check + * __delete_listener() above). The proto_lock and the listener's lock will + * be grabbed in this order. + */ +void delete_listener(struct listener *listener) +{ + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &listener->lock); + __delete_listener(listener); + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &listener->lock); + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); +} + +/* Returns a suitable value for a listener's backlog. It uses the listener's, + * otherwise the frontend's backlog, otherwise the listener's maxconn, + * otherwise the frontend's maxconn, otherwise 1024. + */ +int listener_backlog(const struct listener *l) +{ + if (l->bind_conf->backlog) + return l->bind_conf->backlog; + + if (l->bind_conf->frontend->backlog) + return l->bind_conf->frontend->backlog; + + if (l->bind_conf->maxconn) + return l->bind_conf->maxconn; + + if (l->bind_conf->frontend->maxconn) + return l->bind_conf->frontend->maxconn; + + return 1024; +} + +/* Returns true if listener <l> must check maxconn limit prior to accept. */ +static inline int listener_uses_maxconn(const struct listener *l) +{ + return !(l->bind_conf->options & (BC_O_UNLIMITED|BC_O_XPRT_MAXCONN)); +} + +/* This function is called on a read event from a listening socket, corresponding + * to an accept. It tries to accept as many connections as possible, and for each + * calls the listener's accept handler (generally the frontend's accept handler). + */ +void listener_accept(struct listener *l) +{ + struct connection *cli_conn; + struct proxy *p; + unsigned int max_accept; + int next_conn = 0; + int next_feconn = 0; + int next_actconn = 0; + int expire; + int ret; + + p = l->bind_conf->frontend; + + /* if l->bind_conf->maxaccept is -1, then max_accept is UINT_MAX. It is + * not really illimited, but it is probably enough. + */ + max_accept = l->bind_conf->maxaccept ? l->bind_conf->maxaccept : 1; + + if (!(l->bind_conf->options & BC_O_UNLIMITED) && global.sps_lim) { + int max = freq_ctr_remain(&global.sess_per_sec, global.sps_lim, 0); + + if (unlikely(!max)) { + /* frontend accept rate limit was reached */ + expire = tick_add(now_ms, next_event_delay(&global.sess_per_sec, global.sps_lim, 0)); + goto limit_global; + } + + if (max_accept > max) + max_accept = max; + } + + if (!(l->bind_conf->options & BC_O_UNLIMITED) && global.cps_lim) { + int max = freq_ctr_remain(&global.conn_per_sec, global.cps_lim, 0); + + if (unlikely(!max)) { + /* frontend accept rate limit was reached */ + expire = tick_add(now_ms, next_event_delay(&global.conn_per_sec, global.cps_lim, 0)); + goto limit_global; + } + + if (max_accept > max) + max_accept = max; + } +#ifdef USE_OPENSSL + if (!(l->bind_conf->options & BC_O_UNLIMITED) && global.ssl_lim && + l->bind_conf && l->bind_conf->options & BC_O_USE_SSL) { + int max = freq_ctr_remain(&global.ssl_per_sec, global.ssl_lim, 0); + + if (unlikely(!max)) { + /* frontend accept rate limit was reached */ + expire = tick_add(now_ms, next_event_delay(&global.ssl_per_sec, global.ssl_lim, 0)); + goto limit_global; + } + + if (max_accept > max) + max_accept = max; + } +#endif + if (p && p->fe_sps_lim) { + int max = freq_ctr_remain(&p->fe_sess_per_sec, p->fe_sps_lim, 0); + + if (unlikely(!max)) { + /* frontend accept rate limit was reached */ + expire = tick_add(now_ms, next_event_delay(&p->fe_sess_per_sec, p->fe_sps_lim, 0)); + goto limit_proxy; + } + + if (max_accept > max) + max_accept = max; + } + + /* Note: if we fail to allocate a connection because of configured + * limits, we'll schedule a new attempt worst 1 second later in the + * worst case. If we fail due to system limits or temporary resource + * shortage, we try again 100ms later in the worst case. + */ + for (; max_accept; next_conn = next_feconn = next_actconn = 0, max_accept--) { + unsigned int count; + int status; + __decl_thread(unsigned long mask); + + /* pre-increase the number of connections without going too far. + * We process the listener, then the proxy, then the process. + * We know which ones to unroll based on the next_xxx value. + */ + do { + count = l->nbconn; + if (unlikely(l->bind_conf->maxconn && count >= l->bind_conf->maxconn)) { + /* the listener was marked full or another + * thread is going to do it. + */ + next_conn = 0; + listener_full(l); + goto end; + } + next_conn = count + 1; + } while (!_HA_ATOMIC_CAS(&l->nbconn, (int *)(&count), next_conn)); + + if (p) { + do { + count = p->feconn; + if (unlikely(count >= p->maxconn)) { + /* the frontend was marked full or another + * thread is going to do it. + */ + next_feconn = 0; + expire = TICK_ETERNITY; + goto limit_proxy; + } + next_feconn = count + 1; + } while (!_HA_ATOMIC_CAS(&p->feconn, &count, next_feconn)); + } + + if (listener_uses_maxconn(l)) { + next_actconn = increment_actconn(); + if (!next_actconn) { + /* the process was marked full or another + * thread is going to do it. + */ + expire = tick_add(now_ms, 1000); /* try again in 1 second */ + goto limit_global; + } + } + + /* be careful below, the listener might be shutting down in + * another thread on error and we must not dereference its + * FD without a bit of protection. + */ + cli_conn = NULL; + status = CO_AC_PERMERR; + + HA_RWLOCK_RDLOCK(LISTENER_LOCK, &l->lock); + if (l->rx.flags & RX_F_BOUND) + cli_conn = l->rx.proto->accept_conn(l, &status); + HA_RWLOCK_RDUNLOCK(LISTENER_LOCK, &l->lock); + + if (!cli_conn) { + switch (status) { + case CO_AC_DONE: + goto end; + + case CO_AC_RETRY: /* likely a signal */ + _HA_ATOMIC_DEC(&l->nbconn); + if (p) + _HA_ATOMIC_DEC(&p->feconn); + if (listener_uses_maxconn(l)) + _HA_ATOMIC_DEC(&actconn); + continue; + + case CO_AC_YIELD: + max_accept = 0; + goto end; + + default: + goto transient_error; + } + } + + /* The connection was accepted, it must be counted as such */ + if (l->counters) + HA_ATOMIC_UPDATE_MAX(&l->counters->conn_max, next_conn); + + if (p) { + HA_ATOMIC_UPDATE_MAX(&p->fe_counters.conn_max, next_feconn); + proxy_inc_fe_conn_ctr(l, p); + } + + if (!(l->bind_conf->options & BC_O_UNLIMITED)) { + count = update_freq_ctr(&global.conn_per_sec, 1); + HA_ATOMIC_UPDATE_MAX(&global.cps_max, count); + } + + _HA_ATOMIC_INC(&activity[tid].accepted); + + /* count the number of times an accepted connection resulted in + * maxconn being reached. + */ + if (unlikely(_HA_ATOMIC_LOAD(&actconn) + 1 >= global.maxconn)) + _HA_ATOMIC_INC(&maxconn_reached); + + /* past this point, l->bind_conf->accept() will automatically decrement + * l->nbconn, feconn and actconn once done. Setting next_*conn=0 + * allows the error path not to rollback on nbconn. It's more + * convenient than duplicating all exit labels. + */ + next_conn = 0; + next_feconn = 0; + next_actconn = 0; + + +#if defined(USE_THREAD) + if (!(global.tune.options & GTUNE_LISTENER_MQ_ANY) || stopping) + goto local_accept; + + /* we want to perform thread rebalancing if the listener is + * bound to more than one thread or if it's part of a shard + * with more than one listener. + */ + mask = l->rx.bind_thread & _HA_ATOMIC_LOAD(&tg->threads_enabled); + if (l->rx.shard_info || atleast2(mask)) { + struct accept_queue_ring *ring; + struct listener *new_li; + uint r1, r2, t, t1, t2; + ulong n0, n1; + const struct tgroup_info *g1, *g2; + ulong m1, m2; + ulong *thr_idx_ptr; + + /* The principle is that we have two running indexes, + * each visiting in turn all threads bound to this + * listener's shard. The connection will be assigned to + * the one with the least connections, and the other + * one will be updated. This provides a good fairness + * on short connections (round robin) and on long ones + * (conn count), without ever missing any idle thread. + * Each thread number is encoded as a combination of + * times the receiver number and its local thread + * number from 0 to MAX_THREADS_PER_GROUP - 1. The two + * indexes are stored as 10/12 bit numbers in the thr_idx + * array, since there are up to LONGBITS threads and + * groups that can be represented. They are represented + * like this: + * 31:20 19:15 14:10 9:5 4:0 + * 32b: [ counter | r2num | t2num | r1num | t1num ] + * + * 63:24 23:18 17:12 11:6 5:0 + * 64b: [ counter | r2num | t2num | r1num | t1num ] + * + * The change counter is only used to avoid swapping too + * old a value when the value loops back. + * + * In the loop below we have this for each index: + * - n is the thread index + * - r is the receiver number + * - g is the receiver's thread group + * - t is the thread number in this receiver + * - m is the receiver's thread mask shifted by the thread number + */ + + /* keep a copy for the final update. thr_idx is composite + * and made of (n2<<16) + n1. + */ + thr_idx_ptr = l->rx.shard_info ? &((struct listener *)(l->rx.shard_info->ref->owner))->thr_idx : &l->thr_idx; + while (1) { + int q0, q1, q2; + + /* calculate r1/g1/t1 first (ascending idx) */ + n0 = _HA_ATOMIC_LOAD(thr_idx_ptr); + new_li = NULL; + + t1 = (uint)n0 & (LONGBITS - 1); + r1 = ((uint)n0 / LONGBITS) & (LONGBITS - 1); + + while (1) { + if (l->rx.shard_info) { + /* multiple listeners, take the group into account */ + if (r1 >= l->rx.shard_info->nbgroups) + r1 = 0; + + g1 = &ha_tgroup_info[l->rx.shard_info->members[r1]->bind_tgroup - 1]; + m1 = l->rx.shard_info->members[r1]->bind_thread; + } else { + /* single listener */ + r1 = 0; + g1 = tg; + m1 = l->rx.bind_thread; + } + m1 &= _HA_ATOMIC_LOAD(&g1->threads_enabled); + m1 >>= t1; + + /* find first existing thread */ + if (unlikely(!(m1 & 1))) { + m1 &= ~1UL; + if (!m1) { + /* no more threads here, switch to + * first thread of next group. + */ + t1 = 0; + if (l->rx.shard_info) + r1++; + /* loop again */ + continue; + } + t1 += my_ffsl(m1) - 1; + } + /* done: r1 and t1 are OK */ + break; + } + + /* now r2/g2/t2 (descending idx) */ + t2 = ((uint)n0 / LONGBITS / LONGBITS) & (LONGBITS - 1); + r2 = ((uint)n0 / LONGBITS / LONGBITS / LONGBITS) & (LONGBITS - 1); + + /* if running in round-robin mode ("fair"), we don't need + * to go further. + */ + if ((global.tune.options & GTUNE_LISTENER_MQ_ANY) == GTUNE_LISTENER_MQ_FAIR) { + t = g1->base + t1; + if (l->rx.shard_info && t != tid) + new_li = l->rx.shard_info->members[r1]->owner; + goto updt_t1; + } + + while (1) { + if (l->rx.shard_info) { + /* multiple listeners, take the group into account */ + if (r2 >= l->rx.shard_info->nbgroups) + r2 = l->rx.shard_info->nbgroups - 1; + + g2 = &ha_tgroup_info[l->rx.shard_info->members[r2]->bind_tgroup - 1]; + m2 = l->rx.shard_info->members[r2]->bind_thread; + } else { + /* single listener */ + r2 = 0; + g2 = tg; + m2 = l->rx.bind_thread; + } + m2 &= _HA_ATOMIC_LOAD(&g2->threads_enabled); + m2 &= nbits(t2 + 1); + + /* find previous existing thread */ + if (unlikely(!(m2 & (1UL << t2)) || (g1 == g2 && t1 == t2))) { + /* highest bit not set or colliding threads, let's check + * if we still have other threads available after this + * one. + */ + m2 &= ~(1UL << t2); + if (!m2) { + /* no more threads here, switch to + * last thread of previous group. + */ + t2 = MAX_THREADS_PER_GROUP - 1; + if (l->rx.shard_info) + r2--; + /* loop again */ + continue; + } + t2 = my_flsl(m2) - 1; + } + /* done: r2 and t2 are OK */ + break; + } + + /* tests show that it's worth checking that other threads have not + * already changed the index to save the rest of the calculation, + * or we'd have to redo it anyway. + */ + if (n0 != _HA_ATOMIC_LOAD(thr_idx_ptr)) + continue; + + /* here we have (r1,g1,t1) that designate the first receiver, its + * thread group and local thread, and (r2,g2,t2) that designate + * the second receiver, its thread group and local thread. We'll + * also consider the local thread with q0. + */ + q0 = accept_queue_ring_len(&accept_queue_rings[tid]); + q1 = accept_queue_ring_len(&accept_queue_rings[g1->base + t1]); + q2 = accept_queue_ring_len(&accept_queue_rings[g2->base + t2]); + + /* add to this the currently active connections */ + q0 += _HA_ATOMIC_LOAD(&l->thr_conn[ti->ltid]); + if (l->rx.shard_info) { + q1 += _HA_ATOMIC_LOAD(&((struct listener *)l->rx.shard_info->members[r1]->owner)->thr_conn[t1]); + q2 += _HA_ATOMIC_LOAD(&((struct listener *)l->rx.shard_info->members[r2]->owner)->thr_conn[t2]); + } else { + q1 += _HA_ATOMIC_LOAD(&l->thr_conn[t1]); + q2 += _HA_ATOMIC_LOAD(&l->thr_conn[t2]); + } + + /* we have 3 possibilities now : + * q1 < q2 : t1 is less loaded than t2, so we pick it + * and update t2 (since t1 might still be + * lower than another thread) + * q1 > q2 : t2 is less loaded than t1, so we pick it + * and update t1 (since t2 might still be + * lower than another thread) + * q1 = q2 : both are equally loaded, thus we pick t1 + * and update t1 as it will become more loaded + * than t2. + * On top of that, if in the end the current thread appears + * to be as good of a deal, we'll prefer it over a foreign + * one as it will improve locality and avoid a migration. + */ + + if (q1 - q2 < 0) { + t = g1->base + t1; + if (q0 <= q1) + t = tid; + + if (l->rx.shard_info && t != tid) + new_li = l->rx.shard_info->members[r1]->owner; + + t2--; + if (t2 >= MAX_THREADS_PER_GROUP) { + if (l->rx.shard_info) + r2--; + t2 = MAX_THREADS_PER_GROUP - 1; + } + } + else if (q1 - q2 > 0) { + t = g2->base + t2; + if (q0 <= q2) + t = tid; + + if (l->rx.shard_info && t != tid) + new_li = l->rx.shard_info->members[r2]->owner; + goto updt_t1; + } + else { // q1 == q2 + t = g1->base + t1; + if (q0 < q1) // local must be strictly better than both + t = tid; + + if (l->rx.shard_info && t != tid) + new_li = l->rx.shard_info->members[r1]->owner; + updt_t1: + t1++; + if (t1 >= MAX_THREADS_PER_GROUP) { + if (l->rx.shard_info) + r1++; + t1 = 0; + } + } + + /* The target thread number is in <t> now. Let's + * compute the new index and try to update it. + */ + + /* take previous counter and increment it */ + n1 = n0 & -(ulong)(LONGBITS * LONGBITS * LONGBITS * LONGBITS); + n1 += LONGBITS * LONGBITS * LONGBITS * LONGBITS; + n1 += (((r2 * LONGBITS) + t2) * LONGBITS * LONGBITS); + n1 += (r1 * LONGBITS) + t1; + if (likely(_HA_ATOMIC_CAS(thr_idx_ptr, &n0, n1))) + break; + + /* bah we lost the race, try again */ + __ha_cpu_relax(); + } /* end of main while() loop */ + + /* we may need to update the listener in the connection + * if we switched to another group. + */ + if (new_li) + cli_conn->target = &new_li->obj_type; + + /* here we have the target thread number in <t> and we hold a + * reservation in the target ring. + */ + + if (l->rx.proto && l->rx.proto->set_affinity) { + if (l->rx.proto->set_affinity(cli_conn, t)) { + /* Failed migration, stay on the same thread. */ + goto local_accept; + } + } + + /* We successfully selected the best thread "t" for this + * connection. We use deferred accepts even if it's the + * local thread because tests show that it's the best + * performing model, likely due to better cache locality + * when processing this loop. + */ + ring = &accept_queue_rings[t]; + if (accept_queue_push_mp(ring, cli_conn)) { + _HA_ATOMIC_INC(&activity[t].accq_pushed); + tasklet_wakeup(ring->tasklet); + continue; + } + /* If the ring is full we do a synchronous accept on + * the local thread here. + */ + _HA_ATOMIC_INC(&activity[t].accq_full); + } +#endif // USE_THREAD + + local_accept: + /* restore the connection's listener in case we failed to migrate above */ + cli_conn->target = &l->obj_type; + _HA_ATOMIC_INC(&l->thr_conn[ti->ltid]); + ret = l->bind_conf->accept(cli_conn); + if (unlikely(ret <= 0)) { + /* The connection was closed by stream_accept(). Either + * we just have to ignore it (ret == 0) or it's a critical + * error due to a resource shortage, and we must stop the + * listener (ret < 0). + */ + if (ret == 0) /* successful termination */ + continue; + + goto transient_error; + } + + /* increase the per-process number of cumulated sessions, this + * may only be done once l->bind_conf->accept() has accepted the + * connection. + */ + if (!(l->bind_conf->options & BC_O_UNLIMITED)) { + count = update_freq_ctr(&global.sess_per_sec, 1); + HA_ATOMIC_UPDATE_MAX(&global.sps_max, count); + } +#ifdef USE_OPENSSL + if (!(l->bind_conf->options & BC_O_UNLIMITED) && + l->bind_conf && l->bind_conf->options & BC_O_USE_SSL) { + count = update_freq_ctr(&global.ssl_per_sec, 1); + HA_ATOMIC_UPDATE_MAX(&global.ssl_max, count); + } +#endif + + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_STUCK); // this thread is still running + } /* end of for (max_accept--) */ + + end: + if (next_conn) + _HA_ATOMIC_DEC(&l->nbconn); + + if (p && next_feconn) + _HA_ATOMIC_DEC(&p->feconn); + + if (next_actconn) + _HA_ATOMIC_DEC(&actconn); + + if ((l->state == LI_FULL && (!l->bind_conf->maxconn || l->nbconn < l->bind_conf->maxconn)) || + (l->state == LI_LIMITED && + ((!p || p->feconn < p->maxconn) && (actconn < global.maxconn) && + (!tick_isset(global_listener_queue_task->expire) || + tick_is_expired(global_listener_queue_task->expire, now_ms))))) { + /* at least one thread has to this when quitting */ + relax_listener(l, 0, 0); + + /* Dequeues all of the listeners waiting for a resource */ + dequeue_all_listeners(); + + if (p && !MT_LIST_ISEMPTY(&p->listener_queue) && + (!p->fe_sps_lim || freq_ctr_remain(&p->fe_sess_per_sec, p->fe_sps_lim, 0) > 0)) + dequeue_proxy_listeners(p); + } + return; + + transient_error: + /* pause the listener for up to 100 ms */ + expire = tick_add(now_ms, 100); + + /* This may be a shared socket that was paused by another process. + * Let's put it to pause in this case. + */ + if (l->rx.proto && l->rx.proto->rx_listening(&l->rx) == 0) { + suspend_listener(l, 0, 0); + goto end; + } + + limit_global: + /* (re-)queue the listener to the global queue and set it to expire no + * later than <expire> ahead. The listener turns to LI_LIMITED. + */ + limit_listener(l, &global_listener_queue); + HA_RWLOCK_RDLOCK(LISTENER_LOCK, &global_listener_rwlock); + task_schedule(global_listener_queue_task, expire); + HA_RWLOCK_RDUNLOCK(LISTENER_LOCK, &global_listener_rwlock); + goto end; + + limit_proxy: + /* (re-)queue the listener to the proxy's queue and set it to expire no + * later than <expire> ahead. The listener turns to LI_LIMITED. + */ + limit_listener(l, &p->listener_queue); + if (p->task && tick_isset(expire)) + task_schedule(p->task, expire); + goto end; +} + +/* Notify the listener that a connection initiated from it was released. This + * is used to keep the connection count consistent and to possibly re-open + * listening when it was limited. + */ +void listener_release(struct listener *l) +{ + struct proxy *fe = l->bind_conf->frontend; + + if (listener_uses_maxconn(l)) + _HA_ATOMIC_DEC(&actconn); + if (fe) + _HA_ATOMIC_DEC(&fe->feconn); + _HA_ATOMIC_DEC(&l->nbconn); + _HA_ATOMIC_DEC(&l->thr_conn[ti->ltid]); + + if (l->state == LI_FULL || l->state == LI_LIMITED) + relax_listener(l, 0, 0); + + /* Dequeues all of the listeners waiting for a resource */ + dequeue_all_listeners(); + + if (fe && !MT_LIST_ISEMPTY(&fe->listener_queue) && + (!fe->fe_sps_lim || freq_ctr_remain(&fe->fe_sess_per_sec, fe->fe_sps_lim, 0) > 0)) + dequeue_proxy_listeners(fe); +} + +/* Initializes the listener queues. Returns 0 on success, otherwise ERR_* flags */ +static int listener_queue_init() +{ + global_listener_queue_task = task_new_anywhere(); + if (!global_listener_queue_task) { + ha_alert("Out of memory when initializing global listener queue\n"); + return ERR_FATAL|ERR_ABORT; + } + /* very simple initialization, users will queue the task if needed */ + global_listener_queue_task->context = NULL; /* not even a context! */ + global_listener_queue_task->process = manage_global_listener_queue; + HA_RWLOCK_INIT(&global_listener_rwlock); + + return 0; +} + +static void listener_queue_deinit() +{ + task_destroy(global_listener_queue_task); + global_listener_queue_task = NULL; +} + +REGISTER_CONFIG_POSTPARSER("multi-threaded listener queue", listener_queue_init); +REGISTER_POST_DEINIT(listener_queue_deinit); + + +/* This is the global management task for listeners. It enables listeners waiting + * for global resources when there are enough free resource, or at least once in + * a while. It is designed to be called as a task. It's exported so that it's easy + * to spot in "show tasks" or "show profiling". + */ +struct task *manage_global_listener_queue(struct task *t, void *context, unsigned int state) +{ + /* If there are still too many concurrent connections, let's wait for + * some of them to go away. We don't need to re-arm the timer because + * each of them will scan the queue anyway. + */ + if (unlikely(actconn >= global.maxconn)) + goto out; + + /* We should periodically try to enable listeners waiting for a global + * resource here, because it is possible, though very unlikely, that + * they have been blocked by a temporary lack of global resource such + * as a file descriptor or memory and that the temporary condition has + * disappeared. + */ + dequeue_all_listeners(); + + out: + HA_RWLOCK_WRLOCK(LISTENER_LOCK, &global_listener_rwlock); + t->expire = TICK_ETERNITY; + HA_RWLOCK_WRUNLOCK(LISTENER_LOCK, &global_listener_rwlock); + return t; +} + +/* Applies the thread mask, shards etc to the bind_conf. It normally returns 0 + * otherwie the number of errors. Upon error it may set error codes (ERR_*) in + * err_code. It is supposed to be called only once very late in the boot process + * after the bind_conf's thread_set is fixed. The function may emit warnings and + * alerts. Extra listeners may be created on the fly. + */ +int bind_complete_thread_setup(struct bind_conf *bind_conf, int *err_code) +{ + struct proxy *fe = bind_conf->frontend; + struct listener *li, *new_li, *ref; + struct thread_set new_ts; + int shard, shards, todo, done, grp, dups; + ulong mask, gmask, bit; + int cfgerr = 0; + char *err; + + err = NULL; + if (thread_resolve_group_mask(&bind_conf->thread_set, 0, &err) < 0) { + ha_alert("%s '%s': %s in 'bind %s' at [%s:%d].\n", + proxy_type_str(fe), + fe->id, err, bind_conf->arg, bind_conf->file, bind_conf->line); + free(err); + cfgerr++; + return cfgerr; + } + + /* apply thread masks and groups to all receivers */ + list_for_each_entry(li, &bind_conf->listeners, by_bind) { + shards = bind_conf->settings.shards; + todo = thread_set_count(&bind_conf->thread_set); + + /* special values: -1 = "by-thread", -2 = "by-group" */ + if (shards == -1) { + if (protocol_supports_flag(li->rx.proto, PROTO_F_REUSEPORT_SUPPORTED)) + shards = todo; + else { + if (fe != global.cli_fe) + ha_diag_warning("[%s:%d]: Disabling per-thread sharding for listener in" + " %s '%s' because SO_REUSEPORT is disabled\n", + bind_conf->file, bind_conf->line, proxy_type_str(fe), fe->id); + shards = 1; + } + } + else if (shards == -2) + shards = protocol_supports_flag(li->rx.proto, PROTO_F_REUSEPORT_SUPPORTED) ? my_popcountl(bind_conf->thread_set.grps) : 1; + + /* no more shards than total threads */ + if (shards > todo) + shards = todo; + + /* We also need to check if an explicit shards count was set and cannot be honored */ + if (shards > 1 && !protocol_supports_flag(li->rx.proto, PROTO_F_REUSEPORT_SUPPORTED)) { + ha_warning("[%s:%d]: Disabling sharding for listener in %s '%s' because SO_REUSEPORT is disabled\n", + bind_conf->file, bind_conf->line, proxy_type_str(fe), fe->id); + shards = 1; + } + + shard = done = grp = bit = mask = 0; + new_li = li; + + while (shard < shards) { + memset(&new_ts, 0, sizeof(new_ts)); + while (grp < global.nbtgroups && done < todo) { + /* enlarge mask to cover next bit of bind_thread till we + * have enough bits for one shard. We restart from the + * current grp+bit. + */ + + /* first let's find the first non-empty group starting at <mask> */ + if (!(bind_conf->thread_set.rel[grp] & ha_tgroup_info[grp].threads_enabled & ~mask)) { + grp++; + mask = 0; + continue; + } + + /* take next unassigned bit */ + bit = (bind_conf->thread_set.rel[grp] & ~mask) & -(bind_conf->thread_set.rel[grp] & ~mask); + new_ts.rel[grp] |= bit; + mask |= bit; + new_ts.grps |= 1UL << grp; + + done += shards; + }; + + BUG_ON(!new_ts.grps); // no more bits left unassigned + + /* Create all required listeners for all bound groups. If more than one group is + * needed, the first receiver serves as a reference, and subsequent ones point to + * it. We already have a listener available in new_li() so we only allocate a new + * one if we're not on the last one. We count the remaining groups by copying their + * mask into <gmask> and dropping the lowest bit at the end of the loop until there + * is no more. Ah yes, it's not pretty :-/ + */ + ref = new_li; + gmask = new_ts.grps; + for (dups = 0; gmask; dups++) { + /* assign the first (and only) thread and group */ + new_li->rx.bind_thread = thread_set_nth_tmask(&new_ts, dups); + new_li->rx.bind_tgroup = thread_set_nth_group(&new_ts, dups); + + if (dups) { + /* it has been allocated already in the previous round */ + shard_info_attach(&new_li->rx, ref->rx.shard_info); + new_li->rx.flags |= RX_F_MUST_DUP; + } + + gmask &= gmask - 1; // drop lowest bit + if (gmask) { + /* yet another listener expected in this shard, let's + * chain it. + */ + struct listener *tmp_li = clone_listener(new_li); + + if (!tmp_li) { + ha_alert("Out of memory while trying to allocate extra listener for group %u of shard %d in %s %s\n", + new_li->rx.bind_tgroup, shard, proxy_type_str(fe), fe->id); + cfgerr++; + *err_code |= ERR_FATAL | ERR_ALERT; + return cfgerr; + } + + /* if we're forced to create at least two listeners, we have to + * allocate a shared shard_info that's linked to from the reference + * and each other listener, so we'll create it here. + */ + if (!shard_info_attach(&ref->rx, NULL)) { + ha_alert("Out of memory while trying to allocate shard_info for listener for group %u of shard %d in %s %s\n", + new_li->rx.bind_tgroup, shard, proxy_type_str(fe), fe->id); + cfgerr++; + *err_code |= ERR_FATAL | ERR_ALERT; + return cfgerr; + } + new_li = tmp_li; + } + } + done -= todo; + + shard++; + if (shard >= shards) + break; + + /* create another listener for new shards */ + new_li = clone_listener(li); + if (!new_li) { + ha_alert("Out of memory while trying to allocate extra listener for shard %d in %s %s\n", + shard, proxy_type_str(fe), fe->id); + cfgerr++; + *err_code |= ERR_FATAL | ERR_ALERT; + return cfgerr; + } + } + } + + /* success */ + return cfgerr; +} + +/* + * Registers the bind keyword list <kwl> as a list of valid keywords for next + * parsing sessions. + */ +void bind_register_keywords(struct bind_kw_list *kwl) +{ + LIST_APPEND(&bind_keywords.list, &kwl->list); +} + +/* Return a pointer to the bind keyword <kw>, or NULL if not found. If the + * keyword is found with a NULL ->parse() function, then an attempt is made to + * find one with a valid ->parse() function. This way it is possible to declare + * platform-dependant, known keywords as NULL, then only declare them as valid + * if some options are met. Note that if the requested keyword contains an + * opening parenthesis, everything from this point is ignored. + */ +struct bind_kw *bind_find_kw(const char *kw) +{ + int index; + const char *kwend; + struct bind_kw_list *kwl; + struct bind_kw *ret = NULL; + + kwend = strchr(kw, '('); + if (!kwend) + kwend = kw + strlen(kw); + + list_for_each_entry(kwl, &bind_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if ((strncmp(kwl->kw[index].kw, kw, kwend - kw) == 0) && + kwl->kw[index].kw[kwend-kw] == 0) { + if (kwl->kw[index].parse) + return &kwl->kw[index]; /* found it !*/ + else + ret = &kwl->kw[index]; /* may be OK */ + } + } + } + return ret; +} + +/* Dumps all registered "bind" keywords to the <out> string pointer. The + * unsupported keywords are only dumped if their supported form was not + * found. + */ +void bind_dump_kws(char **out) +{ + struct bind_kw_list *kwl; + int index; + + if (!out) + return; + + *out = NULL; + list_for_each_entry(kwl, &bind_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (kwl->kw[index].parse || + bind_find_kw(kwl->kw[index].kw) == &kwl->kw[index]) { + memprintf(out, "%s[%4s] %s%s%s\n", *out ? *out : "", + kwl->scope, + kwl->kw[index].kw, + kwl->kw[index].skip ? " <arg>" : "", + kwl->kw[index].parse ? "" : " (not supported)"); + } + } + } +} + +/* Try to find in srv_keyword the word that looks closest to <word> by counting + * transitions between letters, digits and other characters. Will return the + * best matching word if found, otherwise NULL. + */ +const char *bind_find_best_kw(const char *word) +{ + uint8_t word_sig[1024]; + uint8_t list_sig[1024]; + const struct bind_kw_list *kwl; + const char *best_ptr = NULL; + int dist, best_dist = INT_MAX; + int index; + + make_word_fingerprint(word_sig, word); + list_for_each_entry(kwl, &bind_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + make_word_fingerprint(list_sig, kwl->kw[index].kw); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_ptr = kwl->kw[index].kw; + } + } + } + + if (best_dist > 2 * strlen(word) || (best_ptr && best_dist > 2 * strlen(best_ptr))) + best_ptr = NULL; + + return best_ptr; +} + +/* allocate an bind_conf struct for a bind line, and chain it to the frontend <fe>. + * If <arg> is not NULL, it is duplicated into ->arg to store useful config + * information for error reporting. NULL is returned on error. + */ +struct bind_conf *bind_conf_alloc(struct proxy *fe, const char *file, + int line, const char *arg, struct xprt_ops *xprt) +{ + struct bind_conf *bind_conf = calloc(1, sizeof(*bind_conf)); + + if (!bind_conf) + goto err; + + bind_conf->file = strdup(file); + if (!bind_conf->file) + goto err; + bind_conf->line = line; + if (arg) { + bind_conf->arg = strdup(arg); + if (!bind_conf->arg) + goto err; + } + + LIST_APPEND(&fe->conf.bind, &bind_conf->by_fe); + bind_conf->settings.ux.uid = -1; + bind_conf->settings.ux.gid = -1; + bind_conf->settings.ux.mode = 0; + bind_conf->settings.shards = global.tune.default_shards; + bind_conf->xprt = xprt; + bind_conf->frontend = fe; + bind_conf->analysers = fe->fe_req_ana; + bind_conf->severity_output = CLI_SEVERITY_NONE; +#ifdef USE_OPENSSL + HA_RWLOCK_INIT(&bind_conf->sni_lock); + bind_conf->sni_ctx = EB_ROOT; + bind_conf->sni_w_ctx = EB_ROOT; +#endif +#ifdef USE_QUIC + /* Use connection socket for QUIC by default. */ + bind_conf->quic_mode = QUIC_SOCK_MODE_CONN; + bind_conf->max_cwnd = + global.tune.bufsize * global.tune.quic_streams_buf; +#endif + LIST_INIT(&bind_conf->listeners); + + bind_conf->rhttp_srvname = NULL; + + return bind_conf; + + err: + if (bind_conf) { + ha_free(&bind_conf->file); + ha_free(&bind_conf->arg); + } + ha_free(&bind_conf); + return NULL; +} + +const char *listener_state_str(const struct listener *l) +{ + static const char *states[8] = { + "NEW", "INI", "ASS", "PAU", "LIS", "RDY", "FUL", "LIM", + }; + unsigned int st = l->state; + + if (st >= sizeof(states) / sizeof(*states)) + return "INVALID"; + return states[st]; +} + +/************************************************************************/ +/* All supported sample and ACL keywords must be declared here. */ +/************************************************************************/ + +/* set temp integer to the number of connexions to the same listening socket */ +static int +smp_fetch_dconn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->sess->listener->nbconn; + return 1; +} + +/* set temp integer to the id of the socket (listener) */ +static int +smp_fetch_so_id(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->sess->listener->luid; + return 1; +} +static int +smp_fetch_so_name(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.u.str.area = smp->sess->listener->name; + if (!smp->data.u.str.area) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + return 1; +} + +/* parse the "accept-proxy" bind keyword */ +static int bind_parse_accept_proxy(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + conf->options |= BC_O_ACC_PROXY; + return 0; +} + +/* parse the "accept-netscaler-cip" bind keyword */ +static int bind_parse_accept_netscaler_cip(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + uint32_t val; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + val = atol(args[cur_arg + 1]); + if (val <= 0) { + memprintf(err, "'%s' : invalid value %d, must be >= 0", args[cur_arg], val); + return ERR_ALERT | ERR_FATAL; + } + + conf->options |= BC_O_ACC_CIP; + conf->ns_cip_magic = val; + return 0; +} + +/* parse the "backlog" bind keyword */ +static int bind_parse_backlog(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + int val; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + val = atol(args[cur_arg + 1]); + if (val < 0) { + memprintf(err, "'%s' : invalid value %d, must be > 0", args[cur_arg], val); + return ERR_ALERT | ERR_FATAL; + } + + conf->backlog = val; + return 0; +} + +/* parse the "id" bind keyword */ +static int bind_parse_id(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + struct eb32_node *node; + struct listener *l, *new; + char *error; + + if (conf->listeners.n != conf->listeners.p) { + memprintf(err, "'%s' can only be used with a single socket", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : expects an integer argument", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + new = LIST_NEXT(&conf->listeners, struct listener *, by_bind); + new->luid = strtol(args[cur_arg + 1], &error, 10); + if (*error != '\0') { + memprintf(err, "'%s' : expects an integer argument, found '%s'", args[cur_arg], args[cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + new->conf.id.key = new->luid; + + if (new->luid <= 0) { + memprintf(err, "'%s' : custom id has to be > 0", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + node = eb32_lookup(&px->conf.used_listener_id, new->luid); + if (node) { + l = container_of(node, struct listener, conf.id); + memprintf(err, "'%s' : custom id %d already used at %s:%d ('bind %s')", + args[cur_arg], l->luid, l->bind_conf->file, l->bind_conf->line, + l->bind_conf->arg); + return ERR_ALERT | ERR_FATAL; + } + + eb32_insert(&px->conf.used_listener_id, &new->conf.id); + return 0; +} + +/* Complete a bind_conf by parsing the args after the address. <args> is the + * arguments array, <cur_arg> is the first one to be considered. <section> is + * the section name to report in error messages, and <file> and <linenum> are + * the file name and line number respectively. Note that args[0..1] are used + * in error messages to provide some context. The return value is an error + * code, zero on success or an OR of ERR_{FATAL,ABORT,ALERT,WARN}. + */ +int bind_parse_args_list(struct bind_conf *bind_conf, char **args, int cur_arg, const char *section, const char *file, int linenum) +{ + int err_code = 0; + + while (*(args[cur_arg])) { + struct bind_kw *kw; + const char *best; + + kw = bind_find_kw(args[cur_arg]); + if (kw) { + char *err = NULL; + int code; + + if (!kw->parse) { + ha_alert("parsing [%s:%d] : '%s %s' in section '%s' : '%s' option is not implemented in this version (check build options).\n", + file, linenum, args[0], args[1], section, args[cur_arg]); + cur_arg += 1 + kw->skip ; + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((bind_conf->options & BC_O_REVERSE_HTTP) && !kw->rhttp_ok) { + ha_alert("'%s' option is not accepted for reverse HTTP\n", + args[cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + code = kw->parse(args, cur_arg, bind_conf->frontend, bind_conf, &err); + err_code |= code; + + if (code) { + if (err && *err) { + indent_msg(&err, 2); + if (((code & (ERR_WARN|ERR_ALERT)) == ERR_WARN)) + ha_warning("parsing [%s:%d] : '%s %s' in section '%s' : %s\n", file, linenum, args[0], args[1], section, err); + else + ha_alert("parsing [%s:%d] : '%s %s' in section '%s' : %s\n", file, linenum, args[0], args[1], section, err); + } + else + ha_alert("parsing [%s:%d] : '%s %s' in section '%s' : error encountered while processing '%s'.\n", + file, linenum, args[0], args[1], section, args[cur_arg]); + if (code & ERR_FATAL) { + free(err); + cur_arg += 1 + kw->skip; + goto out; + } + } + free(err); + cur_arg += 1 + kw->skip; + continue; + } + + best = bind_find_best_kw(args[cur_arg]); + if (best) + ha_alert("parsing [%s:%d] : '%s %s' in section '%s': unknown keyword '%s'; did you mean '%s' maybe ?\n", + file, linenum, args[0], args[1], section, args[cur_arg], best); + else + ha_alert("parsing [%s:%d] : '%s %s' in section '%s': unknown keyword '%s'.\n", + file, linenum, args[0], args[1], section, args[cur_arg]); + + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((bind_conf->options & (BC_O_USE_SOCK_DGRAM|BC_O_USE_SOCK_STREAM)) == (BC_O_USE_SOCK_DGRAM|BC_O_USE_SOCK_STREAM) || + (bind_conf->options & (BC_O_USE_XPRT_DGRAM|BC_O_USE_XPRT_STREAM)) == (BC_O_USE_XPRT_DGRAM|BC_O_USE_XPRT_STREAM)) { + ha_alert("parsing [%s:%d] : '%s %s' in section '%s' : cannot mix datagram and stream protocols.\n", + file, linenum, args[0], args[1], section); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* The transport layer automatically switches to QUIC when QUIC is + * selected, regardless of bind_conf settings. We then need to + * initialize QUIC params. + */ + if ((bind_conf->options & (BC_O_USE_SOCK_DGRAM|BC_O_USE_XPRT_STREAM)) == (BC_O_USE_SOCK_DGRAM|BC_O_USE_XPRT_STREAM)) { +#ifdef USE_QUIC + bind_conf->xprt = xprt_get(XPRT_QUIC); + if (!(bind_conf->options & BC_O_USE_SSL)) { + bind_conf->options |= BC_O_USE_SSL; + ha_warning("parsing [%s:%d] : '%s %s' in section '%s' : QUIC protocol detected, enabling ssl. Use 'ssl' to shut this warning.\n", + file, linenum, args[0], args[1], section); + } + quic_transport_params_init(&bind_conf->quic_params, 1); +#else + ha_alert("parsing [%s:%d] : '%s %s' in section '%s' : QUIC protocol selected but support not compiled in (check build options).\n", + file, linenum, args[0], args[1], section); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +#endif + } + else if (bind_conf->options & BC_O_USE_SSL) { + bind_conf->xprt = xprt_get(XPRT_SSL); + } + + out: + return err_code; +} + +/* parse the "maxconn" bind keyword */ +static int bind_parse_maxconn(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + int val; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + val = atol(args[cur_arg + 1]); + if (val < 0) { + memprintf(err, "'%s' : invalid value %d, must be >= 0", args[cur_arg], val); + return ERR_ALERT | ERR_FATAL; + } + + conf->maxconn = val; + return 0; +} + +/* parse the "name" bind keyword */ +static int bind_parse_name(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + struct listener *l; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing name", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + list_for_each_entry(l, &conf->listeners, by_bind) + l->name = strdup(args[cur_arg + 1]); + + return 0; +} + +/* parse the "nbconn" bind keyword */ +static int bind_parse_nbconn(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + int val; + const struct listener *l; + + /* TODO duplicated code from check_kw_experimental() */ + if (!experimental_directives_allowed) { + memprintf(err, "'%s' is experimental, must be allowed via a global 'expose-experimental-directives'", + args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + mark_tainted(TAINTED_CONFIG_EXP_KW_DECLARED); + + l = LIST_NEXT(&conf->listeners, struct listener *, by_bind); + if (l->rx.addr.ss_family != AF_CUST_RHTTP_SRV) { + memprintf(err, "'%s' : only valid for reverse HTTP listeners.", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value.", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + val = atol(args[cur_arg + 1]); + if (val <= 0) { + memprintf(err, "'%s' : invalid value %d, must be > 0.", args[cur_arg], val); + return ERR_ALERT | ERR_FATAL; + } + + conf->rhttp_nbconn = val; + return 0; +} + +/* parse the "nice" bind keyword */ +static int bind_parse_nice(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + int val; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + val = atol(args[cur_arg + 1]); + if (val < -1024 || val > 1024) { + memprintf(err, "'%s' : invalid value %d, allowed range is -1024..1024", args[cur_arg], val); + return ERR_ALERT | ERR_FATAL; + } + + conf->nice = val; + return 0; +} + +/* parse the "process" bind keyword */ +static int bind_parse_process(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + memprintf(err, "'process %s' on 'bind' lines is not supported anymore, please use 'thread' instead.", args[cur_arg+1]); + return ERR_ALERT | ERR_FATAL; +} + +/* parse the "proto" bind keyword */ +static int bind_parse_proto(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + struct ist proto; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + proto = ist(args[cur_arg + 1]); + conf->mux_proto = get_mux_proto(proto); + if (!conf->mux_proto) { + memprintf(err, "'%s' : unknown MUX protocol '%s'", args[cur_arg], args[cur_arg+1]); + return ERR_ALERT | ERR_FATAL; + } + return 0; +} + +/* parse the "shards" bind keyword. Takes an integer, "by-thread", or "by-group" */ +static int bind_parse_shards(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + int val; + + if (!*args[cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[cur_arg + 1], "by-thread") == 0) { + val = -1; /* -1 = "by-thread", will be fixed in check_config_validity() */ + } else if (strcmp(args[cur_arg + 1], "by-group") == 0) { + val = -2; /* -2 = "by-group", will be fixed in check_config_validity() */ + } else { + val = atol(args[cur_arg + 1]); + if (val < 1 || val > MAX_THREADS) { + memprintf(err, "'%s' : invalid value %d, allowed range is %d..%d or 'by-thread'", args[cur_arg], val, 1, MAX_THREADS); + return ERR_ALERT | ERR_FATAL; + } + } + + conf->settings.shards = val; + return 0; +} + +/* parse the "thread" bind keyword. This will replace any preset thread_set */ +static int bind_parse_thread(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err) +{ + const struct listener *l; + + /* note that the thread set is zeroed before first call, and we don't + * want to reset it so that it remains possible to chain multiple + * "thread" directives. + */ + if (parse_thread_set(args[cur_arg+1], &conf->thread_set, err) < 0) + return ERR_ALERT | ERR_FATAL; + + l = LIST_NEXT(&conf->listeners, struct listener *, by_bind); + if (l->rx.addr.ss_family == AF_CUST_RHTTP_SRV && + atleast2(conf->thread_set.grps)) { + memprintf(err, "'%s' : reverse HTTP bind cannot span multiple thread groups.", args[cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* config parser for global "tune.listener.default-shards" */ +static int cfg_parse_tune_listener_shards(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "by-thread") == 0) + global.tune.default_shards = -1; + else if (strcmp(args[1], "by-group") == 0) + global.tune.default_shards = -2; + else if (strcmp(args[1], "by-process") == 0) + global.tune.default_shards = 1; + else { + memprintf(err, "'%s' expects either 'by-process', 'by-group', or 'by-thread' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +/* config parser for global "tune.listener.multi-queue", accepts "on", "fair" or "off" */ +static int cfg_parse_tune_listener_mq(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.options = (global.tune.options & ~GTUNE_LISTENER_MQ_ANY) | GTUNE_LISTENER_MQ_OPT; + else if (strcmp(args[1], "fair") == 0) + global.tune.options = (global.tune.options & ~GTUNE_LISTENER_MQ_ANY) | GTUNE_LISTENER_MQ_FAIR; + else if (strcmp(args[1], "off") == 0) + global.tune.options &= ~GTUNE_LISTENER_MQ_ANY; + else { + memprintf(err, "'%s' expects either 'on', 'fair', or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct sample_fetch_kw_list smp_kws = {ILH, { + { "dst_conn", smp_fetch_dconn, 0, NULL, SMP_T_SINT, SMP_USE_FTEND, }, + { "so_id", smp_fetch_so_id, 0, NULL, SMP_T_SINT, SMP_USE_FTEND, }, + { "so_name", smp_fetch_so_name, 0, NULL, SMP_T_STR, SMP_USE_FTEND, }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct acl_kw_list acl_kws = {ILH, { + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, acl_register_keywords, &acl_kws); + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted, doing so helps + * all code contributors. + * Optional keywords are also declared with a NULL ->parse() function so that + * the config parser can report an appropriate error when a known keyword was + * not enabled. + */ +static struct bind_kw_list bind_kws = { "ALL", { }, { + { "accept-netscaler-cip", bind_parse_accept_netscaler_cip, 1, 0 }, /* enable NetScaler Client IP insertion protocol */ + { "accept-proxy", bind_parse_accept_proxy, 0, 0 }, /* enable PROXY protocol */ + { "backlog", bind_parse_backlog, 1, 0 }, /* set backlog of listening socket */ + { "id", bind_parse_id, 1, 1 }, /* set id of listening socket */ + { "maxconn", bind_parse_maxconn, 1, 0 }, /* set maxconn of listening socket */ + { "name", bind_parse_name, 1, 1 }, /* set name of listening socket */ + { "nbconn", bind_parse_nbconn, 1, 1 }, /* set number of connection on active preconnect */ + { "nice", bind_parse_nice, 1, 0 }, /* set nice of listening socket */ + { "process", bind_parse_process, 1, 0 }, /* set list of allowed process for this socket */ + { "proto", bind_parse_proto, 1, 0 }, /* set the proto to use for all incoming connections */ + { "shards", bind_parse_shards, 1, 0 }, /* set number of shards */ + { "thread", bind_parse_thread, 1, 1 }, /* set list of allowed threads for this socket */ + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, bind_register_keywords, &bind_kws); + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.listener.default-shards", cfg_parse_tune_listener_shards }, + { CFG_GLOBAL, "tune.listener.multi-queue", cfg_parse_tune_listener_mq }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/log.c b/src/log.c new file mode 100644 index 0000000..010ace9 --- /dev/null +++ b/src/log.c @@ -0,0 +1,4659 @@ +/* + * General logging functions. + * + * Copyright 2000-2008 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <time.h> +#include <unistd.h> +#include <errno.h> + +#include <sys/time.h> +#include <sys/uio.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cfgparse.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/http.h> +#include <haproxy/http_ana.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/sink.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/action.h> +#include <haproxy/time.h> +#include <haproxy/hash.h> +#include <haproxy/tools.h> + +/* global recv logs counter */ +int cum_log_messages; + +/* log forward proxy list */ +struct proxy *cfg_log_forward; + +struct log_fmt_st { + char *name; +}; + +static const struct log_fmt_st log_formats[LOG_FORMATS] = { + [LOG_FORMAT_LOCAL] = { + .name = "local", + }, + [LOG_FORMAT_RFC3164] = { + .name = "rfc3164", + }, + [LOG_FORMAT_RFC5424] = { + .name = "rfc5424", + }, + [LOG_FORMAT_PRIO] = { + .name = "priority", + }, + [LOG_FORMAT_SHORT] = { + .name = "short", + }, + [LOG_FORMAT_TIMED] = { + .name = "timed", + }, + [LOG_FORMAT_ISO] = { + .name = "iso", + }, + [LOG_FORMAT_RAW] = { + .name = "raw", + }, +}; + +/* + * This map is used with all the FD_* macros to check whether a particular bit + * is set or not. Each bit represents an ASCII code. ha_bit_set() sets those + * bytes which should be escaped. When ha_bit_test() returns non-zero, it means + * that the byte should be escaped. Be careful to always pass bytes from 0 to + * 255 exclusively to the macros. + */ +long rfc5424_escape_map[(256/8) / sizeof(long)]; +long hdr_encode_map[(256/8) / sizeof(long)]; +long url_encode_map[(256/8) / sizeof(long)]; +long http_encode_map[(256/8) / sizeof(long)]; + + +const char *log_facilities[NB_LOG_FACILITIES] = { + "kern", "user", "mail", "daemon", + "auth", "syslog", "lpr", "news", + "uucp", "cron", "auth2", "ftp", + "ntp", "audit", "alert", "cron2", + "local0", "local1", "local2", "local3", + "local4", "local5", "local6", "local7" +}; + +const char *log_levels[NB_LOG_LEVELS] = { + "emerg", "alert", "crit", "err", + "warning", "notice", "info", "debug" +}; + +const char sess_term_cond[16] = "-LcCsSPRIDKUIIII"; /* normal, Local, CliTo, CliErr, SrvTo, SrvErr, PxErr, Resource, Internal, Down, Killed, Up, -- */ +const char sess_fin_state[8] = "-RCHDLQT"; /* cliRequest, srvConnect, srvHeader, Data, Last, Queue, Tarpit */ + + +/* log_format */ +struct logformat_type { + char *name; + int type; + int mode; + int lw; /* logwait bitsfield */ + int (*config_callback)(struct logformat_node *node, struct proxy *curproxy); +}; + +int prepare_addrsource(struct logformat_node *node, struct proxy *curproxy); + +/* log_format variable names */ +static const struct logformat_type logformat_keywords[] = { + { "o", LOG_FMT_GLOBAL, PR_MODE_TCP, 0, NULL }, /* global option */ + + /* please keep these lines sorted ! */ + { "B", LOG_FMT_BYTES, PR_MODE_TCP, LW_BYTES, NULL }, /* bytes from server to client */ + { "CC", LOG_FMT_CCLIENT, PR_MODE_HTTP, LW_REQHDR, NULL }, /* client cookie */ + { "CS", LOG_FMT_CSERVER, PR_MODE_HTTP, LW_RSPHDR, NULL }, /* server cookie */ + { "H", LOG_FMT_HOSTNAME, PR_MODE_TCP, LW_INIT, NULL }, /* Hostname */ + { "ID", LOG_FMT_UNIQUEID, PR_MODE_TCP, LW_BYTES, NULL }, /* Unique ID */ + { "ST", LOG_FMT_STATUS, PR_MODE_TCP, LW_RESP, NULL }, /* status code */ + { "T", LOG_FMT_DATEGMT, PR_MODE_TCP, LW_INIT, NULL }, /* date GMT */ + { "Ta", LOG_FMT_Ta, PR_MODE_HTTP, LW_BYTES, NULL }, /* Time active (tr to end) */ + { "Tc", LOG_FMT_TC, PR_MODE_TCP, LW_BYTES, NULL }, /* Tc */ + { "Th", LOG_FMT_Th, PR_MODE_TCP, LW_BYTES, NULL }, /* Time handshake */ + { "Ti", LOG_FMT_Ti, PR_MODE_HTTP, LW_BYTES, NULL }, /* Time idle */ + { "Tl", LOG_FMT_DATELOCAL, PR_MODE_TCP, LW_INIT, NULL }, /* date local timezone */ + { "Tq", LOG_FMT_TQ, PR_MODE_HTTP, LW_BYTES, NULL }, /* Tq=Th+Ti+TR */ + { "Tr", LOG_FMT_Tr, PR_MODE_HTTP, LW_BYTES, NULL }, /* Tr */ + { "TR", LOG_FMT_TR, PR_MODE_HTTP, LW_BYTES, NULL }, /* Time to receive a valid request */ + { "Td", LOG_FMT_TD, PR_MODE_TCP, LW_BYTES, NULL }, /* Td = Tt - (Tq + Tw + Tc + Tr) */ + { "Ts", LOG_FMT_TS, PR_MODE_TCP, LW_INIT, NULL }, /* timestamp GMT */ + { "Tt", LOG_FMT_TT, PR_MODE_TCP, LW_BYTES, NULL }, /* Tt */ + { "Tu", LOG_FMT_TU, PR_MODE_TCP, LW_BYTES, NULL }, /* Tu = Tt -Ti */ + { "Tw", LOG_FMT_TW, PR_MODE_TCP, LW_BYTES, NULL }, /* Tw */ + { "U", LOG_FMT_BYTES_UP, PR_MODE_TCP, LW_BYTES, NULL }, /* bytes from client to server */ + { "ac", LOG_FMT_ACTCONN, PR_MODE_TCP, LW_BYTES, NULL }, /* actconn */ + { "b", LOG_FMT_BACKEND, PR_MODE_TCP, LW_INIT, NULL }, /* backend */ + { "bc", LOG_FMT_BECONN, PR_MODE_TCP, LW_BYTES, NULL }, /* beconn */ + { "bi", LOG_FMT_BACKENDIP, PR_MODE_TCP, LW_BCKIP, prepare_addrsource }, /* backend source ip */ + { "bp", LOG_FMT_BACKENDPORT, PR_MODE_TCP, LW_BCKIP, prepare_addrsource }, /* backend source port */ + { "bq", LOG_FMT_BCKQUEUE, PR_MODE_TCP, LW_BYTES, NULL }, /* backend_queue */ + { "ci", LOG_FMT_CLIENTIP, PR_MODE_TCP, LW_CLIP | LW_XPRT, NULL }, /* client ip */ + { "cp", LOG_FMT_CLIENTPORT, PR_MODE_TCP, LW_CLIP | LW_XPRT, NULL }, /* client port */ + { "f", LOG_FMT_FRONTEND, PR_MODE_TCP, LW_INIT, NULL }, /* frontend */ + { "fc", LOG_FMT_FECONN, PR_MODE_TCP, LW_BYTES, NULL }, /* feconn */ + { "fi", LOG_FMT_FRONTENDIP, PR_MODE_TCP, LW_FRTIP | LW_XPRT, NULL }, /* frontend ip */ + { "fp", LOG_FMT_FRONTENDPORT, PR_MODE_TCP, LW_FRTIP | LW_XPRT, NULL }, /* frontend port */ + { "ft", LOG_FMT_FRONTEND_XPRT, PR_MODE_TCP, LW_INIT, NULL }, /* frontend with transport mode */ + { "hr", LOG_FMT_HDRREQUEST, PR_MODE_TCP, LW_REQHDR, NULL }, /* header request */ + { "hrl", LOG_FMT_HDRREQUESTLIST, PR_MODE_TCP, LW_REQHDR, NULL }, /* header request list */ + { "hs", LOG_FMT_HDRRESPONS, PR_MODE_TCP, LW_RSPHDR, NULL }, /* header response */ + { "hsl", LOG_FMT_HDRRESPONSLIST, PR_MODE_TCP, LW_RSPHDR, NULL }, /* header response list */ + { "HM", LOG_FMT_HTTP_METHOD, PR_MODE_HTTP, LW_REQ, NULL }, /* HTTP method */ + { "HP", LOG_FMT_HTTP_PATH, PR_MODE_HTTP, LW_REQ, NULL }, /* HTTP relative or absolute path */ + { "HPO", LOG_FMT_HTTP_PATH_ONLY, PR_MODE_HTTP, LW_REQ, NULL }, /* HTTP path only (without host nor query string) */ + { "HQ", LOG_FMT_HTTP_QUERY, PR_MODE_HTTP, LW_REQ, NULL }, /* HTTP query */ + { "HU", LOG_FMT_HTTP_URI, PR_MODE_HTTP, LW_REQ, NULL }, /* HTTP full URI */ + { "HV", LOG_FMT_HTTP_VERSION, PR_MODE_HTTP, LW_REQ, NULL }, /* HTTP version */ + { "lc", LOG_FMT_LOGCNT, PR_MODE_TCP, LW_INIT, NULL }, /* log counter */ + { "ms", LOG_FMT_MS, PR_MODE_TCP, LW_INIT, NULL }, /* accept date millisecond */ + { "pid", LOG_FMT_PID, PR_MODE_TCP, LW_INIT, NULL }, /* log pid */ + { "r", LOG_FMT_REQ, PR_MODE_HTTP, LW_REQ, NULL }, /* request */ + { "rc", LOG_FMT_RETRIES, PR_MODE_TCP, LW_BYTES, NULL }, /* retries */ + { "rt", LOG_FMT_COUNTER, PR_MODE_TCP, LW_REQ, NULL }, /* request counter (HTTP or TCP session) */ + { "s", LOG_FMT_SERVER, PR_MODE_TCP, LW_SVID, NULL }, /* server */ + { "sc", LOG_FMT_SRVCONN, PR_MODE_TCP, LW_BYTES, NULL }, /* srv_conn */ + { "si", LOG_FMT_SERVERIP, PR_MODE_TCP, LW_SVIP, NULL }, /* server destination ip */ + { "sp", LOG_FMT_SERVERPORT, PR_MODE_TCP, LW_SVIP, NULL }, /* server destination port */ + { "sq", LOG_FMT_SRVQUEUE, PR_MODE_TCP, LW_BYTES, NULL }, /* srv_queue */ + { "sslc", LOG_FMT_SSL_CIPHER, PR_MODE_TCP, LW_XPRT, NULL }, /* client-side SSL ciphers */ + { "sslv", LOG_FMT_SSL_VERSION, PR_MODE_TCP, LW_XPRT, NULL }, /* client-side SSL protocol version */ + { "t", LOG_FMT_DATE, PR_MODE_TCP, LW_INIT, NULL }, /* date */ + { "tr", LOG_FMT_tr, PR_MODE_HTTP, LW_INIT, NULL }, /* date of start of request */ + { "trg",LOG_FMT_trg, PR_MODE_HTTP, LW_INIT, NULL }, /* date of start of request, GMT */ + { "trl",LOG_FMT_trl, PR_MODE_HTTP, LW_INIT, NULL }, /* date of start of request, local */ + { "ts", LOG_FMT_TERMSTATE, PR_MODE_TCP, LW_BYTES, NULL },/* termination state */ + { "tsc", LOG_FMT_TERMSTATE_CK, PR_MODE_TCP, LW_INIT, NULL },/* termination state */ + { 0, 0, 0, 0, NULL } +}; + +char httpclient_log_format[] = "%ci:%cp [%tr] %ft -/- %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r"; +char default_http_log_format[] = "%ci:%cp [%tr] %ft %b/%s %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r"; // default format +char default_https_log_format[] = "%ci:%cp [%tr] %ft %b/%s %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r %[fc_err]/%[ssl_fc_err,hex]/%[ssl_c_err]/%[ssl_c_ca_err]/%[ssl_fc_is_resumed] %[ssl_fc_sni]/%sslv/%sslc"; +char clf_http_log_format[] = "%{+Q}o %{-Q}ci - - [%trg] %r %ST %B \"\" \"\" %cp %ms %ft %b %s %TR %Tw %Tc %Tr %Ta %tsc %ac %fc %bc %sc %rc %sq %bq %CC %CS %hrl %hsl"; +char default_tcp_log_format[] = "%ci:%cp [%t] %ft %b/%s %Tw/%Tc/%Tt %B %ts %ac/%fc/%bc/%sc/%rc %sq/%bq"; +char *log_format = NULL; + +/* Default string used for structured-data part in RFC5424 formatted + * syslog messages. + */ +char default_rfc5424_sd_log_format[] = "- "; + +/* total number of dropped logs */ +unsigned int dropped_logs = 0; + +/* This is a global syslog message buffer, common to all outgoing + * messages. It contains only the data part. + */ +THREAD_LOCAL char *logline = NULL; + +/* A global syslog message buffer, common to all RFC5424 syslog messages. + * Currently, it is used for generating the structured-data part. + */ +THREAD_LOCAL char *logline_rfc5424 = NULL; + +struct logformat_var_args { + char *name; + int mask; +}; + +struct logformat_var_args var_args_list[] = { +// global + { "M", LOG_OPT_MANDATORY }, + { "Q", LOG_OPT_QUOTE }, + { "X", LOG_OPT_HEXA }, + { "E", LOG_OPT_ESC }, + { 0, 0 } +}; + +/* + * callback used to configure addr source retrieval + */ +int prepare_addrsource(struct logformat_node *node, struct proxy *curproxy) +{ + curproxy->options2 |= PR_O2_SRC_ADDR; + + return 0; +} + + +/* + * Parse args in a logformat_var. Returns 0 in error + * case, otherwise, it returns 1. + */ +int parse_logformat_var_args(char *args, struct logformat_node *node, char **err) +{ + int i = 0; + int end = 0; + int flags = 0; // 1 = + 2 = - + char *sp = NULL; // start pointer + + if (args == NULL) { + memprintf(err, "internal error: parse_logformat_var_args() expects non null 'args'"); + return 0; + } + + while (1) { + if (*args == '\0') + end = 1; + + if (*args == '+') { + // add flag + sp = args + 1; + flags = 1; + } + if (*args == '-') { + // delete flag + sp = args + 1; + flags = 2; + } + + if (*args == '\0' || *args == ',') { + *args = '\0'; + for (i = 0; sp && var_args_list[i].name; i++) { + if (strcmp(sp, var_args_list[i].name) == 0) { + if (flags == 1) { + node->options |= var_args_list[i].mask; + break; + } else if (flags == 2) { + node->options &= ~var_args_list[i].mask; + break; + } + } + } + sp = NULL; + if (end) + break; + } + args++; + } + return 1; +} + +/* + * Parse a variable '%varname' or '%{args}varname' in log-format. The caller + * must pass the args part in the <arg> pointer with its length in <arg_len>, + * and varname with its length in <var> and <var_len> respectively. <arg> is + * ignored when arg_len is 0. Neither <var> nor <var_len> may be null. + * Returns false in error case and err is filled, otherwise returns true. + */ +int parse_logformat_var(char *arg, int arg_len, char *var, int var_len, struct proxy *curproxy, struct list *list_format, int *defoptions, char **err) +{ + int j; + struct logformat_node *node = NULL; + + for (j = 0; logformat_keywords[j].name; j++) { // search a log type + if (strlen(logformat_keywords[j].name) == var_len && + strncmp(var, logformat_keywords[j].name, var_len) == 0) { + if (logformat_keywords[j].mode != PR_MODE_HTTP || curproxy->mode == PR_MODE_HTTP) { + node = calloc(1, sizeof(*node)); + if (!node) { + memprintf(err, "out of memory error"); + goto error_free; + } + node->type = logformat_keywords[j].type; + node->options = *defoptions; + if (arg_len) { + node->arg = my_strndup(arg, arg_len); + if (!parse_logformat_var_args(node->arg, node, err)) + goto error_free; + } + if (node->type == LOG_FMT_GLOBAL) { + *defoptions = node->options; + free(node->arg); + free(node); + } else { + if (logformat_keywords[j].config_callback && + logformat_keywords[j].config_callback(node, curproxy) != 0) { + goto error_free; + } + curproxy->to_log |= logformat_keywords[j].lw; + LIST_APPEND(list_format, &node->list); + } + return 1; + } else { + memprintf(err, "format variable '%s' is reserved for HTTP mode", + logformat_keywords[j].name); + goto error_free; + } + } + } + + j = var[var_len]; + var[var_len] = 0; + memprintf(err, "no such format variable '%s'. If you wanted to emit the '%%' character verbatim, you need to use '%%%%'", var); + var[var_len] = j; + + error_free: + if (node) { + free(node->arg); + free(node); + } + return 0; +} + +/* + * push to the logformat linked list + * + * start: start pointer + * end: end text pointer + * type: string type + * list_format: destination list + * + * LOG_TEXT: copy chars from start to end excluding end. + * +*/ +int add_to_logformat_list(char *start, char *end, int type, struct list *list_format, char **err) +{ + char *str; + + if (type == LF_TEXT) { /* type text */ + struct logformat_node *node = calloc(1, sizeof(*node)); + if (!node) { + memprintf(err, "out of memory error"); + return 0; + } + str = calloc(1, end - start + 1); + strncpy(str, start, end - start); + str[end - start] = '\0'; + node->arg = str; + node->type = LOG_FMT_TEXT; // type string + LIST_APPEND(list_format, &node->list); + } else if (type == LF_SEPARATOR) { + struct logformat_node *node = calloc(1, sizeof(*node)); + if (!node) { + memprintf(err, "out of memory error"); + return 0; + } + node->type = LOG_FMT_SEPARATOR; + LIST_APPEND(list_format, &node->list); + } + return 1; +} + +/* + * Parse the sample fetch expression <text> and add a node to <list_format> upon + * success. At the moment, sample converters are not yet supported but fetch arguments + * should work. The curpx->conf.args.ctx must be set by the caller. If an end pointer + * is passed in <endptr>, it will be updated with the pointer to the first character + * not part of the sample expression. + * + * In error case, the function returns 0, otherwise it returns 1. + */ +int add_sample_to_logformat_list(char *text, char *arg, int arg_len, struct proxy *curpx, struct list *list_format, int options, int cap, char **err, char **endptr) +{ + char *cmd[2]; + struct sample_expr *expr = NULL; + struct logformat_node *node = NULL; + int cmd_arg; + + cmd[0] = text; + cmd[1] = ""; + cmd_arg = 0; + + expr = sample_parse_expr(cmd, &cmd_arg, curpx->conf.args.file, curpx->conf.args.line, err, + &curpx->conf.args, endptr); + if (!expr) { + memprintf(err, "failed to parse sample expression <%s> : %s", text, *err); + goto error_free; + } + + node = calloc(1, sizeof(*node)); + if (!node) { + memprintf(err, "out of memory error"); + goto error_free; + } + node->type = LOG_FMT_EXPR; + node->expr = expr; + node->options = options; + + if (arg_len) { + node->arg = my_strndup(arg, arg_len); + if (!parse_logformat_var_args(node->arg, node, err)) + goto error_free; + } + if (expr->fetch->val & cap & SMP_VAL_REQUEST) + node->options |= LOG_OPT_REQ_CAP; /* fetch method is request-compatible */ + + if (expr->fetch->val & cap & SMP_VAL_RESPONSE) + node->options |= LOG_OPT_RES_CAP; /* fetch method is response-compatible */ + + if (!(expr->fetch->val & cap)) { + memprintf(err, "sample fetch <%s> may not be reliably used here because it needs '%s' which is not available here", + text, sample_src_names(expr->fetch->use)); + goto error_free; + } + + if ((options & LOG_OPT_HTTP) && (expr->fetch->use & (SMP_USE_L6REQ|SMP_USE_L6RES))) { + ha_warning("parsing [%s:%d] : L6 sample fetch <%s> ignored in HTTP log-format string.\n", + curpx->conf.args.file, curpx->conf.args.line, text); + } + + /* check if we need to allocate an http_txn struct for HTTP parsing */ + /* Note, we may also need to set curpx->to_log with certain fetches */ + curpx->http_needed |= !!(expr->fetch->use & SMP_USE_HTTP_ANY); + + /* FIXME: temporary workaround for missing LW_XPRT and LW_REQ flags + * needed with some sample fetches (eg: ssl*). We always set it for + * now on, but this will leave with sample capabilities soon. + */ + curpx->to_log |= LW_XPRT; + if (curpx->http_needed) + curpx->to_log |= LW_REQ; + LIST_APPEND(list_format, &node->list); + return 1; + + error_free: + release_sample_expr(expr); + if (node) { + free(node->arg); + free(node); + } + return 0; +} + +/* + * Parse the log_format string and fill a linked list. + * Variable name are preceded by % and composed by characters [a-zA-Z0-9]* : %varname + * You can set arguments using { } : %{many arguments}varname. + * The curproxy->conf.args.ctx must be set by the caller. + * + * fmt: the string to parse + * curproxy: the proxy affected + * list_format: the destination list + * options: LOG_OPT_* to force on every node + * cap: all SMP_VAL_* flags supported by the consumer + * + * The function returns 1 in success case, otherwise, it returns 0 and err is filled. + */ +int parse_logformat_string(const char *fmt, struct proxy *curproxy, struct list *list_format, int options, int cap, char **err) +{ + char *sp, *str, *backfmt; /* start pointer for text parts */ + char *arg = NULL; /* start pointer for args */ + char *var = NULL; /* start pointer for vars */ + int arg_len = 0; + int var_len = 0; + int cformat; /* current token format */ + int pformat; /* previous token format */ + struct logformat_node *tmplf, *back; + + sp = str = backfmt = strdup(fmt); + if (!str) { + memprintf(err, "out of memory error"); + return 0; + } + curproxy->to_log |= LW_INIT; + + /* flush the list first. */ + list_for_each_entry_safe(tmplf, back, list_format, list) { + LIST_DELETE(&tmplf->list); + release_sample_expr(tmplf->expr); + free(tmplf->arg); + free(tmplf); + } + + for (cformat = LF_INIT; cformat != LF_END; str++) { + pformat = cformat; + + if (!*str) + cformat = LF_END; // preset it to save all states from doing this + + /* The principle of the two-step state machine below is to first detect a change, and + * second have all common paths processed at one place. The common paths are the ones + * encountered in text areas (LF_INIT, LF_TEXT, LF_SEPARATOR) and at the end (LF_END). + * We use the common LF_INIT state to dispatch to the different final states. + */ + switch (pformat) { + case LF_STARTVAR: // text immediately following a '%' + arg = NULL; var = NULL; + arg_len = var_len = 0; + if (*str == '{') { // optional argument + cformat = LF_STARG; + arg = str + 1; + } + else if (*str == '[') { + cformat = LF_STEXPR; + var = str + 1; // store expr in variable name + } + else if (isalpha((unsigned char)*str)) { // variable name + cformat = LF_VAR; + var = str; + } + else if (*str == '%') + cformat = LF_TEXT; // convert this character to a literal (useful for '%') + else if (isdigit((unsigned char)*str) || *str == ' ' || *str == '\t') { + /* single '%' followed by blank or digit, send them both */ + cformat = LF_TEXT; + pformat = LF_TEXT; /* finally we include the previous char as well */ + sp = str - 1; /* send both the '%' and the current char */ + memprintf(err, "unexpected variable name near '%c' at position %d line : '%s'. Maybe you want to write a single '%%', use the syntax '%%%%'", + *str, (int)(str - backfmt), fmt); + goto fail; + + } + else + cformat = LF_INIT; // handle other cases of literals + break; + + case LF_STARG: // text immediately following '%{' + if (*str == '}') { // end of arg + cformat = LF_EDARG; + arg_len = str - arg; + *str = 0; // used for reporting errors + } + break; + + case LF_EDARG: // text immediately following '%{arg}' + if (*str == '[') { + cformat = LF_STEXPR; + var = str + 1; // store expr in variable name + break; + } + else if (isalnum((unsigned char)*str)) { // variable name + cformat = LF_VAR; + var = str; + break; + } + memprintf(err, "parse argument modifier without variable name near '%%{%s}'", arg); + goto fail; + + case LF_STEXPR: // text immediately following '%[' + /* the whole sample expression is parsed at once, + * returning the pointer to the first character not + * part of the expression, which MUST be the trailing + * angle bracket. + */ + if (!add_sample_to_logformat_list(var, arg, arg_len, curproxy, list_format, options, cap, err, &str)) + goto fail; + + if (*str == ']') { + // end of arg, go on with next state + cformat = pformat = LF_EDEXPR; + sp = str; + } + else { + char c = *str; + *str = 0; + if (isprint((unsigned char)c)) + memprintf(err, "expected ']' after '%s', but found '%c'", var, c); + else + memprintf(err, "missing ']' after '%s'", var); + goto fail; + } + break; + + case LF_VAR: // text part of a variable name + var_len = str - var; + if (!isalnum((unsigned char)*str)) + cformat = LF_INIT; // not variable name anymore + break; + + default: // LF_INIT, LF_TEXT, LF_SEPARATOR, LF_END, LF_EDEXPR + cformat = LF_INIT; + } + + if (cformat == LF_INIT) { /* resynchronize state to text/sep/startvar */ + switch (*str) { + case '%': cformat = LF_STARTVAR; break; + case 0 : cformat = LF_END; break; + case ' ': + if (options & LOG_OPT_MERGE_SPACES) { + cformat = LF_SEPARATOR; + break; + } + __fallthrough; + default : cformat = LF_TEXT; break; + } + } + + if (cformat != pformat || pformat == LF_SEPARATOR) { + switch (pformat) { + case LF_VAR: + if (!parse_logformat_var(arg, arg_len, var, var_len, curproxy, list_format, &options, err)) + goto fail; + break; + case LF_TEXT: + case LF_SEPARATOR: + if (!add_to_logformat_list(sp, str, pformat, list_format, err)) + goto fail; + break; + } + sp = str; /* new start of text at every state switch and at every separator */ + } + } + + if (pformat == LF_STARTVAR || pformat == LF_STARG || pformat == LF_STEXPR) { + memprintf(err, "truncated line after '%s'", var ? var : arg ? arg : "%"); + goto fail; + } + free(backfmt); + + return 1; + fail: + free(backfmt); + return 0; +} + +/* + * Parse the first range of indexes from a string made of a list of comma separated + * ranges of indexes. Note that an index may be considered as a particular range + * with a high limit to the low limit. + */ +int get_logger_smp_range(unsigned int *low, unsigned int *high, char **arg, char **err) +{ + char *end, *p; + + *low = *high = 0; + + p = *arg; + end = strchr(p, ','); + if (!end) + end = p + strlen(p); + + *high = *low = read_uint((const char **)&p, end); + if (!*low || (p != end && *p != '-')) + goto err; + + if (p == end) + goto done; + + p++; + *high = read_uint((const char **)&p, end); + if (!*high || *high <= *low || p != end) + goto err; + + done: + if (*end == ',') + end++; + *arg = end; + return 1; + + err: + memprintf(err, "wrong sample range '%s'", *arg); + return 0; +} + +/* + * Returns 1 if the range defined by <low> and <high> overlaps + * one of them in <rgs> array of ranges with <sz> the size of this + * array, 0 if not. + */ +int smp_log_ranges_overlap(struct smp_log_range *rgs, size_t sz, + unsigned int low, unsigned int high, char **err) +{ + size_t i; + + for (i = 0; i < sz; i++) { + if ((low >= rgs[i].low && low <= rgs[i].high) || + (high >= rgs[i].low && high <= rgs[i].high)) { + memprintf(err, "ranges are overlapping"); + return 1; + } + } + + return 0; +} + +int smp_log_range_cmp(const void *a, const void *b) +{ + const struct smp_log_range *rg_a = a; + const struct smp_log_range *rg_b = b; + + if (rg_a->high < rg_b->low) + return -1; + else if (rg_a->low > rg_b->high) + return 1; + + return 0; +} + +/* helper func */ +static inline void init_log_target(struct log_target *target) +{ + target->type = 0; + target->flags = LOG_TARGET_FL_NONE; + target->addr = NULL; + target->resolv_name = NULL; +} + +void deinit_log_target(struct log_target *target) +{ + ha_free(&target->addr); + if (!(target->flags & LOG_TARGET_FL_RESOLVED)) + ha_free(&target->resolv_name); +} + +/* returns 0 on failure and positive value on success */ +static int dup_log_target(struct log_target *def, struct log_target *cpy) +{ + BUG_ON((def->flags & LOG_TARGET_FL_RESOLVED)); /* postparsing already done, invalid use */ + init_log_target(cpy); + if (def->addr) { + cpy->addr = malloc(sizeof(*cpy->addr)); + if (!cpy->addr) + goto error; + *cpy->addr = *def->addr; + } + if (def->resolv_name) { + cpy->resolv_name = strdup(def->resolv_name); + if (!cpy->resolv_name) + goto error; + } + cpy->type = def->type; + return 1; + error: + deinit_log_target(cpy); + return 0; +} + +/* must be called under the lbprm lock */ +static void _log_backend_srv_queue(struct server *srv) +{ + struct proxy *p = srv->proxy; + + /* queue the server in the proxy lb array to make it easily searchable by + * log-balance algorithms. Here we use the srv array as a general server + * pool of in-use servers, lookup is done using a relative positional id + * (array is contiguous) + * + * We use the avail server list to get a quick hand on available servers + * (those that are UP) + */ + if (srv->flags & SRV_F_BACKUP) { + if (!p->srv_act) + p->lbprm.log.srv[p->srv_bck] = srv; + p->srv_bck++; + } + else { + if (!p->srv_act) { + /* we will be switching to act tree in LB logic, thus we need to + * reset the lastid + */ + HA_ATOMIC_STORE(&p->lbprm.log.lastid, 0); + } + p->lbprm.log.srv[p->srv_act] = srv; + p->srv_act++; + } + /* append the server to the list of available servers */ + LIST_APPEND(&p->lbprm.log.avail, &srv->lb_list); + + p->lbprm.tot_weight = (p->srv_act) ? p->srv_act : p->srv_bck; +} + +static void log_backend_srv_up(struct server *srv) +{ + struct proxy *p __maybe_unused = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; /* nothing to do */ + if (srv_currently_usable(srv) || !srv_willbe_usable(srv)) + return; /* false alarm */ + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + _log_backend_srv_queue(srv); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); +} + +/* must be called under lbprm lock */ +static void _log_backend_srv_recalc(struct proxy *p) +{ + unsigned int it = 0; + struct server *cur_srv; + + list_for_each_entry(cur_srv, &p->lbprm.log.avail, lb_list) { + uint8_t backup = cur_srv->flags & SRV_F_BACKUP; + + if ((!p->srv_act && backup) || + (p->srv_act && !backup)) + p->lbprm.log.srv[it++] = cur_srv; + } +} + +/* must be called under the lbprm lock */ +static void _log_backend_srv_dequeue(struct server *srv) +{ + struct proxy *p = srv->proxy; + + if (srv->flags & SRV_F_BACKUP) { + p->srv_bck--; + } + else { + p->srv_act--; + if (!p->srv_act) { + /* we will be switching to bck tree in LB logic, thus we need to + * reset the lastid + */ + HA_ATOMIC_STORE(&p->lbprm.log.lastid, 0); + } + } + + /* remove the srv from the list of available (UP) servers */ + LIST_DELETE(&srv->lb_list); + + /* reconstruct the array of usable servers */ + _log_backend_srv_recalc(p); + + p->lbprm.tot_weight = (p->srv_act) ? p->srv_act : p->srv_bck; +} + +static void log_backend_srv_down(struct server *srv) +{ + struct proxy *p __maybe_unused = srv->proxy; + + if (!srv_lb_status_changed(srv)) + return; /* nothing to do */ + if (!srv_currently_usable(srv) || srv_willbe_usable(srv)) + return; /* false alarm */ + + HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock); + _log_backend_srv_dequeue(srv); + HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock); +} + +/* check that current configuration is compatible with "mode log" */ +static int _postcheck_log_backend_compat(struct proxy *be) +{ + int err_code = ERR_NONE; + + if (!LIST_ISEMPTY(&be->tcp_req.inspect_rules) || + !LIST_ISEMPTY(&be->tcp_req.l4_rules) || + !LIST_ISEMPTY(&be->tcp_req.l5_rules)) { + ha_warning("Cannot use tcp-request rules with 'mode log' in %s '%s'. They will be ignored.\n", + proxy_type_str(be), be->id); + + err_code |= ERR_WARN; + free_act_rules(&be->tcp_req.inspect_rules); + free_act_rules(&be->tcp_req.l4_rules); + free_act_rules(&be->tcp_req.l5_rules); + } + if (!LIST_ISEMPTY(&be->tcp_rep.inspect_rules)) { + ha_warning("Cannot use tcp-response rules with 'mode log' in %s '%s'. They will be ignored.\n", + proxy_type_str(be), be->id); + + err_code |= ERR_WARN; + free_act_rules(&be->tcp_rep.inspect_rules); + } + if (be->table) { + ha_warning("Cannot use stick table with 'mode log' in %s '%s'. It will be ignored.\n", + proxy_type_str(be), be->id); + + err_code |= ERR_WARN; + stktable_deinit(be->table); + ha_free(&be->table); + } + if (!LIST_ISEMPTY(&be->storersp_rules) || + !LIST_ISEMPTY(&be->sticking_rules)) { + ha_warning("Cannot use sticking rules with 'mode log' in %s '%s'. They will be ignored.\n", + proxy_type_str(be), be->id); + + err_code |= ERR_WARN; + free_stick_rules(&be->storersp_rules); + free_stick_rules(&be->sticking_rules); + } + if (isttest(be->server_id_hdr_name)) { + ha_warning("Cannot set \"server_id_hdr_name\" with 'mode log' in %s '%s'. It will be ignored.\n", + proxy_type_str(be), be->id); + + err_code |= ERR_WARN; + istfree(&be->server_id_hdr_name); + } + if (be->dyncookie_key) { + ha_warning("Cannot set \"dynamic-cookie-key\" with 'mode log' in %s '%s'. It will be ignored.\n", + proxy_type_str(be), be->id); + + err_code |= ERR_WARN; + ha_free(&be->dyncookie_key); + } + if (!LIST_ISEMPTY(&be->server_rules)) { + ha_warning("Cannot use \"use-server\" rules with 'mode log' in %s '%s'. They will be ignored.\n", + proxy_type_str(be), be->id); + + err_code |= ERR_WARN; + free_server_rules(&be->server_rules); + } + return err_code; +} + +static int postcheck_log_backend(struct proxy *be) +{ + char *msg = NULL; + struct server *srv; + int err_code = ERR_NONE; + int target_type = -1; // -1 is unused in log_tgt enum + + if (be->mode != PR_MODE_SYSLOG || + (be->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) + return ERR_NONE; /* nothing to do */ + + err_code |= _postcheck_log_backend_compat(be); + if (err_code & ERR_CODE) + return err_code; + + /* First time encountering this log backend, perform some init + */ + be->lbprm.set_server_status_up = log_backend_srv_up; + be->lbprm.set_server_status_down = log_backend_srv_down; + be->lbprm.log.lastid = 0; /* initial value */ + LIST_INIT(&be->lbprm.log.avail); + + /* alloc srv array (it will be used for active and backup server lists in turn, + * so we ensure that the longest list will fit + */ + be->lbprm.log.srv = calloc(MAX(be->srv_act, be->srv_bck), + sizeof(*be->lbprm.log.srv)); + + if (!be->lbprm.log.srv ) { + memprintf(&msg, "memory error when allocating server array (%d entries)", + MAX(be->srv_act, be->srv_bck)); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* reinit srv counters, lbprm queueing will recount */ + be->srv_act = 0; + be->srv_bck = 0; + + /* "log-balance hash" needs to compile its expression */ + if ((be->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_LH) { + struct sample_expr *expr; + char *expr_str = NULL; + char *err_str = NULL; + int idx = 0; + + /* only map-based hash method is supported for now */ + if ((be->lbprm.algo & BE_LB_HASH_TYPE) != BE_LB_HASH_MAP) { + memprintf(&msg, "unsupported hash method (from \"hash-type\")"); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* a little bit of explanation about what we're going to do here: + * as the user gave us a list of converters, instead of the fetch+conv list + * tuple as we're used to, we need to insert a dummy fetch at the start of + * the converter list so that sample_parse_expr() is able to properly parse + * the expr. We're explicitly using str() as dummy fetch, since the input + * sample that will be passed to the converter list at runtime will be a + * string (the log message about to be sent). Doing so allows sample_parse_expr() + * to ensure that the provided converters will be compatible with string type. + */ + memprintf(&expr_str, "str(dummy),%s", be->lbprm.arg_str); + if (!expr_str) { + memprintf(&msg, "memory error during converter list argument parsing (from \"log-balance hash\")"); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + expr = sample_parse_expr((char*[]){expr_str, NULL}, &idx, + be->conf.file, + be->conf.line, + &err_str, NULL, NULL); + if (!expr) { + memprintf(&msg, "%s (from converter list argument in \"log-balance hash\")", err_str); + ha_free(&err_str); + err_code |= ERR_ALERT | ERR_FATAL; + ha_free(&expr_str); + goto end; + } + + /* We expect the log_message->conv_list expr to resolve as a binary-compatible + * value because its output will be passed to gen_hash() to compute the hash. + * + * So we check the last converter's output type to ensure that it can be + * converted into the expected type. Invalid output type will result in an + * error to prevent unexpected results during runtime. + */ + if (sample_casts[smp_expr_output_type(expr)][SMP_T_BIN] == NULL) { + memprintf(&msg, "invalid output type at the end of converter list for \"log-balance hash\" directive"); + err_code |= ERR_ALERT | ERR_FATAL; + release_sample_expr(expr); + ha_free(&expr_str); + goto end; + } + ha_free(&expr_str); + be->lbprm.expr = expr; + } + + /* finish the initialization of proxy's servers */ + srv = be->srv; + while (srv) { + BUG_ON(srv->log_target); + BUG_ON(srv->addr_type.proto_type != PROTO_TYPE_DGRAM && + srv->addr_type.proto_type != PROTO_TYPE_STREAM); + + srv->log_target = malloc(sizeof(*srv->log_target)); + if (!srv->log_target) { + memprintf(&msg, "memory error when allocating log server '%s'\n", srv->id); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + init_log_target(srv->log_target); + if (srv->addr_type.proto_type == PROTO_TYPE_DGRAM) { + srv->log_target->type = LOG_TARGET_DGRAM; + /* Try to allocate log target addr (only used in DGRAM mode) */ + srv->log_target->addr = calloc(1, sizeof(*srv->log_target->addr)); + if (!srv->log_target->addr) { + memprintf(&msg, "memory error when allocating log server '%s'\n", srv->id); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + /* We must initialize it with known addr:svc_port, it will then + * be updated automatically by the server API for runtime changes + */ + ipcpy(&srv->addr, srv->log_target->addr); + set_host_port(srv->log_target->addr, srv->svc_port); + } + else { + /* for now BUFFER type only supports TCP server to it's almost + * explicit + */ + srv->log_target->type = LOG_TARGET_BUFFER; + srv->log_target->sink = sink_new_from_srv(srv, "log backend"); + if (!srv->log_target->sink) { + memprintf(&msg, "error when creating sink from '%s' log server", srv->id); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + } + + if (target_type == -1) + target_type = srv->log_target->type; + + if (target_type != srv->log_target->type) { + memprintf(&msg, "cannot mix server types within a log backend, '%s' srv's network type differs from previous server", srv->id); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + srv->log_target->flags |= LOG_TARGET_FL_RESOLVED; + srv->cur_eweight = 1; /* ignore weights, all servers have the same weight */ + _log_backend_srv_queue(srv); + srv = srv->next; + } + end: + if (err_code & ERR_CODE) { + ha_free(&be->lbprm.log.srv); /* free log servers array */ + ha_alert("log backend '%s': failed to initialize: %s.\n", be->id, msg); + ha_free(&msg); + } + + return err_code; +} + +/* resolves a single logger entry (it is expected to be called + * at postparsing stage) + * + * <logger> is parent logger used for implicit settings + * + * Returns err_code which defaults to ERR_NONE and can be set to a combination + * of ERR_WARN, ERR_ALERT, ERR_FATAL and ERR_ABORT in case of errors. + * <msg> could be set at any time (it will usually be set on error, but + * could also be set when no error occurred to report a diag warning), thus is + * up to the caller to check it and to free it. + */ +int resolve_logger(struct logger *logger, char **msg) +{ + struct log_target *target = &logger->target; + int err_code = ERR_NONE; + + if (target->type == LOG_TARGET_BUFFER) + err_code = sink_resolve_logger_buffer(logger, msg); + else if (target->type == LOG_TARGET_BACKEND) { + struct proxy *be; + + /* special case */ + be = proxy_find_by_name(target->be_name, PR_CAP_BE, 0); + if (!be) { + memprintf(msg, "uses unknown log backend '%s'", target->be_name); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + else if (be->mode != PR_MODE_SYSLOG) { + memprintf(msg, "uses incompatible log backend '%s'", target->be_name); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + ha_free(&target->be_name); /* backend is resolved and will replace name hint */ + target->be = be; + } + + end: + target->flags |= LOG_TARGET_FL_RESOLVED; + + return err_code; +} + +/* tries to duplicate <def> logger + * + * Returns the newly allocated and duplicated logger or NULL + * in case of error. + */ +struct logger *dup_logger(struct logger *def) +{ + struct logger *cpy = malloc(sizeof(*cpy)); + + /* copy everything that can be easily copied */ + memcpy(cpy, def, sizeof(*cpy)); + + /* default values */ + cpy->conf.file = NULL; + LIST_INIT(&cpy->list); + + /* special members */ + if (dup_log_target(&def->target, &cpy->target) == 0) + goto error; + if (def->conf.file) { + cpy->conf.file = strdup(def->conf.file); + if (!cpy->conf.file) + goto error; + } + + /* inherit from original reference if set */ + cpy->ref = (def->ref) ? def->ref : def; + + return cpy; + + error: + free_logger(cpy); + return NULL; +} + +/* frees <logger> after freeing all of its allocated fields. The + * server must not belong to a list anymore. Logsrv may be NULL, which is + * silently ignored. + */ +void free_logger(struct logger *logger) +{ + if (!logger) + return; + + BUG_ON(LIST_INLIST(&logger->list)); + ha_free(&logger->conf.file); + deinit_log_target(&logger->target); + free(logger); +} + +/* Parse single log target + * Returns 0 on failure and positive value on success + */ +static int parse_log_target(char *raw, struct log_target *target, char **err) +{ + int port1, port2, fd; + struct protocol *proto; + struct sockaddr_storage *sk; + + init_log_target(target); + // target addr is NULL at this point + + if (strncmp(raw, "ring@", 5) == 0) { + target->type = LOG_TARGET_BUFFER; + target->ring_name = strdup(raw + 5); + goto done; + } + else if (strncmp(raw, "backend@", 8) == 0) { + target->type = LOG_TARGET_BACKEND; + target->be_name = strdup(raw + 8); + goto done; + } + + /* try to allocate log target addr */ + target->addr = malloc(sizeof(*target->addr)); + if (!target->addr) { + memprintf(err, "memory error"); + goto error; + } + + target->type = LOG_TARGET_DGRAM; // default type + + /* parse the target address */ + sk = str2sa_range(raw, NULL, &port1, &port2, &fd, &proto, NULL, + err, NULL, NULL, + PA_O_RESOLVE | PA_O_PORT_OK | PA_O_RAW_FD | PA_O_DGRAM | PA_O_STREAM | PA_O_DEFAULT_DGRAM); + if (!sk) + goto error; + if (fd != -1) + target->type = LOG_TARGET_FD; + *target->addr = *sk; + + if (sk->ss_family == AF_INET || sk->ss_family == AF_INET6) { + if (!port1) + set_host_port(target->addr, SYSLOG_PORT); + } + + if (proto && proto->xprt_type == PROTO_TYPE_STREAM) { + static unsigned long ring_ids; + + /* Implicit sink buffer will be initialized in post_check + * (target->addr is set in this case) + */ + target->type = LOG_TARGET_BUFFER; + /* compute unique name for the ring */ + memprintf(&target->ring_name, "ring#%lu", ++ring_ids); + } + + done: + return 1; + error: + deinit_log_target(target); + return 0; +} + +/* + * Parse "log" keyword and update <loggers> list accordingly. + * + * When <do_del> is set, it means the "no log" line was parsed, so all log + * servers in <loggers> are released. + * + * Otherwise, we try to parse the "log" line. First of all, when the list is not + * the global one, we look for the parameter "global". If we find it, + * global.loggers is copied. Else we parse each arguments. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +int parse_logger(char **args, struct list *loggers, int do_del, const char *file, int linenum, char **err) +{ + struct smp_log_range *smp_rgs = NULL; + struct logger *logger = NULL; + int cur_arg; + + /* + * "no log": delete previous herited or defined syslog + * servers. + */ + if (do_del) { + struct logger *back; + + if (*(args[1]) != 0) { + memprintf(err, "'no log' does not expect arguments"); + goto error; + } + + list_for_each_entry_safe(logger, back, loggers, list) { + LIST_DEL_INIT(&logger->list); + free_logger(logger); + } + return 1; + } + + /* + * "log global": copy global.loggers linked list to the end of loggers + * list. But first, we check (loggers != global.loggers). + */ + if (*(args[1]) && *(args[2]) == 0 && strcmp(args[1], "global") == 0) { + if (loggers == &global.loggers) { + memprintf(err, "'global' is not supported for a global syslog server"); + goto error; + } + list_for_each_entry(logger, &global.loggers, list) { + struct logger *node; + + list_for_each_entry(node, loggers, list) { + if (node->ref == logger) + goto skip_logger; + } + + /* duplicate logger from global */ + node = dup_logger(logger); + if (!node) { + memprintf(err, "out of memory error"); + goto error; + } + + /* manually override some values */ + ha_free(&node->conf.file); + node->conf.file = strdup(file); + node->conf.line = linenum; + + /* add to list */ + LIST_APPEND(loggers, &node->list); + + skip_logger: + continue; + } + return 1; + } + + /* + * "log <address> ...: parse a syslog server line + */ + if (*(args[1]) == 0 || *(args[2]) == 0) { + memprintf(err, "expects <address> and <facility> %s as arguments", + ((loggers == &global.loggers) ? "" : "or global")); + goto error; + } + + /* take care of "stdout" and "stderr" as regular aliases for fd@1 / fd@2 */ + if (strcmp(args[1], "stdout") == 0) + args[1] = "fd@1"; + else if (strcmp(args[1], "stderr") == 0) + args[1] = "fd@2"; + + logger = calloc(1, sizeof(*logger)); + if (!logger) { + memprintf(err, "out of memory"); + goto error; + } + LIST_INIT(&logger->list); + logger->conf.file = strdup(file); + logger->conf.line = linenum; + + /* skip address for now, it will be parsed at the end */ + cur_arg = 2; + + /* just after the address, a length may be specified */ + logger->maxlen = MAX_SYSLOG_LEN; + if (strcmp(args[cur_arg], "len") == 0) { + int len = atoi(args[cur_arg+1]); + if (len < 80 || len > 65535) { + memprintf(err, "invalid log length '%s', must be between 80 and 65535", + args[cur_arg+1]); + goto error; + } + logger->maxlen = len; + cur_arg += 2; + } + if (logger->maxlen > global.max_syslog_len) + global.max_syslog_len = logger->maxlen; + + /* after the length, a format may be specified */ + if (strcmp(args[cur_arg], "format") == 0) { + logger->format = get_log_format(args[cur_arg+1]); + if (logger->format == LOG_FORMAT_UNSPEC) { + memprintf(err, "unknown log format '%s'", args[cur_arg+1]); + goto error; + } + cur_arg += 2; + } + + if (strcmp(args[cur_arg], "sample") == 0) { + unsigned low, high; + char *p, *beg, *end, *smp_sz_str; + size_t smp_rgs_sz = 0, smp_sz = 0, new_smp_sz; + + p = args[cur_arg+1]; + smp_sz_str = strchr(p, ':'); + if (!smp_sz_str) { + memprintf(err, "Missing sample size"); + goto error; + } + + *smp_sz_str++ = '\0'; + + end = p + strlen(p); + + while (p != end) { + if (!get_logger_smp_range(&low, &high, &p, err)) + goto error; + + if (smp_rgs && smp_log_ranges_overlap(smp_rgs, smp_rgs_sz, low, high, err)) + goto error; + + smp_rgs = my_realloc2(smp_rgs, (smp_rgs_sz + 1) * sizeof *smp_rgs); + if (!smp_rgs) { + memprintf(err, "out of memory error"); + goto error; + } + + smp_rgs[smp_rgs_sz].low = low; + smp_rgs[smp_rgs_sz].high = high; + smp_rgs[smp_rgs_sz].sz = high - low + 1; + if (smp_rgs[smp_rgs_sz].high > smp_sz) + smp_sz = smp_rgs[smp_rgs_sz].high; + smp_rgs_sz++; + } + + if (smp_rgs == NULL) { + memprintf(err, "no sampling ranges given"); + goto error; + } + + beg = smp_sz_str; + end = beg + strlen(beg); + new_smp_sz = read_uint((const char **)&beg, end); + if (!new_smp_sz || beg != end) { + memprintf(err, "wrong sample size '%s' for sample range '%s'", + smp_sz_str, args[cur_arg+1]); + goto error; + } + + if (new_smp_sz < smp_sz) { + memprintf(err, "sample size %zu should be greater or equal to " + "%zu the maximum of the high ranges limits", + new_smp_sz, smp_sz); + goto error; + } + smp_sz = new_smp_sz; + + /* Let's order <smp_rgs> array. */ + qsort(smp_rgs, smp_rgs_sz, sizeof(struct smp_log_range), smp_log_range_cmp); + + logger->lb.smp_rgs = smp_rgs; + logger->lb.smp_rgs_sz = smp_rgs_sz; + logger->lb.smp_sz = smp_sz; + + cur_arg += 2; + } + + /* parse the facility */ + logger->facility = get_log_facility(args[cur_arg]); + if (logger->facility < 0) { + memprintf(err, "unknown log facility '%s'", args[cur_arg]); + goto error; + } + cur_arg++; + + /* parse the max syslog level (default: debug) */ + logger->level = 7; + if (*(args[cur_arg])) { + logger->level = get_log_level(args[cur_arg]); + if (logger->level < 0) { + memprintf(err, "unknown optional log level '%s'", args[cur_arg]); + goto error; + } + cur_arg++; + } + + /* parse the limit syslog level (default: emerg) */ + logger->minlvl = 0; + if (*(args[cur_arg])) { + logger->minlvl = get_log_level(args[cur_arg]); + if (logger->minlvl < 0) { + memprintf(err, "unknown optional minimum log level '%s'", args[cur_arg]); + goto error; + } + cur_arg++; + } + + /* Too many args */ + if (*(args[cur_arg])) { + memprintf(err, "cannot handle unexpected argument '%s'", args[cur_arg]); + goto error; + } + + /* now, back to the log target */ + if (!parse_log_target(args[1], &logger->target, err)) + goto error; + + done: + LIST_APPEND(loggers, &logger->list); + return 1; + + error: + free(smp_rgs); + free_logger(logger); + return 0; +} + + +/* + * returns log format, LOG_FORMAT_UNSPEC is return if not found. + */ +enum log_fmt get_log_format(const char *fmt) +{ + enum log_fmt format; + + format = LOG_FORMATS - 1; + while (format > 0 && log_formats[format].name + && strcmp(log_formats[format].name, fmt) != 0) + format--; + + /* Note: 0 is LOG_FORMAT_UNSPEC */ + return format; +} + +/* + * returns log level for <lev> or -1 if not found. + */ +int get_log_level(const char *lev) +{ + int level; + + level = NB_LOG_LEVELS - 1; + while (level >= 0 && strcmp(log_levels[level], lev) != 0) + level--; + + return level; +} + +/* + * returns log facility for <fac> or -1 if not found. + */ +int get_log_facility(const char *fac) +{ + int facility; + + facility = NB_LOG_FACILITIES - 1; + while (facility >= 0 && strcmp(log_facilities[facility], fac) != 0) + facility--; + + return facility; +} + +/* + * Encode the string. + * + * When using the +E log format option, it will try to escape '"\]' + * characters with '\' as prefix. The same prefix should not be used as + * <escape>. + */ +static char *lf_encode_string(char *start, char *stop, + const char escape, const long *map, + const char *string, + struct logformat_node *node) +{ + if (node->options & LOG_OPT_ESC) { + if (start < stop) { + stop--; /* reserve one byte for the final '\0' */ + while (start < stop && *string != '\0') { + if (!ha_bit_test((unsigned char)(*string), map)) { + if (!ha_bit_test((unsigned char)(*string), rfc5424_escape_map)) + *start++ = *string; + else { + if (start + 2 >= stop) + break; + *start++ = '\\'; + *start++ = *string; + } + } + else { + if (start + 3 >= stop) + break; + *start++ = escape; + *start++ = hextab[(*string >> 4) & 15]; + *start++ = hextab[*string & 15]; + } + string++; + } + *start = '\0'; + } + } + else { + return encode_string(start, stop, escape, map, string); + } + + return start; +} + +/* + * Encode the chunk. + * + * When using the +E log format option, it will try to escape '"\]' + * characters with '\' as prefix. The same prefix should not be used as + * <escape>. + */ +static char *lf_encode_chunk(char *start, char *stop, + const char escape, const long *map, + const struct buffer *chunk, + struct logformat_node *node) +{ + char *str, *end; + + if (node->options & LOG_OPT_ESC) { + if (start < stop) { + str = chunk->area; + end = chunk->area + chunk->data; + + stop--; /* reserve one byte for the final '\0' */ + while (start < stop && str < end) { + if (!ha_bit_test((unsigned char)(*str), map)) { + if (!ha_bit_test((unsigned char)(*str), rfc5424_escape_map)) + *start++ = *str; + else { + if (start + 2 >= stop) + break; + *start++ = '\\'; + *start++ = *str; + } + } + else { + if (start + 3 >= stop) + break; + *start++ = escape; + *start++ = hextab[(*str >> 4) & 15]; + *start++ = hextab[*str & 15]; + } + str++; + } + *start = '\0'; + } + } + else { + return encode_chunk(start, stop, escape, map, chunk); + } + + return start; +} + +/* + * Write a string in the log string + * Take cares of quote and escape options + * + * Return the address of the \0 character, or NULL on error + */ +char *lf_text_len(char *dst, const char *src, size_t len, size_t size, const struct logformat_node *node) +{ + if (size < 2) + return NULL; + + if (node->options & LOG_OPT_QUOTE) { + *(dst++) = '"'; + size--; + } + + if (src && len) { + /* escape_string and strlcpy2 will both try to add terminating NULL-byte + * to dst, so we need to make sure that extra byte will fit into dst + * before calling them + */ + if (node->options & LOG_OPT_ESC) { + char *ret; + + ret = escape_string(dst, (dst + size - 1), '\\', rfc5424_escape_map, src, src + len); + if (ret == NULL || *ret != '\0') + return NULL; + len = ret - dst; + } + else { + if (++len > size) + len = size; + len = strlcpy2(dst, src, len); + } + + size -= len; + dst += len; + } + else if ((node->options & (LOG_OPT_QUOTE|LOG_OPT_MANDATORY)) == LOG_OPT_MANDATORY) { + if (size < 2) + return NULL; + *(dst++) = '-'; + size -= 1; + } + + if (node->options & LOG_OPT_QUOTE) { + if (size < 2) + return NULL; + *(dst++) = '"'; + } + + *dst = '\0'; + return dst; +} + +static inline char *lf_text(char *dst, const char *src, size_t size, const struct logformat_node *node) +{ + return lf_text_len(dst, src, size, size, node); +} + +/* + * Write a IP address to the log string + * +X option write in hexadecimal notation, most significant byte on the left + */ +char *lf_ip(char *dst, const struct sockaddr *sockaddr, size_t size, const struct logformat_node *node) +{ + char *ret = dst; + int iret; + char pn[INET6_ADDRSTRLEN]; + + if (node->options & LOG_OPT_HEXA) { + unsigned char *addr = NULL; + switch (sockaddr->sa_family) { + case AF_INET: + addr = (unsigned char *)&((struct sockaddr_in *)sockaddr)->sin_addr.s_addr; + iret = snprintf(dst, size, "%02X%02X%02X%02X", addr[0], addr[1], addr[2], addr[3]); + break; + case AF_INET6: + addr = (unsigned char *)&((struct sockaddr_in6 *)sockaddr)->sin6_addr.s6_addr; + iret = snprintf(dst, size, "%02X%02X%02X%02X%02X%02X%02X%02X%02X%02X%02X%02X%02X%02X%02X%02X", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], addr[6], addr[7], + addr[8], addr[9], addr[10], addr[11], addr[12], addr[13], addr[14], addr[15]); + break; + default: + return NULL; + } + if (iret < 0 || iret > size) + return NULL; + ret += iret; + } else { + addr_to_str((struct sockaddr_storage *)sockaddr, pn, sizeof(pn)); + ret = lf_text(dst, pn, size, node); + if (ret == NULL) + return NULL; + } + return ret; +} + +/* + * Write a port to the log + * +X option write in hexadecimal notation, most significant byte on the left + */ +char *lf_port(char *dst, const struct sockaddr *sockaddr, size_t size, const struct logformat_node *node) +{ + char *ret = dst; + int iret; + + if (node->options & LOG_OPT_HEXA) { + const unsigned char *port = (const unsigned char *)&((struct sockaddr_in *)sockaddr)->sin_port; + iret = snprintf(dst, size, "%02X%02X", port[0], port[1]); + if (iret < 0 || iret > size) + return NULL; + ret += iret; + } else { + ret = ltoa_o(get_host_port((struct sockaddr_storage *)sockaddr), dst, size); + if (ret == NULL) + return NULL; + } + return ret; +} + + +/* + * This function sends the syslog message using a printf format string. It + * expects an LF-terminated message. + */ +void send_log(struct proxy *p, int level, const char *format, ...) +{ + va_list argp; + int data_len; + + if (level < 0 || format == NULL || logline == NULL) + return; + + va_start(argp, format); + data_len = vsnprintf(logline, global.max_syslog_len, format, argp); + if (data_len < 0 || data_len > global.max_syslog_len) + data_len = global.max_syslog_len; + va_end(argp); + + __send_log((p ? &p->loggers : NULL), (p ? &p->log_tag : NULL), level, + logline, data_len, default_rfc5424_sd_log_format, 2); +} +/* + * This function builds a log header according to <hdr> settings. + * + * If hdr.format is set to LOG_FORMAT_UNSPEC, it tries to determine + * format based on hdr.metadata. It is useful for log-forwarding to be + * able to forward any format without settings. + * + * This function returns a struct ist array of elements of the header + * nbelem is set to the number of available elements. + * This function returns currently a maximum of NB_LOG_HDR_IST_ELEMENTS + * elements. + */ +struct ist *build_log_header(struct log_header hdr, size_t *nbelem) +{ + static THREAD_LOCAL struct { + struct ist ist_vector[NB_LOG_HDR_MAX_ELEMENTS]; + char timestamp_buffer[LOG_LEGACYTIME_LEN+1+1]; + time_t cur_legacy_time; + char priority_buffer[6]; + } hdr_ctx = { .priority_buffer = "<<<<>" }; + + struct tm logtime; + int len; + int fac_level = 0; + time_t time = date.tv_sec; + struct ist *metadata = hdr.metadata; + enum log_fmt format = hdr.format; + int facility = hdr.facility; + int level = hdr.level; + + *nbelem = 0; + + + if (format == LOG_FORMAT_UNSPEC) { + format = LOG_FORMAT_RAW; + if (metadata) { + /* If a hostname is set, it appears we want to perform syslog + * because only rfc5427 or rfc3164 support an hostname. + */ + if (metadata[LOG_META_HOST].len) { + /* If a rfc5424 compliant timestamp is used we consider + * that output format is rfc5424, else legacy format + * is used as specified default for local logs + * in documentation. + */ + if ((metadata[LOG_META_TIME].len == 1 && metadata[LOG_META_TIME].ptr[0] == '-') + || (metadata[LOG_META_TIME].len >= LOG_ISOTIME_MINLEN)) + format = LOG_FORMAT_RFC5424; + else + format = LOG_FORMAT_RFC3164; + } + else if (metadata[LOG_META_TAG].len) { + /* Tag is present but no hostname, we should + * consider we try to emit a local log + * in legacy format (analog to RFC3164 but + * with stripped hostname). + */ + format = LOG_FORMAT_LOCAL; + } + else if (metadata[LOG_META_PRIO].len) { + /* the source seems a parsed message + * offering a valid level/prio prefix + * so we consider this format. + */ + format = LOG_FORMAT_PRIO; + } + } + } + + /* prepare priority, stored into 1 single elem */ + switch (format) { + case LOG_FORMAT_LOCAL: + case LOG_FORMAT_RFC3164: + case LOG_FORMAT_RFC5424: + case LOG_FORMAT_PRIO: + fac_level = facility << 3; + /* further format ignore the facility */ + __fallthrough; + case LOG_FORMAT_TIMED: + case LOG_FORMAT_SHORT: + fac_level += level; + hdr_ctx.ist_vector[*nbelem].ptr = &hdr_ctx.priority_buffer[3]; /* last digit of the log level */ + do { + *hdr_ctx.ist_vector[*nbelem].ptr = '0' + fac_level % 10; + fac_level /= 10; + hdr_ctx.ist_vector[*nbelem].ptr--; + } while (fac_level && hdr_ctx.ist_vector[*nbelem].ptr > &hdr_ctx.priority_buffer[0]); + *hdr_ctx.ist_vector[*nbelem].ptr = '<'; + hdr_ctx.ist_vector[(*nbelem)++].len = &hdr_ctx.priority_buffer[5] - hdr_ctx.ist_vector[0].ptr; + break; + case LOG_FORMAT_ISO: + case LOG_FORMAT_RAW: + break; + case LOG_FORMAT_UNSPEC: + case LOG_FORMATS: + ABORT_NOW(); + } + + + /* prepare timestamp, stored into a max of 4 elems */ + switch (format) { + case LOG_FORMAT_LOCAL: + case LOG_FORMAT_RFC3164: + /* rfc3164 ex: 'Jan 1 00:00:00 ' */ + if (metadata && metadata[LOG_META_TIME].len == LOG_LEGACYTIME_LEN) { + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_TIME]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + /* time is set, break immediately */ + break; + } + else if (metadata && metadata[LOG_META_TIME].len >= LOG_ISOTIME_MINLEN) { + int month; + char *timestamp = metadata[LOG_META_TIME].ptr; + + /* iso time always begins like this: '1970-01-01T00:00:00' */ + + /* compute month */ + month = 10*(timestamp[5] - '0') + (timestamp[6] - '0'); + if (month) + month--; + if (month <= 11) { + /* builds log prefix ex: 'Jan 1 ' */ + len = snprintf(hdr_ctx.timestamp_buffer, sizeof(hdr_ctx.timestamp_buffer), + "%s %c%c ", monthname[month], + timestamp[8] != '0' ? timestamp[8] : ' ', + timestamp[9]); + /* we reused the timestamp_buffer, signal that it does not + * contain local time anymore + */ + hdr_ctx.cur_legacy_time = 0; + if (len == 7) { + hdr_ctx.ist_vector[(*nbelem)++] = ist2(&hdr_ctx.timestamp_buffer[0], len); + /* adds 'HH:MM:SS' from iso time */ + hdr_ctx.ist_vector[(*nbelem)++] = ist2(×tamp[11], 8); + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + /* we successfully reuse iso time, we can break */ + break; + } + } + /* Failed to reuse isotime time, fallback to local legacy time */ + } + + if (unlikely(time != hdr_ctx.cur_legacy_time)) { + /* re-builds timestamp from the current local time */ + get_localtime(time, &logtime); + + len = snprintf(hdr_ctx.timestamp_buffer, sizeof(hdr_ctx.timestamp_buffer), + "%s %2d %02d:%02d:%02d ", + monthname[logtime.tm_mon], + logtime.tm_mday, logtime.tm_hour, logtime.tm_min, logtime.tm_sec); + if (len != LOG_LEGACYTIME_LEN+1) + hdr_ctx.cur_legacy_time = 0; + else + hdr_ctx.cur_legacy_time = time; + } + if (likely(hdr_ctx.cur_legacy_time)) + hdr_ctx.ist_vector[(*nbelem)++] = ist2(&hdr_ctx.timestamp_buffer[0], LOG_LEGACYTIME_LEN+1); + else + hdr_ctx.ist_vector[(*nbelem)++] = ist2("Jan 1 00:00:00 ", LOG_LEGACYTIME_LEN+1); + break; + case LOG_FORMAT_RFC5424: + /* adds rfc5425 version prefix */ + hdr_ctx.ist_vector[(*nbelem)++] = ist2("1 ", 2); + if (metadata && metadata[LOG_META_TIME].len == 1 && metadata[LOG_META_TIME].ptr[0] == '-') { + /* submitted len is NILVALUE, it is a valid timestamp for rfc5425 */ + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_TIME]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + break; + } + /* let continue as 'timed' and 'iso' format for usual timestamp */ + __fallthrough; + case LOG_FORMAT_TIMED: + case LOG_FORMAT_ISO: + /* ISO format ex: '1900:01:01T12:00:00.123456Z' + * '1900:01:01T14:00:00+02:00' + * '1900:01:01T10:00:00.123456-02:00' + */ + if (metadata && metadata[LOG_META_TIME].len >= LOG_ISOTIME_MINLEN) { + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_TIME]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + /* time is set, break immediately */ + break; + } + else if (metadata && metadata[LOG_META_TIME].len == LOG_LEGACYTIME_LEN) { + int month; + char *timestamp = metadata[LOG_META_TIME].ptr; + + for (month = 0; month < 12; month++) + if (!memcmp(monthname[month], timestamp, 3)) + break; + + if (month < 12) { + + /* get local time to retrieve year */ + get_localtime(time, &logtime); + + /* year seems changed since log */ + if (logtime.tm_mon < month) + logtime.tm_year--; + + /* builds rfc5424 prefix ex: '1900-01-01T' */ + len = snprintf(hdr_ctx.timestamp_buffer, sizeof(hdr_ctx.timestamp_buffer), + "%4d-%02d-%c%cT", + logtime.tm_year+1900, month+1, + timestamp[4] != ' ' ? timestamp[4] : '0', + timestamp[5]); + + /* we reused the timestamp_buffer, signal that it does not + * contain local time anymore + */ + hdr_ctx.cur_legacy_time = 0; + if (len == 11) { + hdr_ctx.ist_vector[(*nbelem)++] = ist2(&hdr_ctx.timestamp_buffer[0], len); + /* adds HH:MM:SS from legacy timestamp */ + hdr_ctx.ist_vector[(*nbelem)++] = ist2(×tamp[7], 8); + /* skip secfraq because it is optional */ + /* according to rfc: -00:00 means we don't know the timezone */ + hdr_ctx.ist_vector[(*nbelem)++] = ist2("-00:00 ", 7); + /* we successfully reuse legacy time, we can break */ + break; + } + } + /* Failed to reuse legacy time, fallback to local iso time */ + } + hdr_ctx.ist_vector[(*nbelem)++] = ist2(timeofday_as_iso_us(1), LOG_ISOTIME_MAXLEN + 1); + break; + case LOG_FORMAT_PRIO: + case LOG_FORMAT_SHORT: + case LOG_FORMAT_RAW: + break; + case LOG_FORMAT_UNSPEC: + case LOG_FORMATS: + ABORT_NOW(); + } + + /* prepare other meta data, stored into a max of 10 elems */ + switch (format) { + case LOG_FORMAT_RFC3164: + if (metadata && metadata[LOG_META_HOST].len) { + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_HOST]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + } + else /* the caller MUST fill the hostname, this field is mandatory */ + hdr_ctx.ist_vector[(*nbelem)++] = ist2("localhost ", 10); + __fallthrough; + case LOG_FORMAT_LOCAL: + if (!metadata || !metadata[LOG_META_TAG].len) + break; + + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_TAG]; + if (metadata[LOG_META_PID].len) { + hdr_ctx.ist_vector[(*nbelem)++] = ist2("[", 1); + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_PID]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2("]", 1); + } + hdr_ctx.ist_vector[(*nbelem)++] = ist2(": ", 2); + break; + case LOG_FORMAT_RFC5424: + if (metadata && metadata[LOG_META_HOST].len) { + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_HOST]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + } + else + hdr_ctx.ist_vector[(*nbelem)++] = ist2("- ", 2); + + if (metadata && metadata[LOG_META_TAG].len) { + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_TAG]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + } + else + hdr_ctx.ist_vector[(*nbelem)++] = ist2("- ", 2); + + if (metadata && metadata[LOG_META_PID].len) { + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_PID]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + } + else + hdr_ctx.ist_vector[(*nbelem)++] = ist2("- ", 2); + + if (metadata && metadata[LOG_META_MSGID].len) { + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_MSGID]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + } + else + hdr_ctx.ist_vector[(*nbelem)++] = ist2("- ", 2); + + if (metadata && metadata[LOG_META_STDATA].len) { + hdr_ctx.ist_vector[(*nbelem)++] = metadata[LOG_META_STDATA]; + hdr_ctx.ist_vector[(*nbelem)++] = ist2(" ", 1); + } + else + hdr_ctx.ist_vector[(*nbelem)++] = ist2("- ", 2); + break; + case LOG_FORMAT_PRIO: + case LOG_FORMAT_SHORT: + case LOG_FORMAT_TIMED: + case LOG_FORMAT_ISO: + case LOG_FORMAT_RAW: + break; + case LOG_FORMAT_UNSPEC: + case LOG_FORMATS: + ABORT_NOW(); + } + + return hdr_ctx.ist_vector; +} + +/* + * This function sends a syslog message. + * <target> is the actual log target where log will be sent, + * + * Message will be prefixed by header according to <hdr> setting. + * Final message will be truncated <maxlen> parameter and will be + * terminated with an LF character. + * + * Does not return any error + */ +static inline void __do_send_log(struct log_target *target, struct log_header hdr, + int nblogger, size_t maxlen, + char *message, size_t size) +{ + static THREAD_LOCAL struct iovec iovec[NB_LOG_HDR_MAX_ELEMENTS+1+1] = { }; /* header elements + message + LF */ + static THREAD_LOCAL struct msghdr msghdr = { + //.msg_iov = iovec, + .msg_iovlen = NB_LOG_HDR_MAX_ELEMENTS+2 + }; + static THREAD_LOCAL int logfdunix = -1; /* syslog to AF_UNIX socket */ + static THREAD_LOCAL int logfdinet = -1; /* syslog to AF_INET socket */ + int *plogfd; + int sent; + size_t nbelem; + struct ist *msg_header = NULL; + + msghdr.msg_iov = iovec; + + /* historically some messages used to already contain the trailing LF + * or Zero. Let's remove all trailing LF or Zero + */ + while (size && (message[size-1] == '\n' || (message[size-1] == 0))) + size--; + + if (target->type == LOG_TARGET_BUFFER) { + plogfd = NULL; + goto send; + } + else if (target->addr->ss_family == AF_CUST_EXISTING_FD) { + /* the socket's address is a file descriptor */ + plogfd = (int *)&((struct sockaddr_in *)target->addr)->sin_addr.s_addr; + } + else if (target->addr->ss_family == AF_UNIX) + plogfd = &logfdunix; + else + plogfd = &logfdinet; + + if (plogfd && unlikely(*plogfd < 0)) { + /* socket not successfully initialized yet */ + if ((*plogfd = socket(target->addr->ss_family, SOCK_DGRAM, + (target->addr->ss_family == AF_UNIX) ? 0 : IPPROTO_UDP)) < 0) { + static char once; + + if (!once) { + once = 1; /* note: no need for atomic ops here */ + ha_alert("socket() failed in logger #%d: %s (errno=%d)\n", + nblogger, strerror(errno), errno); + } + return; + } else { + /* we don't want to receive anything on this socket */ + setsockopt(*plogfd, SOL_SOCKET, SO_RCVBUF, &zero, sizeof(zero)); + /* we may want to adjust the output buffer (tune.sndbuf.backend) */ + if (global.tune.backend_sndbuf) + setsockopt(*plogfd, SOL_SOCKET, SO_SNDBUF, &global.tune.backend_sndbuf, sizeof(global.tune.backend_sndbuf)); + /* does nothing under Linux, maybe needed for others */ + shutdown(*plogfd, SHUT_RD); + fd_set_cloexec(*plogfd); + } + } + + msg_header = build_log_header(hdr, &nbelem); + send: + if (target->type == LOG_TARGET_BUFFER) { + struct ist msg; + size_t e_maxlen = maxlen; + + msg = ist2(message, size); + + /* make room for the final '\n' which may be forcefully inserted + * by tcp forwarder applet (sink_forward_io_handler) + */ + e_maxlen -= 1; + + sent = sink_write(target->sink, hdr, e_maxlen, &msg, 1); + } + else if (target->addr->ss_family == AF_CUST_EXISTING_FD) { + struct ist msg; + + msg = ist2(message, size); + + sent = fd_write_frag_line(*plogfd, maxlen, msg_header, nbelem, &msg, 1, 1); + } + else { + int i = 0; + int totlen = maxlen - 1; /* save space for the final '\n' */ + + for (i = 0 ; i < nbelem ; i++ ) { + iovec[i].iov_base = msg_header[i].ptr; + iovec[i].iov_len = msg_header[i].len; + if (totlen <= iovec[i].iov_len) { + iovec[i].iov_len = totlen; + totlen = 0; + break; + } + totlen -= iovec[i].iov_len; + } + if (totlen) { + iovec[i].iov_base = message; + iovec[i].iov_len = size; + if (totlen <= iovec[i].iov_len) + iovec[i].iov_len = totlen; + i++; + } + iovec[i].iov_base = "\n"; /* insert a \n at the end of the message */ + iovec[i].iov_len = 1; + i++; + + msghdr.msg_iovlen = i; + msghdr.msg_name = (struct sockaddr *)target->addr; + msghdr.msg_namelen = get_addr_len(target->addr); + + sent = sendmsg(*plogfd, &msghdr, MSG_DONTWAIT | MSG_NOSIGNAL); + } + + if (sent < 0) { + static char once; + + if (errno == EAGAIN || errno == EWOULDBLOCK) + _HA_ATOMIC_INC(&dropped_logs); + else if (!once) { + once = 1; /* note: no need for atomic ops here */ + ha_alert("sendmsg()/writev() failed in logger #%d: %s (errno=%d)\n", + nblogger, strerror(errno), errno); + } + } +} + +/* does the same as __do_send_log() does for a single target, but here the log + * will be sent according to the log backend's lb settings. The function will + * leverage __do_send_log() function to actually send the log messages. + */ +static inline void __do_send_log_backend(struct proxy *be, struct log_header hdr, + int nblogger, size_t maxlen, + char *message, size_t size) +{ + struct server *srv; + uint32_t targetid = ~0; /* default value to check if it was explicitly assigned */ + uint32_t nb_srv; + + HA_RWLOCK_RDLOCK(LBPRM_LOCK, &be->lbprm.lock); + + if (be->srv_act) { + nb_srv = be->srv_act; + } + else if (be->srv_bck) { + /* no more active servers but backup ones are, switch to backup farm */ + nb_srv = be->srv_bck; + if (!(be->options & PR_O_USE_ALL_BK)) { + /* log balancing disabled on backup farm */ + targetid = 0; /* use first server */ + goto skip_lb; + } + } + else { + /* no srv available, can't log */ + goto drop; + } + + /* log-balancing logic: */ + + if ((be->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_RR) { + /* Atomically load and update lastid since it's not protected + * by any write lock + * + * Wrapping is expected and could lead to unexpected ID reset in the + * middle of a cycle, but given that this only happens once in every + * 4 billions it is quite negligible + */ + targetid = HA_ATOMIC_FETCH_ADD(&be->lbprm.log.lastid, 1) % nb_srv; + } + else if ((be->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_LS) { + /* sticky mode: use first server in the pool, which will always stay + * first during dequeuing and requeuing, unless it becomes unavailable + * and will be replaced by another one + */ + targetid = 0; + } + else if ((be->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_RND) { + /* random mode */ + targetid = statistical_prng() % nb_srv; + } + else if ((be->lbprm.algo & BE_LB_ALGO) == BE_LB_ALGO_LH) { + struct sample result; + + /* log-balance hash */ + memset(&result, 0, sizeof(result)); + result.data.type = SMP_T_STR; + result.flags = SMP_F_CONST; + result.data.u.str.area = message; + result.data.u.str.data = size; + result.data.u.str.size = size + 1; /* with terminating NULL byte */ + if (sample_process_cnv(be->lbprm.expr, &result)) { + /* gen_hash takes binary input, ensure that we provide such value to it */ + if (result.data.type == SMP_T_BIN || sample_casts[result.data.type][SMP_T_BIN]) { + sample_casts[result.data.type][SMP_T_BIN](&result); + targetid = gen_hash(be, result.data.u.str.area, result.data.u.str.data) % nb_srv; + } + } + } + + skip_lb: + + if (targetid == ~0) { + /* no target assigned, nothing to do */ + goto drop; + } + + /* find server based on targetid */ + srv = be->lbprm.log.srv[targetid]; + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &be->lbprm.lock); + + __do_send_log(srv->log_target, hdr, nblogger, maxlen, message, size); + return; + + drop: + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &be->lbprm.lock); + _HA_ATOMIC_INC(&dropped_logs); +} + +/* + * This function sends a syslog message. + * It doesn't care about errors nor does it report them. + * The argument <metadata> MUST be an array of size + * LOG_META_FIELDS*sizeof(struct ist) containing + * data to build the header. + */ +void process_send_log(struct list *loggers, int level, int facility, + struct ist *metadata, char *message, size_t size) +{ + struct logger *logger; + int nblogger; + + /* Send log messages to syslog server. */ + nblogger = 0; + list_for_each_entry(logger, loggers, list) { + int in_range = 1; + + /* we can filter the level of the messages that are sent to each logger */ + if (level > logger->level) + continue; + + if (logger->lb.smp_rgs) { + struct smp_log_range *smp_rg; + uint next_idx, curr_rg; + ullong curr_rg_idx, next_rg_idx; + + curr_rg_idx = _HA_ATOMIC_LOAD(&logger->lb.curr_rg_idx); + do { + next_idx = (curr_rg_idx & 0xFFFFFFFFU) + 1; + curr_rg = curr_rg_idx >> 32; + smp_rg = &logger->lb.smp_rgs[curr_rg]; + + /* check if the index we're going to take is within range */ + in_range = smp_rg->low <= next_idx && next_idx <= smp_rg->high; + if (in_range) { + /* Let's consume this range. */ + if (next_idx == smp_rg->high) { + /* If consumed, let's select the next range. */ + curr_rg = (curr_rg + 1) % logger->lb.smp_rgs_sz; + } + } + + next_idx = next_idx % logger->lb.smp_sz; + next_rg_idx = ((ullong)curr_rg << 32) + next_idx; + } while (!_HA_ATOMIC_CAS(&logger->lb.curr_rg_idx, &curr_rg_idx, next_rg_idx) && + __ha_cpu_relax()); + } + if (in_range) { + struct log_header hdr; + + hdr.level = MAX(level, logger->minlvl); + hdr.facility = (facility == -1) ? logger->facility : facility; + hdr.format = logger->format; + hdr.metadata = metadata; + + nblogger += 1; + if (logger->target.type == LOG_TARGET_BACKEND) { + __do_send_log_backend(logger->target.be, hdr, nblogger, logger->maxlen, message, size); + } + else { + /* normal target */ + __do_send_log(&logger->target, hdr, nblogger, logger->maxlen, message, size); + } + } + } +} + +/* + * This function sends a syslog message. + * It doesn't care about errors nor does it report them. + * The arguments <sd> and <sd_size> are used for the structured-data part + * in RFC5424 formatted syslog messages. + */ +void __send_log(struct list *loggers, struct buffer *tagb, int level, + char *message, size_t size, char *sd, size_t sd_size) +{ + static THREAD_LOCAL pid_t curr_pid; + static THREAD_LOCAL char pidstr[16]; + static THREAD_LOCAL struct ist metadata[LOG_META_FIELDS]; + + if (loggers == NULL) { + if (!LIST_ISEMPTY(&global.loggers)) { + loggers = &global.loggers; + } + } + if (!loggers || LIST_ISEMPTY(loggers)) + return; + + if (!metadata[LOG_META_HOST].len) { + if (global.log_send_hostname) + metadata[LOG_META_HOST] = ist(global.log_send_hostname); + } + + if (!tagb || !tagb->area) + tagb = &global.log_tag; + + if (tagb) + metadata[LOG_META_TAG] = ist2(tagb->area, tagb->data); + + if (unlikely(curr_pid != getpid())) + metadata[LOG_META_PID].len = 0; + + if (!metadata[LOG_META_PID].len) { + curr_pid = getpid(); + ltoa_o(curr_pid, pidstr, sizeof(pidstr)); + metadata[LOG_META_PID] = ist2(pidstr, strlen(pidstr)); + } + + metadata[LOG_META_STDATA] = ist2(sd, sd_size); + + /* Remove trailing space of structured data */ + while (metadata[LOG_META_STDATA].len && metadata[LOG_META_STDATA].ptr[metadata[LOG_META_STDATA].len-1] == ' ') + metadata[LOG_META_STDATA].len--; + + return process_send_log(loggers, level, -1, metadata, message, size); +} + +const char sess_cookie[8] = "NIDVEOU7"; /* No cookie, Invalid cookie, cookie for a Down server, Valid cookie, Expired cookie, Old cookie, Unused, unknown */ +const char sess_set_cookie[8] = "NPDIRU67"; /* No set-cookie, Set-cookie found and left unchanged (passive), + Set-cookie Deleted, Set-Cookie Inserted, Set-cookie Rewritten, + Set-cookie Updated, unknown, unknown */ + +/* + * try to write a character if there is enough space, or goto out + */ +#define LOGCHAR(x) do { \ + if (tmplog < dst + maxsize - 1) { \ + *(tmplog++) = (x); \ + } else { \ + goto out; \ + } \ + } while(0) + + +/* Initializes some log data at boot */ +static void init_log() +{ + char *tmp; + int i; + + /* Initialize the escape map for the RFC5424 structured-data : '"\]' + * inside PARAM-VALUE should be escaped with '\' as prefix. + * See https://tools.ietf.org/html/rfc5424#section-6.3.3 for more + * details. + */ + memset(rfc5424_escape_map, 0, sizeof(rfc5424_escape_map)); + + tmp = "\"\\]"; + while (*tmp) { + ha_bit_set(*tmp, rfc5424_escape_map); + tmp++; + } + + /* initialize the log header encoding map : '{|}"#' should be encoded with + * '#' as prefix, as well as non-printable characters ( <32 or >= 127 ). + * URL encoding only requires '"', '#' to be encoded as well as non- + * printable characters above. + */ + memset(hdr_encode_map, 0, sizeof(hdr_encode_map)); + memset(url_encode_map, 0, sizeof(url_encode_map)); + for (i = 0; i < 32; i++) { + ha_bit_set(i, hdr_encode_map); + ha_bit_set(i, url_encode_map); + } + for (i = 127; i < 256; i++) { + ha_bit_set(i, hdr_encode_map); + ha_bit_set(i, url_encode_map); + } + + tmp = "\"#{|}"; + while (*tmp) { + ha_bit_set(*tmp, hdr_encode_map); + tmp++; + } + + tmp = "\"#"; + while (*tmp) { + ha_bit_set(*tmp, url_encode_map); + tmp++; + } + + /* initialize the http header encoding map. The draft httpbis define the + * header content as: + * + * HTTP-message = start-line + * *( header-field CRLF ) + * CRLF + * [ message-body ] + * header-field = field-name ":" OWS field-value OWS + * field-value = *( field-content / obs-fold ) + * field-content = field-vchar [ 1*( SP / HTAB ) field-vchar ] + * obs-fold = CRLF 1*( SP / HTAB ) + * field-vchar = VCHAR / obs-text + * VCHAR = %x21-7E + * obs-text = %x80-FF + * + * All the chars are encoded except "VCHAR", "obs-text", SP and HTAB. + * The encoded chars are form 0x00 to 0x08, 0x0a to 0x1f and 0x7f. The + * "obs-fold" is voluntarily forgotten because haproxy remove this. + */ + memset(http_encode_map, 0, sizeof(http_encode_map)); + for (i = 0x00; i <= 0x08; i++) + ha_bit_set(i, http_encode_map); + for (i = 0x0a; i <= 0x1f; i++) + ha_bit_set(i, http_encode_map); + ha_bit_set(0x7f, http_encode_map); +} + +INITCALL0(STG_PREPARE, init_log); + +/* Initialize log buffers used for syslog messages */ +int init_log_buffers() +{ + logline = my_realloc2(logline, global.max_syslog_len + 1); + logline_rfc5424 = my_realloc2(logline_rfc5424, global.max_syslog_len + 1); + if (!logline || !logline_rfc5424) + return 0; + return 1; +} + +/* Deinitialize log buffers used for syslog messages */ +void deinit_log_buffers() +{ + free(logline); + free(logline_rfc5424); + logline = NULL; + logline_rfc5424 = NULL; +} + +/* Deinitialize log forwarder proxies used for syslog messages */ +void deinit_log_forward() +{ + struct proxy *p, *p0; + + p = cfg_log_forward; + /* we need to manually clean cfg_log_forward proxy list */ + while (p) { + p0 = p; + p = p->next; + free_proxy(p0); + } +} + +/* Builds a log line in <dst> based on <list_format>, and stops before reaching + * <maxsize> characters. Returns the size of the output string in characters, + * not counting the trailing zero which is always added if the resulting size + * is not zero. It requires a valid session and optionally a stream. If the + * stream is NULL, default values will be assumed for the stream part. + */ +int sess_build_logline(struct session *sess, struct stream *s, char *dst, size_t maxsize, struct list *list_format) +{ + struct proxy *fe = sess->fe; + struct proxy *be; + struct http_txn *txn; + const struct strm_logs *logs; + struct connection *fe_conn, *be_conn; + unsigned int s_flags; + unsigned int uniq_id; + struct buffer chunk; + char *uri; + char *spc; + char *qmark; + char *end; + struct tm tm; + int t_request; + int hdr; + int last_isspace = 1; + int nspaces = 0; + char *tmplog; + char *ret; + int iret; + int status; + struct logformat_node *tmp; + struct timeval tv; + struct strm_logs tmp_strm_log; + struct ist path; + struct http_uri_parser parser; + + /* FIXME: let's limit ourselves to frontend logging for now. */ + + if (likely(s)) { + be = s->be; + txn = s->txn; + be_conn = sc_conn(s->scb); + status = (txn ? txn->status : 0); + s_flags = s->flags; + uniq_id = s->uniq_id; + logs = &s->logs; + } else { + /* we have no stream so we first need to initialize a few + * things that are needed later. We do increment the request + * ID so that it's uniquely assigned to this request just as + * if the request had reached the point of being processed. + * A request error is reported as it's the only element we have + * here and which justifies emitting such a log. + */ + be = ((obj_type(sess->origin) == OBJ_TYPE_CHECK) ? __objt_check(sess->origin)->proxy : fe); + txn = NULL; + fe_conn = objt_conn(sess->origin); + be_conn = ((obj_type(sess->origin) == OBJ_TYPE_CHECK) ? sc_conn(__objt_check(sess->origin)->sc) : NULL); + status = 0; + s_flags = SF_ERR_PRXCOND | SF_FINST_R; + uniq_id = _HA_ATOMIC_FETCH_ADD(&global.req_count, 1); + + /* prepare a valid log structure */ + tmp_strm_log.accept_ts = sess->accept_ts; + tmp_strm_log.accept_date = sess->accept_date; + tmp_strm_log.t_handshake = sess->t_handshake; + tmp_strm_log.t_idle = (sess->t_idle >= 0 ? sess->t_idle : 0); + tmp_strm_log.request_ts = 0; + tmp_strm_log.t_queue = -1; + tmp_strm_log.t_connect = -1; + tmp_strm_log.t_data = -1; + tmp_strm_log.t_close = ns_to_ms(now_ns - sess->accept_ts); + tmp_strm_log.bytes_in = 0; + tmp_strm_log.bytes_out = 0; + tmp_strm_log.prx_queue_pos = 0; + tmp_strm_log.srv_queue_pos = 0; + + logs = &tmp_strm_log; + + if ((fe->mode == PR_MODE_HTTP) && fe_conn && fe_conn->mux && fe_conn->mux->ctl) { + enum mux_exit_status es = fe_conn->mux->ctl(fe_conn, MUX_CTL_EXIT_STATUS, &status); + + switch (es) { + case MUX_ES_SUCCESS: + break; + case MUX_ES_INVALID_ERR: + status = (status ? status : 400); + if ((fe_conn->flags & CO_FL_ERROR) || conn_xprt_read0_pending(fe_conn)) + s_flags = SF_ERR_CLICL | SF_FINST_R; + else + s_flags = SF_ERR_PRXCOND | SF_FINST_R; + break; + case MUX_ES_TOUT_ERR: + status = (status ? status : 408); + s_flags = SF_ERR_CLITO | SF_FINST_R; + break; + case MUX_ES_NOTIMPL_ERR: + status = (status ? status : 501); + s_flags = SF_ERR_PRXCOND | SF_FINST_R; + break; + case MUX_ES_INTERNAL_ERR: + status = (status ? status : 500); + s_flags = SF_ERR_INTERNAL | SF_FINST_R; + break; + default: + break; + } + } + } + + t_request = -1; + if ((llong)(logs->request_ts - logs->accept_ts) >= 0) + t_request = ns_to_ms(logs->request_ts - logs->accept_ts); + + tmplog = dst; + + /* fill logbuffer */ + if (LIST_ISEMPTY(list_format)) + return 0; + + list_for_each_entry(tmp, list_format, list) { +#ifdef USE_OPENSSL + struct connection *conn; +#endif + const struct sockaddr_storage *addr; + const char *src = NULL; + struct sample *key; + const struct buffer empty = { }; + + switch (tmp->type) { + case LOG_FMT_SEPARATOR: + if (!last_isspace) { + LOGCHAR(' '); + last_isspace = 1; + } + break; + + case LOG_FMT_TEXT: // text + src = tmp->arg; + iret = strlcpy2(tmplog, src, dst + maxsize - tmplog); + if (iret == 0) + goto out; + tmplog += iret; + last_isspace = 0; + break; + + case LOG_FMT_EXPR: // sample expression, may be request or response + key = NULL; + if (tmp->options & LOG_OPT_REQ_CAP) + key = sample_fetch_as_type(be, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, tmp->expr, SMP_T_STR); + + if (!key && (tmp->options & LOG_OPT_RES_CAP)) + key = sample_fetch_as_type(be, sess, s, SMP_OPT_DIR_RES|SMP_OPT_FINAL, tmp->expr, SMP_T_STR); + + if (!key && !(tmp->options & (LOG_OPT_REQ_CAP|LOG_OPT_RES_CAP))) // cfg, cli + key = sample_fetch_as_type(be, sess, s, SMP_OPT_FINAL, tmp->expr, SMP_T_STR); + + if (tmp->options & LOG_OPT_HTTP) + ret = lf_encode_chunk(tmplog, dst + maxsize, + '%', http_encode_map, key ? &key->data.u.str : &empty, tmp); + else + ret = lf_text_len(tmplog, + key ? key->data.u.str.area : NULL, + key ? key->data.u.str.data : 0, + dst + maxsize - tmplog, + tmp); + if (ret == 0) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_CLIENTIP: // %ci + addr = (s ? sc_src(s->scf) : sess_src(sess)); + if (addr) + ret = lf_ip(tmplog, (struct sockaddr *)addr, dst + maxsize - tmplog, tmp); + else + ret = lf_text_len(tmplog, NULL, 0, dst + maxsize - tmplog, tmp); + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_CLIENTPORT: // %cp + addr = (s ? sc_src(s->scf) : sess_src(sess)); + if (addr) { + /* sess->listener is always defined when the session's owner is an inbound connections */ + if (addr->ss_family == AF_UNIX) + ret = ltoa_o(sess->listener->luid, tmplog, dst + maxsize - tmplog); + else + ret = lf_port(tmplog, (struct sockaddr *)addr, dst + maxsize - tmplog, tmp); + } + else + ret = lf_text_len(tmplog, NULL, 0, dst + maxsize - tmplog, tmp); + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_FRONTENDIP: // %fi + addr = (s ? sc_dst(s->scf) : sess_dst(sess)); + if (addr) + ret = lf_ip(tmplog, (struct sockaddr *)addr, dst + maxsize - tmplog, tmp); + else + ret = lf_text_len(tmplog, NULL, 0, dst + maxsize - tmplog, tmp); + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_FRONTENDPORT: // %fp + addr = (s ? sc_dst(s->scf) : sess_dst(sess)); + if (addr) { + /* sess->listener is always defined when the session's owner is an inbound connections */ + if (addr->ss_family == AF_UNIX) + ret = ltoa_o(sess->listener->luid, tmplog, dst + maxsize - tmplog); + else + ret = lf_port(tmplog, (struct sockaddr *)addr, dst + maxsize - tmplog, tmp); + } + else + ret = lf_text_len(tmplog, NULL, 0, dst + maxsize - tmplog, tmp); + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_BACKENDIP: // %bi + if (be_conn && conn_get_src(be_conn)) + ret = lf_ip(tmplog, (const struct sockaddr *)be_conn->src, dst + maxsize - tmplog, tmp); + else + ret = lf_text_len(tmplog, NULL, 0, dst + maxsize - tmplog, tmp); + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_BACKENDPORT: // %bp + if (be_conn && conn_get_src(be_conn)) + ret = lf_port(tmplog, (struct sockaddr *)be_conn->src, dst + maxsize - tmplog, tmp); + else + ret = lf_text_len(tmplog, NULL, 0, dst + maxsize - tmplog, tmp); + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_SERVERIP: // %si + if (be_conn && conn_get_dst(be_conn)) + ret = lf_ip(tmplog, (struct sockaddr *)be_conn->dst, dst + maxsize - tmplog, tmp); + else + ret = lf_text_len(tmplog, NULL, 0, dst + maxsize - tmplog, tmp); + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_SERVERPORT: // %sp + if (be_conn && conn_get_dst(be_conn)) + ret = lf_port(tmplog, (struct sockaddr *)be_conn->dst, dst + maxsize - tmplog, tmp); + else + ret = lf_text_len(tmplog, NULL, 0, dst + maxsize - tmplog, tmp); + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_DATE: // %t = accept date + get_localtime(logs->accept_date.tv_sec, &tm); + ret = date2str_log(tmplog, &tm, &logs->accept_date, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_tr: // %tr = start of request date + /* Note that the timers are valid if we get here */ + tv_ms_add(&tv, &logs->accept_date, logs->t_idle >= 0 ? logs->t_idle + logs->t_handshake : 0); + get_localtime(tv.tv_sec, &tm); + ret = date2str_log(tmplog, &tm, &tv, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_DATEGMT: // %T = accept date, GMT + get_gmtime(logs->accept_date.tv_sec, &tm); + ret = gmt2str_log(tmplog, &tm, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_trg: // %trg = start of request date, GMT + tv_ms_add(&tv, &logs->accept_date, logs->t_idle >= 0 ? logs->t_idle + logs->t_handshake : 0); + get_gmtime(tv.tv_sec, &tm); + ret = gmt2str_log(tmplog, &tm, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_DATELOCAL: // %Tl = accept date, local + get_localtime(logs->accept_date.tv_sec, &tm); + ret = localdate2str_log(tmplog, logs->accept_date.tv_sec, &tm, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_trl: // %trl = start of request date, local + tv_ms_add(&tv, &logs->accept_date, logs->t_idle >= 0 ? logs->t_idle + logs->t_handshake : 0); + get_localtime(tv.tv_sec, &tm); + ret = localdate2str_log(tmplog, tv.tv_sec, &tm, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TS: // %Ts + if (tmp->options & LOG_OPT_HEXA) { + iret = snprintf(tmplog, dst + maxsize - tmplog, "%04X", (unsigned int)logs->accept_date.tv_sec); + if (iret < 0 || iret > dst + maxsize - tmplog) + goto out; + last_isspace = 0; + tmplog += iret; + } else { + ret = ltoa_o(logs->accept_date.tv_sec, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + } + break; + + case LOG_FMT_MS: // %ms + if (tmp->options & LOG_OPT_HEXA) { + iret = snprintf(tmplog, dst + maxsize - tmplog, "%02X",(unsigned int)logs->accept_date.tv_usec/1000); + if (iret < 0 || iret > dst + maxsize - tmplog) + goto out; + last_isspace = 0; + tmplog += iret; + } else { + if ((dst + maxsize - tmplog) < 4) + goto out; + ret = utoa_pad((unsigned int)logs->accept_date.tv_usec/1000, + tmplog, 4); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + } + break; + + case LOG_FMT_FRONTEND: // %f + src = fe->id; + ret = lf_text(tmplog, src, dst + maxsize - tmplog, tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_FRONTEND_XPRT: // %ft + src = fe->id; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + iret = strlcpy2(tmplog, src, dst + maxsize - tmplog); + if (iret == 0) + goto out; + tmplog += iret; + + /* sess->listener may be undefined if the session's owner is a health-check */ + if (sess->listener && sess->listener->bind_conf->xprt->get_ssl_sock_ctx) + LOGCHAR('~'); + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + last_isspace = 0; + break; +#ifdef USE_OPENSSL + case LOG_FMT_SSL_CIPHER: // %sslc + src = NULL; + conn = objt_conn(sess->origin); + if (conn) { + src = ssl_sock_get_cipher_name(conn); + } + ret = lf_text(tmplog, src, dst + maxsize - tmplog, tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_SSL_VERSION: // %sslv + src = NULL; + conn = objt_conn(sess->origin); + if (conn) { + src = ssl_sock_get_proto_version(conn); + } + ret = lf_text(tmplog, src, dst + maxsize - tmplog, tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; +#endif + case LOG_FMT_BACKEND: // %b + src = be->id; + ret = lf_text(tmplog, src, dst + maxsize - tmplog, tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_SERVER: // %s + switch (obj_type(s ? s->target : sess->origin)) { + case OBJ_TYPE_SERVER: + src = __objt_server(s->target)->id; + break; + case OBJ_TYPE_APPLET: + src = __objt_applet(s->target)->name; + break; + case OBJ_TYPE_CHECK: + src = (__objt_check(sess->origin)->server + ? __objt_check(sess->origin)->server->id + : "<NOSRV>"); + break; + default: + src = "<NOSRV>"; + break; + } + ret = lf_text(tmplog, src, dst + maxsize - tmplog, tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_Th: // %Th = handshake time + ret = ltoa_o(logs->t_handshake, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_Ti: // %Ti = HTTP idle time + ret = ltoa_o(logs->t_idle, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TR: // %TR = HTTP request time + ret = ltoa_o((t_request >= 0) ? t_request - logs->t_idle - logs->t_handshake : -1, + tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TQ: // %Tq = Th + Ti + TR + ret = ltoa_o(t_request, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TW: // %Tw + ret = ltoa_o((logs->t_queue >= 0) ? logs->t_queue - t_request : -1, + tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TC: // %Tc + ret = ltoa_o((logs->t_connect >= 0) ? logs->t_connect - logs->t_queue : -1, + tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_Tr: // %Tr + ret = ltoa_o((logs->t_data >= 0) ? logs->t_data - logs->t_connect : -1, + tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TD: // %Td + if (be->mode == PR_MODE_HTTP) + ret = ltoa_o((logs->t_data >= 0) ? logs->t_close - logs->t_data : -1, + tmplog, dst + maxsize - tmplog); + else + ret = ltoa_o((logs->t_connect >= 0) ? logs->t_close - logs->t_connect : -1, + tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_Ta: // %Ta = active time = Tt - Th - Ti + if (!(fe->to_log & LW_BYTES)) + LOGCHAR('+'); + ret = ltoa_o(logs->t_close - (logs->t_idle >= 0 ? logs->t_idle + logs->t_handshake : 0), + tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TT: // %Tt = total time + if (!(fe->to_log & LW_BYTES)) + LOGCHAR('+'); + ret = ltoa_o(logs->t_close, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TU: // %Tu = total time seen by user = Tt - Ti + if (!(fe->to_log & LW_BYTES)) + LOGCHAR('+'); + ret = ltoa_o(logs->t_close - (logs->t_idle >= 0 ? logs->t_idle : 0), + tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_STATUS: // %ST + ret = ltoa_o(status, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_BYTES: // %B + if (!(fe->to_log & LW_BYTES)) + LOGCHAR('+'); + ret = lltoa(logs->bytes_out, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_BYTES_UP: // %U + ret = lltoa(logs->bytes_in, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_CCLIENT: // %CC + src = txn ? txn->cli_cookie : NULL; + ret = lf_text(tmplog, src, dst + maxsize - tmplog, tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_CSERVER: // %CS + src = txn ? txn->srv_cookie : NULL; + ret = lf_text(tmplog, src, dst + maxsize - tmplog, tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_TERMSTATE: // %ts + LOGCHAR(sess_term_cond[(s_flags & SF_ERR_MASK) >> SF_ERR_SHIFT]); + LOGCHAR(sess_fin_state[(s_flags & SF_FINST_MASK) >> SF_FINST_SHIFT]); + *tmplog = '\0'; + last_isspace = 0; + break; + + case LOG_FMT_TERMSTATE_CK: // %tsc, same as TS with cookie state (for mode HTTP) + LOGCHAR(sess_term_cond[(s_flags & SF_ERR_MASK) >> SF_ERR_SHIFT]); + LOGCHAR(sess_fin_state[(s_flags & SF_FINST_MASK) >> SF_FINST_SHIFT]); + LOGCHAR((txn && (be->ck_opts & PR_CK_ANY)) ? sess_cookie[(txn->flags & TX_CK_MASK) >> TX_CK_SHIFT] : '-'); + LOGCHAR((txn && (be->ck_opts & PR_CK_ANY)) ? sess_set_cookie[(txn->flags & TX_SCK_MASK) >> TX_SCK_SHIFT] : '-'); + last_isspace = 0; + break; + + case LOG_FMT_ACTCONN: // %ac + ret = ltoa_o(actconn, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_FECONN: // %fc + ret = ltoa_o(fe->feconn, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_BECONN: // %bc + ret = ltoa_o(be->beconn, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_SRVCONN: // %sc + switch (obj_type(s ? s->target : sess->origin)) { + case OBJ_TYPE_SERVER: + ret = ultoa_o(__objt_server(s->target)->cur_sess, + tmplog, dst + maxsize - tmplog); + break; + case OBJ_TYPE_CHECK: + ret = ultoa_o(__objt_check(sess->origin)->server + ? __objt_check(sess->origin)->server->cur_sess + : 0, tmplog, dst + maxsize - tmplog); + break; + default: + ret = ultoa_o(0, tmplog, dst + maxsize - tmplog); + break; + } + + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_RETRIES: // %rc + if (s_flags & SF_REDISP) + LOGCHAR('+'); + ret = ltoa_o((s ? s->conn_retries : 0), tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_SRVQUEUE: // %sq + ret = ltoa_o(logs->srv_queue_pos, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_BCKQUEUE: // %bq + ret = ltoa_o(logs->prx_queue_pos, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_HDRREQUEST: // %hr + /* request header */ + if (fe->nb_req_cap && s && s->req_cap) { + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + LOGCHAR('{'); + for (hdr = 0; hdr < fe->nb_req_cap; hdr++) { + if (hdr) + LOGCHAR('|'); + if (s->req_cap[hdr] != NULL) { + ret = lf_encode_string(tmplog, dst + maxsize, + '#', hdr_encode_map, s->req_cap[hdr], tmp); + if (ret == NULL || *ret != '\0') + goto out; + tmplog = ret; + } + } + LOGCHAR('}'); + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + last_isspace = 0; + } + break; + + case LOG_FMT_HDRREQUESTLIST: // %hrl + /* request header list */ + if (fe->nb_req_cap && s && s->req_cap) { + for (hdr = 0; hdr < fe->nb_req_cap; hdr++) { + if (hdr > 0) + LOGCHAR(' '); + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + if (s->req_cap[hdr] != NULL) { + ret = lf_encode_string(tmplog, dst + maxsize, + '#', hdr_encode_map, s->req_cap[hdr], tmp); + if (ret == NULL || *ret != '\0') + goto out; + tmplog = ret; + } else if (!(tmp->options & LOG_OPT_QUOTE)) + LOGCHAR('-'); + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + last_isspace = 0; + } + } + break; + + + case LOG_FMT_HDRRESPONS: // %hs + /* response header */ + if (fe->nb_rsp_cap && s && s->res_cap) { + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + LOGCHAR('{'); + for (hdr = 0; hdr < fe->nb_rsp_cap; hdr++) { + if (hdr) + LOGCHAR('|'); + if (s->res_cap[hdr] != NULL) { + ret = lf_encode_string(tmplog, dst + maxsize, + '#', hdr_encode_map, s->res_cap[hdr], tmp); + if (ret == NULL || *ret != '\0') + goto out; + tmplog = ret; + } + } + LOGCHAR('}'); + last_isspace = 0; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + } + break; + + case LOG_FMT_HDRRESPONSLIST: // %hsl + /* response header list */ + if (fe->nb_rsp_cap && s && s->res_cap) { + for (hdr = 0; hdr < fe->nb_rsp_cap; hdr++) { + if (hdr > 0) + LOGCHAR(' '); + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + if (s->res_cap[hdr] != NULL) { + ret = lf_encode_string(tmplog, dst + maxsize, + '#', hdr_encode_map, s->res_cap[hdr], tmp); + if (ret == NULL || *ret != '\0') + goto out; + tmplog = ret; + } else if (!(tmp->options & LOG_OPT_QUOTE)) + LOGCHAR('-'); + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + last_isspace = 0; + } + } + break; + + case LOG_FMT_REQ: // %r + /* Request */ + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + uri = txn && txn->uri ? txn->uri : "<BADREQ>"; + ret = lf_encode_string(tmplog, dst + maxsize, + '#', url_encode_map, uri, tmp); + if (ret == NULL || *ret != '\0') + goto out; + tmplog = ret; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + last_isspace = 0; + break; + + case LOG_FMT_HTTP_PATH: // %HP + uri = txn && txn->uri ? txn->uri : "<BADREQ>"; + + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + end = uri + strlen(uri); + // look for the first whitespace character + while (uri < end && !HTTP_IS_SPHT(*uri)) + uri++; + + // keep advancing past multiple spaces + while (uri < end && HTTP_IS_SPHT(*uri)) { + uri++; nspaces++; + } + + // look for first space or question mark after url + spc = uri; + while (spc < end && *spc != '?' && !HTTP_IS_SPHT(*spc)) + spc++; + + if (!txn || !txn->uri || nspaces == 0) { + chunk.area = "<BADREQ>"; + chunk.data = strlen("<BADREQ>"); + } else { + chunk.area = uri; + chunk.data = spc - uri; + } + + ret = lf_encode_chunk(tmplog, dst + maxsize, '#', url_encode_map, &chunk, tmp); + if (ret == NULL || *ret != '\0') + goto out; + + tmplog = ret; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + last_isspace = 0; + break; + + case LOG_FMT_HTTP_PATH_ONLY: // %HPO + uri = txn && txn->uri ? txn->uri : "<BADREQ>"; + + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + end = uri + strlen(uri); + + // look for the first whitespace character + while (uri < end && !HTTP_IS_SPHT(*uri)) + uri++; + + // keep advancing past multiple spaces + while (uri < end && HTTP_IS_SPHT(*uri)) { + uri++; nspaces++; + } + + // look for first space after url + spc = uri; + while (spc < end && !HTTP_IS_SPHT(*spc)) + spc++; + + path = ist2(uri, spc - uri); + + // extract relative path without query params from url + parser = http_uri_parser_init(path); + path = iststop(http_parse_path(&parser), '?'); + if (!txn || !txn->uri || nspaces == 0) { + chunk.area = "<BADREQ>"; + chunk.data = strlen("<BADREQ>"); + } else { + chunk.area = path.ptr; + chunk.data = path.len; + } + + ret = lf_encode_chunk(tmplog, dst + maxsize, '#', url_encode_map, &chunk, tmp); + if (ret == NULL || *ret != '\0') + goto out; + + tmplog = ret; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + last_isspace = 0; + break; + + case LOG_FMT_HTTP_QUERY: // %HQ + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + if (!txn || !txn->uri) { + chunk.area = "<BADREQ>"; + chunk.data = strlen("<BADREQ>"); + } else { + uri = txn->uri; + end = uri + strlen(uri); + // look for the first question mark + while (uri < end && *uri != '?') + uri++; + + qmark = uri; + // look for first space or question mark after url + while (uri < end && !HTTP_IS_SPHT(*uri)) + uri++; + + chunk.area = qmark; + chunk.data = uri - qmark; + } + + ret = lf_encode_chunk(tmplog, dst + maxsize, '#', url_encode_map, &chunk, tmp); + if (ret == NULL || *ret != '\0') + goto out; + + tmplog = ret; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + last_isspace = 0; + break; + + case LOG_FMT_HTTP_URI: // %HU + uri = txn && txn->uri ? txn->uri : "<BADREQ>"; + + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + end = uri + strlen(uri); + // look for the first whitespace character + while (uri < end && !HTTP_IS_SPHT(*uri)) + uri++; + + // keep advancing past multiple spaces + while (uri < end && HTTP_IS_SPHT(*uri)) { + uri++; nspaces++; + } + + // look for first space after url + spc = uri; + while (spc < end && !HTTP_IS_SPHT(*spc)) + spc++; + + if (!txn || !txn->uri || nspaces == 0) { + chunk.area = "<BADREQ>"; + chunk.data = strlen("<BADREQ>"); + } else { + chunk.area = uri; + chunk.data = spc - uri; + } + + ret = lf_encode_chunk(tmplog, dst + maxsize, '#', url_encode_map, &chunk, tmp); + if (ret == NULL || *ret != '\0') + goto out; + + tmplog = ret; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + last_isspace = 0; + break; + + case LOG_FMT_HTTP_METHOD: // %HM + uri = txn && txn->uri ? txn->uri : "<BADREQ>"; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + end = uri + strlen(uri); + // look for the first whitespace character + spc = uri; + while (spc < end && !HTTP_IS_SPHT(*spc)) + spc++; + + if (spc == end) { // odd case, we have txn->uri, but we only got a verb + chunk.area = "<BADREQ>"; + chunk.data = strlen("<BADREQ>"); + } else { + chunk.area = uri; + chunk.data = spc - uri; + } + + ret = lf_encode_chunk(tmplog, dst + maxsize, '#', url_encode_map, &chunk, tmp); + if (ret == NULL || *ret != '\0') + goto out; + + tmplog = ret; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + last_isspace = 0; + break; + + case LOG_FMT_HTTP_VERSION: // %HV + uri = txn && txn->uri ? txn->uri : "<BADREQ>"; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + end = uri + strlen(uri); + // look for the first whitespace character + while (uri < end && !HTTP_IS_SPHT(*uri)) + uri++; + + // keep advancing past multiple spaces + while (uri < end && HTTP_IS_SPHT(*uri)) { + uri++; nspaces++; + } + + // look for the next whitespace character + while (uri < end && !HTTP_IS_SPHT(*uri)) + uri++; + + // keep advancing past multiple spaces + while (uri < end && HTTP_IS_SPHT(*uri)) + uri++; + + if (!txn || !txn->uri || nspaces == 0) { + chunk.area = "<BADREQ>"; + chunk.data = strlen("<BADREQ>"); + } else if (uri == end) { + chunk.area = "HTTP/0.9"; + chunk.data = strlen("HTTP/0.9"); + } else { + chunk.area = uri; + chunk.data = end - uri; + } + + ret = lf_encode_chunk(tmplog, dst + maxsize, '#', url_encode_map, &chunk, tmp); + if (ret == NULL || *ret != '\0') + goto out; + + tmplog = ret; + if (tmp->options & LOG_OPT_QUOTE) + LOGCHAR('"'); + + last_isspace = 0; + break; + + case LOG_FMT_COUNTER: // %rt + if (tmp->options & LOG_OPT_HEXA) { + iret = snprintf(tmplog, dst + maxsize - tmplog, "%04X", uniq_id); + if (iret < 0 || iret > dst + maxsize - tmplog) + goto out; + last_isspace = 0; + tmplog += iret; + } else { + ret = ltoa_o(uniq_id, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + } + break; + + case LOG_FMT_LOGCNT: // %lc + if (tmp->options & LOG_OPT_HEXA) { + iret = snprintf(tmplog, dst + maxsize - tmplog, "%04X", fe->log_count); + if (iret < 0 || iret > dst + maxsize - tmplog) + goto out; + last_isspace = 0; + tmplog += iret; + } else { + ret = ultoa_o(fe->log_count, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + } + break; + + case LOG_FMT_HOSTNAME: // %H + src = hostname; + ret = lf_text(tmplog, src, dst + maxsize - tmplog, tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + case LOG_FMT_PID: // %pid + if (tmp->options & LOG_OPT_HEXA) { + iret = snprintf(tmplog, dst + maxsize - tmplog, "%04X", pid); + if (iret < 0 || iret > dst + maxsize - tmplog) + goto out; + last_isspace = 0; + tmplog += iret; + } else { + ret = ltoa_o(pid, tmplog, dst + maxsize - tmplog); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + } + break; + + case LOG_FMT_UNIQUEID: // %ID + ret = NULL; + if (s) + ret = lf_text_len(tmplog, s->unique_id.ptr, s->unique_id.len, maxsize - (tmplog - dst), tmp); + else + ret = lf_text_len(tmplog, NULL, 0, maxsize - (tmplog - dst), tmp); + if (ret == NULL) + goto out; + tmplog = ret; + last_isspace = 0; + break; + + } + } + +out: + /* *tmplog is a unused character */ + *tmplog = '\0'; + return tmplog - dst; + +} + +/* + * send a log for the stream when we have enough info about it. + * Will not log if the frontend has no log defined. + */ +void strm_log(struct stream *s) +{ + struct session *sess = s->sess; + int size, err, level; + int sd_size = 0; + + /* if we don't want to log normal traffic, return now */ + err = (s->flags & SF_REDISP) || + ((s->flags & SF_ERR_MASK) > SF_ERR_LOCAL) || + (((s->flags & SF_ERR_MASK) == SF_ERR_NONE) && s->conn_retries) || + ((sess->fe->mode == PR_MODE_HTTP) && s->txn && s->txn->status >= 500); + + if (!err && (sess->fe->options2 & PR_O2_NOLOGNORM)) + return; + + if (LIST_ISEMPTY(&sess->fe->loggers)) + return; + + if (s->logs.level) { /* loglevel was overridden */ + if (s->logs.level == -1) { + s->logs.logwait = 0; /* logs disabled */ + return; + } + level = s->logs.level - 1; + } + else { + level = LOG_INFO; + if (err && (sess->fe->options2 & PR_O2_LOGERRORS)) + level = LOG_ERR; + } + + /* if unique-id was not generated */ + if (!isttest(s->unique_id) && !LIST_ISEMPTY(&sess->fe->format_unique_id)) { + stream_generate_unique_id(s, &sess->fe->format_unique_id); + } + + if (!LIST_ISEMPTY(&sess->fe->logformat_sd)) { + sd_size = build_logline(s, logline_rfc5424, global.max_syslog_len, + &sess->fe->logformat_sd); + } + + size = build_logline(s, logline, global.max_syslog_len, &sess->fe->logformat); + if (size > 0) { + _HA_ATOMIC_INC(&sess->fe->log_count); + __send_log(&sess->fe->loggers, &sess->fe->log_tag, level, + logline, size + 1, logline_rfc5424, sd_size); + s->logs.logwait = 0; + } +} + +/* + * send a minimalist log for the session. Will not log if the frontend has no + * log defined. It is assumed that this is only used to report anomalies that + * cannot lead to the creation of a regular stream. Because of this the log + * level is LOG_INFO or LOG_ERR depending on the "log-separate-error" setting + * in the frontend. The caller must simply know that it should not call this + * function to report unimportant events. It is safe to call this function with + * sess==NULL (will not do anything). + */ +void sess_log(struct session *sess) +{ + int size, level; + int sd_size = 0; + + if (!sess) + return; + + if (LIST_ISEMPTY(&sess->fe->loggers)) + return; + + level = LOG_INFO; + if (sess->fe->options2 & PR_O2_LOGERRORS) + level = LOG_ERR; + + if (!LIST_ISEMPTY(&sess->fe->logformat_sd)) { + sd_size = sess_build_logline(sess, NULL, + logline_rfc5424, global.max_syslog_len, + &sess->fe->logformat_sd); + } + + if (!LIST_ISEMPTY(&sess->fe->logformat_error)) + size = sess_build_logline(sess, NULL, logline, global.max_syslog_len, &sess->fe->logformat_error); + else + size = sess_build_logline(sess, NULL, logline, global.max_syslog_len, &sess->fe->logformat); + if (size > 0) { + _HA_ATOMIC_INC(&sess->fe->log_count); + __send_log(&sess->fe->loggers, &sess->fe->log_tag, level, + logline, size + 1, logline_rfc5424, sd_size); + } +} + +void app_log(struct list *loggers, struct buffer *tag, int level, const char *format, ...) +{ + va_list argp; + int data_len; + + if (level < 0 || format == NULL || logline == NULL) + return; + + va_start(argp, format); + data_len = vsnprintf(logline, global.max_syslog_len, format, argp); + if (data_len < 0 || data_len > global.max_syslog_len) + data_len = global.max_syslog_len; + va_end(argp); + + __send_log(loggers, tag, level, logline, data_len, default_rfc5424_sd_log_format, 2); +} +/* + * This function parse a received log message <buf>, of size <buflen> + * it fills <level>, <facility> and <metadata> depending of the detected + * header format and message will point on remaining payload of <size> + * + * <metadata> must point on a preallocated array of LOG_META_FIELDS*sizeof(struct ist) + * struct ist len will be set to 0 if field is not found + * <level> and <facility> will be set to -1 if not found. + */ +void parse_log_message(char *buf, size_t buflen, int *level, int *facility, + struct ist *metadata, char **message, size_t *size) +{ + + char *p; + int fac_level = 0; + + *level = *facility = -1; + + *message = buf; + *size = buflen; + + memset(metadata, 0, LOG_META_FIELDS*sizeof(struct ist)); + + p = buf; + if (*size < 2 || *p != '<') + return; + + p++; + while (*p != '>') { + if (*p > '9' || *p < '0') + return; + fac_level = 10*fac_level + (*p - '0'); + p++; + if ((p - buf) > buflen) + return; + } + + *facility = fac_level >> 3; + *level = fac_level & 0x7; + p++; + + metadata[LOG_META_PRIO] = ist2(buf, p - buf); + + buflen -= p - buf; + buf = p; + + *size = buflen; + *message = buf; + + /* for rfc5424, prio is always followed by '1' and ' ' */ + if ((*size > 2) && (p[0] == '1') && (p[1] == ' ')) { + /* format is always '1 TIMESTAMP HOSTNAME TAG PID MSGID STDATA ' + * followed by message. + * Each header field can present NILVALUE: '-' + */ + + p += 2; + *size -= 2; + /* timestamp is NILVALUE '-' */ + if (*size > 2 && (p[0] == '-') && p[1] == ' ') { + metadata[LOG_META_TIME] = ist2(p, 1); + p++; + } + else if (*size > LOG_ISOTIME_MINLEN) { + metadata[LOG_META_TIME].ptr = p; + + /* check if optional secfrac is present + * in timestamp. + * possible format are: + * ex: '1970-01-01T00:00:00.000000Z' + * '1970-01-01T00:00:00.000000+00:00' + * '1970-01-01T00:00:00.000000-00:00' + * '1970-01-01T00:00:00Z' + * '1970-01-01T00:00:00+00:00' + * '1970-01-01T00:00:00-00:00' + */ + p += 19; + if (*p == '.') { + p++; + if ((p - buf) >= buflen) + goto bad_format; + while (*p != 'Z' && *p != '+' && *p != '-') { + if ((unsigned char)(*p - '0') > 9) + goto bad_format; + + p++; + if ((p - buf) >= buflen) + goto bad_format; + } + } + + if (*p == 'Z') + p++; + else + p += 6; /* case of '+00:00 or '-00:00' */ + + if ((p - buf) >= buflen || *p != ' ') + goto bad_format; + metadata[LOG_META_TIME].len = p - metadata[LOG_META_TIME].ptr; + } + else + goto bad_format; + + + p++; + if ((p - buf) >= buflen || *p == ' ') + goto bad_format; + + metadata[LOG_META_HOST].ptr = p; + while (*p != ' ') { + p++; + if ((p - buf) >= buflen) + goto bad_format; + } + metadata[LOG_META_HOST].len = p - metadata[LOG_META_HOST].ptr; + if (metadata[LOG_META_HOST].len == 1 && metadata[LOG_META_HOST].ptr[0] == '-') + metadata[LOG_META_HOST].len = 0; + + p++; + if ((p - buf) >= buflen || *p == ' ') + goto bad_format; + + metadata[LOG_META_TAG].ptr = p; + while (*p != ' ') { + p++; + if ((p - buf) >= buflen) + goto bad_format; + } + metadata[LOG_META_TAG].len = p - metadata[LOG_META_TAG].ptr; + if (metadata[LOG_META_TAG].len == 1 && metadata[LOG_META_TAG].ptr[0] == '-') + metadata[LOG_META_TAG].len = 0; + + p++; + if ((p - buf) >= buflen || *p == ' ') + goto bad_format; + + metadata[LOG_META_PID].ptr = p; + while (*p != ' ') { + p++; + if ((p - buf) >= buflen) + goto bad_format; + } + metadata[LOG_META_PID].len = p - metadata[LOG_META_PID].ptr; + if (metadata[LOG_META_PID].len == 1 && metadata[LOG_META_PID].ptr[0] == '-') + metadata[LOG_META_PID].len = 0; + + p++; + if ((p - buf) >= buflen || *p == ' ') + goto bad_format; + + metadata[LOG_META_MSGID].ptr = p; + while (*p != ' ') { + p++; + if ((p - buf) >= buflen) + goto bad_format; + } + metadata[LOG_META_MSGID].len = p - metadata[LOG_META_MSGID].ptr; + if (metadata[LOG_META_MSGID].len == 1 && metadata[LOG_META_MSGID].ptr[0] == '-') + metadata[LOG_META_MSGID].len = 0; + + p++; + if ((p - buf) >= buflen || *p == ' ') + goto bad_format; + + /* structured data format is: + * ex: + * '[key1=value1 key2=value2][key3=value3]' + * + * space is invalid outside [] because + * considered as the end of structured data field + */ + metadata[LOG_META_STDATA].ptr = p; + if (*p == '[') { + int elem = 0; + + while (1) { + if (elem) { + /* according to rfc this char is escaped in param values */ + if (*p == ']' && *(p-1) != '\\') + elem = 0; + } + else { + if (*p == '[') + elem = 1; + else if (*p == ' ') + break; + else + goto bad_format; + } + p++; + if ((p - buf) >= buflen) + goto bad_format; + } + } + else if (*p == '-') { + /* case of NILVALUE */ + p++; + if ((p - buf) >= buflen || *p != ' ') + goto bad_format; + } + else + goto bad_format; + + metadata[LOG_META_STDATA].len = p - metadata[LOG_META_STDATA].ptr; + if (metadata[LOG_META_STDATA].len == 1 && metadata[LOG_META_STDATA].ptr[0] == '-') + metadata[LOG_META_STDATA].len = 0; + + p++; + + buflen -= p - buf; + buf = p; + + *size = buflen; + *message = p; + } + else if (*size > LOG_LEGACYTIME_LEN) { + int m; + + /* supported header format according to rfc3164. + * ex: + * 'Jan 1 00:00:00 HOSTNAME TAG[PID]: ' + * or 'Jan 1 00:00:00 HOSTNAME TAG: ' + * or 'Jan 1 00:00:00 HOSTNAME ' + * Note: HOSTNAME is mandatory, and day + * of month uses a single space prefix if + * less than 10 to ensure hour offset is + * always the same. + */ + + /* Check month to see if it correspond to a rfc3164 + * header ex 'Jan 1 00:00:00' */ + for (m = 0; m < 12; m++) + if (!memcmp(monthname[m], p, 3)) + break; + /* Month not found */ + if (m == 12) + goto bad_format; + + metadata[LOG_META_TIME] = ist2(p, LOG_LEGACYTIME_LEN); + + p += LOG_LEGACYTIME_LEN; + if ((p - buf) >= buflen || *p != ' ') + goto bad_format; + + p++; + if ((p - buf) >= buflen || *p == ' ') + goto bad_format; + + metadata[LOG_META_HOST].ptr = p; + while (*p != ' ') { + p++; + if ((p - buf) >= buflen) + goto bad_format; + } + metadata[LOG_META_HOST].len = p - metadata[LOG_META_HOST].ptr; + + /* TAG seems to no be mandatory */ + p++; + + buflen -= p - buf; + buf = p; + + *size = buflen; + *message = buf; + + if (!buflen) + return; + + while (((p - buf) < buflen) && *p != ' ' && *p != ':') + p++; + + /* a tag must present a trailing ':' */ + if (((p - buf) >= buflen) || *p != ':') + return; + p++; + /* followed by a space */ + if (((p - buf) >= buflen) || *p != ' ') + return; + + /* rewind to parse tag and pid */ + p = buf; + metadata[LOG_META_TAG].ptr = p; + /* we have the guarantee that ':' will be reach before size limit */ + while (*p != ':') { + if (*p == '[') { + metadata[LOG_META_TAG].len = p - metadata[LOG_META_TAG].ptr; + metadata[LOG_META_PID].ptr = p + 1; + } + else if (*p == ']' && isttest(metadata[LOG_META_PID])) { + if (p[1] != ':') + return; + metadata[LOG_META_PID].len = p - metadata[LOG_META_PID].ptr; + } + p++; + } + if (!metadata[LOG_META_TAG].len) + metadata[LOG_META_TAG].len = p - metadata[LOG_META_TAG].ptr; + + /* let pass ':' and ' ', we still have warranty size is large enough */ + p += 2; + + buflen -= p - buf; + buf = p; + + *size = buflen; + *message = buf; + } + + return; + +bad_format: + /* bad syslog format, we reset all parsed syslog fields + * but priority is kept because we are able to re-build + * this message using LOF_FORMAT_PRIO. + */ + metadata[LOG_META_TIME].len = 0; + metadata[LOG_META_HOST].len = 0; + metadata[LOG_META_TAG].len = 0; + metadata[LOG_META_PID].len = 0; + metadata[LOG_META_MSGID].len = 0; + metadata[LOG_META_STDATA].len = 0; + + return; +} + +/* + * UDP syslog fd handler + */ +void syslog_fd_handler(int fd) +{ + static THREAD_LOCAL struct ist metadata[LOG_META_FIELDS]; + ssize_t ret = 0; + struct buffer *buf = get_trash_chunk(); + size_t size; + char *message; + int level; + int facility; + struct listener *l = objt_listener(fdtab[fd].owner); + int max_accept; + + BUG_ON(!l); + + if (fdtab[fd].state & FD_POLL_IN) { + + if (!fd_recv_ready(fd)) + return; + + max_accept = l->bind_conf->maxaccept ? l->bind_conf->maxaccept : 1; + + do { + /* Source address */ + struct sockaddr_storage saddr = {0}; + socklen_t saddrlen; + + saddrlen = sizeof(saddr); + + ret = recvfrom(fd, buf->area, buf->size, 0, (struct sockaddr *)&saddr, &saddrlen); + if (ret < 0) { + if (errno == EINTR) + continue; + if (errno == EAGAIN || errno == EWOULDBLOCK) + fd_cant_recv(fd); + goto out; + } + buf->data = ret; + + /* update counters */ + _HA_ATOMIC_INC(&cum_log_messages); + proxy_inc_fe_req_ctr(l, l->bind_conf->frontend, 0); + + parse_log_message(buf->area, buf->data, &level, &facility, metadata, &message, &size); + + process_send_log(&l->bind_conf->frontend->loggers, level, facility, metadata, message, size); + + } while (--max_accept); + } + +out: + return; +} + +/* + * IO Handler to handle message exchange with a syslog tcp client + */ +static void syslog_io_handler(struct appctx *appctx) +{ + static THREAD_LOCAL struct ist metadata[LOG_META_FIELDS]; + struct stconn *sc = appctx_sc(appctx); + struct stream *s = __sc_strm(sc); + struct proxy *frontend = strm_fe(s); + struct listener *l = strm_li(s); + struct buffer *buf = get_trash_chunk(); + int max_accept; + int to_skip; + int facility; + int level; + char *message; + size_t size; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) { + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + goto out; + } + + max_accept = l->bind_conf->maxaccept ? l->bind_conf->maxaccept : 1; + while (1) { + char c; + + if (max_accept <= 0) + goto missing_budget; + max_accept--; + + to_skip = co_getchar(sc_oc(sc), &c); + if (!to_skip) + goto missing_data; + else if (to_skip < 0) + goto cli_abort; + + if (c == '<') { + /* rfc-6587, Non-Transparent-Framing: messages separated by + * a trailing LF or CR LF + */ + to_skip = co_getline(sc_oc(sc), buf->area, buf->size); + if (!to_skip) + goto missing_data; + else if (to_skip < 0) + goto cli_abort; + + if (buf->area[to_skip - 1] != '\n') + goto parse_error; + + buf->data = to_skip - 1; + + /* according to rfc-6587, some devices adds CR before LF */ + if (buf->data && buf->area[buf->data - 1] == '\r') + buf->data--; + + } + else if ((unsigned char)(c - '1') <= 8) { + /* rfc-6587, Octet-Counting: message length in ASCII + * (first digit can not be ZERO), followed by a space + * and message length + */ + char *p = NULL; + int msglen; + + to_skip = co_getword(sc_oc(sc), buf->area, buf->size, ' '); + if (!to_skip) + goto missing_data; + else if (to_skip < 0) + goto cli_abort; + + if (buf->area[to_skip - 1] != ' ') + goto parse_error; + + msglen = strtol(buf->area, &p, 10); + if (!msglen || p != &buf->area[to_skip - 1]) + goto parse_error; + + /* message seems too large */ + if (msglen > buf->size) + goto parse_error; + + msglen = co_getblk(sc_oc(sc), buf->area, msglen, to_skip); + if (!msglen) + goto missing_data; + else if (msglen < 0) + goto cli_abort; + + + buf->data = msglen; + to_skip += msglen; + } + else + goto parse_error; + + co_skip(sc_oc(sc), to_skip); + + /* update counters */ + _HA_ATOMIC_INC(&cum_log_messages); + proxy_inc_fe_req_ctr(l, frontend, 0); + + parse_log_message(buf->area, buf->data, &level, &facility, metadata, &message, &size); + + process_send_log(&frontend->loggers, level, facility, metadata, message, size); + + } + +missing_data: + /* we need more data to read */ + applet_need_more_data(appctx); + return; + +missing_budget: + /* it may remain some stuff to do, let's retry later */ + appctx_wakeup(appctx); + return; + +parse_error: + if (l->counters) + _HA_ATOMIC_INC(&l->counters->failed_req); + _HA_ATOMIC_INC(&frontend->fe_counters.failed_req); + + goto error; + +cli_abort: + if (l->counters) + _HA_ATOMIC_INC(&l->counters->cli_aborts); + _HA_ATOMIC_INC(&frontend->fe_counters.cli_aborts); + +error: + se_fl_set(appctx->sedesc, SE_FL_ERROR); + +out: + return; +} + +static struct applet syslog_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<SYSLOG>", /* used for logging */ + .fct = syslog_io_handler, + .release = NULL, +}; + +/* + * Parse "log-forward" section and create corresponding sink buffer. + * + * The function returns 0 in success case, otherwise, it returns error + * flags. + */ +int cfg_parse_log_forward(const char *file, int linenum, char **args, int kwm) +{ + int err_code = ERR_NONE; + struct proxy *px; + char *errmsg = NULL; + const char *err = NULL; + + if (strcmp(args[0], "log-forward") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing name for log-forward section.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + px = log_forward_by_name(args[1]); + if (px) { + ha_alert("Parsing [%s:%d]: log-forward section '%s' has the same name as another log-forward section declared at %s:%d.\n", + file, linenum, args[1], px->conf.file, px->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + px = proxy_find_by_name(args[1], 0, 0); + if (px) { + ha_alert("Parsing [%s:%d]: log forward section '%s' has the same name as %s '%s' declared at %s:%d.\n", + file, linenum, args[1], proxy_type_str(px), + px->id, px->conf.file, px->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + px = calloc(1, sizeof *px); + if (!px) { + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + init_new_proxy(px); + px->next = cfg_log_forward; + cfg_log_forward = px; + px->conf.file = strdup(file); + px->conf.line = linenum; + px->mode = PR_MODE_SYSLOG; + px->last_change = ns_to_sec(now_ns); + px->cap = PR_CAP_FE; + px->maxconn = 10; + px->timeout.client = TICK_ETERNITY; + px->accept = frontend_accept; + px->default_target = &syslog_applet.obj_type; + px->id = strdup(args[1]); + } + else if (strcmp(args[0], "maxconn") == 0) { /* maxconn */ + if (warnifnotcap(cfg_log_forward, PR_CAP_FE, file, linenum, args[0], " Maybe you want 'fullconn' instead ?")) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + cfg_log_forward->maxconn = atol(args[1]); + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[0], "backlog") == 0) { /* backlog */ + if (warnifnotcap(cfg_log_forward, PR_CAP_FE, file, linenum, args[0], NULL)) + err_code |= ERR_WARN; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d] : '%s' expects an integer argument.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + cfg_log_forward->backlog = atol(args[1]); + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + } + else if (strcmp(args[0], "bind") == 0) { + int cur_arg; + struct bind_conf *bind_conf; + struct listener *l; + int ret; + + cur_arg = 1; + + bind_conf = bind_conf_alloc(cfg_log_forward, file, linenum, + NULL, xprt_get(XPRT_RAW)); + if (!bind_conf) { + ha_alert("parsing [%s:%d] : out of memory error.", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + bind_conf->maxaccept = global.tune.maxaccept ? global.tune.maxaccept : MAX_ACCEPT; + bind_conf->accept = session_accept_fd; + + if (!str2listener(args[1], cfg_log_forward, bind_conf, file, linenum, &errmsg)) { + if (errmsg && *errmsg) { + indent_msg(&errmsg, 2); + ha_alert("parsing [%s:%d] : '%s %s' : %s\n", file, linenum, args[0], args[1], errmsg); + } + else { + ha_alert("parsing [%s:%d] : '%s %s' : error encountered while parsing listening address %s.\n", + file, linenum, args[0], args[1], args[2]); + } + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + list_for_each_entry(l, &bind_conf->listeners, by_bind) { + global.maxsock++; + } + cur_arg++; + + ret = bind_parse_args_list(bind_conf, args, cur_arg, cursection, file, linenum); + err_code |= ret; + if (ret != 0) { + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "dgram-bind") == 0) { + int cur_arg; + struct bind_conf *bind_conf; + struct bind_kw *kw; + struct listener *l; + + cur_arg = 1; + + bind_conf = bind_conf_alloc(cfg_log_forward, file, linenum, + NULL, xprt_get(XPRT_RAW)); + if (!bind_conf) { + ha_alert("parsing [%s:%d] : out of memory error.", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + bind_conf->maxaccept = global.tune.maxaccept ? global.tune.maxaccept : MAX_ACCEPT; + + if (!str2receiver(args[1], cfg_log_forward, bind_conf, file, linenum, &errmsg)) { + if (errmsg && *errmsg) { + indent_msg(&errmsg, 2); + ha_alert("parsing [%s:%d] : '%s %s' : %s\n", file, linenum, args[0], args[1], errmsg); + } + else { + ha_alert("parsing [%s:%d] : '%s %s' : error encountered while parsing listening address %s.\n", + file, linenum, args[0], args[1], args[2]); + } + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + list_for_each_entry(l, &bind_conf->listeners, by_bind) { + /* the fact that the sockets are of type dgram is guaranteed by str2receiver() */ + l->rx.iocb = syslog_fd_handler; + global.maxsock++; + } + cur_arg++; + + while (*args[cur_arg] && (kw = bind_find_kw(args[cur_arg]))) { + int ret; + + ret = kw->parse(args, cur_arg, cfg_log_forward, bind_conf, &errmsg); + err_code |= ret; + if (ret) { + if (errmsg && *errmsg) { + indent_msg(&errmsg, 2); + ha_alert("parsing [%s:%d] : %s\n", file, linenum, errmsg); + } + else + ha_alert("parsing [%s:%d]: error encountered while processing '%s'\n", + file, linenum, args[cur_arg]); + if (ret & ERR_FATAL) + goto out; + } + cur_arg += 1 + kw->skip; + } + if (*args[cur_arg] != 0) { + const char *best = bind_find_best_kw(args[cur_arg]); + if (best) + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section; did you mean '%s' maybe ?\n", + file, linenum, args[cur_arg], cursection, best); + else + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section.\n", + file, linenum, args[cur_arg], cursection); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "log") == 0) { + if (!parse_logger(args, &cfg_log_forward->loggers, (kwm == KWM_NO), file, linenum, &errmsg)) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[0], "timeout") == 0) { + const char *res; + unsigned timeout; + + if (strcmp(args[1], "client") != 0) { + ha_alert("parsing [%s:%d] : unknown keyword '%s %s' in log-forward section.\n", file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (*args[2] == 0) { + ha_alert("parsing [%s:%d] : missing timeout client value.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + res = parse_time_err(args[2], &timeout, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(&errmsg, "timer overflow in argument '%s' to 'timeout client' (maximum value is 2147483647 ms or ~24.8 days)", args[2]); + } + else if (res == PARSE_TIME_UNDER) { + memprintf(&errmsg, "timer underflow in argument '%s' to 'timeout client' (minimum non-null value is 1 ms)", args[2]); + } + else if (res) { + memprintf(&errmsg, "unexpected character '%c' in 'timeout client'", *res); + } + + if (res) { + ha_alert("parsing [%s:%d] : %s : %s\n", file, linenum, args[0], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + cfg_log_forward->timeout.client = MS_TO_TICKS(timeout); + } + else { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in log-forward section.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } +out: + ha_free(&errmsg); + return err_code; +} + +/* function: post-resolve a single list of loggers + * + * Returns err_code which defaults to ERR_NONE and can be set to a combination + * of ERR_WARN, ERR_ALERT, ERR_FATAL and ERR_ABORT in case of errors. + */ +int postresolve_logger_list(struct list *loggers, const char *section, const char *section_name) +{ + int err_code = ERR_NONE; + struct logger *logger; + + list_for_each_entry(logger, loggers, list) { + int cur_code; + char *msg = NULL; + + cur_code = resolve_logger(logger, &msg); + if (msg) { + void (*e_func)(const char *fmt, ...) = NULL; + + if (cur_code & ERR_ALERT) + e_func = ha_alert; + else if (cur_code & ERR_WARN) + e_func = ha_warning; + else + e_func = ha_diag_warning; + if (!section) + e_func("global log directive declared in file %s at line '%d' %s.\n", + logger->conf.file, logger->conf.line, msg); + else + e_func("log directive declared in %s section '%s' in file '%s' at line %d %s.\n", + section, section_name, logger->conf.file, logger->conf.line, msg); + ha_free(&msg); + } + err_code |= cur_code; + } + return err_code; +} + +/* resolve default log directives at end of config. Returns 0 on success + * otherwise error flags. +*/ +static int postresolve_loggers() +{ + struct proxy *px; + int err_code = ERR_NONE; + + /* global log directives */ + err_code |= postresolve_logger_list(&global.loggers, NULL, NULL); + /* proxy log directives */ + for (px = proxies_list; px; px = px->next) + err_code |= postresolve_logger_list(&px->loggers, "proxy", px->id); + /* log-forward log directives */ + for (px = cfg_log_forward; px; px = px->next) + err_code |= postresolve_logger_list(&px->loggers, "log-forward", px->id); + + return err_code; +} + + +/* config parsers for this section */ +REGISTER_CONFIG_SECTION("log-forward", cfg_parse_log_forward, NULL); +REGISTER_POST_CHECK(postresolve_loggers); +REGISTER_POST_PROXY_CHECK(postcheck_log_backend); + +REGISTER_PER_THREAD_ALLOC(init_log_buffers); +REGISTER_PER_THREAD_FREE(deinit_log_buffers); + +REGISTER_POST_DEINIT(deinit_log_forward); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/lru.c b/src/lru.c new file mode 100644 index 0000000..07ef50c --- /dev/null +++ b/src/lru.c @@ -0,0 +1,305 @@ +/* + * Copyright (C) 2015 Willy Tarreau <w@1wt.eu> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <import/lru.h> + +/* Minimal list manipulation macros for lru64_list */ +#define LIST_INSERT(lh, el) ({ (el)->n = (lh)->n; (el)->n->p = (lh)->n = (el); (el)->p = (lh); }) +#define LIST_DELETE(el) ({ (el)->n->p = (el)->p; (el)->p->n = (el)->n; }) + + +/* Lookup key <key> in LRU cache <lru> for use with domain <domain> whose data's + * current version is <revision>. It differs from lru64_get as it does not + * create missing keys. The function returns NULL if an error or a cache miss + * occurs. */ +struct lru64 *lru64_lookup(unsigned long long key, struct lru64_head *lru, + void *domain, unsigned long long revision) +{ + struct eb64_node *node; + struct lru64 *elem; + + node = __eb64_lookup(&lru->keys, key); + elem = container_of(node, typeof(*elem), node); + if (elem) { + /* Existing entry found, check validity then move it at the + * head of the LRU list. + */ + if (elem->domain == domain && elem->revision == revision) { + LIST_DELETE(&elem->lru); + LIST_INSERT(&lru->list, &elem->lru); + return elem; + } + } + return NULL; +} + +/* Get key <key> from LRU cache <lru> for use with domain <domain> whose data's + * current revision is <revision>. If the key doesn't exist it's first created + * with ->domain = NULL. The caller detects this situation by checking ->domain + * and must perform the operation to be cached then call lru64_commit() to + * complete the operation. A lock (mutex or spinlock) may be added around the + * function to permit use in a multi-threaded environment. The function may + * return NULL upon memory allocation failure. + */ +struct lru64 *lru64_get(unsigned long long key, struct lru64_head *lru, + void *domain, unsigned long long revision) +{ + struct eb64_node *node; + struct lru64 *elem; + + if (!lru->spare) { + if (!lru->cache_size) + return NULL; + lru->spare = malloc(sizeof(*lru->spare)); + if (!lru->spare) + return NULL; + lru->spare->domain = NULL; + } + + /* Lookup or insert */ + lru->spare->node.key = key; + node = __eb64_insert(&lru->keys, &lru->spare->node); + elem = container_of(node, typeof(*elem), node); + + if (elem != lru->spare) { + /* Existing entry found, check validity then move it at the + * head of the LRU list. + */ + if (elem->domain == domain && elem->revision == revision) { + LIST_DELETE(&elem->lru); + LIST_INSERT(&lru->list, &elem->lru); + return elem; + } + + if (!elem->domain) + return NULL; // currently locked + + /* recycle this entry */ + LIST_DELETE(&elem->lru); + } + else { + /* New entry inserted, initialize and move to the head of the + * LRU list, and lock it until commit. + */ + lru->cache_usage++; + lru->spare = NULL; // used, need a new one next time + } + + elem->domain = NULL; + LIST_INSERT(&lru->list, &elem->lru); + + if (lru->cache_usage > lru->cache_size) { + /* try to kill oldest entry */ + struct lru64 *old; + + old = container_of(lru->list.p, typeof(*old), lru); + if (old->domain) { + /* not locked */ + LIST_DELETE(&old->lru); + __eb64_delete(&old->node); + if (old->data && old->free) + old->free(old->data); + if (!lru->spare) + lru->spare = old; + else { + free(old); + } + lru->cache_usage--; + } + } + return elem; +} + +/* Commit element <elem> with data <data>, domain <domain> and revision + * <revision>. <elem> is checked for NULL so that it's possible to call it + * with the result from a call to lru64_get(). The caller might lock it using a + * spinlock or mutex shared with the one around lru64_get(). + */ +void lru64_commit(struct lru64 *elem, void *data, void *domain, + unsigned long long revision, void (*free)(void *)) +{ + if (!elem) + return; + + elem->data = data; + elem->revision = revision; + elem->domain = domain; + elem->free = free; +} + +/* Create a new LRU cache of <size> entries. Returns the new cache or NULL in + * case of allocation failure. + */ +struct lru64_head *lru64_new(int size) +{ + struct lru64_head *lru; + + lru = malloc(sizeof(*lru)); + if (lru) { + lru->list.p = lru->list.n = &lru->list; + lru->keys = EB_ROOT_UNIQUE; + lru->spare = NULL; + lru->cache_size = size; + lru->cache_usage = 0; + } + return lru; +} + +/* Tries to destroy the LRU cache <lru>. Returns the number of locked entries + * that prevent it from being destroyed, or zero meaning everything was done. + */ +int lru64_destroy(struct lru64_head *lru) +{ + struct lru64 *elem, *next; + + if (!lru) + return 0; + + elem = container_of(lru->list.p, typeof(*elem), lru); + while (&elem->lru != &lru->list) { + next = container_of(elem->lru.p, typeof(*next), lru); + if (elem->domain) { + /* not locked */ + LIST_DELETE(&elem->lru); + eb64_delete(&elem->node); + if (elem->data && elem->free) + elem->free(elem->data); + free(elem); + lru->cache_usage--; + lru->cache_size--; + } + elem = next; + } + + if (lru->cache_usage) + return lru->cache_usage; + + free(lru); + return 0; +} + +/* kill the <nb> least used entries from the <lru> cache */ +void lru64_kill_oldest(struct lru64_head *lru, unsigned long int nb) +{ + struct lru64 *elem, *next; + + for (elem = container_of(lru->list.p, typeof(*elem), lru); + nb && (&elem->lru != &lru->list); + elem = next) { + next = container_of(elem->lru.p, typeof(*next), lru); + if (!elem->domain) + continue; /* locked entry */ + + LIST_DELETE(&elem->lru); + eb64_delete(&elem->node); + if (elem->data && elem->free) + elem->free(elem->data); + if (!lru->spare) + lru->spare = elem; + else + free(elem); + lru->cache_usage--; + nb--; + } +} + +/* The code below is just for validation and performance testing. It's an + * example of a function taking some time to return results that could be + * cached. + */ +#ifdef STANDALONE + +#include <stdio.h> + +static unsigned int misses; + +static unsigned long long sum(unsigned long long x) +{ +#ifndef TEST_LRU_FAST_OPERATION + if (x < 1) + return 0; + return x + sum(x * 99 / 100 - 1); +#else + return (x << 16) - (x << 8) - 1; +#endif +} + +static long get_value(struct lru64_head *lru, long a) +{ + struct lru64 *item = NULL; + + if (lru) { + item = lru64_get(a, lru, lru, 0); + if (item && item->domain) + return (long)item->data; + } + misses++; + /* do the painful work here */ + a = sum(a); + if (item) + lru64_commit(item, (void *)a, lru, 1, 0); + return a; +} + +static inline unsigned int statistical_prng() +{ + static unsigned int statistical_prng_state = 0x12345678; + unsigned int x = statistical_prng_state; + + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return statistical_prng_state = x; +} + +/* pass #of loops in argv[1] and set argv[2] to something to use the LRU */ +int main(int argc, char **argv) +{ + struct lru64_head *lru = NULL; + long long ret; + int total, loops; + + if (argc < 2) { + printf("Need a number of rounds and optionally an LRU cache size (0..65536)\n"); + exit(1); + } + + total = atoi(argv[1]); + + if (argc > 2) /* cache size */ + lru = lru64_new(atoi(argv[2])); + + ret = 0; + for (loops = 0; loops < total; loops++) { + ret += get_value(lru, statistical_prng() & 65535); + } + /* just for accuracy control */ + printf("ret=%llx, hits=%u, misses=%u (%d %% hits)\n", ret, (unsigned)(total-misses), misses, (int)((float)(total-misses) * 100.0 / total)); + + while (lru64_destroy(lru)); + + return 0; +} + +#endif diff --git a/src/mailers.c b/src/mailers.c new file mode 100644 index 0000000..c09e73c --- /dev/null +++ b/src/mailers.c @@ -0,0 +1,329 @@ +/* + * Mailer management. + * + * Copyright 2015 Horms Solutions Ltd, Simon Horman <horms@verge.net.au> + * Copyright 2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <stdlib.h> + +#include <haproxy/action-t.h> +#include <haproxy/api.h> +#include <haproxy/check.h> +#include <haproxy/errors.h> +#include <haproxy/list.h> +#include <haproxy/mailers.h> +#include <haproxy/pool.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> +#include <haproxy/task.h> +#include <haproxy/tcpcheck.h> +#include <haproxy/thread.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> + + +struct mailers *mailers = NULL; + +/* Set to 1 to disable email sending through checks even if the + * mailers are configured to do so. (e.g.: disable from lua) + */ +int send_email_disabled = 0; + +DECLARE_STATIC_POOL(pool_head_email_alert, "email_alert", sizeof(struct email_alert)); + +/****************************** Email alerts ******************************/ +/* NOTE: It may be pertinent to use an applet to handle email alerts */ +/* instead of a tcp-check ruleset */ +/**************************************************************************/ +void email_alert_free(struct email_alert *alert) +{ + struct tcpcheck_rule *rule, *back; + + if (!alert) + return; + + if (alert->rules.list) { + list_for_each_entry_safe(rule, back, alert->rules.list, list) { + LIST_DELETE(&rule->list); + free_tcpcheck(rule, 1); + } + free_tcpcheck_vars(&alert->rules.preset_vars); + ha_free(&alert->rules.list); + } + pool_free(pool_head_email_alert, alert); +} + +static struct task *process_email_alert(struct task *t, void *context, unsigned int state) +{ + struct check *check = context; + struct email_alertq *q; + struct email_alert *alert; + + q = container_of(check, typeof(*q), check); + + HA_SPIN_LOCK(EMAIL_ALERTS_LOCK, &q->lock); + while (1) { + if (!(check->state & CHK_ST_ENABLED)) { + if (LIST_ISEMPTY(&q->email_alerts)) { + /* All alerts processed, queue the task */ + t->expire = TICK_ETERNITY; + task_queue(t); + goto end; + } + + alert = LIST_NEXT(&q->email_alerts, typeof(alert), list); + LIST_DELETE(&alert->list); + t->expire = now_ms; + check->tcpcheck_rules = &alert->rules; + check->status = HCHK_STATUS_INI; + check->state |= CHK_ST_ENABLED; + } + + process_chk(t, context, state); + if (check->state & CHK_ST_INPROGRESS) + break; + + alert = container_of(check->tcpcheck_rules, typeof(*alert), rules); + email_alert_free(alert); + check->tcpcheck_rules = NULL; + check->server = NULL; + check->state &= ~CHK_ST_ENABLED; + } + end: + HA_SPIN_UNLOCK(EMAIL_ALERTS_LOCK, &q->lock); + return t; +} + +/* Initializes mailer alerts for the proxy <p> using <mls> parameters. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +int init_email_alert(struct mailers *mls, struct proxy *p, char **err) +{ + struct mailer *mailer; + struct email_alertq *queues; + const char *err_str; + int i = 0; + + if ((queues = calloc(mls->count, sizeof(*queues))) == NULL) { + memprintf(err, "out of memory while allocating mailer alerts queues"); + goto fail_no_queue; + } + + for (mailer = mls->mailer_list; mailer; i++, mailer = mailer->next) { + struct email_alertq *q = &queues[i]; + struct check *check = &q->check; + struct task *t; + + LIST_INIT(&q->email_alerts); + HA_SPIN_INIT(&q->lock); + check->obj_type = OBJ_TYPE_CHECK; + check->inter = mls->timeout.mail; + check->rise = DEF_AGENT_RISETIME; + check->proxy = p; + check->fall = DEF_AGENT_FALLTIME; + if ((err_str = init_check(check, PR_O2_TCPCHK_CHK))) { + memprintf(err, "%s", err_str); + goto error; + } + + check->xprt = mailer->xprt; + check->addr = mailer->addr; + check->port = get_host_port(&mailer->addr); + + if ((t = task_new_anywhere()) == NULL) { + memprintf(err, "out of memory while allocating mailer alerts task"); + goto error; + } + + check->task = t; + t->process = process_email_alert; + t->context = check; + + /* check this in one ms */ + t->expire = TICK_ETERNITY; + check->start = now_ns; + task_queue(t); + } + + mls->users++; + free(p->email_alert.mailers.name); + p->email_alert.mailers.m = mls; + p->email_alert.queues = queues; + return 0; + + error: + for (i = 0; i < mls->count; i++) { + struct email_alertq *q = &queues[i]; + struct check *check = &q->check; + + free_check(check); + } + free(queues); + fail_no_queue: + return 1; +} + +static int enqueue_one_email_alert(struct proxy *p, struct server *s, + struct email_alertq *q, const char *msg) +{ + struct email_alert *alert; + struct tcpcheck_rule *tcpcheck; + struct check *check = &q->check; + + if ((alert = pool_alloc(pool_head_email_alert)) == NULL) + goto error; + LIST_INIT(&alert->list); + alert->rules.flags = TCPCHK_RULES_TCP_CHK; + alert->rules.list = calloc(1, sizeof(*alert->rules.list)); + if (!alert->rules.list) + goto error; + LIST_INIT(alert->rules.list); + LIST_INIT(&alert->rules.preset_vars); /* unused for email alerts */ + alert->srv = s; + + if ((tcpcheck = pool_zalloc(pool_head_tcpcheck_rule)) == NULL) + goto error; + tcpcheck->action = TCPCHK_ACT_CONNECT; + tcpcheck->comment = NULL; + + LIST_APPEND(alert->rules.list, &tcpcheck->list); + + if (!add_tcpcheck_expect_str(&alert->rules, "220 ")) + goto error; + + { + const char * const strs[4] = { "HELO ", p->email_alert.myhostname, "\r\n" }; + if (!add_tcpcheck_send_strs(&alert->rules, strs)) + goto error; + } + + if (!add_tcpcheck_expect_str(&alert->rules, "250 ")) + goto error; + + { + const char * const strs[4] = { "MAIL FROM:<", p->email_alert.from, ">\r\n" }; + if (!add_tcpcheck_send_strs(&alert->rules, strs)) + goto error; + } + + if (!add_tcpcheck_expect_str(&alert->rules, "250 ")) + goto error; + + { + const char * const strs[4] = { "RCPT TO:<", p->email_alert.to, ">\r\n" }; + if (!add_tcpcheck_send_strs(&alert->rules, strs)) + goto error; + } + + if (!add_tcpcheck_expect_str(&alert->rules, "250 ")) + goto error; + + { + const char * const strs[2] = { "DATA\r\n" }; + if (!add_tcpcheck_send_strs(&alert->rules, strs)) + goto error; + } + + if (!add_tcpcheck_expect_str(&alert->rules, "354 ")) + goto error; + + { + struct tm tm; + char datestr[48]; + const char * const strs[18] = { + "From: ", p->email_alert.from, "\r\n", + "To: ", p->email_alert.to, "\r\n", + "Date: ", datestr, "\r\n", + "Subject: [HAProxy Alert] ", msg, "\r\n", + "\r\n", + msg, "\r\n", + "\r\n", + ".\r\n", + NULL + }; + + get_localtime(date.tv_sec, &tm); + + if (strftime(datestr, sizeof(datestr), "%a, %d %b %Y %T %z (%Z)", &tm) == 0) { + goto error; + } + + if (!add_tcpcheck_send_strs(&alert->rules, strs)) + goto error; + } + + if (!add_tcpcheck_expect_str(&alert->rules, "250 ")) + goto error; + + { + const char * const strs[2] = { "QUIT\r\n" }; + if (!add_tcpcheck_send_strs(&alert->rules, strs)) + goto error; + } + + if (!add_tcpcheck_expect_str(&alert->rules, "221 ")) + goto error; + + HA_SPIN_LOCK(EMAIL_ALERTS_LOCK, &q->lock); + task_wakeup(check->task, TASK_WOKEN_MSG); + LIST_APPEND(&q->email_alerts, &alert->list); + HA_SPIN_UNLOCK(EMAIL_ALERTS_LOCK, &q->lock); + return 1; + +error: + email_alert_free(alert); + return 0; +} + +static void enqueue_email_alert(struct proxy *p, struct server *s, const char *msg) +{ + int i; + struct mailer *mailer; + + for (i = 0, mailer = p->email_alert.mailers.m->mailer_list; + i < p->email_alert.mailers.m->count; i++, mailer = mailer->next) { + if (!enqueue_one_email_alert(p, s, &p->email_alert.queues[i], msg)) { + ha_alert("Email alert [%s] could not be enqueued: out of memory\n", p->id); + return; + } + } + + return; +} + +/* + * Send email alert if configured. + */ +void send_email_alert(struct server *s, int level, const char *format, ...) +{ + va_list argp; + char buf[1024]; + int len; + struct proxy *p = s->proxy; + + if (send_email_disabled) + return; + + if (!p->email_alert.mailers.m || level > p->email_alert.level || format == NULL) + return; + + va_start(argp, format); + len = vsnprintf(buf, sizeof(buf), format, argp); + va_end(argp); + + if (len < 0 || len >= sizeof(buf)) { + ha_alert("Email alert [%s] could not format message\n", p->id); + return; + } + + enqueue_email_alert(p, s, buf); +} diff --git a/src/map.c b/src/map.c new file mode 100644 index 0000000..ba7fd81 --- /dev/null +++ b/src/map.c @@ -0,0 +1,1232 @@ +/* + * MAP management functions. + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <stdio.h> +#include <syslog.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/arg.h> +#include <haproxy/cli.h> +#include <haproxy/map.h> +#include <haproxy/pattern.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stats-t.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> + + +/* Parse an IPv4 or IPv6 address and store it into the sample. + * The output type is IPv4 or IPv6. + */ +int map_parse_ip(const char *text, struct sample_data *data) +{ + int len = strlen(text); + + if (buf2ip(text, len, &data->u.ipv4)) { + data->type = SMP_T_IPV4; + return 1; + } + if (buf2ip6(text, len, &data->u.ipv6)) { + data->type = SMP_T_IPV6; + return 1; + } + return 0; +} + +/* Parse a string and store a pointer to it into the sample. The original + * string must be left in memory because we return a direct memory reference. + * The output type is SMP_T_STR. There is no risk that the data will be + * overwritten because sample_conv_map() makes a const sample with this + * output. + */ +int map_parse_str(const char *text, struct sample_data *data) +{ + data->u.str.area = (char *)text; + data->u.str.data = strlen(text); + data->u.str.size = data->u.str.data + 1; + data->type = SMP_T_STR; + return 1; +} + +/* Parse an integer and convert it to a sample. The output type is SINT if the + * number is negative, or UINT if it is positive or null. The function returns + * zero (error) if the number is too large. + */ +int map_parse_int(const char *text, struct sample_data *data) +{ + data->type = SMP_T_SINT; + data->u.sint = read_int64(&text, text + strlen(text)); + if (*text != '\0') + return 0; + return 1; +} + +/* This crete and initialize map descriptor. + * Return NULL if out of memory error + */ +static struct map_descriptor *map_create_descriptor(struct sample_conv *conv) +{ + struct map_descriptor *desc; + + desc = calloc(1, sizeof(*desc)); + if (!desc) + return NULL; + + desc->conv = conv; + + return desc; +} + +/* This function load the map file according with data type declared into + * the "struct sample_conv". + * + * This function choose the indexation type (ebtree or list) according with + * the type of match needed. + */ +int sample_load_map(struct arg *arg, struct sample_conv *conv, + const char *file, int line, char **err) +{ + struct map_descriptor *desc; + + if (!(global.mode & MODE_STARTING)) { + memprintf(err, "map: cannot load map at runtime"); + return 0; + } + + /* create new map descriptor */ + desc = map_create_descriptor(conv); + if (!desc) { + memprintf(err, "out of memory"); + return 0; + } + + /* Initialize pattern */ + pattern_init_head(&desc->pat); + + /* This is original pattern, must free */ + desc->do_free = 1; + + /* Set the match method. */ + desc->pat.match = pat_match_fcts[(long)conv->private]; + desc->pat.parse = pat_parse_fcts[(long)conv->private]; + desc->pat.index = pat_index_fcts[(long)conv->private]; + desc->pat.prune = pat_prune_fcts[(long)conv->private]; + desc->pat.expect_type = pat_match_types[(long)conv->private]; + + /* Set the output parse method. */ + switch (desc->conv->out_type) { + case SMP_T_STR: desc->pat.parse_smp = map_parse_str; break; + case SMP_T_SINT: desc->pat.parse_smp = map_parse_int; break; + case SMP_T_ADDR: desc->pat.parse_smp = map_parse_ip; break; + default: + memprintf(err, "map: internal haproxy error: no default parse case for the input type <%d>.", + conv->out_type); + free(desc); + return 0; + } + + /* Load map. */ + if (!pattern_read_from_file(&desc->pat, PAT_REF_MAP, arg[0].data.str.area, PAT_MF_NO_DNS, + 1, err, file, line)) + return 0; + + /* the maps of type IP support a string as default value. This + * string can be an ipv4 or an ipv6, we must convert it. + */ + if (arg[1].type != ARGT_STOP && desc->conv->out_type == SMP_T_ADDR) { + struct sample_data data; + if (!map_parse_ip(arg[1].data.str.area, &data)) { + memprintf(err, "map: cannot parse default ip <%s>.", + arg[1].data.str.area); + return 0; + } + chunk_destroy(&arg[1].data.str); + if (data.type == SMP_T_IPV4) { + arg[1].type = ARGT_IPV4; + arg[1].data.ipv4 = data.u.ipv4; + } else { + arg[1].type = ARGT_IPV6; + arg[1].data.ipv6 = data.u.ipv6; + } + } + + /* replace the first argument by this definition */ + chunk_destroy(&arg[0].data.str); + arg[0].type = ARGT_MAP; + arg[0].data.map = desc; + + return 1; +} + +static int sample_conv_map(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct map_descriptor *desc; + struct pattern *pat; + struct buffer *str; + + /* get config */ + desc = arg_p[0].data.map; + + /* Execute the match function. */ + pat = pattern_exec_match(&desc->pat, smp, 1); + + /* Match case. */ + if (pat) { + if (pat->data) { + /* In the regm case, merge the sample with the input. */ + if ((long)private == PAT_MATCH_REGM) { + struct buffer *tmptrash; + int len; + + /* Copy the content of the sample because it could + be scratched by incoming get_trash_chunk */ + tmptrash = alloc_trash_chunk(); + if (!tmptrash) + return 0; + + tmptrash->data = smp->data.u.str.data; + if (tmptrash->data > (tmptrash->size-1)) + tmptrash->data = tmptrash->size-1; + + memcpy(tmptrash->area, smp->data.u.str.area, tmptrash->data); + tmptrash->area[tmptrash->data] = 0; + + str = get_trash_chunk(); + len = exp_replace(str->area, str->size, + tmptrash->area, + pat->data->u.str.area, + (regmatch_t *)smp->ctx.a[0]); + free_trash_chunk(tmptrash); + + if (len == -1) + return 0; + + str->data = len; + smp->data.u.str = *str; + return 1; + } + /* Copy sample. */ + smp->data = *pat->data; + smp->flags |= SMP_F_CONST; + return 1; + } + + /* Return just int sample containing 1. */ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 1; + return 1; + } + + /* If no default value available, the converter fails. */ + if (arg_p[1].type == ARGT_STOP) + return 0; + + /* Return the default value. */ + switch (desc->conv->out_type) { + + case SMP_T_STR: + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_CONST; + smp->data.u.str = arg_p[1].data.str; + break; + + case SMP_T_SINT: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = arg_p[1].data.sint; + break; + + case SMP_T_ADDR: + if (arg_p[1].type == ARGT_IPV4) { + smp->data.type = SMP_T_IPV4; + smp->data.u.ipv4 = arg_p[1].data.ipv4; + } else { + smp->data.type = SMP_T_IPV6; + smp->data.u.ipv6 = arg_p[1].data.ipv6; + } + break; + } + + return 1; +} + +/* This function is used with map and acl management. It permits to browse + * each reference. The variable <getnext> must contain the current node, + * <end> point to the root node and the <flags> permit to filter required + * nodes. + */ +static inline +struct pat_ref *pat_list_get_next(struct pat_ref *getnext, struct list *end, + unsigned int flags) +{ + struct pat_ref *ref = getnext; + + while (1) { + + /* Get next list entry. */ + ref = LIST_NEXT(&ref->list, struct pat_ref *, list); + + /* If the entry is the last of the list, return NULL. */ + if (&ref->list == end) + return NULL; + + /* If the entry match the flag, return it. */ + if (ref->flags & flags) + return ref; + } +} + +static inline +struct pat_ref *pat_ref_lookup_ref(const char *reference) +{ + int id; + char *error; + + /* If the reference starts by a '#', this is numeric id. */ + if (reference[0] == '#') { + /* Try to convert the numeric id. If the conversion fails, the lookup fails. */ + id = strtol(reference + 1, &error, 10); + if (*error != '\0') + return NULL; + + /* Perform the unique id lookup. */ + return pat_ref_lookupid(id); + } + + /* Perform the string lookup. */ + return pat_ref_lookup(reference); +} + +/* This function is used with map and acl management. It permits to browse + * each reference. + */ +static inline +struct pattern_expr *pat_expr_get_next(struct pattern_expr *getnext, struct list *end) +{ + struct pattern_expr *expr; + expr = LIST_NEXT(&getnext->list, struct pattern_expr *, list); + if (&expr->list == end) + return NULL; + return expr; +} + +/* appctx context for the "{show|get|add|del|*} {map|acl}" commands. This is + * used even by commands that only have a parser and no I/O handler because + * it provides a unified way to manipulate some fields and will allow to + * expand some of them more easily later if needed. + */ +struct show_map_ctx { + struct pat_ref *ref; + struct bref bref; /* back-reference from the pat_ref_elt being dumped */ + struct pattern_expr *expr; + struct buffer chunk; + unsigned int display_flags; + unsigned int curr_gen; /* current/latest generation, for show/clear */ + unsigned int prev_gen; /* prev generation, for clear */ + enum { + STATE_INIT = 0, /* initialize list and backrefs */ + STATE_LIST, /* list entries */ + STATE_DONE, /* finished */ + } state; /* state of the dump */ +}; + +/* expects the current generation ID in ctx->curr_gen */ +static int cli_io_handler_pat_list(struct appctx *appctx) +{ + struct show_map_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct pat_ref_elt *elt; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) { + /* If we're forced to shut down, we might have to remove our + * reference to the last ref_elt being dumped. + */ + if (!LIST_ISEMPTY(&ctx->bref.users)) { + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + LIST_DEL_INIT(&ctx->bref.users); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + } + return 1; + } + + switch (ctx->state) { + case STATE_INIT: + ctx->state = STATE_LIST; + __fallthrough; + + case STATE_LIST: + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + + if (!LIST_ISEMPTY(&ctx->bref.users)) { + LIST_DELETE(&ctx->bref.users); + LIST_INIT(&ctx->bref.users); + } else { + ctx->bref.ref = ctx->ref->head.n; + } + + while (ctx->bref.ref != &ctx->ref->head) { + chunk_reset(&trash); + + elt = LIST_ELEM(ctx->bref.ref, struct pat_ref_elt *, list); + + if (elt->gen_id != ctx->curr_gen) + goto skip; + + /* build messages */ + if (elt->sample) + chunk_appendf(&trash, "%p %s %s\n", + elt, elt->pattern, + elt->sample); + else + chunk_appendf(&trash, "%p %s\n", + elt, elt->pattern); + + if (applet_putchk(appctx, &trash) == -1) { + /* let's try again later from this stream. We add ourselves into + * this stream's users so that it can remove us upon termination. + */ + LIST_APPEND(&elt->back_refs, &ctx->bref.users); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + return 0; + } + skip: + /* get next list entry and check the end of the list */ + ctx->bref.ref = elt->list.n; + } + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + __fallthrough; + + default: + ctx->state = STATE_DONE; + return 1; + } +} + +static int cli_io_handler_pats_list(struct appctx *appctx) +{ + struct show_map_ctx *ctx = appctx->svcctx; + + switch (ctx->state) { + case STATE_INIT: + /* Display the column headers. If the message cannot be sent, + * quit the function with returning 0. The function is called + * later and restarted at the state "STATE_INIT". + */ + chunk_reset(&trash); + chunk_appendf(&trash, "# id (file) description\n"); + if (applet_putchk(appctx, &trash) == -1) + return 0; + + /* Now, we start the browsing of the references lists. + * Note that the following call to LIST_ELEM returns a bad pointer. The only + * available field of this pointer is <list>. It is used with the function + * pat_list_get_next() for returning the first available entry + */ + ctx->ref = LIST_ELEM(&pattern_reference, struct pat_ref *, list); + ctx->ref = pat_list_get_next(ctx->ref, &pattern_reference, + ctx->display_flags); + ctx->state = STATE_LIST; + __fallthrough; + + case STATE_LIST: + while (ctx->ref) { + chunk_reset(&trash); + + /* Build messages. If the reference is used by another category than + * the listed categories, display the information in the message. + */ + chunk_appendf(&trash, "%d (%s) %s. curr_ver=%u next_ver=%u entry_cnt=%llu\n", ctx->ref->unique_id, + ctx->ref->reference ? ctx->ref->reference : "", + ctx->ref->display, ctx->ref->curr_gen, ctx->ref->next_gen, + ctx->ref->entry_cnt); + + if (applet_putchk(appctx, &trash) == -1) { + /* let's try again later from this stream. We add ourselves into + * this stream's users so that it can remove us upon termination. + */ + return 0; + } + + /* get next list entry and check the end of the list */ + ctx->ref = pat_list_get_next(ctx->ref, &pattern_reference, + ctx->display_flags); + } + + __fallthrough; + + default: + ctx->state = STATE_DONE; + return 1; + } + return 0; +} + +static int cli_io_handler_map_lookup(struct appctx *appctx) +{ + struct show_map_ctx *ctx = appctx->svcctx; + struct sample sample; + struct pattern *pat; + int match_method; + + switch (ctx->state) { + case STATE_INIT: + /* Init to the first entry. The list cannot be change */ + ctx->expr = LIST_ELEM(&ctx->ref->pat, struct pattern_expr *, list); + ctx->expr = pat_expr_get_next(ctx->expr, &ctx->ref->pat); + ctx->state = STATE_LIST; + __fallthrough; + + case STATE_LIST: + HA_RWLOCK_RDLOCK(PATREF_LOCK, &ctx->ref->lock); + /* for each lookup type */ + while (ctx->expr) { + /* initialise chunk to build new message */ + chunk_reset(&trash); + + /* execute pattern matching */ + sample.data.type = SMP_T_STR; + sample.flags = SMP_F_CONST; + sample.data.u.str.data = ctx->chunk.data; + sample.data.u.str.area = ctx->chunk.area; + + if (ctx->expr->pat_head->match && + sample_convert(&sample, ctx->expr->pat_head->expect_type)) + pat = ctx->expr->pat_head->match(&sample, ctx->expr, 1); + else + pat = NULL; + + /* build return message: set type of match */ + for (match_method=0; match_method<PAT_MATCH_NUM; match_method++) + if (ctx->expr->pat_head->match == pat_match_fcts[match_method]) + break; + if (match_method >= PAT_MATCH_NUM) + chunk_appendf(&trash, "type=unknown(%p)", ctx->expr->pat_head->match); + else + chunk_appendf(&trash, "type=%s", pat_match_names[match_method]); + + /* case sensitive */ + if (ctx->expr->mflags & PAT_MF_IGNORE_CASE) + chunk_appendf(&trash, ", case=insensitive"); + else + chunk_appendf(&trash, ", case=sensitive"); + + /* Display no match, and set default value */ + if (!pat) { + if (ctx->display_flags == PAT_REF_MAP) + chunk_appendf(&trash, ", found=no"); + else + chunk_appendf(&trash, ", match=no"); + } + + /* Display match and match info */ + else { + /* display match */ + if (ctx->display_flags == PAT_REF_MAP) + chunk_appendf(&trash, ", found=yes"); + else + chunk_appendf(&trash, ", match=yes"); + + /* display index mode */ + if (pat->sflags & PAT_SF_TREE) + chunk_appendf(&trash, ", idx=tree"); + else + chunk_appendf(&trash, ", idx=list"); + + /* display pattern */ + if (ctx->display_flags == PAT_REF_MAP) { + if (pat->ref) + chunk_appendf(&trash, ", key=\"%s\"", pat->ref->pattern); + else + chunk_appendf(&trash, ", key=unknown"); + } + else { + if (pat->ref) + chunk_appendf(&trash, ", pattern=\"%s\"", pat->ref->pattern); + else + chunk_appendf(&trash, ", pattern=unknown"); + } + + /* display return value */ + if (ctx->display_flags == PAT_REF_MAP) { + if (pat->data && pat->ref && pat->ref->sample) + chunk_appendf(&trash, ", value=\"%s\", type=\"%s\"", pat->ref->sample, + smp_to_type[pat->data->type]); + else + chunk_appendf(&trash, ", value=none"); + } + } + + chunk_appendf(&trash, "\n"); + + /* display response */ + if (applet_putchk(appctx, &trash) == -1) { + /* let's try again later from this stream. We add ourselves into + * this stream's users so that it can remove us upon termination. + */ + HA_RWLOCK_RDUNLOCK(PATREF_LOCK, &ctx->ref->lock); + return 0; + } + + /* get next entry */ + ctx->expr = pat_expr_get_next(ctx->expr, + &ctx->ref->pat); + } + HA_RWLOCK_RDUNLOCK(PATREF_LOCK, &ctx->ref->lock); + __fallthrough; + + default: + ctx->state = STATE_DONE; + return 1; + } +} + +static void cli_release_mlook(struct appctx *appctx) +{ + struct show_map_ctx *ctx = appctx->svcctx; + + ha_free(&ctx->chunk.area); +} + + +static int cli_parse_get_map(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_map_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (strcmp(args[1], "map") == 0 || strcmp(args[1], "acl") == 0) { + /* Set flags. */ + if (args[1][0] == 'm') + ctx->display_flags = PAT_REF_MAP; + else + ctx->display_flags = PAT_REF_ACL; + + /* No parameter. */ + if (!*args[2] || !*args[3]) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Missing map identifier and/or key.\n"); + else + return cli_err(appctx, "Missing ACL identifier and/or key.\n"); + } + + /* lookup into the maps */ + ctx->ref = pat_ref_lookup_ref(args[2]); + if (!ctx->ref) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Unknown map identifier. Please use #<id> or <file>.\n"); + else + return cli_err(appctx, "Unknown ACL identifier. Please use #<id> or <file>.\n"); + } + + /* copy input string. The string must be allocated because + * it may be used over multiple iterations. It's released + * at the end and upon abort anyway. + */ + ctx->chunk.data = strlen(args[3]); + ctx->chunk.size = ctx->chunk.data + 1; + ctx->chunk.area = strdup(args[3]); + if (!ctx->chunk.area) + return cli_err(appctx, "Out of memory error.\n"); + + return 0; + } + return 1; +} + +static int cli_parse_prepare_map(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_map_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (strcmp(args[1], "map") == 0 || + strcmp(args[1], "acl") == 0) { + uint next_gen; + char *msg = NULL; + + /* Set ACL or MAP flags. */ + if (args[1][0] == 'm') + ctx->display_flags = PAT_REF_MAP; + else + ctx->display_flags = PAT_REF_ACL; + + /* lookup into the refs and check the map flag */ + ctx->ref = pat_ref_lookup_ref(args[2]); + if (!ctx->ref || + !(ctx->ref->flags & ctx->display_flags)) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Unknown map identifier. Please use #<id> or <file>.\n"); + else + return cli_err(appctx, "Unknown ACL identifier. Please use #<id> or <file>.\n"); + } + next_gen = pat_ref_newgen(ctx->ref); + return cli_dynmsg(appctx, LOG_INFO, memprintf(&msg, "New version created: %u\n", next_gen)); + } + + return 0; +} + +static void cli_release_show_map(struct appctx *appctx) +{ + struct show_map_ctx *ctx = appctx->svcctx; + + if (!LIST_ISEMPTY(&ctx->bref.users)) { + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + LIST_DEL_INIT(&ctx->bref.users); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + } +} + +static int cli_parse_show_map(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_map_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (strcmp(args[1], "map") == 0 || + strcmp(args[1], "acl") == 0) { + const char *gen = NULL; + + /* Set ACL or MAP flags. */ + if (args[1][0] == 'm') + ctx->display_flags = PAT_REF_MAP; + else + ctx->display_flags = PAT_REF_ACL; + + /* no parameter: display all map available */ + if (!*args[2]) { + appctx->io_handler = cli_io_handler_pats_list; + return 0; + } + + /* For both "map" and "acl" we may have an optional generation + * number specified using a "@" character before the pattern + * file name. + */ + if (*args[2] == '@') { + gen = args[2] + 1; + args++; + } + + /* lookup into the refs and check the map flag */ + ctx->ref = pat_ref_lookup_ref(args[2]); + if (!ctx->ref || + !(ctx->ref->flags & ctx->display_flags)) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Unknown map identifier. Please use #<id> or <file>.\n"); + else + return cli_err(appctx, "Unknown ACL identifier. Please use #<id> or <file>.\n"); + } + + /* set the desired generation id in curr_gen */ + if (gen) + ctx->curr_gen = str2uic(gen); + else + ctx->curr_gen = ctx->ref->curr_gen; + + LIST_INIT(&ctx->bref.users); + appctx->io_handler = cli_io_handler_pat_list; + appctx->io_release = cli_release_show_map; + return 0; + } + + return 0; +} + +static int cli_parse_set_map(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_map_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (strcmp(args[1], "map") == 0) { + char *err; + + /* Set flags. */ + ctx->display_flags = PAT_REF_MAP; + + /* Expect three parameters: map name, key and new value. */ + if (!*args[2] || !*args[3] || !*args[4]) + return cli_err(appctx, "'set map' expects three parameters: map identifier, key and value.\n"); + + /* Lookup the reference in the maps. */ + ctx->ref = pat_ref_lookup_ref(args[2]); + if (!ctx->ref) + return cli_err(appctx, "Unknown map identifier. Please use #<id> or <file>.\n"); + + /* If the entry identifier start with a '#', it is considered as + * pointer id + */ + if (args[3][0] == '#' && args[3][1] == '0' && args[3][2] == 'x') { + struct pat_ref_elt *ref; + long long int conv; + char *error; + + /* Convert argument to integer value. */ + conv = strtoll(&args[3][1], &error, 16); + if (*error != '\0') + return cli_err(appctx, "Malformed identifier. Please use #<id> or <file>.\n"); + + /* Convert and check integer to pointer. */ + ref = (struct pat_ref_elt *)(long)conv; + if ((long long int)(long)ref != conv) + return cli_err(appctx, "Malformed identifier. Please use #<id> or <file>.\n"); + + /* Try to modify the entry. */ + err = NULL; + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + if (!pat_ref_set_by_id(ctx->ref, ref, args[4], &err)) { + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + if (err) + return cli_dynerr(appctx, memprintf(&err, "%s.\n", err)); + else + return cli_err(appctx, "Failed to update an entry.\n"); + } + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + } + else { + /* Else, use the entry identifier as pattern + * string, and update the value. + */ + err = NULL; + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + if (!pat_ref_set(ctx->ref, args[3], args[4], &err, NULL)) { + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + if (err) + return cli_dynerr(appctx, memprintf(&err, "%s.\n", err)); + else + return cli_err(appctx, "Failed to update an entry.\n"); + } + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + } + + /* The set is done, send message. */ + appctx->st0 = CLI_ST_PROMPT; + return 0; + } + return 1; +} + +static int cli_parse_add_map(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_map_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (strcmp(args[1], "map") == 0 || + strcmp(args[1], "acl") == 0) { + const char *gen = NULL; + uint genid = 0; + int ret; + char *err; + + /* Set flags. */ + if (args[1][0] == 'm') + ctx->display_flags = PAT_REF_MAP; + else + ctx->display_flags = PAT_REF_ACL; + + /* For both "map" and "acl" we may have an optional generation + * number specified using a "@" character before the pattern + * file name. + */ + if (*args[2] == '@') { + gen = args[2] + 1; + args++; + } + + /* If the keyword is "map", we expect: + * - three parameters if there is no payload + * - one parameter if there is a payload + * If it is "acl", we expect only two parameters + */ + if (ctx->display_flags == PAT_REF_MAP) { + if ((!payload && (!*args[2] || !*args[3] || !*args[4])) || + (payload && !*args[2])) + return cli_err(appctx, + "'add map' expects three parameters (map identifier, key and value)" + " or one parameter (map identifier) and a payload\n"); + } + else if (!*args[2] || !*args[3]) + return cli_err(appctx, "'add acl' expects two parameters: ACL identifier and pattern.\n"); + + /* Lookup for the reference. */ + ctx->ref = pat_ref_lookup_ref(args[2]); + if (!ctx->ref) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Unknown map identifier. Please use #<id> or <file>.\n"); + else + return cli_err(appctx, "Unknown ACL identifier. Please use #<id> or <file>.\n"); + } + + if (gen) { + genid = str2uic(gen); + if ((int)(genid - ctx->ref->next_gen) > 0) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Version number in the future, please use 'prepare map' before.\n"); + else + return cli_err(appctx, "Version number in the future, please use 'prepare acl' before.\n"); + } + } + + /* The command "add acl" is prohibited if the reference + * use samples. + */ + if ((ctx->display_flags & PAT_REF_ACL) && + (ctx->ref->flags & PAT_REF_SMP)) { + return cli_err(appctx, + "This ACL is shared with a map containing samples. " + "You must use the command 'add map' to add values.\n"); + } + + /* Add value(s). If no payload is used, key and value are read + * from the command line and only one key is set. If a payload + * is passed, one key/value pair is read per line till the end + * of the payload is reached. + */ + err = NULL; + + do { + char *key = args[3]; + char *value = args[4]; + size_t l; + + if (payload) { + /* key and value passed as payload, one pair per line */ + if (!*payload) + break; + + key = payload; + l = strcspn(key, " \t"); + payload += l; + + if (!*payload && ctx->display_flags == PAT_REF_MAP) + return cli_dynerr(appctx, memprintf(&err, "Missing value for key '%s'.\n", key)); + + key[l] = 0; + payload++; + + /* value */ + payload += strspn(payload, " \t"); + value = payload; + l = strcspn(value, "\n"); + payload += l; + if (*payload) + payload++; + value[l] = 0; + } + + if (ctx->display_flags != PAT_REF_MAP) + value = NULL; + + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + ret = !!pat_ref_load(ctx->ref, gen ? genid : ctx->ref->curr_gen, key, value, -1, &err); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + + if (!ret) { + if (err) + return cli_dynerr(appctx, memprintf(&err, "%s.\n", err)); + else + return cli_err(appctx, "Failed to add a key.\n"); + } + } while (payload && *payload); + + /* The add is done, send message. */ + appctx->st0 = CLI_ST_PROMPT; + return 1; + } + + return 0; +} + +static int cli_parse_del_map(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_map_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (args[1][0] == 'm') + ctx->display_flags = PAT_REF_MAP; + else + ctx->display_flags = PAT_REF_ACL; + + /* Expect two parameters: map name and key. */ + if (!*args[2] || !*args[3]) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "This command expects two parameters: map identifier and key.\n"); + else + return cli_err(appctx, "This command expects two parameters: ACL identifier and key.\n"); + } + + /* Lookup the reference in the maps. */ + ctx->ref = pat_ref_lookup_ref(args[2]); + if (!ctx->ref || + !(ctx->ref->flags & ctx->display_flags)) + return cli_err(appctx, "Unknown map identifier. Please use #<id> or <file>.\n"); + + /* If the entry identifier start with a '#', it is considered as + * pointer id + */ + if (args[3][0] == '#' && args[3][1] == '0' && args[3][2] == 'x') { + struct pat_ref_elt *ref; + long long int conv; + char *error; + + /* Convert argument to integer value. */ + conv = strtoll(&args[3][1], &error, 16); + if (*error != '\0') + return cli_err(appctx, "Malformed identifier. Please use #<id> or <file>.\n"); + + /* Convert and check integer to pointer. */ + ref = (struct pat_ref_elt *)(long)conv; + if ((long long int)(long)ref != conv) + return cli_err(appctx, "Malformed identifier. Please use #<id> or <file>.\n"); + + /* Try to delete the entry. */ + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + if (!pat_ref_delete_by_id(ctx->ref, ref)) { + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + /* The entry is not found, send message. */ + return cli_err(appctx, "Key not found.\n"); + } + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + } + else { + /* Else, use the entry identifier as pattern + * string and try to delete the entry. + */ + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + if (!pat_ref_delete(ctx->ref, args[3])) { + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + /* The entry is not found, send message. */ + return cli_err(appctx, "Key not found.\n"); + } + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + } + + /* The deletion is done, send message. */ + appctx->st0 = CLI_ST_PROMPT; + return 1; +} + +/* continue to clear a map which was started in the parser. The range of + * generations this applies to is taken from ctx->curr_gen for the oldest + * and ctx->prev_gen for the latest. + */ +static int cli_io_handler_clear_map(struct appctx *appctx) +{ + struct show_map_ctx *ctx = appctx->svcctx; + int finished; + + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + finished = pat_ref_purge_range(ctx->ref, ctx->curr_gen, ctx->prev_gen, 100); + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + + if (!finished) { + /* let's come back later */ + applet_have_more_data(appctx); + return 0; + } + + trim_all_pools(); + return 1; +} + +/* note: sets ctx->curr_gen and ctx->prev_gen to the oldest and + * latest generations to clear, respectively, and will call the clear_map + * handler. + */ +static int cli_parse_clear_map(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_map_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (strcmp(args[1], "map") == 0 || strcmp(args[1], "acl") == 0) { + const char *gen = NULL; + + /* Set ACL or MAP flags. */ + if (args[1][0] == 'm') + ctx->display_flags = PAT_REF_MAP; + else + ctx->display_flags = PAT_REF_ACL; + + /* For both "map" and "acl" we may have an optional generation + * number specified using a "@" character before the pattern + * file name. + */ + if (*args[2] == '@') { + gen = args[2] + 1; + args++; + } + + /* no parameter */ + if (!*args[2]) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Missing map identifier.\n"); + else + return cli_err(appctx, "Missing ACL identifier.\n"); + } + + /* lookup into the refs and check the map flag */ + ctx->ref = pat_ref_lookup_ref(args[2]); + if (!ctx->ref || + !(ctx->ref->flags & ctx->display_flags)) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Unknown map identifier. Please use #<id> or <file>.\n"); + else + return cli_err(appctx, "Unknown ACL identifier. Please use #<id> or <file>.\n"); + } + + /* set the desired generation id in curr_gen/prev_gen */ + if (gen) + ctx->prev_gen = ctx->curr_gen = str2uic(gen); + else + ctx->prev_gen = ctx->curr_gen = ctx->ref->curr_gen; + + /* delegate the clearing to the I/O handler which can yield */ + return 0; + } + return 1; +} + +/* note: sets ctx->curr_gen and ctx->prev_gen to the oldest and + * latest generations to clear, respectively, and will call the clear_map + * handler. + */ +static int cli_parse_commit_map(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_map_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (strcmp(args[1], "map") == 0 || strcmp(args[1], "acl") == 0) { + const char *gen = NULL; + uint genid; + uint ret; + + /* Set ACL or MAP flags. */ + if (args[1][0] == 'm') + ctx->display_flags = PAT_REF_MAP; + else + ctx->display_flags = PAT_REF_ACL; + + if (*args[2] != '@') + return cli_err(appctx, "Missing version number.\n"); + + /* The generation number is mandatory for a commit. The range + * of generations that get trashed by a commit starts from the + * opposite of the current one and ends at the previous one. + */ + gen = args[2] + 1; + genid = str2uic(gen); + ctx->prev_gen = genid - 1; + ctx->curr_gen = ctx->prev_gen - ((~0U) >> 1); + + /* no parameter */ + if (!*args[3]) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Missing map identifier.\n"); + else + return cli_err(appctx, "Missing ACL identifier.\n"); + } + + /* lookup into the refs and check the map flag */ + ctx->ref = pat_ref_lookup_ref(args[3]); + if (!ctx->ref || + !(ctx->ref->flags & ctx->display_flags)) { + if (ctx->display_flags == PAT_REF_MAP) + return cli_err(appctx, "Unknown map identifier. Please use #<id> or <file>.\n"); + else + return cli_err(appctx, "Unknown ACL identifier. Please use #<id> or <file>.\n"); + } + + HA_RWLOCK_WRLOCK(PATREF_LOCK, &ctx->ref->lock); + if (genid - (ctx->ref->curr_gen + 1) < + ctx->ref->next_gen - ctx->ref->curr_gen) + ret = pat_ref_commit(ctx->ref, genid); + else + ret = 1; + HA_RWLOCK_WRUNLOCK(PATREF_LOCK, &ctx->ref->lock); + + if (ret != 0) + return cli_err(appctx, "Version number out of range.\n"); + + /* delegate the clearing to the I/O handler which can yield */ + return 0; + } + return 1; +} + +/* register cli keywords */ + +static struct cli_kw_list cli_kws = {{ },{ + { { "add", "acl", NULL }, "add acl [@<ver>] <acl> <pattern> : add an acl entry", cli_parse_add_map, NULL }, + { { "clear", "acl", NULL }, "clear acl [@<ver>] <acl> : clear the contents of this acl", cli_parse_clear_map, cli_io_handler_clear_map, NULL }, + { { "commit","acl", NULL }, "commit acl @<ver> <acl> : commit the ACL at this version", cli_parse_commit_map, cli_io_handler_clear_map, NULL }, + { { "del", "acl", NULL }, "del acl <acl> [<key>|#<ref>] : delete acl entries matching <key>", cli_parse_del_map, NULL }, + { { "get", "acl", NULL }, "get acl <acl> <value> : report the patterns matching a sample for an ACL", cli_parse_get_map, cli_io_handler_map_lookup, cli_release_mlook }, + { { "prepare","acl",NULL }, "prepare acl <acl> : prepare a new version for atomic ACL replacement", cli_parse_prepare_map, NULL }, + { { "show", "acl", NULL }, "show acl [@<ver>] <acl>] : report available acls or dump an acl's contents", cli_parse_show_map, NULL }, + { { "add", "map", NULL }, "add map [@<ver>] <map> <key> <val> : add a map entry (payload supported instead of key/val)", cli_parse_add_map, NULL }, + { { "clear", "map", NULL }, "clear map [@<ver>] <map> : clear the contents of this map", cli_parse_clear_map, cli_io_handler_clear_map, NULL }, + { { "commit","map", NULL }, "commit map @<ver> <map> : commit the map at this version", cli_parse_commit_map, cli_io_handler_clear_map, NULL }, + { { "del", "map", NULL }, "del map <map> [<key>|#<ref>] : delete map entries matching <key>", cli_parse_del_map, NULL }, + { { "get", "map", NULL }, "get map <acl> <value> : report the keys and values matching a sample for a map", cli_parse_get_map, cli_io_handler_map_lookup, cli_release_mlook }, + { { "prepare","map",NULL }, "prepare map <acl> : prepare a new version for atomic map replacement", cli_parse_prepare_map, NULL }, + { { "set", "map", NULL }, "set map <map> [<key>|#<ref>] <value> : modify a map entry", cli_parse_set_map, NULL }, + { { "show", "map", NULL }, "show map [@ver] [map] : report available maps or dump a map's contents", cli_parse_show_map, NULL }, + { { NULL }, NULL, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +/* Note: must not be declared <const> as its list will be overwritten + * + * For the map_*_int keywords, the output is declared as SMP_T_UINT, but the converter function + * can provide SMP_T_UINT, SMP_T_SINT or SMP_T_BOOL depending on how the patterns found in the + * file can be parsed. + * + * For the map_*_ip keyword, the output is declared as SMP_T_IPV4, but the converter function + * can provide SMP_T_IPV4 or SMP_T_IPV6 depending on the patterns found in the file. + * + * The map_* keywords only emit strings. + * + * The output type is only used during the configuration parsing. It is used for detecting + * compatibility problems. + * + * The arguments are: <file>[,<default value>] + */ +static struct sample_conv_kw_list sample_conv_kws = {ILH, { + { "map", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_STR }, + { "map_str", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_STR }, + { "map_beg", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_BEG }, + { "map_sub", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_SUB }, + { "map_dir", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_DIR }, + { "map_dom", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_DOM }, + { "map_end", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_END }, + { "map_reg", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_REG }, + { "map_regm", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_STR, (void *)PAT_MATCH_REGM}, + { "map_int", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_SINT, SMP_T_STR, (void *)PAT_MATCH_INT }, + { "map_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_ADDR, SMP_T_STR, (void *)PAT_MATCH_IP }, + + { "map_str_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_STR, SMP_T_SINT, (void *)PAT_MATCH_STR }, + { "map_beg_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_STR, SMP_T_SINT, (void *)PAT_MATCH_BEG }, + { "map_sub_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_STR, SMP_T_SINT, (void *)PAT_MATCH_SUB }, + { "map_dir_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_STR, SMP_T_SINT, (void *)PAT_MATCH_DIR }, + { "map_dom_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_STR, SMP_T_SINT, (void *)PAT_MATCH_DOM }, + { "map_end_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_STR, SMP_T_SINT, (void *)PAT_MATCH_END }, + { "map_reg_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_STR, SMP_T_SINT, (void *)PAT_MATCH_REG }, + { "map_int_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_SINT, SMP_T_SINT, (void *)PAT_MATCH_INT }, + { "map_ip_int", sample_conv_map, ARG2(1,STR,SINT), sample_load_map, SMP_T_ADDR, SMP_T_SINT, (void *)PAT_MATCH_IP }, + + { "map_str_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_ADDR, (void *)PAT_MATCH_STR }, + { "map_beg_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_ADDR, (void *)PAT_MATCH_BEG }, + { "map_sub_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_ADDR, (void *)PAT_MATCH_SUB }, + { "map_dir_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_ADDR, (void *)PAT_MATCH_DIR }, + { "map_dom_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_ADDR, (void *)PAT_MATCH_DOM }, + { "map_end_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_ADDR, (void *)PAT_MATCH_END }, + { "map_reg_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_STR, SMP_T_ADDR, (void *)PAT_MATCH_REG }, + { "map_int_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_SINT, SMP_T_ADDR, (void *)PAT_MATCH_INT }, + { "map_ip_ip", sample_conv_map, ARG2(1,STR,STR), sample_load_map, SMP_T_ADDR, SMP_T_ADDR, (void *)PAT_MATCH_IP }, + + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_convs, &sample_conv_kws); diff --git a/src/mjson.c b/src/mjson.c new file mode 100644 index 0000000..73b7a57 --- /dev/null +++ b/src/mjson.c @@ -0,0 +1,1048 @@ +// Copyright (c) 2018-2020 Cesanta Software Limited +// All rights reserved +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include <float.h> +#include <math.h> + +#include <import/mjson.h> + +#if defined(_MSC_VER) +#define alloca(x) _alloca(x) +#endif + +#if defined(_MSC_VER) && _MSC_VER < 1700 +#define va_copy(x, y) (x) = (y) +#define isinf(x) !_finite(x) +#define isnan(x) _isnan(x) +#endif + +static double mystrtod(const char *str, char **end); + +static int mjson_esc(int c, int esc) { + const char *p, *esc1 = "\b\f\n\r\t\\\"", *esc2 = "bfnrt\\\""; + for (p = esc ? esc1 : esc2; *p != '\0'; p++) { + if (*p == c) return esc ? esc2[p - esc1] : esc1[p - esc2]; + } + return 0; +} + +static int mjson_escape(int c) { + return mjson_esc(c, 1); +} + +static int mjson_pass_string(const char *s, int len) { + int i; + for (i = 0; i < len; i++) { + if (s[i] == '\\' && i + 1 < len && mjson_escape(s[i + 1])) { + i++; + } else if (s[i] == '\0') { + return MJSON_ERROR_INVALID_INPUT; + } else if (s[i] == '"') { + return i; + } + } + return MJSON_ERROR_INVALID_INPUT; +} + +int mjson(const char *s, int len, mjson_cb_t cb, void *ud) { + enum { S_VALUE, S_KEY, S_COLON, S_COMMA_OR_EOO } expecting = S_VALUE; + unsigned char nesting[MJSON_MAX_DEPTH]; + int i, depth = 0; +#define MJSONCALL(ev) \ + if (cb != NULL && cb(ev, s, start, i - start + 1, ud)) return i + 1; + +// In the ascii table, the distance between `[` and `]` is 2. +// Ditto for `{` and `}`. Hence +2 in the code below. +#define MJSONEOO() \ + do { \ + if (c != nesting[depth - 1] + 2) return MJSON_ERROR_INVALID_INPUT; \ + depth--; \ + if (depth == 0) { \ + MJSONCALL(tok); \ + return i + 1; \ + } \ + } while (0) + + for (i = 0; i < len; i++) { + int start = i; + unsigned char c = ((unsigned char *) s)[i]; + int tok = c; + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') continue; + // printf("- %c [%.*s] %d %d\n", c, i, s, depth, expecting); + switch (expecting) { + case S_VALUE: + if (c == '{') { + if (depth >= (int) sizeof(nesting)) return MJSON_ERROR_TOO_DEEP; + nesting[depth++] = c; + expecting = S_KEY; + break; + } else if (c == '[') { + if (depth >= (int) sizeof(nesting)) return MJSON_ERROR_TOO_DEEP; + nesting[depth++] = c; + break; + } else if (c == ']' && depth > 0) { // Empty array + MJSONEOO(); + } else if (c == 't' && i + 3 < len && memcmp(&s[i], "true", 4) == 0) { + i += 3; + tok = MJSON_TOK_TRUE; + } else if (c == 'n' && i + 3 < len && memcmp(&s[i], "null", 4) == 0) { + i += 3; + tok = MJSON_TOK_NULL; + } else if (c == 'f' && i + 4 < len && memcmp(&s[i], "false", 5) == 0) { + i += 4; + tok = MJSON_TOK_FALSE; + } else if (c == '-' || ((c >= '0' && c <= '9'))) { + char *end = NULL; + mystrtod(&s[i], &end); + if (end != NULL) i += (int) (end - &s[i] - 1); + tok = MJSON_TOK_NUMBER; + } else if (c == '"') { + int n = mjson_pass_string(&s[i + 1], len - i - 1); + if (n < 0) return n; + i += n + 1; + tok = MJSON_TOK_STRING; + } else { + return MJSON_ERROR_INVALID_INPUT; + } + if (depth == 0) { + MJSONCALL(tok); + return i + 1; + } + expecting = S_COMMA_OR_EOO; + break; + + case S_KEY: + if (c == '"') { + int n = mjson_pass_string(&s[i + 1], len - i - 1); + if (n < 0) return n; + i += n + 1; + tok = MJSON_TOK_KEY; + expecting = S_COLON; + } else if (c == '}') { // Empty object + MJSONEOO(); + expecting = S_COMMA_OR_EOO; + } else { + return MJSON_ERROR_INVALID_INPUT; + } + break; + + case S_COLON: + if (c == ':') { + expecting = S_VALUE; + } else { + return MJSON_ERROR_INVALID_INPUT; + } + break; + + case S_COMMA_OR_EOO: + if (depth <= 0) return MJSON_ERROR_INVALID_INPUT; + if (c == ',') { + expecting = (nesting[depth - 1] == '{') ? S_KEY : S_VALUE; + } else if (c == ']' || c == '}') { + MJSONEOO(); + } else { + return MJSON_ERROR_INVALID_INPUT; + } + break; + } + MJSONCALL(tok); + } + return MJSON_ERROR_INVALID_INPUT; +} + +struct msjon_get_data { + const char *path; // Lookup json path + int pos; // Current path index + int d1; // Current depth of traversal + int d2; // Expected depth of traversal + int i1; // Index in an array + int i2; // Expected index in an array + int obj; // If the value is array/object, offset where it starts + const char **tokptr; // Destination + int *toklen; // Destination length + int tok; // Returned token +}; + +#include <stdio.h> + +static int plen1(const char *s) { + int i = 0, n = 0; + while (s[i] != '\0' && s[i] != '.' && s[i] != '[') + n++, i += s[i] == '\\' ? 2 : 1; + // printf("PLEN: s: [%s], [%.*s] => %d\n", s, i, s, n); + return n; +} + +static int plen2(const char *s) { + int i = 0, __attribute__((unused)) n = 0; + while (s[i] != '\0' && s[i] != '.' && s[i] != '[') + n++, i += s[i] == '\\' ? 2 : 1; + // printf("PLEN: s: [%s], [%.*s] => %d\n", s, i, s, n); + return i; +} + +static int kcmp(const char *a, const char *b, int n) { + int i = 0, j = 0, r = 0; + for (i = 0, j = 0; j < n; i++, j++) { + if (b[i] == '\\') i++; + if ((r = a[j] - b[i]) != 0) return r; + } + // printf("KCMP: a: [%.*s], b:[%.*s] ==> %d\n", n, a, i, b, r); + return r; +} + +static int mjson_get_cb(int tok, const char *s, int off, int len, void *ud) { + struct msjon_get_data *data = (struct msjon_get_data *) ud; + // printf("--> %2x %2d %2d %2d %2d\t'%s'\t'%.*s'\t\t'%.*s'\n", tok, data->d1, + // data->d2, data->i1, data->i2, data->path + data->pos, off, s, len, + // s + off); + if (data->tok != MJSON_TOK_INVALID) return 1; // Found + + if (tok == '{') { + if (!data->path[data->pos] && data->d1 == data->d2) data->obj = off; + data->d1++; + } else if (tok == '[') { + if (data->d1 == data->d2 && data->path[data->pos] == '[') { + data->i1 = 0; + data->i2 = (int) mystrtod(&data->path[data->pos + 1], NULL); + if (data->i1 == data->i2) { + data->d2++; + data->pos += 3; + } + } + if (!data->path[data->pos] && data->d1 == data->d2) data->obj = off; + data->d1++; + } else if (tok == ',') { + if (data->d1 == data->d2 + 1) { + data->i1++; + if (data->i1 == data->i2) { + while (data->path[data->pos] != ']') data->pos++; + data->pos++; + data->d2++; + } + } + } else if (tok == MJSON_TOK_KEY && data->d1 == data->d2 + 1 && + data->path[data->pos] == '.' && s[off] == '"' && + s[off + len - 1] == '"' && + plen1(&data->path[data->pos + 1]) == len - 2 && + kcmp(s + off + 1, &data->path[data->pos + 1], len - 2) == 0) { + data->d2++; + data->pos += plen2(&data->path[data->pos + 1]) + 1; + } else if (tok == MJSON_TOK_KEY && data->d1 == data->d2) { + return 1; // Exhausted path, not found + } else if (tok == '}' || tok == ']') { + data->d1--; + // data->d2--; + if (!data->path[data->pos] && data->d1 == data->d2 && data->obj != -1) { + data->tok = tok - 2; + if (data->tokptr) *data->tokptr = s + data->obj; + if (data->toklen) *data->toklen = off - data->obj + 1; + return 1; + } + } else if (MJSON_TOK_IS_VALUE(tok)) { + // printf("TOK --> %d\n", tok); + if (data->d1 == data->d2 && !data->path[data->pos]) { + data->tok = tok; + if (data->tokptr) *data->tokptr = s + off; + if (data->toklen) *data->toklen = len; + return 1; + } + } + return 0; +} + +enum mjson_tok mjson_find(const char *s, int len, const char *jp, + const char **tokptr, int *toklen) { + struct msjon_get_data data = {jp, 1, 0, 0, 0, + 0, -1, tokptr, toklen, MJSON_TOK_INVALID}; + if (jp[0] != '$') return MJSON_TOK_INVALID; + if (mjson(s, len, mjson_get_cb, &data) < 0) return MJSON_TOK_INVALID; + return (enum mjson_tok) data.tok; +} + +int mjson_get_number(const char *s, int len, const char *path, double *v) { + const char *p; + int tok, n; + if ((tok = mjson_find(s, len, path, &p, &n)) == MJSON_TOK_NUMBER) { + if (v != NULL) *v = mystrtod(p, NULL); + } + return tok == MJSON_TOK_NUMBER ? 1 : 0; +} + +int mjson_get_bool(const char *s, int len, const char *path, int *v) { + int tok = mjson_find(s, len, path, NULL, NULL); + if (tok == MJSON_TOK_TRUE && v != NULL) *v = 1; + if (tok == MJSON_TOK_FALSE && v != NULL) *v = 0; + return tok == MJSON_TOK_TRUE || tok == MJSON_TOK_FALSE ? 1 : 0; +} + +static unsigned char mjson_unhex_nimble(const char *s) { + unsigned char i, v = 0; + for (i = 0; i < 2; i++) { + int c = s[i]; + if (i > 0) v <<= 4; + v |= (c >= '0' && c <= '9') ? c - '0' + : (c >= 'A' && c <= 'F') ? c - '7' : c - 'W'; + } + return v; +} + +static int mjson_unescape(const char *s, int len, char *to, int n) { + int i, j; + for (i = 0, j = 0; i < len && j < n; i++, j++) { + if (s[i] == '\\' && i + 5 < len && s[i + 1] == 'u') { + // \uXXXX escape. We could process a simple one-byte chars + // \u00xx from the ASCII range. More complex chars would require + // dragging in a UTF8 library, which is too much for us + if (s[i + 2] != '0' || s[i + 3] != '0') return -1; // Too much, give up + to[j] = mjson_unhex_nimble(s + i + 4); + i += 5; + } else if (s[i] == '\\' && i + 1 < len) { + int c = mjson_esc(s[i + 1], 0); + if (c == 0) return -1; + to[j] = c; + i++; + } else { + to[j] = s[i]; + } + } + if (j >= n) return -1; + if (n > 0) to[j] = '\0'; + return j; +} + +int mjson_get_string(const char *s, int len, const char *path, char *to, + int n) { + const char *p; + int sz; + if (mjson_find(s, len, path, &p, &sz) != MJSON_TOK_STRING) return -1; + return mjson_unescape(p + 1, sz - 2, to, n); +} + +int mjson_get_hex(const char *s, int len, const char *x, char *to, int n) { + const char *p; + int i, j, sz; + if (mjson_find(s, len, x, &p, &sz) != MJSON_TOK_STRING) return -1; + for (i = j = 0; i < sz - 3 && j < n; i += 2, j++) { + ((unsigned char *) to)[j] = mjson_unhex_nimble(p + i + 1); + } + if (j < n) to[j] = '\0'; + return j; +} + +#if MJSON_ENABLE_BASE64 +static int mjson_base64rev(int c) { + if (c >= 'A' && c <= 'Z') { + return c - 'A'; + } else if (c >= 'a' && c <= 'z') { + return c + 26 - 'a'; + } else if (c >= '0' && c <= '9') { + return c + 52 - '0'; + } else if (c == '+') { + return 62; + } else if (c == '/') { + return 63; + } else { + return 64; + } +} + +int mjson_base64_dec(const char *src, int n, char *dst, int dlen) { + const char *end = src + n; + int len = 0; + while (src + 3 < end && len < dlen) { + int a = mjson_base64rev(src[0]), b = mjson_base64rev(src[1]), + c = mjson_base64rev(src[2]), d = mjson_base64rev(src[3]); + dst[len++] = (a << 2) | (b >> 4); + if (src[2] != '=' && len < dlen) { + dst[len++] = (b << 4) | (c >> 2); + if (src[3] != '=' && len < dlen) { + dst[len++] = (c << 6) | d; + } + } + src += 4; + } + if (len < dlen) dst[len] = '\0'; + return len; +} + +int mjson_get_base64(const char *s, int len, const char *path, char *to, + int n) { + const char *p; + int sz; + if (mjson_find(s, len, path, &p, &sz) != MJSON_TOK_STRING) return 0; + return mjson_base64_dec(p + 1, sz - 2, to, n); +} +#endif // MJSON_ENABLE_BASE64 + +#if MJSON_ENABLE_NEXT +struct nextdata { + int off, len, depth, t, vo, arrayindex; + int *koff, *klen, *voff, *vlen, *vtype; +}; + +static int next_cb(int tok, const char *s, int off, int len, void *ud) { + struct nextdata *d = (struct nextdata *) ud; + // int i; + switch (tok) { + case '{': + case '[': + if (d->depth == 0 && tok == '[') d->arrayindex = 0; + if (d->depth == 1 && off > d->off) { + d->vo = off; + d->t = tok == '{' ? MJSON_TOK_OBJECT : MJSON_TOK_ARRAY; + if (d->voff) *d->voff = off; + if (d->vtype) *d->vtype = d->t; + } + d->depth++; + break; + case '}': + case ']': + d->depth--; + if (d->depth == 1 && d->vo) { + d->len = off + len; + if (d->vlen) *d->vlen = d->len - d->vo; + if (d->arrayindex >= 0) { + if (d->koff) *d->koff = d->arrayindex; // koff holds array index + if (d->klen) *d->klen = 0; // klen holds 0 + } + return 1; + } + if (d->depth == 1 && d->arrayindex >= 0) d->arrayindex++; + break; + case ',': + case ':': + break; + case MJSON_TOK_KEY: + if (d->depth == 1 && d->off < off) { + if (d->koff) *d->koff = off; // And report back to the user + if (d->klen) *d->klen = len; // If we have to + } + break; + default: + if (d->depth != 1) break; + // If we're iterating over the array + if (off > d->off) { + d->len = off + len; + if (d->vlen) *d->vlen = len; // value length + if (d->voff) *d->voff = off; // value offset + if (d->vtype) *d->vtype = tok; // value type + if (d->arrayindex >= 0) { + if (d->koff) *d->koff = d->arrayindex; // koff holds array index + if (d->klen) *d->klen = 0; // klen holds 0 + } + return 1; + } + if (d->arrayindex >= 0) d->arrayindex++; + break; + } + (void) s; + return 0; +} + +int mjson_next(const char *s, int n, int off, int *koff, int *klen, int *voff, + int *vlen, int *vtype) { + struct nextdata d = {off, 0, 0, 0, 0, -1, koff, klen, voff, vlen, vtype}; + mjson(s, n, next_cb, &d); + return d.len; +} +#endif + +#if MJSON_ENABLE_PRINT +int mjson_print_fixed_buf(const char *ptr, int len, void *fndata) { + struct mjson_fixedbuf *fb = (struct mjson_fixedbuf *) fndata; + int i, left = fb->size - 1 - fb->len; + if (left < len) len = left; + for (i = 0; i < len; i++) fb->ptr[fb->len + i] = ptr[i]; + fb->len += len; + fb->ptr[fb->len] = '\0'; + return len; +} + +// This function allocates memory in chunks of size MJSON_DYNBUF_CHUNK +// to decrease memory fragmentation, when many calls are executed to +// print e.g. a base64 string or a hex string. +int mjson_print_dynamic_buf(const char *ptr, int len, void *fndata) { + char *s, *buf = *(char **) fndata; + size_t curlen = buf == NULL ? 0 : strlen(buf); + size_t new_size = curlen + len + 1 + MJSON_DYNBUF_CHUNK; + new_size -= new_size % MJSON_DYNBUF_CHUNK; + + if ((s = (char *) realloc(buf, new_size)) == NULL) { + return 0; + } else { + memcpy(s + curlen, ptr, len); + s[curlen + len] = '\0'; + *(char **) fndata = s; + return len; + } +} + +int mjson_print_null(const char *ptr, int len, void *userdata) { + (void) ptr; + (void) userdata; + return len; +} + +int mjson_print_buf(mjson_print_fn_t fn, void *fnd, const char *buf, int len) { + return fn(buf, len, fnd); +} + +int mjson_print_long(mjson_print_fn_t fn, void *fnd, long val, int is_signed) { + unsigned long v = val, s = 0, n, i; + char buf[20], t; + if (is_signed && val < 0) { + buf[s++] = '-', v = -val; + } + // This loop prints a number in reverse order. I guess this is because we + // write numbers from right to left: least significant digit comes last. + // Maybe because we use Arabic numbers, and Arabs write RTL? + for (n = 0; v > 0; v /= 10) buf[s + n++] = "0123456789"[v % 10]; + // Reverse a string + for (i = 0; i < n / 2; i++) + t = buf[s + i], buf[s + i] = buf[s + n - i - 1], buf[s + n - i - 1] = t; + if (val == 0) buf[n++] = '0'; // Handle special case + return fn(buf, s + n, fnd); +} + +int mjson_print_int(mjson_print_fn_t fn, void *fnd, int v, int s) { + return mjson_print_long(fn, fnd, s ? (long) v : (unsigned) v, s); +} + +static int addexp(char *buf, int e, int sign) { + int n = 0; + buf[n++] = 'e'; + buf[n++] = sign; + if (e > 400) return 0; + if (e < 10) buf[n++] = '0'; + if (e >= 100) buf[n++] = (e / 100) + '0', e -= 100 * (e / 100); + if (e >= 10) buf[n++] = (e / 10) + '0', e -= 10 * (e / 10); + buf[n++] = e + '0'; + return n; +} + +int mjson_print_dbl(mjson_print_fn_t fn, void *fnd, double d, int width) { + char buf[40]; + int i, s = 0, n = 0, e = 0; + double t, mul, saved; + if (d == 0.0) return fn("0", 1, fnd); + if (isinf(d)) return fn(d > 0 ? "inf" : "-inf", d > 0 ? 3 : 4, fnd); + if (isnan(d)) return fn("nan", 3, fnd); + if (d < 0.0) d = -d, buf[s++] = '-'; + + // Round + saved = d; + mul = 1.0; + while (d >= 10.0 && d / mul >= 10.0) mul *= 10.0; + while (d <= 1.0 && d / mul <= 1.0) mul /= 10.0; + for (i = 0, t = mul * 5; i < width; i++) t /= 10.0; + d += t; + // Calculate exponent, and 'mul' for scientific representation + mul = 1.0; + while (d >= 10.0 && d / mul >= 10.0) mul *= 10.0, e++; + while (d < 1.0 && d / mul < 1.0) mul /= 10.0, e--; + // printf(" --> %g %d %g %g\n", saved, e, t, mul); + + if (e >= width) { + struct mjson_fixedbuf fb = {buf + s, (int) sizeof(buf) - s, 0}; + n = mjson_print_dbl(mjson_print_fixed_buf, &fb, saved / mul, width); + // printf(" --> %.*g %d [%.*s]\n", 10, d / t, e, fb.len, fb.ptr); + n += addexp(buf + s + n, e, '+'); + return fn(buf, s + n, fnd); + } else if (e <= -width) { + struct mjson_fixedbuf fb = {buf + s, (int) sizeof(buf) - s, 0}; + n = mjson_print_dbl(mjson_print_fixed_buf, &fb, saved / mul, width); + // printf(" --> %.*g %d [%.*s]\n", 10, d / mul, e, fb.len, fb.ptr); + n += addexp(buf + s + n, -e, '-'); + return fn(buf, s + n, fnd); + } else { + for (i = 0, t = mul; d >= 1.0 && s + n < (int) sizeof(buf); i++) { + int ch = (int) (d / t); + if (n > 0 || ch > 0) buf[s + n++] = ch + '0'; + d -= ch * t; + t /= 10.0; + } + // printf(" --> [%g] -> %g %g (%d) [%.*s]\n", saved, d, t, n, s + n, buf); + if (n == 0) buf[s++] = '0'; + while (t >= 1.0 && n + s < (int) sizeof(buf)) buf[n++] = '0', t /= 10.0; + if (s + n < (int) sizeof(buf)) buf[n + s++] = '.'; + // printf(" 1--> [%g] -> [%.*s]\n", saved, s + n, buf); + for (i = 0, t = 0.1; s + n < (int) sizeof(buf) && n < width; i++) { + int ch = (int) (d / t); + buf[s + n++] = ch + '0'; + d -= ch * t; + t /= 10.0; + } + } + while (n > 0 && buf[s + n - 1] == '0') n--; // Trim trailing zeros + if (n > 0 && buf[s + n - 1] == '.') n--; // Trim trailing dot + return fn(buf, s + n, fnd); +} + +int mjson_print_str(mjson_print_fn_t fn, void *fnd, const char *s, int len) { + int i, n = fn("\"", 1, fnd); + for (i = 0; i < len; i++) { + char c = mjson_escape(s[i]); + if (c) { + n += fn("\\", 1, fnd); + n += fn(&c, 1, fnd); + } else { + n += fn(&s[i], 1, fnd); + } + } + return n + fn("\"", 1, fnd); +} + +#if MJSON_ENABLE_BASE64 +int mjson_print_b64(mjson_print_fn_t fn, void *fnd, const unsigned char *s, + int n) { + const char *t = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + int i, len = fn("\"", 1, fnd); + for (i = 0; i < n; i += 3) { + int a = s[i], b = i + 1 < n ? s[i + 1] : 0, c = i + 2 < n ? s[i + 2] : 0; + char buf[4] = {t[a >> 2], t[(a & 3) << 4 | (b >> 4)], '=', '='}; + if (i + 1 < n) buf[2] = t[(b & 15) << 2 | (c >> 6)]; + if (i + 2 < n) buf[3] = t[c & 63]; + len += fn(buf, sizeof(buf), fnd); + } + return len + fn("\"", 1, fnd); +} +#endif /* MJSON_ENABLE_BASE64 */ + +int mjson_vprintf(mjson_print_fn_t fn, void *fnd, const char *fmt, + va_list xap) { + int i = 0, n = 0; + va_list ap; + va_copy(ap, xap); + while (fmt[i] != '\0') { + if (fmt[i] == '%') { + char fc = fmt[++i]; + int is_long = 0; + if (fc == 'l') { + is_long = 1; + fc = fmt[i + 1]; + } + if (fc == 'Q') { + char *buf = va_arg(ap, char *); + n += mjson_print_str(fn, fnd, buf ? buf : "", + buf ? (int) strlen(buf) : 0); + } else if (strncmp(&fmt[i], ".*Q", 3) == 0) { + int len = va_arg(ap, int); + char *buf = va_arg(ap, char *); + n += mjson_print_str(fn, fnd, buf, len); + i += 2; + } else if (fc == 'd' || fc == 'u') { + int is_signed = (fc == 'd'); + if (is_long) { + long val = va_arg(ap, long); + n += mjson_print_long(fn, fnd, val, is_signed); + i++; + } else { + int val = va_arg(ap, int); + n += mjson_print_int(fn, fnd, val, is_signed); + } + } else if (fc == 'B') { + const char *s = va_arg(ap, int) ? "true" : "false"; + n += mjson_print_buf(fn, fnd, s, (int) strlen(s)); + } else if (fc == 's') { + char *buf = va_arg(ap, char *); + n += mjson_print_buf(fn, fnd, buf, (int) strlen(buf)); + } else if (strncmp(&fmt[i], ".*s", 3) == 0) { + int len = va_arg(ap, int); + char *buf = va_arg(ap, char *); + n += mjson_print_buf(fn, fnd, buf, len); + i += 2; + } else if (fc == 'g') { + n += mjson_print_dbl(fn, fnd, va_arg(ap, double), 6); + } else if (strncmp(&fmt[i], ".*g", 3) == 0) { + int width = va_arg(ap, int); + n += mjson_print_dbl(fn, fnd, va_arg(ap, double), width); + i += 2; +#if MJSON_ENABLE_BASE64 + } else if (fc == 'V') { + int len = va_arg(ap, int); + const char *buf = va_arg(ap, const char *); + n += mjson_print_b64(fn, fnd, (unsigned char *) buf, len); +#endif + } else if (fc == 'H') { + const char *hex = "0123456789abcdef"; + int i, len = va_arg(ap, int); + const unsigned char *p = va_arg(ap, const unsigned char *); + n += fn("\"", 1, fnd); + for (i = 0; i < len; i++) { + n += fn(&hex[(p[i] >> 4) & 15], 1, fnd); + n += fn(&hex[p[i] & 15], 1, fnd); + } + n += fn("\"", 1, fnd); + } else if (fc == 'M') { + mjson_vprint_fn_t vfn = va_arg(ap, mjson_vprint_fn_t); + n += vfn(fn, fnd, &ap); + } + i++; + } else { + n += mjson_print_buf(fn, fnd, &fmt[i++], 1); + } + } + va_end(xap); + va_end(ap); + return n; +} + +int mjson_printf(mjson_print_fn_t fn, void *fnd, const char *fmt, ...) { + va_list ap; + int len; + va_start(ap, fmt); + len = mjson_vprintf(fn, fnd, fmt, ap); + va_end(ap); + return len; +} +#endif /* MJSON_ENABLE_PRINT */ + +static int is_digit(int c) { + return c >= '0' && c <= '9'; +} + +/* NOTE: strtod() implementation by Yasuhiro Matsumoto. */ +static double mystrtod(const char *str, char **end) { + double d = 0.0; + int sign = 1, __attribute__((unused)) n = 0; + const char *p = str, *a = str; + + /* decimal part */ + if (*p == '-') { + sign = -1; + ++p; + } else if (*p == '+') { + ++p; + } + if (is_digit(*p)) { + d = (double) (*p++ - '0'); + while (*p && is_digit(*p)) { + d = d * 10.0 + (double) (*p - '0'); + ++p; + ++n; + } + a = p; + } else if (*p != '.') { + goto done; + } + d *= sign; + + /* fraction part */ + if (*p == '.') { + double f = 0.0; + double base = 0.1; + ++p; + + if (is_digit(*p)) { + while (*p && is_digit(*p)) { + f += base * (*p - '0'); + base /= 10.0; + ++p; + ++n; + } + } + d += f * sign; + a = p; + } + + /* exponential part */ + if ((*p == 'E') || (*p == 'e')) { + int i, e = 0, neg = 0; + p++; + if (*p == '-') p++, neg++; + if (*p == '+') p++; + while (is_digit(*p)) e = e * 10 + *p++ - '0'; + if (neg) e = -e; +#if 0 + if (d == 2.2250738585072011 && e == -308) { + d = 0.0; + a = p; + goto done; + } + if (d == 2.2250738585072012 && e <= -308) { + d *= 1.0e-308; + a = p; + goto done; + } +#endif + for (i = 0; i < e; i++) d *= 10; + for (i = 0; i < -e; i++) d /= 10; + a = p; + } else if (p > str && !is_digit(*(p - 1))) { + a = str; + goto done; + } + +done: + if (end) *end = (char *) a; + return d; +} + +#if MJSON_ENABLE_MERGE +int mjson_merge(const char *s, int n, const char *s2, int n2, + mjson_print_fn_t fn, void *userdata) { + int koff, klen, voff, vlen, t, t2, k, off = 0, len = 0, comma = 0; + if (n < 2) return len; + len += fn("{", 1, userdata); + while ((off = mjson_next(s, n, off, &koff, &klen, &voff, &vlen, &t)) != 0) { + char *path = (char *) alloca(klen + 1); + const char *val; + memcpy(path, "$.", 2); + memcpy(path + 2, s + koff + 1, klen - 2); + path[klen] = '\0'; + if ((t2 = mjson_find(s2, n2, path, &val, &k)) != MJSON_TOK_INVALID) { + if (t2 == MJSON_TOK_NULL) continue; // null deletes the key + } else { + val = s + voff; // Key is not found in the update. Copy the old value. + } + if (comma) len += fn(",", 1, userdata); + len += fn(s + koff, klen, userdata); + len += fn(":", 1, userdata); + if (t == MJSON_TOK_OBJECT && t2 == MJSON_TOK_OBJECT) { + len += mjson_merge(s + voff, vlen, val, k, fn, userdata); + } else { + if (t2 != MJSON_TOK_INVALID) vlen = k; + len += fn(val, vlen, userdata); + } + comma = 1; + } + // Add missing keys + off = 0; + while ((off = mjson_next(s2, n2, off, &koff, &klen, &voff, &vlen, &t)) != 0) { + char *path = (char *) alloca(klen + 1); + const char *val; + if (t == MJSON_TOK_NULL) continue; + memcpy(path, "$.", 2); + memcpy(path + 2, s2 + koff + 1, klen - 2); + path[klen] = '\0'; + if (mjson_find(s, n, path, &val, &vlen) != MJSON_TOK_INVALID) continue; + if (comma) len += fn(",", 1, userdata); + len += fn(s2 + koff, klen, userdata); + len += fn(":", 1, userdata); + len += fn(s2 + voff, vlen, userdata); + comma = 1; + } + len += fn("}", 1, userdata); + return len; +} +#endif // MJSON_ENABLE_MERGE + +#if MJSON_ENABLE_PRETTY +struct prettydata { + int level; + int len; + int prev; + const char *pad; + int padlen; + mjson_print_fn_t fn; + void *userdata; +}; + +static int pretty_cb(int ev, const char *s, int off, int len, void *ud) { + struct prettydata *d = (struct prettydata *) ud; + int i; + switch (ev) { + case '{': + case '[': + d->level++; + d->len += d->fn(s + off, len, d->userdata); + break; + case '}': + case ']': + d->level--; + if (d->prev != '[' && d->prev != '{' && d->padlen > 0) { + d->len += d->fn("\n", 1, d->userdata); + for (i = 0; i < d->level; i++) + d->len += d->fn(d->pad, d->padlen, d->userdata); + } + d->len += d->fn(s + off, len, d->userdata); + break; + case ',': + d->len += d->fn(s + off, len, d->userdata); + if (d->padlen > 0) { + d->len += d->fn("\n", 1, d->userdata); + for (i = 0; i < d->level; i++) + d->len += d->fn(d->pad, d->padlen, d->userdata); + } + break; + case ':': + d->len += d->fn(s + off, len, d->userdata); + if (d->padlen > 0) d->len += d->fn(" ", 1, d->userdata); + break; + case MJSON_TOK_KEY: + if (d->prev == '{' && d->padlen > 0) { + d->len += d->fn("\n", 1, d->userdata); + for (i = 0; i < d->level; i++) + d->len += d->fn(d->pad, d->padlen, d->userdata); + } + d->len += d->fn(s + off, len, d->userdata); + break; + default: + if (d->prev == '[' && d->padlen > 0) { + d->len += d->fn("\n", 1, d->userdata); + for (i = 0; i < d->level; i++) + d->len += d->fn(d->pad, d->padlen, d->userdata); + } + d->len += d->fn(s + off, len, d->userdata); + break; + } + d->prev = ev; + return 0; +} + +int mjson_pretty(const char *s, int n, const char *pad, mjson_print_fn_t fn, + void *userdata) { + struct prettydata d = {0, 0, 0, pad, (int) strlen(pad), fn, userdata}; + if (mjson(s, n, pretty_cb, &d) < 0) return -1; + return d.len; +} +#endif // MJSON_ENABLE_PRETTY + +#if MJSON_ENABLE_RPC +struct jsonrpc_ctx jsonrpc_default_context; + +int mjson_globmatch(const char *s1, int n1, const char *s2, int n2) { + int i = 0, j = 0, ni = 0, nj = 0; + while (i < n1 || j < n2) { + if (i < n1 && j < n2 && (s1[i] == '?' || s2[j] == s1[i])) { + i++, j++; + } else if (i < n1 && (s1[i] == '*' || s1[i] == '#')) { + ni = i, nj = j + 1, i++; + } else if (nj > 0 && nj <= n2 && (s1[i - 1] == '#' || s2[j] != '/')) { + i = ni, j = nj; + } else { + return 0; + } + } + return 1; +} + +void jsonrpc_return_errorv(struct jsonrpc_request *r, int code, + const char *message, const char *data_fmt, + va_list ap) { + if (r->id_len == 0) return; + mjson_printf(r->fn, r->fndata, + "{\"id\":%.*s,\"error\":{\"code\":%d,\"message\":%Q", r->id_len, + r->id, code, message == NULL ? "" : message); + if (data_fmt != NULL) { + mjson_printf(r->fn, r->fndata, ",\"data\":"); + mjson_vprintf(r->fn, r->fndata, data_fmt, ap); + } + mjson_printf(r->fn, r->fndata, "}}\n"); +} + +void jsonrpc_return_error(struct jsonrpc_request *r, int code, + const char *message, const char *data_fmt, ...) { + va_list ap; + va_start(ap, data_fmt); + jsonrpc_return_errorv(r, code, message, data_fmt, ap); + va_end(ap); +} + +void jsonrpc_return_successv(struct jsonrpc_request *r, const char *result_fmt, + va_list ap) { + if (r->id_len == 0) return; + mjson_printf(r->fn, r->fndata, "{\"id\":%.*s,\"result\":", r->id_len, r->id); + if (result_fmt != NULL) { + mjson_vprintf(r->fn, r->fndata, result_fmt, ap); + } else { + mjson_printf(r->fn, r->fndata, "%s", "null"); + } + mjson_printf(r->fn, r->fndata, "}\n"); +} + +void jsonrpc_return_success(struct jsonrpc_request *r, const char *result_fmt, + ...) { + va_list ap; + va_start(ap, result_fmt); + jsonrpc_return_successv(r, result_fmt, ap); + va_end(ap); +} + +void jsonrpc_ctx_process(struct jsonrpc_ctx *ctx, const char *buf, int len, + mjson_print_fn_t fn, void *fndata, void *ud) { + const char *result = NULL, *error = NULL; + int result_sz = 0, error_sz = 0; + struct jsonrpc_method *m = NULL; + struct jsonrpc_request r = {ctx, buf, len, 0, 0, 0, 0, 0, 0, fn, fndata, ud}; + + // Is is a response frame? + mjson_find(buf, len, "$.result", &result, &result_sz); + if (result == NULL) mjson_find(buf, len, "$.error", &error, &error_sz); + if (result_sz > 0 || error_sz > 0) { + if (ctx->response_cb) ctx->response_cb(buf, len, ctx->response_cb_data); + return; + } + + // Method must exist and must be a string + if (mjson_find(buf, len, "$.method", &r.method, &r.method_len) != + MJSON_TOK_STRING) { + mjson_printf(fn, fndata, "{\"error\":{\"code\":-32700,\"message\":%.*Q}}\n", + len, buf); + return; + } + + // id and params are optional + mjson_find(buf, len, "$.id", &r.id, &r.id_len); + mjson_find(buf, len, "$.params", &r.params, &r.params_len); + + for (m = ctx->methods; m != NULL; m = m->next) { + if (mjson_globmatch(m->method, m->method_sz, r.method + 1, + r.method_len - 2) > 0) { + if (r.params == NULL) r.params = ""; + m->cb(&r); + break; + } + } + if (m == NULL) { + jsonrpc_return_error(&r, JSONRPC_ERROR_NOT_FOUND, "method not found", NULL); + } +} + +static int jsonrpc_print_methods(mjson_print_fn_t fn, void *fndata, + va_list *ap) { + struct jsonrpc_ctx *ctx = va_arg(*ap, struct jsonrpc_ctx *); + struct jsonrpc_method *m; + int len = 0; + for (m = ctx->methods; m != NULL; m = m->next) { + if (m != ctx->methods) len += mjson_print_buf(fn, fndata, ",", 1); + len += mjson_print_str(fn, fndata, m->method, (int) strlen(m->method)); + } + return len; +} + +static void rpclist(struct jsonrpc_request *r) { + jsonrpc_return_success(r, "[%M]", jsonrpc_print_methods, r->ctx); +} + +void jsonrpc_ctx_init(struct jsonrpc_ctx *ctx, mjson_print_fn_t response_cb, + void *response_cb_data) { + ctx->response_cb = response_cb; + ctx->response_cb_data = response_cb_data; + jsonrpc_ctx_export(ctx, MJSON_RPC_LIST_NAME, rpclist); +} + +void jsonrpc_init(mjson_print_fn_t response_cb, void *userdata) { + jsonrpc_ctx_init(&jsonrpc_default_context, response_cb, userdata); +} +#endif // MJSON_ENABLE_RPC diff --git a/src/mqtt.c b/src/mqtt.c new file mode 100644 index 0000000..5688296 --- /dev/null +++ b/src/mqtt.c @@ -0,0 +1,1281 @@ +/* + * MQTT Protocol + * + * Copyright 2020 Baptiste Assmann <bedis9@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/chunk.h> +#include <haproxy/mqtt.h> + +uint8_t mqtt_cpt_flags[MQTT_CPT_ENTRIES] = { + [MQTT_CPT_INVALID] = 0x00, + [MQTT_CPT_CONNECT] = 0x00, + [MQTT_CPT_CONNACK] = 0x00, + + /* MQTT_CPT_PUBLISH flags can have different values (DUP, QoS, RETAIN), must be + * check more carefully + */ + [MQTT_CPT_PUBLISH] = 0x0F, + + [MQTT_CPT_PUBACK] = 0x00, + [MQTT_CPT_PUBREC] = 0x00, + [MQTT_CPT_PUBREL] = 0x02, + [MQTT_CPT_PUBCOMP] = 0x00, + [MQTT_CPT_SUBSCRIBE] = 0x02, + [MQTT_CPT_SUBACK] = 0x00, + [MQTT_CPT_UNSUBSCRIBE] = 0x02, + [MQTT_CPT_UNSUBACK] = 0x00, + [MQTT_CPT_PINGREQ] = 0x00, + [MQTT_CPT_PINGRESP] = 0x00, + [MQTT_CPT_DISCONNECT] = 0x00, + [MQTT_CPT_AUTH] = 0x00, +}; + +const struct ist mqtt_fields_string[MQTT_FN_ENTRIES] = { + [MQTT_FN_INVALID] = IST(""), + + /* it's MQTT 3.1, 3.1.1 and 5.0, those fields have no unique id, so we use strings */ + [MQTT_FN_FLAGS] = IST("flags"), + [MQTT_FN_REASON_CODE] = IST("reason_code"), /* MQTT 3.1 and 3.1.1: return_code */ + [MQTT_FN_PROTOCOL_NAME] = IST("protocol_name"), + [MQTT_FN_PROTOCOL_VERSION] = IST("protocol_version"), /* MQTT 3.1.1: protocol_level */ + [MQTT_FN_CLIENT_IDENTIFIER] = IST("client_identifier"), + [MQTT_FN_WILL_TOPIC] = IST("will_topic"), + [MQTT_FN_WILL_PAYLOAD] = IST("will_payload"), /* MQTT 3.1 and 3.1.1: will_message */ + [MQTT_FN_USERNAME] = IST("username"), + [MQTT_FN_PASSWORD] = IST("password"), + [MQTT_FN_KEEPALIVE] = IST("keepalive"), + /* from here, it's MQTT 5.0 only */ + [MQTT_FN_PAYLOAD_FORMAT_INDICATOR] = IST("1"), + [MQTT_FN_MESSAGE_EXPIRY_INTERVAL] = IST("2"), + [MQTT_FN_CONTENT_TYPE] = IST("3"), + [MQTT_FN_RESPONSE_TOPIC] = IST("8"), + [MQTT_FN_CORRELATION_DATA] = IST("9"), + [MQTT_FN_SUBSCRIPTION_IDENTIFIER] = IST("11"), + [MQTT_FN_SESSION_EXPIRY_INTERVAL] = IST("17"), + [MQTT_FN_ASSIGNED_CLIENT_IDENTIFIER] = IST("18"), + [MQTT_FN_SERVER_KEEPALIVE] = IST("19"), + [MQTT_FN_AUTHENTICATION_METHOD] = IST("21"), + [MQTT_FN_AUTHENTICATION_DATA] = IST("22"), + [MQTT_FN_REQUEST_PROBLEM_INFORMATION] = IST("23"), + [MQTT_FN_DELAY_INTERVAL] = IST("24"), + [MQTT_FN_REQUEST_RESPONSE_INFORMATION] = IST("25"), + [MQTT_FN_RESPONSE_INFORMATION] = IST("26"), + [MQTT_FN_SERVER_REFERENCE] = IST("28"), + [MQTT_FN_REASON_STRING] = IST("31"), + [MQTT_FN_RECEIVE_MAXIMUM] = IST("33"), + [MQTT_FN_TOPIC_ALIAS_MAXIMUM] = IST("34"), + [MQTT_FN_TOPIC_ALIAS] = IST("35"), + [MQTT_FN_MAXIMUM_QOS] = IST("36"), + [MQTT_FN_RETAIN_AVAILABLE] = IST("37"), + [MQTT_FN_USER_PROPERTY] = IST("38"), + [MQTT_FN_MAXIMUM_PACKET_SIZE] = IST("39"), + [MQTT_FN_WILDCARD_SUBSCRIPTION_AVAILABLE] = IST("40"), + [MQTT_FN_SUBSCRIPTION_IDENTIFIERS_AVAILABLE] = IST("41"), + [MQTT_FN_SHARED_SUBSCRIPTION_AVAILABLE] = IST("42"), +}; + +/* list of supported capturable field names for each MQTT control packet type */ +const uint64_t mqtt_fields_per_packet[MQTT_CPT_ENTRIES] = { + [MQTT_CPT_INVALID] = 0, + + [MQTT_CPT_CONNECT] = MQTT_FN_BIT_PROTOCOL_NAME | MQTT_FN_BIT_PROTOCOL_VERSION | + MQTT_FN_BIT_FLAGS | MQTT_FN_BIT_KEEPALIVE | + MQTT_FN_BIT_SESSION_EXPIRY_INTERVAL | MQTT_FN_BIT_RECEIVE_MAXIMUM | + MQTT_FN_BIT_MAXIMUM_PACKET_SIZE | MQTT_FN_BIT_TOPIC_ALIAS_MAXIMUM | + MQTT_FN_BIT_REQUEST_RESPONSE_INFORMATION | MQTT_FN_BIT_REQUEST_PROBLEM_INFORMATION | + MQTT_FN_BIT_USER_PROPERTY | MQTT_FN_BIT_AUTHENTICATION_METHOD | + MQTT_FN_BIT_AUTHENTICATION_DATA | MQTT_FN_BIT_CLIENT_IDENTIFIER | + MQTT_FN_BIT_DELAY_INTERVAL | MQTT_FN_BIT_PAYLOAD_FORMAT_INDICATOR | + MQTT_FN_BIT_MESSAGE_EXPIRY_INTERVAL | MQTT_FN_BIT_CONTENT_TYPE | + MQTT_FN_BIT_RESPONSE_TOPIC | MQTT_FN_BIT_CORRELATION_DATA | + MQTT_FN_BIT_USER_PROPERTY | MQTT_FN_BIT_WILL_TOPIC | + MQTT_FN_BIT_WILL_PAYLOAD | MQTT_FN_BIT_USERNAME | + MQTT_FN_BIT_PASSWORD, + + [MQTT_CPT_CONNACK] = MQTT_FN_BIT_FLAGS | MQTT_FN_BIT_PROTOCOL_VERSION | + MQTT_FN_BIT_REASON_CODE | MQTT_FN_BIT_SESSION_EXPIRY_INTERVAL | + MQTT_FN_BIT_RECEIVE_MAXIMUM | MQTT_FN_BIT_MAXIMUM_QOS | + MQTT_FN_BIT_RETAIN_AVAILABLE | MQTT_FN_BIT_MAXIMUM_PACKET_SIZE | + MQTT_FN_BIT_ASSIGNED_CLIENT_IDENTIFIER | MQTT_FN_BIT_TOPIC_ALIAS_MAXIMUM | + MQTT_FN_BIT_REASON_STRING | MQTT_FN_BIT_WILDCARD_SUBSCRIPTION_AVAILABLE | + MQTT_FN_BIT_SUBSCRIPTION_IDENTIFIERS_AVAILABLE| MQTT_FN_BIT_SHARED_SUBSCRIPTION_AVAILABLE | + MQTT_FN_BIT_SERVER_KEEPALIVE | MQTT_FN_BIT_RESPONSE_INFORMATION | + MQTT_FN_BIT_SERVER_REFERENCE | MQTT_FN_BIT_USER_PROPERTY | + MQTT_FN_BIT_AUTHENTICATION_METHOD | MQTT_FN_BIT_AUTHENTICATION_DATA, + + [MQTT_CPT_PUBLISH] = MQTT_FN_BIT_PAYLOAD_FORMAT_INDICATOR | MQTT_FN_BIT_MESSAGE_EXPIRY_INTERVAL | + MQTT_FN_BIT_CONTENT_TYPE | MQTT_FN_BIT_RESPONSE_TOPIC | + MQTT_FN_BIT_CORRELATION_DATA | MQTT_FN_BIT_SUBSCRIPTION_IDENTIFIER | + MQTT_FN_BIT_TOPIC_ALIAS | MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_PUBACK] = MQTT_FN_BIT_REASON_CODE | MQTT_FN_BIT_REASON_STRING | MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_PUBREC] = MQTT_FN_BIT_REASON_CODE | MQTT_FN_BIT_REASON_STRING | MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_PUBREL] = MQTT_FN_BIT_REASON_CODE | MQTT_FN_BIT_REASON_STRING | MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_PUBCOMP] = MQTT_FN_BIT_REASON_CODE | MQTT_FN_BIT_REASON_STRING | MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_SUBSCRIBE] = MQTT_FN_BIT_SUBSCRIPTION_IDENTIFIER | MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_SUBACK] = MQTT_FN_BIT_REASON_STRING | MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_UNSUBSCRIBE] = MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_UNSUBACK] = MQTT_FN_BIT_REASON_STRING | MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_PINGREQ] = 0, + + [MQTT_CPT_PINGRESP] = 0, + + [MQTT_CPT_DISCONNECT] = MQTT_FN_BIT_REASON_CODE | MQTT_FN_BIT_SESSION_EXPIRY_INTERVAL | + MQTT_FN_BIT_SERVER_REFERENCE | MQTT_FN_BIT_REASON_STRING | + MQTT_FN_BIT_USER_PROPERTY, + + [MQTT_CPT_AUTH] = MQTT_FN_BIT_AUTHENTICATION_METHOD | MQTT_FN_BIT_AUTHENTICATION_DATA | + MQTT_FN_BIT_REASON_STRING | MQTT_FN_BIT_USER_PROPERTY, +}; + +/* Checks the first byte of a message to read the fixed header and extract the + * packet type and flags. <parser> is supposed to point to the fix header byte. + * + * Fix header looks like: + * +-------+-----------+-----------+-----------+---------+----------+----------+---------+------------+ + * | bit | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | + * +-------+-----------+-----------+-----------+---------+----------+----------+---------+------------+ + * | field | MQTT Control Packet Type | Flags specific to each Control Packet type | + * +-------+---------------------------------------------+--------------------------------------------+ + * + * On success, <ptk> is updated with the packet type and flags and the new parser + * state is returned. On error, IST_NULL is returned. + */ +static inline struct ist mqtt_read_fixed_hdr(struct ist parser, struct mqtt_pkt *pkt) +{ + uint8_t type = (uint8_t)*istptr(parser); + uint8_t ptype = (type & 0xF0) >> 4; + uint8_t flags = type & 0x0F; + + if (ptype == MQTT_CPT_INVALID || ptype >= MQTT_CPT_ENTRIES || flags != mqtt_cpt_flags[ptype]) + return IST_NULL; + + pkt->fixed_hdr.type = ptype; + pkt->fixed_hdr.flags = flags; + return istnext(parser); +} + +/* Reads a one byte integer. more information here : + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901007 + * + * <parser> is supposed to point to the first byte of the integer. On success + * the integer is stored in <*i>, if provided, and the new parser state is returned. On + * error, IST_NULL is returned. +*/ +static inline struct ist mqtt_read_1byte_int(struct ist parser, uint8_t *i) +{ + if (istlen(parser) < 1) + return IST_NULL; + if (i) + *i = (uint8_t)*istptr(parser); + parser = istnext(parser); + return parser; +} + +/* Reads a two byte integer. more information here : + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901008 + * + * <parser> is supposed to point to the first byte of the integer. On success + * the integer is stored in <*i>, if provided, and the new parser state is returned. On + * error, IST_NULL is returned. +*/ +static inline struct ist mqtt_read_2byte_int(struct ist parser, uint16_t *i) +{ + if (istlen(parser) < 2) + return IST_NULL; + if (i) { + *i = (uint8_t)*istptr(parser) << 8; + *i += (uint8_t)*(istptr(parser) + 1); + } + parser = istadv(parser, 2); + return parser; +} + +/* Reads a four byte integer. more information here : + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901009 + * + * <parser> is supposed to point to the first byte of the integer. On success + * the integer is stored in <*i>, if provided, and the new parser state is returned. On + * error, IST_NULL is returned. +*/ +static inline struct ist mqtt_read_4byte_int(struct ist parser, uint32_t *i) +{ + if (istlen(parser) < 4) + return IST_NULL; + if (i) { + *i = (uint8_t)*istptr(parser) << 24; + *i += (uint8_t)*(istptr(parser) + 1) << 16; + *i += (uint8_t)*(istptr(parser) + 2) << 8; + *i += (uint8_t)*(istptr(parser) + 3); + } + parser = istadv(parser, 4); + return parser; +} + +/* Reads a variable byte integer. more information here : + * https://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3.1.1-os.html#_Toc398718023 + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901011 + * + * It is encoded using a variable length encoding scheme which uses a single + * byte for values up to 127. Larger values are handled as follows. The least + * significant seven bits of each byte encode the data, and the most significant + * bit is used to indicate that there are following bytes in the representation. + * Thus each byte encodes 128 values and a "continuation bit". + * + * The maximum number of bytes in the Remaining Length field is four + * (MQTT_REMAINING_LENGHT_MAX_SIZE). + * + * <parser> is supposed to point to the first byte of the integer. On success + * the integer is stored in <*i> and the new parser state is returned. On + * error, IST_NULL is returned. + */ +static inline struct ist mqtt_read_varint(struct ist parser, uint32_t *i) +{ + int off, m; + + off = m = 0; + if (i) + *i = 0; + for (off = 0; off < MQTT_REMAINING_LENGHT_MAX_SIZE && istlen(parser); off++) { + uint8_t byte = (uint8_t)*istptr(parser); + + if (i) { + *i += (byte & 127) << m; + m += 7; /* preparing <m> for next byte */ + } + parser = istnext(parser); + + /* we read the latest byte for the remaining length field */ + if (byte <= 127) + break; + } + + if (off == MQTT_REMAINING_LENGHT_MAX_SIZE) + return IST_NULL; + return parser; +} + +/* Reads a MQTT string. more information here : + * http://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3.1.1-os.html#_Toc398718016 + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901010 + * + * In MQTT, strings are prefixed by their size, encoded over 2 bytes: + * byte 1: length MSB + * byte 2: length LSB + * byte 3: string + * ... + * + * string size is MSB * 256 + LSB + * + * <parser> is supposed to point to the first byte of the string. On success the + * string is stored in <*str>, if provided, and the new parser state is + * returned. On error, IST_NULL is returned. + */ +static inline struct ist mqtt_read_string(struct ist parser, struct ist *str) +{ + uint16_t len = 0; + + /* read and compute the string length */ + if (istlen(parser) < 2) + goto error; + + parser = mqtt_read_2byte_int(parser, &len); + if (!isttest(parser) || istlen(parser) < len) + goto error; + + if (str) { + str->ptr = istptr(parser); + str->len = len; + } + + return istadv(parser, len); + + error: + return IST_NULL; +} + +/* Helper function to convert a unsigned integer to a string. The result is + * written in <buf>. On success, the written size is returned, otherwise, on + * error, 0 is returned. + */ +static inline size_t mqtt_uint2str(struct buffer *buf, uint32_t i) +{ + char *end; + + end = ultoa_o(i, buf->area, buf->size); + if (!end) + return 0; + buf->data = end - buf->area; + return buf->data; +} + +/* Extracts the value of a <fieldname_id> of type <type> from a given MQTT + * message <msg>. IST_NULL is returned if an error occurred while parsing or if + * the field could not be found. If more data are required, the message with a + * length set to 0 is returned. If the field is found, the response is returned + * as a struct ist. + */ +struct ist mqtt_field_value(struct ist msg, int type, int fieldname_id) +{ + struct buffer *trash = get_trash_chunk(); + struct mqtt_pkt mpkt; + struct ist res; + + switch (mqtt_validate_message(msg, &mpkt)) { + case MQTT_VALID_MESSAGE: + if (mpkt.fixed_hdr.type != type) + goto not_found_or_invalid; + break; + case MQTT_NEED_MORE_DATA: + goto need_more; + case MQTT_INVALID_MESSAGE: + goto not_found_or_invalid; + } + + switch (type) { + case MQTT_CPT_CONNECT: + switch (fieldname_id) { + case MQTT_FN_FLAGS: + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.flags)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_PROTOCOL_NAME: + if (!istlen(mpkt.data.connect.var_hdr.protocol_name)) + goto not_found_or_invalid; + res = mpkt.data.connect.var_hdr.protocol_name; + goto end; + + case MQTT_FN_PROTOCOL_VERSION: + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.protocol_version)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_CLIENT_IDENTIFIER: + if (!istlen(mpkt.data.connect.payload.client_identifier)) + goto not_found_or_invalid; + res = mpkt.data.connect.payload.client_identifier; + goto end; + + case MQTT_FN_WILL_TOPIC: + if (!istlen(mpkt.data.connect.payload.will_topic)) + goto not_found_or_invalid; + res = mpkt.data.connect.payload.will_topic; + goto end; + + case MQTT_FN_WILL_PAYLOAD: + if (!istlen(mpkt.data.connect.payload.will_payload)) + goto not_found_or_invalid; + res = mpkt.data.connect.payload.will_payload; + goto end; + + case MQTT_FN_USERNAME: + if (!istlen(mpkt.data.connect.payload.username)) + goto not_found_or_invalid; + res = mpkt.data.connect.payload.username; + goto end; + + case MQTT_FN_PASSWORD: + if (!istlen(mpkt.data.connect.payload.password)) + goto not_found_or_invalid; + res = mpkt.data.connect.payload.password; + goto end; + + case MQTT_FN_KEEPALIVE: + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.keepalive)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_PAYLOAD_FORMAT_INDICATOR: + if ((mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) || + !(mpkt.data.connect.var_hdr.flags & MQTT_CONNECT_FL_WILL)) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.payload.will_props.payload_format_indicator)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_MESSAGE_EXPIRY_INTERVAL: + if ((mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) || + !(mpkt.data.connect.var_hdr.flags & MQTT_CONNECT_FL_WILL)) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.payload.will_props.message_expiry_interval)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_CONTENT_TYPE: + if ((mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) || + !(mpkt.data.connect.var_hdr.flags & MQTT_CONNECT_FL_WILL)) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connect.payload.will_props.content_type)) + goto not_found_or_invalid; + res = mpkt.data.connect.payload.will_props.content_type; + goto end; + + case MQTT_FN_RESPONSE_TOPIC: + if ((mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) || + !(mpkt.data.connect.var_hdr.flags & MQTT_CONNECT_FL_WILL)) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connect.payload.will_props.response_topic)) + goto not_found_or_invalid; + res = mpkt.data.connect.payload.will_props.response_topic; + goto end; + + case MQTT_FN_CORRELATION_DATA: + if ((mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) || + !(mpkt.data.connect.var_hdr.flags & MQTT_CONNECT_FL_WILL)) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connect.payload.will_props.correlation_data)) + goto not_found_or_invalid; + res = mpkt.data.connect.payload.will_props.correlation_data; + goto end; + + case MQTT_FN_SESSION_EXPIRY_INTERVAL: + if (mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.props.session_expiry_interval)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_AUTHENTICATION_METHOD: + if (mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connect.var_hdr.props.authentication_method)) + goto not_found_or_invalid; + res = mpkt.data.connect.var_hdr.props.authentication_method; + goto end; + + case MQTT_FN_AUTHENTICATION_DATA: + if (mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connect.var_hdr.props.authentication_data)) + goto not_found_or_invalid; + res = mpkt.data.connect.var_hdr.props.authentication_data; + goto end; + + case MQTT_FN_REQUEST_PROBLEM_INFORMATION: + if (mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.props.request_problem_information)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_DELAY_INTERVAL: + if ((mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) || + !(mpkt.data.connect.var_hdr.flags & MQTT_CONNECT_FL_WILL)) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.payload.will_props.delay_interval)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_REQUEST_RESPONSE_INFORMATION: + if (mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.props.request_response_information)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_RECEIVE_MAXIMUM: + if (mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.props.receive_maximum)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_TOPIC_ALIAS_MAXIMUM: + if (mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.props.topic_alias_maximum)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_MAXIMUM_PACKET_SIZE: + if (mpkt.data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connect.var_hdr.props.maximum_packet_size)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + default: + goto not_found_or_invalid; + } + break; + + case MQTT_CPT_CONNACK: + switch (fieldname_id) { + case MQTT_FN_FLAGS: + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.flags)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_REASON_CODE: + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.reason_code)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_PROTOCOL_VERSION: + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.protocol_version)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_SESSION_EXPIRY_INTERVAL: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.session_expiry_interval)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_ASSIGNED_CLIENT_IDENTIFIER: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connack.var_hdr.props.assigned_client_identifier)) + goto not_found_or_invalid; + res = mpkt.data.connack.var_hdr.props.assigned_client_identifier; + goto end; + + case MQTT_FN_SERVER_KEEPALIVE: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.server_keepalive)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_AUTHENTICATION_METHOD: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connack.var_hdr.props.authentication_method)) + goto not_found_or_invalid; + res = mpkt.data.connack.var_hdr.props.authentication_method; + goto end; + + case MQTT_FN_AUTHENTICATION_DATA: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connack.var_hdr.props.authentication_data)) + goto not_found_or_invalid; + res = mpkt.data.connack.var_hdr.props.authentication_data; + goto end; + + case MQTT_FN_RESPONSE_INFORMATION: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connack.var_hdr.props.response_information)) + goto not_found_or_invalid; + res = mpkt.data.connack.var_hdr.props.response_information; + goto end; + + case MQTT_FN_SERVER_REFERENCE: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connack.var_hdr.props.server_reference)) + goto not_found_or_invalid; + res = mpkt.data.connack.var_hdr.props.server_reference; + goto end; + + case MQTT_FN_REASON_STRING: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!istlen(mpkt.data.connack.var_hdr.props.reason_string)) + goto not_found_or_invalid; + res = mpkt.data.connack.var_hdr.props.reason_string; + goto end; + + case MQTT_FN_RECEIVE_MAXIMUM: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.receive_maximum)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_TOPIC_ALIAS_MAXIMUM: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.topic_alias_maximum)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_MAXIMUM_QOS: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.maximum_qos)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_RETAIN_AVAILABLE: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.retain_available)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_MAXIMUM_PACKET_SIZE: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.maximum_packet_size)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_WILDCARD_SUBSCRIPTION_AVAILABLE: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.wildcard_subscription_available)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_SUBSCRIPTION_IDENTIFIERS_AVAILABLE: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.subscription_identifiers_available)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + case MQTT_FN_SHARED_SUBSCRIPTION_AVAILABLE: + if (mpkt.data.connack.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto not_found_or_invalid; + if (!mqtt_uint2str(trash, mpkt.data.connack.var_hdr.props.shared_subsription_available)) + goto not_found_or_invalid; + res = ist2(trash->area, trash->data); + goto end; + + default: + goto not_found_or_invalid; + } + break; + + default: + goto not_found_or_invalid; + } + + end: + return res; + + need_more: + return ist2(istptr(msg), 0); + + not_found_or_invalid: + return IST_NULL; +} + +/* Parses a CONNECT packet : + * https://public.dhe.ibm.com/software/dw/webservices/ws-mqtt/mqtt-v3r1.html#connect + * https://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3.1.1-os.html#_Toc398718028 + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901033 + * + * <parser> should point right after the MQTT fixed header. The remaining length + * was already checked, thus missing data is an error. On success, the result of + * the parsing is stored in <mpkt>. + * + * Returns: + * MQTT_INVALID_MESSAGE if the CONNECT message is invalid + * MQTT_VALID_MESSAGE if the CONNECT message looks valid + */ +static int mqtt_parse_connect(struct ist parser, struct mqtt_pkt *mpkt) +{ + /* The parser length is stored to be sure exactly consumed the announced + * remaining length. */ + size_t orig_len = istlen(parser); + int ret = MQTT_INVALID_MESSAGE; + + /* + * parsing variable header + */ + /* read protocol_name */ + parser = mqtt_read_string(parser, &mpkt->data.connect.var_hdr.protocol_name); + if (!isttest(parser) || !(isteqi(mpkt->data.connect.var_hdr.protocol_name, ist("MQTT")) || isteqi(mpkt->data.connect.var_hdr.protocol_name, ist("MQIsdp")))) + goto end; + + /* read protocol_version */ + parser = mqtt_read_1byte_int(parser, &mpkt->data.connect.var_hdr.protocol_version); + if (!isttest(parser)) + goto end; + if (mpkt->data.connect.var_hdr.protocol_version != MQTT_VERSION_3_1 && + mpkt->data.connect.var_hdr.protocol_version != MQTT_VERSION_3_1_1 && + mpkt->data.connect.var_hdr.protocol_version != MQTT_VERSION_5_0) + goto end; + + /* read flags */ + /* bit 1 is 'reserved' and must be set to 0 in CONNECT message flags */ + parser = mqtt_read_1byte_int(parser, &mpkt->data.connect.var_hdr.flags); + if (!isttest(parser) || (mpkt->data.connect.var_hdr.flags & MQTT_CONNECT_FL_RESERVED)) + goto end; + + /* if WILL flag must be set to have WILL_QOS flag or WILL_RETAIN set */ + if ((mpkt->data.connect.var_hdr.flags & (MQTT_CONNECT_FL_WILL|MQTT_CONNECT_FL_WILL_QOS|MQTT_CONNECT_FL_WILL_RETAIN)) == MQTT_CONNECT_FL_WILL_QOS) + goto end; + + /* read keepalive */ + parser = mqtt_read_2byte_int(parser, &mpkt->data.connect.var_hdr.keepalive); + if (!isttest(parser)) + goto end; + + /* read properties, only available in MQTT_VERSION_5_0 */ + if (mpkt->data.connect.var_hdr.protocol_version == MQTT_VERSION_5_0) { + struct ist props; + unsigned int user_prop_idx = 0; + uint64_t fields = 0; + uint32_t plen = 0; + + parser = mqtt_read_varint(parser, &plen); + if (!isttest(parser) || istlen(parser) < plen) + goto end; + props = ist2(istptr(parser), plen); + parser = istadv(parser, props.len); + + while (istlen(props) > 0) { + switch (*istptr(props)) { + case MQTT_PROP_SESSION_EXPIRY_INTERVAL: + if (fields & MQTT_FN_BIT_SESSION_EXPIRY_INTERVAL) + goto end; + props = mqtt_read_4byte_int(istnext(props), &mpkt->data.connect.var_hdr.props.session_expiry_interval); + fields |= MQTT_FN_BIT_SESSION_EXPIRY_INTERVAL; + break; + + case MQTT_PROP_RECEIVE_MAXIMUM: + if (fields & MQTT_FN_BIT_RECEIVE_MAXIMUM) + goto end; + props = mqtt_read_2byte_int(istnext(props), &mpkt->data.connect.var_hdr.props.receive_maximum); + /* cannot be 0 */ + if (!mpkt->data.connect.var_hdr.props.receive_maximum) + goto end; + fields |= MQTT_FN_BIT_RECEIVE_MAXIMUM; + break; + + case MQTT_PROP_MAXIMUM_PACKET_SIZE: + if (fields & MQTT_FN_BIT_MAXIMUM_PACKET_SIZE) + goto end; + props = mqtt_read_4byte_int(istnext(props), &mpkt->data.connect.var_hdr.props.maximum_packet_size); + /* cannot be 0 */ + if (!mpkt->data.connect.var_hdr.props.maximum_packet_size) + goto end; + fields |= MQTT_FN_BIT_MAXIMUM_PACKET_SIZE; + break; + + case MQTT_PROP_TOPIC_ALIAS_MAXIMUM: + if (fields & MQTT_FN_BIT_TOPIC_ALIAS) + goto end; + props = mqtt_read_2byte_int(istnext(props), &mpkt->data.connect.var_hdr.props.topic_alias_maximum); + fields |= MQTT_FN_BIT_TOPIC_ALIAS; + break; + + case MQTT_PROP_REQUEST_RESPONSE_INFORMATION: + if (fields & MQTT_FN_BIT_REQUEST_RESPONSE_INFORMATION) + goto end; + props = mqtt_read_1byte_int(istnext(props), &mpkt->data.connect.var_hdr.props.request_response_information); + /* can have only 2 values: 0 or 1 */ + if (mpkt->data.connect.var_hdr.props.request_response_information > 1) + goto end; + fields |= MQTT_FN_BIT_REQUEST_RESPONSE_INFORMATION; + break; + + case MQTT_PROP_REQUEST_PROBLEM_INFORMATION: + if (fields & MQTT_FN_BIT_REQUEST_PROBLEM_INFORMATION) + goto end; + props = mqtt_read_1byte_int(istnext(props), &mpkt->data.connect.var_hdr.props.request_problem_information); + /* can have only 2 values: 0 or 1 */ + if (mpkt->data.connect.var_hdr.props.request_problem_information > 1) + goto end; + fields |= MQTT_FN_BIT_REQUEST_PROBLEM_INFORMATION; + break; + + case MQTT_PROP_USER_PROPERTIES: + /* if we reached MQTT_PROP_USER_PROPERTY_ENTRIES already, then + * we start writing over the first property */ + if (user_prop_idx >= MQTT_PROP_USER_PROPERTY_ENTRIES) + user_prop_idx = 0; + + /* read user property name and value */ + props = mqtt_read_string(istnext(props), &mpkt->data.connect.var_hdr.props.user_props[user_prop_idx].name); + if (!isttest(props)) + goto end; + props = mqtt_read_string(props, &mpkt->data.connect.var_hdr.props.user_props[user_prop_idx].value); + ++user_prop_idx; + break; + + case MQTT_PROP_AUTHENTICATION_METHOD: + if (fields & MQTT_FN_BIT_AUTHENTICATION_METHOD) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connect.var_hdr.props.authentication_method); + fields |= MQTT_FN_BIT_AUTHENTICATION_METHOD; + break; + + case MQTT_PROP_AUTHENTICATION_DATA: + if (fields & MQTT_FN_BIT_AUTHENTICATION_DATA) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connect.var_hdr.props.authentication_data); + fields |= MQTT_FN_BIT_AUTHENTICATION_DATA; + break; + + default: + goto end; + } + + if (!isttest(props)) + goto end; + } + } + + /* cannot have auth data without auth method */ + if (!istlen(mpkt->data.connect.var_hdr.props.authentication_method) && + istlen(mpkt->data.connect.var_hdr.props.authentication_data)) + goto end; + + /* parsing payload + * + * Content of payload is related to flags parsed above and the field order is pre-defined: + * Client Identifier, Will Topic, Will Message, User Name, Password + */ + /* read client identifier */ + parser = mqtt_read_string(parser, &mpkt->data.connect.payload.client_identifier); + if (!isttest(parser)) + goto end; + + /* read Will Properties, for MQTT v5 only + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901060 + */ + if ((mpkt->data.connect.var_hdr.protocol_version == MQTT_VERSION_5_0) && + (mpkt->data.connect.var_hdr.flags & MQTT_CONNECT_FL_WILL)) { + struct ist props; + unsigned int user_prop_idx = 0; + uint64_t fields = 0; + uint32_t plen = 0; + + parser = mqtt_read_varint(parser, &plen); + if (!isttest(parser) || istlen(parser) < plen) + goto end; + props = ist2(istptr(parser), plen); + parser = istadv(parser, props.len); + + while (istlen(props) > 0) { + switch (*istptr(props)) { + case MQTT_PROP_WILL_DELAY_INTERVAL: + if (fields & MQTT_FN_BIT_DELAY_INTERVAL) + goto end; + props = mqtt_read_4byte_int(istnext(props), &mpkt->data.connect.payload.will_props.delay_interval); + fields |= MQTT_FN_BIT_DELAY_INTERVAL; + break; + + case MQTT_PROP_PAYLOAD_FORMAT_INDICATOR: + if (fields & MQTT_FN_BIT_PAYLOAD_FORMAT_INDICATOR) + goto end; + props = mqtt_read_1byte_int(istnext(props), &mpkt->data.connect.payload.will_props.payload_format_indicator); + /* can have only 2 values: 0 or 1 */ + if (mpkt->data.connect.payload.will_props.payload_format_indicator > 1) + goto end; + fields |= MQTT_FN_BIT_PAYLOAD_FORMAT_INDICATOR; + break; + + case MQTT_PROP_MESSAGE_EXPIRY_INTERVAL: + if (fields & MQTT_FN_BIT_MESSAGE_EXPIRY_INTERVAL) + goto end; + props = mqtt_read_4byte_int(istnext(props), &mpkt->data.connect.payload.will_props.message_expiry_interval); + fields |= MQTT_FN_BIT_MESSAGE_EXPIRY_INTERVAL; + break; + + case MQTT_PROP_CONTENT_TYPE: + if (fields & MQTT_FN_BIT_CONTENT_TYPE) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connect.payload.will_props.content_type); + fields |= MQTT_FN_BIT_CONTENT_TYPE; + break; + + case MQTT_PROP_RESPONSE_TOPIC: + if (fields & MQTT_FN_BIT_RESPONSE_TOPIC) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connect.payload.will_props.response_topic); + fields |= MQTT_FN_BIT_RESPONSE_TOPIC; + break; + + case MQTT_PROP_CORRELATION_DATA: + if (fields & MQTT_FN_BIT_CORRELATION_DATA) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connect.payload.will_props.correlation_data); + fields |= MQTT_FN_BIT_CORRELATION_DATA; + break; + + case MQTT_PROP_USER_PROPERTIES: + /* if we reached MQTT_PROP_USER_PROPERTY_ENTRIES already, then + * we start writing over the first property */ + if (user_prop_idx >= MQTT_PROP_USER_PROPERTY_ENTRIES) + user_prop_idx = 0; + + /* read user property name and value */ + props = mqtt_read_string(istnext(props), &mpkt->data.connect.payload.will_props.user_props[user_prop_idx].name); + if (!isttest(props)) + goto end; + props = mqtt_read_string(props, &mpkt->data.connect.payload.will_props.user_props[user_prop_idx].value); + ++user_prop_idx; + break; + + default: + goto end; + } + + if (!isttest(props)) + goto end; + } + } + + /* read Will Topic and Will Message (MQTT 3.1.1) or Payload (MQTT 5.0) */ + if (mpkt->data.connect.var_hdr.flags & MQTT_CONNECT_FL_WILL) { + parser = mqtt_read_string(parser, &mpkt->data.connect.payload.will_topic); + if (!isttest(parser)) + goto end; + parser = mqtt_read_string(parser, &mpkt->data.connect.payload.will_payload); + if (!isttest(parser)) + goto end; + } + + /* read User Name */ + if (mpkt->data.connect.var_hdr.flags & MQTT_CONNECT_FL_USERNAME) { + parser = mqtt_read_string(parser, &mpkt->data.connect.payload.username); + if (!isttest(parser)) + goto end; + } + + /* read Password */ + if (mpkt->data.connect.var_hdr.flags & MQTT_CONNECT_FL_PASSWORD) { + parser = mqtt_read_string(parser, &mpkt->data.connect.payload.password); + if (!isttest(parser)) + goto end; + } + + if ((orig_len - istlen(parser)) == mpkt->fixed_hdr.remaining_length) + ret = MQTT_VALID_MESSAGE; + + end: + return ret; +} + +/* Parses a CONNACK packet : + * https://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3.1.1-os.html#_Toc398718033 + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901074 + * + * <parser> should point right after the MQTT fixed header. The remaining length + * was already checked, thus missing data is an error. On success, the result of + * the parsing is stored in <mpkt>. + * + * Returns: + * MQTT_INVALID_MESSAGE if the CONNECT message is invalid + * MQTT_VALID_MESSAGE if the CONNECT message looks valid + */ +static int mqtt_parse_connack(struct ist parser, struct mqtt_pkt *mpkt) +{ + /* The parser length is stored to be sure exactly consumed the announced + * remaining length. */ + size_t orig_len = istlen(parser); + int ret = MQTT_INVALID_MESSAGE; + + if (istlen(parser) < 2) + goto end; + else if (istlen(parser) == 2) + mpkt->data.connack.var_hdr.protocol_version = MQTT_VERSION_3_1_1; + else + mpkt->data.connack.var_hdr.protocol_version = MQTT_VERSION_5_0; + + /* + * parsing variable header + */ + /* read flags */ + /* bits 7 to 1 on flags are reserved and must be 0 */ + parser = mqtt_read_1byte_int(parser, &mpkt->data.connack.var_hdr.flags); + if (!isttest(parser) || (mpkt->data.connack.var_hdr.flags & 0xFE)) + goto end; + + /* read reason_code */ + parser = mqtt_read_1byte_int(parser, &mpkt->data.connack.var_hdr.reason_code); + if (!isttest(parser)) + goto end; + + /* we can leave here for MQTT 3.1.1 */ + if (mpkt->data.connack.var_hdr.protocol_version == MQTT_VERSION_3_1_1) { + if ((orig_len - istlen(parser)) == mpkt->fixed_hdr.remaining_length) + ret = MQTT_VALID_MESSAGE; + goto end; + } + + /* read properties, only available in MQTT_VERSION_5_0 */ + if (mpkt->data.connack.var_hdr.protocol_version == MQTT_VERSION_5_0) { + struct ist props; + unsigned int user_prop_idx = 0; + uint64_t fields = 0; + uint32_t plen = 0; + + parser = mqtt_read_varint(parser, &plen); + if (!isttest(parser) || istlen(parser) < plen) + goto end; + props = ist2(istptr(parser), plen); + parser = istadv(parser, props.len); + + while (istlen(props) > 0) { + switch (*istptr(props)) { + case MQTT_PROP_SESSION_EXPIRY_INTERVAL: + if (fields & MQTT_FN_BIT_SESSION_EXPIRY_INTERVAL) + goto end; + props = mqtt_read_4byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.session_expiry_interval); + fields |= MQTT_FN_BIT_SESSION_EXPIRY_INTERVAL; + break; + + case MQTT_PROP_RECEIVE_MAXIMUM: + if (fields & MQTT_FN_BIT_RECEIVE_MAXIMUM) + goto end; + props = mqtt_read_2byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.receive_maximum); + /* cannot be 0 */ + if (!mpkt->data.connack.var_hdr.props.receive_maximum) + goto end; + fields |= MQTT_FN_BIT_RECEIVE_MAXIMUM; + break; + + case MQTT_PROP_MAXIMUM_QOS: + if (fields & MQTT_FN_BIT_MAXIMUM_QOS) + goto end; + props = mqtt_read_1byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.maximum_qos); + /* can have only 2 values: 0 or 1 */ + if (mpkt->data.connack.var_hdr.props.maximum_qos > 1) + goto end; + fields |= MQTT_FN_BIT_MAXIMUM_QOS; + break; + + case MQTT_PROP_RETAIN_AVAILABLE: + if (fields & MQTT_FN_BIT_RETAIN_AVAILABLE) + goto end; + props = mqtt_read_1byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.retain_available); + /* can have only 2 values: 0 or 1 */ + if (mpkt->data.connack.var_hdr.props.retain_available > 1) + goto end; + fields |= MQTT_FN_BIT_RETAIN_AVAILABLE; + break; + + case MQTT_PROP_MAXIMUM_PACKET_SIZE: + if (fields & MQTT_FN_BIT_MAXIMUM_PACKET_SIZE) + goto end; + props = mqtt_read_4byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.maximum_packet_size); + /* cannot be 0 */ + if (!mpkt->data.connack.var_hdr.props.maximum_packet_size) + goto end; + fields |= MQTT_FN_BIT_MAXIMUM_PACKET_SIZE; + break; + + case MQTT_PROP_ASSIGNED_CLIENT_IDENTIFIER: + if (fields & MQTT_FN_BIT_ASSIGNED_CLIENT_IDENTIFIER) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connack.var_hdr.props.assigned_client_identifier); + if (!istlen(mpkt->data.connack.var_hdr.props.assigned_client_identifier)) + goto end; + fields |= MQTT_FN_BIT_ASSIGNED_CLIENT_IDENTIFIER; + break; + + case MQTT_PROP_TOPIC_ALIAS_MAXIMUM: + if (fields & MQTT_FN_BIT_TOPIC_ALIAS_MAXIMUM) + goto end; + props = mqtt_read_2byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.topic_alias_maximum); + fields |= MQTT_FN_BIT_TOPIC_ALIAS_MAXIMUM; + break; + + case MQTT_PROP_REASON_STRING: + if (fields & MQTT_FN_BIT_REASON_STRING) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connack.var_hdr.props.reason_string); + fields |= MQTT_FN_BIT_REASON_STRING; + break; + + case MQTT_PROP_WILDCARD_SUBSCRIPTION_AVAILABLE: + if (fields & MQTT_FN_BIT_WILDCARD_SUBSCRIPTION_AVAILABLE) + goto end; + props = mqtt_read_1byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.wildcard_subscription_available); + /* can have only 2 values: 0 or 1 */ + if (mpkt->data.connack.var_hdr.props.wildcard_subscription_available > 1) + goto end; + fields |= MQTT_FN_BIT_WILDCARD_SUBSCRIPTION_AVAILABLE; + break; + + case MQTT_PROP_SUBSCRIPTION_IDENTIFIERS_AVAILABLE: + if (fields & MQTT_FN_BIT_SUBSCRIPTION_IDENTIFIER) + goto end; + props = mqtt_read_1byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.subscription_identifiers_available); + /* can have only 2 values: 0 or 1 */ + if (mpkt->data.connack.var_hdr.props.subscription_identifiers_available > 1) + goto end; + fields |= MQTT_FN_BIT_SUBSCRIPTION_IDENTIFIER; + break; + + case MQTT_PROP_SHARED_SUBSRIPTION_AVAILABLE: + if (fields & MQTT_FN_BIT_SHARED_SUBSCRIPTION_AVAILABLE) + goto end; + props = mqtt_read_1byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.shared_subsription_available); + /* can have only 2 values: 0 or 1 */ + if (mpkt->data.connack.var_hdr.props.shared_subsription_available > 1) + goto end; + fields |= MQTT_FN_BIT_SHARED_SUBSCRIPTION_AVAILABLE; + break; + + case MQTT_PROP_SERVER_KEEPALIVE: + if (fields & MQTT_FN_BIT_SERVER_KEEPALIVE) + goto end; + props = mqtt_read_2byte_int(istnext(props), &mpkt->data.connack.var_hdr.props.server_keepalive); + fields |= MQTT_FN_BIT_SERVER_KEEPALIVE; + break; + + case MQTT_PROP_RESPONSE_INFORMATION: + if (fields & MQTT_FN_BIT_RESPONSE_INFORMATION) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connack.var_hdr.props.response_information); + fields |= MQTT_FN_BIT_RESPONSE_INFORMATION; + break; + + case MQTT_PROP_SERVER_REFERENCE: + if (fields & MQTT_FN_BIT_SERVER_REFERENCE) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connack.var_hdr.props.server_reference); + fields |= MQTT_FN_BIT_SERVER_REFERENCE; + break; + + case MQTT_PROP_USER_PROPERTIES: + /* if we reached MQTT_PROP_USER_PROPERTY_ENTRIES already, then + * we start writing over the first property */ + if (user_prop_idx >= MQTT_PROP_USER_PROPERTY_ENTRIES) + user_prop_idx = 0; + + /* read user property name and value */ + props = mqtt_read_string(istnext(props), &mpkt->data.connack.var_hdr.props.user_props[user_prop_idx].name); + if (!isttest(props)) + goto end; + props = mqtt_read_string(props, &mpkt->data.connack.var_hdr.props.user_props[user_prop_idx].value); + ++user_prop_idx; + break; + + case MQTT_PROP_AUTHENTICATION_METHOD: + if (fields & MQTT_FN_BIT_AUTHENTICATION_METHOD) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connack.var_hdr.props.authentication_method); + fields |= MQTT_FN_BIT_AUTHENTICATION_METHOD; + break; + + case MQTT_PROP_AUTHENTICATION_DATA: + if (fields & MQTT_FN_BIT_AUTHENTICATION_DATA) + goto end; + props = mqtt_read_string(istnext(props), &mpkt->data.connack.var_hdr.props.authentication_data); + fields |= MQTT_FN_BIT_AUTHENTICATION_DATA; + break; + + default: + return 0; + } + + if (!isttest(props)) + goto end; + } + } + + if ((orig_len - istlen(parser)) == mpkt->fixed_hdr.remaining_length) + ret = MQTT_VALID_MESSAGE; + end: + return ret; +} + + +/* Parses and validates a MQTT packet + * https://docs.oasis-open.org/mqtt/mqtt/v3.1.1/os/mqtt-v3.1.1-os.html#_Toc398718028 + * + * For now, due to HAProxy limitation, only validation of CONNECT and CONNACK packets + * are supported. + * + * - check FIXED_HDR + * - check remaining length + * - check variable headers and payload + * + * if <mpkt> is not NULL, then this structure will be filled up as well. An + * unsupported packet type is considered as invalid. It is not a problem for now + * because only the first packet on each side can be parsed (CONNECT for the + * client and CONNACK for the server). + * + * Returns: + * MQTT_INVALID_MESSAGE if the message is invalid + * MQTT_NEED_MORE_DATA if we need more data to fully validate the message + * MQTT_VALID_MESSAGE if the message looks valid + */ +int mqtt_validate_message(const struct ist msg, struct mqtt_pkt *mpkt) +{ + struct ist parser; + struct mqtt_pkt tmp_mpkt; + int ret = MQTT_INVALID_MESSAGE; + + if (!mpkt) + mpkt = &tmp_mpkt; + memset(mpkt, 0, sizeof(*mpkt)); + + parser = msg; + if (istlen(msg) < MQTT_MIN_PKT_SIZE) { + ret = MQTT_NEED_MORE_DATA; + goto end; + } + + /* parse the MQTT fixed header */ + parser = mqtt_read_fixed_hdr(parser, mpkt); + if (!isttest(parser)) { + ret = MQTT_INVALID_MESSAGE; + goto end; + } + + /* Now parsing "remaining length" field */ + parser = mqtt_read_varint(parser, &mpkt->fixed_hdr.remaining_length); + if (!isttest(parser)) { + ret = MQTT_INVALID_MESSAGE; + goto end; + } + + if (istlen(parser) < mpkt->fixed_hdr.remaining_length) + return MQTT_NEED_MORE_DATA; + + /* Now parsing the variable header and payload, which is based on the packet type */ + switch (mpkt->fixed_hdr.type) { + case MQTT_CPT_CONNECT: + ret = mqtt_parse_connect(parser, mpkt); + break; + case MQTT_CPT_CONNACK: + ret = mqtt_parse_connack(parser, mpkt); + break; + default: + break; + } + + end: + return ret; +} diff --git a/src/mux_fcgi.c b/src/mux_fcgi.c new file mode 100644 index 0000000..0230e6b --- /dev/null +++ b/src/mux_fcgi.c @@ -0,0 +1,4268 @@ +/* + * FastCGI mux-demux for connections + * + * Copyright (C) 2019 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <import/ist.h> +#include <import/eb32tree.h> +#include <import/ebmbtree.h> + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/errors.h> +#include <haproxy/fcgi-app.h> +#include <haproxy/fcgi.h> +#include <haproxy/h1.h> +#include <haproxy/h1_htx.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/mux_fcgi-t.h> +#include <haproxy/net_helper.h> +#include <haproxy/proxy.h> +#include <haproxy/regex.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/session-t.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/trace.h> +#include <haproxy/version.h> + +/* 32 buffers: one for the ring's root, rest for the mbuf itself */ +#define FCGI_C_MBUF_CNT 32 + +/* Size for a record header (also size of empty record) */ +#define FCGI_RECORD_HEADER_SZ 8 + +/* FCGI connection descriptor */ +struct fcgi_conn { + struct connection *conn; + + enum fcgi_conn_st state; /* FCGI connection state */ + int16_t max_id; /* highest ID known on this connection, <0 before mgmt records */ + uint32_t streams_limit; /* maximum number of concurrent streams the peer supports */ + uint32_t flags; /* Connection flags: FCGI_CF_* */ + + int16_t dsi; /* dmux stream ID (<0 = idle ) */ + uint16_t drl; /* demux record length (if dsi >= 0) */ + uint8_t drt; /* demux record type (if dsi >= 0) */ + uint8_t drp; /* demux record padding (if dsi >= 0) */ + + struct buffer dbuf; /* demux buffer */ + struct buffer mbuf[FCGI_C_MBUF_CNT]; /* mux buffers (ring) */ + + int timeout; /* idle timeout duration in ticks */ + int shut_timeout; /* idle timeout duration in ticks after shutdown */ + unsigned int nb_streams; /* number of streams in the tree */ + unsigned int nb_sc; /* number of attached stream connectors */ + unsigned int nb_reserved; /* number of reserved streams */ + unsigned int stream_cnt; /* total number of streams seen */ + + struct proxy *proxy; /* the proxy this connection was created for */ + struct fcgi_app *app; /* FCGI application used by this mux */ + struct task *task; /* timeout management task */ + struct eb_root streams_by_id; /* all active streams by their ID */ + + struct list send_list; /* list of blocked streams requesting to send */ + + struct buffer_wait buf_wait; /* Wait list for buffer allocation */ + struct wait_event wait_event; /* To be used if we're waiting for I/Os */ +}; + + +/* FCGI stream descriptor */ +struct fcgi_strm { + struct sedesc *sd; + struct session *sess; + struct fcgi_conn *fconn; + + int32_t id; /* stream ID */ + + uint32_t flags; /* Connection flags: FCGI_SF_* */ + enum fcgi_strm_st state; /* FCGI stream state */ + int proto_status; /* FCGI_PS_* */ + + struct h1m h1m; /* response parser state for H1 */ + + struct buffer rxbuf; /* receive buffer, always valid (buf_empty or real buffer) */ + + struct eb32_node by_id; /* place in fcgi_conn's streams_by_id */ + struct wait_event *subs; /* Address of the wait_event the stream connector associated is waiting on */ + struct list send_list; /* To be used when adding in fcgi_conn->send_list */ + struct tasklet *shut_tl; /* deferred shutdown tasklet, to retry to close after we failed to by lack of space */ +}; + +/* Flags representing all default FCGI parameters */ +#define FCGI_SP_CGI_GATEWAY 0x00000001 +#define FCGI_SP_DOC_ROOT 0x00000002 +#define FCGI_SP_SCRIPT_NAME 0x00000004 +#define FCGI_SP_PATH_INFO 0x00000008 +#define FCGI_SP_REQ_URI 0x00000010 +#define FCGI_SP_REQ_METH 0x00000020 +#define FCGI_SP_REQ_QS 0x00000040 +#define FCGI_SP_SRV_PORT 0x00000080 +#define FCGI_SP_SRV_PROTO 0x00000100 +#define FCGI_SP_SRV_NAME 0x00000200 +#define FCGI_SP_REM_ADDR 0x00000400 +#define FCGI_SP_REM_PORT 0x00000800 +#define FCGI_SP_SCRIPT_FILE 0x00001000 +#define FCGI_SP_PATH_TRANS 0x00002000 +#define FCGI_SP_CONT_LEN 0x00004000 +#define FCGI_SP_HTTPS 0x00008000 +#define FCGI_SP_SRV_SOFT 0x00010000 +#define FCGI_SP_MASK 0x0001FFFF +#define FCGI_SP_URI_MASK (FCGI_SP_SCRIPT_NAME|FCGI_SP_PATH_INFO|FCGI_SP_REQ_QS) + +/* FCGI parameters used when PARAMS record is sent */ +struct fcgi_strm_params { + uint32_t mask; + struct ist docroot; + struct ist scriptname; + struct ist pathinfo; + struct ist meth; + struct ist uri; + struct ist vsn; + struct ist qs; + struct ist srv_name; + struct ist srv_port; + struct ist rem_addr; + struct ist rem_port; + struct ist cont_len; + struct ist srv_soft; + int https; + struct buffer *p; +}; + +/* Maximum amount of data we're OK with re-aligning for buffer optimizations */ +#define MAX_DATA_REALIGN 1024 + +/* trace source and events */ +static void fcgi_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +/* The event representation is split like this : + * fconn - internal FCGI connection + * fstrm - internal FCGI stream + * strm - application layer + * rx - data receipt + * tx - data transmission + * rsp - response parsing + */ +static const struct trace_event fcgi_trace_events[] = { +#define FCGI_EV_FCONN_NEW (1ULL << 0) + { .mask = FCGI_EV_FCONN_NEW, .name = "fconn_new", .desc = "new FCGI connection" }, +#define FCGI_EV_FCONN_RECV (1ULL << 1) + { .mask = FCGI_EV_FCONN_RECV, .name = "fconn_recv", .desc = "Rx on FCGI connection" }, +#define FCGI_EV_FCONN_SEND (1ULL << 2) + { .mask = FCGI_EV_FCONN_SEND, .name = "fconn_send", .desc = "Tx on FCGI connection" }, +#define FCGI_EV_FCONN_BLK (1ULL << 3) + { .mask = FCGI_EV_FCONN_BLK, .name = "fconn_blk", .desc = "FCGI connection blocked" }, +#define FCGI_EV_FCONN_WAKE (1ULL << 4) + { .mask = FCGI_EV_FCONN_WAKE, .name = "fconn_wake", .desc = "FCGI connection woken up" }, +#define FCGI_EV_FCONN_END (1ULL << 5) + { .mask = FCGI_EV_FCONN_END, .name = "fconn_end", .desc = "FCGI connection terminated" }, +#define FCGI_EV_FCONN_ERR (1ULL << 6) + { .mask = FCGI_EV_FCONN_ERR, .name = "fconn_err", .desc = "error on FCGI connection" }, + +#define FCGI_EV_RX_FHDR (1ULL << 7) + { .mask = FCGI_EV_RX_FHDR, .name = "rx_fhdr", .desc = "FCGI record header received" }, +#define FCGI_EV_RX_RECORD (1ULL << 8) + { .mask = FCGI_EV_RX_RECORD, .name = "rx_record", .desc = "receipt of any FCGI record" }, +#define FCGI_EV_RX_EOI (1ULL << 9) + { .mask = FCGI_EV_RX_EOI, .name = "rx_eoi", .desc = "receipt of end of FCGI input" }, +#define FCGI_EV_RX_GETVAL (1ULL << 10) + { .mask = FCGI_EV_RX_GETVAL, .name = "rx_get_values", .desc = "receipt of FCGI GET_VALUES_RESULT record" }, +#define FCGI_EV_RX_STDOUT (1ULL << 11) + { .mask = FCGI_EV_RX_STDOUT, .name = "rx_stdout", .desc = "receipt of FCGI STDOUT record" }, +#define FCGI_EV_RX_STDERR (1ULL << 12) + { .mask = FCGI_EV_RX_STDERR, .name = "rx_stderr", .desc = "receipt of FCGI STDERR record" }, +#define FCGI_EV_RX_ENDREQ (1ULL << 13) + { .mask = FCGI_EV_RX_ENDREQ, .name = "rx_end_req", .desc = "receipt of FCGI END_REQUEST record" }, + +#define FCGI_EV_TX_RECORD (1ULL << 14) + { .mask = FCGI_EV_TX_RECORD, .name = "tx_record", .desc = "transmission of any FCGI record" }, +#define FCGI_EV_TX_EOI (1ULL << 15) + { .mask = FCGI_EV_TX_EOI, .name = "tx_eoi", .desc = "transmission of FCGI end of input" }, +#define FCGI_EV_TX_BEGREQ (1ULL << 16) + { .mask = FCGI_EV_TX_BEGREQ, .name = "tx_begin_request", .desc = "transmission of FCGI BEGIN_REQUEST record" }, +#define FCGI_EV_TX_GETVAL (1ULL << 17) + { .mask = FCGI_EV_TX_GETVAL, .name = "tx_get_values", .desc = "transmission of FCGI GET_VALUES record" }, +#define FCGI_EV_TX_PARAMS (1ULL << 18) + { .mask = FCGI_EV_TX_PARAMS, .name = "tx_params", .desc = "transmission of FCGI PARAMS record" }, +#define FCGI_EV_TX_STDIN (1ULL << 19) + { .mask = FCGI_EV_TX_STDIN, .name = "tx_stding", .desc = "transmission of FCGI STDIN record" }, +#define FCGI_EV_TX_ABORT (1ULL << 20) + { .mask = FCGI_EV_TX_ABORT, .name = "tx_abort", .desc = "transmission of FCGI ABORT record" }, + +#define FCGI_EV_RSP_DATA (1ULL << 21) + { .mask = FCGI_EV_RSP_DATA, .name = "rsp_data", .desc = "parse any data of H1 response" }, +#define FCGI_EV_RSP_EOM (1ULL << 22) + { .mask = FCGI_EV_RSP_EOM, .name = "rsp_eom", .desc = "reach the end of message of H1 response" }, +#define FCGI_EV_RSP_HDRS (1ULL << 23) + { .mask = FCGI_EV_RSP_HDRS, .name = "rsp_headers", .desc = "parse headers of H1 response" }, +#define FCGI_EV_RSP_BODY (1ULL << 24) + { .mask = FCGI_EV_RSP_BODY, .name = "rsp_body", .desc = "parse body part of H1 response" }, +#define FCGI_EV_RSP_TLRS (1ULL << 25) + { .mask = FCGI_EV_RSP_TLRS, .name = "rsp_trailerus", .desc = "parse trailers of H1 response" }, + +#define FCGI_EV_FSTRM_NEW (1ULL << 26) + { .mask = FCGI_EV_FSTRM_NEW, .name = "fstrm_new", .desc = "new FCGI stream" }, +#define FCGI_EV_FSTRM_BLK (1ULL << 27) + { .mask = FCGI_EV_FSTRM_BLK, .name = "fstrm_blk", .desc = "FCGI stream blocked" }, +#define FCGI_EV_FSTRM_END (1ULL << 28) + { .mask = FCGI_EV_FSTRM_END, .name = "fstrm_end", .desc = "FCGI stream terminated" }, +#define FCGI_EV_FSTRM_ERR (1ULL << 29) + { .mask = FCGI_EV_FSTRM_ERR, .name = "fstrm_err", .desc = "error on FCGI stream" }, + +#define FCGI_EV_STRM_NEW (1ULL << 30) + { .mask = FCGI_EV_STRM_NEW, .name = "strm_new", .desc = "app-layer stream creation" }, +#define FCGI_EV_STRM_RECV (1ULL << 31) + { .mask = FCGI_EV_STRM_RECV, .name = "strm_recv", .desc = "receiving data for stream" }, +#define FCGI_EV_STRM_SEND (1ULL << 32) + { .mask = FCGI_EV_STRM_SEND, .name = "strm_send", .desc = "sending data for stream" }, +#define FCGI_EV_STRM_FULL (1ULL << 33) + { .mask = FCGI_EV_STRM_FULL, .name = "strm_full", .desc = "stream buffer full" }, +#define FCGI_EV_STRM_WAKE (1ULL << 34) + { .mask = FCGI_EV_STRM_WAKE, .name = "strm_wake", .desc = "stream woken up" }, +#define FCGI_EV_STRM_SHUT (1ULL << 35) + { .mask = FCGI_EV_STRM_SHUT, .name = "strm_shut", .desc = "stream shutdown" }, +#define FCGI_EV_STRM_END (1ULL << 36) + { .mask = FCGI_EV_STRM_END, .name = "strm_end", .desc = "detaching app-layer stream" }, +#define FCGI_EV_STRM_ERR (1ULL << 37) + { .mask = FCGI_EV_STRM_ERR, .name = "strm_err", .desc = "stream error" }, + + { } +}; + +static const struct name_desc fcgi_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the connection */ }, + /* arg2 */ { .name="fstrm", .desc="FCGI stream" }, + /* arg3 */ { }, + /* arg4 */ { } +}; + + +static const struct name_desc fcgi_trace_decoding[] = { +#define FCGI_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define FCGI_VERB_MINIMAL 2 + { .name="minimal", .desc="report only fconn/fstrm state and flags, no real decoding" }, +#define FCGI_VERB_SIMPLE 3 + { .name="simple", .desc="add request/response status line or htx info when available" }, +#define FCGI_VERB_ADVANCED 4 + { .name="advanced", .desc="add header fields or record decoding when available" }, +#define FCGI_VERB_COMPLETE 5 + { .name="complete", .desc="add full data dump when available" }, + { /* end */ } +}; + +static struct trace_source trace_fcgi __read_mostly = { + .name = IST("fcgi"), + .desc = "FastCGI multiplexer", + .arg_def = TRC_ARG1_CONN, // TRACE()'s first argument is always a connection + .default_cb = fcgi_trace, + .known_events = fcgi_trace_events, + .lockon_args = fcgi_trace_lockon_args, + .decoding = fcgi_trace_decoding, + .report_events = ~0, // report everything by default +}; + +#define TRACE_SOURCE &trace_fcgi +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + +/* FCGI connection and stream pools */ +DECLARE_STATIC_POOL(pool_head_fcgi_conn, "fcgi_conn", sizeof(struct fcgi_conn)); +DECLARE_STATIC_POOL(pool_head_fcgi_strm, "fcgi_strm", sizeof(struct fcgi_strm)); + +struct task *fcgi_timeout_task(struct task *t, void *context, unsigned int state); +static int fcgi_process(struct fcgi_conn *fconn); +/* fcgi_io_cb is exported to see it resolved in "show fd" */ +struct task *fcgi_io_cb(struct task *t, void *ctx, unsigned int state); +static inline struct fcgi_strm *fcgi_conn_st_by_id(struct fcgi_conn *fconn, int id); +struct task *fcgi_deferred_shut(struct task *t, void *ctx, unsigned int state); +static struct fcgi_strm *fcgi_stconn_new(struct fcgi_conn *fconn, struct stconn *sc, struct session *sess); +static void fcgi_strm_notify_recv(struct fcgi_strm *fstrm); +static void fcgi_strm_notify_send(struct fcgi_strm *fstrm); +static void fcgi_strm_alert(struct fcgi_strm *fstrm); +static int fcgi_strm_send_abort(struct fcgi_conn *fconn, struct fcgi_strm *fstrm); + +/* a dummy closed endpoint */ +static const struct sedesc closed_ep = { + .sc = NULL, + .flags = SE_FL_DETACHED, +}; + +/* a dmumy management stream */ +static const struct fcgi_strm *fcgi_mgmt_stream = &(const struct fcgi_strm){ + .sd = (struct sedesc*)&closed_ep, + .fconn = NULL, + .state = FCGI_SS_CLOSED, + .flags = FCGI_SF_NONE, + .id = 0, +}; + +/* and a dummy idle stream for use with any unknown stream */ +static const struct fcgi_strm *fcgi_unknown_stream = &(const struct fcgi_strm){ + .sd = (struct sedesc*)&closed_ep, + .fconn = NULL, + .state = FCGI_SS_IDLE, + .flags = FCGI_SF_NONE, + .id = 0, +}; + +/* returns the stconn associated to the FCGI stream */ +static forceinline struct stconn *fcgi_strm_sc(const struct fcgi_strm *fstrm) +{ + return fstrm->sd->sc; +} + + +/* the FCGI traces always expect that arg1, if non-null, is of type connection + * (from which we can derive fconn), that arg2, if non-null, is of type fstrm, + * and that arg3, if non-null, is a htx for rx/tx headers. + */ +static void fcgi_trace(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct connection *conn = a1; + struct fcgi_conn *fconn = conn ? conn->ctx : NULL; + const struct fcgi_strm *fstrm = a2; + const struct htx *htx = a3; + const size_t *val = a4; + + if (!fconn) + fconn = (fstrm ? fstrm->fconn : NULL); + + if (!fconn || src->verbosity < FCGI_VERB_CLEAN) + return; + + /* Display the response state if fstrm is defined */ + if (fstrm) + chunk_appendf(&trace_buf, " [rsp:%s]", h1m_state_str(fstrm->h1m.state)); + + if (src->verbosity == FCGI_VERB_CLEAN) + return; + + /* Display the value to the 4th argument (level > STATE) */ + if (src->level > TRACE_LEVEL_STATE && val) + chunk_appendf(&trace_buf, " - VAL=%lu", (long)*val); + + /* Display status-line if possible (verbosity > MINIMAL) */ + if (src->verbosity > FCGI_VERB_MINIMAL && htx && htx_nbblks(htx)) { + const struct htx_blk *blk = __htx_get_head_blk(htx); + const struct htx_sl *sl = htx_get_blk_ptr(htx, blk); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_REQ_SL || type == HTX_BLK_RES_SL) + chunk_appendf(&trace_buf, " - \"%.*s %.*s %.*s\"", + HTX_SL_P1_LEN(sl), HTX_SL_P1_PTR(sl), + HTX_SL_P2_LEN(sl), HTX_SL_P2_PTR(sl), + HTX_SL_P3_LEN(sl), HTX_SL_P3_PTR(sl)); + } + + /* Display fconn info and, if defined, fstrm info */ + chunk_appendf(&trace_buf, " - fconn=%p(%s,0x%08x)", fconn, fconn_st_to_str(fconn->state), fconn->flags); + if (fstrm) + chunk_appendf(&trace_buf, " fstrm=%p(%d,%s,0x%08x)", fstrm, fstrm->id, fstrm_st_to_str(fstrm->state), fstrm->flags); + + if (!fstrm || fstrm->id <= 0) + chunk_appendf(&trace_buf, " dsi=%d", fconn->dsi); + if (fconn->dsi >= 0 && (mask & FCGI_EV_RX_FHDR)) + chunk_appendf(&trace_buf, " drt=%s", fcgi_rt_str(fconn->drt)); + + if (src->verbosity == FCGI_VERB_MINIMAL) + return; + + /* Display mbuf and dbuf info (level > USER & verbosity > SIMPLE) */ + if (src->level > TRACE_LEVEL_USER) { + if (src->verbosity == FCGI_VERB_COMPLETE || + (src->verbosity == FCGI_VERB_ADVANCED && (mask & (FCGI_EV_FCONN_RECV|FCGI_EV_RX_RECORD)))) + chunk_appendf(&trace_buf, " dbuf=%u@%p+%u/%u", + (unsigned int)b_data(&fconn->dbuf), b_orig(&fconn->dbuf), + (unsigned int)b_head_ofs(&fconn->dbuf), (unsigned int)b_size(&fconn->dbuf)); + if (src->verbosity == FCGI_VERB_COMPLETE || + (src->verbosity == FCGI_VERB_ADVANCED && (mask & (FCGI_EV_FCONN_SEND|FCGI_EV_TX_RECORD)))) { + struct buffer *hmbuf = br_head(fconn->mbuf); + struct buffer *tmbuf = br_tail(fconn->mbuf); + + chunk_appendf(&trace_buf, " .mbuf=[%u..%u|%u],h=[%u@%p+%u/%u],t=[%u@%p+%u/%u]", + br_head_idx(fconn->mbuf), br_tail_idx(fconn->mbuf), br_size(fconn->mbuf), + (unsigned int)b_data(hmbuf), b_orig(hmbuf), + (unsigned int)b_head_ofs(hmbuf), (unsigned int)b_size(hmbuf), + (unsigned int)b_data(tmbuf), b_orig(tmbuf), + (unsigned int)b_head_ofs(tmbuf), (unsigned int)b_size(tmbuf)); + } + + if (fstrm && (src->verbosity == FCGI_VERB_COMPLETE || + (src->verbosity == FCGI_VERB_ADVANCED && (mask & (FCGI_EV_STRM_RECV|FCGI_EV_RSP_DATA))))) + chunk_appendf(&trace_buf, " rxbuf=%u@%p+%u/%u", + (unsigned int)b_data(&fstrm->rxbuf), b_orig(&fstrm->rxbuf), + (unsigned int)b_head_ofs(&fstrm->rxbuf), (unsigned int)b_size(&fstrm->rxbuf)); + } + + /* Display htx info if defined (level > USER) */ + if (src->level > TRACE_LEVEL_USER && htx) { + int full = 0; + + /* Full htx info (level > STATE && verbosity > SIMPLE) */ + if (src->level > TRACE_LEVEL_STATE) { + if (src->verbosity == FCGI_VERB_COMPLETE) + full = 1; + else if (src->verbosity == FCGI_VERB_ADVANCED && (mask & (FCGI_EV_RSP_HDRS|FCGI_EV_TX_PARAMS))) + full = 1; + } + + chunk_memcat(&trace_buf, "\n\t", 2); + htx_dump(&trace_buf, htx, full); + } +} + +/*****************************************************/ +/* functions below are for dynamic buffer management */ +/*****************************************************/ + +/* Indicates whether or not the we may call the fcgi_recv() function to attempt + * to receive data into the buffer and/or demux pending data. The condition is + * a bit complex due to some API limits for now. The rules are the following : + * - if an error or a shutdown was detected on the connection and the buffer + * is empty, we must not attempt to receive + * - if the demux buf failed to be allocated, we must not try to receive and + * we know there is nothing pending + * - if no flag indicates a blocking condition, we may attempt to receive, + * regardless of whether the demux buffer is full or not, so that only + * de demux part decides whether or not to block. This is needed because + * the connection API indeed prevents us from re-enabling receipt that is + * already enabled in a polled state, so we must always immediately stop + * as soon as the demux can't proceed so as never to hit an end of read + * with data pending in the buffers. + * - otherwise must may not attempt + */ +static inline int fcgi_recv_allowed(const struct fcgi_conn *fconn) +{ + if (fconn->flags & (FCGI_CF_EOS|FCGI_CF_ERROR)) + return 0; + + if (b_data(&fconn->dbuf) == 0 && fconn->state == FCGI_CS_CLOSED) + return 0; + + if (!(fconn->flags & FCGI_CF_DEM_DALLOC) && + !(fconn->flags & FCGI_CF_DEM_BLOCK_ANY)) + return 1; + + return 0; +} + +/* Restarts reading on the connection if it was not enabled */ +static inline void fcgi_conn_restart_reading(const struct fcgi_conn *fconn, int consider_buffer) +{ + if (!fcgi_recv_allowed(fconn)) + return; + if ((!consider_buffer || !b_data(&fconn->dbuf)) && + (fconn->wait_event.events & SUB_RETRY_RECV)) + return; + tasklet_wakeup(fconn->wait_event.tasklet); +} + + +/* Tries to grab a buffer and to re-enable processing on mux <target>. The + * fcgi_conn flags are used to figure what buffer was requested. It returns 1 if + * the allocation succeeds, in which case the connection is woken up, or 0 if + * it's impossible to wake up and we prefer to be woken up later. + */ +static int fcgi_buf_available(void *target) +{ + struct fcgi_conn *fconn = target; + struct fcgi_strm *fstrm; + + if ((fconn->flags & FCGI_CF_DEM_DALLOC) && b_alloc(&fconn->dbuf)) { + TRACE_STATE("unblocking fconn, dbuf allocated", FCGI_EV_FCONN_RECV|FCGI_EV_FCONN_BLK|FCGI_EV_FCONN_WAKE, fconn->conn); + fconn->flags &= ~FCGI_CF_DEM_DALLOC; + fcgi_conn_restart_reading(fconn, 1); + return 1; + } + + if ((fconn->flags & FCGI_CF_MUX_MALLOC) && b_alloc(br_tail(fconn->mbuf))) { + TRACE_STATE("unblocking fconn, mbuf allocated", FCGI_EV_FCONN_SEND|FCGI_EV_FCONN_BLK|FCGI_EV_FCONN_WAKE, fconn->conn); + fconn->flags &= ~FCGI_CF_MUX_MALLOC; + if (fconn->flags & FCGI_CF_DEM_MROOM) { + fconn->flags &= ~FCGI_CF_DEM_MROOM; + fcgi_conn_restart_reading(fconn, 1); + } + return 1; + } + + if ((fconn->flags & FCGI_CF_DEM_SALLOC) && + (fstrm = fcgi_conn_st_by_id(fconn, fconn->dsi)) && fcgi_strm_sc(fstrm) && + b_alloc(&fstrm->rxbuf)) { + TRACE_STATE("unblocking fstrm, rxbuf allocated", FCGI_EV_STRM_RECV|FCGI_EV_FSTRM_BLK|FCGI_EV_STRM_WAKE, fconn->conn, fstrm); + fconn->flags &= ~FCGI_CF_DEM_SALLOC; + fcgi_conn_restart_reading(fconn, 1); + fcgi_strm_notify_recv(fstrm); + return 1; + } + + return 0; +} + +static inline struct buffer *fcgi_get_buf(struct fcgi_conn *fconn, struct buffer *bptr) +{ + struct buffer *buf = NULL; + + if (likely(!LIST_INLIST(&fconn->buf_wait.list)) && + unlikely((buf = b_alloc(bptr)) == NULL)) { + fconn->buf_wait.target = fconn; + fconn->buf_wait.wakeup_cb = fcgi_buf_available; + LIST_APPEND(&th_ctx->buffer_wq, &fconn->buf_wait.list); + } + return buf; +} + +static inline void fcgi_release_buf(struct fcgi_conn *fconn, struct buffer *bptr) +{ + if (bptr->size) { + b_free(bptr); + offer_buffers(NULL, 1); + } +} + +static inline void fcgi_release_mbuf(struct fcgi_conn *fconn) +{ + struct buffer *buf; + unsigned int count = 0; + + while (b_size(buf = br_head_pick(fconn->mbuf))) { + b_free(buf); + count++; + } + if (count) + offer_buffers(NULL, count); +} + +/* Returns the number of allocatable outgoing streams for the connection taking + * the number reserved streams into account. + */ +static inline int fcgi_streams_left(const struct fcgi_conn *fconn) +{ + int ret; + + ret = (unsigned int)(0x7FFF - fconn->max_id) - fconn->nb_reserved - 1; + if (ret < 0) + ret = 0; + return ret; +} + +/* Returns the number of streams in use on a connection to figure if it's + * idle or not. We check nb_sc and not nb_streams as the caller will want + * to know if it was the last one after a detach(). + */ +static int fcgi_used_streams(struct connection *conn) +{ + struct fcgi_conn *fconn = conn->ctx; + + return fconn->nb_sc; +} + +/* Returns the number of concurrent streams available on the connection */ +static int fcgi_avail_streams(struct connection *conn) +{ + struct server *srv = objt_server(conn->target); + struct fcgi_conn *fconn = conn->ctx; + int ret1, ret2; + + /* Don't open new stream if the connection is closed */ + if (fconn->state == FCGI_CS_CLOSED) + return 0; + + /* May be negative if this setting has changed */ + ret1 = (fconn->streams_limit - fconn->nb_streams); + + /* we must also consider the limit imposed by stream IDs */ + ret2 = fcgi_streams_left(fconn); + ret1 = MIN(ret1, ret2); + if (ret1 > 0 && srv && srv->max_reuse >= 0) { + ret2 = ((fconn->stream_cnt <= srv->max_reuse) ? srv->max_reuse - fconn->stream_cnt + 1: 0); + ret1 = MIN(ret1, ret2); + } + return ret1; +} + +/*****************************************************************/ +/* functions below are dedicated to the mux setup and management */ +/*****************************************************************/ + +/* Initializes the mux once it's attached. Only outgoing connections are + * supported. So the context is already initialized before installing the + * mux. <input> is always used as Input buffer and may contain data. It is the + * caller responsibility to not reuse it anymore. Returns < 0 on error. + */ +static int fcgi_init(struct connection *conn, struct proxy *px, struct session *sess, + struct buffer *input) +{ + struct fcgi_conn *fconn; + struct fcgi_strm *fstrm; + struct fcgi_app *app = get_px_fcgi_app(px); + struct task *t = NULL; + void *conn_ctx = conn->ctx; + + TRACE_ENTER(FCGI_EV_FSTRM_NEW); + + if (!app) { + TRACE_ERROR("No FCGI app found, don't create fconn", FCGI_EV_FCONN_NEW|FCGI_EV_FCONN_END|FCGI_EV_FCONN_ERR); + goto fail_conn; + } + + fconn = pool_alloc(pool_head_fcgi_conn); + if (!fconn) { + TRACE_ERROR("fconn allocation failure", FCGI_EV_FCONN_NEW|FCGI_EV_FCONN_END|FCGI_EV_FCONN_ERR); + goto fail_conn; + } + + fconn->shut_timeout = fconn->timeout = px->timeout.server; + if (tick_isset(px->timeout.serverfin)) + fconn->shut_timeout = px->timeout.serverfin; + + fconn->flags = FCGI_CF_NONE; + + /* Retrieve useful info from the FCGI app */ + if (app->flags & FCGI_APP_FL_KEEP_CONN) + fconn->flags |= FCGI_CF_KEEP_CONN; + if (app->flags & FCGI_APP_FL_GET_VALUES) + fconn->flags |= FCGI_CF_GET_VALUES; + if (app->flags & FCGI_APP_FL_MPXS_CONNS) + fconn->flags |= FCGI_CF_MPXS_CONNS; + + fconn->proxy = px; + fconn->app = app; + fconn->task = NULL; + if (tick_isset(fconn->timeout)) { + t = task_new_here(); + if (!t) { + TRACE_ERROR("fconn task allocation failure", FCGI_EV_FCONN_NEW|FCGI_EV_FCONN_END|FCGI_EV_FCONN_ERR); + goto fail; + } + + fconn->task = t; + t->process = fcgi_timeout_task; + t->context = fconn; + t->expire = tick_add(now_ms, fconn->timeout); + } + + fconn->wait_event.tasklet = tasklet_new(); + if (!fconn->wait_event.tasklet) + goto fail; + fconn->wait_event.tasklet->process = fcgi_io_cb; + fconn->wait_event.tasklet->context = fconn; + fconn->wait_event.events = 0; + + /* Initialise the context. */ + fconn->state = FCGI_CS_INIT; + fconn->conn = conn; + fconn->streams_limit = app->maxreqs; + fconn->max_id = -1; + fconn->nb_streams = 0; + fconn->nb_sc = 0; + fconn->nb_reserved = 0; + fconn->stream_cnt = 0; + + fconn->dbuf = *input; + fconn->dsi = -1; + + br_init(fconn->mbuf, sizeof(fconn->mbuf) / sizeof(fconn->mbuf[0])); + fconn->streams_by_id = EB_ROOT; + LIST_INIT(&fconn->send_list); + LIST_INIT(&fconn->buf_wait.list); + + conn->ctx = fconn; + + if (t) + task_queue(t); + + /* FIXME: this is temporary, for outgoing connections we need to + * immediately allocate a stream until the code is modified so that the + * caller calls ->attach(). For now the outgoing sc is stored as + * conn->ctx by the caller and saved in conn_ctx. + */ + fstrm = fcgi_stconn_new(fconn, conn_ctx, sess); + if (!fstrm) + goto fail; + + + /* Repare to read something */ + fcgi_conn_restart_reading(fconn, 1); + TRACE_LEAVE(FCGI_EV_FCONN_NEW, conn); + return 0; + + fail: + task_destroy(t); + tasklet_free(fconn->wait_event.tasklet); + pool_free(pool_head_fcgi_conn, fconn); + fail_conn: + conn->ctx = conn_ctx; // restore saved ctx + TRACE_DEVEL("leaving in error", FCGI_EV_FCONN_NEW|FCGI_EV_FCONN_END|FCGI_EV_FCONN_ERR); + return -1; +} + +/* Returns the next allocatable outgoing stream ID for the FCGI connection, or + * -1 if no more is allocatable. + */ +static inline int32_t fcgi_conn_get_next_sid(const struct fcgi_conn *fconn) +{ + int32_t id = (fconn->max_id + 1) | 1; + + if ((id & 0x80000000U)) + id = -1; + return id; +} + +/* Returns the stream associated with id <id> or NULL if not found */ +static inline struct fcgi_strm *fcgi_conn_st_by_id(struct fcgi_conn *fconn, int id) +{ + struct eb32_node *node; + + if (id == 0) + return (struct fcgi_strm *)fcgi_mgmt_stream; + + if (id > fconn->max_id) + return (struct fcgi_strm *)fcgi_unknown_stream; + + node = eb32_lookup(&fconn->streams_by_id, id); + if (!node) + return (struct fcgi_strm *)fcgi_unknown_stream; + return container_of(node, struct fcgi_strm, by_id); +} + + +/* Release function. This one should be called to free all resources allocated + * to the mux. + */ +static void fcgi_release(struct fcgi_conn *fconn) +{ + struct connection *conn = fconn->conn; + + TRACE_POINT(FCGI_EV_FCONN_END); + + if (LIST_INLIST(&fconn->buf_wait.list)) + LIST_DEL_INIT(&fconn->buf_wait.list); + + fcgi_release_buf(fconn, &fconn->dbuf); + fcgi_release_mbuf(fconn); + + if (fconn->task) { + fconn->task->context = NULL; + task_wakeup(fconn->task, TASK_WOKEN_OTHER); + fconn->task = NULL; + } + tasklet_free(fconn->wait_event.tasklet); + if (conn && fconn->wait_event.events != 0) + conn->xprt->unsubscribe(conn, conn->xprt_ctx, fconn->wait_event.events, + &fconn->wait_event); + + pool_free(pool_head_fcgi_conn, fconn); + + if (conn) { + conn->mux = NULL; + conn->ctx = NULL; + TRACE_DEVEL("freeing conn", FCGI_EV_FCONN_END, conn); + + conn_stop_tracking(conn); + conn_full_close(conn); + if (conn->destroy_cb) + conn->destroy_cb(conn); + conn_free(conn); + } +} + +/* Detect a pending read0 for a FCGI connection. It happens if a read0 is + * pending on the connection AND if there is no more data in the demux + * buffer. The function returns 1 to report a read0 or 0 otherwise. + */ +static int fcgi_conn_read0_pending(struct fcgi_conn *fconn) +{ + if ((fconn->flags & FCGI_CF_EOS) && !b_data(&fconn->dbuf)) + return 1; + return 0; +} + + +/* Returns true if the FCGI connection must be release */ +static inline int fcgi_conn_is_dead(struct fcgi_conn *fconn) +{ + if (eb_is_empty(&fconn->streams_by_id) && /* don't close if streams exist */ + (!(fconn->flags & FCGI_CF_KEEP_CONN) || /* don't keep the connection alive */ + (fconn->flags & FCGI_CF_ERROR) || /* errors close immediately */ + (fconn->state == FCGI_CS_CLOSED && !fconn->task) ||/* a timeout stroke earlier */ + (!(fconn->conn->owner)) || /* Nobody's left to take care of the connection, drop it now */ + (!br_data(fconn->mbuf) && /* mux buffer empty, also process clean events below */ + (fconn->flags & FCGI_CF_EOS)))) + return 1; + return 0; +} + + +/********************************************************/ +/* functions below are for the FCGI protocol processing */ +/********************************************************/ + +/* Marks an error on the stream. */ +static inline void fcgi_strm_error(struct fcgi_strm *fstrm) +{ + if (fstrm->id && fstrm->state != FCGI_SS_ERROR) { + TRACE_POINT(FCGI_EV_FSTRM_ERR, fstrm->fconn->conn, fstrm); + if (fstrm->state < FCGI_SS_ERROR) { + fstrm->state = FCGI_SS_ERROR; + TRACE_STATE("switching to ERROR", FCGI_EV_FSTRM_ERR, fstrm->fconn->conn, fstrm); + } + se_fl_set_error(fstrm->sd); + } +} + +/* Attempts to notify the data layer of recv availability */ +static void fcgi_strm_notify_recv(struct fcgi_strm *fstrm) +{ + if (fstrm->subs && (fstrm->subs->events & SUB_RETRY_RECV)) { + TRACE_POINT(FCGI_EV_STRM_WAKE, fstrm->fconn->conn, fstrm); + tasklet_wakeup(fstrm->subs->tasklet); + fstrm->subs->events &= ~SUB_RETRY_RECV; + if (!fstrm->subs->events) + fstrm->subs = NULL; + } +} + +/* Attempts to notify the data layer of send availability */ +static void fcgi_strm_notify_send(struct fcgi_strm *fstrm) +{ + if (fstrm->subs && (fstrm->subs->events & SUB_RETRY_SEND)) { + TRACE_POINT(FCGI_EV_STRM_WAKE, fstrm->fconn->conn, fstrm); + fstrm->flags |= FCGI_SF_NOTIFIED; + tasklet_wakeup(fstrm->subs->tasklet); + fstrm->subs->events &= ~SUB_RETRY_SEND; + if (!fstrm->subs->events) + fstrm->subs = NULL; + } + else if (fstrm->flags & (FCGI_SF_WANT_SHUTR | FCGI_SF_WANT_SHUTW)) { + TRACE_POINT(FCGI_EV_STRM_WAKE, fstrm->fconn->conn, fstrm); + tasklet_wakeup(fstrm->shut_tl); + } +} + +/* Alerts the data layer, trying to wake it up by all means, following + * this sequence : + * - if the fcgi stream' data layer is subscribed to recv, then it's woken up + * for recv + * - if its subscribed to send, then it's woken up for send + * - if it was subscribed to neither, its ->wake() callback is called + * It is safe to call this function with a closed stream which doesn't have a + * stream connector anymore. + */ +static void fcgi_strm_alert(struct fcgi_strm *fstrm) +{ + TRACE_POINT(FCGI_EV_STRM_WAKE, fstrm->fconn->conn, fstrm); + if (fstrm->subs || + (fstrm->flags & (FCGI_SF_WANT_SHUTR|FCGI_SF_WANT_SHUTW))) { + fcgi_strm_notify_recv(fstrm); + fcgi_strm_notify_send(fstrm); + } + else if (fcgi_strm_sc(fstrm) && fcgi_strm_sc(fstrm)->app_ops->wake != NULL) { + TRACE_POINT(FCGI_EV_STRM_WAKE, fstrm->fconn->conn, fstrm); + fcgi_strm_sc(fstrm)->app_ops->wake(fcgi_strm_sc(fstrm)); + } +} + +/* Writes the 16-bit record size <len> at address <record> */ +static inline void fcgi_set_record_size(void *record, uint16_t len) +{ + uint8_t *out = (record + 4); + + *out = (len >> 8); + *(out + 1) = (len & 0xff); +} + +/* Writes the 16-bit stream id <id> at address <record> */ +static inline void fcgi_set_record_id(void *record, uint16_t id) +{ + uint8_t *out = (record + 2); + + *out = (id >> 8); + *(out + 1) = (id & 0xff); +} + +/* Marks a FCGI stream as CLOSED and decrement the number of active streams for + * its connection if the stream was not yet closed. Please use this exclusively + * before closing a stream to ensure stream count is well maintained. + */ +static inline void fcgi_strm_close(struct fcgi_strm *fstrm) +{ + if (fstrm->state != FCGI_SS_CLOSED) { + TRACE_ENTER(FCGI_EV_FSTRM_END, fstrm->fconn->conn, fstrm); + fstrm->fconn->nb_streams--; + if (!fstrm->id) + fstrm->fconn->nb_reserved--; + if (fcgi_strm_sc(fstrm)) { + if (!se_fl_test(fstrm->sd, SE_FL_EOS) && !b_data(&fstrm->rxbuf)) + fcgi_strm_notify_recv(fstrm); + } + fstrm->state = FCGI_SS_CLOSED; + TRACE_STATE("switching to CLOSED", FCGI_EV_FSTRM_END, fstrm->fconn->conn, fstrm); + TRACE_LEAVE(FCGI_EV_FSTRM_END, fstrm->fconn->conn, fstrm); + } +} + +/* Detaches a FCGI stream from its FCGI connection and releases it to the + * fcgi_strm pool. + */ +static void fcgi_strm_destroy(struct fcgi_strm *fstrm) +{ + struct connection *conn = fstrm->fconn->conn; + + TRACE_ENTER(FCGI_EV_FSTRM_END, conn, fstrm); + + fcgi_strm_close(fstrm); + eb32_delete(&fstrm->by_id); + if (b_size(&fstrm->rxbuf)) { + b_free(&fstrm->rxbuf); + offer_buffers(NULL, 1); + } + if (fstrm->subs) + fstrm->subs->events = 0; + /* There's no need to explicitly call unsubscribe here, the only + * reference left would be in the fconn send_list/fctl_list, and if + * we're in it, we're getting out anyway + */ + LIST_DEL_INIT(&fstrm->send_list); + tasklet_free(fstrm->shut_tl); + BUG_ON(fstrm->sd && !se_fl_test(fstrm->sd, SE_FL_ORPHAN)); + sedesc_free(fstrm->sd); + pool_free(pool_head_fcgi_strm, fstrm); + + TRACE_LEAVE(FCGI_EV_FSTRM_END, conn); +} + +/* Allocates a new stream <id> for connection <fconn> and adds it into fconn's + * stream tree. In case of error, nothing is added and NULL is returned. The + * causes of errors can be any failed memory allocation. The caller is + * responsible for checking if the connection may support an extra stream prior + * to calling this function. + */ +static struct fcgi_strm *fcgi_strm_new(struct fcgi_conn *fconn, int id) +{ + struct fcgi_strm *fstrm; + + TRACE_ENTER(FCGI_EV_FSTRM_NEW, fconn->conn); + + fstrm = pool_alloc(pool_head_fcgi_strm); + if (!fstrm) { + TRACE_ERROR("fstrm allocation failure", FCGI_EV_FSTRM_NEW|FCGI_EV_FSTRM_ERR|FCGI_EV_FSTRM_END, fconn->conn); + goto out; + } + + fstrm->shut_tl = tasklet_new(); + if (!fstrm->shut_tl) { + TRACE_ERROR("fstrm shut tasklet allocation failure", FCGI_EV_FSTRM_NEW|FCGI_EV_FSTRM_ERR|FCGI_EV_FSTRM_END, fconn->conn); + pool_free(pool_head_fcgi_strm, fstrm); + goto out; + } + fstrm->subs = NULL; + fstrm->shut_tl->process = fcgi_deferred_shut; + fstrm->shut_tl->context = fstrm; + LIST_INIT(&fstrm->send_list); + fstrm->fconn = fconn; + fstrm->sd = NULL; + fstrm->flags = FCGI_SF_NONE; + fstrm->proto_status = 0; + fstrm->state = FCGI_SS_IDLE; + fstrm->rxbuf = BUF_NULL; + + h1m_init_res(&fstrm->h1m); + fstrm->h1m.err_pos = -1; // don't care about errors on the request path + fstrm->h1m.flags |= (H1_MF_NO_PHDR|H1_MF_CLEAN_CONN_HDR); + + fstrm->by_id.key = fstrm->id = id; + if (id > 0) + fconn->max_id = id; + else + fconn->nb_reserved++; + + eb32_insert(&fconn->streams_by_id, &fstrm->by_id); + fconn->nb_streams++; + fconn->stream_cnt++; + + TRACE_LEAVE(FCGI_EV_FSTRM_NEW, fconn->conn, fstrm); + return fstrm; + + out: + TRACE_DEVEL("leaving in error", FCGI_EV_FSTRM_NEW|FCGI_EV_FSTRM_ERR|FCGI_EV_FSTRM_END, fconn->conn); + return NULL; +} + +/* Allocates a new stream associated to stream connector <sc> on the FCGI connection + * <fconn> and returns it, or NULL in case of memory allocation error or if the + * highest possible stream ID was reached. + */ +static struct fcgi_strm *fcgi_stconn_new(struct fcgi_conn *fconn, struct stconn *sc, + struct session *sess) +{ + struct fcgi_strm *fstrm = NULL; + + TRACE_ENTER(FCGI_EV_FSTRM_NEW, fconn->conn); + if (fconn->nb_streams >= fconn->streams_limit) { + TRACE_ERROR("streams_limit reached", FCGI_EV_FSTRM_NEW|FCGI_EV_FSTRM_END|FCGI_EV_FSTRM_ERR, fconn->conn); + goto out; + } + + if (fcgi_streams_left(fconn) < 1) { + TRACE_ERROR("!streams_left", FCGI_EV_FSTRM_NEW|FCGI_EV_FSTRM_END|FCGI_EV_FSTRM_ERR, fconn->conn); + goto out; + } + + /* Defer choosing the ID until we send the first message to create the stream */ + fstrm = fcgi_strm_new(fconn, 0); + if (!fstrm) { + TRACE_ERROR("fstream allocation failure", FCGI_EV_FSTRM_NEW|FCGI_EV_FSTRM_END|FCGI_EV_FSTRM_ERR, fconn->conn); + goto out; + } + if (sc_attach_mux(sc, fstrm, fconn->conn) < 0) + goto out; + fstrm->sd = sc->sedesc; + fstrm->sess = sess; + fconn->nb_sc++; + + TRACE_LEAVE(FCGI_EV_FSTRM_NEW, fconn->conn, fstrm); + return fstrm; + + out: + TRACE_DEVEL("leaving on error", FCGI_EV_FSTRM_NEW|FCGI_EV_FSTRM_END|FCGI_EV_FSTRM_ERR, fconn->conn); + fcgi_strm_destroy(fstrm); + return NULL; +} + +/* Wakes a specific stream and assign its stream connector some SE_FL_* flags among + * SE_FL_ERR_PENDING and SE_FL_ERROR if needed. The stream's state is + * automatically updated accordingly. If the stream is orphaned, it is + * destroyed. + */ +static void fcgi_strm_wake_one_stream(struct fcgi_strm *fstrm) +{ + struct fcgi_conn *fconn = fstrm->fconn; + + TRACE_ENTER(FCGI_EV_STRM_WAKE, fconn->conn, fstrm); + + if (!fcgi_strm_sc(fstrm)) { + /* this stream was already orphaned */ + fcgi_strm_destroy(fstrm); + TRACE_DEVEL("leaving with no fstrm", FCGI_EV_STRM_WAKE, fconn->conn); + return; + } + + if (fcgi_conn_read0_pending(fconn)) { + if (fstrm->state == FCGI_SS_OPEN) { + fstrm->state = FCGI_SS_HREM; + TRACE_STATE("switching to HREM", FCGI_EV_STRM_WAKE|FCGI_EV_FSTRM_END, fconn->conn, fstrm); + } + else if (fstrm->state == FCGI_SS_HLOC) + fcgi_strm_close(fstrm); + } + + if (fconn->state == FCGI_CS_CLOSED || (fconn->flags & (FCGI_CF_ERR_PENDING|FCGI_CF_ERROR))) { + se_fl_set_error(fstrm->sd); + + if (fstrm->state < FCGI_SS_ERROR) { + fstrm->state = FCGI_SS_ERROR; + TRACE_STATE("switching to ERROR", FCGI_EV_STRM_WAKE|FCGI_EV_FSTRM_END, fconn->conn, fstrm); + } + } + + fcgi_strm_alert(fstrm); + + TRACE_LEAVE(FCGI_EV_STRM_WAKE, fconn->conn, fstrm); +} + +/* Wakes unassigned streams (ID == 0) attached to the connection. */ +static void fcgi_wake_unassigned_streams(struct fcgi_conn *fconn) +{ + struct eb32_node *node; + struct fcgi_strm *fstrm; + + node = eb32_lookup(&fconn->streams_by_id, 0); + while (node) { + fstrm = container_of(node, struct fcgi_strm, by_id); + if (fstrm->id > 0) + break; + node = eb32_next(node); + fcgi_strm_wake_one_stream(fstrm); + } +} + +/* Wakes the streams attached to the connection, whose id is greater than <last> + * or unassigned. + */ +static void fcgi_wake_some_streams(struct fcgi_conn *fconn, int last) +{ + struct eb32_node *node; + struct fcgi_strm *fstrm; + + TRACE_ENTER(FCGI_EV_STRM_WAKE, fconn->conn); + + /* Wake all streams with ID > last */ + node = eb32_lookup_ge(&fconn->streams_by_id, last + 1); + while (node) { + fstrm = container_of(node, struct fcgi_strm, by_id); + node = eb32_next(node); + fcgi_strm_wake_one_stream(fstrm); + } + fcgi_wake_unassigned_streams(fconn); + + TRACE_LEAVE(FCGI_EV_STRM_WAKE, fconn->conn); +} + +static int fcgi_set_default_param(struct fcgi_conn *fconn, struct fcgi_strm *fstrm, + struct htx *htx, struct htx_sl *sl, + struct fcgi_strm_params *params) +{ + struct connection *cli_conn = objt_conn(fstrm->sess->origin); + const struct sockaddr_storage *src = (sc_check(fcgi_strm_sc(fstrm)) ? conn_src(fconn->conn) : sc_src(sc_opposite(fcgi_strm_sc(fstrm)))); + const struct sockaddr_storage *dst = (sc_check(fcgi_strm_sc(fstrm)) ? conn_dst(fconn->conn) : sc_dst(sc_opposite(fcgi_strm_sc(fstrm)))); + struct ist p; + + if (!sl) + goto error; + + if (!(params->mask & FCGI_SP_DOC_ROOT)) + params->docroot = fconn->app->docroot; + + if (!(params->mask & FCGI_SP_REQ_METH)) { + p = htx_sl_req_meth(sl); + params->meth = ist2(b_tail(params->p), p.len); + chunk_istcat(params->p, p); + } + if (!(params->mask & FCGI_SP_REQ_URI)) { + p = h1_get_uri(sl); + params->uri = ist2(b_tail(params->p), p.len); + chunk_istcat(params->p, p); + } + if (!(params->mask & FCGI_SP_SRV_PROTO)) { + p = htx_sl_req_vsn(sl); + params->vsn = ist2(b_tail(params->p), p.len); + chunk_istcat(params->p, p); + } + if (!(params->mask & FCGI_SP_SRV_PORT)) { + char *end; + int port = 0; + if (dst) + port = get_host_port(dst); + end = ultoa_o(port, b_tail(params->p), b_room(params->p)); + if (!end) + goto error; + params->srv_port = ist2(b_tail(params->p), end - b_tail(params->p)); + params->p->data += params->srv_port.len; + } + if (!(params->mask & FCGI_SP_SRV_NAME)) { + /* If no Host header found, use the server address to fill + * srv_name */ + if (!istlen(params->srv_name)) { + char *ptr = NULL; + + if (dst) + if (addr_to_str(dst, b_tail(params->p), b_room(params->p)) != -1) + ptr = b_tail(params->p); + if (ptr) { + params->srv_name = ist(ptr); + params->p->data += params->srv_name.len; + } + } + } + if (!(params->mask & FCGI_SP_REM_ADDR)) { + char *ptr = NULL; + + if (src) + if (addr_to_str(src, b_tail(params->p), b_room(params->p)) != -1) + ptr = b_tail(params->p); + if (ptr) { + params->rem_addr = ist(ptr); + params->p->data += params->rem_addr.len; + } + } + if (!(params->mask & FCGI_SP_REM_PORT)) { + char *end; + int port = 0; + if (src) + port = get_host_port(src); + end = ultoa_o(port, b_tail(params->p), b_room(params->p)); + if (!end) + goto error; + params->rem_port = ist2(b_tail(params->p), end - b_tail(params->p)); + params->p->data += params->rem_port.len; + } + if (!(params->mask & FCGI_SP_CONT_LEN)) { + struct htx_blk *blk; + enum htx_blk_type type; + char *end; + size_t len = 0; + + for (blk = htx_get_head_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_DATA) + len += htx_get_blksz(blk); + } + end = ultoa_o(len, b_tail(params->p), b_room(params->p)); + if (!end) + goto error; + params->cont_len = ist2(b_tail(params->p), end - b_tail(params->p)); + params->p->data += params->cont_len.len; + } + + if (!(params->mask & FCGI_SP_HTTPS)) { + if (cli_conn) + params->https = conn_is_ssl(cli_conn); + } + + if ((params->mask & FCGI_SP_URI_MASK) != FCGI_SP_URI_MASK) { + /* one of scriptname, pathinfo or query_string is no set */ + struct http_uri_parser parser = http_uri_parser_init(params->uri); + struct ist path = http_parse_path(&parser); + int len; + + /* No scrit_name set but no valid path ==> error */ + if (!(params->mask & FCGI_SP_SCRIPT_NAME) && !istlen(path)) + goto error; + + /* If there is a query-string, Set it if not already set */ + if (!(params->mask & FCGI_SP_REQ_QS)) { + struct ist qs = istfind(path, '?'); + + /* Update the path length */ + path.len -= qs.len; + + /* Set the query-string skipping the '?', if any */ + if (istlen(qs)) + params->qs = istnext(qs); + } + + /* If the script_name is set, don't try to deduce the path_info + * too. The opposite is not true. + */ + if (params->mask & FCGI_SP_SCRIPT_NAME) { + params->mask |= FCGI_SP_PATH_INFO; + goto end; + } + + /* Decode the path. it must first be copied to keep the URI + * untouched. + */ + chunk_istcat(params->p, path); + path.ptr = b_tail(params->p) - path.len; + len = url_decode(ist0(path), 0); + if (len < 0) + goto error; + path.len = len; + + /* script_name not set, preset it with the path for now */ + params->scriptname = path; + + /* If there is no regex to match the pathinfo, just to the last + * part and see if the index must be used. + */ + if (!fconn->app->pathinfo_re) + goto check_index; + + /* If some special characters are found in the decoded path (\n + * or \0), the PATH_INFO regex cannot match. This is theoretically + * valid, but probably unexpected, to have such characters. So, + * to avoid any surprises, an error is triggered in this + * case. + */ + if (istchr(path, '\n') || istchr(path, '\0')) + goto error; + + /* The regex does not match, just to the last part and see if + * the index must be used. + */ + if (!regex_exec_match2(fconn->app->pathinfo_re, path.ptr, len, MAX_MATCH, pmatch, 0)) + goto check_index; + + /* We must have at least 1 capture for the script name, + * otherwise we do nothing and jump to the last part. + */ + if (pmatch[1].rm_so == -1 || pmatch[1].rm_eo == -1) + goto check_index; + + /* Finally we can set the script_name and the path_info. The + * path_info is set if not already defined, and if it was + * captured + */ + params->scriptname = ist2(path.ptr + pmatch[1].rm_so, pmatch[1].rm_eo - pmatch[1].rm_so); + if (!(params->mask & FCGI_SP_PATH_INFO) && !(pmatch[2].rm_so == -1 || pmatch[2].rm_eo == -1)) + params->pathinfo = ist2(path.ptr + pmatch[2].rm_so, pmatch[2].rm_eo - pmatch[2].rm_so); + + check_index: + len = params->scriptname.len; + /* the script_name if finished by a '/' so we can add the index + * part, if any. + */ + if (istlen(fconn->app->index) && params->scriptname.ptr[len-1] == '/') { + struct ist sn = params->scriptname; + + params->scriptname = ist2(b_tail(params->p), len+fconn->app->index.len); + chunk_istcat(params->p, sn); + chunk_istcat(params->p, fconn->app->index); + } + } + + if (!(params->mask & FCGI_SP_SRV_SOFT)) { + params->srv_soft = ist2(b_tail(params->p), 0); + chunk_appendf(params->p, "HAProxy %s", haproxy_version); + params->srv_soft.len = b_tail(params->p) - params->srv_soft.ptr; + } + + end: + return 1; + error: + return 0; +} + +static int fcgi_encode_default_param(struct fcgi_conn *fconn, struct fcgi_strm *fstrm, + struct fcgi_strm_params *params, struct buffer *outbuf, int flag) +{ + struct fcgi_param p; + + if (params->mask & flag) + return 1; + + chunk_reset(&trash); + + switch (flag) { + case FCGI_SP_CGI_GATEWAY: + p.n = ist("GATEWAY_INTERFACE"); + p.v = ist("CGI/1.1"); + goto encode; + case FCGI_SP_DOC_ROOT: + p.n = ist("DOCUMENT_ROOT"); + p.v = params->docroot; + goto encode; + case FCGI_SP_SCRIPT_NAME: + p.n = ist("SCRIPT_NAME"); + p.v = params->scriptname; + goto encode; + case FCGI_SP_PATH_INFO: + p.n = ist("PATH_INFO"); + p.v = params->pathinfo; + goto encode; + case FCGI_SP_REQ_URI: + p.n = ist("REQUEST_URI"); + p.v = params->uri; + goto encode; + case FCGI_SP_REQ_METH: + p.n = ist("REQUEST_METHOD"); + p.v = params->meth; + goto encode; + case FCGI_SP_REQ_QS: + p.n = ist("QUERY_STRING"); + p.v = params->qs; + goto encode; + case FCGI_SP_SRV_NAME: + p.n = ist("SERVER_NAME"); + p.v = params->srv_name; + goto encode; + case FCGI_SP_SRV_PORT: + p.n = ist("SERVER_PORT"); + p.v = params->srv_port; + goto encode; + case FCGI_SP_SRV_PROTO: + p.n = ist("SERVER_PROTOCOL"); + p.v = params->vsn; + goto encode; + case FCGI_SP_REM_ADDR: + p.n = ist("REMOTE_ADDR"); + p.v = params->rem_addr; + goto encode; + case FCGI_SP_REM_PORT: + p.n = ist("REMOTE_PORT"); + p.v = params->rem_port; + goto encode; + case FCGI_SP_SCRIPT_FILE: + p.n = ist("SCRIPT_FILENAME"); + chunk_istcat(&trash, params->docroot); + chunk_istcat(&trash, params->scriptname); + p.v = ist2(b_head(&trash), b_data(&trash)); + goto encode; + case FCGI_SP_PATH_TRANS: + if (!istlen(params->pathinfo)) + goto skip; + p.n = ist("PATH_TRANSLATED"); + chunk_istcat(&trash, params->docroot); + chunk_istcat(&trash, params->pathinfo); + p.v = ist2(b_head(&trash), b_data(&trash)); + goto encode; + case FCGI_SP_CONT_LEN: + p.n = ist("CONTENT_LENGTH"); + p.v = params->cont_len; + goto encode; + case FCGI_SP_HTTPS: + if (!params->https) + goto skip; + p.n = ist("HTTPS"); + p.v = ist("on"); + goto encode; + case FCGI_SP_SRV_SOFT: + p.n = ist("SERVER_SOFTWARE"); + p.v = params->srv_soft; + goto encode; + default: + goto skip; + } + + encode: + if (!istlen(p.v)) + goto skip; + if (!fcgi_encode_param(outbuf, &p)) + return 0; + skip: + params->mask |= flag; + return 1; +} + +/* Sends a GET_VALUES record. Returns > 0 on success, 0 if it couldn't do + * anything. It is highly unexpected, but if the record is larger than a buffer + * and cannot be encoded in one time, an error is triggered and the connection is + * closed. GET_VALUES record cannot be split. + */ +static int fcgi_conn_send_get_values(struct fcgi_conn *fconn) +{ + struct buffer outbuf; + struct buffer *mbuf; + struct fcgi_param max_reqs = { .n = ist("FCGI_MAX_REQS"), .v = ist("")}; + struct fcgi_param mpxs_conns = { .n = ist("FCGI_MPXS_CONNS"), .v = ist("")}; + int ret = 0; + + TRACE_ENTER(FCGI_EV_TX_RECORD|FCGI_EV_TX_GETVAL, fconn->conn); + + mbuf = br_tail(fconn->mbuf); + retry: + if (!fcgi_get_buf(fconn, mbuf)) { + fconn->flags |= FCGI_CF_MUX_MALLOC; + fconn->flags |= FCGI_CF_DEM_MROOM; + TRACE_STATE("waiting for fconn mbuf ring allocation", FCGI_EV_TX_RECORD|FCGI_EV_FCONN_BLK, fconn->conn); + ret = 0; + goto end; + } + + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= FCGI_RECORD_HEADER_SZ || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < FCGI_RECORD_HEADER_SZ) + goto full; + + /* vsn: 1(FCGI_VERSION), type: (9)FCGI_GET_VALUES, id: 0x0000, + * len: 0x0000 (fill later), padding: 0x00, rsv: 0x00 */ + memcpy(outbuf.area, "\x01\x09\x00\x00\x00\x00\x00\x00", FCGI_RECORD_HEADER_SZ); + outbuf.data = FCGI_RECORD_HEADER_SZ; + + /* Note: Don't send the param FCGI_MAX_CONNS because its value cannot be + * handled by HAProxy. + */ + if (!fcgi_encode_param(&outbuf, &max_reqs) || !fcgi_encode_param(&outbuf, &mpxs_conns)) + goto full; + + /* update the record's size now */ + TRACE_PROTO("FCGI GET_VALUES record xferred", FCGI_EV_TX_RECORD|FCGI_EV_TX_GETVAL, fconn->conn, 0, 0, (size_t[]){outbuf.data-8}); + fcgi_set_record_size(outbuf.area, outbuf.data - FCGI_RECORD_HEADER_SZ); + b_add(mbuf, outbuf.data); + ret = 1; + + end: + TRACE_LEAVE(FCGI_EV_TX_RECORD|FCGI_EV_TX_GETVAL, fconn->conn); + return ret; + full: + /* Too large to be encoded. For GET_VALUES records, it is an error */ + if (!b_data(mbuf)) { + TRACE_ERROR("GET_VALUES record too large", FCGI_EV_TX_RECORD|FCGI_EV_TX_GETVAL|FCGI_EV_FCONN_ERR, fconn->conn); + goto fail; + } + + if ((mbuf = br_tail_add(fconn->mbuf)) != NULL) + goto retry; + fconn->flags |= FCGI_CF_MUX_MFULL; + fconn->flags |= FCGI_CF_DEM_MROOM; + TRACE_STATE("mbuf ring full", FCGI_EV_TX_RECORD|FCGI_EV_FCONN_BLK, fconn->conn); + ret = 0; + goto end; + fail: + fconn->state = FCGI_CS_CLOSED; + TRACE_STATE("switching to CLOSED", FCGI_EV_TX_RECORD|FCGI_EV_TX_GETVAL|FCGI_EV_FCONN_END, fconn->conn); + TRACE_DEVEL("leaving on error", FCGI_EV_TX_RECORD|FCGI_EV_TX_GETVAL|FCGI_EV_FCONN_ERR, fconn->conn); + return 0; +} + +/* Processes a GET_VALUES_RESULT record. Returns > 0 on success, 0 if it + * couldn't do anything. It is highly unexpected, but if the record is larger + * than a buffer and cannot be decoded in one time, an error is triggered and + * the connection is closed. GET_VALUES_RESULT record cannot be split. + */ +static int fcgi_conn_handle_values_result(struct fcgi_conn *fconn) +{ + struct buffer inbuf; + struct buffer *dbuf; + size_t offset; + + TRACE_ENTER(FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn); + + dbuf = &fconn->dbuf; + + /* Record too large to be fully decoded */ + if (b_size(dbuf) < (fconn->drl + fconn->drp)) + goto fail; + + /* process full record only */ + if (b_data(dbuf) < (fconn->drl + fconn->drp)) { + TRACE_DEVEL("leaving on missing data", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn); + return 0; + } + + if (unlikely(b_contig_data(dbuf, b_head_ofs(dbuf)) < fconn->drl)) { + /* Realign the dmux buffer if the record wraps. It is unexpected + * at this stage because it should be the first record received + * from the FCGI application. + */ + b_slow_realign_ofs(dbuf, trash.area, 0); + } + + inbuf = b_make(b_head(dbuf), b_data(dbuf), 0, fconn->drl); + + for (offset = 0; offset < b_data(&inbuf); ) { + struct fcgi_param p; + size_t ret; + + ret = fcgi_aligned_decode_param(&inbuf, offset, &p); + if (!ret) { + /* name or value too large to be decoded at once */ + TRACE_ERROR("error decoding GET_VALUES_RESULT param", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL|FCGI_EV_FCONN_ERR, fconn->conn); + goto fail; + } + offset += ret; + + if (isteqi(p.n, ist("FCGI_MPXS_CONNS"))) { + if (isteq(p.v, ist("1"))) { + TRACE_STATE("set mpxs param", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn, 0, 0, (size_t[]){1}); + fconn->flags |= FCGI_CF_MPXS_CONNS; + } + else { + TRACE_STATE("set mpxs param", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn, 0, 0, (size_t[]){0}); + fconn->flags &= ~FCGI_CF_MPXS_CONNS; + } + } + else if (isteqi(p.n, ist("FCGI_MAX_REQS"))) { + fconn->streams_limit = strl2ui(p.v.ptr, p.v.len); + TRACE_STATE("set streams_limit", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn, 0, 0, (size_t[]){fconn->streams_limit}); + } + /* + * Ignore all other params + */ + } + + /* Reset the number of concurrent streams supported if the FCGI + * application does not support connection multiplexing + */ + if (!(fconn->flags & FCGI_CF_MPXS_CONNS)) { + fconn->streams_limit = 1; + TRACE_STATE("no mpxs for streams_limit to 1", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn); + } + + /* We must be sure to have read exactly the announced record length, no + * more no less + */ + if (offset != fconn->drl) { + TRACE_ERROR("invalid GET_VALUES_RESULT record length", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL|FCGI_EV_FCONN_ERR, fconn->conn); + goto fail; + } + + TRACE_PROTO("FCGI GET_VALUES_RESULT record rcvd", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn, 0, 0, (size_t[]){fconn->drl}); + b_del(&fconn->dbuf, fconn->drl + fconn->drp); + fconn->drl = 0; + fconn->drp = 0; + fconn->state = FCGI_CS_RECORD_H; + fcgi_wake_unassigned_streams(fconn); + TRACE_STATE("switching to RECORD_H", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn); + TRACE_LEAVE(FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn); + return 1; + fail: + fconn->state = FCGI_CS_CLOSED; + TRACE_STATE("switching to CLOSED", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn); + TRACE_DEVEL("leaving on error", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL|FCGI_EV_FCONN_ERR, fconn->conn); + return 0; +} + +/* Sends an ABORT_REQUEST record for each active streams. Closed streams are + * excluded, as the streams which already received the end-of-stream. It returns + * > 0 if the record was sent tp all streams. Otherwise it returns 0. + */ +static int fcgi_conn_send_aborts(struct fcgi_conn *fconn) +{ + struct eb32_node *node; + struct fcgi_strm *fstrm; + + TRACE_ENTER(FCGI_EV_TX_RECORD, fconn->conn); + + node = eb32_lookup_ge(&fconn->streams_by_id, 1); + while (node) { + fstrm = container_of(node, struct fcgi_strm, by_id); + node = eb32_next(node); + if (fstrm->state != FCGI_SS_CLOSED && + !(fstrm->flags & (FCGI_SF_ES_RCVD|FCGI_SF_ABRT_SENT)) && + !fcgi_strm_send_abort(fconn, fstrm)) + return 0; + } + fconn->flags |= FCGI_CF_ABRTS_SENT; + TRACE_STATE("aborts sent to all fstrms", FCGI_EV_TX_RECORD, fconn->conn); + TRACE_LEAVE(FCGI_EV_TX_RECORD, fconn->conn); + return 1; +} + +/* Sends a BEGIN_REQUEST record. It returns > 0 on success, 0 if it couldn't do + * anything. BEGIN_REQUEST record cannot be split. So we wait to have enough + * space to proceed. It is small enough to be encoded in an empty buffer. + */ +static int fcgi_strm_send_begin_request(struct fcgi_conn *fconn, struct fcgi_strm *fstrm) +{ + struct buffer outbuf; + struct buffer *mbuf; + struct fcgi_begin_request rec = { .role = FCGI_RESPONDER, .flags = 0}; + int ret; + + TRACE_ENTER(FCGI_EV_TX_RECORD|FCGI_EV_TX_BEGREQ, fconn->conn, fstrm); + + mbuf = br_tail(fconn->mbuf); + retry: + if (!fcgi_get_buf(fconn, mbuf)) { + fconn->flags |= FCGI_CF_MUX_MALLOC; + fstrm->flags |= FCGI_SF_BLK_MROOM; + TRACE_STATE("waiting for fconn mbuf ring allocation", FCGI_EV_TX_RECORD|FCGI_EV_FSTRM_BLK|FCGI_EV_FCONN_BLK, fconn->conn, fstrm); + ret = 0; + goto end; + } + + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= FCGI_RECORD_HEADER_SZ || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < FCGI_RECORD_HEADER_SZ) + goto full; + + /* vsn: 1(FCGI_VERSION), type: (1)FCGI_BEGIN_REQUEST, id: fstrm->id, + * len: 0x0008, padding: 0x00, rsv: 0x00 */ + memcpy(outbuf.area, "\x01\x01\x00\x00\x00\x08\x00\x00", FCGI_RECORD_HEADER_SZ); + fcgi_set_record_id(outbuf.area, fstrm->id); + outbuf.data = FCGI_RECORD_HEADER_SZ; + + if (fconn->flags & FCGI_CF_KEEP_CONN) { + TRACE_STATE("keep connection opened", FCGI_EV_TX_RECORD|FCGI_EV_TX_BEGREQ, fconn->conn, fstrm); + rec.flags |= FCGI_KEEP_CONN; + } + if (!fcgi_encode_begin_request(&outbuf, &rec)) + goto full; + + /* commit the record */ + TRACE_PROTO("FCGI BEGIN_REQUEST record xferred", FCGI_EV_TX_RECORD|FCGI_EV_TX_BEGREQ, fconn->conn, fstrm, 0, (size_t[]){0}); + b_add(mbuf, outbuf.data); + fstrm->flags |= FCGI_SF_BEGIN_SENT; + fstrm->state = FCGI_SS_OPEN; + TRACE_STATE("switching to OPEN", FCGI_EV_TX_RECORD|FCGI_EV_TX_BEGREQ, fconn->conn, fstrm); + ret = 1; + + end: + TRACE_LEAVE(FCGI_EV_TX_RECORD|FCGI_EV_TX_BEGREQ, fconn->conn, fstrm); + return ret; + full: + if ((mbuf = br_tail_add(fconn->mbuf)) != NULL) + goto retry; + fconn->flags |= FCGI_CF_MUX_MFULL; + fstrm->flags |= FCGI_SF_BLK_MROOM; + TRACE_STATE("mbuf ring full", FCGI_EV_TX_RECORD|FCGI_EV_FSTRM_BLK|FCGI_EV_FCONN_BLK, fconn->conn); + ret = 0; + goto end; +} + +/* Sends an empty record of type <rtype>. It returns > 0 on success, 0 if it + * couldn't do anything. Empty record cannot be split. So we wait to have enough + * space to proceed. It is small enough to be encoded in an empty buffer. + */ +static int fcgi_strm_send_empty_record(struct fcgi_conn *fconn, struct fcgi_strm *fstrm, + enum fcgi_record_type rtype) +{ + struct buffer outbuf; + struct buffer *mbuf; + int ret; + + TRACE_ENTER(FCGI_EV_TX_RECORD, fconn->conn, fstrm); + mbuf = br_tail(fconn->mbuf); + retry: + if (!fcgi_get_buf(fconn, mbuf)) { + fconn->flags |= FCGI_CF_MUX_MALLOC; + fstrm->flags |= FCGI_SF_BLK_MROOM; + TRACE_STATE("waiting for fconn mbuf ring allocation", FCGI_EV_TX_RECORD|FCGI_EV_FSTRM_BLK|FCGI_EV_FCONN_BLK, fconn->conn, fstrm); + ret = 0; + goto end; + } + + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= FCGI_RECORD_HEADER_SZ || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < FCGI_RECORD_HEADER_SZ) + goto full; + + /* vsn: 1(FCGI_VERSION), type: rtype, id: fstrm->id, + * len: 0x0000, padding: 0x00, rsv: 0x00 */ + memcpy(outbuf.area, "\x01\x05\x00\x00\x00\x00\x00\x00", FCGI_RECORD_HEADER_SZ); + outbuf.area[1] = rtype; + fcgi_set_record_id(outbuf.area, fstrm->id); + outbuf.data = FCGI_RECORD_HEADER_SZ; + + /* commit the record */ + b_add(mbuf, outbuf.data); + ret = 1; + + end: + TRACE_LEAVE(FCGI_EV_TX_RECORD, fconn->conn, fstrm); + return ret; + full: + if ((mbuf = br_tail_add(fconn->mbuf)) != NULL) + goto retry; + fconn->flags |= FCGI_CF_MUX_MFULL; + fstrm->flags |= FCGI_SF_BLK_MROOM; + TRACE_STATE("mbuf ring full", FCGI_EV_TX_RECORD|FCGI_EV_FSTRM_BLK|FCGI_EV_FCONN_BLK, fconn->conn, fstrm); + ret = 0; + goto end; +} + + +/* Sends an empty PARAMS record. It relies on fcgi_strm_send_empty_record(). It + * marks the end of params. + */ +static int fcgi_strm_send_empty_params(struct fcgi_conn *fconn, struct fcgi_strm *fstrm) +{ + int ret; + + TRACE_POINT(FCGI_EV_TX_RECORD|FCGI_EV_TX_PARAMS, fconn->conn, fstrm); + ret = fcgi_strm_send_empty_record(fconn, fstrm, FCGI_PARAMS); + if (ret) { + fstrm->flags |= FCGI_SF_EP_SENT; + TRACE_PROTO("FCGI PARAMS record xferred", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, 0, (size_t[]){0}); + } + return ret; +} + +/* Sends an empty STDIN record. It relies on fcgi_strm_send_empty_record(). It + * marks the end of input. On success, all the request was successfully sent. + */ +static int fcgi_strm_send_empty_stdin(struct fcgi_conn *fconn, struct fcgi_strm *fstrm) +{ + int ret; + + TRACE_POINT(FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN|FCGI_EV_TX_EOI, fconn->conn, fstrm); + ret = fcgi_strm_send_empty_record(fconn, fstrm, FCGI_STDIN); + if (ret) { + fstrm->flags |= FCGI_SF_ES_SENT; + TRACE_PROTO("FCGI STDIN record xferred", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, 0, (size_t[]){0}); + TRACE_USER("FCGI request fully xferred", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN|FCGI_EV_TX_EOI, fconn->conn, fstrm); + TRACE_STATE("stdin data fully sent", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN|FCGI_EV_TX_EOI, fconn->conn, fstrm); + } + return ret; +} + +/* Sends an ABORT_REQUEST record. It relies on fcgi_strm_send_empty_record(). It + * stops the request processing. + */ +static int fcgi_strm_send_abort(struct fcgi_conn *fconn, struct fcgi_strm *fstrm) +{ + int ret; + + TRACE_POINT(FCGI_EV_TX_RECORD|FCGI_EV_TX_ABORT, fconn->conn, fstrm); + ret = fcgi_strm_send_empty_record(fconn, fstrm, FCGI_ABORT_REQUEST); + if (ret) { + fstrm->flags |= FCGI_SF_ABRT_SENT; + TRACE_PROTO("FCGI ABORT record xferred", FCGI_EV_TX_RECORD|FCGI_EV_TX_ABORT, fconn->conn, fstrm, 0, (size_t[]){0}); + TRACE_USER("FCGI request aborted", FCGI_EV_TX_RECORD|FCGI_EV_TX_ABORT, fconn->conn, fstrm); + TRACE_STATE("abort sent", FCGI_EV_TX_RECORD|FCGI_EV_TX_ABORT, fconn->conn, fstrm); + } + return ret; +} + +/* Sends a PARAMS record. Returns > 0 on success, 0 if it couldn't do + * anything. If there are too much K/V params to be encoded in a PARAMS record, + * several records are sent. However, a K/V param cannot be split between 2 + * records. + */ +static size_t fcgi_strm_send_params(struct fcgi_conn *fconn, struct fcgi_strm *fstrm, + struct htx *htx) +{ + struct buffer outbuf; + struct buffer *mbuf; + struct htx_blk *blk; + struct htx_sl *sl = NULL; + struct fcgi_strm_params params; + size_t total = 0; + + TRACE_ENTER(FCGI_EV_TX_RECORD|FCGI_EV_TX_PARAMS, fconn->conn, fstrm, htx); + + memset(¶ms, 0, sizeof(params)); + params.p = get_trash_chunk(); + + mbuf = br_tail(fconn->mbuf); + retry: + if (!fcgi_get_buf(fconn, mbuf)) { + fconn->flags |= FCGI_CF_MUX_MALLOC; + fstrm->flags |= FCGI_SF_BLK_MROOM; + TRACE_STATE("waiting for fconn mbuf ring allocation", FCGI_EV_TX_RECORD|FCGI_EV_FSTRM_BLK|FCGI_EV_FCONN_BLK, fconn->conn, fstrm); + goto end; + } + + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= FCGI_RECORD_HEADER_SZ || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < FCGI_RECORD_HEADER_SZ) + goto full; + + /* vsn: 1(FCGI_VERSION), type: (4)FCGI_PARAMS, id: fstrm->id, + * len: 0x0000 (fill later), padding: 0x00, rsv: 0x00 */ + memcpy(outbuf.area, "\x01\x04\x00\x00\x00\x00\x00\x00", FCGI_RECORD_HEADER_SZ); + fcgi_set_record_id(outbuf.area, fstrm->id); + outbuf.data = FCGI_RECORD_HEADER_SZ; + + blk = htx_get_head_blk(htx); + while (blk) { + enum htx_blk_type type; + uint32_t size = htx_get_blksz(blk); + struct fcgi_param p; + + type = htx_get_blk_type(blk); + switch (type) { + case HTX_BLK_REQ_SL: + sl = htx_get_blk_ptr(htx, blk); + if (sl->info.req.meth == HTTP_METH_HEAD) + fstrm->h1m.flags |= H1_MF_METH_HEAD; + if (sl->flags & HTX_SL_F_VER_11) + fstrm->h1m.flags |= H1_MF_VER_11; + break; + + case HTX_BLK_HDR: + p.n = htx_get_blk_name(htx, blk); + p.v = htx_get_blk_value(htx, blk); + + if (istmatch(p.n, ist(":fcgi-"))) { + p.n = istadv(p.n, 6); + if (isteq(p.n, ist("gateway_interface"))) + params.mask |= FCGI_SP_CGI_GATEWAY; + else if (isteq(p.n, ist("document_root"))) { + params.mask |= FCGI_SP_DOC_ROOT; + params.docroot = p.v; + } + else if (isteq(p.n, ist("script_name"))) { + params.mask |= FCGI_SP_SCRIPT_NAME; + params.scriptname = p.v; + } + else if (isteq(p.n, ist("path_info"))) { + params.mask |= FCGI_SP_PATH_INFO; + params.pathinfo = p.v; + } + else if (isteq(p.n, ist("request_uri"))) { + params.mask |= FCGI_SP_REQ_URI; + params.uri = p.v; + } + else if (isteq(p.n, ist("request_meth"))) + params.mask |= FCGI_SP_REQ_METH; + else if (isteq(p.n, ist("query_string"))) + params.mask |= FCGI_SP_REQ_QS; + else if (isteq(p.n, ist("server_name"))) + params.mask |= FCGI_SP_SRV_NAME; + else if (isteq(p.n, ist("server_port"))) + params.mask |= FCGI_SP_SRV_PORT; + else if (isteq(p.n, ist("server_protocol"))) + params.mask |= FCGI_SP_SRV_PROTO; + else if (isteq(p.n, ist("remote_addr"))) + params.mask |= FCGI_SP_REM_ADDR; + else if (isteq(p.n, ist("remote_port"))) + params.mask |= FCGI_SP_REM_PORT; + else if (isteq(p.n, ist("script_filename"))) + params.mask |= FCGI_SP_SCRIPT_FILE; + else if (isteq(p.n, ist("path_translated"))) + params.mask |= FCGI_SP_PATH_TRANS; + else if (isteq(p.n, ist("https"))) + params.mask |= FCGI_SP_HTTPS; + else if (isteq(p.n, ist("server_software"))) + params.mask |= FCGI_SP_SRV_SOFT; + } + else if (isteq(p.n, ist("content-length"))) { + p.n = ist("CONTENT_LENGTH"); + params.mask |= FCGI_SP_CONT_LEN; + } + else if (isteq(p.n, ist("content-type"))) + p.n = ist("CONTENT_TYPE"); + else { + struct ist n; + + if (isteq(p.n, ist("host"))) + params.srv_name = p.v; + else if (isteq(p.n, ist("te"))) { + /* "te" may only be sent with "trailers" if this value + * is present, otherwise it must be deleted. + */ + p.v = istist(p.v, ist("trailers")); + if (!isttest(p.v) || (p.v.len > 8 && p.v.ptr[8] != ',')) + break; + p.v = ist("trailers"); + } + + /* Skip header if same name is used to add the server name */ + if (isttest(fconn->proxy->server_id_hdr_name) && isteq(p.n, fconn->proxy->server_id_hdr_name)) + break; + + n = ist2(trash.area, 0); + istcat(&n, ist("http_"), trash.size); + istcat(&n, p.n, trash.size); + p.n = n; + } + + if (!fcgi_encode_param(&outbuf, &p)) { + if (b_space_wraps(mbuf)) + goto realign_again; + if (outbuf.data == FCGI_RECORD_HEADER_SZ) + goto full; + goto done; + } + break; + + case HTX_BLK_EOH: + if (isttest(fconn->proxy->server_id_hdr_name)) { + struct server *srv = objt_server(fconn->conn->target); + + if (!srv) + goto done; + + p.n = ist2(trash.area, 0); + istcat(&p.n, ist("http_"), trash.size); + istcat(&p.n, fconn->proxy->server_id_hdr_name, trash.size); + p.v = ist(srv->id); + + if (!fcgi_encode_param(&outbuf, &p)) { + if (b_space_wraps(mbuf)) + goto realign_again; + if (outbuf.data == FCGI_RECORD_HEADER_SZ) + goto full; + } + TRACE_STATE("add server name header", FCGI_EV_TX_RECORD|FCGI_EV_TX_PARAMS, fconn->conn, fstrm); + } + goto done; + + default: + break; + } + total += size; + blk = htx_remove_blk(htx, blk); + } + + done: + if (!fcgi_set_default_param(fconn, fstrm, htx, sl, ¶ms)) { + TRACE_ERROR("error setting default params", FCGI_EV_TX_RECORD|FCGI_EV_STRM_ERR, fconn->conn, fstrm); + goto error; + } + + if (!fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_CGI_GATEWAY) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_DOC_ROOT) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_SCRIPT_NAME) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_PATH_INFO) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_REQ_URI) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_REQ_METH) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_REQ_QS) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_SRV_NAME) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_SRV_PORT) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_SRV_PROTO) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_REM_ADDR) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_REM_PORT) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_SCRIPT_FILE) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_PATH_TRANS) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_CONT_LEN) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_SRV_SOFT) || + !fcgi_encode_default_param(fconn, fstrm, ¶ms, &outbuf, FCGI_SP_HTTPS)) { + TRACE_ERROR("error encoding default params", FCGI_EV_TX_RECORD|FCGI_EV_STRM_ERR, fconn->conn, fstrm); + goto error; + } + + /* update the record's size */ + TRACE_PROTO("FCGI PARAMS record xferred", FCGI_EV_TX_RECORD|FCGI_EV_TX_PARAMS, fconn->conn, fstrm, 0, (size_t[]){outbuf.data - FCGI_RECORD_HEADER_SZ}); + fcgi_set_record_size(outbuf.area, outbuf.data - FCGI_RECORD_HEADER_SZ); + b_add(mbuf, outbuf.data); + + end: + TRACE_LEAVE(FCGI_EV_TX_RECORD|FCGI_EV_TX_PARAMS, fconn->conn, fstrm, htx, (size_t[]){total}); + return total; + full: + if ((mbuf = br_tail_add(fconn->mbuf)) != NULL) + goto retry; + fconn->flags |= FCGI_CF_MUX_MFULL; + fstrm->flags |= FCGI_SF_BLK_MROOM; + TRACE_STATE("mbuf ring full", FCGI_EV_TX_RECORD|FCGI_EV_FSTRM_BLK|FCGI_EV_FCONN_BLK, fconn->conn, fstrm); + if (total) + goto error; + goto end; + + error: + htx->flags |= HTX_FL_PROCESSING_ERROR; + TRACE_ERROR("processing error sending PARAMS record", FCGI_EV_TX_RECORD|FCGI_EV_STRM_ERR, fconn->conn, fstrm); + fcgi_strm_error(fstrm); + goto end; +} + +/* Sends a STDIN record. Returns > 0 on success, 0 if it couldn't do + * anything. STDIN records contain the request body. + */ +static size_t fcgi_strm_send_stdin(struct fcgi_conn *fconn, struct fcgi_strm *fstrm, + struct htx *htx, size_t count, struct buffer *buf) +{ + struct buffer outbuf; + struct buffer *mbuf; + struct htx_blk *blk; + enum htx_blk_type type; + uint32_t size, extra_bytes; + size_t total = 0; + + extra_bytes = 0; + + TRACE_ENTER(FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, htx, (size_t[]){count}); + if (!count) + goto end; + + mbuf = br_tail(fconn->mbuf); + retry: + if (!fcgi_get_buf(fconn, mbuf)) { + fconn->flags |= FCGI_CF_MUX_MALLOC; + fstrm->flags |= FCGI_SF_BLK_MROOM; + TRACE_STATE("waiting for fconn mbuf ring allocation", FCGI_EV_TX_RECORD|FCGI_EV_FSTRM_BLK|FCGI_EV_FCONN_BLK, fconn->conn, fstrm); + goto end; + } + + /* Perform some optimizations to reduce the number of buffer copies. + * First, if the mux's buffer is empty and the htx area contains exactly + * one data block of the same size as the requested count, and this + * count fits within the record size, then it's possible to simply swap + * the caller's buffer with the mux's output buffer and adjust offsets + * and length to match the entire DATA HTX block in the middle. In this + * case we perform a true zero-copy operation from end-to-end. This is + * the situation that happens all the time with large files. Second, if + * this is not possible, but the mux's output buffer is empty, we still + * have an opportunity to avoid the copy to the intermediary buffer, by + * making the intermediary buffer's area point to the output buffer's + * area. In this case we want to skip the HTX header to make sure that + * copies remain aligned and that this operation remains possible all + * the time. This goes for headers, data blocks and any data extracted + * from the HTX blocks. + */ + blk = htx_get_head_blk(htx); + if (!blk) + goto end; + type = htx_get_blk_type(blk); + size = htx_get_blksz(blk); + if (unlikely(size == count && htx_nbblks(htx) == 1 && type == HTX_BLK_DATA)) { + void *old_area = mbuf->area; + int eom = (htx->flags & HTX_FL_EOM); + + /* Last block of the message: Reserve the size for the empty stdin record */ + if (eom) + extra_bytes = FCGI_RECORD_HEADER_SZ; + + if (b_data(mbuf)) { + /* Too bad there are data left there. We're willing to memcpy/memmove + * up to 1/4 of the buffer, which means that it's OK to copy a large + * record into a buffer containing few data if it needs to be realigned, + * and that it's also OK to copy few data without realigning. Otherwise + * we'll pretend the mbuf is full and wait for it to become empty. + */ + if (size + FCGI_RECORD_HEADER_SZ + extra_bytes <= b_room(mbuf) && + (b_data(mbuf) <= b_size(mbuf) / 4 || + (size <= b_size(mbuf) / 4 && size + FCGI_RECORD_HEADER_SZ + extra_bytes <= b_contig_space(mbuf)))) + goto copy; + goto full; + } + + TRACE_PROTO("sending stding data (zero-copy)", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, htx, (size_t[]){size}); + /* map a FCGI record to the HTX block so that we can put the + * record header there. + */ + *mbuf = b_make(buf->area, buf->size, sizeof(struct htx) + blk->addr - FCGI_RECORD_HEADER_SZ, size + FCGI_RECORD_HEADER_SZ); + outbuf.area = b_head(mbuf); + + /* prepend a FCGI record header just before the DATA block */ + memcpy(outbuf.area, "\x01\x05\x00\x00\x00\x00\x00\x00", FCGI_RECORD_HEADER_SZ); + fcgi_set_record_id(outbuf.area, fstrm->id); + fcgi_set_record_size(outbuf.area, size); + + /* and exchange with our old area */ + buf->area = old_area; + buf->data = buf->head = 0; + total += size; + + htx = (struct htx *)buf->area; + htx_reset(htx); + if (eom) + goto empty_stdin; + goto end; + } + + copy: + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= FCGI_RECORD_HEADER_SZ + extra_bytes || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < FCGI_RECORD_HEADER_SZ + extra_bytes) + goto full; + + /* vsn: 1(FCGI_VERSION), type: (5)FCGI_STDIN, id: fstrm->id, + * len: 0x0000 (fill later), padding: 0x00, rsv: 0x00 */ + memcpy(outbuf.area, "\x01\x05\x00\x00\x00\x00\x00\x00", FCGI_RECORD_HEADER_SZ); + fcgi_set_record_id(outbuf.area, fstrm->id); + outbuf.data = FCGI_RECORD_HEADER_SZ; + + blk = htx_get_head_blk(htx); + while (blk && count) { + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t size = htx_get_blksz(blk); + struct ist v; + + switch (type) { + case HTX_BLK_DATA: + TRACE_PROTO("sending stding data", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, htx, (size_t[]){size}); + v = htx_get_blk_value(htx, blk); + + if (htx_is_unique_blk(htx, blk) && (htx->flags & HTX_FL_EOM)) + extra_bytes = FCGI_RECORD_HEADER_SZ; /* Last block of the message */ + + if (v.len > count) { + v.len = count; + extra_bytes = 0; + } + + if (v.len + FCGI_RECORD_HEADER_SZ + extra_bytes > b_room(&outbuf)) { + /* It doesn't fit at once. If it at least fits once split and + * the amount of data to move is low, let's defragment the + * buffer now. + */ + if (b_space_wraps(mbuf) && + b_data(&outbuf) + v.len + extra_bytes <= b_room(mbuf) && + b_data(mbuf) <= MAX_DATA_REALIGN) + goto realign_again; + v.len = (FCGI_RECORD_HEADER_SZ + extra_bytes > b_room(&outbuf) + ? 0 + : b_room(&outbuf) - FCGI_RECORD_HEADER_SZ - extra_bytes); + } + if (!v.len || !chunk_memcat(&outbuf, v.ptr, v.len)) { + if (outbuf.data == FCGI_RECORD_HEADER_SZ) + goto full; + goto done; + } + if (v.len != size) { + total += v.len; + count -= v.len; + htx_cut_data_blk(htx, blk, v.len); + goto done; + } + break; + + default: + break; + } + total += size; + count -= size; + blk = htx_remove_blk(htx, blk); + } + + done: + /* update the record's size */ + TRACE_PROTO("FCGI STDIN record xferred", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, 0, (size_t[]){outbuf.data - FCGI_RECORD_HEADER_SZ}); + fcgi_set_record_size(outbuf.area, outbuf.data - FCGI_RECORD_HEADER_SZ); + b_add(mbuf, outbuf.data); + + /* Send the empty stding here to finish the message */ + if (htx_is_empty(htx) && (htx->flags & HTX_FL_EOM)) { + empty_stdin: + TRACE_PROTO("sending FCGI STDIN record", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, htx); + if (!fcgi_strm_send_empty_stdin(fconn, fstrm)) { + /* bytes already reserved for this record. It should not fail */ + htx->flags |= HTX_FL_PROCESSING_ERROR; + TRACE_ERROR("processing error sending empty STDIN record", FCGI_EV_TX_RECORD|FCGI_EV_STRM_ERR, fconn->conn, fstrm); + fcgi_strm_error(fstrm); + } + } + + end: + TRACE_LEAVE(FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, htx, (size_t[]){total}); + return total; + full: + if ((mbuf = br_tail_add(fconn->mbuf)) != NULL) + goto retry; + fconn->flags |= FCGI_CF_MUX_MFULL; + fstrm->flags |= FCGI_SF_BLK_MROOM; + TRACE_STATE("mbuf ring full", FCGI_EV_TX_RECORD|FCGI_EV_FSTRM_BLK|FCGI_EV_FCONN_BLK, fconn->conn, fstrm); + goto end; +} + +/* Processes a STDOUT record. Returns > 0 on success, 0 if it couldn't do + * anything. STDOUT records contain the entire response. All the content is + * copied in the stream's rxbuf. The parsing will be handled in fcgi_rcv_buf(). + */ +static int fcgi_strm_handle_stdout(struct fcgi_conn *fconn, struct fcgi_strm *fstrm) +{ + struct buffer *dbuf; + size_t ret; + size_t max; + + TRACE_ENTER(FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm); + + dbuf = &fconn->dbuf; + + /* Only padding remains */ + if (fconn->state == FCGI_CS_RECORD_P) + goto end_transfer; + + if (b_data(dbuf) < (fconn->drl + fconn->drp) && + b_size(dbuf) > (fconn->drl + fconn->drp) && + buf_room_for_htx_data(dbuf)) + goto fail; // incomplete record + + if (!fcgi_get_buf(fconn, &fstrm->rxbuf)) { + fconn->flags |= FCGI_CF_DEM_SALLOC; + TRACE_STATE("waiting for fstrm rxbuf allocation", FCGI_EV_RX_RECORD|FCGI_EV_FSTRM_BLK, fconn->conn, fstrm); + goto fail; + } + + /*max = MIN(b_room(&fstrm->rxbuf), fconn->drl);*/ + max = buf_room_for_htx_data(&fstrm->rxbuf); + if (!b_data(&fstrm->rxbuf)) + fstrm->rxbuf.head = sizeof(struct htx); + if (max > fconn->drl) + max = fconn->drl; + + ret = b_xfer(&fstrm->rxbuf, dbuf, max); + if (!ret) + goto fail; + fconn->drl -= ret; + TRACE_DATA("move some data to fstrm rxbuf", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm, 0, (size_t[]){ret}); + TRACE_PROTO("FCGI STDOUT record rcvd", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm, 0, (size_t[]){ret}); + + if (!buf_room_for_htx_data(&fstrm->rxbuf)) { + fconn->flags |= FCGI_CF_DEM_SFULL; + TRACE_STATE("fstrm rxbuf full", FCGI_EV_RX_RECORD|FCGI_EV_FSTRM_BLK, fconn->conn, fstrm); + } + + if (fconn->drl) + goto fail; + + end_transfer: + fconn->state = FCGI_CS_RECORD_P; + fconn->drl += fconn->drp; + fconn->drp = 0; + ret = MIN(b_data(&fconn->dbuf), fconn->drl); + b_del(&fconn->dbuf, ret); + fconn->drl -= ret; + if (fconn->drl) + goto fail; + + fconn->state = FCGI_CS_RECORD_H; + TRACE_STATE("switching to RECORD_H", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn, fstrm); + TRACE_LEAVE(FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm); + return 1; + fail: + TRACE_DEVEL("leaving on missing data or error", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm); + return 0; +} + + +/* Processes an empty STDOUT. Returns > 0 on success, 0 if it couldn't do + * anything. It only skip the padding in fact, there is no payload for such + * records. It marks the end of the response. + */ +static int fcgi_strm_handle_empty_stdout(struct fcgi_conn *fconn, struct fcgi_strm *fstrm) +{ + int ret; + + TRACE_ENTER(FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm); + + fconn->state = FCGI_CS_RECORD_P; + TRACE_STATE("switching to RECORD_P", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm); + fconn->drl += fconn->drp; + fconn->drp = 0; + ret = MIN(b_data(&fconn->dbuf), fconn->drl); + b_del(&fconn->dbuf, ret); + fconn->drl -= ret; + if (fconn->drl) { + TRACE_DEVEL("leaving on missing data or error", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm); + return 0; + } + fconn->state = FCGI_CS_RECORD_H; + fstrm->flags |= FCGI_SF_ES_RCVD; + TRACE_PROTO("FCGI STDOUT record rcvd", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm, 0, (size_t[]){0}); + TRACE_STATE("stdout data fully send, switching to RECORD_H", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR|FCGI_EV_RX_EOI, fconn->conn, fstrm); + TRACE_LEAVE(FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm); + return 1; +} + +/* Processes a STDERR record. Returns > 0 on success, 0 if it couldn't do + * anything. + */ +static int fcgi_strm_handle_stderr(struct fcgi_conn *fconn, struct fcgi_strm *fstrm) +{ + struct buffer *dbuf; + struct buffer tag; + size_t ret; + + TRACE_ENTER(FCGI_EV_RX_RECORD|FCGI_EV_RX_STDERR, fconn->conn, fstrm); + dbuf = &fconn->dbuf; + + /* Only padding remains */ + if (fconn->state == FCGI_CS_RECORD_P || !fconn->drl) + goto end_transfer; + + if (b_data(dbuf) < (fconn->drl + fconn->drp) && + b_size(dbuf) > (fconn->drl + fconn->drp) && + buf_room_for_htx_data(dbuf)) + goto fail; // incomplete record + + chunk_reset(&trash); + ret = b_force_xfer(&trash, dbuf, MIN(b_room(&trash), fconn->drl)); + if (!ret) + goto fail; + fconn->drl -= ret; + TRACE_PROTO("FCGI STDERR record rcvd", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDERR, fconn->conn, fstrm, 0, (size_t[]){ret}); + + trash.area[ret] = '\n'; + trash.area[ret+1] = '\0'; + tag.area = fconn->app->name; tag.data = strlen(fconn->app->name); + app_log(&fconn->app->loggers, &tag, LOG_ERR, "%s", trash.area); + + if (fconn->drl) + goto fail; + + end_transfer: + fconn->state = FCGI_CS_RECORD_P; + fconn->drl += fconn->drp; + fconn->drp = 0; + ret = MIN(b_data(&fconn->dbuf), fconn->drl); + b_del(&fconn->dbuf, ret); + fconn->drl -= ret; + if (fconn->drl) + goto fail; + fconn->state = FCGI_CS_RECORD_H; + TRACE_STATE("switching to RECORD_H", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn, fstrm); + TRACE_LEAVE(FCGI_EV_RX_RECORD|FCGI_EV_RX_STDERR, fconn->conn, fstrm); + return 1; + fail: + TRACE_DEVEL("leaving on missing data or error", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDERR, fconn->conn, fstrm); + return 0; +} + +/* Processes an END_REQUEST record. Returns > 0 on success, 0 if it couldn't do + * anything. If the empty STDOUT record is not already received, this one marks + * the end of the response. It is highly unexpected, but if the record is larger + * than a buffer and cannot be decoded in one time, an error is triggered and + * the connection is closed. END_REQUEST record cannot be split. + */ +static int fcgi_strm_handle_end_request(struct fcgi_conn *fconn, struct fcgi_strm *fstrm) +{ + struct buffer inbuf; + struct buffer *dbuf; + struct fcgi_end_request endreq; + + TRACE_ENTER(FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ, fconn->conn, fstrm); + dbuf = &fconn->dbuf; + + /* Record too large to be fully decoded */ + if (b_size(dbuf) < (fconn->drl + fconn->drp)) { + TRACE_ERROR("END_REQUEST record too large", FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ|FCGI_EV_FSTRM_ERR, fconn->conn, fstrm); + goto fail; + } + + /* process full record only */ + if (b_data(dbuf) < (fconn->drl + fconn->drp)) { + TRACE_DEVEL("leaving on missing data", FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ, fconn->conn); + return 0; + } + + if (unlikely(b_contig_data(dbuf, b_head_ofs(dbuf)) < fconn->drl)) { + /* Realign the dmux buffer if the record wraps. It is unexpected + * at this stage because it should be the first record received + * from the FCGI application. + */ + b_slow_realign_ofs(dbuf, trash.area, 0); + } + + inbuf = b_make(b_head(dbuf), b_data(dbuf), 0, fconn->drl); + + if (!fcgi_decode_end_request(&inbuf, 0, &endreq)) { + TRACE_ERROR("END_REQUEST record decoding failure", FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ|FCGI_EV_FSTRM_ERR, fconn->conn, fstrm); + goto fail; + } + + fstrm->flags |= FCGI_SF_ES_RCVD; + TRACE_STATE("end of script reported", FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ|FCGI_EV_RX_EOI, fconn->conn, fstrm); + TRACE_PROTO("FCGI END_REQUEST record rcvd", FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ, fconn->conn, fstrm, 0, (size_t[]){fconn->drl}); + fstrm->proto_status = endreq.errcode; + fcgi_strm_close(fstrm); + + b_del(&fconn->dbuf, fconn->drl + fconn->drp); + fconn->drl = 0; + fconn->drp = 0; + fconn->state = FCGI_CS_RECORD_H; + TRACE_STATE("switching to RECORD_H", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn, fstrm); + TRACE_LEAVE(FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ, fconn->conn, fstrm); + return 1; + + fail: + fcgi_strm_error(fstrm); + TRACE_DEVEL("leaving on error", FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ|FCGI_EV_FSTRM_ERR, fconn->conn, fstrm); + return 0; +} + +/* process Rx records to be demultiplexed */ +static void fcgi_process_demux(struct fcgi_conn *fconn) +{ + struct fcgi_strm *fstrm = NULL, *tmp_fstrm; + struct fcgi_header hdr; + int ret; + + TRACE_ENTER(FCGI_EV_FCONN_WAKE, fconn->conn); + + if (fconn->state == FCGI_CS_CLOSED) + return; + + if (unlikely(fconn->state < FCGI_CS_RECORD_H)) { + if (fconn->state == FCGI_CS_INIT) { + TRACE_STATE("waiting FCGI GET_VALUES to be sent", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR|FCGI_EV_RX_GETVAL, fconn->conn); + return; + } + if (fconn->state == FCGI_CS_SETTINGS) { + /* ensure that what is pending is a valid GET_VALUES_RESULT record. */ + TRACE_STATE("receiving FCGI record header", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn); + ret = fcgi_decode_record_hdr(&fconn->dbuf, 0, &hdr); + if (!ret) { + TRACE_ERROR("header record decoding failure", FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ|FCGI_EV_FSTRM_ERR, fconn->conn, fstrm); + goto fail; + } + b_del(&fconn->dbuf, ret); + + if (hdr.id || (hdr.type != FCGI_GET_VALUES_RESULT && hdr.type != FCGI_UNKNOWN_TYPE)) { + fconn->state = FCGI_CS_CLOSED; + TRACE_ERROR("unexpected record type or flags", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR|FCGI_EV_RX_GETVAL|FCGI_EV_FCONN_ERR, fconn->conn); + TRACE_STATE("switching to CLOSED", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR|FCGI_EV_RX_GETVAL|FCGI_EV_FCONN_ERR, fconn->conn); + goto fail; + } + goto new_record; + } + } + + /* process as many incoming records as possible below */ + while (1) { + if (!b_data(&fconn->dbuf)) { + TRACE_DEVEL("no more Rx data", FCGI_EV_RX_RECORD, fconn->conn); + break; + } + + if (fconn->state == FCGI_CS_CLOSED) { + TRACE_STATE("end of connection reported", FCGI_EV_RX_RECORD|FCGI_EV_RX_EOI, fconn->conn); + break; + } + + if (fconn->state == FCGI_CS_RECORD_H) { + TRACE_PROTO("receiving FCGI record header", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn); + ret = fcgi_decode_record_hdr(&fconn->dbuf, 0, &hdr); + if (!ret) + break; + b_del(&fconn->dbuf, ret); + + new_record: + fconn->dsi = hdr.id; + fconn->drt = hdr.type; + fconn->drl = hdr.len; + fconn->drp = hdr.padding; + fconn->state = FCGI_CS_RECORD_D; + TRACE_STATE("FCGI record header rcvd, switching to RECORD_D", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn); + } + + /* Only FCGI_CS_RECORD_D or FCGI_CS_RECORD_P */ + tmp_fstrm = fcgi_conn_st_by_id(fconn, fconn->dsi); + + if (tmp_fstrm != fstrm && fstrm && fcgi_strm_sc(fstrm) && + (b_data(&fstrm->rxbuf) || + fcgi_conn_read0_pending(fconn) || + fstrm->state == FCGI_SS_CLOSED || + (fstrm->flags & FCGI_SF_ES_RCVD) || + se_fl_test(fstrm->sd, SE_FL_ERROR | SE_FL_ERR_PENDING | SE_FL_EOS))) { + /* we may have to signal the upper layers */ + TRACE_DEVEL("notifying stream before switching SID", FCGI_EV_RX_RECORD|FCGI_EV_STRM_WAKE, fconn->conn, fstrm); + se_fl_set(fstrm->sd, SE_FL_RCV_MORE); + fcgi_strm_notify_recv(fstrm); + } + fstrm = tmp_fstrm; + + if (fstrm->state == FCGI_SS_CLOSED && fconn->dsi != 0) { + /* ignore all record for closed streams */ + goto ignore_record; + } + if (fstrm->state == FCGI_SS_IDLE) { + /* ignore all record for unknown streams */ + goto ignore_record; + } + + switch (fconn->drt) { + case FCGI_GET_VALUES_RESULT: + TRACE_PROTO("receiving FCGI GET_VALUES_RESULT record", FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn); + ret = fcgi_conn_handle_values_result(fconn); + break; + + case FCGI_STDOUT: + if (fstrm->flags & FCGI_SF_ES_RCVD) + goto ignore_record; + + TRACE_PROTO("receiving FCGI STDOUT record", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDOUT, fconn->conn, fstrm); + if (fconn->drl) + ret = fcgi_strm_handle_stdout(fconn, fstrm); + else + ret = fcgi_strm_handle_empty_stdout(fconn, fstrm); + break; + + case FCGI_STDERR: + TRACE_PROTO("receiving FCGI STDERR record", FCGI_EV_RX_RECORD|FCGI_EV_RX_STDERR, fconn->conn, fstrm); + ret = fcgi_strm_handle_stderr(fconn, fstrm); + break; + + case FCGI_END_REQUEST: + TRACE_PROTO("receiving FCGI END_REQUEST record", FCGI_EV_RX_RECORD|FCGI_EV_RX_ENDREQ, fconn->conn, fstrm); + ret = fcgi_strm_handle_end_request(fconn, fstrm); + break; + + /* implement all extra record types here */ + default: + ignore_record: + /* drop records that we ignore. They may be + * larger than the buffer so we drain all of + * their contents until we reach the end. + */ + fconn->state = FCGI_CS_RECORD_P; + fconn->drl += fconn->drp; + fconn->drp = 0; + ret = MIN(b_data(&fconn->dbuf), fconn->drl); + TRACE_PROTO("receiving FCGI ignored record", FCGI_EV_RX_RECORD, fconn->conn, fstrm, 0, (size_t[]){ret}); + TRACE_STATE("switching to RECORD_P", FCGI_EV_RX_RECORD, fconn->conn, fstrm); + b_del(&fconn->dbuf, ret); + fconn->drl -= ret; + ret = (fconn->drl == 0); + } + + /* error or missing data condition met above ? */ + if (ret <= 0) { + TRACE_DEVEL("insufficient data to proceed", FCGI_EV_RX_RECORD, fconn->conn, fstrm); + break; + } + + if (fconn->state != FCGI_CS_RECORD_H && !(fconn->drl+fconn->drp)) { + fconn->state = FCGI_CS_RECORD_H; + TRACE_STATE("switching to RECORD_H", FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn); + } + } + + fail: + /* we can go here on missing data, blocked response or error */ + if (fstrm && fcgi_strm_sc(fstrm) && + (b_data(&fstrm->rxbuf) || + fcgi_conn_read0_pending(fconn) || + fstrm->state == FCGI_SS_CLOSED || + (fstrm->flags & FCGI_SF_ES_RCVD) || + se_fl_test(fstrm->sd, SE_FL_ERROR | SE_FL_ERR_PENDING | SE_FL_EOS))) { + /* we may have to signal the upper layers */ + TRACE_DEVEL("notifying stream before switching SID", FCGI_EV_RX_RECORD|FCGI_EV_STRM_WAKE, fconn->conn, fstrm); + se_fl_set(fstrm->sd, SE_FL_RCV_MORE); + fcgi_strm_notify_recv(fstrm); + } + + fcgi_conn_restart_reading(fconn, 0); +} + +/* process Tx records from streams to be multiplexed. Returns > 0 if it reached + * the end. + */ +static int fcgi_process_mux(struct fcgi_conn *fconn) +{ + struct fcgi_strm *fstrm, *fstrm_back; + + TRACE_ENTER(FCGI_EV_FCONN_WAKE, fconn->conn); + + if (unlikely(fconn->state < FCGI_CS_RECORD_H)) { + if (unlikely(fconn->state == FCGI_CS_INIT)) { + if (!(fconn->flags & FCGI_CF_GET_VALUES)) { + fconn->state = FCGI_CS_RECORD_H; + TRACE_STATE("switching to RECORD_H", FCGI_EV_TX_RECORD|FCGI_EV_RX_RECORD|FCGI_EV_RX_FHDR, fconn->conn); + fcgi_wake_unassigned_streams(fconn); + goto mux; + } + TRACE_PROTO("sending FCGI GET_VALUES record", FCGI_EV_TX_RECORD|FCGI_EV_TX_GETVAL, fconn->conn); + if (unlikely(!fcgi_conn_send_get_values(fconn))) + goto fail; + fconn->state = FCGI_CS_SETTINGS; + TRACE_STATE("switching to SETTINGS", FCGI_EV_TX_RECORD|FCGI_EV_RX_RECORD|FCGI_EV_RX_GETVAL, fconn->conn); + } + /* need to wait for the other side */ + if (fconn->state < FCGI_CS_RECORD_H) + goto done; + } + + mux: + list_for_each_entry_safe(fstrm, fstrm_back, &fconn->send_list, send_list) { + if (fconn->state == FCGI_CS_CLOSED || fconn->flags & FCGI_CF_MUX_BLOCK_ANY) + break; + + if (fstrm->flags & FCGI_SF_NOTIFIED) + continue; + + /* If the sender changed his mind and unsubscribed, let's just + * remove the stream from the send_list. + */ + if (!(fstrm->flags & (FCGI_SF_WANT_SHUTR|FCGI_SF_WANT_SHUTW)) && + (!fstrm->subs || !(fstrm->subs->events & SUB_RETRY_SEND))) { + LIST_DEL_INIT(&fstrm->send_list); + continue; + } + + if (fstrm->subs && fstrm->subs->events & SUB_RETRY_SEND) { + TRACE_POINT(FCGI_EV_STRM_WAKE, fconn->conn, fstrm); + fstrm->flags &= ~FCGI_SF_BLK_ANY; + fstrm->flags |= FCGI_SF_NOTIFIED; + tasklet_wakeup(fstrm->subs->tasklet); + fstrm->subs->events &= ~SUB_RETRY_SEND; + if (!fstrm->subs->events) + fstrm->subs = NULL; + } else { + /* it's the shut request that was queued */ + TRACE_POINT(FCGI_EV_STRM_WAKE, fconn->conn, fstrm); + tasklet_wakeup(fstrm->shut_tl); + } + } + + fail: + if (fconn->state == FCGI_CS_CLOSED) { + if (fconn->stream_cnt - fconn->nb_reserved > 0) { + fcgi_conn_send_aborts(fconn); + if (fconn->flags & FCGI_CF_MUX_BLOCK_ANY) { + TRACE_DEVEL("leaving in blocked situation", FCGI_EV_FCONN_WAKE|FCGI_EV_FCONN_BLK, fconn->conn); + return 0; + } + } + } + + done: + TRACE_LEAVE(FCGI_EV_FCONN_WAKE, fconn->conn); + return 1; +} + + +/* Attempt to read data, and subscribe if none available. + * The function returns 1 if data has been received, otherwise zero. + */ +static int fcgi_recv(struct fcgi_conn *fconn) +{ + struct connection *conn = fconn->conn; + struct buffer *buf; + int max; + size_t ret; + + TRACE_ENTER(FCGI_EV_FCONN_RECV, conn); + + if (fconn->wait_event.events & SUB_RETRY_RECV) { + TRACE_DEVEL("leaving on sub_recv", FCGI_EV_FCONN_RECV, conn); + return (b_data(&fconn->dbuf)); + } + + if (!fcgi_recv_allowed(fconn)) { + TRACE_DEVEL("leaving on !recv_allowed", FCGI_EV_FCONN_RECV, conn); + return 1; + } + + buf = fcgi_get_buf(fconn, &fconn->dbuf); + if (!buf) { + TRACE_DEVEL("waiting for fconn dbuf allocation", FCGI_EV_FCONN_RECV|FCGI_EV_FCONN_BLK, conn); + fconn->flags |= FCGI_CF_DEM_DALLOC; + return 0; + } + + if (!b_data(buf)) { + /* try to pre-align the buffer like the + * rxbufs will be to optimize memory copies. We'll make + * sure that the record header lands at the end of the + * HTX block to alias it upon recv. We cannot use the + * head because rcv_buf() will realign the buffer if + * it's empty. Thus we cheat and pretend we already + * have a few bytes there. + */ + max = buf_room_for_htx_data(buf) + (fconn->state == FCGI_CS_RECORD_H ? FCGI_RECORD_HEADER_SZ : 0); + buf->head = sizeof(struct htx) - (fconn->state == FCGI_CS_RECORD_H ? FCGI_RECORD_HEADER_SZ : 0); + } + else + max = buf_room_for_htx_data(buf); + + ret = max ? conn->xprt->rcv_buf(conn, conn->xprt_ctx, buf, max, 0) : 0; + + if (max && !ret && fcgi_recv_allowed(fconn)) { + TRACE_DATA("failed to receive data, subscribing", FCGI_EV_FCONN_RECV, conn); + conn->xprt->subscribe(conn, conn->xprt_ctx, SUB_RETRY_RECV, &fconn->wait_event); + } + else + TRACE_DATA("recv data", FCGI_EV_FCONN_RECV, conn, 0, 0, (size_t[]){ret}); + + if (conn_xprt_read0_pending(conn)) { + TRACE_DATA("received read0", FCGI_EV_FCONN_RECV, conn); + fconn->flags |= FCGI_CF_EOS; + } + if (conn->flags & CO_FL_ERROR) { + TRACE_DATA("connection error", FCGI_EV_FCONN_RECV, conn); + fconn->flags |= FCGI_CF_ERROR; + } + + if (!b_data(buf)) { + fcgi_release_buf(fconn, &fconn->dbuf); + goto end; + } + + if (ret == max) { + TRACE_DEVEL("fconn dbuf full", FCGI_EV_FCONN_RECV|FCGI_EV_FCONN_BLK, conn); + fconn->flags |= FCGI_CF_DEM_DFULL; + } + +end: + TRACE_LEAVE(FCGI_EV_FCONN_RECV, conn); + return !!ret || (fconn->flags & (FCGI_CF_EOS|FCGI_CF_ERROR)); +} + + +/* Try to send data if possible. + * The function returns 1 if data have been sent, otherwise zero. + */ +static int fcgi_send(struct fcgi_conn *fconn) +{ + struct connection *conn = fconn->conn; + int done; + int sent = 0; + + TRACE_ENTER(FCGI_EV_FCONN_SEND, conn); + + if (fconn->flags & (FCGI_CF_ERROR|FCGI_CF_ERR_PENDING)) { + TRACE_DEVEL("leaving on connection error", FCGI_EV_FCONN_SEND, conn); + if (fconn->flags & FCGI_CF_EOS) + fconn->flags |= FCGI_CF_ERROR; + b_reset(br_tail(fconn->mbuf)); + return 1; + } + + + if (conn->flags & CO_FL_WAIT_XPRT) { + /* a handshake was requested */ + goto schedule; + } + + /* This loop is quite simple : it tries to fill as much as it can from + * pending streams into the existing buffer until it's reportedly full + * or the end of send requests is reached. Then it tries to send this + * buffer's contents out, marks it not full if at least one byte could + * be sent, and tries again. + * + * The snd_buf() function normally takes a "flags" argument which may + * be made of a combination of CO_SFL_MSG_MORE to indicate that more + * data immediately comes and CO_SFL_STREAMER to indicate that the + * connection is streaming lots of data (used to increase TLS record + * size at the expense of latency). The former can be sent any time + * there's a buffer full flag, as it indicates at least one stream + * attempted to send and failed so there are pending data. An + * alternative would be to set it as long as there's an active stream + * but that would be problematic for ACKs until we have an absolute + * guarantee that all waiters have at least one byte to send. The + * latter should possibly not be set for now. + */ + + done = 0; + while (!done) { + unsigned int flags = 0; + unsigned int released = 0; + struct buffer *buf; + + /* fill as much as we can into the current buffer */ + while (((fconn->flags & (FCGI_CF_MUX_MFULL|FCGI_CF_MUX_MALLOC)) == 0) && !done) + done = fcgi_process_mux(fconn); + + if (fconn->flags & FCGI_CF_MUX_MALLOC) + done = 1; // we won't go further without extra buffers + + if (conn->flags & CO_FL_ERROR) + break; + + if (fconn->flags & (FCGI_CF_MUX_MFULL | FCGI_CF_DEM_MROOM)) + flags |= CO_SFL_MSG_MORE; + + for (buf = br_head(fconn->mbuf); b_size(buf); buf = br_del_head(fconn->mbuf)) { + if (b_data(buf)) { + int ret; + + ret = conn->xprt->snd_buf(conn, conn->xprt_ctx, buf, b_data(buf), flags); + if (!ret) { + done = 1; + break; + } + sent = 1; + TRACE_DATA("send data", FCGI_EV_FCONN_SEND, conn, 0, 0, (size_t[]){ret}); + b_del(buf, ret); + if (b_data(buf)) { + done = 1; + break; + } + } + b_free(buf); + released++; + } + + if (released) + offer_buffers(NULL, released); + + /* wrote at least one byte, the buffer is not full anymore */ + if (fconn->flags & (FCGI_CF_MUX_MFULL | FCGI_CF_DEM_MROOM)) + TRACE_STATE("fconn mbuf ring not fill anymore", FCGI_EV_FCONN_SEND|FCGI_EV_FCONN_BLK, conn); + fconn->flags &= ~(FCGI_CF_MUX_MFULL | FCGI_CF_DEM_MROOM); + } + + if (conn->flags & CO_FL_ERROR) { + fconn->flags |= FCGI_CF_ERR_PENDING; + if (fconn->flags & FCGI_CF_EOS) + fconn->flags |= FCGI_CF_ERROR; + b_reset(br_tail(fconn->mbuf)); + } + + /* We're not full anymore, so we can wake any task that are waiting + * for us. + */ + if (!(fconn->flags & (FCGI_CF_MUX_MFULL | FCGI_CF_DEM_MROOM)) && fconn->state >= FCGI_CS_RECORD_H) { + struct fcgi_strm *fstrm; + + list_for_each_entry(fstrm, &fconn->send_list, send_list) { + if (fconn->state == FCGI_CS_CLOSED || fconn->flags & FCGI_CF_MUX_BLOCK_ANY) + break; + + if (fstrm->flags & FCGI_SF_NOTIFIED) + continue; + + /* If the sender changed his mind and unsubscribed, let's just + * remove the stream from the send_list. + */ + if (!(fstrm->flags & (FCGI_SF_WANT_SHUTR|FCGI_SF_WANT_SHUTW)) && + (!fstrm->subs || !(fstrm->subs->events & SUB_RETRY_SEND))) { + LIST_DEL_INIT(&fstrm->send_list); + continue; + } + + if (fstrm->subs && fstrm->subs->events & SUB_RETRY_SEND) { + TRACE_DEVEL("waking up pending stream", FCGI_EV_FCONN_SEND|FCGI_EV_STRM_WAKE, conn, fstrm); + fstrm->flags &= ~FCGI_SF_BLK_ANY; + fstrm->flags |= FCGI_SF_NOTIFIED; + tasklet_wakeup(fstrm->subs->tasklet); + fstrm->subs->events &= ~SUB_RETRY_SEND; + if (!fstrm->subs->events) + fstrm->subs = NULL; + } else { + /* it's the shut request that was queued */ + TRACE_POINT(FCGI_EV_STRM_WAKE, fconn->conn, fstrm); + tasklet_wakeup(fstrm->shut_tl); + } + } + } + /* We're done, no more to send */ + if (!br_data(fconn->mbuf)) { + TRACE_DEVEL("leaving with everything sent", FCGI_EV_FCONN_SEND, conn); + goto end; + } +schedule: + if (!(conn->flags & CO_FL_ERROR) && !(fconn->wait_event.events & SUB_RETRY_SEND)) { + TRACE_STATE("more data to send, subscribing", FCGI_EV_FCONN_SEND, conn); + conn->xprt->subscribe(conn, conn->xprt_ctx, SUB_RETRY_SEND, &fconn->wait_event); + } + + TRACE_DEVEL("leaving with some data left to send", FCGI_EV_FCONN_SEND, conn); +end: + return sent || (fconn->flags & (FCGI_CF_ERR_PENDING|FCGI_CF_ERROR)); +} + +/* this is the tasklet referenced in fconn->wait_event.tasklet */ +struct task *fcgi_io_cb(struct task *t, void *ctx, unsigned int state) +{ + struct connection *conn; + struct fcgi_conn *fconn = ctx; + struct tasklet *tl = (struct tasklet *)t; + int conn_in_list; + int ret = 0; + + if (state & TASK_F_USR1) { + /* the tasklet was idling on an idle connection, it might have + * been stolen, let's be careful! + */ + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + if (tl->context == NULL) { + /* The connection has been taken over by another thread, + * we're no longer responsible for it, so just free the + * tasklet, and do nothing. + */ + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + tasklet_free(tl); + return NULL; + } + conn = fconn->conn; + TRACE_POINT(FCGI_EV_FCONN_WAKE, conn); + + conn_in_list = conn->flags & CO_FL_LIST_MASK; + if (conn_in_list) + conn_delete_from_tree(conn); + + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } else { + /* we're certain the connection was not in an idle list */ + conn = fconn->conn; + TRACE_ENTER(FCGI_EV_FCONN_WAKE, conn); + conn_in_list = 0; + } + + if (!(fconn->wait_event.events & SUB_RETRY_SEND)) + ret = fcgi_send(fconn); + if (!(fconn->wait_event.events & SUB_RETRY_RECV)) + ret |= fcgi_recv(fconn); + if (ret || b_data(&fconn->dbuf)) + ret = fcgi_process(fconn); + + /* If we were in an idle list, we want to add it back into it, + * unless fcgi_process() returned -1, which mean it has destroyed + * the connection (testing !ret is enough, if fcgi_process() wasn't + * called then ret will be 0 anyway. + */ + if (ret < 0) + t = NULL; + + if (!ret && conn_in_list) { + struct server *srv = objt_server(conn->target); + + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + _srv_add_idle(srv, conn, conn_in_list == CO_FL_SAFE_LIST); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + return t; +} + +/* callback called on any event by the connection handler. + * It applies changes and returns zero, or < 0 if it wants immediate + * destruction of the connection (which normally doesn not happen in FCGI). + */ +static int fcgi_process(struct fcgi_conn *fconn) +{ + struct connection *conn = fconn->conn; + + TRACE_POINT(FCGI_EV_FCONN_WAKE, conn); + + if (b_data(&fconn->dbuf) && !(fconn->flags & FCGI_CF_DEM_BLOCK_ANY)) { + fcgi_process_demux(fconn); + + if (fconn->state == FCGI_CS_CLOSED || (fconn->flags & FCGI_CF_ERROR)) + b_reset(&fconn->dbuf); + + if (buf_room_for_htx_data(&fconn->dbuf)) + fconn->flags &= ~FCGI_CF_DEM_DFULL; + } + fcgi_send(fconn); + + if (unlikely(fconn->proxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + /* frontend is stopping, reload likely in progress, let's try + * to announce a graceful shutdown if not yet done. We don't + * care if it fails, it will be tried again later. + */ + TRACE_STATE("proxy stopped, sending ABORT to all streams", FCGI_EV_FCONN_WAKE|FCGI_EV_TX_RECORD, conn); + if (!(fconn->flags & (FCGI_CF_ABRTS_SENT|FCGI_CF_ABRTS_FAILED))) { + if (fconn->stream_cnt - fconn->nb_reserved > 0) + fcgi_conn_send_aborts(fconn); + } + } + + /* + * If we received early data, and the handshake is done, wake + * any stream that was waiting for it. + */ + if (!(fconn->flags & FCGI_CF_WAIT_FOR_HS) && + (conn->flags & (CO_FL_EARLY_SSL_HS | CO_FL_WAIT_XPRT | CO_FL_EARLY_DATA)) == CO_FL_EARLY_DATA) { + struct eb32_node *node; + struct fcgi_strm *fstrm; + + fconn->flags |= FCGI_CF_WAIT_FOR_HS; + node = eb32_lookup_ge(&fconn->streams_by_id, 1); + + while (node) { + fstrm = container_of(node, struct fcgi_strm, by_id); + if (fcgi_strm_sc(fstrm) && se_fl_test(fstrm->sd, SE_FL_WAIT_FOR_HS)) + fcgi_strm_notify_recv(fstrm); + node = eb32_next(node); + } + } + + if ((fconn->flags & FCGI_CF_ERROR) || fcgi_conn_read0_pending(fconn) || + fconn->state == FCGI_CS_CLOSED || (fconn->flags & FCGI_CF_ABRTS_FAILED) || + eb_is_empty(&fconn->streams_by_id)) { + fcgi_wake_some_streams(fconn, 0); + + if (eb_is_empty(&fconn->streams_by_id)) { + /* no more stream, kill the connection now */ + fcgi_release(fconn); + TRACE_DEVEL("leaving after releasing the connection", FCGI_EV_FCONN_WAKE); + return -1; + } + } + + if (!b_data(&fconn->dbuf)) + fcgi_release_buf(fconn, &fconn->dbuf); + + if (fconn->state == FCGI_CS_CLOSED || (fconn->flags & FCGI_CF_ABRTS_FAILED) || + (!br_data(fconn->mbuf) && ((fconn->flags & FCGI_CF_MUX_BLOCK_ANY) || LIST_ISEMPTY(&fconn->send_list)))) + fcgi_release_mbuf(fconn); + + if (fconn->task) { + fconn->task->expire = tick_add(now_ms, (fconn->state == FCGI_CS_CLOSED ? fconn->shut_timeout : fconn->timeout)); + task_queue(fconn->task); + } + + fcgi_send(fconn); + TRACE_LEAVE(FCGI_EV_FCONN_WAKE, conn); + return 0; +} + + +/* wake-up function called by the connection layer (mux_ops.wake) */ +static int fcgi_wake(struct connection *conn) +{ + struct fcgi_conn *fconn = conn->ctx; + + TRACE_POINT(FCGI_EV_FCONN_WAKE, conn); + return (fcgi_process(fconn)); +} + + +static int fcgi_ctl(struct connection *conn, enum mux_ctl_type mux_ctl, void *output) +{ + int ret = 0; + switch (mux_ctl) { + case MUX_CTL_STATUS: + if (!(conn->flags & CO_FL_WAIT_XPRT)) + ret |= MUX_STATUS_READY; + return ret; + case MUX_CTL_EXIT_STATUS: + return MUX_ES_UNKNOWN; + default: + return -1; + } +} + +static int fcgi_sctl(struct stconn *sc, enum mux_sctl_type mux_sctl, void *output) +{ + int ret = 0; + struct fcgi_strm *fstrm = __sc_mux_strm(sc); + + switch (mux_sctl) { + case MUX_SCTL_SID: + if (output) + *((int64_t *)output) = fstrm->id; + return ret; + + default: + return -1; + } +} + +/* Connection timeout management. The principle is that if there's no receipt + * nor sending for a certain amount of time, the connection is closed. If the + * MUX buffer still has lying data or is not allocatable, the connection is + * immediately killed. If it's allocatable and empty, we attempt to send a + * ABORT records. + */ +struct task *fcgi_timeout_task(struct task *t, void *context, unsigned int state) +{ + struct fcgi_conn *fconn = context; + int expired = tick_is_expired(t->expire, now_ms); + + TRACE_ENTER(FCGI_EV_FCONN_WAKE, (fconn ? fconn->conn : NULL)); + + if (fconn) { + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + + /* Somebody already stole the connection from us, so we should not + * free it, we just have to free the task. + */ + if (!t->context) { + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + fconn = NULL; + goto do_leave; + } + + if (!expired) { + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + TRACE_DEVEL("leaving (not expired)", FCGI_EV_FCONN_WAKE, fconn->conn); + return t; + } + + /* We're about to destroy the connection, so make sure nobody attempts + * to steal it from us. + */ + if (fconn->conn->flags & CO_FL_LIST_MASK) + conn_delete_from_tree(fconn->conn); + + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + +do_leave: + task_destroy(t); + + if (!fconn) { + /* resources were already deleted */ + TRACE_DEVEL("leaving (not more fconn)", FCGI_EV_FCONN_WAKE); + return NULL; + } + + fconn->task = NULL; + fconn->state = FCGI_CS_CLOSED; + fcgi_wake_some_streams(fconn, 0); + + if (br_data(fconn->mbuf)) { + /* don't even try to send aborts, the buffer is stuck */ + fconn->flags |= FCGI_CF_ABRTS_FAILED; + goto end; + } + + /* try to send but no need to insist */ + if (!fcgi_conn_send_aborts(fconn)) + fconn->flags |= FCGI_CF_ABRTS_FAILED; + + if (br_data(fconn->mbuf) && !(fconn->flags & FCGI_CF_ABRTS_FAILED) && + conn_xprt_ready(fconn->conn)) { + unsigned int released = 0; + struct buffer *buf; + + for (buf = br_head(fconn->mbuf); b_size(buf); buf = br_del_head(fconn->mbuf)) { + if (b_data(buf)) { + int ret = fconn->conn->xprt->snd_buf(fconn->conn, fconn->conn->xprt_ctx, + buf, b_data(buf), 0); + if (!ret) + break; + b_del(buf, ret); + if (b_data(buf)) + break; + b_free(buf); + released++; + } + } + + if (released) + offer_buffers(NULL, released); + } + + end: + /* either we can release everything now or it will be done later once + * the last stream closes. + */ + if (eb_is_empty(&fconn->streams_by_id)) + fcgi_release(fconn); + + TRACE_LEAVE(FCGI_EV_FCONN_WAKE); + return NULL; +} + + +/*******************************************/ +/* functions below are used by the streams */ +/*******************************************/ + +/* Append the description of what is present in error snapshot <es> into <out>. + * The description must be small enough to always fit in a buffer. The output + * buffer may be the trash so the trash must not be used inside this function. + */ +static void fcgi_show_error_snapshot(struct buffer *out, const struct error_snapshot *es) +{ + chunk_appendf(out, + " FCGI connection flags 0x%08x, FCGI stream flags 0x%08x\n" + " H1 msg state %s(%d), H1 msg flags 0x%08x\n" + " H1 chunk len %lld bytes, H1 body len %lld bytes :\n", + es->ctx.h1.c_flags, es->ctx.h1.s_flags, + h1m_state_str(es->ctx.h1.state), es->ctx.h1.state, + es->ctx.h1.m_flags, es->ctx.h1.m_clen, es->ctx.h1.m_blen); +} +/* + * Capture a bad response and archive it in the proxy's structure. By default + * it tries to report the error position as h1m->err_pos. However if this one is + * not set, it will then report h1m->next, which is the last known parsing + * point. The function is able to deal with wrapping buffers. It always displays + * buffers as a contiguous area starting at buf->p. The direction is determined + * thanks to the h1m's flags. + */ +static void fcgi_strm_capture_bad_message(struct fcgi_conn *fconn, struct fcgi_strm *fstrm, + struct h1m *h1m, struct buffer *buf) +{ + struct session *sess = fstrm->sess; + struct proxy *proxy = fconn->proxy; + struct proxy *other_end; + union error_snapshot_ctx ctx; + + if (fcgi_strm_sc(fstrm) && sc_strm(fcgi_strm_sc(fstrm))) { + if (sess == NULL) + sess = __sc_strm(fcgi_strm_sc(fstrm))->sess; + if (!(h1m->flags & H1_MF_RESP)) + other_end = __sc_strm(fcgi_strm_sc(fstrm))->be; + else + other_end = sess->fe; + } else + other_end = NULL; + /* http-specific part now */ + ctx.h1.state = h1m->state; + ctx.h1.c_flags = fconn->flags; + ctx.h1.s_flags = fstrm->flags; + ctx.h1.m_flags = h1m->flags; + ctx.h1.m_clen = h1m->curr_len; + ctx.h1.m_blen = h1m->body_len; + + proxy_capture_error(proxy, 1, other_end, fconn->conn->target, sess, buf, 0, 0, + (h1m->err_pos >= 0) ? h1m->err_pos : h1m->next, + &ctx, fcgi_show_error_snapshot); +} + +static size_t fcgi_strm_parse_headers(struct fcgi_strm *fstrm, struct h1m *h1m, struct htx *htx, + struct buffer *buf, size_t *ofs, size_t max) +{ + int ret; + + TRACE_ENTER(FCGI_EV_RSP_DATA|FCGI_EV_RSP_HDRS, fstrm->fconn->conn, fstrm, 0, (size_t[]){max}); + ret = h1_parse_msg_hdrs(h1m, NULL, htx, buf, *ofs, max); + if (ret <= 0) { + TRACE_DEVEL("leaving on missing data or error", FCGI_EV_RSP_DATA|FCGI_EV_RSP_HDRS, fstrm->fconn->conn, fstrm); + if (htx->flags & HTX_FL_PARSING_ERROR) { + TRACE_ERROR("parsing error, reject H1 response", FCGI_EV_RSP_DATA|FCGI_EV_RSP_HDRS|FCGI_EV_FSTRM_ERR, fstrm->fconn->conn, fstrm); + fcgi_strm_error(fstrm); + fcgi_strm_capture_bad_message(fstrm->fconn, fstrm, h1m, buf); + } + ret = 0; + goto end; + } + + /* Reject any message with an unknown transfer-encoding. In fact if any + * encoding other than "chunked". A 422-Unprocessable-Content is + * returned for an invalid request, a 502-Bad-Gateway for an invalid + * response. + */ + if (h1m->flags & H1_MF_TE_OTHER) { + htx->flags |= HTX_FL_PARSING_ERROR; + TRACE_ERROR("Unknown transfer-encoding", FCGI_EV_RSP_DATA|FCGI_EV_RSP_HDRS|FCGI_EV_FSTRM_ERR, fstrm->fconn->conn, fstrm); + fcgi_strm_error(fstrm); + fcgi_strm_capture_bad_message(fstrm->fconn, fstrm, h1m, buf); + ret = 0; + goto end; + } + + *ofs += ret; + end: + TRACE_LEAVE(FCGI_EV_RSP_DATA|FCGI_EV_RSP_HDRS, fstrm->fconn->conn, fstrm, 0, (size_t[]){ret}); + return ret; + +} + +static size_t fcgi_strm_parse_data(struct fcgi_strm *fstrm, struct h1m *h1m, struct htx **htx, + struct buffer *buf, size_t *ofs, size_t max, struct buffer *htxbuf) +{ + size_t ret; + + TRACE_ENTER(FCGI_EV_RSP_DATA|FCGI_EV_RSP_BODY, fstrm->fconn->conn, fstrm, 0, (size_t[]){max}); + ret = h1_parse_msg_data(h1m, htx, buf, *ofs, max, htxbuf); + if (!ret) { + TRACE_DEVEL("leaving on missing data or error", FCGI_EV_RSP_DATA|FCGI_EV_RSP_BODY, fstrm->fconn->conn, fstrm); + if ((*htx)->flags & HTX_FL_PARSING_ERROR) { + TRACE_ERROR("parsing error, reject H1 response", FCGI_EV_RSP_DATA|FCGI_EV_RSP_BODY|FCGI_EV_FSTRM_ERR, fstrm->fconn->conn, fstrm); + fcgi_strm_error(fstrm); + fcgi_strm_capture_bad_message(fstrm->fconn, fstrm, h1m, buf); + } + goto end; + } + *ofs += ret; + end: + TRACE_LEAVE(FCGI_EV_RSP_DATA|FCGI_EV_RSP_BODY, fstrm->fconn->conn, fstrm, 0, (size_t[]){ret}); + return ret; +} + +static size_t fcgi_strm_parse_trailers(struct fcgi_strm *fstrm, struct h1m *h1m, struct htx *htx, + struct buffer *buf, size_t *ofs, size_t max) +{ + int ret; + + TRACE_ENTER(FCGI_EV_RSP_DATA|FCGI_EV_RSP_TLRS, fstrm->fconn->conn, fstrm, 0, (size_t[]){max}); + ret = h1_parse_msg_tlrs(h1m, htx, buf, *ofs, max); + if (ret <= 0) { + TRACE_DEVEL("leaving on missing data or error", FCGI_EV_RSP_DATA|FCGI_EV_RSP_TLRS, fstrm->fconn->conn, fstrm); + if (htx->flags & HTX_FL_PARSING_ERROR) { + TRACE_ERROR("parsing error, reject H1 response", FCGI_EV_RSP_DATA|FCGI_EV_RSP_TLRS|FCGI_EV_FSTRM_ERR, fstrm->fconn->conn, fstrm); + fcgi_strm_error(fstrm); + fcgi_strm_capture_bad_message(fstrm->fconn, fstrm, h1m, buf); + } + ret = 0; + goto end; + } + *ofs += ret; + end: + TRACE_LEAVE(FCGI_EV_RSP_DATA|FCGI_EV_RSP_TLRS, fstrm->fconn->conn, fstrm, 0, (size_t[]){ret}); + return ret; +} + +static size_t fcgi_strm_parse_response(struct fcgi_strm *fstrm, struct buffer *buf, size_t count) +{ + struct fcgi_conn *fconn = fstrm->fconn; + struct htx *htx; + struct h1m *h1m = &fstrm->h1m; + size_t ret, data, total = 0; + + htx = htx_from_buf(buf); + TRACE_ENTER(FCGI_EV_RSP_DATA, fconn->conn, fstrm, htx, (size_t[]){count}); + + data = htx->data; + if (fstrm->state == FCGI_SS_ERROR) + goto end; + + do { + size_t used = htx_used_space(htx); + + if (h1m->state <= H1_MSG_LAST_LF) { + TRACE_PROTO("parsing response headers", FCGI_EV_RSP_DATA|FCGI_EV_RSP_HDRS, fconn->conn, fstrm); + ret = fcgi_strm_parse_headers(fstrm, h1m, htx, &fstrm->rxbuf, &total, count); + if (!ret) + break; + + TRACE_USER("rcvd H1 response headers", FCGI_EV_RSP_DATA|FCGI_EV_RSP_HDRS, fconn->conn, fstrm, htx); + + if ((h1m->flags & (H1_MF_VER_11|H1_MF_XFER_LEN)) == H1_MF_VER_11) { + struct htx_blk *blk = htx_get_head_blk(htx); + struct htx_sl *sl; + + if (!blk) + break; + sl = htx_get_blk_ptr(htx, blk); + sl->flags |= HTX_SL_F_XFER_LEN; + htx->extra = 0; + } + } + else if (h1m->state < H1_MSG_TRAILERS) { + TRACE_PROTO("parsing response payload", FCGI_EV_RSP_DATA|FCGI_EV_RSP_BODY, fconn->conn, fstrm); + fcgi_strm_parse_data(fstrm, h1m, &htx, &fstrm->rxbuf, &total, count, buf); + + if (!(h1m->flags & H1_MF_XFER_LEN) && fstrm->state != FCGI_SS_ERROR && + (fstrm->flags & FCGI_SF_ES_RCVD) && b_data(&fstrm->rxbuf) == total) { + TRACE_DEVEL("end of data", FCGI_EV_RSP_DATA, fconn->conn, fstrm); + if (htx_is_empty(htx) && !htx_add_endof(htx, HTX_BLK_EOT)) + break; + htx->flags |= HTX_FL_EOM; + h1m->state = H1_MSG_DONE; + TRACE_USER("H1 response fully rcvd", FCGI_EV_RSP_DATA|FCGI_EV_RSP_EOM, fconn->conn, fstrm, htx); + } + + if (h1m->state < H1_MSG_TRAILERS) + break; + + TRACE_PROTO("rcvd response payload data", FCGI_EV_RSP_DATA|FCGI_EV_RSP_BODY, fconn->conn, fstrm, htx); + } + else if (h1m->state == H1_MSG_TRAILERS) { + TRACE_PROTO("parsing response trailers", FCGI_EV_RSP_DATA|FCGI_EV_RSP_TLRS, fconn->conn, fstrm); + fcgi_strm_parse_trailers(fstrm, h1m, htx, &fstrm->rxbuf, &total, count); + if (h1m->state != H1_MSG_DONE) + break; + + TRACE_PROTO("rcvd H1 response trailers", FCGI_EV_RSP_DATA|FCGI_EV_RSP_TLRS, fconn->conn, fstrm, htx); + } + else if (h1m->state == H1_MSG_DONE) { + TRACE_USER("H1 response fully rcvd", FCGI_EV_RSP_DATA|FCGI_EV_RSP_EOM, fconn->conn, fstrm, htx); + if (b_data(&fstrm->rxbuf) > total) { + htx->flags |= HTX_FL_PARSING_ERROR; + TRACE_PROTO("too much data, parsing error", FCGI_EV_RSP_DATA, fconn->conn, fstrm); + fcgi_strm_error(fstrm); + } + break; + } + else { + htx->flags |= HTX_FL_PROCESSING_ERROR; + TRACE_ERROR("unexpected processing error", FCGI_EV_RSP_DATA|FCGI_EV_STRM_ERR, fconn->conn, fstrm); + fcgi_strm_error(fstrm); + break; + } + + count -= htx_used_space(htx) - used; + } while (fstrm->state != FCGI_SS_ERROR); + + if (fstrm->state == FCGI_SS_ERROR) { + b_reset(&fstrm->rxbuf); + htx_to_buf(htx, buf); + TRACE_DEVEL("leaving on error", FCGI_EV_RSP_DATA|FCGI_EV_STRM_ERR, fconn->conn, fstrm); + return 0; + } + + b_del(&fstrm->rxbuf, total); + + end: + htx_to_buf(htx, buf); + ret = htx->data - data; + TRACE_LEAVE(FCGI_EV_RSP_DATA, fconn->conn, fstrm, htx, (size_t[]){ret}); + return ret; +} + +/* + * Attach a new stream to a connection + * (Used for outgoing connections) + */ +static int fcgi_attach(struct connection *conn, struct sedesc *sd, struct session *sess) +{ + struct fcgi_strm *fstrm; + struct fcgi_conn *fconn = conn->ctx; + + TRACE_ENTER(FCGI_EV_FSTRM_NEW, conn); + fstrm = fcgi_stconn_new(fconn, sd->sc, sess); + if (!fstrm) + goto err; + + /* the connection is not idle anymore, let's mark this */ + HA_ATOMIC_AND(&fconn->wait_event.tasklet->state, ~TASK_F_USR1); + xprt_set_used(conn, conn->xprt, conn->xprt_ctx); + + TRACE_LEAVE(FCGI_EV_FSTRM_NEW, conn, fstrm); + return 0; + + err: + TRACE_DEVEL("leaving on error", FCGI_EV_FSTRM_NEW|FCGI_EV_FSTRM_ERR, conn); + return -1; +} + +/* Retrieves the first valid stream connector from this connection, or returns NULL. + * We have to scan because we may have some orphan streams. It might be + * beneficial to scan backwards from the end to reduce the likeliness to find + * orphans. + */ +static struct stconn *fcgi_get_first_sc(const struct connection *conn) +{ + struct fcgi_conn *fconn = conn->ctx; + struct fcgi_strm *fstrm; + struct eb32_node *node; + + node = eb32_first(&fconn->streams_by_id); + while (node) { + fstrm = container_of(node, struct fcgi_strm, by_id); + if (fcgi_strm_sc(fstrm)) + return fcgi_strm_sc(fstrm); + node = eb32_next(node); + } + return NULL; +} + +/* + * Destroy the mux and the associated connection, if it is no longer used + */ +static void fcgi_destroy(void *ctx) +{ + struct fcgi_conn *fconn = ctx; + + TRACE_POINT(FCGI_EV_FCONN_END, fconn->conn); + if (eb_is_empty(&fconn->streams_by_id)) { + BUG_ON(fconn->conn->ctx != fconn); + fcgi_release(fconn); + } +} + +/* + * Detach the stream from the connection and possibly release the connection. + */ +static void fcgi_detach(struct sedesc *sd) +{ + struct fcgi_strm *fstrm = sd->se; + struct fcgi_conn *fconn; + struct session *sess; + + TRACE_ENTER(FCGI_EV_STRM_END, (fstrm ? fstrm->fconn->conn : NULL), fstrm); + + if (!fstrm) { + TRACE_LEAVE(FCGI_EV_STRM_END); + return; + } + + /* there's no txbuf so we're certain no to be able to send anything */ + fstrm->flags &= ~FCGI_SF_NOTIFIED; + + sess = fstrm->sess; + fconn = fstrm->fconn; + fconn->nb_sc--; + + if (fstrm->proto_status == FCGI_PS_CANT_MPX_CONN) { + fconn->flags &= ~FCGI_CF_MPXS_CONNS; + fconn->streams_limit = 1; + } + else if (fstrm->proto_status == FCGI_PS_OVERLOADED || + fstrm->proto_status == FCGI_PS_UNKNOWN_ROLE) { + fconn->flags &= ~FCGI_CF_KEEP_CONN; + fconn->state = FCGI_CS_CLOSED; + } + + /* this stream may be blocked waiting for some data to leave, so orphan + * it in this case. + */ + if (!(fconn->flags & (FCGI_CF_ERR_PENDING|FCGI_CF_ERROR)) && // FIXME: Be sure for ERR_PENDING + (fconn->state != FCGI_CS_CLOSED) && + (fstrm->flags & (FCGI_SF_BLK_MBUSY|FCGI_SF_BLK_MROOM)) && + (fstrm->subs || (fstrm->flags & (FCGI_SF_WANT_SHUTR|FCGI_SF_WANT_SHUTW)))) { + TRACE_DEVEL("leaving on stream blocked", FCGI_EV_STRM_END|FCGI_EV_FSTRM_BLK, fconn->conn, fstrm); + return; + } + + if ((fconn->flags & FCGI_CF_DEM_BLOCK_ANY && fstrm->id == fconn->dsi)) { + /* unblock the connection if it was blocked on this stream. */ + fconn->flags &= ~FCGI_CF_DEM_BLOCK_ANY; + fcgi_conn_restart_reading(fconn, 1); + } + + fcgi_strm_destroy(fstrm); + + if (!(fconn->flags & (FCGI_CF_EOS|FCGI_CF_ERR_PENDING|FCGI_CF_ERROR)) && + (fconn->flags & FCGI_CF_KEEP_CONN)) { + if (fconn->conn->flags & CO_FL_PRIVATE) { + /* Add the connection in the session serverlist, if not already done */ + if (!session_add_conn(sess, fconn->conn, fconn->conn->target)) { + fconn->conn->owner = NULL; + if (eb_is_empty(&fconn->streams_by_id)) { + /* let's kill the connection right away */ + fconn->conn->mux->destroy(fconn); + TRACE_DEVEL("outgoing connection killed", FCGI_EV_STRM_END|FCGI_EV_FCONN_ERR); + return; + } + } + if (eb_is_empty(&fconn->streams_by_id)) { + if (session_check_idle_conn(fconn->conn->owner, fconn->conn) != 0) { + /* The connection is destroyed, let's leave */ + TRACE_DEVEL("outgoing connection killed", FCGI_EV_STRM_END|FCGI_EV_FCONN_ERR); + return; + } + } + } + else { + if (eb_is_empty(&fconn->streams_by_id)) { + /* If the connection is owned by the session, first remove it + * from its list + */ + if (fconn->conn->owner) { + session_unown_conn(fconn->conn->owner, fconn->conn); + fconn->conn->owner = NULL; + } + + /* mark that the tasklet may lose its context to another thread and + * that the handler needs to check it under the idle conns lock. + */ + HA_ATOMIC_OR(&fconn->wait_event.tasklet->state, TASK_F_USR1); + xprt_set_idle(fconn->conn, fconn->conn->xprt, fconn->conn->xprt_ctx); + + if (!srv_add_to_idle_list(objt_server(fconn->conn->target), fconn->conn, 1)) { + /* The server doesn't want it, let's kill the connection right away */ + fconn->conn->mux->destroy(fconn); + TRACE_DEVEL("outgoing connection killed", FCGI_EV_STRM_END|FCGI_EV_FCONN_ERR); + return; + } + /* At this point, the connection has been added to the + * server idle list, so another thread may already have + * hijacked it, so we can't do anything with it. + */ + TRACE_DEVEL("reusable idle connection", FCGI_EV_STRM_END, fconn->conn); + return; + } + else if (!fconn->conn->hash_node->node.node.leaf_p && + fcgi_avail_streams(fconn->conn) > 0 && objt_server(fconn->conn->target) && + !LIST_INLIST(&fconn->conn->session_list)) { + srv_add_to_avail_list(__objt_server(fconn->conn->target), fconn->conn); + } + } + } + + /* We don't want to close right now unless we're removing the last + * stream and the connection is in error. + */ + if (fcgi_conn_is_dead(fconn)) { + /* no more stream will come, kill it now */ + TRACE_DEVEL("leaving, killing dead connection", FCGI_EV_STRM_END, fconn->conn); + fcgi_release(fconn); + } + else if (fconn->task) { + fconn->task->expire = tick_add(now_ms, (fconn->state == FCGI_CS_CLOSED ? fconn->shut_timeout : fconn->timeout)); + task_queue(fconn->task); + TRACE_DEVEL("leaving, refreshing connection's timeout", FCGI_EV_STRM_END, fconn->conn); + } + else + TRACE_DEVEL("leaving", FCGI_EV_STRM_END, fconn->conn); +} + + +/* Performs a synchronous or asynchronous shutr(). */ +static void fcgi_do_shutr(struct fcgi_strm *fstrm) +{ + struct fcgi_conn *fconn = fstrm->fconn; + + TRACE_ENTER(FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + + if (fstrm->state == FCGI_SS_CLOSED) + goto done; + + /* a connstream may require us to immediately kill the whole connection + * for example because of a "tcp-request content reject" rule that is + * normally used to limit abuse. + */ + if (se_fl_test(fstrm->sd, SE_FL_KILL_CONN) && + !(fconn->flags & (FCGI_CF_ABRTS_SENT|FCGI_CF_ABRTS_FAILED))) { + TRACE_STATE("stream wants to kill the connection", FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + fconn->state = FCGI_CS_CLOSED; + } + else if (fstrm->flags & FCGI_SF_BEGIN_SENT) { + TRACE_STATE("no headers sent yet, trying a retryable abort", FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + if (!(fstrm->flags & (FCGI_SF_ES_SENT|FCGI_SF_ABRT_SENT)) && + !fcgi_strm_send_abort(fconn, fstrm)) + goto add_to_list; + } + + fcgi_strm_close(fstrm); + + if (!(fconn->wait_event.events & SUB_RETRY_SEND)) + tasklet_wakeup(fconn->wait_event.tasklet); + done: + fstrm->flags &= ~FCGI_SF_WANT_SHUTR; + TRACE_LEAVE(FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + return; + + add_to_list: + /* Let the handler know we want to shutr, and add ourselves to the + * send list if not yet done. fcgi_deferred_shut() will be + * automatically called via the shut_tl tasklet when there's room + * again. + */ + if (!LIST_INLIST(&fstrm->send_list)) { + if (fstrm->flags & (FCGI_SF_BLK_MBUSY|FCGI_SF_BLK_MROOM)) { + LIST_APPEND(&fconn->send_list, &fstrm->send_list); + } + } + fstrm->flags |= FCGI_SF_WANT_SHUTR; + TRACE_LEAVE(FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + return; +} + +/* Performs a synchronous or asynchronous shutw(). */ +static void fcgi_do_shutw(struct fcgi_strm *fstrm) +{ + struct fcgi_conn *fconn = fstrm->fconn; + + TRACE_ENTER(FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + + if (fstrm->state != FCGI_SS_HLOC || fstrm->state == FCGI_SS_CLOSED) + goto done; + + if (fstrm->state != FCGI_SS_ERROR && (fstrm->flags & FCGI_SF_BEGIN_SENT)) { + if (!(fstrm->flags & (FCGI_SF_ES_SENT|FCGI_SF_ABRT_SENT)) && + !fcgi_strm_send_abort(fconn, fstrm)) + goto add_to_list; + + if (fstrm->state == FCGI_SS_HREM) + fcgi_strm_close(fstrm); + else + fstrm->state = FCGI_SS_HLOC; + } else { + /* a connstream may require us to immediately kill the whole connection + * for example because of a "tcp-request content reject" rule that is + * normally used to limit abuse. + */ + if (se_fl_test(fstrm->sd, SE_FL_KILL_CONN) && + !(fconn->flags & (FCGI_CF_ABRTS_SENT|FCGI_CF_ABRTS_FAILED))) { + TRACE_STATE("stream wants to kill the connection", FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + fconn->state = FCGI_CS_CLOSED; + } + + fcgi_strm_close(fstrm); + } + + if (!(fconn->wait_event.events & SUB_RETRY_SEND)) + tasklet_wakeup(fconn->wait_event.tasklet); + done: + fstrm->flags &= ~FCGI_SF_WANT_SHUTW; + TRACE_LEAVE(FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + return; + + add_to_list: + /* Let the handler know we want to shutr, and add ourselves to the + * send list if not yet done. fcgi_deferred_shut() will be + * automatically called via the shut_tl tasklet when there's room + * again. + */ + if (!LIST_INLIST(&fstrm->send_list)) { + if (fstrm->flags & (FCGI_SF_BLK_MBUSY|FCGI_SF_BLK_MROOM)) { + LIST_APPEND(&fconn->send_list, &fstrm->send_list); + } + } + fstrm->flags |= FCGI_SF_WANT_SHUTW; + TRACE_LEAVE(FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + return; +} + +/* This is the tasklet referenced in fstrm->shut_tl, it is used for + * deferred shutdowns when the fcgi_detach() was done but the mux buffer was full + * and prevented the last record from being emitted. + */ +struct task *fcgi_deferred_shut(struct task *t, void *ctx, unsigned int state) +{ + struct fcgi_strm *fstrm = ctx; + struct fcgi_conn *fconn = fstrm->fconn; + + TRACE_ENTER(FCGI_EV_STRM_SHUT, fconn->conn, fstrm); + + if (fstrm->flags & FCGI_SF_NOTIFIED) { + /* some data processing remains to be done first */ + goto end; + } + + if (fstrm->flags & FCGI_SF_WANT_SHUTW) + fcgi_do_shutw(fstrm); + + if (fstrm->flags & FCGI_SF_WANT_SHUTR) + fcgi_do_shutr(fstrm); + + if (!(fstrm->flags & (FCGI_SF_WANT_SHUTR|FCGI_SF_WANT_SHUTW))) { + /* We're done trying to send, remove ourself from the send_list */ + LIST_DEL_INIT(&fstrm->send_list); + + if (!fcgi_strm_sc(fstrm)) { + fcgi_strm_destroy(fstrm); + if (fcgi_conn_is_dead(fconn)) + fcgi_release(fconn); + } + } + end: + TRACE_LEAVE(FCGI_EV_STRM_SHUT); + return NULL; +} + +/* shutr() called by the stream connector (mux_ops.shutr) */ +static void fcgi_shutr(struct stconn *sc, enum co_shr_mode mode) +{ + struct fcgi_strm *fstrm = __sc_mux_strm(sc); + + TRACE_POINT(FCGI_EV_STRM_SHUT, fstrm->fconn->conn, fstrm); + if (!mode) + return; + fcgi_do_shutr(fstrm); +} + +/* shutw() called by the stream connector (mux_ops.shutw) */ +static void fcgi_shutw(struct stconn *sc, enum co_shw_mode mode) +{ + struct fcgi_strm *fstrm = __sc_mux_strm(sc); + + TRACE_POINT(FCGI_EV_STRM_SHUT, fstrm->fconn->conn, fstrm); + fcgi_do_shutw(fstrm); +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0. + */ +static int fcgi_subscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct fcgi_strm *fstrm = __sc_mux_strm(sc); + struct fcgi_conn *fconn = fstrm->fconn; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(fstrm->subs && fstrm->subs != es); + + es->events |= event_type; + fstrm->subs = es; + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("unsubscribe(recv)", FCGI_EV_STRM_RECV, fconn->conn, fstrm); + + if (event_type & SUB_RETRY_SEND) { + TRACE_DEVEL("unsubscribe(send)", FCGI_EV_STRM_SEND, fconn->conn, fstrm); + if (!LIST_INLIST(&fstrm->send_list)) + LIST_APPEND(&fconn->send_list, &fstrm->send_list); + } + return 0; +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type> + * (undo fcgi_subscribe). The <es> pointer is not allowed to differ from the one + * passed to the subscribe() call. It always returns zero. + */ +static int fcgi_unsubscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct fcgi_strm *fstrm = __sc_mux_strm(sc); + struct fcgi_conn *fconn = fstrm->fconn; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(fstrm->subs && fstrm->subs != es); + + es->events &= ~event_type; + if (!es->events) + fstrm->subs = NULL; + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("subscribe(recv)", FCGI_EV_STRM_RECV, fconn->conn, fstrm); + + if (event_type & SUB_RETRY_SEND) { + TRACE_DEVEL("subscribe(send)", FCGI_EV_STRM_SEND, fconn->conn, fstrm); + fstrm->flags &= ~FCGI_SF_NOTIFIED; + if (!(fstrm->flags & (FCGI_SF_WANT_SHUTR|FCGI_SF_WANT_SHUTW))) + LIST_DEL_INIT(&fstrm->send_list); + } + return 0; +} + +/* Called from the upper layer, to receive data + * + * The caller is responsible for defragmenting <buf> if necessary. But <flags> + * must be tested to know the calling context. If CO_RFL_BUF_FLUSH is set, it + * means the caller wants to flush input data (from the mux buffer and the + * channel buffer) to be able to use kernel splicing or any kind of mux-to-mux + * xfer. If CO_RFL_KEEP_RECV is set, the mux must always subscribe for read + * events before giving back. CO_RFL_BUF_WET is set if <buf> is congested with + * data scheduled for leaving soon. CO_RFL_BUF_NOT_STUCK is set to instruct the + * mux it may optimize the data copy to <buf> if necessary. Otherwise, it should + * copy as much data as possible. + */ +static size_t fcgi_rcv_buf(struct stconn *sc, struct buffer *buf, size_t count, int flags) +{ + struct fcgi_strm *fstrm = __sc_mux_strm(sc); + struct fcgi_conn *fconn = fstrm->fconn; + size_t ret = 0; + + TRACE_ENTER(FCGI_EV_STRM_RECV, fconn->conn, fstrm); + + if (!(fconn->flags & FCGI_CF_DEM_SALLOC)) + ret = fcgi_strm_parse_response(fstrm, buf, count); + else + TRACE_STATE("fstrm rxbuf not allocated", FCGI_EV_STRM_RECV|FCGI_EV_FSTRM_BLK, fconn->conn, fstrm); + + if (b_data(&fstrm->rxbuf)) { + /* If the channel buffer is not empty, consider the mux is + * blocked because it needs more room. But if the channel buffer + * is empty, it means partial data were received and the mux + * needs to receive more data to be able to parse it. + */ + if (b_data(buf)) + se_fl_set(fstrm->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + } + else { + se_fl_clr(fstrm->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + if (fstrm->state == FCGI_SS_ERROR || (fstrm->h1m.state == H1_MSG_DONE)) { + se_fl_set(fstrm->sd, SE_FL_EOI); + if (!(fstrm->h1m.flags & (H1_MF_VER_11|H1_MF_XFER_LEN))) + se_fl_set(fstrm->sd, SE_FL_EOS); + } + if (fcgi_conn_read0_pending(fconn)) { + se_fl_set(fstrm->sd, SE_FL_EOS); + if (!se_fl_test(fstrm->sd, SE_FL_EOI)) + se_fl_set(fstrm->sd, SE_FL_ERROR); + } + if (se_fl_test(fstrm->sd, SE_FL_ERR_PENDING)) + se_fl_set(fstrm->sd, SE_FL_ERROR); + fcgi_release_buf(fconn, &fstrm->rxbuf); + } + + if (ret && fconn->dsi == fstrm->id) { + /* demux is blocking on this stream's buffer */ + fconn->flags &= ~FCGI_CF_DEM_SFULL; + fcgi_conn_restart_reading(fconn, 1); + } + + TRACE_LEAVE(FCGI_EV_STRM_RECV, fconn->conn, fstrm); + return ret; +} + + +/* Called from the upper layer, to send data from buffer <buf> for no more than + * <count> bytes. Returns the number of bytes effectively sent. Some status + * flags may be updated on the stream connector. + */ +static size_t fcgi_snd_buf(struct stconn *sc, struct buffer *buf, size_t count, int flags) +{ + struct fcgi_strm *fstrm = __sc_mux_strm(sc); + struct fcgi_conn *fconn = fstrm->fconn; + size_t total = 0; + size_t ret; + struct htx *htx = NULL; + struct htx_sl *sl; + struct htx_blk *blk; + uint32_t bsize; + + TRACE_ENTER(FCGI_EV_STRM_SEND, fconn->conn, fstrm, 0, (size_t[]){count}); + + /* If we were not just woken because we wanted to send but couldn't, + * and there's somebody else that is waiting to send, do nothing, + * we will subscribe later and be put at the end of the list + */ + if (!(fstrm->flags & FCGI_SF_NOTIFIED) && !LIST_ISEMPTY(&fconn->send_list)) { + TRACE_STATE("other streams already waiting, going to the queue and leaving", FCGI_EV_STRM_SEND|FCGI_EV_FSTRM_BLK, fconn->conn, fstrm); + return 0; + } + fstrm->flags &= ~FCGI_SF_NOTIFIED; + + if (fconn->state < FCGI_CS_RECORD_H) { + TRACE_STATE("connection not ready, leaving", FCGI_EV_STRM_SEND|FCGI_EV_FSTRM_BLK, fconn->conn, fstrm); + return 0; + } + + htx = htxbuf(buf); + if (fstrm->id == 0) { + int32_t id = fcgi_conn_get_next_sid(fconn); + + if (id < 0) { + fcgi_strm_close(fstrm); + se_fl_set(fstrm->sd, SE_FL_ERROR); + TRACE_DEVEL("couldn't get a stream ID, leaving in error", FCGI_EV_STRM_SEND|FCGI_EV_FSTRM_ERR|FCGI_EV_STRM_ERR, fconn->conn, fstrm); + return 0; + } + + eb32_delete(&fstrm->by_id); + fstrm->by_id.key = fstrm->id = id; + fconn->max_id = id; + fconn->nb_reserved--; + eb32_insert(&fconn->streams_by_id, &fstrm->by_id); + + + /* Check if length of the body is known or if the message is + * full. Otherwise, the request is invalid. + */ + sl = http_get_stline(htx); + if (!sl || (!(sl->flags & HTX_SL_F_CLEN) && !(htx->flags & HTX_FL_EOM))) { + htx->flags |= HTX_FL_PARSING_ERROR; + fcgi_strm_error(fstrm); + goto done; + } + } + + if (!(fstrm->flags & FCGI_SF_BEGIN_SENT)) { + TRACE_PROTO("sending FCGI BEGIN_REQUEST record", FCGI_EV_TX_RECORD|FCGI_EV_TX_BEGREQ, fconn->conn, fstrm); + if (!fcgi_strm_send_begin_request(fconn, fstrm)) + goto done; + } + + if (!(fstrm->flags & FCGI_SF_OUTGOING_DATA) && count) + fstrm->flags |= FCGI_SF_OUTGOING_DATA; + + while (fstrm->state < FCGI_SS_HLOC && !(fstrm->flags & FCGI_SF_BLK_ANY) && + count && !htx_is_empty(htx)) { + blk = htx_get_head_blk(htx); + ALREADY_CHECKED(blk); + bsize = htx_get_blksz(blk); + + switch (htx_get_blk_type(blk)) { + case HTX_BLK_REQ_SL: + case HTX_BLK_HDR: + TRACE_USER("sending FCGI PARAMS record", FCGI_EV_TX_RECORD|FCGI_EV_TX_PARAMS, fconn->conn, fstrm, htx); + ret = fcgi_strm_send_params(fconn, fstrm, htx); + if (!ret) { + goto done; + } + total += ret; + count -= ret; + break; + + case HTX_BLK_EOH: + if (!(fstrm->flags & FCGI_SF_EP_SENT)) { + TRACE_PROTO("sending FCGI PARAMS record", FCGI_EV_TX_RECORD|FCGI_EV_TX_PARAMS, fconn->conn, fstrm, htx); + ret = fcgi_strm_send_empty_params(fconn, fstrm); + if (!ret) + goto done; + } + if (htx_is_unique_blk(htx, blk) && (htx->flags & HTX_FL_EOM)) { + TRACE_PROTO("sending FCGI STDIN record", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, htx); + ret = fcgi_strm_send_empty_stdin(fconn, fstrm); + if (!ret) + goto done; + } + goto remove_blk; + + case HTX_BLK_DATA: + TRACE_PROTO("sending FCGI STDIN record", FCGI_EV_TX_RECORD|FCGI_EV_TX_STDIN, fconn->conn, fstrm, htx); + ret = fcgi_strm_send_stdin(fconn, fstrm, htx, count, buf); + if (ret > 0) { + htx = htx_from_buf(buf); + total += ret; + count -= ret; + if (ret < bsize) + goto done; + } + break; + + default: + remove_blk: + htx_remove_blk(htx, blk); + total += bsize; + count -= bsize; + break; + } + } + + done: + if (fstrm->state >= FCGI_SS_HLOC) { + /* trim any possibly pending data after we close (extra CR-LF, + * unprocessed trailers, abnormal extra data, ...) + */ + total += count; + count = 0; + } + + if (fstrm->state == FCGI_SS_ERROR) { + TRACE_DEVEL("reporting error to the app-layer stream", FCGI_EV_STRM_SEND|FCGI_EV_FSTRM_ERR|FCGI_EV_STRM_ERR, fconn->conn, fstrm); + se_fl_set_error(fstrm->sd); + if (!(fstrm->flags & FCGI_SF_BEGIN_SENT) || fcgi_strm_send_abort(fconn, fstrm)) + fcgi_strm_close(fstrm); + } + + if (htx) + htx_to_buf(htx, buf); + + if (total > 0) { + if (!(fconn->wait_event.events & SUB_RETRY_SEND)) { + TRACE_DEVEL("data queued, waking up fconn sender", FCGI_EV_STRM_SEND|FCGI_EV_FCONN_SEND|FCGI_EV_FCONN_WAKE, fconn->conn, fstrm); + tasklet_wakeup(fconn->wait_event.tasklet); + } + + /* Ok we managed to send something, leave the send_list */ + if (!(fstrm->flags & (FCGI_SF_WANT_SHUTR|FCGI_SF_WANT_SHUTW))) + LIST_DEL_INIT(&fstrm->send_list); + } + + TRACE_LEAVE(FCGI_EV_STRM_SEND, fconn->conn, fstrm, htx, (size_t[]){total}); + return total; +} + +/* for debugging with CLI's "show fd" command */ +static int fcgi_show_fd(struct buffer *msg, struct connection *conn) +{ + struct fcgi_conn *fconn = conn->ctx; + struct fcgi_strm *fstrm = NULL; + struct eb32_node *node; + int send_cnt = 0; + int tree_cnt = 0; + int orph_cnt = 0; + struct buffer *hmbuf, *tmbuf; + + if (!fconn) + return 0; + + list_for_each_entry(fstrm, &fconn->send_list, send_list) + send_cnt++; + + fstrm = NULL; + node = eb32_first(&fconn->streams_by_id); + while (node) { + fstrm = container_of(node, struct fcgi_strm, by_id); + tree_cnt++; + if (!fcgi_strm_sc(fstrm)) + orph_cnt++; + node = eb32_next(node); + } + + hmbuf = br_head(fconn->mbuf); + tmbuf = br_tail(fconn->mbuf); + chunk_appendf(msg, " fconn.st0=%d .maxid=%d .flg=0x%04x .nbst=%u" + " .nbcs=%u .send_cnt=%d .tree_cnt=%d .orph_cnt=%d .sub=%d " + ".dsi=%d .dbuf=%u@%p+%u/%u .mbuf=[%u..%u|%u],h=[%u@%p+%u/%u],t=[%u@%p+%u/%u]", + fconn->state, fconn->max_id, fconn->flags, + fconn->nb_streams, fconn->nb_sc, send_cnt, tree_cnt, orph_cnt, + fconn->wait_event.events, fconn->dsi, + (unsigned int)b_data(&fconn->dbuf), b_orig(&fconn->dbuf), + (unsigned int)b_head_ofs(&fconn->dbuf), (unsigned int)b_size(&fconn->dbuf), + br_head_idx(fconn->mbuf), br_tail_idx(fconn->mbuf), br_size(fconn->mbuf), + (unsigned int)b_data(hmbuf), b_orig(hmbuf), + (unsigned int)b_head_ofs(hmbuf), (unsigned int)b_size(hmbuf), + (unsigned int)b_data(tmbuf), b_orig(tmbuf), + (unsigned int)b_head_ofs(tmbuf), (unsigned int)b_size(tmbuf)); + + if (fstrm) { + chunk_appendf(msg, " last_fstrm=%p .id=%d .flg=0x%04x .rxbuf=%u@%p+%u/%u .sc=%p", + fstrm, fstrm->id, fstrm->flags, + (unsigned int)b_data(&fstrm->rxbuf), b_orig(&fstrm->rxbuf), + (unsigned int)b_head_ofs(&fstrm->rxbuf), (unsigned int)b_size(&fstrm->rxbuf), + fcgi_strm_sc(fstrm)); + + chunk_appendf(msg, " .sd.flg=0x%08x", se_fl_get(fstrm->sd)); + if (!se_fl_test(fstrm->sd, SE_FL_ORPHAN)) + chunk_appendf(msg, " .sc.flg=0x%08x .sc.app=%p", + fcgi_strm_sc(fstrm)->flags, fcgi_strm_sc(fstrm)->app); + + chunk_appendf(msg, " .subs=%p", fstrm->subs); + if (fstrm->subs) { + chunk_appendf(msg, "(ev=%d tl=%p", fstrm->subs->events, fstrm->subs->tasklet); + chunk_appendf(msg, " tl.calls=%d tl.ctx=%p tl.fct=", + fstrm->subs->tasklet->calls, + fstrm->subs->tasklet->context); + resolve_sym_name(msg, NULL, fstrm->subs->tasklet->process); + chunk_appendf(msg, ")"); + } + } + return 0; +} + +/* Migrate the the connection to the current thread. + * Return 0 if successful, non-zero otherwise. + * Expected to be called with the old thread lock held. + */ +static int fcgi_takeover(struct connection *conn, int orig_tid) +{ + struct fcgi_conn *fcgi = conn->ctx; + struct task *task; + struct task *new_task; + struct tasklet *new_tasklet; + + /* Pre-allocate tasks so that we don't have to roll back after the xprt + * has been migrated. + */ + new_task = task_new_here(); + new_tasklet = tasklet_new(); + if (!new_task || !new_tasklet) + goto fail; + + if (fd_takeover(conn->handle.fd, conn) != 0) + goto fail; + + if (conn->xprt->takeover && conn->xprt->takeover(conn, conn->xprt_ctx, orig_tid) != 0) { + /* We failed to takeover the xprt, even if the connection may + * still be valid, flag it as error'd, as we have already + * taken over the fd, and wake the tasklet, so that it will + * destroy it. + */ + conn->flags |= CO_FL_ERROR; + tasklet_wakeup_on(fcgi->wait_event.tasklet, orig_tid); + goto fail; + } + + if (fcgi->wait_event.events) + fcgi->conn->xprt->unsubscribe(fcgi->conn, fcgi->conn->xprt_ctx, + fcgi->wait_event.events, &fcgi->wait_event); + + task = fcgi->task; + if (task) { + /* only assign a task if there was already one, otherwise + * the preallocated new task will be released. + */ + task->context = NULL; + fcgi->task = NULL; + __ha_barrier_store(); + task_kill(task); + + fcgi->task = new_task; + new_task = NULL; + fcgi->task->process = fcgi_timeout_task; + fcgi->task->context = fcgi; + } + + /* To let the tasklet know it should free itself, and do nothing else, + * set its context to NULL; + */ + fcgi->wait_event.tasklet->context = NULL; + tasklet_wakeup_on(fcgi->wait_event.tasklet, orig_tid); + + fcgi->wait_event.tasklet = new_tasklet; + fcgi->wait_event.tasklet->process = fcgi_io_cb; + fcgi->wait_event.tasklet->context = fcgi; + fcgi->conn->xprt->subscribe(fcgi->conn, fcgi->conn->xprt_ctx, + SUB_RETRY_RECV, &fcgi->wait_event); + + if (new_task) + __task_free(new_task); + return 0; + fail: + if (new_task) + __task_free(new_task); + tasklet_free(new_tasklet); + return -1; +} + +/****************************************/ +/* MUX initialization and instantiation */ +/****************************************/ + +/* The mux operations */ +static const struct mux_ops mux_fcgi_ops = { + .init = fcgi_init, + .wake = fcgi_wake, + .attach = fcgi_attach, + .get_first_sc = fcgi_get_first_sc, + .detach = fcgi_detach, + .destroy = fcgi_destroy, + .avail_streams = fcgi_avail_streams, + .used_streams = fcgi_used_streams, + .rcv_buf = fcgi_rcv_buf, + .snd_buf = fcgi_snd_buf, + .subscribe = fcgi_subscribe, + .unsubscribe = fcgi_unsubscribe, + .shutr = fcgi_shutr, + .shutw = fcgi_shutw, + .ctl = fcgi_ctl, + .sctl = fcgi_sctl, + .show_fd = fcgi_show_fd, + .takeover = fcgi_takeover, + .flags = MX_FL_HTX|MX_FL_HOL_RISK|MX_FL_NO_UPG, + .name = "FCGI", +}; + + +/* this mux registers FCGI proto */ +static struct mux_proto_list mux_proto_fcgi = +{ .token = IST("fcgi"), .mode = PROTO_MODE_HTTP, .side = PROTO_SIDE_BE, .mux = &mux_fcgi_ops }; + +INITCALL1(STG_REGISTER, register_mux_proto, &mux_proto_fcgi); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/mux_h1.c b/src/mux_h1.c new file mode 100644 index 0000000..455ebeb --- /dev/null +++ b/src/mux_h1.c @@ -0,0 +1,5374 @@ +/* + * HTTP/1 mux-demux for connections + * + * Copyright 2018 Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <import/ebistree.h> +#include <import/ebmbtree.h> + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/h1.h> +#include <haproxy/h1_htx.h> +#include <haproxy/h2.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/istbuf.h> +#include <haproxy/log.h> +#include <haproxy/mux_h1-t.h> +#include <haproxy/pipe.h> +#include <haproxy/proxy.h> +#include <haproxy/session-t.h> +#include <haproxy/stats.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/trace.h> +#include <haproxy/xref.h> + +/* H1 connection descriptor */ +struct h1c { + struct connection *conn; + struct h1s *h1s; /* H1 stream descriptor */ + struct task *task; /* timeout management task */ + + uint32_t flags; /* Connection flags: H1C_F_* */ + enum h1_cs state; /* Connection state */ + + + struct buffer ibuf; /* Input buffer to store data before parsing */ + struct buffer obuf; /* Output buffer to store data after reformatting */ + struct proxy *px; + + unsigned int errcode; /* Status code when an error occurred at the H1 connection level */ + + int idle_exp; /* idle expiration date (http-keep-alive or http-request timeout) */ + int timeout; /* client/server timeout duration */ + int shut_timeout; /* client-fin/server-fin timeout duration */ + + unsigned int req_count; /* The number of requests handled by this H1 connection */ + + struct h1_counters *px_counters; /* h1 counters attached to proxy */ + struct buffer_wait buf_wait; /* Wait list for buffer allocation */ + struct wait_event wait_event; /* To be used if we're waiting for I/Os */ +}; + +/* H1 stream descriptor */ +struct h1s { + struct h1c *h1c; + struct sedesc *sd; + uint32_t flags; /* Connection flags: H1S_F_* */ + + struct wait_event *subs; /* Address of the wait_event the stream connector associated is waiting on */ + + struct session *sess; /* Associated session */ + struct buffer rxbuf; /* receive buffer, always valid (buf_empty or real buffer) */ + struct h1m req; + struct h1m res; + + enum http_meth_t meth; /* HTTP request method */ + uint16_t status; /* HTTP response status */ + + char ws_key[25]; /* websocket handshake key */ +}; + +/* Map of headers used to convert outgoing headers */ +struct h1_hdrs_map { + char *name; + struct eb_root map; +}; + +/* An entry in a headers map */ +struct h1_hdr_entry { + struct ist name; + struct ebpt_node node; +}; + +/* Declare the headers map */ +static struct h1_hdrs_map hdrs_map = { .name = NULL, .map = EB_ROOT }; +static int accept_payload_with_any_method = 0; + +/* trace source and events */ +static void h1_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +/* The event representation is split like this : + * h1c - internal H1 connection + * h1s - internal H1 stream + * strm - application layer + * rx - data receipt + * tx - data transmission + * + */ +static const struct trace_event h1_trace_events[] = { +#define H1_EV_H1C_NEW (1ULL << 0) + { .mask = H1_EV_H1C_NEW, .name = "h1c_new", .desc = "new H1 connection" }, +#define H1_EV_H1C_RECV (1ULL << 1) + { .mask = H1_EV_H1C_RECV, .name = "h1c_recv", .desc = "Rx on H1 connection" }, +#define H1_EV_H1C_SEND (1ULL << 2) + { .mask = H1_EV_H1C_SEND, .name = "h1c_send", .desc = "Tx on H1 connection" }, +#define H1_EV_H1C_BLK (1ULL << 3) + { .mask = H1_EV_H1C_BLK, .name = "h1c_blk", .desc = "H1 connection blocked" }, +#define H1_EV_H1C_WAKE (1ULL << 4) + { .mask = H1_EV_H1C_WAKE, .name = "h1c_wake", .desc = "H1 connection woken up" }, +#define H1_EV_H1C_END (1ULL << 5) + { .mask = H1_EV_H1C_END, .name = "h1c_end", .desc = "H1 connection terminated" }, +#define H1_EV_H1C_ERR (1ULL << 6) + { .mask = H1_EV_H1C_ERR, .name = "h1c_err", .desc = "error on H1 connection" }, + +#define H1_EV_RX_DATA (1ULL << 7) + { .mask = H1_EV_RX_DATA, .name = "rx_data", .desc = "receipt of any H1 data" }, +#define H1_EV_RX_EOI (1ULL << 8) + { .mask = H1_EV_RX_EOI, .name = "rx_eoi", .desc = "receipt of end of H1 input" }, +#define H1_EV_RX_HDRS (1ULL << 9) + { .mask = H1_EV_RX_HDRS, .name = "rx_headers", .desc = "receipt of H1 headers" }, +#define H1_EV_RX_BODY (1ULL << 10) + { .mask = H1_EV_RX_BODY, .name = "rx_body", .desc = "receipt of H1 body" }, +#define H1_EV_RX_TLRS (1ULL << 11) + { .mask = H1_EV_RX_TLRS, .name = "rx_trailerus", .desc = "receipt of H1 trailers" }, + +#define H1_EV_TX_DATA (1ULL << 12) + { .mask = H1_EV_TX_DATA, .name = "tx_data", .desc = "transmission of any H1 data" }, +#define H1_EV_TX_EOI (1ULL << 13) + { .mask = H1_EV_TX_EOI, .name = "tx_eoi", .desc = "transmission of end of H1 input" }, +#define H1_EV_TX_HDRS (1ULL << 14) + { .mask = H1_EV_TX_HDRS, .name = "tx_headers", .desc = "transmission of all headers" }, +#define H1_EV_TX_BODY (1ULL << 15) + { .mask = H1_EV_TX_BODY, .name = "tx_body", .desc = "transmission of H1 body" }, +#define H1_EV_TX_TLRS (1ULL << 16) + { .mask = H1_EV_TX_TLRS, .name = "tx_trailerus", .desc = "transmission of H1 trailers" }, + +#define H1_EV_H1S_NEW (1ULL << 17) + { .mask = H1_EV_H1S_NEW, .name = "h1s_new", .desc = "new H1 stream" }, +#define H1_EV_H1S_BLK (1ULL << 18) + { .mask = H1_EV_H1S_BLK, .name = "h1s_blk", .desc = "H1 stream blocked" }, +#define H1_EV_H1S_END (1ULL << 19) + { .mask = H1_EV_H1S_END, .name = "h1s_end", .desc = "H1 stream terminated" }, +#define H1_EV_H1S_ERR (1ULL << 20) + { .mask = H1_EV_H1S_ERR, .name = "h1s_err", .desc = "error on H1 stream" }, + +#define H1_EV_STRM_NEW (1ULL << 21) + { .mask = H1_EV_STRM_NEW, .name = "strm_new", .desc = "app-layer stream creation" }, +#define H1_EV_STRM_RECV (1ULL << 22) + { .mask = H1_EV_STRM_RECV, .name = "strm_recv", .desc = "receiving data for stream" }, +#define H1_EV_STRM_SEND (1ULL << 23) + { .mask = H1_EV_STRM_SEND, .name = "strm_send", .desc = "sending data for stream" }, +#define H1_EV_STRM_WAKE (1ULL << 24) + { .mask = H1_EV_STRM_WAKE, .name = "strm_wake", .desc = "stream woken up" }, +#define H1_EV_STRM_SHUT (1ULL << 25) + { .mask = H1_EV_STRM_SHUT, .name = "strm_shut", .desc = "stream shutdown" }, +#define H1_EV_STRM_END (1ULL << 26) + { .mask = H1_EV_STRM_END, .name = "strm_end", .desc = "detaching app-layer stream" }, +#define H1_EV_STRM_ERR (1ULL << 27) + { .mask = H1_EV_STRM_ERR, .name = "strm_err", .desc = "stream error" }, + + { } +}; + +static const struct name_desc h1_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the connection */ }, + /* arg2 */ { .name="h1s", .desc="H1 stream" }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc h1_trace_decoding[] = { +#define H1_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define H1_VERB_MINIMAL 2 + { .name="minimal", .desc="report only h1c/h1s state and flags, no real decoding" }, +#define H1_VERB_SIMPLE 3 + { .name="simple", .desc="add request/response status line or htx info when available" }, +#define H1_VERB_ADVANCED 4 + { .name="advanced", .desc="add header fields or frame decoding when available" }, +#define H1_VERB_COMPLETE 5 + { .name="complete", .desc="add full data dump when available" }, + { /* end */ } +}; + +static struct trace_source trace_h1 __read_mostly = { + .name = IST("h1"), + .desc = "HTTP/1 multiplexer", + .arg_def = TRC_ARG1_CONN, // TRACE()'s first argument is always a connection + .default_cb = h1_trace, + .known_events = h1_trace_events, + .lockon_args = h1_trace_lockon_args, + .decoding = h1_trace_decoding, + .report_events = ~0, // report everything by default +}; + +#define TRACE_SOURCE &trace_h1 +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + + +/* h1 stats module */ +enum { + H1_ST_OPEN_CONN, + H1_ST_OPEN_STREAM, + H1_ST_TOTAL_CONN, + H1_ST_TOTAL_STREAM, + + H1_ST_BYTES_IN, + H1_ST_BYTES_OUT, +#if defined(USE_LINUX_SPLICE) + H1_ST_SPLICED_BYTES_IN, + H1_ST_SPLICED_BYTES_OUT, +#endif + H1_STATS_COUNT /* must be the last member of the enum */ +}; + + +static struct name_desc h1_stats[] = { + [H1_ST_OPEN_CONN] = { .name = "h1_open_connections", + .desc = "Count of currently open connections" }, + [H1_ST_OPEN_STREAM] = { .name = "h1_open_streams", + .desc = "Count of currently open streams" }, + [H1_ST_TOTAL_CONN] = { .name = "h1_total_connections", + .desc = "Total number of connections" }, + [H1_ST_TOTAL_STREAM] = { .name = "h1_total_streams", + .desc = "Total number of streams" }, + + [H1_ST_BYTES_IN] = { .name = "h1_bytes_in", + .desc = "Total number of bytes received" }, + [H1_ST_BYTES_OUT] = { .name = "h1_bytes_out", + .desc = "Total number of bytes send" }, +#if defined(USE_LINUX_SPLICE) + [H1_ST_SPLICED_BYTES_IN] = { .name = "h1_spliced_bytes_in", + .desc = "Total number of bytes received using kernel splicing" }, + [H1_ST_SPLICED_BYTES_OUT] = { .name = "h1_spliced_bytes_out", + .desc = "Total number of bytes sendusing kernel splicing" }, +#endif + +}; + +static struct h1_counters { + long long open_conns; /* count of currently open connections */ + long long open_streams; /* count of currently open streams */ + long long total_conns; /* total number of connections */ + long long total_streams; /* total number of streams */ + + long long bytes_in; /* number of bytes received */ + long long bytes_out; /* number of bytes sent */ +#if defined(USE_LINUX_SPLICE) + long long spliced_bytes_in; /* number of bytes received using kernel splicing */ + long long spliced_bytes_out; /* number of bytes sent using kernel splicing */ +#endif +} h1_counters; + +static void h1_fill_stats(void *data, struct field *stats) +{ + struct h1_counters *counters = data; + + stats[H1_ST_OPEN_CONN] = mkf_u64(FN_GAUGE, counters->open_conns); + stats[H1_ST_OPEN_STREAM] = mkf_u64(FN_GAUGE, counters->open_streams); + stats[H1_ST_TOTAL_CONN] = mkf_u64(FN_COUNTER, counters->total_conns); + stats[H1_ST_TOTAL_STREAM] = mkf_u64(FN_COUNTER, counters->total_streams); + + stats[H1_ST_BYTES_IN] = mkf_u64(FN_COUNTER, counters->bytes_in); + stats[H1_ST_BYTES_OUT] = mkf_u64(FN_COUNTER, counters->bytes_out); +#if defined(USE_LINUX_SPLICE) + stats[H1_ST_SPLICED_BYTES_IN] = mkf_u64(FN_COUNTER, counters->spliced_bytes_in); + stats[H1_ST_SPLICED_BYTES_OUT] = mkf_u64(FN_COUNTER, counters->spliced_bytes_out); +#endif +} + +static struct stats_module h1_stats_module = { + .name = "h1", + .fill_stats = h1_fill_stats, + .stats = h1_stats, + .stats_count = H1_STATS_COUNT, + .counters = &h1_counters, + .counters_size = sizeof(h1_counters), + .domain_flags = MK_STATS_PROXY_DOMAIN(STATS_PX_CAP_FE|STATS_PX_CAP_BE), + .clearable = 1, +}; + +INITCALL1(STG_REGISTER, stats_register_module, &h1_stats_module); + + +/* the h1c and h1s pools */ +DECLARE_STATIC_POOL(pool_head_h1c, "h1c", sizeof(struct h1c)); +DECLARE_STATIC_POOL(pool_head_h1s, "h1s", sizeof(struct h1s)); + +static int h1_recv(struct h1c *h1c); +static int h1_send(struct h1c *h1c); +static int h1_process(struct h1c *h1c); +/* h1_io_cb is exported to see it resolved in "show fd" */ +struct task *h1_io_cb(struct task *t, void *ctx, unsigned int state); +struct task *h1_timeout_task(struct task *t, void *context, unsigned int state); +static void h1_shutw_conn(struct connection *conn); +static void h1_wake_stream_for_recv(struct h1s *h1s); +static void h1_wake_stream_for_send(struct h1s *h1s); +static void h1s_destroy(struct h1s *h1s); + +/* returns the stconn associated to the H1 stream */ +static forceinline struct stconn *h1s_sc(const struct h1s *h1s) +{ + return h1s->sd->sc; +} + +/* the H1 traces always expect that arg1, if non-null, is of type connection + * (from which we can derive h1c), that arg2, if non-null, is of type h1s, and + * that arg3, if non-null, is a htx for rx/tx headers. + */ +static void h1_trace(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct connection *conn = a1; + const struct h1c *h1c = conn ? conn->ctx : NULL; + const struct h1s *h1s = a2; + const struct htx *htx = a3; + const size_t *val = a4; + + if (!h1c) + h1c = (h1s ? h1s->h1c : NULL); + + if (!h1c || src->verbosity < H1_VERB_CLEAN) + return; + + /* Display frontend/backend info by default */ + chunk_appendf(&trace_buf, " : [%c,%s]", ((h1c->flags & H1C_F_IS_BACK) ? 'B' : 'F'), h1c_st_to_str(h1c->state)); + + /* Display request and response states if h1s is defined */ + if (h1s) { + chunk_appendf(&trace_buf, " [%s, %s]", + h1m_state_str(h1s->req.state), h1m_state_str(h1s->res.state)); + + if (src->verbosity > H1_VERB_SIMPLE) { + chunk_appendf(&trace_buf, " - req=(.fl=0x%08x .curr_len=%lu .body_len=%lu)", + h1s->req.flags, (unsigned long)h1s->req.curr_len, (unsigned long)h1s->req.body_len); + chunk_appendf(&trace_buf, " res=(.fl=0x%08x .curr_len=%lu .body_len=%lu)", + h1s->res.flags, (unsigned long)h1s->res.curr_len, (unsigned long)h1s->res.body_len); + } + + } + + if (src->verbosity == H1_VERB_CLEAN) + return; + + /* Display the value to the 4th argument (level > STATE) */ + if (src->level > TRACE_LEVEL_STATE && val) + chunk_appendf(&trace_buf, " - VAL=%lu", (long)*val); + + /* Display status-line if possible (verbosity > MINIMAL) */ + if (src->verbosity > H1_VERB_MINIMAL && htx && htx_nbblks(htx)) { + const struct htx_blk *blk = htx_get_head_blk(htx); + const struct htx_sl *sl = htx_get_blk_ptr(htx, blk); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_REQ_SL || type == HTX_BLK_RES_SL) + chunk_appendf(&trace_buf, " - \"%.*s %.*s %.*s\"", + HTX_SL_P1_LEN(sl), HTX_SL_P1_PTR(sl), + HTX_SL_P2_LEN(sl), HTX_SL_P2_PTR(sl), + HTX_SL_P3_LEN(sl), HTX_SL_P3_PTR(sl)); + } + + /* Display h1c info and, if defined, h1s info (pointer + flags) */ + chunk_appendf(&trace_buf, " - h1c=%p(0x%08x)", h1c, h1c->flags); + if (h1c->conn) + chunk_appendf(&trace_buf, " conn=%p(0x%08x)", h1c->conn, h1c->conn->flags); + if (h1s) { + chunk_appendf(&trace_buf, " h1s=%p(0x%08x)", h1s, h1s->flags); + if (h1s->sd) + chunk_appendf(&trace_buf, " sd=%p(0x%08x)", h1s->sd, se_fl_get(h1s->sd)); + if (h1s->sd && h1s_sc(h1s)) + chunk_appendf(&trace_buf, " sc=%p(0x%08x)", h1s_sc(h1s), h1s_sc(h1s)->flags); + } + + if (src->verbosity == H1_VERB_MINIMAL) + return; + + /* Display input and output buffer info (level > USER & verbosity > SIMPLE) */ + if (src->level > TRACE_LEVEL_USER) { + if (src->verbosity == H1_VERB_COMPLETE || + (src->verbosity == H1_VERB_ADVANCED && (mask & (H1_EV_H1C_RECV|H1_EV_STRM_RECV)))) + chunk_appendf(&trace_buf, " ibuf=%u@%p+%u/%u", + (unsigned int)b_data(&h1c->ibuf), b_orig(&h1c->ibuf), + (unsigned int)b_head_ofs(&h1c->ibuf), (unsigned int)b_size(&h1c->ibuf)); + if (src->verbosity == H1_VERB_COMPLETE || + (src->verbosity == H1_VERB_ADVANCED && (mask & (H1_EV_H1C_SEND|H1_EV_STRM_SEND)))) + chunk_appendf(&trace_buf, " obuf=%u@%p+%u/%u", + (unsigned int)b_data(&h1c->obuf), b_orig(&h1c->obuf), + (unsigned int)b_head_ofs(&h1c->obuf), (unsigned int)b_size(&h1c->obuf)); + } + + /* Display htx info if defined (level > USER) */ + if (src->level > TRACE_LEVEL_USER && htx) { + int full = 0; + + /* Full htx info (level > STATE && verbosity > SIMPLE) */ + if (src->level > TRACE_LEVEL_STATE) { + if (src->verbosity == H1_VERB_COMPLETE) + full = 1; + else if (src->verbosity == H1_VERB_ADVANCED && (mask & (H1_EV_RX_HDRS|H1_EV_TX_HDRS))) + full = 1; + } + + chunk_memcat(&trace_buf, "\n\t", 2); + htx_dump(&trace_buf, htx, full); + } +} + + +/*****************************************************/ +/* functions below are for dynamic buffer management */ +/*****************************************************/ +/* + * Indicates whether or not we may receive data. The rules are the following : + * - if an error or a shutdown for reads was detected on the H1 connection we + * must not attempt to receive + * - if we are waiting for the connection establishment, we must not attempt + * to receive + * - if reads are explicitly disabled, we must not attempt to receive + * - if the input buffer failed to be allocated or is full , we must not try + * to receive + * - if the mux is blocked on an input condition, we must may not attempt to + * receive + * - otherwise we may attempt to receive + */ +static inline int h1_recv_allowed(const struct h1c *h1c) +{ + if (h1c->flags & (H1C_F_EOS|H1C_F_ERROR)) { + TRACE_DEVEL("recv not allowed because of (eos|error) on h1c", H1_EV_H1C_RECV|H1_EV_H1C_BLK, h1c->conn); + return 0; + } + + if (h1c->conn->flags & (CO_FL_WAIT_L4_CONN|CO_FL_WAIT_L6_CONN)) { + TRACE_DEVEL("recv not allowed because of (waitl4|waitl6) on connection", H1_EV_H1C_RECV|H1_EV_H1C_BLK, h1c->conn); + return 0; + } + + if ((h1c->flags & (H1C_F_IN_ALLOC|H1C_F_IN_FULL|H1C_F_IN_SALLOC))) { + TRACE_DEVEL("recv not allowed because input is blocked", H1_EV_H1C_RECV|H1_EV_H1C_BLK, h1c->conn); + return 0; + } + + return 1; +} + +/* + * Tries to grab a buffer and to re-enables processing on mux <target>. The h1 + * flags are used to figure what buffer was requested. It returns 1 if the + * allocation succeeds, in which case the connection is woken up, or 0 if it's + * impossible to wake up and we prefer to be woken up later. + */ +static int h1_buf_available(void *target) +{ + struct h1c *h1c = target; + + if ((h1c->flags & H1C_F_IN_ALLOC) && b_alloc(&h1c->ibuf)) { + TRACE_STATE("unblocking h1c, ibuf allocated", H1_EV_H1C_RECV|H1_EV_H1C_BLK|H1_EV_H1C_WAKE, h1c->conn); + h1c->flags &= ~H1C_F_IN_ALLOC; + if (h1_recv_allowed(h1c)) + tasklet_wakeup(h1c->wait_event.tasklet); + return 1; + } + + if ((h1c->flags & H1C_F_OUT_ALLOC) && b_alloc(&h1c->obuf)) { + TRACE_STATE("unblocking h1s, obuf allocated", H1_EV_TX_DATA|H1_EV_H1S_BLK|H1_EV_STRM_WAKE, h1c->conn, h1c->h1s); + h1c->flags &= ~H1C_F_OUT_ALLOC; + if (h1c->h1s) + h1_wake_stream_for_send(h1c->h1s); + return 1; + } + + if ((h1c->flags & H1C_F_IN_SALLOC) && h1c->h1s && b_alloc(&h1c->h1s->rxbuf)) { + TRACE_STATE("unblocking h1c, stream rxbuf allocated", H1_EV_H1C_RECV|H1_EV_H1C_BLK|H1_EV_H1C_WAKE, h1c->conn); + h1c->flags &= ~H1C_F_IN_SALLOC; + tasklet_wakeup(h1c->wait_event.tasklet); + return 1; + } + + return 0; +} + +/* + * Allocate a buffer. If if fails, it adds the mux in buffer wait queue. + */ +static inline struct buffer *h1_get_buf(struct h1c *h1c, struct buffer *bptr) +{ + struct buffer *buf = NULL; + + if (likely(!LIST_INLIST(&h1c->buf_wait.list)) && + unlikely((buf = b_alloc(bptr)) == NULL)) { + h1c->buf_wait.target = h1c; + h1c->buf_wait.wakeup_cb = h1_buf_available; + LIST_APPEND(&th_ctx->buffer_wq, &h1c->buf_wait.list); + } + return buf; +} + +/* + * Release a buffer, if any, and try to wake up entities waiting in the buffer + * wait queue. + */ +static inline void h1_release_buf(struct h1c *h1c, struct buffer *bptr) +{ + if (bptr->size) { + b_free(bptr); + offer_buffers(h1c->buf_wait.target, 1); + } +} + +/* Returns 1 if the H1 connection is alive (IDLE, EMBRYONIC, RUNNING or + * RUNNING). Ortherwise 0 is returned. + */ +static inline int h1_is_alive(const struct h1c *h1c) +{ + return (h1c->state <= H1_CS_RUNNING); +} + +/* Switch the H1 connection to CLOSING or CLOSED mode, depending on the output + * buffer state and if there is still a H1 stream or not. If there are sill + * pending outgoing data or if there is still a H1 stream, it is set to CLOSING + * state. Otherwise it is set to CLOSED mode. */ +static inline void h1_close(struct h1c *h1c) +{ + h1c->state = ((h1c->h1s || b_data(&h1c->obuf)) ? H1_CS_CLOSING : H1_CS_CLOSED); +} + +/* returns the number of streams in use on a connection to figure if it's idle + * or not. We rely on H1C state to know if the connection is in-use or not. It + * is IDLE only when no H1 stream is attached and when the previous stream, if + * any, was fully terminated without any error and in K/A mode. + */ +static int h1_used_streams(struct connection *conn) +{ + struct h1c *h1c = conn->ctx; + + return ((h1c->state == H1_CS_IDLE) ? 0 : 1); +} + +/* returns the number of streams still available on a connection */ +static int h1_avail_streams(struct connection *conn) +{ + return 1 - h1_used_streams(conn); +} + +/* Refresh the h1c task timeout if necessary */ +static void h1_refresh_timeout(struct h1c *h1c) +{ + int is_idle_conn = 0; + + if (h1c->task) { + if (!h1_is_alive(h1c)) { + /* half-closed or dead connections : switch to clientfin/serverfin + * timeouts so that we don't hang too long on clients that have + * gone away (especially in tunnel mode). + */ + h1c->task->expire = tick_add(now_ms, h1c->shut_timeout); + TRACE_DEVEL("refreshing connection's timeout (dead or half-closed)", H1_EV_H1C_SEND|H1_EV_H1C_RECV, h1c->conn); + is_idle_conn = 1; + } + else if (b_data(&h1c->obuf)) { + /* alive connection with pending outgoing data, need a timeout (server or client). */ + h1c->task->expire = tick_add(now_ms, h1c->timeout); + TRACE_DEVEL("refreshing connection's timeout (pending outgoing data)", H1_EV_H1C_SEND|H1_EV_H1C_RECV, h1c->conn); + } + else if (!(h1c->flags & H1C_F_IS_BACK) && (h1c->state == H1_CS_IDLE)) { + /* idle front connections. */ + h1c->task->expire = (tick_isset(h1c->idle_exp) ? h1c->idle_exp : tick_add(now_ms, h1c->timeout)); + TRACE_DEVEL("refreshing connection's timeout (idle front h1c)", H1_EV_H1C_SEND|H1_EV_H1C_RECV, h1c->conn); + is_idle_conn = 1; + } + else if (!(h1c->flags & H1C_F_IS_BACK) && (h1c->state != H1_CS_RUNNING)) { + /* alive front connections waiting for a fully usable stream need a timeout. */ + h1c->task->expire = tick_add(now_ms, h1c->timeout); + TRACE_DEVEL("refreshing connection's timeout (alive front h1c but not ready)", H1_EV_H1C_SEND|H1_EV_H1C_RECV, h1c->conn); + /* A frontend connection not yet ready could be treated the same way as an idle + * one in case of soft-close. + */ + is_idle_conn = 1; + } + else { + /* alive back connections of front connections with a stream connector attached */ + h1c->task->expire = TICK_ETERNITY; + TRACE_DEVEL("no connection timeout (alive back h1c or front h1c with an SC)", H1_EV_H1C_SEND|H1_EV_H1C_RECV, h1c->conn); + } + + /* Finally set the idle expiration date if shorter */ + h1c->task->expire = tick_first(h1c->task->expire, h1c->idle_exp); + + if ((h1c->px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) && + is_idle_conn && tick_isset(global.close_spread_end)) { + /* If a soft-stop is in progress and a close-spread-time + * is set, we want to spread idle connection closing roughly + * evenly across the defined window. This should only + * act on idle frontend connections. + * If the window end is already in the past, we wake the + * timeout task up immediately so that it can be closed. + */ + int remaining_window = tick_remain(now_ms, global.close_spread_end); + if (remaining_window) { + /* We don't need to reset the expire if it would + * already happen before the close window end. + */ + if (tick_is_le(global.close_spread_end, h1c->task->expire)) { + /* Set an expire value shorter than the current value + * because the close spread window end comes earlier. + */ + h1c->task->expire = tick_add(now_ms, statistical_prng_range(remaining_window)); + TRACE_DEVEL("connection timeout set to value before close-spread window end", H1_EV_H1C_SEND|H1_EV_H1C_RECV, h1c->conn); + } + } + else { + /* We are past the soft close window end, wake the timeout + * task up immediately. + */ + task_wakeup(h1c->task, TASK_WOKEN_TIMER); + } + } + TRACE_DEVEL("new expiration date", H1_EV_H1C_SEND|H1_EV_H1C_RECV, h1c->conn, 0, 0, (size_t[]){h1c->task->expire}); + task_queue(h1c->task); + } +} + +static void h1_set_idle_expiration(struct h1c *h1c) +{ + if (h1c->flags & H1C_F_IS_BACK || !h1c->task) { + TRACE_DEVEL("no idle expiration (backend connection || no task)", H1_EV_H1C_RECV, h1c->conn); + h1c->idle_exp = TICK_ETERNITY; + return; + } + if (h1c->state == H1_CS_IDLE) { + if (!tick_isset(h1c->idle_exp)) { + if ((h1c->flags & H1C_F_WAIT_NEXT_REQ) && /* Not the first request */ + !b_data(&h1c->ibuf) && /* No input data */ + tick_isset(h1c->px->timeout.httpka)) { /* K-A timeout set */ + h1c->idle_exp = tick_add_ifset(now_ms, h1c->px->timeout.httpka); + TRACE_DEVEL("set idle expiration (keep-alive timeout)", H1_EV_H1C_RECV, h1c->conn); + } + else { + h1c->idle_exp = tick_add_ifset(now_ms, h1c->px->timeout.httpreq); + TRACE_DEVEL("set idle expiration (http-request timeout)", H1_EV_H1C_RECV, h1c->conn); + } + } + } + else if (h1c->state < H1_CS_RUNNING) { + if (!tick_isset(h1c->idle_exp)) { + h1c->idle_exp = tick_add_ifset(now_ms, h1c->px->timeout.httpreq); + TRACE_DEVEL("set idle expiration (http-request timeout)", H1_EV_H1C_RECV, h1c->conn); + } + } + else { + h1c->idle_exp = TICK_ETERNITY; + TRACE_DEVEL("unset idle expiration (running or closing)", H1_EV_H1C_RECV, h1c->conn); + } +} +/*****************************************************************/ +/* functions below are dedicated to the mux setup and management */ +/*****************************************************************/ + +/* returns non-zero if there are input data pending for stream h1s. */ +static inline size_t h1s_data_pending(const struct h1s *h1s) +{ + const struct h1m *h1m; + + h1m = ((h1s->h1c->flags & H1C_F_IS_BACK) ? &h1s->res : &h1s->req); + return ((h1m->state == H1_MSG_DONE) ? 0 : b_data(&h1s->h1c->ibuf)); +} + +/* Creates a new stream connector and the associate stream. <input> is used as input + * buffer for the stream. On success, it is transferred to the stream and the + * mux is no longer responsible of it. On error, <input> is unchanged, thus the + * mux must still take care of it. However, there is nothing special to do + * because, on success, <input> is updated to points on BUF_NULL. Thus, calling + * b_free() on it is always safe. This function returns the stream connector on + * success or NULL on error. */ +static struct stconn *h1s_new_sc(struct h1s *h1s, struct buffer *input) +{ + struct h1c *h1c = h1s->h1c; + + TRACE_ENTER(H1_EV_STRM_NEW, h1c->conn, h1s); + + if (h1s->flags & H1S_F_NOT_FIRST) + se_fl_set(h1s->sd, SE_FL_NOT_FIRST); + if (h1s->req.flags & H1_MF_UPG_WEBSOCKET) + se_fl_set(h1s->sd, SE_FL_WEBSOCKET); + + if (!sc_new_from_endp(h1s->sd, h1c->conn->owner, input)) { + TRACE_ERROR("SC allocation failure", H1_EV_STRM_NEW|H1_EV_STRM_END|H1_EV_STRM_ERR, h1c->conn, h1s); + goto err; + } + + h1c->state = H1_CS_RUNNING; + TRACE_LEAVE(H1_EV_STRM_NEW, h1c->conn, h1s); + return h1s_sc(h1s); + + err: + TRACE_DEVEL("leaving on error", H1_EV_STRM_NEW|H1_EV_STRM_ERR, h1c->conn, h1s); + return NULL; +} + +static struct stconn *h1s_upgrade_sc(struct h1s *h1s, struct buffer *input) +{ + TRACE_ENTER(H1_EV_STRM_NEW, h1s->h1c->conn, h1s); + + if (stream_upgrade_from_sc(h1s_sc(h1s), input) < 0) { + TRACE_ERROR("stream upgrade failure", H1_EV_STRM_NEW|H1_EV_STRM_END|H1_EV_STRM_ERR, h1s->h1c->conn, h1s); + goto err; + } + + h1s->h1c->state = H1_CS_RUNNING; + TRACE_LEAVE(H1_EV_STRM_NEW, h1s->h1c->conn, h1s); + return h1s_sc(h1s); + + err: + TRACE_DEVEL("leaving on error", H1_EV_STRM_NEW|H1_EV_STRM_ERR, h1s->h1c->conn, h1s); + return NULL; +} + +static struct h1s *h1s_new(struct h1c *h1c) +{ + struct h1s *h1s; + + TRACE_ENTER(H1_EV_H1S_NEW, h1c->conn); + + h1s = pool_alloc(pool_head_h1s); + if (!h1s) { + TRACE_ERROR("H1S allocation failure", H1_EV_H1S_NEW|H1_EV_H1S_END|H1_EV_H1S_ERR, h1c->conn); + goto fail; + } + h1s->h1c = h1c; + h1c->h1s = h1s; + h1s->sess = NULL; + h1s->sd = NULL; + h1s->flags = H1S_F_WANT_KAL; + h1s->subs = NULL; + h1s->rxbuf = BUF_NULL; + memset(h1s->ws_key, 0, sizeof(h1s->ws_key)); + + h1m_init_req(&h1s->req); + h1s->req.flags |= (H1_MF_NO_PHDR|H1_MF_CLEAN_CONN_HDR); + + h1m_init_res(&h1s->res); + h1s->res.flags |= (H1_MF_NO_PHDR|H1_MF_CLEAN_CONN_HDR); + + h1s->status = 0; + h1s->meth = HTTP_METH_OTHER; + + if (h1c->flags & H1C_F_WAIT_NEXT_REQ) + h1s->flags |= H1S_F_NOT_FIRST; + h1s->h1c->state = H1_CS_EMBRYONIC; + h1s->h1c->flags &= ~H1C_F_WAIT_NEXT_REQ; + TRACE_LEAVE(H1_EV_H1S_NEW, h1c->conn, h1s); + return h1s; + + fail: + TRACE_DEVEL("leaving on error", H1_EV_STRM_NEW|H1_EV_STRM_ERR, h1c->conn); + return NULL; +} + +static struct h1s *h1c_frt_stream_new(struct h1c *h1c, struct stconn *sc, struct session *sess) +{ + struct h1s *h1s; + + TRACE_ENTER(H1_EV_H1S_NEW, h1c->conn); + + h1s = h1s_new(h1c); + if (!h1s) + goto fail; + + if (sc) { + if (sc_attach_mux(sc, h1s, h1c->conn) < 0) + goto fail; + h1s->sd = sc->sedesc; + } + else { + h1s->sd = sedesc_new(); + if (!h1s->sd) + goto fail; + h1s->sd->se = h1s; + h1s->sd->conn = h1c->conn; + se_fl_set(h1s->sd, SE_FL_T_MUX | SE_FL_ORPHAN); + } + /* When a request starts, the H1S does not expect data while the request + * is not finished. It does not mean the response must not be received, + * especially if headers were already forwarded. But it is not + * mandatory. + */ + if (!(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_H1_SND)) + se_fl_set(h1s->sd, SE_FL_MAY_FASTFWD_CONS); + se_expect_no_data(h1s->sd); + h1s->sess = sess; + + if (h1c->px->options2 & PR_O2_REQBUG_OK) + h1s->req.err_pos = -1; + + HA_ATOMIC_INC(&h1c->px_counters->open_streams); + HA_ATOMIC_INC(&h1c->px_counters->total_streams); + + h1c->idle_exp = TICK_ETERNITY; + h1_set_idle_expiration(h1c); + TRACE_LEAVE(H1_EV_H1S_NEW, h1c->conn, h1s); + return h1s; + + fail: + TRACE_DEVEL("leaving on error", H1_EV_STRM_NEW|H1_EV_STRM_ERR, h1c->conn); + h1s_destroy(h1s); + return NULL; +} + +static struct h1s *h1c_bck_stream_new(struct h1c *h1c, struct stconn *sc, struct session *sess) +{ + struct h1s *h1s; + + TRACE_ENTER(H1_EV_H1S_NEW, h1c->conn); + + h1s = h1s_new(h1c); + if (!h1s) + goto fail; + + if (sc_attach_mux(sc, h1s, h1c->conn) < 0) + goto fail; + + h1s->flags |= H1S_F_RX_BLK; + h1s->sd = sc->sedesc; + h1s->sess = sess; + + if (!(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_H1_SND)) + se_fl_set(h1s->sd, SE_FL_MAY_FASTFWD_CONS); + h1c->state = H1_CS_RUNNING; + + if (h1c->px->options2 & PR_O2_RSPBUG_OK) + h1s->res.err_pos = -1; + + HA_ATOMIC_INC(&h1c->px_counters->open_streams); + HA_ATOMIC_INC(&h1c->px_counters->total_streams); + + TRACE_LEAVE(H1_EV_H1S_NEW, h1c->conn, h1s); + return h1s; + + fail: + TRACE_DEVEL("leaving on error", H1_EV_STRM_NEW|H1_EV_STRM_ERR, h1c->conn); + h1s_destroy(h1s); + return NULL; +} + +static void h1s_destroy(struct h1s *h1s) +{ + if (h1s) { + struct h1c *h1c = h1s->h1c; + + TRACE_POINT(H1_EV_H1S_END, h1c->conn, h1s); + h1c->h1s = NULL; + + if (h1s->subs) + h1s->subs->events = 0; + + h1_release_buf(h1c, &h1s->rxbuf); + + h1c->flags &= ~(H1C_F_WANT_FASTFWD| + H1C_F_OUT_FULL|H1C_F_OUT_ALLOC|H1C_F_IN_SALLOC| + H1C_F_CO_MSG_MORE|H1C_F_CO_STREAMER); + + if (!(h1c->flags & (H1C_F_EOS|H1C_F_ERR_PENDING|H1C_F_ERROR|H1C_F_ABRT_PENDING|H1C_F_ABRTED)) && /* No error/read0/abort */ + h1_is_alive(h1c) && /* still alive */ + (h1s->flags & H1S_F_WANT_KAL) && /* K/A possible */ + h1s->req.state == H1_MSG_DONE && h1s->res.state == H1_MSG_DONE) { /* req/res in DONE state */ + h1c->state = H1_CS_IDLE; + h1c->flags |= H1C_F_WAIT_NEXT_REQ; + h1c->req_count++; + TRACE_STATE("set idle mode on h1c, waiting for the next request", H1_EV_H1C_ERR, h1c->conn, h1s); + } + else { + h1_close(h1c); + TRACE_STATE("close h1c", H1_EV_H1S_END, h1c->conn, h1s); + } + + HA_ATOMIC_DEC(&h1c->px_counters->open_streams); + BUG_ON(h1s->sd && !se_fl_test(h1s->sd, SE_FL_ORPHAN)); + sedesc_free(h1s->sd); + pool_free(pool_head_h1s, h1s); + } +} + +/* + * Initialize the mux once it's attached. It is expected that conn->ctx points + * to the existing stream connector (for outgoing connections or for incoming + * ones during a mux upgrade) or NULL (for incoming ones during the connection + * establishment). <input> is always used as Input buffer and may contain + * data. It is the caller responsibility to not reuse it anymore. Returns < 0 on + * error. + */ +static int h1_init(struct connection *conn, struct proxy *proxy, struct session *sess, + struct buffer *input) +{ + struct h1c *h1c; + struct task *t = NULL; + void *conn_ctx = conn->ctx; + + TRACE_ENTER(H1_EV_H1C_NEW); + + h1c = pool_alloc(pool_head_h1c); + if (!h1c) { + TRACE_ERROR("H1C allocation failure", H1_EV_H1C_NEW|H1_EV_H1C_END|H1_EV_H1C_ERR); + goto fail_h1c; + } + h1c->conn = conn; + h1c->px = proxy; + + h1c->state = H1_CS_IDLE; + h1c->flags = H1C_F_NONE; + h1c->errcode = 0; + h1c->ibuf = *input; + h1c->obuf = BUF_NULL; + h1c->h1s = NULL; + h1c->task = NULL; + h1c->req_count = 0; + + LIST_INIT(&h1c->buf_wait.list); + h1c->wait_event.tasklet = tasklet_new(); + if (!h1c->wait_event.tasklet) + goto fail; + h1c->wait_event.tasklet->process = h1_io_cb; + h1c->wait_event.tasklet->context = h1c; + h1c->wait_event.events = 0; + h1c->idle_exp = TICK_ETERNITY; + + if (conn_is_back(conn)) { + h1c->flags |= H1C_F_IS_BACK; + h1c->shut_timeout = h1c->timeout = proxy->timeout.server; + if (tick_isset(proxy->timeout.serverfin)) + h1c->shut_timeout = proxy->timeout.serverfin; + + h1c->px_counters = EXTRA_COUNTERS_GET(proxy->extra_counters_be, + &h1_stats_module); + } else { + h1c->shut_timeout = h1c->timeout = proxy->timeout.client; + if (tick_isset(proxy->timeout.clientfin)) + h1c->shut_timeout = proxy->timeout.clientfin; + + h1c->px_counters = EXTRA_COUNTERS_GET(proxy->extra_counters_fe, + &h1_stats_module); + + LIST_APPEND(&mux_stopping_data[tid].list, + &h1c->conn->stopping_list); + } + if (tick_isset(h1c->timeout)) { + t = task_new_here(); + if (!t) { + TRACE_ERROR("H1C task allocation failure", H1_EV_H1C_NEW|H1_EV_H1C_END|H1_EV_H1C_ERR); + goto fail; + } + + h1c->task = t; + t->process = h1_timeout_task; + t->context = h1c; + + t->expire = tick_add(now_ms, h1c->timeout); + } + + conn->ctx = h1c; + + if (h1c->flags & H1C_F_IS_BACK) { + /* Create a new H1S now for backend connection only */ + if (!h1c_bck_stream_new(h1c, conn_ctx, sess)) + goto fail; + } + else if (conn_ctx) { + /* Upgraded frontend connection (from TCP) */ + if (!h1c_frt_stream_new(h1c, conn_ctx, h1c->conn->owner)) + goto fail; + + /* Attach the SC but Not ready yet */ + h1c->state = H1_CS_UPGRADING; + TRACE_DEVEL("Inherit the SC from TCP connection to perform an upgrade", + H1_EV_H1C_NEW|H1_EV_STRM_NEW, h1c->conn, h1c->h1s); + } + + if (t) { + h1_set_idle_expiration(h1c); + t->expire = tick_first(t->expire, h1c->idle_exp); + task_queue(t); + } + + /* prepare to read something */ + if (b_data(&h1c->ibuf)) + tasklet_wakeup(h1c->wait_event.tasklet); + else if (h1_recv_allowed(h1c)) + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + + if (!conn_is_back(conn)) + proxy_inc_fe_cum_sess_ver_ctr(sess->listener, proxy, 1); + HA_ATOMIC_INC(&h1c->px_counters->open_conns); + HA_ATOMIC_INC(&h1c->px_counters->total_conns); + + /* mux->wake will be called soon to complete the operation */ + TRACE_LEAVE(H1_EV_H1C_NEW, conn, h1c->h1s); + return 0; + + fail: + task_destroy(t); + tasklet_free(h1c->wait_event.tasklet); + pool_free(pool_head_h1c, h1c); + fail_h1c: + if (!conn_is_back(conn)) + LIST_DEL_INIT(&conn->stopping_list); + conn->ctx = conn_ctx; // restore saved context + TRACE_DEVEL("leaving in error", H1_EV_H1C_NEW|H1_EV_H1C_END|H1_EV_H1C_ERR); + return -1; +} + +/* release function. This one should be called to free all resources allocated + * to the mux. + */ +static void h1_release(struct h1c *h1c) +{ + struct connection *conn = NULL; + + TRACE_POINT(H1_EV_H1C_END); + + /* The connection must be aattached to this mux to be released */ + if (h1c->conn && h1c->conn->ctx == h1c) + conn = h1c->conn; + + if (conn && h1c->flags & H1C_F_UPG_H2C) { + TRACE_DEVEL("upgrading H1 to H2", H1_EV_H1C_END, conn); + /* Make sure we're no longer subscribed to anything */ + if (h1c->wait_event.events) + conn->xprt->unsubscribe(conn, conn->xprt_ctx, + h1c->wait_event.events, &h1c->wait_event); + if (conn_upgrade_mux_fe(conn, NULL, &h1c->ibuf, ist("h2"), PROTO_MODE_HTTP) != -1) { + /* connection successfully upgraded to H2, this + * mux was already released */ + return; + } + TRACE_ERROR("h2 upgrade failed", H1_EV_H1C_END|H1_EV_H1C_ERR, conn); + sess_log(conn->owner); /* Log if the upgrade failed */ + } + + + if (LIST_INLIST(&h1c->buf_wait.list)) + LIST_DEL_INIT(&h1c->buf_wait.list); + + h1_release_buf(h1c, &h1c->ibuf); + h1_release_buf(h1c, &h1c->obuf); + + if (h1c->task) { + h1c->task->context = NULL; + task_wakeup(h1c->task, TASK_WOKEN_OTHER); + h1c->task = NULL; + } + + if (h1c->wait_event.tasklet) { + tasklet_free(h1c->wait_event.tasklet); + h1c->wait_event.tasklet = NULL; + } + + h1s_destroy(h1c->h1s); + if (conn) { + if (h1c->wait_event.events != 0) + conn->xprt->unsubscribe(conn, conn->xprt_ctx, h1c->wait_event.events, + &h1c->wait_event); + h1_shutw_conn(conn); + } + + HA_ATOMIC_DEC(&h1c->px_counters->open_conns); + pool_free(pool_head_h1c, h1c); + + if (conn) { + if (!conn_is_back(conn)) + LIST_DEL_INIT(&conn->stopping_list); + + conn->mux = NULL; + conn->ctx = NULL; + TRACE_DEVEL("freeing conn", H1_EV_H1C_END, conn); + + conn_stop_tracking(conn); + conn_full_close(conn); + if (conn->destroy_cb) + conn->destroy_cb(conn); + conn_free(conn); + } +} + +/******************************************************/ +/* functions below are for the H1 protocol processing */ +/******************************************************/ +/* Parse the request version and set H1_MF_VER_11 on <h1m> if the version is + * greater or equal to 1.1 + */ +static void h1_parse_req_vsn(struct h1m *h1m, const struct htx_sl *sl) +{ + const char *p = HTX_SL_REQ_VPTR(sl); + + if ((HTX_SL_REQ_VLEN(sl) == 8) && + (*(p + 5) > '1' || + (*(p + 5) == '1' && *(p + 7) >= '1'))) + h1m->flags |= H1_MF_VER_11; +} + +/* Parse the response version and set H1_MF_VER_11 on <h1m> if the version is + * greater or equal to 1.1 + */ +static void h1_parse_res_vsn(struct h1m *h1m, const struct htx_sl *sl) +{ + const char *p = HTX_SL_RES_VPTR(sl); + + if ((HTX_SL_RES_VLEN(sl) == 8) && + (*(p + 5) > '1' || + (*(p + 5) == '1' && *(p + 7) >= '1'))) + h1m->flags |= H1_MF_VER_11; +} + +/* Deduce the connection mode of the client connection, depending on the + * configuration and the H1 message flags. This function is called twice, the + * first time when the request is parsed and the second time when the response + * is parsed. + */ +static void h1_set_cli_conn_mode(struct h1s *h1s, struct h1m *h1m) +{ + struct proxy *fe = h1s->h1c->px; + + if (h1m->flags & H1_MF_RESP) { + /* Output direction: second pass */ + if ((h1s->meth == HTTP_METH_CONNECT && h1s->status >= 200 && h1s->status < 300) || + h1s->status == 101) { + /* Either we've established an explicit tunnel, or we're + * switching the protocol. In both cases, we're very unlikely to + * understand the next protocols. We have to switch to tunnel + * mode, so that we transfer the request and responses then let + * this protocol pass unmodified. When we later implement + * specific parsers for such protocols, we'll want to check the + * Upgrade header which contains information about that protocol + * for responses with status 101 (eg: see RFC2817 about TLS). + */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_TUN; + TRACE_STATE("set tunnel mode (resp)", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + else if (h1s->flags & H1S_F_WANT_KAL) { + /* By default the client is in KAL mode. CLOSE mode mean + * it is imposed by the client itself. So only change + * KAL mode here. */ + if (!(h1m->flags & H1_MF_XFER_LEN) || (h1m->flags & H1_MF_CONN_CLO)) { + /* no length known or explicit close => close */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("detect close mode (resp)", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + else if (!(h1m->flags & H1_MF_CONN_KAL) && + (fe->options & PR_O_HTTP_MODE) == PR_O_HTTP_CLO) { + /* no explicit keep-alive and option httpclose => close */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("force close mode (resp)", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + } + } + else { + /* Input direction: first pass */ + if (!(h1m->flags & (H1_MF_VER_11|H1_MF_CONN_KAL)) || h1m->flags & H1_MF_CONN_CLO) { + /* no explicit keep-alive in HTTP/1.0 or explicit close => close*/ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("detect close mode (req)", H1_EV_RX_DATA|H1_EV_RX_HDRS, h1s->h1c->conn, h1s); + } + } + + /* If KAL, check if the frontend is stopping. If yes, switch in CLO mode + * unless a 'close-spread-time' option is set (either to define a + * soft-close window or to disable active closing (close-spread-time + * option set to 0). + */ + if (h1s->flags & H1S_F_WANT_KAL && (fe->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + int want_clo = 1; + /* If a close-spread-time option is set, we want to avoid + * closing all the active HTTP connections at once so we add a + * random factor that will spread the closing. + */ + if (tick_isset(global.close_spread_end)) { + int remaining_window = tick_remain(now_ms, global.close_spread_end); + if (remaining_window) { + /* This should increase the closing rate the further along + * the window we are. + */ + want_clo = (remaining_window <= statistical_prng_range(global.close_spread_time)); + } + } + else if (global.tune.options & GTUNE_DISABLE_ACTIVE_CLOSE) + want_clo = 0; + + if (want_clo) { + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("stopping, set close mode", H1_EV_RX_DATA|H1_EV_RX_HDRS|H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + } +} + +/* Deduce the connection mode of the client connection, depending on the + * configuration and the H1 message flags. This function is called twice, the + * first time when the request is parsed and the second time when the response + * is parsed. + */ +static void h1_set_srv_conn_mode(struct h1s *h1s, struct h1m *h1m) +{ + struct session *sess = h1s->sess; + struct proxy *be = h1s->h1c->px; + int fe_flags = sess ? sess->fe->options : 0; + + if (h1m->flags & H1_MF_RESP) { + /* Input direction: second pass */ + if ((h1s->meth == HTTP_METH_CONNECT && h1s->status >= 200 && h1s->status < 300) || + h1s->status == 101) { + /* Either we've established an explicit tunnel, or we're + * switching the protocol. In both cases, we're very unlikely to + * understand the next protocols. We have to switch to tunnel + * mode, so that we transfer the request and responses then let + * this protocol pass unmodified. When we later implement + * specific parsers for such protocols, we'll want to check the + * Upgrade header which contains information about that protocol + * for responses with status 101 (eg: see RFC2817 about TLS). + */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_TUN; + TRACE_STATE("set tunnel mode (resp)", H1_EV_RX_DATA|H1_EV_RX_HDRS, h1s->h1c->conn, h1s); + } + else if (h1s->flags & H1S_F_WANT_KAL) { + /* By default the server is in KAL mode. CLOSE mode mean + * it is imposed by haproxy itself. So only change KAL + * mode here. */ + if (!(h1m->flags & H1_MF_XFER_LEN) || h1m->flags & H1_MF_CONN_CLO || + !(h1m->flags & (H1_MF_VER_11|H1_MF_CONN_KAL))){ + /* no length known or explicit close or no explicit keep-alive in HTTP/1.0 => close */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("detect close mode (resp)", H1_EV_RX_DATA|H1_EV_RX_HDRS, h1s->h1c->conn, h1s); + } + } + } + else { + /* Output direction: first pass */ + if (h1m->flags & H1_MF_CONN_CLO) { + /* explicit close => close */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("detect close mode (req)", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + else if (!(h1m->flags & H1_MF_CONN_KAL) && + ((fe_flags & PR_O_HTTP_MODE) == PR_O_HTTP_SCL || + (be->options & PR_O_HTTP_MODE) == PR_O_HTTP_SCL || + (be->options & PR_O_HTTP_MODE) == PR_O_HTTP_CLO)) { + /* no explicit keep-alive option httpclose/server-close => close */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("force close mode (req)", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + } + + /* If KAL, check if the backend is stopping. If yes, switch in CLO mode */ + if (h1s->flags & H1S_F_WANT_KAL && (be->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("stopping, set close mode", H1_EV_RX_DATA|H1_EV_RX_HDRS|H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } +} + +static void h1_update_req_conn_value(struct h1s *h1s, struct h1m *h1m, struct ist *conn_val) +{ + struct proxy *px = h1s->h1c->px; + + /* Don't update "Connection:" header in TUNNEL mode or if "Upgrage" + * token is found + */ + if (h1s->flags & H1S_F_WANT_TUN || h1m->flags & H1_MF_CONN_UPG) + return; + + if (h1s->flags & H1S_F_WANT_KAL || px->options2 & PR_O2_FAKE_KA) { + if (!(h1m->flags & H1_MF_VER_11)) { + TRACE_STATE("add \"Connection: keep-alive\"", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + *conn_val = ist("keep-alive"); + } + } + else { /* H1S_F_WANT_CLO && !PR_O2_FAKE_KA */ + if (h1m->flags & H1_MF_VER_11) { + TRACE_STATE("add \"Connection: close\"", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + *conn_val = ist("close"); + } + } +} + +static void h1_update_res_conn_value(struct h1s *h1s, struct h1m *h1m, struct ist *conn_val) +{ + /* Don't update "Connection:" header in TUNNEL mode or if "Upgrage" + * token is found + */ + if (h1s->flags & H1S_F_WANT_TUN || h1m->flags & H1_MF_CONN_UPG) + return; + + if (h1s->flags & H1S_F_WANT_KAL) { + if (!(h1m->flags & H1_MF_VER_11) || + !((h1m->flags & h1s->req.flags) & H1_MF_VER_11)) { + TRACE_STATE("add \"Connection: keep-alive\"", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + *conn_val = ist("keep-alive"); + } + } + else { /* H1S_F_WANT_CLO */ + if (h1m->flags & H1_MF_VER_11) { + TRACE_STATE("add \"Connection: close\"", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + *conn_val = ist("close"); + } + } +} + +static void h1_process_input_conn_mode(struct h1s *h1s, struct h1m *h1m, struct htx *htx) +{ + if (!(h1s->h1c->flags & H1C_F_IS_BACK)) + h1_set_cli_conn_mode(h1s, h1m); + else + h1_set_srv_conn_mode(h1s, h1m); +} + +static void h1_process_output_conn_mode(struct h1s *h1s, struct h1m *h1m, struct ist *conn_val) +{ + if (!(h1s->h1c->flags & H1C_F_IS_BACK)) + h1_set_cli_conn_mode(h1s, h1m); + else + h1_set_srv_conn_mode(h1s, h1m); + + if (!(h1m->flags & H1_MF_RESP)) + h1_update_req_conn_value(h1s, h1m, conn_val); + else + h1_update_res_conn_value(h1s, h1m, conn_val); +} + +/* Try to adjust the case of the message header name using the global map + * <hdrs_map>. + */ +static void h1_adjust_case_outgoing_hdr(struct h1s *h1s, struct h1m *h1m, struct ist *name) +{ + struct ebpt_node *node; + struct h1_hdr_entry *entry; + + /* No entry in the map, do nothing */ + if (eb_is_empty(&hdrs_map.map)) + return; + + /* No conversion for the request headers */ + if (!(h1m->flags & H1_MF_RESP) && !(h1s->h1c->px->options2 & PR_O2_H1_ADJ_BUGSRV)) + return; + + /* No conversion for the response headers */ + if ((h1m->flags & H1_MF_RESP) && !(h1s->h1c->px->options2 & PR_O2_H1_ADJ_BUGCLI)) + return; + + node = ebis_lookup_len(&hdrs_map.map, name->ptr, name->len); + if (!node) + return; + entry = container_of(node, struct h1_hdr_entry, node); + name->ptr = entry->name.ptr; + name->len = entry->name.len; +} + +/* Append the description of what is present in error snapshot <es> into <out>. + * The description must be small enough to always fit in a buffer. The output + * buffer may be the trash so the trash must not be used inside this function. + */ +static void h1_show_error_snapshot(struct buffer *out, const struct error_snapshot *es) +{ + chunk_appendf(out, + " H1 connection flags 0x%08x, H1 stream flags 0x%08x\n" + " H1 msg state %s(%d), H1 msg flags 0x%08x\n" + " H1 chunk len %lld bytes, H1 body len %lld bytes :\n", + es->ctx.h1.c_flags, es->ctx.h1.s_flags, + h1m_state_str(es->ctx.h1.state), es->ctx.h1.state, + es->ctx.h1.m_flags, es->ctx.h1.m_clen, es->ctx.h1.m_blen); +} +/* + * Capture a bad request or response and archive it in the proxy's structure. + * By default it tries to report the error position as h1m->err_pos. However if + * this one is not set, it will then report h1m->next, which is the last known + * parsing point. The function is able to deal with wrapping buffers. It always + * displays buffers as a contiguous area starting at buf->p. The direction is + * determined thanks to the h1m's flags. + */ +static void h1_capture_bad_message(struct h1c *h1c, struct h1s *h1s, + struct h1m *h1m, struct buffer *buf) +{ + struct session *sess = h1s->sess; + struct proxy *proxy = h1c->px; + struct proxy *other_end; + union error_snapshot_ctx ctx; + + if (h1c->state == H1_CS_UPGRADING || h1c->state == H1_CS_RUNNING) { + if (sess == NULL) + sess = __sc_strm(h1s_sc(h1s))->sess; + if (!(h1m->flags & H1_MF_RESP)) + other_end = __sc_strm(h1s_sc(h1s))->be; + else + other_end = sess->fe; + } else + other_end = NULL; + + /* http-specific part now */ + ctx.h1.state = h1m->state; + ctx.h1.c_flags = h1c->flags; + ctx.h1.s_flags = h1s->flags; + ctx.h1.m_flags = h1m->flags; + ctx.h1.m_clen = h1m->curr_len; + ctx.h1.m_blen = h1m->body_len; + + proxy_capture_error(proxy, !!(h1m->flags & H1_MF_RESP), other_end, + h1c->conn->target, sess, buf, 0, 0, + (h1m->err_pos >= 0) ? h1m->err_pos : h1m->next, + &ctx, h1_show_error_snapshot); +} + +/* Emit the chunksize followed by a CRLF in front of data of the buffer + * <buf>. It goes backwards and starts with the byte before the buffer's + * head. The caller is responsible for ensuring there is enough room left before + * the buffer's head for the string. + */ +static void h1_prepend_chunk_size(struct buffer *buf, size_t chksz) +{ + char *beg, *end; + + beg = end = b_head(buf); + *--beg = '\n'; + *--beg = '\r'; + do { + *--beg = hextab[chksz & 0xF]; + } while (chksz >>= 4); + buf->head -= (end - beg); + b_add(buf, end - beg); +} + +/* Emit the chunksize followed by a CRLF after the data of the buffer + * <buf>. Returns 0 on error. + */ +static int h1_append_chunk_size(struct buffer *buf, size_t chksz) +{ + char tmp[10]; + char *beg, *end; + + beg = end = tmp+10; + *--beg = '\n'; + *--beg = '\r'; + do { + *--beg = hextab[chksz & 0xF]; + } while (chksz >>= 4); + + return chunk_memcat(buf, beg, end - beg); +} + +/* Emit a CRLF in front of data of the buffer <buf>. It goes backwards and + * starts with the byte before the buffer's head. The caller is responsible for + * ensuring there is enough room left before the buffer's head for the string. + */ +static void h1_prepend_chunk_crlf(struct buffer *buf) +{ + char *head; + + head = b_head(buf); + *--head = '\n'; + *--head = '\r'; + buf->head -= 2; + b_add(buf, 2); +} + + +/* Emit a CRLF after the data of the buffer <buf>. The caller is responsible for + * ensuring there is enough room left in the buffer for the string. */ +static void h1_append_chunk_crlf(struct buffer *buf) +{ + *(b_peek(buf, b_data(buf))) = '\r'; + *(b_peek(buf, b_data(buf) + 1)) = '\n'; + b_add(buf, 2); +} + +/* + * Switch the stream to tunnel mode. This function must only be called on 2xx + * (successful) replies to CONNECT requests or on 101 (switching protocol). + */ +static void h1_set_tunnel_mode(struct h1s *h1s) +{ + struct h1c *h1c = h1s->h1c; + + h1s->req.state = H1_MSG_TUNNEL; + h1s->req.flags &= ~(H1_MF_XFER_LEN|H1_MF_CLEN|H1_MF_CHNK); + + h1s->res.state = H1_MSG_TUNNEL; + h1s->res.flags &= ~(H1_MF_XFER_LEN|H1_MF_CLEN|H1_MF_CHNK); + + TRACE_STATE("switch H1 stream in tunnel mode", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s); + + if (h1s->flags & H1S_F_RX_BLK) { + h1s->flags &= ~H1S_F_RX_BLK; + h1_wake_stream_for_recv(h1s); + TRACE_STATE("Re-enable input processing", H1_EV_RX_DATA|H1_EV_H1S_BLK|H1_EV_STRM_WAKE, h1c->conn, h1s); + } + if (h1s->flags & H1S_F_TX_BLK) { + h1s->flags &= ~H1S_F_TX_BLK; + h1_wake_stream_for_send(h1s); + TRACE_STATE("Re-enable output processing", H1_EV_TX_DATA|H1_EV_H1S_BLK|H1_EV_STRM_WAKE, h1c->conn, h1s); + } +} + +/* Search for a websocket key header. The message should have been identified + * as a valid websocket handshake. + * + * On the request side, if found the key is stored in the session. It might be + * needed to calculate response key if the server side is using http/2. + * + * On the response side, the key might be verified if haproxy has been + * responsible for the generation of a key. This happens when a h2 client is + * interfaced with a h1 server. + * + * Returns 0 if no key found or invalid key + */ +static int h1_search_websocket_key(struct h1s *h1s, struct h1m *h1m, struct htx *htx) +{ + struct htx_blk *blk; + enum htx_blk_type type; + struct ist n, v; + int ws_key_found = 0, idx; + + idx = htx_get_head(htx); // returns the SL that we skip + while ((idx = htx_get_next(htx, idx)) != -1) { + blk = htx_get_blk(htx, idx); + type = htx_get_blk_type(blk); + + if (type == HTX_BLK_UNUSED) + continue; + + if (type != HTX_BLK_HDR) + break; + + n = htx_get_blk_name(htx, blk); + v = htx_get_blk_value(htx, blk); + + /* Websocket key is base64 encoded of 16 bytes */ + if (isteqi(n, ist("sec-websocket-key")) && v.len == 24 && + !(h1m->flags & H1_MF_RESP)) { + /* Copy the key on request side + * we might need it if the server is using h2 and does + * not provide the response + */ + memcpy(h1s->ws_key, v.ptr, 24); + ws_key_found = 1; + break; + } + else if (isteqi(n, ist("sec-websocket-accept")) && + h1m->flags & H1_MF_RESP) { + /* Need to verify the response key if the input was + * generated by haproxy + */ + if (h1s->ws_key[0]) { + char key[29]; + h1_calculate_ws_output_key(h1s->ws_key, key); + if (!isteqi(ist(key), v)) + break; + } + ws_key_found = 1; + break; + } + } + + /* missing websocket key, reject the message */ + if (!ws_key_found) { + htx->flags |= HTX_FL_PARSING_ERROR; + return 0; + } + + return 1; +} + +/* + * Parse HTTP/1 headers. It returns the number of bytes parsed if > 0, or 0 if + * it couldn't proceed. Parsing errors are reported by setting H1S_F_*_ERROR + * flag. If more room is requested, H1S_F_RX_CONGESTED flag is set. If relies on + * the function http_parse_msg_hdrs() to do the parsing. + */ +static size_t h1_handle_headers(struct h1s *h1s, struct h1m *h1m, struct htx *htx, + struct buffer *buf, size_t *ofs, size_t max) +{ + union h1_sl h1sl; + int ret = 0; + + TRACE_ENTER(H1_EV_RX_DATA|H1_EV_RX_HDRS, h1s->h1c->conn, h1s, 0, (size_t[]){max}); + + if (h1s->meth == HTTP_METH_CONNECT) + h1m->flags |= H1_MF_METH_CONNECT; + if (h1s->meth == HTTP_METH_HEAD) + h1m->flags |= H1_MF_METH_HEAD; + + ret = h1_parse_msg_hdrs(h1m, &h1sl, htx, buf, *ofs, max); + if (ret <= 0) { + TRACE_DEVEL("leaving on missing data or error", H1_EV_RX_DATA|H1_EV_RX_HDRS, h1s->h1c->conn, h1s); + if (ret == -1) { + h1s->flags |= H1S_F_PARSING_ERROR; + TRACE_ERROR("parsing error, reject H1 message", H1_EV_RX_DATA|H1_EV_RX_HDRS|H1_EV_H1S_ERR, h1s->h1c->conn, h1s); + h1_capture_bad_message(h1s->h1c, h1s, h1m, buf); + } + else if (ret == -2) { + TRACE_STATE("RX path congested, waiting for more space", H1_EV_RX_DATA|H1_EV_RX_HDRS|H1_EV_H1S_BLK, h1s->h1c->conn, h1s); + h1s->flags |= H1S_F_RX_CONGESTED; + } + ret = 0; + goto end; + } + + + /* Reject HTTP/1.0 GET/HEAD/DELETE requests with a payload except if + * accept_payload_with_any_method global option is set. + *There is a payload if the c-l is not null or the the payload is + * chunk-encoded. A parsing error is reported but a A + * 413-Payload-Too-Large is returned instead of a 400-Bad-Request. + */ + if (!accept_payload_with_any_method && + !(h1m->flags & (H1_MF_RESP|H1_MF_VER_11)) && + (((h1m->flags & H1_MF_CLEN) && h1m->body_len) || (h1m->flags & H1_MF_CHNK)) && + (h1sl.rq.meth == HTTP_METH_GET || h1sl.rq.meth == HTTP_METH_HEAD || h1sl.rq.meth == HTTP_METH_DELETE)) { + h1s->flags |= H1S_F_PARSING_ERROR; + htx->flags |= HTX_FL_PARSING_ERROR; + h1s->h1c->errcode = 413; + TRACE_ERROR("HTTP/1.0 GET/HEAD/DELETE request with a payload forbidden", H1_EV_RX_DATA|H1_EV_RX_HDRS|H1_EV_H1S_ERR, h1s->h1c->conn, h1s); + h1_capture_bad_message(h1s->h1c, h1s, h1m, buf); + ret = 0; + goto end; + } + + /* Reject any message with an unknown transfer-encoding. In fact if any + * encoding other than "chunked". A 422-Unprocessable-Content is + * returned for an invalid request, a 502-Bad-Gateway for an invalid + * response. + */ + if (h1m->flags & H1_MF_TE_OTHER) { + h1s->flags |= H1S_F_PARSING_ERROR; + htx->flags |= HTX_FL_PARSING_ERROR; + if (!(h1m->flags & H1_MF_RESP)) + h1s->h1c->errcode = 422; + TRACE_ERROR("Unknown transfer-encoding", H1_EV_RX_DATA|H1_EV_RX_HDRS|H1_EV_H1S_ERR, h1s->h1c->conn, h1s); + h1_capture_bad_message(h1s->h1c, h1s, h1m, buf); + ret = 0; + goto end; + } + + /* If websocket handshake, search for the websocket key */ + if ((h1m->flags & (H1_MF_CONN_UPG|H1_MF_UPG_WEBSOCKET)) == + (H1_MF_CONN_UPG|H1_MF_UPG_WEBSOCKET)) { + int ws_ret = h1_search_websocket_key(h1s, h1m, htx); + if (!ws_ret) { + h1s->flags |= H1S_F_PARSING_ERROR; + TRACE_ERROR("missing/invalid websocket key, reject H1 message", H1_EV_RX_DATA|H1_EV_RX_HDRS|H1_EV_H1S_ERR, h1s->h1c->conn, h1s); + h1_capture_bad_message(h1s->h1c, h1s, h1m, buf); + + ret = 0; + goto end; + } + } + + if (h1m->err_pos >= 0) { + /* Maybe we found an error during the parsing while we were + * configured not to block on that, so we have to capture it + * now. + */ + TRACE_STATE("Ignored parsing error", H1_EV_RX_DATA|H1_EV_RX_HDRS, h1s->h1c->conn, h1s); + h1_capture_bad_message(h1s->h1c, h1s, h1m, buf); + } + + if (!(h1m->flags & H1_MF_RESP)) { + h1s->meth = h1sl.rq.meth; + if (h1s->meth == HTTP_METH_HEAD) + h1s->flags |= H1S_F_BODYLESS_RESP; + } + else { + h1s->status = h1sl.st.status; + if (h1s->status == 204 || h1s->status == 304) + h1s->flags |= H1S_F_BODYLESS_RESP; + } + h1_process_input_conn_mode(h1s, h1m, htx); + *ofs += ret; + + end: + TRACE_LEAVE(H1_EV_RX_DATA|H1_EV_RX_HDRS, h1s->h1c->conn, h1s, 0, (size_t[]){ret}); + return ret; +} + +/* + * Parse HTTP/1 body. It returns the number of bytes parsed if > 0, or 0 if it + * couldn't proceed. Parsing errors are reported by setting H1S_F_*_ERROR flag. + * If relies on the function http_parse_msg_data() to do the parsing. + */ +static size_t h1_handle_data(struct h1s *h1s, struct h1m *h1m, struct htx **htx, + struct buffer *buf, size_t *ofs, size_t max, + struct buffer *htxbuf) +{ + size_t ret; + + TRACE_ENTER(H1_EV_RX_DATA|H1_EV_RX_BODY, h1s->h1c->conn, h1s, 0, (size_t[]){max}); + ret = h1_parse_msg_data(h1m, htx, buf, *ofs, max, htxbuf); + if (!ret) { + TRACE_DEVEL("leaving on missing data or error", H1_EV_RX_DATA|H1_EV_RX_BODY, h1s->h1c->conn, h1s); + if ((*htx)->flags & HTX_FL_PARSING_ERROR) { + h1s->flags |= H1S_F_PARSING_ERROR; + TRACE_ERROR("parsing error, reject H1 message", H1_EV_RX_DATA|H1_EV_RX_BODY|H1_EV_H1S_ERR, h1s->h1c->conn, h1s); + h1_capture_bad_message(h1s->h1c, h1s, h1m, buf); + } + goto end; + } + + *ofs += ret; + + end: + if (b_data(buf) != *ofs && (h1m->state == H1_MSG_DATA || h1m->state == H1_MSG_TUNNEL)) { + TRACE_STATE("RX path congested, waiting for more space", H1_EV_RX_DATA|H1_EV_RX_BODY|H1_EV_H1S_BLK, h1s->h1c->conn, h1s); + h1s->flags |= H1S_F_RX_CONGESTED; + } + + TRACE_LEAVE(H1_EV_RX_DATA|H1_EV_RX_BODY, h1s->h1c->conn, h1s, 0, (size_t[]){ret}); + return ret; +} + +/* + * Parse HTTP/1 trailers. It returns the number of bytes parsed if > 0, or 0 if + * it couldn't proceed. Parsing errors are reported by setting H1S_F_*_ERROR + * flag and filling h1s->err_pos and h1s->err_state fields. This functions is + * responsible to update the parser state <h1m>. If more room is requested, + * H1S_F_RX_CONGESTED flag is set. + */ +static size_t h1_handle_trailers(struct h1s *h1s, struct h1m *h1m, struct htx *htx, + struct buffer *buf, size_t *ofs, size_t max) +{ + int ret; + + TRACE_ENTER(H1_EV_RX_DATA|H1_EV_RX_TLRS, h1s->h1c->conn, h1s, 0, (size_t[]){max}); + ret = h1_parse_msg_tlrs(h1m, htx, buf, *ofs, max); + if (ret <= 0) { + TRACE_DEVEL("leaving on missing data or error", H1_EV_RX_DATA|H1_EV_RX_BODY, h1s->h1c->conn, h1s); + if (ret == -1) { + h1s->flags |= H1S_F_PARSING_ERROR; + TRACE_ERROR("parsing error, reject H1 message", H1_EV_RX_DATA|H1_EV_RX_TLRS|H1_EV_H1S_ERR, h1s->h1c->conn, h1s); + h1_capture_bad_message(h1s->h1c, h1s, h1m, buf); + } + else if (ret == -2) { + TRACE_STATE("RX path congested, waiting for more space", H1_EV_RX_DATA|H1_EV_RX_TLRS|H1_EV_H1S_BLK, h1s->h1c->conn, h1s); + h1s->flags |= H1S_F_RX_CONGESTED; + } + ret = 0; + goto end; + } + + *ofs += ret; + + end: + TRACE_LEAVE(H1_EV_RX_DATA|H1_EV_RX_TLRS, h1s->h1c->conn, h1s, 0, (size_t[]){ret}); + return ret; +} + +/* + * Process incoming data. It parses data and transfer them from h1c->ibuf into + * <buf>. It returns the number of bytes parsed and transferred if > 0, or 0 if + * it couldn't proceed. + * + * WARNING: H1S_F_RX_CONGESTED flag must be removed before processing input data. + */ +static size_t h1_process_demux(struct h1c *h1c, struct buffer *buf, size_t count) +{ + struct h1s *h1s = h1c->h1s; + struct h1m *h1m; + struct htx *htx; + size_t data; + size_t ret = 0; + size_t total = 0; + + htx = htx_from_buf(buf); + TRACE_ENTER(H1_EV_RX_DATA, h1c->conn, h1s, htx, (size_t[]){count}); + + h1m = (!(h1c->flags & H1C_F_IS_BACK) ? &h1s->req : &h1s->res); + data = htx->data; + + if (h1s->flags & (H1S_F_INTERNAL_ERROR|H1S_F_PARSING_ERROR|H1S_F_NOT_IMPL_ERROR)) + goto end; + + if (h1s->flags & H1S_F_RX_BLK) + goto out; + + /* Always remove congestion flags and try to process more input data */ + h1s->flags &= ~H1S_F_RX_CONGESTED; + + do { + size_t used = htx_used_space(htx); + + if (h1m->state <= H1_MSG_LAST_LF) { + TRACE_PROTO("parsing message headers", H1_EV_RX_DATA|H1_EV_RX_HDRS, h1c->conn, h1s); + ret = h1_handle_headers(h1s, h1m, htx, &h1c->ibuf, &total, count); + if (!ret) + break; + + TRACE_USER((!(h1m->flags & H1_MF_RESP) ? "rcvd H1 request headers" : "rcvd H1 response headers"), + H1_EV_RX_DATA|H1_EV_RX_HDRS, h1c->conn, h1s, htx, (size_t[]){ret}); + + if ((h1m->flags & H1_MF_RESP) && + h1s->status < 200 && (h1s->status == 100 || h1s->status >= 102)) { + h1m_init_res(&h1s->res); + h1m->flags |= (H1_MF_NO_PHDR|H1_MF_CLEAN_CONN_HDR); + TRACE_STATE("1xx response rcvd", H1_EV_RX_DATA|H1_EV_RX_HDRS, h1c->conn, h1s); + } + } + else if (h1m->state < H1_MSG_TRAILERS) { + TRACE_PROTO("parsing message payload", H1_EV_RX_DATA|H1_EV_RX_BODY, h1c->conn, h1s); + ret = h1_handle_data(h1s, h1m, &htx, &h1c->ibuf, &total, count, buf); + if (h1m->state < H1_MSG_TRAILERS) + break; + + TRACE_PROTO((!(h1m->flags & H1_MF_RESP) ? "rcvd H1 request payload data" : "rcvd H1 response payload data"), + H1_EV_RX_DATA|H1_EV_RX_BODY, h1c->conn, h1s, htx, (size_t[]){ret}); + } + else if (h1m->state == H1_MSG_TRAILERS) { + TRACE_PROTO("parsing message trailers", H1_EV_RX_DATA|H1_EV_RX_TLRS, h1c->conn, h1s); + ret = h1_handle_trailers(h1s, h1m, htx, &h1c->ibuf, &total, count); + if (h1m->state != H1_MSG_DONE) + break; + + TRACE_PROTO((!(h1m->flags & H1_MF_RESP) ? "rcvd H1 request trailers" : "rcvd H1 response trailers"), + H1_EV_RX_DATA|H1_EV_RX_TLRS, h1c->conn, h1s, htx, (size_t[]){ret}); + } + else if (h1m->state == H1_MSG_DONE) { + TRACE_USER((!(h1m->flags & H1_MF_RESP) ? "H1 request fully rcvd" : "H1 response fully rcvd"), + H1_EV_RX_DATA|H1_EV_RX_EOI, h1c->conn, h1s, htx); + + if (!(h1c->flags & H1C_F_IS_BACK)) { + /* The request was fully received. It means the H1S now + * expect data from the opposite side + */ + se_expect_data(h1s->sd); + } + + if ((h1m->flags & H1_MF_RESP) && + ((h1s->meth == HTTP_METH_CONNECT && h1s->status >= 200 && h1s->status < 300) || h1s->status == 101)) + h1_set_tunnel_mode(h1s); + else { + if (h1s->req.state < H1_MSG_DONE || h1s->res.state < H1_MSG_DONE) { + /* Unfinished transaction: block this input side waiting the end of the output side */ + h1s->flags |= H1S_F_RX_BLK; + TRACE_STATE("Disable input processing", H1_EV_RX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + } + if (h1s->flags & H1S_F_TX_BLK) { + h1s->flags &= ~H1S_F_TX_BLK; + h1_wake_stream_for_send(h1s); + TRACE_STATE("Re-enable output processing", H1_EV_TX_DATA|H1_EV_H1S_BLK|H1_EV_STRM_WAKE, h1c->conn, h1s); + } + break; + } + } + else if (h1m->state == H1_MSG_TUNNEL) { + TRACE_PROTO("parsing tunneled data", H1_EV_RX_DATA, h1c->conn, h1s); + ret = h1_handle_data(h1s, h1m, &htx, &h1c->ibuf, &total, count, buf); + if (!ret) + break; + + TRACE_PROTO((!(h1m->flags & H1_MF_RESP) ? "rcvd H1 request tunneled data" : "rcvd H1 response tunneled data"), + H1_EV_RX_DATA|H1_EV_RX_EOI, h1c->conn, h1s, htx, (size_t[]){ret}); + } + else { + h1s->flags |= H1S_F_PARSING_ERROR; + break; + } + + count -= htx_used_space(htx) - used; + } while (!(h1s->flags & (H1S_F_PARSING_ERROR|H1S_F_NOT_IMPL_ERROR|H1S_F_RX_BLK|H1S_F_RX_CONGESTED))); + + + if (h1s->flags & (H1S_F_PARSING_ERROR|H1S_F_NOT_IMPL_ERROR)) { + TRACE_ERROR("parsing or not-implemented error", H1_EV_RX_DATA|H1_EV_H1S_ERR, h1c->conn, h1s); + goto err; + } + + b_del(&h1c->ibuf, total); + + TRACE_DEVEL("incoming data parsed", H1_EV_RX_DATA, h1c->conn, h1s, htx, (size_t[]){ret}); + + ret = htx->data - data; + if ((h1c->flags & H1C_F_IN_FULL) && buf_room_for_htx_data(&h1c->ibuf)) { + h1c->flags &= ~H1C_F_IN_FULL; + TRACE_STATE("h1c ibuf not full anymore", H1_EV_RX_DATA|H1_EV_H1C_BLK|H1_EV_H1C_WAKE, h1c->conn, h1s); + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + + if (!b_data(&h1c->ibuf)) + h1_release_buf(h1c, &h1c->ibuf); + + if (h1m->state <= H1_MSG_LAST_LF) + goto out; + + if (h1c->state < H1_CS_RUNNING) { + /* The H1 connection is not ready. Most of time, there is no SC + * attached, except for TCP>H1 upgrade, from a TCP frontend. In both + * cases, it is only possible on the client side. + */ + BUG_ON(h1c->flags & H1C_F_IS_BACK); + + if (h1c->state == H1_CS_EMBRYONIC) { + TRACE_DEVEL("request headers fully parsed, create and attach the SC", H1_EV_RX_DATA, h1c->conn, h1s); + BUG_ON(h1s_sc(h1s)); + if (!h1s_new_sc(h1s, buf)) { + h1s->flags |= H1S_F_INTERNAL_ERROR; + goto err; + } + } + else { + TRACE_DEVEL("request headers fully parsed, upgrade the inherited SC", H1_EV_RX_DATA, h1c->conn, h1s); + BUG_ON(h1s_sc(h1s) == NULL); + if (!h1s_upgrade_sc(h1s, buf)) { + h1s->flags |= H1S_F_INTERNAL_ERROR; + TRACE_ERROR("H1S upgrade failure", H1_EV_RX_DATA|H1_EV_H1S_ERR, h1c->conn, h1s); + goto err; + } + } + } + + /* Here h1s_sc(h1s) is always defined */ + if (!(h1c->flags & H1C_F_CANT_FASTFWD) && + (!(h1m->flags & H1_MF_RESP) || !(h1s->flags & H1S_F_BODYLESS_RESP)) && + (h1m->state == H1_MSG_DATA || h1m->state == H1_MSG_TUNNEL) && + !(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_H1_RCV)) { + TRACE_STATE("notify the mux can use fast-forward", H1_EV_RX_DATA|H1_EV_RX_BODY, h1c->conn, h1s); + se_fl_set(h1s->sd, SE_FL_MAY_FASTFWD_PROD); + } + else { + TRACE_STATE("notify the mux can't use fast-forward anymore", H1_EV_RX_DATA|H1_EV_RX_BODY, h1c->conn, h1s); + se_fl_clr(h1s->sd, SE_FL_MAY_FASTFWD_PROD); + h1c->flags &= ~H1C_F_WANT_FASTFWD; + } + + /* Set EOI on stream connector in DONE state iff: + * - it is a response + * - it is a request but no a protocol upgrade nor a CONNECT + * + * If not set, Wait the response to do so or not depending on the status + * code. + */ + if (((h1m->state == H1_MSG_DONE) && (h1m->flags & H1_MF_RESP)) || + ((h1m->state == H1_MSG_DONE) && (h1s->meth != HTTP_METH_CONNECT) && !(h1m->flags & H1_MF_CONN_UPG))) + se_fl_set(h1s->sd, SE_FL_EOI); + + out: + /* When Input data are pending for this message, notify upper layer that + * the mux need more space in the HTX buffer to continue if : + * + * - The parser is blocked in MSG_DATA or MSG_TUNNEL state + * - Headers or trailers are pending to be copied. + */ + if (h1s->flags & (H1S_F_RX_CONGESTED)) { + se_fl_set(h1s->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + TRACE_STATE("waiting for more room", H1_EV_RX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + } + else { + se_fl_clr(h1s->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + if (h1c->flags & H1C_F_EOS) { + se_fl_set(h1s->sd, SE_FL_EOS); + TRACE_STATE("report EOS to SE", H1_EV_RX_DATA, h1c->conn, h1s); + if (h1m->state >= H1_MSG_DONE || (h1m->state > H1_MSG_LAST_LF && !(h1m->flags & H1_MF_XFER_LEN))) { + /* DONE or TUNNEL or SHUTR without XFER_LEN, set + * EOI on the stream connector */ + se_fl_set(h1s->sd, SE_FL_EOI); + TRACE_STATE("report EOI to SE", H1_EV_RX_DATA, h1c->conn, h1s); + } + else if (h1m->state < H1_MSG_DONE) { + if (h1m->state <= H1_MSG_LAST_LF && b_data(&h1c->ibuf)) + htx->flags |= HTX_FL_PARSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("message aborted, set error on SC", H1_EV_RX_DATA|H1_EV_H1S_ERR, h1c->conn, h1s); + } + + if (h1s->flags & H1S_F_TX_BLK) { + h1s->flags &= ~H1S_F_TX_BLK; + h1_wake_stream_for_send(h1s); + TRACE_STATE("Re-enable output processing", H1_EV_TX_DATA|H1_EV_H1S_BLK|H1_EV_STRM_WAKE, h1c->conn, h1s); + } + } + if (h1c->flags & H1C_F_ERROR) { + /* Report a terminal error to the SE if a previous read error was detected */ + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_STATE("report ERROR to SE", H1_EV_RX_DATA|H1_EV_H1S_ERR, h1c->conn, h1s); + } + } + + end: + htx_to_buf(htx, buf); + TRACE_LEAVE(H1_EV_RX_DATA, h1c->conn, h1s, htx, (size_t[]){ret}); + return ret; + + err: + htx_to_buf(htx, buf); + se_fl_set(h1s->sd, SE_FL_EOI); + if (h1c->state < H1_CS_RUNNING) { + h1c->flags |= H1C_F_EOS; + se_fl_set(h1s->sd, SE_FL_EOS); + } + TRACE_DEVEL("leaving on error", H1_EV_RX_DATA|H1_EV_STRM_ERR, h1c->conn, h1s); + return 0; +} + +/* Try to send the request line from the HTX message <htx> for the stream + * <h1s>. It returns the number of bytes consumed or zero if nothing was done or + * if an error occurred. No more than <count> bytes can be sent. + */ +static size_t h1_make_reqline(struct h1s *h1s, struct h1m *h1m, struct htx *htx, size_t count) +{ + struct h1c *h1c = h1s->h1c; + struct htx_blk *blk; + struct htx_sl *sl; + enum htx_blk_type type; + uint32_t sz; + size_t ret = 0; + + TRACE_ENTER(H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx, (size_t[]){count}); + + while (1) { + blk = htx_get_head_blk(htx); + if (!blk) + goto end; + type = htx_get_blk_type(blk); + sz = htx_get_blksz(blk); + if (type == HTX_BLK_UNUSED) + continue; + if (type != HTX_BLK_REQ_SL || sz > count) + goto error; + break; + } + + TRACE_USER("sending request headers", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx); + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + + sl = htx_get_blk_ptr(htx, blk); + if (!h1_format_htx_reqline(sl, &h1c->obuf)) + goto full; + + h1s->meth = sl->info.req.meth; + h1_parse_req_vsn(h1m, sl); + + h1m->flags |= H1_MF_XFER_LEN; + if (sl->flags & HTX_SL_F_CHNK) + h1m->flags |= H1_MF_CHNK; + else if (sl->flags & HTX_SL_F_CLEN) + h1m->flags |= H1_MF_CLEN; + if (sl->flags & HTX_SL_F_XFER_ENC) + h1m->flags |= H1_MF_XFER_ENC; + + if (sl->flags & HTX_SL_F_BODYLESS && !(h1m->flags & H1_MF_CLEN)) { + h1m->flags = (h1m->flags & ~H1_MF_CHNK) | H1_MF_CLEN; + h1s->flags |= H1S_F_HAVE_CLEN; + } + if ((sl->flags & HTX_SL_F_BODYLESS_RESP) || h1s->meth == HTTP_METH_HEAD) + h1s->flags |= H1S_F_BODYLESS_RESP; + + if (h1s->flags & H1S_F_RX_BLK) { + h1s->flags &= ~H1S_F_RX_BLK; + h1_wake_stream_for_recv(h1s); + TRACE_STATE("Re-enable input processing", H1_EV_TX_DATA|H1_EV_H1S_BLK|H1_EV_STRM_WAKE, h1c->conn, h1s); + } + + h1m->state = H1_MSG_HDR_NAME; + ret += sz; + htx_remove_blk(htx, blk); + + end: + TRACE_LEAVE(H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx, (size_t[]){ret}); + return ret; + + full: + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + goto end; + + error: + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("processing error on request start-line", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto end; +} + +/* Try to send the status line from the HTX message <htx> for the stream + * <h1s>. It returns the number of bytes consumed or zero if nothing was done or + * if an error occurred. No more than <count> bytes can be sent. + */ +static size_t h1_make_stline(struct h1s *h1s, struct h1m *h1m, struct htx *htx, size_t count) +{ + struct h1c *h1c = h1s->h1c; + struct htx_blk *blk; + struct htx_sl *sl; + enum htx_blk_type type; + uint32_t sz; + size_t ret = 0; + + TRACE_ENTER(H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx, (size_t[]){count}); + + while (1) { + blk = htx_get_head_blk(htx); + if (!blk) + goto end; + + type = htx_get_blk_type(blk); + sz = htx_get_blksz(blk); + + if (type == HTX_BLK_UNUSED) + continue; + if (type != HTX_BLK_RES_SL || sz > count) + goto error; + break; + } + + TRACE_USER("sending response headers", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx); + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + + sl = htx_get_blk_ptr(htx, blk); + if (!h1_format_htx_stline(sl, &h1c->obuf)) + goto full; + + h1s->status = sl->info.res.status; + h1_parse_res_vsn(h1m, sl); + + if (sl->flags & HTX_SL_F_XFER_LEN) { + h1m->flags |= H1_MF_XFER_LEN; + if (sl->flags & HTX_SL_F_CHNK) + h1m->flags |= H1_MF_CHNK; + else if (sl->flags & HTX_SL_F_CLEN) + h1m->flags |= H1_MF_CLEN; + if (sl->flags & HTX_SL_F_XFER_ENC) + h1m->flags |= H1_MF_XFER_ENC; + } + if (h1s->status < 200) + h1s->flags |= H1S_F_HAVE_O_CONN; + else if ((sl->flags & HTX_SL_F_BODYLESS_RESP) || h1s->status == 204 || h1s->status == 304) + h1s->flags |= H1S_F_BODYLESS_RESP; + + h1m->state = H1_MSG_HDR_NAME; + ret += sz; + htx_remove_blk(htx, blk); + + end: + TRACE_LEAVE(H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx, (size_t[]){ret}); + return ret; + + full: + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + goto end; + + error: + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("processing error on response start-line", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto end; +} + +/* Try to send the message headers from the HTX message <htx> for the stream + * <h1s>. It returns the number of bytes consumed or zero if nothing was done or + * if an error occurred. No more than <count> bytes can be sent. + */ +static size_t h1_make_headers(struct h1s *h1s, struct h1m *h1m, struct htx *htx, size_t count) +{ + struct h1c *h1c = h1s->h1c; + struct htx_blk *blk; + struct buffer outbuf; + enum htx_blk_type type; + struct ist n, v; + uint32_t sz; + size_t ret = 0; + + TRACE_ENTER(H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx, (size_t[]){count}); + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + outbuf = b_make(b_tail(&h1c->obuf), b_contig_space(&h1c->obuf), 0, 0); + + blk = htx_get_head_blk(htx); + while (blk) { + type = htx_get_blk_type(blk); + sz = htx_get_blksz(blk); + + if (type == HTX_BLK_HDR) { + if (sz > count) + goto error; + + n = htx_get_blk_name(htx, blk); + v = htx_get_blk_value(htx, blk); + + /* Skip all pseudo-headers */ + if (*(n.ptr) == ':') + goto nextblk; + + if (isteq(n, ist("transfer-encoding"))) { + if ((h1m->flags & H1_MF_RESP) && (h1s->status < 200 || h1s->status == 204)) + goto nextblk; + if (!(h1m->flags & H1_MF_CHNK)) + goto nextblk; + if (h1_parse_xfer_enc_header(h1m, v) < 0) + goto error; + h1s->flags |= H1S_F_HAVE_CHNK; + } + else if (isteq(n, ist("content-length"))) { + if ((h1m->flags & H1_MF_RESP) && (h1s->status < 200 || h1s->status == 204)) + goto nextblk; + if (!(h1m->flags & H1_MF_CLEN)) + goto nextblk; + if (!(h1s->flags & H1S_F_HAVE_CLEN)) + h1m->flags &= ~H1_MF_CLEN; + /* Only skip C-L header with invalid value. */ + if (h1_parse_cont_len_header(h1m, &v) < 0) + goto error; + if (h1s->flags & H1S_F_HAVE_CLEN) + goto nextblk; + h1s->flags |= H1S_F_HAVE_CLEN; + } + else if (isteq(n, ist("connection"))) { + h1_parse_connection_header(h1m, &v); + if (!v.len) + goto nextblk; + } + else if (isteq(n, ist("upgrade"))) { + h1_parse_upgrade_header(h1m, v); + } + else if ((isteq(n, ist("sec-websocket-accept")) && h1m->flags & H1_MF_RESP) || + (isteq(n, ist("sec-websocket-key")) && !(h1m->flags & H1_MF_RESP))) { + h1s->flags |= H1S_F_HAVE_WS_KEY; + } + else if (isteq(n, ist("te"))) { + /* "te" may only be sent with "trailers" if this value + * is present, otherwise it must be deleted. + */ + v = istist(v, ist("trailers")); + if (!isttest(v) || (v.len > 8 && v.ptr[8] != ',')) + goto nextblk; + v = ist("trailers"); + } + + /* Skip header if same name is used to add the server name */ + if (!(h1m->flags & H1_MF_RESP) && isttest(h1c->px->server_id_hdr_name) && + isteqi(n, h1c->px->server_id_hdr_name)) + goto nextblk; + + /* Try to adjust the case of the header name */ + if (h1c->px->options2 & (PR_O2_H1_ADJ_BUGCLI|PR_O2_H1_ADJ_BUGSRV)) + h1_adjust_case_outgoing_hdr(h1s, h1m, &n); + if (!h1_format_htx_hdr(n, v, &outbuf)) + goto full; + } + else if (type == HTX_BLK_EOH) { + h1m->state = H1_MSG_LAST_LF; + break; /* Do not consume this block */ + } + else if (type == HTX_BLK_UNUSED) + goto nextblk; + else + goto error; + + nextblk: + ret += sz; + count -= sz; + blk = htx_remove_blk(htx, blk); + } + + copy: + b_add(&h1c->obuf, outbuf.data); + + end: + TRACE_LEAVE(H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx, (size_t[]){ret}); + return ret; + + full: + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + goto copy; + + error: + ret = 0; + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("processing error on message headers", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto end; +} + +/* Handle the EOH and perform last processing before sending the data. It + * returns the number of bytes consumed or zero if nothing was done or if an + * error occurred. No more than <count> bytes can be sent. + */ +static size_t h1_make_eoh(struct h1s *h1s, struct h1m *h1m, struct htx *htx, size_t count) +{ + struct h1c *h1c = h1s->h1c; + struct htx_blk *blk; + struct buffer outbuf; + enum htx_blk_type type; + struct ist n, v; + uint32_t sz; + size_t ret = 0; + + TRACE_ENTER(H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx, (size_t[]){count}); + + while (1) { + blk = htx_get_head_blk(htx); + if (!blk) + goto end; + + type = htx_get_blk_type(blk); + sz = htx_get_blksz(blk); + + if (type == HTX_BLK_UNUSED) + continue; + if (type != HTX_BLK_EOH || sz > count) + goto error; + break; + } + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + outbuf = b_make(b_tail(&h1c->obuf), b_contig_space(&h1c->obuf), 0, 0); + + /* Deal with "Connection" header */ + if (!(h1s->flags & H1S_F_HAVE_O_CONN)) { + if ((htx->flags & HTX_FL_PROXY_RESP) && h1s->req.state != H1_MSG_DONE) { + /* If the reply comes from haproxy while the request is + * not finished, we force the connection close. */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("force close mode (resp)", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + else if ((h1m->flags & (H1_MF_XFER_ENC|H1_MF_CLEN)) == (H1_MF_XFER_ENC|H1_MF_CLEN)) { + /* T-E + C-L: force close */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + h1m->flags &= ~H1_MF_CLEN; + TRACE_STATE("force close mode (T-E + C-L)", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + else if ((h1m->flags & (H1_MF_VER_11|H1_MF_XFER_ENC)) == H1_MF_XFER_ENC) { + /* T-E + HTTP/1.0: force close */ + h1s->flags = (h1s->flags & ~H1S_F_WANT_MSK) | H1S_F_WANT_CLO; + TRACE_STATE("force close mode (T-E + HTTP/1.0)", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1s->h1c->conn, h1s); + } + + /* the conn_mode must be processed. So do it */ + n = ist("connection"); + v = ist(""); + h1_process_output_conn_mode(h1s, h1m, &v); + if (v.len) { + /* Try to adjust the case of the header name */ + if (h1c->px->options2 & (PR_O2_H1_ADJ_BUGCLI|PR_O2_H1_ADJ_BUGSRV)) + h1_adjust_case_outgoing_hdr(h1s, h1m, &n); + if (!h1_format_htx_hdr(n, v, &outbuf)) + goto full; + } + h1s->flags |= H1S_F_HAVE_O_CONN; + } + + /* Deal with "Transfer-Encoding" header */ + if ((h1s->meth != HTTP_METH_CONNECT && + (h1m->flags & (H1_MF_VER_11|H1_MF_RESP|H1_MF_CLEN|H1_MF_CHNK|H1_MF_XFER_LEN)) == + (H1_MF_VER_11|H1_MF_XFER_LEN)) || + (h1s->status >= 200 && !(h1s->flags & H1S_F_BODYLESS_RESP) && + !(h1s->meth == HTTP_METH_CONNECT && h1s->status >= 200 && h1s->status < 300) && + (h1m->flags & (H1_MF_VER_11|H1_MF_RESP|H1_MF_CLEN|H1_MF_CHNK|H1_MF_XFER_LEN)) == + (H1_MF_VER_11|H1_MF_RESP|H1_MF_XFER_LEN))) + h1m->flags |= H1_MF_CHNK; + + if ((h1m->flags & H1_MF_CHNK) && !(h1s->flags & H1S_F_HAVE_CHNK)) { + /* chunking needed but header not seen */ + n = ist("transfer-encoding"); + v = ist("chunked"); + if (h1c->px->options2 & (PR_O2_H1_ADJ_BUGCLI|PR_O2_H1_ADJ_BUGSRV)) + h1_adjust_case_outgoing_hdr(h1s, h1m, &n); + if (!h1_format_htx_hdr(n, v, &outbuf)) + goto full; + TRACE_STATE("add \"Transfer-Encoding: chunked\"", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s); + h1s->flags |= H1S_F_HAVE_CHNK; + } + + /* Deal with "Content-Length header */ + if ((h1m->flags & H1_MF_CLEN) && !(h1s->flags & H1S_F_HAVE_CLEN)) { + char *end; + + h1m->curr_len = h1m->body_len = htx->data + htx->extra - sz; + end = DISGUISE(ulltoa(h1m->body_len, trash.area, b_size(&trash))); + + n = ist("content-length"); + v = ist2(trash.area, end-trash.area); + if (h1c->px->options2 & (PR_O2_H1_ADJ_BUGCLI|PR_O2_H1_ADJ_BUGSRV)) + h1_adjust_case_outgoing_hdr(h1s, h1m, &n); + if (!h1_format_htx_hdr(n, v, &outbuf)) + goto full; + TRACE_STATE("add \"Content-Length: <LEN>\"", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s); + h1s->flags |= H1S_F_HAVE_CLEN; + } + + /* Add the server name to a header (if requested) */ + if (!(h1s->flags & H1S_F_HAVE_SRV_NAME) && + !(h1m->flags & H1_MF_RESP) && isttest(h1c->px->server_id_hdr_name)) { + struct server *srv = objt_server(h1c->conn->target); + + if (srv) { + n = h1c->px->server_id_hdr_name; + v = ist(srv->id); + + /* Try to adjust the case of the header name */ + if (h1c->px->options2 & (PR_O2_H1_ADJ_BUGCLI|PR_O2_H1_ADJ_BUGSRV)) + h1_adjust_case_outgoing_hdr(h1s, h1m, &n); + if (!h1_format_htx_hdr(n, v, &outbuf)) + goto full; + } + TRACE_STATE("add server name header", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s); + h1s->flags |= H1S_F_HAVE_SRV_NAME; + } + + /* Add websocket handshake key if needed */ + if (!(h1s->flags & H1S_F_HAVE_WS_KEY) && + (h1m->flags & (H1_MF_CONN_UPG|H1_MF_UPG_WEBSOCKET)) == (H1_MF_CONN_UPG|H1_MF_UPG_WEBSOCKET)) { + if (!(h1m->flags & H1_MF_RESP)) { + /* generate a random websocket key + * stored in the session to + * verify it on the response side + */ + h1_generate_random_ws_input_key(h1s->ws_key); + + if (!h1_format_htx_hdr(ist("Sec-Websocket-Key"), + ist(h1s->ws_key), + &outbuf)) { + goto full; + } + } + else { + /* add the response header key */ + char key[29]; + + h1_calculate_ws_output_key(h1s->ws_key, key); + if (!h1_format_htx_hdr(ist("Sec-Websocket-Accept"), + ist(key), + &outbuf)) { + goto full; + } + } + h1s->flags |= H1S_F_HAVE_WS_KEY; + } + + /* + * All headers was sent, now process EOH + */ + if (!(h1m->flags & H1_MF_RESP) && h1s->meth == HTTP_METH_CONNECT) { + if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto full; + /* a CONNECT request was sent. Output processing is now blocked + * waiting the server response. + */ + h1m->state = H1_MSG_DONE; + h1s->flags |= H1S_F_TX_BLK; + TRACE_STATE("CONNECT request waiting for tunnel mode", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + } + else if ((h1m->flags & H1_MF_RESP) && + ((h1s->meth == HTTP_METH_CONNECT && h1s->status >= 200 && h1s->status < 300) || h1s->status == 101)) { + if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto full; + /* a successful reply to a CONNECT or a protocol switching is sent + * to the client. Switch the response to tunnel mode. + */ + h1_set_tunnel_mode(h1s); + } + else if ((h1m->flags & H1_MF_RESP) && + h1s->status < 200 && (h1s->status == 100 || h1s->status >= 102)) { + if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto full; + /* 1xx response was sent, reset response processing */ + h1m_init_res(h1m); + h1m->flags |= (H1_MF_NO_PHDR|H1_MF_CLEAN_CONN_HDR); + h1s->flags &= ~H1S_F_HAVE_O_CONN; + TRACE_STATE("1xx response xferred", H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s); + } + else if (htx_is_unique_blk(htx, blk) && + ((htx->flags & HTX_FL_EOM) || ((h1m->flags & H1_MF_CLEN) && !h1m->curr_len))) { + /* EOM flag is set and it is the last block or there is no + * payload. If cannot be removed now. We must emit the end of + * the message first to be sure the output buffer is not full + */ + if ((h1m->flags & H1_MF_CHNK) && !(h1s->flags & H1S_F_BODYLESS_RESP)) { + if (!chunk_memcat(&outbuf, "\r\n0\r\n\r\n", 7)) + goto full; + } + else if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto full; + h1m->state = ((htx->flags & HTX_FL_EOM) ? H1_MSG_DONE : H1_MSG_TRAILERS); + } + else { + if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto full; + h1m->state = ((h1m->flags & H1_MF_CHNK) ? H1_MSG_CHUNK_SIZE: H1_MSG_DATA); + } + + TRACE_PROTO((!(h1m->flags & H1_MF_RESP) ? "H1 request headers xferred" : "H1 response headers xferred"), + H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s); + ret += sz; + htx_remove_blk(htx, blk); + + copy: + b_add(&h1c->obuf, outbuf.data); + end: + TRACE_LEAVE(H1_EV_TX_DATA|H1_EV_TX_HDRS, h1c->conn, h1s, htx, (size_t[]){ret}); + return ret; + + full: + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + goto copy; + + error: + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("processing error on message EOH", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto end; +} + +/* Try to send the message payload from the HTX message <htx> for the stream + * <h1s>. In this case, we are not in TUNNEL mode. It returns the number of + * bytes consumed or zero if nothing was done or if an error occurred. No more + * than <count> bytes can be sent. + */ +static size_t h1_make_data(struct h1s *h1s, struct h1m *h1m, struct buffer *buf, size_t count) +{ + struct h1c *h1c = h1s->h1c; + struct htx *htx = htx_from_buf(buf); + struct htx_blk *blk; + struct buffer outbuf; + enum htx_blk_type type; + struct ist v; + uint32_t sz; + size_t ret = 0; + + TRACE_ENTER(H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, htx, (size_t[]){count}); + blk = htx_get_head_blk(htx); + + /* Perform some optimizations to reduce the number of buffer copies. If + * the mux's buffer is empty and the htx area contains exactly one data + * block of the same size as the requested count, then it's possible to + * simply swap the caller's buffer with the mux's output buffer and + * adjust offsets and length to match the entire DATA HTX block in the + * middle. In this case we perform a true zero-copy operation from + * end-to-end. This is the situation that happens all the time with + * large files. + */ + if ((!(h1m->flags & H1_MF_RESP) || !(h1s->flags & H1S_F_BODYLESS_RESP)) && + !b_data(&h1c->obuf) && + (!(h1m->flags & H1_MF_CHNK) || ((h1m->flags & H1_MF_CHNK) && (!h1m->curr_len || count == h1m->curr_len))) && + htx_nbblks(htx) == 1 && + htx_get_blk_type(blk) == HTX_BLK_DATA && + htx_get_blk_value(htx, blk).len == count) { + void *old_area; + uint64_t extra; + int eom = (htx->flags & HTX_FL_EOM); + + extra = htx->extra; + old_area = h1c->obuf.area; + h1c->obuf.area = buf->area; + h1c->obuf.head = sizeof(struct htx) + blk->addr; + h1c->obuf.data = count; + + buf->area = old_area; + buf->data = buf->head = 0; + + htx = (struct htx *)buf->area; + htx_reset(htx); + htx->extra = extra; + + if (h1m->flags & H1_MF_CLEN) { + if (count > h1m->curr_len) { + TRACE_ERROR("more payload than announced", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto error; + } + h1m->curr_len -= count; + if (!h1m->curr_len) + h1m->state = (eom ? H1_MSG_DONE : H1_MSG_TRAILERS); + } + else if (h1m->flags & H1_MF_CHNK) { + /* The message is chunked. We need to check if we must + * emit the chunk size, the CRLF marking the end of the + * current chunk and eventually the CRLF marking the end + * of the previous chunk (because of fast-forwarding). + * If it is the end of the message, we must + * also emit the last chunk. + * + * We have at least the size of the struct htx to write + * the chunk envelope. It should be enough. + */ + + /* If is a new chunk, prepend the chunk size */ + if (h1m->state == H1_MSG_CHUNK_CRLF || h1m->state == H1_MSG_CHUNK_SIZE) { + if (h1m->curr_len) { + TRACE_ERROR("chunk bigger than announced", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto error; + } + h1m->curr_len = count + (htx->extra != HTX_UNKOWN_PAYLOAD_LENGTH ? htx->extra : 0); + + /* Because chunk meta-data are prepended, the chunk size of the current chunk + * must be handled before the end of the previous chunk. + */ + h1_prepend_chunk_size(&h1c->obuf, h1m->curr_len); + if (h1m->state == H1_MSG_CHUNK_CRLF) + h1_prepend_chunk_crlf(&h1c->obuf); + + h1m->state = H1_MSG_DATA; + } + + h1m->curr_len -= count; + + /* It is the end of the chunk, append the CRLF */ + if (!h1m->curr_len) { + h1_append_chunk_crlf(&h1c->obuf); + h1m->state = H1_MSG_CHUNK_SIZE; + } + + /* It is the end of the message, add the last chunk with the extra CRLF */ + if (eom) { + if (h1m->curr_len) { + TRACE_ERROR("chunk smaller than announced", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto error; + } + /* Emit the last chunk too at the buffer's end */ + b_putblk(&h1c->obuf, "0\r\n\r\n", 5); + h1m->state = H1_MSG_DONE; + } + } + /* Nothing to do if XFER len is unknown */ + + ret = count; + TRACE_PROTO("H1 message payload data xferred (zero-copy)", H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, 0, (size_t[]){ret}); + goto end; + } + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + outbuf = b_make(b_tail(&h1c->obuf), b_contig_space(&h1c->obuf), 0, 0); + + + /* Handle now case of CRLF at the end of a chun. */ + if ((h1m->flags & H1_MF_CHNK) && h1m->state == H1_MSG_CHUNK_CRLF) { + if (h1m->curr_len) { + TRACE_ERROR("chunk bigger than announced", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto error; + } + if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto full; + h1m->state = H1_MSG_CHUNK_SIZE; + } + + while (blk && count) { + uint32_t vlen, chklen; + int last_data = 0; + + type = htx_get_blk_type(blk); + sz = htx_get_blksz(blk); + vlen = sz; + if (type == HTX_BLK_DATA) { + if (vlen > count) { + /* Get the maximum amount of data we can xferred */ + vlen = count; + } + else if (htx_is_unique_blk(htx, blk) && (htx->flags & HTX_FL_EOM)) { + /* It is the last block of this message. After this one, + * only tunneled data may be forwarded. */ + TRACE_DEVEL("last message block", H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s); + last_data = 1; + } + + if ((h1m->flags & H1_MF_RESP) && (h1s->flags & H1S_F_BODYLESS_RESP)) { + TRACE_PROTO("Skip data for bodyless response", H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, htx); + goto nextblk; + } + + chklen = 0; + if (h1m->flags & H1_MF_CHNK) { + /* If is a new chunk, prepend the chunk size */ + if (h1m->state == H1_MSG_CHUNK_SIZE) { + h1m->curr_len = (htx->extra && htx->extra != HTX_UNKOWN_PAYLOAD_LENGTH ? htx->data + htx->extra : vlen); + if (!h1_append_chunk_size(&outbuf, h1m->curr_len)) { + h1m->curr_len = 0; + goto full; + } + h1m->state = H1_MSG_DATA; + } + + if (vlen > h1m->curr_len) { + vlen = h1m->curr_len; + last_data = 0; + } + + chklen = 0; + if (h1m->curr_len == vlen) + chklen += 2; + if (last_data) + chklen += 5; + } + + if (vlen + chklen > b_room(&outbuf)) { + /* too large for the buffer */ + if (chklen >= b_room(&outbuf)) + goto full; + vlen = b_room(&outbuf) - chklen; + last_data = 0; + } + + v = htx_get_blk_value(htx, blk); + v.len = vlen; + if (!h1_format_htx_data(v, &outbuf, 0)) + goto full; + + if (h1m->flags & H1_MF_CLEN) { + if (vlen > h1m->curr_len) { + TRACE_ERROR("more payload than announced", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto error; + } + h1m->curr_len -= vlen; + } + else if (h1m->flags & H1_MF_CHNK) { + h1m->curr_len -= vlen; + /* Space already reserved, so it must succeed */ + if (!h1m->curr_len) { + if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto error; + h1m->state = H1_MSG_CHUNK_SIZE; + } + if (last_data) { + if (h1m->curr_len) { + TRACE_ERROR("chunk smaller than announced", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto error; + } + if (!chunk_memcat(&outbuf, "0\r\n\r\n", 5)) + goto error; + } + } + + + } + else if (type == HTX_BLK_EOT || type == HTX_BLK_TLR) { + if ((h1m->flags & H1_MF_RESP) && (h1s->flags & H1S_F_BODYLESS_RESP)) { + /* Do nothing the payload must be skipped + * because it is a bodyless response + */ + } + else if (h1m->flags & H1_MF_CHNK) { + /* Emit last chunk for chunked messages only */ + if (!chunk_memcat(&outbuf, "0\r\n", 3)) + goto full; + } + h1m->state = H1_MSG_TRAILERS; + break; + } + else if (type == HTX_BLK_UNUSED) + goto nextblk; + else + goto error; + + nextblk: + ret += vlen; + count -= vlen; + if (sz == vlen) + blk = htx_remove_blk(htx, blk); + else { + htx_cut_data_blk(htx, blk, vlen); + if (!b_room(&outbuf)) + goto full; + } + + if (last_data) + h1m->state = H1_MSG_DONE; + } + + copy: + TRACE_PROTO("H1 message payload data xferred", H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, 0, (size_t[]){ret}); + b_add(&h1c->obuf, outbuf.data); + end: + TRACE_LEAVE(H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, htx, (size_t[]){ret}); + return ret; + full: + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + goto copy; + error: + ret = 0; + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("processing error on message payload", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto end; +} + +/* Try to send the tunneled data from the HTX message <htx> for the stream + * <h1s>. In this case, we are in TUNNEL mode. It returns the number of bytes + * consumed or zero if nothing was done or if an error occurred. No more than + * <count> bytes can be sent. + */ +static size_t h1_make_tunnel(struct h1s *h1s, struct h1m *h1m, struct buffer *buf, size_t count) +{ + struct h1c *h1c = h1s->h1c; + struct htx *htx = htx_from_buf(buf); + struct htx_blk *blk; + struct buffer outbuf; + enum htx_blk_type type; + struct ist v; + uint32_t sz; + size_t ret = 0; + + TRACE_ENTER(H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, htx, (size_t[]){count}); + + blk = htx_get_head_blk(htx); + + /* Perform some optimizations to reduce the number of buffer copies. If + * the mux's buffer is empty and the htx area contains exactly one data + * block of the same size as the requested count, then it's possible to + * simply swap the caller's buffer with the mux's output buffer and + * adjust offsets and length to match the entire DATA HTX block in the + * middle. In this case we perform a true zero-copy operation from + * end-to-end. This is the situation that happens all the time with + * large files. + */ + if (!b_data(&h1c->obuf) && + htx_nbblks(htx) == 1 && + htx_get_blk_type(blk) == HTX_BLK_DATA && + htx_get_blksz(blk) == count) { + void *old_area; + + old_area = h1c->obuf.area; + h1c->obuf.area = buf->area; + h1c->obuf.head = sizeof(struct htx) + blk->addr; + h1c->obuf.data = count; + + buf->area = old_area; + buf->data = buf->head = 0; + + htx = (struct htx *)buf->area; + htx_reset(htx); + + ret = count; + TRACE_PROTO("H1 tunneled data xferred (zero-copy)", H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, 0, (size_t[]){ret}); + goto end; + } + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + outbuf = b_make(b_tail(&h1c->obuf), b_contig_space(&h1c->obuf), 0, 0); + + while (blk) { + uint32_t vlen; + + type = htx_get_blk_type(blk); + sz = htx_get_blksz(blk); + vlen = sz; + + if (type == HTX_BLK_DATA) { + if (vlen > count) { + /* Get the maximum amount of data we can xferred */ + vlen = count; + } + + if (vlen > b_room(&outbuf)) { + /* too large for the buffer */ + vlen = b_room(&outbuf); + } + + v = htx_get_blk_value(htx, blk); + v.len = vlen; + if (!h1_format_htx_data(v, &outbuf, 0)) + goto full; + } + else if (type == HTX_BLK_UNUSED) + goto nextblk; + else + goto error; + + nextblk: + ret += vlen; + count -= vlen; + if (sz == vlen) + blk = htx_remove_blk(htx, blk); + else { + htx_cut_data_blk(htx, blk, vlen); + break; + } + } + + copy: + TRACE_PROTO("H1 tunneled data xferred", H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, 0, (size_t[]){ret}); + b_add(&h1c->obuf, outbuf.data); + + end: + TRACE_LEAVE(H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, htx, (size_t[]){ret}); + return ret; + + full: + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + goto copy; + + error: + ret = 0; + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("processing error on tunneled", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto end; +} + +/* Try to send the trailers from the HTX message <htx> for the stream <h1s>. It + * returns the number of bytes consumed or zero if nothing was done or if an + * error occurred. No more than <count> bytes can be sent. + */ +static size_t h1_make_trailers(struct h1s *h1s, struct h1m *h1m, struct htx *htx, size_t count) +{ + struct h1c *h1c = h1s->h1c; + struct htx_blk *blk; + struct buffer outbuf; + enum htx_blk_type type; + struct ist n, v; + uint32_t sz; + size_t ret = 0; + + TRACE_ENTER(H1_EV_TX_DATA|H1_EV_TX_TLRS, h1c->conn, h1s, htx, (size_t[]){count}); + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + chunk_reset(&outbuf); + outbuf = b_make(b_tail(&h1c->obuf), b_contig_space(&h1c->obuf), 0, 0); + + blk = htx_get_head_blk(htx); + while (blk) { + type = htx_get_blk_type(blk); + sz = htx_get_blksz(blk); + + if (type == HTX_BLK_TLR) { + if (sz > count) + goto error; + + if (!(h1m->flags & H1_MF_CHNK) || ((h1m->flags & H1_MF_RESP) && (h1s->flags & H1S_F_BODYLESS_RESP))) + goto nextblk; + + n = htx_get_blk_name(htx, blk); + v = htx_get_blk_value(htx, blk); + + /* Try to adjust the case of the header name */ + if (h1c->px->options2 & (PR_O2_H1_ADJ_BUGCLI|PR_O2_H1_ADJ_BUGSRV)) + h1_adjust_case_outgoing_hdr(h1s, h1m, &n); + if (!h1_format_htx_hdr(n, v, &outbuf)) + goto full; + } + else if (type == HTX_BLK_EOT) { + if (!(h1m->flags & H1_MF_CHNK) || ((h1m->flags & H1_MF_RESP) && (h1s->flags & H1S_F_BODYLESS_RESP))) { + TRACE_PROTO((!(h1m->flags & H1_MF_RESP) ? "H1 request trailers skipped" : "H1 response trailers skipped"), + H1_EV_TX_DATA|H1_EV_TX_TLRS, h1c->conn, h1s); + } + else { + if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto full; + TRACE_PROTO((!(h1m->flags & H1_MF_RESP) ? "H1 request trailers xferred" : "H1 response trailers xferred"), + H1_EV_TX_DATA|H1_EV_TX_TLRS, h1c->conn, h1s); + } + h1m->state = H1_MSG_DONE; + } + else if (type == HTX_BLK_UNUSED) + goto nextblk; + else + goto error; + + nextblk: + ret += sz; + count -= sz; + blk = htx_remove_blk(htx, blk); + } + + copy: + b_add(&h1c->obuf, outbuf.data); + + end: + TRACE_LEAVE(H1_EV_TX_DATA|H1_EV_TX_TLRS, h1c->conn, h1s, htx, (size_t[]){ret}); + return ret; + + full: + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + goto copy; + + error: + ret = 0; + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("processing error on message trailers", + H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto end; +} + +/* Try to send the header for a chunk of <len> bytes. It returns the number of + * bytes consumed or zero if nothing was done or if an error occurred.. + */ +static size_t h1_make_chunk(struct h1s *h1s, struct h1m * h1m, size_t len) +{ + struct h1c *h1c = h1s->h1c; + struct buffer outbuf; + size_t ret = 0; + + TRACE_ENTER(H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s); + + if (!h1_get_buf(h1c, &h1c->obuf)) { + h1c->flags |= H1C_F_OUT_ALLOC; + TRACE_STATE("waiting for h1c obuf allocation", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + goto end; + } + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + outbuf = b_make(b_tail(&h1c->obuf), b_contig_space(&h1c->obuf), 0, 0); + + if (h1m->state == H1_MSG_CHUNK_CRLF) { + if (!chunk_memcat(&outbuf, "\r\n", 2)) + goto full; + h1m->state = H1_MSG_CHUNK_SIZE; + } + if (!h1_append_chunk_size(&outbuf, len)) + goto full; + + h1m->state = H1_MSG_DATA; + + TRACE_PROTO("H1 chunk info xferred", H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, 0, (size_t[]){ret}); + b_add(&h1c->obuf, outbuf.data); + ret = outbuf.data; + +end: + TRACE_LEAVE(H1_EV_TX_DATA|H1_EV_TX_BODY, h1c->conn, h1s, NULL, (size_t[]){ret}); + return ret; +full: + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + goto end; +} + +/* + * Process outgoing data. It parses data and transfer them from the channel buffer into + * h1c->obuf. It returns the number of bytes parsed and transferred if > 0, or + * 0 if it couldn't proceed. + */ +static size_t h1_process_mux(struct h1c *h1c, struct buffer *buf, size_t count) +{ + struct h1s *h1s = h1c->h1s; + struct h1m *h1m; + struct htx *htx; + size_t ret, total = 0; + + htx = htxbuf(buf); + TRACE_ENTER(H1_EV_TX_DATA, h1c->conn, h1s, htx, (size_t[]){count}); + + if (htx_is_empty(htx)) + goto end; + + if (h1s->flags & (H1S_F_INTERNAL_ERROR|H1S_F_PROCESSING_ERROR|H1S_F_TX_BLK)) + goto end; + + if (!h1_get_buf(h1c, &h1c->obuf)) { + h1c->flags |= H1C_F_OUT_ALLOC; + TRACE_STATE("waiting for h1c obuf allocation", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + goto end; + } + h1m = (!(h1c->flags & H1C_F_IS_BACK) ? &h1s->res : &h1s->req); + + while (!(h1c->flags & H1C_F_OUT_FULL) && + !(h1s->flags & (H1S_F_PROCESSING_ERROR|H1S_F_TX_BLK)) && + !htx_is_empty(htx) && count) { + switch (h1m->state) { + case H1_MSG_RQBEFORE: + ret = h1_make_reqline(h1s, h1m, htx, count); + break; + + case H1_MSG_RPBEFORE: + ret = h1_make_stline(h1s, h1m, htx, count); + break; + + case H1_MSG_HDR_NAME: + ret = h1_make_headers(h1s, h1m, htx, count); + if (unlikely(h1m->state == H1_MSG_LAST_LF)) // in case of no header + ret += h1_make_eoh(h1s, h1m, htx, count); + break; + + case H1_MSG_LAST_LF: + ret = h1_make_eoh(h1s, h1m, htx, count); + break; + + case H1_MSG_CHUNK_SIZE: + case H1_MSG_CHUNK_CRLF: + case H1_MSG_DATA: + ret = h1_make_data(h1s, h1m, buf, count); + if (ret > 0) + htx = htx_from_buf(buf); + if (unlikely(h1m->state == H1_MSG_TRAILERS)) // in case of no data + ret += h1_make_trailers(h1s, h1m, htx, count); + break; + + case H1_MSG_TUNNEL: + ret = h1_make_tunnel(h1s, h1m, buf, count); + if (ret > 0) + htx = htx_from_buf(buf); + break; + + case H1_MSG_TRAILERS: + ret = h1_make_trailers(h1s, h1m, htx, count); + break; + + case H1_MSG_DONE: + TRACE_STATE("unexpected data xferred in done state", H1_EV_TX_DATA|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + __fallthrough; + + default: + ret = 0; + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("processing error", H1_EV_TX_DATA|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + break; + } + + if (!ret) + break; + total += ret; + count -= ret; + + if (h1m->state == H1_MSG_DONE) { + TRACE_USER((!(h1m->flags & H1_MF_RESP) ? "H1 request fully xferred" : "H1 response fully xferred"), + H1_EV_TX_DATA, h1c->conn, h1s); + + if (h1s->flags & H1S_F_RX_BLK) { + h1s->flags &= ~H1S_F_RX_BLK; + h1_wake_stream_for_recv(h1s); + TRACE_STATE("Re-enable input processing", H1_EV_TX_DATA|H1_EV_H1S_BLK|H1_EV_STRM_WAKE, h1c->conn, h1s); + } + } + } + + htx_to_buf(htx, buf); + if (!buf_room_for_htx_data(&h1c->obuf)) { + TRACE_STATE("h1c obuf full", H1_EV_TX_DATA|H1_EV_H1S_BLK, h1c->conn, h1s); + h1c->flags |= H1C_F_OUT_FULL; + } + + end: + + /* Both the request and the response reached the DONE state. So set EOI + * flag on the conn-stream. Most of time, the flag will already be set, + * except for protocol upgrades. Report an error if data remains blocked + * in the output buffer. + */ + if (h1s->req.state == H1_MSG_DONE && h1s->res.state == H1_MSG_DONE) { + se_fl_set(h1s->sd, SE_FL_EOI); + if (!htx_is_empty(htx)) { + htx->flags |= HTX_FL_PROCESSING_ERROR; + h1s->flags |= H1S_F_PROCESSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("txn done but data waiting to be sent, set error on h1c", H1_EV_H1C_ERR, h1c->conn, h1s); + } + } + + TRACE_LEAVE(H1_EV_TX_DATA, h1c->conn, h1s, htx, (size_t[]){total}); + return total; +} + +/*********************************************************/ +/* functions below are I/O callbacks from the connection */ +/*********************************************************/ +static void h1_wake_stream_for_recv(struct h1s *h1s) +{ + if (h1s && h1s->subs && h1s->subs->events & SUB_RETRY_RECV) { + TRACE_POINT(H1_EV_STRM_WAKE, h1s->h1c->conn, h1s); + tasklet_wakeup(h1s->subs->tasklet); + h1s->subs->events &= ~SUB_RETRY_RECV; + if (!h1s->subs->events) + h1s->subs = NULL; + } +} +static void h1_wake_stream_for_send(struct h1s *h1s) +{ + if (h1s && h1s->subs && h1s->subs->events & SUB_RETRY_SEND) { + TRACE_POINT(H1_EV_STRM_WAKE, h1s->h1c->conn, h1s); + tasklet_wakeup(h1s->subs->tasklet); + h1s->subs->events &= ~SUB_RETRY_SEND; + if (!h1s->subs->events) + h1s->subs = NULL; + } +} + +/* alerts the data layer following this sequence : + * - if the h1s' data layer is subscribed to recv, then it's woken up for recv + * - if its subscribed to send, then it's woken up for send + * - if it was subscribed to neither, its ->wake() callback is called + */ +static void h1_alert(struct h1s *h1s) +{ + if (h1s->subs) { + h1_wake_stream_for_recv(h1s); + h1_wake_stream_for_send(h1s); + } + else if (h1s_sc(h1s) && h1s_sc(h1s)->app_ops->wake != NULL) { + TRACE_POINT(H1_EV_STRM_WAKE, h1s->h1c->conn, h1s); + h1s_sc(h1s)->app_ops->wake(h1s_sc(h1s)); + } +} + +/* Try to send an HTTP error with h1c->errcode status code. It returns 1 on success + * and 0 on error. The flag H1C_F_ABRT_PENDING is set on the H1 connection for + * retryable errors (allocation error or buffer full). On success, the error is + * copied in the output buffer. +*/ +static int h1_send_error(struct h1c *h1c) +{ + int rc = http_get_status_idx(h1c->errcode); + int ret = 0; + + TRACE_ENTER(H1_EV_H1C_ERR, h1c->conn, 0, 0, (size_t[]){h1c->errcode}); + + /* Verify if the error is mapped on /dev/null or any empty file */ + /// XXX: do a function ! + if (h1c->px->replies[rc] && + h1c->px->replies[rc]->type == HTTP_REPLY_ERRMSG && + h1c->px->replies[rc]->body.errmsg && + b_is_null(h1c->px->replies[rc]->body.errmsg)) { + /* Empty error, so claim a success */ + ret = 1; + goto out; + } + + if (h1c->flags & (H1C_F_OUT_ALLOC|H1C_F_OUT_FULL)) { + h1c->flags |= H1C_F_ABRT_PENDING; + goto out; + } + + if (!h1_get_buf(h1c, &h1c->obuf)) { + h1c->flags |= (H1C_F_OUT_ALLOC|H1C_F_ABRT_PENDING); + TRACE_STATE("waiting for h1c obuf allocation", H1_EV_H1C_ERR|H1_EV_H1C_BLK, h1c->conn); + goto out; + } + ret = b_istput(&h1c->obuf, ist(http_err_msgs[rc])); + if (unlikely(ret <= 0)) { + if (!ret) { + h1c->flags |= (H1C_F_OUT_FULL|H1C_F_ABRT_PENDING); + TRACE_STATE("h1c obuf full", H1_EV_H1C_ERR|H1_EV_H1C_BLK, h1c->conn); + goto out; + } + else { + /* we cannot report this error, so claim a success */ + ret = 1; + } + } + + if (h1c->state == H1_CS_EMBRYONIC) { + BUG_ON(h1c->h1s == NULL || h1s_sc(h1c->h1s) != NULL); + TRACE_DEVEL("Abort embryonic H1S", H1_EV_H1C_ERR, h1c->conn, h1c->h1s); + h1s_destroy(h1c->h1s); + } + + h1c->flags = (h1c->flags & ~(H1C_F_WAIT_NEXT_REQ|H1C_F_ABRT_PENDING)) | H1C_F_ABRTED; + h1_close(h1c); + out: + TRACE_LEAVE(H1_EV_H1C_ERR, h1c->conn); + return ret; +} + +/* Try to send a 500 internal error. It relies on h1_send_error to send the + * error. This function takes care of incrementing stats and tracked counters. + */ +static int h1_handle_internal_err(struct h1c *h1c) +{ + struct session *sess = h1c->conn->owner; + int ret = 0; + + session_inc_http_req_ctr(sess); + proxy_inc_fe_req_ctr(sess->listener, sess->fe, 1); + _HA_ATOMIC_INC(&sess->fe->fe_counters.p.http.rsp[5]); + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + + h1c->errcode = 500; + ret = h1_send_error(h1c); + sess_log(sess); + return ret; +} + +/* Try to send an error because of a parsing error. By default a 400 bad request + * error is returned. But the status code may be specified by setting + * h1c->errcode. It relies on h1_send_error to send the error. This function + * takes care of incrementing stats and tracked counters. + */ +static int h1_handle_parsing_error(struct h1c *h1c) +{ + struct session *sess = h1c->conn->owner; + int ret = 0; + + if (!b_data(&h1c->ibuf) && ((h1c->flags & H1C_F_WAIT_NEXT_REQ) || (sess->fe->options & PR_O_IGNORE_PRB))) { + h1c->flags = (h1c->flags & ~H1C_F_WAIT_NEXT_REQ) | H1C_F_ABRTED; + h1_close(h1c); + goto end; + } + + session_inc_http_req_ctr(sess); + session_inc_http_err_ctr(sess); + proxy_inc_fe_req_ctr(sess->listener, sess->fe, 1); + _HA_ATOMIC_INC(&sess->fe->fe_counters.p.http.rsp[4]); + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + + if (!h1c->errcode) + h1c->errcode = 400; + ret = h1_send_error(h1c); + if (b_data(&h1c->ibuf) || !(sess->fe->options & PR_O_NULLNOLOG)) + sess_log(sess); + + end: + return ret; +} + +/* Try to send a 501 not implemented error. It relies on h1_send_error to send + * the error. This function takes care of incrementing stats and tracked + * counters. + */ +static int h1_handle_not_impl_err(struct h1c *h1c) +{ + struct session *sess = h1c->conn->owner; + int ret = 0; + + if (!b_data(&h1c->ibuf) && ((h1c->flags & H1C_F_WAIT_NEXT_REQ) || (sess->fe->options & PR_O_IGNORE_PRB))) { + h1c->flags = (h1c->flags & ~H1C_F_WAIT_NEXT_REQ) | H1C_F_ABRTED; + h1_close(h1c); + goto end; + } + + session_inc_http_req_ctr(sess); + proxy_inc_fe_req_ctr(sess->listener, sess->fe, 1); + _HA_ATOMIC_INC(&sess->fe->fe_counters.p.http.rsp[4]); + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + + h1c->errcode = 501; + ret = h1_send_error(h1c); + if (b_data(&h1c->ibuf) || !(sess->fe->options & PR_O_NULLNOLOG)) + sess_log(sess); + + end: + return ret; +} + +/* Try to send a 408 timeout error. It relies on h1_send_error to send the + * error. This function takes care of incrementing stats and tracked counters. + */ +static int h1_handle_req_tout(struct h1c *h1c) +{ + struct session *sess = h1c->conn->owner; + int ret = 0; + + if (!b_data(&h1c->ibuf) && ((h1c->flags & H1C_F_WAIT_NEXT_REQ) || (sess->fe->options & PR_O_IGNORE_PRB))) { + h1c->flags = (h1c->flags & ~H1C_F_WAIT_NEXT_REQ) | H1C_F_ABRTED; + h1_close(h1c); + goto end; + } + + session_inc_http_req_ctr(sess); + proxy_inc_fe_req_ctr(sess->listener, sess->fe, 1); + _HA_ATOMIC_INC(&sess->fe->fe_counters.p.http.rsp[4]); + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + + h1c->errcode = 408; + ret = h1_send_error(h1c); + if (b_data(&h1c->ibuf) || !(sess->fe->options & PR_O_NULLNOLOG)) + sess_log(sess); + + end: + return ret; +} + + +/* + * Attempt to read data, and subscribe if none available + */ +static int h1_recv(struct h1c *h1c) +{ + struct connection *conn = h1c->conn; + size_t ret = 0, max; + int flags = 0; + + TRACE_ENTER(H1_EV_H1C_RECV, h1c->conn); + + if (h1c->wait_event.events & SUB_RETRY_RECV) { + TRACE_DEVEL("leaving on sub_recv", H1_EV_H1C_RECV, h1c->conn); + return (b_data(&h1c->ibuf)); + } + + if ((h1c->flags & H1C_F_WANT_FASTFWD) || !h1_recv_allowed(h1c)) { + TRACE_DEVEL("leaving on (want_fastfwde|!recv_allowed)", H1_EV_H1C_RECV, h1c->conn); + return 1; + } + + if (!h1_get_buf(h1c, &h1c->ibuf)) { + h1c->flags |= H1C_F_IN_ALLOC; + TRACE_STATE("waiting for h1c ibuf allocation", H1_EV_H1C_RECV|H1_EV_H1C_BLK, h1c->conn); + return 0; + } + + /* + * If we only have a small amount of data, realign it, + * it's probably cheaper than doing 2 recv() calls. + */ + if (b_data(&h1c->ibuf) > 0 && b_data(&h1c->ibuf) < 128) + b_slow_realign_ofs(&h1c->ibuf, trash.area, sizeof(struct htx)); + + max = buf_room_for_htx_data(&h1c->ibuf); + + /* avoid useless reads after first responses */ + if (!h1c->h1s || + (!(h1c->flags & H1C_F_IS_BACK) && h1c->h1s->req.state == H1_MSG_RQBEFORE) || + ((h1c->flags & H1C_F_IS_BACK) && h1c->h1s->res.state == H1_MSG_RPBEFORE)) { + flags |= CO_RFL_READ_ONCE; + + /* we know that the first read will be constrained to a smaller + * read by the stream layer in order to respect the reserve. + * Reading too much will result in global.tune.maxrewrite being + * left at the end of the buffer, and in a very small read + * being performed again to complete them (typically 16 bytes + * freed in the index after headers were consumed) before + * another larger read. Instead, given that we know we're + * waiting for a header and we'll be limited, let's perform a + * shorter first read that the upper layer can retrieve by just + * a pointer swap and the next read will be doable at once in + * an empty buffer. + */ + if (max > global.tune.bufsize - global.tune.maxrewrite) + max = global.tune.bufsize - global.tune.maxrewrite; + } + + if (max) { + if (h1c->flags & H1C_F_IN_FULL) { + h1c->flags &= ~H1C_F_IN_FULL; + TRACE_STATE("h1c ibuf not full anymore", H1_EV_H1C_RECV|H1_EV_H1C_BLK); + } + + if (!b_data(&h1c->ibuf)) { + /* try to pre-align the buffer like the rxbufs will be + * to optimize memory copies. + */ + h1c->ibuf.head = sizeof(struct htx); + } + ret = conn->xprt->rcv_buf(conn, conn->xprt_ctx, &h1c->ibuf, max, flags); + HA_ATOMIC_ADD(&h1c->px_counters->bytes_in, ret); + } + + if (conn_xprt_read0_pending(conn)) { + TRACE_DEVEL("read0 on connection", H1_EV_H1C_RECV, h1c->conn); + h1c->flags |= H1C_F_EOS; + } + if (h1c->conn->flags & CO_FL_ERROR) { + TRACE_DEVEL("connection error", H1_EV_H1C_RECV, h1c->conn); + h1c->flags |= H1C_F_ERROR; + } + + if (max && !ret && h1_recv_allowed(h1c)) { + TRACE_STATE("failed to receive data, subscribing", H1_EV_H1C_RECV, h1c->conn); + conn->xprt->subscribe(conn, conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + else { + TRACE_DATA("data received or pending or connection error", H1_EV_H1C_RECV, h1c->conn, 0, 0, (size_t[]){ret}); + h1_wake_stream_for_recv(h1c->h1s); + } + + if (!b_data(&h1c->ibuf)) + h1_release_buf(h1c, &h1c->ibuf); + else if (!buf_room_for_htx_data(&h1c->ibuf)) { + h1c->flags |= H1C_F_IN_FULL; + TRACE_STATE("h1c ibuf full", H1_EV_H1C_RECV|H1_EV_H1C_BLK); + } + + TRACE_LEAVE(H1_EV_H1C_RECV, h1c->conn); + return !!ret || (h1c->flags & (H1C_F_EOS|H1C_F_ERROR)); +} + + +/* + * Try to send data if possible + */ +static int h1_send(struct h1c *h1c) +{ + struct connection *conn = h1c->conn; + unsigned int flags = 0; + size_t ret; + int sent = 0; + + TRACE_ENTER(H1_EV_H1C_SEND, h1c->conn); + + if (h1c->flags & (H1C_F_ERROR|H1C_F_ERR_PENDING)) { + TRACE_DEVEL("leaving on H1C error|err_pending", H1_EV_H1C_SEND, h1c->conn); + b_reset(&h1c->obuf); + if (h1c->flags & H1C_F_EOS) + h1c->flags |= H1C_F_ERROR; + return 1; + } + + if (!b_data(&h1c->obuf)) + goto end; + + if (h1c->flags & H1C_F_CO_MSG_MORE) + flags |= CO_SFL_MSG_MORE; + if (h1c->flags & H1C_F_CO_STREAMER) + flags |= CO_SFL_STREAMER; + + ret = conn->xprt->snd_buf(conn, conn->xprt_ctx, &h1c->obuf, b_data(&h1c->obuf), flags); + if (ret > 0) { + TRACE_DATA("data sent", H1_EV_H1C_SEND, h1c->conn, 0, 0, (size_t[]){ret}); + if (h1c->flags & H1C_F_OUT_FULL) { + h1c->flags &= ~H1C_F_OUT_FULL; + TRACE_STATE("h1c obuf not full anymore", H1_EV_STRM_SEND|H1_EV_H1S_BLK, h1c->conn); + } + HA_ATOMIC_ADD(&h1c->px_counters->bytes_out, ret); + b_del(&h1c->obuf, ret); + sent = 1; + } + + if (conn->flags & CO_FL_ERROR) { + /* connection error, nothing to send, clear the buffer to release it */ + TRACE_DEVEL("connection error", H1_EV_H1C_SEND, h1c->conn); + h1c->flags |= H1C_F_ERR_PENDING; + if (h1c->flags & H1C_F_EOS) + h1c->flags |= H1C_F_ERROR; + else if (!(h1c->wait_event.events & SUB_RETRY_RECV)) { + /* EOS not seen, so subscribe for reads to be able to + * catch the error on the reading path. It is especially + * important if EOI was reached. + */ + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + b_reset(&h1c->obuf); + } + + end: + if (!(h1c->flags & (H1C_F_OUT_FULL|H1C_F_OUT_ALLOC))) + h1_wake_stream_for_send(h1c->h1s); + + /* We're done, no more to send */ + if (!b_data(&h1c->obuf)) { + TRACE_DEVEL("leaving with everything sent", H1_EV_H1C_SEND, h1c->conn); + h1_release_buf(h1c, &h1c->obuf); + if (h1c->state == H1_CS_CLOSING) { + TRACE_STATE("process pending shutdown for writes", H1_EV_H1C_SEND, h1c->conn); + h1_shutw_conn(conn); + } + } + else if (!(h1c->wait_event.events & SUB_RETRY_SEND)) { + TRACE_STATE("more data to send, subscribing", H1_EV_H1C_SEND, h1c->conn); + conn->xprt->subscribe(conn, conn->xprt_ctx, SUB_RETRY_SEND, &h1c->wait_event); + } + + TRACE_LEAVE(H1_EV_H1C_SEND, h1c->conn); + return sent || (h1c->flags & (H1C_F_ERR_PENDING|H1C_F_ERROR)) || (h1c->state == H1_CS_CLOSED); +} + +/* callback called on any event by the connection handler. + * It applies changes and returns zero, or < 0 if it wants immediate + * destruction of the connection. + */ +static int h1_process(struct h1c * h1c) +{ + struct connection *conn = h1c->conn; + + TRACE_ENTER(H1_EV_H1C_WAKE, conn); + + /* Try to parse now the first block of a request, creating the H1 stream if necessary */ + if (b_data(&h1c->ibuf) && /* Input data to be processed */ + (h1c->state < H1_CS_RUNNING) && /* IDLE, EMBRYONIC or UPGRADING */ + !(h1c->flags & (H1C_F_IN_SALLOC|H1C_F_ABRT_PENDING))) { /* No allocation failure on the stream rxbuf and no ERROR on the H1C */ + struct h1s *h1s = h1c->h1s; + struct buffer *buf; + size_t count; + + /* When it happens for a backend connection, we may release it (it is probably a 408) */ + if (h1c->flags & H1C_F_IS_BACK) + goto release; + + /* First of all handle H1 to H2 upgrade (no need to create the H1 stream) */ + if (!(h1c->flags & H1C_F_WAIT_NEXT_REQ) && /* First request */ + !(h1c->px->options2 & PR_O2_NO_H2_UPGRADE) && /* H2 upgrade supported by the proxy */ + !(conn->mux->flags & MX_FL_NO_UPG)) { /* the current mux supports upgrades */ + /* Try to match H2 preface before parsing the request headers. */ + if (b_isteq(&h1c->ibuf, 0, b_data(&h1c->ibuf), ist(H2_CONN_PREFACE)) > 0) { + h1c->flags |= H1C_F_UPG_H2C; + if (h1c->state == H1_CS_UPGRADING) { + BUG_ON(!h1s); + se_fl_set(h1s->sd, SE_FL_EOI|SE_FL_EOS); /* Set EOS here to release the SC */ + } + TRACE_STATE("release h1c to perform H2 upgrade ", H1_EV_RX_DATA|H1_EV_H1C_WAKE); + goto release; + } + } + + /* Create the H1 stream if not already there */ + if (!h1s) { + h1s = h1c_frt_stream_new(h1c, NULL, h1c->conn->owner); + if (!h1s) { + b_reset(&h1c->ibuf); + h1_handle_internal_err(h1c); + TRACE_ERROR("alloc error", H1_EV_H1C_WAKE|H1_EV_H1C_ERR); + goto no_parsing; + } + } + + if (h1s->sess->t_idle == -1) + h1s->sess->t_idle = ns_to_ms(now_ns - h1s->sess->accept_ts) - h1s->sess->t_handshake; + + /* Get the stream rxbuf */ + buf = h1_get_buf(h1c, &h1s->rxbuf); + if (!buf) { + h1c->flags |= H1C_F_IN_SALLOC; + TRACE_STATE("waiting for stream rxbuf allocation", H1_EV_H1C_WAKE|H1_EV_H1C_BLK, h1c->conn); + return 0; + } + + count = (buf->size - sizeof(struct htx) - global.tune.maxrewrite); + h1_process_demux(h1c, buf, count); + h1_release_buf(h1c, &h1s->rxbuf); + h1_set_idle_expiration(h1c); + if (h1c->state < H1_CS_RUNNING) { + if (h1s->flags & H1S_F_INTERNAL_ERROR) { + h1_handle_internal_err(h1c); + TRACE_ERROR("internal error detected", H1_EV_H1C_WAKE|H1_EV_H1C_ERR); + } + else if (h1s->flags & H1S_F_NOT_IMPL_ERROR) { + h1_handle_not_impl_err(h1c); + TRACE_ERROR("not-implemented error detected", H1_EV_H1C_WAKE|H1_EV_H1C_ERR); + } + else if (h1s->flags & H1S_F_PARSING_ERROR || se_fl_test(h1s->sd, SE_FL_ERROR)) { + h1_handle_parsing_error(h1c); + TRACE_ERROR("parsing error detected", H1_EV_H1C_WAKE|H1_EV_H1C_ERR); + } + else { + TRACE_STATE("Incomplete message, subscribing", H1_EV_RX_DATA|H1_EV_H1C_BLK|H1_EV_H1C_WAKE, h1c->conn, h1s); + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + } + } + + no_parsing: + h1_send(h1c); + + /* H1 connection must be released ASAP if: + * - an error occurred on the H1C or + * - a read0 was received or + * - a silent shutdown was emitted and all outgoing data sent + */ + if ((h1c->flags & (H1C_F_EOS|H1C_F_ERROR|H1C_F_ABRT_PENDING|H1C_F_ABRTED)) || + (h1c->state >= H1_CS_CLOSING && (h1c->flags & H1C_F_SILENT_SHUT) && !b_data(&h1c->obuf))) { + if (h1c->state != H1_CS_RUNNING) { + /* No stream connector or upgrading */ + if (h1c->state < H1_CS_RUNNING && !(h1c->flags & (H1C_F_IS_BACK|H1C_F_ABRT_PENDING))) { + /* shutdown for reads and no error on the frontend connection: Send an error */ + if (h1_handle_parsing_error(h1c)) + h1_send(h1c); + } + else if (h1c->flags & H1C_F_ABRT_PENDING) { + /* Handle pending error, if any (only possible on frontend connection) */ + BUG_ON(h1c->flags & H1C_F_IS_BACK); + if (h1_send_error(h1c)) + h1_send(h1c); + } + else { + h1_close(h1c); + TRACE_STATE("close h1c", H1_EV_H1S_END, h1c->conn); + } + + /* If there is some pending outgoing data or error, just wait */ + if (h1c->state == H1_CS_CLOSING || (h1c->flags & H1C_F_ABRT_PENDING)) + goto end; + + /* Otherwise we can release the H1 connection */ + goto release; + } + else { + struct h1s *h1s = h1c->h1s; + + /* Here there is still a H1 stream with a stream connector. + * Report an error at the stream level and wake up the stream + */ + BUG_ON(!h1s); + + if (h1c->flags & (H1C_F_ERR_PENDING|H1C_F_ERROR)) { + se_fl_set_error(h1s->sd); + TRACE_STATE("report (ERR_PENDING|ERROR) to SE", H1_EV_H1C_RECV, conn, h1s); + } + TRACE_POINT(H1_EV_STRM_WAKE, h1c->conn, h1s); + h1_alert(h1s); + } + } + + if (!b_data(&h1c->ibuf)) + h1_release_buf(h1c, &h1c->ibuf); + + /* Check if a soft-stop is in progress. + * Release idling front connection if this is the case. + */ + if (!(h1c->flags & H1C_F_IS_BACK)) { + if (unlikely(h1c->px->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + if (!(h1c->px->options & PR_O_IDLE_CLOSE_RESP) && + h1c->flags & H1C_F_WAIT_NEXT_REQ) { + + int send_close = 1; + /* If a close-spread-time option is set, we want to avoid + * closing all the active HTTP2 connections at once so we add a + * random factor that will spread the closing. + */ + if (tick_isset(global.close_spread_end)) { + int remaining_window = tick_remain(now_ms, global.close_spread_end); + if (remaining_window) { + /* This should increase the closing rate the + * further along the window we are. + */ + send_close = (remaining_window <= statistical_prng_range(global.close_spread_time)); + } + } + else if (global.tune.options & GTUNE_DISABLE_ACTIVE_CLOSE) + send_close = 0; /* let the client close his connection himself */ + if (send_close) + goto release; + } + } + } + + if (h1c->state == H1_CS_RUNNING && (h1c->flags & H1C_F_WANT_FASTFWD) && !h1s_data_pending(h1c->h1s)) { + TRACE_DEVEL("xprt rcv_buf blocked (want_fastfwd), notify h1s for recv", H1_EV_H1C_RECV, h1c->conn); + h1_wake_stream_for_recv(h1c->h1s); + } + + end: + h1_refresh_timeout(h1c); + TRACE_LEAVE(H1_EV_H1C_WAKE, conn); + return 0; + + release: + if (h1c->state == H1_CS_UPGRADING) { + struct h1s *h1s = h1c->h1s; + + /* Don't release the H1 connection right now, we must destroy + * the attached SC first */ + BUG_ON(!h1s); + + if (h1c->flags & H1C_F_EOS) { + se_fl_set(h1s->sd, SE_FL_EOI|SE_FL_EOS); + TRACE_STATE("report EOS to SE", H1_EV_H1C_RECV, conn, h1s); + } + if (h1c->flags & (H1C_F_ERR_PENDING|H1C_F_ERROR)) { + se_fl_set_error(h1s->sd); + TRACE_STATE("report (ERR_PENDING|ERROR) to SE", H1_EV_H1C_RECV, conn, h1s); + } + h1_alert(h1s); + TRACE_DEVEL("waiting to release the SC before releasing the connection", H1_EV_H1C_WAKE); + } + else { + h1_release(h1c); + TRACE_DEVEL("leaving after releasing the connection", H1_EV_H1C_WAKE); + } + return -1; +} + +struct task *h1_io_cb(struct task *t, void *ctx, unsigned int state) +{ + struct connection *conn; + struct tasklet *tl = (struct tasklet *)t; + int conn_in_list; + struct h1c *h1c = ctx; + int ret = 0; + + if (state & TASK_F_USR1) { + /* the tasklet was idling on an idle connection, it might have + * been stolen, let's be careful! + */ + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + if (tl->context == NULL) { + /* The connection has been taken over by another thread, + * we're no longer responsible for it, so just free the + * tasklet, and do nothing. + */ + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + tasklet_free(tl); + return NULL; + } + conn = h1c->conn; + TRACE_POINT(H1_EV_H1C_WAKE, conn); + + /* Remove the connection from the list, to be sure nobody attempts + * to use it while we handle the I/O events + */ + conn_in_list = conn->flags & CO_FL_LIST_MASK; + if (conn_in_list) + conn_delete_from_tree(conn); + + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } else { + /* we're certain the connection was not in an idle list */ + conn = h1c->conn; + TRACE_ENTER(H1_EV_H1C_WAKE, conn); + conn_in_list = 0; + } + + if (!(h1c->wait_event.events & SUB_RETRY_SEND)) + ret = h1_send(h1c); + if (!(h1c->wait_event.events & SUB_RETRY_RECV)) + ret |= h1_recv(h1c); + if (ret || b_data(&h1c->ibuf)) + ret = h1_process(h1c); + + /* If we were in an idle list, we want to add it back into it, + * unless h1_process() returned -1, which mean it has destroyed + * the connection (testing !ret is enough, if h1_process() wasn't + * called then ret will be 0 anyway. + */ + if (ret < 0) + t = NULL; + + if (!ret && conn_in_list) { + struct server *srv = objt_server(conn->target); + + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + _srv_add_idle(srv, conn, conn_in_list == CO_FL_SAFE_LIST); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + return t; +} + +static int h1_wake(struct connection *conn) +{ + struct h1c *h1c = conn->ctx; + int ret; + + TRACE_POINT(H1_EV_H1C_WAKE, conn); + + h1_send(h1c); + ret = h1_process(h1c); + if (ret == 0) { + struct h1s *h1s = h1c->h1s; + + if (h1c->state == H1_CS_UPGRADING || h1c->state == H1_CS_RUNNING) + h1_alert(h1s); + } + return ret; +} + +/* Connection timeout management. The principle is that if there's no receipt + * nor sending for a certain amount of time, the connection is closed. + */ +struct task *h1_timeout_task(struct task *t, void *context, unsigned int state) +{ + struct h1c *h1c = context; + int expired = tick_is_expired(t->expire, now_ms); + + TRACE_ENTER(H1_EV_H1C_WAKE, h1c ? h1c->conn : NULL); + + if (h1c) { + /* Make sure nobody stole the connection from us */ + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + + /* Somebody already stole the connection from us, so we should not + * free it, we just have to free the task. + */ + if (!t->context) { + h1c = NULL; + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + goto do_leave; + } + + if (!expired) { + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + TRACE_DEVEL("leaving (not expired)", H1_EV_H1C_WAKE, h1c->conn, h1c->h1s); + return t; + } + + /* If a stream connector is still attached and ready to the mux, wait for the + * stream's timeout + */ + if (h1c->state == H1_CS_RUNNING) { + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + t->expire = TICK_ETERNITY; + TRACE_DEVEL("leaving (SC still attached)", H1_EV_H1C_WAKE, h1c->conn, h1c->h1s); + return t; + } + + /* Try to send an error to the client */ + if (h1c->state != H1_CS_CLOSING && !(h1c->flags & (H1C_F_IS_BACK|H1C_F_ERROR|H1C_F_ABRT_PENDING))) { + TRACE_DEVEL("timeout error detected", H1_EV_H1C_WAKE|H1_EV_H1C_ERR, h1c->conn, h1c->h1s); + if (h1_handle_req_tout(h1c)) + h1_send(h1c); + if (b_data(&h1c->obuf) || (h1c->flags & H1C_F_ABRT_PENDING)) { + h1_refresh_timeout(h1c); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + return t; + } + } + + if (h1c->h1s && !se_fl_test(h1c->h1s->sd, SE_FL_ORPHAN)) { + /* Don't release the H1 connection right now, we must destroy the + * attached SC first. */ + se_fl_set(h1c->h1s->sd, SE_FL_EOS | SE_FL_ERROR); + h1_alert(h1c->h1s); + h1_refresh_timeout(h1c); + HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conns[tid].idle_conns_lock); + TRACE_DEVEL("waiting to release the SC before releasing the connection", H1_EV_H1C_WAKE); + return t; + } + + /* We're about to destroy the connection, so make sure nobody attempts + * to steal it from us. + */ + if (h1c->conn->flags & CO_FL_LIST_MASK) + conn_delete_from_tree(h1c->conn); + + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + + do_leave: + task_destroy(t); + + if (!h1c) { + /* resources were already deleted */ + TRACE_DEVEL("leaving (not more h1c)", H1_EV_H1C_WAKE); + return NULL; + } + + h1c->task = NULL; + h1_release(h1c); + TRACE_LEAVE(H1_EV_H1C_WAKE); + return NULL; +} + +/*******************************************/ +/* functions below are used by the streams */ +/*******************************************/ + +/* + * Attach a new stream to a connection + * (Used for outgoing connections) + */ +static int h1_attach(struct connection *conn, struct sedesc *sd, struct session *sess) +{ + struct h1c *h1c = conn->ctx; + struct h1s *h1s; + + /* this connection is no more idle (if it was at all) */ + h1c->flags &= ~H1C_F_SILENT_SHUT; + + TRACE_ENTER(H1_EV_STRM_NEW, conn); + if (h1c->flags & (H1C_F_ERR_PENDING|H1C_F_ERROR)) { + TRACE_ERROR("h1c on error", H1_EV_STRM_NEW|H1_EV_STRM_END|H1_EV_STRM_ERR, conn); + goto err; + } + + h1s = h1c_bck_stream_new(h1c, sd->sc, sess); + if (h1s == NULL) { + TRACE_ERROR("h1s creation failure", H1_EV_STRM_NEW|H1_EV_STRM_END|H1_EV_STRM_ERR, conn); + goto err; + } + + /* the connection is not idle anymore, let's mark this */ + HA_ATOMIC_AND(&h1c->wait_event.tasklet->state, ~TASK_F_USR1); + xprt_set_used(conn, conn->xprt, conn->xprt_ctx); + + TRACE_LEAVE(H1_EV_STRM_NEW, conn, h1s); + return 0; + err: + TRACE_DEVEL("leaving on error", H1_EV_STRM_NEW|H1_EV_STRM_END|H1_EV_STRM_ERR, conn); + return -1; +} + +/* Retrieves a valid stream connector from this connection, or returns NULL. + * For this mux, it's easy as we can only store a single stream connector. + */ +static struct stconn *h1_get_first_sc(const struct connection *conn) +{ + struct h1c *h1c = conn->ctx; + struct h1s *h1s = h1c->h1s; + + if (h1s) + return h1s_sc(h1s); + + return NULL; +} + +static void h1_destroy(void *ctx) +{ + struct h1c *h1c = ctx; + + TRACE_POINT(H1_EV_H1C_END, h1c->conn); + if (!h1c->h1s || h1c->conn->ctx != h1c) + h1_release(h1c); +} + +/* + * Detach the stream from the connection and possibly release the connection. + */ +static void h1_detach(struct sedesc *sd) +{ + struct h1s *h1s = sd->se; + struct h1c *h1c; + struct session *sess; + int is_not_first; + + TRACE_ENTER(H1_EV_STRM_END, h1s ? h1s->h1c->conn : NULL, h1s); + + if (!h1s) { + TRACE_LEAVE(H1_EV_STRM_END); + return; + } + + sess = h1s->sess; + h1c = h1s->h1c; + + sess->accept_date = date; + sess->accept_ts = now_ns; + sess->t_handshake = 0; + sess->t_idle = -1; + + is_not_first = h1s->flags & H1S_F_NOT_FIRST; + h1s_destroy(h1s); + + if (h1c->state == H1_CS_IDLE && (h1c->flags & H1C_F_IS_BACK)) { + /* this connection may be killed at any moment, we want it to + * die "cleanly" (i.e. only an RST). + */ + h1c->flags |= H1C_F_SILENT_SHUT; + + /* If there are any excess server data in the input buffer, + * release it and close the connection ASAP (some data may + * remain in the output buffer). This happens if a server sends + * invalid responses. So in such case, we don't want to reuse + * the connection + */ + if (b_data(&h1c->ibuf)) { + h1_release_buf(h1c, &h1c->ibuf); + h1_close(h1c); + TRACE_DEVEL("remaining data on detach, kill connection", H1_EV_STRM_END|H1_EV_H1C_END); + goto release; + } + + if (h1c->conn->flags & CO_FL_PRIVATE) { + /* Add the connection in the session server list, if not already done */ + if (!session_add_conn(sess, h1c->conn, h1c->conn->target)) { + h1c->conn->owner = NULL; + h1c->conn->mux->destroy(h1c); + goto end; + } + /* Always idle at this step */ + if (session_check_idle_conn(sess, h1c->conn)) { + /* The connection got destroyed, let's leave */ + TRACE_DEVEL("outgoing connection killed", H1_EV_STRM_END|H1_EV_H1C_END); + goto end; + } + } + else { + if (h1c->conn->owner == sess) + h1c->conn->owner = NULL; + + /* mark that the tasklet may lose its context to another thread and + * that the handler needs to check it under the idle conns lock. + */ + HA_ATOMIC_OR(&h1c->wait_event.tasklet->state, TASK_F_USR1); + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + xprt_set_idle(h1c->conn, h1c->conn->xprt, h1c->conn->xprt_ctx); + + if (!srv_add_to_idle_list(objt_server(h1c->conn->target), h1c->conn, is_not_first)) { + /* The server doesn't want it, let's kill the connection right away */ + h1c->conn->mux->destroy(h1c); + TRACE_DEVEL("outgoing connection killed", H1_EV_STRM_END|H1_EV_H1C_END); + goto end; + } + /* At this point, the connection has been added to the + * server idle list, so another thread may already have + * hijacked it, so we can't do anything with it. + */ + return; + } + } + + release: + /* We don't want to close right now unless the connection is in error or shut down for writes */ + if ((h1c->flags & H1C_F_ERROR) || + (h1c->state == H1_CS_CLOSED) || + (h1c->state == H1_CS_CLOSING && !b_data(&h1c->obuf)) || + !h1c->conn->owner) { + TRACE_DEVEL("killing dead connection", H1_EV_STRM_END, h1c->conn); + h1_release(h1c); + } + else { + if (h1c->state == H1_CS_IDLE) { + /* If we have a new request, process it immediately or + * subscribe for reads waiting for new data + */ + if (unlikely(b_data(&h1c->ibuf))) { + if (h1_process(h1c) == -1) + goto end; + } + else + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + h1_set_idle_expiration(h1c); + h1_refresh_timeout(h1c); + } + end: + TRACE_LEAVE(H1_EV_STRM_END); +} + + +static void h1_shutr(struct stconn *sc, enum co_shr_mode mode) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c; + + if (!h1s) + return; + h1c = h1s->h1c; + + TRACE_POINT(H1_EV_STRM_SHUT, h1c->conn, h1s, 0, (size_t[]){mode}); +} + +static void h1_shutw(struct stconn *sc, enum co_shw_mode mode) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c; + + if (!h1s) + return; + h1c = h1s->h1c; + + TRACE_ENTER(H1_EV_STRM_SHUT, h1c->conn, h1s, 0, (size_t[]){mode}); + + if (se_fl_test(h1s->sd, SE_FL_KILL_CONN)) { + TRACE_STATE("stream wants to kill the connection", H1_EV_STRM_SHUT, h1c->conn, h1s); + goto do_shutw; + } + if (h1c->state == H1_CS_CLOSING || (h1c->flags & (H1C_F_EOS|H1C_F_ERR_PENDING|H1C_F_ERROR))) { + TRACE_STATE("shutdown on connection (EOS || CLOSING || ERROR)", H1_EV_STRM_SHUT, h1c->conn, h1s); + goto do_shutw; + } + + if (h1c->state == H1_CS_UPGRADING) { + TRACE_STATE("keep connection alive (UPGRADING)", H1_EV_STRM_SHUT, h1c->conn, h1s); + goto end; + } + if (((h1s->flags & H1S_F_WANT_KAL) && h1s->req.state == H1_MSG_DONE && h1s->res.state == H1_MSG_DONE)) { + TRACE_STATE("keep connection alive (want_kal)", H1_EV_STRM_SHUT, h1c->conn, h1s); + goto end; + } + + do_shutw: + h1_close(h1c); + if (mode != CO_SHW_NORMAL) + h1c->flags |= H1C_F_SILENT_SHUT; + + if (!b_data(&h1c->obuf)) + h1_shutw_conn(h1c->conn); + end: + TRACE_LEAVE(H1_EV_STRM_SHUT, h1c->conn, h1s); +} + +static void h1_shutw_conn(struct connection *conn) +{ + struct h1c *h1c = conn->ctx; + + TRACE_ENTER(H1_EV_H1C_END, conn); + h1_close(h1c); + if (conn->flags & CO_FL_SOCK_WR_SH) + return; + + conn_xprt_shutw(conn); + conn_sock_shutw(conn, !(h1c->flags & H1C_F_SILENT_SHUT)); + + if (h1c->wait_event.tasklet && !h1c->wait_event.events) + tasklet_wakeup(h1c->wait_event.tasklet); + + TRACE_LEAVE(H1_EV_H1C_END, conn); +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type> + * The <es> pointer is not allowed to differ from the one passed to the + * subscribe() call. It always returns zero. + */ +static int h1_unsubscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct h1s *h1s = __sc_mux_strm(sc); + + if (!h1s) + return 0; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(h1s->subs && h1s->subs != es); + + es->events &= ~event_type; + if (!es->events) + h1s->subs = NULL; + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("unsubscribe(recv)", H1_EV_STRM_RECV, h1s->h1c->conn, h1s); + + if (event_type & SUB_RETRY_SEND) + TRACE_DEVEL("unsubscribe(send)", H1_EV_STRM_SEND, h1s->h1c->conn, h1s); + + return 0; +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0, unless + * the stream connector <sc> was already detached, in which case it will return + * -1. + */ +static int h1_subscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c; + + if (!h1s) + return -1; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(h1s->subs && h1s->subs != es); + + es->events |= event_type; + h1s->subs = es; + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("subscribe(recv)", H1_EV_STRM_RECV, h1s->h1c->conn, h1s); + + + if (event_type & SUB_RETRY_SEND) { + TRACE_DEVEL("subscribe(send)", H1_EV_STRM_SEND, h1s->h1c->conn, h1s); + /* + * If the stconn attempts to subscribe, and the + * mux isn't subscribed to the connection, then it + * probably means the connection wasn't established + * yet, so we have to subscribe. + */ + h1c = h1s->h1c; + if (!(h1c->wait_event.events & SUB_RETRY_SEND)) + h1c->conn->xprt->subscribe(h1c->conn, + h1c->conn->xprt_ctx, + SUB_RETRY_SEND, + &h1c->wait_event); + } + return 0; +} + +/* Called from the upper layer, to receive data. + * + * The caller is responsible for defragmenting <buf> if necessary. But <flags> + * must be tested to know the calling context. If CO_RFL_BUF_FLUSH is set, it + * means the caller wants to flush input data (from the mux buffer and the + * channel buffer) to be able to use fast-forwarding. + * If CO_RFL_KEEP_RECV is set, the mux must always subscribe for read + * events before giving back. CO_RFL_BUF_WET is set if <buf> is congested with + * data scheduled for leaving soon. CO_RFL_BUF_NOT_STUCK is set to instruct the + * mux it may optimize the data copy to <buf> if necessary. Otherwise, it should + * copy as much data as possible. + */ +static size_t h1_rcv_buf(struct stconn *sc, struct buffer *buf, size_t count, int flags) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c = h1s->h1c; + struct h1m *h1m = (!(h1c->flags & H1C_F_IS_BACK) ? &h1s->req : &h1s->res); + size_t ret = 0; + + TRACE_ENTER(H1_EV_STRM_RECV, h1c->conn, h1s, 0, (size_t[]){count}); + + /* Do nothing for now if not RUNNING (implies UPGRADING) */ + if (h1c->state < H1_CS_RUNNING) { + TRACE_DEVEL("h1c not ready yet", H1_EV_H1C_RECV|H1_EV_H1C_BLK, h1c->conn); + goto end; + } + + if (!(h1c->flags & H1C_F_IN_ALLOC)) + ret = h1_process_demux(h1c, buf, count); + else + TRACE_DEVEL("h1c ibuf not allocated", H1_EV_H1C_RECV|H1_EV_H1C_BLK, h1c->conn); + + if ((flags & CO_RFL_BUF_FLUSH) && se_fl_test(h1s->sd, SE_FL_MAY_FASTFWD_PROD)) { + h1c->flags |= H1C_F_WANT_FASTFWD; + TRACE_STATE("Block xprt rcv_buf to flush stream's buffer (want_fastfwd)", H1_EV_STRM_RECV, h1c->conn, h1s); + } + else { + if (((flags & CO_RFL_KEEP_RECV) || (h1m->state != H1_MSG_DONE)) && !(h1c->wait_event.events & SUB_RETRY_RECV)) + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + + end: + TRACE_LEAVE(H1_EV_STRM_RECV, h1c->conn, h1s, 0, (size_t[]){ret}); + return ret; +} + + +/* Called from the upper layer, to send data */ +static size_t h1_snd_buf(struct stconn *sc, struct buffer *buf, size_t count, int flags) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c; + size_t total = 0; + + if (!h1s) + return 0; + h1c = h1s->h1c; + + TRACE_ENTER(H1_EV_STRM_SEND, h1c->conn, h1s, 0, (size_t[]){count}); + + /* If we're not connected yet, or we're waiting for a handshake, stop + * now, as we don't want to remove everything from the channel buffer + * before we're sure we can send it. + */ + if (h1c->conn->flags & CO_FL_WAIT_XPRT) { + TRACE_LEAVE(H1_EV_STRM_SEND, h1c->conn, h1s); + return 0; + } + + if (h1c->flags & (H1C_F_ERR_PENDING|H1C_F_ERROR)) { + se_fl_set_error(h1s->sd); + TRACE_ERROR("H1C on error, leaving in error", H1_EV_STRM_SEND|H1_EV_H1C_ERR|H1_EV_H1S_ERR|H1_EV_STRM_ERR, h1c->conn, h1s); + return 0; + } + + /* Inherit some flags from the upper layer */ + h1c->flags &= ~(H1C_F_CO_MSG_MORE|H1C_F_CO_STREAMER); + if (flags & CO_SFL_MSG_MORE) + h1c->flags |= H1C_F_CO_MSG_MORE; + if (flags & CO_SFL_STREAMER) + h1c->flags |= H1C_F_CO_STREAMER; + + while (count) { + size_t ret = 0; + + if (!(h1c->flags & (H1C_F_OUT_FULL|H1C_F_OUT_ALLOC))) + ret = h1_process_mux(h1c, buf, count); + else + TRACE_DEVEL("h1c obuf not allocated", H1_EV_STRM_SEND|H1_EV_H1S_BLK, h1c->conn, h1s); + + if (!ret) + break; + + if ((count - ret) > 0) + h1c->flags |= H1C_F_CO_MSG_MORE; + + total += ret; + count -= ret; + + if ((h1c->wait_event.events & SUB_RETRY_SEND) || !h1_send(h1c)) + break; + + if ((h1c->conn->flags & (CO_FL_ERROR|CO_FL_SOCK_WR_SH))) + break; + } + + if (h1c->flags & (H1C_F_ERR_PENDING|H1C_F_ERROR)) { + // FIXME: following test was removed : + // ((h1c->conn->flags & CO_FL_ERROR) && (se_fl_test(h1s->sd, SE_FL_EOI | SE_FL_EOS) || !b_data(&h1c->ibuf)))) { + se_fl_set_error(h1s->sd); + TRACE_ERROR("reporting error to the app-layer stream", H1_EV_STRM_SEND|H1_EV_H1S_ERR|H1_EV_STRM_ERR, h1c->conn, h1s); + } + + h1_refresh_timeout(h1c); + TRACE_LEAVE(H1_EV_STRM_SEND, h1c->conn, h1s, 0, (size_t[]){total}); + return total; +} + +static inline struct sedesc *h1s_opposite_sd(struct h1s *h1s) +{ + struct xref *peer; + struct sedesc *sdo; + + peer = xref_get_peer_and_lock(&h1s->sd->xref); + if (!peer) + return NULL; + + sdo = container_of(peer, struct sedesc, xref); + xref_unlock(&h1s->sd->xref, peer); + return sdo; +} + +static size_t h1_nego_ff(struct stconn *sc, struct buffer *input, size_t count, unsigned int may_splice) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c = h1s->h1c; + struct h1m *h1m = (!(h1c->flags & H1C_F_IS_BACK) ? &h1s->res : &h1s->req); + size_t ret = 0; + + TRACE_ENTER(H1_EV_STRM_SEND, h1c->conn, h1s, 0, (size_t[]){count}); + + + if (global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_H1_SND) { + h1s->sd->iobuf.flags |= IOBUF_FL_NO_FF; + goto out; + } + + /* TODO: add check on curr_len if CLEN */ + + if (h1m->flags & H1_MF_CHNK) { + if (h1m->curr_len) { + BUG_ON(h1m->state != H1_MSG_DATA); + if (count > h1m->curr_len) + count = h1m->curr_len; + } + else { + BUG_ON(h1m->state != H1_MSG_CHUNK_CRLF && h1m->state != H1_MSG_CHUNK_SIZE); + if (!h1_make_chunk(h1s, h1m, count)) + goto out; + h1m->curr_len = count; + } + } + + /* Use kernel splicing if it is supported by the sender and if there + * are no input data _AND_ no output data. + * + * TODO: It may be good to add a flag to send obuf data first if any, + * and then data in pipe, or the opposite. For now, it is not + * supported to mix data. + */ + if (!b_data(input) && !b_data(&h1c->obuf) && may_splice) { +#if defined(USE_LINUX_SPLICE) + if (h1c->conn->xprt->snd_pipe && (h1s->sd->iobuf.pipe || (pipes_used < global.maxpipes && (h1s->sd->iobuf.pipe = get_pipe())))) { + h1s->sd->iobuf.offset = 0; + h1s->sd->iobuf.data = 0; + ret = count; + goto out; + } +#endif + h1s->sd->iobuf.flags |= IOBUF_FL_NO_SPLICING; + TRACE_DEVEL("Unable to allocate pipe for splicing, fallback to buffer", H1_EV_STRM_SEND, h1c->conn, h1s); + } + + if (!h1_get_buf(h1c, &h1c->obuf)) { + h1c->flags |= H1C_F_OUT_ALLOC; + TRACE_STATE("waiting for opposite h1c obuf allocation", H1_EV_STRM_SEND|H1_EV_H1S_BLK, h1c->conn, h1s); + goto out; + } + + if (b_space_wraps(&h1c->obuf)) + b_slow_realign(&h1c->obuf, trash.area, b_data(&h1c->obuf)); + + h1s->sd->iobuf.buf = &h1c->obuf; + h1s->sd->iobuf.offset = 0; + h1s->sd->iobuf.data = 0; + + /* Cannot forward more than available room in output buffer */ + if (count > b_room(&h1c->obuf)) + count = b_room(&h1c->obuf); + + if (!count) { + h1c->flags |= H1C_F_OUT_FULL; + h1s->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + TRACE_STATE("output buffer full", H1_EV_STRM_SEND|H1_EV_H1S_BLK, h1c->conn, h1s); + goto out; + } + + /* forward remaining input data */ + if (b_data(input)) { + size_t xfer = count; + + if (xfer > b_data(input)) + xfer = b_data(input); + h1s->sd->iobuf.data = b_xfer(&h1c->obuf, input, xfer); + + /* Cannot forward more data, wait for room */ + if (b_data(input)) + goto out; + } + + ret = count - h1s->sd->iobuf.data; + + out: + TRACE_LEAVE(H1_EV_STRM_SEND, h1c->conn, h1s, 0, (size_t[]){ret}); + return ret; +} + +static size_t h1_done_ff(struct stconn *sc) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c = h1s->h1c; + struct h1m *h1m = (!(h1c->flags & H1C_F_IS_BACK) ? &h1s->res : &h1s->req); + struct sedesc *sd = h1s->sd; + size_t total = 0; + + TRACE_ENTER(H1_EV_STRM_SEND, h1c->conn, h1s); + +#if defined(USE_LINUX_SPLICE) + if (sd->iobuf.pipe) { + total = h1c->conn->xprt->snd_pipe(h1c->conn, h1c->conn->xprt_ctx, sd->iobuf.pipe, sd->iobuf.pipe->data); + if (total > 0) + HA_ATOMIC_ADD(&h1c->px_counters->spliced_bytes_out, total); + if (!sd->iobuf.pipe->data) { + put_pipe(sd->iobuf.pipe); + sd->iobuf.pipe = NULL; + } + goto out; + } +#endif + if (!sd->iobuf.pipe) { + if (b_room(&h1c->obuf) == sd->iobuf.offset) + h1c->flags |= H1C_F_OUT_FULL; + + total = sd->iobuf.data; + sd->iobuf.buf = NULL; + sd->iobuf.offset = 0; + sd->iobuf.data = 0; + + if (sd->iobuf.flags & IOBUF_FL_EOI) + h1c->flags &= ~H1C_F_CO_MSG_MORE; + + /* Perform a synchronous send but in all cases, consider + * everything was already sent from the SC point of view. + */ + h1_send(h1c); + } + + out: + if (h1m->curr_len) + h1m->curr_len -= total; + + if (!h1m->curr_len && (h1m->flags & H1_MF_CLEN)) + h1m->state = ((sd->iobuf.flags & IOBUF_FL_EOI) ? H1_MSG_DONE : H1_MSG_TRAILERS); + else if (!h1m->curr_len && (h1m->flags & H1_MF_CHNK)) { + if (h1m->state == H1_MSG_DATA) + h1m->state = H1_MSG_CHUNK_CRLF; + } + + HA_ATOMIC_ADD(&h1c->px_counters->bytes_out, total); + + // TODO: should we call h1_process() instead ? + if (h1c->conn->flags & CO_FL_ERROR) { + h1c->flags = (h1c->flags & ~H1C_F_WANT_FASTFWD) | H1C_F_ERR_PENDING; + if (h1c->flags & H1C_F_EOS) + h1c->flags |= H1C_F_ERROR; + else if (!(h1c->wait_event.events & SUB_RETRY_RECV)) { + /* EOS not seen, so subscribe for reads to be able to + * catch the error on the reading path. It is especially + * important if EOI was reached. + */ + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + se_fl_set_error(h1s->sd); + TRACE_DEVEL("connection error", H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + } + + TRACE_LEAVE(H1_EV_STRM_RECV, h1c->conn, h1s, 0, (size_t[]){total}); + return total; +} + +static int h1_fastfwd(struct stconn *sc, unsigned int count, unsigned int flags) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c = h1s->h1c; + struct h1m *h1m = (!(h1c->flags & H1C_F_IS_BACK) ? &h1s->req : &h1s->res); + struct sedesc *sdo = NULL; + size_t total = 0, try = 0; + int ret = 0; + + TRACE_ENTER(H1_EV_STRM_RECV, h1c->conn, h1s, 0, (size_t[]){count}); + + if (h1m->state != H1_MSG_DATA && h1m->state != H1_MSG_TUNNEL) { + h1c->flags &= ~H1C_F_WANT_FASTFWD; + TRACE_STATE("Cannot fast-forwad data now !(msg_data|msg_tunnel)", H1_EV_STRM_RECV, h1c->conn, h1s); + goto end; + } + + se_fl_clr(h1s->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + h1c->conn->flags &= ~CO_FL_WAIT_ROOM; + h1c->flags |= H1C_F_WANT_FASTFWD; + + if (h1c->flags & (H1C_F_EOS|H1C_F_ERROR)) { + h1c->flags &= ~H1C_F_WANT_FASTFWD; + TRACE_DEVEL("leaving on (EOS|ERROR)", H1_EV_STRM_RECV, h1c->conn, h1s); + goto end; + } + + sdo = h1s_opposite_sd(h1s); + if (!sdo) { + TRACE_STATE("Opposite endpoint not available yet", H1_EV_STRM_RECV, h1c->conn, h1s); + goto out; + } + + retry: + ret = 0; + + if (h1m->state == H1_MSG_DATA && (h1m->flags & (H1_MF_CHNK|H1_MF_CLEN)) && count > h1m->curr_len) + count = h1m->curr_len; + + try = se_nego_ff(sdo, &h1c->ibuf, count, h1c->conn->xprt->rcv_pipe && !!(flags & CO_RFL_MAY_SPLICE) && !(sdo->iobuf.flags & IOBUF_FL_NO_SPLICING)); + if (b_room(&h1c->ibuf) && (h1c->flags & H1C_F_IN_FULL)) { + h1c->flags &= ~H1C_F_IN_FULL; + TRACE_STATE("h1c ibuf not full anymore", H1_EV_STRM_RECV|H1_EV_H1C_BLK); + } + if (!b_data(&h1c->ibuf)) + h1_release_buf(h1c, &h1c->ibuf); + + if (sdo->iobuf.flags & IOBUF_FL_NO_FF) { + /* Fast forwarding is not supported by the consumer */ + h1c->flags = (h1c->flags & ~H1C_F_WANT_FASTFWD) | H1C_F_CANT_FASTFWD; + TRACE_DEVEL("Fast-forwarding not supported by opposite endpoint, disable it", H1_EV_STRM_RECV, h1c->conn, h1s); + goto end; + } + if (sdo->iobuf.flags & IOBUF_FL_FF_BLOCKED) { + se_fl_set(h1s->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + TRACE_STATE("waiting for more room", H1_EV_STRM_RECV|H1_EV_H1S_ERR, h1c->conn, h1s); + goto out; + } + + total += sdo->iobuf.data; + count -= sdo->iobuf.data; +#if defined(USE_LINUX_SPLICE) + if (sdo->iobuf.pipe) { + /* Here, not data was xferred */ + ret = h1c->conn->xprt->rcv_pipe(h1c->conn, h1c->conn->xprt_ctx, sdo->iobuf.pipe, try); + if (ret < 0) { + h1c->flags = (h1c->flags & ~H1C_F_WANT_FASTFWD) | H1C_F_CANT_FASTFWD; + TRACE_ERROR("Error when trying to fast-forward data, disable it and abort", + H1_EV_STRM_RECV|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + BUG_ON(sdo->iobuf.pipe->data); + put_pipe(sdo->iobuf.pipe); + sdo->iobuf.pipe = NULL; + goto end; + } + total += ret; + count -= ret; + if (!ret) { + TRACE_STATE("failed to receive data, subscribing", H1_EV_STRM_RECV, h1c->conn); + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + HA_ATOMIC_ADD(&h1c->px_counters->spliced_bytes_in, ret); + } +#endif + if (!sdo->iobuf.pipe) { + b_add(sdo->iobuf.buf, sdo->iobuf.offset); + ret = h1c->conn->xprt->rcv_buf(h1c->conn, h1c->conn->xprt_ctx, sdo->iobuf.buf, try, flags); + if (ret < try) { + TRACE_STATE("failed to receive data, subscribing", H1_EV_STRM_RECV, h1c->conn); + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + b_sub(sdo->iobuf.buf, sdo->iobuf.offset); + total += ret; + count -= ret; + sdo->iobuf.data += ret; + } + + /* Till now, we forwarded less than a buffer, we can immediately retry + * to fast-forward more data. Instruct the consumer it is an interim + * fast-forward. It is of course only possible if there is still data to + * fast-forward (count > 0), if the previous attempt was a full success + * (0 > ret == try) and if we are not splicing (iobuf.buf != NULL). + */ + if (ret > 0 && ret == try && count && sdo->iobuf.buf && total < b_size(sdo->iobuf.buf)) { + sdo->iobuf.flags |= IOBUF_FL_INTERIM_FF; + se_done_ff(sdo); + goto retry; + } + + out: + if (h1m->state == H1_MSG_DATA && (h1m->flags & (H1_MF_CHNK|H1_MF_CLEN))) { + if (total > h1m->curr_len) { + h1s->flags |= H1S_F_PARSING_ERROR; + se_fl_set(h1s->sd, SE_FL_ERROR); + TRACE_ERROR("too much payload, more than announced", + H1_EV_STRM_RECV|H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + goto end; + } + h1m->curr_len -= total; + if (!h1m->curr_len) { + if (h1m->flags & H1_MF_CLEN) { + h1m->state = H1_MSG_DONE; + se_fl_set(h1s->sd, SE_FL_EOI); /* TODO: this line is tricky and must be evaluated first + * Its purpose is to avoid to set CO_SFL_MSG_MORE on the + * next calls to ->complete_fastfwd(). + */ + } + else + h1m->state = H1_MSG_CHUNK_CRLF; + h1c->flags &= ~H1C_F_WANT_FASTFWD; + + if (!(h1c->flags & H1C_F_IS_BACK)) { + /* The request was fully received. It means the H1S now + * expect data from the opposite side + */ + se_expect_data(h1s->sd); + } + + TRACE_STATE("payload fully received", H1_EV_STRM_RECV, h1c->conn, h1s); + } + } + + if (conn_xprt_read0_pending(h1c->conn)) { + se_fl_set(h1s->sd, SE_FL_EOS); + TRACE_STATE("report EOS to SE", H1_EV_STRM_RECV, h1c->conn, h1s); + if (h1m->state >= H1_MSG_DONE || !(h1m->flags & H1_MF_XFER_LEN)) { + /* DONE or TUNNEL or SHUTR without XFER_LEN, set + * EOI on the stream connector */ + se_fl_set(h1s->sd, SE_FL_EOI); + TRACE_STATE("report EOI to SE", H1_EV_STRM_RECV, h1c->conn, h1s); + } + else { + se_fl_set(h1s->sd, SE_FL_ERROR); + h1c->flags = (h1c->flags & ~H1C_F_WANT_FASTFWD) | H1C_F_ERROR; + TRACE_ERROR("message aborted, set error on SC", H1_EV_STRM_RECV|H1_EV_H1S_ERR, h1c->conn, h1s); + } + h1c->flags = (h1c->flags & ~H1C_F_WANT_FASTFWD) | H1C_F_EOS; + TRACE_STATE("Allow xprt rcv_buf on read0", H1_EV_STRM_RECV, h1c->conn, h1s); + } + if (h1c->conn->flags & CO_FL_ERROR) { + se_fl_set(h1s->sd, SE_FL_ERROR); + h1c->flags = (h1c->flags & ~H1C_F_WANT_FASTFWD) | H1C_F_ERROR; + TRACE_DEVEL("connection error", H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + } + + + sdo->iobuf.flags &= ~IOBUF_FL_INTERIM_FF; + if (se_fl_test(h1s->sd, SE_FL_EOI)) { + sdo->iobuf.flags |= IOBUF_FL_EOI; /* TODO: it may be good to have a flag to be sure we can + * forward the EOI the to consumer side + */ + } + se_done_ff(sdo); + + ret = total; + HA_ATOMIC_ADD(&h1c->px_counters->bytes_in, total); + + if (sdo->iobuf.pipe) { + se_fl_set(h1s->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + } + + end: + + if (!(h1c->flags & H1C_F_WANT_FASTFWD)) { + TRACE_STATE("notify the mux can't use fast-forward anymore", H1_EV_STRM_RECV, h1c->conn, h1s); + se_fl_clr(h1s->sd, SE_FL_MAY_FASTFWD_PROD); + if (!(h1c->wait_event.events & SUB_RETRY_RECV)) { + TRACE_STATE("restart receiving data, subscribing", H1_EV_STRM_RECV, h1c->conn, h1s); + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + } + + TRACE_LEAVE(H1_EV_STRM_RECV, h1c->conn, h1s, 0, (size_t[]){ret}); + return ret; +} + +static int h1_resume_fastfwd(struct stconn *sc, unsigned int flags) +{ + struct h1s *h1s = __sc_mux_strm(sc); + struct h1c *h1c = h1s->h1c; + int ret = 0; + + TRACE_ENTER(H1_EV_STRM_SEND, h1c->conn, h1s, 0, (size_t[]){flags}); + +#if defined(USE_LINUX_SPLICE) + if (h1s->sd->iobuf.pipe) { + struct h1m *h1m = (!(h1c->flags & H1C_F_IS_BACK) ? &h1s->res : &h1s->req); + struct sedesc *sd = h1s->sd; + + ret = h1c->conn->xprt->snd_pipe(h1c->conn, h1c->conn->xprt_ctx, sd->iobuf.pipe, sd->iobuf.pipe->data); + if (ret > 0) + HA_ATOMIC_ADD(&h1c->px_counters->spliced_bytes_out, ret); + if (!sd->iobuf.pipe->data) { + put_pipe(sd->iobuf.pipe); + sd->iobuf.pipe = NULL; + } + + h1m->curr_len -= ret; + + if (!h1m->curr_len && (h1m->flags & H1_MF_CLEN)) + h1m->state = H1_MSG_DONE; + else if (!h1m->curr_len && (h1m->flags & H1_MF_CHNK)) { + if (h1m->state == H1_MSG_DATA) + h1m->state = H1_MSG_CHUNK_CRLF; + } + + HA_ATOMIC_ADD(&h1c->px_counters->bytes_out, ret); + } +#endif + + // TODO: should we call h1_process() instead ? + if (h1c->conn->flags & CO_FL_ERROR) { + h1c->flags = (h1c->flags & ~H1C_F_WANT_FASTFWD) | H1C_F_ERR_PENDING; + if (h1c->flags & H1C_F_EOS) + h1c->flags |= H1C_F_ERROR; + else if (!(h1c->wait_event.events & SUB_RETRY_RECV)) { + /* EOS not seen, so subscribe for reads to be able to + * catch the error on the reading path. It is especially + * important if EOI was reached. + */ + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + } + se_fl_set_error(h1s->sd); + TRACE_DEVEL("connection error", H1_EV_STRM_ERR|H1_EV_H1C_ERR|H1_EV_H1S_ERR, h1c->conn, h1s); + } + + TRACE_LEAVE(H1_EV_STRM_RECV, h1c->conn, h1s, 0, (size_t[]){ret}); + return ret; +} + +static int h1_ctl(struct connection *conn, enum mux_ctl_type mux_ctl, void *output) +{ + struct h1c *h1c = conn->ctx; + int ret = 0; + + switch (mux_ctl) { + case MUX_CTL_STATUS: + if (!(conn->flags & CO_FL_WAIT_XPRT)) + ret |= MUX_STATUS_READY; + return ret; + case MUX_CTL_EXIT_STATUS: + if (output) + *((int *)output) = h1c->errcode; + ret = (h1c->errcode == 408 ? MUX_ES_TOUT_ERR : + (h1c->errcode == 501 ? MUX_ES_NOTIMPL_ERR : + (h1c->errcode == 500 ? MUX_ES_INTERNAL_ERR : + ((h1c->errcode >= 400 && h1c->errcode <= 499) ? MUX_ES_INVALID_ERR : + MUX_ES_SUCCESS)))); + return ret; + case MUX_CTL_SUBS_RECV: + if (!(h1c->wait_event.events & SUB_RETRY_RECV)) + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, SUB_RETRY_RECV, &h1c->wait_event); + return 0; + default: + return -1; + } +} + +static int h1_sctl(struct stconn *sc, enum mux_sctl_type mux_sctl, void *output) +{ + int ret = 0; + struct h1s *h1s = __sc_mux_strm(sc); + + switch (mux_sctl) { + case MUX_SCTL_SID: + if (output) + *((int64_t *)output) = h1s->h1c->req_count; + return ret; + + default: + return -1; + } +} + +/* appends some info about connection <h1c> to buffer <msg>, or does nothing if + * <h1c> is NULL. Returns non-zero if the connection is considered suspicious. + * May emit multiple lines, each new one being prefixed with <pfx>, if <pfx> is + * not NULL, otherwise a single line is used. + */ +static int h1_dump_h1c_info(struct buffer *msg, struct h1c *h1c, const char *pfx) +{ + int ret = 0; + + if (!h1c) + return ret; + + chunk_appendf(msg, " h1c.flg=0x%x .sub=%d .ibuf=%u@%p+%u/%u .obuf=%u@%p+%u/%u", + h1c->flags, h1c->wait_event.events, + (unsigned int)b_data(&h1c->ibuf), b_orig(&h1c->ibuf), + (unsigned int)b_head_ofs(&h1c->ibuf), (unsigned int)b_size(&h1c->ibuf), + (unsigned int)b_data(&h1c->obuf), b_orig(&h1c->obuf), + (unsigned int)b_head_ofs(&h1c->obuf), (unsigned int)b_size(&h1c->obuf)); + + chunk_appendf(msg, " .task=%p", h1c->task); + if (h1c->task) { + chunk_appendf(msg, " .exp=%s", + h1c->task->expire ? tick_is_expired(h1c->task->expire, now_ms) ? "<PAST>" : + human_time(TICKS_TO_MS(h1c->task->expire - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + } + + return ret; +} + +/* appends some info about stream <h1s> to buffer <msg>, or does nothing if + * <h1s> is NULL. Returns non-zero if the stream is considered suspicious. May + * emit multiple lines, each new one being prefixed with <pfx>, if <pfx> is not + * NULL, otherwise a single line is used. + */ +static int h1_dump_h1s_info(struct buffer *msg, const struct h1s *h1s, const char *pfx) +{ + const char *method; + int ret = 0; + + if (!h1s) + return ret; + + if (h1s->meth < HTTP_METH_OTHER) + method = http_known_methods[h1s->meth].ptr; + else + method = "UNKNOWN"; + + chunk_appendf(msg, " h1s=%p h1s.flg=0x%x .sd.flg=0x%x .req.state=%s .res.state=%s", + h1s, h1s->flags, se_fl_get(h1s->sd), + h1m_state_str(h1s->req.state), h1m_state_str(h1s->res.state)); + + if (pfx) + chunk_appendf(msg, "\n%s", pfx); + + chunk_appendf(msg, " .meth=%s status=%d", + method, h1s->status); + + chunk_appendf(msg, " .sd.flg=0x%08x", se_fl_get(h1s->sd)); + if (!se_fl_test(h1s->sd, SE_FL_ORPHAN)) + chunk_appendf(msg, " .sc.flg=0x%08x .sc.app=%p", + h1s_sc(h1s)->flags, h1s_sc(h1s)->app); + + if (pfx && h1s->subs) + chunk_appendf(msg, "\n%s", pfx); + + chunk_appendf(msg, " .subs=%p", h1s->subs); + if (h1s->subs) { + chunk_appendf(msg, "(ev=%d tl=%p", h1s->subs->events, h1s->subs->tasklet); + chunk_appendf(msg, " tl.calls=%d tl.ctx=%p tl.fct=", + h1s->subs->tasklet->calls, + h1s->subs->tasklet->context); + if (h1s->subs->tasklet->calls >= 1000000) + ret = 1; + resolve_sym_name(msg, NULL, h1s->subs->tasklet->process); + chunk_appendf(msg, ")"); + } + return ret; +} + +/* for debugging with CLI's "show fd" command */ +static int h1_show_fd(struct buffer *msg, struct connection *conn) +{ + struct h1c *h1c = conn->ctx; + struct h1s *h1s = h1c->h1s; + int ret = 0; + + ret |= h1_dump_h1c_info(msg, h1c, NULL); + + if (h1s) + ret |= h1_dump_h1s_info(msg, h1s, NULL); + + return ret; +} + +/* for debugging with CLI's "show sess" command. May emit multiple lines, each + * new one being prefixed with <pfx>, if <pfx> is not NULL, otherwise a single + * line is used. Each field starts with a space so it's safe to print it after + * existing fields. + */ +static int h1_show_sd(struct buffer *msg, struct sedesc *sd, const char *pfx) +{ + struct h1s *h1s = sd->se; + int ret = 0; + + if (!h1s) + return ret; + + ret |= h1_dump_h1s_info(msg, h1s, pfx); + if (pfx) + chunk_appendf(msg, "\n%s", pfx); + chunk_appendf(msg, " h1c=%p", h1s->h1c); + ret |= h1_dump_h1c_info(msg, h1s->h1c, pfx); + return ret; +} + + +/* Add an entry in the headers map. Returns -1 on error and 0 on success. */ +static int add_hdr_case_adjust(const char *from, const char *to, char **err) +{ + struct h1_hdr_entry *entry; + + /* Be sure there is a non-empty <to> */ + if (!strlen(to)) { + memprintf(err, "expect <to>"); + return -1; + } + + /* Be sure only the case differs between <from> and <to> */ + if (strcasecmp(from, to) != 0) { + memprintf(err, "<from> and <to> must not differ except the case"); + return -1; + } + + /* Be sure <from> does not already existsin the tree */ + if (ebis_lookup(&hdrs_map.map, from)) { + memprintf(err, "duplicate entry '%s'", from); + return -1; + } + + /* Create the entry and insert it in the tree */ + entry = malloc(sizeof(*entry)); + if (!entry) { + memprintf(err, "out of memory"); + return -1; + } + + entry->node.key = strdup(from); + entry->name = ist(strdup(to)); + if (!entry->node.key || !isttest(entry->name)) { + free(entry->node.key); + istfree(&entry->name); + free(entry); + memprintf(err, "out of memory"); + return -1; + } + ebis_insert(&hdrs_map.map, &entry->node); + return 0; +} + +/* Migrate the the connection to the current thread. + * Return 0 if successful, non-zero otherwise. + * Expected to be called with the old thread lock held. + */ +static int h1_takeover(struct connection *conn, int orig_tid) +{ + struct h1c *h1c = conn->ctx; + struct task *task; + struct task *new_task; + struct tasklet *new_tasklet; + + /* Pre-allocate tasks so that we don't have to roll back after the xprt + * has been migrated. + */ + new_task = task_new_here(); + new_tasklet = tasklet_new(); + if (!new_task || !new_tasklet) + goto fail; + + if (fd_takeover(conn->handle.fd, conn) != 0) + goto fail; + + if (conn->xprt->takeover && conn->xprt->takeover(conn, conn->xprt_ctx, orig_tid) != 0) { + /* We failed to takeover the xprt, even if the connection may + * still be valid, flag it as error'd, as we have already + * taken over the fd, and wake the tasklet, so that it will + * destroy it. + */ + conn->flags |= CO_FL_ERROR; + tasklet_wakeup_on(h1c->wait_event.tasklet, orig_tid); + goto fail; + } + + if (h1c->wait_event.events) + h1c->conn->xprt->unsubscribe(h1c->conn, h1c->conn->xprt_ctx, + h1c->wait_event.events, &h1c->wait_event); + + task = h1c->task; + if (task) { + /* only assign a task if there was already one, otherwise + * the preallocated new task will be released. + */ + task->context = NULL; + h1c->task = NULL; + __ha_barrier_store(); + task_kill(task); + + h1c->task = new_task; + new_task = NULL; + h1c->task->process = h1_timeout_task; + h1c->task->context = h1c; + } + + /* To let the tasklet know it should free itself, and do nothing else, + * set its context to NULL. + */ + h1c->wait_event.tasklet->context = NULL; + tasklet_wakeup_on(h1c->wait_event.tasklet, orig_tid); + + h1c->wait_event.tasklet = new_tasklet; + h1c->wait_event.tasklet->process = h1_io_cb; + h1c->wait_event.tasklet->context = h1c; + h1c->conn->xprt->subscribe(h1c->conn, h1c->conn->xprt_ctx, + SUB_RETRY_RECV, &h1c->wait_event); + + if (new_task) + __task_free(new_task); + return 0; + fail: + if (new_task) + __task_free(new_task); + tasklet_free(new_tasklet); + return -1; +} + + +static void h1_hdeaders_case_adjust_deinit() +{ + struct ebpt_node *node, *next; + struct h1_hdr_entry *entry; + + node = ebpt_first(&hdrs_map.map); + while (node) { + next = ebpt_next(node); + ebpt_delete(node); + entry = container_of(node, struct h1_hdr_entry, node); + free(entry->node.key); + istfree(&entry->name); + free(entry); + node = next; + } + free(hdrs_map.name); +} + +static int cfg_h1_headers_case_adjust_postparser() +{ + FILE *file = NULL; + char *c, *key_beg, *key_end, *value_beg, *value_end; + char *err; + int rc, line = 0, err_code = 0; + + if (!hdrs_map.name) + goto end; + + file = fopen(hdrs_map.name, "r"); + if (!file) { + ha_alert("h1-headers-case-adjust-file '%s': failed to open file.\n", + hdrs_map.name); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* now parse all lines. The file may contain only two header name per + * line, separated by spaces. All heading and trailing spaces will be + * ignored. Lines starting with a # are ignored. + */ + while (fgets(trash.area, trash.size, file) != NULL) { + line++; + c = trash.area; + + /* strip leading spaces and tabs */ + while (*c == ' ' || *c == '\t') + c++; + + /* ignore emptu lines, or lines beginning with a dash */ + if (*c == '#' || *c == '\0' || *c == '\r' || *c == '\n') + continue; + + /* look for the end of the key */ + key_beg = c; + while (*c != '\0' && *c != ' ' && *c != '\t' && *c != '\n' && *c != '\r') + c++; + key_end = c; + + /* strip middle spaces and tabs */ + while (*c == ' ' || *c == '\t') + c++; + + /* look for the end of the value, it is the end of the line */ + value_beg = c; + while (*c && *c != '\n' && *c != '\r') + c++; + value_end = c; + + /* trim possibly trailing spaces and tabs */ + while (value_end > value_beg && (value_end[-1] == ' ' || value_end[-1] == '\t')) + value_end--; + + /* set final \0 and check entries */ + *key_end = '\0'; + *value_end = '\0'; + + err = NULL; + rc = add_hdr_case_adjust(key_beg, value_beg, &err); + if (rc < 0) { + ha_alert("h1-headers-case-adjust-file '%s' : %s at line %d.\n", + hdrs_map.name, err, line); + err_code |= ERR_ALERT | ERR_FATAL; + free(err); + goto end; + } + if (rc > 0) { + ha_warning("h1-headers-case-adjust-file '%s' : %s at line %d.\n", + hdrs_map.name, err, line); + err_code |= ERR_WARN; + free(err); + } + } + + end: + if (file) + fclose(file); + hap_register_post_deinit(h1_hdeaders_case_adjust_deinit); + return err_code; +} + +/* config parser for global "h1-accept-payload_=-with-any-method" */ +static int cfg_parse_h1_accept_payload_with_any_method(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(0, args, err, NULL)) + return -1; + accept_payload_with_any_method = 1; + return 0; +} + + +/* config parser for global "h1-header-case-adjust" */ +static int cfg_parse_h1_header_case_adjust(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(2, args, err, NULL)) + return -1; + if (!*(args[1]) || !*(args[2])) { + memprintf(err, "'%s' expects <from> and <to> as argument.", args[0]); + return -1; + } + return add_hdr_case_adjust(args[1], args[2], err); +} + +/* config parser for global "h1-headers-case-adjust-file" */ +static int cfg_parse_h1_headers_case_adjust_file(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + if (!*(args[1])) { + memprintf(err, "'%s' expects <file> as argument.", args[0]); + return -1; + } + free(hdrs_map.name); + hdrs_map.name = strdup(args[1]); + return 0; +} + +/* config parser for global "tune.h1.zero-copy-fwd-recv" */ +static int cfg_parse_h1_zero_copy_fwd_rcv(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.no_zero_copy_fwd &= ~NO_ZERO_COPY_FWD_H1_RCV; + else if (strcmp(args[1], "off") == 0) + global.tune.no_zero_copy_fwd |= NO_ZERO_COPY_FWD_H1_RCV; + else { + memprintf(err, "'%s' expects 'on' or 'off'.", args[0]); + return -1; + } + return 0; +} + +/* config parser for global "tune.h1.zero-copy-fwd-send" */ +static int cfg_parse_h1_zero_copy_fwd_snd(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.no_zero_copy_fwd &= ~NO_ZERO_COPY_FWD_H1_SND; + else if (strcmp(args[1], "off") == 0) + global.tune.no_zero_copy_fwd |= NO_ZERO_COPY_FWD_H1_SND; + else { + memprintf(err, "'%s' expects 'on' or 'off'.", args[0]); + return -1; + } + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {{ }, { + { CFG_GLOBAL, "h1-accept-payload-with-any-method", cfg_parse_h1_accept_payload_with_any_method }, + { CFG_GLOBAL, "h1-case-adjust", cfg_parse_h1_header_case_adjust }, + { CFG_GLOBAL, "h1-case-adjust-file", cfg_parse_h1_headers_case_adjust_file }, + { CFG_GLOBAL, "tune.h1.zero-copy-fwd-recv", cfg_parse_h1_zero_copy_fwd_rcv }, + { CFG_GLOBAL, "tune.h1.zero-copy-fwd-send", cfg_parse_h1_zero_copy_fwd_snd }, + { 0, NULL, NULL }, + } +}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); +REGISTER_CONFIG_POSTPARSER("h1-headers-map", cfg_h1_headers_case_adjust_postparser); + + +/****************************************/ +/* MUX initialization and instantiation */ +/****************************************/ + +/* The mux operations */ +static const struct mux_ops mux_http_ops = { + .init = h1_init, + .wake = h1_wake, + .attach = h1_attach, + .get_first_sc = h1_get_first_sc, + .detach = h1_detach, + .destroy = h1_destroy, + .avail_streams = h1_avail_streams, + .used_streams = h1_used_streams, + .rcv_buf = h1_rcv_buf, + .snd_buf = h1_snd_buf, + .nego_fastfwd = h1_nego_ff, + .done_fastfwd = h1_done_ff, + .fastfwd = h1_fastfwd, + .resume_fastfwd = h1_resume_fastfwd, + .subscribe = h1_subscribe, + .unsubscribe = h1_unsubscribe, + .shutr = h1_shutr, + .shutw = h1_shutw, + .show_fd = h1_show_fd, + .show_sd = h1_show_sd, + .ctl = h1_ctl, + .sctl = h1_sctl, + .takeover = h1_takeover, + .flags = MX_FL_HTX, + .name = "H1", +}; + +static const struct mux_ops mux_h1_ops = { + .init = h1_init, + .wake = h1_wake, + .attach = h1_attach, + .get_first_sc = h1_get_first_sc, + .detach = h1_detach, + .destroy = h1_destroy, + .avail_streams = h1_avail_streams, + .used_streams = h1_used_streams, + .rcv_buf = h1_rcv_buf, + .snd_buf = h1_snd_buf, + .nego_fastfwd = h1_nego_ff, + .done_fastfwd = h1_done_ff, + .fastfwd = h1_fastfwd, + .resume_fastfwd = h1_resume_fastfwd, + .subscribe = h1_subscribe, + .unsubscribe = h1_unsubscribe, + .shutr = h1_shutr, + .shutw = h1_shutw, + .show_fd = h1_show_fd, + .show_sd = h1_show_sd, + .ctl = h1_ctl, + .sctl = h1_sctl, + .takeover = h1_takeover, + .flags = MX_FL_HTX|MX_FL_NO_UPG, + .name = "H1", +}; + +/* this mux registers default HTX proto but also h1 proto (to be referenced in the conf */ +static struct mux_proto_list mux_proto_h1 = + { .token = IST("h1"), .mode = PROTO_MODE_HTTP, .side = PROTO_SIDE_BOTH, .mux = &mux_h1_ops }; +static struct mux_proto_list mux_proto_http = + { .token = IST(""), .mode = PROTO_MODE_HTTP, .side = PROTO_SIDE_BOTH, .mux = &mux_http_ops }; + +INITCALL1(STG_REGISTER, register_mux_proto, &mux_proto_h1); +INITCALL1(STG_REGISTER, register_mux_proto, &mux_proto_http); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/mux_h2.c b/src/mux_h2.c new file mode 100644 index 0000000..273e1f5 --- /dev/null +++ b/src/mux_h2.c @@ -0,0 +1,7598 @@ +/* + * HTTP/2 mux-demux for connections + * + * Copyright 2017 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <import/eb32tree.h> +#include <import/ebmbtree.h> +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/h2.h> +#include <haproxy/hpack-dec.h> +#include <haproxy/hpack-enc.h> +#include <haproxy/hpack-tbl.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/istbuf.h> +#include <haproxy/log.h> +#include <haproxy/mux_h2-t.h> +#include <haproxy/net_helper.h> +#include <haproxy/proxy.h> +#include <haproxy/server.h> +#include <haproxy/session-t.h> +#include <haproxy/stats.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/trace.h> +#include <haproxy/xref.h> + + +/* dummy streams returned for closed, error, refused, idle and states */ +static const struct h2s *h2_closed_stream; +static const struct h2s *h2_error_stream; +static const struct h2s *h2_refused_stream; +static const struct h2s *h2_idle_stream; + + +/**** H2 connection descriptor ****/ +struct h2c { + struct connection *conn; + + enum h2_cs st0; /* mux state */ + enum h2_err errcode; /* H2 err code (H2_ERR_*) */ + + /* 16 bit hole here */ + uint32_t flags; /* connection flags: H2_CF_* */ + uint32_t streams_limit; /* maximum number of concurrent streams the peer supports */ + int32_t max_id; /* highest ID known on this connection, <0 before preface */ + uint32_t rcvd_c; /* newly received data to ACK for the connection */ + uint32_t rcvd_s; /* newly received data to ACK for the current stream (dsi) or zero */ + + /* states for the demux direction */ + struct hpack_dht *ddht; /* demux dynamic header table */ + struct buffer dbuf; /* demux buffer */ + + int32_t dsi; /* demux stream ID (<0 = idle) */ + int32_t dfl; /* demux frame length (if dsi >= 0) */ + int8_t dft; /* demux frame type (if dsi >= 0) */ + int8_t dff; /* demux frame flags (if dsi >= 0) */ + uint8_t dpl; /* demux pad length (part of dfl), init to 0 */ + /* 8 bit hole here */ + int32_t last_sid; /* last processed stream ID for GOAWAY, <0 before preface */ + + /* states for the mux direction */ + struct buffer mbuf[H2C_MBUF_CNT]; /* mux buffers (ring) */ + int32_t miw; /* mux initial window size for all new streams */ + int32_t mws; /* mux window size. Can be negative. */ + int32_t mfs; /* mux's max frame size */ + + int timeout; /* idle timeout duration in ticks */ + int shut_timeout; /* idle timeout duration in ticks after GOAWAY was sent */ + int idle_start; /* date of the last time the connection went idle (no stream + empty mbuf), or the start of current http req */ + /* 32-bit hole here */ + unsigned int nb_streams; /* number of streams in the tree */ + unsigned int nb_sc; /* number of attached stream connectors */ + unsigned int nb_reserved; /* number of reserved streams */ + unsigned int stream_cnt; /* total number of streams seen */ + struct proxy *proxy; /* the proxy this connection was created for */ + struct task *task; /* timeout management task */ + struct h2_counters *px_counters; /* h2 counters attached to proxy */ + struct eb_root streams_by_id; /* all active streams by their ID */ + struct list send_list; /* list of blocked streams requesting to send */ + struct list fctl_list; /* list of streams blocked by connection's fctl */ + struct list blocked_list; /* list of streams blocked for other reasons (e.g. sfctl, dep) */ + struct buffer_wait buf_wait; /* wait list for buffer allocations */ + struct wait_event wait_event; /* To be used if we're waiting for I/Os */ +}; + + +/* H2 stream descriptor, describing the stream as it appears in the H2C, and as + * it is being processed in the internal HTTP representation (HTX). + */ +struct h2s { + struct sedesc *sd; + struct session *sess; + struct h2c *h2c; + struct eb32_node by_id; /* place in h2c's streams_by_id */ + int32_t id; /* stream ID */ + uint32_t flags; /* H2_SF_* */ + int sws; /* stream window size, to be added to the mux's initial window size */ + enum h2_err errcode; /* H2 err code (H2_ERR_*) */ + enum h2_ss st; + uint16_t status; /* HTTP response status */ + unsigned long long body_len; /* remaining body length according to content-length if H2_SF_DATA_CLEN */ + struct buffer rxbuf; /* receive buffer, always valid (buf_empty or real buffer) */ + struct wait_event *subs; /* recv wait_event the stream connector associated is waiting on (via h2_subscribe) */ + struct list list; /* To be used when adding in h2c->send_list or h2c->fctl_lsit */ + struct tasklet *shut_tl; /* deferred shutdown tasklet, to retry to send an RST after we failed to, + * in case there's no other subscription to do it */ + + char upgrade_protocol[16]; /* rfc 8441: requested protocol on Extended CONNECT */ +}; + +/* descriptor for an h2 frame header */ +struct h2_fh { + uint32_t len; /* length, host order, 24 bits */ + uint32_t sid; /* stream id, host order, 31 bits */ + uint8_t ft; /* frame type */ + uint8_t ff; /* frame flags */ +}; + +/* trace source and events */ +static void h2_trace(enum trace_level level, uint64_t mask, \ + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +/* The event representation is split like this : + * strm - application layer + * h2s - internal H2 stream + * h2c - internal H2 connection + * conn - external connection + * + */ +static const struct trace_event h2_trace_events[] = { +#define H2_EV_H2C_NEW (1ULL << 0) + { .mask = H2_EV_H2C_NEW, .name = "h2c_new", .desc = "new H2 connection" }, +#define H2_EV_H2C_RECV (1ULL << 1) + { .mask = H2_EV_H2C_RECV, .name = "h2c_recv", .desc = "Rx on H2 connection" }, +#define H2_EV_H2C_SEND (1ULL << 2) + { .mask = H2_EV_H2C_SEND, .name = "h2c_send", .desc = "Tx on H2 connection" }, +#define H2_EV_H2C_FCTL (1ULL << 3) + { .mask = H2_EV_H2C_FCTL, .name = "h2c_fctl", .desc = "H2 connection flow-controlled" }, +#define H2_EV_H2C_BLK (1ULL << 4) + { .mask = H2_EV_H2C_BLK, .name = "h2c_blk", .desc = "H2 connection blocked" }, +#define H2_EV_H2C_WAKE (1ULL << 5) + { .mask = H2_EV_H2C_WAKE, .name = "h2c_wake", .desc = "H2 connection woken up" }, +#define H2_EV_H2C_END (1ULL << 6) + { .mask = H2_EV_H2C_END, .name = "h2c_end", .desc = "H2 connection terminated" }, +#define H2_EV_H2C_ERR (1ULL << 7) + { .mask = H2_EV_H2C_ERR, .name = "h2c_err", .desc = "error on H2 connection" }, +#define H2_EV_RX_FHDR (1ULL << 8) + { .mask = H2_EV_RX_FHDR, .name = "rx_fhdr", .desc = "H2 frame header received" }, +#define H2_EV_RX_FRAME (1ULL << 9) + { .mask = H2_EV_RX_FRAME, .name = "rx_frame", .desc = "receipt of any H2 frame" }, +#define H2_EV_RX_EOI (1ULL << 10) + { .mask = H2_EV_RX_EOI, .name = "rx_eoi", .desc = "receipt of end of H2 input (ES or RST)" }, +#define H2_EV_RX_PREFACE (1ULL << 11) + { .mask = H2_EV_RX_PREFACE, .name = "rx_preface", .desc = "receipt of H2 preface" }, +#define H2_EV_RX_DATA (1ULL << 12) + { .mask = H2_EV_RX_DATA, .name = "rx_data", .desc = "receipt of H2 DATA frame" }, +#define H2_EV_RX_HDR (1ULL << 13) + { .mask = H2_EV_RX_HDR, .name = "rx_hdr", .desc = "receipt of H2 HEADERS frame" }, +#define H2_EV_RX_PRIO (1ULL << 14) + { .mask = H2_EV_RX_PRIO, .name = "rx_prio", .desc = "receipt of H2 PRIORITY frame" }, +#define H2_EV_RX_RST (1ULL << 15) + { .mask = H2_EV_RX_RST, .name = "rx_rst", .desc = "receipt of H2 RST_STREAM frame" }, +#define H2_EV_RX_SETTINGS (1ULL << 16) + { .mask = H2_EV_RX_SETTINGS, .name = "rx_settings", .desc = "receipt of H2 SETTINGS frame" }, +#define H2_EV_RX_PUSH (1ULL << 17) + { .mask = H2_EV_RX_PUSH, .name = "rx_push", .desc = "receipt of H2 PUSH_PROMISE frame" }, +#define H2_EV_RX_PING (1ULL << 18) + { .mask = H2_EV_RX_PING, .name = "rx_ping", .desc = "receipt of H2 PING frame" }, +#define H2_EV_RX_GOAWAY (1ULL << 19) + { .mask = H2_EV_RX_GOAWAY, .name = "rx_goaway", .desc = "receipt of H2 GOAWAY frame" }, +#define H2_EV_RX_WU (1ULL << 20) + { .mask = H2_EV_RX_WU, .name = "rx_wu", .desc = "receipt of H2 WINDOW_UPDATE frame" }, +#define H2_EV_RX_CONT (1ULL << 21) + { .mask = H2_EV_RX_CONT, .name = "rx_cont", .desc = "receipt of H2 CONTINUATION frame" }, +#define H2_EV_TX_FRAME (1ULL << 22) + { .mask = H2_EV_TX_FRAME, .name = "tx_frame", .desc = "transmission of any H2 frame" }, +#define H2_EV_TX_EOI (1ULL << 23) + { .mask = H2_EV_TX_EOI, .name = "tx_eoi", .desc = "transmission of H2 end of input (ES or RST)" }, +#define H2_EV_TX_PREFACE (1ULL << 24) + { .mask = H2_EV_TX_PREFACE, .name = "tx_preface", .desc = "transmission of H2 preface" }, +#define H2_EV_TX_DATA (1ULL << 25) + { .mask = H2_EV_TX_DATA, .name = "tx_data", .desc = "transmission of H2 DATA frame" }, +#define H2_EV_TX_HDR (1ULL << 26) + { .mask = H2_EV_TX_HDR, .name = "tx_hdr", .desc = "transmission of H2 HEADERS frame" }, +#define H2_EV_TX_PRIO (1ULL << 27) + { .mask = H2_EV_TX_PRIO, .name = "tx_prio", .desc = "transmission of H2 PRIORITY frame" }, +#define H2_EV_TX_RST (1ULL << 28) + { .mask = H2_EV_TX_RST, .name = "tx_rst", .desc = "transmission of H2 RST_STREAM frame" }, +#define H2_EV_TX_SETTINGS (1ULL << 29) + { .mask = H2_EV_TX_SETTINGS, .name = "tx_settings", .desc = "transmission of H2 SETTINGS frame" }, +#define H2_EV_TX_PUSH (1ULL << 30) + { .mask = H2_EV_TX_PUSH, .name = "tx_push", .desc = "transmission of H2 PUSH_PROMISE frame" }, +#define H2_EV_TX_PING (1ULL << 31) + { .mask = H2_EV_TX_PING, .name = "tx_ping", .desc = "transmission of H2 PING frame" }, +#define H2_EV_TX_GOAWAY (1ULL << 32) + { .mask = H2_EV_TX_GOAWAY, .name = "tx_goaway", .desc = "transmission of H2 GOAWAY frame" }, +#define H2_EV_TX_WU (1ULL << 33) + { .mask = H2_EV_TX_WU, .name = "tx_wu", .desc = "transmission of H2 WINDOW_UPDATE frame" }, +#define H2_EV_TX_CONT (1ULL << 34) + { .mask = H2_EV_TX_CONT, .name = "tx_cont", .desc = "transmission of H2 CONTINUATION frame" }, +#define H2_EV_H2S_NEW (1ULL << 35) + { .mask = H2_EV_H2S_NEW, .name = "h2s_new", .desc = "new H2 stream" }, +#define H2_EV_H2S_RECV (1ULL << 36) + { .mask = H2_EV_H2S_RECV, .name = "h2s_recv", .desc = "Rx for H2 stream" }, +#define H2_EV_H2S_SEND (1ULL << 37) + { .mask = H2_EV_H2S_SEND, .name = "h2s_send", .desc = "Tx for H2 stream" }, +#define H2_EV_H2S_FCTL (1ULL << 38) + { .mask = H2_EV_H2S_FCTL, .name = "h2s_fctl", .desc = "H2 stream flow-controlled" }, +#define H2_EV_H2S_BLK (1ULL << 39) + { .mask = H2_EV_H2S_BLK, .name = "h2s_blk", .desc = "H2 stream blocked" }, +#define H2_EV_H2S_WAKE (1ULL << 40) + { .mask = H2_EV_H2S_WAKE, .name = "h2s_wake", .desc = "H2 stream woken up" }, +#define H2_EV_H2S_END (1ULL << 41) + { .mask = H2_EV_H2S_END, .name = "h2s_end", .desc = "H2 stream terminated" }, +#define H2_EV_H2S_ERR (1ULL << 42) + { .mask = H2_EV_H2S_ERR, .name = "h2s_err", .desc = "error on H2 stream" }, +#define H2_EV_STRM_NEW (1ULL << 43) + { .mask = H2_EV_STRM_NEW, .name = "strm_new", .desc = "app-layer stream creation" }, +#define H2_EV_STRM_RECV (1ULL << 44) + { .mask = H2_EV_STRM_RECV, .name = "strm_recv", .desc = "receiving data for stream" }, +#define H2_EV_STRM_SEND (1ULL << 45) + { .mask = H2_EV_STRM_SEND, .name = "strm_send", .desc = "sending data for stream" }, +#define H2_EV_STRM_FULL (1ULL << 46) + { .mask = H2_EV_STRM_FULL, .name = "strm_full", .desc = "stream buffer full" }, +#define H2_EV_STRM_WAKE (1ULL << 47) + { .mask = H2_EV_STRM_WAKE, .name = "strm_wake", .desc = "stream woken up" }, +#define H2_EV_STRM_SHUT (1ULL << 48) + { .mask = H2_EV_STRM_SHUT, .name = "strm_shut", .desc = "stream shutdown" }, +#define H2_EV_STRM_END (1ULL << 49) + { .mask = H2_EV_STRM_END, .name = "strm_end", .desc = "detaching app-layer stream" }, +#define H2_EV_STRM_ERR (1ULL << 50) + { .mask = H2_EV_STRM_ERR, .name = "strm_err", .desc = "stream error" }, +#define H2_EV_PROTO_ERR (1ULL << 51) + { .mask = H2_EV_PROTO_ERR, .name = "proto_err", .desc = "protocol error" }, + { } +}; + +static const struct name_desc h2_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the connection */ }, + /* arg2 */ { .name="h2s", .desc="H2 stream" }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc h2_trace_decoding[] = { +#define H2_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define H2_VERB_MINIMAL 2 + { .name="minimal", .desc="report only h2c/h2s state and flags, no real decoding" }, +#define H2_VERB_SIMPLE 3 + { .name="simple", .desc="add request/response status line or frame info when available" }, +#define H2_VERB_ADVANCED 4 + { .name="advanced", .desc="add header fields or frame decoding when available" }, +#define H2_VERB_COMPLETE 5 + { .name="complete", .desc="add full data dump when available" }, + { /* end */ } +}; + +static struct trace_source trace_h2 __read_mostly = { + .name = IST("h2"), + .desc = "HTTP/2 multiplexer", + .arg_def = TRC_ARG1_CONN, // TRACE()'s first argument is always a connection + .default_cb = h2_trace, + .known_events = h2_trace_events, + .lockon_args = h2_trace_lockon_args, + .decoding = h2_trace_decoding, + .report_events = ~0, // report everything by default +}; + +#define TRACE_SOURCE &trace_h2 +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + +/* h2 stats module */ +enum { + H2_ST_HEADERS_RCVD, + H2_ST_DATA_RCVD, + H2_ST_SETTINGS_RCVD, + H2_ST_RST_STREAM_RCVD, + H2_ST_GOAWAY_RCVD, + + H2_ST_CONN_PROTO_ERR, + H2_ST_STRM_PROTO_ERR, + H2_ST_RST_STREAM_RESP, + H2_ST_GOAWAY_RESP, + + H2_ST_OPEN_CONN, + H2_ST_OPEN_STREAM, + H2_ST_TOTAL_CONN, + H2_ST_TOTAL_STREAM, + + H2_STATS_COUNT /* must be the last member of the enum */ +}; + +static struct name_desc h2_stats[] = { + [H2_ST_HEADERS_RCVD] = { .name = "h2_headers_rcvd", + .desc = "Total number of received HEADERS frames" }, + [H2_ST_DATA_RCVD] = { .name = "h2_data_rcvd", + .desc = "Total number of received DATA frames" }, + [H2_ST_SETTINGS_RCVD] = { .name = "h2_settings_rcvd", + .desc = "Total number of received SETTINGS frames" }, + [H2_ST_RST_STREAM_RCVD] = { .name = "h2_rst_stream_rcvd", + .desc = "Total number of received RST_STREAM frames" }, + [H2_ST_GOAWAY_RCVD] = { .name = "h2_goaway_rcvd", + .desc = "Total number of received GOAWAY frames" }, + + [H2_ST_CONN_PROTO_ERR] = { .name = "h2_detected_conn_protocol_errors", + .desc = "Total number of connection protocol errors" }, + [H2_ST_STRM_PROTO_ERR] = { .name = "h2_detected_strm_protocol_errors", + .desc = "Total number of stream protocol errors" }, + [H2_ST_RST_STREAM_RESP] = { .name = "h2_rst_stream_resp", + .desc = "Total number of RST_STREAM sent on detected error" }, + [H2_ST_GOAWAY_RESP] = { .name = "h2_goaway_resp", + .desc = "Total number of GOAWAY sent on detected error" }, + + [H2_ST_OPEN_CONN] = { .name = "h2_open_connections", + .desc = "Count of currently open connections" }, + [H2_ST_OPEN_STREAM] = { .name = "h2_backend_open_streams", + .desc = "Count of currently open streams" }, + [H2_ST_TOTAL_CONN] = { .name = "h2_total_connections", + .desc = "Total number of connections" }, + [H2_ST_TOTAL_STREAM] = { .name = "h2_backend_total_streams", + .desc = "Total number of streams" }, +}; + +static struct h2_counters { + long long headers_rcvd; /* total number of HEADERS frame received */ + long long data_rcvd; /* total number of DATA frame received */ + long long settings_rcvd; /* total number of SETTINGS frame received */ + long long rst_stream_rcvd; /* total number of RST_STREAM frame received */ + long long goaway_rcvd; /* total number of GOAWAY frame received */ + + long long conn_proto_err; /* total number of protocol errors detected */ + long long strm_proto_err; /* total number of protocol errors detected */ + long long rst_stream_resp; /* total number of RST_STREAM frame sent on error */ + long long goaway_resp; /* total number of GOAWAY frame sent on error */ + + long long open_conns; /* count of currently open connections */ + long long open_streams; /* count of currently open streams */ + long long total_conns; /* total number of connections */ + long long total_streams; /* total number of streams */ +} h2_counters; + +static void h2_fill_stats(void *data, struct field *stats) +{ + struct h2_counters *counters = data; + + stats[H2_ST_HEADERS_RCVD] = mkf_u64(FN_COUNTER, counters->headers_rcvd); + stats[H2_ST_DATA_RCVD] = mkf_u64(FN_COUNTER, counters->data_rcvd); + stats[H2_ST_SETTINGS_RCVD] = mkf_u64(FN_COUNTER, counters->settings_rcvd); + stats[H2_ST_RST_STREAM_RCVD] = mkf_u64(FN_COUNTER, counters->rst_stream_rcvd); + stats[H2_ST_GOAWAY_RCVD] = mkf_u64(FN_COUNTER, counters->goaway_rcvd); + + stats[H2_ST_CONN_PROTO_ERR] = mkf_u64(FN_COUNTER, counters->conn_proto_err); + stats[H2_ST_STRM_PROTO_ERR] = mkf_u64(FN_COUNTER, counters->strm_proto_err); + stats[H2_ST_RST_STREAM_RESP] = mkf_u64(FN_COUNTER, counters->rst_stream_resp); + stats[H2_ST_GOAWAY_RESP] = mkf_u64(FN_COUNTER, counters->goaway_resp); + + stats[H2_ST_OPEN_CONN] = mkf_u64(FN_GAUGE, counters->open_conns); + stats[H2_ST_OPEN_STREAM] = mkf_u64(FN_GAUGE, counters->open_streams); + stats[H2_ST_TOTAL_CONN] = mkf_u64(FN_COUNTER, counters->total_conns); + stats[H2_ST_TOTAL_STREAM] = mkf_u64(FN_COUNTER, counters->total_streams); +} + +static struct stats_module h2_stats_module = { + .name = "h2", + .fill_stats = h2_fill_stats, + .stats = h2_stats, + .stats_count = H2_STATS_COUNT, + .counters = &h2_counters, + .counters_size = sizeof(h2_counters), + .domain_flags = MK_STATS_PROXY_DOMAIN(STATS_PX_CAP_FE|STATS_PX_CAP_BE), + .clearable = 1, +}; + +INITCALL1(STG_REGISTER, stats_register_module, &h2_stats_module); + +/* the h2c connection pool */ +DECLARE_STATIC_POOL(pool_head_h2c, "h2c", sizeof(struct h2c)); + +/* the h2s stream pool */ +DECLARE_STATIC_POOL(pool_head_h2s, "h2s", sizeof(struct h2s)); + +/* The default connection window size is 65535, it may only be enlarged using + * a WINDOW_UPDATE message. Since the window must never be larger than 2G-1, + * we'll pretend we already received the difference between the two to send + * an equivalent window update to enlarge it to 2G-1. + */ +#define H2_INITIAL_WINDOW_INCREMENT ((1U<<31)-1 - 65535) + +/* maximum amount of data we're OK with re-aligning for buffer optimizations */ +#define MAX_DATA_REALIGN 1024 + +/* a few settings from the global section */ +static int h2_settings_header_table_size = 4096; /* initial value */ +static int h2_settings_initial_window_size = 65536; /* default initial value */ +static int h2_be_settings_initial_window_size = 0; /* backend's default initial value */ +static int h2_fe_settings_initial_window_size = 0; /* frontend's default initial value */ +static unsigned int h2_settings_max_concurrent_streams = 100; /* default value */ +static unsigned int h2_be_settings_max_concurrent_streams = 0; /* backend value */ +static unsigned int h2_fe_settings_max_concurrent_streams = 0; /* frontend value */ +static int h2_settings_max_frame_size = 0; /* unset */ + +/* other non-protocol settings */ +static unsigned int h2_fe_max_total_streams = 0; /* frontend value */ + +/* a dummy closed endpoint */ +static const struct sedesc closed_ep = { + .sc = NULL, + .flags = SE_FL_DETACHED, +}; + +/* a dmumy closed stream */ +static const struct h2s *h2_closed_stream = &(const struct h2s){ + .sd = (struct sedesc *)&closed_ep, + .h2c = NULL, + .st = H2_SS_CLOSED, + .errcode = H2_ERR_STREAM_CLOSED, + .flags = H2_SF_RST_RCVD, + .id = 0, +}; + +/* a dmumy closed stream returning a PROTOCOL_ERROR error */ +static const struct h2s *h2_error_stream = &(const struct h2s){ + .sd = (struct sedesc *)&closed_ep, + .h2c = NULL, + .st = H2_SS_CLOSED, + .errcode = H2_ERR_PROTOCOL_ERROR, + .flags = 0, + .id = 0, +}; + +/* a dmumy closed stream returning a REFUSED_STREAM error */ +static const struct h2s *h2_refused_stream = &(const struct h2s){ + .sd = (struct sedesc *)&closed_ep, + .h2c = NULL, + .st = H2_SS_CLOSED, + .errcode = H2_ERR_REFUSED_STREAM, + .flags = 0, + .id = 0, +}; + +/* and a dummy idle stream for use with any unannounced stream */ +static const struct h2s *h2_idle_stream = &(const struct h2s){ + .sd = (struct sedesc *)&closed_ep, + .h2c = NULL, + .st = H2_SS_IDLE, + .errcode = H2_ERR_STREAM_CLOSED, + .id = 0, +}; + + +struct task *h2_timeout_task(struct task *t, void *context, unsigned int state); +static int h2_send(struct h2c *h2c); +static int h2_recv(struct h2c *h2c); +static int h2_process(struct h2c *h2c); +/* h2_io_cb is exported to see it resolved in "show fd" */ +struct task *h2_io_cb(struct task *t, void *ctx, unsigned int state); +static inline struct h2s *h2c_st_by_id(struct h2c *h2c, int id); +static int h2c_dec_hdrs(struct h2c *h2c, struct buffer *rxbuf, uint32_t *flags, unsigned long long *body_len, char *upgrade_protocol); +static int h2_frt_transfer_data(struct h2s *h2s); +struct task *h2_deferred_shut(struct task *t, void *ctx, unsigned int state); +static struct h2s *h2c_bck_stream_new(struct h2c *h2c, struct stconn *sc, struct session *sess); +static void h2s_alert(struct h2s *h2s); +static inline void h2_remove_from_list(struct h2s *h2s); + +/* returns the stconn associated to the H2 stream */ +static forceinline struct stconn *h2s_sc(const struct h2s *h2s) +{ + return h2s->sd->sc; +} + +/* the H2 traces always expect that arg1, if non-null, is of type connection + * (from which we can derive h2c), that arg2, if non-null, is of type h2s, and + * that arg3, if non-null, is either of type htx for tx headers, or of type + * buffer for everything else. + */ +static void h2_trace(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct connection *conn = a1; + const struct h2c *h2c = conn ? conn->ctx : NULL; + const struct h2s *h2s = a2; + const struct buffer *buf = a3; + const struct htx *htx; + int pos; + + if (!h2c) // nothing to add + return; + + if (src->verbosity > H2_VERB_CLEAN) { + chunk_appendf(&trace_buf, " : h2c=%p(%c,%s)", h2c, conn_is_back(conn) ? 'B' : 'F', h2c_st_to_str(h2c->st0)); + + if (mask & H2_EV_H2C_NEW) // inside h2_init, otherwise it's hard to match conn & h2c + conn_append_debug_info(&trace_buf, conn, " : "); + + if (h2c->errcode) + chunk_appendf(&trace_buf, " err=%s/%02x", h2_err_str(h2c->errcode), h2c->errcode); + + if (h2c->flags & H2_CF_DEM_IN_PROGRESS && // frame processing has started, type and length are valid + (mask & (H2_EV_RX_FRAME|H2_EV_RX_FHDR)) == (H2_EV_RX_FRAME|H2_EV_RX_FHDR)) { + chunk_appendf(&trace_buf, " dft=%s/%02x dfl=%d", h2_ft_str(h2c->dft), h2c->dff, h2c->dfl); + } + + if (h2s) { + if (h2s->id <= 0) + chunk_appendf(&trace_buf, " dsi=%d", h2c->dsi); + if (h2s == h2_idle_stream) + chunk_appendf(&trace_buf, " h2s=IDL"); + else if (h2s != h2_closed_stream && h2s != h2_refused_stream && h2s != h2_error_stream) + chunk_appendf(&trace_buf, " h2s=%p(%d,%s)", h2s, h2s->id, h2s_st_to_str(h2s->st)); + else if (h2c->dsi > 0) // don't show that before sid is known + chunk_appendf(&trace_buf, " h2s=CLO"); + if (h2s->id && h2s->errcode) + chunk_appendf(&trace_buf, " err=%s/%02x", h2_err_str(h2s->errcode), h2s->errcode); + } + } + + /* Let's dump decoded requests and responses right after parsing. They + * are traced at level USER with a few recognizable flags. + */ + if ((mask == (H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW) || + mask == (H2_EV_RX_FRAME|H2_EV_RX_HDR)) && buf) + htx = htxbuf(buf); // recv req/res + else if (mask == (H2_EV_TX_FRAME|H2_EV_TX_HDR)) + htx = a3; // send req/res + else + htx = NULL; + + if (level == TRACE_LEVEL_USER && src->verbosity != H2_VERB_MINIMAL && htx && (pos = htx_get_head(htx)) != -1) { + const struct htx_blk *blk = htx_get_blk(htx, pos); + const struct htx_sl *sl = htx_get_blk_ptr(htx, blk); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_REQ_SL) + chunk_appendf(&trace_buf, " : [%d] H2 REQ: %.*s %.*s %.*s", + h2s ? h2s->id : h2c->dsi, + HTX_SL_P1_LEN(sl), HTX_SL_P1_PTR(sl), + HTX_SL_P2_LEN(sl), HTX_SL_P2_PTR(sl), + HTX_SL_P3_LEN(sl), HTX_SL_P3_PTR(sl)); + else if (type == HTX_BLK_RES_SL) + chunk_appendf(&trace_buf, " : [%d] H2 RES: %.*s %.*s %.*s", + h2s ? h2s->id : h2c->dsi, + HTX_SL_P1_LEN(sl), HTX_SL_P1_PTR(sl), + HTX_SL_P2_LEN(sl), HTX_SL_P2_PTR(sl), + HTX_SL_P3_LEN(sl), HTX_SL_P3_PTR(sl)); + } +} + + +/* Detect a pending read0 for a H2 connection. It happens if a read0 was + * already reported on a previous xprt->rcvbuf() AND a frame parser failed + * to parse pending data, confirming no more progress is possible because + * we're facing a truncated frame. The function returns 1 to report a read0 + * or 0 otherwise. + */ +static inline int h2c_read0_pending(struct h2c *h2c) +{ + return !!(h2c->flags & H2_CF_END_REACHED); +} + +/* returns true if the connection is allowed to expire, false otherwise. A + * connection may expire when it has no attached streams. As long as streams + * are attached, the application layer is responsible for timeout management, + * and each layer will detach when it doesn't want to wait anymore. When the + * last one leaves, the connection must take over timeout management. + */ +static inline int h2c_may_expire(const struct h2c *h2c) +{ + return !h2c->nb_sc; +} + +/* returns the number of max concurrent streams permitted on a connection, + * depending on its side (frontend or backend), falling back to the default + * h2_settings_max_concurrent_streams. It may even be zero. + */ +static inline int h2c_max_concurrent_streams(const struct h2c *h2c) +{ + int ret; + + ret = (h2c->flags & H2_CF_IS_BACK) ? + h2_be_settings_max_concurrent_streams : + h2_fe_settings_max_concurrent_streams; + + ret = ret ? ret : h2_settings_max_concurrent_streams; + return ret; +} + + +/* update h2c timeout if needed */ +static void h2c_update_timeout(struct h2c *h2c) +{ + int is_idle_conn = 0; + + TRACE_ENTER(H2_EV_H2C_WAKE, h2c->conn); + + if (!h2c->task) + goto leave; + + if (h2c_may_expire(h2c)) { + /* no more streams attached */ + if (br_data(h2c->mbuf)) { + /* pending output data: always the regular data timeout */ + h2c->task->expire = tick_add_ifset(now_ms, h2c->timeout); + } else { + /* no stream, no output data */ + if (!(h2c->flags & H2_CF_IS_BACK)) { + int to; + + if (h2c->max_id > 0 && !b_data(&h2c->dbuf) && + tick_isset(h2c->proxy->timeout.httpka)) { + /* idle after having seen one stream => keep-alive */ + to = h2c->proxy->timeout.httpka; + } else { + /* before first request, or started to deserialize a + * new req => http-request. + */ + to = h2c->proxy->timeout.httpreq; + } + + h2c->task->expire = tick_add_ifset(h2c->idle_start, to); + is_idle_conn = 1; + } + + if (h2c->flags & (H2_CF_GOAWAY_SENT|H2_CF_GOAWAY_FAILED)) { + /* GOAWAY sent (or failed), closing in progress */ + int exp = tick_add_ifset(now_ms, h2c->shut_timeout); + + h2c->task->expire = tick_first(h2c->task->expire, exp); + is_idle_conn = 1; + } + + /* if a timeout above was not set, fall back to the default one */ + if (!tick_isset(h2c->task->expire)) + h2c->task->expire = tick_add_ifset(now_ms, h2c->timeout); + } + + if ((h2c->proxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) && + is_idle_conn && tick_isset(global.close_spread_end)) { + /* If a soft-stop is in progress and a close-spread-time + * is set, we want to spread idle connection closing roughly + * evenly across the defined window. This should only + * act on idle frontend connections. + * If the window end is already in the past, we wake the + * timeout task up immediately so that it can be closed. + */ + int remaining_window = tick_remain(now_ms, global.close_spread_end); + if (remaining_window) { + /* We don't need to reset the expire if it would + * already happen before the close window end. + */ + if (tick_isset(h2c->task->expire) && + tick_is_le(global.close_spread_end, h2c->task->expire)) { + /* Set an expire value shorter than the current value + * because the close spread window end comes earlier. + */ + h2c->task->expire = tick_add(now_ms, statistical_prng_range(remaining_window)); + } + } + else { + /* We are past the soft close window end, wake the timeout + * task up immediately. + */ + task_wakeup(h2c->task, TASK_WOKEN_TIMER); + } + } + + } else { + h2c->task->expire = TICK_ETERNITY; + } + task_queue(h2c->task); + leave: + TRACE_LEAVE(H2_EV_H2C_WAKE); +} + +static __inline int +h2c_is_dead(const struct h2c *h2c) +{ + if (eb_is_empty(&h2c->streams_by_id) && /* don't close if streams exist */ + ((h2c->flags & H2_CF_ERROR) || /* errors close immediately */ + (h2c->flags & H2_CF_ERR_PENDING && h2c->st0 < H2_CS_FRAME_H) || /* early error during connect */ + (h2c->st0 >= H2_CS_ERROR && !h2c->task) || /* a timeout stroke earlier */ + (!(h2c->conn->owner) && !conn_is_reverse(h2c->conn)) || /* Nobody's left to take care of the connection, drop it now */ + (!br_data(h2c->mbuf) && /* mux buffer empty, also process clean events below */ + ((h2c->flags & H2_CF_RCVD_SHUT) || + (h2c->last_sid >= 0 && h2c->max_id >= h2c->last_sid))))) + return 1; + + return 0; +} + +/*****************************************************/ +/* functions below are for dynamic buffer management */ +/*****************************************************/ + +/* indicates whether or not the we may call the h2_recv() function to attempt + * to receive data into the buffer and/or demux pending data. The condition is + * a bit complex due to some API limits for now. The rules are the following : + * - if an error or a shutdown was detected on the connection and the buffer + * is empty, we must not attempt to receive + * - if the demux buf failed to be allocated, we must not try to receive and + * we know there is nothing pending + * - if no flag indicates a blocking condition, we may attempt to receive, + * regardless of whether the demux buffer is full or not, so that only + * de demux part decides whether or not to block. This is needed because + * the connection API indeed prevents us from re-enabling receipt that is + * already enabled in a polled state, so we must always immediately stop + * as soon as the demux can't proceed so as never to hit an end of read + * with data pending in the buffers. + * - otherwise must may not attempt + */ +static inline int h2_recv_allowed(const struct h2c *h2c) +{ + if (b_data(&h2c->dbuf) == 0 && + ((h2c->flags & (H2_CF_RCVD_SHUT|H2_CF_ERROR)) || h2c->st0 >= H2_CS_ERROR)) + return 0; + + if (!(h2c->flags & H2_CF_DEM_DALLOC) && + !(h2c->flags & H2_CF_DEM_BLOCK_ANY)) + return 1; + + return 0; +} + +/* restarts reading on the connection if it was not enabled */ +static inline void h2c_restart_reading(const struct h2c *h2c, int consider_buffer) +{ + if (!h2_recv_allowed(h2c)) + return; + if ((!consider_buffer || !b_data(&h2c->dbuf)) + && (h2c->wait_event.events & SUB_RETRY_RECV)) + return; + tasklet_wakeup(h2c->wait_event.tasklet); +} + + +/* returns true if the front connection has too many stream connectors attached */ +static inline int h2_frt_has_too_many_sc(const struct h2c *h2c) +{ + return h2c->nb_sc > h2c_max_concurrent_streams(h2c) || + unlikely(conn_reverse_in_preconnect(h2c->conn)); +} + +/* Tries to grab a buffer and to re-enable processing on mux <target>. The h2c + * flags are used to figure what buffer was requested. It returns 1 if the + * allocation succeeds, in which case the connection is woken up, or 0 if it's + * impossible to wake up and we prefer to be woken up later. + */ +static int h2_buf_available(void *target) +{ + struct h2c *h2c = target; + struct h2s *h2s; + + if ((h2c->flags & H2_CF_DEM_DALLOC) && b_alloc(&h2c->dbuf)) { + h2c->flags &= ~H2_CF_DEM_DALLOC; + h2c_restart_reading(h2c, 1); + return 1; + } + + if ((h2c->flags & H2_CF_MUX_MALLOC) && b_alloc(br_tail(h2c->mbuf))) { + h2c->flags &= ~H2_CF_MUX_MALLOC; + + if (h2c->flags & H2_CF_DEM_MROOM) { + h2c->flags &= ~H2_CF_DEM_MROOM; + h2c_restart_reading(h2c, 1); + } + return 1; + } + + if ((h2c->flags & H2_CF_DEM_SALLOC) && + (h2s = h2c_st_by_id(h2c, h2c->dsi)) && h2s_sc(h2s) && + b_alloc(&h2s->rxbuf)) { + h2c->flags &= ~H2_CF_DEM_SALLOC; + h2c_restart_reading(h2c, 1); + return 1; + } + + return 0; +} + +static inline struct buffer *h2_get_buf(struct h2c *h2c, struct buffer *bptr) +{ + struct buffer *buf = NULL; + + if (likely(!LIST_INLIST(&h2c->buf_wait.list)) && + unlikely((buf = b_alloc(bptr)) == NULL)) { + h2c->buf_wait.target = h2c; + h2c->buf_wait.wakeup_cb = h2_buf_available; + LIST_APPEND(&th_ctx->buffer_wq, &h2c->buf_wait.list); + } + return buf; +} + +static inline void h2_release_buf(struct h2c *h2c, struct buffer *bptr) +{ + if (bptr->size) { + b_free(bptr); + offer_buffers(NULL, 1); + } +} + +static inline void h2_release_mbuf(struct h2c *h2c) +{ + struct buffer *buf; + unsigned int count = 0; + + while (b_size(buf = br_head_pick(h2c->mbuf))) { + b_free(buf); + count++; + } + if (count) + offer_buffers(NULL, count); +} + +/* returns the number of allocatable outgoing streams for the connection taking + * the last_sid and the reserved ones into account. + */ +static inline int h2_streams_left(const struct h2c *h2c) +{ + int ret; + + /* consider the number of outgoing streams we're allowed to create before + * reaching the last GOAWAY frame seen. max_id is the last assigned id, + * nb_reserved is the number of streams which don't yet have an ID. + */ + ret = (h2c->last_sid >= 0) ? h2c->last_sid : 0x7FFFFFFF; + ret = (unsigned int)(ret - h2c->max_id) / 2 - h2c->nb_reserved - 1; + if (ret < 0) + ret = 0; + return ret; +} + +/* returns the number of streams in use on a connection to figure if it's + * idle or not. We check nb_sc and not nb_streams as the caller will want + * to know if it was the last one after a detach(). + */ +static int h2_used_streams(struct connection *conn) +{ + struct h2c *h2c = conn->ctx; + + return h2c->nb_sc; +} + +/* returns the number of concurrent streams available on the connection */ +static int h2_avail_streams(struct connection *conn) +{ + struct server *srv = objt_server(conn->target); + struct h2c *h2c = conn->ctx; + int ret1, ret2; + + /* RFC7540#6.8: Receivers of a GOAWAY frame MUST NOT open additional + * streams on the connection. + */ + if (h2c->last_sid >= 0) + return 0; + + if (h2c->st0 >= H2_CS_ERROR) + return 0; + + /* note: may be negative if a SETTINGS frame changes the limit */ + ret1 = h2c->streams_limit - h2c->nb_streams; + + /* we must also consider the limit imposed by stream IDs */ + ret2 = h2_streams_left(h2c); + ret1 = MIN(ret1, ret2); + if (ret1 > 0 && srv && srv->max_reuse >= 0) { + ret2 = h2c->stream_cnt <= srv->max_reuse ? srv->max_reuse - h2c->stream_cnt + 1: 0; + ret1 = MIN(ret1, ret2); + } + return ret1; +} + +/* Unconditionally produce a trace of the header. Please do not call this one + * and use h2_trace_header() instead which first checks if traces are enabled. + */ +void _h2_trace_header(const struct ist hn, const struct ist hv, + uint64_t mask, const struct ist trc_loc, const char *func, + const struct h2c *h2c, const struct h2s *h2s) +{ + struct ist n_ist, v_ist; + const char *c_str, *s_str; + + chunk_reset(&trash); + c_str = chunk_newstr(&trash); + if (h2c) { + chunk_appendf(&trash, "h2c=%p(%c,%s) ", + h2c, (h2c->flags & H2_CF_IS_BACK) ? 'B' : 'F', h2c_st_to_str(h2c->st0)); + } + + s_str = chunk_newstr(&trash); + if (h2s) { + if (h2s->id <= 0) + chunk_appendf(&trash, "dsi=%d ", h2s->h2c->dsi); + chunk_appendf(&trash, "h2s=%p(%d,%s) ", h2s, h2s->id, h2s_st_to_str(h2s->st)); + } + else if (h2c) + chunk_appendf(&trash, "dsi=%d ", h2c->dsi); + + n_ist = ist2(chunk_newstr(&trash), 0); + istscpy(&n_ist, hn, 256); + trash.data += n_ist.len; + if (n_ist.len != hn.len) + chunk_appendf(&trash, " (... +%ld)", (long)(hn.len - n_ist.len)); + + v_ist = ist2(chunk_newstr(&trash), 0); + istscpy(&v_ist, hv, 1024); + trash.data += v_ist.len; + if (v_ist.len != hv.len) + chunk_appendf(&trash, " (... +%ld)", (long)(hv.len - v_ist.len)); + + TRACE_PRINTF_LOC(TRACE_LEVEL_USER, mask, trc_loc, func, + (h2c ? h2c->conn : 0), 0, 0, 0, + "%s%s%s %s: %s", c_str, s_str, + (mask & H2_EV_TX_HDR) ? "sndh" : "rcvh", + n_ist.ptr, v_ist.ptr); +} + +/* produce a trace of the header after checking that tracing is enabled */ +static inline void h2_trace_header(const struct ist hn, const struct ist hv, + uint64_t mask, const struct ist trc_loc, const char *func, + const struct h2c *h2c, const struct h2s *h2s) +{ + if ((TRACE_SOURCE)->verbosity >= H2_VERB_ADVANCED && + TRACE_ENABLED(TRACE_LEVEL_USER, mask, h2c ? h2c->conn : 0, h2s, 0, 0)) + _h2_trace_header(hn, hv, mask, trc_loc, func, h2c, h2s); +} + +/* hpack-encode header name <hn> and value <hv>, possibly emitting a trace if + * currently enabled. This is done on behalf of function <func> at <trc_loc> + * passed as ist(TRC_LOC), h2c <h2c>, and h2s <h2s>, all of which may be NULL. + * The trace is only emitted if the header is emitted (in which case non-zero + * is returned). The trash is modified. In the traces, the header's name will + * be truncated to 256 chars and the header's value to 1024 chars. + */ +static inline int h2_encode_header(struct buffer *buf, const struct ist hn, const struct ist hv, + uint64_t mask, const struct ist trc_loc, const char *func, + const struct h2c *h2c, const struct h2s *h2s) +{ + int ret; + + ret = hpack_encode_header(buf, hn, hv); + if (ret) + h2_trace_header(hn, hv, mask, trc_loc, func, h2c, h2s); + + return ret; +} + +/*****************************************************************/ +/* functions below are dedicated to the mux setup and management */ +/*****************************************************************/ + +/* Initialize the mux once it's attached. For outgoing connections, the context + * is already initialized before installing the mux, so we detect incoming + * connections from the fact that the context is still NULL (even during mux + * upgrades). <input> is always used as Input buffer and may contain data. It is + * the caller responsibility to not reuse it anymore. Returns < 0 on error. + */ +static int h2_init(struct connection *conn, struct proxy *prx, struct session *sess, + struct buffer *input) +{ + struct h2c *h2c; + struct task *t = NULL; + void *conn_ctx = conn->ctx; + + TRACE_ENTER(H2_EV_H2C_NEW); + + h2c = pool_alloc(pool_head_h2c); + if (!h2c) + goto fail_no_h2c; + + if (conn_is_back(conn)) { + h2c->flags = H2_CF_IS_BACK; + h2c->shut_timeout = h2c->timeout = prx->timeout.server; + if (tick_isset(prx->timeout.serverfin)) + h2c->shut_timeout = prx->timeout.serverfin; + + h2c->px_counters = EXTRA_COUNTERS_GET(prx->extra_counters_be, + &h2_stats_module); + } else { + h2c->flags = H2_CF_NONE; + h2c->shut_timeout = h2c->timeout = prx->timeout.client; + if (tick_isset(prx->timeout.clientfin)) + h2c->shut_timeout = prx->timeout.clientfin; + + h2c->px_counters = EXTRA_COUNTERS_GET(prx->extra_counters_fe, + &h2_stats_module); + } + + h2c->proxy = prx; + h2c->task = NULL; + h2c->wait_event.tasklet = NULL; + h2c->idle_start = now_ms; + if (tick_isset(h2c->timeout)) { + t = task_new_here(); + if (!t) + goto fail; + + h2c->task = t; + t->process = h2_timeout_task; + t->context = h2c; + t->expire = tick_add(now_ms, h2c->timeout); + } + + h2c->wait_event.tasklet = tasklet_new(); + if (!h2c->wait_event.tasklet) + goto fail; + h2c->wait_event.tasklet->process = h2_io_cb; + h2c->wait_event.tasklet->context = h2c; + h2c->wait_event.events = 0; + if (!conn_is_back(conn)) { + /* Connection might already be in the stopping_list if subject + * to h1->h2 upgrade. + */ + if (!LIST_INLIST(&conn->stopping_list)) { + LIST_APPEND(&mux_stopping_data[tid].list, + &conn->stopping_list); + } + } + + h2c->ddht = hpack_dht_alloc(); + if (!h2c->ddht) + goto fail; + + /* Initialise the context. */ + h2c->st0 = H2_CS_PREFACE; + h2c->conn = conn; + h2c->streams_limit = h2c_max_concurrent_streams(h2c); + h2c->max_id = -1; + h2c->errcode = H2_ERR_NO_ERROR; + h2c->rcvd_c = 0; + h2c->rcvd_s = 0; + h2c->nb_streams = 0; + h2c->nb_sc = 0; + h2c->nb_reserved = 0; + h2c->stream_cnt = 0; + + h2c->dbuf = *input; + h2c->dsi = -1; + + h2c->last_sid = -1; + + br_init(h2c->mbuf, sizeof(h2c->mbuf) / sizeof(h2c->mbuf[0])); + h2c->miw = 65535; /* mux initial window size */ + h2c->mws = 65535; /* mux window size */ + h2c->mfs = 16384; /* initial max frame size */ + h2c->streams_by_id = EB_ROOT; + LIST_INIT(&h2c->send_list); + LIST_INIT(&h2c->fctl_list); + LIST_INIT(&h2c->blocked_list); + LIST_INIT(&h2c->buf_wait.list); + + conn->ctx = h2c; + + TRACE_USER("new H2 connection", H2_EV_H2C_NEW, conn); + + if (t) + task_queue(t); + + if (h2c->flags & H2_CF_IS_BACK && likely(!conn_is_reverse(h2c->conn))) { + /* FIXME: this is temporary, for outgoing connections we need + * to immediately allocate a stream until the code is modified + * so that the caller calls ->attach(). For now the outgoing sc + * is stored as conn->ctx by the caller and saved in conn_ctx. + */ + struct h2s *h2s; + + h2s = h2c_bck_stream_new(h2c, conn_ctx, sess); + if (!h2s) + goto fail_stream; + } + + if (sess) + proxy_inc_fe_cum_sess_ver_ctr(sess->listener, prx, 2); + HA_ATOMIC_INC(&h2c->px_counters->open_conns); + HA_ATOMIC_INC(&h2c->px_counters->total_conns); + + /* prepare to read something */ + h2c_restart_reading(h2c, 1); + TRACE_LEAVE(H2_EV_H2C_NEW, conn); + return 0; + fail_stream: + hpack_dht_free(h2c->ddht); + fail: + task_destroy(t); + tasklet_free(h2c->wait_event.tasklet); + pool_free(pool_head_h2c, h2c); + fail_no_h2c: + if (!conn_is_back(conn)) + LIST_DEL_INIT(&conn->stopping_list); + conn->ctx = conn_ctx; /* restore saved ctx */ + TRACE_DEVEL("leaving in error", H2_EV_H2C_NEW|H2_EV_H2C_END|H2_EV_H2C_ERR); + return -1; +} + +/* returns the next allocatable outgoing stream ID for the H2 connection, or + * -1 if no more is allocatable. + */ +static inline int32_t h2c_get_next_sid(const struct h2c *h2c) +{ + int32_t id = (h2c->max_id + 1) | 1; + + if ((id & 0x80000000U) || (h2c->last_sid >= 0 && id > h2c->last_sid)) + id = -1; + return id; +} + +/* returns the stream associated with id <id> or NULL if not found */ +static inline struct h2s *h2c_st_by_id(struct h2c *h2c, int id) +{ + struct eb32_node *node; + + if (id == 0) + return (struct h2s *)h2_closed_stream; + + if (id > h2c->max_id) + return (struct h2s *)h2_idle_stream; + + node = eb32_lookup(&h2c->streams_by_id, id); + if (!node) + return (struct h2s *)h2_closed_stream; + + return container_of(node, struct h2s, by_id); +} + +/* release function. This one should be called to free all resources allocated + * to the mux. + */ +static void h2_release(struct h2c *h2c) +{ + struct connection *conn = h2c->conn; + + TRACE_ENTER(H2_EV_H2C_END); + + hpack_dht_free(h2c->ddht); + + if (LIST_INLIST(&h2c->buf_wait.list)) + LIST_DEL_INIT(&h2c->buf_wait.list); + + h2_release_buf(h2c, &h2c->dbuf); + h2_release_mbuf(h2c); + + if (h2c->task) { + h2c->task->context = NULL; + task_wakeup(h2c->task, TASK_WOKEN_OTHER); + h2c->task = NULL; + } + tasklet_free(h2c->wait_event.tasklet); + if (conn && h2c->wait_event.events != 0) + conn->xprt->unsubscribe(conn, conn->xprt_ctx, h2c->wait_event.events, + &h2c->wait_event); + + HA_ATOMIC_DEC(&h2c->px_counters->open_conns); + + pool_free(pool_head_h2c, h2c); + + if (conn) { + if (!conn_is_back(conn)) + LIST_DEL_INIT(&conn->stopping_list); + + conn->mux = NULL; + conn->ctx = NULL; + TRACE_DEVEL("freeing conn", H2_EV_H2C_END, conn); + + conn_stop_tracking(conn); + + /* there might be a GOAWAY frame still pending in the TCP + * stack, and if the peer continues to send (i.e. window + * updates etc), this can result in losing the GOAWAY. For + * this reason we try to drain anything received in between. + */ + conn->flags |= CO_FL_WANT_DRAIN; + + conn_xprt_shutw(conn); + conn_xprt_close(conn); + conn_sock_shutw(conn, !conn_is_back(conn)); + conn_ctrl_close(conn); + + if (conn->destroy_cb) + conn->destroy_cb(conn); + conn_free(conn); + } + + TRACE_LEAVE(H2_EV_H2C_END); +} + + +/******************************************************/ +/* functions below are for the H2 protocol processing */ +/******************************************************/ + +/* returns the stream if of stream <h2s> or 0 if <h2s> is NULL */ +static inline __maybe_unused int h2s_id(const struct h2s *h2s) +{ + return h2s ? h2s->id : 0; +} + +/* returns the sum of the stream's own window size and the mux's initial + * window, which together form the stream's effective window size. + */ +static inline int h2s_mws(const struct h2s *h2s) +{ + return h2s->sws + h2s->h2c->miw; +} + +/* marks an error on the connection. Before settings are sent, we must not send + * a GOAWAY frame, and the error state will prevent h2c_send_goaway_error() + * from verifying this so we set H2_CF_GOAWAY_FAILED to make sure it will not + * even try. + */ +static inline __maybe_unused void h2c_error(struct h2c *h2c, enum h2_err err) +{ + TRACE_POINT(H2_EV_H2C_ERR, h2c->conn, 0, 0, (void *)(long)(err)); + h2c->errcode = err; + if (h2c->st0 < H2_CS_SETTINGS1) + h2c->flags |= H2_CF_GOAWAY_FAILED; + h2c->st0 = H2_CS_ERROR; +} + +/* marks an error on the stream. It may also update an already closed stream + * (e.g. to report an error after an RST was received). + */ +static inline __maybe_unused void h2s_error(struct h2s *h2s, enum h2_err err) +{ + if (h2s->id && h2s->st != H2_SS_ERROR) { + TRACE_POINT(H2_EV_H2S_ERR, h2s->h2c->conn, h2s, 0, (void *)(long)(err)); + h2s->errcode = err; + if (h2s->st < H2_SS_ERROR) + h2s->st = H2_SS_ERROR; + se_fl_set_error(h2s->sd); + } +} + +/* attempt to notify the data layer of recv availability */ +static void __maybe_unused h2s_notify_recv(struct h2s *h2s) +{ + if (h2s->subs && h2s->subs->events & SUB_RETRY_RECV) { + TRACE_POINT(H2_EV_STRM_WAKE, h2s->h2c->conn, h2s); + tasklet_wakeup(h2s->subs->tasklet); + h2s->subs->events &= ~SUB_RETRY_RECV; + if (!h2s->subs->events) + h2s->subs = NULL; + } +} + +/* attempt to notify the data layer of send availability */ +static void __maybe_unused h2s_notify_send(struct h2s *h2s) +{ + if (h2s->subs && h2s->subs->events & SUB_RETRY_SEND) { + TRACE_POINT(H2_EV_STRM_WAKE, h2s->h2c->conn, h2s); + h2s->flags |= H2_SF_NOTIFIED; + tasklet_wakeup(h2s->subs->tasklet); + h2s->subs->events &= ~SUB_RETRY_SEND; + if (!h2s->subs->events) + h2s->subs = NULL; + } + else if (h2s->flags & (H2_SF_WANT_SHUTR | H2_SF_WANT_SHUTW)) { + TRACE_POINT(H2_EV_STRM_WAKE, h2s->h2c->conn, h2s); + tasklet_wakeup(h2s->shut_tl); + } +} + +/* alerts the data layer, trying to wake it up by all means, following + * this sequence : + * - if the h2s' data layer is subscribed to recv, then it's woken up for recv + * - if its subscribed to send, then it's woken up for send + * - if it was subscribed to neither, its ->wake() callback is called + * It is safe to call this function with a closed stream which doesn't have a + * stream connector anymore. + */ +static void __maybe_unused h2s_alert(struct h2s *h2s) +{ + TRACE_ENTER(H2_EV_H2S_WAKE, h2s->h2c->conn, h2s); + + if (h2s->subs || + (h2s->flags & (H2_SF_WANT_SHUTR | H2_SF_WANT_SHUTW))) { + h2s_notify_recv(h2s); + h2s_notify_send(h2s); + } + else if (h2s_sc(h2s) && h2s_sc(h2s)->app_ops->wake != NULL) { + TRACE_POINT(H2_EV_STRM_WAKE, h2s->h2c->conn, h2s); + h2s_sc(h2s)->app_ops->wake(h2s_sc(h2s)); + } + + TRACE_LEAVE(H2_EV_H2S_WAKE, h2s->h2c->conn, h2s); +} + +/* writes the 24-bit frame size <len> at address <frame> */ +static inline __maybe_unused void h2_set_frame_size(void *frame, uint32_t len) +{ + uint8_t *out = frame; + + *out = len >> 16; + write_n16(out + 1, len); +} + +/* reads <bytes> bytes from buffer <b> starting at relative offset <o> from the + * current pointer, dealing with wrapping, and stores the result in <dst>. It's + * the caller's responsibility to verify that there are at least <bytes> bytes + * available in the buffer's input prior to calling this function. The buffer + * is assumed not to hold any output data. + */ +static inline __maybe_unused void h2_get_buf_bytes(void *dst, size_t bytes, + const struct buffer *b, int o) +{ + readv_bytes(dst, bytes, b_peek(b, o), b_wrap(b) - b_peek(b, o), b_orig(b)); +} + +static inline __maybe_unused uint16_t h2_get_n16(const struct buffer *b, int o) +{ + return readv_n16(b_peek(b, o), b_wrap(b) - b_peek(b, o), b_orig(b)); +} + +static inline __maybe_unused uint32_t h2_get_n32(const struct buffer *b, int o) +{ + return readv_n32(b_peek(b, o), b_wrap(b) - b_peek(b, o), b_orig(b)); +} + +static inline __maybe_unused uint64_t h2_get_n64(const struct buffer *b, int o) +{ + return readv_n64(b_peek(b, o), b_wrap(b) - b_peek(b, o), b_orig(b)); +} + + +/* Peeks an H2 frame header from offset <o> of buffer <b> into descriptor <h>. + * The algorithm is not obvious. It turns out that H2 headers are neither + * aligned nor do they use regular sizes. And to add to the trouble, the buffer + * may wrap so each byte read must be checked. The header is formed like this : + * + * b0 b1 b2 b3 b4 b5..b8 + * +----------+---------+--------+----+----+----------------------+ + * |len[23:16]|len[15:8]|len[7:0]|type|flag|sid[31:0] (big endian)| + * +----------+---------+--------+----+----+----------------------+ + * + * Here we read a big-endian 64 bit word from h[1]. This way in a single read + * we get the sid properly aligned and ordered, and 16 bits of len properly + * ordered as well. The type and flags can be extracted using bit shifts from + * the word, and only one extra read is needed to fetch len[16:23]. + * Returns zero if some bytes are missing, otherwise non-zero on success. The + * buffer is assumed not to contain any output data. + */ +static __maybe_unused int h2_peek_frame_hdr(const struct buffer *b, int o, struct h2_fh *h) +{ + uint64_t w; + + if (b_data(b) < o + 9) + return 0; + + w = h2_get_n64(b, o + 1); + h->len = *(uint8_t*)b_peek(b, o) << 16; + h->sid = w & 0x7FFFFFFF; /* RFC7540#4.1: R bit must be ignored */ + h->ff = w >> 32; + h->ft = w >> 40; + h->len += w >> 48; + return 1; +} + +/* skip the next 9 bytes corresponding to the frame header possibly parsed by + * h2_peek_frame_hdr() above. + */ +static inline __maybe_unused void h2_skip_frame_hdr(struct buffer *b) +{ + b_del(b, 9); +} + +/* same as above, automatically advances the buffer on success */ +static inline __maybe_unused int h2_get_frame_hdr(struct buffer *b, struct h2_fh *h) +{ + int ret; + + ret = h2_peek_frame_hdr(b, 0, h); + if (ret > 0) + h2_skip_frame_hdr(b); + return ret; +} + + +/* try to fragment the headers frame present at the beginning of buffer <b>, + * enforcing a limit of <mfs> bytes per frame. Returns 0 on failure, 1 on + * success. Typical causes of failure include a buffer not large enough to + * add extra frame headers. The existing frame size is read in the current + * frame. Its EH flag will be cleared if CONTINUATION frames need to be added, + * and its length will be adjusted. The stream ID for continuation frames will + * be copied from the initial frame's. + */ +static int h2_fragment_headers(struct buffer *b, uint32_t mfs) +{ + size_t remain = b->data - 9; + int extra_frames = (remain - 1) / mfs; + size_t fsize; + char *fptr; + int frame; + + if (b->data <= mfs + 9) + return 1; + + /* Too large a frame, we need to fragment it using CONTINUATION + * frames. We start from the end and move tails as needed. + */ + if (b->data + extra_frames * 9 > b->size) + return 0; + + for (frame = extra_frames; frame; frame--) { + fsize = ((remain - 1) % mfs) + 1; + remain -= fsize; + + /* move data */ + fptr = b->area + 9 + remain + (frame - 1) * 9; + memmove(fptr + 9, b->area + 9 + remain, fsize); + b->data += 9; + + /* write new frame header */ + h2_set_frame_size(fptr, fsize); + fptr[3] = H2_FT_CONTINUATION; + fptr[4] = (frame == extra_frames) ? H2_F_HEADERS_END_HEADERS : 0; + write_n32(fptr + 5, read_n32(b->area + 5)); + } + + b->area[4] &= ~H2_F_HEADERS_END_HEADERS; + h2_set_frame_size(b->area, remain); + return 1; +} + + +/* marks stream <h2s> as CLOSED and decrement the number of active streams for + * its connection if the stream was not yet closed. Please use this exclusively + * before closing a stream to ensure stream count is well maintained. Note that + * it does explicitly support being called with a partially initialized h2s + * (e.g. sd==NULL). + */ +static inline void h2s_close(struct h2s *h2s) +{ + if (h2s->st != H2_SS_CLOSED) { + TRACE_ENTER(H2_EV_H2S_END, h2s->h2c->conn, h2s); + h2s->h2c->nb_streams--; + if (!h2s->id) + h2s->h2c->nb_reserved--; + if (h2s->sd && h2s_sc(h2s)) { + if (!se_fl_test(h2s->sd, SE_FL_EOS) && !b_data(&h2s->rxbuf)) + h2s_notify_recv(h2s); + } + HA_ATOMIC_DEC(&h2s->h2c->px_counters->open_streams); + + TRACE_LEAVE(H2_EV_H2S_END, h2s->h2c->conn, h2s); + } + h2s->st = H2_SS_CLOSED; +} + +/* Check h2c and h2s flags to evaluate if EOI/EOS/ERR_PENDING/ERROR flags must + * be set on the SE. + */ +static inline void h2s_propagate_term_flags(struct h2c *h2c, struct h2s *h2s) +{ + if (h2s->flags & H2_SF_ES_RCVD) { + se_fl_set(h2s->sd, SE_FL_EOI); + /* Add EOS flag for tunnel */ + if (h2s->flags & H2_SF_BODY_TUNNEL) + se_fl_set(h2s->sd, SE_FL_EOS); + } + if (h2c_read0_pending(h2c) || h2s->st == H2_SS_CLOSED) { + se_fl_set(h2s->sd, SE_FL_EOS); + if (!se_fl_test(h2s->sd, SE_FL_EOI)) + se_fl_set(h2s->sd, SE_FL_ERROR); + } + if (se_fl_test(h2s->sd, SE_FL_ERR_PENDING)) + se_fl_set(h2s->sd, SE_FL_ERROR); +} + +/* detaches an H2 stream from its H2C and releases it to the H2S pool. */ +/* h2s_destroy should only ever be called by the thread that owns the stream, + * that means that a tasklet should be used if we want to destroy the h2s + * from another thread + */ +static void h2s_destroy(struct h2s *h2s) +{ + struct connection *conn = h2s->h2c->conn; + + TRACE_ENTER(H2_EV_H2S_END, conn, h2s); + + h2s_close(h2s); + eb32_delete(&h2s->by_id); + if (b_size(&h2s->rxbuf)) { + b_free(&h2s->rxbuf); + offer_buffers(NULL, 1); + } + + if (h2s->subs) + h2s->subs->events = 0; + + /* There's no need to explicitly call unsubscribe here, the only + * reference left would be in the h2c send_list/fctl_list, and if + * we're in it, we're getting out anyway + */ + h2_remove_from_list(h2s); + + /* ditto, calling tasklet_free() here should be ok */ + tasklet_free(h2s->shut_tl); + BUG_ON(h2s->sd && !se_fl_test(h2s->sd, SE_FL_ORPHAN)); + sedesc_free(h2s->sd); + pool_free(pool_head_h2s, h2s); + + TRACE_LEAVE(H2_EV_H2S_END, conn); +} + +/* allocates a new stream <id> for connection <h2c> and adds it into h2c's + * stream tree. In case of error, nothing is added and NULL is returned. The + * causes of errors can be any failed memory allocation. The caller is + * responsible for checking if the connection may support an extra stream + * prior to calling this function. + */ +static struct h2s *h2s_new(struct h2c *h2c, int id) +{ + struct h2s *h2s; + + TRACE_ENTER(H2_EV_H2S_NEW, h2c->conn); + + h2s = pool_alloc(pool_head_h2s); + if (!h2s) + goto out; + + h2s->shut_tl = tasklet_new(); + if (!h2s->shut_tl) { + pool_free(pool_head_h2s, h2s); + goto out; + } + h2s->subs = NULL; + h2s->shut_tl->process = h2_deferred_shut; + h2s->shut_tl->context = h2s; + LIST_INIT(&h2s->list); + h2s->h2c = h2c; + h2s->sd = NULL; + h2s->sws = 0; + h2s->flags = H2_SF_NONE; + h2s->errcode = H2_ERR_NO_ERROR; + h2s->st = H2_SS_IDLE; + h2s->status = 0; + h2s->body_len = 0; + h2s->rxbuf = BUF_NULL; + memset(h2s->upgrade_protocol, 0, sizeof(h2s->upgrade_protocol)); + + h2s->by_id.key = h2s->id = id; + if (id > 0) + h2c->max_id = id; + else + h2c->nb_reserved++; + + eb32_insert(&h2c->streams_by_id, &h2s->by_id); + h2c->nb_streams++; + + HA_ATOMIC_INC(&h2c->px_counters->open_streams); + HA_ATOMIC_INC(&h2c->px_counters->total_streams); + + TRACE_LEAVE(H2_EV_H2S_NEW, h2c->conn, h2s); + return h2s; + out: + TRACE_DEVEL("leaving in error", H2_EV_H2S_ERR|H2_EV_H2S_END, h2c->conn); + return NULL; +} + +/* creates a new stream <id> on the h2c connection and returns it, or NULL in + * case of memory allocation error. <input> is used as input buffer for the new + * stream. On success, it is transferred to the stream and the mux is no longer + * responsible of it. On error, <input> is unchanged, thus the mux must still + * take care of it. + */ +static struct h2s *h2c_frt_stream_new(struct h2c *h2c, int id, struct buffer *input, uint32_t flags) +{ + struct session *sess = h2c->conn->owner; + struct h2s *h2s; + + TRACE_ENTER(H2_EV_H2S_NEW, h2c->conn); + + /* Cannot handle stream if active reversed connection is not yet accepted. */ + BUG_ON(conn_reverse_in_preconnect(h2c->conn)); + + if (h2c->nb_streams >= h2c_max_concurrent_streams(h2c)) { + TRACE_ERROR("HEADERS frame causing MAX_CONCURRENT_STREAMS to be exceeded", H2_EV_H2S_NEW|H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn); + session_inc_http_req_ctr(sess); + session_inc_http_err_ctr(sess); + goto out; + } + + h2s = h2s_new(h2c, id); + if (!h2s) + goto out_alloc; + + h2s->sd = sedesc_new(); + if (!h2s->sd) + goto out_close; + h2s->sd->se = h2s; + h2s->sd->conn = h2c->conn; + se_fl_set(h2s->sd, SE_FL_T_MUX | SE_FL_ORPHAN | SE_FL_NOT_FIRST); + + if (!(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_H2_SND)) + se_fl_set(h2s->sd, SE_FL_MAY_FASTFWD_CONS); + + /* The request is not finished, don't expect data from the opposite side + * yet + */ + if (!(h2c->dff & (H2_F_HEADERS_END_STREAM| H2_F_DATA_END_STREAM)) && !(flags & H2_SF_BODY_TUNNEL)) + se_expect_no_data(h2s->sd); + + /* FIXME wrong analogy between ext-connect and websocket, this need to + * be refine. + */ + if (flags & H2_SF_EXT_CONNECT_RCVD) + se_fl_set(h2s->sd, SE_FL_WEBSOCKET); + + /* The stream will record the request's accept date (which is either the + * end of the connection's or the date immediately after the previous + * request) and the idle time, which is the delay since the previous + * request. We can set the value now, it will be copied by stream_new(). + */ + sess->t_idle = ns_to_ms(now_ns - sess->accept_ts) - sess->t_handshake; + + if (!sc_new_from_endp(h2s->sd, sess, input)) + goto out_close; + + h2c->nb_sc++; + + /* We want the accept date presented to the next stream to be the one + * we have now, the handshake time to be null (since the next stream + * is not delayed by a handshake), and the idle time to count since + * right now. + */ + sess->accept_date = date; + sess->accept_ts = now_ns; + sess->t_handshake = 0; + sess->t_idle = 0; + + /* OK done, the stream lives its own life now */ + if (h2_frt_has_too_many_sc(h2c)) + h2c->flags |= H2_CF_DEM_TOOMANY; + TRACE_LEAVE(H2_EV_H2S_NEW, h2c->conn); + return h2s; + + out_close: + h2s_destroy(h2s); + out_alloc: + TRACE_ERROR("Failed to allocate a new stream", H2_EV_H2S_NEW|H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn); + out: + sess_log(sess); + TRACE_LEAVE(H2_EV_H2S_NEW|H2_EV_H2S_ERR|H2_EV_H2S_END, h2c->conn); + return NULL; +} + +/* allocates a new stream associated to stream connector <sc> on the h2c + * connection and returns it, or NULL in case of memory allocation error or if + * the highest possible stream ID was reached. + */ +static struct h2s *h2c_bck_stream_new(struct h2c *h2c, struct stconn *sc, struct session *sess) +{ + struct h2s *h2s = NULL; + + TRACE_ENTER(H2_EV_H2S_NEW, h2c->conn); + + /* Cannot handle stream if connection waiting to be reversed. */ + BUG_ON(conn_reverse_in_preconnect(h2c->conn)); + + if (h2c->nb_streams >= h2c->streams_limit) { + TRACE_ERROR("Aborting stream since negotiated limit is too low", H2_EV_H2S_NEW, h2c->conn); + goto out; + } + + if (h2_streams_left(h2c) < 1) { + TRACE_ERROR("Aborting stream since no more streams left", H2_EV_H2S_NEW, h2c->conn); + goto out; + } + + /* Defer choosing the ID until we send the first message to create the stream */ + h2s = h2s_new(h2c, 0); + if (!h2s) { + TRACE_ERROR("Failed to allocate a new stream", H2_EV_H2S_NEW, h2c->conn); + goto out; + } + + if (sc_attach_mux(sc, h2s, h2c->conn) < 0) { + TRACE_ERROR("Failed to allocate a new stream", H2_EV_H2S_NEW, h2c->conn); + h2s_destroy(h2s); + h2s = NULL; + goto out; + } + h2s->sd = sc->sedesc; + h2s->sess = sess; + h2c->nb_sc++; + + if (!(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_H2_SND)) + se_fl_set(h2s->sd, SE_FL_MAY_FASTFWD_CONS); + /* on the backend we can afford to only count total streams upon success */ + h2c->stream_cnt++; + + out: + if (likely(h2s)) + TRACE_LEAVE(H2_EV_H2S_NEW, h2c->conn, h2s); + else + TRACE_LEAVE(H2_EV_H2S_NEW|H2_EV_H2S_ERR|H2_EV_H2S_END, h2c->conn, h2s); + return h2s; +} + +/* try to send a settings frame on the connection. Returns > 0 on success, 0 if + * it couldn't do anything. It may return an error in h2c. See RFC7540#11.3 for + * the various settings codes. + */ +static int h2c_send_settings(struct h2c *h2c) +{ + struct buffer *res; + char buf_data[100]; // enough for 15 settings + struct buffer buf; + int iws; + int mfs; + int mcs; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_SETTINGS, h2c->conn); + + chunk_init(&buf, buf_data, sizeof(buf_data)); + chunk_memcpy(&buf, + "\x00\x00\x00" /* length : 0 for now */ + "\x04\x00" /* type : 4 (settings), flags : 0 */ + "\x00\x00\x00\x00", /* stream ID : 0 */ + 9); + + if (h2c->flags & H2_CF_IS_BACK) { + /* send settings_enable_push=0 */ + chunk_memcat(&buf, "\x00\x02\x00\x00\x00\x00", 6); + } + + /* rfc 8441 #3 SETTINGS_ENABLE_CONNECT_PROTOCOL=1, + * sent automatically unless disabled in the global config */ + if (!(global.tune.options & GTUNE_DISABLE_H2_WEBSOCKET)) + chunk_memcat(&buf, "\x00\x08\x00\x00\x00\x01", 6); + + if (h2_settings_header_table_size != 4096) { + char str[6] = "\x00\x01"; /* header_table_size */ + + write_n32(str + 2, h2_settings_header_table_size); + chunk_memcat(&buf, str, 6); + } + + iws = (h2c->flags & H2_CF_IS_BACK) ? + h2_be_settings_initial_window_size: + h2_fe_settings_initial_window_size; + iws = iws ? iws : h2_settings_initial_window_size; + + if (iws != 65535) { + char str[6] = "\x00\x04"; /* initial_window_size */ + + write_n32(str + 2, iws); + chunk_memcat(&buf, str, 6); + } + + mcs = h2c_max_concurrent_streams(h2c); + if (mcs != 0) { + char str[6] = "\x00\x03"; /* max_concurrent_streams */ + + /* Note: 0 means "unlimited" for haproxy's config but not for + * the protocol, so never send this value! + */ + write_n32(str + 2, mcs); + chunk_memcat(&buf, str, 6); + } + + mfs = h2_settings_max_frame_size; + if (mfs > global.tune.bufsize) + mfs = global.tune.bufsize; + + if (!mfs) + mfs = global.tune.bufsize; + + if (mfs != 16384) { + char str[6] = "\x00\x05"; /* max_frame_size */ + + /* note: similarly we could also emit MAX_HEADER_LIST_SIZE to + * match bufsize - rewrite size, but at the moment it seems + * that clients don't take care of it. + */ + write_n32(str + 2, mfs); + chunk_memcat(&buf, str, 6); + } + + h2_set_frame_size(buf.area, buf.data - 9); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + + ret = b_istput(res, ist2(buf.area, buf.data)); + if (unlikely(ret <= 0)) { + if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2c->flags |= H2_CF_DEM_MROOM; + } + else { + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = 0; + } + } + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_SETTINGS, h2c->conn); + return ret; +} + +/* Try to receive a connection preface, then upon success try to send our + * preface which is a SETTINGS frame. Returns > 0 on success or zero on + * missing data. It may return an error in h2c. + */ +static int h2c_frt_recv_preface(struct h2c *h2c) +{ + int ret1; + int ret2; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_PREFACE, h2c->conn); + + ret1 = b_isteq(&h2c->dbuf, 0, b_data(&h2c->dbuf), ist(H2_CONN_PREFACE)); + + if (unlikely(ret1 <= 0)) { + if (!ret1) + h2c->flags |= H2_CF_DEM_SHORT_READ; + if (ret1 < 0 || (h2c->flags & H2_CF_RCVD_SHUT)) { + TRACE_ERROR("I/O error or short read", H2_EV_RX_FRAME|H2_EV_RX_PREFACE, h2c->conn); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + if (b_data(&h2c->dbuf) || + !(((const struct session *)h2c->conn->owner)->fe->options & PR_O_IGNORE_PRB)) + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + } + ret2 = 0; + goto out; + } + + ret2 = h2c_send_settings(h2c); + if (ret2 > 0) + b_del(&h2c->dbuf, ret1); + out: + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_PREFACE, h2c->conn); + return ret2; +} + +/* Try to send a connection preface, then upon success try to send our + * preface which is a SETTINGS frame. Returns > 0 on success or zero on + * missing data. It may return an error in h2c. + */ +static int h2c_bck_send_preface(struct h2c *h2c) +{ + struct buffer *res; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_PREFACE, h2c->conn); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + + if (!b_data(res)) { + /* preface not yet sent */ + ret = b_istput(res, ist(H2_CONN_PREFACE)); + if (unlikely(ret <= 0)) { + if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + else { + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = 0; + goto out; + } + } + } + ret = h2c_send_settings(h2c); + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_PREFACE, h2c->conn); + return ret; +} + +/* try to send a GOAWAY frame on the connection to report an error or a graceful + * shutdown, with h2c->errcode as the error code. Returns > 0 on success or zero + * if nothing was done. It uses h2c->last_sid as the advertised ID, or copies it + * from h2c->max_id if it's not set yet (<0). In case of lack of room to write + * the message, it subscribes the requester (either <h2s> or <h2c>) to future + * notifications. It sets H2_CF_GOAWAY_SENT on success, and H2_CF_GOAWAY_FAILED + * on unrecoverable failure. It will not attempt to send one again in this last + * case, nor will it send one if settings were not sent (e.g. still waiting for + * a preface) so that it is safe to use h2c_error() to report such errors. + */ +static int h2c_send_goaway_error(struct h2c *h2c, struct h2s *h2s) +{ + struct buffer *res; + char str[17]; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_GOAWAY, h2c->conn); + + if ((h2c->flags & H2_CF_GOAWAY_FAILED) || h2c->st0 < H2_CS_SETTINGS1) { + ret = 1; // claim that it worked + goto out; + } + + /* len: 8, type: 7, flags: none, sid: 0 */ + memcpy(str, "\x00\x00\x08\x07\x00\x00\x00\x00\x00", 9); + + if (h2c->last_sid < 0) + h2c->last_sid = h2c->max_id; + + write_n32(str + 9, h2c->last_sid); + write_n32(str + 13, h2c->errcode); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + if (h2s) + h2s->flags |= H2_SF_BLK_MROOM; + else + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + + ret = b_istput(res, ist2(str, 17)); + if (unlikely(ret <= 0)) { + if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + if (h2s) + h2s->flags |= H2_SF_BLK_MROOM; + else + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + else { + /* we cannot report this error using GOAWAY, so we mark + * it and claim a success. + */ + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + h2c->flags |= H2_CF_GOAWAY_FAILED; + ret = 1; + goto out; + } + } + h2c->flags |= H2_CF_GOAWAY_SENT; + + /* some codes are not for real errors, just attempts to close cleanly */ + switch (h2c->errcode) { + case H2_ERR_NO_ERROR: + case H2_ERR_ENHANCE_YOUR_CALM: + case H2_ERR_REFUSED_STREAM: + case H2_ERR_CANCEL: + break; + default: + HA_ATOMIC_INC(&h2c->px_counters->goaway_resp); + } + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_GOAWAY, h2c->conn); + return ret; +} + +/* Try to send an RST_STREAM frame on the connection for the indicated stream + * during mux operations. This stream must be valid and cannot be closed + * already. h2s->id will be used for the stream ID and h2s->errcode will be + * used for the error code. h2s->st will be update to H2_SS_CLOSED if it was + * not yet. + * + * Returns > 0 on success or zero if nothing was done. In case of lack of room + * to write the message, it subscribes the stream to future notifications. + */ +static int h2s_send_rst_stream(struct h2c *h2c, struct h2s *h2s) +{ + struct buffer *res; + char str[13]; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_RST, h2c->conn, h2s); + + if (!h2s || h2s->st == H2_SS_CLOSED) { + ret = 1; + goto out; + } + + /* RFC7540#5.4.2: To avoid looping, an endpoint MUST NOT send a + * RST_STREAM in response to a RST_STREAM frame. + */ + if (h2c->dsi == h2s->id && h2c->dft == H2_FT_RST_STREAM) { + ret = 1; + goto ignore; + } + + /* len: 4, type: 3, flags: none */ + memcpy(str, "\x00\x00\x04\x03\x00", 5); + write_n32(str + 5, h2s->id); + write_n32(str + 9, h2s->errcode); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2s->flags |= H2_SF_BLK_MROOM; + goto out; + } + + ret = b_istput(res, ist2(str, 13)); + if (unlikely(ret <= 0)) { + if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + goto out; + } + else { + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = 0; + goto out; + } + } + + ignore: + h2s->flags |= H2_SF_RST_SENT; + h2s_close(h2s); + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_RST, h2c->conn, h2s); + return ret; +} + +/* Try to send an RST_STREAM frame on the connection for the stream being + * demuxed using h2c->dsi for the stream ID. It will use h2s->errcode as the + * error code, even if the stream is one of the dummy ones, and will update + * h2s->st to H2_SS_CLOSED if it was not yet. + * + * Returns > 0 on success or zero if nothing was done. In case of lack of room + * to write the message, it blocks the demuxer and subscribes it to future + * notifications. It's worth mentioning that an RST may even be sent for a + * closed stream. + */ +static int h2c_send_rst_stream(struct h2c *h2c, struct h2s *h2s) +{ + struct buffer *res; + char str[13]; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_RST, h2c->conn, h2s); + + /* RFC7540#5.4.2: To avoid looping, an endpoint MUST NOT send a + * RST_STREAM in response to a RST_STREAM frame. + */ + if (h2c->dft == H2_FT_RST_STREAM) { + ret = 1; + goto ignore; + } + + /* len: 4, type: 3, flags: none */ + memcpy(str, "\x00\x00\x04\x03\x00", 5); + + write_n32(str + 5, h2c->dsi); + write_n32(str + 9, h2s->errcode); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + + ret = b_istput(res, ist2(str, 13)); + if (unlikely(ret <= 0)) { + if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + else { + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = 0; + goto out; + } + } + + ignore: + if (h2s->id) { + h2s->flags |= H2_SF_RST_SENT; + h2s_close(h2s); + } + + out: + HA_ATOMIC_INC(&h2c->px_counters->rst_stream_resp); + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_RST, h2c->conn, h2s); + return ret; +} + +/* try to send an empty DATA frame with the ES flag set to notify about the + * end of stream and match a shutdown(write). If an ES was already sent as + * indicated by HLOC/ERROR/RESET/CLOSED states, nothing is done. Returns > 0 + * on success or zero if nothing was done. In case of lack of room to write the + * message, it subscribes the requesting stream to future notifications. + */ +static int h2_send_empty_data_es(struct h2s *h2s) +{ + struct h2c *h2c = h2s->h2c; + struct buffer *res; + char str[9]; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_DATA|H2_EV_TX_EOI, h2c->conn, h2s); + + if (h2s->st == H2_SS_HLOC || h2s->st == H2_SS_ERROR || h2s->st == H2_SS_CLOSED) { + ret = 1; + goto out; + } + + /* len: 0x000000, type: 0(DATA), flags: ES=1 */ + memcpy(str, "\x00\x00\x00\x00\x01", 5); + write_n32(str + 5, h2s->id); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2s->flags |= H2_SF_BLK_MROOM; + goto out; + } + + ret = b_istput(res, ist2(str, 9)); + if (likely(ret > 0)) { + h2s->flags |= H2_SF_ES_SENT; + } + else if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + } + else { + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = 0; + } + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_DATA|H2_EV_TX_EOI, h2c->conn, h2s); + return ret; +} + +/* wake a specific stream and assign its stream connector some SE_FL_* flags + * among SE_FL_ERR_PENDING and SE_FL_ERROR if needed. The stream's state + * is automatically updated accordingly. If the stream is orphaned, it is + * destroyed. + */ +static void h2s_wake_one_stream(struct h2s *h2s) +{ + struct h2c *h2c = h2s->h2c; + + TRACE_ENTER(H2_EV_H2S_WAKE, h2c->conn, h2s); + + if (!h2s_sc(h2s)) { + /* this stream was already orphaned */ + h2s_destroy(h2s); + TRACE_DEVEL("leaving with no h2s", H2_EV_H2S_WAKE, h2c->conn); + return; + } + + if (h2c_read0_pending(h2s->h2c)) { + if (h2s->st == H2_SS_OPEN) + h2s->st = H2_SS_HREM; + else if (h2s->st == H2_SS_HLOC) + h2s_close(h2s); + } + + if ((h2s->st != H2_SS_CLOSED) && + (h2s->h2c->st0 >= H2_CS_ERROR || (h2s->h2c->flags & H2_CF_ERROR) || + (h2s->h2c->last_sid > 0 && (!h2s->id || h2s->id > h2s->h2c->last_sid)))) { + se_fl_set_error(h2s->sd); + + if (h2s->st < H2_SS_ERROR) + h2s->st = H2_SS_ERROR; + } + + h2s_alert(h2s); + TRACE_LEAVE(H2_EV_H2S_WAKE, h2c->conn); +} + +/* wake the streams attached to the connection, whose id is greater than <last> + * or unassigned. + */ +static void h2_wake_some_streams(struct h2c *h2c, int last) +{ + struct eb32_node *node; + struct h2s *h2s; + + TRACE_ENTER(H2_EV_H2S_WAKE, h2c->conn); + + /* Wake all streams with ID > last */ + node = eb32_lookup_ge(&h2c->streams_by_id, last + 1); + while (node) { + h2s = container_of(node, struct h2s, by_id); + node = eb32_next(node); + h2s_wake_one_stream(h2s); + } + + /* Wake all streams with unassigned ID (ID == 0) */ + node = eb32_lookup(&h2c->streams_by_id, 0); + while (node) { + h2s = container_of(node, struct h2s, by_id); + if (h2s->id > 0) + break; + node = eb32_next(node); + h2s_wake_one_stream(h2s); + } + + TRACE_LEAVE(H2_EV_H2S_WAKE, h2c->conn); +} + +/* Wake up all blocked streams whose window size has become positive after the + * mux's initial window was adjusted. This should be done after having processed + * SETTINGS frames which have updated the mux's initial window size. + */ +static void h2c_unblock_sfctl(struct h2c *h2c) +{ + struct h2s *h2s; + struct eb32_node *node; + + TRACE_ENTER(H2_EV_H2C_WAKE, h2c->conn); + + node = eb32_first(&h2c->streams_by_id); + while (node) { + h2s = container_of(node, struct h2s, by_id); + if (h2s->flags & H2_SF_BLK_SFCTL && h2s_mws(h2s) > 0) { + h2s->flags &= ~H2_SF_BLK_SFCTL; + LIST_DEL_INIT(&h2s->list); + if ((h2s->subs && h2s->subs->events & SUB_RETRY_SEND) || + h2s->flags & (H2_SF_WANT_SHUTR|H2_SF_WANT_SHUTW)) + LIST_APPEND(&h2c->send_list, &h2s->list); + } + node = eb32_next(node); + } + + TRACE_LEAVE(H2_EV_H2C_WAKE, h2c->conn); +} + +/* processes a SETTINGS frame whose payload is <payload> for <plen> bytes, and + * ACKs it if needed. Returns > 0 on success or zero on missing data. It may + * return an error in h2c. The caller must have already verified frame length + * and stream ID validity. Described in RFC7540#6.5. + */ +static int h2c_handle_settings(struct h2c *h2c) +{ + unsigned int offset; + int error; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_SETTINGS, h2c->conn); + + if (h2c->dff & H2_F_SETTINGS_ACK) { + if (h2c->dfl) { + error = H2_ERR_FRAME_SIZE_ERROR; + goto fail; + } + goto done; + } + + /* process full frame only */ + if (b_data(&h2c->dbuf) < h2c->dfl) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto out0; + } + + /* parse the frame */ + for (offset = 0; offset < h2c->dfl; offset += 6) { + uint16_t type = h2_get_n16(&h2c->dbuf, offset); + int32_t arg = h2_get_n32(&h2c->dbuf, offset + 2); + + switch (type) { + case H2_SETTINGS_INITIAL_WINDOW_SIZE: + /* we need to update all existing streams with the + * difference from the previous iws. + */ + if (arg < 0) { // RFC7540#6.5.2 + error = H2_ERR_FLOW_CONTROL_ERROR; + goto fail; + } + h2c->miw = arg; + break; + case H2_SETTINGS_MAX_FRAME_SIZE: + if (arg < 16384 || arg > 16777215) { // RFC7540#6.5.2 + TRACE_ERROR("MAX_FRAME_SIZE out of range", H2_EV_RX_FRAME|H2_EV_RX_SETTINGS, h2c->conn); + error = H2_ERR_PROTOCOL_ERROR; + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto fail; + } + h2c->mfs = arg; + break; + case H2_SETTINGS_HEADER_TABLE_SIZE: + h2c->flags |= H2_CF_SHTS_UPDATED; + break; + case H2_SETTINGS_ENABLE_PUSH: + if (arg < 0 || arg > 1) { // RFC7540#6.5.2 + TRACE_ERROR("ENABLE_PUSH out of range", H2_EV_RX_FRAME|H2_EV_RX_SETTINGS, h2c->conn); + error = H2_ERR_PROTOCOL_ERROR; + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto fail; + } + break; + case H2_SETTINGS_MAX_CONCURRENT_STREAMS: + if (h2c->flags & H2_CF_IS_BACK) { + /* the limit is only for the backend; for the frontend it is our limit */ + if ((unsigned int)arg > h2c_max_concurrent_streams(h2c)) + arg = h2c_max_concurrent_streams(h2c); + h2c->streams_limit = arg; + } + break; + case H2_SETTINGS_ENABLE_CONNECT_PROTOCOL: + if (arg == 1) + h2c->flags |= H2_CF_RCVD_RFC8441; + break; + } + } + + /* need to ACK this frame now */ + h2c->st0 = H2_CS_FRAME_A; + done: + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_SETTINGS, h2c->conn); + return 1; + fail: + if (!(h2c->flags & H2_CF_IS_BACK)) + sess_log(h2c->conn->owner); + h2c_error(h2c, error); + out0: + TRACE_DEVEL("leaving with missing data or error", H2_EV_RX_FRAME|H2_EV_RX_SETTINGS, h2c->conn); + return 0; +} + +/* try to send an ACK for a settings frame on the connection. Returns > 0 on + * success or one of the h2_status values. + */ +static int h2c_ack_settings(struct h2c *h2c) +{ + struct buffer *res; + char str[9]; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_SETTINGS, h2c->conn); + + memcpy(str, + "\x00\x00\x00" /* length : 0 (no data) */ + "\x04" "\x01" /* type : 4, flags : ACK */ + "\x00\x00\x00\x00" /* stream ID */, 9); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + + ret = b_istput(res, ist2(str, 9)); + if (unlikely(ret <= 0)) { + if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2c->flags |= H2_CF_DEM_MROOM; + } + else { + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = 0; + } + } + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_SETTINGS, h2c->conn); + return ret; +} + +/* processes a PING frame and schedules an ACK if needed. The caller must pass + * the pointer to the payload in <payload>. Returns > 0 on success or zero on + * missing data. The caller must have already verified frame length + * and stream ID validity. + */ +static int h2c_handle_ping(struct h2c *h2c) +{ + /* schedule a response */ + if (!(h2c->dff & H2_F_PING_ACK)) + h2c->st0 = H2_CS_FRAME_A; + return 1; +} + +/* Try to send a window update for stream id <sid> and value <increment>. + * Returns > 0 on success or zero on missing room or failure. It may return an + * error in h2c. + */ +static int h2c_send_window_update(struct h2c *h2c, int sid, uint32_t increment) +{ + struct buffer *res; + char str[13]; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn); + + /* length: 4, type: 8, flags: none */ + memcpy(str, "\x00\x00\x04\x08\x00", 5); + write_n32(str + 5, sid); + write_n32(str + 9, increment); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + + ret = b_istput(res, ist2(str, 13)); + if (unlikely(ret <= 0)) { + if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2c->flags |= H2_CF_DEM_MROOM; + } + else { + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = 0; + } + } + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn); + return ret; +} + +/* try to send pending window update for the connection. It's safe to call it + * with no pending updates. Returns > 0 on success or zero on missing room or + * failure. It may return an error in h2c. + */ +static int h2c_send_conn_wu(struct h2c *h2c) +{ + int ret = 1; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn); + + if (h2c->rcvd_c <= 0) + goto out; + + if (!(h2c->flags & H2_CF_WINDOW_OPENED)) { + /* increase the advertised connection window to 2G on + * first update. + */ + h2c->flags |= H2_CF_WINDOW_OPENED; + h2c->rcvd_c += H2_INITIAL_WINDOW_INCREMENT; + } + + /* send WU for the connection */ + ret = h2c_send_window_update(h2c, 0, h2c->rcvd_c); + if (ret > 0) + h2c->rcvd_c = 0; + + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn); + return ret; +} + +/* try to send pending window update for the current dmux stream. It's safe to + * call it with no pending updates. Returns > 0 on success or zero on missing + * room or failure. It may return an error in h2c. + */ +static int h2c_send_strm_wu(struct h2c *h2c) +{ + int ret = 1; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn); + + if (h2c->rcvd_s <= 0) + goto out; + + /* send WU for the stream */ + ret = h2c_send_window_update(h2c, h2c->dsi, h2c->rcvd_s); + if (ret > 0) + h2c->rcvd_s = 0; + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn); + return ret; +} + +/* try to send an ACK for a ping frame on the connection. Returns > 0 on + * success, 0 on missing data or one of the h2_status values. + */ +static int h2c_ack_ping(struct h2c *h2c) +{ + struct buffer *res; + char str[17]; + int ret = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_PING, h2c->conn); + + if (b_data(&h2c->dbuf) < 8) + goto out; + + memcpy(str, + "\x00\x00\x08" /* length : 8 (same payload) */ + "\x06" "\x01" /* type : 6, flags : ACK */ + "\x00\x00\x00\x00" /* stream ID */, 9); + + /* copy the original payload */ + h2_get_buf_bytes(str + 9, 8, &h2c->dbuf, 0); + + res = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, res)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2c->flags |= H2_CF_DEM_MROOM; + goto out; + } + + ret = b_istput(res, ist2(str, 17)); + if (unlikely(ret <= 0)) { + if (!ret) { + if ((res = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2c->flags |= H2_CF_DEM_MROOM; + } + else { + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = 0; + } + } + out: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_PING, h2c->conn); + return ret; +} + +/* processes a WINDOW_UPDATE frame whose payload is <payload> for <plen> bytes. + * Returns > 0 on success or zero on missing data. It may return an error in + * h2c or h2s. The caller must have already verified frame length and stream ID + * validity. Described in RFC7540#6.9. + */ +static int h2c_handle_window_update(struct h2c *h2c, struct h2s *h2s) +{ + int32_t inc; + int error; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn); + + /* process full frame only */ + if (b_data(&h2c->dbuf) < h2c->dfl) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto out0; + } + + inc = h2_get_n32(&h2c->dbuf, 0); + + if (h2c->dsi != 0) { + /* stream window update */ + + /* it's not an error to receive WU on a closed stream */ + if (h2s->st == H2_SS_CLOSED) + goto done; + + if (!inc) { + TRACE_ERROR("stream WINDOW_UPDATE inc=0", H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn, h2s); + error = H2_ERR_PROTOCOL_ERROR; + HA_ATOMIC_INC(&h2c->px_counters->strm_proto_err); + goto strm_err; + } + + if (h2s_mws(h2s) >= 0 && h2s_mws(h2s) + inc < 0) { + TRACE_ERROR("stream WINDOW_UPDATE inc<0", H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn, h2s); + error = H2_ERR_FLOW_CONTROL_ERROR; + HA_ATOMIC_INC(&h2c->px_counters->strm_proto_err); + goto strm_err; + } + + h2s->sws += inc; + if (h2s_mws(h2s) > 0 && (h2s->flags & H2_SF_BLK_SFCTL)) { + h2s->flags &= ~H2_SF_BLK_SFCTL; + LIST_DEL_INIT(&h2s->list); + if ((h2s->subs && h2s->subs->events & SUB_RETRY_SEND) || + h2s->flags & (H2_SF_WANT_SHUTR|H2_SF_WANT_SHUTW)) + LIST_APPEND(&h2c->send_list, &h2s->list); + } + } + else { + /* connection window update */ + if (!inc) { + TRACE_ERROR("conn WINDOW_UPDATE inc=0", H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn); + error = H2_ERR_PROTOCOL_ERROR; + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto conn_err; + } + + if (h2c->mws >= 0 && h2c->mws + inc < 0) { + TRACE_ERROR("conn WINDOW_UPDATE inc<0", H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn); + error = H2_ERR_FLOW_CONTROL_ERROR; + goto conn_err; + } + + h2c->mws += inc; + } + + done: + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn); + return 1; + + conn_err: + h2c_error(h2c, error); + out0: + TRACE_DEVEL("leaving on missing data or error", H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn); + return 0; + + strm_err: + h2s_error(h2s, error); + h2c->st0 = H2_CS_FRAME_E; + TRACE_DEVEL("leaving on stream error", H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn); + return 0; +} + +/* processes a GOAWAY frame, and signals all streams whose ID is greater than + * the last ID. Returns > 0 on success or zero on missing data. The caller must + * have already verified frame length and stream ID validity. Described in + * RFC7540#6.8. + */ +static int h2c_handle_goaway(struct h2c *h2c) +{ + int last; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_GOAWAY, h2c->conn); + /* process full frame only */ + if (b_data(&h2c->dbuf) < h2c->dfl) { + TRACE_DEVEL("leaving on missing data", H2_EV_RX_FRAME|H2_EV_RX_GOAWAY, h2c->conn); + h2c->flags |= H2_CF_DEM_SHORT_READ; + return 0; + } + + last = h2_get_n32(&h2c->dbuf, 0); + h2c->errcode = h2_get_n32(&h2c->dbuf, 4); + if (h2c->last_sid < 0) + h2c->last_sid = last; + h2_wake_some_streams(h2c, last); + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_GOAWAY, h2c->conn); + return 1; +} + +/* processes a PRIORITY frame, and either skips it or rejects if it is + * invalid. Returns > 0 on success or zero on missing data. It may return an + * error in h2c. The caller must have already verified frame length and stream + * ID validity. Described in RFC7540#6.3. + */ +static int h2c_handle_priority(struct h2c *h2c) +{ + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_PRIO, h2c->conn); + + /* process full frame only */ + if (b_data(&h2c->dbuf) < h2c->dfl) { + TRACE_DEVEL("leaving on missing data", H2_EV_RX_FRAME|H2_EV_RX_PRIO, h2c->conn); + h2c->flags |= H2_CF_DEM_SHORT_READ; + return 0; + } + + if (h2_get_n32(&h2c->dbuf, 0) == h2c->dsi) { + /* 7540#5.3 : can't depend on itself */ + TRACE_ERROR("PRIORITY depends on itself", H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + TRACE_DEVEL("leaving on error", H2_EV_RX_FRAME|H2_EV_RX_PRIO, h2c->conn); + return 0; + } + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_PRIO, h2c->conn); + return 1; +} + +/* processes an RST_STREAM frame, and sets the 32-bit error code on the stream. + * Returns > 0 on success or zero on missing data. The caller must have already + * verified frame length and stream ID validity. Described in RFC7540#6.4. + */ +static int h2c_handle_rst_stream(struct h2c *h2c, struct h2s *h2s) +{ + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_RST|H2_EV_RX_EOI, h2c->conn, h2s); + + /* process full frame only */ + if (b_data(&h2c->dbuf) < h2c->dfl) { + TRACE_DEVEL("leaving on missing data", H2_EV_RX_FRAME|H2_EV_RX_RST|H2_EV_RX_EOI, h2c->conn, h2s); + h2c->flags |= H2_CF_DEM_SHORT_READ; + return 0; + } + + /* late RST, already handled */ + if (h2s->st == H2_SS_CLOSED) { + TRACE_DEVEL("leaving on stream closed", H2_EV_RX_FRAME|H2_EV_RX_RST|H2_EV_RX_EOI, h2c->conn, h2s); + return 1; + } + + h2s->errcode = h2_get_n32(&h2c->dbuf, 0); + h2s_close(h2s); + + if (h2s_sc(h2s)) { + se_fl_set_error(h2s->sd); + h2s_alert(h2s); + } + + h2s->flags |= H2_SF_RST_RCVD; + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_RST|H2_EV_RX_EOI, h2c->conn, h2s); + return 1; +} + +/* processes a HEADERS frame. Returns h2s on success or NULL on missing data. + * It may return an error in h2c or h2s. The caller must consider that the + * return value is the new h2s in case one was allocated (most common case). + * Described in RFC7540#6.2. Most of the + * errors here are reported as connection errors since it's impossible to + * recover from such errors after the compression context has been altered. + */ +static struct h2s *h2c_frt_handle_headers(struct h2c *h2c, struct h2s *h2s) +{ + struct buffer rxbuf = BUF_NULL; + unsigned long long body_len = 0; + uint32_t flags = 0; + int error; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + + if (!b_size(&h2c->dbuf)) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto out; // empty buffer + } + + if (b_data(&h2c->dbuf) < h2c->dfl && !b_full(&h2c->dbuf)) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto out; // incomplete frame + } + + /* now either the frame is complete or the buffer is complete */ + if (h2s->st != H2_SS_IDLE) { + /* The stream exists/existed, this must be a trailers frame */ + if (h2s->st != H2_SS_CLOSED) { + error = h2c_dec_hdrs(h2c, &h2s->rxbuf, &h2s->flags, &body_len, NULL); + /* unrecoverable error ? */ + if (h2c->st0 >= H2_CS_ERROR) { + TRACE_USER("Unrecoverable error decoding H2 trailers", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW|H2_EV_STRM_END, h2c->conn, 0, &rxbuf); + sess_log(h2c->conn->owner); + goto out; + } + + if (error == 0) { + /* Demux not blocked because of the stream, it is an incomplete frame */ + if (!(h2c->flags &H2_CF_DEM_BLOCK_ANY)) + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto out; // missing data + } + + if (error < 0) { + /* Failed to decode this frame (e.g. too large request) + * but the HPACK decompressor is still synchronized. + */ + sess_log(h2c->conn->owner); + h2s_error(h2s, H2_ERR_INTERNAL_ERROR); + TRACE_USER("Stream error decoding H2 trailers", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW|H2_EV_STRM_END, h2c->conn, 0, &rxbuf); + h2c->st0 = H2_CS_FRAME_E; + goto out; + } + goto done; + } + /* the stream was already killed by an RST, let's consume + * the data and send another RST. + */ + error = h2c_dec_hdrs(h2c, &rxbuf, &flags, &body_len, NULL); + sess_log(h2c->conn->owner); + h2s = (struct h2s*)h2_error_stream; + TRACE_USER("rcvd H2 trailers on closed stream", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW|H2_EV_STRM_END, h2c->conn, h2s, &rxbuf); + goto send_rst; + } + else if (h2c->dsi <= h2c->max_id || !(h2c->dsi & 1)) { + /* RFC7540#5.1.1 stream id > prev ones, and must be odd here */ + error = H2_ERR_PROTOCOL_ERROR; + TRACE_ERROR("HEADERS on invalid stream ID", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + sess_log(h2c->conn->owner); + session_inc_http_req_ctr(h2c->conn->owner); + session_inc_http_err_ctr(h2c->conn->owner); + goto conn_err; + } + else if (h2c->flags & H2_CF_DEM_TOOMANY) { + goto out; // IDLE but too many sc still present + } + else if (h2_fe_max_total_streams && + h2c->stream_cnt >= h2_fe_max_total_streams + h2c_max_concurrent_streams(h2c)) { + /* We've already told this client we were going to close a + * while ago and apparently it didn't care, so it's time to + * stop processing its requests for real. + */ + error = H2_ERR_ENHANCE_YOUR_CALM; + TRACE_STATE("Stream limit violated", H2_EV_STRM_SHUT, h2c->conn); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + sess_log(h2c->conn->owner); + session_inc_http_req_ctr(h2c->conn->owner); + session_inc_http_err_ctr(h2c->conn->owner); + goto conn_err; + } + + error = h2c_dec_hdrs(h2c, &rxbuf, &flags, &body_len, NULL); + + if (error == 0) { + /* No error but missing data for demuxing, it is an incomplete frame */ + if (!(h2c->flags &H2_CF_DEM_BLOCK_ANY)) + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto out; + } + + /* Now we cannot roll back and we won't come back here anymore for this + * stream, so this stream ID is open from a protocol perspective, even + * if incomplete or broken, we want to count it as attempted. + */ + if (h2c->dsi > h2c->max_id) + h2c->max_id = h2c->dsi; + h2c->stream_cnt++; + + if (error < 0) { + /* Failed to decode this stream. This might be due to a + * recoverable error affecting only the stream (e.g. too large + * request for buffer, that leaves the HPACK decompressor still + * synchronized), or a non-recoverable error such as an invalid + * frame type sequence (e.g. other frame type interleaved with + * CONTINUATION), in which h2c_dec_hdrs() has already set the + * error code in the connection and counted it in the relevant + * stats. We still count a req error in both cases. + */ + sess_log(h2c->conn->owner); + session_inc_http_req_ctr(h2c->conn->owner); + session_inc_http_err_ctr(h2c->conn->owner); + + if (h2c->st0 >= H2_CS_ERROR) { + TRACE_USER("Unrecoverable error decoding H2 request", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW|H2_EV_STRM_END, h2c->conn, 0, &rxbuf); + goto out; + } + + /* recoverable stream error (e.g. too large request) */ + TRACE_USER("rcvd unparsable H2 request", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW|H2_EV_STRM_END, h2c->conn, h2s, &rxbuf); + goto strm_err; + } + + TRACE_USER("rcvd H2 request ", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW, h2c->conn, 0, &rxbuf); + + /* Note: we don't emit any other logs below because if we return + * positively from h2c_frt_stream_new(), the stream will report the error, + * and if we return in error, h2c_frt_stream_new() will emit the error. + * + * Xfer the rxbuf to the stream. On success, the new stream owns the + * rxbuf. On error, it is released here. + */ + h2s = h2c_frt_stream_new(h2c, h2c->dsi, &rxbuf, flags); + if (!h2s) { + h2s = (struct h2s*)h2_refused_stream; + TRACE_USER("refused H2 req. ", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW|H2_EV_STRM_END, h2c->conn, h2s, &rxbuf); + goto send_rst; + } + + h2s->st = H2_SS_OPEN; + h2s->flags |= flags; + h2s->body_len = body_len; + h2s_propagate_term_flags(h2c, h2s); + + done: + if (h2s->flags & H2_SF_ES_RCVD) { + if (h2s->st == H2_SS_OPEN) + h2s->st = H2_SS_HREM; + else + h2s_close(h2s); + } + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + goto leave; + + conn_err: + h2c_error(h2c, error); + out: + h2_release_buf(h2c, &rxbuf); + TRACE_DEVEL("leaving on missing data or error", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + h2s = NULL; + goto leave; + + strm_err: + h2s = (struct h2s*)h2_error_stream; + + send_rst: + /* make the demux send an RST for the current stream. We may only + * do this if we're certain that the HEADERS frame was properly + * decompressed so that the HPACK decoder is still kept up to date. + */ + h2_release_buf(h2c, &rxbuf); + h2c->st0 = H2_CS_FRAME_E; + + TRACE_DEVEL("leaving on error", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + + leave: + if (h2_fe_max_total_streams && h2c->stream_cnt >= h2_fe_max_total_streams) { + /* we've had enough streams on this connection, time to renew it. + * In order to gracefully do this, we'll advertise a stream limit + * of the current one plus the max concurrent streams value in the + * GOAWAY frame, so that we're certain that the client is aware of + * the limit before creating a new stream, but knows we won't harm + * the streams in flight. Remember that client stream IDs are odd + * so we apply twice the concurrent streams value to the current + * ID. + */ + if (h2c->last_sid <= 0 || + h2c->last_sid > h2c->max_id + 2 * h2c_max_concurrent_streams(h2c)) { + /* not set yet or was too high */ + h2c->last_sid = h2c->max_id + 2 * h2c_max_concurrent_streams(h2c); + h2c_send_goaway_error(h2c, NULL); + } + } + + return h2s; +} + +/* processes a HEADERS frame. Returns h2s on success or NULL on missing data. + * It may return an error in h2c or h2s. Described in RFC7540#6.2. Most of the + * errors here are reported as connection errors since it's impossible to + * recover from such errors after the compression context has been altered. + */ +static struct h2s *h2c_bck_handle_headers(struct h2c *h2c, struct h2s *h2s) +{ + struct buffer rxbuf = BUF_NULL; + unsigned long long body_len = 0; + uint32_t flags = 0; + int error; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + + if (!b_size(&h2c->dbuf)) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto fail; // empty buffer + } + + if (b_data(&h2c->dbuf) < h2c->dfl && !b_full(&h2c->dbuf)) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto fail; // incomplete frame + } + + if (h2s->st != H2_SS_CLOSED) { + error = h2c_dec_hdrs(h2c, &h2s->rxbuf, &h2s->flags, &h2s->body_len, h2s->upgrade_protocol); + } + else { + /* the connection was already killed by an RST, let's consume + * the data and send another RST. + */ + error = h2c_dec_hdrs(h2c, &rxbuf, &flags, &body_len, NULL); + h2s = (struct h2s*)h2_error_stream; + h2c->st0 = H2_CS_FRAME_E; + goto send_rst; + } + + /* unrecoverable error ? */ + if (h2c->st0 >= H2_CS_ERROR) { + TRACE_USER("Unrecoverable error decoding H2 HEADERS", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + goto fail; + } + + if (h2s->st != H2_SS_OPEN && h2s->st != H2_SS_HLOC) { + /* RFC7540#5.1 */ + TRACE_ERROR("response HEADERS in invalid state", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + h2s_error(h2s, H2_ERR_STREAM_CLOSED); + h2c->st0 = H2_CS_FRAME_E; + HA_ATOMIC_INC(&h2c->px_counters->strm_proto_err); + goto fail; + } + + if (error <= 0) { + if (error == 0) { + /* Demux not blocked because of the stream, it is an incomplete frame */ + if (!(h2c->flags &H2_CF_DEM_BLOCK_ANY)) + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto fail; // missing data + } + + /* stream error : send RST_STREAM */ + TRACE_ERROR("couldn't decode response HEADERS", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + h2s_error(h2s, H2_ERR_PROTOCOL_ERROR); + h2c->st0 = H2_CS_FRAME_E; + HA_ATOMIC_INC(&h2c->px_counters->strm_proto_err); + goto fail; + } + + if (se_fl_test(h2s->sd, SE_FL_ERROR) && h2s->st < H2_SS_ERROR) + h2s->st = H2_SS_ERROR; + else if (h2s->flags & H2_SF_ES_RCVD) { + if (h2s->st == H2_SS_OPEN) + h2s->st = H2_SS_HREM; + else if (h2s->st == H2_SS_HLOC) + h2s_close(h2s); + } + + /* Unblock busy server h2s waiting for the response headers to validate + * the tunnel establishment or the end of the response of an oborted + * tunnel + */ + if ((h2s->flags & (H2_SF_BODY_TUNNEL|H2_SF_BLK_MBUSY)) == (H2_SF_BODY_TUNNEL|H2_SF_BLK_MBUSY) || + (h2s->flags & (H2_SF_TUNNEL_ABRT|H2_SF_ES_RCVD|H2_SF_BLK_MBUSY)) == (H2_SF_TUNNEL_ABRT|H2_SF_ES_RCVD|H2_SF_BLK_MBUSY)) { + TRACE_STATE("Unblock h2s blocked on tunnel establishment/abort", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + h2s->flags &= ~H2_SF_BLK_MBUSY; + } + + TRACE_USER("rcvd H2 response ", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, 0, &h2s->rxbuf); + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + return h2s; + fail: + TRACE_DEVEL("leaving on missing data or error", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + return NULL; + + send_rst: + /* make the demux send an RST for the current stream. We may only + * do this if we're certain that the HEADERS frame was properly + * decompressed so that the HPACK decoder is still kept up to date. + */ + h2_release_buf(h2c, &rxbuf); + h2c->st0 = H2_CS_FRAME_E; + + TRACE_USER("rejected H2 response", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_STRM_NEW|H2_EV_STRM_END, h2c->conn, 0, &rxbuf); + TRACE_DEVEL("leaving on error", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + return h2s; +} + +/* processes a DATA frame. Returns > 0 on success or zero on missing data. + * It may return an error in h2c or h2s. Described in RFC7540#6.1. + */ +static int h2c_handle_data(struct h2c *h2c, struct h2s *h2s) +{ + int error; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + + /* note that empty DATA frames are perfectly valid and sometimes used + * to signal an end of stream (with the ES flag). + */ + + if (!b_size(&h2c->dbuf) && h2c->dfl) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto fail; // empty buffer + } + + if (b_data(&h2c->dbuf) < h2c->dfl && !b_full(&h2c->dbuf)) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + goto fail; // incomplete frame + } + + /* now either the frame is complete or the buffer is complete */ + + if (h2s->st != H2_SS_OPEN && h2s->st != H2_SS_HLOC) { + /* RFC7540#6.1 */ + error = H2_ERR_STREAM_CLOSED; + goto strm_err; + } + + if (!(h2s->flags & H2_SF_HEADERS_RCVD)) { + /* RFC9113#8.1: The header section must be received before the message content */ + TRACE_ERROR("Unexpected DATA frame before the message headers", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + error = H2_ERR_PROTOCOL_ERROR; + HA_ATOMIC_INC(&h2c->px_counters->strm_proto_err); + goto strm_err; + } + if ((h2s->flags & H2_SF_DATA_CLEN) && (h2c->dfl - h2c->dpl) > h2s->body_len) { + /* RFC7540#8.1.2 */ + TRACE_ERROR("DATA frame larger than content-length", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + error = H2_ERR_PROTOCOL_ERROR; + HA_ATOMIC_INC(&h2c->px_counters->strm_proto_err); + goto strm_err; + } + if (!(h2c->flags & H2_CF_IS_BACK) && + (h2s->flags & (H2_SF_TUNNEL_ABRT|H2_SF_ES_SENT)) == (H2_SF_TUNNEL_ABRT|H2_SF_ES_SENT) && + ((h2c->dfl - h2c->dpl) || !(h2c->dff & H2_F_DATA_END_STREAM))) { + /* a tunnel attempt was aborted but the client still try to send some raw data. + * Thus the stream is closed with the CANCEL error. Here we take care it is not + * an empty DATA Frame with the ES flag. The error is only handled if ES was + * already sent to the client because depending on the scheduling, these data may + * have been sent before the server response but not handle here. + */ + TRACE_ERROR("Request DATA frame for aborted tunnel", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + error = H2_ERR_CANCEL; + goto strm_err; + } + + if (!h2_frt_transfer_data(h2s)) + goto fail; + + /* call the upper layers to process the frame, then let the upper layer + * notify the stream about any change. + */ + if (!h2s_sc(h2s)) { + /* The upper layer has already closed, this may happen on + * 4xx/redirects during POST, or when receiving a response + * from an H2 server after the client has aborted. + */ + error = H2_ERR_CANCEL; + goto strm_err; + } + + if (h2c->st0 >= H2_CS_ERROR) + goto fail; + + if (h2s->st >= H2_SS_ERROR) { + /* stream error : send RST_STREAM */ + h2c->st0 = H2_CS_FRAME_E; + } + + /* check for completion : the callee will change this to FRAME_A or + * FRAME_H once done. + */ + if (h2c->st0 == H2_CS_FRAME_P) + goto fail; + + /* last frame */ + if (h2c->dff & H2_F_DATA_END_STREAM) { + h2s->flags |= H2_SF_ES_RCVD; + if (h2s->st == H2_SS_OPEN) + h2s->st = H2_SS_HREM; + else + h2s_close(h2s); + + if (h2s->flags & H2_SF_DATA_CLEN && h2s->body_len) { + /* RFC7540#8.1.2 */ + TRACE_ERROR("ES on DATA frame before content-length", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + error = H2_ERR_PROTOCOL_ERROR; + HA_ATOMIC_INC(&h2c->px_counters->strm_proto_err); + goto strm_err; + } + } + + /* Unblock busy server h2s waiting for the end of the response for an + * aborted tunnel + */ + if ((h2c->flags & H2_CF_IS_BACK) && + (h2s->flags & (H2_SF_TUNNEL_ABRT|H2_SF_ES_RCVD|H2_SF_BLK_MBUSY)) == (H2_SF_TUNNEL_ABRT|H2_SF_ES_RCVD|H2_SF_BLK_MBUSY)) { + TRACE_STATE("Unblock h2s blocked on tunnel abort", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + h2s->flags &= ~H2_SF_BLK_MBUSY; + } + + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + return 1; + + strm_err: + h2s_error(h2s, error); + h2c->st0 = H2_CS_FRAME_E; + fail: + TRACE_DEVEL("leaving on missing data or error", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + return 0; +} + +/* check that the current frame described in h2c->{dsi,dft,dfl,dff,...} is + * valid for the current stream state. This is needed only after parsing the + * frame header but in practice it can be performed at any time during + * H2_CS_FRAME_P since no state transition happens there. Returns >0 on success + * or 0 in case of error, in which case either h2s or h2c will carry an error. + */ +static int h2_frame_check_vs_state(struct h2c *h2c, struct h2s *h2s) +{ + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_FHDR, h2c->conn, h2s); + + if (h2s->st == H2_SS_IDLE && + h2c->dft != H2_FT_HEADERS && h2c->dft != H2_FT_PRIORITY) { + /* RFC7540#5.1: any frame other than HEADERS or PRIORITY in + * this state MUST be treated as a connection error + */ + TRACE_ERROR("invalid frame type for IDLE state", H2_EV_RX_FRAME|H2_EV_RX_FHDR, h2c->conn, h2s); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + if (!h2c->nb_streams && !(h2c->flags & H2_CF_IS_BACK)) { + /* only log if no other stream can report the error */ + sess_log(h2c->conn->owner); + } + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + TRACE_DEVEL("leaving in error (idle&!hdrs&!prio)", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn, h2s); + return 0; + } + + if (h2s->st == H2_SS_IDLE && (h2c->flags & H2_CF_IS_BACK)) { + /* only PUSH_PROMISE would be permitted here */ + TRACE_ERROR("invalid frame type for IDLE state (back)", H2_EV_RX_FRAME|H2_EV_RX_FHDR, h2c->conn, h2s); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + TRACE_DEVEL("leaving in error (idle&back)", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn, h2s); + return 0; + } + + if (h2s->st == H2_SS_HREM && h2c->dft != H2_FT_WINDOW_UPDATE && + h2c->dft != H2_FT_RST_STREAM && h2c->dft != H2_FT_PRIORITY) { + /* RFC7540#5.1: any frame other than WU/PRIO/RST in + * this state MUST be treated as a stream error. + * 6.2, 6.6 and 6.10 further mandate that HEADERS/ + * PUSH_PROMISE/CONTINUATION cause connection errors. + */ + if (h2_ft_bit(h2c->dft) & H2_FT_HDR_MASK) { + TRACE_ERROR("invalid frame type for HREM state", H2_EV_RX_FRAME|H2_EV_RX_FHDR, h2c->conn, h2s); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + } + else { + h2s_error(h2s, H2_ERR_STREAM_CLOSED); + } + TRACE_DEVEL("leaving in error (hrem&!wu&!rst&!prio)", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn, h2s); + return 0; + } + + /* Below the management of frames received in closed state is a + * bit hackish because the spec makes strong differences between + * streams closed by receiving RST, sending RST, and seeing ES + * in both directions. In addition to this, the creation of a + * new stream reusing the identifier of a closed one will be + * detected here. Given that we cannot keep track of all closed + * streams forever, we consider that unknown closed streams were + * closed on RST received, which allows us to respond with an + * RST without breaking the connection (eg: to abort a transfer). + * Some frames have to be silently ignored as well. + */ + if (h2s->st == H2_SS_CLOSED && h2c->dsi) { + if (!(h2c->flags & H2_CF_IS_BACK) && h2_ft_bit(h2c->dft) & H2_FT_HDR_MASK) { + /* #5.1.1: The identifier of a newly + * established stream MUST be numerically + * greater than all streams that the initiating + * endpoint has opened or reserved. This + * governs streams that are opened using a + * HEADERS frame and streams that are reserved + * using PUSH_PROMISE. An endpoint that + * receives an unexpected stream identifier + * MUST respond with a connection error. + */ + h2c_error(h2c, H2_ERR_STREAM_CLOSED); + TRACE_DEVEL("leaving in error (closed&hdrmask)", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn, h2s); + return 0; + } + + if (h2s->flags & H2_SF_RST_RCVD && + !(h2_ft_bit(h2c->dft) & (H2_FT_HDR_MASK | H2_FT_RST_STREAM_BIT | H2_FT_PRIORITY_BIT | H2_FT_WINDOW_UPDATE_BIT))) { + /* RFC7540#5.1:closed: an endpoint that + * receives any frame other than PRIORITY after + * receiving a RST_STREAM MUST treat that as a + * stream error of type STREAM_CLOSED. + * + * Note that old streams fall into this category + * and will lead to an RST being sent. + * + * However, we cannot generalize this to all frame types. Those + * carrying compression state must still be processed before + * being dropped or we'll desynchronize the decoder. This can + * happen with request trailers received after sending an + * RST_STREAM, or with header/trailers responses received after + * sending RST_STREAM (aborted stream). + * + * In addition, since our CLOSED streams always carry the + * RST_RCVD bit, we don't want to accidentally catch valid + * frames for a closed stream, i.e. RST/PRIO/WU. + */ + h2s_error(h2s, H2_ERR_STREAM_CLOSED); + h2c->st0 = H2_CS_FRAME_E; + TRACE_DEVEL("leaving in error (rst_rcvd&!hdrmask)", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn, h2s); + return 0; + } + + /* RFC7540#5.1:closed: if this state is reached as a + * result of sending a RST_STREAM frame, the peer that + * receives the RST_STREAM might have already sent + * frames on the stream that cannot be withdrawn. An + * endpoint MUST ignore frames that it receives on + * closed streams after it has sent a RST_STREAM + * frame. An endpoint MAY choose to limit the period + * over which it ignores frames and treat frames that + * arrive after this time as being in error. + */ + if (h2s->id && !(h2s->flags & H2_SF_RST_SENT)) { + /* RFC7540#5.1:closed: any frame other than + * PRIO/WU/RST in this state MUST be treated as + * a connection error + */ + if (h2c->dft != H2_FT_RST_STREAM && + h2c->dft != H2_FT_PRIORITY && + h2c->dft != H2_FT_WINDOW_UPDATE) { + h2c_error(h2c, H2_ERR_STREAM_CLOSED); + TRACE_DEVEL("leaving in error (rst_sent&!rst&!prio&!wu)", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn, h2s); + return 0; + } + } + } + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_FHDR, h2c->conn, h2s); + return 1; +} + +/* Reverse the connection <h2c>. Common operations are done for both active and + * passive reversal. Timeouts are inverted and H2_CF_IS_BACK is set or unset + * depending on the reversal direction. + * + * For active reversal, only minor steps are required. The connection should + * then be accepted by its listener before being able to use it for transfers. + * + * For passive reversal, connection is inserted in its targeted server idle + * pool. It can thus be reused immediately for future transfers on this server. + * + * Returns 1 on success else 0. + */ +static int h2_conn_reverse(struct h2c *h2c) +{ + struct connection *conn = h2c->conn; + + TRACE_ENTER(H2_EV_H2C_WAKE, h2c->conn); + + if (conn_reverse(conn)) { + TRACE_ERROR("reverse connection failed", H2_EV_H2C_WAKE, conn); + goto err; + } + + TRACE_USER("reverse connection", H2_EV_H2C_WAKE, conn); + + /* Check the connection new side after reversal. */ + if (conn_is_back(conn)) { + struct server *srv = __objt_server(h2c->conn->target); + struct proxy *prx = srv->proxy; + + h2c->flags |= H2_CF_IS_BACK; + + h2c->shut_timeout = h2c->timeout = prx->timeout.server; + if (tick_isset(prx->timeout.serverfin)) + h2c->shut_timeout = prx->timeout.serverfin; + + h2c->px_counters = EXTRA_COUNTERS_GET(prx->extra_counters_be, + &h2_stats_module); + + HA_ATOMIC_OR(&h2c->wait_event.tasklet->state, TASK_F_USR1); + xprt_set_idle(conn, conn->xprt, conn->xprt_ctx); + if (!srv_add_to_idle_list(srv, conn, 1)) + goto err; + } + else { + struct listener *l = __objt_listener(h2c->conn->target); + struct proxy *prx = l->bind_conf->frontend; + + h2c->flags &= ~H2_CF_IS_BACK; + + h2c->shut_timeout = h2c->timeout = prx->timeout.client; + if (tick_isset(prx->timeout.clientfin)) + h2c->shut_timeout = prx->timeout.clientfin; + + h2c->px_counters = EXTRA_COUNTERS_GET(prx->extra_counters_fe, + &h2_stats_module); + + proxy_inc_fe_cum_sess_ver_ctr(l, prx, 2); + + BUG_ON(LIST_INLIST(&h2c->conn->stopping_list)); + LIST_APPEND(&mux_stopping_data[tid].list, + &h2c->conn->stopping_list); + } + + /* Check if stream creation is initially forbidden. This is the case + * for active preconnect until reversal is done. + */ + if (conn_reverse_in_preconnect(h2c->conn)) { + TRACE_DEVEL("prevent stream demux until accept is done", H2_EV_H2C_WAKE, conn); + h2c->flags |= H2_CF_DEM_TOOMANY; + } + + /* If only the new side has a defined timeout, task must be allocated. + * On the contrary, if only old side has a timeout, it must be freed. + */ + if (!h2c->task && tick_isset(h2c->timeout)) { + h2c->task = task_new_here(); + if (!h2c->task) + goto err; + + h2c->task->process = h2_timeout_task; + h2c->task->context = h2c; + } + else if (!tick_isset(h2c->timeout)) { + task_destroy(h2c->task); + h2c->task = NULL; + } + + /* Requeue task if instantiated with the new timeout value. */ + if (h2c->task) { + h2c->task->expire = tick_add(now_ms, h2c->timeout); + task_queue(h2c->task); + } + + TRACE_LEAVE(H2_EV_H2C_WAKE, h2c->conn); + return 1; + + err: + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + TRACE_DEVEL("leaving on error", H2_EV_H2C_WAKE); + return 0; +} + +/* process Rx frames to be demultiplexed */ +static void h2_process_demux(struct h2c *h2c) +{ + struct h2s *h2s = NULL, *tmp_h2s; + struct h2_fh hdr; + unsigned int padlen = 0; + int32_t old_iw = h2c->miw; + + TRACE_ENTER(H2_EV_H2C_WAKE, h2c->conn); + + if (h2c->st0 >= H2_CS_ERROR) + goto out; + + if (unlikely(h2c->st0 < H2_CS_FRAME_H)) { + if (h2c->st0 == H2_CS_PREFACE) { + TRACE_STATE("expecting preface", H2_EV_RX_PREFACE, h2c->conn); + if (h2c->flags & H2_CF_IS_BACK) + goto out; + + if (unlikely(h2c_frt_recv_preface(h2c) <= 0)) { + /* RFC7540#3.5: a GOAWAY frame MAY be omitted */ + if (h2c->st0 == H2_CS_ERROR) { + TRACE_PROTO("failed to receive preface", H2_EV_RX_PREFACE|H2_EV_PROTO_ERR, h2c->conn); + h2c->st0 = H2_CS_ERROR2; + if (b_data(&h2c->dbuf) || + !(((const struct session *)h2c->conn->owner)->fe->options & (PR_O_NULLNOLOG|PR_O_IGNORE_PRB))) + sess_log(h2c->conn->owner); + } + goto done; + } + TRACE_PROTO("received preface", H2_EV_RX_PREFACE, h2c->conn); + + h2c->max_id = 0; + TRACE_STATE("switching to SETTINGS1", H2_EV_RX_PREFACE, h2c->conn); + h2c->st0 = H2_CS_SETTINGS1; + } + + if (h2c->st0 == H2_CS_SETTINGS1) { + /* ensure that what is pending is a valid SETTINGS frame + * without an ACK. + */ + TRACE_STATE("expecting settings", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_SETTINGS, h2c->conn); + if (!h2_get_frame_hdr(&h2c->dbuf, &hdr)) { + /* RFC7540#3.5: a GOAWAY frame MAY be omitted */ + h2c->flags |= H2_CF_DEM_SHORT_READ; + if (h2c->st0 == H2_CS_ERROR) { + TRACE_ERROR("failed to receive settings", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_SETTINGS|H2_EV_PROTO_ERR, h2c->conn); + h2c->st0 = H2_CS_ERROR2; + if (!(h2c->flags & H2_CF_IS_BACK)) + sess_log(h2c->conn->owner); + } + goto done; + } + + if (hdr.sid || hdr.ft != H2_FT_SETTINGS || hdr.ff & H2_F_SETTINGS_ACK) { + /* RFC7540#3.5: a GOAWAY frame MAY be omitted */ + TRACE_ERROR("unexpected frame type or flags", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_SETTINGS|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + h2c->st0 = H2_CS_ERROR2; + if (!(h2c->flags & H2_CF_IS_BACK)) + sess_log(h2c->conn->owner); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto done; + } + + if ((int)hdr.len < 0 || (int)hdr.len > global.tune.bufsize) { + /* RFC7540#3.5: a GOAWAY frame MAY be omitted */ + TRACE_ERROR("invalid settings frame length", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_SETTINGS|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_FRAME_SIZE_ERROR); + h2c->st0 = H2_CS_ERROR2; + if (!(h2c->flags & H2_CF_IS_BACK)) + sess_log(h2c->conn->owner); + goto done; + } + + /* that's OK, switch to FRAME_P to process it. This is + * a SETTINGS frame whose header has already been + * deleted above. + */ + padlen = 0; + HA_ATOMIC_INC(&h2c->px_counters->settings_rcvd); + goto new_frame; + } + } + + /* process as many incoming frames as possible below */ + while (1) { + int ret = 0; + + if (!b_data(&h2c->dbuf)) { + TRACE_DEVEL("no more Rx data", H2_EV_RX_FRAME, h2c->conn); + h2c->flags |= H2_CF_DEM_SHORT_READ; + break; + } + + if (h2c->st0 >= H2_CS_ERROR) { + TRACE_STATE("end of connection reported", H2_EV_RX_FRAME|H2_EV_RX_EOI, h2c->conn); + break; + } + + if (h2c->st0 == H2_CS_FRAME_H) { + TRACE_STATE("expecting H2 frame header", H2_EV_RX_FRAME|H2_EV_RX_FHDR, h2c->conn); + if (!h2_peek_frame_hdr(&h2c->dbuf, 0, &hdr)) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + break; + } + + if ((int)hdr.len < 0 || (int)hdr.len > global.tune.bufsize) { + TRACE_ERROR("invalid H2 frame length", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_FRAME_SIZE_ERROR); + if (!h2c->nb_streams && !(h2c->flags & H2_CF_IS_BACK)) { + /* only log if no other stream can report the error */ + sess_log(h2c->conn->owner); + } + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + break; + } + + if (h2c->rcvd_s && h2c->dsi != hdr.sid) { + /* changed stream with a pending WU, need to + * send it now. + */ + TRACE_PROTO("sending stream WINDOW_UPDATE frame on stream switch", H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn); + ret = h2c_send_strm_wu(h2c); + if (ret <= 0) + break; + } + + padlen = 0; + if (h2_ft_bit(hdr.ft) & H2_FT_PADDED_MASK && hdr.ff & H2_F_PADDED) { + /* If the frame is padded (HEADERS, PUSH_PROMISE or DATA), + * we read the pad length and drop it from the remaining + * payload (one byte + the 9 remaining ones = 10 total + * removed), so we have a frame payload starting after the + * pad len. Flow controlled frames (DATA) also count the + * padlen in the flow control, so it must be adjusted. + */ + if (hdr.len < 1) { + TRACE_ERROR("invalid H2 padded frame length", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_FRAME_SIZE_ERROR); + if (!(h2c->flags & H2_CF_IS_BACK)) + sess_log(h2c->conn->owner); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto done; + } + hdr.len--; + + if (b_data(&h2c->dbuf) < 10) { + h2c->flags |= H2_CF_DEM_SHORT_READ; + break; // missing padlen + } + + padlen = *(uint8_t *)b_peek(&h2c->dbuf, 9); + + if (padlen > hdr.len) { + TRACE_ERROR("invalid H2 padding length", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn); + /* RFC7540#6.1 : pad length = length of + * frame payload or greater => error. + */ + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + if (!(h2c->flags & H2_CF_IS_BACK)) + sess_log(h2c->conn->owner); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto done; + } + + if (h2_ft_bit(hdr.ft) & H2_FT_FC_MASK) { + h2c->rcvd_c++; + h2c->rcvd_s++; + } + b_del(&h2c->dbuf, 1); + } + h2_skip_frame_hdr(&h2c->dbuf); + + new_frame: + h2c->dfl = hdr.len; + h2c->dsi = hdr.sid; + h2c->dft = hdr.ft; + h2c->dff = hdr.ff; + h2c->dpl = padlen; + h2c->flags |= H2_CF_DEM_IN_PROGRESS; + TRACE_STATE("rcvd H2 frame header, switching to FRAME_P state", H2_EV_RX_FRAME|H2_EV_RX_FHDR, h2c->conn); + h2c->st0 = H2_CS_FRAME_P; + + /* check for minimum basic frame format validity */ + ret = h2_frame_check(h2c->dft, 1, h2c->dsi, h2c->dfl, global.tune.bufsize); + if (ret != H2_ERR_NO_ERROR) { + TRACE_ERROR("received invalid H2 frame header", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, ret); + if (!(h2c->flags & H2_CF_IS_BACK)) + sess_log(h2c->conn->owner); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto done; + } + + /* transition to HEADERS frame ends the keep-alive idle + * timer and starts the http-request idle delay. It uses + * the idle_start timer as well. + */ + if (hdr.ft == H2_FT_HEADERS) + h2c->idle_start = now_ms; + } + + /* Only H2_CS_FRAME_P, H2_CS_FRAME_A and H2_CS_FRAME_E here. + * H2_CS_FRAME_P indicates an incomplete previous operation + * (most often the first attempt) and requires some validity + * checks for the frame and the current state. The two other + * ones are set after completion (or abortion) and must skip + * validity checks. + */ + tmp_h2s = h2c_st_by_id(h2c, h2c->dsi); + + if (tmp_h2s != h2s && h2s && h2s_sc(h2s) && + (b_data(&h2s->rxbuf) || + h2c_read0_pending(h2c) || + h2s->st == H2_SS_CLOSED || + (h2s->flags & H2_SF_ES_RCVD) || + se_fl_test(h2s->sd, SE_FL_ERROR | SE_FL_ERR_PENDING | SE_FL_EOS))) { + /* we may have to signal the upper layers */ + TRACE_DEVEL("notifying stream before switching SID", H2_EV_RX_FRAME|H2_EV_STRM_WAKE, h2c->conn, h2s); + se_fl_set(h2s->sd, SE_FL_RCV_MORE); + h2s_notify_recv(h2s); + } + h2s = tmp_h2s; + + if (h2c->st0 == H2_CS_FRAME_E || + (h2c->st0 == H2_CS_FRAME_P && !h2_frame_check_vs_state(h2c, h2s))) { + TRACE_PROTO("stream error reported", H2_EV_RX_FRAME|H2_EV_PROTO_ERR, h2c->conn, h2s); + goto strm_err; + } + + switch (h2c->dft) { + case H2_FT_SETTINGS: + if (h2c->st0 == H2_CS_FRAME_P) { + TRACE_PROTO("receiving H2 SETTINGS frame", H2_EV_RX_FRAME|H2_EV_RX_SETTINGS, h2c->conn, h2s); + ret = h2c_handle_settings(h2c); + } + HA_ATOMIC_INC(&h2c->px_counters->settings_rcvd); + + if (h2c->st0 == H2_CS_FRAME_A) { + TRACE_PROTO("sending H2 SETTINGS ACK frame", H2_EV_TX_FRAME|H2_EV_RX_SETTINGS, h2c->conn, h2s); + ret = h2c_ack_settings(h2c); + + if (ret > 0 && conn_is_reverse(h2c->conn)) { + /* Initiate connection reversal after SETTINGS reception. */ + ret = h2_conn_reverse(h2c); + } + } + break; + + case H2_FT_PING: + if (h2c->st0 == H2_CS_FRAME_P) { + TRACE_PROTO("receiving H2 PING frame", H2_EV_RX_FRAME|H2_EV_RX_PING, h2c->conn, h2s); + ret = h2c_handle_ping(h2c); + } + + if (h2c->st0 == H2_CS_FRAME_A) { + TRACE_PROTO("sending H2 PING ACK frame", H2_EV_TX_FRAME|H2_EV_TX_SETTINGS, h2c->conn, h2s); + ret = h2c_ack_ping(h2c); + } + break; + + case H2_FT_WINDOW_UPDATE: + if (h2c->st0 == H2_CS_FRAME_P) { + TRACE_PROTO("receiving H2 WINDOW_UPDATE frame", H2_EV_RX_FRAME|H2_EV_RX_WU, h2c->conn, h2s); + ret = h2c_handle_window_update(h2c, h2s); + } + break; + + case H2_FT_CONTINUATION: + /* RFC7540#6.10: CONTINUATION may only be preceded by + * a HEADERS/PUSH_PROMISE/CONTINUATION frame. These + * frames' parsers consume all following CONTINUATION + * frames so this one is out of sequence. + */ + TRACE_ERROR("received unexpected H2 CONTINUATION frame", H2_EV_RX_FRAME|H2_EV_RX_CONT|H2_EV_H2C_ERR, h2c->conn, h2s); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + if (!(h2c->flags & H2_CF_IS_BACK)) + sess_log(h2c->conn->owner); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto done; + + case H2_FT_HEADERS: + if (h2c->st0 == H2_CS_FRAME_P) { + TRACE_PROTO("receiving H2 HEADERS frame", H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); + if (h2c->flags & H2_CF_IS_BACK) + tmp_h2s = h2c_bck_handle_headers(h2c, h2s); + else + tmp_h2s = h2c_frt_handle_headers(h2c, h2s); + if (tmp_h2s) { + h2s = tmp_h2s; + ret = 1; + } + } + HA_ATOMIC_INC(&h2c->px_counters->headers_rcvd); + break; + + case H2_FT_DATA: + if (h2c->st0 == H2_CS_FRAME_P) { + TRACE_PROTO("receiving H2 DATA frame", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + ret = h2c_handle_data(h2c, h2s); + } + HA_ATOMIC_INC(&h2c->px_counters->data_rcvd); + + if (h2c->st0 == H2_CS_FRAME_A) { + /* rcvd_s will suffice to trigger the sending of a WU */ + h2c->st0 = H2_CS_FRAME_H; + } + break; + + case H2_FT_PRIORITY: + if (h2c->st0 == H2_CS_FRAME_P) { + TRACE_PROTO("receiving H2 PRIORITY frame", H2_EV_RX_FRAME|H2_EV_RX_PRIO, h2c->conn, h2s); + ret = h2c_handle_priority(h2c); + } + break; + + case H2_FT_RST_STREAM: + if (h2c->st0 == H2_CS_FRAME_P) { + TRACE_PROTO("receiving H2 RST_STREAM frame", H2_EV_RX_FRAME|H2_EV_RX_RST|H2_EV_RX_EOI, h2c->conn, h2s); + ret = h2c_handle_rst_stream(h2c, h2s); + } + HA_ATOMIC_INC(&h2c->px_counters->rst_stream_rcvd); + break; + + case H2_FT_GOAWAY: + if (h2c->st0 == H2_CS_FRAME_P) { + TRACE_PROTO("receiving H2 GOAWAY frame", H2_EV_RX_FRAME|H2_EV_RX_GOAWAY, h2c->conn, h2s); + ret = h2c_handle_goaway(h2c); + } + HA_ATOMIC_INC(&h2c->px_counters->goaway_rcvd); + break; + + /* implement all extra frame types here */ + default: + TRACE_PROTO("receiving H2 ignored frame", H2_EV_RX_FRAME, h2c->conn, h2s); + /* drop frames that we ignore. They may be larger than + * the buffer so we drain all of their contents until + * we reach the end. + */ + ret = MIN(b_data(&h2c->dbuf), h2c->dfl); + b_del(&h2c->dbuf, ret); + h2c->dfl -= ret; + ret = h2c->dfl == 0; + } + + strm_err: + /* We may have to send an RST if not done yet */ + if (h2s->st == H2_SS_ERROR) { + TRACE_STATE("stream error, switching to FRAME_E", H2_EV_RX_FRAME|H2_EV_H2S_ERR, h2c->conn, h2s); + h2c->st0 = H2_CS_FRAME_E; + } + + if (h2c->st0 == H2_CS_FRAME_E) { + TRACE_PROTO("sending H2 RST_STREAM frame", H2_EV_TX_FRAME|H2_EV_TX_RST|H2_EV_TX_EOI, h2c->conn, h2s); + ret = h2c_send_rst_stream(h2c, h2s); + } + + /* error or missing data condition met above ? */ + if (ret <= 0) + break; + + if (h2c->st0 != H2_CS_FRAME_H) { + if (h2c->dfl) + TRACE_DEVEL("skipping remaining frame payload", H2_EV_RX_FRAME, h2c->conn, h2s); + ret = MIN(b_data(&h2c->dbuf), h2c->dfl); + b_del(&h2c->dbuf, ret); + h2c->dfl -= ret; + if (!h2c->dfl) { + h2c->flags &= ~H2_CF_DEM_IN_PROGRESS; + TRACE_STATE("switching to FRAME_H", H2_EV_RX_FRAME|H2_EV_RX_FHDR, h2c->conn); + h2c->st0 = H2_CS_FRAME_H; + } + } + } + + if (h2c->rcvd_s > 0 && + !(h2c->flags & (H2_CF_MUX_MFULL | H2_CF_DEM_MROOM))) { + TRACE_PROTO("sending stream WINDOW_UPDATE frame", H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn, h2s); + h2c_send_strm_wu(h2c); + } + + if (h2c->rcvd_c > 0 && + !(h2c->flags & (H2_CF_MUX_MFULL | H2_CF_DEM_MROOM))) { + TRACE_PROTO("sending H2 WINDOW_UPDATE frame", H2_EV_TX_FRAME|H2_EV_TX_WU, h2c->conn); + h2c_send_conn_wu(h2c); + } + + done: + if (h2c->st0 >= H2_CS_ERROR || (h2c->flags & H2_CF_DEM_SHORT_READ)) { + if (h2c->flags & H2_CF_RCVD_SHUT) + h2c->flags |= H2_CF_END_REACHED; + } + + if (h2s && h2s_sc(h2s) && + (b_data(&h2s->rxbuf) || + h2c_read0_pending(h2c) || + h2s->st == H2_SS_CLOSED || + (h2s->flags & H2_SF_ES_RCVD) || + se_fl_test(h2s->sd, SE_FL_ERROR | SE_FL_ERR_PENDING | SE_FL_EOS))) { + /* we may have to signal the upper layers */ + TRACE_DEVEL("notifying stream before switching SID", H2_EV_RX_FRAME|H2_EV_H2S_WAKE, h2c->conn, h2s); + se_fl_set(h2s->sd, SE_FL_RCV_MORE); + h2s_notify_recv(h2s); + } + + if (old_iw != h2c->miw) { + TRACE_STATE("notifying streams about SFCTL increase", H2_EV_RX_FRAME|H2_EV_H2S_WAKE, h2c->conn); + h2c_unblock_sfctl(h2c); + } + + h2c_restart_reading(h2c, 0); + out: + TRACE_LEAVE(H2_EV_H2C_WAKE, h2c->conn); + return; +} + +/* resume each h2s eligible for sending in list head <head> */ +static void h2_resume_each_sending_h2s(struct h2c *h2c, struct list *head) +{ + struct h2s *h2s, *h2s_back; + + TRACE_ENTER(H2_EV_H2C_SEND|H2_EV_H2S_WAKE, h2c->conn); + + list_for_each_entry_safe(h2s, h2s_back, head, list) { + if (h2c->mws <= 0 || + h2c->flags & H2_CF_MUX_BLOCK_ANY || + h2c->st0 >= H2_CS_ERROR) + break; + + h2s->flags &= ~H2_SF_BLK_ANY; + + if (h2s->flags & H2_SF_NOTIFIED) + continue; + + /* If the sender changed his mind and unsubscribed, let's just + * remove the stream from the send_list. + */ + if (!(h2s->flags & (H2_SF_WANT_SHUTR|H2_SF_WANT_SHUTW)) && + (!h2s->subs || !(h2s->subs->events & SUB_RETRY_SEND))) { + LIST_DEL_INIT(&h2s->list); + continue; + } + + if (h2s->subs && h2s->subs->events & SUB_RETRY_SEND) { + h2s->flags |= H2_SF_NOTIFIED; + tasklet_wakeup(h2s->subs->tasklet); + h2s->subs->events &= ~SUB_RETRY_SEND; + if (!h2s->subs->events) + h2s->subs = NULL; + } + else if (h2s->flags & (H2_SF_WANT_SHUTR|H2_SF_WANT_SHUTW)) { + tasklet_wakeup(h2s->shut_tl); + } + } + + TRACE_LEAVE(H2_EV_H2C_SEND|H2_EV_H2S_WAKE, h2c->conn); +} + +/* removes a stream from the list it may be in. If a stream has recently been + * appended to the send_list, it might have been waiting on this one when + * entering h2_snd_buf() and expecting it to complete before starting to send + * in turn. For this reason we check (and clear) H2_CF_WAIT_INLIST to detect + * this condition, and we try to resume sending streams if it happens. Note + * that we don't need to do it for fctl_list as this list is relevant before + * (only consulted after) a window update on the connection, and not because + * of any competition with other streams. + */ +static inline void h2_remove_from_list(struct h2s *h2s) +{ + struct h2c *h2c = h2s->h2c; + + if (!LIST_INLIST(&h2s->list)) + return; + + LIST_DEL_INIT(&h2s->list); + if (h2c->flags & H2_CF_WAIT_INLIST) { + h2c->flags &= ~H2_CF_WAIT_INLIST; + h2_resume_each_sending_h2s(h2c, &h2c->send_list); + } +} + +/* process Tx frames from streams to be multiplexed. Returns > 0 if it reached + * the end. + */ +static int h2_process_mux(struct h2c *h2c) +{ + TRACE_ENTER(H2_EV_H2C_WAKE, h2c->conn); + + if (unlikely(h2c->st0 < H2_CS_FRAME_H)) { + if (unlikely(h2c->st0 == H2_CS_PREFACE && (h2c->flags & H2_CF_IS_BACK))) { + if (unlikely(h2c_bck_send_preface(h2c) <= 0)) { + /* RFC7540#3.5: a GOAWAY frame MAY be omitted */ + if (h2c->st0 == H2_CS_ERROR) + h2c->st0 = H2_CS_ERROR2; + goto fail; + } + h2c->st0 = H2_CS_SETTINGS1; + } + /* need to wait for the other side */ + if (h2c->st0 < H2_CS_FRAME_H) + goto done; + } + + /* start by sending possibly pending window updates */ + if (h2c->rcvd_s > 0 && + !(h2c->flags & (H2_CF_MUX_MFULL | H2_CF_MUX_MALLOC)) && + h2c_send_strm_wu(h2c) < 0) + goto fail; + + if (h2c->rcvd_c > 0 && + !(h2c->flags & (H2_CF_MUX_MFULL | H2_CF_MUX_MALLOC)) && + h2c_send_conn_wu(h2c) < 0) + goto fail; + + /* First we always process the flow control list because the streams + * waiting there were already elected for immediate emission but were + * blocked just on this. + */ + h2c->flags &= ~H2_CF_WAIT_INLIST; + h2_resume_each_sending_h2s(h2c, &h2c->fctl_list); + h2_resume_each_sending_h2s(h2c, &h2c->send_list); + + fail: + if (unlikely(h2c->st0 >= H2_CS_ERROR)) { + if (h2c->st0 == H2_CS_ERROR) { + if (h2c->max_id >= 0) { + h2c_send_goaway_error(h2c, NULL); + if (h2c->flags & H2_CF_MUX_BLOCK_ANY) + goto out0; + } + + h2c->st0 = H2_CS_ERROR2; // sent (or failed hard) ! + } + } + done: + TRACE_LEAVE(H2_EV_H2C_WAKE, h2c->conn); + return 1; + out0: + TRACE_DEVEL("leaving in blocked situation", H2_EV_H2C_WAKE, h2c->conn); + return 0; +} + + +/* Attempt to read data, and subscribe if none available. + * The function returns 1 if data has been received, otherwise zero. + */ +static int h2_recv(struct h2c *h2c) +{ + struct connection *conn = h2c->conn; + struct buffer *buf; + int max; + size_t ret; + + TRACE_ENTER(H2_EV_H2C_RECV, h2c->conn); + + if (h2c->wait_event.events & SUB_RETRY_RECV) { + TRACE_DEVEL("leaving on sub_recv", H2_EV_H2C_RECV, h2c->conn); + return (b_data(&h2c->dbuf)); + } + + if (!h2_recv_allowed(h2c)) { + TRACE_DEVEL("leaving on !recv_allowed", H2_EV_H2C_RECV, h2c->conn); + return 1; + } + + buf = h2_get_buf(h2c, &h2c->dbuf); + if (!buf) { + h2c->flags |= H2_CF_DEM_DALLOC; + TRACE_DEVEL("leaving on !alloc", H2_EV_H2C_RECV, h2c->conn); + return 0; + } + + if (!b_data(buf)) { + /* try to pre-align the buffer like the + * rxbufs will be to optimize memory copies. We'll make + * sure that the frame header lands at the end of the + * HTX block to alias it upon recv. We cannot use the + * head because rcv_buf() will realign the buffer if + * it's empty. Thus we cheat and pretend we already + * have a few bytes there. + */ + max = buf_room_for_htx_data(buf) + 9; + buf->head = sizeof(struct htx) - 9; + } + else + max = b_room(buf); + + ret = max ? conn->xprt->rcv_buf(conn, conn->xprt_ctx, buf, max, 0) : 0; + + if (max && !ret && h2_recv_allowed(h2c)) { + TRACE_DATA("failed to receive data, subscribing", H2_EV_H2C_RECV, h2c->conn); + conn->xprt->subscribe(conn, conn->xprt_ctx, SUB_RETRY_RECV, &h2c->wait_event); + } else if (ret) { + TRACE_DATA("received data", H2_EV_H2C_RECV, h2c->conn, 0, 0, (void*)(long)ret); + h2c->flags &= ~H2_CF_DEM_SHORT_READ; + } + + if (conn_xprt_read0_pending(h2c->conn)) { + TRACE_DATA("received read0", H2_EV_H2C_RECV, h2c->conn); + h2c->flags |= H2_CF_RCVD_SHUT; + } + if (h2c->conn->flags & CO_FL_ERROR && !b_data(&h2c->dbuf)) { + TRACE_DATA("connection error", H2_EV_H2C_RECV, h2c->conn); + h2c->flags |= H2_CF_ERROR; + } + + if (!b_data(buf)) { + h2_release_buf(h2c, &h2c->dbuf); + goto end; + } + + if (b_data(buf) == buf->size) { + h2c->flags |= H2_CF_DEM_DFULL; + TRACE_STATE("demux buffer full", H2_EV_H2C_RECV|H2_EV_H2C_BLK, h2c->conn); + } + + end: + TRACE_LEAVE(H2_EV_H2C_RECV, h2c->conn); + return !!ret || (h2c->flags & (H2_CF_RCVD_SHUT|H2_CF_ERROR)); +} + +/* Try to send data if possible. + * The function returns 1 if data have been sent, otherwise zero. + */ +static int h2_send(struct h2c *h2c) +{ + struct connection *conn = h2c->conn; + int done; + int sent = 0; + + TRACE_ENTER(H2_EV_H2C_SEND, h2c->conn); + + if (h2c->flags & (H2_CF_ERROR|H2_CF_ERR_PENDING)) { + TRACE_DEVEL("leaving on error", H2_EV_H2C_SEND, h2c->conn); + if (h2c->flags & H2_CF_END_REACHED) + h2c->flags |= H2_CF_ERROR; + b_reset(br_tail(h2c->mbuf)); + h2c->idle_start = now_ms; + return 1; + } + + /* This loop is quite simple : it tries to fill as much as it can from + * pending streams into the existing buffer until it's reportedly full + * or the end of send requests is reached. Then it tries to send this + * buffer's contents out, marks it not full if at least one byte could + * be sent, and tries again. + * + * The snd_buf() function normally takes a "flags" argument which may + * be made of a combination of CO_SFL_MSG_MORE to indicate that more + * data immediately comes and CO_SFL_STREAMER to indicate that the + * connection is streaming lots of data (used to increase TLS record + * size at the expense of latency). The former can be sent any time + * there's a buffer full flag, as it indicates at least one stream + * attempted to send and failed so there are pending data. An + * alternative would be to set it as long as there's an active stream + * but that would be problematic for ACKs until we have an absolute + * guarantee that all waiters have at least one byte to send. The + * latter should possibly not be set for now. + */ + + done = 0; + while (!(conn->flags & CO_FL_WAIT_XPRT) && !done) { + unsigned int flags = 0; + unsigned int released = 0; + struct buffer *buf; + uint to_send; + + /* fill as much as we can into the current buffer */ + while (((h2c->flags & (H2_CF_MUX_MFULL|H2_CF_MUX_MALLOC)) == 0) && !done) + done = h2_process_mux(h2c); + + if (h2c->flags & H2_CF_MUX_MALLOC) + done = 1; // we won't go further without extra buffers + + if ((conn->flags & (CO_FL_SOCK_WR_SH|CO_FL_ERROR)) || + (h2c->flags & H2_CF_GOAWAY_FAILED)) + break; + + if (h2c->flags & (H2_CF_MUX_MFULL | H2_CF_DEM_MROOM)) + flags |= CO_SFL_MSG_MORE; + + to_send = br_count(h2c->mbuf); + if (to_send > 1) { + /* usually we want to emit small TLS records to speed + * up the decoding on the client. That's what is being + * done by default. However if there is more than one + * buffer being allocated, we're streaming large data + * so we stich to large records. + */ + flags |= CO_SFL_STREAMER; + } + + for (buf = br_head(h2c->mbuf); b_size(buf); buf = br_del_head(h2c->mbuf)) { + if (b_data(buf)) { + int ret = conn->xprt->snd_buf(conn, conn->xprt_ctx, buf, b_data(buf), + flags | (to_send > 1 ? CO_SFL_MSG_MORE : 0)); + if (!ret) { + done = 1; + break; + } + sent = 1; + to_send--; + TRACE_DATA("sent data", H2_EV_H2C_SEND, h2c->conn, 0, buf, (void*)(long)ret); + b_del(buf, ret); + if (b_data(buf)) { + done = 1; + break; + } + } + b_free(buf); + released++; + } + + if (released) + offer_buffers(NULL, released); + + /* Normally if wrote at least one byte, the buffer is not full + * anymore. However, if it was marked full because all of its + * buffers were used, we don't want to instantly wake up many + * streams because we'd create a thundering herd effect, notably + * when data are flushed in small chunks. Instead we wait for + * the buffer to be decongested again before allowing to send + * again. It also has the added benefit of not pumping more + * data from the other side when it's known that this one is + * still congested. + */ + if (sent && br_single(h2c->mbuf)) + h2c->flags &= ~(H2_CF_MUX_MFULL | H2_CF_DEM_MROOM); + } + + if (conn->flags & CO_FL_ERROR) { + h2c->flags |= H2_CF_ERR_PENDING; + if (h2c->flags & H2_CF_END_REACHED) + h2c->flags |= H2_CF_ERROR; + b_reset(br_tail(h2c->mbuf)); + } + + /* We're not full anymore, so we can wake any task that are waiting + * for us. + */ + if (!(h2c->flags & (H2_CF_MUX_MFULL | H2_CF_DEM_MROOM)) && h2c->st0 >= H2_CS_FRAME_H) { + h2c->flags &= ~H2_CF_WAIT_INLIST; + h2_resume_each_sending_h2s(h2c, &h2c->send_list); + } + + /* We're done, no more to send */ + if (!(conn->flags & CO_FL_WAIT_XPRT) && !br_data(h2c->mbuf)) { + TRACE_DEVEL("leaving with everything sent", H2_EV_H2C_SEND, h2c->conn); + if (h2c->flags & H2_CF_MBUF_HAS_DATA && !h2c->nb_sc) { + h2c->flags &= ~H2_CF_MBUF_HAS_DATA; + h2c->idle_start = now_ms; + } + goto end; + } + + if (!(conn->flags & CO_FL_ERROR) && !(h2c->wait_event.events & SUB_RETRY_SEND)) { + TRACE_STATE("more data to send, subscribing", H2_EV_H2C_SEND, h2c->conn); + conn->xprt->subscribe(conn, conn->xprt_ctx, SUB_RETRY_SEND, &h2c->wait_event); + } + TRACE_DEVEL("leaving with some data left to send", H2_EV_H2C_SEND, h2c->conn); +end: + return sent || (h2c->flags & (H2_CF_ERR_PENDING|H2_CF_ERROR)); +} + +/* this is the tasklet referenced in h2c->wait_event.tasklet */ +struct task *h2_io_cb(struct task *t, void *ctx, unsigned int state) +{ + struct connection *conn; + struct tasklet *tl = (struct tasklet *)t; + int conn_in_list; + struct h2c *h2c = ctx; + int ret = 0; + + if (state & TASK_F_USR1) { + /* the tasklet was idling on an idle connection, it might have + * been stolen, let's be careful! + */ + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + if (t->context == NULL) { + /* The connection has been taken over by another thread, + * we're no longer responsible for it, so just free the + * tasklet, and do nothing. + */ + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + tasklet_free(tl); + t = NULL; + goto leave; + } + conn = h2c->conn; + TRACE_ENTER(H2_EV_H2C_WAKE, conn); + + /* Remove the connection from the list, to be sure nobody attempts + * to use it while we handle the I/O events + */ + conn_in_list = conn->flags & CO_FL_LIST_MASK; + if (conn_in_list) + conn_delete_from_tree(conn); + + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } else { + /* we're certain the connection was not in an idle list */ + conn = h2c->conn; + TRACE_ENTER(H2_EV_H2C_WAKE, conn); + conn_in_list = 0; + } + + if (!(h2c->wait_event.events & SUB_RETRY_SEND)) + ret = h2_send(h2c); + if (!(h2c->wait_event.events & SUB_RETRY_RECV)) + ret |= h2_recv(h2c); + if (ret || b_data(&h2c->dbuf)) + ret = h2_process(h2c); + + /* If we were in an idle list, we want to add it back into it, + * unless h2_process() returned -1, which mean it has destroyed + * the connection (testing !ret is enough, if h2_process() wasn't + * called then ret will be 0 anyway. + */ + if (ret < 0) + t = NULL; + + if (!ret && conn_in_list) { + struct server *srv = objt_server(conn->target); + + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + _srv_add_idle(srv, conn, conn_in_list == CO_FL_SAFE_LIST); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + +leave: + TRACE_LEAVE(H2_EV_H2C_WAKE); + return t; +} + +/* callback called on any event by the connection handler. + * It applies changes and returns zero, or < 0 if it wants immediate + * destruction of the connection (which normally doesn not happen in h2). + */ +static int h2_process(struct h2c *h2c) +{ + struct connection *conn = h2c->conn; + + TRACE_ENTER(H2_EV_H2C_WAKE, conn); + + if (!(h2c->flags & H2_CF_DEM_BLOCK_ANY) && + (b_data(&h2c->dbuf) || (h2c->flags & H2_CF_RCVD_SHUT))) { + h2_process_demux(h2c); + + if (h2c->st0 >= H2_CS_ERROR || (h2c->flags & H2_CF_ERROR)) + b_reset(&h2c->dbuf); + + if (!b_full(&h2c->dbuf)) + h2c->flags &= ~H2_CF_DEM_DFULL; + } + h2_send(h2c); + + if (unlikely(h2c->proxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) && !(h2c->flags & H2_CF_IS_BACK)) { + int send_goaway = 1; + /* If a close-spread-time option is set, we want to avoid + * closing all the active HTTP2 connections at once so we add a + * random factor that will spread the closing. + */ + if (tick_isset(global.close_spread_end)) { + int remaining_window = tick_remain(now_ms, global.close_spread_end); + if (remaining_window) { + /* This should increase the closing rate the + * further along the window we are. */ + send_goaway = (remaining_window <= statistical_prng_range(global.close_spread_time)); + } + } + else if (global.tune.options & GTUNE_DISABLE_ACTIVE_CLOSE) + send_goaway = 0; /* let the client close his connection himself */ + /* frontend is stopping, reload likely in progress, let's try + * to announce a graceful shutdown if not yet done. We don't + * care if it fails, it will be tried again later. + */ + if (send_goaway) { + TRACE_STATE("proxy stopped, sending GOAWAY", H2_EV_H2C_WAKE|H2_EV_TX_FRAME, conn); + if (!(h2c->flags & (H2_CF_GOAWAY_SENT|H2_CF_GOAWAY_FAILED))) { + if (h2c->last_sid < 0) + h2c->last_sid = (1U << 31) - 1; + h2c_send_goaway_error(h2c, NULL); + } + } + } + + /* + * If we received early data, and the handshake is done, wake + * any stream that was waiting for it. + */ + if (!(h2c->flags & H2_CF_WAIT_FOR_HS) && + (conn->flags & (CO_FL_EARLY_SSL_HS | CO_FL_WAIT_XPRT | CO_FL_EARLY_DATA)) == CO_FL_EARLY_DATA) { + struct eb32_node *node; + struct h2s *h2s; + + h2c->flags |= H2_CF_WAIT_FOR_HS; + node = eb32_lookup_ge(&h2c->streams_by_id, 1); + + while (node) { + h2s = container_of(node, struct h2s, by_id); + if (se_fl_test(h2s->sd, SE_FL_WAIT_FOR_HS)) + h2s_notify_recv(h2s); + node = eb32_next(node); + } + } + + if ((h2c->flags & H2_CF_ERROR) || h2c_read0_pending(h2c) || + h2c->st0 == H2_CS_ERROR2 || h2c->flags & H2_CF_GOAWAY_FAILED || + (eb_is_empty(&h2c->streams_by_id) && h2c->last_sid >= 0 && + h2c->max_id >= h2c->last_sid)) { + h2_wake_some_streams(h2c, 0); + + if (eb_is_empty(&h2c->streams_by_id)) { + /* no more stream, kill the connection now */ + h2_release(h2c); + TRACE_DEVEL("leaving after releasing the connection", H2_EV_H2C_WAKE); + return -1; + } + + /* connections in error must be removed from the idle lists */ + if (conn->flags & CO_FL_LIST_MASK) { + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + conn_delete_from_tree(conn); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + } + else if (h2c->st0 == H2_CS_ERROR) { + /* connections in error must be removed from the idle lists */ + if (conn->flags & CO_FL_LIST_MASK) { + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + conn_delete_from_tree(conn); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + } + + if (!b_data(&h2c->dbuf)) + h2_release_buf(h2c, &h2c->dbuf); + + if (h2c->st0 == H2_CS_ERROR2 || (h2c->flags & H2_CF_GOAWAY_FAILED) || + (h2c->st0 != H2_CS_ERROR && + !br_data(h2c->mbuf) && + (h2c->mws <= 0 || LIST_ISEMPTY(&h2c->fctl_list)) && + ((h2c->flags & H2_CF_MUX_BLOCK_ANY) || LIST_ISEMPTY(&h2c->send_list)))) + h2_release_mbuf(h2c); + + h2c_update_timeout(h2c); + h2_send(h2c); + TRACE_LEAVE(H2_EV_H2C_WAKE, conn); + return 0; +} + +/* wake-up function called by the connection layer (mux_ops.wake) */ +static int h2_wake(struct connection *conn) +{ + struct h2c *h2c = conn->ctx; + int ret; + + TRACE_ENTER(H2_EV_H2C_WAKE, conn); + ret = h2_process(h2c); + if (ret >= 0) { + h2_wake_some_streams(h2c, 0); + + /* For active reverse connection, an explicit check is required if an + * error is pending to propagate the error as demux process is blocked + * until reversal. This allows to quickly close the connection and + * prepare a new one. + */ + if (unlikely(conn_reverse_in_preconnect(conn)) && h2c_is_dead(h2c)) { + TRACE_DEVEL("leaving and killing dead connection", H2_EV_STRM_END, h2c->conn); + h2_release(h2c); + } + } + + TRACE_LEAVE(H2_EV_H2C_WAKE); + return ret; +} + +/* Connection timeout management. The principle is that if there's no receipt + * nor sending for a certain amount of time, the connection is closed. If the + * MUX buffer still has lying data or is not allocatable, the connection is + * immediately killed. If it's allocatable and empty, we attempt to send a + * GOAWAY frame. + */ +struct task *h2_timeout_task(struct task *t, void *context, unsigned int state) +{ + struct h2c *h2c = context; + int expired = tick_is_expired(t->expire, now_ms); + + TRACE_ENTER(H2_EV_H2C_WAKE, h2c ? h2c->conn : NULL); + + if (h2c) { + /* Make sure nobody stole the connection from us */ + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + + /* Somebody already stole the connection from us, so we should not + * free it, we just have to free the task. + */ + if (!t->context) { + h2c = NULL; + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + goto do_leave; + } + + + if (!expired) { + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + TRACE_DEVEL("leaving (not expired)", H2_EV_H2C_WAKE, h2c->conn); + return t; + } + + if (!h2c_may_expire(h2c)) { + /* we do still have streams but all of them are idle, waiting + * for the data layer, so we must not enforce the timeout here. + */ + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + t->expire = TICK_ETERNITY; + return t; + } + + /* We're about to destroy the connection, so make sure nobody attempts + * to steal it from us. + */ + if (h2c->conn->flags & CO_FL_LIST_MASK) + conn_delete_from_tree(h2c->conn); + + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + +do_leave: + task_destroy(t); + + if (!h2c) { + /* resources were already deleted */ + TRACE_DEVEL("leaving (not more h2c)", H2_EV_H2C_WAKE); + return NULL; + } + + h2c->task = NULL; + h2c_error(h2c, H2_ERR_NO_ERROR); + h2_wake_some_streams(h2c, 0); + + if (br_data(h2c->mbuf)) { + /* don't even try to send a GOAWAY, the buffer is stuck */ + h2c->flags |= H2_CF_GOAWAY_FAILED; + } + + /* try to send but no need to insist */ + h2c->last_sid = h2c->max_id; + if (h2c_send_goaway_error(h2c, NULL) <= 0) + h2c->flags |= H2_CF_GOAWAY_FAILED; + + if (br_data(h2c->mbuf) && !(h2c->flags & H2_CF_GOAWAY_FAILED) && conn_xprt_ready(h2c->conn)) { + unsigned int released = 0; + struct buffer *buf; + + for (buf = br_head(h2c->mbuf); b_size(buf); buf = br_del_head(h2c->mbuf)) { + if (b_data(buf)) { + int ret = h2c->conn->xprt->snd_buf(h2c->conn, h2c->conn->xprt_ctx, buf, b_data(buf), 0); + if (!ret) + break; + b_del(buf, ret); + if (b_data(buf)) + break; + b_free(buf); + released++; + } + } + + if (released) + offer_buffers(NULL, released); + } + + /* in any case this connection must not be considered idle anymore */ + if (h2c->conn->flags & CO_FL_LIST_MASK) { + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + conn_delete_from_tree(h2c->conn); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + + /* either we can release everything now or it will be done later once + * the last stream closes. + */ + if (eb_is_empty(&h2c->streams_by_id)) + h2_release(h2c); + + TRACE_LEAVE(H2_EV_H2C_WAKE); + return NULL; +} + + +/*******************************************/ +/* functions below are used by the streams */ +/*******************************************/ + +/* + * Attach a new stream to a connection + * (Used for outgoing connections) + */ +static int h2_attach(struct connection *conn, struct sedesc *sd, struct session *sess) +{ + struct h2s *h2s; + struct h2c *h2c = conn->ctx; + + TRACE_ENTER(H2_EV_H2S_NEW, conn); + h2s = h2c_bck_stream_new(h2c, sd->sc, sess); + if (!h2s) { + TRACE_DEVEL("leaving on stream creation failure", H2_EV_H2S_NEW|H2_EV_H2S_ERR, conn); + return -1; + } + + /* the connection is not idle anymore, let's mark this */ + HA_ATOMIC_AND(&h2c->wait_event.tasklet->state, ~TASK_F_USR1); + xprt_set_used(h2c->conn, h2c->conn->xprt, h2c->conn->xprt_ctx); + + TRACE_LEAVE(H2_EV_H2S_NEW, conn, h2s); + return 0; +} + +/* Retrieves the first valid stream connector from this connection, or returns + * NULL. We have to scan because we may have some orphan streams. It might be + * beneficial to scan backwards from the end to reduce the likeliness to find + * orphans. + */ +static struct stconn *h2_get_first_sc(const struct connection *conn) +{ + struct h2c *h2c = conn->ctx; + struct h2s *h2s; + struct eb32_node *node; + + node = eb32_first(&h2c->streams_by_id); + while (node) { + h2s = container_of(node, struct h2s, by_id); + if (h2s_sc(h2s)) + return h2s_sc(h2s); + node = eb32_next(node); + } + return NULL; +} + +static int h2_ctl(struct connection *conn, enum mux_ctl_type mux_ctl, void *output) +{ + int ret = 0; + struct h2c *h2c = conn->ctx; + + switch (mux_ctl) { + case MUX_CTL_STATUS: + /* Only consider the mux to be ready if we're done with + * the preface and settings, and we had no error. + */ + if (h2c->st0 >= H2_CS_FRAME_H && h2c->st0 < H2_CS_ERROR) + ret |= MUX_STATUS_READY; + return ret; + case MUX_CTL_EXIT_STATUS: + return MUX_ES_UNKNOWN; + + case MUX_CTL_REVERSE_CONN: + BUG_ON(h2c->flags & H2_CF_IS_BACK); + + TRACE_DEVEL("connection reverse done, restart demux", H2_EV_H2C_WAKE, h2c->conn); + h2c->flags &= ~H2_CF_DEM_TOOMANY; + tasklet_wakeup(h2c->wait_event.tasklet); + return 0; + + default: + return -1; + } +} + +static int h2_sctl(struct stconn *sc, enum mux_sctl_type mux_sctl, void *output) +{ + int ret = 0; + struct h2s *h2s = __sc_mux_strm(sc); + + switch (mux_sctl) { + case MUX_SCTL_SID: + if (output) + *((int64_t *)output) = h2s->id; + return ret; + + default: + return -1; + } +} + +/* + * Destroy the mux and the associated connection, if it is no longer used + */ +static void h2_destroy(void *ctx) +{ + struct h2c *h2c = ctx; + + TRACE_ENTER(H2_EV_H2C_END, h2c->conn); + if (eb_is_empty(&h2c->streams_by_id)) { + BUG_ON(h2c->conn->ctx != h2c); + h2_release(h2c); + } + TRACE_LEAVE(H2_EV_H2C_END); +} + +/* + * Detach the stream from the connection and possibly release the connection. + */ +static void h2_detach(struct sedesc *sd) +{ + struct h2s *h2s = sd->se; + struct h2c *h2c; + struct session *sess; + + TRACE_ENTER(H2_EV_STRM_END, h2s ? h2s->h2c->conn : NULL, h2s); + + if (!h2s) { + TRACE_LEAVE(H2_EV_STRM_END); + return; + } + + /* there's no txbuf so we're certain not to be able to send anything */ + h2s->flags &= ~H2_SF_NOTIFIED; + + sess = h2s->sess; + h2c = h2s->h2c; + h2c->nb_sc--; + if (!h2c->nb_sc && !br_data(h2c->mbuf)) + h2c->idle_start = now_ms; + + if ((h2c->flags & (H2_CF_IS_BACK|H2_CF_DEM_TOOMANY)) == H2_CF_DEM_TOOMANY && + !h2_frt_has_too_many_sc(h2c)) { + /* frontend connection was blocking new streams creation */ + h2c->flags &= ~H2_CF_DEM_TOOMANY; + h2c_restart_reading(h2c, 1); + } + + /* this stream may be blocked waiting for some data to leave (possibly + * an ES or RST frame), so orphan it in this case. + */ + if (!(h2c->flags & (H2_CF_ERR_PENDING|H2_CF_ERROR)) && + (h2c->st0 < H2_CS_ERROR) && + (h2s->flags & (H2_SF_BLK_MBUSY | H2_SF_BLK_MROOM | H2_SF_BLK_MFCTL)) && + ((h2s->flags & (H2_SF_WANT_SHUTR | H2_SF_WANT_SHUTW)) || h2s->subs)) { + TRACE_DEVEL("leaving on stream blocked", H2_EV_STRM_END|H2_EV_H2S_BLK, h2c->conn, h2s); + /* refresh the timeout if none was active, so that the last + * leaving stream may arm it. + */ + if (h2c->task && !tick_isset(h2c->task->expire)) + h2c_update_timeout(h2c); + return; + } + + if ((h2c->flags & H2_CF_DEM_BLOCK_ANY && h2s->id == h2c->dsi)) { + /* unblock the connection if it was blocked on this + * stream. + */ + h2c->flags &= ~H2_CF_DEM_BLOCK_ANY; + h2c->flags &= ~H2_CF_MUX_BLOCK_ANY; + h2c_restart_reading(h2c, 1); + } + + h2s_destroy(h2s); + + if (h2c->flags & H2_CF_IS_BACK) { + if (!(h2c->flags & (H2_CF_RCVD_SHUT|H2_CF_ERR_PENDING|H2_CF_ERROR))) { + if (h2c->conn->flags & CO_FL_PRIVATE) { + /* Add the connection in the session server list, if not already done */ + if (!session_add_conn(sess, h2c->conn, h2c->conn->target)) { + h2c->conn->owner = NULL; + if (eb_is_empty(&h2c->streams_by_id)) { + h2c->conn->mux->destroy(h2c); + TRACE_DEVEL("leaving on error after killing outgoing connection", H2_EV_STRM_END|H2_EV_H2C_ERR); + return; + } + } + if (eb_is_empty(&h2c->streams_by_id)) { + if (session_check_idle_conn(h2c->conn->owner, h2c->conn) != 0) { + /* At this point either the connection is destroyed, or it's been added to the server idle list, just stop */ + TRACE_DEVEL("leaving without reusable idle connection", H2_EV_STRM_END); + return; + } + } + } + else { + if (eb_is_empty(&h2c->streams_by_id)) { + /* If the connection is owned by the session, first remove it + * from its list + */ + if (h2c->conn->owner) { + session_unown_conn(h2c->conn->owner, h2c->conn); + h2c->conn->owner = NULL; + } + + /* mark that the tasklet may lose its context to another thread and + * that the handler needs to check it under the idle conns lock. + */ + HA_ATOMIC_OR(&h2c->wait_event.tasklet->state, TASK_F_USR1); + xprt_set_idle(h2c->conn, h2c->conn->xprt, h2c->conn->xprt_ctx); + + if (!srv_add_to_idle_list(objt_server(h2c->conn->target), h2c->conn, 1)) { + /* The server doesn't want it, let's kill the connection right away */ + h2c->conn->mux->destroy(h2c); + TRACE_DEVEL("leaving on error after killing outgoing connection", H2_EV_STRM_END|H2_EV_H2C_ERR); + return; + } + /* At this point, the connection has been added to the + * server idle list, so another thread may already have + * hijacked it, so we can't do anything with it. + */ + TRACE_DEVEL("reusable idle connection", H2_EV_STRM_END); + return; + + } + else if (!h2c->conn->hash_node->node.node.leaf_p && + h2_avail_streams(h2c->conn) > 0 && objt_server(h2c->conn->target) && + !LIST_INLIST(&h2c->conn->session_list)) { + srv_add_to_avail_list(__objt_server(h2c->conn->target), h2c->conn); + } + } + } + } + + /* We don't want to close right now unless we're removing the + * last stream, and either the connection is in error, or it + * reached the ID already specified in a GOAWAY frame received + * or sent (as seen by last_sid >= 0). + */ + if (h2c_is_dead(h2c)) { + /* no more stream will come, kill it now */ + TRACE_DEVEL("leaving and killing dead connection", H2_EV_STRM_END, h2c->conn); + h2_release(h2c); + } + else if (h2c->task) { + h2c_update_timeout(h2c); + TRACE_DEVEL("leaving, refreshing connection's timeout", H2_EV_STRM_END, h2c->conn); + } + else + TRACE_DEVEL("leaving", H2_EV_STRM_END, h2c->conn); +} + +/* Performs a synchronous or asynchronous shutr(). */ +static void h2_do_shutr(struct h2s *h2s) +{ + struct h2c *h2c = h2s->h2c; + + if (h2s->st == H2_SS_CLOSED) + goto done; + + TRACE_ENTER(H2_EV_STRM_SHUT, h2c->conn, h2s); + + if (h2s->flags & H2_SF_WANT_SHUTW) + goto add_to_list; + + /* a connstream may require us to immediately kill the whole connection + * for example because of a "tcp-request content reject" rule that is + * normally used to limit abuse. In this case we schedule a goaway to + * close the connection. + */ + if (se_fl_test(h2s->sd, SE_FL_KILL_CONN) && + !(h2c->flags & (H2_CF_GOAWAY_SENT|H2_CF_GOAWAY_FAILED))) { + TRACE_STATE("stream wants to kill the connection", H2_EV_STRM_SHUT, h2c->conn, h2s); + h2c_error(h2c, H2_ERR_ENHANCE_YOUR_CALM); + h2s_error(h2s, H2_ERR_ENHANCE_YOUR_CALM); + } + else if (!(h2s->flags & H2_SF_HEADERS_SENT)) { + /* Nothing was never sent for this stream, so reset with + * REFUSED_STREAM error to let the client retry the + * request. + */ + TRACE_STATE("no headers sent yet, trying a retryable abort", H2_EV_STRM_SHUT, h2c->conn, h2s); + h2s_error(h2s, H2_ERR_REFUSED_STREAM); + } + else { + /* a final response was already provided, we don't want this + * stream anymore. This may happen when the server responds + * before the end of an upload and closes quickly (redirect, + * deny, ...) + */ + h2s_error(h2s, H2_ERR_CANCEL); + } + + if (!(h2s->flags & H2_SF_RST_SENT) && + h2s_send_rst_stream(h2c, h2s) <= 0) + goto add_to_list; + + if (!(h2c->wait_event.events & SUB_RETRY_SEND)) + tasklet_wakeup(h2c->wait_event.tasklet); + h2s_close(h2s); + done: + h2s->flags &= ~H2_SF_WANT_SHUTR; + TRACE_LEAVE(H2_EV_STRM_SHUT, h2c->conn, h2s); + return; +add_to_list: + /* Let the handler know we want to shutr, and add ourselves to the + * most relevant list if not yet done. h2_deferred_shut() will be + * automatically called via the shut_tl tasklet when there's room + * again. + */ + h2s->flags |= H2_SF_WANT_SHUTR; + if (!LIST_INLIST(&h2s->list)) { + if (h2s->flags & H2_SF_BLK_MFCTL) + LIST_APPEND(&h2c->fctl_list, &h2s->list); + else if (h2s->flags & (H2_SF_BLK_MBUSY|H2_SF_BLK_MROOM)) + LIST_APPEND(&h2c->send_list, &h2s->list); + } + TRACE_LEAVE(H2_EV_STRM_SHUT, h2c->conn, h2s); + return; +} + +/* Performs a synchronous or asynchronous shutw(). */ +static void h2_do_shutw(struct h2s *h2s) +{ + struct h2c *h2c = h2s->h2c; + + if (h2s->st == H2_SS_HLOC || h2s->st == H2_SS_CLOSED) + goto done; + + TRACE_ENTER(H2_EV_STRM_SHUT, h2c->conn, h2s); + + if (h2s->st != H2_SS_ERROR && + (h2s->flags & (H2_SF_HEADERS_SENT | H2_SF_MORE_HTX_DATA)) == H2_SF_HEADERS_SENT) { + /* we can cleanly close using an empty data frame only after headers + * and if no more data is expected to be sent. + */ + if (!(h2s->flags & (H2_SF_ES_SENT|H2_SF_RST_SENT)) && + h2_send_empty_data_es(h2s) <= 0) + goto add_to_list; + + if (h2s->st == H2_SS_HREM) + h2s_close(h2s); + else + h2s->st = H2_SS_HLOC; + } else { + /* a connstream may require us to immediately kill the whole connection + * for example because of a "tcp-request content reject" rule that is + * normally used to limit abuse. In this case we schedule a goaway to + * close the connection. + */ + if (se_fl_test(h2s->sd, SE_FL_KILL_CONN) && + !(h2c->flags & (H2_CF_GOAWAY_SENT|H2_CF_GOAWAY_FAILED))) { + TRACE_STATE("stream wants to kill the connection", H2_EV_STRM_SHUT, h2c->conn, h2s); + h2c_error(h2c, H2_ERR_ENHANCE_YOUR_CALM); + h2s_error(h2s, H2_ERR_ENHANCE_YOUR_CALM); + } + else if (h2s->flags & H2_SF_MORE_HTX_DATA) { + /* some unsent data were pending (e.g. abort during an upload), + * let's send a CANCEL. + */ + TRACE_STATE("shutw before end of data, sending CANCEL", H2_EV_STRM_SHUT, h2c->conn, h2s); + h2s_error(h2s, H2_ERR_CANCEL); + } + else { + /* Nothing was never sent for this stream, so reset with + * REFUSED_STREAM error to let the client retry the + * request. + */ + TRACE_STATE("no headers sent yet, trying a retryable abort", H2_EV_STRM_SHUT, h2c->conn, h2s); + h2s_error(h2s, H2_ERR_REFUSED_STREAM); + } + + if (!(h2s->flags & H2_SF_RST_SENT) && + h2s_send_rst_stream(h2c, h2s) <= 0) + goto add_to_list; + + h2s_close(h2s); + } + + if (!(h2c->wait_event.events & SUB_RETRY_SEND)) + tasklet_wakeup(h2c->wait_event.tasklet); + + TRACE_LEAVE(H2_EV_STRM_SHUT, h2c->conn, h2s); + + done: + h2s->flags &= ~H2_SF_WANT_SHUTW; + return; + + add_to_list: + /* Let the handler know we want to shutw, and add ourselves to the + * most relevant list if not yet done. h2_deferred_shut() will be + * automatically called via the shut_tl tasklet when there's room + * again. + */ + h2s->flags |= H2_SF_WANT_SHUTW; + if (!LIST_INLIST(&h2s->list)) { + if (h2s->flags & H2_SF_BLK_MFCTL) + LIST_APPEND(&h2c->fctl_list, &h2s->list); + else if (h2s->flags & (H2_SF_BLK_MBUSY|H2_SF_BLK_MROOM)) + LIST_APPEND(&h2c->send_list, &h2s->list); + } + TRACE_LEAVE(H2_EV_STRM_SHUT, h2c->conn, h2s); + return; +} + +/* This is the tasklet referenced in h2s->shut_tl, it is used for + * deferred shutdowns when the h2_detach() was done but the mux buffer was full + * and prevented the last frame from being emitted. + */ +struct task *h2_deferred_shut(struct task *t, void *ctx, unsigned int state) +{ + struct h2s *h2s = ctx; + struct h2c *h2c = h2s->h2c; + + TRACE_ENTER(H2_EV_STRM_SHUT, h2c->conn, h2s); + + if (h2s->flags & H2_SF_NOTIFIED) { + /* some data processing remains to be done first */ + goto end; + } + + if (h2s->flags & H2_SF_WANT_SHUTW) + h2_do_shutw(h2s); + + if (h2s->flags & H2_SF_WANT_SHUTR) + h2_do_shutr(h2s); + + if (!(h2s->flags & (H2_SF_WANT_SHUTR|H2_SF_WANT_SHUTW))) { + /* We're done trying to send, remove ourself from the send_list */ + h2_remove_from_list(h2s); + + if (!h2s_sc(h2s)) { + h2s_destroy(h2s); + if (h2c_is_dead(h2c)) { + h2_release(h2c); + t = NULL; + } + } + } + end: + TRACE_LEAVE(H2_EV_STRM_SHUT); + return t; +} + +/* shutr() called by the stream connector (mux_ops.shutr) */ +static void h2_shutr(struct stconn *sc, enum co_shr_mode mode) +{ + struct h2s *h2s = __sc_mux_strm(sc); + + TRACE_ENTER(H2_EV_STRM_SHUT, h2s->h2c->conn, h2s); + if (mode) + h2_do_shutr(h2s); + TRACE_LEAVE(H2_EV_STRM_SHUT, h2s->h2c->conn, h2s); +} + +/* shutw() called by the stream connector (mux_ops.shutw) */ +static void h2_shutw(struct stconn *sc, enum co_shw_mode mode) +{ + struct h2s *h2s = __sc_mux_strm(sc); + + TRACE_ENTER(H2_EV_STRM_SHUT, h2s->h2c->conn, h2s); + h2_do_shutw(h2s); + TRACE_LEAVE(H2_EV_STRM_SHUT, h2s->h2c->conn, h2s); +} + +/* Decode the payload of a HEADERS frame and produce the HTX request or response + * depending on the connection's side. Returns a positive value on success, a + * negative value on failure, or 0 if it couldn't proceed. May report connection + * errors in h2c->errcode if the frame is non-decodable and the connection + * unrecoverable. In absence of connection error when a failure is reported, the + * caller must assume a stream error. + * + * The function may fold CONTINUATION frames into the initial HEADERS frame + * by removing padding and next frame header, then moving the CONTINUATION + * frame's payload and adjusting h2c->dfl to match the new aggregated frame, + * leaving a hole between the main frame and the beginning of the next one. + * The possibly remaining incomplete or next frame at the end may be moved + * if the aggregated frame is not deleted, in order to fill the hole. Wrapped + * HEADERS frames are unwrapped into a temporary buffer before decoding. + * + * A buffer at the beginning of processing may look like this : + * + * ,---.---------.-----.--------------.--------------.------.---. + * |///| HEADERS | PAD | CONTINUATION | CONTINUATION | DATA |///| + * `---^---------^-----^--------------^--------------^------^---' + * | | <-----> | | + * area | dpl | wrap + * |<--------------> | + * | dfl | + * |<-------------------------------------------------->| + * head data + * + * Padding is automatically overwritten when folding, participating to the + * hole size after dfl : + * + * ,---.------------------------.-----.--------------.------.---. + * |///| HEADERS : CONTINUATION |/////| CONTINUATION | DATA |///| + * `---^------------------------^-----^--------------^------^---' + * | | <-----> | | + * area | hole | wrap + * |<-----------------------> | + * | dfl | + * |<-------------------------------------------------->| + * head data + * + * Please note that the HEADERS frame is always deprived from its PADLEN byte + * however it may start with the 5 stream-dep+weight bytes in case of PRIORITY + * bit. + * + * The <flags> field must point to either the stream's flags or to a copy of it + * so that the function can update the following flags : + * - H2_SF_DATA_CLEN when content-length is seen + * - H2_SF_HEADERS_RCVD once the frame is successfully decoded + * + * The H2_SF_HEADERS_RCVD flag is also looked at in the <flags> field prior to + * decoding, in order to detect if we're dealing with a headers or a trailers + * block (the trailers block appears after H2_SF_HEADERS_RCVD was seen). + */ +static int h2c_dec_hdrs(struct h2c *h2c, struct buffer *rxbuf, uint32_t *flags, unsigned long long *body_len, char *upgrade_protocol) +{ + const uint8_t *hdrs = (uint8_t *)b_head(&h2c->dbuf); + struct buffer *tmp = get_trash_chunk(); + struct http_hdr list[global.tune.max_http_hdr * 2]; + struct buffer *copy = NULL; + unsigned int msgf; + struct htx *htx = NULL; + int flen; // header frame len + int hole = 0; + int ret = 0; + int outlen; + int wrap; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn); + +next_frame: + if (b_data(&h2c->dbuf) - hole < h2c->dfl) + goto leave; // incomplete input frame + + /* No END_HEADERS means there's one or more CONTINUATION frames. In + * this case, we'll try to paste it immediately after the initial + * HEADERS frame payload and kill any possible padding. The initial + * frame's length will be increased to represent the concatenation + * of the two frames. The next frame is read from position <tlen> + * and written at position <flen> (minus padding if some is present). + */ + if (unlikely(!(h2c->dff & H2_F_HEADERS_END_HEADERS))) { + struct h2_fh hdr; + int clen; // CONTINUATION frame's payload length + + TRACE_STATE("EH missing, expecting continuation frame", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_HDR, h2c->conn); + if (!h2_peek_frame_hdr(&h2c->dbuf, h2c->dfl + hole, &hdr)) { + /* no more data, the buffer may be full, either due to + * too large a frame or because of too large a hole that + * we're going to compact at the end. + */ + goto leave; + } + + if (hdr.ft != H2_FT_CONTINUATION) { + /* RFC7540#6.10: frame of unexpected type */ + TRACE_STATE("not continuation!", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_HDR|H2_EV_RX_CONT|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto fail; + } + + if (hdr.sid != h2c->dsi) { + /* RFC7540#6.10: frame of different stream */ + TRACE_STATE("different stream ID!", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_HDR|H2_EV_RX_CONT|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto fail; + } + + if ((unsigned)hdr.len > (unsigned)global.tune.bufsize) { + /* RFC7540#4.2: invalid frame length */ + TRACE_STATE("too large frame!", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_HDR|H2_EV_RX_CONT|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_FRAME_SIZE_ERROR); + goto fail; + } + + /* detect when we must stop aggregating frames */ + h2c->dff |= hdr.ff & H2_F_HEADERS_END_HEADERS; + + /* Take as much as we can of the CONTINUATION frame's payload */ + clen = b_data(&h2c->dbuf) - (h2c->dfl + hole + 9); + if (clen > hdr.len) + clen = hdr.len; + + /* Move the frame's payload over the padding, hole and frame + * header. At least one of hole or dpl is null (see diagrams + * above). The hole moves after the new aggregated frame. + */ + b_move(&h2c->dbuf, b_peek_ofs(&h2c->dbuf, h2c->dfl + hole + 9), clen, -(h2c->dpl + hole + 9)); + h2c->dfl += hdr.len - h2c->dpl; + hole += h2c->dpl + 9; + h2c->dpl = 0; + TRACE_STATE("waiting for next continuation frame", H2_EV_RX_FRAME|H2_EV_RX_FHDR|H2_EV_RX_CONT|H2_EV_RX_HDR, h2c->conn); + goto next_frame; + } + + flen = h2c->dfl - h2c->dpl; + + /* if the input buffer wraps, take a temporary copy of it (rare) */ + wrap = b_wrap(&h2c->dbuf) - b_head(&h2c->dbuf); + if (wrap < h2c->dfl) { + copy = alloc_trash_chunk(); + if (!copy) { + TRACE_DEVEL("failed to allocate temporary buffer", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + goto fail; + } + memcpy(copy->area, b_head(&h2c->dbuf), wrap); + memcpy(copy->area + wrap, b_orig(&h2c->dbuf), h2c->dfl - wrap); + hdrs = (uint8_t *) copy->area; + } + + /* Skip StreamDep and weight for now (we don't support PRIORITY) */ + if (h2c->dff & H2_F_HEADERS_PRIORITY) { + if (read_n32(hdrs) == h2c->dsi) { + /* RFC7540#5.3.1 : stream dep may not depend on itself */ + TRACE_STATE("invalid stream dependency!", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto fail; + } + + if (flen < 5) { + TRACE_STATE("frame too short for priority!", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_FRAME_SIZE_ERROR); + goto fail; + } + + hdrs += 5; // stream dep = 4, weight = 1 + flen -= 5; + } + + if (!h2_get_buf(h2c, rxbuf)) { + TRACE_STATE("waiting for h2c rxbuf allocation", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_BLK, h2c->conn); + h2c->flags |= H2_CF_DEM_SALLOC; + goto leave; + } + + /* we can't retry a failed decompression operation so we must be very + * careful not to take any risks. In practice the output buffer is + * always empty except maybe for trailers, in which case we simply have + * to wait for the upper layer to finish consuming what is available. + */ + htx = htx_from_buf(rxbuf); + if (!htx_is_empty(htx)) { + TRACE_STATE("waiting for room in h2c rxbuf", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_BLK, h2c->conn); + h2c->flags |= H2_CF_DEM_SFULL; + goto leave; + } + + /* past this point we cannot roll back in case of error */ + outlen = hpack_decode_frame(h2c->ddht, hdrs, flen, list, + sizeof(list)/sizeof(list[0]), tmp); + + if (outlen > 0 && + (TRACE_SOURCE)->verbosity >= H2_VERB_ADVANCED && + TRACE_ENABLED(TRACE_LEVEL_USER, H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, 0, 0, 0)) { + struct ist n; + int i; + + for (i = 0; list[i].n.len; i++) { + n = list[i].n; + + if (!isttest(n)) { + /* this is in fact a pseudo header whose number is in n.len */ + n = h2_phdr_to_ist(n.len); + } + + h2_trace_header(n, list[i].v, H2_EV_RX_FRAME|H2_EV_RX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, NULL); + } + } + + if (outlen < 0) { + TRACE_STATE("failed to decompress HPACK", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_COMPRESSION_ERROR); + goto fail; + } + + /* The PACK decompressor was updated, let's update the input buffer and + * the parser's state to commit these changes and allow us to later + * fail solely on the stream if needed. + */ + b_del(&h2c->dbuf, h2c->dfl + hole); + h2c->dfl = hole = 0; + h2c->st0 = H2_CS_FRAME_H; + + /* OK now we have our header list in <list> */ + msgf = (h2c->dff & H2_F_HEADERS_END_STREAM) ? 0 : H2_MSGF_BODY; + msgf |= (*flags & H2_SF_BODY_TUNNEL) ? H2_MSGF_BODY_TUNNEL: 0; + /* If an Extended CONNECT has been sent on this stream, set message flag + * to convert 200 response to 101 htx response */ + msgf |= (*flags & H2_SF_EXT_CONNECT_SENT) ? H2_MSGF_EXT_CONNECT: 0; + + if (*flags & H2_SF_HEADERS_RCVD) + goto trailers; + + /* This is the first HEADERS frame so it's a headers block */ + if (h2c->flags & H2_CF_IS_BACK) + outlen = h2_make_htx_response(list, htx, &msgf, body_len, upgrade_protocol); + else + outlen = h2_make_htx_request(list, htx, &msgf, body_len, + !!(((const struct session *)h2c->conn->owner)->fe->options2 & PR_O2_REQBUG_OK)); + + if (outlen < 0 || htx_free_space(htx) < global.tune.maxrewrite) { + /* too large headers? this is a stream error only */ + TRACE_STATE("message headers too large or invalid", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2S_ERR|H2_EV_PROTO_ERR, h2c->conn); + htx->flags |= HTX_FL_PARSING_ERROR; + goto fail; + } + + if (msgf & H2_MSGF_BODY) { + /* a payload is present */ + if (msgf & H2_MSGF_BODY_CL) { + *flags |= H2_SF_DATA_CLEN; + htx->extra = *body_len; + } + } + if (msgf & H2_MSGF_BODYLESS_RSP) + *flags |= H2_SF_BODYLESS_RESP; + + if (msgf & H2_MSGF_BODY_TUNNEL) + *flags |= H2_SF_BODY_TUNNEL; + else { + /* Abort the tunnel attempt, if any */ + if (*flags & H2_SF_BODY_TUNNEL) + *flags |= H2_SF_TUNNEL_ABRT; + *flags &= ~H2_SF_BODY_TUNNEL; + } + + done: + /* indicate that a HEADERS frame was received for this stream, except + * for 1xx responses. For 1xx responses, another HEADERS frame is + * expected. + */ + if (!(msgf & H2_MSGF_RSP_1XX)) + *flags |= H2_SF_HEADERS_RCVD; + + if (h2c->dff & H2_F_HEADERS_END_STREAM) { + if (msgf & H2_MSGF_RSP_1XX) { + /* RFC9113#8.1 : HEADERS frame with the ES flag set that carries an informational status code is malformed */ + TRACE_STATE("invalid interim response with ES flag!", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); + goto fail; + } + /* no more data are expected for this message */ + htx->flags |= HTX_FL_EOM; + *flags |= H2_SF_ES_RCVD; + } + + if (msgf & H2_MSGF_EXT_CONNECT) + *flags |= H2_SF_EXT_CONNECT_RCVD; + + /* success */ + ret = 1; + + leave: + /* If there is a hole left and it's not at the end, we are forced to + * move the remaining data over it. + */ + if (hole) { + if (b_data(&h2c->dbuf) > h2c->dfl + hole) + b_move(&h2c->dbuf, b_peek_ofs(&h2c->dbuf, h2c->dfl + hole), + b_data(&h2c->dbuf) - (h2c->dfl + hole), -hole); + b_sub(&h2c->dbuf, hole); + } + + if (b_full(&h2c->dbuf) && h2c->dfl && (!htx || htx_is_empty(htx))) { + /* too large frames */ + h2c_error(h2c, H2_ERR_INTERNAL_ERROR); + ret = -1; + } + + if (htx) + htx_to_buf(htx, rxbuf); + free_trash_chunk(copy); + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn); + return ret; + + fail: + ret = -1; + goto leave; + + trailers: + /* This is the last HEADERS frame hence a trailer */ + if (!(h2c->dff & H2_F_HEADERS_END_STREAM)) { + /* It's a trailer but it's missing ES flag */ + TRACE_STATE("missing EH on trailers frame", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); + h2c_error(h2c, H2_ERR_PROTOCOL_ERROR); + HA_ATOMIC_INC(&h2c->px_counters->conn_proto_err); + goto fail; + } + + /* Trailers terminate a DATA sequence */ + if (h2_make_htx_trailers(list, htx) <= 0) { + TRACE_STATE("failed to append HTX trailers into rxbuf", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2S_ERR, h2c->conn); + goto fail; + } + *flags |= H2_SF_ES_RCVD; + goto done; +} + +/* Transfer the payload of a DATA frame to the HTTP/1 side. The HTTP/2 frame + * parser state is automatically updated. Returns > 0 if it could completely + * send the current frame, 0 if it couldn't complete, in which case + * SE_FL_RCV_MORE must be checked to know if some data remain pending (an empty + * DATA frame can return 0 as a valid result). Stream errors are reported in + * h2s->errcode and connection errors in h2c->errcode. The caller must already + * have checked the frame header and ensured that the frame was complete or the + * buffer full. It changes the frame state to FRAME_A once done. + */ +static int h2_frt_transfer_data(struct h2s *h2s) +{ + struct h2c *h2c = h2s->h2c; + int block; + unsigned int flen = 0; + struct htx *htx = NULL; + struct buffer *scbuf; + unsigned int sent; + + TRACE_ENTER(H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + + h2c->flags &= ~H2_CF_DEM_SFULL; + + scbuf = h2_get_buf(h2c, &h2s->rxbuf); + if (!scbuf) { + h2c->flags |= H2_CF_DEM_SALLOC; + TRACE_STATE("waiting for an h2s rxbuf", H2_EV_RX_FRAME|H2_EV_RX_DATA|H2_EV_H2S_BLK, h2c->conn, h2s); + goto fail; + } + htx = htx_from_buf(scbuf); + +try_again: + flen = h2c->dfl - h2c->dpl; + if (!flen) + goto end_transfer; + + if (flen > b_data(&h2c->dbuf)) { + flen = b_data(&h2c->dbuf); + if (!flen) + goto fail; + } + + block = htx_free_data_space(htx); + if (!block) { + h2c->flags |= H2_CF_DEM_SFULL; + TRACE_STATE("h2s rxbuf is full", H2_EV_RX_FRAME|H2_EV_RX_DATA|H2_EV_H2S_BLK, h2c->conn, h2s); + goto fail; + } + if (flen > block) + flen = block; + + /* here, flen is the max we can copy into the output buffer */ + block = b_contig_data(&h2c->dbuf, 0); + if (flen > block) + flen = block; + + sent = htx_add_data(htx, ist2(b_head(&h2c->dbuf), flen)); + TRACE_DATA("move some data to h2s rxbuf", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s, 0, (void *)(long)sent); + + b_del(&h2c->dbuf, sent); + h2c->dfl -= sent; + h2c->rcvd_c += sent; + h2c->rcvd_s += sent; // warning, this can also affect the closed streams! + + if (h2s->flags & H2_SF_DATA_CLEN) { + h2s->body_len -= sent; + htx->extra = h2s->body_len; + } + + if (sent < flen) { + h2c->flags |= H2_CF_DEM_SFULL; + TRACE_STATE("h2s rxbuf is full", H2_EV_RX_FRAME|H2_EV_RX_DATA|H2_EV_H2S_BLK, h2c->conn, h2s); + goto fail; + } + + goto try_again; + + end_transfer: + /* here we're done with the frame, all the payload (except padding) was + * transferred. + */ + + if (!(h2s->flags & H2_SF_BODY_TUNNEL) && (h2c->dff & H2_F_DATA_END_STREAM)) { + /* no more data are expected for this message. This add the EOM + * flag but only on the response path or if no tunnel attempt + * was aborted. Otherwise (request path + tunnel abrted), the + * EOM was already reported. + */ + if ((h2c->flags & H2_CF_IS_BACK) || !(h2s->flags & H2_SF_TUNNEL_ABRT)) { + /* htx may be empty if receiving an empty DATA frame. */ + if (!htx_set_eom(htx)) + goto fail; + } + } + + h2c->rcvd_c += h2c->dpl; + h2c->rcvd_s += h2c->dpl; + h2c->dpl = 0; + h2c->st0 = H2_CS_FRAME_A; // send the corresponding window update + htx_to_buf(htx, scbuf); + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + return 1; + fail: + if (htx) + htx_to_buf(htx, scbuf); + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + return 0; +} + +/* Try to send a HEADERS frame matching HTX response present in HTX message + * <htx> for the H2 stream <h2s>. Returns the number of bytes sent. The caller + * must check the stream's status to detect any error which might have happened + * subsequently to a successful send. The htx blocks are automatically removed + * from the message. The htx message is assumed to be valid since produced from + * the internal code, hence it contains a start line, an optional series of + * header blocks and an end of header, otherwise an invalid frame could be + * emitted and the resulting htx message could be left in an inconsistent state. + */ +static size_t h2s_snd_fhdrs(struct h2s *h2s, struct htx *htx) +{ + struct http_hdr list[global.tune.max_http_hdr]; + struct h2c *h2c = h2s->h2c; + struct htx_blk *blk; + struct buffer outbuf; + struct buffer *mbuf; + struct htx_sl *sl; + enum htx_blk_type type; + int es_now = 0; + int ret = 0; + int hdr; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s); + + /* get the start line (we do have one) and the rest of the headers, + * that we dump starting at header 0 */ + sl = NULL; + hdr = 0; + for (blk = htx_get_head_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + type = htx_get_blk_type(blk); + + if (type == HTX_BLK_UNUSED) + continue; + + if (type == HTX_BLK_EOH) + break; + + if (type == HTX_BLK_HDR) { + BUG_ON(!sl); /* The start-line mut be defined before any headers */ + if (unlikely(hdr >= sizeof(list)/sizeof(list[0]) - 1)) { + TRACE_ERROR("too many headers", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + + list[hdr].n = htx_get_blk_name(htx, blk); + list[hdr].v = htx_get_blk_value(htx, blk); + hdr++; + } + else if (type == HTX_BLK_RES_SL) { + BUG_ON(sl); /* Only one start-line expected */ + sl = htx_get_blk_ptr(htx, blk); + h2s->status = sl->info.res.status; + if ((sl->flags & HTX_SL_F_BODYLESS_RESP) || h2s->status == 204 || h2s->status == 304) + h2s->flags |= H2_SF_BODYLESS_RESP; + if (h2s->status < 100 || h2s->status > 999) { + TRACE_ERROR("will not encode an invalid status code", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + else if (h2s->status == 101) { + if (unlikely(h2s->flags & H2_SF_EXT_CONNECT_RCVD)) { + /* If an Extended CONNECT has been received, we need to convert 101 to 200 */ + h2s->status = 200; + h2s->flags &= ~H2_SF_EXT_CONNECT_RCVD; + } + else { + /* Otherwise, 101 responses are not supported in H2, so return a error (RFC7540#8.1.1) */ + TRACE_ERROR("will not encode an invalid status code", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + } + else if ((h2s->flags & H2_SF_BODY_TUNNEL) && h2s->status >= 300) { + /* Abort the tunnel attempt */ + h2s->flags &= ~H2_SF_BODY_TUNNEL; + h2s->flags |= H2_SF_TUNNEL_ABRT; + } + } + else { + TRACE_ERROR("will not encode unexpected htx block", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + } + + /* The start-line me be defined */ + BUG_ON(!sl); + + /* marker for end of headers */ + list[hdr].n = ist(""); + + mbuf = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, mbuf)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2s->flags |= H2_SF_BLK_MROOM; + TRACE_STATE("waiting for room in output buffer", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_BLK, h2c->conn, h2s); + return 0; + } + + chunk_reset(&outbuf); + + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= 9 || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < 9) + goto full; + + /* len: 0x000000 (fill later), type: 1(HEADERS), flags: ENDH=4 */ + memcpy(outbuf.area, "\x00\x00\x00\x01\x04", 5); + write_n32(outbuf.area + 5, h2s->id); // 4 bytes + outbuf.data = 9; + + if ((h2c->flags & (H2_CF_SHTS_UPDATED|H2_CF_DTSU_EMITTED)) == H2_CF_SHTS_UPDATED) { + /* SETTINGS_HEADER_TABLE_SIZE changed, we must send an HPACK + * dynamic table size update so that some clients are not + * confused. In practice we only need to send the DTSU when the + * advertised size is lower than the current one, and since we + * don't use it and don't care about the default 4096 bytes, + * we only ack it with a zero size thus we at most have to deal + * with this once. See RFC7541#4.2 and #6.3 for the spec, and + * below for the whole context and interoperability risks: + * https://lists.w3.org/Archives/Public/ietf-http-wg/2021OctDec/0235.html + */ + if (b_room(&outbuf) < 1) + goto full; + outbuf.area[outbuf.data++] = 0x20; // HPACK DTSU 0 bytes + + /* let's not update the flags now but only once the buffer is + * really committed. + */ + } + + /* encode status, which necessarily is the first one */ + if (!hpack_encode_int_status(&outbuf, h2s->status)) { + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + + if ((TRACE_SOURCE)->verbosity >= H2_VERB_ADVANCED) { + char sts[4]; + + h2_trace_header(ist(":status"), ist(ultoa_r(h2s->status, sts, sizeof(sts))), + H2_EV_TX_FRAME|H2_EV_TX_HDR, ist(TRC_LOC), __FUNCTION__, + h2c, h2s); + } + + /* encode all headers, stop at empty name */ + for (hdr = 0; hdr < sizeof(list)/sizeof(list[0]); hdr++) { + /* these ones do not exist in H2 and must be dropped. */ + if (isteq(list[hdr].n, ist("connection")) || + isteq(list[hdr].n, ist("proxy-connection")) || + isteq(list[hdr].n, ist("keep-alive")) || + isteq(list[hdr].n, ist("upgrade")) || + isteq(list[hdr].n, ist("transfer-encoding"))) + continue; + + /* Skip all pseudo-headers */ + if (*(list[hdr].n.ptr) == ':') + continue; + + if (isteq(list[hdr].n, ist(""))) + break; // end + + if (!h2_encode_header(&outbuf, list[hdr].n, list[hdr].v, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + } + + /* update the frame's size */ + h2_set_frame_size(outbuf.area, outbuf.data - 9); + + if (outbuf.data > h2c->mfs + 9) { + if (!h2_fragment_headers(&outbuf, h2c->mfs)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + } + + TRACE_USER("sent H2 response ", H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s, htx); + + /* remove all header blocks including the EOH and compute the + * corresponding size. + */ + ret = 0; + blk = htx_get_head_blk(htx); + while (blk) { + type = htx_get_blk_type(blk); + ret += htx_get_blksz(blk); + blk = htx_remove_blk(htx, blk); + /* The removed block is the EOH */ + if (type == HTX_BLK_EOH) + break; + } + + if (!h2s_sc(h2s) || se_fl_test(h2s->sd, SE_FL_SHW)) { + /* Response already closed: add END_STREAM */ + es_now = 1; + } + else if ((htx->flags & HTX_FL_EOM) && htx_is_empty(htx) && h2s->status >= 200) { + /* EOM+empty: we may need to add END_STREAM except for 1xx + * responses and tunneled response. + */ + if (!(h2s->flags & H2_SF_BODY_TUNNEL) || h2s->status >= 300) + es_now = 1; + } + + if (es_now) + outbuf.area[4] |= H2_F_HEADERS_END_STREAM; + + /* commit the H2 response */ + b_add(mbuf, outbuf.data); + h2c->flags |= H2_CF_MBUF_HAS_DATA; + + /* indicates the HEADERS frame was sent, except for 1xx responses. For + * 1xx responses, another HEADERS frame is expected. + */ + if (h2s->status >= 200) + h2s->flags |= H2_SF_HEADERS_SENT; + + if (h2c->flags & H2_CF_SHTS_UPDATED) { + /* was sent above */ + h2c->flags |= H2_CF_DTSU_EMITTED; + h2c->flags &= ~H2_CF_SHTS_UPDATED; + } + + if (es_now) { + h2s->flags |= H2_SF_ES_SENT; + TRACE_PROTO("setting ES on HEADERS frame", H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s, htx); + if (h2s->st == H2_SS_OPEN) + h2s->st = H2_SS_HLOC; + else + h2s_close(h2s); + } + + /* OK we could properly deliver the response */ + end: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s); + return ret; + full: + if ((mbuf = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + ret = 0; + TRACE_STATE("mux buffer full", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + fail: + /* unparsable HTX messages, too large ones to be produced in the local + * list etc go here (unrecoverable errors). + */ + h2s_error(h2s, H2_ERR_INTERNAL_ERROR); + ret = 0; + goto end; +} + +/* Try to send a HEADERS frame matching HTX request present in HTX message + * <htx> for the H2 stream <h2s>. Returns the number of bytes sent. The caller + * must check the stream's status to detect any error which might have happened + * subsequently to a successful send. The htx blocks are automatically removed + * from the message. The htx message is assumed to be valid since produced from + * the internal code, hence it contains a start line, an optional series of + * header blocks and an end of header, otherwise an invalid frame could be + * emitted and the resulting htx message could be left in an inconsistent state. + */ +static size_t h2s_snd_bhdrs(struct h2s *h2s, struct htx *htx) +{ + struct http_hdr list[global.tune.max_http_hdr]; + struct h2c *h2c = h2s->h2c; + struct htx_blk *blk; + struct buffer outbuf; + struct buffer *mbuf; + struct htx_sl *sl; + struct ist meth, uri, auth, host = IST_NULL; + enum htx_blk_type type; + int es_now = 0; + int ret = 0; + int hdr; + int extended_connect = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s); + + /* get the start line (we do have one) and the rest of the headers, + * that we dump starting at header 0 */ + sl = NULL; + hdr = 0; + for (blk = htx_get_head_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + type = htx_get_blk_type(blk); + + if (type == HTX_BLK_UNUSED) + continue; + + if (type == HTX_BLK_EOH) + break; + + if (type == HTX_BLK_HDR) { + BUG_ON(!sl); /* The start-line mut be defined before any headers */ + if (unlikely(hdr >= sizeof(list)/sizeof(list[0]) - 1)) { + TRACE_ERROR("too many headers", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + + list[hdr].n = htx_get_blk_name(htx, blk); + list[hdr].v = htx_get_blk_value(htx, blk); + + /* Skip header if same name is used to add the server name */ + if ((h2c->flags & H2_CF_IS_BACK) && isttest(h2c->proxy->server_id_hdr_name) && + isteq(list[hdr].n, h2c->proxy->server_id_hdr_name)) + continue; + + /* Convert connection: upgrade to Extended connect from rfc 8441 */ + if ((sl->flags & HTX_SL_F_CONN_UPG) && isteqi(list[hdr].n, ist("connection"))) { + /* rfc 7230 #6.1 Connection = list of tokens */ + struct ist connection_ist = list[hdr].v; + do { + if (isteqi(iststop(connection_ist, ','), + ist("upgrade"))) { + if (!(h2c->flags & H2_CF_RCVD_RFC8441)) { + TRACE_STATE("reject upgrade because of no RFC8441 support", H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s); + goto fail; + } + + TRACE_STATE("convert upgrade to extended connect method", H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s); + h2s->flags |= (H2_SF_BODY_TUNNEL|H2_SF_EXT_CONNECT_SENT); + sl->info.req.meth = HTTP_METH_CONNECT; + meth = ist("CONNECT"); + + extended_connect = 1; + break; + } + + connection_ist = istadv(istfind(connection_ist, ','), 1); + } while (istlen(connection_ist)); + } + + if ((sl->flags & HTX_SL_F_CONN_UPG) && isteq(list[hdr].n, ist("upgrade"))) { + /* rfc 7230 #6.7 Upgrade = list of protocols + * rfc 8441 #4 Extended connect = :protocol is single-valued + * + * only first HTTP/1 protocol is preserved + */ + const struct ist protocol = iststop(list[hdr].v, ','); + /* upgrade_protocol field is 16 bytes long in h2s */ + istpad(h2s->upgrade_protocol, isttrim(protocol, 15)); + } + + if (isteq(list[hdr].n, ist("host"))) + host = list[hdr].v; + + hdr++; + } + else if (type == HTX_BLK_REQ_SL) { + BUG_ON(sl); /* Only one start-line expected */ + sl = htx_get_blk_ptr(htx, blk); + meth = htx_sl_req_meth(sl); + uri = htx_sl_req_uri(sl); + if ((sl->flags & HTX_SL_F_BODYLESS_RESP) || sl->info.req.meth == HTTP_METH_HEAD) + h2s->flags |= H2_SF_BODYLESS_RESP; + if (unlikely(uri.len == 0)) { + TRACE_ERROR("no URI in HTX request", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + } + else { + TRACE_ERROR("will not encode unexpected htx block", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + } + + /* The start-line me be defined */ + BUG_ON(!sl); + + /* Now add the server name to a header (if requested) */ + if ((h2c->flags & H2_CF_IS_BACK) && isttest(h2c->proxy->server_id_hdr_name)) { + struct server *srv = objt_server(h2c->conn->target); + + if (srv) { + list[hdr].n = h2c->proxy->server_id_hdr_name; + list[hdr].v = ist(srv->id); + hdr++; + } + } + + /* marker for end of headers */ + list[hdr].n = ist(""); + + mbuf = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, mbuf)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2s->flags |= H2_SF_BLK_MROOM; + TRACE_STATE("waiting for room in output buffer", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_BLK, h2c->conn, h2s); + return 0; + } + + chunk_reset(&outbuf); + + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= 9 || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < 9) + goto full; + + /* len: 0x000000 (fill later), type: 1(HEADERS), flags: ENDH=4 */ + memcpy(outbuf.area, "\x00\x00\x00\x01\x04", 5); + write_n32(outbuf.area + 5, h2s->id); // 4 bytes + outbuf.data = 9; + + /* encode the method, which necessarily is the first one */ + if (!hpack_encode_method(&outbuf, sl->info.req.meth, meth)) { + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + + h2_trace_header(ist(":method"), meth, H2_EV_TX_FRAME|H2_EV_TX_HDR, ist(TRC_LOC), __FUNCTION__, h2c, h2s); + + auth = ist(NULL); + + /* RFC7540 #8.3: the CONNECT method must have : + * - :authority set to the URI part (host:port) + * - :method set to CONNECT + * - :scheme and :path omitted + * + * Note that this is not applicable in case of the Extended CONNECT + * protocol from rfc 8441. + */ + if (unlikely(sl->info.req.meth == HTTP_METH_CONNECT) && !extended_connect) { + auth = uri; + + if (!h2_encode_header(&outbuf, ist(":authority"), auth, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + + h2s->flags |= H2_SF_BODY_TUNNEL; + } else { + /* other methods need a :scheme. If an authority is known from + * the request line, it must be sent, otherwise only host is + * sent. Host is never sent as the authority. + * + * This code is also applicable for Extended CONNECT protocol + * from rfc 8441. + */ + struct ist scheme = { }; + + if (uri.ptr[0] != '/' && uri.ptr[0] != '*') { + /* the URI seems to start with a scheme */ + int len = 1; + + while (len < uri.len && uri.ptr[len] != ':') + len++; + + if (len + 2 < uri.len && uri.ptr[len + 1] == '/' && uri.ptr[len + 2] == '/') { + /* make the uri start at the authority now */ + scheme = ist2(uri.ptr, len); + uri = istadv(uri, len + 3); + + /* find the auth part of the URI */ + auth = ist2(uri.ptr, 0); + while (auth.len < uri.len && auth.ptr[auth.len] != '/') + auth.len++; + + uri = istadv(uri, auth.len); + } + } + + /* For Extended CONNECT, the :authority must be present. + * Use host value for it. + */ + if (unlikely(extended_connect) && isttest(host)) + auth = host; + + if (!scheme.len) { + /* no explicit scheme, we're using an origin-form URI, + * probably from an H1 request transcoded to H2 via an + * external layer, then received as H2 without authority. + * So we have to look up the scheme from the HTX flags. + * In such a case only http and https are possible, and + * https is the default (sent by browsers). + */ + if ((sl->flags & (HTX_SL_F_HAS_SCHM|HTX_SL_F_SCHM_HTTP)) == (HTX_SL_F_HAS_SCHM|HTX_SL_F_SCHM_HTTP)) + scheme = ist("http"); + else + scheme = ist("https"); + } + + if (!hpack_encode_scheme(&outbuf, scheme)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + + if (auth.len && + !h2_encode_header(&outbuf, ist(":authority"), auth, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + + /* encode the path. RFC7540#8.1.2.3: if path is empty it must + * be sent as '/' or '*'. + */ + if (unlikely(!uri.len)) { + if (sl->info.req.meth == HTTP_METH_OPTIONS) + uri = ist("*"); + else + uri = ist("/"); + } + + if (!hpack_encode_path(&outbuf, uri)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + + h2_trace_header(ist(":path"), uri, H2_EV_TX_FRAME|H2_EV_TX_HDR, ist(TRC_LOC), __FUNCTION__, h2c, h2s); + + /* encode the pseudo-header protocol from rfc8441 if using + * Extended CONNECT method. + */ + if (unlikely(extended_connect)) { + const struct ist protocol = ist(h2s->upgrade_protocol); + if (isttest(protocol)) { + if (!h2_encode_header(&outbuf, ist(":protocol"), protocol, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + } + } + } + + /* encode all headers, stop at empty name. Host is only sent if we + * do not provide an authority. + */ + for (hdr = 0; hdr < sizeof(list)/sizeof(list[0]); hdr++) { + struct ist n = list[hdr].n; + struct ist v = list[hdr].v; + + /* these ones do not exist in H2 and must be dropped. */ + if (isteq(n, ist("connection")) || + (auth.len && isteq(n, ist("host"))) || + isteq(n, ist("proxy-connection")) || + isteq(n, ist("keep-alive")) || + isteq(n, ist("upgrade")) || + isteq(n, ist("transfer-encoding"))) + continue; + + if (isteq(n, ist("te"))) { + /* "te" may only be sent with "trailers" if this value + * is present, otherwise it must be deleted. + */ + v = istist(v, ist("trailers")); + if (!isttest(v) || (v.len > 8 && v.ptr[8] != ',')) + continue; + v = ist("trailers"); + } + + /* Skip all pseudo-headers */ + if (*(n.ptr) == ':') + continue; + + if (isteq(n, ist(""))) + break; // end + + if (!h2_encode_header(&outbuf, n, v, H2_EV_TX_FRAME|H2_EV_TX_HDR, ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + } + + /* update the frame's size */ + h2_set_frame_size(outbuf.area, outbuf.data - 9); + + if (outbuf.data > h2c->mfs + 9) { + if (!h2_fragment_headers(&outbuf, h2c->mfs)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + } + + TRACE_USER("sent H2 request ", H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s, htx); + + /* remove all header blocks including the EOH and compute the + * corresponding size. + */ + ret = 0; + blk = htx_get_head_blk(htx); + while (blk) { + type = htx_get_blk_type(blk); + ret += htx_get_blksz(blk); + blk = htx_remove_blk(htx, blk); + /* The removed block is the EOH */ + if (type == HTX_BLK_EOH) + break; + } + + if (!h2s_sc(h2s) || se_fl_test(h2s->sd, SE_FL_SHW)) { + /* Request already closed: add END_STREAM */ + es_now = 1; + } + if ((htx->flags & HTX_FL_EOM) && htx_is_empty(htx)) { + /* EOM+empty: we may need to add END_STREAM (except for CONNECT + * request) + */ + if (!(h2s->flags & H2_SF_BODY_TUNNEL)) + es_now = 1; + } + + if (es_now) + outbuf.area[4] |= H2_F_HEADERS_END_STREAM; + + /* commit the H2 response */ + b_add(mbuf, outbuf.data); + h2c->flags |= H2_CF_MBUF_HAS_DATA; + h2s->flags |= H2_SF_HEADERS_SENT; + h2s->st = H2_SS_OPEN; + + if (es_now) { + TRACE_PROTO("setting ES on HEADERS frame", H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s, htx); + // trim any possibly pending data (eg: inconsistent content-length) + h2s->flags |= H2_SF_ES_SENT; + h2s->st = H2_SS_HLOC; + } + + end: + return ret; + full: + if ((mbuf = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + ret = 0; + TRACE_STATE("mux buffer full", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + fail: + /* unparsable HTX messages, too large ones to be produced in the local + * list etc go here (unrecoverable errors). + */ + h2s_error(h2s, H2_ERR_INTERNAL_ERROR); + ret = 0; + goto end; +} + +/* Try to send a DATA frame matching HTTP response present in HTX structure + * present in <buf>, for stream <h2s>. Returns the number of bytes sent. The + * caller must check the stream's status to detect any error which might have + * happened subsequently to a successful send. Returns the number of data bytes + * consumed, or zero if nothing done. + */ +static size_t h2s_make_data(struct h2s *h2s, struct buffer *buf, size_t count) +{ + struct h2c *h2c = h2s->h2c; + struct htx *htx; + struct buffer outbuf; + struct buffer *mbuf; + size_t total = 0; + int es_now = 0; + int bsize; /* htx block size */ + int fsize; /* h2 frame size */ + struct htx_blk *blk; + enum htx_blk_type type; + int trunc_out; /* non-zero if truncated on out buf */ + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + + htx = htx_from_buf(buf); + + /* We only come here with HTX_BLK_DATA blocks */ + + new_frame: + if (!count || htx_is_empty(htx)) + goto end; + + if ((h2c->flags & H2_CF_IS_BACK) && + (h2s->flags & (H2_SF_HEADERS_RCVD|H2_SF_BODY_TUNNEL)) == H2_SF_BODY_TUNNEL) { + /* The response HEADERS frame not received yet. Thus the tunnel + * is not fully established yet. In this situation, we block + * data sending. + */ + h2s->flags |= H2_SF_BLK_MBUSY; + TRACE_STATE("Request DATA frame blocked waiting for tunnel establishment", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + goto end; + } + else if ((h2c->flags & H2_CF_IS_BACK) && (h2s->flags & H2_SF_TUNNEL_ABRT)) { + /* a tunnel attempt was aborted but the is pending raw data to xfer to the server. + * Thus the stream is closed with the CANCEL error. The error will be reported to + * the upper layer as aserver abort. But at this stage there is nothing more we can + * do. We just wait for the end of the response to be sure to not truncate it. + */ + if (!(h2s->flags & H2_SF_ES_RCVD)) { + TRACE_STATE("Request DATA frame blocked waiting end of aborted tunnel", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + h2s->flags |= H2_SF_BLK_MBUSY; + } + else { + TRACE_ERROR("Request DATA frame for aborted tunnel", H2_EV_RX_FRAME|H2_EV_RX_DATA, h2c->conn, h2s); + h2s_error(h2s, H2_ERR_CANCEL); + } + goto end; + } + + blk = htx_get_head_blk(htx); + type = htx_get_blk_type(blk); + bsize = htx_get_blksz(blk); + fsize = bsize; + trunc_out = 0; + if (type != HTX_BLK_DATA) + goto end; + + mbuf = br_tail(h2c->mbuf); + retry: + if (br_count(h2c->mbuf) > h2c->nb_streams) { + /* more buffers than streams allocated, pointless + * to continue, we'd use more RAM for no reason. + */ + h2s->flags |= H2_SF_BLK_MROOM; + TRACE_STATE("waiting for room in output buffer", H2_EV_TX_FRAME|H2_EV_TX_DATA|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + } + + if (!h2_get_buf(h2c, mbuf)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2s->flags |= H2_SF_BLK_MROOM; + TRACE_STATE("waiting for room in output buffer", H2_EV_TX_FRAME|H2_EV_TX_DATA|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + } + + /* Perform some optimizations to reduce the number of buffer copies. + * First, if the mux's buffer is empty and the htx area contains + * exactly one data block of the same size as the requested count, and + * this count fits within the frame size, the stream's window size, and + * the connection's window size, then it's possible to simply swap the + * caller's buffer with the mux's output buffer and adjust offsets and + * length to match the entire DATA HTX block in the middle. In this + * case we perform a true zero-copy operation from end-to-end. This is + * the situation that happens all the time with large files. Second, if + * this is not possible, but the mux's output buffer is empty, we still + * have an opportunity to avoid the copy to the intermediary buffer, by + * making the intermediary buffer's area point to the output buffer's + * area. In this case we want to skip the HTX header to make sure that + * copies remain aligned and that this operation remains possible all + * the time. This goes for headers, data blocks and any data extracted + * from the HTX blocks. + */ + if (unlikely(fsize == count && + htx_nbblks(htx) == 1 && type == HTX_BLK_DATA && + fsize <= h2s_mws(h2s) && fsize <= h2c->mws && fsize <= h2c->mfs)) { + void *old_area = mbuf->area; + + if (b_data(mbuf)) { + /* Too bad there are data left there. We're willing to memcpy/memmove + * up to 1/4 of the buffer, which means that it's OK to copy a large + * frame into a buffer containing few data if it needs to be realigned, + * and that it's also OK to copy few data without realigning. Otherwise + * we'll pretend the mbuf is full and wait for it to become empty. + */ + if (fsize + 9 <= b_room(mbuf) && + (b_data(mbuf) <= b_size(mbuf) / 4 || + (fsize <= b_size(mbuf) / 4 && fsize + 9 <= b_contig_space(mbuf)))) { + TRACE_STATE("small data present in output buffer, appending", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + goto copy; + } + + if ((mbuf = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + TRACE_STATE("too large data present in output buffer, waiting for emptiness", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + goto end; + } + + if (htx->flags & HTX_FL_EOM) { + /* EOM+empty: we may need to add END_STREAM (except for tunneled + * message) + */ + if (!(h2s->flags & H2_SF_BODY_TUNNEL)) + es_now = 1; + } + /* map an H2 frame to the HTX block so that we can put the + * frame header there. + */ + *mbuf = b_make(buf->area, buf->size, sizeof(struct htx) + blk->addr - 9, fsize + 9); + outbuf.area = b_head(mbuf); + + /* prepend an H2 DATA frame header just before the DATA block */ + memcpy(outbuf.area, "\x00\x00\x00\x00\x00", 5); + write_n32(outbuf.area + 5, h2s->id); // 4 bytes + if (es_now) + outbuf.area[4] |= H2_F_DATA_END_STREAM; + h2_set_frame_size(outbuf.area, fsize); + + /* update windows */ + h2s->sws -= fsize; + h2c->mws -= fsize; + + /* and exchange with our old area */ + buf->area = old_area; + buf->data = buf->head = 0; + total += fsize; + fsize = 0; + h2c->flags |= H2_CF_MBUF_HAS_DATA; + + TRACE_PROTO("sent H2 DATA frame (zero-copy)", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + goto out; + } + + copy: + /* for DATA and EOM we'll have to emit a frame, even if empty */ + + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= 9 || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < 9) { + if ((mbuf = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + TRACE_STATE("output buffer full", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + goto end; + } + + /* len: 0x000000 (fill later), type: 0(DATA), flags: none=0 */ + memcpy(outbuf.area, "\x00\x00\x00\x00\x00", 5); + write_n32(outbuf.area + 5, h2s->id); // 4 bytes + outbuf.data = 9; + + /* we have in <fsize> the exact number of bytes we need to copy from + * the HTX buffer. We need to check this against the connection's and + * the stream's send windows, and to ensure that this fits in the max + * frame size and in the buffer's available space minus 9 bytes (for + * the frame header). The connection's flow control is applied last so + * that we can use a separate list of streams which are immediately + * unblocked on window opening. Note: we don't implement padding. + */ + + if (!fsize) + goto send_empty; + + if (h2s_mws(h2s) <= 0) { + h2s->flags |= H2_SF_BLK_SFCTL; + if (LIST_INLIST(&h2s->list)) + h2_remove_from_list(h2s); + LIST_APPEND(&h2c->blocked_list, &h2s->list); + TRACE_STATE("stream window <=0, flow-controlled", H2_EV_TX_FRAME|H2_EV_TX_DATA|H2_EV_H2S_FCTL, h2c->conn, h2s); + goto end; + } + + if (fsize > count) + fsize = count; + + if (fsize > h2s_mws(h2s)) + fsize = h2s_mws(h2s); // >0 + + if (h2c->mfs && fsize > h2c->mfs) + fsize = h2c->mfs; // >0 + + if (fsize + 9 > outbuf.size) { + /* It doesn't fit at once. If it at least fits once split and + * the amount of data to move is low, let's defragment the + * buffer now. + */ + if (b_space_wraps(mbuf) && + (fsize + 9 <= b_room(mbuf)) && + b_data(mbuf) <= MAX_DATA_REALIGN) + goto realign_again; + fsize = outbuf.size - 9; + trunc_out = 1; + + if (fsize <= 0) { + /* no need to send an empty frame here */ + if ((mbuf = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + TRACE_STATE("output buffer full", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + goto end; + } + } + + if (h2c->mws <= 0) { + h2s->flags |= H2_SF_BLK_MFCTL; + TRACE_STATE("connection window <=0, stream flow-controlled", H2_EV_TX_FRAME|H2_EV_TX_DATA|H2_EV_H2C_FCTL, h2c->conn, h2s); + goto end; + } + + if (fsize > h2c->mws) + fsize = h2c->mws; + + /* now let's copy this this into the output buffer */ + memcpy(outbuf.area + 9, htx_get_blk_ptr(htx, blk), fsize); + h2s->sws -= fsize; + h2c->mws -= fsize; + count -= fsize; + + send_empty: + /* update the frame's size */ + h2_set_frame_size(outbuf.area, fsize); + + /* consume incoming HTX block */ + total += fsize; + if (fsize == bsize) { + htx_remove_blk(htx, blk); + if ((htx->flags & HTX_FL_EOM) && htx_is_empty(htx)) { + /* EOM+empty: we may need to add END_STREAM (except for tunneled + * message) + */ + if (!(h2s->flags & H2_SF_BODY_TUNNEL)) + es_now = 1; + } + } + else { + /* we've truncated this block */ + htx_cut_data_blk(htx, blk, fsize); + } + + if (es_now) + outbuf.area[4] |= H2_F_DATA_END_STREAM; + + /* commit the H2 response */ + b_add(mbuf, fsize + 9); + h2c->flags |= H2_CF_MBUF_HAS_DATA; + + out: + if (es_now) { + if (h2s->st == H2_SS_OPEN) + h2s->st = H2_SS_HLOC; + else + h2s_close(h2s); + + h2s->flags |= H2_SF_ES_SENT; + TRACE_PROTO("ES flag set on outgoing frame", H2_EV_TX_FRAME|H2_EV_TX_DATA|H2_EV_TX_EOI, h2c->conn, h2s); + } + else if (fsize) { + if (fsize == bsize) { + TRACE_DEVEL("more data may be available, trying to send another frame", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + goto new_frame; + } + else if (trunc_out) { + /* we've truncated this block */ + goto new_frame; + } + } + + end: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + return total; +} + +/* Skip the message payload (DATA blocks) and emit an empty DATA frame with the + * ES flag set for stream <h2s>. This function is called for response known to + * have no payload. Only DATA blocks are skipped. This means the trailers are + * still emitted. The caller must check the stream's status to detect any error + * which might have happened subsequently to a successful send. Returns the + * number of data bytes consumed, or zero if nothing done. + */ +static size_t h2s_skip_data(struct h2s *h2s, struct buffer *buf, size_t count) +{ + struct h2c *h2c = h2s->h2c; + struct htx *htx; + int bsize; /* htx block size */ + int fsize; /* h2 frame size */ + struct htx_blk *blk; + enum htx_blk_type type; + size_t total = 0; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + + htx = htx_from_buf(buf); + + next_data: + if (!count || htx_is_empty(htx)) + goto end; + blk = htx_get_head_blk(htx); + type = htx_get_blk_type(blk); + bsize = htx_get_blksz(blk); + fsize = bsize; + if (type != HTX_BLK_DATA) + goto end; + + if (fsize > count) + fsize = count; + + if (fsize != bsize) + goto skip_data; + + if (!(htx->flags & HTX_FL_EOM) || !htx_is_unique_blk(htx, blk)) + goto skip_data; + + /* Here, it is the last block and it is also the end of the message. So + * we can emit an empty DATA frame with the ES flag set + */ + if (h2_send_empty_data_es(h2s) <= 0) + goto end; + + if (h2s->st == H2_SS_OPEN) + h2s->st = H2_SS_HLOC; + else + h2s_close(h2s); + + skip_data: + /* consume incoming HTX block */ + total += fsize; + if (fsize == bsize) { + TRACE_DEVEL("more data may be available, trying to skip another frame", H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + htx_remove_blk(htx, blk); + goto next_data; + } + else { + /* we've truncated this block */ + htx_cut_data_blk(htx, blk, fsize); + } + + end: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_DATA, h2c->conn, h2s); + return total; +} + +/* Try to send a HEADERS frame matching HTX_BLK_TLR series of blocks present in + * HTX message <htx> for the H2 stream <h2s>. Returns the number of bytes + * processed. The caller must check the stream's status to detect any error + * which might have happened subsequently to a successful send. The htx blocks + * are automatically removed from the message. The htx message is assumed to be + * valid since produced from the internal code. Processing stops when meeting + * the EOT, which *is* removed. All trailers are processed at once and sent as a + * single frame. The ES flag is always set. + */ +static size_t h2s_make_trailers(struct h2s *h2s, struct htx *htx) +{ + struct http_hdr list[global.tune.max_http_hdr]; + struct h2c *h2c = h2s->h2c; + struct htx_blk *blk; + struct buffer outbuf; + struct buffer *mbuf; + enum htx_blk_type type; + int ret = 0; + int hdr; + int idx; + + TRACE_ENTER(H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s); + + /* get trailers. */ + hdr = 0; + for (blk = htx_get_head_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + type = htx_get_blk_type(blk); + + if (type == HTX_BLK_UNUSED) + continue; + + if (type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_TLR) { + if (unlikely(hdr >= sizeof(list)/sizeof(list[0]) - 1)) { + TRACE_ERROR("too many headers", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + + list[hdr].n = htx_get_blk_name(htx, blk); + list[hdr].v = htx_get_blk_value(htx, blk); + hdr++; + } + else { + TRACE_ERROR("will not encode unexpected htx block", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_ERR, h2c->conn, h2s); + goto fail; + } + } + + /* marker for end of trailers */ + list[hdr].n = ist(""); + + mbuf = br_tail(h2c->mbuf); + retry: + if (!h2_get_buf(h2c, mbuf)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2s->flags |= H2_SF_BLK_MROOM; + TRACE_STATE("waiting for room in output buffer", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + } + + chunk_reset(&outbuf); + + while (1) { + outbuf = b_make(b_tail(mbuf), b_contig_space(mbuf), 0, 0); + if (outbuf.size >= 9 || !b_space_wraps(mbuf)) + break; + realign_again: + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (outbuf.size < 9) + goto full; + + /* len: 0x000000 (fill later), type: 1(HEADERS), flags: ENDH=4,ES=1 */ + memcpy(outbuf.area, "\x00\x00\x00\x01\x05", 5); + write_n32(outbuf.area + 5, h2s->id); // 4 bytes + outbuf.data = 9; + + /* encode all headers */ + for (idx = 0; idx < hdr; idx++) { + /* these ones do not exist in H2 or must not appear in + * trailers and must be dropped. + */ + if (isteq(list[idx].n, ist("host")) || + isteq(list[idx].n, ist("content-length")) || + isteq(list[idx].n, ist("connection")) || + isteq(list[idx].n, ist("proxy-connection")) || + isteq(list[idx].n, ist("keep-alive")) || + isteq(list[idx].n, ist("upgrade")) || + isteq(list[idx].n, ist("te")) || + isteq(list[idx].n, ist("transfer-encoding"))) + continue; + + /* Skip all pseudo-headers */ + if (*(list[idx].n.ptr) == ':') + continue; + + if (!h2_encode_header(&outbuf, list[idx].n, list[idx].v, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + } + + if (outbuf.data == 9) { + /* here we have a problem, we have nothing to emit (either we + * received an empty trailers block followed or we removed its + * contents above). Because of this we can't send a HEADERS + * frame, so we have to cheat and instead send an empty DATA + * frame conveying the ES flag. + */ + outbuf.area[3] = H2_FT_DATA; + outbuf.area[4] = H2_F_DATA_END_STREAM; + } + + /* update the frame's size */ + h2_set_frame_size(outbuf.area, outbuf.data - 9); + + if (outbuf.data > h2c->mfs + 9) { + if (!h2_fragment_headers(&outbuf, h2c->mfs)) { + /* output full */ + if (b_space_wraps(mbuf)) + goto realign_again; + goto full; + } + } + + /* commit the H2 response */ + TRACE_PROTO("sent H2 trailers HEADERS frame", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_TX_EOI, h2c->conn, h2s); + b_add(mbuf, outbuf.data); + h2c->flags |= H2_CF_MBUF_HAS_DATA; + h2s->flags |= H2_SF_ES_SENT; + + if (h2s->st == H2_SS_OPEN) + h2s->st = H2_SS_HLOC; + else + h2s_close(h2s); + + /* OK we could properly deliver the response */ + done: + /* remove all header blocks till the end and compute the corresponding size. */ + ret = 0; + blk = htx_get_head_blk(htx); + while (blk) { + type = htx_get_blk_type(blk); + ret += htx_get_blksz(blk); + blk = htx_remove_blk(htx, blk); + /* The removed block is the EOT */ + if (type == HTX_BLK_EOT) + break; + } + + end: + TRACE_LEAVE(H2_EV_TX_FRAME|H2_EV_TX_HDR, h2c->conn, h2s); + return ret; + full: + if ((mbuf = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + ret = 0; + TRACE_STATE("mux buffer full", H2_EV_TX_FRAME|H2_EV_TX_HDR|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + fail: + /* unparsable HTX messages, too large ones to be produced in the local + * list etc go here (unrecoverable errors). + */ + h2s_error(h2s, H2_ERR_INTERNAL_ERROR); + ret = 0; + goto end; +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0. + */ +static int h2_subscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct h2s *h2s = __sc_mux_strm(sc); + struct h2c *h2c = h2s->h2c; + + TRACE_ENTER(H2_EV_STRM_SEND|H2_EV_STRM_RECV, h2c->conn, h2s); + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(h2s->subs && h2s->subs != es); + + es->events |= event_type; + h2s->subs = es; + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("subscribe(recv)", H2_EV_STRM_RECV, h2c->conn, h2s); + + if (event_type & SUB_RETRY_SEND) { + TRACE_DEVEL("subscribe(send)", H2_EV_STRM_SEND, h2c->conn, h2s); + if (!(h2s->flags & H2_SF_BLK_SFCTL) && + !LIST_INLIST(&h2s->list)) { + if (h2s->flags & H2_SF_BLK_MFCTL) { + TRACE_DEVEL("Adding to fctl list", H2_EV_STRM_SEND, h2c->conn, h2s); + LIST_APPEND(&h2c->fctl_list, &h2s->list); + } + else { + TRACE_DEVEL("Adding to send list", H2_EV_STRM_SEND, h2c->conn, h2s); + LIST_APPEND(&h2c->send_list, &h2s->list); + } + } + } + TRACE_LEAVE(H2_EV_STRM_SEND|H2_EV_STRM_RECV, h2c->conn, h2s); + return 0; +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type>. + * The <es> pointer is not allowed to differ from the one passed to the + * subscribe() call. It always returns zero. + */ +static int h2_unsubscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct h2s *h2s = __sc_mux_strm(sc); + + TRACE_ENTER(H2_EV_STRM_SEND|H2_EV_STRM_RECV, h2s->h2c->conn, h2s); + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(h2s->subs && h2s->subs != es); + + es->events &= ~event_type; + if (!es->events) + h2s->subs = NULL; + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("unsubscribe(recv)", H2_EV_STRM_RECV, h2s->h2c->conn, h2s); + + if (event_type & SUB_RETRY_SEND) { + TRACE_DEVEL("unsubscribe(send)", H2_EV_STRM_SEND, h2s->h2c->conn, h2s); + h2s->flags &= ~H2_SF_NOTIFIED; + if (!(h2s->flags & (H2_SF_WANT_SHUTR | H2_SF_WANT_SHUTW))) + h2_remove_from_list(h2s); + } + + TRACE_LEAVE(H2_EV_STRM_SEND|H2_EV_STRM_RECV, h2s->h2c->conn, h2s); + return 0; +} + + +/* Called from the upper layer, to receive data + * + * The caller is responsible for defragmenting <buf> if necessary. But <flags> + * must be tested to know the calling context. If CO_RFL_BUF_FLUSH is set, it + * means the caller wants to flush input data (from the mux buffer and the + * channel buffer) to be able to use kernel splicing or any kind of mux-to-mux + * xfer. If CO_RFL_KEEP_RECV is set, the mux must always subscribe for read + * events before giving back. CO_RFL_BUF_WET is set if <buf> is congested with + * data scheduled for leaving soon. CO_RFL_BUF_NOT_STUCK is set to instruct the + * mux it may optimize the data copy to <buf> if necessary. Otherwise, it should + * copy as much data as possible. + */ +static size_t h2_rcv_buf(struct stconn *sc, struct buffer *buf, size_t count, int flags) +{ + struct h2s *h2s = __sc_mux_strm(sc); + struct h2c *h2c = h2s->h2c; + struct htx *h2s_htx = NULL; + struct htx *buf_htx = NULL; + size_t ret = 0; + + TRACE_ENTER(H2_EV_STRM_RECV, h2c->conn, h2s); + + /* transfer possibly pending data to the upper layer */ + h2s_htx = htx_from_buf(&h2s->rxbuf); + if (htx_is_empty(h2s_htx) && !(h2s_htx->flags & HTX_FL_PARSING_ERROR)) { + /* Here htx_to_buf() will set buffer data to 0 because + * the HTX is empty. + */ + htx_to_buf(h2s_htx, &h2s->rxbuf); + goto end; + } + ret = h2s_htx->data; + buf_htx = htx_from_buf(buf); + + /* <buf> is empty and the message is small enough, swap the + * buffers. */ + if (htx_is_empty(buf_htx) && htx_used_space(h2s_htx) <= count) { + htx_to_buf(buf_htx, buf); + htx_to_buf(h2s_htx, &h2s->rxbuf); + b_xfer(buf, &h2s->rxbuf, b_data(&h2s->rxbuf)); + goto end; + } + + htx_xfer_blks(buf_htx, h2s_htx, count, HTX_BLK_UNUSED); + + if (h2s_htx->flags & HTX_FL_PARSING_ERROR) { + buf_htx->flags |= HTX_FL_PARSING_ERROR; + if (htx_is_empty(buf_htx)) + se_fl_set(h2s->sd, SE_FL_EOI); + } + else if (htx_is_empty(h2s_htx)) { + buf_htx->flags |= (h2s_htx->flags & HTX_FL_EOM); + } + + buf_htx->extra = (h2s_htx->extra ? (h2s_htx->data + h2s_htx->extra) : 0); + htx_to_buf(buf_htx, buf); + htx_to_buf(h2s_htx, &h2s->rxbuf); + ret -= h2s_htx->data; + + end: + if (b_data(&h2s->rxbuf)) + se_fl_set(h2s->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + else { + if (!(h2c->flags & H2_CF_IS_BACK) && (h2s->flags & (H2_SF_BODY_TUNNEL|H2_SF_ES_RCVD))) { + /* If request ES is reported to the upper layer, it means the + * H2S now expects data from the opposite side. + */ + se_expect_data(h2s->sd); + } + + se_fl_clr(h2s->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + h2s_propagate_term_flags(h2c, h2s); + if (b_size(&h2s->rxbuf)) { + b_free(&h2s->rxbuf); + offer_buffers(NULL, 1); + } + } + + if (ret && h2c->dsi == h2s->id) { + /* demux is blocking on this stream's buffer */ + h2c->flags &= ~H2_CF_DEM_SFULL; + h2c_restart_reading(h2c, 1); + } + + TRACE_LEAVE(H2_EV_STRM_RECV, h2c->conn, h2s); + return ret; +} + + +/* Called from the upper layer, to send data from buffer <buf> for no more than + * <count> bytes. Returns the number of bytes effectively sent. Some status + * flags may be updated on the stream connector. + */ +static size_t h2_snd_buf(struct stconn *sc, struct buffer *buf, size_t count, int flags) +{ + struct h2s *h2s = __sc_mux_strm(sc); + size_t total = 0; + size_t ret; + struct htx *htx; + struct htx_blk *blk; + enum htx_blk_type btype; + uint32_t bsize; + int32_t idx; + + TRACE_ENTER(H2_EV_H2S_SEND|H2_EV_STRM_SEND, h2s->h2c->conn, h2s); + + /* If we were not just woken because we wanted to send but couldn't, + * and there's somebody else that is waiting to send, do nothing, + * we will subscribe later and be put at the end of the list + */ + if (!(h2s->flags & H2_SF_NOTIFIED) && + (!LIST_ISEMPTY(&h2s->h2c->send_list) || !LIST_ISEMPTY(&h2s->h2c->fctl_list))) { + if (LIST_INLIST(&h2s->list)) + TRACE_DEVEL("stream already waiting, leaving", H2_EV_H2S_SEND|H2_EV_H2S_BLK, h2s->h2c->conn, h2s); + else { + TRACE_DEVEL("other streams already waiting, going to the queue and leaving", H2_EV_H2S_SEND|H2_EV_H2S_BLK, h2s->h2c->conn, h2s); + h2s->h2c->flags |= H2_CF_WAIT_INLIST; + } + return 0; + } + h2s->flags &= ~H2_SF_NOTIFIED; + + if (h2s->h2c->st0 < H2_CS_FRAME_H) { + TRACE_DEVEL("connection not ready, leaving", H2_EV_H2S_SEND|H2_EV_H2S_BLK, h2s->h2c->conn, h2s); + return 0; + } + + if (h2s->h2c->st0 >= H2_CS_ERROR) { + se_fl_set(h2s->sd, SE_FL_ERROR); + TRACE_DEVEL("connection is in error, leaving in error", H2_EV_H2S_SEND|H2_EV_H2S_BLK|H2_EV_H2S_ERR|H2_EV_STRM_ERR, h2s->h2c->conn, h2s); + return 0; + } + + htx = htx_from_buf(buf); + + if (!(h2s->flags & H2_SF_OUTGOING_DATA) && count) + h2s->flags |= H2_SF_OUTGOING_DATA; + + if (htx->extra && htx->extra != HTX_UNKOWN_PAYLOAD_LENGTH) + h2s->flags |= H2_SF_MORE_HTX_DATA; + else + h2s->flags &= ~H2_SF_MORE_HTX_DATA; + + if (h2s->id == 0) { + int32_t id = h2c_get_next_sid(h2s->h2c); + + if (id < 0) { + se_fl_set(h2s->sd, SE_FL_ERROR); + TRACE_DEVEL("couldn't get a stream ID, leaving in error", H2_EV_H2S_SEND|H2_EV_H2S_BLK|H2_EV_H2S_ERR|H2_EV_STRM_ERR, h2s->h2c->conn, h2s); + return 0; + } + + eb32_delete(&h2s->by_id); + h2s->by_id.key = h2s->id = id; + h2s->h2c->max_id = id; + h2s->h2c->nb_reserved--; + eb32_insert(&h2s->h2c->streams_by_id, &h2s->by_id); + } + + while (h2s->st < H2_SS_HLOC && !(h2s->flags & H2_SF_BLK_ANY) && + count && !htx_is_empty(htx)) { + idx = htx_get_head(htx); + blk = htx_get_blk(htx, idx); + btype = htx_get_blk_type(blk); + bsize = htx_get_blksz(blk); + + switch (btype) { + case HTX_BLK_REQ_SL: + /* start-line before headers */ + ret = h2s_snd_bhdrs(h2s, htx); + if (ret > 0) { + total += ret; + count -= ret; + if (ret < bsize) + goto done; + } + break; + + case HTX_BLK_RES_SL: + /* start-line before headers */ + ret = h2s_snd_fhdrs(h2s, htx); + if (ret > 0) { + total += ret; + count -= ret; + if (ret < bsize) + goto done; + } + break; + + case HTX_BLK_DATA: + /* all these cause the emission of a DATA frame (possibly empty) */ + if (!(h2s->h2c->flags & H2_CF_IS_BACK) && + (h2s->flags & (H2_SF_BODY_TUNNEL|H2_SF_BODYLESS_RESP)) == H2_SF_BODYLESS_RESP) + ret = h2s_skip_data(h2s, buf, count); + else + ret = h2s_make_data(h2s, buf, count); + if (ret > 0) { + htx = htx_from_buf(buf); + total += ret; + count -= ret; + if (ret < bsize) + goto done; + } + break; + + case HTX_BLK_TLR: + case HTX_BLK_EOT: + /* This is the first trailers block, all the subsequent ones */ + ret = h2s_make_trailers(h2s, htx); + if (ret > 0) { + total += ret; + count -= ret; + if (ret < bsize) + goto done; + } + break; + + default: + htx_remove_blk(htx, blk); + total += bsize; + count -= bsize; + break; + } + } + + done: + if (h2s->st >= H2_SS_HLOC) { + /* trim any possibly pending data after we close (extra CR-LF, + * unprocessed trailers, abnormal extra data, ...) + */ + total += count; + count = 0; + } + + /* RST are sent similarly to frame acks */ + if (h2s->st == H2_SS_ERROR || h2s->flags & H2_SF_RST_RCVD) { + TRACE_DEVEL("reporting RST/error to the app-layer stream", H2_EV_H2S_SEND|H2_EV_H2S_ERR|H2_EV_STRM_ERR, h2s->h2c->conn, h2s); + se_fl_set_error(h2s->sd); + if (h2s_send_rst_stream(h2s->h2c, h2s) > 0) + h2s_close(h2s); + } + + htx_to_buf(htx, buf); + + if (total > 0) { + if (!(h2s->h2c->wait_event.events & SUB_RETRY_SEND)) { + TRACE_DEVEL("data queued, waking up h2c sender", H2_EV_H2S_SEND|H2_EV_H2C_SEND, h2s->h2c->conn, h2s); + if (h2_send(h2s->h2c)) + tasklet_wakeup(h2s->h2c->wait_event.tasklet); + } + + } + /* If we're waiting for flow control, and we got a shutr on the + * connection, we will never be unlocked, so add an error on + * the stream connector. + */ + if ((h2s->h2c->flags & H2_CF_RCVD_SHUT) && + !b_data(&h2s->h2c->dbuf) && + (h2s->flags & (H2_SF_BLK_SFCTL | H2_SF_BLK_MFCTL))) { + TRACE_DEVEL("fctl with shutr, reporting error to app-layer", H2_EV_H2S_SEND|H2_EV_STRM_SEND|H2_EV_STRM_ERR, h2s->h2c->conn, h2s); + se_fl_set_error(h2s->sd); + } + + if (total > 0 && !(h2s->flags & H2_SF_BLK_SFCTL) && + !(h2s->flags & (H2_SF_WANT_SHUTR|H2_SF_WANT_SHUTW))) { + /* Ok we managed to send something, leave the send_list if we were still there */ + h2_remove_from_list(h2s); + TRACE_DEVEL("Removed from h2s list", H2_EV_H2S_SEND|H2_EV_H2C_SEND, h2s->h2c->conn, h2s); + } + + TRACE_LEAVE(H2_EV_H2S_SEND|H2_EV_STRM_SEND, h2s->h2c->conn, h2s); + return total; +} + +static size_t h2_nego_ff(struct stconn *sc, struct buffer *input, size_t count, unsigned int may_splice) +{ + struct h2s *h2s = __sc_mux_strm(sc); + struct h2c *h2c = h2s->h2c; + struct buffer *mbuf; + size_t sz , ret = 0; + + TRACE_ENTER(H2_EV_H2S_SEND|H2_EV_STRM_SEND, h2s->h2c->conn, h2s); + + /* If we were not just woken because we wanted to send but couldn't, + * and there's somebody else that is waiting to send, do nothing, + * we will subscribe later and be put at the end of the list + * + * WARNING: h2_done_ff() is responsible to remove H2_SF_NOTIFIED flags + * depending on iobuf flags. + */ + if (!(h2s->flags & H2_SF_NOTIFIED) && + (!LIST_ISEMPTY(&h2c->send_list) || !LIST_ISEMPTY(&h2c->fctl_list))) { + if (LIST_INLIST(&h2s->list)) + TRACE_DEVEL("stream already waiting, leaving", H2_EV_H2S_SEND|H2_EV_H2S_BLK, h2s->h2c->conn, h2s); + else { + TRACE_DEVEL("other streams already waiting, going to the queue and leaving", H2_EV_H2S_SEND|H2_EV_H2S_BLK, h2s->h2c->conn, h2s); + h2s->h2c->flags |= H2_CF_WAIT_INLIST; + } + h2s->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + goto end; + } + + if (h2s_mws(h2s) <= 0) { + h2s->flags |= H2_SF_BLK_SFCTL; + if (LIST_INLIST(&h2s->list)) + LIST_DEL_INIT(&h2s->list); + LIST_APPEND(&h2c->blocked_list, &h2s->list); + h2s->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + TRACE_STATE("stream window <=0, flow-controlled", H2_EV_H2S_SEND|H2_EV_H2S_FCTL, h2c->conn, h2s); + goto end; + } + if (h2c->mws <= 0) { + h2s->flags |= H2_SF_BLK_MFCTL; + h2s->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + TRACE_STATE("connection window <=0, stream flow-controlled", H2_EV_H2S_SEND|H2_EV_H2C_FCTL, h2c->conn, h2s); + goto end; + } + + sz = count; + if (sz > h2s_mws(h2s)) + sz = h2s_mws(h2s); + if (h2c->mfs && sz > h2c->mfs) + sz = h2c->mfs; // >0 + if (sz > h2c->mws) + sz = h2c->mws; + + if (count > sz) + count = sz; + + mbuf = br_tail(h2c->mbuf); + retry: + if (br_count(h2c->mbuf) > h2c->nb_streams) { + /* more buffers than streams allocated, pointless + * to continue, we'd use more RAM for no reason. + */ + h2s->flags |= H2_SF_BLK_MROOM; + h2s->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + TRACE_STATE("waiting for room in output buffer", H2_EV_TX_FRAME|H2_EV_TX_DATA|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + } + + if (!h2_get_buf(h2c, mbuf)) { + h2c->flags |= H2_CF_MUX_MALLOC; + h2s->flags |= H2_SF_BLK_MROOM; + h2s->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + TRACE_STATE("waiting for room in output buffer", H2_EV_H2S_SEND|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + } + + if (b_room(mbuf) < sz && b_room(mbuf) < b_size(mbuf) / 4) { + if ((mbuf = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + h2s->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + TRACE_STATE("too large data present in output buffer, waiting for emptiness", H2_EV_H2S_SEND|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + } + + while (1) { + if (b_contig_space(mbuf) >= 9 || !b_space_wraps(mbuf)) + break; + b_slow_realign(mbuf, trash.area, b_data(mbuf)); + } + + if (b_contig_space(mbuf) <= 9) { + if ((mbuf = br_tail_add(h2c->mbuf)) != NULL) + goto retry; + h2c->flags |= H2_CF_MUX_MFULL; + h2s->flags |= H2_SF_BLK_MROOM; + h2s->sd->iobuf.flags |= IOBUF_FL_FF_BLOCKED; + TRACE_STATE("output buffer full", H2_EV_H2S_SEND|H2_EV_H2S_BLK, h2c->conn, h2s); + goto end; + } + + /* Cannot forward more than available room in output buffer */ + sz = b_contig_space(mbuf) - 9; + if (count > sz) + count = sz; + + /* len: 0x000000 (fill later), type: 0(DATA), flags: none=0 */ + memcpy(b_tail(mbuf), "\x00\x00\x00\x00\x00", 5); + write_n32(b_tail(mbuf) + 5, h2s->id); // 4 bytes + + h2s->sd->iobuf.buf = mbuf; + h2s->sd->iobuf.offset = 9; + h2s->sd->iobuf.data = 0; + + /* forward remaining input data */ + if (b_data(input)) { + size_t xfer = count; + + if (xfer > b_data(input)) + xfer = b_data(input); + b_add(mbuf, 9); + h2s->sd->iobuf.data = b_xfer(mbuf, input, xfer); + b_sub(mbuf, 9); + + /* Cannot forward more data, wait for room */ + if (b_data(input)) + goto end; + } + + ret = count - h2s->sd->iobuf.data; + end: + if (h2s->sd->iobuf.flags & IOBUF_FL_FF_BLOCKED) + h2s->flags &= ~H2_SF_NOTIFIED; + TRACE_LEAVE(H2_EV_H2S_SEND|H2_EV_STRM_SEND, h2s->h2c->conn, h2s); + return ret; +} + +static size_t h2_done_ff(struct stconn *sc) +{ + struct h2s *h2s = __sc_mux_strm(sc); + struct h2c *h2c = h2s->h2c; + struct sedesc *sd = h2s->sd; + struct buffer *mbuf; + char *head; + size_t total = 0; + + TRACE_ENTER(H2_EV_H2S_SEND|H2_EV_STRM_SEND, h2s->h2c->conn, h2s); + + mbuf = sd->iobuf.buf; + if (!mbuf) + goto end; + head = b_peek(mbuf, b_data(mbuf) - sd->iobuf.data); + + if (sd->iobuf.flags & IOBUF_FL_EOI) + h2s->flags &= ~H2_SF_MORE_HTX_DATA; + + if (!(sd->iobuf.flags & IOBUF_FL_FF_BLOCKED) && + !(h2s->flags & H2_SF_BLK_SFCTL) && + !(h2s->flags & (H2_SF_WANT_SHUTR|H2_SF_WANT_SHUTW))) { + /* Ok we managed to send something, leave the send_list if we were still there */ + h2_remove_from_list(h2s); + } + + if (!sd->iobuf.data) + goto end; + + /* Perform a synchronous send but in all cases, consider + * everything was already sent from the SC point of view. + */ + total = sd->iobuf.data; + h2_set_frame_size(head, total); + b_add(mbuf, 9); + h2s->sws -= total; + h2c->mws -= total; + if (h2_send(h2s->h2c)) + tasklet_wakeup(h2s->h2c->wait_event.tasklet); + + end: + sd->iobuf.buf = NULL; + sd->iobuf.offset = 0; + sd->iobuf.data = 0; + + if (!(sd->iobuf.flags & IOBUF_FL_INTERIM_FF)) + h2s->flags &= ~H2_SF_NOTIFIED; + + TRACE_LEAVE(H2_EV_H2S_SEND|H2_EV_STRM_SEND, h2s->h2c->conn, h2s); + return total; +} + +static int h2_resume_ff(struct stconn *sc, unsigned int flags) +{ + return 0; +} + +/* appends some info about stream <h2s> to buffer <msg>, or does nothing if + * <h2s> is NULL. Returns non-zero if the stream is considered suspicious. May + * emit multiple lines, each new one being prefixed with <pfx>, if <pfx> is not + * NULL, otherwise a single line is used. + */ +static int h2_dump_h2s_info(struct buffer *msg, const struct h2s *h2s, const char *pfx) +{ + int ret = 0; + + if (!h2s) + return ret; + + chunk_appendf(msg, " h2s.id=%d .st=%s .flg=0x%04x .rxbuf=%u@%p+%u/%u", + h2s->id, h2s_st_to_str(h2s->st), h2s->flags, + (unsigned int)b_data(&h2s->rxbuf), b_orig(&h2s->rxbuf), + (unsigned int)b_head_ofs(&h2s->rxbuf), (unsigned int)b_size(&h2s->rxbuf)); + + if (pfx) + chunk_appendf(msg, "\n%s", pfx); + + chunk_appendf(msg, " .sc=%p", h2s_sc(h2s)); + if (h2s_sc(h2s)) + chunk_appendf(msg, "(.flg=0x%08x .app=%p)", + h2s_sc(h2s)->flags, h2s_sc(h2s)->app); + + chunk_appendf(msg, " .sd=%p", h2s->sd); + chunk_appendf(msg, "(.flg=0x%08x)", se_fl_get(h2s->sd)); + + if (pfx) + chunk_appendf(msg, "\n%s", pfx); + + chunk_appendf(msg, " .subs=%p", h2s->subs); + if (h2s->subs) { + chunk_appendf(msg, "(ev=%d tl=%p", h2s->subs->events, h2s->subs->tasklet); + chunk_appendf(msg, " tl.calls=%d tl.ctx=%p tl.fct=", + h2s->subs->tasklet->calls, + h2s->subs->tasklet->context); + if (h2s->subs->tasklet->calls >= 1000000) + ret = 1; + resolve_sym_name(msg, NULL, h2s->subs->tasklet->process); + chunk_appendf(msg, ")"); + } + return ret; +} + +/* appends some info about connection <h2c> to buffer <msg>, or does nothing if + * <h2c> is NULL. Returns non-zero if the connection is considered suspicious. + * May emit multiple lines, each new one being prefixed with <pfx>, if <pfx> is + * not NULL, otherwise a single line is used. + */ +static int h2_dump_h2c_info(struct buffer *msg, struct h2c *h2c, const char *pfx) +{ + const struct buffer *hmbuf, *tmbuf; + const struct h2s *h2s = NULL; + struct eb32_node *node; + int fctl_cnt = 0; + int send_cnt = 0; + int tree_cnt = 0; + int orph_cnt = 0; + int ret = 0; + + if (!h2c) + return ret; + + list_for_each_entry(h2s, &h2c->fctl_list, list) + fctl_cnt++; + + list_for_each_entry(h2s, &h2c->send_list, list) + send_cnt++; + + node = eb32_first(&h2c->streams_by_id); + while (node) { + h2s = container_of(node, struct h2s, by_id); + tree_cnt++; + if (!h2s_sc(h2s)) + orph_cnt++; + node = eb32_next(node); + } + + hmbuf = br_head(h2c->mbuf); + tmbuf = br_tail(h2c->mbuf); + chunk_appendf(msg, " h2c.st0=%s .err=%d .maxid=%d .lastid=%d .flg=0x%04x" + " .nbst=%u .nbsc=%u", + h2c_st_to_str(h2c->st0), h2c->errcode, h2c->max_id, h2c->last_sid, h2c->flags, + h2c->nb_streams, h2c->nb_sc); + + if (pfx) + chunk_appendf(msg, "\n%s", pfx); + + chunk_appendf(msg, " .fctl_cnt=%d .send_cnt=%d .tree_cnt=%d" + " .orph_cnt=%d .sub=%d .dsi=%d .dbuf=%u@%p+%u/%u", + fctl_cnt, send_cnt, tree_cnt, orph_cnt, + h2c->wait_event.events, h2c->dsi, + (unsigned int)b_data(&h2c->dbuf), b_orig(&h2c->dbuf), + (unsigned int)b_head_ofs(&h2c->dbuf), (unsigned int)b_size(&h2c->dbuf)); + + if (pfx) + chunk_appendf(msg, "\n%s", pfx); + + chunk_appendf(msg, " .mbuf=[%u..%u|%u],h=[%u@%p+%u/%u],t=[%u@%p+%u/%u]", + br_head_idx(h2c->mbuf), br_tail_idx(h2c->mbuf), br_size(h2c->mbuf), + (unsigned int)b_data(hmbuf), b_orig(hmbuf), + (unsigned int)b_head_ofs(hmbuf), (unsigned int)b_size(hmbuf), + (unsigned int)b_data(tmbuf), b_orig(tmbuf), + (unsigned int)b_head_ofs(tmbuf), (unsigned int)b_size(tmbuf)); + + chunk_appendf(msg, " .task=%p", h2c->task); + if (h2c->task) { + chunk_appendf(msg, " .exp=%s", + h2c->task->expire ? tick_is_expired(h2c->task->expire, now_ms) ? "<PAST>" : + human_time(TICKS_TO_MS(h2c->task->expire - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + } + + return ret; +} + +/* for debugging with CLI's "show fd" command */ +static int h2_show_fd(struct buffer *msg, struct connection *conn) +{ + struct h2c *h2c = conn->ctx; + const struct h2s *h2s; + struct eb32_node *node; + int ret = 0; + + if (!h2c) + return ret; + + ret |= h2_dump_h2c_info(msg, h2c, NULL); + + node = eb32_last(&h2c->streams_by_id); + if (node) { + h2s = container_of(node, struct h2s, by_id); + chunk_appendf(msg, " last_h2s=%p", h2s); + ret |= h2_dump_h2s_info(msg, h2s, NULL); + } + + return ret; +} + +/* for debugging with CLI's "show sess" command. May emit multiple lines, each + * new one being prefixed with <pfx>, if <pfx> is not NULL, otherwise a single + * line is used. Each field starts with a space so it's safe to print it after + * existing fields. + */ +static int h2_show_sd(struct buffer *msg, struct sedesc *sd, const char *pfx) +{ + struct h2s *h2s = sd->se; + int ret = 0; + + if (!h2s) + return ret; + + chunk_appendf(msg, " h2s=%p", h2s); + ret |= h2_dump_h2s_info(msg, h2s, pfx); + if (pfx) + chunk_appendf(msg, "\n%s", pfx); + chunk_appendf(msg, " h2c=%p", h2s->h2c); + ret |= h2_dump_h2c_info(msg, h2s->h2c, pfx); + return ret; +} + +/* Migrate the the connection to the current thread. + * Return 0 if successful, non-zero otherwise. + * Expected to be called with the old thread lock held. + */ +static int h2_takeover(struct connection *conn, int orig_tid) +{ + struct h2c *h2c = conn->ctx; + struct task *task; + struct task *new_task; + struct tasklet *new_tasklet; + + /* Pre-allocate tasks so that we don't have to roll back after the xprt + * has been migrated. + */ + new_task = task_new_here(); + new_tasklet = tasklet_new(); + if (!new_task || !new_tasklet) + goto fail; + + if (fd_takeover(conn->handle.fd, conn) != 0) + goto fail; + + if (conn->xprt->takeover && conn->xprt->takeover(conn, conn->xprt_ctx, orig_tid) != 0) { + /* We failed to takeover the xprt, even if the connection may + * still be valid, flag it as error'd, as we have already + * taken over the fd, and wake the tasklet, so that it will + * destroy it. + */ + conn->flags |= CO_FL_ERROR; + tasklet_wakeup_on(h2c->wait_event.tasklet, orig_tid); + goto fail; + } + + if (h2c->wait_event.events) + h2c->conn->xprt->unsubscribe(h2c->conn, h2c->conn->xprt_ctx, + h2c->wait_event.events, &h2c->wait_event); + + task = h2c->task; + if (task) { + /* only assign a task if there was already one, otherwise + * the preallocated new task will be released. + */ + task->context = NULL; + h2c->task = NULL; + __ha_barrier_store(); + task_kill(task); + + h2c->task = new_task; + new_task = NULL; + h2c->task->process = h2_timeout_task; + h2c->task->context = h2c; + } + + /* To let the tasklet know it should free itself, and do nothing else, + * set its context to NULL. + */ + h2c->wait_event.tasklet->context = NULL; + tasklet_wakeup_on(h2c->wait_event.tasklet, orig_tid); + + h2c->wait_event.tasklet = new_tasklet; + h2c->wait_event.tasklet->process = h2_io_cb; + h2c->wait_event.tasklet->context = h2c; + h2c->conn->xprt->subscribe(h2c->conn, h2c->conn->xprt_ctx, + SUB_RETRY_RECV, &h2c->wait_event); + + if (new_task) + __task_free(new_task); + return 0; + fail: + if (new_task) + __task_free(new_task); + tasklet_free(new_tasklet); + return -1; +} + +/*******************************************************/ +/* functions below are dedicated to the config parsers */ +/*******************************************************/ + +/* config parser for global "tune.h2.header-table-size" */ +static int h2_parse_header_table_size(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + h2_settings_header_table_size = atoi(args[1]); + if (h2_settings_header_table_size < 4096 || h2_settings_header_table_size > 65536) { + memprintf(err, "'%s' expects a numeric value between 4096 and 65536.", args[0]); + return -1; + } + return 0; +} + +/* config parser for global "tune.h2.{be.,fe.,}initial-window-size" */ +static int h2_parse_initial_window_size(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int *vptr; + + if (too_many_args(1, args, err, NULL)) + return -1; + + /* backend/frontend/default */ + vptr = (args[0][8] == 'b') ? &h2_be_settings_initial_window_size : + (args[0][8] == 'f') ? &h2_fe_settings_initial_window_size : + &h2_settings_initial_window_size; + + *vptr = atoi(args[1]); + if (*vptr < 0) { + memprintf(err, "'%s' expects a positive numeric value.", args[0]); + return -1; + } + return 0; +} + +/* config parser for global "tune.h2.{be.,fe.,}max-concurrent-streams" */ +static int h2_parse_max_concurrent_streams(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + uint *vptr; + + if (too_many_args(1, args, err, NULL)) + return -1; + + /* backend/frontend/default */ + vptr = (args[0][8] == 'b') ? &h2_be_settings_max_concurrent_streams : + (args[0][8] == 'f') ? &h2_fe_settings_max_concurrent_streams : + &h2_settings_max_concurrent_streams; + + *vptr = atoi(args[1]); + if ((int)*vptr < 0) { + memprintf(err, "'%s' expects a positive numeric value.", args[0]); + return -1; + } + return 0; +} + +/* config parser for global "tune.h2.fe.max-total-streams" */ +static int h2_parse_max_total_streams(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + uint *vptr; + + if (too_many_args(1, args, err, NULL)) + return -1; + + /* frontend only for now */ + vptr = &h2_fe_max_total_streams; + + *vptr = atoi(args[1]); + if ((int)*vptr < 0) { + memprintf(err, "'%s' expects a positive numeric value.", args[0]); + return -1; + } + return 0; +} + +/* config parser for global "tune.h2.max-frame-size" */ +static int h2_parse_max_frame_size(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + h2_settings_max_frame_size = atoi(args[1]); + if (h2_settings_max_frame_size < 16384 || h2_settings_max_frame_size > 16777215) { + memprintf(err, "'%s' expects a numeric value between 16384 and 16777215.", args[0]); + return -1; + } + return 0; +} + + +/* config parser for global "tune.h2.zero-copy-fwd-send" */ +static int h2_parse_zero_copy_fwd_snd(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.no_zero_copy_fwd &= ~NO_ZERO_COPY_FWD_H2_SND; + else if (strcmp(args[1], "off") == 0) + global.tune.no_zero_copy_fwd |= NO_ZERO_COPY_FWD_H2_SND; + else { + memprintf(err, "'%s' expects 'on' or 'off'.", args[0]); + return -1; + } + return 0; +} + +/****************************************/ +/* MUX initialization and instantiation */ +/***************************************/ + +/* The mux operations */ +static const struct mux_ops h2_ops = { + .init = h2_init, + .wake = h2_wake, + .snd_buf = h2_snd_buf, + .rcv_buf = h2_rcv_buf, + .nego_fastfwd = h2_nego_ff, + .done_fastfwd = h2_done_ff, + .resume_fastfwd = h2_resume_ff, + .subscribe = h2_subscribe, + .unsubscribe = h2_unsubscribe, + .attach = h2_attach, + .get_first_sc = h2_get_first_sc, + .detach = h2_detach, + .destroy = h2_destroy, + .avail_streams = h2_avail_streams, + .used_streams = h2_used_streams, + .shutr = h2_shutr, + .shutw = h2_shutw, + .ctl = h2_ctl, + .sctl = h2_sctl, + .show_fd = h2_show_fd, + .show_sd = h2_show_sd, + .takeover = h2_takeover, + .flags = MX_FL_HTX|MX_FL_HOL_RISK|MX_FL_NO_UPG|MX_FL_REVERSABLE, + .name = "H2", +}; + +static struct mux_proto_list mux_proto_h2 = + { .token = IST("h2"), .mode = PROTO_MODE_HTTP, .side = PROTO_SIDE_BOTH, .mux = &h2_ops }; + +INITCALL1(STG_REGISTER, register_mux_proto, &mux_proto_h2); + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.h2.be.initial-window-size", h2_parse_initial_window_size }, + { CFG_GLOBAL, "tune.h2.be.max-concurrent-streams", h2_parse_max_concurrent_streams }, + { CFG_GLOBAL, "tune.h2.fe.initial-window-size", h2_parse_initial_window_size }, + { CFG_GLOBAL, "tune.h2.fe.max-concurrent-streams", h2_parse_max_concurrent_streams }, + { CFG_GLOBAL, "tune.h2.fe.max-total-streams", h2_parse_max_total_streams }, + { CFG_GLOBAL, "tune.h2.header-table-size", h2_parse_header_table_size }, + { CFG_GLOBAL, "tune.h2.initial-window-size", h2_parse_initial_window_size }, + { CFG_GLOBAL, "tune.h2.max-concurrent-streams", h2_parse_max_concurrent_streams }, + { CFG_GLOBAL, "tune.h2.max-frame-size", h2_parse_max_frame_size }, + { CFG_GLOBAL, "tune.h2.zero-copy-fwd-send", h2_parse_zero_copy_fwd_snd }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* initialize internal structs after the config is parsed. + * Returns zero on success, non-zero on error. + */ +static int init_h2() +{ + pool_head_hpack_tbl = create_pool("hpack_tbl", + h2_settings_header_table_size, + MEM_F_SHARED|MEM_F_EXACT); + if (!pool_head_hpack_tbl) { + ha_alert("failed to allocate hpack_tbl memory pool\n"); + return (ERR_ALERT | ERR_FATAL); + } + return ERR_NONE; +} + +REGISTER_POST_CHECK(init_h2); diff --git a/src/mux_pt.c b/src/mux_pt.c new file mode 100644 index 0000000..3cca6a1 --- /dev/null +++ b/src/mux_pt.c @@ -0,0 +1,904 @@ +/* + * Pass-through mux-demux for connections + * + * Copyright 2017 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/cfgparse.h> +#include <haproxy/connection.h> +#include <haproxy/pipe.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/trace.h> +#include <haproxy/xref.h> + +struct mux_pt_ctx { + struct sedesc *sd; + struct connection *conn; + struct wait_event wait_event; +}; + +DECLARE_STATIC_POOL(pool_head_pt_ctx, "mux_pt", sizeof(struct mux_pt_ctx)); + +/* trace source and events */ +static void pt_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +/* The event representation is split like this : + * pt_ctx - internal PT context + * strm - application layer + */ +static const struct trace_event pt_trace_events[] = { +#define PT_EV_CONN_NEW (1ULL << 0) + { .mask = PT_EV_CONN_NEW, .name = "pt_conn_new", .desc = "new PT connection" }, +#define PT_EV_CONN_WAKE (1ULL << 1) + { .mask = PT_EV_CONN_WAKE, .name = "pt_conn_wake", .desc = "PT connection woken up" }, +#define PT_EV_CONN_END (1ULL << 2) + { .mask = PT_EV_CONN_END, .name = "pt_conn_end", .desc = "PT connection terminated" }, +#define PT_EV_CONN_ERR (1ULL << 3) + { .mask = PT_EV_CONN_ERR, .name = "pt_conn_err", .desc = "error on PT connection" }, +#define PT_EV_STRM_NEW (1ULL << 4) + { .mask = PT_EV_STRM_NEW, .name = "strm_new", .desc = "app-layer stream creation" }, +#define PT_EV_STRM_SHUT (1ULL << 5) + { .mask = PT_EV_STRM_SHUT, .name = "strm_shut", .desc = "stream shutdown" }, +#define PT_EV_STRM_END (1ULL << 6) + { .mask = PT_EV_STRM_END, .name = "strm_end", .desc = "detaching app-layer stream" }, +#define PT_EV_STRM_ERR (1ULL << 7) + { .mask = PT_EV_STRM_ERR, .name = "strm_err", .desc = "stream error" }, +#define PT_EV_RX_DATA (1ULL << 8) + { .mask = PT_EV_RX_DATA, .name = "pt_rx_data", .desc = "Rx on PT connection" }, +#define PT_EV_TX_DATA (1ULL << 9) + { .mask = PT_EV_TX_DATA, .name = "pt_tx_data", .desc = "Tx on PT connection" }, + + {} +}; + + +static const struct name_desc pt_trace_decoding[] = { +#define PT_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define PT_VERB_MINIMAL 2 + { .name="minimal", .desc="report only h1c/h1s state and flags, no real decoding" }, +#define PT_VERB_SIMPLE 3 + { .name="simple", .desc="add request/response status line or htx info when available" }, +#define PT_VERB_ADVANCED 4 + { .name="advanced", .desc="add header fields or frame decoding when available" }, +#define PT_VERB_COMPLETE 5 + { .name="complete", .desc="add full data dump when available" }, + { /* end */ } +}; + +static struct trace_source trace_pt __read_mostly = { + .name = IST("pt"), + .desc = "Passthrough multiplexer", + .arg_def = TRC_ARG1_CONN, // TRACE()'s first argument is always a connection + .default_cb = pt_trace, + .known_events = pt_trace_events, + .lockon_args = NULL, + .decoding = pt_trace_decoding, + .report_events = ~0, // report everything by default +}; + +#define TRACE_SOURCE &trace_pt +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + +/* returns the stconn associated to the stream */ +static forceinline struct stconn *pt_sc(const struct mux_pt_ctx *pt) +{ + return pt->sd->sc; +} + +static inline void pt_trace_buf(const struct buffer *buf, size_t ofs, size_t len) +{ + size_t block1, block2; + int line, ptr, newptr; + + block1 = b_contig_data(buf, ofs); + block2 = 0; + if (block1 > len) + block1 = len; + block2 = len - block1; + + ofs = b_peek_ofs(buf, ofs); + + line = 0; + ptr = ofs; + while (ptr < ofs + block1) { + newptr = dump_text_line(&trace_buf, b_orig(buf), b_size(buf), ofs + block1, &line, ptr); + if (newptr == ptr) + break; + ptr = newptr; + } + + line = ptr = 0; + while (ptr < block2) { + newptr = dump_text_line(&trace_buf, b_orig(buf), b_size(buf), block2, &line, ptr); + if (newptr == ptr) + break; + ptr = newptr; + } +} + +/* the PT traces always expect that arg1, if non-null, is of type connection + * (from which we can derive the pt context), that arg2, if non-null, is a + * stream connector, and that arg3, if non-null, is a buffer. + */ +static void pt_trace(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct connection *conn = a1; + const struct mux_pt_ctx *ctx = conn ? conn->ctx : NULL; + const struct stconn *sc = a2; + const struct buffer *buf = a3; + const size_t *val = a4; + + if (!ctx || src->verbosity < PT_VERB_CLEAN) + return; + + /* Display frontend/backend info by default */ + chunk_appendf(&trace_buf, " : [%c]", (conn_is_back(conn) ? 'B' : 'F')); + + if (src->verbosity == PT_VERB_CLEAN) + return; + + if (!sc) + sc = pt_sc(ctx); + + /* Display the value to the 4th argument (level > STATE) */ + if (src->level > TRACE_LEVEL_STATE && val) + chunk_appendf(&trace_buf, " - VAL=%lu", (long)*val); + + /* Display conn and sc info, if defined (pointer + flags) */ + chunk_appendf(&trace_buf, " - conn=%p(0x%08x)", conn, conn->flags); + chunk_appendf(&trace_buf, " sd=%p(0x%08x)", ctx->sd, se_fl_get(ctx->sd)); + if (sc) + chunk_appendf(&trace_buf, " sc=%p(0x%08x)", sc, sc->flags); + + if (src->verbosity == PT_VERB_MINIMAL) + return; + + /* Display buffer info, if defined (level > USER & verbosity > SIMPLE) */ + if (src->level > TRACE_LEVEL_USER && buf) { + int full = 0, max = 3000, chunk = 1024; + + /* Full info (level > STATE && verbosity > SIMPLE) */ + if (src->level > TRACE_LEVEL_STATE) { + if (src->verbosity == PT_VERB_COMPLETE) + full = 1; + else if (src->verbosity == PT_VERB_ADVANCED) { + full = 1; + max = 256; + chunk = 64; + } + } + + chunk_appendf(&trace_buf, " buf=%u@%p+%u/%u", + (unsigned int)b_data(buf), b_orig(buf), + (unsigned int)b_head_ofs(buf), (unsigned int)b_size(buf)); + + if (b_data(buf) && full) { + chunk_memcat(&trace_buf, "\n", 1); + if (b_data(buf) < max) + pt_trace_buf(buf, 0, b_data(buf)); + else { + pt_trace_buf(buf, 0, chunk); + chunk_memcat(&trace_buf, " ...\n", 6); + pt_trace_buf(buf, b_data(buf) - chunk, chunk); + } + } + } +} + +static void mux_pt_destroy(struct mux_pt_ctx *ctx) +{ + struct connection *conn = NULL; + + TRACE_POINT(PT_EV_CONN_END); + + /* The connection must be attached to this mux to be released */ + if (ctx->conn && ctx->conn->ctx == ctx) + conn = ctx->conn; + + tasklet_free(ctx->wait_event.tasklet); + + if (conn && ctx->wait_event.events != 0) + conn->xprt->unsubscribe(conn, conn->xprt_ctx, ctx->wait_event.events, + &ctx->wait_event); + BUG_ON(ctx->sd && !se_fl_test(ctx->sd, SE_FL_ORPHAN)); + sedesc_free(ctx->sd); + pool_free(pool_head_pt_ctx, ctx); + + if (conn) { + conn->mux = NULL; + conn->ctx = NULL; + TRACE_DEVEL("freeing conn", PT_EV_CONN_END, conn); + + conn_stop_tracking(conn); + conn_full_close(conn); + if (conn->destroy_cb) + conn->destroy_cb(conn); + conn_free(conn); + } +} + +/* Callback, used when we get I/Os while in idle mode. This one is exported so + * that "show fd" can resolve it. + */ +struct task *mux_pt_io_cb(struct task *t, void *tctx, unsigned int status) +{ + struct mux_pt_ctx *ctx = tctx; + + TRACE_ENTER(PT_EV_CONN_WAKE, ctx->conn); + if (!se_fl_test(ctx->sd, SE_FL_ORPHAN)) { + /* There's a small race condition. + * mux_pt_io_cb() is only supposed to be called if we have no + * stream attached. However, maybe the tasklet got woken up, + * and this connection was then attached to a new stream. + * If this happened, just wake the tasklet up if anybody + * subscribed to receive events, and otherwise call the wake + * method, to make sure the event is noticed. + */ + if (ctx->conn->subs) { + ctx->conn->subs->events = 0; + tasklet_wakeup(ctx->conn->subs->tasklet); + ctx->conn->subs = NULL; + } else if (pt_sc(ctx)->app_ops->wake) + pt_sc(ctx)->app_ops->wake(pt_sc(ctx)); + TRACE_DEVEL("leaving waking up SC", PT_EV_CONN_WAKE, ctx->conn); + return t; + } + conn_ctrl_drain(ctx->conn); + if (ctx->conn->flags & (CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH)) { + TRACE_DEVEL("leaving destroying pt context", PT_EV_CONN_WAKE, ctx->conn); + mux_pt_destroy(ctx); + t = NULL; + } + else { + ctx->conn->xprt->subscribe(ctx->conn, ctx->conn->xprt_ctx, SUB_RETRY_RECV, + &ctx->wait_event); + TRACE_DEVEL("leaving subscribing for reads", PT_EV_CONN_WAKE, ctx->conn); + } + + return t; +} + +/* Initialize the mux once it's attached. It is expected that conn->ctx points + * to the existing stream connector (for outgoing connections) or NULL (for + * incoming ones, in which case one will be allocated and a new stream will be + * instantiated). Returns < 0 on error. + */ +static int mux_pt_init(struct connection *conn, struct proxy *prx, struct session *sess, + struct buffer *input) +{ + struct stconn *sc = conn->ctx; + struct mux_pt_ctx *ctx = pool_alloc(pool_head_pt_ctx); + + TRACE_ENTER(PT_EV_CONN_NEW); + + if (!ctx) { + TRACE_ERROR("PT context allocation failure", PT_EV_CONN_NEW|PT_EV_CONN_END|PT_EV_CONN_ERR); + goto fail; + } + + ctx->wait_event.tasklet = tasklet_new(); + if (!ctx->wait_event.tasklet) + goto fail_free_ctx; + ctx->wait_event.tasklet->context = ctx; + ctx->wait_event.tasklet->process = mux_pt_io_cb; + ctx->wait_event.events = 0; + ctx->conn = conn; + + if (!sc) { + ctx->sd = sedesc_new(); + if (!ctx->sd) { + TRACE_ERROR("SC allocation failure", PT_EV_STRM_NEW|PT_EV_STRM_END|PT_EV_STRM_ERR, conn); + goto fail_free_ctx; + } + ctx->sd->se = ctx; + ctx->sd->conn = conn; + se_fl_set(ctx->sd, SE_FL_T_MUX | SE_FL_ORPHAN); + + sc = sc_new_from_endp(ctx->sd, sess, input); + if (!sc) { + TRACE_ERROR("SC allocation failure", PT_EV_STRM_NEW|PT_EV_STRM_END|PT_EV_STRM_ERR, conn); + goto fail_free_sd; + } + TRACE_POINT(PT_EV_STRM_NEW, conn, sc); + } + else { + if (sc_attach_mux(sc, ctx, conn) < 0) + goto fail_free_ctx; + ctx->sd = sc->sedesc; + } + conn->ctx = ctx; + se_fl_set(ctx->sd, SE_FL_RCV_MORE); + if ((global.tune.options & GTUNE_USE_SPLICE) && !(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_PT)) + se_fl_set(ctx->sd, SE_FL_MAY_FASTFWD_PROD|SE_FL_MAY_FASTFWD_CONS); + + TRACE_LEAVE(PT_EV_CONN_NEW, conn); + return 0; + + fail_free_sd: + sedesc_free(ctx->sd); + fail_free_ctx: + tasklet_free(ctx->wait_event.tasklet); + pool_free(pool_head_pt_ctx, ctx); + fail: + TRACE_DEVEL("leaving in error", PT_EV_CONN_NEW|PT_EV_CONN_END|PT_EV_CONN_ERR); + return -1; +} + +/* callback to be used by default for the pass-through mux. It calls the data + * layer wake() callback if it is set otherwise returns 0. + */ +static int mux_pt_wake(struct connection *conn) +{ + struct mux_pt_ctx *ctx = conn->ctx; + int ret = 0; + + TRACE_ENTER(PT_EV_CONN_WAKE, ctx->conn); + if (!se_fl_test(ctx->sd, SE_FL_ORPHAN)) { + ret = pt_sc(ctx)->app_ops->wake ? pt_sc(ctx)->app_ops->wake(pt_sc(ctx)) : 0; + + if (ret < 0) { + TRACE_DEVEL("leaving waking up SC", PT_EV_CONN_WAKE, ctx->conn); + return ret; + } + } else { + conn_ctrl_drain(conn); + if (conn->flags & (CO_FL_ERROR | CO_FL_SOCK_RD_SH)) { + TRACE_DEVEL("leaving destroying PT context", PT_EV_CONN_WAKE, ctx->conn); + mux_pt_destroy(ctx); + return -1; + } + } + + /* If we had early data, and we're done with the handshake + * then we know the data are safe, and we can remove the flag. + */ + if ((conn->flags & (CO_FL_EARLY_DATA | CO_FL_EARLY_SSL_HS | CO_FL_WAIT_XPRT)) == + CO_FL_EARLY_DATA) + conn->flags &= ~CO_FL_EARLY_DATA; + + TRACE_LEAVE(PT_EV_CONN_WAKE, ctx->conn); + return ret; +} + +/* + * Attach a new stream to a connection + * (Used for outgoing connections) + */ +static int mux_pt_attach(struct connection *conn, struct sedesc *sd, struct session *sess) +{ + struct mux_pt_ctx *ctx = conn->ctx; + + TRACE_ENTER(PT_EV_STRM_NEW, conn); + if (ctx->wait_event.events) + conn->xprt->unsubscribe(ctx->conn, conn->xprt_ctx, SUB_RETRY_RECV, &ctx->wait_event); + if (sc_attach_mux(sd->sc, ctx, conn) < 0) + return -1; + ctx->sd = sd; + se_fl_set(ctx->sd, SE_FL_RCV_MORE); + if ((global.tune.options & GTUNE_USE_SPLICE) && !(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_PT)) + se_fl_set(ctx->sd, SE_FL_MAY_FASTFWD_PROD|SE_FL_MAY_FASTFWD_CONS); + + TRACE_LEAVE(PT_EV_STRM_NEW, conn, sd->sc); + return 0; +} + +/* Retrieves a valid stream connector from this connection, or returns NULL. + * For this mux, it's easy as we can only store a single stream connector. + */ +static struct stconn *mux_pt_get_first_sc(const struct connection *conn) +{ + struct mux_pt_ctx *ctx = conn->ctx; + + return pt_sc(ctx); +} + +/* Destroy the mux and the associated connection if still attached to this mux + * and no longer used */ +static void mux_pt_destroy_meth(void *ctx) +{ + struct mux_pt_ctx *pt = ctx; + + TRACE_POINT(PT_EV_CONN_END, pt->conn, pt_sc(pt)); + if (se_fl_test(pt->sd, SE_FL_ORPHAN) || pt->conn->ctx != pt) { + if (pt->conn->ctx != pt) { + pt->sd = NULL; + } + mux_pt_destroy(pt); + } +} + +/* + * Detach the stream from the connection and possibly release the connection. + */ +static void mux_pt_detach(struct sedesc *sd) +{ + struct connection *conn = sd->conn; + struct mux_pt_ctx *ctx; + + TRACE_ENTER(PT_EV_STRM_END, conn, sd->sc); + + ctx = conn->ctx; + + /* Subscribe, to know if we got disconnected */ + if (!conn_is_back(conn) && conn->owner != NULL && + !(conn->flags & (CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH))) { + conn->xprt->subscribe(conn, conn->xprt_ctx, SUB_RETRY_RECV, &ctx->wait_event); + } else { + /* There's no session attached to that connection, destroy it */ + TRACE_DEVEL("killing dead connection", PT_EV_STRM_END, conn, sd->sc); + mux_pt_destroy(ctx); + } + + TRACE_LEAVE(PT_EV_STRM_END); +} + +/* returns the number of streams in use on a connection */ +static int mux_pt_used_streams(struct connection *conn) +{ + struct mux_pt_ctx *ctx = conn->ctx; + + return (!se_fl_test(ctx->sd, SE_FL_ORPHAN) ? 1 : 0); +} + +/* returns the number of streams still available on a connection */ +static int mux_pt_avail_streams(struct connection *conn) +{ + return 1 - mux_pt_used_streams(conn); +} + +static void mux_pt_shutr(struct stconn *sc, enum co_shr_mode mode) +{ + struct connection *conn = __sc_conn(sc); + struct mux_pt_ctx *ctx = conn->ctx; + + TRACE_ENTER(PT_EV_STRM_SHUT, conn, sc); + + se_fl_clr(ctx->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + if (conn_xprt_ready(conn) && conn->xprt->shutr) + conn->xprt->shutr(conn, conn->xprt_ctx, + (mode == CO_SHR_DRAIN)); + else if (mode == CO_SHR_DRAIN) + conn_ctrl_drain(conn); + if (se_fl_test(ctx->sd, SE_FL_SHW)) + conn_full_close(conn); + + TRACE_LEAVE(PT_EV_STRM_SHUT, conn, sc); +} + +static void mux_pt_shutw(struct stconn *sc, enum co_shw_mode mode) +{ + struct connection *conn = __sc_conn(sc); + struct mux_pt_ctx *ctx = conn->ctx; + + TRACE_ENTER(PT_EV_STRM_SHUT, conn, sc); + + if (conn_xprt_ready(conn) && conn->xprt->shutw) + conn->xprt->shutw(conn, conn->xprt_ctx, + (mode == CO_SHW_NORMAL)); + if (!se_fl_test(ctx->sd, SE_FL_SHR)) + conn_sock_shutw(conn, (mode == CO_SHW_NORMAL)); + else + conn_full_close(conn); + + TRACE_LEAVE(PT_EV_STRM_SHUT, conn, sc); +} + +/* + * Called from the upper layer, to get more data + * + * The caller is responsible for defragmenting <buf> if necessary. But <flags> + * must be tested to know the calling context. If CO_RFL_BUF_FLUSH is set, it + * means the caller wants to flush input data (from the mux buffer and the + * channel buffer) to be able to use kernel splicing or any kind of mux-to-mux + * xfer. If CO_RFL_KEEP_RECV is set, the mux must always subscribe for read + * events before giving back. CO_RFL_BUF_WET is set if <buf> is congested with + * data scheduled for leaving soon. CO_RFL_BUF_NOT_STUCK is set to instruct the + * mux it may optimize the data copy to <buf> if necessary. Otherwise, it should + * copy as much data as possible. + */ +static size_t mux_pt_rcv_buf(struct stconn *sc, struct buffer *buf, size_t count, int flags) +{ + struct connection *conn = __sc_conn(sc); + struct mux_pt_ctx *ctx = conn->ctx; + size_t ret = 0; + + TRACE_ENTER(PT_EV_RX_DATA, conn, sc, buf, (size_t[]){count}); + + if (!count) { + se_fl_set(ctx->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + goto end; + } + b_realign_if_empty(buf); + ret = conn->xprt->rcv_buf(conn, conn->xprt_ctx, buf, count, flags); + if (conn->flags & CO_FL_ERROR) { + se_fl_clr(ctx->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + if (conn_xprt_read0_pending(conn)) + se_fl_set(ctx->sd, SE_FL_EOS); + se_fl_set(ctx->sd, SE_FL_ERROR); + TRACE_DEVEL("error on connection", PT_EV_RX_DATA|PT_EV_CONN_ERR, conn, sc); + } + else if (conn_xprt_read0_pending(conn)) { + se_fl_clr(ctx->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + se_fl_set(ctx->sd, (SE_FL_EOI|SE_FL_EOS)); + TRACE_DEVEL("read0 on connection", PT_EV_RX_DATA, conn, sc); + } + end: + TRACE_LEAVE(PT_EV_RX_DATA, conn, sc, buf, (size_t[]){ret}); + return ret; +} + +/* Called from the upper layer, to send data */ +static size_t mux_pt_snd_buf(struct stconn *sc, struct buffer *buf, size_t count, int flags) +{ + struct connection *conn = __sc_conn(sc); + struct mux_pt_ctx *ctx = conn->ctx; + size_t ret; + + TRACE_ENTER(PT_EV_TX_DATA, conn, sc, buf, (size_t[]){count}); + + ret = conn->xprt->snd_buf(conn, conn->xprt_ctx, buf, count, flags); + + if (ret > 0) + b_del(buf, ret); + + if (conn->flags & CO_FL_ERROR) { + if (conn_xprt_read0_pending(conn)) + se_fl_set(ctx->sd, SE_FL_EOS); + se_fl_set_error(ctx->sd); + TRACE_DEVEL("error on connection", PT_EV_TX_DATA|PT_EV_CONN_ERR, conn, sc); + } + + TRACE_LEAVE(PT_EV_TX_DATA, conn, sc, buf, (size_t[]){ret}); + return ret; +} + +static inline struct sedesc *mux_pt_opposite_sd(struct mux_pt_ctx *ctx) +{ + struct xref *peer; + struct sedesc *sdo; + + peer = xref_get_peer_and_lock(&ctx->sd->xref); + if (!peer) + return NULL; + + sdo = container_of(peer, struct sedesc, xref); + xref_unlock(&ctx->sd->xref, peer); + return sdo; +} + +static size_t mux_pt_nego_ff(struct stconn *sc, struct buffer *input, size_t count, unsigned int may_splice) +{ + struct connection *conn = __sc_conn(sc); + struct mux_pt_ctx *ctx = conn->ctx; + size_t ret = 0; + + TRACE_ENTER(PT_EV_TX_DATA, conn, sc, 0, (size_t[]){count}); + + /* Use kernel splicing if it is supported by the sender and if there + * are no input data _AND_ no output data. + * + * TODO: It may be good to add a flag to send obuf data first if any, + * and then data in pipe, or the opposite. For now, it is not + * supported to mix data. + */ + if (!b_data(input) && may_splice) { + if (conn->xprt->snd_pipe && (ctx->sd->iobuf.pipe || (pipes_used < global.maxpipes && (ctx->sd->iobuf.pipe = get_pipe())))) { + ctx->sd->iobuf.offset = 0; + ctx->sd->iobuf.data = 0; + ret = count; + goto out; + } + ctx->sd->iobuf.flags |= IOBUF_FL_NO_SPLICING; + TRACE_DEVEL("Unable to allocate pipe for splicing, fallback to buffer", PT_EV_TX_DATA, conn, sc); + } + + /* No buffer case */ + + out: + TRACE_LEAVE(PT_EV_TX_DATA, conn, sc, 0, (size_t[]){ret}); + return ret; +} + +static size_t mux_pt_done_ff(struct stconn *sc) +{ + struct connection *conn = __sc_conn(sc); + struct mux_pt_ctx *ctx = conn->ctx; + struct sedesc *sd = ctx->sd; + size_t total = 0; + + TRACE_ENTER(PT_EV_TX_DATA, conn, sc); + + if (sd->iobuf.pipe) { + total = conn->xprt->snd_pipe(conn, conn->xprt_ctx, sd->iobuf.pipe, sd->iobuf.pipe->data); + if (!sd->iobuf.pipe->data) { + put_pipe(sd->iobuf.pipe); + sd->iobuf.pipe = NULL; + } + } + else { + BUG_ON(sd->iobuf.buf); + } + + out: + if (conn->flags & CO_FL_ERROR) { + if (conn_xprt_read0_pending(conn)) + se_fl_set(ctx->sd, SE_FL_EOS); + se_fl_set_error(ctx->sd); + TRACE_DEVEL("error on connection", PT_EV_TX_DATA|PT_EV_CONN_ERR, conn, sc); + } + + TRACE_LEAVE(PT_EV_TX_DATA, conn, sc, 0, (size_t[]){total}); + return total; +} + +static int mux_pt_fastfwd(struct stconn *sc, unsigned int count, unsigned int flags) +{ + struct connection *conn = __sc_conn(sc); + struct mux_pt_ctx *ctx = conn->ctx; + struct sedesc *sdo = NULL; + size_t total = 0, try = 0; + int ret = 0; + + TRACE_ENTER(PT_EV_RX_DATA, conn, sc, 0, (size_t[]){count}); + + se_fl_clr(ctx->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + conn->flags &= ~CO_FL_WAIT_ROOM; + sdo = mux_pt_opposite_sd(ctx); + if (!sdo) { + TRACE_STATE("Opposite endpoint not available yet", PT_EV_RX_DATA, conn, sc); + goto out; + } + + try = se_nego_ff(sdo, &BUF_NULL, count, conn->xprt->rcv_pipe && !!(flags & CO_RFL_MAY_SPLICE) && !(sdo->iobuf.flags & IOBUF_FL_NO_SPLICING)); + if (sdo->iobuf.flags & IOBUF_FL_NO_FF) { + /* Fast forwarding is not supported by the consumer */ + se_fl_clr(ctx->sd, SE_FL_MAY_FASTFWD_PROD); + TRACE_DEVEL("Fast-forwarding not supported by opposite endpoint, disable it", PT_EV_RX_DATA, conn, sc); + goto end; + } + if (sdo->iobuf.flags & IOBUF_FL_FF_BLOCKED) { + se_fl_set(ctx->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + TRACE_STATE("waiting for more room", PT_EV_RX_DATA|PT_EV_STRM_ERR, conn, sc); + goto out; + } + + total += sdo->iobuf.data; + + if (sdo->iobuf.pipe) { + /* Here, not data was xferred */ + ret = conn->xprt->rcv_pipe(conn, conn->xprt_ctx, sdo->iobuf.pipe, try); + if (ret < 0) { + TRACE_ERROR("Error when trying to fast-forward data, disable it and abort", + PT_EV_RX_DATA|PT_EV_STRM_ERR|PT_EV_CONN_ERR, conn, sc); + se_fl_clr(ctx->sd, SE_FL_MAY_FASTFWD_PROD); + BUG_ON(sdo->iobuf.pipe->data); + put_pipe(sdo->iobuf.pipe); + sdo->iobuf.pipe = NULL; + goto end; + } + total += ret; + } + else { + BUG_ON(sdo->iobuf.buf); + ret = -1; /* abort splicing for now and fallback to buffer mode */ + goto end; + } + + ret = total; + se_done_ff(sdo); + + if (sdo->iobuf.pipe) { + se_fl_set(ctx->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + } + + TRACE_DEVEL("Data fast-forwarded", PT_EV_RX_DATA, conn, sc, 0, (size_t[]){ret}); + + + out: + if (conn->flags & CO_FL_ERROR) { + if (conn_xprt_read0_pending(conn)) + se_fl_set(ctx->sd, SE_FL_EOS); + se_fl_set(ctx->sd, SE_FL_ERROR); + TRACE_DEVEL("error on connection", PT_EV_RX_DATA|PT_EV_CONN_ERR, conn, sc); + } + else if (conn_xprt_read0_pending(conn)) { + se_fl_set(ctx->sd, (SE_FL_EOS|SE_FL_EOI)); + TRACE_DEVEL("read0 on connection", PT_EV_RX_DATA, conn, sc); + } + end: + TRACE_LEAVE(PT_EV_RX_DATA, conn, sc, 0, (size_t[]){ret}); + return ret; +} + +static int mux_pt_resume_fastfwd(struct stconn *sc, unsigned int flags) +{ + struct connection *conn = __sc_conn(sc); + struct mux_pt_ctx *ctx = conn->ctx; + struct sedesc *sd = ctx->sd; + size_t total = 0; + + TRACE_ENTER(PT_EV_TX_DATA, conn, sc, 0, (size_t[]){flags}); + + if (sd->iobuf.pipe) { + total = conn->xprt->snd_pipe(conn, conn->xprt_ctx, sd->iobuf.pipe, sd->iobuf.pipe->data); + if (!sd->iobuf.pipe->data) { + put_pipe(sd->iobuf.pipe); + sd->iobuf.pipe = NULL; + } + } + else { + BUG_ON(sd->iobuf.buf); + } + + out: + if (conn->flags & CO_FL_ERROR) { + if (conn_xprt_read0_pending(conn)) + se_fl_set(ctx->sd, SE_FL_EOS); + se_fl_set_error(ctx->sd); + TRACE_DEVEL("error on connection", PT_EV_TX_DATA|PT_EV_CONN_ERR, conn, sc); + } + + TRACE_LEAVE(PT_EV_TX_DATA, conn, sc, 0, (size_t[]){total}); + return total; +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0. + */ +static int mux_pt_subscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct connection *conn = __sc_conn(sc); + + TRACE_POINT(PT_EV_RX_DATA|PT_EV_TX_DATA, conn, sc, 0, (size_t[]){event_type}); + return conn->xprt->subscribe(conn, conn->xprt_ctx, event_type, es); +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type>. + * The <es> pointer is not allowed to differ from the one passed to the + * subscribe() call. It always returns zero. + */ +static int mux_pt_unsubscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct connection *conn = __sc_conn(sc); + + TRACE_POINT(PT_EV_RX_DATA|PT_EV_TX_DATA, conn, sc, 0, (size_t[]){event_type}); + return conn->xprt->unsubscribe(conn, conn->xprt_ctx, event_type, es); +} + +static int mux_pt_ctl(struct connection *conn, enum mux_ctl_type mux_ctl, void *output) +{ + int ret = 0; + switch (mux_ctl) { + case MUX_CTL_STATUS: + if (!(conn->flags & CO_FL_WAIT_XPRT)) + ret |= MUX_STATUS_READY; + return ret; + case MUX_CTL_EXIT_STATUS: + return MUX_ES_UNKNOWN; + default: + return -1; + } +} + +static int mux_pt_sctl(struct stconn *sc, enum mux_sctl_type mux_sctl, void *output) +{ + int ret = 0; + + switch (mux_sctl) { + case MUX_SCTL_SID: + if (output) + *((int64_t *)output) = 0; + return ret; + + default: + return -1; + } +} + +/* config parser for global "tune.pt.zero-copy-forwarding" */ +static int cfg_parse_pt_zero_copy_fwd(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.no_zero_copy_fwd &= ~NO_ZERO_COPY_FWD_PT; + else if (strcmp(args[1], "off") == 0) + global.tune.no_zero_copy_fwd |= NO_ZERO_COPY_FWD_PT; + else { + memprintf(err, "'%s' expects 'on' or 'off'.", args[0]); + return -1; + } + return 0; +} + + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.pt.zero-copy-forwarding", cfg_parse_pt_zero_copy_fwd }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + + +/* The mux operations */ +const struct mux_ops mux_tcp_ops = { + .init = mux_pt_init, + .wake = mux_pt_wake, + .rcv_buf = mux_pt_rcv_buf, + .snd_buf = mux_pt_snd_buf, + .nego_fastfwd = mux_pt_nego_ff, + .done_fastfwd = mux_pt_done_ff, + .fastfwd = mux_pt_fastfwd, + .resume_fastfwd = mux_pt_resume_fastfwd, + .subscribe = mux_pt_subscribe, + .unsubscribe = mux_pt_unsubscribe, + .attach = mux_pt_attach, + .get_first_sc = mux_pt_get_first_sc, + .detach = mux_pt_detach, + .avail_streams = mux_pt_avail_streams, + .used_streams = mux_pt_used_streams, + .destroy = mux_pt_destroy_meth, + .ctl = mux_pt_ctl, + .sctl = mux_pt_sctl, + .shutr = mux_pt_shutr, + .shutw = mux_pt_shutw, + .flags = MX_FL_NONE, + .name = "PASS", +}; + + +const struct mux_ops mux_pt_ops = { + .init = mux_pt_init, + .wake = mux_pt_wake, + .rcv_buf = mux_pt_rcv_buf, + .snd_buf = mux_pt_snd_buf, + .nego_fastfwd = mux_pt_nego_ff, + .done_fastfwd = mux_pt_done_ff, + .fastfwd = mux_pt_fastfwd, + .resume_fastfwd = mux_pt_resume_fastfwd, + .subscribe = mux_pt_subscribe, + .unsubscribe = mux_pt_unsubscribe, + .attach = mux_pt_attach, + .get_first_sc = mux_pt_get_first_sc, + .detach = mux_pt_detach, + .avail_streams = mux_pt_avail_streams, + .used_streams = mux_pt_used_streams, + .destroy = mux_pt_destroy_meth, + .ctl = mux_pt_ctl, + .sctl = mux_pt_sctl, + .shutr = mux_pt_shutr, + .shutw = mux_pt_shutw, + .flags = MX_FL_NONE|MX_FL_NO_UPG, + .name = "PASS", +}; + +/* PROT selection : default mux has empty name */ +static struct mux_proto_list mux_proto_none = + { .token = IST("none"), .mode = PROTO_MODE_TCP, .side = PROTO_SIDE_BOTH, .mux = &mux_pt_ops }; +static struct mux_proto_list mux_proto_tcp = + { .token = IST(""), .mode = PROTO_MODE_TCP, .side = PROTO_SIDE_BOTH, .mux = &mux_tcp_ops }; + +INITCALL1(STG_REGISTER, register_mux_proto, &mux_proto_none); +INITCALL1(STG_REGISTER, register_mux_proto, &mux_proto_tcp); diff --git a/src/mux_quic.c b/src/mux_quic.c new file mode 100644 index 0000000..de87368 --- /dev/null +++ b/src/mux_quic.c @@ -0,0 +1,3067 @@ +#include <haproxy/mux_quic.h> + +#include <import/eb64tree.h> + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/h3.h> +#include <haproxy/list.h> +#include <haproxy/ncbuf.h> +#include <haproxy/pool.h> +#include <haproxy/proxy.h> +#include <haproxy/qmux_http.h> +#include <haproxy/qmux_trace.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_frame.h> +#include <haproxy/quic_sock.h> +#include <haproxy/quic_stream.h> +#include <haproxy/quic_tp-t.h> +#include <haproxy/ssl_sock-t.h> +#include <haproxy/stconn.h> +#include <haproxy/time.h> +#include <haproxy/trace.h> +#include <haproxy/xref.h> + +DECLARE_POOL(pool_head_qcc, "qcc", sizeof(struct qcc)); +DECLARE_POOL(pool_head_qcs, "qcs", sizeof(struct qcs)); + +static void qcs_free_ncbuf(struct qcs *qcs, struct ncbuf *ncbuf) +{ + struct buffer buf; + + if (ncb_is_null(ncbuf)) + return; + + buf = b_make(ncbuf->area, ncbuf->size, 0, 0); + b_free(&buf); + offer_buffers(NULL, 1); + + *ncbuf = NCBUF_NULL; + + /* Reset DEM_FULL as buffer is released. This ensures mux is not woken + * up from rcv_buf stream callback when demux was previously blocked. + */ + qcs->flags &= ~QC_SF_DEM_FULL; +} + +/* Free <qcs> instance. This function is reserved for internal usage : it must + * only be called on qcs alloc error or on connection shutdown. Else + * qcs_destroy must be preferred to handle QUIC flow-control increase. + */ +static void qcs_free(struct qcs *qcs) +{ + struct qcc *qcc = qcs->qcc; + + TRACE_ENTER(QMUX_EV_QCS_END, qcc->conn, qcs); + + /* Safe to use even if already removed from the list. */ + LIST_DEL_INIT(&qcs->el_opening); + LIST_DEL_INIT(&qcs->el_send); + + /* Release stream endpoint descriptor. */ + BUG_ON(qcs->sd && !se_fl_test(qcs->sd, SE_FL_ORPHAN)); + sedesc_free(qcs->sd); + + /* Release app-layer context. */ + if (qcs->ctx && qcc->app_ops->detach) + qcc->app_ops->detach(qcs); + + /* Release qc_stream_desc buffer from quic-conn layer. */ + qc_stream_desc_release(qcs->stream, qcs->tx.sent_offset); + + /* Free Rx/Tx buffers. */ + qcs_free_ncbuf(qcs, &qcs->rx.ncbuf); + b_free(&qcs->tx.buf); + + /* Remove qcs from qcc tree. */ + eb64_delete(&qcs->by_id); + + pool_free(pool_head_qcs, qcs); + + TRACE_LEAVE(QMUX_EV_QCS_END, qcc->conn); +} + +/* Allocate a new QUIC streams with id <id> and type <type>. */ +static struct qcs *qcs_new(struct qcc *qcc, uint64_t id, enum qcs_type type) +{ + struct qcs *qcs; + + TRACE_ENTER(QMUX_EV_QCS_NEW, qcc->conn); + + qcs = pool_alloc(pool_head_qcs); + if (!qcs) { + TRACE_ERROR("alloc failure", QMUX_EV_QCS_NEW, qcc->conn); + return NULL; + } + + qcs->stream = NULL; + qcs->qcc = qcc; + qcs->sd = NULL; + qcs->flags = QC_SF_NONE; + qcs->st = QC_SS_IDLE; + qcs->ctx = NULL; + + /* App callback attach may register the stream for http-request wait. + * These fields must be initialed before. + */ + LIST_INIT(&qcs->el_opening); + LIST_INIT(&qcs->el_send); + qcs->start = TICK_ETERNITY; + + /* store transport layer stream descriptor in qcc tree */ + qcs->id = qcs->by_id.key = id; + eb64_insert(&qcc->streams_by_id, &qcs->by_id); + + /* If stream is local, use peer remote-limit, or else the opposite. */ + if (quic_stream_is_bidi(id)) { + qcs->tx.msd = quic_stream_is_local(qcc, id) ? qcc->rfctl.msd_bidi_r : + qcc->rfctl.msd_bidi_l; + } + else if (quic_stream_is_local(qcc, id)) { + qcs->tx.msd = qcc->rfctl.msd_uni_l; + } + + /* Properly set flow-control blocking if initial MSD is nul. */ + if (!qcs->tx.msd) + qcs->flags |= QC_SF_BLK_SFCTL; + + qcs->rx.ncbuf = NCBUF_NULL; + qcs->rx.app_buf = BUF_NULL; + qcs->rx.offset = qcs->rx.offset_max = 0; + + if (quic_stream_is_bidi(id)) { + qcs->rx.msd = quic_stream_is_local(qcc, id) ? qcc->lfctl.msd_bidi_l : + qcc->lfctl.msd_bidi_r; + } + else if (quic_stream_is_remote(qcc, id)) { + qcs->rx.msd = qcc->lfctl.msd_uni_r; + } + qcs->rx.msd_init = qcs->rx.msd; + + qcs->tx.buf = BUF_NULL; + qcs->tx.offset = 0; + qcs->tx.sent_offset = 0; + + qcs->wait_event.tasklet = NULL; + qcs->wait_event.events = 0; + qcs->subs = NULL; + + qcs->err = 0; + + /* Allocate transport layer stream descriptor. Only needed for TX. */ + if (!quic_stream_is_uni(id) || !quic_stream_is_remote(qcc, id)) { + struct quic_conn *qc = qcc->conn->handle.qc; + qcs->stream = qc_stream_desc_new(id, type, qcs, qc); + if (!qcs->stream) { + TRACE_ERROR("qc_stream_desc alloc failure", QMUX_EV_QCS_NEW, qcc->conn, qcs); + goto err; + } + } + + if (qcc->app_ops->attach && qcc->app_ops->attach(qcs, qcc->ctx)) { + TRACE_ERROR("app proto failure", QMUX_EV_QCS_NEW, qcc->conn, qcs); + goto err; + } + + out: + TRACE_LEAVE(QMUX_EV_QCS_NEW, qcc->conn, qcs); + return qcs; + + err: + qcs_free(qcs); + TRACE_LEAVE(QMUX_EV_QCS_NEW, qcc->conn); + return NULL; +} + +static forceinline struct stconn *qcs_sc(const struct qcs *qcs) +{ + return qcs->sd ? qcs->sd->sc : NULL; +} + +/* Reset the <qcc> inactivity timeout for http-keep-alive timeout. */ +static forceinline void qcc_reset_idle_start(struct qcc *qcc) +{ + qcc->idle_start = now_ms; +} + +/* Decrement <qcc> sc. */ +static forceinline void qcc_rm_sc(struct qcc *qcc) +{ + BUG_ON(!qcc->nb_sc); /* Ensure sc count is always valid (ie >=0). */ + --qcc->nb_sc; + + /* Reset qcc idle start for http-keep-alive timeout. Timeout will be + * refreshed after this on stream detach. + */ + if (!qcc->nb_sc && !qcc->nb_hreq) + qcc_reset_idle_start(qcc); +} + +/* Decrement <qcc> hreq. */ +static forceinline void qcc_rm_hreq(struct qcc *qcc) +{ + BUG_ON(!qcc->nb_hreq); /* Ensure http req count is always valid (ie >=0). */ + --qcc->nb_hreq; + + /* Reset qcc idle start for http-keep-alive timeout. Timeout will be + * refreshed after this on I/O handler. + */ + if (!qcc->nb_sc && !qcc->nb_hreq) + qcc_reset_idle_start(qcc); +} + +static inline int qcc_is_dead(const struct qcc *qcc) +{ + /* Maintain connection if stream endpoints are still active. */ + if (qcc->nb_sc) + return 0; + + /* Connection considered dead if either : + * - remote error detected at transport level + * - error detected locally + * - MUX timeout expired + */ + if (qcc->flags & (QC_CF_ERR_CONN|QC_CF_ERRL_DONE) || + !qcc->task) { + return 1; + } + + return 0; +} + +/* Return true if the mux timeout should be armed. */ +static inline int qcc_may_expire(struct qcc *qcc) +{ + return !qcc->nb_sc; +} + +/* Refresh the timeout on <qcc> if needed depending on its state. */ +static void qcc_refresh_timeout(struct qcc *qcc) +{ + const struct proxy *px = qcc->proxy; + + TRACE_ENTER(QMUX_EV_QCC_WAKE, qcc->conn); + + if (!qcc->task) { + TRACE_DEVEL("already expired", QMUX_EV_QCC_WAKE, qcc->conn); + goto leave; + } + + /* Check if upper layer is responsible of timeout management. */ + if (!qcc_may_expire(qcc)) { + TRACE_DEVEL("not eligible for timeout", QMUX_EV_QCC_WAKE, qcc->conn); + qcc->task->expire = TICK_ETERNITY; + task_queue(qcc->task); + goto leave; + } + + /* Frontend timeout management + * - shutdown done -> timeout client-fin + * - detached streams with data left to send -> default timeout + * - stream waiting on incomplete request or no stream yet activated -> timeout http-request + * - idle after stream processing -> timeout http-keep-alive + * + * If proxy stop-stop in progress, immediate or spread close will be + * processed if shutdown already one or connection is idle. + */ + if (!conn_is_back(qcc->conn)) { + if (qcc->nb_hreq && !(qcc->flags & QC_CF_APP_SHUT)) { + TRACE_DEVEL("one or more requests still in progress", QMUX_EV_QCC_WAKE, qcc->conn); + qcc->task->expire = tick_add_ifset(now_ms, qcc->timeout); + task_queue(qcc->task); + goto leave; + } + + if ((!LIST_ISEMPTY(&qcc->opening_list) || unlikely(!qcc->largest_bidi_r)) && + !(qcc->flags & QC_CF_APP_SHUT)) { + int timeout = px->timeout.httpreq; + struct qcs *qcs = NULL; + int base_time; + + /* Use start time of first stream waiting on HTTP or + * qcc idle if no stream not yet used. + */ + if (likely(!LIST_ISEMPTY(&qcc->opening_list))) + qcs = LIST_ELEM(qcc->opening_list.n, struct qcs *, el_opening); + base_time = qcs ? qcs->start : qcc->idle_start; + + TRACE_DEVEL("waiting on http request", QMUX_EV_QCC_WAKE, qcc->conn, qcs); + qcc->task->expire = tick_add_ifset(base_time, timeout); + } + else { + if (qcc->flags & QC_CF_APP_SHUT) { + TRACE_DEVEL("connection in closing", QMUX_EV_QCC_WAKE, qcc->conn); + qcc->task->expire = tick_add_ifset(now_ms, + qcc->shut_timeout); + } + else { + /* Use http-request timeout if keep-alive timeout not set */ + int timeout = tick_isset(px->timeout.httpka) ? + px->timeout.httpka : px->timeout.httpreq; + TRACE_DEVEL("at least one request achieved but none currently in progress", QMUX_EV_QCC_WAKE, qcc->conn); + qcc->task->expire = tick_add_ifset(qcc->idle_start, timeout); + } + + /* If proxy soft-stop in progress and connection is + * inactive, close the connection immediately. If a + * close-spread-time is configured, randomly spread the + * timer over a closing window. + */ + if ((qcc->proxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) && + !(global.tune.options & GTUNE_DISABLE_ACTIVE_CLOSE)) { + + /* Wake timeout task immediately if window already expired. */ + int remaining_window = tick_isset(global.close_spread_end) ? + tick_remain(now_ms, global.close_spread_end) : 0; + + TRACE_DEVEL("proxy disabled, prepare connection soft-stop", QMUX_EV_QCC_WAKE, qcc->conn); + if (remaining_window) { + /* We don't need to reset the expire if it would + * already happen before the close window end. + */ + if (!tick_isset(qcc->task->expire) || + tick_is_le(global.close_spread_end, qcc->task->expire)) { + /* Set an expire value shorter than the current value + * because the close spread window end comes earlier. + */ + qcc->task->expire = tick_add(now_ms, + statistical_prng_range(remaining_window)); + } + } + else { + /* We are past the soft close window end, wake the timeout + * task up immediately. + */ + qcc->task->expire = now_ms; + task_wakeup(qcc->task, TASK_WOKEN_TIMER); + } + } + } + } + + /* fallback to default timeout if frontend specific undefined or for + * backend connections. + */ + if (!tick_isset(qcc->task->expire)) { + TRACE_DEVEL("fallback to default timeout", QMUX_EV_QCC_WAKE, qcc->conn); + qcc->task->expire = tick_add_ifset(now_ms, qcc->timeout); + } + + task_queue(qcc->task); + + leave: + TRACE_LEAVE(QMUX_EV_QCS_NEW, qcc->conn); +} + +/* Mark a stream as open if it was idle. This can be used on every + * successful emission/reception operation to update the stream state. + */ +static void qcs_idle_open(struct qcs *qcs) +{ + /* This operation must not be used if the stream is already closed. */ + BUG_ON_HOT(qcs->st == QC_SS_CLO); + + if (qcs->st == QC_SS_IDLE) { + TRACE_STATE("opening stream", QMUX_EV_QCS_NEW, qcs->qcc->conn, qcs); + qcs->st = QC_SS_OPEN; + } +} + +/* Close the local channel of <qcs> instance. */ +static void qcs_close_local(struct qcs *qcs) +{ + TRACE_STATE("closing stream locally", QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + + /* The stream must have already been opened. */ + BUG_ON_HOT(qcs->st == QC_SS_IDLE); + + /* This operation cannot be used multiple times. */ + BUG_ON_HOT(qcs->st == QC_SS_HLOC || qcs->st == QC_SS_CLO); + + if (quic_stream_is_bidi(qcs->id)) { + qcs->st = (qcs->st == QC_SS_HREM) ? QC_SS_CLO : QC_SS_HLOC; + + if (qcs->flags & QC_SF_HREQ_RECV) + qcc_rm_hreq(qcs->qcc); + } + else { + /* Only local uni streams are valid for this operation. */ + BUG_ON_HOT(quic_stream_is_remote(qcs->qcc, qcs->id)); + qcs->st = QC_SS_CLO; + } +} + +/* Close the remote channel of <qcs> instance. */ +static void qcs_close_remote(struct qcs *qcs) +{ + TRACE_STATE("closing stream remotely", QMUX_EV_QCS_RECV, qcs->qcc->conn, qcs); + + /* The stream must have already been opened. */ + BUG_ON_HOT(qcs->st == QC_SS_IDLE); + + /* This operation cannot be used multiple times. */ + BUG_ON_HOT(qcs->st == QC_SS_HREM || qcs->st == QC_SS_CLO); + + if (quic_stream_is_bidi(qcs->id)) { + qcs->st = (qcs->st == QC_SS_HLOC) ? QC_SS_CLO : QC_SS_HREM; + } + else { + /* Only remote uni streams are valid for this operation. */ + BUG_ON_HOT(quic_stream_is_local(qcs->qcc, qcs->id)); + qcs->st = QC_SS_CLO; + } +} + +int qcs_is_close_local(struct qcs *qcs) +{ + return qcs->st == QC_SS_HLOC || qcs->st == QC_SS_CLO; +} + +int qcs_is_close_remote(struct qcs *qcs) +{ + return qcs->st == QC_SS_HREM || qcs->st == QC_SS_CLO; +} + +/* Allocate if needed buffer <bptr> for stream <qcs>. + * + * Returns the buffer instance or NULL on allocation failure. + */ +struct buffer *qcs_get_buf(struct qcs *qcs, struct buffer *bptr) +{ + return b_alloc(bptr); +} + +/* Allocate if needed buffer <ncbuf> for stream <qcs>. + * + * Returns the buffer instance or NULL on allocation failure. + */ +static struct ncbuf *qcs_get_ncbuf(struct qcs *qcs, struct ncbuf *ncbuf) +{ + struct buffer buf = BUF_NULL; + + if (ncb_is_null(ncbuf)) { + if (!b_alloc(&buf)) + return NULL; + + *ncbuf = ncb_make(buf.area, buf.size, 0); + ncb_init(ncbuf, 0); + } + + return ncbuf; +} + +/* Notify an eventual subscriber on <qcs> or else wakeup up the stconn layer if + * initialized. + */ +static void qcs_alert(struct qcs *qcs) +{ + if (qcs->subs) { + qcs_notify_recv(qcs); + qcs_notify_send(qcs); + } + else if (qcs_sc(qcs) && qcs->sd->sc->app_ops->wake) { + TRACE_POINT(QMUX_EV_STRM_WAKE, qcs->qcc->conn, qcs); + qcs->sd->sc->app_ops->wake(qcs->sd->sc); + } +} + +int qcs_subscribe(struct qcs *qcs, int event_type, struct wait_event *es) +{ + struct qcc *qcc = qcs->qcc; + + TRACE_ENTER(QMUX_EV_STRM_SEND|QMUX_EV_STRM_RECV, qcc->conn, qcs); + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(qcs->subs && qcs->subs != es); + + es->events |= event_type; + qcs->subs = es; + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("subscribe(recv)", QMUX_EV_STRM_RECV, qcc->conn, qcs); + + if (event_type & SUB_RETRY_SEND) + TRACE_DEVEL("subscribe(send)", QMUX_EV_STRM_SEND, qcc->conn, qcs); + + TRACE_LEAVE(QMUX_EV_STRM_SEND|QMUX_EV_STRM_RECV, qcc->conn, qcs); + + return 0; +} + +void qcs_notify_recv(struct qcs *qcs) +{ + if (qcs->subs && qcs->subs->events & SUB_RETRY_RECV) { + TRACE_POINT(QMUX_EV_STRM_WAKE, qcs->qcc->conn, qcs); + tasklet_wakeup(qcs->subs->tasklet); + qcs->subs->events &= ~SUB_RETRY_RECV; + if (!qcs->subs->events) + qcs->subs = NULL; + } +} + +void qcs_notify_send(struct qcs *qcs) +{ + if (qcs->subs && qcs->subs->events & SUB_RETRY_SEND) { + TRACE_POINT(QMUX_EV_STRM_WAKE, qcs->qcc->conn, qcs); + tasklet_wakeup(qcs->subs->tasklet); + qcs->subs->events &= ~SUB_RETRY_SEND; + if (!qcs->subs->events) + qcs->subs = NULL; + } +} + +/* A fatal error is detected locally for <qcc> connection. It should be closed + * with a CONNECTION_CLOSE using <err> code. Set <app> to true to indicate that + * the code must be considered as an application level error. This function + * must not be called more than once by connection. + */ +void qcc_set_error(struct qcc *qcc, int err, int app) +{ + /* This must not be called multiple times per connection. */ + BUG_ON(qcc->flags & QC_CF_ERRL); + + TRACE_STATE("connection on error", QMUX_EV_QCC_ERR, qcc->conn); + + qcc->flags |= QC_CF_ERRL; + qcc->err = app ? quic_err_app(err) : quic_err_transport(err); + + /* TODO + * Ensure qcc_io_send() will be conducted to convert QC_CF_ERRL in + * QC_CF_ERRL_DONE with CONNECTION_CLOSE frame emission. This may be + * unnecessary if we are currently in the MUX tasklet context, but it + * is too tedious too not forget a wakeup outside of this function for + * the moment. + */ + tasklet_wakeup(qcc->wait_event.tasklet); +} + +/* Open a locally initiated stream for the connection <qcc>. Set <bidi> for a + * bidirectional stream, else an unidirectional stream is opened. The next + * available ID on the connection will be used according to the stream type. + * + * Returns the allocated stream instance or NULL on error. + */ +struct qcs *qcc_init_stream_local(struct qcc *qcc, int bidi) +{ + struct qcs *qcs; + enum qcs_type type; + uint64_t *next; + + TRACE_ENTER(QMUX_EV_QCS_NEW, qcc->conn); + + if (bidi) { + next = &qcc->next_bidi_l; + type = conn_is_back(qcc->conn) ? QCS_CLT_BIDI : QCS_SRV_BIDI; + } + else { + next = &qcc->next_uni_l; + type = conn_is_back(qcc->conn) ? QCS_CLT_UNI : QCS_SRV_UNI; + } + + /* TODO ensure that we won't overflow remote peer flow control limit on + * streams. Else, we should emit a STREAMS_BLOCKED frame. + */ + + qcs = qcs_new(qcc, *next, type); + if (!qcs) { + TRACE_LEAVE(QMUX_EV_QCS_NEW, qcc->conn); + qcc_set_error(qcc, QC_ERR_INTERNAL_ERROR, 0); + return NULL; + } + + TRACE_PROTO("opening local stream", QMUX_EV_QCS_NEW, qcc->conn, qcs); + *next += 4; + + TRACE_LEAVE(QMUX_EV_QCS_NEW, qcc->conn, qcs); + return qcs; +} + +/* Open a remote initiated stream for the connection <qcc> with ID <id>. The + * caller is responsible to ensure that a stream with the same ID was not + * already opened. This function will also create all intermediaries streams + * with ID smaller than <id> not already opened before. + * + * Returns the allocated stream instance or NULL on error. + */ +static struct qcs *qcc_init_stream_remote(struct qcc *qcc, uint64_t id) +{ + struct qcs *qcs = NULL; + enum qcs_type type; + uint64_t *largest, max_id; + + TRACE_ENTER(QMUX_EV_QCS_NEW, qcc->conn); + + /* Function reserved to remote stream IDs. */ + BUG_ON(quic_stream_is_local(qcc, id)); + + if (quic_stream_is_bidi(id)) { + largest = &qcc->largest_bidi_r; + type = conn_is_back(qcc->conn) ? QCS_SRV_BIDI : QCS_CLT_BIDI; + } + else { + largest = &qcc->largest_uni_r; + type = conn_is_back(qcc->conn) ? QCS_SRV_UNI : QCS_CLT_UNI; + } + + /* RFC 9000 4.6. Controlling Concurrency + * + * An endpoint that receives a frame with a stream ID exceeding the + * limit it has sent MUST treat this as a connection error of type + * STREAM_LIMIT_ERROR + */ + max_id = quic_stream_is_bidi(id) ? qcc->lfctl.ms_bidi * 4 : + qcc->lfctl.ms_uni * 4; + if (id >= max_id) { + TRACE_ERROR("flow control error", QMUX_EV_QCS_NEW|QMUX_EV_PROTO_ERR, qcc->conn); + qcc_set_error(qcc, QC_ERR_STREAM_LIMIT_ERROR, 0); + goto err; + } + + /* Only stream ID not already opened can be used. */ + BUG_ON(id < *largest); + + while (id >= *largest) { + const char *str = *largest < id ? "initializing intermediary remote stream" : + "initializing remote stream"; + + qcs = qcs_new(qcc, *largest, type); + if (!qcs) { + TRACE_ERROR("stream fallocation failure", QMUX_EV_QCS_NEW, qcc->conn); + qcc_set_error(qcc, QC_ERR_INTERNAL_ERROR, 0); + goto err; + } + + TRACE_PROTO(str, QMUX_EV_QCS_NEW, qcc->conn, qcs); + *largest += 4; + } + + out: + TRACE_LEAVE(QMUX_EV_QCS_NEW, qcc->conn, qcs); + return qcs; + + err: + TRACE_LEAVE(QMUX_EV_QCS_NEW, qcc->conn); + return NULL; +} + +struct stconn *qcs_attach_sc(struct qcs *qcs, struct buffer *buf, char fin) +{ + struct qcc *qcc = qcs->qcc; + struct session *sess = qcc->conn->owner; + + qcs->sd = sedesc_new(); + if (!qcs->sd) + return NULL; + + qcs->sd->se = qcs; + qcs->sd->conn = qcc->conn; + se_fl_set(qcs->sd, SE_FL_T_MUX | SE_FL_ORPHAN | SE_FL_NOT_FIRST); + se_expect_no_data(qcs->sd); + + if (!(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD_QUIC_SND)) + se_fl_set(qcs->sd, SE_FL_MAY_FASTFWD_CONS); + + /* TODO duplicated from mux_h2 */ + sess->t_idle = ns_to_ms(now_ns - sess->accept_ts) - sess->t_handshake; + + if (!sc_new_from_endp(qcs->sd, sess, buf)) + return NULL; + + /* QC_SF_HREQ_RECV must be set once for a stream. Else, nb_hreq counter + * will be incorrect for the connection. + */ + BUG_ON_HOT(qcs->flags & QC_SF_HREQ_RECV); + qcs->flags |= QC_SF_HREQ_RECV; + ++qcc->nb_sc; + ++qcc->nb_hreq; + + /* TODO duplicated from mux_h2 */ + sess->accept_date = date; + sess->accept_ts = now_ns; + sess->t_handshake = 0; + sess->t_idle = 0; + + /* A stream must have been registered for HTTP wait before attaching + * it to sedesc. See <qcs_wait_http_req> for more info. + */ + BUG_ON_HOT(!LIST_INLIST(&qcs->el_opening)); + LIST_DEL_INIT(&qcs->el_opening); + + if (fin) { + TRACE_STATE("report end-of-input", QMUX_EV_STRM_RECV, qcc->conn, qcs); + se_fl_set(qcs->sd, SE_FL_EOI); + } + + /* A QCS can be already locally closed before stream layer + * instantiation. This notably happens if STOP_SENDING was the first + * frame received for this instance. In this case, an error is + * immediately to the stream layer to prevent transmission. + * + * TODO it could be better to not instantiate at all the stream layer. + * However, extra care is required to ensure QCS instance is released. + */ + if (unlikely(qcs_is_close_local(qcs) || (qcs->flags & QC_SF_TO_RESET))) { + TRACE_STATE("report early error", QMUX_EV_STRM_RECV, qcc->conn, qcs); + se_fl_set_error(qcs->sd); + } + + return qcs->sd->sc; +} + +/* Use this function for a stream <id> which is not in <qcc> stream tree. It + * returns true if the associated stream is closed. + */ +static int qcc_stream_id_is_closed(struct qcc *qcc, uint64_t id) +{ + uint64_t *largest; + + /* This function must only be used for stream not present in the stream tree. */ + BUG_ON_HOT(eb64_lookup(&qcc->streams_by_id, id)); + + if (quic_stream_is_local(qcc, id)) { + largest = quic_stream_is_uni(id) ? &qcc->next_uni_l : + &qcc->next_bidi_l; + } + else { + largest = quic_stream_is_uni(id) ? &qcc->largest_uni_r : + &qcc->largest_bidi_r; + } + + return id < *largest; +} + +/* Retrieve the stream instance from <id> ID. This can be used when receiving + * STREAM, STREAM_DATA_BLOCKED, RESET_STREAM, MAX_STREAM_DATA or STOP_SENDING + * frames. Set to false <receive_only> or <send_only> if these particular types + * of streams are not allowed. If the stream instance is found, it is stored in + * <out>. + * + * Returns 0 on success else non-zero. On error, a RESET_STREAM or a + * CONNECTION_CLOSE is automatically emitted. Beware that <out> may be NULL + * on success if the stream has already been closed. + */ +int qcc_get_qcs(struct qcc *qcc, uint64_t id, int receive_only, int send_only, + struct qcs **out) +{ + struct eb64_node *node; + + TRACE_ENTER(QMUX_EV_QCC_RECV, qcc->conn); + *out = NULL; + + if (!receive_only && quic_stream_is_uni(id) && quic_stream_is_remote(qcc, id)) { + TRACE_ERROR("receive-only stream not allowed", QMUX_EV_QCC_RECV|QMUX_EV_QCC_NQCS|QMUX_EV_PROTO_ERR, qcc->conn, NULL, &id); + qcc_set_error(qcc, QC_ERR_STREAM_STATE_ERROR, 0); + goto err; + } + + if (!send_only && quic_stream_is_uni(id) && quic_stream_is_local(qcc, id)) { + TRACE_ERROR("send-only stream not allowed", QMUX_EV_QCC_RECV|QMUX_EV_QCC_NQCS|QMUX_EV_PROTO_ERR, qcc->conn, NULL, &id); + qcc_set_error(qcc, QC_ERR_STREAM_STATE_ERROR, 0); + goto err; + } + + /* Search the stream in the connection tree. */ + node = eb64_lookup(&qcc->streams_by_id, id); + if (node) { + *out = eb64_entry(node, struct qcs, by_id); + TRACE_DEVEL("using stream from connection tree", QMUX_EV_QCC_RECV, qcc->conn, *out); + goto out; + } + + /* Check if stream is already closed. */ + if (qcc_stream_id_is_closed(qcc, id)) { + TRACE_DATA("already closed stream", QMUX_EV_QCC_RECV|QMUX_EV_QCC_NQCS, qcc->conn, NULL, &id); + /* Consider this as a success even if <out> is left NULL. */ + goto out; + } + + /* Create the stream. This is valid only for remote initiated one. A + * local stream must have already been explicitly created by the + * application protocol layer. + */ + if (quic_stream_is_local(qcc, id)) { + /* RFC 9000 19.8. STREAM Frames + * + * An endpoint MUST terminate the connection with error + * STREAM_STATE_ERROR if it receives a STREAM frame for a locally + * initiated stream that has not yet been created, or for a send-only + * stream. + */ + TRACE_ERROR("locally initiated stream not yet created", QMUX_EV_QCC_RECV|QMUX_EV_QCC_NQCS|QMUX_EV_PROTO_ERR, qcc->conn, NULL, &id); + qcc_set_error(qcc, QC_ERR_STREAM_STATE_ERROR, 0); + goto err; + } + else { + /* Remote stream not found - try to open it. */ + *out = qcc_init_stream_remote(qcc, id); + if (!*out) { + TRACE_ERROR("stream creation error", QMUX_EV_QCC_RECV|QMUX_EV_QCC_NQCS, qcc->conn, NULL, &id); + goto err; + } + } + + out: + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn, *out); + return 0; + + err: + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 1; +} + +/* Simple function to duplicate a buffer */ +static inline struct buffer qcs_b_dup(const struct ncbuf *b) +{ + return b_make(ncb_orig(b), b->size, b->head, ncb_data(b, 0)); +} + +/* Remove <bytes> from <qcs> Rx buffer. Flow-control for received offsets may + * be allocated for the peer if needed. + */ +static void qcs_consume(struct qcs *qcs, uint64_t bytes) +{ + struct qcc *qcc = qcs->qcc; + struct quic_frame *frm; + struct ncbuf *buf = &qcs->rx.ncbuf; + enum ncb_ret ret; + + TRACE_ENTER(QMUX_EV_QCS_RECV, qcc->conn, qcs); + + ret = ncb_advance(buf, bytes); + if (ret) { + ABORT_NOW(); /* should not happens because removal only in data */ + } + + if (ncb_is_empty(buf)) + qcs_free_ncbuf(qcs, buf); + + qcs->rx.offset += bytes; + /* Not necessary to emit a MAX_STREAM_DATA if all data received. */ + if (qcs->flags & QC_SF_SIZE_KNOWN) + goto conn_fctl; + + if (qcs->rx.msd - qcs->rx.offset < qcs->rx.msd_init / 2) { + TRACE_DATA("increase stream credit via MAX_STREAM_DATA", QMUX_EV_QCS_RECV, qcc->conn, qcs); + frm = qc_frm_alloc(QUIC_FT_MAX_STREAM_DATA); + if (!frm) { + qcc_set_error(qcc, QC_ERR_INTERNAL_ERROR, 0); + return; + } + + qcs->rx.msd = qcs->rx.offset + qcs->rx.msd_init; + + frm->max_stream_data.id = qcs->id; + frm->max_stream_data.max_stream_data = qcs->rx.msd; + + LIST_APPEND(&qcc->lfctl.frms, &frm->list); + tasklet_wakeup(qcc->wait_event.tasklet); + } + + conn_fctl: + qcc->lfctl.offsets_consume += bytes; + if (qcc->lfctl.md - qcc->lfctl.offsets_consume < qcc->lfctl.md_init / 2) { + TRACE_DATA("increase conn credit via MAX_DATA", QMUX_EV_QCS_RECV, qcc->conn, qcs); + frm = qc_frm_alloc(QUIC_FT_MAX_DATA); + if (!frm) { + qcc_set_error(qcc, QC_ERR_INTERNAL_ERROR, 0); + return; + } + + qcc->lfctl.md = qcc->lfctl.offsets_consume + qcc->lfctl.md_init; + + frm->max_data.max_data = qcc->lfctl.md; + + LIST_APPEND(&qcs->qcc->lfctl.frms, &frm->list); + tasklet_wakeup(qcs->qcc->wait_event.tasklet); + } + + TRACE_LEAVE(QMUX_EV_QCS_RECV, qcc->conn, qcs); +} + +/* Decode the content of STREAM frames already received on the stream instance + * <qcs>. + * + * Returns 0 on success else non-zero. + */ +static int qcc_decode_qcs(struct qcc *qcc, struct qcs *qcs) +{ + struct buffer b; + ssize_t ret; + int fin = 0; + + TRACE_ENTER(QMUX_EV_QCS_RECV, qcc->conn, qcs); + + b = qcs_b_dup(&qcs->rx.ncbuf); + + /* Signal FIN to application if STREAM FIN received with all data. */ + if (qcs_is_close_remote(qcs)) + fin = 1; + + if (!(qcs->flags & QC_SF_READ_ABORTED)) { + ret = qcc->app_ops->decode_qcs(qcs, &b, fin); + if (ret < 0) { + TRACE_ERROR("decoding error", QMUX_EV_QCS_RECV, qcc->conn, qcs); + goto err; + } + + if (qcs->flags & QC_SF_TO_RESET) { + if (qcs_sc(qcs) && !se_fl_test(qcs->sd, SE_FL_ERROR|SE_FL_ERR_PENDING)) { + se_fl_set_error(qcs->sd); + qcs_alert(qcs); + } + } + } + else { + TRACE_DATA("ignore read on stream", QMUX_EV_QCS_RECV, qcc->conn, qcs); + ret = b_data(&b); + } + + if (ret) + qcs_consume(qcs, ret); + if (ret || (!b_data(&b) && fin)) + qcs_notify_recv(qcs); + + TRACE_LEAVE(QMUX_EV_QCS_RECV, qcc->conn, qcs); + return 0; + + err: + TRACE_LEAVE(QMUX_EV_QCS_RECV, qcc->conn, qcs); + return 1; +} + +/* Prepare for the emission of RESET_STREAM on <qcs> with error code <err>. */ +void qcc_reset_stream(struct qcs *qcs, int err) +{ + struct qcc *qcc = qcs->qcc; + + if ((qcs->flags & QC_SF_TO_RESET) || qcs_is_close_local(qcs)) + return; + + TRACE_STATE("reset stream", QMUX_EV_QCS_END, qcc->conn, qcs); + qcs->flags |= QC_SF_TO_RESET; + qcs->err = err; + + /* Remove prepared stream data from connection flow-control calcul. */ + if (qcs->tx.offset > qcs->tx.sent_offset) { + const uint64_t diff = qcs->tx.offset - qcs->tx.sent_offset; + BUG_ON(qcc->tx.offsets - diff < qcc->tx.sent_offsets); + qcc->tx.offsets -= diff; + /* Reset qcs offset to prevent BUG_ON() on qcs_destroy(). */ + qcs->tx.offset = qcs->tx.sent_offset; + } + + /* Report send error to stream-endpoint layer. */ + if (qcs_sc(qcs)) { + se_fl_set_error(qcs->sd); + qcs_alert(qcs); + } + + qcc_send_stream(qcs, 1); + tasklet_wakeup(qcc->wait_event.tasklet); +} + +/* Register <qcs> stream for emission of STREAM, STOP_SENDING or RESET_STREAM. + * Set <urg> to 1 if stream content should be treated in priority compared to + * other streams. + */ +void qcc_send_stream(struct qcs *qcs, int urg) +{ + struct qcc *qcc = qcs->qcc; + + TRACE_ENTER(QMUX_EV_QCS_SEND, qcc->conn, qcs); + + /* Cannot send if already closed. */ + BUG_ON(qcs_is_close_local(qcs)); + + if (urg) { + LIST_DEL_INIT(&qcs->el_send); + LIST_INSERT(&qcc->send_list, &qcs->el_send); + } + else { + if (!LIST_INLIST(&qcs->el_send)) + LIST_APPEND(&qcs->qcc->send_list, &qcs->el_send); + } + + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcc->conn, qcs); +} + +/* Prepare for the emission of STOP_SENDING on <qcs>. */ +void qcc_abort_stream_read(struct qcs *qcs) +{ + struct qcc *qcc = qcs->qcc; + + TRACE_ENTER(QMUX_EV_QCC_NEW, qcc->conn, qcs); + + if ((qcs->flags & QC_SF_TO_STOP_SENDING) || qcs_is_close_remote(qcs)) + goto end; + + TRACE_STATE("abort stream read", QMUX_EV_QCS_END, qcc->conn, qcs); + qcs->flags |= (QC_SF_TO_STOP_SENDING|QC_SF_READ_ABORTED); + + qcc_send_stream(qcs, 1); + tasklet_wakeup(qcc->wait_event.tasklet); + + end: + TRACE_LEAVE(QMUX_EV_QCC_NEW, qcc->conn, qcs); +} + +/* Install the <app_ops> applicative layer of a QUIC connection on mux <qcc>. + * Returns 0 on success else non-zero. + */ +int qcc_install_app_ops(struct qcc *qcc, const struct qcc_app_ops *app_ops) +{ + TRACE_ENTER(QMUX_EV_QCC_NEW, qcc->conn); + + if (app_ops->init && !app_ops->init(qcc)) { + TRACE_ERROR("app ops init error", QMUX_EV_QCC_NEW, qcc->conn); + goto err; + } + + TRACE_PROTO("application layer initialized", QMUX_EV_QCC_NEW, qcc->conn); + qcc->app_ops = app_ops; + + /* RFC 9114 7.2.4.2. Initialization + * + * Endpoints MUST NOT require any data to be + * received from the peer prior to sending the SETTINGS frame; + * settings MUST be sent as soon as the transport is ready to + * send data. + */ + if (qcc->app_ops->finalize) { + if (qcc->app_ops->finalize(qcc->ctx)) { + TRACE_ERROR("app ops finalize error", QMUX_EV_QCC_NEW, qcc->conn); + goto err; + } + tasklet_wakeup(qcc->wait_event.tasklet); + } + + TRACE_LEAVE(QMUX_EV_QCC_NEW, qcc->conn); + return 0; + + err: + TRACE_LEAVE(QMUX_EV_QCC_NEW, qcc->conn); + return 1; +} + +/* Handle a new STREAM frame for stream with id <id>. Payload is pointed by + * <data> with length <len> and represents the offset <offset>. <fin> is set if + * the QUIC frame FIN bit is set. + * + * Returns 0 on success else non-zero. On error, the received frame should not + * be acknowledged. + */ +int qcc_recv(struct qcc *qcc, uint64_t id, uint64_t len, uint64_t offset, + char fin, char *data) +{ + struct qcs *qcs; + enum ncb_ret ret; + + TRACE_ENTER(QMUX_EV_QCC_RECV, qcc->conn); + + if (qcc->flags & QC_CF_ERRL) { + TRACE_DATA("connection on error", QMUX_EV_QCC_RECV, qcc->conn); + goto err; + } + + /* RFC 9000 19.8. STREAM Frames + * + * An endpoint MUST terminate the connection with error + * STREAM_STATE_ERROR if it receives a STREAM frame for a locally + * initiated stream that has not yet been created, or for a send-only + * stream. + */ + if (qcc_get_qcs(qcc, id, 1, 0, &qcs)) { + TRACE_DATA("qcs retrieval error", QMUX_EV_QCC_RECV, qcc->conn); + goto err; + } + + if (!qcs) { + TRACE_DATA("already closed stream", QMUX_EV_QCC_RECV, qcc->conn); + goto out; + } + + /* RFC 9000 4.5. Stream Final Size + * + * Once a final size for a stream is known, it cannot change. If a + * RESET_STREAM or STREAM frame is received indicating a change in the + * final size for the stream, an endpoint SHOULD respond with an error + * of type FINAL_SIZE_ERROR; see Section 11 for details on error + * handling. + */ + if (qcs->flags & QC_SF_SIZE_KNOWN && + (offset + len > qcs->rx.offset_max || (fin && offset + len < qcs->rx.offset_max))) { + TRACE_ERROR("final size error", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV|QMUX_EV_PROTO_ERR, qcc->conn, qcs); + qcc_set_error(qcc, QC_ERR_FINAL_SIZE_ERROR, 0); + goto err; + } + + if (qcs_is_close_remote(qcs)) { + TRACE_DATA("skipping STREAM for remotely closed", QMUX_EV_QCC_RECV, qcc->conn); + goto out; + } + + if (offset + len < qcs->rx.offset || + (offset + len == qcs->rx.offset && (!fin || (qcs->flags & QC_SF_SIZE_KNOWN)))) { + TRACE_DATA("already received offset", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + goto out; + } + + TRACE_PROTO("receiving STREAM", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + qcs_idle_open(qcs); + + if (offset + len > qcs->rx.offset_max) { + uint64_t diff = offset + len - qcs->rx.offset_max; + qcs->rx.offset_max = offset + len; + qcc->lfctl.offsets_recv += diff; + + if (offset + len > qcs->rx.msd || + qcc->lfctl.offsets_recv > qcc->lfctl.md) { + /* RFC 9000 4.1. Data Flow Control + * + * A receiver MUST close the connection with an error + * of type FLOW_CONTROL_ERROR if the sender violates + * the advertised connection or stream data limits + */ + TRACE_ERROR("flow control error", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV|QMUX_EV_PROTO_ERR, + qcc->conn, qcs); + qcc_set_error(qcc, QC_ERR_FLOW_CONTROL_ERROR, 0); + goto err; + } + } + + if (!qcs_get_ncbuf(qcs, &qcs->rx.ncbuf) || ncb_is_null(&qcs->rx.ncbuf)) { + TRACE_ERROR("receive ncbuf alloc failure", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + qcc_set_error(qcc, QC_ERR_INTERNAL_ERROR, 0); + goto err; + } + + TRACE_DATA("newly received offset", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + if (offset < qcs->rx.offset) { + size_t diff = qcs->rx.offset - offset; + + len -= diff; + data += diff; + offset = qcs->rx.offset; + } + + if (len) { + ret = ncb_add(&qcs->rx.ncbuf, offset - qcs->rx.offset, data, len, NCB_ADD_COMPARE); + switch (ret) { + case NCB_RET_OK: + break; + + case NCB_RET_DATA_REJ: + /* RFC 9000 2.2. Sending and Receiving Data + * + * An endpoint could receive data for a stream at the + * same stream offset multiple times. Data that has + * already been received can be discarded. The data at + * a given offset MUST NOT change if it is sent + * multiple times; an endpoint MAY treat receipt of + * different data at the same offset within a stream as + * a connection error of type PROTOCOL_VIOLATION. + */ + TRACE_ERROR("overlapping data rejected", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV|QMUX_EV_PROTO_ERR, + qcc->conn, qcs); + qcc_set_error(qcc, QC_ERR_PROTOCOL_VIOLATION, 0); + return 1; + + case NCB_RET_GAP_SIZE: + TRACE_DATA("cannot bufferize frame due to gap size limit", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, + qcc->conn, qcs); + return 1; + } + } + + if (fin) + qcs->flags |= QC_SF_SIZE_KNOWN; + + if (qcs->flags & QC_SF_SIZE_KNOWN && + qcs->rx.offset_max == qcs->rx.offset + ncb_data(&qcs->rx.ncbuf, 0)) { + qcs_close_remote(qcs); + } + + if ((ncb_data(&qcs->rx.ncbuf, 0) && !(qcs->flags & QC_SF_DEM_FULL)) || fin) { + qcc_decode_qcs(qcc, qcs); + qcc_refresh_timeout(qcc); + } + + out: + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 0; + + err: + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 1; +} + +/* Handle a new MAX_DATA frame. <max> must contains the maximum data field of + * the frame. + * + * Returns 0 on success else non-zero. + */ +int qcc_recv_max_data(struct qcc *qcc, uint64_t max) +{ + TRACE_ENTER(QMUX_EV_QCC_RECV, qcc->conn); + + TRACE_PROTO("receiving MAX_DATA", QMUX_EV_QCC_RECV, qcc->conn); + if (qcc->rfctl.md < max) { + qcc->rfctl.md = max; + TRACE_DATA("increase remote max-data", QMUX_EV_QCC_RECV, qcc->conn); + + if (qcc->flags & QC_CF_BLK_MFCTL) { + qcc->flags &= ~QC_CF_BLK_MFCTL; + tasklet_wakeup(qcc->wait_event.tasklet); + } + } + + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 0; +} + +/* Handle a new MAX_STREAM_DATA frame. <max> must contains the maximum data + * field of the frame and <id> is the identifier of the QUIC stream. + * + * Returns 0 on success else non-zero. On error, the received frame should not + * be acknowledged. + */ +int qcc_recv_max_stream_data(struct qcc *qcc, uint64_t id, uint64_t max) +{ + struct qcs *qcs; + + TRACE_ENTER(QMUX_EV_QCC_RECV, qcc->conn); + + if (qcc->flags & QC_CF_ERRL) { + TRACE_DATA("connection on error", QMUX_EV_QCC_RECV, qcc->conn); + goto err; + } + + /* RFC 9000 19.10. MAX_STREAM_DATA Frames + * + * Receiving a MAX_STREAM_DATA frame for a locally + * initiated stream that has not yet been created MUST be treated as a + * connection error of type STREAM_STATE_ERROR. An endpoint that + * receives a MAX_STREAM_DATA frame for a receive-only stream MUST + * terminate the connection with error STREAM_STATE_ERROR. + */ + if (qcc_get_qcs(qcc, id, 0, 1, &qcs)) + goto err; + + if (qcs) { + TRACE_PROTO("receiving MAX_STREAM_DATA", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + if (max > qcs->tx.msd) { + qcs->tx.msd = max; + TRACE_DATA("increase remote max-stream-data", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + + if (qcs->flags & QC_SF_BLK_SFCTL) { + qcs->flags &= ~QC_SF_BLK_SFCTL; + /* TODO optim: only wakeup IO-CB if stream has data to sent. */ + tasklet_wakeup(qcc->wait_event.tasklet); + } + } + } + + if (qcc_may_expire(qcc) && !qcc->nb_hreq) + qcc_refresh_timeout(qcc); + + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 0; + + err: + TRACE_DEVEL("leaving on error", QMUX_EV_QCC_RECV, qcc->conn); + return 1; +} + +/* Handle a new RESET_STREAM frame from stream ID <id> with error code <err> + * and final stream size <final_size>. + * + * Returns 0 on success else non-zero. On error, the received frame should not + * be acknowledged. + */ +int qcc_recv_reset_stream(struct qcc *qcc, uint64_t id, uint64_t err, uint64_t final_size) +{ + struct qcs *qcs; + + TRACE_ENTER(QMUX_EV_QCC_RECV, qcc->conn); + + if (qcc->flags & QC_CF_ERRL) { + TRACE_DATA("connection on error", QMUX_EV_QCC_RECV, qcc->conn); + goto err; + } + + /* RFC 9000 19.4. RESET_STREAM Frames + * + * An endpoint that receives a RESET_STREAM frame for a send-only stream + * MUST terminate the connection with error STREAM_STATE_ERROR. + */ + if (qcc_get_qcs(qcc, id, 1, 0, &qcs)) { + TRACE_ERROR("RESET_STREAM for send-only stream received", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + goto err; + } + + /* RFC 9000 3.2. Receiving Stream States + * + * A RESET_STREAM signal might be suppressed or withheld + * if stream data is completely received and is buffered to be read by + * the application. If the RESET_STREAM is suppressed, the receiving + * part of the stream remains in "Data Recvd". + */ + if (!qcs || qcs_is_close_remote(qcs)) + goto out; + + TRACE_PROTO("receiving RESET_STREAM", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + qcs_idle_open(qcs); + + /* Ensure stream closure is not forbidden by application protocol. */ + if (qcc->app_ops->close) { + if (qcc->app_ops->close(qcs, QCC_APP_OPS_CLOSE_SIDE_RD)) { + TRACE_ERROR("closure rejected by app layer", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + goto out; + } + } + + if (qcs->rx.offset_max > final_size || + ((qcs->flags & QC_SF_SIZE_KNOWN) && qcs->rx.offset_max != final_size)) { + TRACE_ERROR("final size error on RESET_STREAM", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + qcc_set_error(qcc, QC_ERR_FINAL_SIZE_ERROR, 0); + goto err; + } + + /* RFC 9000 3.2. Receiving Stream States + * + * An + * implementation MAY interrupt delivery of stream data, discard any + * data that was not consumed, and signal the receipt of the + * RESET_STREAM. + */ + qcs->flags |= QC_SF_SIZE_KNOWN|QC_SF_RECV_RESET; + qcs_close_remote(qcs); + qcs_free_ncbuf(qcs, &qcs->rx.ncbuf); + + out: + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 0; + + err: + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 1; +} + +/* Handle a new STOP_SENDING frame for stream ID <id>. The error code should be + * specified in <err>. + * + * Returns 0 on success else non-zero. On error, the received frame should not + * be acknowledged. + */ +int qcc_recv_stop_sending(struct qcc *qcc, uint64_t id, uint64_t err) +{ + struct qcs *qcs; + + TRACE_ENTER(QMUX_EV_QCC_RECV, qcc->conn); + + if (qcc->flags & QC_CF_ERRL) { + TRACE_DATA("connection on error", QMUX_EV_QCC_RECV, qcc->conn); + goto err; + } + + /* RFC 9000 19.5. STOP_SENDING Frames + * + * Receiving a STOP_SENDING frame for a + * locally initiated stream that has not yet been created MUST be + * treated as a connection error of type STREAM_STATE_ERROR. An + * endpoint that receives a STOP_SENDING frame for a receive-only stream + * MUST terminate the connection with error STREAM_STATE_ERROR. + */ + if (qcc_get_qcs(qcc, id, 0, 1, &qcs)) + goto err; + + if (!qcs) + goto out; + + TRACE_PROTO("receiving STOP_SENDING", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + + /* RFC 9000 3.5. Solicited State Transitions + * + * An endpoint is expected to send another STOP_SENDING frame if a + * packet containing a previous STOP_SENDING is lost. However, once + * either all stream data or a RESET_STREAM frame has been received for + * the stream -- that is, the stream is in any state other than "Recv" + * or "Size Known" -- sending a STOP_SENDING frame is unnecessary. + */ + + /* TODO thanks to previous RFC clause, STOP_SENDING is ignored if current stream + * has already been closed locally. This is useful to not emit multiple + * RESET_STREAM for a single stream. This is functional if stream is + * locally closed due to all data transmitted, but in this case the RFC + * advices to use an explicit RESET_STREAM. + */ + if (qcs_is_close_local(qcs)) { + TRACE_STATE("ignoring STOP_SENDING", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + goto out; + } + + qcs_idle_open(qcs); + + if (qcc->app_ops->close) { + if (qcc->app_ops->close(qcs, QCC_APP_OPS_CLOSE_SIDE_WR)) { + TRACE_ERROR("closure rejected by app layer", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + goto out; + } + } + + /* If FIN already reached, future RESET_STREAMS will be ignored. + * Manually set EOS in this case. + */ + if (qcs_sc(qcs) && se_fl_test(qcs->sd, SE_FL_EOI)) { + se_fl_set(qcs->sd, SE_FL_EOS); + qcs_alert(qcs); + } + + /* RFC 9000 3.5. Solicited State Transitions + * + * An endpoint that receives a STOP_SENDING frame + * MUST send a RESET_STREAM frame if the stream is in the "Ready" or + * "Send" state. If the stream is in the "Data Sent" state, the + * endpoint MAY defer sending the RESET_STREAM frame until the packets + * containing outstanding data are acknowledged or declared lost. If + * any outstanding data is declared lost, the endpoint SHOULD send a + * RESET_STREAM frame instead of retransmitting the data. + * + * An endpoint SHOULD copy the error code from the STOP_SENDING frame to + * the RESET_STREAM frame it sends, but it can use any application error + * code. + */ + qcc_reset_stream(qcs, err); + + if (qcc_may_expire(qcc) && !qcc->nb_hreq) + qcc_refresh_timeout(qcc); + + out: + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 0; + + err: + TRACE_DEVEL("leaving on error", QMUX_EV_QCC_RECV, qcc->conn); + return 1; +} + +/* Signal the closing of remote stream with id <id>. Flow-control for new + * streams may be allocated for the peer if needed. + */ +static int qcc_release_remote_stream(struct qcc *qcc, uint64_t id) +{ + struct quic_frame *frm; + + TRACE_ENTER(QMUX_EV_QCS_END, qcc->conn); + + if (quic_stream_is_bidi(id)) { + ++qcc->lfctl.cl_bidi_r; + if (qcc->lfctl.cl_bidi_r > qcc->lfctl.ms_bidi_init / 2) { + TRACE_DATA("increase max stream limit with MAX_STREAMS_BIDI", QMUX_EV_QCC_SEND, qcc->conn); + frm = qc_frm_alloc(QUIC_FT_MAX_STREAMS_BIDI); + if (!frm) { + qcc_set_error(qcc, QC_ERR_INTERNAL_ERROR, 0); + goto err; + } + + frm->max_streams_bidi.max_streams = qcc->lfctl.ms_bidi + + qcc->lfctl.cl_bidi_r; + LIST_APPEND(&qcc->lfctl.frms, &frm->list); + tasklet_wakeup(qcc->wait_event.tasklet); + + qcc->lfctl.ms_bidi += qcc->lfctl.cl_bidi_r; + qcc->lfctl.cl_bidi_r = 0; + } + } + else { + /* TODO unidirectional stream flow control with MAX_STREAMS_UNI + * emission not implemented. It should be unnecessary for + * HTTP/3 but may be required if other application protocols + * are supported. + */ + } + + TRACE_LEAVE(QMUX_EV_QCS_END, qcc->conn); + + return 0; + + err: + TRACE_DEVEL("leaving on error", QMUX_EV_QCS_END, qcc->conn); + return 1; +} + +/* detaches the QUIC stream from its QCC and releases it to the QCS pool. */ +static void qcs_destroy(struct qcs *qcs) +{ + struct qcc *qcc = qcs->qcc; + struct connection *conn = qcc->conn; + const uint64_t id = qcs->id; + + TRACE_ENTER(QMUX_EV_QCS_END, conn, qcs); + + /* MUST not removed a stream with sending prepared data left. This is + * to ensure consistency on connection flow-control calculation. + */ + BUG_ON(qcs->tx.offset < qcs->tx.sent_offset); + + if (!(qcc->flags & QC_CF_ERRL)) { + if (quic_stream_is_remote(qcc, id)) + qcc_release_remote_stream(qcc, id); + } + + qcs_free(qcs); + + TRACE_LEAVE(QMUX_EV_QCS_END, conn); +} + +/* Transfer as much as possible data on <qcs> from <in> to <out>. This is done + * in respect with available flow-control at stream and connection level. + * + * Returns the total bytes of transferred data or a negative error code. + */ +static int qcs_xfer_data(struct qcs *qcs, struct buffer *out, struct buffer *in) +{ + struct qcc *qcc = qcs->qcc; + int left, to_xfer; + int total = 0; + + TRACE_ENTER(QMUX_EV_QCS_SEND, qcc->conn, qcs); + + if (!qcs_get_buf(qcs, out)) { + TRACE_ERROR("buffer alloc failure", QMUX_EV_QCS_SEND, qcc->conn, qcs); + goto err; + } + + /* + * QCS out buffer diagram + * head left to_xfer + * -------------> ----------> -----> + * -------------------------------------------------- + * |...............|xxxxxxxxxxx|<<<<< + * -------------------------------------------------- + * ^ ack-off ^ sent-off ^ off + * + * STREAM frame + * ^ ^ + * |xxxxxxxxxxxxxxxxx| + */ + + BUG_ON_HOT(qcs->tx.sent_offset < qcs->stream->ack_offset); + BUG_ON_HOT(qcs->tx.offset < qcs->tx.sent_offset); + BUG_ON_HOT(qcc->tx.offsets < qcc->tx.sent_offsets); + + left = qcs->tx.offset - qcs->tx.sent_offset; + to_xfer = QUIC_MIN(b_data(in), b_room(out)); + + BUG_ON_HOT(qcs->tx.offset > qcs->tx.msd); + /* do not exceed flow control limit */ + if (qcs->tx.offset + to_xfer > qcs->tx.msd) { + TRACE_DATA("do not exceed stream flow control", QMUX_EV_QCS_SEND, qcc->conn, qcs); + to_xfer = qcs->tx.msd - qcs->tx.offset; + } + + BUG_ON_HOT(qcc->tx.offsets > qcc->rfctl.md); + /* do not overcome flow control limit on connection */ + if (qcc->tx.offsets + to_xfer > qcc->rfctl.md) { + TRACE_DATA("do not exceed conn flow control", QMUX_EV_QCS_SEND, qcc->conn, qcs); + to_xfer = qcc->rfctl.md - qcc->tx.offsets; + } + + if (!left && !to_xfer) + goto out; + + total = b_force_xfer(out, in, to_xfer); + + out: + { + struct qcs_xfer_data_trace_arg arg = { + .prep = b_data(out), .xfer = total, + }; + TRACE_LEAVE(QMUX_EV_QCS_SEND|QMUX_EV_QCS_XFER_DATA, + qcc->conn, qcs, &arg); + } + + return total; + + err: + TRACE_DEVEL("leaving on error", QMUX_EV_QCS_SEND, qcc->conn, qcs); + return -1; +} + +/* Prepare a STREAM frame for <qcs> instance using <out> as payload. The frame + * is appended in <frm_list>. Set <fin> if this is supposed to be the last + * stream frame. If <out> is NULL an empty STREAM frame is built : this may be + * useful if FIN needs to be sent without any data left. + * + * Returns the payload length of the STREAM frame or a negative error code. + */ +static int qcs_build_stream_frm(struct qcs *qcs, struct buffer *out, char fin, + struct list *frm_list) +{ + struct qcc *qcc = qcs->qcc; + struct quic_frame *frm; + int head, total; + uint64_t base_off; + + TRACE_ENTER(QMUX_EV_QCS_SEND, qcc->conn, qcs); + + /* if ack_offset < buf_offset, it points to an older buffer. */ + base_off = MAX(qcs->stream->buf_offset, qcs->stream->ack_offset); + BUG_ON(qcs->tx.sent_offset < base_off); + + head = qcs->tx.sent_offset - base_off; + total = out ? b_data(out) - head : 0; + BUG_ON(total < 0); + + if (!total && !fin) { + /* No need to send anything if total is NULL and no FIN to signal. */ + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcc->conn, qcs); + return 0; + } + BUG_ON((!total && qcs->tx.sent_offset > qcs->tx.offset) || + (total && qcs->tx.sent_offset >= qcs->tx.offset)); + BUG_ON(qcs->tx.sent_offset + total > qcs->tx.offset); + BUG_ON(qcc->tx.sent_offsets + total > qcc->rfctl.md); + + TRACE_PROTO("sending STREAM frame", QMUX_EV_QCS_SEND, qcc->conn, qcs); + frm = qc_frm_alloc(QUIC_FT_STREAM_8); + if (!frm) { + TRACE_ERROR("frame alloc failure", QMUX_EV_QCS_SEND, qcc->conn, qcs); + goto err; + } + + frm->stream.stream = qcs->stream; + frm->stream.id = qcs->id; + frm->stream.offset.key = 0; + frm->stream.dup = 0; + + if (total) { + frm->stream.buf = out; + frm->stream.data = (unsigned char *)b_peek(out, head); + } + else { + /* Empty STREAM frame. */ + frm->stream.buf = NULL; + frm->stream.data = NULL; + } + + /* FIN is positioned only when the buffer has been totally emptied. */ + if (fin) + frm->type |= QUIC_STREAM_FRAME_TYPE_FIN_BIT; + + if (qcs->tx.sent_offset) { + frm->type |= QUIC_STREAM_FRAME_TYPE_OFF_BIT; + frm->stream.offset.key = qcs->tx.sent_offset; + } + + /* Always set length bit as we do not know if there is remaining frames + * in the final packet after this STREAM. + */ + frm->type |= QUIC_STREAM_FRAME_TYPE_LEN_BIT; + frm->stream.len = total; + + LIST_APPEND(frm_list, &frm->list); + + out: + { + struct qcs_build_stream_trace_arg arg = { + .len = frm->stream.len, .fin = fin, + .offset = frm->stream.offset.key, + }; + TRACE_LEAVE(QMUX_EV_QCS_SEND|QMUX_EV_QCS_BUILD_STRM, + qcc->conn, qcs, &arg); + } + + return total; + + err: + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcc->conn, qcs); + return -1; +} + +/* Check after transferring data from qcs.tx.buf if FIN must be set on the next + * STREAM frame for <qcs>. + * + * Returns true if FIN must be set else false. + */ +static int qcs_stream_fin(struct qcs *qcs) +{ + return qcs->flags & QC_SF_FIN_STREAM && !b_data(&qcs->tx.buf); +} + +/* Return true if <qcs> has data to send in new STREAM frames. */ +static forceinline int qcs_need_sending(struct qcs *qcs) +{ + return b_data(&qcs->tx.buf) || qcs->tx.sent_offset < qcs->tx.offset || + qcs_stream_fin(qcs); +} + +/* This function must be called by the upper layer to inform about the sending + * of a STREAM frame for <qcs> instance. The frame is of <data> length and on + * <offset>. + */ +void qcc_streams_sent_done(struct qcs *qcs, uint64_t data, uint64_t offset) +{ + struct qcc *qcc = qcs->qcc; + uint64_t diff; + + TRACE_ENTER(QMUX_EV_QCS_SEND, qcc->conn, qcs); + + BUG_ON(offset > qcs->tx.sent_offset); + BUG_ON(offset + data > qcs->tx.offset); + + /* check if the STREAM frame has already been notified. It can happen + * for retransmission. + */ + if (offset + data < qcs->tx.sent_offset) { + TRACE_DEVEL("offset already notified", QMUX_EV_QCS_SEND, qcc->conn, qcs); + goto out; + } + + qcs_idle_open(qcs); + + diff = offset + data - qcs->tx.sent_offset; + if (diff) { + /* increase offset sum on connection */ + qcc->tx.sent_offsets += diff; + BUG_ON_HOT(qcc->tx.sent_offsets > qcc->rfctl.md); + if (qcc->tx.sent_offsets == qcc->rfctl.md) { + qcc->flags |= QC_CF_BLK_MFCTL; + TRACE_STATE("connection flow-control reached", QMUX_EV_QCS_SEND, qcc->conn); + } + + /* increase offset on stream */ + qcs->tx.sent_offset += diff; + BUG_ON_HOT(qcs->tx.sent_offset > qcs->tx.msd); + BUG_ON_HOT(qcs->tx.sent_offset > qcs->tx.offset); + if (qcs->tx.sent_offset == qcs->tx.msd) { + qcs->flags |= QC_SF_BLK_SFCTL; + TRACE_STATE("stream flow-control reached", QMUX_EV_QCS_SEND, qcc->conn, qcs); + } + + /* If qcs.stream.buf is full, release it to the lower layer. */ + if (qcs->tx.offset == qcs->tx.sent_offset && + b_full(&qcs->stream->buf->buf)) { + qc_stream_buf_release(qcs->stream); + } + + /* Add measurement for send rate. This is done at the MUX layer + * to account only for STREAM frames without retransmission. + */ + increment_send_rate(diff, 0); + } + + if (qcs->tx.offset == qcs->tx.sent_offset && !b_data(&qcs->tx.buf)) { + /* Remove stream from send_list if all was sent. */ + LIST_DEL_INIT(&qcs->el_send); + TRACE_STATE("stream sent done", QMUX_EV_QCS_SEND, qcc->conn, qcs); + + if (qcs->flags & (QC_SF_FIN_STREAM|QC_SF_DETACH)) { + /* Close stream locally. */ + qcs_close_local(qcs); + /* Reset flag to not emit multiple FIN STREAM frames. */ + qcs->flags &= ~QC_SF_FIN_STREAM; + } + } + + out: + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcc->conn, qcs); +} + +/* Returns true if subscribe set, false otherwise. */ +static int qcc_subscribe_send(struct qcc *qcc) +{ + struct connection *conn = qcc->conn; + + /* Do not subscribe if lower layer in error. */ + if (conn->flags & CO_FL_ERROR) + return 0; + + if (qcc->wait_event.events & SUB_RETRY_SEND) + return 1; + + TRACE_DEVEL("subscribe for send", QMUX_EV_QCC_SEND, qcc->conn); + conn->xprt->subscribe(conn, conn->xprt_ctx, SUB_RETRY_SEND, &qcc->wait_event); + return 1; +} + +/* Wrapper for send on transport layer. Send a list of frames <frms> for the + * connection <qcc>. + * + * Returns 0 if all data sent with success else non-zero. + */ +static int qcc_send_frames(struct qcc *qcc, struct list *frms) +{ + TRACE_ENTER(QMUX_EV_QCC_SEND, qcc->conn); + + if (LIST_ISEMPTY(frms)) { + TRACE_DEVEL("no frames to send", QMUX_EV_QCC_SEND, qcc->conn); + goto err; + } + + if (!qc_send_mux(qcc->conn->handle.qc, frms)) { + TRACE_DEVEL("error on sending", QMUX_EV_QCC_SEND, qcc->conn); + qcc_subscribe_send(qcc); + goto err; + } + + /* If there is frames left at this stage, transport layer is blocked. + * Subscribe on it to retry later. + */ + if (!LIST_ISEMPTY(frms)) { + TRACE_DEVEL("remaining frames to send", QMUX_EV_QCC_SEND, qcc->conn); + qcc_subscribe_send(qcc); + goto err; + } + + TRACE_LEAVE(QMUX_EV_QCC_SEND, qcc->conn); + return 0; + + err: + TRACE_DEVEL("leaving on error", QMUX_EV_QCC_SEND, qcc->conn); + return 1; +} + +/* Emit a RESET_STREAM on <qcs>. + * + * Returns 0 if the frame has been successfully sent else non-zero. + */ +static int qcs_send_reset(struct qcs *qcs) +{ + struct list frms = LIST_HEAD_INIT(frms); + struct quic_frame *frm; + + TRACE_ENTER(QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + + frm = qc_frm_alloc(QUIC_FT_RESET_STREAM); + if (!frm) { + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + return 1; + } + + frm->reset_stream.id = qcs->id; + frm->reset_stream.app_error_code = qcs->err; + frm->reset_stream.final_size = qcs->tx.sent_offset; + + LIST_APPEND(&frms, &frm->list); + if (qcc_send_frames(qcs->qcc, &frms)) { + if (!LIST_ISEMPTY(&frms)) + qc_frm_free(qcs->qcc->conn->handle.qc, &frm); + TRACE_DEVEL("cannot send RESET_STREAM", QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + return 1; + } + + qcs_close_local(qcs); + qcs->flags &= ~QC_SF_TO_RESET; + + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + return 0; +} + +/* Emit a STOP_SENDING on <qcs>. + * + * Returns 0 if the frame has been successfully sent else non-zero. + */ +static int qcs_send_stop_sending(struct qcs *qcs) +{ + struct list frms = LIST_HEAD_INIT(frms); + struct quic_frame *frm; + struct qcc *qcc = qcs->qcc; + + TRACE_ENTER(QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + + /* RFC 9000 3.3. Permitted Frame Types + * + * A + * receiver MAY send a STOP_SENDING frame in any state where it has not + * received a RESET_STREAM frame -- that is, states other than "Reset + * Recvd" or "Reset Read". However, there is little value in sending a + * STOP_SENDING frame in the "Data Recvd" state, as all stream data has + * been received. A sender could receive either of these two types of + * frames in any state as a result of delayed delivery of packets.¶ + */ + if (qcs_is_close_remote(qcs)) { + TRACE_STATE("skip STOP_SENDING on remote already closed", QMUX_EV_QCS_SEND, qcc->conn, qcs); + goto done; + } + + frm = qc_frm_alloc(QUIC_FT_STOP_SENDING); + if (!frm) { + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + return 1; + } + + frm->stop_sending.id = qcs->id; + frm->stop_sending.app_error_code = qcs->err; + + LIST_APPEND(&frms, &frm->list); + if (qcc_send_frames(qcs->qcc, &frms)) { + if (!LIST_ISEMPTY(&frms)) + qc_frm_free(qcc->conn->handle.qc, &frm); + TRACE_DEVEL("cannot send STOP_SENDING", QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + return 1; + } + + done: + qcs->flags &= ~QC_SF_TO_STOP_SENDING; + + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcs->qcc->conn, qcs); + return 0; +} + +/* Used internally by qcc_io_send function. Proceed to send for <qcs>. This will + * transfer data from qcs buffer to its quic_stream counterpart. A STREAM frame + * is then generated and inserted in <frms> list. + * + * Returns the total bytes transferred between qcs and quic_stream buffers. Can + * be null if out buffer cannot be allocated. On error a negative error code is + * used. + */ +static int qcs_send(struct qcs *qcs, struct list *frms) +{ + struct qcc *qcc = qcs->qcc; + struct buffer *buf = &qcs->tx.buf; + struct buffer *out = qc_stream_buf_get(qcs->stream); + int xfer = 0, buf_avail; + char fin = 0; + + TRACE_ENTER(QMUX_EV_QCS_SEND, qcc->conn, qcs); + + /* Cannot send STREAM on remote unidirectional streams. */ + BUG_ON(quic_stream_is_uni(qcs->id) && quic_stream_is_remote(qcc, qcs->id)); + + if (b_data(buf)) { + /* Allocate <out> buffer if not already done. */ + if (!out) { + if (qcc->flags & QC_CF_CONN_FULL) + goto out; + + out = qc_stream_buf_alloc(qcs->stream, qcs->tx.offset, + &buf_avail); + if (!out) { + if (buf_avail) { + TRACE_ERROR("stream desc alloc failure", QMUX_EV_QCS_SEND, qcc->conn, qcs); + goto err; + } + + TRACE_STATE("hitting stream desc buffer limit", QMUX_EV_QCS_SEND, qcc->conn, qcs); + qcc->flags |= QC_CF_CONN_FULL; + goto out; + } + } + + /* Transfer data from <buf> to <out>. */ + xfer = qcs_xfer_data(qcs, out, buf); + if (xfer < 0) + goto err; + + if (xfer > 0) { + qcs_notify_send(qcs); + qcs->flags &= ~QC_SF_BLK_MROOM; + } + + qcs->tx.offset += xfer; + BUG_ON_HOT(qcs->tx.offset > qcs->tx.msd); + qcc->tx.offsets += xfer; + BUG_ON_HOT(qcc->tx.offsets > qcc->rfctl.md); + + /* out buffer cannot be emptied if qcs offsets differ. */ + BUG_ON(!b_data(out) && qcs->tx.sent_offset != qcs->tx.offset); + } + + /* FIN is set if all incoming data were transferred. */ + fin = qcs_stream_fin(qcs); + + /* Build a new STREAM frame with <out> buffer. */ + if (qcs->tx.sent_offset != qcs->tx.offset || fin) { + /* Skip STREAM frame allocation if already subscribed for send. + * Happens on sendto transient error or network congestion. + */ + if (qcc->wait_event.events & SUB_RETRY_SEND) { + TRACE_DEVEL("already subscribed for sending", + QMUX_EV_QCS_SEND, qcc->conn, qcs); + goto err; + } + + if (qcs_build_stream_frm(qcs, out, fin, frms) < 0) + goto err; + } + + out: + TRACE_LEAVE(QMUX_EV_QCS_SEND, qcc->conn, qcs); + return xfer; + + err: + TRACE_DEVEL("leaving on error", QMUX_EV_QCS_SEND, qcc->conn, qcs); + return -1; +} + +/* Proceed to sending. Loop through all available streams for the <qcc> + * instance and try to send as much as possible. + * + * Returns the total of bytes sent to the transport layer. + */ +static int qcc_io_send(struct qcc *qcc) +{ + struct list frms = LIST_HEAD_INIT(frms); + /* Temporary list for QCS on error. */ + struct list qcs_failed = LIST_HEAD_INIT(qcs_failed); + struct qcs *qcs, *qcs_tmp, *first_qcs = NULL; + int ret, total = 0; + + TRACE_ENTER(QMUX_EV_QCC_SEND, qcc->conn); + + /* TODO if socket in transient error, sending should be temporarily + * disabled for all frames. However, checking for send subscription is + * not valid as this may be caused by a congestion error which only + * apply for STREAM frames. + */ + + /* Check for transport error. */ + if (qcc->flags & QC_CF_ERR_CONN || qcc->conn->flags & CO_FL_ERROR) { + TRACE_DEVEL("connection on error", QMUX_EV_QCC_SEND, qcc->conn); + goto out; + } + + /* Check for locally detected connection error. */ + if (qcc->flags & QC_CF_ERRL) { + /* Prepare a CONNECTION_CLOSE if not already done. */ + if (!(qcc->flags & QC_CF_ERRL_DONE)) { + TRACE_DATA("report a connection error", QMUX_EV_QCC_SEND|QMUX_EV_QCC_ERR, qcc->conn); + quic_set_connection_close(qcc->conn->handle.qc, qcc->err); + qcc->flags |= QC_CF_ERRL_DONE; + } + goto out; + } + + if (qcc->conn->flags & CO_FL_SOCK_WR_SH) { + qcc->conn->flags |= CO_FL_ERROR; + TRACE_DEVEL("connection on error", QMUX_EV_QCC_SEND, qcc->conn); + goto out; + } + + if (!LIST_ISEMPTY(&qcc->lfctl.frms)) { + if (qcc_send_frames(qcc, &qcc->lfctl.frms)) { + TRACE_DEVEL("flow-control frames rejected by transport, aborting send", QMUX_EV_QCC_SEND, qcc->conn); + goto out; + } + } + + /* Send STREAM/STOP_SENDING/RESET_STREAM data for registered streams. */ + list_for_each_entry_safe(qcs, qcs_tmp, &qcc->send_list, el_send) { + /* Check if all QCS were processed. */ + if (qcs == first_qcs) + break; + + /* Stream must not be present in send_list if it has nothing to send. */ + BUG_ON(!(qcs->flags & (QC_SF_TO_STOP_SENDING|QC_SF_TO_RESET)) && + !qcs_need_sending(qcs)); + + /* Each STOP_SENDING/RESET_STREAM frame is sent individually to + * guarantee its emission. + * + * TODO multiplex several frames in same datagram to optimize sending + */ + if (qcs->flags & QC_SF_TO_STOP_SENDING) { + if (qcs_send_stop_sending(qcs)) + goto sent_done; + + /* Remove stream from send_list if it had only STOP_SENDING + * to send. + */ + if (!(qcs->flags & QC_SF_TO_RESET) && !qcs_need_sending(qcs)) { + LIST_DEL_INIT(&qcs->el_send); + continue; + } + } + + if (qcs->flags & QC_SF_TO_RESET) { + if (qcs_send_reset(qcs)) + goto sent_done; + + /* RFC 9000 3.3. Permitted Frame Types + * + * A sender MUST NOT send + * a STREAM or STREAM_DATA_BLOCKED frame for a stream in the + * "Reset Sent" state or any terminal state -- that is, after + * sending a RESET_STREAM frame. + */ + LIST_DEL_INIT(&qcs->el_send); + continue; + } + + if (!(qcc->flags & QC_CF_BLK_MFCTL) && + !(qcs->flags & QC_SF_BLK_SFCTL)) { + if ((ret = qcs_send(qcs, &frms)) < 0) { + /* Temporarily remove QCS from send-list. */ + LIST_DEL_INIT(&qcs->el_send); + LIST_APPEND(&qcs_failed, &qcs->el_send); + continue; + } + + total += ret; + if (ret) { + /* Move QCS with some bytes transferred at the + * end of send-list for next iterations. + */ + LIST_DEL_INIT(&qcs->el_send); + LIST_APPEND(&qcc->send_list, &qcs->el_send); + /* Remember first moved QCS as checkpoint to interrupt loop */ + if (!first_qcs) + first_qcs = qcs; + } + } + } + + /* Retry sending until no frame to send, data rejected or connection + * flow-control limit reached. + */ + while (qcc_send_frames(qcc, &frms) == 0 && !(qcc->flags & QC_CF_BLK_MFCTL)) { + /* Reloop over <qcc.send_list>. Useful for streams which have + * fulfilled their qc_stream_desc buf and have now release it. + */ + list_for_each_entry_safe(qcs, qcs_tmp, &qcc->send_list, el_send) { + /* Only streams blocked on flow-control or waiting on a + * new qc_stream_desc should be present in send_list as + * long as transport layer can handle all data. + */ + BUG_ON(qcs->stream->buf && !(qcs->flags & QC_SF_BLK_SFCTL)); + + if (!(qcs->flags & QC_SF_BLK_SFCTL)) { + if ((ret = qcs_send(qcs, &frms)) < 0) { + LIST_DEL_INIT(&qcs->el_send); + LIST_APPEND(&qcs_failed, &qcs->el_send); + continue; + } + + total += ret; + } + } + } + + sent_done: + /* Deallocate frames that the transport layer has rejected. */ + if (!LIST_ISEMPTY(&frms)) { + struct quic_frame *frm, *frm2; + + list_for_each_entry_safe(frm, frm2, &frms, list) + qc_frm_free(qcc->conn->handle.qc, &frm); + } + + /* Re-insert on-error QCS at the end of the send-list. */ + if (!LIST_ISEMPTY(&qcs_failed)) { + list_for_each_entry_safe(qcs, qcs_tmp, &qcs_failed, el_send) { + LIST_DEL_INIT(&qcs->el_send); + LIST_APPEND(&qcc->send_list, &qcs->el_send); + } + + if (!(qcc->flags & QC_CF_BLK_MFCTL)) + tasklet_wakeup(qcc->wait_event.tasklet); + } + + out: + if (qcc->conn->flags & CO_FL_ERROR && !(qcc->flags & QC_CF_ERR_CONN)) { + TRACE_ERROR("error reported by transport layer", + QMUX_EV_QCC_SEND, qcc->conn); + qcc->flags |= QC_CF_ERR_CONN; + } + + TRACE_LEAVE(QMUX_EV_QCC_SEND, qcc->conn); + return total; +} + +/* Proceed on receiving. Loop through all streams from <qcc> and use decode_qcs + * operation. + * + * Returns 0 on success else non-zero. + */ +static int qcc_io_recv(struct qcc *qcc) +{ + struct eb64_node *node; + struct qcs *qcs; + + TRACE_ENTER(QMUX_EV_QCC_RECV, qcc->conn); + + if (qcc->flags & QC_CF_ERRL) { + TRACE_DATA("connection on error", QMUX_EV_QCC_RECV, qcc->conn); + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 0; + } + + node = eb64_first(&qcc->streams_by_id); + while (node) { + uint64_t id; + + qcs = eb64_entry(node, struct qcs, by_id); + id = qcs->id; + + if (!ncb_data(&qcs->rx.ncbuf, 0) || (qcs->flags & QC_SF_DEM_FULL)) { + node = eb64_next(node); + continue; + } + + if (quic_stream_is_uni(id) && quic_stream_is_local(qcc, id)) { + node = eb64_next(node); + continue; + } + + qcc_decode_qcs(qcc, qcs); + node = eb64_next(node); + } + + TRACE_LEAVE(QMUX_EV_QCC_RECV, qcc->conn); + return 0; +} + + +/* Release all streams which have their transfer operation achieved. + * + * Returns true if at least one stream is released. + */ +static int qcc_purge_streams(struct qcc *qcc) +{ + struct eb64_node *node; + int release = 0; + + TRACE_ENTER(QMUX_EV_QCC_WAKE, qcc->conn); + + node = eb64_first(&qcc->streams_by_id); + while (node) { + struct qcs *qcs = eb64_entry(node, struct qcs, by_id); + node = eb64_next(node); + + /* Release not attached closed streams. */ + if (qcs->st == QC_SS_CLO && !qcs_sc(qcs)) { + TRACE_STATE("purging closed stream", QMUX_EV_QCC_WAKE, qcs->qcc->conn, qcs); + qcs_destroy(qcs); + release = 1; + continue; + } + + /* Release detached streams with empty buffer. */ + if (qcs->flags & QC_SF_DETACH) { + if (qcs_is_close_local(qcs)) { + TRACE_STATE("purging detached stream", QMUX_EV_QCC_WAKE, qcs->qcc->conn, qcs); + qcs_destroy(qcs); + release = 1; + continue; + } + } + } + + TRACE_LEAVE(QMUX_EV_QCC_WAKE, qcc->conn); + return release; +} + +/* Execute application layer shutdown. If this operation is not defined, a + * CONNECTION_CLOSE will be prepared as a fallback. This function is protected + * against multiple invocation with the flag QC_CF_APP_SHUT. + */ +static void qcc_shutdown(struct qcc *qcc) +{ + TRACE_ENTER(QMUX_EV_QCC_END, qcc->conn); + + if (qcc->flags & (QC_CF_ERR_CONN|QC_CF_ERRL)) { + TRACE_DATA("connection on error", QMUX_EV_QCC_END, qcc->conn); + goto out; + } + + if (qcc->flags & QC_CF_APP_SHUT) + goto out; + + TRACE_STATE("perform graceful shutdown", QMUX_EV_QCC_END, qcc->conn); + if (qcc->app_ops && qcc->app_ops->shutdown) { + qcc->app_ops->shutdown(qcc->ctx); + qcc_io_send(qcc); + } + else { + qcc->err = quic_err_app(QC_ERR_NO_ERROR); + } + + /* Register "no error" code at transport layer. Do not use + * quic_set_connection_close() as retransmission may be performed to + * finalized transfers. Do not overwrite quic-conn existing code if + * already set. + * + * TODO implement a wrapper function for this in quic-conn module + */ + if (!(qcc->conn->handle.qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE)) + qcc->conn->handle.qc->err = qcc->err; + + out: + qcc->flags |= QC_CF_APP_SHUT; + TRACE_LEAVE(QMUX_EV_QCC_END, qcc->conn); +} + +/* Loop through all qcs from <qcc>. Report error on stream endpoint if + * connection on error and wake them. + */ +static int qcc_wake_some_streams(struct qcc *qcc) +{ + struct qcs *qcs; + struct eb64_node *node; + + TRACE_POINT(QMUX_EV_QCC_WAKE, qcc->conn); + + for (node = eb64_first(&qcc->streams_by_id); node; + node = eb64_next(node)) { + qcs = eb64_entry(node, struct qcs, by_id); + + if (!qcs_sc(qcs)) + continue; + + if (qcc->flags & (QC_CF_ERR_CONN|QC_CF_ERRL)) { + TRACE_POINT(QMUX_EV_QCC_WAKE, qcc->conn, qcs); + se_fl_set_error(qcs->sd); + qcs_alert(qcs); + } + } + + return 0; +} + +/* Conduct operations which should be made for <qcc> connection after + * input/output. Most notably, closed streams are purged which may leave the + * connection has ready to be released. + * + * Returns 1 if <qcc> must be released else 0. + */ +static int qcc_io_process(struct qcc *qcc) +{ + qcc_purge_streams(qcc); + + /* Check if a soft-stop is in progress. + * + * TODO this is relevant for frontend connections only. + */ + if (unlikely(qcc->proxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + int close = 1; + + /* If using listener socket, soft-stop is not supported. The + * connection must be closed immediately. + */ + if (!qc_test_fd(qcc->conn->handle.qc)) { + TRACE_DEVEL("proxy disabled with listener socket, closing connection", QMUX_EV_QCC_WAKE, qcc->conn); + qcc->conn->flags |= (CO_FL_SOCK_RD_SH|CO_FL_SOCK_WR_SH); + qcc_io_send(qcc); + goto out; + } + + TRACE_DEVEL("proxy disabled, prepare connection soft-stop", QMUX_EV_QCC_WAKE, qcc->conn); + + /* If a close-spread-time option is set, we want to avoid + * closing all the active HTTP3 connections at once so we add a + * random factor that will spread the closing. + */ + if (tick_isset(global.close_spread_end)) { + int remaining_window = tick_remain(now_ms, global.close_spread_end); + if (remaining_window) { + /* This should increase the closing rate the + * further along the window we are. */ + close = (remaining_window <= statistical_prng_range(global.close_spread_time)); + } + } + else if (global.tune.options & GTUNE_DISABLE_ACTIVE_CLOSE) { + close = 0; /* let the client close his connection himself */ + } + + if (close) + qcc_shutdown(qcc); + } + + /* Report error if set on stream endpoint layer. */ + if (qcc->flags & (QC_CF_ERR_CONN|QC_CF_ERRL)) + qcc_wake_some_streams(qcc); + + out: + if (qcc_is_dead(qcc)) + return 1; + + return 0; +} + +/* release function. This one should be called to free all resources allocated + * to the mux. + */ +static void qcc_release(struct qcc *qcc) +{ + struct connection *conn = qcc->conn; + struct eb64_node *node; + + TRACE_ENTER(QMUX_EV_QCC_END, conn); + + qcc_shutdown(qcc); + + if (qcc->task) { + task_destroy(qcc->task); + qcc->task = NULL; + } + + tasklet_free(qcc->wait_event.tasklet); + if (conn && qcc->wait_event.events) { + conn->xprt->unsubscribe(conn, conn->xprt_ctx, + qcc->wait_event.events, + &qcc->wait_event); + } + + /* liberate remaining qcs instances */ + node = eb64_first(&qcc->streams_by_id); + while (node) { + struct qcs *qcs = eb64_entry(node, struct qcs, by_id); + node = eb64_next(node); + qcs_free(qcs); + } + + while (!LIST_ISEMPTY(&qcc->lfctl.frms)) { + struct quic_frame *frm = LIST_ELEM(qcc->lfctl.frms.n, struct quic_frame *, list); + qc_frm_free(qcc->conn->handle.qc, &frm); + } + + if (qcc->app_ops && qcc->app_ops->release) + qcc->app_ops->release(qcc->ctx); + TRACE_PROTO("application layer released", QMUX_EV_QCC_END, conn); + + pool_free(pool_head_qcc, qcc); + + if (conn) { + LIST_DEL_INIT(&conn->stopping_list); + + conn->handle.qc->conn = NULL; + conn->mux = NULL; + conn->ctx = NULL; + + TRACE_DEVEL("freeing conn", QMUX_EV_QCC_END, conn); + + conn_stop_tracking(conn); + conn_full_close(conn); + if (conn->destroy_cb) + conn->destroy_cb(conn); + conn_free(conn); + } + + TRACE_LEAVE(QMUX_EV_QCC_END); +} + +struct task *qcc_io_cb(struct task *t, void *ctx, unsigned int status) +{ + struct qcc *qcc = ctx; + + TRACE_ENTER(QMUX_EV_QCC_WAKE, qcc->conn); + + qcc_io_send(qcc); + + qcc_io_recv(qcc); + + if (qcc_io_process(qcc)) { + TRACE_STATE("releasing dead connection", QMUX_EV_QCC_WAKE, qcc->conn); + goto release; + } + + qcc_refresh_timeout(qcc); + + end: + TRACE_LEAVE(QMUX_EV_QCC_WAKE, qcc->conn); + return NULL; + + release: + qcc_release(qcc); + TRACE_LEAVE(QMUX_EV_QCC_WAKE); + return NULL; +} + +static struct task *qcc_timeout_task(struct task *t, void *ctx, unsigned int state) +{ + struct qcc *qcc = ctx; + int expired = tick_is_expired(t->expire, now_ms); + + TRACE_ENTER(QMUX_EV_QCC_WAKE, qcc ? qcc->conn : NULL); + + if (qcc) { + if (!expired) { + TRACE_DEVEL("not expired", QMUX_EV_QCC_WAKE, qcc->conn); + goto requeue; + } + + if (!qcc_may_expire(qcc)) { + TRACE_DEVEL("cannot expired", QMUX_EV_QCC_WAKE, qcc->conn); + t->expire = TICK_ETERNITY; + goto requeue; + } + } + + task_destroy(t); + + if (!qcc) { + TRACE_DEVEL("no more qcc", QMUX_EV_QCC_WAKE); + goto out; + } + + /* Mark timeout as triggered by setting task to NULL. */ + qcc->task = NULL; + + /* TODO depending on the timeout condition, different shutdown mode + * should be used. For http keep-alive or disabled proxy, a graceful + * shutdown should occurs. For all other cases, an immediate close + * seems legitimate. + */ + if (qcc_is_dead(qcc)) { + TRACE_STATE("releasing dead connection", QMUX_EV_QCC_WAKE, qcc->conn); + qcc_release(qcc); + } + + out: + TRACE_LEAVE(QMUX_EV_QCC_WAKE); + return NULL; + + requeue: + TRACE_LEAVE(QMUX_EV_QCC_WAKE); + return t; +} + +static int qmux_init(struct connection *conn, struct proxy *prx, + struct session *sess, struct buffer *input) +{ + struct qcc *qcc; + struct quic_transport_params *lparams, *rparams; + + TRACE_ENTER(QMUX_EV_QCC_NEW); + + qcc = pool_alloc(pool_head_qcc); + if (!qcc) { + TRACE_ERROR("alloc failure", QMUX_EV_QCC_NEW); + goto fail_no_qcc; + } + + qcc->conn = conn; + conn->ctx = qcc; + qcc->nb_hreq = qcc->nb_sc = 0; + qcc->flags = 0; + + qcc->app_ops = NULL; + + qcc->streams_by_id = EB_ROOT_UNIQUE; + + /* Server parameters, params used for RX flow control. */ + lparams = &conn->handle.qc->rx.params; + + qcc->tx.sent_offsets = qcc->tx.offsets = 0; + + LIST_INIT(&qcc->lfctl.frms); + qcc->lfctl.ms_bidi = qcc->lfctl.ms_bidi_init = lparams->initial_max_streams_bidi; + qcc->lfctl.ms_uni = lparams->initial_max_streams_uni; + qcc->lfctl.msd_bidi_l = lparams->initial_max_stream_data_bidi_local; + qcc->lfctl.msd_bidi_r = lparams->initial_max_stream_data_bidi_remote; + qcc->lfctl.msd_uni_r = lparams->initial_max_stream_data_uni; + qcc->lfctl.cl_bidi_r = 0; + + qcc->lfctl.md = qcc->lfctl.md_init = lparams->initial_max_data; + qcc->lfctl.offsets_recv = qcc->lfctl.offsets_consume = 0; + + rparams = &conn->handle.qc->tx.params; + qcc->rfctl.md = rparams->initial_max_data; + qcc->rfctl.msd_bidi_l = rparams->initial_max_stream_data_bidi_local; + qcc->rfctl.msd_bidi_r = rparams->initial_max_stream_data_bidi_remote; + qcc->rfctl.msd_uni_l = rparams->initial_max_stream_data_uni; + + if (conn_is_back(conn)) { + qcc->next_bidi_l = 0x00; + qcc->largest_bidi_r = 0x01; + qcc->next_uni_l = 0x02; + qcc->largest_uni_r = 0x03; + } + else { + qcc->largest_bidi_r = 0x00; + qcc->next_bidi_l = 0x01; + qcc->largest_uni_r = 0x02; + qcc->next_uni_l = 0x03; + } + + qcc->wait_event.tasklet = tasklet_new(); + if (!qcc->wait_event.tasklet) { + TRACE_ERROR("taslket alloc failure", QMUX_EV_QCC_NEW); + goto fail_no_tasklet; + } + + LIST_INIT(&qcc->send_list); + + qcc->wait_event.tasklet->process = qcc_io_cb; + qcc->wait_event.tasklet->context = qcc; + qcc->wait_event.events = 0; + + qcc->proxy = prx; + /* haproxy timeouts */ + if (conn_is_back(qcc->conn)) { + qcc->timeout = prx->timeout.server; + qcc->shut_timeout = tick_isset(prx->timeout.serverfin) ? + prx->timeout.serverfin : prx->timeout.server; + } + else { + qcc->timeout = prx->timeout.client; + qcc->shut_timeout = tick_isset(prx->timeout.clientfin) ? + prx->timeout.clientfin : prx->timeout.client; + } + + /* Always allocate task even if timeout is unset. In MUX code, if task + * is NULL, it indicates that a timeout has stroke earlier. + */ + qcc->task = task_new_here(); + if (!qcc->task) { + TRACE_ERROR("timeout task alloc failure", QMUX_EV_QCC_NEW); + goto fail_no_timeout_task; + } + qcc->task->process = qcc_timeout_task; + qcc->task->context = qcc; + qcc->task->expire = tick_add_ifset(now_ms, qcc->timeout); + + qcc_reset_idle_start(qcc); + LIST_INIT(&qcc->opening_list); + + HA_ATOMIC_STORE(&conn->handle.qc->qcc, qcc); + + if (qcc_install_app_ops(qcc, conn->handle.qc->app_ops)) { + TRACE_PROTO("Cannot install app layer", QMUX_EV_QCC_NEW|QMUX_EV_QCC_ERR, qcc->conn); + /* prepare a CONNECTION_CLOSE frame */ + quic_set_connection_close(conn->handle.qc, quic_err_transport(QC_ERR_APPLICATION_ERROR)); + goto fail_install_app_ops; + } + + if (qcc->app_ops == &h3_ops) + proxy_inc_fe_cum_sess_ver_ctr(sess->listener, prx, 3); + + /* Register conn for idle front closing. This is done once everything is allocated. */ + if (!conn_is_back(conn)) + LIST_APPEND(&mux_stopping_data[tid].list, &conn->stopping_list); + + /* init read cycle */ + tasklet_wakeup(qcc->wait_event.tasklet); + + TRACE_LEAVE(QMUX_EV_QCC_NEW, qcc->conn); + return 0; + + fail_install_app_ops: + if (qcc->app_ops && qcc->app_ops->release) + qcc->app_ops->release(qcc->ctx); + task_destroy(qcc->task); + fail_no_timeout_task: + tasklet_free(qcc->wait_event.tasklet); + fail_no_tasklet: + pool_free(pool_head_qcc, qcc); + fail_no_qcc: + TRACE_LEAVE(QMUX_EV_QCC_NEW); + return -1; +} + +static void qmux_destroy(void *ctx) +{ + struct qcc *qcc = ctx; + + TRACE_ENTER(QMUX_EV_QCC_END, qcc->conn); + qcc_release(qcc); + TRACE_LEAVE(QMUX_EV_QCC_END); +} + +static void qmux_strm_detach(struct sedesc *sd) +{ + struct qcs *qcs = sd->se; + struct qcc *qcc = qcs->qcc; + + TRACE_ENTER(QMUX_EV_STRM_END, qcc->conn, qcs); + + /* TODO this BUG_ON_HOT() is not correct as the stconn layer may detach + * from the stream even if it is not closed remotely at the QUIC layer. + * This happens for example when a stream must be closed due to a + * rejected request. To better handle these cases, it will be required + * to implement shutr/shutw MUX operations. Once this is done, this + * BUG_ON_HOT() statement can be adjusted. + */ + //BUG_ON_HOT(!qcs_is_close_remote(qcs)); + + qcc_rm_sc(qcc); + + if (!qcs_is_close_local(qcs) && + !(qcc->flags & (QC_CF_ERR_CONN|QC_CF_ERRL))) { + TRACE_STATE("remaining data, detaching qcs", QMUX_EV_STRM_END, qcc->conn, qcs); + qcs->flags |= QC_SF_DETACH; + qcc_refresh_timeout(qcc); + + TRACE_LEAVE(QMUX_EV_STRM_END, qcc->conn, qcs); + return; + } + + qcs_destroy(qcs); + + if (qcc_is_dead(qcc)) { + TRACE_STATE("killing dead connection", QMUX_EV_STRM_END, qcc->conn); + goto release; + } + else { + TRACE_DEVEL("refreshing connection's timeout", QMUX_EV_STRM_END, qcc->conn); + qcc_refresh_timeout(qcc); + } + + TRACE_LEAVE(QMUX_EV_STRM_END, qcc->conn); + return; + + release: + qcc_release(qcc); + TRACE_LEAVE(QMUX_EV_STRM_END); + return; +} + +/* Called from the upper layer, to receive data */ +static size_t qmux_strm_rcv_buf(struct stconn *sc, struct buffer *buf, + size_t count, int flags) +{ + struct qcs *qcs = __sc_mux_strm(sc); + struct qcc *qcc = qcs->qcc; + size_t ret = 0; + char fin = 0; + + TRACE_ENTER(QMUX_EV_STRM_RECV, qcc->conn, qcs); + + ret = qcs_http_rcv_buf(qcs, buf, count, &fin); + + if (b_data(&qcs->rx.app_buf)) { + se_fl_set(qcs->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + } + else { + se_fl_clr(qcs->sd, SE_FL_RCV_MORE | SE_FL_WANT_ROOM); + + /* Set end-of-input when full message properly received. */ + if (fin) { + TRACE_STATE("report end-of-input", QMUX_EV_STRM_RECV, qcc->conn, qcs); + se_fl_set(qcs->sd, SE_FL_EOI); + + /* If request EOM is reported to the upper layer, it means the + * QCS now expects data from the opposite side. + */ + se_expect_data(qcs->sd); + } + + /* Set end-of-stream on read closed. */ + if (qcs->flags & QC_SF_RECV_RESET || + qcc->conn->flags & CO_FL_SOCK_RD_SH) { + TRACE_STATE("report end-of-stream", QMUX_EV_STRM_RECV, qcc->conn, qcs); + se_fl_set(qcs->sd, SE_FL_EOS); + + /* Set error if EOI not reached. This may happen on + * RESET_STREAM reception or connection error. + */ + if (!se_fl_test(qcs->sd, SE_FL_EOI)) { + TRACE_STATE("report error on stream aborted", QMUX_EV_STRM_RECV, qcc->conn, qcs); + se_fl_set(qcs->sd, SE_FL_ERROR); + } + } + + if (se_fl_test(qcs->sd, SE_FL_ERR_PENDING)) { + TRACE_STATE("report error", QMUX_EV_STRM_RECV, qcc->conn, qcs); + se_fl_set(qcs->sd, SE_FL_ERROR); + } + + if (b_size(&qcs->rx.app_buf)) { + b_free(&qcs->rx.app_buf); + offer_buffers(NULL, 1); + } + } + + /* Restart demux if it was interrupted on full buffer. */ + if (ret && qcs->flags & QC_SF_DEM_FULL) { + /* Ensure DEM_FULL is only set if there is available data to + * ensure we never do unnecessary wakeup here. + */ + BUG_ON(!ncb_data(&qcs->rx.ncbuf, 0)); + + qcs->flags &= ~QC_SF_DEM_FULL; + if (!(qcc->flags & QC_CF_ERRL)) + tasklet_wakeup(qcc->wait_event.tasklet); + } + + TRACE_LEAVE(QMUX_EV_STRM_RECV, qcc->conn, qcs); + + return ret; +} + +static size_t qmux_strm_snd_buf(struct stconn *sc, struct buffer *buf, + size_t count, int flags) +{ + struct qcs *qcs = __sc_mux_strm(sc); + size_t ret = 0; + char fin; + + TRACE_ENTER(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + + /* stream layer has been detached so no transfer must occur after. */ + BUG_ON_HOT(qcs->flags & QC_SF_DETACH); + + /* Report error if set on stream endpoint layer. */ + if (qcs->qcc->flags & (QC_CF_ERR_CONN|QC_CF_ERRL)) { + se_fl_set(qcs->sd, SE_FL_ERROR); + TRACE_DEVEL("connection in error", QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + goto end; + } + + if (qcs_is_close_local(qcs) || (qcs->flags & QC_SF_TO_RESET)) { + ret = qcs_http_reset_buf(qcs, buf, count); + goto end; + } + + ret = qcs_http_snd_buf(qcs, buf, count, &fin); + if (fin) { + TRACE_STATE("reached stream fin", QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + qcs->flags |= QC_SF_FIN_STREAM; + } + + if (ret || fin) { + qcc_send_stream(qcs, 0); + if (!(qcs->qcc->wait_event.events & SUB_RETRY_SEND)) + tasklet_wakeup(qcs->qcc->wait_event.tasklet); + } + + end: + TRACE_LEAVE(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + + return ret; +} + + +static size_t qmux_nego_ff(struct stconn *sc, struct buffer *input, size_t count, unsigned int may_splice) +{ + struct qcs *qcs = __sc_mux_strm(sc); + size_t ret = 0; + + TRACE_ENTER(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + + /* stream layer has been detached so no transfer must occur after. */ + BUG_ON_HOT(qcs->flags & QC_SF_DETACH); + + if (!qcs->qcc->app_ops->nego_ff || !qcs->qcc->app_ops->done_ff) { + /* Fast forwading is not supported by the QUIC application layer */ + qcs->sd->iobuf.flags |= IOBUF_FL_NO_FF; + goto end; + } + + if (qcs->qcc->flags & (QC_CF_ERR_CONN|QC_CF_ERRL)) { + /* Disable fast-forward if connection is on error. Eventually, + * error will be reported to stream-conn if snd_buf is invoked. + */ + TRACE_DEVEL("connection in error", QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + qcs->sd->iobuf.flags |= IOBUF_FL_NO_FF; + goto end; + } + + /* Alawys disable splicing */ + qcs->sd->iobuf.flags |= IOBUF_FL_NO_SPLICING; + + ret = qcs->qcc->app_ops->nego_ff(qcs, count); + if (!ret) + goto end; + + /* forward remaining input data */ + if (b_data(input)) { + size_t xfer = ret; + + if (xfer > b_data(input)) + xfer = b_data(input); + b_add(qcs->sd->iobuf.buf, qcs->sd->iobuf.offset); + qcs->sd->iobuf.data = b_xfer(qcs->sd->iobuf.buf, input, xfer); + b_sub(qcs->sd->iobuf.buf, qcs->sd->iobuf.offset); + + /* Cannot forward more data, wait for room */ + if (b_data(input)) { + ret = 0; + goto end; + } + } + ret -= qcs->sd->iobuf.data; + + end: + TRACE_LEAVE(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + return ret; +} + +static size_t qmux_done_ff(struct stconn *sc) +{ + struct qcs *qcs = __sc_mux_strm(sc); + struct qcc *qcc = qcs->qcc; + struct sedesc *sd = qcs->sd; + size_t total = 0; + + TRACE_ENTER(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + + if (sd->iobuf.flags & IOBUF_FL_EOI) + qcs->flags |= QC_SF_FIN_STREAM; + + if (!(qcs->flags & QC_SF_FIN_STREAM) && !sd->iobuf.data) + goto end; + + total = qcs->qcc->app_ops->done_ff(qcs); + + qcc_send_stream(qcs, 0); + if (!(qcs->qcc->wait_event.events & SUB_RETRY_SEND)) + tasklet_wakeup(qcc->wait_event.tasklet); + + end: + if (!b_data(&qcs->tx.buf)) + b_free(&qcs->tx.buf); + + TRACE_LEAVE(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + return total; +} + +static int qmux_resume_ff(struct stconn *sc, unsigned int flags) +{ + return 0; +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0. + */ +static int qmux_strm_subscribe(struct stconn *sc, int event_type, + struct wait_event *es) +{ + return qcs_subscribe(__sc_mux_strm(sc), event_type, es); +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type>. + * The <es> pointer is not allowed to differ from the one passed to the + * subscribe() call. It always returns zero. + */ +static int qmux_strm_unsubscribe(struct stconn *sc, int event_type, struct wait_event *es) +{ + struct qcs *qcs = __sc_mux_strm(sc); + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(qcs->subs && qcs->subs != es); + + es->events &= ~event_type; + if (!es->events) + qcs->subs = NULL; + + return 0; +} + +static int qmux_wake(struct connection *conn) +{ + struct qcc *qcc = conn->ctx; + + TRACE_ENTER(QMUX_EV_QCC_WAKE, conn); + + if (qcc_io_process(qcc)) { + TRACE_STATE("releasing dead connection", QMUX_EV_QCC_WAKE, qcc->conn); + goto release; + } + + qcc_wake_some_streams(qcc); + + qcc_refresh_timeout(qcc); + + TRACE_LEAVE(QMUX_EV_QCC_WAKE, conn); + return 0; + + release: + qcc_release(qcc); + TRACE_LEAVE(QMUX_EV_QCC_WAKE); + return 1; +} + +static void qmux_strm_shutw(struct stconn *sc, enum co_shw_mode mode) +{ + struct qcs *qcs = __sc_mux_strm(sc); + struct qcc *qcc = qcs->qcc; + + TRACE_ENTER(QMUX_EV_STRM_SHUT, qcc->conn, qcs); + + /* Early closure reported if QC_SF_FIN_STREAM not yet set. */ + if (!qcs_is_close_local(qcs) && + !(qcs->flags & (QC_SF_FIN_STREAM|QC_SF_TO_RESET))) { + + if (qcs->flags & QC_SF_UNKNOWN_PL_LENGTH) { + /* Close stream with a FIN STREAM frame. */ + if (!(qcc->flags & (QC_CF_ERR_CONN|QC_CF_ERRL))) { + TRACE_STATE("set FIN STREAM", + QMUX_EV_STRM_SHUT, qcc->conn, qcs); + qcs->flags |= QC_SF_FIN_STREAM; + qcc_send_stream(qcs, 0); + } + } + else { + /* RESET_STREAM necessary. */ + qcc_reset_stream(qcs, 0); + } + + tasklet_wakeup(qcc->wait_event.tasklet); + } + + out: + TRACE_LEAVE(QMUX_EV_STRM_SHUT, qcc->conn, qcs); +} + +static int qmux_sctl(struct stconn *sc, enum mux_sctl_type mux_sctl, void *output) +{ + int ret = 0; + struct qcs *qcs = __sc_mux_strm(sc); + + switch (mux_sctl) { + case MUX_SCTL_SID: + if (output) + *((int64_t *)output) = qcs->id; + return ret; + + default: + return -1; + } +} + +/* for debugging with CLI's "show sess" command. May emit multiple lines, each + * new one being prefixed with <pfx>, if <pfx> is not NULL, otherwise a single + * line is used. Each field starts with a space so it's safe to print it after + * existing fields. + */ +static int qmux_strm_show_sd(struct buffer *msg, struct sedesc *sd, const char *pfx) +{ + struct qcs *qcs = sd->se; + struct qcc *qcc; + int ret = 0; + + if (!qcs) + return ret; + + chunk_appendf(msg, " qcs=%p .flg=%#x .id=%llu .st=%s .ctx=%p, .err=%#llx", + qcs, qcs->flags, (ull)qcs->id, qcs_st_to_str(qcs->st), qcs->ctx, (ull)qcs->err); + + if (pfx) + chunk_appendf(msg, "\n%s", pfx); + + qcc = qcs->qcc; + chunk_appendf(msg, " qcc=%p .flg=%#x .nbsc=%llu .nbhreq=%llu, .task=%p", + qcc, qcc->flags, (ull)qcc->nb_sc, (ull)qcc->nb_hreq, qcc->task); + return ret; +} + + +static const struct mux_ops qmux_ops = { + .init = qmux_init, + .destroy = qmux_destroy, + .detach = qmux_strm_detach, + .rcv_buf = qmux_strm_rcv_buf, + .snd_buf = qmux_strm_snd_buf, + .nego_fastfwd = qmux_nego_ff, + .done_fastfwd = qmux_done_ff, + .resume_fastfwd = qmux_resume_ff, + .subscribe = qmux_strm_subscribe, + .unsubscribe = qmux_strm_unsubscribe, + .wake = qmux_wake, + .shutw = qmux_strm_shutw, + .sctl = qmux_sctl, + .show_sd = qmux_strm_show_sd, + .flags = MX_FL_HTX|MX_FL_NO_UPG|MX_FL_FRAMED, + .name = "QUIC", +}; + +static struct mux_proto_list mux_proto_quic = + { .token = IST("quic"), .mode = PROTO_MODE_HTTP, .side = PROTO_SIDE_FE, .mux = &qmux_ops }; + +INITCALL1(STG_REGISTER, register_mux_proto, &mux_proto_quic); diff --git a/src/mworker-prog.c b/src/mworker-prog.c new file mode 100644 index 0000000..2734d95 --- /dev/null +++ b/src/mworker-prog.c @@ -0,0 +1,359 @@ +/* + * Master Worker - program + * + * Copyright HAProxy Technologies - William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE + +#include <sys/types.h> +#include <errno.h> +#include <grp.h> +#include <pwd.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/mworker.h> +#include <haproxy/task.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> + + +static int use_program = 0; /* do we use the program section ? */ + +/* + * Launch every programs + */ +int mworker_ext_launch_all() +{ + int ret; + struct mworker_proc *child; + struct mworker_proc *tmp; + int reexec = 0; + + if (!use_program) + return 0; + + reexec = getenv("HAPROXY_MWORKER_REEXEC") ? 1 : 0; + + /* find the right mworker_proc */ + list_for_each_entry_safe(child, tmp, &proc_list, list) { + if (child->reloads == 0 && (child->options & PROC_O_TYPE_PROG)) { + + if (reexec && (!(child->options & PROC_O_START_RELOAD))) { + struct mworker_proc *old_child; + + /* + * This is a reload and we don't want to fork a + * new program so have to remove the entry in + * the list. + * + * But before that, we need to mark the + * previous program as not leaving, if we find one. + */ + + list_for_each_entry(old_child, &proc_list, list) { + if (!(old_child->options & PROC_O_TYPE_PROG) || (!(old_child->options & PROC_O_LEAVING))) + continue; + + if (strcmp(old_child->id, child->id) == 0) + old_child->options &= ~PROC_O_LEAVING; + } + + + LIST_DELETE(&child->list); + mworker_free_child(child); + child = NULL; + + continue; + } + + child->timestamp = ns_to_sec(now_ns); + + ret = fork(); + if (ret < 0) { + ha_alert("Cannot fork program '%s'.\n", child->id); + exit(EXIT_FAILURE); /* there has been an error */ + } else if (ret > 0) { /* parent */ + child->pid = ret; + ha_notice("New program '%s' (%d) forked\n", child->id, ret); + continue; + } else if (ret == 0) { + /* In child */ + mworker_unblock_signals(); + mworker_cleanlisteners(); + mworker_cleantasks(); + + /* setgid / setuid */ + if (child->gid != -1) { + if (getgroups(0, NULL) > 0 && setgroups(0, NULL) == -1) + ha_warning("[%s.main()] Failed to drop supplementary groups. Using 'gid'/'group'" + " without 'uid'/'user' is generally useless.\n", child->command[0]); + + if (setgid(child->gid) == -1) { + ha_alert("[%s.main()] Cannot set gid %d.\n", child->command[0], child->gid); + exit(1); + } + } + + if (child->uid != -1 && setuid(child->uid) == -1) { + ha_alert("[%s.main()] Cannot set uid %d.\n", child->command[0], child->gid); + exit(1); + } + + /* This one must not be exported, it's internal! */ + unsetenv("HAPROXY_MWORKER_REEXEC"); + unsetenv("HAPROXY_STARTUPLOGS_FD"); + unsetenv("HAPROXY_MWORKER_WAIT_ONLY"); + unsetenv("HAPROXY_PROCESSES"); + execvp(child->command[0], child->command); + + ha_alert("Cannot execute %s: %s\n", child->command[0], strerror(errno)); + exit(EXIT_FAILURE); + } + } + } + + return 0; + +} + + +/* Configuration */ + +int cfg_parse_program(const char *file, int linenum, char **args, int kwm) +{ + static struct mworker_proc *ext_child = NULL; + struct mworker_proc *child; + int err_code = 0; + + if (strcmp(args[0], "program") == 0) { + if (alertif_too_many_args(1, file, linenum, args, &err_code)) { + err_code |= ERR_ABORT; + goto error; + } + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects an <id> argument\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_ABORT; + goto error; + } + + ext_child = calloc(1, sizeof(*ext_child)); + if (!ext_child) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto error; + } + + ext_child->options |= PROC_O_TYPE_PROG; /* external process */ + ext_child->command = NULL; + ext_child->path = NULL; + ext_child->id = NULL; + ext_child->pid = -1; + ext_child->reloads = 0; + ext_child->timestamp = -1; + ext_child->ipc_fd[0] = -1; + ext_child->ipc_fd[1] = -1; + ext_child->options |= PROC_O_START_RELOAD; /* restart the programs by default */ + ext_child->uid = -1; + ext_child->gid = -1; + LIST_INIT(&ext_child->list); + + list_for_each_entry(child, &proc_list, list) { + if (child->reloads == 0 && (child->options & PROC_O_TYPE_PROG)) { + if (strcmp(args[1], child->id) == 0) { + ha_alert("parsing [%s:%d]: '%s' program section already exists in the configuration.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_ABORT; + goto error; + } + } + } + + ext_child->id = strdup(args[1]); + if (!ext_child->id) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto error; + } + + LIST_APPEND(&proc_list, &ext_child->list); + + } else if (strcmp(args[0], "command") == 0) { + int arg_nb = 0; + int i = 0; + + if (*(args[1]) == 0) { + ha_alert("parsing [%s:%d]: '%s' expects a command with optional arguments separated in words.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto error; + } + + while (*args[arg_nb+1]) + arg_nb++; + + ext_child->command = calloc(arg_nb+1, sizeof(*ext_child->command)); + + if (!ext_child->command) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto error; + } + + while (i < arg_nb) { + ext_child->command[i] = strdup(args[i+1]); + if (!ext_child->command[i]) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto error; + } + i++; + } + ext_child->command[i] = NULL; + + } else if (strcmp(args[0], "option") == 0) { + + if (*(args[1]) == '\0') { + ha_alert("parsing [%s:%d]: '%s' expects an option name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto error; + } + + if (strcmp(args[1], "start-on-reload") == 0) { + if (alertif_too_many_args_idx(0, 1, file, linenum, args, &err_code)) + goto error; + if (kwm == KWM_STD) + ext_child->options |= PROC_O_START_RELOAD; + else if (kwm == KWM_NO) + ext_child->options &= ~PROC_O_START_RELOAD; + goto out; + + } else { + ha_alert("parsing [%s:%d] : unknown option '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto error; + } + } else if (strcmp(args[0], "user") == 0) { + struct passwd *ext_child_user; + if (*(args[1]) == '\0') { + ha_alert("parsing [%s:%d]: '%s' expects a user name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto error; + } + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto error; + + if (ext_child->uid != -1) { + ha_alert("parsing [%s:%d] : user/uid already specified. Continuing.\n", file, linenum); + err_code |= ERR_ALERT; + goto out; + } + + ext_child_user = getpwnam(args[1]); + if (ext_child_user != NULL) { + ext_child->uid = (int)ext_child_user->pw_uid; + } else { + ha_alert("parsing [%s:%d] : cannot find user id for '%s' (%d:%s)\n", file, linenum, args[1], errno, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + } + } else if (strcmp(args[0], "group") == 0) { + struct group *ext_child_group; + if (*(args[1]) == '\0') { + ha_alert("parsing [%s:%d]: '%s' expects a group name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto error; + } + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto error; + + if (ext_child->gid != -1) { + ha_alert("parsing [%s:%d] : group/gid already specified. Continuing.\n", file, linenum); + err_code |= ERR_ALERT; + goto out; + } + + ext_child_group = getgrnam(args[1]); + if (ext_child_group != NULL) { + ext_child->gid = (int)ext_child_group->gr_gid; + } else { + ha_alert("parsing [%s:%d] : cannot find group id for '%s' (%d:%s)\n", file, linenum, args[1], errno, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + } + } else { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], "program"); + err_code |= ERR_ALERT | ERR_FATAL; + goto error; + } + + use_program = 1; + + return err_code; + +error: + if (ext_child) { + LIST_DELETE(&ext_child->list); + if (ext_child->command) { + int i; + + for (i = 0; ext_child->command[i]; i++) { + ha_free(&ext_child->command[i]); + } + ha_free(&ext_child->command); + } + ha_free(&ext_child->id); + } + + ha_free(&ext_child); + +out: + return err_code; + +} + +int cfg_program_postparser() +{ + int err_code = 0; + struct mworker_proc *child; + + /* we only need to check this during configuration parsing, + * wait mode doesn't have the complete description of a program */ + if (global.mode & MODE_MWORKER_WAIT) + return err_code; + + list_for_each_entry(child, &proc_list, list) { + if (child->reloads == 0 && (child->options & PROC_O_TYPE_PROG)) { + if (child->command == NULL) { + ha_alert("The program section '%s' lacks a command to launch.\n", child->id); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + } + + if (use_program && !(global.mode & MODE_MWORKER)) { + ha_alert("Can't use a 'program' section without master worker mode.\n"); + err_code |= ERR_ALERT | ERR_FATAL; + } + + return err_code; +} + + +REGISTER_CONFIG_SECTION("program", cfg_parse_program, NULL); +REGISTER_CONFIG_POSTPARSER("program", cfg_program_postparser); diff --git a/src/mworker.c b/src/mworker.c new file mode 100644 index 0000000..c71446a --- /dev/null +++ b/src/mworker.c @@ -0,0 +1,821 @@ +/* + * Master Worker + * + * Copyright HAProxy Technologies 2019 - William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE + +#include <errno.h> +#include <fcntl.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <sys/wait.h> +#include <unistd.h> + +#if defined(USE_SYSTEMD) +#include <systemd/sd-daemon.h> +#endif + +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/listener.h> +#include <haproxy/mworker.h> +#include <haproxy/peers.h> +#include <haproxy/proto_sockpair.h> +#include <haproxy/proxy.h> +#include <haproxy/ring.h> +#include <haproxy/sc_strm.h> +#include <haproxy/signal.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + + +static int exitcode = -1; +static int max_reloads = -1; /* number max of reloads a worker can have until they are killed */ +struct mworker_proc *proc_self = NULL; /* process structure of current process */ + +/* ----- children processes handling ----- */ + +/* + * Send signal to every known children. + */ + +static void mworker_kill(int sig) +{ + struct mworker_proc *child; + + list_for_each_entry(child, &proc_list, list) { + /* careful there, we must be sure that the pid > 0, we don't want to emit a kill -1 */ + if ((child->options & (PROC_O_TYPE_WORKER|PROC_O_TYPE_PROG)) && (child->pid > 0)) + kill(child->pid, sig); + } +} + +void mworker_kill_max_reloads(int sig) +{ + struct mworker_proc *child; + + list_for_each_entry(child, &proc_list, list) { + if (max_reloads != -1 && (child->options & PROC_O_TYPE_WORKER) && + (child->pid > 0) && (child->reloads > max_reloads)) + kill(child->pid, sig); + } +} + +/* return 1 if a pid is a current child otherwise 0 */ +int mworker_current_child(int pid) +{ + struct mworker_proc *child; + + list_for_each_entry(child, &proc_list, list) { + if ((child->options & (PROC_O_TYPE_WORKER|PROC_O_TYPE_PROG)) && (!(child->options & PROC_O_LEAVING)) && (child->pid == pid)) + return 1; + } + return 0; +} + +/* + * Return the number of new and old children (including workers and external + * processes) + */ +int mworker_child_nb() +{ + struct mworker_proc *child; + int ret = 0; + + list_for_each_entry(child, &proc_list, list) { + if (child->options & (PROC_O_TYPE_WORKER|PROC_O_TYPE_PROG)) + ret++; + } + + return ret; +} + + +/* + * serialize the proc list and put it in the environment + */ +void mworker_proc_list_to_env() +{ + char *msg = NULL; + struct mworker_proc *child; + int minreloads = INT_MAX; /* minimum number of reloads to chose which processes are "current" ones */ + + list_for_each_entry(child, &proc_list, list) { + char type = '?'; + + if (child->options & PROC_O_TYPE_MASTER) + type = 'm'; + else if (child->options & PROC_O_TYPE_PROG) + type = 'e'; + else if (child->options &= PROC_O_TYPE_WORKER) + type = 'w'; + + if (child->reloads < minreloads) + minreloads = child->reloads; + + if (child->pid > -1) + memprintf(&msg, "%s|type=%c;fd=%d;cfd=%d;pid=%d;reloads=%d;failedreloads=%d;timestamp=%d;id=%s;version=%s", msg ? msg : "", type, child->ipc_fd[0], child->ipc_fd[1], child->pid, child->reloads, child->failedreloads, child->timestamp, child->id ? child->id : "", child->version); + } + if (msg) + setenv("HAPROXY_PROCESSES", msg, 1); + + list_for_each_entry(child, &proc_list, list) { + if (child->reloads > minreloads && !(child->options & PROC_O_TYPE_MASTER)) { + child->options |= PROC_O_LEAVING; + } + } + + +} + +struct mworker_proc *mworker_proc_new() +{ + struct mworker_proc *child; + + child = calloc(1, sizeof(*child)); + if (!child) + return NULL; + + child->failedreloads = 0; + child->reloads = 0; + child->pid = -1; + child->ipc_fd[0] = -1; + child->ipc_fd[1] = -1; + child->timestamp = -1; + + return child; +} + + +/* + * unserialize the proc list from the environment + * Return < 0 upon error. + */ +int mworker_env_to_proc_list() +{ + char *env, *msg, *omsg = NULL, *token = NULL, *s1; + struct mworker_proc *child; + int minreloads = INT_MAX; /* minimum number of reloads to chose which processes are "current" ones */ + int err = 0; + + env = getenv("HAPROXY_PROCESSES"); + if (!env) + goto no_env; + + omsg = msg = strdup(env); + if (!msg) { + ha_alert("Out of memory while trying to allocate a worker process structure."); + err = -1; + goto out; + } + + while ((token = strtok_r(msg, "|", &s1))) { + char *subtoken = NULL; + char *s2; + + msg = NULL; + + child = mworker_proc_new(); + if (!child) { + ha_alert("out of memory while trying to allocate a worker process structure."); + err = -1; + goto out; + } + + while ((subtoken = strtok_r(token, ";", &s2))) { + + token = NULL; + + if (strncmp(subtoken, "type=", 5) == 0) { + char type; + + type = *(subtoken+5); + if (type == 'm') { /* we are in the master, assign it */ + proc_self = child; + child->options |= PROC_O_TYPE_MASTER; + } else if (type == 'e') { + child->options |= PROC_O_TYPE_PROG; + } else if (type == 'w') { + child->options |= PROC_O_TYPE_WORKER; + } + + } else if (strncmp(subtoken, "fd=", 3) == 0) { + child->ipc_fd[0] = atoi(subtoken+3); + if (child->ipc_fd[0] > -1) + global.maxsock++; + } else if (strncmp(subtoken, "cfd=", 4) == 0) { + child->ipc_fd[1] = atoi(subtoken+4); + if (child->ipc_fd[1] > -1) + global.maxsock++; + } else if (strncmp(subtoken, "pid=", 4) == 0) { + child->pid = atoi(subtoken+4); + } else if (strncmp(subtoken, "reloads=", 8) == 0) { + /* we only increment the number of asked reload */ + child->reloads = atoi(subtoken+8); + + if (child->reloads < minreloads) + minreloads = child->reloads; + } else if (strncmp(subtoken, "failedreloads=", 14) == 0) { + child->failedreloads = atoi(subtoken+14); + } else if (strncmp(subtoken, "timestamp=", 10) == 0) { + child->timestamp = atoi(subtoken+10); + } else if (strncmp(subtoken, "id=", 3) == 0) { + child->id = strdup(subtoken+3); + } else if (strncmp(subtoken, "version=", 8) == 0) { + child->version = strdup(subtoken+8); + } + } + if (child->pid) { + LIST_APPEND(&proc_list, &child->list); + } else { + mworker_free_child(child); + } + } + + /* set the leaving processes once we know which number of reloads are the current processes */ + + list_for_each_entry(child, &proc_list, list) { + if (child->reloads > minreloads) + child->options |= PROC_O_LEAVING; + } + + unsetenv("HAPROXY_PROCESSES"); + +no_env: + + if (!proc_self) { + + proc_self = mworker_proc_new(); + if (!proc_self) { + ha_alert("Cannot allocate process structures.\n"); + err = -1; + goto out; + } + proc_self->options |= PROC_O_TYPE_MASTER; + proc_self->pid = pid; + proc_self->timestamp = 0; /* we don't know the startime anymore */ + + LIST_APPEND(&proc_list, &proc_self->list); + ha_warning("The master internals are corrupted or it was started with a too old version (< 1.9). Please restart the master process.\n"); + } + +out: + free(omsg); + return err; +} + +/* Signal blocking and unblocking */ + +void mworker_block_signals() +{ + sigset_t set; + + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGUSR2); + sigaddset(&set, SIGTTIN); + sigaddset(&set, SIGTTOU); + sigaddset(&set, SIGHUP); + sigaddset(&set, SIGCHLD); + ha_sigmask(SIG_SETMASK, &set, NULL); +} + +void mworker_unblock_signals() +{ + haproxy_unblock_signals(); +} + +/* ----- mworker signal handlers ----- */ + +/* broadcast the configured signal to the workers */ +void mworker_broadcast_signal(struct sig_handler *sh) +{ + mworker_kill(sh->arg); +} + +/* + * When called, this function reexec haproxy with -sf followed by current + * children PIDs and possibly old children PIDs if they didn't leave yet. + */ +void mworker_catch_sighup(struct sig_handler *sh) +{ + mworker_reload(0); +} + +void mworker_catch_sigterm(struct sig_handler *sh) +{ + int sig = sh->arg; + +#if defined(USE_SYSTEMD) + if (global.tune.options & GTUNE_USE_SYSTEMD) { + sd_notify(0, "STOPPING=1"); + } +#endif + ha_warning("Exiting Master process...\n"); + mworker_kill(sig); +} + +/* + * Wait for every children to exit + */ + +void mworker_catch_sigchld(struct sig_handler *sh) +{ + int exitpid = -1; + int status = 0; + int childfound; + +restart_wait: + + childfound = 0; + + exitpid = waitpid(-1, &status, WNOHANG); + if (exitpid > 0) { + struct mworker_proc *child, *it; + + if (WIFEXITED(status)) + status = WEXITSTATUS(status); + else if (WIFSIGNALED(status)) + status = 128 + WTERMSIG(status); + else if (WIFSTOPPED(status)) + status = 128 + WSTOPSIG(status); + else + status = 255; + + /* delete the child from the process list */ + list_for_each_entry_safe(child, it, &proc_list, list) { + if (child->pid != exitpid) + continue; + + LIST_DELETE(&child->list); + close(child->ipc_fd[0]); + childfound = 1; + break; + } + + if (!childfound) { + /* We didn't find the PID in the list, that shouldn't happen but we can emit a warning */ + ha_warning("Process %d exited with code %d (%s)\n", exitpid, status, (status >= 128) ? strsignal(status - 128) : "Exit"); + } else { + /* check if exited child is a current child */ + if (!(child->options & PROC_O_LEAVING)) { + if (child->options & PROC_O_TYPE_WORKER) { + if (status < 128) + ha_warning("Current worker (%d) exited with code %d (%s)\n", exitpid, status, "Exit"); + else + ha_alert("Current worker (%d) exited with code %d (%s)\n", exitpid, status, strsignal(status - 128)); + } + else if (child->options & PROC_O_TYPE_PROG) + ha_alert("Current program '%s' (%d) exited with code %d (%s)\n", child->id, exitpid, status, (status >= 128) ? strsignal(status - 128) : "Exit"); + + if (status != 0 && status != 130 && status != 143) { + if (child->options & PROC_O_TYPE_WORKER) { + ha_warning("A worker process unexpectedly died and this can only be explained by a bug in haproxy or its dependencies.\nPlease check that you are running an up to date and maintained version of haproxy and open a bug report.\n"); + display_version(); + } + if (!(global.tune.options & GTUNE_NOEXIT_ONFAILURE)) { + ha_alert("exit-on-failure: killing every processes with SIGTERM\n"); + mworker_kill(SIGTERM); + } + } + /* 0 & SIGTERM (143) are normal, but we should report SIGINT (130) and other signals */ + if (exitcode < 0 && status != 0 && status != 143) + exitcode = status; + } else { + if (child->options & PROC_O_TYPE_WORKER) { + ha_warning("Former worker (%d) exited with code %d (%s)\n", exitpid, status, (status >= 128) ? strsignal(status - 128) : "Exit"); + delete_oldpid(exitpid); + } else if (child->options & PROC_O_TYPE_PROG) { + ha_warning("Former program '%s' (%d) exited with code %d (%s)\n", child->id, exitpid, status, (status >= 128) ? strsignal(status - 128) : "Exit"); + } + } + mworker_free_child(child); + child = NULL; + } + + /* do it again to check if it was the last worker */ + goto restart_wait; + } + /* Better rely on the system than on a list of process to check if it was the last one */ + else if (exitpid == -1 && errno == ECHILD) { + ha_warning("All workers exited. Exiting... (%d)\n", (exitcode > 0) ? exitcode : EXIT_SUCCESS); + atexit_flag = 0; + if (exitcode > 0) + exit(exitcode); /* parent must leave using the status code that provoked the exit */ + exit(EXIT_SUCCESS); + } + +} + +/* ----- IPC FD (sockpair) related ----- */ + +/* This wrapper is called from the workers. It is registered instead of the + * normal listener_accept() so the worker can exit() when it detects that the + * master closed the IPC FD. If it's not a close, we just call the regular + * listener_accept() function. + */ +void mworker_accept_wrapper(int fd) +{ + char c; + int ret; + + while (1) { + ret = recv(fd, &c, 1, MSG_PEEK); + if (ret == -1) { + if (errno == EINTR) + continue; + if (errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_recv(fd); + return; + } + break; + } else if (ret > 0) { + struct listener *l = fdtab[fd].owner; + + if (l) + listener_accept(l); + return; + } else if (ret == 0) { + /* At this step the master is down before + * this worker perform a 'normal' exit. + * So we want to exit with an error but + * other threads could currently process + * some stuff so we can't perform a clean + * deinit(). + */ + exit(EXIT_FAILURE); + } + } + return; +} + +/* + * This function registers the accept wrapper for the sockpair of the master + * worker. It's only handled by worker thread #0. Other threads and master do + * nothing here. It always returns 1 (success). + */ +static int mworker_sockpair_register_per_thread() +{ + if (!(global.mode & MODE_MWORKER) || master) + return 1; + + if (tid != 0) + return 1; + + if (proc_self->ipc_fd[1] < 0) /* proc_self was incomplete and we can't find the socketpair */ + return 1; + + fd_set_nonblock(proc_self->ipc_fd[1]); + /* register the wrapper to handle read 0 when the master exits */ + fdtab[proc_self->ipc_fd[1]].iocb = mworker_accept_wrapper; + fd_want_recv(proc_self->ipc_fd[1]); + return 1; +} + +REGISTER_PER_THREAD_INIT(mworker_sockpair_register_per_thread); + +/* ----- proxies ----- */ +/* + * Upon a reload, the master worker needs to close all listeners FDs but the mworker_pipe + * fd, and the FD provided by fd@ + */ +void mworker_cleanlisteners() +{ + struct listener *l, *l_next; + struct proxy *curproxy; + struct peers *curpeers; + + /* peers proxies cleanup */ + for (curpeers = cfg_peers; curpeers; curpeers = curpeers->next) { + if (!curpeers->peers_fe) + continue; + + stop_proxy(curpeers->peers_fe); + /* disable this peer section so that it kills itself */ + if (curpeers->sighandler) + signal_unregister_handler(curpeers->sighandler); + task_destroy(curpeers->sync_task); + curpeers->sync_task = NULL; + curpeers->peers_fe = NULL; + } + + /* main proxies cleanup */ + for (curproxy = proxies_list; curproxy; curproxy = curproxy->next) { + int listen_in_master = 0; + + list_for_each_entry_safe(l, l_next, &curproxy->conf.listeners, by_fe) { + /* remove the listener, but not those we need in the master... */ + if (!(l->rx.flags & RX_F_MWORKER)) { + unbind_listener(l); + delete_listener(l); + } else { + listen_in_master = 1; + } + } + /* if the proxy shouldn't be in the master, we stop it */ + if (!listen_in_master) + curproxy->flags |= PR_FL_DISABLED; + } +} + +/* Upon a configuration loading error some mworker_proc and FDs/server were + * assigned but the worker was never forked, we must close the FDs and + * remove the server + */ +void mworker_cleanup_proc() +{ + struct mworker_proc *child, *it; + + list_for_each_entry_safe(child, it, &proc_list, list) { + + if (child->pid == -1) { + /* Close the socketpairs. */ + if (child->ipc_fd[0] > -1) + close(child->ipc_fd[0]); + if (child->ipc_fd[1] > -1) + close(child->ipc_fd[1]); + if (child->srv) { + /* only exists if we created a master CLI listener */ + srv_drop(child->srv); + } + LIST_DELETE(&child->list); + mworker_free_child(child); + } + } +} + + +/* Displays workers and processes */ +static int cli_io_handler_show_proc(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct mworker_proc *child; + int old = 0; + int up = date.tv_sec - proc_self->timestamp; + char *uptime = NULL; + char *reloadtxt = NULL; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + if (up < 0) /* must never be negative because of clock drift */ + up = 0; + + chunk_reset(&trash); + + memprintf(&reloadtxt, "%d [failed: %d]", proc_self->reloads, proc_self->failedreloads); + chunk_printf(&trash, "#%-14s %-15s %-15s %-15s %-15s\n", "<PID>", "<type>", "<reloads>", "<uptime>", "<version>"); + memprintf(&uptime, "%dd%02dh%02dm%02ds", up / 86400, (up % 86400) / 3600, (up % 3600) / 60, (up % 60)); + chunk_appendf(&trash, "%-15u %-15s %-15s %-15s %-15s\n", (unsigned int)getpid(), "master", reloadtxt, uptime, haproxy_version); + ha_free(&reloadtxt); + ha_free(&uptime); + + /* displays current processes */ + + chunk_appendf(&trash, "# workers\n"); + list_for_each_entry(child, &proc_list, list) { + up = date.tv_sec - child->timestamp; + if (up < 0) /* must never be negative because of clock drift */ + up = 0; + + if (!(child->options & PROC_O_TYPE_WORKER)) + continue; + + if (child->options & PROC_O_LEAVING) { + old++; + continue; + } + memprintf(&uptime, "%dd%02dh%02dm%02ds", up / 86400, (up % 86400) / 3600, (up % 3600) / 60, (up % 60)); + chunk_appendf(&trash, "%-15u %-15s %-15d %-15s %-15s\n", child->pid, "worker", child->reloads, uptime, child->version); + ha_free(&uptime); + } + + /* displays old processes */ + + if (old) { + char *msg = NULL; + + chunk_appendf(&trash, "# old workers\n"); + list_for_each_entry(child, &proc_list, list) { + up = date.tv_sec - child->timestamp; + if (up <= 0) /* must never be negative because of clock drift */ + up = 0; + + if (!(child->options & PROC_O_TYPE_WORKER)) + continue; + + if (child->options & PROC_O_LEAVING) { + memprintf(&uptime, "%dd%02dh%02dm%02ds", up / 86400, (up % 86400) / 3600, (up % 3600) / 60, (up % 60)); + chunk_appendf(&trash, "%-15u %-15s %-15d %-15s %-15s\n", child->pid, "worker", child->reloads, uptime, child->version); + ha_free(&uptime); + } + } + free(msg); + } + + /* displays external process */ + chunk_appendf(&trash, "# programs\n"); + old = 0; + list_for_each_entry(child, &proc_list, list) { + up = date.tv_sec - child->timestamp; + if (up < 0) /* must never be negative because of clock drift */ + up = 0; + + if (!(child->options & PROC_O_TYPE_PROG)) + continue; + + if (child->options & PROC_O_LEAVING) { + old++; + continue; + } + memprintf(&uptime, "%dd%02dh%02dm%02ds", up / 86400, (up % 86400) / 3600, (up % 3600) / 60, (up % 60)); + chunk_appendf(&trash, "%-15u %-15s %-15d %-15s %-15s\n", child->pid, child->id, child->reloads, uptime, "-"); + ha_free(&uptime); + } + + if (old) { + chunk_appendf(&trash, "# old programs\n"); + list_for_each_entry(child, &proc_list, list) { + up = date.tv_sec - child->timestamp; + if (up < 0) /* must never be negative because of clock drift */ + up = 0; + + if (!(child->options & PROC_O_TYPE_PROG)) + continue; + + if (child->options & PROC_O_LEAVING) { + memprintf(&uptime, "%dd%02dh%02dm%02ds", up / 86400, (up % 86400) / 3600, (up % 3600) / 60, (up % 60)); + chunk_appendf(&trash, "%-15u %-15s %-15d %-15s %-15s\n", child->pid, child->id, child->reloads, uptime, "-"); + ha_free(&uptime); + } + } + } + + + + if (applet_putchk(appctx, &trash) == -1) + return 0; + + /* dump complete */ + return 1; +} + +/* reload the master process */ +static int cli_parse_reload(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct stconn *scb = NULL; + struct stream *strm = NULL; + struct connection *conn = NULL; + int fd = -1; + int hardreload = 0; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + /* hard reload requested */ + if (*args[0] == 'h') + hardreload = 1; + + /* This ask for a synchronous reload, which means we will keep this FD + instead of closing it. */ + + scb = appctx_sc(appctx); + if (scb) + strm = sc_strm(scb); + if (strm && strm->scf) + conn = sc_conn(strm->scf); + if (conn) + fd = conn_fd(conn); + + /* Send the FD of the current session to the "cli_reload" FD, which won't be polled */ + if (fd != -1 && send_fd_uxst(proc_self->ipc_fd[0], fd) == 0) { + fd_delete(fd); /* avoid the leak of the FD after sending it via the socketpair */ + } + mworker_reload(hardreload); + + return 1; +} + +/* Displays if the current reload failed or succeed. + * If the startup-logs is available, dump it. */ +static int cli_io_handler_show_loadstatus(struct appctx *appctx) +{ + char *env; + struct stconn *sc = appctx_sc(appctx); + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + env = getenv("HAPROXY_LOAD_SUCCESS"); + if (!env) + return 1; + + if (strcmp(env, "0") == 0) { + chunk_printf(&trash, "Success=0\n"); + } else if (strcmp(env, "1") == 0) { + chunk_printf(&trash, "Success=1\n"); + } +#ifdef USE_SHM_OPEN + if (startup_logs && b_data(&startup_logs->buf) > 1) + chunk_appendf(&trash, "--\n"); + + if (applet_putchk(appctx, &trash) == -1) + return 0; + + if (startup_logs) { + appctx->io_handler = NULL; + ring_attach_cli(startup_logs, appctx, 0); + return 0; + } +#else + if (applet_putchk(appctx, &trash) == -1) + return 0; +#endif + return 1; +} + +static int mworker_parse_global_max_reloads(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int linenum, char **err) +{ + + int err_code = 0; + + if (alertif_too_many_args(1, file, linenum, args, &err_code)) + goto out; + + if (*(args[1]) == 0) { + memprintf(err, "%sparsing [%s:%d] : '%s' expects an integer argument.\n", *err, file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + max_reloads = atol(args[1]); + if (max_reloads < 0) { + memprintf(err, "%sparsing [%s:%d] '%s' : invalid value %d, must be >= 0", *err, file, linenum, args[0], max_reloads); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + +out: + return err_code; +} + +void mworker_free_child(struct mworker_proc *child) +{ + int i; + + if (child == NULL) + return; + + for (i = 0; child->command && child->command[i]; i++) + ha_free(&child->command[i]); + + ha_free(&child->command); + ha_free(&child->id); + ha_free(&child->version); + free(child); +} + +static struct cfg_kw_list mworker_kws = {{ }, { + { CFG_GLOBAL, "mworker-max-reloads", mworker_parse_global_max_reloads }, + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &mworker_kws); + + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "@<relative pid>", NULL }, "@<relative pid> : send a command to the <relative pid> process", NULL, cli_io_handler_show_proc, NULL, NULL, ACCESS_MASTER_ONLY}, + { { "@!<pid>", NULL }, "@!<pid> : send a command to the <pid> process", cli_parse_default, NULL, NULL, NULL, ACCESS_MASTER_ONLY}, + { { "@master", NULL }, "@master : send a command to the master process", cli_parse_default, NULL, NULL, NULL, ACCESS_MASTER_ONLY}, + { { "show", "proc", NULL }, "show proc : show processes status", cli_parse_default, cli_io_handler_show_proc, NULL, NULL, ACCESS_MASTER_ONLY}, + { { "reload", NULL }, "reload : achieve a soft-reload (-sf) of haproxy", cli_parse_reload, NULL, NULL, NULL, ACCESS_MASTER_ONLY}, + { { "hard-reload", NULL }, "hard-reload : achieve a hard-reload (-st) of haproxy", cli_parse_reload, NULL, NULL, NULL, ACCESS_MASTER_ONLY}, + { { "_loadstatus", NULL }, NULL, cli_parse_default, cli_io_handler_show_loadstatus, NULL, NULL, ACCESS_MASTER_ONLY}, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); diff --git a/src/namespace.c b/src/namespace.c new file mode 100644 index 0000000..9cc85a3 --- /dev/null +++ b/src/namespace.c @@ -0,0 +1,132 @@ +#define _GNU_SOURCE + +#include <sched.h> +#include <stdio.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/socket.h> + +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/chunk.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/hash.h> +#include <haproxy/namespace.h> +#include <haproxy/signal.h> + +/* Opens the namespace <ns_name> and returns the FD or -1 in case of error + * (check errno). + */ +static int open_named_namespace(const char *ns_name) +{ + if (chunk_printf(&trash, "/var/run/netns/%s", ns_name) < 0) + return -1; + return open(trash.area, O_RDONLY | O_CLOEXEC); +} + +static int default_namespace = -1; + +static int init_default_namespace() +{ + if (chunk_printf(&trash, "/proc/%d/ns/net", getpid()) < 0) + return -1; + default_namespace = open(trash.area, O_RDONLY | O_CLOEXEC); + return default_namespace; +} + +static struct eb_root namespace_tree_root = EB_ROOT; + +static void netns_sig_stop(struct sig_handler *sh) +{ + struct ebpt_node *node, *next; + struct netns_entry *entry; + + /* close namespace file descriptors and remove registered namespaces from the + * tree when stopping */ + node = ebpt_first(&namespace_tree_root); + while (node) { + next = ebpt_next(node); + ebpt_delete(node); + entry = container_of(node, struct netns_entry, node); + free(entry->node.key); + close(entry->fd); + free(entry); + node = next; + } +} + +int netns_init(void) +{ + int err_code = 0; + + /* if no namespaces have been defined in the config then + * there is no point in trying to initialize anything: + * my_socketat() will never be called with a valid namespace + * structure and thus switching back to the default namespace + * is not needed either */ + if (!eb_is_empty(&namespace_tree_root)) { + if (init_default_namespace() < 0) { + ha_alert("Failed to open the default namespace.\n"); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + + signal_register_fct(0, netns_sig_stop, 0); + + return err_code; +} + +struct netns_entry* netns_store_insert(const char *ns_name) +{ + struct netns_entry *entry = NULL; + int fd = open_named_namespace(ns_name); + if (fd == -1) + goto out; + + entry = calloc(1, sizeof(*entry)); + if (!entry) + goto out; + entry->fd = fd; + entry->node.key = strdup(ns_name); + entry->name_len = strlen(ns_name); + ebis_insert(&namespace_tree_root, &entry->node); +out: + return entry; +} + +const struct netns_entry* netns_store_lookup(const char *ns_name, size_t ns_name_len) +{ + struct ebpt_node *node; + + node = ebis_lookup_len(&namespace_tree_root, ns_name, ns_name_len); + if (node) + return ebpt_entry(node, struct netns_entry, node); + else + return NULL; +} + +/* Opens a socket in the namespace described by <ns> with the parameters <domain>, + * <type> and <protocol> and returns the FD or -1 in case of error (check errno). + */ +int my_socketat(const struct netns_entry *ns, int domain, int type, int protocol) +{ + int sock; + + if (default_namespace >= 0 && ns && setns(ns->fd, CLONE_NEWNET) == -1) + return -1; + + sock = socket(domain, type, protocol); + + if (default_namespace >= 0 && ns && setns(default_namespace, CLONE_NEWNET) == -1) { + if (sock >= 0) + close(sock); + return -1; + } + return sock; +} + +REGISTER_BUILD_OPTS("Built with network namespace support."); diff --git a/src/ncbuf.c b/src/ncbuf.c new file mode 100644 index 0000000..e1452f1 --- /dev/null +++ b/src/ncbuf.c @@ -0,0 +1,986 @@ +#include <haproxy/ncbuf.h> + +#include <string.h> + +#ifndef MIN +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#endif + +#ifdef STANDALONE +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> + +#include <haproxy/list.h> +#endif /* STANDALONE */ + +#ifdef DEBUG_STRICT +# include <haproxy/bug.h> +#else +# include <stdio.h> +# include <stdlib.h> + +# undef BUG_ON +# define BUG_ON(x) if (x) { fprintf(stderr, "CRASH ON %s:%d\n", __func__, __LINE__); abort(); } + +# undef BUG_ON_HOT +# define BUG_ON_HOT(x) if (x) { fprintf(stderr, "CRASH ON %s:%d\n", __func__, __LINE__); abort(); } +#endif /* DEBUG_DEV */ + +#include <haproxy/compiler.h> + +/* ******** internal API ******** */ + +#define NCB_BLK_NULL ((struct ncb_blk){ .st = NULL }) + +#define NCB_BK_F_GAP 0x01 /* block represents a gap */ +#define NCB_BK_F_FIN 0x02 /* special reduced gap present at the end of the buffer */ +struct ncb_blk { + char *st; /* first byte of the block */ + char *end; /* first byte after this block */ + + char *sz_ptr; /* pointer to size element - NULL for reduced gap */ + ncb_sz_t sz; /* size of the block */ + ncb_sz_t sz_data; /* size of the data following the block - invalid for reduced GAP */ + ncb_sz_t off; /* offset of block in buffer */ + + char flag; +}; + +/* Return pointer to <off> relative to <buf> head. Support buffer wrapping. */ +static char *ncb_peek(const struct ncbuf *buf, ncb_sz_t off) +{ + char *ptr = ncb_head(buf) + off; + if (ptr >= buf->area + buf->size) + ptr -= buf->size; + return ptr; +} + +/* Returns the reserved space of <buf> which contains the size of the first + * data block. + */ +static char *ncb_reserved(const struct ncbuf *buf) +{ + return ncb_peek(buf, buf->size - NCB_RESERVED_SZ); +} + +/* Encode <off> at <st> position in <buf>. Support wrapping. */ +static forceinline void ncb_write_off(const struct ncbuf *buf, char *st, ncb_sz_t off) +{ + int i; + + BUG_ON_HOT(st >= buf->area + buf->size); + + for (i = 0; i < sizeof(ncb_sz_t); ++i) { + (*st) = off >> (8 * i) & 0xff; + + if ((++st) == ncb_wrap(buf)) + st = ncb_orig(buf); + } +} + +/* Decode offset stored at <st> position in <buf>. Support wrapping. */ +static forceinline ncb_sz_t ncb_read_off(const struct ncbuf *buf, char *st) +{ + int i; + ncb_sz_t off = 0; + + BUG_ON_HOT(st >= buf->area + buf->size); + + for (i = 0; i < sizeof(ncb_sz_t); ++i) { + off |= (unsigned char )(*st) << (8 * i); + + if ((++st) == ncb_wrap(buf)) + st = ncb_orig(buf); + } + + return off; +} + +/* Add <off> to the offset stored at <st> in <buf>. Support wrapping. */ +static forceinline void ncb_inc_off(const struct ncbuf *buf, char *st, ncb_sz_t off) +{ + const ncb_sz_t old = ncb_read_off(buf, st); + ncb_write_off(buf, st, old + off); +} + +/* Returns true if a gap cannot be inserted at <off> : a reduced gap must be used. */ +static forceinline int ncb_off_reduced(const struct ncbuf *b, ncb_sz_t off) +{ + return off + NCB_GAP_MIN_SZ > ncb_size(b); +} + +/* Returns true if <blk> is the special NULL block. */ +static forceinline int ncb_blk_is_null(const struct ncb_blk *blk) +{ + return !blk->st; +} + +/* Returns true if <blk> is the last block of <buf>. */ +static forceinline int ncb_blk_is_last(const struct ncbuf *buf, const struct ncb_blk *blk) +{ + BUG_ON_HOT(blk->off + blk->sz > ncb_size(buf)); + return blk->off + blk->sz == ncb_size(buf); +} + +/* Returns the first block of <buf> which is always a DATA. */ +static struct ncb_blk ncb_blk_first(const struct ncbuf *buf) +{ + struct ncb_blk blk; + + if (ncb_is_null(buf)) + return NCB_BLK_NULL; + + blk.st = ncb_head(buf); + + blk.sz_ptr = ncb_reserved(buf); + blk.sz = ncb_read_off(buf, ncb_reserved(buf)); + blk.sz_data = 0; + BUG_ON_HOT(blk.sz > ncb_size(buf)); + + blk.end = ncb_peek(buf, blk.sz); + blk.off = 0; + blk.flag = 0; + + return blk; +} + +/* Returns the block following <prev> in the buffer <buf>. */ +static struct ncb_blk ncb_blk_next(const struct ncbuf *buf, + const struct ncb_blk *prev) +{ + struct ncb_blk blk; + + BUG_ON_HOT(ncb_blk_is_null(prev)); + + if (ncb_blk_is_last(buf, prev)) + return NCB_BLK_NULL; + + blk.st = prev->end; + blk.off = prev->off + prev->sz; + blk.flag = ~prev->flag & NCB_BK_F_GAP; + + if (blk.flag & NCB_BK_F_GAP) { + if (ncb_off_reduced(buf, blk.off)) { + blk.flag |= NCB_BK_F_FIN; + blk.sz_ptr = NULL; + blk.sz = ncb_size(buf) - blk.off; + blk.sz_data = 0; + + /* A reduced gap can only be the last block. */ + BUG_ON_HOT(!ncb_blk_is_last(buf, &blk)); + } + else { + blk.sz_ptr = ncb_peek(buf, blk.off + NCB_GAP_SZ_OFF); + blk.sz = ncb_read_off(buf, blk.sz_ptr); + blk.sz_data = ncb_read_off(buf, ncb_peek(buf, blk.off + NCB_GAP_SZ_DATA_OFF)); + BUG_ON_HOT(blk.sz < NCB_GAP_MIN_SZ); + } + } + else { + blk.sz_ptr = ncb_peek(buf, prev->off + NCB_GAP_SZ_DATA_OFF); + blk.sz = prev->sz_data; + blk.sz_data = 0; + + /* only first DATA block can be empty. If this happens, a GAP + * merge should have been realized. + */ + BUG_ON_HOT(!blk.sz); + } + + BUG_ON_HOT(blk.off + blk.sz > ncb_size(buf)); + blk.end = ncb_peek(buf, blk.off + blk.sz); + + return blk; +} + +/* Returns the block containing offset <off>. Note that if <off> is at the + * frontier between two blocks, this function will return the preceding one. + * This is done to easily merge blocks on insertion/deletion. + */ +static struct ncb_blk ncb_blk_find(const struct ncbuf *buf, ncb_sz_t off) +{ + struct ncb_blk blk; + + if (ncb_is_null(buf)) + return NCB_BLK_NULL; + + BUG_ON_HOT(off >= ncb_size(buf)); + + for (blk = ncb_blk_first(buf); off > blk.off + blk.sz; + blk = ncb_blk_next(buf, &blk)) { + } + + return blk; +} + +/* Transform absolute offset <off> to a relative one from <blk> start. */ +static forceinline ncb_sz_t ncb_blk_off(const struct ncb_blk *blk, ncb_sz_t off) +{ + BUG_ON_HOT(off < blk->off || off > blk->off + blk->sz); + BUG_ON_HOT(off - blk->off > blk->sz); + return off - blk->off; +} + +/* Simulate insertion in <buf> of <data> of length <len> at offset <off>. This + * ensures that minimal block size are respected for newly formed gaps. <blk> + * must be the block where the insert operation begins. If <mode> is + * NCB_ADD_COMPARE, old and new overlapped data are compared to validate the + * insertion. + * + * Returns NCB_RET_OK if insertion can proceed. + */ +static enum ncb_ret ncb_check_insert(const struct ncbuf *buf, + const struct ncb_blk *blk, ncb_sz_t off, + const char *data, ncb_sz_t len, + enum ncb_add_mode mode) +{ + struct ncb_blk next; + ncb_sz_t off_blk = ncb_blk_off(blk, off); + ncb_sz_t to_copy; + ncb_sz_t left = len; + + /* If insertion starts in a gap, it must leave enough space to keep the + * gap header. + */ + if (left && (blk->flag & NCB_BK_F_GAP)) { + if (off_blk < NCB_GAP_MIN_SZ) + return NCB_RET_GAP_SIZE; + } + + next = *blk; + while (left) { + off_blk = ncb_blk_off(&next, off); + to_copy = MIN(left, next.sz - off_blk); + + if (next.flag & NCB_BK_F_GAP && off_blk + to_copy < next.sz) { + /* Insertion must leave enough space for a new gap + * header if stopped in a middle of a gap. + */ + const ncb_sz_t gap_sz = next.sz - (off_blk + to_copy); + if (gap_sz < NCB_GAP_MIN_SZ && !ncb_blk_is_last(buf, &next)) + return NCB_RET_GAP_SIZE; + } + else if (!(next.flag & NCB_BK_F_GAP) && mode == NCB_ADD_COMPARE) { + /* Compare memory of data block in NCB_ADD_COMPARE mode. */ + const ncb_sz_t off_blk = ncb_blk_off(&next, off); + char *st = ncb_peek(buf, off); + + to_copy = MIN(left, next.sz - off_blk); + if (st + to_copy > ncb_wrap(buf)) { + const ncb_sz_t sz1 = ncb_wrap(buf) - st; + if (memcmp(st, data, sz1)) + return NCB_RET_DATA_REJ; + if (memcmp(ncb_orig(buf), data + sz1, to_copy - sz1)) + return NCB_RET_DATA_REJ; + } + else { + if (memcmp(st, data, to_copy)) + return NCB_RET_DATA_REJ; + } + } + + left -= to_copy; + data += to_copy; + off += to_copy; + + next = ncb_blk_next(buf, &next); + } + + return NCB_RET_OK; +} + +/* Fill new <data> of length <len> inside an already existing data <blk> at + * offset <off>. Offset is relative to <blk> so it cannot be greater than the + * block size. <mode> specifies if old data are preserved or overwritten. + */ +static ncb_sz_t ncb_fill_data_blk(const struct ncbuf *buf, + const struct ncb_blk *blk, ncb_sz_t off, + const char *data, ncb_sz_t len, + enum ncb_add_mode mode) +{ + const ncb_sz_t to_copy = MIN(len, blk->sz - off); + char *ptr = NULL; + + BUG_ON_HOT(off > blk->sz); + /* This can happens due to previous ncb_blk_find() usage. In this + * case the current fill is a noop. + */ + if (off == blk->sz) + return 0; + + if (mode == NCB_ADD_OVERWRT) { + ptr = ncb_peek(buf, blk->off + off); + + if (ptr + to_copy >= ncb_wrap(buf)) { + const ncb_sz_t sz1 = ncb_wrap(buf) - ptr; + memcpy(ptr, data, sz1); + memcpy(ncb_orig(buf), data + sz1, to_copy - sz1); + } + else { + memcpy(ptr, data, to_copy); + } + } + + return to_copy; +} + +/* Fill the gap <blk> starting at <off> with new <data> of length <len>. <off> + * is relative to <blk> so it cannot be greater than the block size. + */ +static ncb_sz_t ncb_fill_gap_blk(const struct ncbuf *buf, + const struct ncb_blk *blk, ncb_sz_t off, + const char *data, ncb_sz_t len) +{ + const ncb_sz_t to_copy = MIN(len, blk->sz - off); + char *ptr; + + BUG_ON_HOT(off > blk->sz); + /* This can happens due to previous ncb_blk_find() usage. In this + * case the current fill is a noop. + */ + if (off == blk->sz) + return 0; + + /* A new gap must be created if insertion stopped before gap end. */ + if (off + to_copy < blk->sz) { + const ncb_sz_t gap_off = blk->off + off + to_copy; + const ncb_sz_t gap_sz = blk->sz - off - to_copy; + + BUG_ON_HOT(!ncb_off_reduced(buf, gap_off) && + blk->off + blk->sz - gap_off < NCB_GAP_MIN_SZ); + + /* write the new gap header unless this is a reduced gap. */ + if (!ncb_off_reduced(buf, gap_off)) { + char *gap_ptr = ncb_peek(buf, gap_off + NCB_GAP_SZ_OFF); + char *gap_data_ptr = ncb_peek(buf, gap_off + NCB_GAP_SZ_DATA_OFF); + + ncb_write_off(buf, gap_ptr, gap_sz); + ncb_write_off(buf, gap_data_ptr, blk->sz_data); + } + } + + /* fill the gap with new data */ + ptr = ncb_peek(buf, blk->off + off); + if (ptr + to_copy >= ncb_wrap(buf)) { + ncb_sz_t sz1 = ncb_wrap(buf) - ptr; + memcpy(ptr, data, sz1); + memcpy(ncb_orig(buf), data + sz1, to_copy - sz1); + } + else { + memcpy(ptr, data, to_copy); + } + + return to_copy; +} + +/* ******** public API ******** */ + +/* Initialize or reset <buf> by clearing all data. Its size is untouched. + * Buffer is positioned to <head> offset. Use 0 to realign it. <buf> must not + * be NCBUF_NULL. + */ +void ncb_init(struct ncbuf *buf, ncb_sz_t head) +{ + BUG_ON_HOT(ncb_is_null(buf)); + + BUG_ON_HOT(head >= buf->size); + buf->head = head; + + ncb_write_off(buf, ncb_reserved(buf), 0); + ncb_write_off(buf, ncb_head(buf), ncb_size(buf)); + ncb_write_off(buf, ncb_peek(buf, sizeof(ncb_sz_t)), 0); +} + +/* Construct a ncbuf with all its parameters. */ +struct ncbuf ncb_make(char *area, ncb_sz_t size, ncb_sz_t head) +{ + struct ncbuf buf; + + /* Ensure that there is enough space for the reserved space and data. + * This is the minimal value to not crash later. + */ + BUG_ON_HOT(size <= NCB_RESERVED_SZ); + + buf.area = area; + buf.size = size; + buf.head = head; + + return buf; +} + +/* Returns the total number of bytes stored in whole <buf>. */ +ncb_sz_t ncb_total_data(const struct ncbuf *buf) +{ + struct ncb_blk blk; + int total = 0; + + for (blk = ncb_blk_first(buf); !ncb_blk_is_null(&blk); blk = ncb_blk_next(buf, &blk)) { + if (!(blk.flag & NCB_BK_F_GAP)) + total += blk.sz; + } + + return total; +} + +/* Returns true if there is no data anywhere in <buf>. */ +int ncb_is_empty(const struct ncbuf *buf) +{ + int first_data, first_gap; + + if (ncb_is_null(buf)) + return 1; + + first_data = ncb_read_off(buf, ncb_reserved(buf)); + BUG_ON_HOT(first_data > ncb_size(buf)); + /* Buffer is not empty if first data block is not nul. */ + if (first_data) + return 0; + + /* Head contains the first gap size if first data block is empty. */ + first_gap = ncb_read_off(buf, ncb_head(buf)); + BUG_ON_HOT(first_gap > ncb_size(buf)); + return first_gap == ncb_size(buf); +} + +/* Returns true if no more data can be inserted in <buf>. */ +int ncb_is_full(const struct ncbuf *buf) +{ + int first_data; + + if (ncb_is_null(buf)) + return 0; + + /* First data block must cover whole buffer if full. */ + first_data = ncb_read_off(buf, ncb_reserved(buf)); + BUG_ON_HOT(first_data > ncb_size(buf)); + return first_data == ncb_size(buf); +} + +/* Returns true if <buf> contains data fragmented by gaps. */ +int ncb_is_fragmented(const struct ncbuf *buf) +{ + struct ncb_blk data, gap; + + if (ncb_is_null(buf)) + return 0; + + /* check if buffer is empty or full */ + if (ncb_is_empty(buf) || ncb_is_full(buf)) + return 0; + + /* check that following gap is the last block */ + data = ncb_blk_first(buf); + gap = ncb_blk_next(buf, &data); + return !ncb_blk_is_last(buf, &gap); +} + +/* Returns the number of bytes of data available in <buf> starting at offset + * <off> until the next gap or the buffer end. The counted data may wrapped if + * the buffer storage is not aligned. + */ +ncb_sz_t ncb_data(const struct ncbuf *buf, ncb_sz_t off) +{ + struct ncb_blk blk; + ncb_sz_t off_blk; + + if (ncb_is_null(buf)) + return 0; + + blk = ncb_blk_find(buf, off); + off_blk = ncb_blk_off(&blk, off); + + /* if <off> at the frontier between two and <blk> is gap, retrieve the + * next data block. + */ + if (blk.flag & NCB_BK_F_GAP && off_blk == blk.sz && + !ncb_blk_is_last(buf, &blk)) { + blk = ncb_blk_next(buf, &blk); + off_blk = ncb_blk_off(&blk, off); + } + + if (blk.flag & NCB_BK_F_GAP) + return 0; + + return blk.sz - off_blk; +} + +/* Add a new block at <data> of size <len> in <buf> at offset <off>. + * + * Returns NCB_RET_OK on success. On error the following codes are returned : + * - NCB_RET_GAP_SIZE : cannot add data because the gap formed is too small + * - NCB_RET_DATA_REJ : old data would be overwritten by different ones in + * NCB_ADD_COMPARE mode. + */ +enum ncb_ret ncb_add(struct ncbuf *buf, ncb_sz_t off, + const char *data, ncb_sz_t len, enum ncb_add_mode mode) +{ + struct ncb_blk blk; + ncb_sz_t left = len; + enum ncb_ret ret; + char *new_sz; + + if (!len) + return NCB_RET_OK; + + BUG_ON_HOT(off + len > ncb_size(buf)); + + /* Get block where insertion begins. */ + blk = ncb_blk_find(buf, off); + + /* Check if insertion is possible. */ + ret = ncb_check_insert(buf, &blk, off, data, len, mode); + if (ret != NCB_RET_OK) + return ret; + + if (blk.flag & NCB_BK_F_GAP) { + /* Reduce gap size if insertion begins in a gap. Gap data size + * is reset and will be recalculated during insertion. + */ + const ncb_sz_t gap_sz = off - blk.off; + BUG_ON_HOT(gap_sz < NCB_GAP_MIN_SZ); + + /* pointer to data size to increase. */ + new_sz = ncb_peek(buf, blk.off + NCB_GAP_SZ_DATA_OFF); + + ncb_write_off(buf, blk.sz_ptr, gap_sz); + ncb_write_off(buf, new_sz, 0); + } + else { + /* pointer to data size to increase. */ + new_sz = blk.sz_ptr; + } + + /* insert data */ + while (left) { + struct ncb_blk next; + const ncb_sz_t off_blk = ncb_blk_off(&blk, off); + ncb_sz_t done; + + /* retrieve the next block. This is necessary to do this + * before overwriting a gap. + */ + next = ncb_blk_next(buf, &blk); + + if (blk.flag & NCB_BK_F_GAP) { + done = ncb_fill_gap_blk(buf, &blk, off_blk, data, left); + + /* update the inserted data block size */ + if (off + done == blk.off + blk.sz) { + /* merge next data block if insertion reached gap end */ + ncb_inc_off(buf, new_sz, done + blk.sz_data); + } + else { + /* insertion stopped before gap end */ + ncb_inc_off(buf, new_sz, done); + } + } + else { + done = ncb_fill_data_blk(buf, &blk, off_blk, data, left, mode); + } + + BUG_ON_HOT(done > blk.sz || done > left); + left -= done; + data += done; + off += done; + + blk = next; + } + + return NCB_RET_OK; +} + +/* Advance the head of <buf> to the offset <adv>. Data at the start of buffer + * will be lost while some space will be formed at the end to be able to insert + * new data. + * + * Returns NCB_RET_OK on success. It may return NCB_RET_GAP_SIZE if operation + * is rejected due to the formation of a too small gap in front. If advance is + * done only inside a data block it is guaranteed to succeed. + */ +enum ncb_ret ncb_advance(struct ncbuf *buf, ncb_sz_t adv) +{ + struct ncb_blk start, last; + ncb_sz_t off_blk; + ncb_sz_t first_data_sz; + + BUG_ON_HOT(adv > ncb_size(buf)); + if (!adv) + return NCB_RET_OK; + + /* Special case if adv is full size. This is equivalent to a reset. */ + if (adv == ncb_size(buf)) { + ncb_init(buf, buf->head); + return NCB_RET_OK; + } + + start = ncb_blk_find(buf, adv); + + /* Special case if advance until the last block which is a GAP. The + * buffer will be left empty and is thus equivalent to a reset. + */ + if (ncb_blk_is_last(buf, &start) && (start.flag & NCB_BK_F_GAP)) { + ncb_sz_t new_head = buf->head + adv; + if (new_head >= buf->size) + new_head -= buf->size; + + ncb_init(buf, new_head); + return NCB_RET_OK; + } + + last = start; + while (!ncb_blk_is_last(buf, &last)) + last = ncb_blk_next(buf, &last); + + off_blk = ncb_blk_off(&start, adv); + + if (start.flag & NCB_BK_F_GAP) { + /* If advance in a GAP, its new size must be big enough. */ + if (start.sz == off_blk) { + /* GAP removed. Buffer will start with following DATA block. */ + first_data_sz = start.sz_data; + } + else if (start.sz - off_blk < NCB_GAP_MIN_SZ) { + return NCB_RET_GAP_SIZE; + } + else { + /* Buffer will start with this GAP block. */ + first_data_sz = 0; + } + } + else { + /* If off_blk less than start.sz, the data block will becomes the + * first block. If equal, the data block is completely removed + * and thus the following GAP will be the first block. + */ + first_data_sz = start.sz - off_blk; + } + + if (last.flag & NCB_BK_F_GAP) { + /* Extend last GAP unless this is a reduced gap. */ + if (!(last.flag & NCB_BK_F_FIN) || last.sz + adv >= NCB_GAP_MIN_SZ) { + /* use .st instead of .sz_ptr which can be NULL if reduced gap */ + ncb_write_off(buf, last.st, last.sz + adv); + ncb_write_off(buf, ncb_peek(buf, last.off + NCB_GAP_SZ_DATA_OFF), 0); + } + } + else { + /* Insert a GAP after the last DATA block. */ + if (adv >= NCB_GAP_MIN_SZ) { + ncb_write_off(buf, ncb_peek(buf, last.off + last.sz + NCB_GAP_SZ_OFF), adv); + ncb_write_off(buf, ncb_peek(buf, last.off + last.sz + NCB_GAP_SZ_DATA_OFF), 0); + } + } + + /* Advance head and update reserved header with new first data size. */ + buf->head += adv; + if (buf->head >= buf->size) + buf->head -= buf->size; + ncb_write_off(buf, ncb_reserved(buf), first_data_sz); + + /* If advance in a GAP, reduce its size. */ + if (start.flag & NCB_BK_F_GAP && !first_data_sz) { + ncb_write_off(buf, ncb_head(buf), start.sz - off_blk); + /* Recopy the block sz_data at the new position. */ + ncb_write_off(buf, ncb_peek(buf, NCB_GAP_SZ_DATA_OFF), start.sz_data); + } + + return NCB_RET_OK; +} + +/* ******** testing API ******** */ +/* To build it : + * gcc -Wall -DSTANDALONE -lasan -I./include -o ncbuf src/ncbuf.c + */ +#ifdef STANDALONE + +int ncb_print = 0; + +static void ncbuf_printf(char *str, ...) +{ + va_list args; + + va_start(args, str); + if (ncb_print) + vfprintf(stderr, str, args); + va_end(args); +} + +struct rand_off { + struct list el; + ncb_sz_t off; + ncb_sz_t len; +}; + +static struct rand_off *ncb_generate_rand_off(const struct ncbuf *buf) +{ + struct rand_off *roff; + roff = calloc(1, sizeof(*roff)); + BUG_ON(!roff); + + roff->off = rand() % (ncb_size(buf)); + if (roff->off > 0 && roff->off < NCB_GAP_MIN_SZ) + roff->off = 0; + + roff->len = rand() % (ncb_size(buf) - roff->off + 1); + + return roff; +} + +static void ncb_print_blk(const struct ncb_blk *blk) +{ + if (ncb_print) { + fprintf(stderr, "%s(%s): %2u/%u.\n", + blk->flag & NCB_BK_F_GAP ? "GAP " : "DATA", + blk->flag & NCB_BK_F_FIN ? "F" : "-", blk->off, blk->sz); + } +} + +static int ncb_is_null_blk(const struct ncb_blk *blk) +{ + return !blk->st; +} + +static void ncb_loop(const struct ncbuf *buf) +{ + struct ncb_blk blk; + + blk = ncb_blk_first(buf); + do { + ncb_print_blk(&blk); + blk = ncb_blk_next(buf, &blk); + } while (!ncb_is_null_blk(&blk)); + + ncbuf_printf("\n"); +} + +static void ncbuf_print_buf(struct ncbuf *b, ncb_sz_t len, + unsigned char *area, int line) +{ + int i; + + ncbuf_printf("buffer status at line %d\n", line); + for (i = 0; i < len; ++i) { + ncbuf_printf("%02x.", area[i]); + if (i && i % 32 == 31) ncbuf_printf("\n"); + else if (i && i % 8 == 7) ncbuf_printf(" "); + } + ncbuf_printf("\n"); + + ncb_loop(b); + + if (ncb_print) + getchar(); +} + +static struct ncbuf b; +static unsigned char *bufarea = NULL; +static ncb_sz_t bufsize = 16384; +static ncb_sz_t bufhead = 15; + +#define NCB_INIT(buf) \ + if ((reset)) { memset(bufarea, 0xaa, bufsize); } \ + ncb_init(buf, bufhead); \ + ncbuf_print_buf(&b, bufsize, bufarea, __LINE__); + +#define NCB_ADD_EQ(buf, off, data, sz, mode, ret) \ + BUG_ON(ncb_add((buf), (off), (data), (sz), (mode)) != (ret)); \ + ncbuf_print_buf(buf, bufsize, bufarea, __LINE__); + +#define NCB_ADD_NEQ(buf, off, data, sz, mode, ret) \ + BUG_ON(ncb_add((buf), (off), (data), (sz), (mode)) == (ret)); \ + ncbuf_print_buf(buf, bufsize, bufarea, __LINE__); + +#define NCB_ADVANCE_EQ(buf, off, ret) \ + BUG_ON(ncb_advance((buf), (off)) != (ret)); \ + ncbuf_print_buf(buf, bufsize, bufarea, __LINE__); + +#define NCB_TOTAL_DATA_EQ(buf, data) \ + BUG_ON(ncb_total_data((buf)) != (data)); + +#define NCB_DATA_EQ(buf, off, data) \ + BUG_ON(ncb_data((buf), (off)) != (data)); + +static int ncbuf_test(ncb_sz_t head, int reset, int print_delay) +{ + char *data0, data1[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }; + struct list list = LIST_HEAD_INIT(list); + struct rand_off *roff, *roff_tmp; + enum ncb_ret ret; + + data0 = malloc(bufsize); + BUG_ON(!data0); + memset(data0, 0xff, bufsize); + + bufarea = malloc(bufsize); + BUG_ON(!bufarea); + + fprintf(stderr, "running unit tests\n"); + + b = NCBUF_NULL; + BUG_ON(!ncb_is_null(&b)); + NCB_DATA_EQ(&b, 0, 0); + NCB_TOTAL_DATA_EQ(&b, 0); + BUG_ON(ncb_size(&b) != 0); + BUG_ON(!ncb_is_empty(&b)); + BUG_ON(ncb_is_full(&b)); + BUG_ON(ncb_is_fragmented(&b)); + + b.area = (char *)bufarea; + b.size = bufsize; + b.head = head; + NCB_INIT(&b); + + /* insertion test suite */ + NCB_INIT(&b); + NCB_DATA_EQ(&b, 0, 0); NCB_DATA_EQ(&b, bufsize - NCB_RESERVED_SZ - 1, 0); /* first and last offset */ + NCB_ADD_EQ(&b, 24, data0, 9, NCB_ADD_PRESERVE, NCB_RET_OK); NCB_DATA_EQ(&b, 24, 9); + /* insert new data at the same offset as old */ + NCB_ADD_EQ(&b, 24, data0, 16, NCB_ADD_PRESERVE, NCB_RET_OK); NCB_DATA_EQ(&b, 24, 16); + + NCB_INIT(&b); NCB_DATA_EQ(&b, 0, 0); + NCB_ADD_EQ(&b, 0, data0, 16, NCB_ADD_PRESERVE, NCB_RET_OK); NCB_DATA_EQ(&b, 0, 16); + BUG_ON(ncb_is_fragmented(&b)); + NCB_ADD_EQ(&b, 24, data0, 16, NCB_ADD_PRESERVE, NCB_RET_OK); NCB_DATA_EQ(&b, 0, 16); + BUG_ON(!ncb_is_fragmented(&b)); + /* insert data overlapping two data blocks and a gap */ + NCB_ADD_EQ(&b, 12, data0, 16, NCB_ADD_PRESERVE, NCB_RET_OK); NCB_DATA_EQ(&b, 0, 40); + BUG_ON(ncb_is_fragmented(&b)); + + NCB_INIT(&b); + NCB_ADD_EQ(&b, 32, data0, 16, NCB_ADD_PRESERVE, NCB_RET_OK); NCB_DATA_EQ(&b, 0, 0); NCB_DATA_EQ(&b, 16, 0); NCB_DATA_EQ(&b, 32, 16); + BUG_ON(!ncb_is_fragmented(&b)); + NCB_ADD_EQ(&b, 0, data0, 16, NCB_ADD_PRESERVE, NCB_RET_OK); NCB_DATA_EQ(&b, 0, 16); NCB_DATA_EQ(&b, 16, 0); NCB_DATA_EQ(&b, 32, 16); + BUG_ON(!ncb_is_fragmented(&b)); + /* insert data to exactly cover a gap between two data blocks */ + NCB_ADD_EQ(&b, 16, data0, 16, NCB_ADD_PRESERVE, NCB_RET_OK); NCB_DATA_EQ(&b, 0, 48); NCB_DATA_EQ(&b, 16, 32); NCB_DATA_EQ(&b, 32, 16); + BUG_ON(ncb_is_fragmented(&b)); + + NCB_INIT(&b); + NCB_ADD_EQ(&b, 0, data0, 8, NCB_ADD_PRESERVE, NCB_RET_OK); + /* this insertion must be rejected because of minimal gap size */ + NCB_ADD_EQ(&b, 10, data0, 8, NCB_ADD_PRESERVE, NCB_RET_GAP_SIZE); + + /* Test reduced gap support */ + NCB_INIT(&b); + /* this insertion will form a reduced gap */ + NCB_ADD_EQ(&b, 0, data0, bufsize - (NCB_GAP_MIN_SZ - 1), NCB_ADD_COMPARE, NCB_RET_OK); + + /* Test the various insertion mode */ + NCB_INIT(&b); + NCB_ADD_EQ(&b, 10, data1, 16, NCB_ADD_PRESERVE, NCB_RET_OK); + NCB_ADD_EQ(&b, 12, data1, 16, NCB_ADD_COMPARE, NCB_RET_DATA_REJ); + NCB_ADD_EQ(&b, 12, data1, 16, NCB_ADD_PRESERVE, NCB_RET_OK); BUG_ON(*ncb_peek(&b, 12) != data1[2]); + NCB_ADD_EQ(&b, 12, data1, 16, NCB_ADD_OVERWRT, NCB_RET_OK); BUG_ON(*ncb_peek(&b, 12) == data1[2]); + + /* advance test suite */ + NCB_INIT(&b); + NCB_ADVANCE_EQ(&b, 10, NCB_RET_OK); /* advance in an empty buffer; this ensures we do not leave an empty DATA in the middle of the buffer */ + NCB_ADVANCE_EQ(&b, ncb_size(&b) - 2, NCB_RET_OK); + + NCB_INIT(&b); + /* first fill the buffer */ + NCB_ADD_EQ(&b, 0, data0, bufsize - NCB_RESERVED_SZ, NCB_ADD_COMPARE, NCB_RET_OK); + /* delete 2 bytes : a reduced gap must be created */ + NCB_ADVANCE_EQ(&b, 2, NCB_RET_OK); NCB_TOTAL_DATA_EQ(&b, ncb_size(&b) - 2); + /* delete 1 byte : extend the reduced gap */ + NCB_ADVANCE_EQ(&b, 1, NCB_RET_OK); NCB_TOTAL_DATA_EQ(&b, ncb_size(&b) - 3); + /* delete 5 bytes : a full gap must be present */ + NCB_ADVANCE_EQ(&b, 5, NCB_RET_OK); NCB_TOTAL_DATA_EQ(&b, ncb_size(&b) - 8); + /* completely clear the buffer */ + NCB_ADVANCE_EQ(&b, bufsize - NCB_RESERVED_SZ, NCB_RET_OK); NCB_TOTAL_DATA_EQ(&b, 0); + + + NCB_INIT(&b); + NCB_ADD_EQ(&b, 10, data0, 10, NCB_ADD_PRESERVE, NCB_RET_OK); + NCB_ADVANCE_EQ(&b, 2, NCB_RET_OK); /* reduce a gap in front of the buffer */ + NCB_ADVANCE_EQ(&b, 1, NCB_RET_GAP_SIZE); /* reject */ + NCB_ADVANCE_EQ(&b, 8, NCB_RET_OK); /* remove completely the gap */ + NCB_ADVANCE_EQ(&b, 8, NCB_RET_OK); /* remove inside the data */ + NCB_ADVANCE_EQ(&b, 10, NCB_RET_OK); /* remove completely the data */ + + fprintf(stderr, "first random pass\n"); + NCB_INIT(&b); + + /* generate randon data offsets until the buffer is full */ + while (!ncb_is_full(&b)) { + roff = ncb_generate_rand_off(&b); + LIST_INSERT(&list, &roff->el); + + ret = ncb_add(&b, roff->off, data0, roff->len, NCB_ADD_COMPARE); + BUG_ON(ret == NCB_RET_DATA_REJ); + ncbuf_print_buf(&b, bufsize, bufarea, __LINE__); + usleep(print_delay); + } + + fprintf(stderr, "buf full, prepare for reverse random\n"); + ncbuf_print_buf(&b, bufsize, bufarea, __LINE__); + + /* insert the previously generated random offsets in the reverse order. + * At the end, the buffer should be full. + */ + NCB_INIT(&b); + list_for_each_entry_safe(roff, roff_tmp, &list, el) { + int full = ncb_is_full(&b); + if (!full) { + ret = ncb_add(&b, roff->off, data0, roff->len, NCB_ADD_COMPARE); + BUG_ON(ret == NCB_RET_DATA_REJ); + ncbuf_print_buf(&b, bufsize, bufarea, __LINE__); + usleep(print_delay); + } + + LIST_DELETE(&roff->el); + free(roff); + } + + if (!ncb_is_full(&b)) + abort(); + + fprintf(stderr, "done\n"); + + free(bufarea); + free(data0); + + return 1; +} + +int main(int argc, char **argv) +{ + int reset = 0; + int print_delay = 100000; + char c; + + opterr = 0; + while ((c = getopt(argc, argv, "h:s:rp::")) != -1) { + switch (c) { + case 'h': + bufhead = atoi(optarg); + break; + case 's': + bufsize = atoi(optarg); + if (bufsize < 64) { + fprintf(stderr, "bufsize should be at least 64 bytes for unit test suite\n"); + exit(127); + } + break; + case 'r': + reset = 1; + break; + case 'p': + if (optarg) + print_delay = atoi(optarg); + ncb_print = 1; + break; + case '?': + default: + fprintf(stderr, "usage: %s [-r] [-s bufsize] [-h bufhead] [-p <delay_msec>]\n", argv[0]); + exit(127); + } + } + + ncbuf_test(bufhead, reset, print_delay); + return EXIT_SUCCESS; +} + +#endif /* STANDALONE */ diff --git a/src/pattern.c b/src/pattern.c new file mode 100644 index 0000000..52dda5e --- /dev/null +++ b/src/pattern.c @@ -0,0 +1,2683 @@ +/* + * Pattern management functions. + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <stdio.h> +#include <errno.h> + +#include <import/ebistree.h> +#include <import/ebpttree.h> +#include <import/ebsttree.h> +#include <import/lru.h> + +#include <haproxy/api.h> +#include <haproxy/global.h> +#include <haproxy/log.h> +#include <haproxy/net_helper.h> +#include <haproxy/pattern.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/tools.h> +#include <haproxy/xxhash.h> + + +const char *const pat_match_names[PAT_MATCH_NUM] = { + [PAT_MATCH_FOUND] = "found", + [PAT_MATCH_BOOL] = "bool", + [PAT_MATCH_INT] = "int", + [PAT_MATCH_IP] = "ip", + [PAT_MATCH_BIN] = "bin", + [PAT_MATCH_LEN] = "len", + [PAT_MATCH_STR] = "str", + [PAT_MATCH_BEG] = "beg", + [PAT_MATCH_SUB] = "sub", + [PAT_MATCH_DIR] = "dir", + [PAT_MATCH_DOM] = "dom", + [PAT_MATCH_END] = "end", + [PAT_MATCH_REG] = "reg", + [PAT_MATCH_REGM] = "regm", +}; + +int (*const pat_parse_fcts[PAT_MATCH_NUM])(const char *, struct pattern *, int, char **) = { + [PAT_MATCH_FOUND] = pat_parse_nothing, + [PAT_MATCH_BOOL] = pat_parse_nothing, + [PAT_MATCH_INT] = pat_parse_int, + [PAT_MATCH_IP] = pat_parse_ip, + [PAT_MATCH_BIN] = pat_parse_bin, + [PAT_MATCH_LEN] = pat_parse_int, + [PAT_MATCH_STR] = pat_parse_str, + [PAT_MATCH_BEG] = pat_parse_str, + [PAT_MATCH_SUB] = pat_parse_str, + [PAT_MATCH_DIR] = pat_parse_str, + [PAT_MATCH_DOM] = pat_parse_str, + [PAT_MATCH_END] = pat_parse_str, + [PAT_MATCH_REG] = pat_parse_reg, + [PAT_MATCH_REGM] = pat_parse_reg, +}; + +int (*const pat_index_fcts[PAT_MATCH_NUM])(struct pattern_expr *, struct pattern *, char **) = { + [PAT_MATCH_FOUND] = pat_idx_list_val, + [PAT_MATCH_BOOL] = pat_idx_list_val, + [PAT_MATCH_INT] = pat_idx_list_val, + [PAT_MATCH_IP] = pat_idx_tree_ip, + [PAT_MATCH_BIN] = pat_idx_list_ptr, + [PAT_MATCH_LEN] = pat_idx_list_val, + [PAT_MATCH_STR] = pat_idx_tree_str, + [PAT_MATCH_BEG] = pat_idx_tree_pfx, + [PAT_MATCH_SUB] = pat_idx_list_str, + [PAT_MATCH_DIR] = pat_idx_list_str, + [PAT_MATCH_DOM] = pat_idx_list_str, + [PAT_MATCH_END] = pat_idx_list_str, + [PAT_MATCH_REG] = pat_idx_list_reg, + [PAT_MATCH_REGM] = pat_idx_list_regm, +}; + +void (*const pat_prune_fcts[PAT_MATCH_NUM])(struct pattern_expr *) = { + [PAT_MATCH_FOUND] = pat_prune_gen, + [PAT_MATCH_BOOL] = pat_prune_gen, + [PAT_MATCH_INT] = pat_prune_gen, + [PAT_MATCH_IP] = pat_prune_gen, + [PAT_MATCH_BIN] = pat_prune_gen, + [PAT_MATCH_LEN] = pat_prune_gen, + [PAT_MATCH_STR] = pat_prune_gen, + [PAT_MATCH_BEG] = pat_prune_gen, + [PAT_MATCH_SUB] = pat_prune_gen, + [PAT_MATCH_DIR] = pat_prune_gen, + [PAT_MATCH_DOM] = pat_prune_gen, + [PAT_MATCH_END] = pat_prune_gen, + [PAT_MATCH_REG] = pat_prune_gen, + [PAT_MATCH_REGM] = pat_prune_gen, +}; + +struct pattern *(*const pat_match_fcts[PAT_MATCH_NUM])(struct sample *, struct pattern_expr *, int) = { + [PAT_MATCH_FOUND] = NULL, + [PAT_MATCH_BOOL] = pat_match_nothing, + [PAT_MATCH_INT] = pat_match_int, + [PAT_MATCH_IP] = pat_match_ip, + [PAT_MATCH_BIN] = pat_match_bin, + [PAT_MATCH_LEN] = pat_match_len, + [PAT_MATCH_STR] = pat_match_str, + [PAT_MATCH_BEG] = pat_match_beg, + [PAT_MATCH_SUB] = pat_match_sub, + [PAT_MATCH_DIR] = pat_match_dir, + [PAT_MATCH_DOM] = pat_match_dom, + [PAT_MATCH_END] = pat_match_end, + [PAT_MATCH_REG] = pat_match_reg, + [PAT_MATCH_REGM] = pat_match_regm, +}; + +/* Just used for checking configuration compatibility */ +int const pat_match_types[PAT_MATCH_NUM] = { + [PAT_MATCH_FOUND] = SMP_T_SINT, + [PAT_MATCH_BOOL] = SMP_T_SINT, + [PAT_MATCH_INT] = SMP_T_SINT, + [PAT_MATCH_IP] = SMP_T_ADDR, + [PAT_MATCH_BIN] = SMP_T_BIN, + [PAT_MATCH_LEN] = SMP_T_STR, + [PAT_MATCH_STR] = SMP_T_STR, + [PAT_MATCH_BEG] = SMP_T_STR, + [PAT_MATCH_SUB] = SMP_T_STR, + [PAT_MATCH_DIR] = SMP_T_STR, + [PAT_MATCH_DOM] = SMP_T_STR, + [PAT_MATCH_END] = SMP_T_STR, + [PAT_MATCH_REG] = SMP_T_STR, + [PAT_MATCH_REGM] = SMP_T_STR, +}; + +/* this struct is used to return information */ +static THREAD_LOCAL struct pattern static_pattern; +static THREAD_LOCAL struct sample_data static_sample_data; + +/* This is the root of the list of all pattern_ref avalaibles. */ +struct list pattern_reference = LIST_HEAD_INIT(pattern_reference); + +static THREAD_LOCAL struct lru64_head *pat_lru_tree; +static unsigned long long pat_lru_seed __read_mostly; + +/* + * + * The following functions are not exported and are used by internals process + * of pattern matching + * + */ + +/* Background: Fast way to find a zero byte in a word + * http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord + * hasZeroByte = (v - 0x01010101UL) & ~v & 0x80808080UL; + * + * To look for 4 different byte values, xor the word with those bytes and + * then check for zero bytes: + * + * v = (((unsigned char)c * 0x1010101U) ^ delimiter) + * where <delimiter> is the 4 byte values to look for (as an uint) + * and <c> is the character that is being tested + */ +static inline unsigned int is_delimiter(unsigned char c, unsigned int mask) +{ + mask ^= (c * 0x01010101); /* propagate the char to all 4 bytes */ + return (mask - 0x01010101) & ~mask & 0x80808080U; +} + +static inline unsigned int make_4delim(unsigned char d1, unsigned char d2, unsigned char d3, unsigned char d4) +{ + return d1 << 24 | d2 << 16 | d3 << 8 | d4; +} + + +/* + * + * These functions are exported and may be used by any other component. + * + * The following functions are used for parsing pattern matching input value. + * The <text> contain the string to be parsed. <pattern> must be a preallocated + * pattern. The pat_parse_* functions fill this structure with the parsed value. + * <err> is filled with an error message built with memprintf() function. It is + * allowed to use a trash as a temporary storage for the returned pattern, as + * the next call after these functions will be pat_idx_*. + * + * In success case, the pat_parse_* function returns 1. If the function + * fails, it returns 0 and <err> is filled. + */ + +/* ignore the current line */ +int pat_parse_nothing(const char *text, struct pattern *pattern, int mflags, char **err) +{ + return 1; +} + +/* Parse a string. It is allocated and duplicated. */ +int pat_parse_str(const char *text, struct pattern *pattern, int mflags, char **err) +{ + pattern->type = SMP_T_STR; + pattern->ptr.str = (char *)text; + pattern->len = strlen(text); + return 1; +} + +/* Parse a binary written in hexa. It is allocated. */ +int pat_parse_bin(const char *text, struct pattern *pattern, int mflags, char **err) +{ + struct buffer *trash; + + pattern->type = SMP_T_BIN; + trash = get_trash_chunk(); + pattern->len = trash->size; + pattern->ptr.str = trash->area; + return !!parse_binary(text, &pattern->ptr.str, &pattern->len, err); +} + +/* Parse a regex. It is allocated. */ +int pat_parse_reg(const char *text, struct pattern *pattern, int mflags, char **err) +{ + pattern->ptr.str = (char *)text; + return 1; +} + +/* Parse a range of positive integers delimited by either ':' or '-'. If only + * one integer is read, it is set as both min and max. An operator may be + * specified as the prefix, among this list of 5 : + * + * 0:eq, 1:gt, 2:ge, 3:lt, 4:le + * + * The default operator is "eq". It supports range matching. Ranges are + * rejected for other operators. The operator may be changed at any time. + * The operator is stored in the 'opaque' argument. + * + * If err is non-NULL, an error message will be returned there on errors and + * the caller will have to free it. The function returns zero on error, and + * non-zero on success. + * + */ +int pat_parse_int(const char *text, struct pattern *pattern, int mflags, char **err) +{ + const char *ptr = text; + + pattern->type = SMP_T_SINT; + + /* Empty string is not valid */ + if (!*text) + goto not_valid_range; + + /* Search ':' or '-' separator. */ + while (*ptr != '\0' && *ptr != ':' && *ptr != '-') + ptr++; + + /* If separator not found. */ + if (!*ptr) { + if (strl2llrc(text, ptr - text, &pattern->val.range.min) != 0) { + memprintf(err, "'%s' is not a number", text); + return 0; + } + pattern->val.range.max = pattern->val.range.min; + pattern->val.range.min_set = 1; + pattern->val.range.max_set = 1; + return 1; + } + + /* If the separator is the first character. */ + if (ptr == text && *(ptr + 1) != '\0') { + if (strl2llrc(ptr + 1, strlen(ptr + 1), &pattern->val.range.max) != 0) + goto not_valid_range; + + pattern->val.range.min_set = 0; + pattern->val.range.max_set = 1; + return 1; + } + + /* If separator is the last character. */ + if (*(ptr + 1) == '\0') { + if (strl2llrc(text, ptr - text, &pattern->val.range.min) != 0) + goto not_valid_range; + + pattern->val.range.min_set = 1; + pattern->val.range.max_set = 0; + return 1; + } + + /* Else, parse two numbers. */ + if (strl2llrc(text, ptr - text, &pattern->val.range.min) != 0) + goto not_valid_range; + + if (strl2llrc(ptr + 1, strlen(ptr + 1), &pattern->val.range.max) != 0) + goto not_valid_range; + + if (pattern->val.range.min > pattern->val.range.max) + goto not_valid_range; + + pattern->val.range.min_set = 1; + pattern->val.range.max_set = 1; + return 1; + + not_valid_range: + memprintf(err, "'%s' is not a valid number range", text); + return 0; +} + +/* Parse a range of positive 2-component versions delimited by either ':' or + * '-'. The version consists in a major and a minor, both of which must be + * smaller than 65536, because internally they will be represented as a 32-bit + * integer. + * If only one version is read, it is set as both min and max. Just like for + * pure integers, an operator may be specified as the prefix, among this list + * of 5 : + * + * 0:eq, 1:gt, 2:ge, 3:lt, 4:le + * + * The default operator is "eq". It supports range matching. Ranges are + * rejected for other operators. The operator may be changed at any time. + * The operator is stored in the 'opaque' argument. This allows constructs + * such as the following one : + * + * acl obsolete_ssl ssl_req_proto lt 3 + * acl unsupported_ssl ssl_req_proto gt 3.1 + * acl valid_ssl ssl_req_proto 3.0-3.1 + * + */ +int pat_parse_dotted_ver(const char *text, struct pattern *pattern, int mflags, char **err) +{ + const char *ptr = text; + + pattern->type = SMP_T_SINT; + + /* Search ':' or '-' separator. */ + while (*ptr != '\0' && *ptr != ':' && *ptr != '-') + ptr++; + + /* If separator not found. */ + if (*ptr == '\0' && ptr > text) { + if (strl2llrc_dotted(text, ptr-text, &pattern->val.range.min) != 0) { + memprintf(err, "'%s' is not a dotted number", text); + return 0; + } + pattern->val.range.max = pattern->val.range.min; + pattern->val.range.min_set = 1; + pattern->val.range.max_set = 1; + return 1; + } + + /* If the separator is the first character. */ + if (ptr == text && *(ptr+1) != '\0') { + if (strl2llrc_dotted(ptr+1, strlen(ptr+1), &pattern->val.range.max) != 0) { + memprintf(err, "'%s' is not a valid dotted number range", text); + return 0; + } + pattern->val.range.min_set = 0; + pattern->val.range.max_set = 1; + return 1; + } + + /* If separator is the last character. */ + if (ptr == &text[strlen(text)-1]) { + if (strl2llrc_dotted(text, ptr-text, &pattern->val.range.min) != 0) { + memprintf(err, "'%s' is not a valid dotted number range", text); + return 0; + } + pattern->val.range.min_set = 1; + pattern->val.range.max_set = 0; + return 1; + } + + /* Else, parse two numbers. */ + if (strl2llrc_dotted(text, ptr-text, &pattern->val.range.min) != 0) { + memprintf(err, "'%s' is not a valid dotted number range", text); + return 0; + } + if (strl2llrc_dotted(ptr+1, strlen(ptr+1), &pattern->val.range.max) != 0) { + memprintf(err, "'%s' is not a valid dotted number range", text); + return 0; + } + if (pattern->val.range.min > pattern->val.range.max) { + memprintf(err, "'%s' is not a valid dotted number range", text); + return 0; + } + pattern->val.range.min_set = 1; + pattern->val.range.max_set = 1; + return 1; +} + +/* Parse an IP address and an optional mask in the form addr[/mask]. + * The addr may either be an IPv4 address or a hostname. The mask + * may either be a dotted mask or a number of bits. Returns 1 if OK, + * otherwise 0. NOTE: IP address patterns are typed (IPV4/IPV6). + */ +int pat_parse_ip(const char *text, struct pattern *pattern, int mflags, char **err) +{ + if (str2net(text, !(mflags & PAT_MF_NO_DNS) && (global.mode & MODE_STARTING), + &pattern->val.ipv4.addr, &pattern->val.ipv4.mask)) { + pattern->type = SMP_T_IPV4; + return 1; + } + else if (str62net(text, &pattern->val.ipv6.addr, &pattern->val.ipv6.mask)) { + pattern->type = SMP_T_IPV6; + return 1; + } + else { + memprintf(err, "'%s' is not a valid IPv4 or IPv6 address", text); + return 0; + } +} + +/* + * + * These functions are exported and may be used by any other component. + * + * This function just takes a sample <smp> and checks if this sample matches + * with the pattern <pattern>. This function returns only PAT_MATCH or + * PAT_NOMATCH. + * + */ + +/* always return false */ +struct pattern *pat_match_nothing(struct sample *smp, struct pattern_expr *expr, int fill) +{ + if (smp->data.u.sint) { + if (fill) { + static_pattern.data = NULL; + static_pattern.ref = NULL; + static_pattern.type = 0; + static_pattern.ptr.str = NULL; + } + return &static_pattern; + } + else + return NULL; +} + + +/* NB: For two strings to be identical, it is required that their length match */ +struct pattern *pat_match_str(struct sample *smp, struct pattern_expr *expr, int fill) +{ + int icase; + struct ebmb_node *node; + struct pattern_tree *elt; + struct pattern_list *lst; + struct pattern *pattern; + struct pattern *ret = NULL; + struct lru64 *lru = NULL; + + /* Lookup a string in the expression's pattern tree. */ + if (!eb_is_empty(&expr->pattern_tree)) { + char prev = 0; + + if (smp->data.u.str.data < smp->data.u.str.size) { + /* we may have to force a trailing zero on the test pattern and + * the buffer is large enough to accommodate it. If the flag + * CONST is set, duplicate the string + */ + prev = smp->data.u.str.area[smp->data.u.str.data]; + if (prev) { + if (smp->flags & SMP_F_CONST) { + if (!smp_dup(smp)) + return NULL; + } else { + smp->data.u.str.area[smp->data.u.str.data] = '\0'; + } + } + } + else { + /* Otherwise, the sample is duplicated. A trailing zero + * is automatically added to the string. + */ + if (!smp_dup(smp)) + return NULL; + } + + node = ebst_lookup(&expr->pattern_tree, smp->data.u.str.area); + if (prev) + smp->data.u.str.area[smp->data.u.str.data] = prev; + + while (node) { + elt = ebmb_entry(node, struct pattern_tree, node); + if (elt->ref->gen_id != expr->ref->curr_gen) { + node = ebmb_next_dup(node); + continue; + } + if (fill) { + static_pattern.data = elt->data; + static_pattern.ref = elt->ref; + static_pattern.sflags = PAT_SF_TREE; + static_pattern.type = SMP_T_STR; + static_pattern.ptr.str = (char *)elt->node.key; + } + return &static_pattern; + } + } + + /* look in the list */ + if (pat_lru_tree && !LIST_ISEMPTY(&expr->patterns)) { + unsigned long long seed = pat_lru_seed ^ (long)expr; + + lru = lru64_get(XXH3(smp->data.u.str.area, smp->data.u.str.data, seed), + pat_lru_tree, expr, expr->ref->revision); + if (lru && lru->domain) { + ret = lru->data; + return ret; + } + } + + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (pattern->len != smp->data.u.str.data) + continue; + + icase = expr->mflags & PAT_MF_IGNORE_CASE; + if ((icase && strncasecmp(pattern->ptr.str, smp->data.u.str.area, smp->data.u.str.data) == 0) || + (!icase && strncmp(pattern->ptr.str, smp->data.u.str.area, smp->data.u.str.data) == 0)) { + ret = pattern; + break; + } + } + + if (lru) + lru64_commit(lru, ret, expr, expr->ref->revision, NULL); + + return ret; +} + +/* NB: For two binaries buf to be identical, it is required that their lengths match */ +struct pattern *pat_match_bin(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct pattern_list *lst; + struct pattern *pattern; + struct pattern *ret = NULL; + struct lru64 *lru = NULL; + + if (pat_lru_tree && !LIST_ISEMPTY(&expr->patterns)) { + unsigned long long seed = pat_lru_seed ^ (long)expr; + + lru = lru64_get(XXH3(smp->data.u.str.area, smp->data.u.str.data, seed), + pat_lru_tree, expr, expr->ref->revision); + if (lru && lru->domain) { + ret = lru->data; + return ret; + } + } + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (pattern->len != smp->data.u.str.data) + continue; + + if (memcmp(pattern->ptr.str, smp->data.u.str.area, smp->data.u.str.data) == 0) { + ret = pattern; + break; + } + } + + if (lru) + lru64_commit(lru, ret, expr, expr->ref->revision, NULL); + + return ret; +} + +/* Executes a regex. It temporarily changes the data to add a trailing zero, + * and restores the previous character when leaving. This function fills + * a matching array. + */ +struct pattern *pat_match_regm(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct pattern_list *lst; + struct pattern *pattern; + struct pattern *ret = NULL; + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (regex_exec_match2(pattern->ptr.reg, smp->data.u.str.area, smp->data.u.str.data, + MAX_MATCH, pmatch, 0)) { + ret = pattern; + smp->ctx.a[0] = pmatch; + break; + } + } + + return ret; +} + +/* Executes a regex. It temporarily changes the data to add a trailing zero, + * and restores the previous character when leaving. + */ +struct pattern *pat_match_reg(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct pattern_list *lst; + struct pattern *pattern; + struct pattern *ret = NULL; + struct lru64 *lru = NULL; + + if (pat_lru_tree && !LIST_ISEMPTY(&expr->patterns)) { + unsigned long long seed = pat_lru_seed ^ (long)expr; + + lru = lru64_get(XXH3(smp->data.u.str.area, smp->data.u.str.data, seed), + pat_lru_tree, expr, expr->ref->revision); + if (lru && lru->domain) { + ret = lru->data; + return ret; + } + } + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (regex_exec2(pattern->ptr.reg, smp->data.u.str.area, smp->data.u.str.data)) { + ret = pattern; + break; + } + } + + if (lru) + lru64_commit(lru, ret, expr, expr->ref->revision, NULL); + + return ret; +} + +/* Checks that the pattern matches the beginning of the tested string. */ +struct pattern *pat_match_beg(struct sample *smp, struct pattern_expr *expr, int fill) +{ + int icase; + struct ebmb_node *node; + struct pattern_tree *elt; + struct pattern_list *lst; + struct pattern *pattern; + struct pattern *ret = NULL; + struct lru64 *lru = NULL; + + /* Lookup a string in the expression's pattern tree. */ + if (!eb_is_empty(&expr->pattern_tree)) { + char prev = 0; + + if (smp->data.u.str.data < smp->data.u.str.size) { + /* we may have to force a trailing zero on the test pattern and + * the buffer is large enough to accommodate it. + */ + prev = smp->data.u.str.area[smp->data.u.str.data]; + if (prev) + smp->data.u.str.area[smp->data.u.str.data] = '\0'; + } + else { + /* Otherwise, the sample is duplicated. A trailing zero + * is automatically added to the string. + */ + if (!smp_dup(smp)) + return NULL; + } + + node = ebmb_lookup_longest(&expr->pattern_tree, + smp->data.u.str.area); + if (prev) + smp->data.u.str.area[smp->data.u.str.data] = prev; + + while (node) { + elt = ebmb_entry(node, struct pattern_tree, node); + if (elt->ref->gen_id != expr->ref->curr_gen) { + node = ebmb_lookup_shorter(node); + continue; + } + if (fill) { + static_pattern.data = elt->data; + static_pattern.ref = elt->ref; + static_pattern.sflags = PAT_SF_TREE; + static_pattern.type = SMP_T_STR; + static_pattern.ptr.str = (char *)elt->node.key; + } + return &static_pattern; + } + } + + /* look in the list */ + if (pat_lru_tree && !LIST_ISEMPTY(&expr->patterns)) { + unsigned long long seed = pat_lru_seed ^ (long)expr; + + lru = lru64_get(XXH3(smp->data.u.str.area, smp->data.u.str.data, seed), + pat_lru_tree, expr, expr->ref->revision); + if (lru && lru->domain) { + ret = lru->data; + return ret; + } + } + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (pattern->len > smp->data.u.str.data) + continue; + + icase = expr->mflags & PAT_MF_IGNORE_CASE; + if ((icase && strncasecmp(pattern->ptr.str, smp->data.u.str.area, pattern->len) != 0) || + (!icase && strncmp(pattern->ptr.str, smp->data.u.str.area, pattern->len) != 0)) + continue; + + ret = pattern; + break; + } + + if (lru) + lru64_commit(lru, ret, expr, expr->ref->revision, NULL); + + return ret; +} + +/* Checks that the pattern matches the end of the tested string. */ +struct pattern *pat_match_end(struct sample *smp, struct pattern_expr *expr, int fill) +{ + int icase; + struct pattern_list *lst; + struct pattern *pattern; + struct pattern *ret = NULL; + struct lru64 *lru = NULL; + + if (pat_lru_tree && !LIST_ISEMPTY(&expr->patterns)) { + unsigned long long seed = pat_lru_seed ^ (long)expr; + + lru = lru64_get(XXH3(smp->data.u.str.area, smp->data.u.str.data, seed), + pat_lru_tree, expr, expr->ref->revision); + if (lru && lru->domain) { + ret = lru->data; + return ret; + } + } + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (pattern->len > smp->data.u.str.data) + continue; + + icase = expr->mflags & PAT_MF_IGNORE_CASE; + if ((icase && strncasecmp(pattern->ptr.str, smp->data.u.str.area + smp->data.u.str.data - pattern->len, pattern->len) != 0) || + (!icase && strncmp(pattern->ptr.str, smp->data.u.str.area + smp->data.u.str.data - pattern->len, pattern->len) != 0)) + continue; + + ret = pattern; + break; + } + + if (lru) + lru64_commit(lru, ret, expr, expr->ref->revision, NULL); + + return ret; +} + +/* Checks that the pattern is included inside the tested string. + * NB: Suboptimal, should be rewritten using a Boyer-Moore method. + */ +struct pattern *pat_match_sub(struct sample *smp, struct pattern_expr *expr, int fill) +{ + int icase; + char *end; + char *c; + struct pattern_list *lst; + struct pattern *pattern; + struct pattern *ret = NULL; + struct lru64 *lru = NULL; + + if (pat_lru_tree && !LIST_ISEMPTY(&expr->patterns)) { + unsigned long long seed = pat_lru_seed ^ (long)expr; + + lru = lru64_get(XXH3(smp->data.u.str.area, smp->data.u.str.data, seed), + pat_lru_tree, expr, expr->ref->revision); + if (lru && lru->domain) { + ret = lru->data; + return ret; + } + } + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (pattern->len > smp->data.u.str.data) + continue; + + end = smp->data.u.str.area + smp->data.u.str.data - pattern->len; + icase = expr->mflags & PAT_MF_IGNORE_CASE; + if (icase) { + for (c = smp->data.u.str.area; c <= end; c++) { + if (tolower((unsigned char)*c) != tolower((unsigned char)*pattern->ptr.str)) + continue; + if (strncasecmp(pattern->ptr.str, c, pattern->len) == 0) { + ret = pattern; + goto leave; + } + } + } else { + for (c = smp->data.u.str.area; c <= end; c++) { + if (*c != *pattern->ptr.str) + continue; + if (strncmp(pattern->ptr.str, c, pattern->len) == 0) { + ret = pattern; + goto leave; + } + } + } + } + leave: + if (lru) + lru64_commit(lru, ret, expr, expr->ref->revision, NULL); + + return ret; +} + +/* This one is used by other real functions. It checks that the pattern is + * included inside the tested string, but enclosed between the specified + * delimiters or at the beginning or end of the string. The delimiters are + * provided as an unsigned int made by make_4delim() and match up to 4 different + * delimiters. Delimiters are stripped at the beginning and end of the pattern. + */ +static int match_word(struct sample *smp, struct pattern *pattern, int mflags, unsigned int delimiters) +{ + int may_match, icase; + char *c, *end; + char *ps; + int pl; + + pl = pattern->len; + ps = pattern->ptr.str; + + while (pl > 0 && is_delimiter(*ps, delimiters)) { + pl--; + ps++; + } + + while (pl > 0 && is_delimiter(ps[pl - 1], delimiters)) + pl--; + + if (pl > smp->data.u.str.data) + return PAT_NOMATCH; + + may_match = 1; + icase = mflags & PAT_MF_IGNORE_CASE; + end = smp->data.u.str.area + smp->data.u.str.data - pl; + for (c = smp->data.u.str.area; c <= end; c++) { + if (is_delimiter(*c, delimiters)) { + may_match = 1; + continue; + } + + if (!may_match) + continue; + + if (icase) { + if ((tolower((unsigned char)*c) == tolower((unsigned char)*ps)) && + (strncasecmp(ps, c, pl) == 0) && + (c == end || is_delimiter(c[pl], delimiters))) + return PAT_MATCH; + } else { + if ((*c == *ps) && + (strncmp(ps, c, pl) == 0) && + (c == end || is_delimiter(c[pl], delimiters))) + return PAT_MATCH; + } + may_match = 0; + } + return PAT_NOMATCH; +} + +/* Checks that the pattern is included inside the tested string, but enclosed + * between the delimiters '?' or '/' or at the beginning or end of the string. + * Delimiters at the beginning or end of the pattern are ignored. + */ +struct pattern *pat_match_dir(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct pattern_list *lst; + struct pattern *pattern; + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (match_word(smp, pattern, expr->mflags, make_4delim('/', '?', '?', '?'))) + return pattern; + } + return NULL; +} + +/* Checks that the pattern is included inside the tested string, but enclosed + * between the delmiters '/', '?', '.' or ":" or at the beginning or end of + * the string. Delimiters at the beginning or end of the pattern are ignored. + */ +struct pattern *pat_match_dom(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct pattern_list *lst; + struct pattern *pattern; + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if (match_word(smp, pattern, expr->mflags, make_4delim('/', '?', '.', ':'))) + return pattern; + } + return NULL; +} + +/* Checks that the integer in <test> is included between min and max */ +struct pattern *pat_match_int(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct pattern_list *lst; + struct pattern *pattern; + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if ((!pattern->val.range.min_set || pattern->val.range.min <= smp->data.u.sint) && + (!pattern->val.range.max_set || smp->data.u.sint <= pattern->val.range.max)) + return pattern; + } + return NULL; +} + +/* Checks that the length of the pattern in <test> is included between min and max */ +struct pattern *pat_match_len(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct pattern_list *lst; + struct pattern *pattern; + + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + if ((!pattern->val.range.min_set || pattern->val.range.min <= smp->data.u.str.data) && + (!pattern->val.range.max_set || smp->data.u.str.data <= pattern->val.range.max)) + return pattern; + } + return NULL; +} + +/* Performs ipv4 key lookup in <expr> ipv4 tree + * Returns NULL on failure + */ +static struct pattern *_pat_match_tree_ipv4(struct in_addr *key, struct pattern_expr *expr, int fill) +{ + struct ebmb_node *node; + struct pattern_tree *elt; + + /* Lookup an IPv4 address in the expression's pattern tree using + * the longest match method. + */ + node = ebmb_lookup_longest(&expr->pattern_tree, key); + while (node) { + elt = ebmb_entry(node, struct pattern_tree, node); + if (elt->ref->gen_id != expr->ref->curr_gen) { + node = ebmb_lookup_shorter(node); + continue; + } + if (fill) { + static_pattern.data = elt->data; + static_pattern.ref = elt->ref; + static_pattern.sflags = PAT_SF_TREE; + static_pattern.type = SMP_T_IPV4; + static_pattern.val.ipv4.addr.s_addr = read_u32(elt->node.key); + if (!cidr2dotted(elt->node.node.pfx, &static_pattern.val.ipv4.mask)) + return NULL; + } + return &static_pattern; + } + return NULL; +} + +/* Performs ipv6 key lookup in <expr> ipv6 tree + * Returns NULL on failure + */ +static struct pattern *_pat_match_tree_ipv6(struct in6_addr *key, struct pattern_expr *expr, int fill) +{ + struct ebmb_node *node; + struct pattern_tree *elt; + + /* Lookup an IPv6 address in the expression's pattern tree using + * the longest match method. + */ + node = ebmb_lookup_longest(&expr->pattern_tree_2, key); + while (node) { + elt = ebmb_entry(node, struct pattern_tree, node); + if (elt->ref->gen_id != expr->ref->curr_gen) { + node = ebmb_lookup_shorter(node); + continue; + } + if (fill) { + static_pattern.data = elt->data; + static_pattern.ref = elt->ref; + static_pattern.sflags = PAT_SF_TREE; + static_pattern.type = SMP_T_IPV6; + memcpy(&static_pattern.val.ipv6.addr, elt->node.key, 16); + static_pattern.val.ipv6.mask = elt->node.node.pfx; + } + return &static_pattern; + } + return NULL; +} + +struct pattern *pat_match_ip(struct sample *smp, struct pattern_expr *expr, int fill) +{ + struct in_addr v4; + struct in6_addr v6; + struct pattern_list *lst; + struct pattern *pattern; + + /* The input sample is IPv4. Try to match in the trees. */ + if (smp->data.type == SMP_T_IPV4) { + pattern = _pat_match_tree_ipv4(&smp->data.u.ipv4, expr, fill); + if (pattern) + return pattern; + /* The IPv4 sample don't match the IPv4 tree. Convert the IPv4 + * sample address to IPv6 and try to lookup in the IPv6 tree. + */ + v4tov6(&v6, &smp->data.u.ipv4); + pattern = _pat_match_tree_ipv6(&v6, expr, fill); + if (pattern) + return pattern; + /* eligible for list lookup using IPv4 address */ + v4 = smp->data.u.ipv4; + goto list_lookup; + } + + /* The input sample is IPv6. Try to match in the trees. */ + if (smp->data.type == SMP_T_IPV6) { + pattern = _pat_match_tree_ipv6(&smp->data.u.ipv6, expr, fill); + if (pattern) + return pattern; + /* No match in the IPv6 tree. Try to convert 6 to 4 to lookup in + * the IPv4 tree + */ + if (v6tov4(&v4, &smp->data.u.ipv6)) { + pattern = _pat_match_tree_ipv4(&v4, expr, fill); + if (pattern) + return pattern; + /* eligible for list lookup using IPv4 address */ + goto list_lookup; + } + } + + not_found: + return NULL; + + list_lookup: + /* No match in the trees, but we still have a valid IPv4 address: lookup + * in the IPv4 list (non-contiguous masks list). This is our last resort + */ + list_for_each_entry(lst, &expr->patterns, list) { + pattern = &lst->pat; + + if (pattern->ref->gen_id != expr->ref->curr_gen) + continue; + + /* Check if the input sample match the current pattern. */ + if (((v4.s_addr ^ pattern->val.ipv4.addr.s_addr) & pattern->val.ipv4.mask.s_addr) == 0) + return pattern; + } + goto not_found; +} + +/* finds the pattern holding <list> from list head <head> and deletes it. + * This is made for use for pattern removal within an expression. + */ +static void pat_unlink_from_head(void **head, void **list) +{ + while (*head) { + if (*head == list) { + *head = *list; + return; + } + head = *head; + } +} + +void free_pattern_tree(struct eb_root *root) +{ + struct eb_node *node, *next; + struct pattern_tree *elt; + + node = eb_first(root); + while (node) { + next = eb_next(node); + eb_delete(node); + elt = container_of(node, struct pattern_tree, node); + pat_unlink_from_head(&elt->ref->tree_head, &elt->from_ref); + free(elt->data); + free(elt); + node = next; + } +} + +void pat_prune_gen(struct pattern_expr *expr) +{ + struct pattern_list *pat, *tmp; + + list_for_each_entry_safe(pat, tmp, &expr->patterns, list) { + LIST_DELETE(&pat->list); + pat_unlink_from_head(&pat->pat.ref->list_head, &pat->from_ref); + if (pat->pat.sflags & PAT_SF_REGFREE) + regex_free(pat->pat.ptr.ptr); + else + free(pat->pat.ptr.ptr); + free(pat->pat.data); + free(pat); + } + + free_pattern_tree(&expr->pattern_tree); + free_pattern_tree(&expr->pattern_tree_2); + LIST_INIT(&expr->patterns); + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt = 0; +} + +/* + * + * The following functions are used for the pattern indexation + * + */ + +int pat_idx_list_val(struct pattern_expr *expr, struct pattern *pat, char **err) +{ + struct pattern_list *patl; + + /* allocate pattern */ + patl = calloc(1, sizeof(*patl)); + if (!patl) { + memprintf(err, "out of memory while indexing pattern"); + return 0; + } + + /* duplicate pattern */ + memcpy(&patl->pat, pat, sizeof(*pat)); + + /* chain pattern in the expression */ + LIST_APPEND(&expr->patterns, &patl->list); + patl->expr = expr; + /* and from the reference */ + patl->from_ref = pat->ref->list_head; + pat->ref->list_head = &patl->from_ref; + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt++; + + /* that's ok */ + return 1; +} + +int pat_idx_list_ptr(struct pattern_expr *expr, struct pattern *pat, char **err) +{ + struct pattern_list *patl; + + /* allocate pattern */ + patl = calloc(1, sizeof(*patl)); + if (!patl) { + memprintf(err, "out of memory while indexing pattern"); + return 0; + } + + /* duplicate pattern */ + memcpy(&patl->pat, pat, sizeof(*pat)); + patl->pat.ptr.ptr = malloc(patl->pat.len); + if (!patl->pat.ptr.ptr) { + free(patl); + memprintf(err, "out of memory while indexing pattern"); + return 0; + } + memcpy(patl->pat.ptr.ptr, pat->ptr.ptr, pat->len); + + /* chain pattern in the expression */ + LIST_APPEND(&expr->patterns, &patl->list); + patl->expr = expr; + /* and from the reference */ + patl->from_ref = pat->ref->list_head; + pat->ref->list_head = &patl->from_ref; + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt++; + + /* that's ok */ + return 1; +} + +int pat_idx_list_str(struct pattern_expr *expr, struct pattern *pat, char **err) +{ + struct pattern_list *patl; + + /* allocate pattern */ + patl = calloc(1, sizeof(*patl)); + if (!patl) { + memprintf(err, "out of memory while indexing pattern"); + return 0; + } + + /* duplicate pattern */ + memcpy(&patl->pat, pat, sizeof(*pat)); + patl->pat.ptr.str = malloc(patl->pat.len + 1); + if (!patl->pat.ptr.str) { + free(patl); + memprintf(err, "out of memory while indexing pattern"); + return 0; + } + memcpy(patl->pat.ptr.ptr, pat->ptr.ptr, pat->len); + patl->pat.ptr.str[patl->pat.len] = '\0'; + + /* chain pattern in the expression */ + LIST_APPEND(&expr->patterns, &patl->list); + patl->expr = expr; + /* and from the reference */ + patl->from_ref = pat->ref->list_head; + pat->ref->list_head = &patl->from_ref; + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt++; + + /* that's ok */ + return 1; +} + +int pat_idx_list_reg_cap(struct pattern_expr *expr, struct pattern *pat, int cap, char **err) +{ + struct pattern_list *patl; + + /* allocate pattern */ + patl = calloc(1, sizeof(*patl)); + if (!patl) { + memprintf(err, "out of memory while indexing pattern"); + return 0; + } + + /* duplicate pattern */ + memcpy(&patl->pat, pat, sizeof(*pat)); + + /* compile regex */ + patl->pat.sflags |= PAT_SF_REGFREE; + if (!(patl->pat.ptr.reg = regex_comp(pat->ptr.str, !(expr->mflags & PAT_MF_IGNORE_CASE), + cap, err))) { + free(patl); + return 0; + } + + /* chain pattern in the expression */ + LIST_APPEND(&expr->patterns, &patl->list); + patl->expr = expr; + /* and from the reference */ + patl->from_ref = pat->ref->list_head; + pat->ref->list_head = &patl->from_ref; + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt++; + + /* that's ok */ + return 1; +} + +int pat_idx_list_reg(struct pattern_expr *expr, struct pattern *pat, char **err) +{ + return pat_idx_list_reg_cap(expr, pat, 0, err); +} + +int pat_idx_list_regm(struct pattern_expr *expr, struct pattern *pat, char **err) +{ + return pat_idx_list_reg_cap(expr, pat, 1, err); +} + +int pat_idx_tree_ip(struct pattern_expr *expr, struct pattern *pat, char **err) +{ + unsigned int mask; + struct pattern_tree *node; + + /* Only IPv4 can be indexed */ + if (pat->type == SMP_T_IPV4) { + /* in IPv4 case, check if the mask is contiguous so that we can + * insert the network into the tree. A continuous mask has only + * ones on the left. This means that this mask + its lower bit + * added once again is null. + */ + mask = ntohl(pat->val.ipv4.mask.s_addr); + if (mask + (mask & -mask) == 0) { + mask = mask ? 33 - flsnz(mask & -mask) : 0; /* equals cidr value */ + + /* node memory allocation */ + node = calloc(1, sizeof(*node) + 4); + if (!node) { + memprintf(err, "out of memory while loading pattern"); + return 0; + } + + /* copy the pointer to sample associated to this node */ + node->data = pat->data; + node->ref = pat->ref; + + /* FIXME: insert <addr>/<mask> into the tree here */ + memcpy(node->node.key, &pat->val.ipv4.addr, 4); /* network byte order */ + node->node.node.pfx = mask; + + /* Insert the entry. */ + ebmb_insert_prefix(&expr->pattern_tree, &node->node, 4); + + node->expr = expr; + node->from_ref = pat->ref->tree_head; + pat->ref->tree_head = &node->from_ref; + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt++; + + /* that's ok */ + return 1; + } + else { + /* If the mask is not contiguous, just add the pattern to the list */ + return pat_idx_list_val(expr, pat, err); + } + } + else if (pat->type == SMP_T_IPV6) { + /* IPv6 also can be indexed */ + node = calloc(1, sizeof(*node) + 16); + if (!node) { + memprintf(err, "out of memory while loading pattern"); + return 0; + } + + /* copy the pointer to sample associated to this node */ + node->data = pat->data; + node->ref = pat->ref; + + /* FIXME: insert <addr>/<mask> into the tree here */ + memcpy(node->node.key, &pat->val.ipv6.addr, 16); /* network byte order */ + node->node.node.pfx = pat->val.ipv6.mask; + + /* Insert the entry. */ + ebmb_insert_prefix(&expr->pattern_tree_2, &node->node, 16); + + node->expr = expr; + node->from_ref = pat->ref->tree_head; + pat->ref->tree_head = &node->from_ref; + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt++; + + /* that's ok */ + return 1; + } + + return 0; +} + +int pat_idx_tree_str(struct pattern_expr *expr, struct pattern *pat, char **err) +{ + int len; + struct pattern_tree *node; + + /* Only string can be indexed */ + if (pat->type != SMP_T_STR) { + memprintf(err, "internal error: string expected, but the type is '%s'", + smp_to_type[pat->type]); + return 0; + } + + /* If the flag PAT_F_IGNORE_CASE is set, we cannot use trees */ + if (expr->mflags & PAT_MF_IGNORE_CASE) + return pat_idx_list_str(expr, pat, err); + + /* Process the key len */ + len = strlen(pat->ptr.str) + 1; + + /* node memory allocation */ + node = calloc(1, sizeof(*node) + len); + if (!node) { + memprintf(err, "out of memory while loading pattern"); + return 0; + } + + /* copy the pointer to sample associated to this node */ + node->data = pat->data; + node->ref = pat->ref; + + /* copy the string */ + memcpy(node->node.key, pat->ptr.str, len); + + /* index the new node */ + ebst_insert(&expr->pattern_tree, &node->node); + + node->expr = expr; + node->from_ref = pat->ref->tree_head; + pat->ref->tree_head = &node->from_ref; + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt++; + + /* that's ok */ + return 1; +} + +int pat_idx_tree_pfx(struct pattern_expr *expr, struct pattern *pat, char **err) +{ + int len; + struct pattern_tree *node; + + /* Only string can be indexed */ + if (pat->type != SMP_T_STR) { + memprintf(err, "internal error: string expected, but the type is '%s'", + smp_to_type[pat->type]); + return 0; + } + + /* If the flag PAT_F_IGNORE_CASE is set, we cannot use trees */ + if (expr->mflags & PAT_MF_IGNORE_CASE) + return pat_idx_list_str(expr, pat, err); + + /* Process the key len */ + len = strlen(pat->ptr.str); + + /* node memory allocation */ + node = calloc(1, sizeof(*node) + len + 1); + if (!node) { + memprintf(err, "out of memory while loading pattern"); + return 0; + } + + /* copy the pointer to sample associated to this node */ + node->data = pat->data; + node->ref = pat->ref; + + /* copy the string and the trailing zero */ + memcpy(node->node.key, pat->ptr.str, len + 1); + node->node.node.pfx = len * 8; + + /* index the new node */ + ebmb_insert_prefix(&expr->pattern_tree, &node->node, len); + + node->expr = expr; + node->from_ref = pat->ref->tree_head; + pat->ref->tree_head = &node->from_ref; + expr->ref->revision = rdtsc(); + expr->ref->entry_cnt++; + + /* that's ok */ + return 1; +} + +/* Deletes all patterns from reference <elt>. Note that all of their + * expressions must be locked, and the pattern lock must be held as well. + */ +void pat_delete_gen(struct pat_ref *ref, struct pat_ref_elt *elt) +{ + struct pattern_tree *tree; + struct pattern_list *pat; + void **node; + + /* delete all known tree nodes. They are all allocated inline */ + for (node = elt->tree_head; node;) { + tree = container_of(node, struct pattern_tree, from_ref); + node = *node; + BUG_ON(tree->ref != elt); + + ebmb_delete(&tree->node); + free(tree->data); + free(tree); + } + + /* delete all list nodes and free their pattern entries (str/reg) */ + for (node = elt->list_head; node;) { + pat = container_of(node, struct pattern_list, from_ref); + node = *node; + BUG_ON(pat->pat.ref != elt); + + /* Delete and free entry. */ + LIST_DELETE(&pat->list); + if (pat->pat.sflags & PAT_SF_REGFREE) + regex_free(pat->pat.ptr.reg); + else + free(pat->pat.ptr.ptr); + free(pat->pat.data); + free(pat); + } + + /* update revision number to refresh the cache */ + ref->revision = rdtsc(); + ref->entry_cnt--; + elt->tree_head = NULL; + elt->list_head = NULL; +} + +void pattern_init_expr(struct pattern_expr *expr) +{ + LIST_INIT(&expr->patterns); + expr->pattern_tree = EB_ROOT; + expr->pattern_tree_2 = EB_ROOT; +} + +void pattern_init_head(struct pattern_head *head) +{ + LIST_INIT(&head->head); +} + +/* The following functions are relative to the management of the reference + * lists. These lists are used to store the original pattern and associated + * value as string form. + * + * This is used with modifiable ACL and MAPS + * + * The pattern reference are stored with two identifiers: the unique_id and + * the reference. + * + * The reference identify a file. Each file with the same name point to the + * same reference. We can register many times one file. If the file is modified, + * all his dependencies are also modified. The reference can be used with map or + * acl. + * + * The unique_id identify inline acl. The unique id is unique for each acl. + * You cannot force the same id in the configuration file, because this repoort + * an error. + * + * A particular case appears if the filename is a number. In this case, the + * unique_id is set with the number represented by the filename and the + * reference is also set. This method prevent double unique_id. + * + */ + +/* This function looks up a reference by name. If the reference is found, a + * pointer to the struct pat_ref is returned, otherwise NULL is returned. + */ +struct pat_ref *pat_ref_lookup(const char *reference) +{ + struct pat_ref *ref; + + list_for_each_entry(ref, &pattern_reference, list) + if (ref->reference && strcmp(reference, ref->reference) == 0) + return ref; + return NULL; +} + +/* This function looks up a reference's unique id. If the reference is found, a + * pointer to the struct pat_ref is returned, otherwise NULL is returned. + */ +struct pat_ref *pat_ref_lookupid(int unique_id) +{ + struct pat_ref *ref; + + list_for_each_entry(ref, &pattern_reference, list) + if (ref->unique_id == unique_id) + return ref; + return NULL; +} + +/* This function removes from the pattern reference <ref> all the patterns + * attached to the reference element <elt>, and the element itself. The + * reference must be locked. + */ +void pat_ref_delete_by_ptr(struct pat_ref *ref, struct pat_ref_elt *elt) +{ + struct pattern_expr *expr; + struct bref *bref, *back; + + /* + * we have to unlink all watchers from this reference pattern. We must + * not relink them if this elt was the last one in the list. + */ + list_for_each_entry_safe(bref, back, &elt->back_refs, users) { + LIST_DELETE(&bref->users); + LIST_INIT(&bref->users); + if (elt->list.n != &ref->head) + LIST_APPEND(&LIST_ELEM(elt->list.n, typeof(elt), list)->back_refs, &bref->users); + bref->ref = elt->list.n; + } + + /* delete all entries from all expressions for this pattern */ + list_for_each_entry(expr, &ref->pat, list) + HA_RWLOCK_WRLOCK(PATEXP_LOCK, &expr->lock); + + pat_delete_gen(ref, elt); + + list_for_each_entry(expr, &ref->pat, list) + HA_RWLOCK_WRUNLOCK(PATEXP_LOCK, &expr->lock); + + LIST_DELETE(&elt->list); + ebmb_delete(&elt->node); + free(elt->sample); + free(elt); +} + +/* This function removes the pattern matching the pointer <refelt> from + * the reference and from each expr member of this reference. This function + * returns 1 if the entry was found and deleted, otherwise zero. + * + * <refelt> is user input: it is provided as an ID and should never be + * dereferenced without making sure that it is valid. + */ +int pat_ref_delete_by_id(struct pat_ref *ref, struct pat_ref_elt *refelt) +{ + struct pat_ref_elt *elt, *safe; + + /* delete pattern from reference */ + list_for_each_entry_safe(elt, safe, &ref->head, list) { + if (elt == refelt) { + pat_ref_delete_by_ptr(ref, elt); + return 1; + } + } + return 0; +} + +/* This function removes all patterns matching <key> from the reference + * and from each expr member of the reference. This function returns 1 + * if the deletion is done and returns 0 is the entry is not found. + */ +int pat_ref_delete(struct pat_ref *ref, const char *key) +{ + struct ebmb_node *node; + int found = 0; + + /* delete pattern from reference */ + node = ebst_lookup(&ref->ebmb_root, key); + while (node) { + struct pat_ref_elt *elt; + + elt = ebmb_entry(node, struct pat_ref_elt, node); + node = ebmb_next_dup(node); + pat_ref_delete_by_ptr(ref, elt); + found = 1; + } + + return found; +} + +/* + * find and return an element <elt> matching <key> in a reference <ref> + * return NULL if not found + */ +struct pat_ref_elt *pat_ref_find_elt(struct pat_ref *ref, const char *key) +{ + struct ebmb_node *node; + + node = ebst_lookup(&ref->ebmb_root, key); + if (node) + return ebmb_entry(node, struct pat_ref_elt, node); + + return NULL; +} + + +/* This function modifies the sample of pat_ref_elt <elt> in all expressions + * found under <ref> to become <value>. It is assumed that the caller has + * already verified that <elt> belongs to <ref>. + */ +static inline int pat_ref_set_elt(struct pat_ref *ref, struct pat_ref_elt *elt, + const char *value, char **err) +{ + struct pattern_expr *expr; + struct sample_data **data; + char *sample; + struct sample_data test; + struct pattern_tree *tree; + struct pattern_list *pat; + void **node; + + + /* Try all needed converters. */ + list_for_each_entry(expr, &ref->pat, list) { + if (!expr->pat_head->parse_smp) + continue; + + if (!expr->pat_head->parse_smp(value, &test)) { + memprintf(err, "unable to parse '%s'", value); + return 0; + } + } + + /* Modify pattern from reference. */ + sample = strdup(value); + if (!sample) { + memprintf(err, "out of memory error"); + return 0; + } + /* Load sample in each reference. All the conversions are tested + * below, normally these calls don't fail. + */ + for (node = elt->tree_head; node;) { + tree = container_of(node, struct pattern_tree, from_ref); + node = *node; + BUG_ON(tree->ref != elt); + expr = tree->expr; + if (!expr->pat_head->parse_smp) + continue; + + data = &tree->data; + if (data && *data) { + HA_RWLOCK_WRLOCK(PATEXP_LOCK, &expr->lock); + if (!expr->pat_head->parse_smp(sample, *data)) + *data = NULL; + HA_RWLOCK_WRUNLOCK(PATEXP_LOCK, &expr->lock); + } + } + + for (node = elt->list_head; node;) { + pat = container_of(node, struct pattern_list, from_ref); + node = *node; + BUG_ON(pat->pat.ref != elt); + expr = pat->expr; + if (!expr->pat_head->parse_smp) + continue; + + data = &pat->pat.data; + if (data && *data) { + HA_RWLOCK_WRLOCK(PATEXP_LOCK, &expr->lock); + if (!expr->pat_head->parse_smp(sample, *data)) + *data = NULL; + HA_RWLOCK_WRUNLOCK(PATEXP_LOCK, &expr->lock); + } + } + + /* free old sample only when all exprs are updated */ + free(elt->sample); + elt->sample = sample; + + + return 1; +} + +/* This function modifies the sample of pat_ref_elt <refelt> in all expressions + * found under <ref> to become <value>, after checking that <refelt> really + * belongs to <ref>. + * + * <refelt> is user input: it is provided as an ID and should never be + * dereferenced without making sure that it is valid. + */ +int pat_ref_set_by_id(struct pat_ref *ref, struct pat_ref_elt *refelt, const char *value, char **err) +{ + struct pat_ref_elt *elt; + + /* Look for pattern in the reference. */ + list_for_each_entry(elt, &ref->head, list) { + if (elt == refelt) { + if (!pat_ref_set_elt(ref, elt, value, err)) + return 0; + return 1; + } + } + + memprintf(err, "key or pattern not found"); + return 0; +} + +/* This function modifies to <value> the sample of all patterns matching <key> + * under <ref>. + */ +int pat_ref_set(struct pat_ref *ref, const char *key, const char *value, char **err, struct pat_ref_elt *elt) +{ + int found = 0; + char *_merr; + char **merr; + struct ebmb_node *node; + + if (err) { + merr = &_merr; + *merr = NULL; + } + else + merr = NULL; + + if (elt) { + node = &elt->node; + } + else { + /* Look for pattern in the reference. */ + node = ebst_lookup(&ref->ebmb_root, key); + } + + while (node) { + elt = ebmb_entry(node, struct pat_ref_elt, node); + node = ebmb_next_dup(node); + if (!pat_ref_set_elt(ref, elt, value, merr)) { + if (err && merr) { + if (!found) { + *err = *merr; + } else { + memprintf(err, "%s, %s", *err, *merr); + ha_free(merr); + } + } + } + found = 1; + } + + if (!found) { + memprintf(err, "entry not found"); + return 0; + } + return 1; +} + +/* This function creates a new reference. <ref> is the reference name. + * <flags> are PAT_REF_*. /!\ The reference is not checked, and must + * be unique. The user must check the reference with "pat_ref_lookup()" + * before calling this function. If the function fails, it returns NULL, + * otherwise it returns the new struct pat_ref. + */ +struct pat_ref *pat_ref_new(const char *reference, const char *display, unsigned int flags) +{ + struct pat_ref *ref; + + ref = calloc(1, sizeof(*ref)); + if (!ref) + return NULL; + + if (display) { + ref->display = strdup(display); + if (!ref->display) { + free(ref); + return NULL; + } + } + + ref->reference = strdup(reference); + if (!ref->reference) { + free(ref->display); + free(ref); + return NULL; + } + + ref->flags = flags; + ref->unique_id = -1; + ref->revision = 0; + ref->entry_cnt = 0; + + LIST_INIT(&ref->head); + ref->ebmb_root = EB_ROOT; + LIST_INIT(&ref->pat); + HA_RWLOCK_INIT(&ref->lock); + LIST_APPEND(&pattern_reference, &ref->list); + + return ref; +} + +/* This function creates a new reference. <unique_id> is the unique id. If + * the value of <unique_id> is -1, the unique id is calculated later. + * <flags> are PAT_REF_*. /!\ The reference is not checked, and must + * be unique. The user must check the reference with "pat_ref_lookup()" + * or pat_ref_lookupid before calling this function. If the function + * fails, it returns NULL, otherwise it returns the new struct pat_ref. + */ +struct pat_ref *pat_ref_newid(int unique_id, const char *display, unsigned int flags) +{ + struct pat_ref *ref; + + ref = calloc(1, sizeof(*ref)); + if (!ref) + return NULL; + + if (display) { + ref->display = strdup(display); + if (!ref->display) { + free(ref); + return NULL; + } + } + + ref->reference = NULL; + ref->flags = flags; + ref->curr_gen = 0; + ref->next_gen = 0; + ref->unique_id = unique_id; + LIST_INIT(&ref->head); + ref->ebmb_root = EB_ROOT; + LIST_INIT(&ref->pat); + HA_RWLOCK_INIT(&ref->lock); + LIST_APPEND(&pattern_reference, &ref->list); + + return ref; +} + +/* This function adds entry to <ref>. It can fail on memory error. It returns + * the newly added element on success, or NULL on failure. The PATREF_LOCK on + * <ref> must be held. It sets the newly created pattern's generation number + * to the same value as the reference's. + */ +struct pat_ref_elt *pat_ref_append(struct pat_ref *ref, const char *pattern, const char *sample, int line) +{ + struct pat_ref_elt *elt; + int len = strlen(pattern); + + elt = calloc(1, sizeof(*elt) + len + 1); + if (!elt) + goto fail; + + elt->gen_id = ref->curr_gen; + elt->line = line; + + memcpy((char*)elt->pattern, pattern, len + 1); + + if (sample) { + elt->sample = strdup(sample); + if (!elt->sample) + goto fail; + } + + LIST_INIT(&elt->back_refs); + elt->list_head = NULL; + elt->tree_head = NULL; + LIST_APPEND(&ref->head, &elt->list); + /* Even if calloc()'ed, ensure this node is not linked to a tree. */ + elt->node.node.leaf_p = NULL; + ebst_insert(&ref->ebmb_root, &elt->node); + return elt; + fail: + free(elt); + return NULL; +} + +/* This function creates sample found in <elt>, parses the pattern also + * found in <elt> and inserts it in <expr>. The function copies <patflags> + * into <expr>. If the function fails, it returns 0 and <err> is filled. + * In success case, the function returns 1. + */ +int pat_ref_push(struct pat_ref_elt *elt, struct pattern_expr *expr, + int patflags, char **err) +{ + struct sample_data *data; + struct pattern pattern; + + /* Create sample */ + if (elt->sample && expr->pat_head->parse_smp) { + /* New sample. */ + data = malloc(sizeof(*data)); + if (!data) + return 0; + + /* Parse value. */ + if (!expr->pat_head->parse_smp(elt->sample, data)) { + memprintf(err, "unable to parse '%s'", elt->sample); + free(data); + return 0; + } + + } + else + data = NULL; + + /* initialise pattern */ + memset(&pattern, 0, sizeof(pattern)); + pattern.data = data; + pattern.ref = elt; + + /* parse pattern */ + if (!expr->pat_head->parse(elt->pattern, &pattern, expr->mflags, err)) { + free(data); + return 0; + } + + HA_RWLOCK_WRLOCK(PATEXP_LOCK, &expr->lock); + /* index pattern */ + if (!expr->pat_head->index(expr, &pattern, err)) { + HA_RWLOCK_WRUNLOCK(PATEXP_LOCK, &expr->lock); + free(data); + return 0; + } + HA_RWLOCK_WRUNLOCK(PATEXP_LOCK, &expr->lock); + + return 1; +} + +/* This function tries to commit entry <elt> into <ref>. The new entry must + * have already been inserted using pat_ref_append(), and its generation number + * may have been adjusted as it will not be changed. <err> must point to a NULL + * pointer. The PATREF lock on <ref> must be held. All the pattern_expr for + * this reference will be updated (parsing, indexing). On success, non-zero is + * returned. On failure, all the operation is rolled back (the element is + * deleted from all expressions and is freed), zero is returned and the error + * pointer <err> may have been updated (and the caller must free it). Failure + * causes include memory allocation, parsing error or indexing error. + */ +int pat_ref_commit_elt(struct pat_ref *ref, struct pat_ref_elt *elt, char **err) +{ + struct pattern_expr *expr; + + list_for_each_entry(expr, &ref->pat, list) { + if (!pat_ref_push(elt, expr, 0, err)) { + pat_ref_delete_by_ptr(ref, elt); + return 0; + } + } + return 1; +} + +/* Loads <pattern>:<sample> into <ref> for generation <gen>. <sample> may be + * NULL if none exists (e.g. ACL). If not needed, the generation number should + * be set to ref->curr_gen. The error pointer must initially point to NULL. The + * new entry will be propagated to all use places, involving allocation, parsing + * and indexing. On error (parsing, allocation), the operation will be rolled + * back, an error may be reported, and NULL will be reported. On success, the + * freshly allocated element will be returned. The PATREF lock on <ref> must be + * held during the operation. + */ +struct pat_ref_elt *pat_ref_load(struct pat_ref *ref, unsigned int gen, + const char *pattern, const char *sample, + int line, char **err) +{ + struct pat_ref_elt *elt; + + elt = pat_ref_append(ref, pattern, sample, line); + if (elt) { + elt->gen_id = gen; + if (!pat_ref_commit_elt(ref, elt, err)) + elt = NULL; + } else + memprintf(err, "out of memory error"); + + return elt; +} + +/* This function adds entry to <ref>. It can fail on memory error. The new + * entry is added at all the pattern_expr registered in this reference. The + * function stops on the first error encountered. It returns 0 and <err> is + * filled. If an error is encountered, the complete add operation is cancelled. + * If the insertion is a success the function returns 1. + */ +int pat_ref_add(struct pat_ref *ref, + const char *pattern, const char *sample, + char **err) +{ + return !!pat_ref_load(ref, ref->curr_gen, pattern, sample, -1, err); +} + +/* This function purges all elements from <ref> whose generation is included in + * the range of <from> to <to> (inclusive), taking wrapping into consideration. + * It will not purge more than <budget> entries at once, in order to remain + * responsive. If budget is negative, no limit is applied. + * The caller must already hold the PATREF_LOCK on <ref>. The function will + * take the PATEXP_LOCK on all expressions of the pattern as needed. It returns + * non-zero on completion, or zero if it had to stop before the end after + * <budget> was depleted. + */ +int pat_ref_purge_range(struct pat_ref *ref, uint from, uint to, int budget) +{ + struct pat_ref_elt *elt, *elt_bck; + struct bref *bref, *bref_bck; + struct pattern_expr *expr; + int done; + + list_for_each_entry(expr, &ref->pat, list) + HA_RWLOCK_WRLOCK(PATEXP_LOCK, &expr->lock); + + /* all expr are locked, we can safely remove all pat_ref */ + + /* assume completion for e.g. empty lists */ + done = 1; + list_for_each_entry_safe(elt, elt_bck, &ref->head, list) { + if (elt->gen_id - from > to - from) + continue; + + if (budget >= 0 && !budget--) { + done = 0; + break; + } + + /* + * we have to unlink all watchers from this reference pattern. We must + * not relink them if this elt was the last one in the list. + */ + list_for_each_entry_safe(bref, bref_bck, &elt->back_refs, users) { + LIST_DELETE(&bref->users); + LIST_INIT(&bref->users); + if (elt->list.n != &ref->head) + LIST_APPEND(&LIST_ELEM(elt->list.n, typeof(elt), list)->back_refs, &bref->users); + bref->ref = elt->list.n; + } + + /* delete the storage for all representations of this pattern. */ + pat_delete_gen(ref, elt); + + LIST_DELETE(&elt->list); + ebmb_delete(&elt->node); + free(elt->sample); + free(elt); + } + + list_for_each_entry(expr, &ref->pat, list) + HA_RWLOCK_WRUNLOCK(PATEXP_LOCK, &expr->lock); + + return done; +} + +/* This function prunes all entries of <ref> and all their associated + * pattern_expr. It may return before the end of the list is reached, + * returning 0, to yield, indicating to the caller that it must call it again. + * until it returns non-zero. All patterns are purged, both current ones and + * future or incomplete ones. This is used by "clear map" or "clear acl". + */ +int pat_ref_prune(struct pat_ref *ref) +{ + return pat_ref_purge_range(ref, 0, ~0, 100); +} + +/* This function looks up any existing reference <ref> in pattern_head <head>, and + * returns the associated pattern_expr pointer if found, otherwise NULL. + */ +struct pattern_expr *pattern_lookup_expr(struct pattern_head *head, struct pat_ref *ref) +{ + struct pattern_expr_list *expr; + + list_for_each_entry(expr, &head->head, list) + if (expr->expr->ref == ref) + return expr->expr; + return NULL; +} + +/* This function creates new pattern_expr associated to the reference <ref>. + * <ref> can be NULL. If an error occurs, the function returns NULL and + * <err> is filled. Otherwise, the function returns new pattern_expr linked + * with <head> and <ref>. + * + * The returned value can be an already filled pattern list, in this case the + * flag <reuse> is set. + */ +struct pattern_expr *pattern_new_expr(struct pattern_head *head, struct pat_ref *ref, + int patflags, char **err, int *reuse) +{ + struct pattern_expr *expr; + struct pattern_expr_list *list; + + if (reuse) + *reuse = 0; + + /* Memory and initialization of the chain element. */ + list = calloc(1, sizeof(*list)); + if (!list) { + memprintf(err, "out of memory"); + return NULL; + } + + /* Look for existing similar expr. No that only the index, parse and + * parse_smp function must be identical for having similar pattern. + * The other function depends of these first. + */ + if (ref) { + list_for_each_entry(expr, &ref->pat, list) + if (expr->pat_head->index == head->index && + expr->pat_head->parse == head->parse && + expr->pat_head->parse_smp == head->parse_smp && + expr->mflags == patflags) + break; + if (&expr->list == &ref->pat) + expr = NULL; + } + else + expr = NULL; + + /* If no similar expr was found, we create new expr. */ + if (!expr) { + /* Get a lot of memory for the expr struct. */ + expr = calloc(1, sizeof(*expr)); + if (!expr) { + free(list); + memprintf(err, "out of memory"); + return NULL; + } + + /* Initialize this new expr. */ + pattern_init_expr(expr); + + /* Copy the pattern matching and indexing flags. */ + expr->mflags = patflags; + + /* This new pattern expression reference one of his heads. */ + expr->pat_head = head; + + /* Link with ref, or to self to facilitate LIST_DELETE() */ + if (ref) + LIST_APPEND(&ref->pat, &expr->list); + else + LIST_INIT(&expr->list); + + expr->ref = ref; + + HA_RWLOCK_INIT(&expr->lock); + + /* We must free this pattern if it is no more used. */ + list->do_free = 1; + } + else { + /* If the pattern used already exists, it is already linked + * with ref and we must not free it. + */ + list->do_free = 0; + if (reuse) + *reuse = 1; + } + + /* The new list element reference the pattern_expr. */ + list->expr = expr; + + /* Link the list element with the pattern_head. */ + LIST_APPEND(&head->head, &list->list); + return expr; +} + +/* Reads patterns from a file. If <err_msg> is non-NULL, an error message will + * be returned there on errors and the caller will have to free it. + * + * The file contains one key + value per line. Lines which start with '#' are + * ignored, just like empty lines. Leading tabs/spaces are stripped. The key is + * then the first "word" (series of non-space/tabs characters), and the value is + * what follows this series of space/tab till the end of the line excluding + * trailing spaces/tabs. + * + * Example : + * + * # this is a comment and is ignored + * 62.212.114.60 1wt.eu \n + * <-><-----------><---><----><----> + * | | | | `--- trailing spaces ignored + * | | | `-------- value + * | | `--------------- middle spaces ignored + * | `------------------------ key + * `-------------------------------- leading spaces ignored + * + * Return non-zero in case of success, otherwise 0. + */ +int pat_ref_read_from_file_smp(struct pat_ref *ref, const char *filename, char **err) +{ + FILE *file; + char *c; + int ret = 0; + int line = 0; + char *key_beg; + char *key_end; + char *value_beg; + char *value_end; + + file = fopen(filename, "r"); + if (!file) { + memprintf(err, "failed to open pattern file <%s>", filename); + return 0; + } + + /* now parse all patterns. The file may contain only one pattern + * followed by one value per line. The start spaces, separator spaces + * and and spaces are stripped. Each can contain comment started by '#' + */ + while (fgets(trash.area, trash.size, file) != NULL) { + line++; + c = trash.area; + + /* ignore lines beginning with a dash */ + if (*c == '#') + continue; + + /* strip leading spaces and tabs */ + while (*c == ' ' || *c == '\t') + c++; + + /* empty lines are ignored too */ + if (*c == '\0' || *c == '\r' || *c == '\n') + continue; + + /* look for the end of the key */ + key_beg = c; + while (*c && *c != ' ' && *c != '\t' && *c != '\n' && *c != '\r') + c++; + + key_end = c; + + /* strip middle spaces and tabs */ + while (*c == ' ' || *c == '\t') + c++; + + /* look for the end of the value, it is the end of the line */ + value_beg = c; + while (*c && *c != '\n' && *c != '\r') + c++; + value_end = c; + + /* trim possibly trailing spaces and tabs */ + while (value_end > value_beg && (value_end[-1] == ' ' || value_end[-1] == '\t')) + value_end--; + + /* set final \0 and check entries */ + *key_end = '\0'; + *value_end = '\0'; + + /* insert values */ + if (!pat_ref_append(ref, key_beg, value_beg, line)) { + memprintf(err, "out of memory"); + goto out_close; + } + } + + if (ferror(file)) { + memprintf(err, "error encountered while reading <%s> : %s", + filename, strerror(errno)); + goto out_close; + } + /* success */ + ret = 1; + + out_close: + fclose(file); + return ret; +} + +/* Reads patterns from a file. If <err_msg> is non-NULL, an error message will + * be returned there on errors and the caller will have to free it. + */ +int pat_ref_read_from_file(struct pat_ref *ref, const char *filename, char **err) +{ + FILE *file; + char *c; + char *arg; + int ret = 0; + int line = 0; + + file = fopen(filename, "r"); + if (!file) { + memprintf(err, "failed to open pattern file <%s>", filename); + return 0; + } + + /* now parse all patterns. The file may contain only one pattern per + * line. If the line contains spaces, they will be part of the pattern. + * The pattern stops at the first CR, LF or EOF encountered. + */ + while (fgets(trash.area, trash.size, file) != NULL) { + line++; + c = trash.area; + + /* ignore lines beginning with a dash */ + if (*c == '#') + continue; + + /* strip leading spaces and tabs */ + while (*c == ' ' || *c == '\t') + c++; + + + arg = c; + while (*c && *c != '\n' && *c != '\r') + c++; + *c = 0; + + /* empty lines are ignored too */ + if (c == arg) + continue; + + if (!pat_ref_append(ref, arg, NULL, line)) { + memprintf(err, "out of memory when loading patterns from file <%s>", filename); + goto out_close; + } + } + + if (ferror(file)) { + memprintf(err, "error encountered while reading <%s> : %s", + filename, strerror(errno)); + goto out_close; + } + ret = 1; /* success */ + + out_close: + fclose(file); + return ret; +} + +int pattern_read_from_file(struct pattern_head *head, unsigned int refflags, + const char *filename, int patflags, int load_smp, + char **err, const char *file, int line) +{ + struct pat_ref *ref; + struct pattern_expr *expr; + struct pat_ref_elt *elt; + int reuse = 0; + + /* Lookup for the existing reference. */ + ref = pat_ref_lookup(filename); + + /* If the reference doesn't exists, create it and load associated file. */ + if (!ref) { + chunk_printf(&trash, + "pattern loaded from file '%s' used by %s at file '%s' line %d", + filename, refflags & PAT_REF_MAP ? "map" : "acl", file, line); + + ref = pat_ref_new(filename, trash.area, refflags); + if (!ref) { + memprintf(err, "out of memory"); + return 0; + } + + if (load_smp) { + ref->flags |= PAT_REF_SMP; + if (!pat_ref_read_from_file_smp(ref, filename, err)) + return 0; + } + else { + if (!pat_ref_read_from_file(ref, filename, err)) + return 0; + } + } + else { + /* The reference already exists, check the map compatibility. */ + + /* If the load require samples and the flag PAT_REF_SMP is not set, + * the reference doesn't contain sample, and cannot be used. + */ + if (load_smp) { + if (!(ref->flags & PAT_REF_SMP)) { + memprintf(err, "The file \"%s\" is already used as one column file " + "and cannot be used by as two column file.", + filename); + return 0; + } + } + else { + /* The load doesn't require samples. If the flag PAT_REF_SMP is + * set, the reference contains a sample, and cannot be used. + */ + if (ref->flags & PAT_REF_SMP) { + memprintf(err, "The file \"%s\" is already used as two column file " + "and cannot be used by as one column file.", + filename); + return 0; + } + } + + /* Extends display */ + chunk_printf(&trash, "%s", ref->display); + chunk_appendf(&trash, ", by %s at file '%s' line %d", + refflags & PAT_REF_MAP ? "map" : "acl", file, line); + free(ref->display); + ref->display = strdup(trash.area); + if (!ref->display) { + memprintf(err, "out of memory"); + return 0; + } + + /* Merge flags. */ + ref->flags |= refflags; + } + + /* Now, we can loading patterns from the reference. */ + + /* Lookup for existing reference in the head. If the reference + * doesn't exists, create it. + */ + expr = pattern_lookup_expr(head, ref); + if (!expr || (expr->mflags != patflags)) { + expr = pattern_new_expr(head, ref, patflags, err, &reuse); + if (!expr) + return 0; + } + + /* The returned expression may be not empty, because the function + * "pattern_new_expr" lookup for similar pattern list and can + * reuse a already filled pattern list. In this case, we can not + * reload the patterns. + */ + if (reuse) + return 1; + + /* Load reference content in the pattern expression. + * We need to load elements in the same order they were seen in the + * file as list-based matching types may rely on it. + */ + list_for_each_entry(elt, &ref->head, list) { + if (!pat_ref_push(elt, expr, patflags, err)) { + if (elt->line > 0) + memprintf(err, "%s at line %d of file '%s'", + *err, elt->line, filename); + return 0; + } + } + + return 1; +} + +/* This function executes a pattern match on a sample. It applies pattern <expr> + * to sample <smp>. The function returns NULL if the sample don't match. It returns + * non-null if the sample match. If <fill> is true and the sample match, the + * function returns the matched pattern. In many cases, this pattern can be a + * static buffer. + */ +struct pattern *pattern_exec_match(struct pattern_head *head, struct sample *smp, int fill) +{ + struct pattern_expr_list *list; + struct pattern *pat; + + if (!head->match) { + if (fill) { + static_pattern.data = NULL; + static_pattern.ref = NULL; + static_pattern.sflags = 0; + static_pattern.type = SMP_T_SINT; + static_pattern.val.i = 1; + } + return &static_pattern; + } + + /* convert input to string */ + if (!sample_convert(smp, head->expect_type)) + return NULL; + + list_for_each_entry(list, &head->head, list) { + HA_RWLOCK_RDLOCK(PATEXP_LOCK, &list->expr->lock); + pat = head->match(smp, list->expr, fill); + if (pat) { + /* We duplicate the pattern cause it could be modified + by another thread */ + if (pat != &static_pattern) { + memcpy(&static_pattern, pat, sizeof(struct pattern)); + pat = &static_pattern; + } + + /* We also duplicate the sample data for + same reason */ + if (pat->data && (pat->data != &static_sample_data)) { + switch(pat->data->type) { + case SMP_T_STR: + static_sample_data.type = SMP_T_STR; + static_sample_data.u.str = *get_trash_chunk(); + static_sample_data.u.str.data = pat->data->u.str.data; + if (static_sample_data.u.str.data >= static_sample_data.u.str.size) + static_sample_data.u.str.data = static_sample_data.u.str.size - 1; + memcpy(static_sample_data.u.str.area, + pat->data->u.str.area, static_sample_data.u.str.data); + static_sample_data.u.str.area[static_sample_data.u.str.data] = 0; + pat->data = &static_sample_data; + break; + + case SMP_T_IPV4: + case SMP_T_IPV6: + case SMP_T_SINT: + memcpy(&static_sample_data, pat->data, sizeof(struct sample_data)); + pat->data = &static_sample_data; + break; + default: + /* unimplemented pattern type */ + pat->data = NULL; + break; + } + } + HA_RWLOCK_RDUNLOCK(PATEXP_LOCK, &list->expr->lock); + return pat; + } + HA_RWLOCK_RDUNLOCK(PATEXP_LOCK, &list->expr->lock); + } + return NULL; +} + +/* This function prunes the pattern expressions starting at pattern_head <head>. */ +void pattern_prune(struct pattern_head *head) +{ + struct pattern_expr_list *list, *safe; + + list_for_each_entry_safe(list, safe, &head->head, list) { + LIST_DELETE(&list->list); + if (list->do_free) { + LIST_DELETE(&list->expr->list); + HA_RWLOCK_WRLOCK(PATEXP_LOCK, &list->expr->lock); + head->prune(list->expr); + HA_RWLOCK_WRUNLOCK(PATEXP_LOCK, &list->expr->lock); + free(list->expr); + } + free(list); + } +} + +/* This function compares two pat_ref** on their unique_id, and returns -1/0/1 + * depending on their order (suitable for sorting). + */ +static int cmp_pat_ref(const void *_a, const void *_b) +{ + struct pat_ref * const *a = _a; + struct pat_ref * const *b = _b; + + if ((*a)->unique_id < (*b)->unique_id) + return -1; + else if ((*a)->unique_id > (*b)->unique_id) + return 1; + return 0; +} + +/* This function finalizes the configuration parsing. It sets all the + * automatic ids. + */ +int pattern_finalize_config(void) +{ + size_t len = 0; + size_t unassigned_pos = 0; + int next_unique_id = 0; + size_t i, j; + struct pat_ref *ref, **arr; + struct list pr = LIST_HEAD_INIT(pr); + + pat_lru_seed = ha_random(); + + /* Count pat_refs with user defined unique_id and totalt count */ + list_for_each_entry(ref, &pattern_reference, list) { + len++; + if (ref->unique_id != -1) + unassigned_pos++; + } + + if (len == 0) { + return 0; + } + + arr = calloc(len, sizeof(*arr)); + if (arr == NULL) { + ha_alert("Out of memory error.\n"); + return ERR_ALERT | ERR_FATAL; + } + + i = 0; + j = unassigned_pos; + list_for_each_entry(ref, &pattern_reference, list) { + if (ref->unique_id != -1) + arr[i++] = ref; + else + arr[j++] = ref; + } + + /* Sort first segment of array with user-defined unique ids for + * fast lookup when generating unique ids + */ + qsort(arr, unassigned_pos, sizeof(*arr), cmp_pat_ref); + + /* Assign unique ids to the rest of the elements */ + for (i = unassigned_pos; i < len; i++) { + do { + arr[i]->unique_id = next_unique_id++; + } while (bsearch(&arr[i], arr, unassigned_pos, sizeof(*arr), cmp_pat_ref)); + } + + /* Sort complete array */ + qsort(arr, len, sizeof(*arr), cmp_pat_ref); + + /* Convert back to linked list */ + for (i = 0; i < len; i++) + LIST_APPEND(&pr, &arr[i]->list); + + /* swap root */ + LIST_INSERT(&pr, &pattern_reference); + LIST_DELETE(&pr); + + free(arr); + return 0; +} + +static int pattern_per_thread_lru_alloc() +{ + if (!global.tune.pattern_cache) + return 1; + pat_lru_tree = lru64_new(global.tune.pattern_cache); + return !!pat_lru_tree; +} + +static void pattern_per_thread_lru_free() +{ + lru64_destroy(pat_lru_tree); +} + +REGISTER_PER_THREAD_ALLOC(pattern_per_thread_lru_alloc); +REGISTER_PER_THREAD_FREE(pattern_per_thread_lru_free); diff --git a/src/payload.c b/src/payload.c new file mode 100644 index 0000000..6a536d7 --- /dev/null +++ b/src/payload.c @@ -0,0 +1,1448 @@ +/* + * General protocol-agnostic payload-based sample fetches and ACLs + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <stdlib.h> +#include <string.h> + +#include <haproxy/acl.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/channel.h> +#include <haproxy/connection.h> +#include <haproxy/htx.h> +#include <haproxy/net_helper.h> +#include <haproxy/pattern.h> +#include <haproxy/payload.h> +#include <haproxy/sample.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> + + +/************************************************************************/ +/* All supported sample fetch functions must be declared here */ +/************************************************************************/ + +/* wait for more data as long as possible, then return TRUE. This should be + * used with content inspection. + */ +static int +smp_fetch_wait_end(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!(smp->opt & SMP_OPT_FINAL)) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 1; + return 1; +} + +/* return the number of bytes in the request buffer */ +static int +smp_fetch_len(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (smp->strm) { + struct channel *chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + + /* Not accurate but kept for backward compatibility purpose */ + if (IS_HTX_STRM(smp->strm)) { + struct htx *htx = htxbuf(&chn->buf); + smp->data.u.sint = htx->data - co_data(chn); + } + else + smp->data.u.sint = ci_data(chn); + } + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) { + struct check *check = __objt_check(smp->sess->origin); + + /* Not accurate but kept for backward compatibility purpose */ + smp->data.u.sint = ((check->sc && IS_HTX_SC(check->sc)) ? (htxbuf(&check->bi))->data: b_data(&check->bi)); + } + else + return 0; + + smp->data.type = SMP_T_SINT; + smp->flags = SMP_F_VOLATILE | SMP_F_MAY_CHANGE; + return 1; +} + +/* Returns 0 if the client didn't send a SessionTicket Extension + * Returns 1 if the client sent SessionTicket Extension + * Returns 2 if the client also sent non-zero length SessionTicket + * Returns SMP_T_SINT data type + */ +static int +smp_fetch_req_ssl_st_ext(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int hs_len, ext_len, bleft; + struct channel *chn; + unsigned char *data; + + if (!smp->strm) + goto not_ssl_hello; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + goto not_ssl_hello; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + + + bleft = ci_data(chn); + data = (unsigned char *)ci_head(chn); + + /* Check for SSL/TLS Handshake */ + if (!bleft) + goto too_short; + if (*data != 0x16) + goto not_ssl_hello; + + /* Check for SSLv3 or later (SSL version >= 3.0) in the record layer*/ + if (bleft < 3) + goto too_short; + if (data[1] < 0x03) + goto not_ssl_hello; + + if (bleft < 5) + goto too_short; + hs_len = (data[3] << 8) + data[4]; + if (hs_len < 1 + 3 + 2 + 32 + 1 + 2 + 2 + 1 + 1 + 2 + 2) + goto not_ssl_hello; /* too short to have an extension */ + + data += 5; /* enter TLS handshake */ + bleft -= 5; + + /* Check for a complete client hello starting at <data> */ + if (bleft < 1) + goto too_short; + if (data[0] != 0x01) /* msg_type = Client Hello */ + goto not_ssl_hello; + + /* Check the Hello's length */ + if (bleft < 4) + goto too_short; + hs_len = (data[1] << 16) + (data[2] << 8) + data[3]; + if (hs_len < 2 + 32 + 1 + 2 + 2 + 1 + 1 + 2 + 2) + goto not_ssl_hello; /* too short to have an extension */ + + /* We want the full handshake here */ + if (bleft < hs_len) + goto too_short; + + data += 4; + /* Start of the ClientHello message */ + if (data[0] < 0x03 || data[1] < 0x01) /* TLSv1 minimum */ + goto not_ssl_hello; + + ext_len = data[34]; /* session_id_len */ + if (ext_len > 32 || ext_len > (hs_len - 35)) /* check for correct session_id len */ + goto not_ssl_hello; + + /* Jump to cipher suite */ + hs_len -= 35 + ext_len; + data += 35 + ext_len; + + if (hs_len < 4 || /* minimum one cipher */ + (ext_len = (data[0] << 8) + data[1]) < 2 || /* minimum 2 bytes for a cipher */ + ext_len > hs_len) + goto not_ssl_hello; + + /* Jump to the compression methods */ + hs_len -= 2 + ext_len; + data += 2 + ext_len; + + if (hs_len < 2 || /* minimum one compression method */ + data[0] < 1 || data[0] > hs_len) /* minimum 1 bytes for a method */ + goto not_ssl_hello; + + /* Jump to the extensions */ + hs_len -= 1 + data[0]; + data += 1 + data[0]; + + if (hs_len < 2 || /* minimum one extension list length */ + (ext_len = (data[0] << 8) + data[1]) > hs_len - 2) /* list too long */ + goto not_ssl_hello; + + hs_len = ext_len; /* limit ourselves to the extension length */ + data += 2; + + while (hs_len >= 4) { + int ext_type, ext_len; + + ext_type = (data[0] << 8) + data[1]; + ext_len = (data[2] << 8) + data[3]; + + if (ext_len > hs_len - 4) /* Extension too long */ + goto not_ssl_hello; + + /* SesstionTicket extension */ + if (ext_type == 35) { + smp->data.type = SMP_T_SINT; + /* SessionTicket also present */ + if (ext_len > 0) + smp->data.u.sint = 2; + /* SessionTicket absent */ + else + smp->data.u.sint = 1; + smp->flags = SMP_F_VOLATILE; + return 1; + } + + hs_len -= 4 + ext_len; + data += 4 + ext_len; + } + /* SessionTicket Extension not found */ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + smp->flags = SMP_F_VOLATILE; + return 1; + + too_short: + smp->flags = SMP_F_MAY_CHANGE; + + not_ssl_hello: + return 0; +} + +/* Returns TRUE if the client sent Supported Elliptic Curves Extension (0x000a) + * Mainly used to detect if client supports ECC cipher suites. + */ +static int +smp_fetch_req_ssl_ec_ext(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int hs_len, ext_len, bleft; + struct channel *chn; + unsigned char *data; + + if (!smp->strm) + goto not_ssl_hello; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + goto not_ssl_hello; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + bleft = ci_data(chn); + data = (unsigned char *)ci_head(chn); + + /* Check for SSL/TLS Handshake */ + if (!bleft) + goto too_short; + if (*data != 0x16) + goto not_ssl_hello; + + /* Check for SSLv3 or later (SSL version >= 3.0) in the record layer*/ + if (bleft < 3) + goto too_short; + if (data[1] < 0x03) + goto not_ssl_hello; + + if (bleft < 5) + goto too_short; + hs_len = (data[3] << 8) + data[4]; + if (hs_len < 1 + 3 + 2 + 32 + 1 + 2 + 2 + 1 + 1 + 2 + 2) + goto not_ssl_hello; /* too short to have an extension */ + + data += 5; /* enter TLS handshake */ + bleft -= 5; + + /* Check for a complete client hello starting at <data> */ + if (bleft < 1) + goto too_short; + if (data[0] != 0x01) /* msg_type = Client Hello */ + goto not_ssl_hello; + + /* Check the Hello's length */ + if (bleft < 4) + goto too_short; + hs_len = (data[1] << 16) + (data[2] << 8) + data[3]; + if (hs_len < 2 + 32 + 1 + 2 + 2 + 1 + 1 + 2 + 2) + goto not_ssl_hello; /* too short to have an extension */ + + /* We want the full handshake here */ + if (bleft < hs_len) + goto too_short; + + data += 4; + /* Start of the ClientHello message */ + if (data[0] < 0x03 || data[1] < 0x01) /* TLSv1 minimum */ + goto not_ssl_hello; + + ext_len = data[34]; /* session_id_len */ + if (ext_len > 32 || ext_len > (hs_len - 35)) /* check for correct session_id len */ + goto not_ssl_hello; + + /* Jump to cipher suite */ + hs_len -= 35 + ext_len; + data += 35 + ext_len; + + if (hs_len < 4 || /* minimum one cipher */ + (ext_len = (data[0] << 8) + data[1]) < 2 || /* minimum 2 bytes for a cipher */ + ext_len > hs_len) + goto not_ssl_hello; + + /* Jump to the compression methods */ + hs_len -= 2 + ext_len; + data += 2 + ext_len; + + if (hs_len < 2 || /* minimum one compression method */ + data[0] < 1 || data[0] > hs_len) /* minimum 1 bytes for a method */ + goto not_ssl_hello; + + /* Jump to the extensions */ + hs_len -= 1 + data[0]; + data += 1 + data[0]; + + if (hs_len < 2 || /* minimum one extension list length */ + (ext_len = (data[0] << 8) + data[1]) > hs_len - 2) /* list too long */ + goto not_ssl_hello; + + hs_len = ext_len; /* limit ourselves to the extension length */ + data += 2; + + while (hs_len >= 4) { + int ext_type, ext_len; + + ext_type = (data[0] << 8) + data[1]; + ext_len = (data[2] << 8) + data[3]; + + if (ext_len > hs_len - 4) /* Extension too long */ + goto not_ssl_hello; + + /* Elliptic curves extension */ + if (ext_type == 10) { + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 1; + smp->flags = SMP_F_VOLATILE; + return 1; + } + + hs_len -= 4 + ext_len; + data += 4 + ext_len; + } + /* server name not found */ + goto not_ssl_hello; + + too_short: + smp->flags = SMP_F_MAY_CHANGE; + + not_ssl_hello: + + return 0; +} +/* returns the type of SSL hello message (mainly used to detect an SSL hello) */ +static int +smp_fetch_ssl_hello_type(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int hs_len; + int hs_type, bleft; + struct channel *chn; + const unsigned char *data; + + if (!smp->strm) + goto not_ssl_hello; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + goto not_ssl_hello; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + bleft = ci_data(chn); + data = (const unsigned char *)ci_head(chn); + + if (!bleft) + goto too_short; + + if ((*data >= 0x14 && *data <= 0x17) || (*data == 0xFF)) { + /* SSLv3 header format */ + if (bleft < 9) + goto too_short; + + /* ssl version 3 */ + if ((data[1] << 16) + data[2] < 0x00030000) + goto not_ssl_hello; + + /* ssl message len must present handshake type and len */ + if ((data[3] << 8) + data[4] < 4) + goto not_ssl_hello; + + /* format introduced with SSLv3 */ + + hs_type = (int)data[5]; + hs_len = ( data[6] << 16 ) + ( data[7] << 8 ) + data[8]; + + /* not a full handshake */ + if (bleft < (9 + hs_len)) + goto too_short; + + } + else { + goto not_ssl_hello; + } + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = hs_type; + smp->flags = SMP_F_VOLATILE; + + return 1; + + too_short: + smp->flags = SMP_F_MAY_CHANGE; + + not_ssl_hello: + + return 0; +} + +/* Return the version of the SSL protocol in the request. It supports both + * SSLv3 (TLSv1) header format for any message, and SSLv2 header format for + * the hello message. The SSLv3 format is described in RFC 2246 p49, and the + * SSLv2 format is described here, and completed p67 of RFC 2246 : + * http://wp.netscape.com/eng/security/SSL_2.html + * + * Note: this decoder only works with non-wrapping data. + */ +static int +smp_fetch_req_ssl_ver(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int version, bleft, msg_len; + const unsigned char *data; + struct channel *req; + + if (!smp->strm) + goto not_ssl; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + goto not_ssl; + + req = &smp->strm->req; + msg_len = 0; + bleft = ci_data(req); + if (!bleft) + goto too_short; + + data = (const unsigned char *)ci_head(req); + if ((*data >= 0x14 && *data <= 0x17) || (*data == 0xFF)) { + /* SSLv3 header format */ + if (bleft < 11) + goto too_short; + + version = (data[1] << 16) + data[2]; /* record layer version: major, minor */ + msg_len = (data[3] << 8) + data[4]; /* record length */ + + /* format introduced with SSLv3 */ + if (version < 0x00030000) + goto not_ssl; + + /* message length between 6 and 2^14 + 2048 */ + if (msg_len < 6 || msg_len > ((1<<14) + 2048)) + goto not_ssl; + + bleft -= 5; data += 5; + + /* return the client hello client version, not the record layer version */ + version = (data[4] << 16) + data[5]; /* client hello version: major, minor */ + } else { + /* SSLv2 header format, only supported for hello (msg type 1) */ + int rlen, plen, cilen, silen, chlen; + + if (*data & 0x80) { + if (bleft < 3) + goto too_short; + /* short header format : 15 bits for length */ + rlen = ((data[0] & 0x7F) << 8) | data[1]; + plen = 0; + bleft -= 2; data += 2; + } else { + if (bleft < 4) + goto too_short; + /* long header format : 14 bits for length + pad length */ + rlen = ((data[0] & 0x3F) << 8) | data[1]; + plen = data[2]; + bleft -= 3; data += 3; + } + + if (*data != 0x01) + goto not_ssl; + bleft--; data++; + + if (bleft < 8) + goto too_short; + version = (data[0] << 16) + data[1]; /* version: major, minor */ + cilen = (data[2] << 8) + data[3]; /* cipher len, multiple of 3 */ + silen = (data[4] << 8) + data[5]; /* session_id_len: 0 or 16 */ + chlen = (data[6] << 8) + data[7]; /* 16<=challenge length<=32 */ + + bleft -= 8; data += 8; + if (cilen % 3 != 0) + goto not_ssl; + if (silen && silen != 16) + goto not_ssl; + if (chlen < 16 || chlen > 32) + goto not_ssl; + if (rlen != 9 + cilen + silen + chlen) + goto not_ssl; + + /* focus on the remaining data length */ + msg_len = cilen + silen + chlen + plen; + } + /* We could recursively check that the buffer ends exactly on an SSL + * fragment boundary and that a possible next segment is still SSL, + * but that's a bit pointless. However, we could still check that + * all the part of the request which fits in a buffer is already + * there. + */ + if (msg_len > channel_recv_limit(req) + b_orig(&req->buf) - ci_head(req)) + msg_len = channel_recv_limit(req) + b_orig(&req->buf) - ci_head(req); + + if (bleft < msg_len) + goto too_short; + + /* OK that's enough. We have at least the whole message, and we have + * the protocol version. + */ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = version; + smp->flags = SMP_F_VOLATILE; + return 1; + + too_short: + smp->flags = SMP_F_MAY_CHANGE; + not_ssl: + return 0; +} + +/* Try to extract the Server Name Indication that may be presented in a TLS + * client hello handshake message. The format of the message is the following + * (cf RFC5246 + RFC6066) : + * TLS frame : + * - uint8 type = 0x16 (Handshake) + * - uint16 version >= 0x0301 (TLSv1) + * - uint16 length (frame length) + * - TLS handshake : + * - uint8 msg_type = 0x01 (ClientHello) + * - uint24 length (handshake message length) + * - ClientHello : + * - uint16 client_version >= 0x0301 (TLSv1) + * - uint8 Random[32] (4 first ones are timestamp) + * - SessionID : + * - uint8 session_id_len (0..32) (SessionID len in bytes) + * - uint8 session_id[session_id_len] + * - CipherSuite : + * - uint16 cipher_len >= 2 (Cipher length in bytes) + * - uint16 ciphers[cipher_len/2] + * - CompressionMethod : + * - uint8 compression_len >= 1 (# of supported methods) + * - uint8 compression_methods[compression_len] + * - optional client_extension_len (in bytes) + * - optional sequence of ClientHelloExtensions (as many bytes as above): + * - uint16 extension_type = 0 for server_name + * - uint16 extension_len + * - opaque extension_data[extension_len] + * - uint16 server_name_list_len (# of bytes here) + * - opaque server_names[server_name_list_len bytes] + * - uint8 name_type = 0 for host_name + * - uint16 name_len + * - opaque hostname[name_len bytes] + */ +static int +smp_fetch_ssl_hello_sni(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int hs_len, ext_len, bleft; + struct channel *chn; + unsigned char *data; + + if (!smp->strm) + goto not_ssl_hello; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + goto not_ssl_hello; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + bleft = ci_data(chn); + data = (unsigned char *)ci_head(chn); + + /* Check for SSL/TLS Handshake */ + if (!bleft) + goto too_short; + if (*data != 0x16) + goto not_ssl_hello; + + /* Check for SSLv3 or later (SSL version >= 3.0) in the record layer*/ + if (bleft < 3) + goto too_short; + if (data[1] < 0x03) + goto not_ssl_hello; + + if (bleft < 5) + goto too_short; + hs_len = (data[3] << 8) + data[4]; + if (hs_len < 1 + 3 + 2 + 32 + 1 + 2 + 2 + 1 + 1 + 2 + 2) + goto not_ssl_hello; /* too short to have an extension */ + + data += 5; /* enter TLS handshake */ + bleft -= 5; + + /* Check for a complete client hello starting at <data> */ + if (bleft < 1) + goto too_short; + if (data[0] != 0x01) /* msg_type = Client Hello */ + goto not_ssl_hello; + + /* Check the Hello's length */ + if (bleft < 4) + goto too_short; + hs_len = (data[1] << 16) + (data[2] << 8) + data[3]; + if (hs_len < 2 + 32 + 1 + 2 + 2 + 1 + 1 + 2 + 2) + goto not_ssl_hello; /* too short to have an extension */ + + /* We want the full handshake here */ + if (bleft < hs_len) + goto too_short; + + data += 4; + /* Start of the ClientHello message */ + if (data[0] < 0x03 || data[1] < 0x01) /* TLSv1 minimum */ + goto not_ssl_hello; + + ext_len = data[34]; /* session_id_len */ + if (ext_len > 32 || ext_len > (hs_len - 35)) /* check for correct session_id len */ + goto not_ssl_hello; + + /* Jump to cipher suite */ + hs_len -= 35 + ext_len; + data += 35 + ext_len; + + if (hs_len < 4 || /* minimum one cipher */ + (ext_len = (data[0] << 8) + data[1]) < 2 || /* minimum 2 bytes for a cipher */ + ext_len > hs_len) + goto not_ssl_hello; + + /* Jump to the compression methods */ + hs_len -= 2 + ext_len; + data += 2 + ext_len; + + if (hs_len < 2 || /* minimum one compression method */ + data[0] < 1 || data[0] > hs_len) /* minimum 1 bytes for a method */ + goto not_ssl_hello; + + /* Jump to the extensions */ + hs_len -= 1 + data[0]; + data += 1 + data[0]; + + if (hs_len < 2 || /* minimum one extension list length */ + (ext_len = (data[0] << 8) + data[1]) > hs_len - 2) /* list too long */ + goto not_ssl_hello; + + hs_len = ext_len; /* limit ourselves to the extension length */ + data += 2; + + while (hs_len >= 4) { + int ext_type, name_type, srv_len, name_len; + + ext_type = (data[0] << 8) + data[1]; + ext_len = (data[2] << 8) + data[3]; + + if (ext_len > hs_len - 4) /* Extension too long */ + goto not_ssl_hello; + + if (ext_type == 0) { /* Server name */ + if (ext_len < 2) /* need one list length */ + goto not_ssl_hello; + + srv_len = (data[4] << 8) + data[5]; + if (srv_len < 4 || srv_len > hs_len - 6) + goto not_ssl_hello; /* at least 4 bytes per server name */ + + name_type = data[6]; + name_len = (data[7] << 8) + data[8]; + + if (name_type == 0) { /* hostname */ + smp->data.type = SMP_T_STR; + smp->data.u.str.area = (char *)data + 9; + smp->data.u.str.data = name_len; + smp->flags = SMP_F_VOLATILE | SMP_F_CONST; + return 1; + } + } + + hs_len -= 4 + ext_len; + data += 4 + ext_len; + } + /* server name not found */ + goto not_ssl_hello; + + too_short: + smp->flags = SMP_F_MAY_CHANGE; + + not_ssl_hello: + + return 0; +} + +/* Try to extract the Application-Layer Protocol Negotiation (ALPN) protocol + * names that may be presented in a TLS client hello handshake message. As the + * message presents a list of protocol names in descending order of preference, + * it may return iteratively. The format of the message is the following + * (cf RFC5246 + RFC7301) : + * TLS frame : + * - uint8 type = 0x16 (Handshake) + * - uint16 version >= 0x0301 (TLSv1) + * - uint16 length (frame length) + * - TLS handshake : + * - uint8 msg_type = 0x01 (ClientHello) + * - uint24 length (handshake message length) + * - ClientHello : + * - uint16 client_version >= 0x0301 (TLSv1) + * - uint8 Random[32] (4 first ones are timestamp) + * - SessionID : + * - uint8 session_id_len (0..32) (SessionID len in bytes) + * - uint8 session_id[session_id_len] + * - CipherSuite : + * - uint16 cipher_len >= 2 (Cipher length in bytes) + * - uint16 ciphers[cipher_len/2] + * - CompressionMethod : + * - uint8 compression_len >= 1 (# of supported methods) + * - uint8 compression_methods[compression_len] + * - optional client_extension_len (in bytes) + * - optional sequence of ClientHelloExtensions (as many bytes as above): + * - uint16 extension_type = 16 for application_layer_protocol_negotiation + * - uint16 extension_len + * - opaque extension_data[extension_len] + * - uint16 protocol_names_len (# of bytes here) + * - opaque protocol_names[protocol_names_len bytes] + * - uint8 name_len + * - opaque protocol_name[name_len bytes] + */ +static int +smp_fetch_ssl_hello_alpn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int hs_len, ext_len, bleft; + struct channel *chn; + unsigned char *data; + + if (!smp->strm) + goto not_ssl_hello; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + goto not_ssl_hello; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + bleft = ci_data(chn); + data = (unsigned char *)ci_head(chn); + + /* Check for SSL/TLS Handshake */ + if (!bleft) + goto too_short; + if (*data != 0x16) + goto not_ssl_hello; + + /* Check for SSLv3 or later (SSL version >= 3.0) in the record layer*/ + if (bleft < 3) + goto too_short; + if (data[1] < 0x03) + goto not_ssl_hello; + + if (bleft < 5) + goto too_short; + hs_len = (data[3] << 8) + data[4]; + if (hs_len < 1 + 3 + 2 + 32 + 1 + 2 + 2 + 1 + 1 + 2 + 2) + goto not_ssl_hello; /* too short to have an extension */ + + data += 5; /* enter TLS handshake */ + bleft -= 5; + + /* Check for a complete client hello starting at <data> */ + if (bleft < 1) + goto too_short; + if (data[0] != 0x01) /* msg_type = Client Hello */ + goto not_ssl_hello; + + /* Check the Hello's length */ + if (bleft < 4) + goto too_short; + hs_len = (data[1] << 16) + (data[2] << 8) + data[3]; + if (hs_len < 2 + 32 + 1 + 2 + 2 + 1 + 1 + 2 + 2) + goto not_ssl_hello; /* too short to have an extension */ + + /* We want the full handshake here */ + if (bleft < hs_len) + goto too_short; + + data += 4; + /* Start of the ClientHello message */ + if (data[0] < 0x03 || data[1] < 0x01) /* TLSv1 minimum */ + goto not_ssl_hello; + + ext_len = data[34]; /* session_id_len */ + if (ext_len > 32 || ext_len > (hs_len - 35)) /* check for correct session_id len */ + goto not_ssl_hello; + + /* Jump to cipher suite */ + hs_len -= 35 + ext_len; + data += 35 + ext_len; + + if (hs_len < 4 || /* minimum one cipher */ + (ext_len = (data[0] << 8) + data[1]) < 2 || /* minimum 2 bytes for a cipher */ + ext_len > hs_len) + goto not_ssl_hello; + + /* Jump to the compression methods */ + hs_len -= 2 + ext_len; + data += 2 + ext_len; + + if (hs_len < 2 || /* minimum one compression method */ + data[0] < 1 || data[0] > hs_len) /* minimum 1 bytes for a method */ + goto not_ssl_hello; + + /* Jump to the extensions */ + hs_len -= 1 + data[0]; + data += 1 + data[0]; + + if (hs_len < 2 || /* minimum one extension list length */ + (ext_len = (data[0] << 8) + data[1]) > hs_len - 2) /* list too long */ + goto not_ssl_hello; + + hs_len = ext_len; /* limit ourselves to the extension length */ + data += 2; + + while (hs_len >= 4) { + int ext_type, name_len, name_offset; + + ext_type = (data[0] << 8) + data[1]; + ext_len = (data[2] << 8) + data[3]; + + if (ext_len > hs_len - 4) /* Extension too long */ + goto not_ssl_hello; + + if (ext_type == 16) { /* ALPN */ + if (ext_len < 3) /* one list length [uint16] + at least one name length [uint8] */ + goto not_ssl_hello; + + /* Name cursor in ctx, must begin after protocol_names_len */ + name_offset = smp->ctx.i < 6 ? 6 : smp->ctx.i; + name_len = data[name_offset]; + + if (name_len + name_offset - 3 > ext_len) + goto not_ssl_hello; + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = (char *)data + name_offset + 1; /* +1 to skip name_len */ + smp->data.u.str.data = name_len; + smp->flags = SMP_F_VOLATILE | SMP_F_CONST; + + /* May have more protocol names remaining */ + if (name_len + name_offset - 3 < ext_len) { + smp->ctx.i = name_offset + name_len + 1; + smp->flags |= SMP_F_NOT_LAST; + } + + return 1; + } + + hs_len -= 4 + ext_len; + data += 4 + ext_len; + } + /* alpn not found */ + goto not_ssl_hello; + + too_short: + smp->flags = SMP_F_MAY_CHANGE; + + not_ssl_hello: + + return 0; +} + +/* Fetch the request RDP cookie identified in <cname>:<clen>, or any cookie if + * <clen> is empty (cname is then ignored). It returns the data into sample <smp> + * of type SMP_T_CSTR. Note: this decoder only works with non-wrapping data. + */ +int +fetch_rdp_cookie_name(struct stream *s, struct sample *smp, const char *cname, int clen) +{ + int bleft; + const unsigned char *data; + + smp->flags = SMP_F_CONST; + smp->data.type = SMP_T_STR; + + bleft = ci_data(&s->req); + if (bleft <= 11) + goto too_short; + + data = (const unsigned char *)ci_head(&s->req) + 11; + bleft -= 11; + + if (bleft <= 7) + goto too_short; + + if (strncasecmp((const char *)data, "Cookie:", 7) != 0) + goto not_cookie; + + data += 7; + bleft -= 7; + + while (bleft > 0 && *data == ' ') { + data++; + bleft--; + } + + if (clen) { + if (bleft <= clen) + goto too_short; + + if ((data[clen] != '=') || + strncasecmp(cname, (const char *)data, clen) != 0) + goto not_cookie; + + data += clen + 1; + bleft -= clen + 1; + } else { + while (bleft > 0 && *data != '=') { + if (*data == '\r' || *data == '\n') + goto not_cookie; + data++; + bleft--; + } + + if (bleft < 1) + goto too_short; + + if (*data != '=') + goto not_cookie; + + data++; + bleft--; + } + + /* data points to cookie value */ + smp->data.u.str.area = (char *)data; + smp->data.u.str.data = 0; + + while (bleft > 0 && *data != '\r') { + data++; + bleft--; + } + + if (bleft < 2) + goto too_short; + + if (data[0] != '\r' || data[1] != '\n') + goto not_cookie; + + smp->data.u.str.data = (char *)data - smp->data.u.str.area; + smp->flags = SMP_F_VOLATILE | SMP_F_CONST; + return 1; + + too_short: + smp->flags = SMP_F_MAY_CHANGE | SMP_F_CONST; + not_cookie: + return 0; +} + +/* Fetch the request RDP cookie identified in the args, or any cookie if no arg + * is passed. It is usable both for ACL and for samples. Note: this decoder + * only works with non-wrapping data. Accepts either 0 or 1 argument. Argument + * is a string (cookie name), other types will lead to undefined behaviour. The + * returned sample has type SMP_T_CSTR. + */ +int +smp_fetch_rdp_cookie(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + return 0; + + return fetch_rdp_cookie_name(smp->strm, smp, + args ? args->data.str.area : NULL, + args ? args->data.str.data : 0); +} + +/* returns either 1 or 0 depending on whether an RDP cookie is found or not */ +static int +smp_fetch_rdp_cookie_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int ret; + + ret = smp_fetch_rdp_cookie(args, smp, kw, private); + + if (smp->flags & SMP_F_MAY_CHANGE) + return 0; + + smp->flags = SMP_F_VOLATILE; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = ret; + return 1; +} + +/* extracts part of a payload with offset and length at a given position */ +static int +smp_fetch_payload_lv(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + unsigned int len_offset = arg_p[0].data.sint; + unsigned int len_size = arg_p[1].data.sint; + unsigned int buf_offset; + unsigned int buf_size = 0; + struct channel *chn = NULL; + char *head = NULL; + size_t max, data; + int i; + + /* Format is (len offset, len size, buf offset) or (len offset, len size) */ + /* by default buf offset == len offset + len size */ + /* buf offset could be absolute or relative to len offset + len size if prefixed by + or - */ + + if (smp->strm) { + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + return 0; + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + head = ci_head(chn); + data = ci_data(chn); + } + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) { + struct check *check = __objt_check(smp->sess->origin); + + /* meaningless for HTX buffers */ + if (check->sc && IS_HTX_SC(check->sc)) + return 0; + head = b_head(&check->bi); + data = b_data(&check->bi); + } + max = global.tune.bufsize; + if (!head) + goto too_short; + + if (len_offset + len_size > data) + goto too_short; + + for (i = 0; i < len_size; i++) { + buf_size = (buf_size << 8) + ((unsigned char *)head)[i + len_offset]; + } + + /* buf offset may be implicit, absolute or relative. If the LSB + * is set, then the offset is relative otherwise it is absolute. + */ + buf_offset = len_offset + len_size; + if (arg_p[2].type == ARGT_SINT) { + if (arg_p[2].data.sint & 1) + buf_offset += arg_p[2].data.sint >> 1; + else + buf_offset = arg_p[2].data.sint >> 1; + } + + if (!buf_size || buf_size > max || buf_offset + buf_size > max) { + /* will never match */ + smp->flags = 0; + return 0; + } + + if (buf_offset + buf_size > data) + goto too_short; + + /* init chunk as read only */ + smp->data.type = SMP_T_BIN; + smp->flags = SMP_F_VOLATILE | SMP_F_CONST; + chunk_initlen(&smp->data.u.str, head + buf_offset, 0, buf_size); + return 1; + + too_short: + smp->flags = SMP_F_MAY_CHANGE | SMP_F_CONST; + return 0; +} + +/* extracts some payload at a fixed position and length */ +static int +smp_fetch_payload(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + unsigned int buf_offset = arg_p[0].data.sint; + unsigned int buf_size = arg_p[1].data.sint; + struct channel *chn = NULL; + char *head = NULL; + size_t max, data; + + if (smp->strm) { + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + return 0; + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + head = ci_head(chn); + data = ci_data(chn); + } + else if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) { + struct check *check = __objt_check(smp->sess->origin); + + /* meaningless for HTX buffers */ + if (check->sc && IS_HTX_SC(check->sc)) + return 0; + head = b_head(&check->bi); + data = b_data(&check->bi); + } + max = global.tune.bufsize; + if (!head) + goto too_short; + + if (buf_size > max || buf_offset + buf_size > max) { + /* will never match */ + smp->flags = 0; + return 0; + } + if (buf_offset + buf_size > data) + goto too_short; + + /* init chunk as read only */ + smp->data.type = SMP_T_BIN; + smp->flags = SMP_F_VOLATILE | SMP_F_CONST; + chunk_initlen(&smp->data.u.str, head + buf_offset, 0, buf_size ? buf_size : (data - buf_offset)); + + if (!buf_size && chn && channel_may_recv(chn) && !channel_input_closed(chn)) + smp->flags |= SMP_F_MAY_CHANGE; + + return 1; + + too_short: + smp->flags = SMP_F_MAY_CHANGE | SMP_F_CONST; + return 0; +} + +/* This function is used to validate the arguments passed to a "payload_lv" fetch + * keyword. This keyword allows two positive integers and an optional signed one, + * with the second one being strictly positive and the third one being greater than + * the opposite of the two others if negative. It is assumed that the types are + * already the correct ones. Returns 0 on error, non-zero if OK. If <err_msg> is + * not NULL, it will be filled with a pointer to an error message in case of + * error, that the caller is responsible for freeing. The initial location must + * either be freeable or NULL. + * + * Note that offset2 is stored with SINT type, but its not directly usable as is. + * The value is contained in the 63 MSB and the LSB is used as a flag for marking + * the "relative" property of the value. + */ +int val_payload_lv(struct arg *arg, char **err_msg) +{ + int relative = 0; + const char *str; + + if (arg[0].data.sint < 0) { + memprintf(err_msg, "payload offset1 must be positive"); + return 0; + } + + if (!arg[1].data.sint) { + memprintf(err_msg, "payload length must be > 0"); + return 0; + } + + if (arg[2].type == ARGT_STR && arg[2].data.str.data > 0) { + long long int i; + + if (arg[2].data.str.area[0] == '+' || arg[2].data.str.area[0] == '-') + relative = 1; + str = arg[2].data.str.area; + i = read_int64(&str, str + arg[2].data.str.data); + if (*str != '\0') { + memprintf(err_msg, "payload offset2 is not a number"); + return 0; + } + chunk_destroy(&arg[2].data.str); + arg[2].type = ARGT_SINT; + arg[2].data.sint = i; + + if (arg[0].data.sint + arg[1].data.sint + arg[2].data.sint < 0) { + memprintf(err_msg, "payload offset2 too negative"); + return 0; + } + if (relative) + arg[2].data.sint = ( arg[2].data.sint << 1 ) + 1; + } + return 1; +} + +/* extracts the parameter value of a distcc token */ +static int +smp_fetch_distcc_param(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + unsigned int match_tok = arg_p[0].data.sint; + unsigned int match_occ = arg_p[1].data.sint; + unsigned int token; + unsigned int param; + unsigned int body; + unsigned int ofs; + unsigned int occ; + struct channel *chn; + int i; + + /* Format is (token[,occ]). occ starts at 1. */ + + if (!smp->strm) + return 0; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + + ofs = 0; occ = 0; + while (1) { + if (ofs + 12 > ci_data(chn)) { + /* not there yet but could it at least fit ? */ + if (!chn->buf.size) + goto too_short; + + if (ofs + 12 <= channel_recv_limit(chn) + b_orig(&chn->buf) - ci_head(chn)) + goto too_short; + + goto no_match; + } + + token = read_n32(ci_head(chn) + ofs); + ofs += 4; + + for (i = param = 0; i < 8; i++) { + int c = hex2i(ci_head(chn)[ofs + i]); + + if (c < 0) + goto no_match; + param = (param << 4) + c; + } + ofs += 8; + + /* these tokens don't have a body */ + if (token != 0x41524743 /* ARGC */ && token != 0x44495354 /* DIST */ && + token != 0x4E46494C /* NFIL */ && token != 0x53544154 /* STAT */ && + token != 0x444F4E45 /* DONE */) + body = param; + else + body = 0; + + if (token == match_tok) { + occ++; + if (!match_occ || match_occ == occ) { + /* found */ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = param; + smp->flags = SMP_F_VOLATILE | SMP_F_CONST; + return 1; + } + } + ofs += body; + } + + too_short: + smp->flags = SMP_F_MAY_CHANGE | SMP_F_CONST; + return 0; + no_match: + /* will never match (end of buffer, or bad contents) */ + smp->flags = 0; + return 0; + +} + +/* extracts the (possibly truncated) body of a distcc token */ +static int +smp_fetch_distcc_body(const struct arg *arg_p, struct sample *smp, const char *kw, void *private) +{ + unsigned int match_tok = arg_p[0].data.sint; + unsigned int match_occ = arg_p[1].data.sint; + unsigned int token; + unsigned int param; + unsigned int ofs; + unsigned int occ; + unsigned int body; + struct channel *chn; + int i; + + /* Format is (token[,occ]). occ starts at 1. */ + + if (!smp->strm) + return 0; + + /* meaningless for HTX buffers */ + if (IS_HTX_STRM(smp->strm)) + return 0; + + chn = ((smp->opt & SMP_OPT_DIR) == SMP_OPT_DIR_RES) ? &smp->strm->res : &smp->strm->req; + + ofs = 0; occ = 0; + while (1) { + if (ofs + 12 > ci_data(chn)) { + if (!chn->buf.size) + goto too_short; + + if (ofs + 12 <= channel_recv_limit(chn) + b_orig(&chn->buf) - ci_head(chn)) + goto too_short; + + goto no_match; + } + + token = read_n32(ci_head(chn) + ofs); + ofs += 4; + + for (i = param = 0; i < 8; i++) { + int c = hex2i(ci_head(chn)[ofs + i]); + + if (c < 0) + goto no_match; + param = (param << 4) + c; + } + ofs += 8; + + /* these tokens don't have a body */ + if (token != 0x41524743 /* ARGC */ && token != 0x44495354 /* DIST */ && + token != 0x4E46494C /* NFIL */ && token != 0x53544154 /* STAT */ && + token != 0x444F4E45 /* DONE */) + body = param; + else + body = 0; + + if (token == match_tok) { + occ++; + if (!match_occ || match_occ == occ) { + /* found */ + + smp->data.type = SMP_T_BIN; + smp->flags = SMP_F_VOLATILE | SMP_F_CONST; + + if (ofs + body > ci_head(chn) - b_orig(&chn->buf) + ci_data(chn)) { + /* incomplete body */ + + if (ofs + body > channel_recv_limit(chn) + b_orig(&chn->buf) - ci_head(chn)) { + /* truncate it to whatever will fit */ + smp->flags |= SMP_F_MAY_CHANGE; + body = channel_recv_limit(chn) + b_orig(&chn->buf) - ci_head(chn) - ofs; + } + } + + chunk_initlen(&smp->data.u.str, ci_head(chn) + ofs, 0, body); + return 1; + } + } + ofs += body; + } + + too_short: + smp->flags = SMP_F_MAY_CHANGE | SMP_F_CONST; + return 0; + no_match: + /* will never match (end of buffer, or bad contents) */ + smp->flags = 0; + return 0; + +} + +/* This function is used to validate the arguments passed to a "distcc_param" or + * "distcc_body" sample fetch keyword. They take a mandatory token name of exactly + * 4 characters, followed by an optional occurrence number starting at 1. It is + * assumed that the types are already the correct ones. Returns 0 on error, non- + * zero if OK. If <err_msg> is not NULL, it will be filled with a pointer to an + * error message in case of error, that the caller is responsible for freeing. + * The initial location must either be freeable or NULL. + */ +int val_distcc(struct arg *arg, char **err_msg) +{ + unsigned int token; + + if (arg[0].data.str.data != 4) { + memprintf(err_msg, "token name must be exactly 4 characters"); + return 0; + } + + /* convert the token name to an unsigned int (one byte per character, + * big endian format). + */ + token = (arg[0].data.str.area[0] << 24) + (arg[0].data.str.area[1] << 16) + + (arg[0].data.str.area[2] << 8) + (arg[0].data.str.area[3] << 0); + + chunk_destroy(&arg[0].data.str); + arg[0].type = ARGT_SINT; + arg[0].data.sint = token; + + if (arg[1].type != ARGT_SINT) { + arg[1].type = ARGT_SINT; + arg[1].data.sint = 0; + } + return 1; +} + +/************************************************************************/ +/* All supported sample and ACL keywords must be declared here. */ +/************************************************************************/ + +/* Note: must not be declared <const> as its list will be overwritten. + * Note: fetches that may return multiple types should be declared using the + * appropriate pseudo-type. If not available it must be declared as the lowest + * common denominator, the type that can be casted into all other ones. + */ +static struct sample_fetch_kw_list smp_kws = {ILH, { + { "distcc_body", smp_fetch_distcc_body, ARG2(1,STR,SINT), val_distcc, SMP_T_BIN, SMP_USE_L6REQ|SMP_USE_L6RES }, + { "distcc_param", smp_fetch_distcc_param, ARG2(1,STR,SINT), val_distcc, SMP_T_SINT, SMP_USE_L6REQ|SMP_USE_L6RES }, + { "payload", smp_fetch_payload, ARG2(2,SINT,SINT), NULL, SMP_T_BIN, SMP_USE_L6REQ|SMP_USE_L6RES }, + { "payload_lv", smp_fetch_payload_lv, ARG3(2,SINT,SINT,STR), val_payload_lv, SMP_T_BIN, SMP_USE_L6REQ|SMP_USE_L6RES }, + { "rdp_cookie", smp_fetch_rdp_cookie, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_L6REQ }, + { "rdp_cookie_cnt", smp_fetch_rdp_cookie_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "rep_ssl_hello_type", smp_fetch_ssl_hello_type, 0, NULL, SMP_T_SINT, SMP_USE_L6RES }, + { "req_len", smp_fetch_len, 0, NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "req_ssl_hello_type", smp_fetch_ssl_hello_type, 0, NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "req_ssl_sni", smp_fetch_ssl_hello_sni, 0, NULL, SMP_T_STR, SMP_USE_L6REQ }, + { "req_ssl_ver", smp_fetch_req_ssl_ver, 0, NULL, SMP_T_SINT, SMP_USE_L6REQ }, + + { "req.len", smp_fetch_len, 0, NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "req.payload", smp_fetch_payload, ARG2(2,SINT,SINT), NULL, SMP_T_BIN, SMP_USE_L6REQ }, + { "req.payload_lv", smp_fetch_payload_lv, ARG3(2,SINT,SINT,STR), val_payload_lv, SMP_T_BIN, SMP_USE_L6REQ }, + { "req.rdp_cookie", smp_fetch_rdp_cookie, ARG1(0,STR), NULL, SMP_T_STR, SMP_USE_L6REQ }, + { "req.rdp_cookie_cnt", smp_fetch_rdp_cookie_cnt, ARG1(0,STR), NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "req.ssl_ec_ext", smp_fetch_req_ssl_ec_ext, 0, NULL, SMP_T_BOOL, SMP_USE_L6REQ }, + { "req.ssl_st_ext", smp_fetch_req_ssl_st_ext, 0, NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "req.ssl_hello_type", smp_fetch_ssl_hello_type, 0, NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "req.ssl_sni", smp_fetch_ssl_hello_sni, 0, NULL, SMP_T_STR, SMP_USE_L6REQ }, + { "req.ssl_alpn", smp_fetch_ssl_hello_alpn, 0, NULL, SMP_T_STR, SMP_USE_L6REQ }, + { "req.ssl_ver", smp_fetch_req_ssl_ver, 0, NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "res.len", smp_fetch_len, 0, NULL, SMP_T_SINT, SMP_USE_L6RES }, + { "res.payload", smp_fetch_payload, ARG2(2,SINT,SINT), NULL, SMP_T_BIN, SMP_USE_L6RES }, + { "res.payload_lv", smp_fetch_payload_lv, ARG3(2,SINT,SINT,STR), val_payload_lv, SMP_T_BIN, SMP_USE_L6RES }, + { "res.ssl_hello_type", smp_fetch_ssl_hello_type, 0, NULL, SMP_T_SINT, SMP_USE_L6RES }, + { "wait_end", smp_fetch_wait_end, 0, NULL, SMP_T_BOOL, SMP_USE_INTRN }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct acl_kw_list acl_kws = {ILH, { + { "payload", "req.payload", PAT_MATCH_BIN }, + { "payload_lv", "req.payload_lv", PAT_MATCH_BIN }, + { "req_rdp_cookie", "req.rdp_cookie", PAT_MATCH_STR }, + { "req_rdp_cookie_cnt", "req.rdp_cookie_cnt", PAT_MATCH_INT }, + { "req_ssl_sni", "req.ssl_sni", PAT_MATCH_STR }, + { "req_ssl_ver", "req.ssl_ver", PAT_MATCH_INT, pat_parse_dotted_ver }, + { "req.ssl_ver", "req.ssl_ver", PAT_MATCH_INT, pat_parse_dotted_ver }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, acl_register_keywords, &acl_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/peers.c b/src/peers.c new file mode 100644 index 0000000..5eefd18 --- /dev/null +++ b/src/peers.c @@ -0,0 +1,4231 @@ +/* + * Peer synchro management. + * + * Copyright 2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include <import/eb32tree.h> +#include <import/ebmbtree.h> +#include <import/ebpttree.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/cli.h> +#include <haproxy/dict.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/frontend.h> +#include <haproxy/net_helper.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/peers.h> +#include <haproxy/proxy.h> +#include <haproxy/sc_strm.h> +#include <haproxy/session-t.h> +#include <haproxy/signal.h> +#include <haproxy/stats-t.h> +#include <haproxy/stconn.h> +#include <haproxy/stick_table.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/thread.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> + + +/*******************************/ +/* Current peer learning state */ +/*******************************/ + +/******************************/ +/* Current peers section resync state */ +/******************************/ +#define PEERS_F_RESYNC_LOCAL 0x00000001 /* Learn from local finished or no more needed */ +#define PEERS_F_RESYNC_REMOTE 0x00000002 /* Learn from remote finished or no more needed */ +#define PEERS_F_RESYNC_ASSIGN 0x00000004 /* A peer was assigned to learn our lesson */ +#define PEERS_F_RESYNC_PROCESS 0x00000008 /* The assigned peer was requested for resync */ +#define PEERS_F_RESYNC_LOCALTIMEOUT 0x00000010 /* Timeout waiting for a full resync from a local node */ +#define PEERS_F_RESYNC_REMOTETIMEOUT 0x00000020 /* Timeout waiting for a full resync from a remote node */ +#define PEERS_F_RESYNC_LOCALABORT 0x00000040 /* Session aborted learning from a local node */ +#define PEERS_F_RESYNC_REMOTEABORT 0x00000080 /* Session aborted learning from a remote node */ +#define PEERS_F_RESYNC_LOCALFINISHED 0x00000100 /* A local node teach us and was fully up to date */ +#define PEERS_F_RESYNC_REMOTEFINISHED 0x00000200 /* A remote node teach us and was fully up to date */ +#define PEERS_F_RESYNC_LOCALPARTIAL 0x00000400 /* A local node teach us but was partially up to date */ +#define PEERS_F_RESYNC_REMOTEPARTIAL 0x00000800 /* A remote node teach us but was partially up to date */ +#define PEERS_F_RESYNC_LOCALASSIGN 0x00001000 /* A local node was assigned for a full resync */ +#define PEERS_F_RESYNC_REMOTEASSIGN 0x00002000 /* A remote node was assigned for a full resync */ +#define PEERS_F_RESYNC_REQUESTED 0x00004000 /* A resync was explicitly requested */ +#define PEERS_F_DONOTSTOP 0x00010000 /* Main table sync task block process during soft stop + to push data to new process */ + +#define PEERS_RESYNC_STATEMASK (PEERS_F_RESYNC_LOCAL|PEERS_F_RESYNC_REMOTE) +#define PEERS_RESYNC_FROMLOCAL 0x00000000 +#define PEERS_RESYNC_FROMREMOTE PEERS_F_RESYNC_LOCAL +#define PEERS_RESYNC_FINISHED (PEERS_F_RESYNC_LOCAL|PEERS_F_RESYNC_REMOTE) + +/***********************************/ +/* Current shared table sync state */ +/***********************************/ +#define SHTABLE_F_TEACH_STAGE1 0x00000001 /* Teach state 1 complete */ +#define SHTABLE_F_TEACH_STAGE2 0x00000002 /* Teach state 2 complete */ + +/******************************/ +/* Remote peer teaching state */ +/******************************/ +#define PEER_F_TEACH_PROCESS 0x00000001 /* Teach a lesson to current peer */ +#define PEER_F_TEACH_FINISHED 0x00000008 /* Teach conclude, (wait for confirm) */ +#define PEER_F_TEACH_COMPLETE 0x00000010 /* All that we know already taught to current peer, used only for a local peer */ +#define PEER_F_LEARN_ASSIGN 0x00000100 /* Current peer was assigned for a lesson */ +#define PEER_F_LEARN_NOTUP2DATE 0x00000200 /* Learn from peer finished but peer is not up to date */ +#define PEER_F_ALIVE 0x20000000 /* Used to flag a peer a alive. */ +#define PEER_F_HEARTBEAT 0x40000000 /* Heartbeat message to send. */ +#define PEER_F_DWNGRD 0x80000000 /* When this flag is enabled, we must downgrade the supported version announced during peer sessions. */ + +#define PEER_TEACH_RESET ~(PEER_F_TEACH_PROCESS|PEER_F_TEACH_FINISHED) /* PEER_F_TEACH_COMPLETE should never be reset */ +#define PEER_LEARN_RESET ~(PEER_F_LEARN_ASSIGN|PEER_F_LEARN_NOTUP2DATE) + +#define PEER_RESYNC_TIMEOUT 5000 /* 5 seconds */ +#define PEER_RECONNECT_TIMEOUT 5000 /* 5 seconds */ +#define PEER_LOCAL_RECONNECT_TIMEOUT 500 /* 500ms */ +#define PEER_HEARTBEAT_TIMEOUT 3000 /* 3 seconds */ + +/* default maximum of updates sent at once */ +#define PEER_DEF_MAX_UPDATES_AT_ONCE 200 + +/* flags for "show peers" */ +#define PEERS_SHOW_F_DICT 0x00000001 /* also show the contents of the dictionary */ + +/*****************************/ +/* Sync message class */ +/*****************************/ +enum { + PEER_MSG_CLASS_CONTROL = 0, + PEER_MSG_CLASS_ERROR, + PEER_MSG_CLASS_STICKTABLE = 10, + PEER_MSG_CLASS_RESERVED = 255, +}; + +/*****************************/ +/* control message types */ +/*****************************/ +enum { + PEER_MSG_CTRL_RESYNCREQ = 0, + PEER_MSG_CTRL_RESYNCFINISHED, + PEER_MSG_CTRL_RESYNCPARTIAL, + PEER_MSG_CTRL_RESYNCCONFIRM, + PEER_MSG_CTRL_HEARTBEAT, +}; + +/*****************************/ +/* error message types */ +/*****************************/ +enum { + PEER_MSG_ERR_PROTOCOL = 0, + PEER_MSG_ERR_SIZELIMIT, +}; + +/* network key types; + * network types were directly and mistakenly + * mapped on sample types, to keep backward + * compatiblitiy we keep those values but + * we now use a internal/network mapping + * to avoid further mistakes adding or + * modifying internals types + */ +enum { + PEER_KT_ANY = 0, /* any type */ + PEER_KT_RESV1, /* UNUSED */ + PEER_KT_SINT, /* signed 64bits integer type */ + PEER_KT_RESV3, /* UNUSED */ + PEER_KT_IPV4, /* ipv4 type */ + PEER_KT_IPV6, /* ipv6 type */ + PEER_KT_STR, /* char string type */ + PEER_KT_BIN, /* buffer type */ + PEER_KT_TYPES /* number of types, must always be last */ +}; + +/* Map used to retrieve network type from internal type + * Note: Undeclared mapping maps entry to PEER_KT_ANY == 0 + */ +static int peer_net_key_type[SMP_TYPES] = { + [SMP_T_SINT] = PEER_KT_SINT, + [SMP_T_IPV4] = PEER_KT_IPV4, + [SMP_T_IPV6] = PEER_KT_IPV6, + [SMP_T_STR] = PEER_KT_STR, + [SMP_T_BIN] = PEER_KT_BIN, +}; + +/* Map used to retrieve internal type from external type + * Note: Undeclared mapping maps entry to SMP_T_ANY == 0 + */ +static int peer_int_key_type[PEER_KT_TYPES] = { + [PEER_KT_SINT] = SMP_T_SINT, + [PEER_KT_IPV4] = SMP_T_IPV4, + [PEER_KT_IPV6] = SMP_T_IPV6, + [PEER_KT_STR] = SMP_T_STR, + [PEER_KT_BIN] = SMP_T_BIN, +}; + +/* + * Parameters used by functions to build peer protocol messages. */ +struct peer_prep_params { + struct { + struct peer *peer; + } hello; + struct { + unsigned int st1; + } error_status; + struct { + struct stksess *stksess; + struct shared_table *shared_table; + unsigned int updateid; + int use_identifier; + int use_timed; + struct peer *peer; + } updt; + struct { + struct shared_table *shared_table; + } swtch; + struct { + struct shared_table *shared_table; + } ack; + struct { + unsigned char head[2]; + } control; + struct { + unsigned char head[2]; + } error; +}; + +/*******************************/ +/* stick table sync mesg types */ +/* Note: ids >= 128 contains */ +/* id message contains data */ +/*******************************/ +#define PEER_MSG_STKT_UPDATE 0x80 +#define PEER_MSG_STKT_INCUPDATE 0x81 +#define PEER_MSG_STKT_DEFINE 0x82 +#define PEER_MSG_STKT_SWITCH 0x83 +#define PEER_MSG_STKT_ACK 0x84 +#define PEER_MSG_STKT_UPDATE_TIMED 0x85 +#define PEER_MSG_STKT_INCUPDATE_TIMED 0x86 +/* All the stick-table message identifiers abova have the #7 bit set */ +#define PEER_MSG_STKT_BIT 7 +#define PEER_MSG_STKT_BIT_MASK (1 << PEER_MSG_STKT_BIT) + +/* The maximum length of an encoded data length. */ +#define PEER_MSG_ENC_LENGTH_MAXLEN 5 + +/* Minimum 64-bits value encoded with 2 bytes */ +#define PEER_ENC_2BYTES_MIN 0xf0 /* 0xf0 (or 240) */ +/* 3 bytes */ +#define PEER_ENC_3BYTES_MIN ((1ULL << 11) | PEER_ENC_2BYTES_MIN) /* 0x8f0 (or 2288) */ +/* 4 bytes */ +#define PEER_ENC_4BYTES_MIN ((1ULL << 18) | PEER_ENC_3BYTES_MIN) /* 0x408f0 (or 264432) */ +/* 5 bytes */ +#define PEER_ENC_5BYTES_MIN ((1ULL << 25) | PEER_ENC_4BYTES_MIN) /* 0x20408f0 (or 33818864) */ +/* 6 bytes */ +#define PEER_ENC_6BYTES_MIN ((1ULL << 32) | PEER_ENC_5BYTES_MIN) /* 0x1020408f0 (or 4328786160) */ +/* 7 bytes */ +#define PEER_ENC_7BYTES_MIN ((1ULL << 39) | PEER_ENC_6BYTES_MIN) /* 0x81020408f0 (or 554084600048) */ +/* 8 bytes */ +#define PEER_ENC_8BYTES_MIN ((1ULL << 46) | PEER_ENC_7BYTES_MIN) /* 0x4081020408f0 (or 70922828777712) */ +/* 9 bytes */ +#define PEER_ENC_9BYTES_MIN ((1ULL << 53) | PEER_ENC_8BYTES_MIN) /* 0x204081020408f0 (or 9078122083518704) */ +/* 10 bytes */ +#define PEER_ENC_10BYTES_MIN ((1ULL << 60) | PEER_ENC_9BYTES_MIN) /* 0x10204081020408f0 (or 1161999626690365680) */ + +/* #7 bit used to detect the last byte to be encoded */ +#define PEER_ENC_STOP_BIT 7 +/* The byte minimum value with #7 bit set */ +#define PEER_ENC_STOP_BYTE (1 << PEER_ENC_STOP_BIT) +/* The left most number of bits set for PEER_ENC_2BYTES_MIN */ +#define PEER_ENC_2BYTES_MIN_BITS 4 + +#define PEER_MSG_HEADER_LEN 2 + +#define PEER_STKT_CACHE_MAX_ENTRIES 128 + +/**********************************/ +/* Peer Session IO handler states */ +/**********************************/ + +enum { + PEER_SESS_ST_ACCEPT = 0, /* Initial state for session create by an accept, must be zero! */ + PEER_SESS_ST_GETVERSION, /* Validate supported protocol version */ + PEER_SESS_ST_GETHOST, /* Validate host ID correspond to local host id */ + PEER_SESS_ST_GETPEER, /* Validate peer ID correspond to a known remote peer id */ + /* after this point, data were possibly exchanged */ + PEER_SESS_ST_SENDSUCCESS, /* Send ret code 200 (success) and wait for message */ + PEER_SESS_ST_CONNECT, /* Initial state for session create on a connect, push presentation into buffer */ + PEER_SESS_ST_GETSTATUS, /* Wait for the welcome message */ + PEER_SESS_ST_WAITMSG, /* Wait for data messages */ + PEER_SESS_ST_EXIT, /* Exit with status code */ + PEER_SESS_ST_ERRPROTO, /* Send error proto message before exit */ + PEER_SESS_ST_ERRSIZE, /* Send error size message before exit */ + PEER_SESS_ST_END, /* Killed session */ +}; + +/***************************************************/ +/* Peer Session status code - part of the protocol */ +/***************************************************/ + +#define PEER_SESS_SC_CONNECTCODE 100 /* connect in progress */ +#define PEER_SESS_SC_CONNECTEDCODE 110 /* tcp connect success */ + +#define PEER_SESS_SC_SUCCESSCODE 200 /* accept or connect successful */ + +#define PEER_SESS_SC_TRYAGAIN 300 /* try again later */ + +#define PEER_SESS_SC_ERRPROTO 501 /* error protocol */ +#define PEER_SESS_SC_ERRVERSION 502 /* unknown protocol version */ +#define PEER_SESS_SC_ERRHOST 503 /* bad host name */ +#define PEER_SESS_SC_ERRPEER 504 /* unknown peer */ + +#define PEER_SESSION_PROTO_NAME "HAProxyS" +#define PEER_MAJOR_VER 2 +#define PEER_MINOR_VER 1 +#define PEER_DWNGRD_MINOR_VER 0 + +static size_t proto_len = sizeof(PEER_SESSION_PROTO_NAME) - 1; +struct peers *cfg_peers = NULL; +static int peers_max_updates_at_once = PEER_DEF_MAX_UPDATES_AT_ONCE; +static void peer_session_forceshutdown(struct peer *peer); + +static struct ebpt_node *dcache_tx_insert(struct dcache *dc, + struct dcache_tx_entry *i); +static inline void flush_dcache(struct peer *peer); + +/* trace source and events */ +static void peers_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +static const struct trace_event peers_trace_events[] = { +#define PEERS_EV_UPDTMSG (1 << 0) + { .mask = PEERS_EV_UPDTMSG, .name = "updtmsg", .desc = "update message received" }, +#define PEERS_EV_ACKMSG (1 << 1) + { .mask = PEERS_EV_ACKMSG, .name = "ackmsg", .desc = "ack message received" }, +#define PEERS_EV_SWTCMSG (1 << 2) + { .mask = PEERS_EV_SWTCMSG, .name = "swtcmsg", .desc = "switch message received" }, +#define PEERS_EV_DEFMSG (1 << 3) + { .mask = PEERS_EV_DEFMSG, .name = "defmsg", .desc = "definition message received" }, +#define PEERS_EV_CTRLMSG (1 << 4) + { .mask = PEERS_EV_CTRLMSG, .name = "ctrlmsg", .desc = "control message sent/received" }, +#define PEERS_EV_SESSREL (1 << 5) + { .mask = PEERS_EV_SESSREL, .name = "sessrl", .desc = "peer session releasing" }, +#define PEERS_EV_PROTOERR (1 << 6) + { .mask = PEERS_EV_PROTOERR, .name = "protoerr", .desc = "protocol error" }, +}; + +static const struct name_desc peers_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the connection */ }, + /* arg2 */ { .name="peers", .desc="Peers protocol" }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc peers_trace_decoding[] = { +#define PEERS_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, + { /* end */ } +}; + + +struct trace_source trace_peers = { + .name = IST("peers"), + .desc = "Peers protocol", + .arg_def = TRC_ARG1_CONN, /* TRACE()'s first argument is always a connection */ + .default_cb = peers_trace, + .known_events = peers_trace_events, + .lockon_args = peers_trace_lockon_args, + .decoding = peers_trace_decoding, + .report_events = ~0, /* report everything by default */ +}; + +/* Return peer control message types as strings (only for debugging purpose). */ +static inline char *ctrl_msg_type_str(unsigned int type) +{ + switch (type) { + case PEER_MSG_CTRL_RESYNCREQ: + return "RESYNCREQ"; + case PEER_MSG_CTRL_RESYNCFINISHED: + return "RESYNCFINISHED"; + case PEER_MSG_CTRL_RESYNCPARTIAL: + return "RESYNCPARTIAL"; + case PEER_MSG_CTRL_RESYNCCONFIRM: + return "RESYNCCONFIRM"; + case PEER_MSG_CTRL_HEARTBEAT: + return "HEARTBEAT"; + default: + return "???"; + } +} + +#define TRACE_SOURCE &trace_peers +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + +static void peers_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + if (mask & (PEERS_EV_UPDTMSG|PEERS_EV_ACKMSG|PEERS_EV_SWTCMSG)) { + if (a2) { + const struct peer *peer = a2; + + chunk_appendf(&trace_buf, " peer=%s", peer->id); + } + if (a3) { + const char *p = a3; + + chunk_appendf(&trace_buf, " @%p", p); + } + if (a4) { + const size_t *val = a4; + + chunk_appendf(&trace_buf, " %llu", (unsigned long long)*val); + } + } + + if (mask & PEERS_EV_DEFMSG) { + if (a2) { + const struct peer *peer = a2; + + chunk_appendf(&trace_buf, " peer=%s", peer->id); + } + if (a3) { + const char *p = a3; + + chunk_appendf(&trace_buf, " @%p", p); + } + if (a4) { + const int *val = a4; + + chunk_appendf(&trace_buf, " %d", *val); + } + } + + if (mask & PEERS_EV_CTRLMSG) { + if (a2) { + const unsigned char *ctrl_msg_type = a2; + + chunk_appendf(&trace_buf, " %s", ctrl_msg_type_str(*ctrl_msg_type)); + + } + if (a3) { + const char *local_peer = a3; + + chunk_appendf(&trace_buf, " %s", local_peer); + } + + if (a4) { + const char *remote_peer = a4; + + chunk_appendf(&trace_buf, " -> %s", remote_peer); + } + } + + if (mask & (PEERS_EV_SESSREL|PEERS_EV_PROTOERR)) { + if (a2) { + const struct peer *peer = a2; + struct peers *peers = NULL; + + if (peer->appctx) + peers = peer->peers; + + if (peers) + chunk_appendf(&trace_buf, " %s", peers->local->id); + chunk_appendf(&trace_buf, " -> %s", peer->id); + } + + if (a3) { + const int *prev_state = a3; + + chunk_appendf(&trace_buf, " prev_state=%d\n", *prev_state); + } + } +} + +static const char *statuscode_str(int statuscode) +{ + switch (statuscode) { + case PEER_SESS_SC_CONNECTCODE: + return "CONN"; + case PEER_SESS_SC_CONNECTEDCODE: + return "HSHK"; + case PEER_SESS_SC_SUCCESSCODE: + return "ESTA"; + case PEER_SESS_SC_TRYAGAIN: + return "RETR"; + case PEER_SESS_SC_ERRPROTO: + return "PROT"; + case PEER_SESS_SC_ERRVERSION: + return "VERS"; + case PEER_SESS_SC_ERRHOST: + return "NAME"; + case PEER_SESS_SC_ERRPEER: + return "UNKN"; + default: + return "NONE"; + } +} + +/* This function encode an uint64 to 'dynamic' length format. + The encoded value is written at address *str, and the + caller must assure that size after *str is large enough. + At return, the *str is set at the next Byte after then + encoded integer. The function returns then length of the + encoded integer in Bytes */ +int intencode(uint64_t i, char **str) { + int idx = 0; + unsigned char *msg; + + msg = (unsigned char *)*str; + if (i < PEER_ENC_2BYTES_MIN) { + msg[0] = (unsigned char)i; + *str = (char *)&msg[idx+1]; + return (idx+1); + } + + msg[idx] =(unsigned char)i | PEER_ENC_2BYTES_MIN; + i = (i - PEER_ENC_2BYTES_MIN) >> PEER_ENC_2BYTES_MIN_BITS; + while (i >= PEER_ENC_STOP_BYTE) { + msg[++idx] = (unsigned char)i | PEER_ENC_STOP_BYTE; + i = (i - PEER_ENC_STOP_BYTE) >> PEER_ENC_STOP_BIT; + } + msg[++idx] = (unsigned char)i; + *str = (char *)&msg[idx+1]; + return (idx+1); +} + + +/* This function returns a decoded 64bits unsigned integer + * from a varint + * + * Calling: + * - *str must point on the first byte of the buffer to decode. + * - end must point on the next byte after the end of the buffer + * we are authorized to parse (buf + buflen) + * + * At return: + * + * On success *str will point at the byte following + * the fully decoded integer into the buffer. and + * the decoded value is returned. + * + * If end is reached before the integer was fully decoded, + * *str is set to NULL and the caller have to check this + * to know there is a decoding error. In this case + * the returned integer is also forced to 0 + */ +uint64_t intdecode(char **str, char *end) +{ + unsigned char *msg; + uint64_t i; + int shift; + + if (!*str) + return 0; + + msg = (unsigned char *)*str; + if (msg >= (unsigned char *)end) + goto fail; + + i = *(msg++); + if (i >= PEER_ENC_2BYTES_MIN) { + shift = PEER_ENC_2BYTES_MIN_BITS; + do { + if (msg >= (unsigned char *)end) + goto fail; + i += (uint64_t)*msg << shift; + shift += PEER_ENC_STOP_BIT; + } while (*(msg++) >= PEER_ENC_STOP_BYTE); + } + *str = (char *)msg; + return i; + + fail: + *str = NULL; + return 0; +} + +/* + * Build a "hello" peer protocol message. + * Return the number of written bytes written to build this messages if succeeded, + * 0 if not. + */ +static int peer_prepare_hellomsg(char *msg, size_t size, struct peer_prep_params *p) +{ + int min_ver, ret; + struct peer *peer; + + peer = p->hello.peer; + min_ver = (peer->flags & PEER_F_DWNGRD) ? PEER_DWNGRD_MINOR_VER : PEER_MINOR_VER; + /* Prepare headers */ + ret = snprintf(msg, size, PEER_SESSION_PROTO_NAME " %d.%d\n%s\n%s %d %d\n", + (int)PEER_MAJOR_VER, min_ver, peer->id, localpeer, (int)getpid(), (int)1); + if (ret >= size) + return 0; + + return ret; +} + +/* + * Build a "handshake succeeded" status message. + * Return the number of written bytes written to build this messages if succeeded, + * 0 if not. + */ +static int peer_prepare_status_successmsg(char *msg, size_t size, struct peer_prep_params *p) +{ + int ret; + + ret = snprintf(msg, size, "%d\n", (int)PEER_SESS_SC_SUCCESSCODE); + if (ret >= size) + return 0; + + return ret; +} + +/* + * Build an error status message. + * Return the number of written bytes written to build this messages if succeeded, + * 0 if not. + */ +static int peer_prepare_status_errormsg(char *msg, size_t size, struct peer_prep_params *p) +{ + int ret; + unsigned int st1; + + st1 = p->error_status.st1; + ret = snprintf(msg, size, "%u\n", st1); + if (ret >= size) + return 0; + + return ret; +} + +/* Set the stick-table UPDATE message type byte at <msg_type> address, + * depending on <use_identifier> and <use_timed> boolean parameters. + * Always successful. + */ +static inline void peer_set_update_msg_type(char *msg_type, int use_identifier, int use_timed) +{ + if (use_timed) { + if (use_identifier) + *msg_type = PEER_MSG_STKT_UPDATE_TIMED; + else + *msg_type = PEER_MSG_STKT_INCUPDATE_TIMED; + } + else { + if (use_identifier) + *msg_type = PEER_MSG_STKT_UPDATE; + else + *msg_type = PEER_MSG_STKT_INCUPDATE; + } +} +/* + * This prepare the data update message on the stick session <ts>, <st> is the considered + * stick table. + * <msg> is a buffer of <size> to receive data message content + * If function returns 0, the caller should consider we were unable to encode this message (TODO: + * check size) + */ +static int peer_prepare_updatemsg(char *msg, size_t size, struct peer_prep_params *p) +{ + uint32_t netinteger; + unsigned short datalen; + char *cursor, *datamsg; + unsigned int data_type; + void *data_ptr; + struct stksess *ts; + struct shared_table *st; + unsigned int updateid; + int use_identifier; + int use_timed; + struct peer *peer; + + ts = p->updt.stksess; + st = p->updt.shared_table; + updateid = p->updt.updateid; + use_identifier = p->updt.use_identifier; + use_timed = p->updt.use_timed; + peer = p->updt.peer; + + cursor = datamsg = msg + PEER_MSG_HEADER_LEN + PEER_MSG_ENC_LENGTH_MAXLEN; + + /* construct message */ + + /* check if we need to send the update identifier */ + if (!st->last_pushed || updateid < st->last_pushed || ((updateid - st->last_pushed) != 1)) { + use_identifier = 1; + } + + /* encode update identifier if needed */ + if (use_identifier) { + netinteger = htonl(updateid); + memcpy(cursor, &netinteger, sizeof(netinteger)); + cursor += sizeof(netinteger); + } + + if (use_timed) { + netinteger = htonl(tick_remain(now_ms, ts->expire)); + memcpy(cursor, &netinteger, sizeof(netinteger)); + cursor += sizeof(netinteger); + } + + /* encode the key */ + if (st->table->type == SMP_T_STR) { + int stlen = strlen((char *)ts->key.key); + + intencode(stlen, &cursor); + memcpy(cursor, ts->key.key, stlen); + cursor += stlen; + } + else if (st->table->type == SMP_T_SINT) { + netinteger = htonl(read_u32(ts->key.key)); + memcpy(cursor, &netinteger, sizeof(netinteger)); + cursor += sizeof(netinteger); + } + else { + memcpy(cursor, ts->key.key, st->table->key_size); + cursor += st->table->key_size; + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &ts->lock); + /* encode values */ + for (data_type = 0 ; data_type < STKTABLE_DATA_TYPES ; data_type++) { + + data_ptr = stktable_data_ptr(st->table, ts, data_type); + if (data_ptr) { + /* in case of array all elements use + * the same std_type and they are linearly + * encoded. + */ + if (stktable_data_types[data_type].is_array) { + unsigned int idx = 0; + + switch (stktable_data_types[data_type].std_type) { + case STD_T_SINT: { + int data; + + do { + data = stktable_data_cast(data_ptr, std_t_sint); + intencode(data, &cursor); + + data_ptr = stktable_data_ptr_idx(st->table, ts, data_type, ++idx); + } while(data_ptr); + break; + } + case STD_T_UINT: { + unsigned int data; + + do { + data = stktable_data_cast(data_ptr, std_t_uint); + intencode(data, &cursor); + + data_ptr = stktable_data_ptr_idx(st->table, ts, data_type, ++idx); + } while(data_ptr); + break; + } + case STD_T_ULL: { + unsigned long long data; + + do { + data = stktable_data_cast(data_ptr, std_t_ull); + intencode(data, &cursor); + + data_ptr = stktable_data_ptr_idx(st->table, ts, data_type, ++idx); + } while(data_ptr); + break; + } + case STD_T_FRQP: { + struct freq_ctr *frqp; + + do { + frqp = &stktable_data_cast(data_ptr, std_t_frqp); + intencode((unsigned int)(now_ms - frqp->curr_tick), &cursor); + intencode(frqp->curr_ctr, &cursor); + intencode(frqp->prev_ctr, &cursor); + + data_ptr = stktable_data_ptr_idx(st->table, ts, data_type, ++idx); + } while(data_ptr); + break; + } + } + + /* array elements fully encoded + * proceed next data_type. + */ + continue; + } + switch (stktable_data_types[data_type].std_type) { + case STD_T_SINT: { + int data; + + data = stktable_data_cast(data_ptr, std_t_sint); + intencode(data, &cursor); + break; + } + case STD_T_UINT: { + unsigned int data; + + data = stktable_data_cast(data_ptr, std_t_uint); + intencode(data, &cursor); + break; + } + case STD_T_ULL: { + unsigned long long data; + + data = stktable_data_cast(data_ptr, std_t_ull); + intencode(data, &cursor); + break; + } + case STD_T_FRQP: { + struct freq_ctr *frqp; + + frqp = &stktable_data_cast(data_ptr, std_t_frqp); + intencode((unsigned int)(now_ms - frqp->curr_tick), &cursor); + intencode(frqp->curr_ctr, &cursor); + intencode(frqp->prev_ctr, &cursor); + break; + } + case STD_T_DICT: { + struct dict_entry *de; + struct ebpt_node *cached_de; + struct dcache_tx_entry cde = { }; + char *beg, *end; + size_t value_len, data_len; + struct dcache *dc; + + de = stktable_data_cast(data_ptr, std_t_dict); + if (!de) { + /* No entry */ + intencode(0, &cursor); + break; + } + + dc = peer->dcache; + cde.entry.key = de; + cached_de = dcache_tx_insert(dc, &cde); + if (cached_de == &cde.entry) { + if (cde.id + 1 >= PEER_ENC_2BYTES_MIN) + break; + /* Encode the length of the remaining data -> 1 */ + intencode(1, &cursor); + /* Encode the cache entry ID */ + intencode(cde.id + 1, &cursor); + } + else { + /* Leave enough room to encode the remaining data length. */ + end = beg = cursor + PEER_MSG_ENC_LENGTH_MAXLEN; + /* Encode the dictionary entry key */ + intencode(cde.id + 1, &end); + /* Encode the length of the dictionary entry data */ + value_len = de->len; + intencode(value_len, &end); + /* Copy the data */ + memcpy(end, de->value.key, value_len); + end += value_len; + /* Encode the length of the data */ + data_len = end - beg; + intencode(data_len, &cursor); + memmove(cursor, beg, data_len); + cursor += data_len; + } + break; + } + } + } + } + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* Compute datalen */ + datalen = (cursor - datamsg); + + /* prepare message header */ + msg[0] = PEER_MSG_CLASS_STICKTABLE; + peer_set_update_msg_type(&msg[1], use_identifier, use_timed); + cursor = &msg[2]; + intencode(datalen, &cursor); + + /* move data after header */ + memmove(cursor, datamsg, datalen); + + /* return header size + data_len */ + return (cursor - msg) + datalen; +} + +/* + * This prepare the switch table message to targeted share table <st>. + * <msg> is a buffer of <size> to receive data message content + * If function returns 0, the caller should consider we were unable to encode this message (TODO: + * check size) + */ +static int peer_prepare_switchmsg(char *msg, size_t size, struct peer_prep_params *params) +{ + int len; + unsigned short datalen; + struct buffer *chunk; + char *cursor, *datamsg, *chunkp, *chunkq; + uint64_t data = 0; + unsigned int data_type; + struct shared_table *st; + + st = params->swtch.shared_table; + cursor = datamsg = msg + PEER_MSG_HEADER_LEN + PEER_MSG_ENC_LENGTH_MAXLEN; + + /* Encode data */ + + /* encode local id */ + intencode(st->local_id, &cursor); + + /* encode table name */ + len = strlen(st->table->nid); + intencode(len, &cursor); + memcpy(cursor, st->table->nid, len); + cursor += len; + + /* encode table type */ + + intencode(peer_net_key_type[st->table->type], &cursor); + + /* encode table key size */ + intencode(st->table->key_size, &cursor); + + chunk = get_trash_chunk(); + chunkp = chunkq = chunk->area; + /* encode available known data types in table */ + for (data_type = 0 ; data_type < STKTABLE_DATA_TYPES ; data_type++) { + if (st->table->data_ofs[data_type]) { + /* stored data types parameters are all linearly encoded + * at the end of the 'table definition' message. + * + * Currently only array data_types and and data_types + * using freq_counter base type have parameters: + * + * - array has always at least one parameter set to the + * number of elements. + * + * - array of base-type freq_counters has an additional + * parameter set to the period used to compute those + * freq_counters. + * + * - simple freq counter has a parameter set to the period + * used to compute + * + * A set of parameter for a datatype MUST BE prefixed + * by the data-type id itself: + * This is useless because the data_types are ordered and + * the data_type bitfield already gives the information of + * stored types, but it was designed this way when the + * push of period parameter was added for freq counters + * and we don't want to break the compatibility. + * + */ + if (stktable_data_types[data_type].is_array) { + /* This is an array type so we first encode + * the data_type itself to prefix parameters + */ + intencode(data_type, &chunkq); + + /* We encode the first parameter which is + * the number of elements of this array + */ + intencode(st->table->data_nbelem[data_type], &chunkq); + + /* for array of freq counters, there is an additional + * period parameter to encode + */ + if (stktable_data_types[data_type].std_type == STD_T_FRQP) + intencode(st->table->data_arg[data_type].u, &chunkq); + } + else if (stktable_data_types[data_type].std_type == STD_T_FRQP) { + /* this datatype is a simple freq counter not part + * of an array. We encode the data_type itself + * to prefix the 'period' parameter + */ + intencode(data_type, &chunkq); + intencode(st->table->data_arg[data_type].u, &chunkq); + } + /* set the bit corresponding to stored data type */ + data |= 1ULL << data_type; + } + } + intencode(data, &cursor); + + /* Encode stick-table entries duration. */ + intencode(st->table->expire, &cursor); + + if (chunkq > chunkp) { + chunk->data = chunkq - chunkp; + memcpy(cursor, chunk->area, chunk->data); + cursor += chunk->data; + } + + /* Compute datalen */ + datalen = (cursor - datamsg); + + /* prepare message header */ + msg[0] = PEER_MSG_CLASS_STICKTABLE; + msg[1] = PEER_MSG_STKT_DEFINE; + cursor = &msg[2]; + intencode(datalen, &cursor); + + /* move data after header */ + memmove(cursor, datamsg, datalen); + + /* return header size + data_len */ + return (cursor - msg) + datalen; +} + +/* + * This prepare the acknowledge message on the stick session <ts>, <st> is the considered + * stick table. + * <msg> is a buffer of <size> to receive data message content + * If function returns 0, the caller should consider we were unable to encode this message (TODO: + * check size) + */ +static int peer_prepare_ackmsg(char *msg, size_t size, struct peer_prep_params *p) +{ + unsigned short datalen; + char *cursor, *datamsg; + uint32_t netinteger; + struct shared_table *st; + + cursor = datamsg = msg + PEER_MSG_HEADER_LEN + PEER_MSG_ENC_LENGTH_MAXLEN; + + st = p->ack.shared_table; + intencode(st->remote_id, &cursor); + netinteger = htonl(st->last_get); + memcpy(cursor, &netinteger, sizeof(netinteger)); + cursor += sizeof(netinteger); + + /* Compute datalen */ + datalen = (cursor - datamsg); + + /* prepare message header */ + msg[0] = PEER_MSG_CLASS_STICKTABLE; + msg[1] = PEER_MSG_STKT_ACK; + cursor = &msg[2]; + intencode(datalen, &cursor); + + /* move data after header */ + memmove(cursor, datamsg, datalen); + + /* return header size + data_len */ + return (cursor - msg) + datalen; +} + +/* + * Function to deinit connected peer + */ +void __peer_session_deinit(struct peer *peer) +{ + struct peers *peers = peer->peers; + int thr; + + if (!peers || !peer->appctx) + return; + + thr = peer->appctx->t->tid; + HA_ATOMIC_DEC(&peers->applet_count[thr]); + + if (peer->appctx->st0 == PEER_SESS_ST_WAITMSG) + HA_ATOMIC_DEC(&connected_peers); + + HA_ATOMIC_DEC(&active_peers); + + flush_dcache(peer); + + /* Re-init current table pointers to force announcement on re-connect */ + peer->remote_table = peer->last_local_table = peer->stop_local_table = NULL; + peer->appctx = NULL; + if (peer->flags & PEER_F_LEARN_ASSIGN) { + /* unassign current peer for learning */ + peer->flags &= ~(PEER_F_LEARN_ASSIGN); + peers->flags &= ~(PEERS_F_RESYNC_ASSIGN|PEERS_F_RESYNC_PROCESS); + + if (peer->local) + peers->flags |= PEERS_F_RESYNC_LOCALABORT; + else + peers->flags |= PEERS_F_RESYNC_REMOTEABORT; + /* reschedule a resync */ + peers->resync_timeout = tick_add(now_ms, MS_TO_TICKS(5000)); + } + /* reset teaching and learning flags to 0 */ + peer->flags &= PEER_TEACH_RESET; + peer->flags &= PEER_LEARN_RESET; + task_wakeup(peers->sync_task, TASK_WOKEN_MSG); +} + +static int peer_session_init(struct appctx *appctx) +{ + struct peer *peer = appctx->svcctx; + struct stream *s; + struct sockaddr_storage *addr = NULL; + + if (!sockaddr_alloc(&addr, &peer->addr, sizeof(peer->addr))) + goto out_error; + + if (appctx_finalize_startup(appctx, peer->peers->peers_fe, &BUF_NULL) == -1) + goto out_free_addr; + + s = appctx_strm(appctx); + /* applet is waiting for data */ + applet_need_more_data(appctx); + appctx_wakeup(appctx); + + /* initiate an outgoing connection */ + s->scb->dst = addr; + s->scb->flags |= (SC_FL_RCV_ONCE|SC_FL_NOLINGER); + s->flags = SF_ASSIGNED; + s->target = peer_session_target(peer, s); + + s->do_log = NULL; + s->uniq_id = 0; + + _HA_ATOMIC_INC(&active_peers); + return 0; + + out_free_addr: + sockaddr_free(&addr); + out_error: + return -1; +} + +/* + * Callback to release a session with a peer + */ +static void peer_session_release(struct appctx *appctx) +{ + struct peer *peer = appctx->svcctx; + + TRACE_PROTO("releasing peer session", PEERS_EV_SESSREL, NULL, peer); + /* appctx->svcctx is not a peer session */ + if (appctx->st0 < PEER_SESS_ST_SENDSUCCESS) + return; + + /* peer session identified */ + if (peer) { + HA_SPIN_LOCK(PEER_LOCK, &peer->lock); + if (peer->appctx == appctx) + __peer_session_deinit(peer); + peer->flags &= ~PEER_F_ALIVE; + HA_SPIN_UNLOCK(PEER_LOCK, &peer->lock); + } +} + +/* Retrieve the major and minor versions of peers protocol + * announced by a remote peer. <str> is a null-terminated + * string with the following format: "<maj_ver>.<min_ver>". + */ +static int peer_get_version(const char *str, + unsigned int *maj_ver, unsigned int *min_ver) +{ + unsigned int majv, minv; + const char *pos, *saved; + const char *end; + + saved = pos = str; + end = str + strlen(str); + + majv = read_uint(&pos, end); + if (saved == pos || *pos++ != '.') + return -1; + + saved = pos; + minv = read_uint(&pos, end); + if (saved == pos || pos != end) + return -1; + + *maj_ver = majv; + *min_ver = minv; + + return 0; +} + +/* + * Parse a line terminated by an optional '\r' character, followed by a mandatory + * '\n' character. + * Returns 1 if succeeded or 0 if a '\n' character could not be found, and -1 if + * a line could not be read because the communication channel is closed. + */ +static inline int peer_getline(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + int n; + + n = co_getline(sc_oc(sc), trash.area, trash.size); + if (!n) + return 0; + + if (n < 0 || trash.area[n - 1] != '\n') { + appctx->st0 = PEER_SESS_ST_END; + return -1; + } + + if (n > 1 && (trash.area[n - 2] == '\r')) + trash.area[n - 2] = 0; + else + trash.area[n - 1] = 0; + + co_skip(sc_oc(sc), n); + + return n; +} + +/* + * Send a message after having called <peer_prepare_msg> to build it. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_msg(struct appctx *appctx, + int (*peer_prepare_msg)(char *, size_t, struct peer_prep_params *), + struct peer_prep_params *params) +{ + int ret, msglen; + + msglen = peer_prepare_msg(trash.area, trash.size, params); + if (!msglen) { + /* internal error: message does not fit in trash */ + appctx->st0 = PEER_SESS_ST_END; + return 0; + } + + /* message to buffer */ + ret = applet_putblk(appctx, trash.area, msglen); + if (ret <= 0) { + if (ret != -1) + appctx->st0 = PEER_SESS_ST_END; + } + + return ret; +} + +/* + * Send a hello message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_hellomsg(struct appctx *appctx, struct peer *peer) +{ + struct peer_prep_params p = { + .hello.peer = peer, + }; + + return peer_send_msg(appctx, peer_prepare_hellomsg, &p); +} + +/* + * Send a success peer handshake status message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_status_successmsg(struct appctx *appctx) +{ + return peer_send_msg(appctx, peer_prepare_status_successmsg, NULL); +} + +/* + * Send a peer handshake status error message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_status_errormsg(struct appctx *appctx) +{ + struct peer_prep_params p = { + .error_status.st1 = appctx->st1, + }; + + return peer_send_msg(appctx, peer_prepare_status_errormsg, &p); +} + +/* + * Send a stick-table switch message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_switchmsg(struct shared_table *st, struct appctx *appctx) +{ + struct peer_prep_params p = { + .swtch.shared_table = st, + }; + + return peer_send_msg(appctx, peer_prepare_switchmsg, &p); +} + +/* + * Send a stick-table update acknowledgement message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_ackmsg(struct shared_table *st, struct appctx *appctx) +{ + struct peer_prep_params p = { + .ack.shared_table = st, + }; + + return peer_send_msg(appctx, peer_prepare_ackmsg, &p); +} + +/* + * Send a stick-table update message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_updatemsg(struct shared_table *st, struct appctx *appctx, struct stksess *ts, + unsigned int updateid, int use_identifier, int use_timed) +{ + struct peer_prep_params p = { + .updt = { + .stksess = ts, + .shared_table = st, + .updateid = updateid, + .use_identifier = use_identifier, + .use_timed = use_timed, + .peer = appctx->svcctx, + }, + }; + + return peer_send_msg(appctx, peer_prepare_updatemsg, &p); +} + +/* + * Build a peer protocol control class message. + * Returns the number of written bytes used to build the message if succeeded, + * 0 if not. + */ +static int peer_prepare_control_msg(char *msg, size_t size, struct peer_prep_params *p) +{ + if (size < sizeof p->control.head) + return 0; + + msg[0] = p->control.head[0]; + msg[1] = p->control.head[1]; + + return 2; +} + +/* + * Send a stick-table synchronization request message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appctx st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_resync_reqmsg(struct appctx *appctx, + struct peer *peer, struct peers *peers) +{ + struct peer_prep_params p = { + .control.head = { PEER_MSG_CLASS_CONTROL, PEER_MSG_CTRL_RESYNCREQ, }, + }; + + TRACE_PROTO("send control message", PEERS_EV_CTRLMSG, + NULL, &p.control.head[1], peers->local->id, peer->id); + + return peer_send_msg(appctx, peer_prepare_control_msg, &p); +} + +/* + * Send a stick-table synchronization confirmation message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appctx st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_resync_confirmsg(struct appctx *appctx, + struct peer *peer, struct peers *peers) +{ + struct peer_prep_params p = { + .control.head = { PEER_MSG_CLASS_CONTROL, PEER_MSG_CTRL_RESYNCCONFIRM, }, + }; + + TRACE_PROTO("send control message", PEERS_EV_CTRLMSG, + NULL, &p.control.head[1], peers->local->id, peer->id); + + return peer_send_msg(appctx, peer_prepare_control_msg, &p); +} + +/* + * Send a stick-table synchronization finished message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appctx st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_resync_finishedmsg(struct appctx *appctx, + struct peer *peer, struct peers *peers) +{ + struct peer_prep_params p = { + .control.head = { PEER_MSG_CLASS_CONTROL, }, + }; + + p.control.head[1] = (peers->flags & PEERS_RESYNC_STATEMASK) == PEERS_RESYNC_FINISHED ? + PEER_MSG_CTRL_RESYNCFINISHED : PEER_MSG_CTRL_RESYNCPARTIAL; + + TRACE_PROTO("send control message", PEERS_EV_CTRLMSG, + NULL, &p.control.head[1], peers->local->id, peer->id); + + return peer_send_msg(appctx, peer_prepare_control_msg, &p); +} + +/* + * Send a heartbeat message. + * Return 0 if the message could not be built modifying the appctx st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appctx st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_heartbeatmsg(struct appctx *appctx, + struct peer *peer, struct peers *peers) +{ + struct peer_prep_params p = { + .control.head = { PEER_MSG_CLASS_CONTROL, PEER_MSG_CTRL_HEARTBEAT, }, + }; + + TRACE_PROTO("send control message", PEERS_EV_CTRLMSG, + NULL, &p.control.head[1], peers->local->id, peer->id); + + return peer_send_msg(appctx, peer_prepare_control_msg, &p); +} + +/* + * Build a peer protocol error class message. + * Returns the number of written bytes used to build the message if succeeded, + * 0 if not. + */ +static int peer_prepare_error_msg(char *msg, size_t size, struct peer_prep_params *p) +{ + if (size < sizeof p->error.head) + return 0; + + msg[0] = p->error.head[0]; + msg[1] = p->error.head[1]; + + return 2; +} + +/* + * Send a "size limit reached" error message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appctx st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_error_size_limitmsg(struct appctx *appctx) +{ + struct peer_prep_params p = { + .error.head = { PEER_MSG_CLASS_ERROR, PEER_MSG_ERR_SIZELIMIT, }, + }; + + return peer_send_msg(appctx, peer_prepare_error_msg, &p); +} + +/* + * Send a "peer protocol" error message. + * Return 0 if the message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appctx st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_error_protomsg(struct appctx *appctx) +{ + struct peer_prep_params p = { + .error.head = { PEER_MSG_CLASS_ERROR, PEER_MSG_ERR_PROTOCOL, }, + }; + + return peer_send_msg(appctx, peer_prepare_error_msg, &p); +} + +/* + * Function used to lookup for recent stick-table updates associated with + * <st> shared stick-table when a lesson must be taught a peer (PEER_F_LEARN_ASSIGN flag set). + */ +static inline struct stksess *peer_teach_process_stksess_lookup(struct shared_table *st) +{ + struct eb32_node *eb; + + eb = eb32_lookup_ge(&st->table->updates, st->last_pushed+1); + if (!eb) { + eb = eb32_first(&st->table->updates); + if (!eb || (eb->key == st->last_pushed)) { + st->table->commitupdate = st->last_pushed = st->table->localupdate; + return NULL; + } + } + + /* if distance between the last pushed and the retrieved key + * is greater than the distance last_pushed and the local_update + * this means we are beyond localupdate. + */ + if ((eb->key - st->last_pushed) > (st->table->localupdate - st->last_pushed)) { + st->table->commitupdate = st->last_pushed = st->table->localupdate; + return NULL; + } + + return eb32_entry(eb, struct stksess, upd); +} + +/* + * Function used to lookup for recent stick-table updates associated with + * <st> shared stick-table during teach state 1 step. + */ +static inline struct stksess *peer_teach_stage1_stksess_lookup(struct shared_table *st) +{ + struct eb32_node *eb; + + eb = eb32_lookup_ge(&st->table->updates, st->last_pushed+1); + if (!eb) { + st->flags |= SHTABLE_F_TEACH_STAGE1; + eb = eb32_first(&st->table->updates); + if (eb) + st->last_pushed = eb->key - 1; + return NULL; + } + + return eb32_entry(eb, struct stksess, upd); +} + +/* + * Function used to lookup for recent stick-table updates associated with + * <st> shared stick-table during teach state 2 step. + */ +static inline struct stksess *peer_teach_stage2_stksess_lookup(struct shared_table *st) +{ + struct eb32_node *eb; + + eb = eb32_lookup_ge(&st->table->updates, st->last_pushed+1); + if (!eb || eb->key > st->teaching_origin) { + st->flags |= SHTABLE_F_TEACH_STAGE2; + return NULL; + } + + return eb32_entry(eb, struct stksess, upd); +} + +/* + * Generic function to emit update messages for <st> stick-table when a lesson must + * be taught to the peer <p>. + * + * This function temporary unlock/lock <st> when it sends stick-table updates or + * when decrementing its refcount in case of any error when it sends this updates. + * It must be called with the stick-table lock released. + * + * Return 0 if any message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + * If it returns 0 or -1, this function leave <st> locked if already locked when entering this function + * unlocked if not already locked when entering this function. + */ +static inline int peer_send_teachmsgs(struct appctx *appctx, struct peer *p, + struct stksess *(*peer_stksess_lookup)(struct shared_table *), + struct shared_table *st) +{ + int ret, new_pushed, use_timed; + int updates_sent = 0; + + ret = 1; + use_timed = 0; + if (st != p->last_local_table) { + ret = peer_send_switchmsg(st, appctx); + if (ret <= 0) + return ret; + + p->last_local_table = st; + } + + if (peer_stksess_lookup != peer_teach_process_stksess_lookup) + use_timed = !(p->flags & PEER_F_DWNGRD); + + /* We force new pushed to 1 to force identifier in update message */ + new_pushed = 1; + + HA_RWLOCK_RDLOCK(STK_TABLE_LOCK, &st->table->updt_lock); + + while (1) { + struct stksess *ts; + unsigned updateid; + + /* push local updates */ + ts = peer_stksess_lookup(st); + if (!ts) { + ret = 1; // done + break; + } + + updateid = ts->upd.key; + if (p->srv->shard && ts->shard != p->srv->shard) { + /* Skip this entry */ + st->last_pushed = updateid; + new_pushed = 1; + continue; + } + + HA_ATOMIC_INC(&ts->ref_cnt); + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &st->table->updt_lock); + + ret = peer_send_updatemsg(st, appctx, ts, updateid, new_pushed, use_timed); + HA_RWLOCK_RDLOCK(STK_TABLE_LOCK, &st->table->updt_lock); + HA_ATOMIC_DEC(&ts->ref_cnt); + if (ret <= 0) + break; + + st->last_pushed = updateid; + + if (peer_stksess_lookup == peer_teach_process_stksess_lookup) { + uint commitid = _HA_ATOMIC_LOAD(&st->table->commitupdate); + + while ((int)(updateid - commitid) > 0) { + if (_HA_ATOMIC_CAS(&st->table->commitupdate, &commitid, updateid)) + break; + __ha_cpu_relax(); + } + } + + /* identifier may not needed in next update message */ + new_pushed = 0; + + updates_sent++; + if (updates_sent >= peers_max_updates_at_once) { + /* pretend we're full so that we get back ASAP */ + struct stconn *sc = appctx_sc(appctx); + + sc_need_room(sc, 0); + ret = -1; + break; + } + } + + out: + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &st->table->updt_lock); + return ret; +} + +/* + * Function to emit update messages for <st> stick-table when a lesson must + * be taught to the peer <p> (PEER_F_LEARN_ASSIGN flag set). + * + * Note that <st> shared stick-table is locked when calling this function, and + * the lock is dropped then re-acquired. + * + * Return 0 if any message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_teach_process_msgs(struct appctx *appctx, struct peer *p, + struct shared_table *st) +{ + int ret; + + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &st->table->lock); + ret = peer_send_teachmsgs(appctx, p, peer_teach_process_stksess_lookup, st); + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &st->table->lock); + + return ret; +} + +/* + * Function to emit update messages for <st> stick-table when a lesson must + * be taught to the peer <p> during teach state 1 step. It must be called with + * the stick-table lock released. + * + * Return 0 if any message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_teach_stage1_msgs(struct appctx *appctx, struct peer *p, + struct shared_table *st) +{ + return peer_send_teachmsgs(appctx, p, peer_teach_stage1_stksess_lookup, st); +} + +/* + * Function to emit update messages for <st> stick-table when a lesson must + * be taught to the peer <p> during teach state 1 step. It must be called with + * the stick-table lock released. + * + * Return 0 if any message could not be built modifying the appcxt st0 to PEER_SESS_ST_END value. + * Returns -1 if there was not enough room left to send the message, + * any other negative returned value must be considered as an error with an appcxt st0 + * returned value equal to PEER_SESS_ST_END. + */ +static inline int peer_send_teach_stage2_msgs(struct appctx *appctx, struct peer *p, + struct shared_table *st) +{ + return peer_send_teachmsgs(appctx, p, peer_teach_stage2_stksess_lookup, st); +} + + +/* + * Function used to parse a stick-table update message after it has been received + * by <p> peer with <msg_cur> as address of the pointer to the position in the + * receipt buffer with <msg_end> being position of the end of the stick-table message. + * Update <msg_curr> accordingly to the peer protocol specs if no peer protocol error + * was encountered. + * <exp> must be set if the stick-table entry expires. + * <updt> must be set for PEER_MSG_STKT_UPDATE or PEER_MSG_STKT_UPDATE_TIMED stick-table + * messages, in this case the stick-table update message is received with a stick-table + * update ID. + * <totl> is the length of the stick-table update message computed upon receipt. + */ +static int peer_treat_updatemsg(struct appctx *appctx, struct peer *p, int updt, int exp, + char **msg_cur, char *msg_end, int msg_len, int totl) +{ + struct shared_table *st = p->remote_table; + struct stktable *table; + struct stksess *ts, *newts; + struct stksess *wts = NULL; /* write_to stksess */ + uint32_t update; + int expire; + unsigned int data_type; + size_t keylen; + void *data_ptr; + char *msg_save; + + TRACE_ENTER(PEERS_EV_UPDTMSG, NULL, p); + /* Here we have data message */ + if (!st) + goto ignore_msg; + + table = st->table; + + expire = MS_TO_TICKS(table->expire); + + if (updt) { + if (msg_len < sizeof(update)) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_exit; + } + + memcpy(&update, *msg_cur, sizeof(update)); + *msg_cur += sizeof(update); + st->last_get = htonl(update); + } + else { + st->last_get++; + } + + if (exp) { + size_t expire_sz = sizeof expire; + + if (*msg_cur + expire_sz > msg_end) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, *msg_cur); + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, msg_end, &expire_sz); + goto malformed_exit; + } + + memcpy(&expire, *msg_cur, expire_sz); + *msg_cur += expire_sz; + expire = ntohl(expire); + } + + newts = stksess_new(table, NULL); + if (!newts) + goto ignore_msg; + + if (table->type == SMP_T_STR) { + unsigned int to_read, to_store; + + to_read = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_free_newts; + } + + to_store = MIN(to_read, table->key_size - 1); + if (*msg_cur + to_store > msg_end) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, *msg_cur); + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, msg_end, &to_store); + goto malformed_free_newts; + } + + keylen = to_store; + memcpy(newts->key.key, *msg_cur, keylen); + newts->key.key[keylen] = 0; + *msg_cur += to_read; + } + else if (table->type == SMP_T_SINT) { + unsigned int netinteger; + + if (*msg_cur + sizeof(netinteger) > msg_end) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, *msg_cur); + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, msg_end); + goto malformed_free_newts; + } + + keylen = sizeof(netinteger); + memcpy(&netinteger, *msg_cur, keylen); + netinteger = ntohl(netinteger); + memcpy(newts->key.key, &netinteger, keylen); + *msg_cur += keylen; + } + else { + if (*msg_cur + table->key_size > msg_end) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, *msg_cur); + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, msg_end, &table->key_size); + goto malformed_free_newts; + } + + keylen = table->key_size; + memcpy(newts->key.key, *msg_cur, keylen); + *msg_cur += keylen; + } + + newts->shard = stktable_get_key_shard(table, newts->key.key, keylen); + + /* lookup for existing entry */ + ts = stktable_set_entry(table, newts); + if (ts != newts) { + stksess_free(table, newts); + newts = NULL; + } + + msg_save = *msg_cur; + + update_wts: + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + for (data_type = 0 ; data_type < STKTABLE_DATA_TYPES ; data_type++) { + uint64_t decoded_int; + unsigned int idx; + int ignore = 0; + + if (!((1ULL << data_type) & st->remote_data)) + continue; + + /* We shouldn't learn local-only values. Also, when handling the + * write_to table we must ignore types that can be processed + * so we don't interfere with any potential arithmetic logic + * performed on them (ie: cumulative counters). + */ + if (stktable_data_types[data_type].is_local || + (table != st->table && !stktable_data_types[data_type].as_is)) + ignore = 1; + + if (stktable_data_types[data_type].is_array) { + /* in case of array all elements + * use the same std_type and they + * are linearly encoded. + * The number of elements was provided + * by table definition message + */ + switch (stktable_data_types[data_type].std_type) { + case STD_T_SINT: + for (idx = 0; idx < st->remote_data_nbelem[data_type]; idx++) { + decoded_int = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + data_ptr = stktable_data_ptr_idx(table, ts, data_type, idx); + if (data_ptr && !ignore) + stktable_data_cast(data_ptr, std_t_sint) = decoded_int; + } + break; + case STD_T_UINT: + for (idx = 0; idx < st->remote_data_nbelem[data_type]; idx++) { + decoded_int = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + data_ptr = stktable_data_ptr_idx(table, ts, data_type, idx); + if (data_ptr && !ignore) + stktable_data_cast(data_ptr, std_t_uint) = decoded_int; + } + break; + case STD_T_ULL: + for (idx = 0; idx < st->remote_data_nbelem[data_type]; idx++) { + decoded_int = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + data_ptr = stktable_data_ptr_idx(table, ts, data_type, idx); + if (data_ptr && !ignore) + stktable_data_cast(data_ptr, std_t_ull) = decoded_int; + } + break; + case STD_T_FRQP: + for (idx = 0; idx < st->remote_data_nbelem[data_type]; idx++) { + struct freq_ctr data; + + /* First bit is reserved for the freq_ctr lock + * Note: here we're still protected by the stksess lock + * so we don't need to update the update the freq_ctr + * using its internal lock. + */ + + decoded_int = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + data.curr_tick = tick_add(now_ms, -decoded_int) & ~0x1; + data.curr_ctr = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + data.prev_ctr = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + data_ptr = stktable_data_ptr_idx(table, ts, data_type, idx); + if (data_ptr && !ignore) + stktable_data_cast(data_ptr, std_t_frqp) = data; + } + break; + } + + /* array is fully decoded + * proceed next data_type. + */ + continue; + } + decoded_int = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + switch (stktable_data_types[data_type].std_type) { + case STD_T_SINT: + data_ptr = stktable_data_ptr(table, ts, data_type); + if (data_ptr && !ignore) + stktable_data_cast(data_ptr, std_t_sint) = decoded_int; + break; + + case STD_T_UINT: + data_ptr = stktable_data_ptr(table, ts, data_type); + if (data_ptr && !ignore) + stktable_data_cast(data_ptr, std_t_uint) = decoded_int; + break; + + case STD_T_ULL: + data_ptr = stktable_data_ptr(table, ts, data_type); + if (data_ptr && !ignore) + stktable_data_cast(data_ptr, std_t_ull) = decoded_int; + break; + + case STD_T_FRQP: { + struct freq_ctr data; + + /* First bit is reserved for the freq_ctr lock + Note: here we're still protected by the stksess lock + so we don't need to update the update the freq_ctr + using its internal lock. + */ + + data.curr_tick = tick_add(now_ms, -decoded_int) & ~0x1; + data.curr_ctr = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + data.prev_ctr = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, NULL, p); + goto malformed_unlock; + } + + data_ptr = stktable_data_ptr(table, ts, data_type); + if (data_ptr && !ignore) + stktable_data_cast(data_ptr, std_t_frqp) = data; + break; + } + case STD_T_DICT: { + struct buffer *chunk; + size_t data_len, value_len; + unsigned int id; + struct dict_entry *de; + struct dcache *dc; + char *end; + + if (!decoded_int) { + /* No entry. */ + break; + } + data_len = decoded_int; + if (*msg_cur + data_len > msg_end) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, *msg_cur); + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, msg_end, &data_len); + goto malformed_unlock; + } + + /* Compute the end of the current data, <msg_end> being at the end of + * the entire message. + */ + end = *msg_cur + data_len; + id = intdecode(msg_cur, end); + if (!*msg_cur || !id) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, *msg_cur, &id); + goto malformed_unlock; + } + + dc = p->dcache; + if (*msg_cur == end) { + /* Dictionary entry key without value. */ + if (id > dc->max_entries) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, NULL, &id); + goto malformed_unlock; + } + /* IDs sent over the network are numbered from 1. */ + de = dc->rx[id - 1].de; + } + else { + chunk = get_trash_chunk(); + value_len = intdecode(msg_cur, end); + if (!*msg_cur || *msg_cur + value_len > end || + unlikely(value_len + 1 >= chunk->size)) { + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, *msg_cur, &value_len); + TRACE_PROTO("malformed message", PEERS_EV_UPDTMSG, + NULL, p, end, &chunk->size); + goto malformed_unlock; + } + + chunk_memcpy(chunk, *msg_cur, value_len); + chunk->area[chunk->data] = '\0'; + *msg_cur += value_len; + + de = dict_insert(&server_key_dict, chunk->area); + dict_entry_unref(&server_key_dict, dc->rx[id - 1].de); + dc->rx[id - 1].de = de; + } + if (de) { + data_ptr = stktable_data_ptr(table, ts, data_type); + if (data_ptr && !ignore) { + HA_ATOMIC_INC(&de->refcount); + stktable_data_cast(data_ptr, std_t_dict) = de; + } + } + break; + } + } + } + + if (st->table->write_to.t && table != st->table->write_to.t) { + struct stktable_key stkey = { .key = ts->key.key, .key_len = keylen }; + + /* While we're still under the main ts lock, try to get related + * write_to stksess with main ts key + */ + wts = stktable_get_entry(st->table->write_to.t, &stkey); + } + + /* Force new expiration */ + ts->expire = tick_add(now_ms, expire); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_touch_remote(table, ts, 1); + + if (wts) { + /* Start over the message decoding for wts as we got a valid stksess + * for write_to table, so we need to refresh the entry with supported + * values. + * + * We prefer to do the decoding a second time even though it might + * cost a bit more than copying from main ts to wts, but doing so + * enables us to get rid of main ts lock: we only need the wts lock + * since upstream data is still available in msg_cur + */ + ts = wts; + table = st->table->write_to.t; + wts = NULL; /* so we don't get back here */ + *msg_cur = msg_save; + goto update_wts; + } + + ignore_msg: + TRACE_LEAVE(PEERS_EV_UPDTMSG, NULL, p); + return 1; + + malformed_unlock: + /* malformed message */ + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_touch_remote(st->table, ts, 1); + appctx->st0 = PEER_SESS_ST_ERRPROTO; + TRACE_DEVEL("leaving in error", PEERS_EV_UPDTMSG); + return 0; + + malformed_free_newts: + /* malformed message */ + stksess_free(st->table, newts); + malformed_exit: + appctx->st0 = PEER_SESS_ST_ERRPROTO; + TRACE_DEVEL("leaving in error", PEERS_EV_UPDTMSG); + return 0; +} + +/* + * Function used to parse a stick-table update acknowledgement message after it + * has been received by <p> peer with <msg_cur> as address of the pointer to the position in the + * receipt buffer with <msg_end> being the position of the end of the stick-table message. + * Update <msg_curr> accordingly to the peer protocol specs if no peer protocol error + * was encountered. + * Return 1 if succeeded, 0 if not with the appctx state st0 set to PEER_SESS_ST_ERRPROTO. + */ +static inline int peer_treat_ackmsg(struct appctx *appctx, struct peer *p, + char **msg_cur, char *msg_end) +{ + /* ack message */ + uint32_t table_id ; + uint32_t update; + struct shared_table *st; + + /* ignore ack during teaching process */ + if (p->flags & PEER_F_TEACH_PROCESS) + return 1; + + table_id = intdecode(msg_cur, msg_end); + if (!*msg_cur || (*msg_cur + sizeof(update) > msg_end)) { + /* malformed message */ + + TRACE_PROTO("malformed message", PEERS_EV_ACKMSG, + NULL, p, *msg_cur); + appctx->st0 = PEER_SESS_ST_ERRPROTO; + return 0; + } + + memcpy(&update, *msg_cur, sizeof(update)); + update = ntohl(update); + + for (st = p->tables; st; st = st->next) { + if (st->local_id == table_id) { + st->update = update; + break; + } + } + + return 1; +} + +/* + * Function used to parse a stick-table switch message after it has been received + * by <p> peer with <msg_cur> as address of the pointer to the position in the + * receipt buffer with <msg_end> being the position of the end of the stick-table message. + * Update <msg_curr> accordingly to the peer protocol specs if no peer protocol error + * was encountered. + * Return 1 if succeeded, 0 if not with the appctx state st0 set to PEER_SESS_ST_ERRPROTO. + */ +static inline int peer_treat_switchmsg(struct appctx *appctx, struct peer *p, + char **msg_cur, char *msg_end) +{ + struct shared_table *st; + int table_id; + + table_id = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_SWTCMSG, NULL, p); + /* malformed message */ + appctx->st0 = PEER_SESS_ST_ERRPROTO; + return 0; + } + + p->remote_table = NULL; + for (st = p->tables; st; st = st->next) { + if (st->remote_id == table_id) { + p->remote_table = st; + break; + } + } + + return 1; +} + +/* + * Function used to parse a stick-table definition message after it has been received + * by <p> peer with <msg_cur> as address of the pointer to the position in the + * receipt buffer with <msg_end> being the position of the end of the stick-table message. + * Update <msg_curr> accordingly to the peer protocol specs if no peer protocol error + * was encountered. + * <totl> is the length of the stick-table update message computed upon receipt. + * Return 1 if succeeded, 0 if not with the appctx state st0 set to PEER_SESS_ST_ERRPROTO. + */ +static inline int peer_treat_definemsg(struct appctx *appctx, struct peer *p, + char **msg_cur, char *msg_end, int totl) +{ + int table_id_len; + struct shared_table *st; + int table_type; + int table_keylen; + int table_id; + uint64_t table_data; + + table_id = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_DEFMSG, NULL, p); + goto malformed_exit; + } + + table_id_len = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_DEFMSG, NULL, p, *msg_cur); + goto malformed_exit; + } + + p->remote_table = NULL; + if (!table_id_len || (*msg_cur + table_id_len) >= msg_end) { + TRACE_PROTO("malformed message", PEERS_EV_DEFMSG, NULL, p, *msg_cur, &table_id_len); + goto malformed_exit; + } + + for (st = p->tables; st; st = st->next) { + /* Reset IDs */ + if (st->remote_id == table_id) + st->remote_id = 0; + + if (!p->remote_table && (table_id_len == strlen(st->table->nid)) && + (memcmp(st->table->nid, *msg_cur, table_id_len) == 0)) + p->remote_table = st; + } + + if (!p->remote_table) { + TRACE_PROTO("ignored message", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + + *msg_cur += table_id_len; + if (*msg_cur >= msg_end) { + TRACE_PROTO("malformed message", PEERS_EV_DEFMSG, NULL, p); + goto malformed_exit; + } + + table_type = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_DEFMSG, NULL, p); + goto malformed_exit; + } + + table_keylen = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_DEFMSG, NULL, p); + goto malformed_exit; + } + + table_data = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + TRACE_PROTO("malformed message", PEERS_EV_DEFMSG, NULL, p); + goto malformed_exit; + } + + if (p->remote_table->table->type != peer_int_key_type[table_type] + || p->remote_table->table->key_size != table_keylen) { + p->remote_table = NULL; + TRACE_PROTO("ignored message", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + + /* Check if there there is the additional expire data */ + intdecode(msg_cur, msg_end); + if (*msg_cur) { + uint64_t data_type; + uint64_t type; + + /* This define contains the expire data so we consider + * it also contain all data_types parameters. + */ + for (data_type = 0; data_type < STKTABLE_DATA_TYPES; data_type++) { + if (table_data & (1ULL << data_type)) { + if (stktable_data_types[data_type].is_array) { + /* This should be an array + * so we parse the data_type prefix + * because we must have parameters. + */ + type = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + p->remote_table = NULL; + TRACE_PROTO("missing meta data for array", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + + /* check if the data_type match the current from the bitfield */ + if (type != data_type) { + p->remote_table = NULL; + TRACE_PROTO("meta data mismatch type", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + + /* decode the nbelem of the array */ + p->remote_table->remote_data_nbelem[type] = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + p->remote_table = NULL; + TRACE_PROTO("missing array size meta data for array", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + + /* if it is an array of frqp, we must also have the period to decode */ + if (stktable_data_types[data_type].std_type == STD_T_FRQP) { + intdecode(msg_cur, msg_end); + if (!*msg_cur) { + p->remote_table = NULL; + TRACE_PROTO("missing period for frqp", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + } + } + else if (stktable_data_types[data_type].std_type == STD_T_FRQP) { + /* This should be a std freq counter data_type + * so we parse the data_type prefix + * because we must have parameters. + */ + type = intdecode(msg_cur, msg_end); + if (!*msg_cur) { + p->remote_table = NULL; + TRACE_PROTO("missing meta data for frqp", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + + /* check if the data_type match the current from the bitfield */ + if (type != data_type) { + p->remote_table = NULL; + TRACE_PROTO("meta data mismatch type", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + + /* decode the period */ + intdecode(msg_cur, msg_end); + if (!*msg_cur) { + p->remote_table = NULL; + TRACE_PROTO("missing period for frqp", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + } + } + } + } + else { + uint64_t data_type; + + /* There is not additional data but + * array size parameter is mandatory to parse array + * so we consider an error if an array data_type is define + * but there is no additional data. + */ + for (data_type = 0; data_type < STKTABLE_DATA_TYPES; data_type++) { + if (table_data & (1ULL << data_type)) { + if (stktable_data_types[data_type].is_array) { + p->remote_table = NULL; + TRACE_PROTO("missing array size meta data for array", PEERS_EV_DEFMSG, NULL, p); + goto ignore_msg; + } + } + } + } + + p->remote_table->remote_data = table_data; + p->remote_table->remote_id = table_id; + + ignore_msg: + return 1; + + malformed_exit: + /* malformed message */ + appctx->st0 = PEER_SESS_ST_ERRPROTO; + return 0; +} + +/* + * Receive a stick-table message or pre-parse any other message. + * The message's header will be sent into <msg_head> which must be at least + * <msg_head_sz> bytes long (at least 7 to store 32-bit variable lengths). + * The first two bytes are always read, and the rest is only read if the + * first bytes indicate a stick-table message. If the message is a stick-table + * message, the varint is decoded and the equivalent number of bytes will be + * copied into the trash at trash.area. <totl> is incremented by the number of + * bytes read EVEN IN CASE OF INCOMPLETE MESSAGES. + * Returns 1 if there was no error, if not, returns 0 if not enough data were available, + * -1 if there was an error updating the appctx state st0 accordingly. + */ +static inline int peer_recv_msg(struct appctx *appctx, char *msg_head, size_t msg_head_sz, + uint32_t *msg_len, int *totl) +{ + int reql; + struct stconn *sc = appctx_sc(appctx); + char *cur; + + reql = co_getblk(sc_oc(sc), msg_head, 2 * sizeof(char), *totl); + if (reql <= 0) /* closed or EOL not found */ + goto incomplete; + + *totl += reql; + + if (!(msg_head[1] & PEER_MSG_STKT_BIT_MASK)) + return 1; + + /* This is a stick-table message, let's go on */ + + /* Read and Decode message length */ + msg_head += *totl; + msg_head_sz -= *totl; + reql = co_data(sc_oc(sc)) - *totl; + if (reql > msg_head_sz) + reql = msg_head_sz; + + reql = co_getblk(sc_oc(sc), msg_head, reql, *totl); + if (reql <= 0) /* closed */ + goto incomplete; + + cur = msg_head; + *msg_len = intdecode(&cur, cur + reql); + if (!cur) { + /* the number is truncated, did we read enough ? */ + if (reql < msg_head_sz) + goto incomplete; + + /* malformed message */ + TRACE_PROTO("malformed message: too large length encoding", PEERS_EV_UPDTMSG); + appctx->st0 = PEER_SESS_ST_ERRPROTO; + return -1; + } + *totl += cur - msg_head; + + /* Read message content */ + if (*msg_len) { + if (*msg_len > trash.size) { + /* Status code is not success, abort */ + appctx->st0 = PEER_SESS_ST_ERRSIZE; + return -1; + } + + reql = co_getblk(sc_oc(sc), trash.area, *msg_len, *totl); + if (reql <= 0) /* closed */ + goto incomplete; + *totl += reql; + } + + return 1; + + incomplete: + if (reql < 0 || (sc->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED))) { + /* there was an error or the message was truncated */ + appctx->st0 = PEER_SESS_ST_END; + return -1; + } + + return 0; +} + +/* + * Treat the awaited message with <msg_head> as header.* + * Return 1 if succeeded, 0 if not. + */ +static inline int peer_treat_awaited_msg(struct appctx *appctx, struct peer *peer, unsigned char *msg_head, + char **msg_cur, char *msg_end, int msg_len, int totl) +{ + struct peers *peers = peer->peers; + + if (msg_head[0] == PEER_MSG_CLASS_CONTROL) { + if (msg_head[1] == PEER_MSG_CTRL_RESYNCREQ) { + struct shared_table *st; + /* Reset message: remote need resync */ + + TRACE_PROTO("received control message", PEERS_EV_CTRLMSG, + NULL, &msg_head[1], peers->local->id, peer->id); + /* prepare tables for a global push */ + for (st = peer->tables; st; st = st->next) { + st->teaching_origin = st->last_pushed = st->update; + st->flags = 0; + } + + /* reset teaching flags to 0 */ + peer->flags &= PEER_TEACH_RESET; + + /* flag to start to teach lesson */ + peer->flags |= PEER_F_TEACH_PROCESS; + peers->flags |= PEERS_F_RESYNC_REQUESTED; + } + else if (msg_head[1] == PEER_MSG_CTRL_RESYNCFINISHED) { + TRACE_PROTO("received control message", PEERS_EV_CTRLMSG, + NULL, &msg_head[1], peers->local->id, peer->id); + if (peer->flags & PEER_F_LEARN_ASSIGN) { + int commit_a_finish = 1; + + peer->flags &= ~PEER_F_LEARN_ASSIGN; + peers->flags &= ~(PEERS_F_RESYNC_ASSIGN|PEERS_F_RESYNC_PROCESS); + if (peer->srv->shard) { + struct peer *ps; + + peers->flags |= PEERS_F_RESYNC_REMOTEPARTIAL; + peer->flags |= PEER_F_LEARN_NOTUP2DATE; + for (ps = peers->remote; ps; ps = ps->next) { + if (ps->srv->shard == peer->srv->shard) { + /* flag all peers from same shard + * notup2date to disable request + * of a resync frm them + */ + ps->flags |= PEER_F_LEARN_NOTUP2DATE; + } + else if (ps->srv->shard && !(ps->flags & PEER_F_LEARN_NOTUP2DATE)) { + /* it remains some other shards not requested + * we don't commit a resync finish to request + * the other shards + */ + commit_a_finish = 0; + } + } + + if (!commit_a_finish) { + /* it remains some shard to request, we schedule a new request + */ + peers->resync_timeout = tick_add(now_ms, MS_TO_TICKS(PEER_RESYNC_TIMEOUT)); + task_wakeup(peers->sync_task, TASK_WOKEN_MSG); + } + } + + if (commit_a_finish) { + peers->flags |= (PEERS_F_RESYNC_LOCAL|PEERS_F_RESYNC_REMOTE); + if (peer->local) + peers->flags |= PEERS_F_RESYNC_LOCALFINISHED; + else + peers->flags |= PEERS_F_RESYNC_REMOTEFINISHED; + } + } + peer->confirm++; + } + else if (msg_head[1] == PEER_MSG_CTRL_RESYNCPARTIAL) { + TRACE_PROTO("received control message", PEERS_EV_CTRLMSG, + NULL, &msg_head[1], peers->local->id, peer->id); + if (peer->flags & PEER_F_LEARN_ASSIGN) { + peer->flags &= ~PEER_F_LEARN_ASSIGN; + peers->flags &= ~(PEERS_F_RESYNC_ASSIGN|PEERS_F_RESYNC_PROCESS); + + if (peer->local) + peers->flags |= PEERS_F_RESYNC_LOCALPARTIAL; + else + peers->flags |= PEERS_F_RESYNC_REMOTEPARTIAL; + peer->flags |= PEER_F_LEARN_NOTUP2DATE; + peers->resync_timeout = tick_add(now_ms, MS_TO_TICKS(PEER_RESYNC_TIMEOUT)); + task_wakeup(peers->sync_task, TASK_WOKEN_MSG); + } + peer->confirm++; + } + else if (msg_head[1] == PEER_MSG_CTRL_RESYNCCONFIRM) { + struct shared_table *st; + + TRACE_PROTO("received control message", PEERS_EV_CTRLMSG, + NULL, &msg_head[1], peers->local->id, peer->id); + /* If stopping state */ + if (stopping) { + /* Close session, push resync no more needed */ + peer->flags |= PEER_F_TEACH_COMPLETE; + appctx->st0 = PEER_SESS_ST_END; + return 0; + } + for (st = peer->tables; st; st = st->next) { + st->update = st->last_pushed = st->teaching_origin; + st->flags = 0; + } + + /* reset teaching flags to 0 */ + peer->flags &= PEER_TEACH_RESET; + } + else if (msg_head[1] == PEER_MSG_CTRL_HEARTBEAT) { + TRACE_PROTO("received control message", PEERS_EV_CTRLMSG, + NULL, &msg_head[1], peers->local->id, peer->id); + peer->reconnect = tick_add(now_ms, MS_TO_TICKS(PEER_RECONNECT_TIMEOUT)); + peer->rx_hbt++; + } + } + else if (msg_head[0] == PEER_MSG_CLASS_STICKTABLE) { + if (msg_head[1] == PEER_MSG_STKT_DEFINE) { + if (!peer_treat_definemsg(appctx, peer, msg_cur, msg_end, totl)) + return 0; + } + else if (msg_head[1] == PEER_MSG_STKT_SWITCH) { + if (!peer_treat_switchmsg(appctx, peer, msg_cur, msg_end)) + return 0; + } + else if (msg_head[1] == PEER_MSG_STKT_UPDATE || + msg_head[1] == PEER_MSG_STKT_INCUPDATE || + msg_head[1] == PEER_MSG_STKT_UPDATE_TIMED || + msg_head[1] == PEER_MSG_STKT_INCUPDATE_TIMED) { + int update, expire; + + update = msg_head[1] == PEER_MSG_STKT_UPDATE || msg_head[1] == PEER_MSG_STKT_UPDATE_TIMED; + expire = msg_head[1] == PEER_MSG_STKT_UPDATE_TIMED || msg_head[1] == PEER_MSG_STKT_INCUPDATE_TIMED; + if (!peer_treat_updatemsg(appctx, peer, update, expire, + msg_cur, msg_end, msg_len, totl)) + return 0; + + } + else if (msg_head[1] == PEER_MSG_STKT_ACK) { + if (!peer_treat_ackmsg(appctx, peer, msg_cur, msg_end)) + return 0; + } + } + else if (msg_head[0] == PEER_MSG_CLASS_RESERVED) { + appctx->st0 = PEER_SESS_ST_ERRPROTO; + return 0; + } + + return 1; +} + + +/* + * Send any message to <peer> peer. + * Returns 1 if succeeded, or -1 or 0 if failed. + * -1 means an internal error occurred, 0 is for a peer protocol error leading + * to a peer state change (from the peer I/O handler point of view). + * + * - peer->last_local_table is the last table for which we send an update + * messages. + * + * - peer->stop_local_table is the last evaluated table. It is unset when the + * teaching process starts. But we use it as a + * restart point when the loop is interrupted. It is + * especially useful when the number of tables exceeds + * peers_max_updates_at_once value. + * + * When a teaching lopp is started, the peer's last_local_table is saved in a + * local variable. This variable is used as a finish point. When the crrent + * table is equal to it, it means all tables were evaluated, all updates where + * sent and the teaching process is finished. + * + * peer->stop_local_table is always NULL when the teaching process begins. It is + * only reset at the end. In the mean time, it always point on a table. + */ + +static inline int peer_send_msgs(struct appctx *appctx, + struct peer *peer, struct peers *peers) +{ + int repl; + + /* Need to request a resync */ + if ((peer->flags & PEER_F_LEARN_ASSIGN) && + (peers->flags & PEERS_F_RESYNC_ASSIGN) && + !(peers->flags & PEERS_F_RESYNC_PROCESS)) { + + repl = peer_send_resync_reqmsg(appctx, peer, peers); + if (repl <= 0) + return repl; + + peers->flags |= PEERS_F_RESYNC_PROCESS; + } + + /* Nothing to read, now we start to write */ + if (peer->tables) { + struct shared_table *st; + struct shared_table *last_local_table; + int updates = 0; + + last_local_table = peer->last_local_table; + if (!last_local_table) + last_local_table = peer->tables; + if (!peer->stop_local_table) + peer->stop_local_table = last_local_table; + st = peer->stop_local_table->next; + + while (1) { + if (!st) + st = peer->tables; + /* It remains some updates to ack */ + if (st->last_get != st->last_acked) { + repl = peer_send_ackmsg(st, appctx); + if (repl <= 0) + return repl; + + st->last_acked = st->last_get; + } + + if (!(peer->flags & PEER_F_TEACH_PROCESS)) { + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &st->table->lock); + if (!(peer->flags & PEER_F_LEARN_ASSIGN) && + (st->last_pushed != st->table->localupdate)) { + + repl = peer_send_teach_process_msgs(appctx, peer, st); + if (repl <= 0) { + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &st->table->lock); + peer->stop_local_table = peer->last_local_table; + return repl; + } + } + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &st->table->lock); + } + else if (!(peer->flags & PEER_F_TEACH_FINISHED)) { + if (!(st->flags & SHTABLE_F_TEACH_STAGE1)) { + repl = peer_send_teach_stage1_msgs(appctx, peer, st); + if (repl <= 0) { + peer->stop_local_table = peer->last_local_table; + return repl; + } + } + + if (!(st->flags & SHTABLE_F_TEACH_STAGE2)) { + repl = peer_send_teach_stage2_msgs(appctx, peer, st); + if (repl <= 0) { + peer->stop_local_table = peer->last_local_table; + return repl; + } + } + } + + if (st == last_local_table) { + peer->stop_local_table = NULL; + break; + } + + /* This one is to be sure to restart from <st->next> if we are interrupted + * because of peer_send_teach_stage2_msgs or because buffer is full + * when sedning an ackmsg. In both cases current <st> was evaluated and + * we must restart from <st->next> + */ + peer->stop_local_table = st; + + updates++; + if (updates >= peers_max_updates_at_once) { + /* pretend we're full so that we get back ASAP */ + struct stconn *sc = appctx_sc(appctx); + + sc_need_room(sc, 0); + return -1; + } + + st = st->next; + } + } + + if ((peer->flags & PEER_F_TEACH_PROCESS) && !(peer->flags & PEER_F_TEACH_FINISHED)) { + repl = peer_send_resync_finishedmsg(appctx, peer, peers); + if (repl <= 0) + return repl; + + /* flag finished message sent */ + peer->flags |= PEER_F_TEACH_FINISHED; + } + + /* Confirm finished or partial messages */ + while (peer->confirm) { + repl = peer_send_resync_confirmsg(appctx, peer, peers); + if (repl <= 0) + return repl; + + peer->confirm--; + } + + return 1; +} + +/* + * Read and parse a first line of a "hello" peer protocol message. + * Returns 0 if could not read a line, -1 if there was a read error or + * the line is malformed, 1 if succeeded. + */ +static inline int peer_getline_version(struct appctx *appctx, + unsigned int *maj_ver, unsigned int *min_ver) +{ + int reql; + + reql = peer_getline(appctx); + if (!reql) + return 0; + + if (reql < 0) + return -1; + + /* test protocol */ + if (strncmp(PEER_SESSION_PROTO_NAME " ", trash.area, proto_len + 1) != 0) { + appctx->st0 = PEER_SESS_ST_EXIT; + appctx->st1 = PEER_SESS_SC_ERRPROTO; + return -1; + } + if (peer_get_version(trash.area + proto_len + 1, maj_ver, min_ver) == -1 || + *maj_ver != PEER_MAJOR_VER || *min_ver > PEER_MINOR_VER) { + appctx->st0 = PEER_SESS_ST_EXIT; + appctx->st1 = PEER_SESS_SC_ERRVERSION; + return -1; + } + + return 1; +} + +/* + * Read and parse a second line of a "hello" peer protocol message. + * Returns 0 if could not read a line, -1 if there was a read error or + * the line is malformed, 1 if succeeded. + */ +static inline int peer_getline_host(struct appctx *appctx) +{ + int reql; + + reql = peer_getline(appctx); + if (!reql) + return 0; + + if (reql < 0) + return -1; + + /* test hostname match */ + if (strcmp(localpeer, trash.area) != 0) { + appctx->st0 = PEER_SESS_ST_EXIT; + appctx->st1 = PEER_SESS_SC_ERRHOST; + return -1; + } + + return 1; +} + +/* + * Read and parse a last line of a "hello" peer protocol message. + * Returns 0 if could not read a character, -1 if there was a read error or + * the line is malformed, 1 if succeeded. + * Set <curpeer> accordingly (the remote peer sending the "hello" message). + */ +static inline int peer_getline_last(struct appctx *appctx, struct peer **curpeer) +{ + char *p; + int reql; + struct peer *peer; + struct stream *s = appctx_strm(appctx); + struct peers *peers = strm_fe(s)->parent; + + reql = peer_getline(appctx); + if (!reql) + return 0; + + if (reql < 0) + return -1; + + /* parse line "<peer name> <pid> <relative_pid>" */ + p = strchr(trash.area, ' '); + if (!p) { + appctx->st0 = PEER_SESS_ST_EXIT; + appctx->st1 = PEER_SESS_SC_ERRPROTO; + return -1; + } + *p = 0; + + /* lookup known peer */ + for (peer = peers->remote; peer; peer = peer->next) { + if (strcmp(peer->id, trash.area) == 0) + break; + } + + /* if unknown peer */ + if (!peer) { + appctx->st0 = PEER_SESS_ST_EXIT; + appctx->st1 = PEER_SESS_SC_ERRPEER; + return -1; + } + *curpeer = peer; + + return 1; +} + +/* + * Init <peer> peer after having accepted it at peer protocol level. + */ +static inline void init_accepted_peer(struct peer *peer, struct peers *peers) +{ + struct shared_table *st; + + peer->heartbeat = tick_add(now_ms, MS_TO_TICKS(PEER_HEARTBEAT_TIMEOUT)); + /* Register status code */ + peer->statuscode = PEER_SESS_SC_SUCCESSCODE; + peer->last_hdshk = now_ms; + + /* Awake main task */ + task_wakeup(peers->sync_task, TASK_WOKEN_MSG); + + /* Init confirm counter */ + peer->confirm = 0; + + /* Init cursors */ + for (st = peer->tables; st ; st = st->next) { + uint commitid, updateid; + + st->last_get = st->last_acked = 0; + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &st->table->lock); + /* if st->update appears to be in future it means + * that the last acked value is very old and we + * remain unconnected a too long time to use this + * acknowledgement as a reset. + * We should update the protocol to be able to + * signal the remote peer that it needs a full resync. + * Here a partial fix consist to set st->update at + * the max past value + */ + if ((int)(st->table->localupdate - st->update) < 0) + st->update = st->table->localupdate + (2147483648U); + st->teaching_origin = st->last_pushed = st->update; + st->flags = 0; + + updateid = st->last_pushed; + commitid = _HA_ATOMIC_LOAD(&st->table->commitupdate); + + while ((int)(updateid - commitid) > 0) { + if (_HA_ATOMIC_CAS(&st->table->commitupdate, &commitid, updateid)) + break; + __ha_cpu_relax(); + } + + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &st->table->lock); + } + + /* reset teaching and learning flags to 0 */ + peer->flags &= PEER_TEACH_RESET; + peer->flags &= PEER_LEARN_RESET; + + /* if current peer is local */ + if (peer->local) { + /* if current host need resyncfrom local and no process assigned */ + if ((peers->flags & PEERS_RESYNC_STATEMASK) == PEERS_RESYNC_FROMLOCAL && + !(peers->flags & PEERS_F_RESYNC_ASSIGN)) { + /* assign local peer for a lesson, consider lesson already requested */ + peer->flags |= PEER_F_LEARN_ASSIGN; + peers->flags |= (PEERS_F_RESYNC_ASSIGN|PEERS_F_RESYNC_PROCESS); + peers->flags |= PEERS_F_RESYNC_LOCALASSIGN; + } + + } + else if ((peers->flags & PEERS_RESYNC_STATEMASK) == PEERS_RESYNC_FROMREMOTE && + !(peers->flags & PEERS_F_RESYNC_ASSIGN)) { + /* assign peer for a lesson */ + peer->flags |= PEER_F_LEARN_ASSIGN; + peers->flags |= PEERS_F_RESYNC_ASSIGN; + peers->flags |= PEERS_F_RESYNC_REMOTEASSIGN; + } +} + +/* + * Init <peer> peer after having connected it at peer protocol level. + */ +static inline void init_connected_peer(struct peer *peer, struct peers *peers) +{ + struct shared_table *st; + + peer->heartbeat = tick_add(now_ms, MS_TO_TICKS(PEER_HEARTBEAT_TIMEOUT)); + /* Init cursors */ + for (st = peer->tables; st ; st = st->next) { + uint updateid, commitid; + + st->last_get = st->last_acked = 0; + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &st->table->lock); + /* if st->update appears to be in future it means + * that the last acked value is very old and we + * remain unconnected a too long time to use this + * acknowledgement as a reset. + * We should update the protocol to be able to + * signal the remote peer that it needs a full resync. + * Here a partial fix consist to set st->update at + * the max past value. + */ + if ((int)(st->table->localupdate - st->update) < 0) + st->update = st->table->localupdate + (2147483648U); + st->teaching_origin = st->last_pushed = st->update; + st->flags = 0; + + updateid = st->last_pushed; + commitid = _HA_ATOMIC_LOAD(&st->table->commitupdate); + + while ((int)(updateid - commitid) > 0) { + if (_HA_ATOMIC_CAS(&st->table->commitupdate, &commitid, updateid)) + break; + __ha_cpu_relax(); + } + + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &st->table->lock); + } + + /* Init confirm counter */ + peer->confirm = 0; + + /* reset teaching and learning flags to 0 */ + peer->flags &= PEER_TEACH_RESET; + peer->flags &= PEER_LEARN_RESET; + + /* If current peer is local */ + if (peer->local) { + /* flag to start to teach lesson */ + peer->flags |= PEER_F_TEACH_PROCESS; + } + else if ((peers->flags & PEERS_RESYNC_STATEMASK) == PEERS_RESYNC_FROMREMOTE && + !(peers->flags & PEERS_F_RESYNC_ASSIGN)) { + /* If peer is remote and resync from remote is needed, + and no peer currently assigned */ + + /* assign peer for a lesson */ + peer->flags |= PEER_F_LEARN_ASSIGN; + peers->flags |= PEERS_F_RESYNC_ASSIGN; + peers->flags |= PEERS_F_RESYNC_REMOTEASSIGN; + } +} + +/* + * IO Handler to handle message exchange with a peer + */ +static void peer_io_handler(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct stream *s = __sc_strm(sc); + struct peers *curpeers = strm_fe(s)->parent; + struct peer *curpeer = NULL; + int reql = 0; + int repl = 0; + unsigned int maj_ver, min_ver; + int prev_state; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) { + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + goto out; + } + + /* Check if the input buffer is available. */ + if (sc_ib(sc)->size == 0) { + sc_need_room(sc, 0); + goto out; + } + + while (1) { + prev_state = appctx->st0; +switchstate: + maj_ver = min_ver = (unsigned int)-1; + switch(appctx->st0) { + case PEER_SESS_ST_ACCEPT: + prev_state = appctx->st0; + appctx->svcctx = NULL; + appctx->st0 = PEER_SESS_ST_GETVERSION; + __fallthrough; + case PEER_SESS_ST_GETVERSION: + prev_state = appctx->st0; + reql = peer_getline_version(appctx, &maj_ver, &min_ver); + if (reql <= 0) { + if (!reql) + goto out; + goto switchstate; + } + + appctx->st0 = PEER_SESS_ST_GETHOST; + __fallthrough; + case PEER_SESS_ST_GETHOST: + prev_state = appctx->st0; + reql = peer_getline_host(appctx); + if (reql <= 0) { + if (!reql) + goto out; + goto switchstate; + } + + appctx->st0 = PEER_SESS_ST_GETPEER; + __fallthrough; + case PEER_SESS_ST_GETPEER: { + prev_state = appctx->st0; + reql = peer_getline_last(appctx, &curpeer); + if (reql <= 0) { + if (!reql) + goto out; + goto switchstate; + } + + HA_SPIN_LOCK(PEER_LOCK, &curpeer->lock); + if (curpeer->appctx && curpeer->appctx != appctx) { + if (curpeer->local) { + /* Local connection, reply a retry */ + appctx->st0 = PEER_SESS_ST_EXIT; + appctx->st1 = PEER_SESS_SC_TRYAGAIN; + goto switchstate; + } + + /* we're killing a connection, we must apply a random delay before + * retrying otherwise the other end will do the same and we can loop + * for a while. + */ + curpeer->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000)); + peer_session_forceshutdown(curpeer); + curpeer->heartbeat = TICK_ETERNITY; + curpeer->coll++; + } + if (maj_ver != (unsigned int)-1 && min_ver != (unsigned int)-1) { + if (min_ver == PEER_DWNGRD_MINOR_VER) { + curpeer->flags |= PEER_F_DWNGRD; + } + else { + curpeer->flags &= ~PEER_F_DWNGRD; + } + } + curpeer->appctx = appctx; + curpeer->flags |= PEER_F_ALIVE; + appctx->svcctx = curpeer; + appctx->st0 = PEER_SESS_ST_SENDSUCCESS; + _HA_ATOMIC_INC(&active_peers); + } + __fallthrough; + case PEER_SESS_ST_SENDSUCCESS: { + prev_state = appctx->st0; + if (!curpeer) { + curpeer = appctx->svcctx; + HA_SPIN_LOCK(PEER_LOCK, &curpeer->lock); + if (curpeer->appctx != appctx) { + appctx->st0 = PEER_SESS_ST_END; + goto switchstate; + } + } + + repl = peer_send_status_successmsg(appctx); + if (repl <= 0) { + if (repl == -1) + goto out; + goto switchstate; + } + + init_accepted_peer(curpeer, curpeers); + + /* switch to waiting message state */ + _HA_ATOMIC_INC(&connected_peers); + appctx->st0 = PEER_SESS_ST_WAITMSG; + goto switchstate; + } + case PEER_SESS_ST_CONNECT: { + prev_state = appctx->st0; + if (!curpeer) { + curpeer = appctx->svcctx; + HA_SPIN_LOCK(PEER_LOCK, &curpeer->lock); + if (curpeer->appctx != appctx) { + appctx->st0 = PEER_SESS_ST_END; + goto switchstate; + } + } + + repl = peer_send_hellomsg(appctx, curpeer); + if (repl <= 0) { + if (repl == -1) + goto out; + goto switchstate; + } + + /* switch to the waiting statuscode state */ + appctx->st0 = PEER_SESS_ST_GETSTATUS; + } + __fallthrough; + case PEER_SESS_ST_GETSTATUS: { + prev_state = appctx->st0; + if (!curpeer) { + curpeer = appctx->svcctx; + HA_SPIN_LOCK(PEER_LOCK, &curpeer->lock); + if (curpeer->appctx != appctx) { + appctx->st0 = PEER_SESS_ST_END; + goto switchstate; + } + } + + if (sc_ic(sc)->flags & CF_WROTE_DATA) + curpeer->statuscode = PEER_SESS_SC_CONNECTEDCODE; + + reql = peer_getline(appctx); + if (!reql) + goto out; + + if (reql < 0) + goto switchstate; + + /* Register status code */ + curpeer->statuscode = atoi(trash.area); + curpeer->last_hdshk = now_ms; + + /* Awake main task */ + task_wakeup(curpeers->sync_task, TASK_WOKEN_MSG); + + /* If status code is success */ + if (curpeer->statuscode == PEER_SESS_SC_SUCCESSCODE) { + init_connected_peer(curpeer, curpeers); + } + else { + if (curpeer->statuscode == PEER_SESS_SC_ERRVERSION) + curpeer->flags |= PEER_F_DWNGRD; + /* Status code is not success, abort */ + appctx->st0 = PEER_SESS_ST_END; + goto switchstate; + } + _HA_ATOMIC_INC(&connected_peers); + appctx->st0 = PEER_SESS_ST_WAITMSG; + } + __fallthrough; + case PEER_SESS_ST_WAITMSG: { + uint32_t msg_len = 0; + char *msg_cur = trash.area; + char *msg_end = trash.area; + unsigned char msg_head[7]; // 2 + 5 for varint32 + int totl = 0; + + prev_state = appctx->st0; + if (!curpeer) { + curpeer = appctx->svcctx; + HA_SPIN_LOCK(PEER_LOCK, &curpeer->lock); + if (curpeer->appctx != appctx) { + appctx->st0 = PEER_SESS_ST_END; + goto switchstate; + } + } + + reql = peer_recv_msg(appctx, (char *)msg_head, sizeof msg_head, &msg_len, &totl); + if (reql <= 0) { + if (reql == -1) + goto switchstate; + goto send_msgs; + } + + msg_end += msg_len; + if (!peer_treat_awaited_msg(appctx, curpeer, msg_head, &msg_cur, msg_end, msg_len, totl)) + goto switchstate; + + curpeer->flags |= PEER_F_ALIVE; + + /* skip consumed message */ + co_skip(sc_oc(sc), totl); + /* loop on that state to peek next message */ + goto switchstate; + +send_msgs: + if (curpeer->flags & PEER_F_HEARTBEAT) { + curpeer->flags &= ~PEER_F_HEARTBEAT; + repl = peer_send_heartbeatmsg(appctx, curpeer, curpeers); + if (repl <= 0) { + if (repl == -1) + goto out; + goto switchstate; + } + curpeer->tx_hbt++; + } + /* we get here when a peer_recv_msg() returns 0 in reql */ + repl = peer_send_msgs(appctx, curpeer, curpeers); + if (repl <= 0) { + if (repl == -1) + goto out; + goto switchstate; + } + + /* noting more to do */ + goto out; + } + case PEER_SESS_ST_EXIT: + if (prev_state == PEER_SESS_ST_WAITMSG) + _HA_ATOMIC_DEC(&connected_peers); + prev_state = appctx->st0; + if (peer_send_status_errormsg(appctx) == -1) + goto out; + appctx->st0 = PEER_SESS_ST_END; + goto switchstate; + case PEER_SESS_ST_ERRSIZE: { + if (prev_state == PEER_SESS_ST_WAITMSG) + _HA_ATOMIC_DEC(&connected_peers); + prev_state = appctx->st0; + if (peer_send_error_size_limitmsg(appctx) == -1) + goto out; + appctx->st0 = PEER_SESS_ST_END; + goto switchstate; + } + case PEER_SESS_ST_ERRPROTO: { + TRACE_PROTO("protocol error", PEERS_EV_PROTOERR, + NULL, curpeer, &prev_state); + if (curpeer) + curpeer->proto_err++; + if (prev_state == PEER_SESS_ST_WAITMSG) + _HA_ATOMIC_DEC(&connected_peers); + prev_state = appctx->st0; + if (peer_send_error_protomsg(appctx) == -1) { + TRACE_PROTO("could not send error message", PEERS_EV_PROTOERR); + goto out; + } + appctx->st0 = PEER_SESS_ST_END; + prev_state = appctx->st0; + } + __fallthrough; + case PEER_SESS_ST_END: { + if (prev_state == PEER_SESS_ST_WAITMSG) + _HA_ATOMIC_DEC(&connected_peers); + prev_state = appctx->st0; + if (curpeer) { + HA_SPIN_UNLOCK(PEER_LOCK, &curpeer->lock); + curpeer = NULL; + } + se_fl_set(appctx->sedesc, SE_FL_EOS|SE_FL_EOI); + co_skip(sc_oc(sc), co_data(sc_oc(sc))); + goto out; + } + } + } +out: + sc_opposite(sc)->flags |= SC_FL_RCV_ONCE; + + if (curpeer) + HA_SPIN_UNLOCK(PEER_LOCK, &curpeer->lock); + return; +} + +static struct applet peer_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<PEER>", /* used for logging */ + .fct = peer_io_handler, + .init = peer_session_init, + .release = peer_session_release, +}; + + +/* + * Use this function to force a close of a peer session + */ +static void peer_session_forceshutdown(struct peer *peer) +{ + struct appctx *appctx = peer->appctx; + + /* Note that the peer sessions which have just been created + * (->st0 == PEER_SESS_ST_CONNECT) must not + * be shutdown, if not, the TCP session will never be closed + * and stay in CLOSE_WAIT state after having been closed by + * the remote side. + */ + if (!appctx || appctx->st0 == PEER_SESS_ST_CONNECT) + return; + + if (appctx->applet != &peer_applet) + return; + + __peer_session_deinit(peer); + + appctx->st0 = PEER_SESS_ST_END; + appctx_wakeup(appctx); +} + +/* Pre-configures a peers frontend to accept incoming connections */ +void peers_setup_frontend(struct proxy *fe) +{ + fe->last_change = ns_to_sec(now_ns); + fe->cap = PR_CAP_FE | PR_CAP_BE; + fe->mode = PR_MODE_PEERS; + fe->maxconn = 0; + fe->conn_retries = CONN_RETRIES; + fe->timeout.connect = MS_TO_TICKS(1000); + fe->timeout.client = MS_TO_TICKS(5000); + fe->timeout.server = MS_TO_TICKS(5000); + fe->accept = frontend_accept; + fe->default_target = &peer_applet.obj_type; + fe->options2 |= PR_O2_INDEPSTR | PR_O2_SMARTCON | PR_O2_SMARTACC; +} + +/* + * Create a new peer session in assigned state (connect will start automatically) + */ +static struct appctx *peer_session_create(struct peers *peers, struct peer *peer) +{ + struct appctx *appctx; + unsigned int thr = 0; + int idx; + + peer->new_conn++; + peer->reconnect = tick_add(now_ms, (stopping ? MS_TO_TICKS(PEER_LOCAL_RECONNECT_TIMEOUT) : MS_TO_TICKS(PEER_RECONNECT_TIMEOUT))); + peer->heartbeat = TICK_ETERNITY; + peer->statuscode = PEER_SESS_SC_CONNECTCODE; + peer->last_hdshk = now_ms; + + for (idx = 0; idx < global.nbthread; idx++) + thr = peers->applet_count[idx] < peers->applet_count[thr] ? idx : thr; + appctx = appctx_new_on(&peer_applet, NULL, thr); + if (!appctx) + goto out_close; + appctx->svcctx = (void *)peer; + + appctx->st0 = PEER_SESS_ST_CONNECT; + peer->appctx = appctx; + + HA_ATOMIC_INC(&peers->applet_count[thr]); + appctx_wakeup(appctx); + return appctx; + + out_close: + return NULL; +} + +/* + * Task processing function to manage re-connect, peer session + * tasks wakeup on local update and heartbeat. Let's keep it exported so that it + * resolves in stack traces and "show tasks". + */ +struct task *process_peer_sync(struct task * task, void *context, unsigned int state) +{ + struct peers *peers = context; + struct peer *ps; + struct shared_table *st; + + task->expire = TICK_ETERNITY; + + /* Acquire lock for all peers of the section */ + for (ps = peers->remote; ps; ps = ps->next) + HA_SPIN_LOCK(PEER_LOCK, &ps->lock); + + if (!stopping) { + /* Normal case (not soft stop)*/ + + /* resync timeout set to TICK_ETERNITY means we just start + * a new process and timer was not initialized. + * We must arm this timer to switch to a request to a remote + * node if incoming connection from old local process never + * comes. + */ + if (peers->resync_timeout == TICK_ETERNITY) + peers->resync_timeout = tick_add(now_ms, MS_TO_TICKS(PEER_RESYNC_TIMEOUT)); + + if (((peers->flags & PEERS_RESYNC_STATEMASK) == PEERS_RESYNC_FROMLOCAL) && + (!nb_oldpids || tick_is_expired(peers->resync_timeout, now_ms)) && + !(peers->flags & PEERS_F_RESYNC_ASSIGN)) { + /* Resync from local peer needed + no peer was assigned for the lesson + and no old local peer found + or resync timeout expire */ + + /* flag no more resync from local, to try resync from remotes */ + peers->flags |= PEERS_F_RESYNC_LOCAL; + peers->flags |= PEERS_F_RESYNC_LOCALTIMEOUT; + + /* reschedule a resync */ + peers->resync_timeout = tick_add(now_ms, MS_TO_TICKS(PEER_RESYNC_TIMEOUT)); + } + + /* For each session */ + for (ps = peers->remote; ps; ps = ps->next) { + /* For each remote peers */ + if (!ps->local) { + if (!ps->appctx) { + /* no active peer connection */ + if (ps->statuscode == 0 || + ((ps->statuscode == PEER_SESS_SC_CONNECTCODE || + ps->statuscode == PEER_SESS_SC_SUCCESSCODE || + ps->statuscode == PEER_SESS_SC_CONNECTEDCODE) && + tick_is_expired(ps->reconnect, now_ms))) { + /* connection never tried + * or previous peer connection established with success + * or previous peer connection failed while connecting + * and reconnection timer is expired */ + + /* retry a connect */ + ps->appctx = peer_session_create(peers, ps); + } + else if (!tick_is_expired(ps->reconnect, now_ms)) { + /* If previous session failed during connection + * but reconnection timer is not expired */ + + /* reschedule task for reconnect */ + task->expire = tick_first(task->expire, ps->reconnect); + } + /* else do nothing */ + } /* !ps->appctx */ + else if (ps->statuscode == PEER_SESS_SC_SUCCESSCODE) { + /* current peer connection is active and established */ + if (((peers->flags & PEERS_RESYNC_STATEMASK) == PEERS_RESYNC_FROMREMOTE) && + !(peers->flags & PEERS_F_RESYNC_ASSIGN) && + !(ps->flags & PEER_F_LEARN_NOTUP2DATE)) { + /* Resync from a remote is needed + * and no peer was assigned for lesson + * and current peer may be up2date */ + + /* assign peer for the lesson */ + ps->flags |= PEER_F_LEARN_ASSIGN; + peers->flags |= PEERS_F_RESYNC_ASSIGN; + peers->flags |= PEERS_F_RESYNC_REMOTEASSIGN; + + /* wake up peer handler to handle a request of resync */ + appctx_wakeup(ps->appctx); + } + else { + int update_to_push = 0; + + /* Awake session if there is data to push */ + for (st = ps->tables; st ; st = st->next) { + if (st->last_pushed != st->table->localupdate) { + /* wake up the peer handler to push local updates */ + update_to_push = 1; + /* There is no need to send a heartbeat message + * when some updates must be pushed. The remote + * peer will consider <ps> peer as alive when it will + * receive these updates. + */ + ps->flags &= ~PEER_F_HEARTBEAT; + /* Re-schedule another one later. */ + ps->heartbeat = tick_add(now_ms, MS_TO_TICKS(PEER_HEARTBEAT_TIMEOUT)); + /* Refresh reconnect if necessary */ + if (tick_is_expired(ps->reconnect, now_ms)) + ps->reconnect = tick_add(now_ms, MS_TO_TICKS(PEER_RECONNECT_TIMEOUT)); + /* We are going to send updates, let's ensure we will + * come back to send heartbeat messages or to reconnect. + */ + task->expire = tick_first(ps->reconnect, ps->heartbeat); + appctx_wakeup(ps->appctx); + break; + } + } + /* When there are updates to send we do not reconnect + * and do not send heartbeat message either. + */ + if (!update_to_push) { + if (tick_is_expired(ps->reconnect, now_ms)) { + if (ps->flags & PEER_F_ALIVE) { + /* This peer was alive during a 'reconnect' period. + * Flag it as not alive again for the next period. + */ + ps->flags &= ~PEER_F_ALIVE; + ps->reconnect = tick_add(now_ms, MS_TO_TICKS(PEER_RECONNECT_TIMEOUT)); + } + else { + ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000)); + ps->heartbeat = TICK_ETERNITY; + peer_session_forceshutdown(ps); + ps->no_hbt++; + } + } + else if (tick_is_expired(ps->heartbeat, now_ms)) { + ps->heartbeat = tick_add(now_ms, MS_TO_TICKS(PEER_HEARTBEAT_TIMEOUT)); + ps->flags |= PEER_F_HEARTBEAT; + appctx_wakeup(ps->appctx); + } + task->expire = tick_first(ps->reconnect, ps->heartbeat); + } + } + /* else do nothing */ + } /* SUCCESSCODE */ + } /* !ps->peer->local */ + } /* for */ + + /* Resync from remotes expired: consider resync is finished */ + if (((peers->flags & PEERS_RESYNC_STATEMASK) == PEERS_RESYNC_FROMREMOTE) && + !(peers->flags & PEERS_F_RESYNC_ASSIGN) && + tick_is_expired(peers->resync_timeout, now_ms)) { + /* Resync from remote peer needed + * no peer was assigned for the lesson + * and resync timeout expire */ + + /* flag no more resync from remote, consider resync is finished */ + peers->flags |= PEERS_F_RESYNC_REMOTE; + peers->flags |= PEERS_F_RESYNC_REMOTETIMEOUT; + } + + if ((peers->flags & PEERS_RESYNC_STATEMASK) != PEERS_RESYNC_FINISHED) { + /* Resync not finished*/ + /* reschedule task to resync timeout if not expired, to ended resync if needed */ + if (!tick_is_expired(peers->resync_timeout, now_ms)) + task->expire = tick_first(task->expire, peers->resync_timeout); + } + } /* !stopping */ + else { + /* soft stop case */ + if (state & TASK_WOKEN_SIGNAL) { + /* We've just received the signal */ + if (!(peers->flags & PEERS_F_DONOTSTOP)) { + /* add DO NOT STOP flag if not present */ + _HA_ATOMIC_INC(&jobs); + peers->flags |= PEERS_F_DONOTSTOP; + + /* disconnect all connected peers to process a local sync + * this must be done only the first time we are switching + * in stopping state + */ + for (ps = peers->remote; ps; ps = ps->next) { + /* we're killing a connection, we must apply a random delay before + * retrying otherwise the other end will do the same and we can loop + * for a while. + */ + ps->reconnect = tick_add(now_ms, MS_TO_TICKS(50 + ha_random() % 2000)); + if (ps->appctx) { + peer_session_forceshutdown(ps); + } + } + + /* Set resync timeout for the local peer and request a immediate reconnect */ + peers->resync_timeout = tick_add(now_ms, MS_TO_TICKS(PEER_RESYNC_TIMEOUT)); + peers->local->reconnect = now_ms; + } + } + + ps = peers->local; + if (ps->flags & PEER_F_TEACH_COMPLETE) { + if (peers->flags & PEERS_F_DONOTSTOP) { + /* resync of new process was complete, current process can die now */ + _HA_ATOMIC_DEC(&jobs); + peers->flags &= ~PEERS_F_DONOTSTOP; + for (st = ps->tables; st ; st = st->next) + HA_ATOMIC_DEC(&st->table->refcnt); + } + } + else if (!ps->appctx) { + /* Re-arm resync timeout if necessary */ + if (!tick_isset(peers->resync_timeout)) + peers->resync_timeout = tick_add(now_ms, MS_TO_TICKS(PEER_RESYNC_TIMEOUT)); + + /* If there's no active peer connection */ + if ((peers->flags & PEERS_RESYNC_STATEMASK) == PEERS_RESYNC_FINISHED && + !tick_is_expired(peers->resync_timeout, now_ms) && + (ps->statuscode == 0 || + ps->statuscode == PEER_SESS_SC_SUCCESSCODE || + ps->statuscode == PEER_SESS_SC_CONNECTEDCODE || + ps->statuscode == PEER_SESS_SC_TRYAGAIN)) { + /* The resync is finished for the local peer and + * the resync timeout is not expired and + * connection never tried + * or previous peer connection was successfully established + * or previous tcp connect succeeded but init state incomplete + * or during previous connect, peer replies a try again statuscode */ + + if (!tick_is_expired(ps->reconnect, now_ms)) { + /* reconnection timer is not expired. reschedule task for reconnect */ + task->expire = tick_first(task->expire, ps->reconnect); + } + else { + /* connect to the local peer if we must push a local sync */ + if (peers->flags & PEERS_F_DONOTSTOP) { + peer_session_create(peers, ps); + } + } + } + else { + /* Other error cases */ + if (peers->flags & PEERS_F_DONOTSTOP) { + /* unable to resync new process, current process can die now */ + _HA_ATOMIC_DEC(&jobs); + peers->flags &= ~PEERS_F_DONOTSTOP; + for (st = ps->tables; st ; st = st->next) + HA_ATOMIC_DEC(&st->table->refcnt); + } + } + } + else if (ps->statuscode == PEER_SESS_SC_SUCCESSCODE ) { + /* Reset resync timeout during a resync */ + peers->resync_timeout = TICK_ETERNITY; + + /* current peer connection is active and established + * wake up all peer handlers to push remaining local updates */ + for (st = ps->tables; st ; st = st->next) { + if (st->last_pushed != st->table->localupdate) { + appctx_wakeup(ps->appctx); + break; + } + } + } + } /* stopping */ + + /* Release lock for all peers of the section */ + for (ps = peers->remote; ps; ps = ps->next) + HA_SPIN_UNLOCK(PEER_LOCK, &ps->lock); + + /* Wakeup for re-connect */ + return task; +} + + +/* + * returns 0 in case of error. + */ +int peers_init_sync(struct peers *peers) +{ + struct peer * curpeer; + + for (curpeer = peers->remote; curpeer; curpeer = curpeer->next) { + peers->peers_fe->maxconn += 3; + } + + peers->sync_task = task_new_anywhere(); + if (!peers->sync_task) + return 0; + + memset(peers->applet_count, 0, sizeof(peers->applet_count)); + peers->sync_task->process = process_peer_sync; + peers->sync_task->context = (void *)peers; + peers->sighandler = signal_register_task(0, peers->sync_task, 0); + task_wakeup(peers->sync_task, TASK_WOKEN_INIT); + return 1; +} + +/* + * Allocate a cache a dictionary entries used upon transmission. + */ +static struct dcache_tx *new_dcache_tx(size_t max_entries) +{ + struct dcache_tx *d; + struct ebpt_node *entries; + + d = malloc(sizeof *d); + entries = calloc(max_entries, sizeof *entries); + if (!d || !entries) + goto err; + + d->lru_key = 0; + d->prev_lookup = NULL; + d->cached_entries = EB_ROOT_UNIQUE; + d->entries = entries; + + return d; + + err: + free(d); + free(entries); + return NULL; +} + +/* + * Allocate a cache of dictionary entries with <name> as name and <max_entries> + * as maximum of entries. + * Return the dictionary cache if succeeded, NULL if not. + * Must be deallocated calling free_dcache(). + */ +static struct dcache *new_dcache(size_t max_entries) +{ + struct dcache_tx *dc_tx; + struct dcache *dc; + struct dcache_rx *dc_rx; + + dc = calloc(1, sizeof *dc); + dc_tx = new_dcache_tx(max_entries); + dc_rx = calloc(max_entries, sizeof *dc_rx); + if (!dc || !dc_tx || !dc_rx) + goto err; + + dc->tx = dc_tx; + dc->rx = dc_rx; + dc->max_entries = max_entries; + + return dc; + + err: + free(dc); + free(dc_tx); + free(dc_rx); + return NULL; +} + +/* + * Look for the dictionary entry with the value of <i> in <d> cache of dictionary + * entries used upon transmission. + * Return the entry if found, NULL if not. + */ +static struct ebpt_node *dcache_tx_lookup_value(struct dcache_tx *d, + struct dcache_tx_entry *i) +{ + return ebpt_lookup(&d->cached_entries, i->entry.key); +} + +/* + * Flush <dc> cache. + * Always succeeds. + */ +static inline void flush_dcache(struct peer *peer) +{ + int i; + struct dcache *dc = peer->dcache; + + for (i = 0; i < dc->max_entries; i++) { + ebpt_delete(&dc->tx->entries[i]); + dc->tx->entries[i].key = NULL; + dict_entry_unref(&server_key_dict, dc->rx[i].de); + dc->rx[i].de = NULL; + } + dc->tx->prev_lookup = NULL; + dc->tx->lru_key = 0; + + memset(dc->rx, 0, dc->max_entries * sizeof *dc->rx); +} + +/* + * Insert a dictionary entry in <dc> cache part used upon transmission (->tx) + * with information provided by <i> dictionary cache entry (especially the value + * to be inserted if not already). Return <i> if already present in the cache + * or something different of <i> if not. + */ +static struct ebpt_node *dcache_tx_insert(struct dcache *dc, struct dcache_tx_entry *i) +{ + struct dcache_tx *dc_tx; + struct ebpt_node *o; + + dc_tx = dc->tx; + + if (dc_tx->prev_lookup && dc_tx->prev_lookup->key == i->entry.key) { + o = dc_tx->prev_lookup; + } else { + o = dcache_tx_lookup_value(dc_tx, i); + if (o) { + /* Save it */ + dc_tx->prev_lookup = o; + } + } + + if (o) { + /* Copy the ID. */ + i->id = o - dc->tx->entries; + return &i->entry; + } + + /* The new entry to put in cache */ + dc_tx->prev_lookup = o = &dc_tx->entries[dc_tx->lru_key]; + + ebpt_delete(o); + o->key = i->entry.key; + ebpt_insert(&dc_tx->cached_entries, o); + i->id = dc_tx->lru_key; + + /* Update the index for the next entry to put in cache */ + dc_tx->lru_key = (dc_tx->lru_key + 1) & (dc->max_entries - 1); + + return o; +} + +/* + * Allocate a dictionary cache for each peer of <peers> section. + * Return 1 if succeeded, 0 if not. + */ +int peers_alloc_dcache(struct peers *peers) +{ + struct peer *p; + + for (p = peers->remote; p; p = p->next) { + p->dcache = new_dcache(PEER_STKT_CACHE_MAX_ENTRIES); + if (!p->dcache) + return 0; + } + + return 1; +} + +/* + * Function used to register a table for sync on a group of peers + * Returns 0 in case of success. + */ +int peers_register_table(struct peers *peers, struct stktable *table) +{ + struct shared_table *st; + struct peer * curpeer; + int id = 0; + int retval = 0; + + for (curpeer = peers->remote; curpeer; curpeer = curpeer->next) { + st = calloc(1,sizeof(*st)); + if (!st) { + retval = 1; + break; + } + st->table = table; + st->next = curpeer->tables; + if (curpeer->tables) + id = curpeer->tables->local_id; + st->local_id = id + 1; + + /* If peer is local we inc table + * refcnt to protect against flush + * until this process pushed all + * table content to the new one + */ + if (curpeer->local) + HA_ATOMIC_INC(&st->table->refcnt); + curpeer->tables = st; + } + + table->sync_task = peers->sync_task; + + return retval; +} + +/* context used by a "show peers" command */ +struct show_peers_ctx { + void *target; /* if non-null, dump only this section and stop */ + struct peers *peers; /* "peers" section being currently dumped. */ + struct peer *peer; /* "peer" being currently dumped. */ + int flags; /* non-zero if "dict" dump requested */ + enum { + STATE_HEAD = 0, /* dump the section's header */ + STATE_PEER, /* dump the whole peer */ + STATE_DONE, /* finished */ + } state; /* parser's state */ +}; + +/* + * Parse the "show peers" command arguments. + * Returns 0 if succeeded, 1 if not with the ->msg of the appctx set as + * error message. + */ +static int cli_parse_show_peers(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_peers_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (strcmp(args[2], "dict") == 0) { + /* show the dictionaries (large dump) */ + ctx->flags |= PEERS_SHOW_F_DICT; + args++; + } else if (strcmp(args[2], "-") == 0) + args++; // allows to show a section called "dict" + + if (*args[2]) { + struct peers *p; + + for (p = cfg_peers; p; p = p->next) { + if (strcmp(p->id, args[2]) == 0) { + ctx->target = p; + break; + } + } + + if (!p) + return cli_err(appctx, "No such peers\n"); + } + + /* where to start from */ + ctx->peers = ctx->target ? ctx->target : cfg_peers; + return 0; +} + +/* + * This function dumps the peer state information of <peers> "peers" section. + * Returns 0 if the output buffer is full and needs to be called again, non-zero if not. + * Dedicated to be called by cli_io_handler_show_peers() cli I/O handler. + */ +static int peers_dump_head(struct buffer *msg, struct appctx *appctx, struct peers *peers) +{ + struct tm tm; + + get_localtime(peers->last_change, &tm); + chunk_appendf(msg, "%p: [%02d/%s/%04d:%02d:%02d:%02d] id=%s disabled=%d flags=0x%x resync_timeout=%s task_calls=%u\n", + peers, + tm.tm_mday, monthname[tm.tm_mon], tm.tm_year+1900, + tm.tm_hour, tm.tm_min, tm.tm_sec, + peers->id, peers->disabled, peers->flags, + peers->resync_timeout ? + tick_is_expired(peers->resync_timeout, now_ms) ? "<PAST>" : + human_time(TICKS_TO_MS(peers->resync_timeout - now_ms), + TICKS_TO_MS(1000)) : "<NEVER>", + peers->sync_task ? peers->sync_task->calls : 0); + + if (applet_putchk(appctx, msg) == -1) + return 0; + + return 1; +} + +/* + * This function dumps <peer> state information. + * Returns 0 if the output buffer is full and needs to be called again, non-zero + * if not. Dedicated to be called by cli_io_handler_show_peers() cli I/O handler. + */ +static int peers_dump_peer(struct buffer *msg, struct appctx *appctx, struct peer *peer, int flags) +{ + struct connection *conn; + char pn[INET6_ADDRSTRLEN]; + struct stconn *peer_cs; + struct stream *peer_s; + struct shared_table *st; + + addr_to_str(&peer->addr, pn, sizeof pn); + chunk_appendf(msg, " %p: id=%s(%s,%s) addr=%s:%d last_status=%s", + peer, peer->id, + peer->local ? "local" : "remote", + peer->appctx ? "active" : "inactive", + pn, get_host_port(&peer->addr), + statuscode_str(peer->statuscode)); + + chunk_appendf(msg, " last_hdshk=%s\n", + peer->last_hdshk ? human_time(TICKS_TO_MS(now_ms - peer->last_hdshk), + TICKS_TO_MS(1000)) : "<NEVER>"); + + chunk_appendf(msg, " reconnect=%s", + peer->reconnect ? + tick_is_expired(peer->reconnect, now_ms) ? "<PAST>" : + human_time(TICKS_TO_MS(peer->reconnect - now_ms), + TICKS_TO_MS(1000)) : "<NEVER>"); + + chunk_appendf(msg, " heartbeat=%s", + peer->heartbeat ? + tick_is_expired(peer->heartbeat, now_ms) ? "<PAST>" : + human_time(TICKS_TO_MS(peer->heartbeat - now_ms), + TICKS_TO_MS(1000)) : "<NEVER>"); + + chunk_appendf(msg, " confirm=%u tx_hbt=%u rx_hbt=%u no_hbt=%u new_conn=%u proto_err=%u coll=%u\n", + peer->confirm, peer->tx_hbt, peer->rx_hbt, + peer->no_hbt, peer->new_conn, peer->proto_err, peer->coll); + + chunk_appendf(&trash, " flags=0x%x", peer->flags); + + if (!peer->appctx) + goto table_info; + + chunk_appendf(&trash, " appctx:%p st0=%d st1=%d task_calls=%u", + peer->appctx, peer->appctx->st0, peer->appctx->st1, + peer->appctx->t ? peer->appctx->t->calls : 0); + + peer_cs = appctx_sc(peer->appctx); + if (!peer_cs) { + /* the appctx might exist but not yet be initialized due to + * deferred initialization used to balance applets across + * threads. + */ + goto table_info; + } + + peer_s = __sc_strm(peer_cs); + + chunk_appendf(&trash, " state=%s", sc_state_str(sc_opposite(peer_cs)->state)); + + conn = objt_conn(strm_orig(peer_s)); + if (conn) + chunk_appendf(&trash, "\n xprt=%s", conn_get_xprt_name(conn)); + + switch (conn && conn_get_src(conn) ? addr_to_str(conn->src, pn, sizeof(pn)) : AF_UNSPEC) { + case AF_INET: + case AF_INET6: + chunk_appendf(&trash, " src=%s:%d", pn, get_host_port(conn->src)); + break; + case AF_UNIX: + chunk_appendf(&trash, " src=unix:%d", strm_li(peer_s)->luid); + break; + } + + switch (conn && conn_get_dst(conn) ? addr_to_str(conn->dst, pn, sizeof(pn)) : AF_UNSPEC) { + case AF_INET: + case AF_INET6: + chunk_appendf(&trash, " addr=%s:%d", pn, get_host_port(conn->dst)); + break; + case AF_UNIX: + chunk_appendf(&trash, " addr=unix:%d", strm_li(peer_s)->luid); + break; + } + + table_info: + if (peer->remote_table) + chunk_appendf(&trash, "\n remote_table:%p id=%s local_id=%d remote_id=%d", + peer->remote_table, + peer->remote_table->table->id, + peer->remote_table->local_id, + peer->remote_table->remote_id); + + if (peer->last_local_table) + chunk_appendf(&trash, "\n last_local_table:%p id=%s local_id=%d remote_id=%d", + peer->last_local_table, + peer->last_local_table->table->id, + peer->last_local_table->local_id, + peer->last_local_table->remote_id); + + if (peer->tables) { + chunk_appendf(&trash, "\n shared tables:"); + for (st = peer->tables; st; st = st->next) { + int i, count; + struct stktable *t; + struct dcache *dcache; + + t = st->table; + dcache = peer->dcache; + + chunk_appendf(&trash, "\n %p local_id=%d remote_id=%d " + "flags=0x%x remote_data=0x%llx", + st, st->local_id, st->remote_id, + st->flags, (unsigned long long)st->remote_data); + chunk_appendf(&trash, "\n last_acked=%u last_pushed=%u last_get=%u" + " teaching_origin=%u update=%u", + st->last_acked, st->last_pushed, st->last_get, + st->teaching_origin, st->update); + chunk_appendf(&trash, "\n table:%p id=%s update=%u localupdate=%u" + " commitupdate=%u refcnt=%u", + t, t->id, t->update, t->localupdate, _HA_ATOMIC_LOAD(&t->commitupdate), t->refcnt); + if (flags & PEERS_SHOW_F_DICT) { + chunk_appendf(&trash, "\n TX dictionary cache:"); + count = 0; + for (i = 0; i < dcache->max_entries; i++) { + struct ebpt_node *node; + struct dict_entry *de; + + node = &dcache->tx->entries[i]; + if (!node->key) + break; + + if (!count++) + chunk_appendf(&trash, "\n "); + de = node->key; + chunk_appendf(&trash, " %3u -> %s", i, (char *)de->value.key); + count &= 0x3; + } + chunk_appendf(&trash, "\n RX dictionary cache:"); + count = 0; + for (i = 0; i < dcache->max_entries; i++) { + if (!count++) + chunk_appendf(&trash, "\n "); + chunk_appendf(&trash, " %3u -> %s", i, + dcache->rx[i].de ? + (char *)dcache->rx[i].de->value.key : "-"); + count &= 0x3; + } + } else { + chunk_appendf(&trash, "\n Dictionary cache not dumped (use \"show peers dict\")"); + } + } + } + + end: + chunk_appendf(&trash, "\n"); + if (applet_putchk(appctx, msg) == -1) + return 0; + + return 1; +} + +/* + * This function dumps all the peers of "peers" section. + * Returns 0 if the output buffer is full and needs to be called + * again, non-zero if not. It proceeds in an isolated thread, so + * there is no thread safety issue here. + */ +static int cli_io_handler_show_peers(struct appctx *appctx) +{ + struct show_peers_ctx *ctx = appctx->svcctx; + int ret = 0, first_peers = 1; + + thread_isolate(); + + chunk_reset(&trash); + + while (ctx->state != STATE_DONE) { + switch (ctx->state) { + case STATE_HEAD: + if (!ctx->peers) { + /* No more peers list. */ + ctx->state = STATE_DONE; + } + else { + if (!first_peers) + chunk_appendf(&trash, "\n"); + else + first_peers = 0; + if (!peers_dump_head(&trash, appctx, ctx->peers)) + goto out; + + ctx->peer = ctx->peers->remote; + ctx->peers = ctx->peers->next; + ctx->state = STATE_PEER; + } + break; + + case STATE_PEER: + if (!ctx->peer) { + /* End of peer list */ + if (!ctx->target) + ctx->state = STATE_HEAD; // next one + else + ctx->state = STATE_DONE; + } + else { + if (!peers_dump_peer(&trash, appctx, ctx->peer, ctx->flags)) + goto out; + + ctx->peer = ctx->peer->next; + } + break; + + default: + break; + } + } + ret = 1; + out: + thread_release(); + return ret; +} + + +struct peers_kw_list peers_keywords = { + .list = LIST_HEAD_INIT(peers_keywords.list) +}; + +void peers_register_keywords(struct peers_kw_list *pkwl) +{ + LIST_APPEND(&peers_keywords.list, &pkwl->list); +} + +/* config parser for global "tune.peers.max-updates-at-once" */ +static int cfg_parse_max_updt_at_once(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int arg = -1; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) != 0) + arg = atoi(args[1]); + + if (arg < 1) { + memprintf(err, "'%s' expects an integer argument greater than 0.", args[0]); + return -1; + } + + peers_max_updates_at_once = arg; + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.peers.max-updates-at-once", cfg_parse_max_updt_at_once }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* + * CLI keywords. + */ +static struct cli_kw_list cli_kws = {{ }, { + { { "show", "peers", NULL }, "show peers [dict|-] [section] : dump some information about all the peers or this peers section", cli_parse_show_peers, cli_io_handler_show_peers, }, + {}, +}}; + +/* Register cli keywords */ +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); diff --git a/src/pipe.c b/src/pipe.c new file mode 100644 index 0000000..5599fe0 --- /dev/null +++ b/src/pipe.c @@ -0,0 +1,136 @@ +/* + * Pipe management + * + * Copyright 2000-2009 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <unistd.h> +#include <fcntl.h> + +#include <haproxy/api.h> +#include <haproxy/global.h> +#include <haproxy/pipe-t.h> +#include <haproxy/pool.h> +#include <haproxy/thread.h> + + +DECLARE_STATIC_POOL(pool_head_pipe, "pipe", sizeof(struct pipe)); + +struct pipe *pipes_live = NULL; /* pipes which are still ready to use */ + +__decl_spinlock(pipes_lock); /* lock used to protect pipes list */ + +static THREAD_LOCAL int local_pipes_free = 0; /* #cache objects */ +static THREAD_LOCAL struct pipe *local_pipes = NULL; + +int pipes_used = 0; /* # of pipes in use (2 fds each) */ +int pipes_free = 0; /* # of pipes unused */ + +/* return a pre-allocated empty pipe. Try to allocate one if there isn't any + * left. NULL is returned if a pipe could not be allocated. + */ +struct pipe *get_pipe() +{ + struct pipe *ret = NULL; + int pipefd[2]; + + ret = local_pipes; + if (likely(ret)) { + local_pipes = ret->next; + local_pipes_free--; + HA_ATOMIC_DEC(&pipes_free); + HA_ATOMIC_INC(&pipes_used); + goto out; + } + + if (likely(pipes_live)) { + HA_SPIN_LOCK(PIPES_LOCK, &pipes_lock); + ret = pipes_live; + if (likely(ret)) + pipes_live = ret->next; + HA_SPIN_UNLOCK(PIPES_LOCK, &pipes_lock); + if (ret) { + HA_ATOMIC_DEC(&pipes_free); + HA_ATOMIC_INC(&pipes_used); + goto out; + } + } + + HA_ATOMIC_INC(&pipes_used); + if (pipes_used + pipes_free >= global.maxpipes) + goto fail; + + ret = pool_alloc(pool_head_pipe); + if (!ret) + goto fail; + + if (pipe(pipefd) < 0) + goto fail; + +#ifdef F_SETPIPE_SZ + if (global.tune.pipesize) + fcntl(pipefd[0], F_SETPIPE_SZ, global.tune.pipesize); +#endif + ret->data = 0; + ret->prod = pipefd[1]; + ret->cons = pipefd[0]; + ret->next = NULL; + out: + return ret; + fail: + pool_free(pool_head_pipe, ret); + HA_ATOMIC_DEC(&pipes_used); + return NULL; + +} + +/* destroy a pipe, possibly because an error was encountered on it. Its FDs + * will be closed and it will not be reinjected into the live pool. + */ +void kill_pipe(struct pipe *p) +{ + close(p->prod); + close(p->cons); + pool_free(pool_head_pipe, p); + HA_ATOMIC_DEC(&pipes_used); +} + +/* put back a unused pipe into the live pool. If it still has data in it, it is + * closed and not reinjected into the live pool. The caller is not allowed to + * use it once released. + */ +void put_pipe(struct pipe *p) +{ + if (unlikely(p->data)) { + kill_pipe(p); + return; + } + + if (likely(local_pipes_free * global.nbthread < global.maxpipes - pipes_used)) { + p->next = local_pipes; + local_pipes = p; + local_pipes_free++; + goto out; + } + + HA_SPIN_LOCK(PIPES_LOCK, &pipes_lock); + p->next = pipes_live; + pipes_live = p; + HA_SPIN_UNLOCK(PIPES_LOCK, &pipes_lock); + out: + HA_ATOMIC_INC(&pipes_free); + HA_ATOMIC_DEC(&pipes_used); +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/pool.c b/src/pool.c new file mode 100644 index 0000000..376b311 --- /dev/null +++ b/src/pool.c @@ -0,0 +1,1539 @@ +/* + * Memory management functions. + * + * Copyright 2000-2007 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> + +#include <import/plock.h> + +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/applet-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/pool.h> +#include <haproxy/pool-os.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stats-t.h> +#include <haproxy/stconn.h> +#include <haproxy/thread.h> +#include <haproxy/tools.h> + + +/* These ones are initialized per-thread on startup by init_pools() */ +THREAD_LOCAL size_t pool_cache_bytes = 0; /* total cache size */ +THREAD_LOCAL size_t pool_cache_count = 0; /* #cache objects */ + +static struct list pools __read_mostly = LIST_HEAD_INIT(pools); +int mem_poison_byte __read_mostly = 'P'; +int pool_trim_in_progress = 0; +uint pool_debugging __read_mostly = /* set of POOL_DBG_* flags */ +#ifdef DEBUG_FAIL_ALLOC + POOL_DBG_FAIL_ALLOC | +#endif +#ifdef DEBUG_DONT_SHARE_POOLS + POOL_DBG_DONT_MERGE | +#endif +#ifdef DEBUG_POOL_INTEGRITY + POOL_DBG_COLD_FIRST | +#endif +#ifdef DEBUG_POOL_INTEGRITY + POOL_DBG_INTEGRITY | +#endif +#ifdef CONFIG_HAP_NO_GLOBAL_POOLS + POOL_DBG_NO_GLOBAL | +#endif +#if defined(DEBUG_NO_POOLS) || defined(DEBUG_UAF) + POOL_DBG_NO_CACHE | +#endif +#if defined(DEBUG_POOL_TRACING) + POOL_DBG_CALLER | +#endif +#if defined(DEBUG_MEMORY_POOLS) + POOL_DBG_TAG | +#endif +#if defined(DEBUG_UAF) + POOL_DBG_UAF | +#endif + 0; + +static const struct { + uint flg; + const char *set; + const char *clr; + const char *hlp; +} dbg_options[] = { + /* flg, set, clr, hlp */ + { POOL_DBG_FAIL_ALLOC, "fail", "no-fail", "randomly fail allocations" }, + { POOL_DBG_DONT_MERGE, "no-merge", "merge", "disable merging of similar pools" }, + { POOL_DBG_COLD_FIRST, "cold-first", "hot-first", "pick cold objects first" }, + { POOL_DBG_INTEGRITY, "integrity", "no-integrity", "enable cache integrity checks" }, + { POOL_DBG_NO_GLOBAL, "no-global", "global", "disable global shared cache" }, + { POOL_DBG_NO_CACHE, "no-cache", "cache", "disable thread-local cache" }, + { POOL_DBG_CALLER, "caller", "no-caller", "save caller information in cache" }, + { POOL_DBG_TAG, "tag", "no-tag", "add tag at end of allocated objects" }, + { POOL_DBG_POISON, "poison", "no-poison", "poison newly allocated objects" }, + { POOL_DBG_UAF, "uaf", "no-uaf", "enable use-after-free checks (slow)" }, + { 0 /* end */ } +}; + +/* describes a snapshot of a pool line about to be dumped by "show pools" */ +struct pool_dump_info { + const struct pool_head *entry; + ulong alloc_items; + ulong alloc_bytes; + ulong used_items; + ulong cached_items; + ulong need_avg; + ulong failed_items; +}; + +/* context used by "show pools" */ +struct show_pools_ctx { + char *prefix; /* if non-null, match this prefix name for the pool */ + int by_what; /* 0=no sort, 1=by name, 2=by item size, 3=by total alloc */ + int maxcnt; /* 0=no limit, other=max number of output entries */ +}; + +static int mem_fail_rate __read_mostly = 0; +static int using_default_allocator __read_mostly = 1; // linked-in allocator or LD_PRELOADed one ? +static int disable_trim __read_mostly = 0; +static int(*my_mallctl)(const char *, void *, size_t *, void *, size_t) = NULL; +static int(*_malloc_trim)(size_t) = NULL; + +/* returns the pool hash bucket an object should use based on its pointer. + * Objects will needed consistent bucket assignment so that they may be + * allocated on one thread and released on another one. Thus only the + * pointer is usable. + */ +static forceinline unsigned int pool_pbucket(const void *ptr) +{ + return ptr_hash(ptr, CONFIG_HAP_POOL_BUCKETS_BITS); +} + +/* returns the pool hash bucket to use for the current thread. This should only + * be used when no pointer is available (e.g. count alloc failures). + */ +static forceinline unsigned int pool_tbucket(void) +{ + return tid % CONFIG_HAP_POOL_BUCKETS; +} + +/* ask the allocator to trim memory pools. + * This must run under thread isolation so that competing threads trying to + * allocate or release memory do not prevent the allocator from completing + * its job. We just have to be careful as callers might already be isolated + * themselves. + */ +void trim_all_pools(void) +{ + int isolated = thread_isolated(); + + if (!isolated) + thread_isolate(); + + malloc_trim(0); + + if (!isolated) + thread_release(); +} + +/* check if we're using the same allocator as the one that provides + * malloc_trim() and mallinfo(). The principle is that on glibc, both + * malloc_trim() and mallinfo() are provided, and using mallinfo() we + * can check if malloc() is performed through glibc or any other one + * the executable was linked against (e.g. jemalloc). Prior to this we + * have to check whether we're running on jemalloc by verifying if the + * mallctl() function is provided. Its pointer will be used later. + */ +static void detect_allocator(void) +{ +#if defined(__ELF__) + extern int mallctl(const char *, void *, size_t *, void *, size_t) __attribute__((weak)); + + my_mallctl = mallctl; +#endif + if (!my_mallctl) { + /* trick: we won't enter here if mallctl() is known at link + * time. This allows to detect if the symbol was changed since + * the program was linked, indicating it's not running on the + * expected allocator (due to an LD_PRELOAD) and that we must + * be extra cautious and avoid some optimizations that are + * known to break such as malloc_trim(). + */ + my_mallctl = get_sym_curr_addr("mallctl"); + using_default_allocator = (my_mallctl == NULL); + } + + if (!my_mallctl) { +#if defined(HA_HAVE_MALLOC_TRIM) +#ifdef HA_HAVE_MALLINFO2 + struct mallinfo2 mi1, mi2; +#else + struct mallinfo mi1, mi2; +#endif + void *ptr; + +#ifdef HA_HAVE_MALLINFO2 + mi1 = mallinfo2(); +#else + mi1 = mallinfo(); +#endif + ptr = DISGUISE(malloc(1)); +#ifdef HA_HAVE_MALLINFO2 + mi2 = mallinfo2(); +#else + mi2 = mallinfo(); +#endif + free(DISGUISE(ptr)); + + using_default_allocator = !!memcmp(&mi1, &mi2, sizeof(mi1)); +#elif defined(HA_HAVE_MALLOC_ZONE) + using_default_allocator = (malloc_default_zone() != NULL); +#endif + } + + /* detect presence of malloc_trim() */ + _malloc_trim = get_sym_next_addr("malloc_trim"); +} + +/* replace the libc's malloc_trim() so that we can also intercept the calls + * from child libraries when the allocator is not the default one. + */ +int malloc_trim(size_t pad) +{ + int ret = 0; + + if (disable_trim) + return ret; + + HA_ATOMIC_INC(&pool_trim_in_progress); + + if (my_mallctl) { + /* here we're on jemalloc and malloc_trim() is called either + * by haproxy or another dependency (the worst case that + * normally crashes). Instead of just failing, we can actually + * emulate it so let's do it now. + */ + unsigned int i, narenas = 0; + size_t len = sizeof(narenas); + + if (my_mallctl("arenas.narenas", &narenas, &len, NULL, 0) == 0) { + for (i = 0; i < narenas; i ++) { + char mib[32] = {0}; + snprintf(mib, sizeof(mib), "arena.%u.purge", i); + (void)my_mallctl(mib, NULL, NULL, NULL, 0); + ret = 1; // success + } + } + } + else if (!using_default_allocator) { + /* special allocators that can be LD_PRELOADed end here */ + ret = 0; // did nothing + } + else if (_malloc_trim) { + /* we're typically on glibc and not overridden */ + ret = _malloc_trim(pad); + } +#if defined(HA_HAVE_MALLOC_ZONE) + else { + /* we're on MacOS, there's an equivalent mechanism */ + vm_address_t *zones; + unsigned int i, nzones; + + if (malloc_get_all_zones(0, NULL, &zones, &nzones) == KERN_SUCCESS) { + for (i = 0; i < nzones; i ++) { + malloc_zone_t *zone = (malloc_zone_t *)zones[i]; + + /* we cannot purge anonymous zones */ + if (zone->zone_name) { + malloc_zone_pressure_relief(zone, 0); + ret = 1; // success + } + } + } + } +#endif + HA_ATOMIC_DEC(&pool_trim_in_progress); + + /* here we have ret=0 if nothing was release, or 1 if some were */ + return ret; +} + +static int mem_should_fail(const struct pool_head *pool) +{ + int ret = 0; + + if (mem_fail_rate > 0 && !(global.mode & MODE_STARTING)) { + if (mem_fail_rate > statistical_prng_range(100)) + ret = 1; + else + ret = 0; + } + return ret; +} + +/* Try to find an existing shared pool with the same characteristics and + * returns it, otherwise creates this one. NULL is returned if no memory + * is available for a new creation. Two flags are supported : + * - MEM_F_SHARED to indicate that the pool may be shared with other users + * - MEM_F_EXACT to indicate that the size must not be rounded up + */ +struct pool_head *create_pool(char *name, unsigned int size, unsigned int flags) +{ + unsigned int extra_mark, extra_caller, extra; + struct pool_head *pool; + struct pool_head *entry; + struct list *start; + unsigned int align; + int thr __maybe_unused; + + extra_mark = (pool_debugging & POOL_DBG_TAG) ? POOL_EXTRA_MARK : 0; + extra_caller = (pool_debugging & POOL_DBG_CALLER) ? POOL_EXTRA_CALLER : 0; + extra = extra_mark + extra_caller; + + if (!(pool_debugging & POOL_DBG_NO_CACHE)) { + /* we'll store two lists there, we need the room for this. Let's + * make sure it's always OK even when including the extra word + * that is stored after the pci struct. + */ + if (size + extra - extra_caller < sizeof(struct pool_cache_item)) + size = sizeof(struct pool_cache_item) + extra_caller - extra; + } + + /* Now we know our size is set to the strict minimum possible. It may + * be OK for elements allocated with an exact size (e.g. buffers), but + * we're going to round the size up 16 bytes to merge almost identical + * pools together. We only round up however when we add the debugging + * tag since it's used to detect overflows. Otherwise we only round up + * to the size of a word to preserve alignment. + */ + if (!(flags & MEM_F_EXACT)) { + align = (pool_debugging & POOL_DBG_TAG) ? sizeof(void *) : 16; + size = ((size + align - 1) & -align); + } + + /* TODO: thread: we do not lock pool list for now because all pools are + * created during HAProxy startup (so before threads creation) */ + start = &pools; + pool = NULL; + + list_for_each_entry(entry, &pools, list) { + if (entry->size == size) { + /* either we can share this place and we take it, or + * we look for a shareable one or for the next position + * before which we will insert a new one. + */ + if ((flags & entry->flags & MEM_F_SHARED) && + (!(pool_debugging & POOL_DBG_DONT_MERGE) || + strcmp(name, entry->name) == 0)) { + /* we can share this one */ + pool = entry; + DPRINTF(stderr, "Sharing %s with %s\n", name, pool->name); + break; + } + } + else if (entry->size > size) { + /* insert before this one */ + start = &entry->list; + break; + } + } + + if (!pool) { + void *pool_addr; + + pool_addr = calloc(1, sizeof(*pool) + __alignof__(*pool)); + if (!pool_addr) + return NULL; + + /* always provide an aligned pool */ + pool = (struct pool_head*)((((size_t)pool_addr) + __alignof__(*pool)) & -(size_t)__alignof__(*pool)); + pool->base_addr = pool_addr; // keep it, it's the address to free later + + if (name) + strlcpy2(pool->name, name, sizeof(pool->name)); + pool->alloc_sz = size + extra; + pool->size = size; + pool->flags = flags; + LIST_APPEND(start, &pool->list); + + if (!(pool_debugging & POOL_DBG_NO_CACHE)) { + /* update per-thread pool cache if necessary */ + for (thr = 0; thr < MAX_THREADS; thr++) { + LIST_INIT(&pool->cache[thr].list); + pool->cache[thr].tid = thr; + pool->cache[thr].pool = pool; + } + } + } + pool->users++; + return pool; +} + +/* Tries to allocate an object for the pool <pool> using the system's allocator + * and directly returns it. The pool's allocated counter is checked but NOT + * updated, this is left to the caller, and but no other checks are performed. + */ +void *pool_get_from_os_noinc(struct pool_head *pool) +{ + if (!pool->limit || pool_allocated(pool) < pool->limit) { + void *ptr; + + if (pool_debugging & POOL_DBG_UAF) + ptr = pool_alloc_area_uaf(pool->alloc_sz); + else + ptr = pool_alloc_area(pool->alloc_sz); + if (ptr) + return ptr; + _HA_ATOMIC_INC(&pool->buckets[pool_tbucket()].failed); + } + activity[tid].pool_fail++; + return NULL; + +} + +/* Releases a pool item back to the operating system but DOES NOT update + * the allocation counter, it's left to the caller to do it. It may be + * done before or after, it doesn't matter, the function does not use it. + */ +void pool_put_to_os_nodec(struct pool_head *pool, void *ptr) +{ + if (pool_debugging & POOL_DBG_UAF) + pool_free_area_uaf(ptr, pool->alloc_sz); + else + pool_free_area(ptr, pool->alloc_sz); +} + +/* Tries to allocate an object for the pool <pool> using the system's allocator + * and directly returns it. The pool's counters are updated but the object is + * never cached, so this is usable with and without local or shared caches. + */ +void *pool_alloc_nocache(struct pool_head *pool, const void *caller) +{ + void *ptr = NULL; + uint bucket; + + ptr = pool_get_from_os_noinc(pool); + if (!ptr) + return NULL; + + bucket = pool_pbucket(ptr); + swrate_add_scaled_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, pool->buckets[bucket].used, POOL_AVG_SAMPLES/4); + _HA_ATOMIC_INC(&pool->buckets[bucket].allocated); + _HA_ATOMIC_INC(&pool->buckets[bucket].used); + + /* keep track of where the element was allocated from */ + POOL_DEBUG_SET_MARK(pool, ptr); + POOL_DEBUG_TRACE_CALLER(pool, (struct pool_cache_item *)ptr, caller); + return ptr; +} + +/* Release a pool item back to the OS and keeps the pool's counters up to date. + * This is always defined even when pools are not enabled (their usage stats + * are maintained). + */ +void pool_free_nocache(struct pool_head *pool, void *ptr) +{ + uint bucket = pool_pbucket(ptr); + + _HA_ATOMIC_DEC(&pool->buckets[bucket].used); + _HA_ATOMIC_DEC(&pool->buckets[bucket].allocated); + swrate_add_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, pool->buckets[bucket].used); + + pool_put_to_os_nodec(pool, ptr); +} + + +/* Updates <pch>'s fill_pattern and fills the free area after <item> with it, + * up to <size> bytes. The item part is left untouched. + */ +void pool_fill_pattern(struct pool_cache_head *pch, struct pool_cache_item *item, uint size) +{ + ulong *ptr = (ulong *)item; + uint ofs; + ulong u; + + if (size <= sizeof(*item)) + return; + + /* Upgrade the fill_pattern to change about half of the bits + * (to be sure to catch static flag corruption), and apply it. + */ + u = pch->fill_pattern += ~0UL / 3; // 0x55...55 + ofs = sizeof(*item) / sizeof(*ptr); + while (ofs < size / sizeof(*ptr)) + ptr[ofs++] = u; +} + +/* check for a pool_cache_item integrity after extracting it from the cache. It + * must have been previously initialized using pool_fill_pattern(). If any + * corruption is detected, the function provokes an immediate crash. + */ +void pool_check_pattern(struct pool_cache_head *pch, struct pool_head *pool, struct pool_cache_item *item, const void *caller) +{ + const ulong *ptr = (const ulong *)item; + uint size = pool->size; + uint ofs; + ulong u; + + if (size <= sizeof(*item)) + return; + + /* let's check that all words past *item are equal */ + ofs = sizeof(*item) / sizeof(*ptr); + u = ptr[ofs++]; + while (ofs < size / sizeof(*ptr)) { + if (unlikely(ptr[ofs] != u)) { + pool_inspect_item("cache corruption detected", pool, item, caller); + ABORT_NOW(); + } + ofs++; + } +} + +/* removes up to <count> items from the end of the local pool cache <ph> for + * pool <pool>. The shared pool is refilled with these objects in the limit + * of the number of acceptable objects, and the rest will be released to the + * OS. It is not a problem is <count> is larger than the number of objects in + * the local cache. The counters are automatically updated. Must not be used + * with pools disabled. + */ +static void pool_evict_last_items(struct pool_head *pool, struct pool_cache_head *ph, uint count) +{ + struct pool_cache_item *item; + struct pool_item *pi, *head = NULL; + void *caller = __builtin_return_address(0); + uint released = 0; + uint cluster = 0; + uint to_free_max; + uint bucket; + + BUG_ON(pool_debugging & POOL_DBG_NO_CACHE); + + /* Note: this will be zero when global pools are disabled */ + to_free_max = pool_releasable(pool); + + while (released < count && !LIST_ISEMPTY(&ph->list)) { + item = LIST_PREV(&ph->list, typeof(item), by_pool); + BUG_ON(&item->by_pool == &ph->list); + if (unlikely(pool_debugging & POOL_DBG_INTEGRITY)) + pool_check_pattern(ph, pool, item, caller); + LIST_DELETE(&item->by_pool); + LIST_DELETE(&item->by_lru); + + bucket = pool_pbucket(item); + _HA_ATOMIC_DEC(&pool->buckets[bucket].used); + swrate_add_opportunistic(&pool->buckets[bucket].needed_avg, POOL_AVG_SAMPLES, pool->buckets[bucket].used); + + if (to_free_max > released || cluster) { + /* will never match when global pools are disabled */ + pi = (struct pool_item *)item; + pi->next = NULL; + pi->down = head; + head = pi; + cluster++; + if (cluster >= CONFIG_HAP_POOL_CLUSTER_SIZE) { + /* enough to make a cluster */ + pool_put_to_shared_cache(pool, head); + cluster = 0; + head = NULL; + } + } else { + /* does pool_free_nocache() with a known bucket */ + _HA_ATOMIC_DEC(&pool->buckets[bucket].allocated); + pool_put_to_os_nodec(pool, item); + } + + released++; + } + + /* incomplete cluster left */ + if (cluster) + pool_put_to_shared_cache(pool, head); + + ph->count -= released; + pool_cache_count -= released; + pool_cache_bytes -= released * pool->size; +} + +/* Evicts some of the oldest objects from one local cache, until its number of + * objects is no more than 16+1/8 of the total number of locally cached objects + * or the total size of the local cache is no more than 75% of its maximum (i.e. + * we don't want a single cache to use all the cache for itself). For this, the + * list is scanned in reverse. If <full> is non-null, all objects are evicted. + * Must not be used when pools are disabled. + */ +void pool_evict_from_local_cache(struct pool_head *pool, int full) +{ + struct pool_cache_head *ph = &pool->cache[tid]; + + BUG_ON(pool_debugging & POOL_DBG_NO_CACHE); + + while ((ph->count && full) || + (ph->count >= CONFIG_HAP_POOL_CLUSTER_SIZE && + ph->count >= 16 + pool_cache_count / 8 && + pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) { + pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE); + } +} + +/* Evicts some of the oldest objects from the local cache, pushing them to the + * global pool. Must not be used when pools are disabled. + */ +void pool_evict_from_local_caches() +{ + struct pool_cache_item *item; + struct pool_cache_head *ph; + struct pool_head *pool; + + BUG_ON(pool_debugging & POOL_DBG_NO_CACHE); + + do { + item = LIST_PREV(&th_ctx->pool_lru_head, struct pool_cache_item *, by_lru); + BUG_ON(&item->by_lru == &th_ctx->pool_lru_head); + /* note: by definition we remove oldest objects so they also are the + * oldest in their own pools, thus their next is the pool's head. + */ + ph = LIST_NEXT(&item->by_pool, struct pool_cache_head *, list); + BUG_ON(ph->tid != tid); + + pool = container_of(ph - tid, struct pool_head, cache); + BUG_ON(pool != ph->pool); + + pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE); + } while (pool_cache_bytes > global.tune.pool_cache_size * 7 / 8); +} + +/* Frees an object to the local cache, possibly pushing oldest objects to the + * shared cache, which itself may decide to release some of them to the OS. + * While it is unspecified what the object becomes past this point, it is + * guaranteed to be released from the users' perspective. A caller address may + * be passed and stored into the area when DEBUG_POOL_TRACING is set. Must not + * be used with pools disabled. + */ +void pool_put_to_cache(struct pool_head *pool, void *ptr, const void *caller) +{ + struct pool_cache_item *item = (struct pool_cache_item *)ptr; + struct pool_cache_head *ph = &pool->cache[tid]; + + BUG_ON(pool_debugging & POOL_DBG_NO_CACHE); + + LIST_INSERT(&ph->list, &item->by_pool); + LIST_INSERT(&th_ctx->pool_lru_head, &item->by_lru); + POOL_DEBUG_TRACE_CALLER(pool, item, caller); + ph->count++; + if (unlikely(pool_debugging & POOL_DBG_INTEGRITY)) + pool_fill_pattern(ph, item, pool->size); + pool_cache_count++; + pool_cache_bytes += pool->size; + + if (unlikely(pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) { + if (ph->count >= 16 + pool_cache_count / 8 + CONFIG_HAP_POOL_CLUSTER_SIZE) + pool_evict_from_local_cache(pool, 0); + if (pool_cache_bytes > global.tune.pool_cache_size) + pool_evict_from_local_caches(); + } +} + +/* Tries to refill the local cache <pch> from the shared one for pool <pool>. + * This is only used when pools are in use and shared pools are enabled. No + * malloc() is attempted, and poisonning is never performed. The purpose is to + * get the fastest possible refilling so that the caller can easily check if + * the cache has enough objects for its use. Must not be used when pools are + * disabled. + */ +void pool_refill_local_from_shared(struct pool_head *pool, struct pool_cache_head *pch) +{ + struct pool_cache_item *item; + struct pool_item *ret, *down; + uint bucket; + uint count; + + BUG_ON(pool_debugging & POOL_DBG_NO_CACHE); + + /* we'll need to reference the first element to figure the next one. We + * must temporarily lock it so that nobody allocates then releases it, + * or the dereference could fail. In order to limit the locking, + * threads start from a bucket that depends on their ID. + */ + + bucket = pool_tbucket(); + ret = _HA_ATOMIC_LOAD(&pool->buckets[bucket].free_list); + count = 0; + do { + /* look for an apparently non-busy entry. If we hit a busy pool + * we retry with another random bucket. And if we encounter a + * NULL, we retry once with another random bucket. This is in + * order to prevent object accumulation in other buckets. + */ + while (unlikely(ret == POOL_BUSY || (ret == NULL && count++ < 1))) { + bucket = statistical_prng() % CONFIG_HAP_POOL_BUCKETS; + ret = _HA_ATOMIC_LOAD(&pool->buckets[bucket].free_list); + } + if (ret == NULL) + return; + } while (unlikely((ret = _HA_ATOMIC_XCHG(&pool->buckets[bucket].free_list, POOL_BUSY)) == POOL_BUSY)); + + if (unlikely(ret == NULL)) { + HA_ATOMIC_STORE(&pool->buckets[bucket].free_list, NULL); + return; + } + + /* this releases the lock */ + HA_ATOMIC_STORE(&pool->buckets[bucket].free_list, ret->next); + + /* now store the retrieved object(s) into the local cache. Note that + * they don't all have the same hash and that it doesn't necessarily + * match the one from the pool. + */ + count = 0; + for (; ret; ret = down) { + down = ret->down; + item = (struct pool_cache_item *)ret; + POOL_DEBUG_TRACE_CALLER(pool, item, NULL); + LIST_INSERT(&pch->list, &item->by_pool); + LIST_INSERT(&th_ctx->pool_lru_head, &item->by_lru); + _HA_ATOMIC_INC(&pool->buckets[pool_pbucket(item)].used); + count++; + if (unlikely(pool_debugging & POOL_DBG_INTEGRITY)) + pool_fill_pattern(pch, item, pool->size); + + } + pch->count += count; + pool_cache_count += count; + pool_cache_bytes += count * pool->size; +} + +/* Adds pool item cluster <item> to the shared cache, which contains <count> + * elements. The caller is advised to first check using pool_releasable() if + * it's wise to add this series of objects there. Both the pool and the item's + * head must be valid. + */ +void pool_put_to_shared_cache(struct pool_head *pool, struct pool_item *item) +{ + struct pool_item *free_list; + uint bucket = pool_pbucket(item); + + /* we prefer to put the item into the entry that corresponds to its own + * hash so that on return it remains in the right place, but that's not + * mandatory. + */ + free_list = _HA_ATOMIC_LOAD(&pool->buckets[bucket].free_list); + do { + /* look for an apparently non-busy entry */ + while (unlikely(free_list == POOL_BUSY)) { + bucket = (bucket + 1) % CONFIG_HAP_POOL_BUCKETS; + free_list = _HA_ATOMIC_LOAD(&pool->buckets[bucket].free_list); + } + _HA_ATOMIC_STORE(&item->next, free_list); + __ha_barrier_atomic_store(); + } while (!_HA_ATOMIC_CAS(&pool->buckets[bucket].free_list, &free_list, item)); + __ha_barrier_atomic_store(); +} + +/* + * This function frees whatever can be freed in pool <pool>. + */ +void pool_flush(struct pool_head *pool) +{ + struct pool_item *next, *temp, *down; + uint bucket; + + if (!pool || (pool_debugging & (POOL_DBG_NO_CACHE|POOL_DBG_NO_GLOBAL))) + return; + + /* The loop below atomically detaches the head of the free list and + * replaces it with a NULL. Then the list can be released. + */ + for (bucket = 0; bucket < CONFIG_HAP_POOL_BUCKETS; bucket++) { + next = pool->buckets[bucket].free_list; + while (1) { + while (unlikely(next == POOL_BUSY)) + next = (void*)pl_wait_new_long((ulong*)&pool->buckets[bucket].free_list, (ulong)next); + + if (next == NULL) + break; + + next = _HA_ATOMIC_XCHG(&pool->buckets[bucket].free_list, POOL_BUSY); + if (next != POOL_BUSY) { + HA_ATOMIC_STORE(&pool->buckets[bucket].free_list, NULL); + break; + } + } + + while (next) { + temp = next; + next = temp->next; + for (; temp; temp = down) { + down = temp->down; + _HA_ATOMIC_DEC(&pool->buckets[pool_pbucket(temp)].allocated); + pool_put_to_os_nodec(pool, temp); + } + } + } + /* here, we should have pool->allocated == pool->used */ +} + +/* + * This function frees whatever can be freed in all pools, but respecting + * the minimum thresholds imposed by owners. It makes sure to be alone to + * run by using thread_isolate(). <pool_ctx> is unused. + */ +void pool_gc(struct pool_head *pool_ctx) +{ + struct pool_head *entry; + int isolated = thread_isolated(); + + if (!isolated) + thread_isolate(); + + list_for_each_entry(entry, &pools, list) { + struct pool_item *temp, *down; + uint allocated = pool_allocated(entry); + uint used = pool_used(entry); + int bucket = 0; + + while ((int)(allocated - used) > (int)entry->minavail) { + /* ok let's find next entry to evict */ + while (!entry->buckets[bucket].free_list && bucket < CONFIG_HAP_POOL_BUCKETS) + bucket++; + + if (bucket >= CONFIG_HAP_POOL_BUCKETS) + break; + + temp = entry->buckets[bucket].free_list; + entry->buckets[bucket].free_list = temp->next; + for (; temp; temp = down) { + down = temp->down; + allocated--; + _HA_ATOMIC_DEC(&entry->buckets[pool_pbucket(temp)].allocated); + pool_put_to_os_nodec(entry, temp); + } + } + } + + trim_all_pools(); + + if (!isolated) + thread_release(); +} + +/* + * Returns a pointer to type <type> taken from the pool <pool_type> or + * dynamically allocated. In the first case, <pool_type> is updated to point to + * the next element in the list. <flags> is a binary-OR of POOL_F_* flags. + * Prefer using pool_alloc() which does the right thing without flags. + */ +void *__pool_alloc(struct pool_head *pool, unsigned int flags) +{ + void *p = NULL; + void *caller = __builtin_return_address(0); + + if (unlikely(pool_debugging & POOL_DBG_FAIL_ALLOC)) + if (!(flags & POOL_F_NO_FAIL) && mem_should_fail(pool)) + return NULL; + + if (likely(!(pool_debugging & POOL_DBG_NO_CACHE)) && !p) + p = pool_get_from_cache(pool, caller); + + if (unlikely(!p)) + p = pool_alloc_nocache(pool, caller); + + if (likely(p)) { +#ifdef USE_MEMORY_PROFILING + if (unlikely(profiling & HA_PROF_MEMORY)) { + extern struct memprof_stats memprof_stats[MEMPROF_HASH_BUCKETS + 1]; + struct memprof_stats *bin; + + bin = memprof_get_bin(__builtin_return_address(0), MEMPROF_METH_P_ALLOC); + _HA_ATOMIC_ADD(&bin->alloc_calls, 1); + _HA_ATOMIC_ADD(&bin->alloc_tot, pool->size); + _HA_ATOMIC_STORE(&bin->info, pool); + /* replace the caller with the allocated bin: this way + * we'll the pool_free() call will be able to update our + * entry. We only do it for non-colliding entries though, + * since these ones store the true caller location. + */ + if (bin >= &memprof_stats[0] && bin < &memprof_stats[MEMPROF_HASH_BUCKETS]) + POOL_DEBUG_TRACE_CALLER(pool, (struct pool_cache_item *)p, bin); + } +#endif + if (unlikely(flags & POOL_F_MUST_ZERO)) + memset(p, 0, pool->size); + else if (unlikely(!(flags & POOL_F_NO_POISON) && (pool_debugging & POOL_DBG_POISON))) + memset(p, mem_poison_byte, pool->size); + } + return p; +} + +/* + * Puts a memory area back to the corresponding pool. <ptr> be valid. Using + * pool_free() is preferred. + */ +void __pool_free(struct pool_head *pool, void *ptr) +{ + const void *caller = __builtin_return_address(0); + + /* we'll get late corruption if we refill to the wrong pool or double-free */ + POOL_DEBUG_CHECK_MARK(pool, ptr, caller); + POOL_DEBUG_RESET_MARK(pool, ptr); + +#ifdef USE_MEMORY_PROFILING + if (unlikely(profiling & HA_PROF_MEMORY) && ptr) { + extern struct memprof_stats memprof_stats[MEMPROF_HASH_BUCKETS + 1]; + struct memprof_stats *bin; + + bin = memprof_get_bin(__builtin_return_address(0), MEMPROF_METH_P_FREE); + _HA_ATOMIC_ADD(&bin->free_calls, 1); + _HA_ATOMIC_ADD(&bin->free_tot, pool->size); + _HA_ATOMIC_STORE(&bin->info, pool); + + /* check if the caller is an allocator, and if so, let's update + * its free() count. + */ + bin = *(struct memprof_stats**)(((char *)ptr) + pool->alloc_sz - sizeof(void*)); + if (bin >= &memprof_stats[0] && bin < &memprof_stats[MEMPROF_HASH_BUCKETS]) { + _HA_ATOMIC_ADD(&bin->free_calls, 1); + _HA_ATOMIC_ADD(&bin->free_tot, pool->size); + } + } +#endif + + if (unlikely((pool_debugging & POOL_DBG_NO_CACHE) || + global.tune.pool_cache_size < pool->size)) { + pool_free_nocache(pool, ptr); + return; + } + + pool_put_to_cache(pool, ptr, caller); +} + +/* + * This function destroys a pool by freeing it completely, unless it's still + * in use. This should be called only under extreme circumstances. It always + * returns NULL if the resulting pool is empty, easing the clearing of the old + * pointer, otherwise it returns the pool. + * . + */ +void *pool_destroy(struct pool_head *pool) +{ + if (pool) { + if (!(pool_debugging & POOL_DBG_NO_CACHE)) + pool_evict_from_local_cache(pool, 1); + + pool_flush(pool); + if (pool_used(pool)) + return pool; + pool->users--; + if (!pool->users) { + LIST_DELETE(&pool->list); + /* note that if used == 0, the cache is empty */ + free(pool->base_addr); + } + } + return NULL; +} + +/* This destroys all pools on exit. It is *not* thread safe. */ +void pool_destroy_all() +{ + struct pool_head *entry, *back; + + list_for_each_entry_safe(entry, back, &pools, list) { + /* there's only one occurrence of each pool in the list, + * and we're existing instead of looping on the whole + * list just to decrement users, force it to 1 here. + */ + entry->users = 1; + pool_destroy(entry); + } +} + +/* carefully inspects an item upon fatal error and emit diagnostics */ +void pool_inspect_item(const char *msg, struct pool_head *pool, const void *item, const void *caller) +{ + const struct pool_head *the_pool = NULL; + + chunk_printf(&trash, + "FATAL: pool inconsistency detected in thread %d: %s.\n" + " caller: %p (", + tid + 1, msg, caller); + + resolve_sym_name(&trash, NULL, caller); + + chunk_appendf(&trash, + ")\n" + " item: %p\n" + " pool: %p ('%s', size %u, real %u, users %u)\n", + item, pool, pool->name, pool->size, pool->alloc_sz, pool->users); + + if (pool_debugging & POOL_DBG_TAG) { + const void **pool_mark; + struct pool_head *ph; + const void *tag; + + pool_mark = (const void **)(((char *)item) + pool->size); + tag = may_access(pool_mark) ? *pool_mark : NULL; + if (tag == pool) { + chunk_appendf(&trash, " tag: @%p = %p (%s)\n", pool_mark, tag, pool->name); + the_pool = pool; + } + else { + if (!may_access(pool_mark)) + chunk_appendf(&trash, "Tag not accessible. "); + else + chunk_appendf(&trash, "Tag does not match (%p). ", tag); + + list_for_each_entry(ph, &pools, list) { + pool_mark = (const void **)(((char *)item) + ph->size); + if (!may_access(pool_mark)) + continue; + tag = *pool_mark; + + if (tag == ph) { + if (!the_pool) + chunk_appendf(&trash, "Possible origin pool(s):\n"); + + chunk_appendf(&trash, " tag: @%p = %p (%s, size %u, real %u, users %u)\n", + pool_mark, tag, ph->name, ph->size, ph->alloc_sz, ph->users); + if (!the_pool || the_pool->size < ph->size) + the_pool = ph; + } + } + + if (!the_pool) { + const char *start, *end, *p; + + pool_mark = (const void **)(((char *)item) + pool->size); + chunk_appendf(&trash, + "Tag does not match any other pool.\n" + "Contents around address %p+%lu=%p:\n", + item, (ulong)((const void*)pool_mark - (const void*)item), + pool_mark); + + /* dump in word-sized blocks */ + start = (const void *)(((uintptr_t)pool_mark - 32) & -sizeof(void*)); + end = (const void *)(((uintptr_t)pool_mark + 32 + sizeof(void*) - 1) & -sizeof(void*)); + + while (start < end) { + dump_addr_and_bytes(&trash, " ", start, sizeof(void*)); + chunk_strcat(&trash, " ["); + for (p = start; p < start + sizeof(void*); p++) { + if (!may_access(p)) + chunk_strcat(&trash, "*"); + else if (isprint((unsigned char)*p)) + chunk_appendf(&trash, "%c", *p); + else + chunk_strcat(&trash, "."); + } + + if (may_access(start)) + tag = *(const void **)start; + else + tag = NULL; + + if (tag == pool) { + /* the pool can often be there so let's detect it */ + chunk_appendf(&trash, "] [pool:%s", pool->name); + } + else if (tag) { + /* print pointers that resolve to a symbol */ + size_t back_data = trash.data; + chunk_strcat(&trash, "] ["); + if (!resolve_sym_name(&trash, NULL, tag)) + trash.data = back_data; + } + + chunk_strcat(&trash, "]\n"); + start = p; + } + } + } + } + + if (pool_debugging & POOL_DBG_CALLER) { + struct buffer *trash2 = get_trash_chunk(); + const struct pool_head *ph; + const void **pool_mark; + const void *tag, *rec_tag; + + ph = the_pool ? the_pool : pool; + pool_mark = (const void **)(((char *)item) + ph->alloc_sz - sizeof(void*)); + rec_tag = may_access(pool_mark) ? *pool_mark : NULL; + + if (rec_tag && resolve_sym_name(trash2, NULL, rec_tag)) + chunk_appendf(&trash, + "Recorded caller if pool '%s':\n @%p (+%04u) = %p (%s)\n", + ph->name, pool_mark, (uint)(ph->alloc_sz - sizeof(void*)), + rec_tag, trash2->area); + + if (!the_pool) { + /* the pool couldn't be formally verified */ + chunk_appendf(&trash, "Other possible callers:\n"); + list_for_each_entry(ph, &pools, list) { + if (ph == pool) + continue; + pool_mark = (const void **)(((char *)item) + ph->alloc_sz - sizeof(void*)); + if (!may_access(pool_mark)) + continue; + tag = *pool_mark; + if (tag == rec_tag) + continue; + + /* see if we can resolve something */ + chunk_printf(trash2, "@%p (+%04u) = %p (", pool_mark, (uint)(ph->alloc_sz - sizeof(void*)), tag); + if (resolve_sym_name(trash2, NULL, tag)) { + chunk_appendf(trash2, ")"); + chunk_appendf(&trash, + " %s [as pool %s, size %u, real %u, users %u]\n", + trash2->area, ph->name, ph->size, ph->alloc_sz, ph->users); + } + } + } + } + + chunk_appendf(&trash, "\n"); + DISGUISE(write(2, trash.area, trash.data)); +} + +/* used by qsort in "show pools" to sort by name */ +static int cmp_dump_pools_name(const void *a, const void *b) +{ + const struct pool_dump_info *l = (const struct pool_dump_info *)a; + const struct pool_dump_info *r = (const struct pool_dump_info *)b; + + return strcmp(l->entry->name, r->entry->name); +} + +/* used by qsort in "show pools" to sort by item size */ +static int cmp_dump_pools_size(const void *a, const void *b) +{ + const struct pool_dump_info *l = (const struct pool_dump_info *)a; + const struct pool_dump_info *r = (const struct pool_dump_info *)b; + + if (l->entry->size > r->entry->size) + return -1; + else if (l->entry->size < r->entry->size) + return 1; + else + return 0; +} + +/* used by qsort in "show pools" to sort by usage */ +static int cmp_dump_pools_usage(const void *a, const void *b) +{ + const struct pool_dump_info *l = (const struct pool_dump_info *)a; + const struct pool_dump_info *r = (const struct pool_dump_info *)b; + + if (l->alloc_bytes > r->alloc_bytes) + return -1; + else if (l->alloc_bytes < r->alloc_bytes) + return 1; + else + return 0; +} + +/* will not dump more than this number of entries. Anything beyond this will + * likely not fit into a regular output buffer anyway. + */ +#define POOLS_MAX_DUMPED_ENTRIES 1024 + +/* This function dumps memory usage information into the trash buffer. + * It may sort by a criterion if <by_what> is non-zero, and limit the + * number of output lines if <max> is non-zero. It may limit only to + * pools whose names start with <pfx> if <pfx> is non-null. + */ +void dump_pools_to_trash(int by_what, int max, const char *pfx) +{ + struct pool_dump_info pool_info[POOLS_MAX_DUMPED_ENTRIES]; + struct pool_head *entry; + unsigned long long allocated, used; + int nbpools, i; + unsigned long long cached_bytes = 0; + uint cached = 0; + uint alloc_items; + + allocated = used = nbpools = 0; + + list_for_each_entry(entry, &pools, list) { + if (nbpools >= POOLS_MAX_DUMPED_ENTRIES) + break; + + alloc_items = pool_allocated(entry); + /* do not dump unused entries when sorting by usage */ + if (by_what == 3 && !alloc_items) + continue; + + /* verify the pool name if a prefix is requested */ + if (pfx && strncmp(entry->name, pfx, strlen(pfx)) != 0) + continue; + + if (!(pool_debugging & POOL_DBG_NO_CACHE)) { + for (cached = i = 0; i < global.nbthread; i++) + cached += entry->cache[i].count; + } + pool_info[nbpools].entry = entry; + pool_info[nbpools].alloc_items = alloc_items; + pool_info[nbpools].alloc_bytes = (ulong)entry->size * alloc_items; + pool_info[nbpools].used_items = pool_used(entry); + pool_info[nbpools].cached_items = cached; + pool_info[nbpools].need_avg = swrate_avg(pool_needed_avg(entry), POOL_AVG_SAMPLES); + pool_info[nbpools].failed_items = pool_failed(entry); + nbpools++; + } + + if (by_what == 1) /* sort by name */ + qsort(pool_info, nbpools, sizeof(pool_info[0]), cmp_dump_pools_name); + else if (by_what == 2) /* sort by item size */ + qsort(pool_info, nbpools, sizeof(pool_info[0]), cmp_dump_pools_size); + else if (by_what == 3) /* sort by total usage */ + qsort(pool_info, nbpools, sizeof(pool_info[0]), cmp_dump_pools_usage); + + chunk_printf(&trash, "Dumping pools usage"); + if (!max || max >= POOLS_MAX_DUMPED_ENTRIES) + max = POOLS_MAX_DUMPED_ENTRIES; + if (nbpools >= max) + chunk_appendf(&trash, " (limited to the first %u entries)", max); + chunk_appendf(&trash, ". Use SIGQUIT to flush them.\n"); + + for (i = 0; i < nbpools && i < max; i++) { + chunk_appendf(&trash, " - Pool %s (%lu bytes) : %lu allocated (%lu bytes), %lu used" + " (~%lu by thread caches)" + ", needed_avg %lu, %lu failures, %u users, @%p%s\n", + pool_info[i].entry->name, (ulong)pool_info[i].entry->size, + pool_info[i].alloc_items, pool_info[i].alloc_bytes, + pool_info[i].used_items, pool_info[i].cached_items, + pool_info[i].need_avg, pool_info[i].failed_items, + pool_info[i].entry->users, pool_info[i].entry, + (pool_info[i].entry->flags & MEM_F_SHARED) ? " [SHARED]" : ""); + + cached_bytes += pool_info[i].cached_items * (ulong)pool_info[i].entry->size; + allocated += pool_info[i].alloc_items * (ulong)pool_info[i].entry->size; + used += pool_info[i].used_items * (ulong)pool_info[i].entry->size; + } + + chunk_appendf(&trash, "Total: %d pools, %llu bytes allocated, %llu used" + " (~%llu by thread caches)" + ".\n", + nbpools, allocated, used, cached_bytes + ); +} + +/* Dump statistics on pools usage. */ +void dump_pools(void) +{ + dump_pools_to_trash(0, 0, NULL); + qfprintf(stderr, "%s", trash.area); +} + +/* This function returns the total number of failed pool allocations */ +int pool_total_failures() +{ + struct pool_head *entry; + int failed = 0; + + list_for_each_entry(entry, &pools, list) + failed += pool_failed(entry); + return failed; +} + +/* This function returns the total amount of memory allocated in pools (in bytes) */ +unsigned long long pool_total_allocated() +{ + struct pool_head *entry; + unsigned long long allocated = 0; + + list_for_each_entry(entry, &pools, list) + allocated += pool_allocated(entry) * (ullong)entry->size; + return allocated; +} + +/* This function returns the total amount of memory used in pools (in bytes) */ +unsigned long long pool_total_used() +{ + struct pool_head *entry; + unsigned long long used = 0; + + list_for_each_entry(entry, &pools, list) + used += pool_used(entry) * (ullong)entry->size; + return used; +} + +/* This function parses a string made of a set of debugging features as + * specified after -dM on the command line, and will set pool_debugging + * accordingly. On success it returns a strictly positive value. It may zero + * with the first warning in <err>, -1 with a help message in <err>, or -2 with + * the first error in <err> return the first error in <err>. <err> is undefined + * on success, and will be non-null and locally allocated on help/error/warning. + * The caller must free it. Warnings are used to report features that were not + * enabled at build time, and errors are used to report unknown features. + */ +int pool_parse_debugging(const char *str, char **err) +{ + struct ist args; + char *end; + uint new_dbg; + int v; + + + /* if it's empty or starts with a number, it's the mem poisonning byte */ + v = strtol(str, &end, 0); + if (!*end || *end == ',') { + mem_poison_byte = *str ? v : 'P'; + if (mem_poison_byte >= 0) + pool_debugging |= POOL_DBG_POISON; + else + pool_debugging &= ~POOL_DBG_POISON; + str = end; + } + + new_dbg = pool_debugging; + + for (args = ist(str); istlen(args); args = istadv(istfind(args, ','), 1)) { + struct ist feat = iststop(args, ','); + + if (!istlen(feat)) + continue; + + if (isteq(feat, ist("help"))) { + ha_free(err); + memprintf(err, + "-dM alone enables memory poisonning with byte 0x50 on allocation. A numeric\n" + "value may be appended immediately after -dM to use another value (0 supported).\n" + "Then an optional list of comma-delimited keywords may be appended to set or\n" + "clear some debugging options ('*' marks the current setting):\n\n" + " set clear description\n" + " -----------------+-----------------+-----------------------------------------\n"); + + for (v = 0; dbg_options[v].flg; v++) { + memprintf(err, "%s %c %-15s|%c %-15s| %s\n", + *err, + (pool_debugging & dbg_options[v].flg) ? '*' : ' ', + dbg_options[v].set, + (pool_debugging & dbg_options[v].flg) ? ' ' : '*', + dbg_options[v].clr, + dbg_options[v].hlp); + } + + memprintf(err, + "%s -----------------+-----------------+-----------------------------------------\n" + "Examples:\n" + " Disable merging and enable poisonning with byte 'P': -dM0x50,no-merge\n" + " Randomly fail allocations: -dMfail\n" + " Detect out-of-bound corruptions: -dMno-merge,tag\n" + " Detect post-free cache corruptions: -dMno-merge,cold-first,integrity,caller\n" + " Detect all cache corruptions: -dMno-merge,cold-first,integrity,tag,caller\n" + " Detect UAF (disables cache, very slow): -dMuaf\n" + " Detect post-cache UAF: -dMuaf,cache,no-merge,cold-first,integrity,tag,caller\n" + " Detect post-free cache corruptions: -dMno-merge,cold-first,integrity,caller\n", + *err); + return -1; + } + + for (v = 0; dbg_options[v].flg; v++) { + if (isteq(feat, ist(dbg_options[v].set))) { + new_dbg |= dbg_options[v].flg; + /* UAF implicitly disables caching, but it's + * still possible to forcefully re-enable it. + */ + if (dbg_options[v].flg == POOL_DBG_UAF) + new_dbg |= POOL_DBG_NO_CACHE; + /* fail should preset the tune.fail-alloc ratio to 1% */ + if (dbg_options[v].flg == POOL_DBG_FAIL_ALLOC) + mem_fail_rate = 1; + break; + } + else if (isteq(feat, ist(dbg_options[v].clr))) { + new_dbg &= ~dbg_options[v].flg; + /* no-fail should reset the tune.fail-alloc ratio */ + if (dbg_options[v].flg == POOL_DBG_FAIL_ALLOC) + mem_fail_rate = 0; + break; + } + } + + if (!dbg_options[v].flg) { + memprintf(err, "unknown pool debugging feature <%.*s>", (int)istlen(feat), istptr(feat)); + return -2; + } + } + + pool_debugging = new_dbg; + return 1; +} + +/* parse a "show pools" command. It returns 1 on failure, 0 if it starts to dump. */ +static int cli_parse_show_pools(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_pools_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int arg; + + for (arg = 2; *args[arg]; arg++) { + if (strcmp(args[arg], "byname") == 0) { + ctx->by_what = 1; // sort output by name + } + else if (strcmp(args[arg], "bysize") == 0) { + ctx->by_what = 2; // sort output by item size + } + else if (strcmp(args[arg], "byusage") == 0) { + ctx->by_what = 3; // sort output by total allocated size + } + else if (strcmp(args[arg], "match") == 0 && *args[arg+1]) { + ctx->prefix = strdup(args[arg+1]); // only pools starting with this + arg++; + } + else if (isdigit((unsigned char)*args[arg])) { + ctx->maxcnt = atoi(args[arg]); // number of entries to dump + } + else + return cli_err(appctx, "Expects either 'byname', 'bysize', 'byusage', 'match <pfx>', or a max number of output lines.\n"); + } + return 0; +} + +/* release the "show pools" context */ +static void cli_release_show_pools(struct appctx *appctx) +{ + struct show_pools_ctx *ctx = appctx->svcctx; + + ha_free(&ctx->prefix); +} + +/* This function dumps memory usage information onto the stream connector's + * read buffer. It returns 0 as long as it does not complete, non-zero upon + * completion. No state is used. + */ +static int cli_io_handler_dump_pools(struct appctx *appctx) +{ + struct show_pools_ctx *ctx = appctx->svcctx; + + dump_pools_to_trash(ctx->by_what, ctx->maxcnt, ctx->prefix); + if (applet_putchk(appctx, &trash) == -1) + return 0; + return 1; +} + +/* callback used to create early pool <name> of size <size> and store the + * resulting pointer into <ptr>. If the allocation fails, it quits with after + * emitting an error message. + */ +void create_pool_callback(struct pool_head **ptr, char *name, unsigned int size) +{ + *ptr = create_pool(name, size, MEM_F_SHARED); + if (!*ptr) { + ha_alert("Failed to allocate pool '%s' of size %u : %s. Aborting.\n", + name, size, strerror(errno)); + exit(1); + } +} + +/* Initializes all per-thread arrays on startup */ +static void init_pools() +{ + int thr; + + for (thr = 0; thr < MAX_THREADS; thr++) { + LIST_INIT(&ha_thread_ctx[thr].pool_lru_head); + } + + detect_allocator(); +} + +INITCALL0(STG_PREPARE, init_pools); + +/* Report in build options if trim is supported */ +static void pools_register_build_options(void) +{ + if (!using_default_allocator) { + char *ptr = NULL; + memprintf(&ptr, "Running with a replaced memory allocator (e.g. via LD_PRELOAD)."); + hap_register_build_opts(ptr, 1); + mark_tainted(TAINTED_REPLACED_MEM_ALLOCATOR); + } +} +INITCALL0(STG_REGISTER, pools_register_build_options); + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "show", "pools", NULL }, "show pools [by*] [match <pfx>] [nb] : report information about the memory pools usage", cli_parse_show_pools, cli_io_handler_dump_pools, cli_release_show_pools }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + + +/* config parser for global "tune.fail-alloc" */ +static int mem_parse_global_fail_alloc(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + mem_fail_rate = atoi(args[1]); + if (mem_fail_rate < 0 || mem_fail_rate > 100) { + memprintf(err, "'%s' expects a numeric value between 0 and 100.", args[0]); + return -1; + } + return 0; +} + +/* config parser for global "tune.memory.hot-size" */ +static int mem_parse_global_hot_size(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + long size; + + if (too_many_args(1, args, err, NULL)) + return -1; + + size = atol(args[1]); + if (size <= 0) { + memprintf(err, "'%s' expects a strictly positive value.", args[0]); + return -1; + } + + global.tune.pool_cache_size = size; + return 0; +} + +/* config parser for global "no-memory-trimming" */ +static int mem_parse_global_no_mem_trim(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(0, args, err, NULL)) + return -1; + disable_trim = 1; + return 0; +} + +/* register global config keywords */ +static struct cfg_kw_list mem_cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.fail-alloc", mem_parse_global_fail_alloc }, + { CFG_GLOBAL, "tune.memory.hot-size", mem_parse_global_hot_size }, + { CFG_GLOBAL, "no-memory-trimming", mem_parse_global_no_mem_trim }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &mem_cfg_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/proto_quic.c b/src/proto_quic.c new file mode 100644 index 0000000..899cffe --- /dev/null +++ b/src/proto_quic.c @@ -0,0 +1,799 @@ +/* + * AF_INET/AF_INET6 QUIC protocol layer. + * + * Copyright 2020 Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <netinet/udp.h> +#include <netinet/in.h> + +#include <import/ebtree-t.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/cbuf.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/namespace.h> +#include <haproxy/port_range.h> +#include <haproxy/protocol.h> +#include <haproxy/proto_quic.h> +#include <haproxy/proto_udp.h> +#include <haproxy/proxy-t.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_sock.h> +#include <haproxy/sock.h> +#include <haproxy/sock_inet.h> +#include <haproxy/task.h> +#include <haproxy/tools.h> + +/* per-thread quic datagram handlers */ +struct quic_dghdlr *quic_dghdlrs; +struct eb_root *quic_cid_tree; + +/* global CID trees */ +#define QUIC_CID_TREES_CNT 256 +struct quic_cid_tree *quic_cid_trees; + +/* Size of the internal buffer of QUIC RX buffer at the fd level */ +#define QUIC_RX_BUFSZ (1UL << 18) + +DECLARE_STATIC_POOL(pool_head_quic_rxbuf, "quic_rxbuf", QUIC_RX_BUFSZ); + +static int quic_bind_listener(struct listener *listener, char *errmsg, int errlen); +static int quic_connect_server(struct connection *conn, int flags); +static void quic_enable_listener(struct listener *listener); +static void quic_disable_listener(struct listener *listener); +static int quic_set_affinity(struct connection *conn, int new_tid); + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_quic4 = { + .name = "quic4", + + /* connection layer */ + .xprt_type = PROTO_TYPE_STREAM, + .listen = quic_bind_listener, + .enable = quic_enable_listener, + .disable = quic_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .suspend = default_suspend_listener, + .resume = default_resume_listener, + .accept_conn = quic_sock_accept_conn, + .get_src = quic_sock_get_src, + .get_dst = quic_sock_get_dst, + .connect = quic_connect_server, + .set_affinity = quic_set_affinity, + + /* binding layer */ + .rx_suspend = udp_suspend_receiver, + .rx_resume = udp_resume_receiver, + + /* address family */ + .fam = &proto_fam_inet4, + + /* socket layer */ + .proto_type = PROTO_TYPE_DGRAM, + .sock_type = SOCK_DGRAM, + .sock_prot = IPPROTO_UDP, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .rx_listening = quic_sock_accepting_conn, + .default_iocb = quic_lstnr_sock_fd_iocb, + .receivers = LIST_HEAD_INIT(proto_quic4.receivers), + .nb_receivers = 0, +#ifdef SO_REUSEPORT + .flags = PROTO_F_REUSEPORT_SUPPORTED, +#endif +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_quic4); + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_quic6 = { + .name = "quic6", + + /* connection layer */ + .xprt_type = PROTO_TYPE_STREAM, + .listen = quic_bind_listener, + .enable = quic_enable_listener, + .disable = quic_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .suspend = default_suspend_listener, + .resume = default_resume_listener, + .accept_conn = quic_sock_accept_conn, + .get_src = quic_sock_get_src, + .get_dst = quic_sock_get_dst, + .connect = quic_connect_server, + .set_affinity = quic_set_affinity, + + /* binding layer */ + .rx_suspend = udp_suspend_receiver, + .rx_resume = udp_resume_receiver, + + /* address family */ + .fam = &proto_fam_inet6, + + /* socket layer */ + .proto_type = PROTO_TYPE_DGRAM, + .sock_type = SOCK_DGRAM, + .sock_prot = IPPROTO_UDP, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .rx_listening = quic_sock_accepting_conn, + .default_iocb = quic_lstnr_sock_fd_iocb, + .receivers = LIST_HEAD_INIT(proto_quic6.receivers), + .nb_receivers = 0, +#ifdef SO_REUSEPORT + .flags = PROTO_F_REUSEPORT_SUPPORTED, +#endif +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_quic6); + +/* Binds ipv4/ipv6 address <local> to socket <fd>, unless <flags> is set, in which + * case we try to bind <remote>. <flags> is a 2-bit field consisting of : + * - 0 : ignore remote address (may even be a NULL pointer) + * - 1 : use provided address + * - 2 : use provided port + * - 3 : use both + * + * The function supports multiple foreign binding methods : + * - linux_tproxy: we directly bind to the foreign address + * The second one can be used as a fallback for the first one. + * This function returns 0 when everything's OK, 1 if it could not bind, to the + * local address, 2 if it could not bind to the foreign address. + */ +int quic_bind_socket(int fd, int flags, struct sockaddr_storage *local, struct sockaddr_storage *remote) +{ + struct sockaddr_storage bind_addr; + int foreign_ok = 0; + int ret; + static THREAD_LOCAL int ip_transp_working = 1; + static THREAD_LOCAL int ip6_transp_working = 1; + + switch (local->ss_family) { + case AF_INET: + if (flags && ip_transp_working) { + /* This deserves some explanation. Some platforms will support + * multiple combinations of certain methods, so we try the + * supported ones until one succeeds. + */ + if (sock_inet4_make_foreign(fd)) + foreign_ok = 1; + else + ip_transp_working = 0; + } + break; + case AF_INET6: + if (flags && ip6_transp_working) { + if (sock_inet6_make_foreign(fd)) + foreign_ok = 1; + else + ip6_transp_working = 0; + } + break; + } + + if (flags) { + memset(&bind_addr, 0, sizeof(bind_addr)); + bind_addr.ss_family = remote->ss_family; + switch (remote->ss_family) { + case AF_INET: + if (flags & 1) + ((struct sockaddr_in *)&bind_addr)->sin_addr = ((struct sockaddr_in *)remote)->sin_addr; + if (flags & 2) + ((struct sockaddr_in *)&bind_addr)->sin_port = ((struct sockaddr_in *)remote)->sin_port; + break; + case AF_INET6: + if (flags & 1) + ((struct sockaddr_in6 *)&bind_addr)->sin6_addr = ((struct sockaddr_in6 *)remote)->sin6_addr; + if (flags & 2) + ((struct sockaddr_in6 *)&bind_addr)->sin6_port = ((struct sockaddr_in6 *)remote)->sin6_port; + break; + default: + /* we don't want to try to bind to an unknown address family */ + foreign_ok = 0; + } + } + + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + if (foreign_ok) { + if (is_inet_addr(&bind_addr)) { + ret = bind(fd, (struct sockaddr *)&bind_addr, get_addr_len(&bind_addr)); + if (ret < 0) + return 2; + } + } + else { + if (is_inet_addr(local)) { + ret = bind(fd, (struct sockaddr *)local, get_addr_len(local)); + if (ret < 0) + return 1; + } + } + + if (!flags) + return 0; + + if (!foreign_ok) + /* we could not bind to a foreign address */ + return 2; + + return 0; +} + +/* + * This function initiates a QUIC connection establishment to the target assigned + * to connection <conn> using (si->{target,dst}). A source address may be + * pointed to by conn->src in case of transparent proxying. Normal source + * bind addresses are still determined locally (due to the possible need of a + * source port). conn->target may point either to a valid server or to a backend, + * depending on conn->target. Only OBJ_TYPE_PROXY and OBJ_TYPE_SERVER are + * supported. The <data> parameter is a boolean indicating whether there are data + * waiting for being sent or not, in order to adjust data write polling and on + * some platforms, the ability to avoid an empty initial ACK. The <flags> argument + * is not used. + * + * Note that a pending send_proxy message accounts for data. + * + * It can return one of : + * - SF_ERR_NONE if everything's OK + * - SF_ERR_SRVTO if there are no more servers + * - SF_ERR_SRVCL if the connection was refused by the server + * - SF_ERR_PRXCOND if the connection has been limited by the proxy (maxconn) + * - SF_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * - SF_ERR_INTERNAL for any other purely internal errors + * Additionally, in the case of SF_ERR_RESOURCE, an emergency log will be emitted. + * + * The connection's fd is inserted only when SF_ERR_NONE is returned, otherwise + * it's invalid and the caller has nothing to do. + */ + +int quic_connect_server(struct connection *conn, int flags) +{ + int fd; + struct server *srv; + struct proxy *be; + struct conn_src *src; + struct sockaddr_storage *addr; + + BUG_ON(!conn->dst); + + conn->flags |= CO_FL_WAIT_L4_CONN; /* connection in progress */ + + switch (obj_type(conn->target)) { + case OBJ_TYPE_PROXY: + be = __objt_proxy(conn->target); + srv = NULL; + break; + case OBJ_TYPE_SERVER: + srv = __objt_server(conn->target); + be = srv->proxy; + break; + default: + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + fd = conn->handle.fd = sock_create_server_socket(conn); + + if (fd == -1) { + qfprintf(stderr, "Cannot get a server socket.\n"); + + if (errno == ENFILE) { + conn->err_code = CO_ER_SYS_FDLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached system FD limit (maxsock=%d). Please check system tunables.\n", + be->id, global.maxsock); + } + else if (errno == EMFILE) { + conn->err_code = CO_ER_PROC_FDLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached process FD limit (maxsock=%d). Please check 'ulimit-n' and restart.\n", + be->id, global.maxsock); + } + else if (errno == ENOBUFS || errno == ENOMEM) { + conn->err_code = CO_ER_SYS_MEMLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached system memory limit (maxsock=%d). Please check system tunables.\n", + be->id, global.maxsock); + } + else if (errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) { + conn->err_code = CO_ER_NOPROTO; + } + else + conn->err_code = CO_ER_SOCK_ERR; + + /* this is a resource error */ + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } + + if (fd >= global.maxsock) { + /* do not log anything there, it's a normal condition when this option + * is used to serialize connections to a server ! + */ + ha_alert("socket(): not enough free sockets. Raise -n argument. Giving up.\n"); + close(fd); + conn->err_code = CO_ER_CONF_FDLIM; + conn->flags |= CO_FL_ERROR; + return SF_ERR_PRXCOND; /* it is a configuration limit */ + } + + if (fd_set_nonblock(fd) == -1) { + qfprintf(stderr,"Cannot set client socket to non blocking mode.\n"); + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if (master == 1 && fd_set_cloexec(fd) == -1) { + ha_alert("Cannot set CLOEXEC on client socket.\n"); + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + /* allow specific binding : + * - server-specific at first + * - proxy-specific next + */ + if (srv && srv->conn_src.opts & CO_SRC_BIND) + src = &srv->conn_src; + else if (be->conn_src.opts & CO_SRC_BIND) + src = &be->conn_src; + else + src = NULL; + + if (src) { + int ret, flags = 0; + + if (conn->src && is_inet_addr(conn->src)) { + switch (src->opts & CO_SRC_TPROXY_MASK) { + case CO_SRC_TPROXY_CLI: + conn_set_private(conn); + __fallthrough; + case CO_SRC_TPROXY_ADDR: + flags = 3; + break; + case CO_SRC_TPROXY_CIP: + case CO_SRC_TPROXY_DYN: + conn_set_private(conn); + flags = 1; + break; + } + } + +#ifdef SO_BINDTODEVICE + /* Note: this might fail if not CAP_NET_RAW */ + if (src->iface_name) + setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, src->iface_name, src->iface_len + 1); +#endif + + if (src->sport_range) { + int attempts = 10; /* should be more than enough to find a spare port */ + struct sockaddr_storage sa; + + ret = 1; + memcpy(&sa, &src->source_addr, sizeof(sa)); + + do { + /* note: in case of retry, we may have to release a previously + * allocated port, hence this loop's construct. + */ + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + + if (!attempts) + break; + attempts--; + + fdinfo[fd].local_port = port_range_alloc_port(src->sport_range); + if (!fdinfo[fd].local_port) { + conn->err_code = CO_ER_PORT_RANGE; + break; + } + + fdinfo[fd].port_range = src->sport_range; + set_host_port(&sa, fdinfo[fd].local_port); + + ret = quic_bind_socket(fd, flags, &sa, conn->src); + if (ret != 0) + conn->err_code = CO_ER_CANT_BIND; + } while (ret != 0); /* binding NOK */ + } + else { +#ifdef IP_BIND_ADDRESS_NO_PORT + static THREAD_LOCAL int bind_address_no_port = 1; + setsockopt(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT, (const void *) &bind_address_no_port, sizeof(int)); +#endif + ret = quic_bind_socket(fd, flags, &src->source_addr, conn->src); + if (ret != 0) + conn->err_code = CO_ER_CANT_BIND; + } + + if (unlikely(ret != 0)) { + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + close(fd); + + if (ret == 1) { + ha_alert("Cannot bind to source address before connect() for backend %s. Aborting.\n", + be->id); + send_log(be, LOG_EMERG, + "Cannot bind to source address before connect() for backend %s.\n", + be->id); + } else { + ha_alert("Cannot bind to tproxy source address before connect() for backend %s. Aborting.\n", + be->id); + send_log(be, LOG_EMERG, + "Cannot bind to tproxy source address before connect() for backend %s.\n", + be->id); + } + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } + } + + if (global.tune.server_sndbuf) + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &global.tune.server_sndbuf, sizeof(global.tune.server_sndbuf)); + + if (global.tune.server_rcvbuf) + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &global.tune.server_rcvbuf, sizeof(global.tune.server_rcvbuf)); + + addr = (conn->flags & CO_FL_SOCKS4) ? &srv->socks4_addr : conn->dst; + if (connect(fd, (const struct sockaddr *)addr, get_addr_len(addr)) == -1) { + if (errno == EINPROGRESS || errno == EALREADY) { + /* common case, let's wait for connect status */ + conn->flags |= CO_FL_WAIT_L4_CONN; + } + else if (errno == EISCONN) { + /* should normally not happen but if so, indicates that it's OK */ + conn->flags &= ~CO_FL_WAIT_L4_CONN; + } + else if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EADDRINUSE || errno == EADDRNOTAVAIL) { + char *msg; + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EADDRNOTAVAIL) { + msg = "no free ports"; + conn->err_code = CO_ER_FREE_PORTS; + } + else { + msg = "local address already in use"; + conn->err_code = CO_ER_ADDR_INUSE; + } + + qfprintf(stderr,"Connect() failed for backend %s: %s.\n", be->id, msg); + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + close(fd); + send_log(be, LOG_ERR, "Connect() failed for backend %s: %s.\n", be->id, msg); + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } else if (errno == ETIMEDOUT) { + //qfprintf(stderr,"Connect(): ETIMEDOUT"); + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_SRVTO; + } else { + // (errno == ECONNREFUSED || errno == ENETUNREACH || errno == EACCES || errno == EPERM) + //qfprintf(stderr,"Connect(): %d", errno); + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_SRVCL; + } + } + else { + /* connect() == 0, this is great! */ + conn->flags &= ~CO_FL_WAIT_L4_CONN; + } + + conn_ctrl_init(conn); /* registers the FD */ + HA_ATOMIC_OR(&fdtab[fd].state, FD_LINGER_RISK); /* close hard if needed */ + + if (conn->flags & CO_FL_WAIT_L4_CONN) { + fd_want_send(fd); + fd_cant_send(fd); + fd_cant_recv(fd); + } + + return SF_ERR_NONE; /* connection is OK */ +} + +/* Allocate the RX buffers for <l> listener. + * Return 1 if succeeded, 0 if not. + */ +static int quic_alloc_rxbufs_listener(struct listener *l) +{ + int i; + struct quic_receiver_buf *tmp; + + MT_LIST_INIT(&l->rx.rxbuf_list); + for (i = 0; i < my_popcountl(l->rx.bind_thread); i++) { + struct quic_receiver_buf *rxbuf; + char *buf; + + rxbuf = calloc(1, sizeof(*rxbuf)); + if (!rxbuf) + goto err; + + buf = pool_alloc(pool_head_quic_rxbuf); + if (!buf) { + free(rxbuf); + goto err; + } + + rxbuf->buf = b_make(buf, QUIC_RX_BUFSZ, 0, 0); + LIST_INIT(&rxbuf->dgram_list); + MT_LIST_APPEND(&l->rx.rxbuf_list, &rxbuf->rxbuf_el); + } + + return 1; + + err: + while ((tmp = MT_LIST_POP(&l->rx.rxbuf_list, typeof(tmp), rxbuf_el))) { + pool_free(pool_head_quic_rxbuf, tmp->buf.area); + free(tmp); + } + return 0; +} + +/* Check if platform supports the required feature set for quic-conn owned + * socket. <l> listener must already be binded; a dummy socket will be opened + * on the same address as one of the support test. + * + * Returns true if platform is deemed compatible else false. + */ +static int quic_test_sock_per_conn_support(struct listener *l) +{ + const struct receiver *rx = &l->rx; + int ret = 1, fdtest; + + /* Check if IP destination address can be retrieved on recvfrom() + * operation. + */ +#if !defined(IP_PKTINFO) && !defined(IP_RECVDSTADDR) + ha_alert("Your platform does not seem to support UDP source address retrieval through IP_PKTINFO or an alternative flag. " + "QUIC connections will use listener socket.\n"); + ret = 0; +#endif + + /* Check if platform support multiple UDP sockets bind on the same + * local address. Create a dummy socket and bind it on the same address + * as <l> listener. If bind system call fails, deactivate socket per + * connection. All other errors are not taken into account. + */ + if (ret) { + fdtest = socket(rx->proto->fam->sock_domain, + rx->proto->sock_type, rx->proto->sock_prot); + if (fdtest >= 0) { + if (setsockopt(fdtest, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) && + bind(fdtest, (struct sockaddr *)&rx->addr, rx->proto->fam->sock_addrlen) < 0) { + ha_alert("Your platform does not seem to support multiple UDP sockets binded on the same address. " + "QUIC connections will use listener socket.\n"); + ret = 0; + } + + close(fdtest); + } + } + + return ret; +} + +/* This function tries to bind a QUIC4/6 listener. It may return a warning or + * an error message in <errmsg> if the message is at most <errlen> bytes long + * (including '\0'). Note that <errmsg> may be NULL if <errlen> is also zero. + * The return value is composed from ERR_ABORT, ERR_WARN, + * ERR_ALERT, ERR_RETRYABLE and ERR_FATAL. ERR_NONE indicates that everything + * was alright and that no message was returned. ERR_RETRYABLE means that an + * error occurred but that it may vanish after a retry (eg: port in use), and + * ERR_FATAL indicates a non-fixable error. ERR_WARN and ERR_ALERT do not alter + * the meaning of the error, but just indicate that a message is present which + * should be displayed with the respective level. Last, ERR_ABORT indicates + * that it's pointless to try to start other listeners. No error message is + * returned if errlen is NULL. + */ +static int quic_bind_listener(struct listener *listener, char *errmsg, int errlen) +{ + const struct sockaddr_storage addr = listener->rx.addr; + int fd, err = ERR_NONE; + char *msg = NULL; + + /* ensure we never return garbage */ + if (errlen) + *errmsg = 0; + + if (listener->state != LI_ASSIGNED) + return ERR_NONE; /* already bound */ + + if (!(listener->rx.flags & RX_F_BOUND)) { + msg = "receiving socket not bound"; + goto udp_return; + } + + /* Duplicate quic_mode setting from bind_conf. Useful to overwrite it + * at runtime per receiver instance. + */ + listener->rx.quic_mode = listener->bind_conf->quic_mode; + + /* Set IP_PKTINFO to retrieve destination address on recv. */ + fd = listener->rx.fd; + switch (addr.ss_family) { + case AF_INET: +#if defined(IP_PKTINFO) + setsockopt(fd, IPPROTO_IP, IP_PKTINFO, &one, sizeof(one)); +#elif defined(IP_RECVDSTADDR) + setsockopt(fd, IPPROTO_IP, IP_RECVDSTADDR, &one, sizeof(one)); +#endif /* IP_PKTINFO || IP_RECVDSTADDR */ + break; + case AF_INET6: +#ifdef IPV6_RECVPKTINFO + setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one)); +#endif + break; + default: + break; + } + + if (!quic_alloc_rxbufs_listener(listener)) { + msg = "could not initialize tx/rx rings"; + err |= ERR_WARN; + goto udp_return; + } + + if (global.tune.options & GTUNE_QUIC_SOCK_PER_CONN) { + if (!quic_test_sock_per_conn_support(listener)) + global.tune.options &= ~GTUNE_QUIC_SOCK_PER_CONN; + } + + if (global.tune.frontend_rcvbuf) + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &global.tune.frontend_rcvbuf, sizeof(global.tune.frontend_rcvbuf)); + + if (global.tune.frontend_sndbuf) + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &global.tune.frontend_sndbuf, sizeof(global.tune.frontend_sndbuf)); + + listener_set_state(listener, LI_LISTEN); + + udp_return: + if (msg && errlen) { + char pn[INET6_ADDRSTRLEN]; + + addr_to_str(&listener->rx.addr, pn, sizeof(pn)); + snprintf(errmsg, errlen, "%s for [%s:%d]", msg, pn, get_host_port(&listener->rx.addr)); + } + return err; +} + +/* Enable receipt of incoming connections for listener <l>. The receiver must + * still be valid. Does nothing in early boot (needs fd_updt). + */ +static void quic_enable_listener(struct listener *l) +{ + /* FIXME: The following statements are incorrect. This + * is the responsibility of the QUIC xprt to stop accepting new + * connections. + */ + if (fd_updt) + fd_want_recv(l->rx.fd); +} + +/* Disable receipt of incoming connections for listener <l>. The receiver must + * still be valid. Does nothing in early boot (needs fd_updt). + */ +static void quic_disable_listener(struct listener *l) +{ + /* FIXME: The following statements are incorrect. This + * is the responsibility of the QUIC xprt to start accepting new + * connections again. + */ + if (fd_updt) + fd_stop_recv(l->rx.fd); +} + +/* change the connection's thread to <new_tid>. For frontend connections, the + * target is a listener, and the caller is responsible for guaranteeing that + * the listener assigned to the connection is bound to the requested thread. + */ +static int quic_set_affinity(struct connection *conn, int new_tid) +{ + struct quic_conn *qc = conn->handle.qc; + return qc_set_tid_affinity(qc, new_tid, objt_listener(conn->target)); +} + +static int quic_alloc_dghdlrs(void) +{ + int i; + + quic_dghdlrs = calloc(global.nbthread, sizeof(*quic_dghdlrs)); + if (!quic_dghdlrs) { + ha_alert("Failed to allocate the quic datagram handlers.\n"); + return 0; + } + + for (i = 0; i < global.nbthread; i++) { + struct quic_dghdlr *dghdlr = &quic_dghdlrs[i]; + + dghdlr->task = tasklet_new(); + if (!dghdlr->task) { + ha_alert("Failed to allocate the quic datagram handler on thread %d.\n", i); + return 0; + } + + tasklet_set_tid(dghdlr->task, i); + dghdlr->task->context = dghdlr; + dghdlr->task->process = quic_lstnr_dghdlr; + + MT_LIST_INIT(&dghdlr->dgrams); + } + + quic_cid_trees = calloc(QUIC_CID_TREES_CNT, sizeof(*quic_cid_trees)); + if (!quic_cid_trees) { + ha_alert("Failed to allocate global CIDs trees.\n"); + return 0; + } + + for (i = 0; i < QUIC_CID_TREES_CNT; ++i) { + HA_RWLOCK_INIT(&quic_cid_trees[i].lock); + quic_cid_trees[i].root = EB_ROOT_UNIQUE; + } + + return 1; +} +REGISTER_POST_CHECK(quic_alloc_dghdlrs); + +static int quic_deallocate_dghdlrs(void) +{ + int i; + + if (quic_dghdlrs) { + for (i = 0; i < global.nbthread; ++i) + tasklet_free(quic_dghdlrs[i].task); + free(quic_dghdlrs); + } + + ha_free(&quic_cid_trees); + + return 1; +} +REGISTER_POST_DEINIT(quic_deallocate_dghdlrs); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/proto_rhttp.c b/src/proto_rhttp.c new file mode 100644 index 0000000..452ee32 --- /dev/null +++ b/src/proto_rhttp.c @@ -0,0 +1,464 @@ +#include <stdio.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/intops.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/server.h> +#include <haproxy/sock.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/task.h> + +#include <haproxy/proto_rhttp.h> + +struct proto_fam proto_fam_rhttp = { + .name = "rhttp", + .sock_domain = AF_CUST_RHTTP_SRV, + .sock_family = AF_INET, + .bind = rhttp_bind_receiver, +}; + +struct protocol proto_rhttp = { + .name = "rev", + + /* connection layer (no outgoing connection) */ + .listen = rhttp_bind_listener, + .enable = rhttp_enable_listener, + .disable = rhttp_disable_listener, + .add = default_add_listener, + .unbind = rhttp_unbind_receiver, + .resume = default_resume_listener, + .accept_conn = rhttp_accept_conn, + .set_affinity = rhttp_set_affinity, + + /* address family */ + .fam = &proto_fam_rhttp, + + /* socket layer */ + .proto_type = PROTO_TYPE_STREAM, + .sock_type = SOCK_STREAM, + .sock_prot = IPPROTO_TCP, + .rx_listening = rhttp_accepting_conn, + .receivers = LIST_HEAD_INIT(proto_rhttp.receivers), +}; + +static struct connection *new_reverse_conn(struct listener *l, struct server *srv) +{ + struct connection *conn = conn_new(srv); + struct sockaddr_storage *bind_addr = NULL; + if (!conn) + goto err; + + HA_ATOMIC_INC(&th_ctx->nb_rhttp_conns); + + conn_set_reverse(conn, &l->obj_type); + + if (alloc_bind_address(&bind_addr, srv, srv->proxy, NULL) != SRV_STATUS_OK) + goto err; + conn->src = bind_addr; + + sockaddr_alloc(&conn->dst, 0, 0); + if (!conn->dst) + goto err; + *conn->dst = srv->addr; + set_host_port(conn->dst, srv->svc_port); + + if (conn_prepare(conn, protocol_lookup(conn->dst->ss_family, PROTO_TYPE_STREAM, 0), srv->xprt)) + goto err; + + if (conn->ctrl->connect(conn, 0) != SF_ERR_NONE) + goto err; + +#ifdef USE_OPENSSL + if (srv->ssl_ctx.sni) { + struct sample *sni_smp = NULL; + /* TODO remove NULL session which can cause crash depending on the SNI sample expr used. */ + sni_smp = sample_fetch_as_type(srv->proxy, NULL, NULL, + SMP_OPT_DIR_REQ | SMP_OPT_FINAL, + srv->ssl_ctx.sni, SMP_T_STR); + if (smp_make_safe(sni_smp)) + ssl_sock_set_servername(conn, sni_smp->data.u.str.area); + } +#endif /* USE_OPENSSL */ + + if (conn_xprt_start(conn) < 0) + goto err; + + if (!srv->use_ssl || + (!srv->ssl_ctx.alpn_str && !srv->ssl_ctx.npn_str) || + srv->mux_proto) { + if (conn_install_mux_be(conn, NULL, NULL, NULL) < 0) + goto err; + } + + /* Not expected here. */ + BUG_ON((conn->flags & CO_FL_HANDSHAKE)); + return conn; + + err: + if (conn) { + conn_stop_tracking(conn); + conn_xprt_shutw(conn); + conn_xprt_close(conn); + conn_sock_shutw(conn, 0); + conn_ctrl_close(conn); + + if (conn->destroy_cb) + conn->destroy_cb(conn); + + /* Mark connection as non-reversable. This prevents conn_free() + * to reschedule rhttp task on freeing a preconnect connection. + */ + conn->reverse.target = NULL; + conn_free(conn); + } + + return NULL; +} + +/* Report that a connection used for preconnect on listener <l> is freed before + * reversal is completed. This is used to cleanup any reference to the + * connection and rearm a new preconnect attempt. + */ +void rhttp_notify_preconn_err(struct listener *l) +{ + /* Receiver must reference a reverse connection as pending. */ + BUG_ON(!l->rx.rhttp.pend_conn); + + /* Remove reference to the freed connection. */ + l->rx.rhttp.pend_conn = NULL; + + if (l->rx.rhttp.state != LI_PRECONN_ST_ERR) { + send_log(l->bind_conf->frontend, LOG_ERR, + "preconnect %s::%s: Error encountered.\n", + l->bind_conf->frontend->id, l->bind_conf->rhttp_srvname); + l->rx.rhttp.state = LI_PRECONN_ST_ERR; + } + + /* Rearm a new preconnect attempt. */ + l->rx.rhttp.task->expire = MS_TO_TICKS(now_ms + 1000); + task_queue(l->rx.rhttp.task); +} + +/* Lookup over listener <l> threads for their current count of active reverse + * HTTP connections. Returns the less loaded thread ID. + */ +static unsigned int select_thread(struct listener *l) +{ + unsigned long mask = l->rx.bind_thread & _HA_ATOMIC_LOAD(&tg->threads_enabled); + unsigned int load_min = HA_ATOMIC_LOAD(&th_ctx->nb_rhttp_conns); + unsigned int load_thr; + unsigned int ret = tid; + int i; + + /* Returns current tid if listener runs on one thread only. */ + if (!atleast2(mask)) + goto end; + + /* Loop over all threads and return the less loaded one. This needs to + * be just an approximation so it's not important if the selected + * thread load has varied since its selection. + */ + + for (i = tg->base; mask; mask >>= 1, i++) { + if (!(mask & 0x1)) + continue; + + load_thr = HA_ATOMIC_LOAD(&ha_thread_ctx[i].nb_rhttp_conns); + if (load_min > load_thr) { + ret = i; + load_min = load_thr; + } + } + + end: + return ret; +} + +/* Detach <task> from its thread and assign it to <new_tid> thread. The task is + * queued to be woken up on the new thread. + */ +static void task_migrate(struct task *task, uint new_tid) +{ + task_unlink_wq(task); + task->expire = TICK_ETERNITY; + task_set_thread(task, new_tid); + task_wakeup(task, TASK_WOKEN_MSG); +} + +struct task *rhttp_process(struct task *task, void *ctx, unsigned int state) +{ + struct listener *l = ctx; + struct connection *conn = l->rx.rhttp.pend_conn; + + if (conn) { + /* Either connection is on error ot the connect timeout fired. */ + if (conn->flags & CO_FL_ERROR || tick_is_expired(task->expire, now_ms)) { + /* If mux already instantiated, let it release the + * connection along with its context. Else do cleanup + * directly. + */ + if (conn->mux && conn->mux->destroy) { + conn->mux->destroy(conn->ctx); + } + else { + conn_stop_tracking(conn); + conn_xprt_shutw(conn); + conn_xprt_close(conn); + conn_sock_shutw(conn, 0); + conn_ctrl_close(conn); + + if (conn->destroy_cb) + conn->destroy_cb(conn); + conn_free(conn); + } + + /* conn_free() must report preconnect failure using rhttp_notify_preconn_err(). */ + BUG_ON(l->rx.rhttp.pend_conn); + + l->rx.rhttp.task->expire = TICKS_TO_MS(now_ms); + } + else { + /* Spurious receiver task woken up despite pend_conn not ready/on error. */ + BUG_ON(!(conn->flags & CO_FL_ACT_REVERSING)); + + /* A connection is ready to be accepted. */ + listener_accept(l); + l->rx.rhttp.task->expire = TICK_ETERNITY; + } + } + else { + struct server *srv = l->rx.rhttp.srv; + + if ((state & TASK_WOKEN_ANY) != TASK_WOKEN_MSG) { + unsigned int new_tid = select_thread(l); + if (new_tid != tid) { + task_migrate(l->rx.rhttp.task, new_tid); + return task; + } + } + + /* No pending reverse connection, prepare a new one. Store it in the + * listener and return NULL. Connection will be returned later after + * reversal is completed. + */ + conn = new_reverse_conn(l, srv); + l->rx.rhttp.pend_conn = conn; + + /* On success task will be woken up by H2 mux after reversal. */ + l->rx.rhttp.task->expire = conn ? + tick_add_ifset(now_ms, srv->proxy->timeout.connect) : + MS_TO_TICKS(now_ms + 1000); + } + + return task; +} + +int rhttp_bind_receiver(struct receiver *rx, char **errmsg) +{ + rx->flags |= RX_F_BOUND; + return ERR_NONE; +} + +int rhttp_bind_listener(struct listener *listener, char *errmsg, int errlen) +{ + struct task *task; + struct proxy *be; + struct server *srv; + struct ist be_name, sv_name; + char *name = NULL; + + unsigned long mask; + uint task_tid; + + if (listener->state != LI_ASSIGNED) + return ERR_NONE; /* already bound */ + + /* Retrieve the first thread usable for this listener. */ + mask = listener->rx.bind_thread & _HA_ATOMIC_LOAD(&tg->threads_enabled); + task_tid = my_ffsl(mask) + ha_tgroup_info[listener->rx.bind_tgroup].base; + if (!(task = task_new_on(task_tid))) { + snprintf(errmsg, errlen, "Out of memory."); + goto err; + } + task->process = rhttp_process; + task->context = listener; + listener->rx.rhttp.task = task; + listener->rx.rhttp.state = LI_PRECONN_ST_STOP; + + /* Set maxconn which is defined via the special kw nbconn for reverse + * connect. Use a default value of 1 if not set. This guarantees that + * listener will be automatically re-enable each time it fell back below + * it due to a connection error. + */ + listener->bind_conf->maxconn = listener->bind_conf->rhttp_nbconn; + if (!listener->bind_conf->maxconn) + listener->bind_conf->maxconn = 1; + + name = strdup(listener->bind_conf->rhttp_srvname); + if (!name) { + snprintf(errmsg, errlen, "Out of memory."); + goto err; + } + + sv_name = ist(name); + be_name = istsplit(&sv_name, '/'); + if (!istlen(sv_name)) { + snprintf(errmsg, errlen, "Invalid server name: '%s'.", name); + goto err; + } + + if (!(be = proxy_be_by_name(ist0(be_name)))) { + snprintf(errmsg, errlen, "No such backend: '%s'.", name); + goto err; + } + if (!(srv = server_find_by_name(be, ist0(sv_name)))) { + snprintf(errmsg, errlen, "No such server: '%s/%s'.", ist0(be_name), ist0(sv_name)); + goto err; + } + + if (srv->flags & SRV_F_RHTTP) { + snprintf(errmsg, errlen, "Cannot use reverse HTTP server '%s/%s' as target to a reverse bind.", ist0(be_name), ist0(sv_name)); + goto err; + } + + if (srv_is_transparent(srv)) { + snprintf(errmsg, errlen, "Cannot use transparent server '%s/%s' as target to a reverse bind.", ist0(be_name), ist0(sv_name)); + goto err; + } + + /* Check that server uses HTTP/2 either with proto or ALPN. */ + if ((!srv->mux_proto || !isteqi(srv->mux_proto->token, ist("h2"))) && + (!srv->use_ssl || !isteqi(ist(srv->ssl_ctx.alpn_str), ist("\x02h2")))) { + snprintf(errmsg, errlen, "Cannot reverse connect with server '%s/%s' unless HTTP/2 is activated on it with either proto or alpn keyword.", name, ist0(sv_name)); + goto err; + } + + /* Prevent dynamic source address settings. */ + if (((srv->conn_src.opts & CO_SRC_TPROXY_MASK) && + (srv->conn_src.opts & CO_SRC_TPROXY_MASK) != CO_SRC_TPROXY_ADDR) || + ((srv->proxy->conn_src.opts & CO_SRC_TPROXY_MASK) && + (srv->proxy->conn_src.opts & CO_SRC_TPROXY_MASK) != CO_SRC_TPROXY_ADDR)) { + snprintf(errmsg, errlen, "Cannot reverse connect with server '%s/%s' which uses dynamic source address setting.", name, ist0(sv_name)); + goto err; + } + + ha_free(&name); + + listener->rx.rhttp.srv = srv; + listener_set_state(listener, LI_LISTEN); + + return ERR_NONE; + + err: + ha_free(&name); + return ERR_ALERT | ERR_FATAL; +} + +void rhttp_enable_listener(struct listener *l) +{ + if (l->rx.rhttp.state < LI_PRECONN_ST_INIT) { + send_log(l->bind_conf->frontend, LOG_INFO, + "preconnect %s::%s: Initiating.\n", + l->bind_conf->frontend->id, l->bind_conf->rhttp_srvname); + l->rx.rhttp.state = LI_PRECONN_ST_INIT; + } + + task_wakeup(l->rx.rhttp.task, TASK_WOKEN_ANY); +} + +void rhttp_disable_listener(struct listener *l) +{ + if (l->rx.rhttp.state < LI_PRECONN_ST_FULL) { + send_log(l->bind_conf->frontend, LOG_INFO, + "preconnect %s::%s: Running with nbconn %d reached.\n", + l->bind_conf->frontend->id, l->bind_conf->rhttp_srvname, + l->bind_conf->maxconn); + l->rx.rhttp.state = LI_PRECONN_ST_FULL; + } +} + +struct connection *rhttp_accept_conn(struct listener *l, int *status) +{ + struct connection *conn = l->rx.rhttp.pend_conn; + + if (!conn) { + /* Reverse connect listener must have an explicit maxconn set + * to ensure it is re-enabled on connection error. + */ + BUG_ON(!l->bind_conf->maxconn); + + /* Instantiate a new conn if maxconn not yet exceeded. */ + if (l->nbconn <= l->bind_conf->maxconn) { + /* Try first if a new thread should be used for the new connection. */ + unsigned int new_tid = select_thread(l); + if (new_tid != tid) { + task_migrate(l->rx.rhttp.task, new_tid); + *status = CO_AC_DONE; + return NULL; + } + + /* No need to use a new thread, use the opportunity to alloc the connection right now. */ + l->rx.rhttp.pend_conn = new_reverse_conn(l, l->rx.rhttp.srv); + if (!l->rx.rhttp.pend_conn) { + *status = CO_AC_PAUSE; + return NULL; + } + } + + *status = CO_AC_DONE; + return NULL; + } + + /* listener_accept() must not be called if no pending connection is not yet reversed. */ + BUG_ON(!(conn->flags & CO_FL_ACT_REVERSING)); + conn->flags &= ~CO_FL_ACT_REVERSING; + conn->flags |= CO_FL_REVERSED; + conn->mux->ctl(conn, MUX_CTL_REVERSE_CONN, NULL); + + l->rx.rhttp.pend_conn = NULL; + *status = CO_AC_NONE; + + return conn; +} + +void rhttp_unbind_receiver(struct listener *l) +{ + l->rx.flags &= ~RX_F_BOUND; +} + +int rhttp_set_affinity(struct connection *conn, int new_tid) +{ + /* Explicitely disable connection thread migration on accept. Indeed, + * it's unsafe to move a connection with its FD to another thread. Note + * that active reverse task thread migration should be sufficient to + * ensure repartition of reversed connections accross listener threads. + */ + return -1; +} + +int rhttp_accepting_conn(const struct receiver *rx) +{ + return 1; +} + +INITCALL1(STG_REGISTER, protocol_register, &proto_rhttp); + +/* perform minimal intializations */ +static void init_rhttp() +{ + int i; + + for (i = 0; i < MAX_THREADS; i++) + ha_thread_ctx[i].nb_rhttp_conns = 0; +} + +INITCALL0(STG_PREPARE, init_rhttp); diff --git a/src/proto_sockpair.c b/src/proto_sockpair.c new file mode 100644 index 0000000..a719063 --- /dev/null +++ b/src/proto_sockpair.c @@ -0,0 +1,589 @@ +/* + * Socket Pair protocol layer (sockpair) + * + * Copyright HAProxy Technologies - William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <pwd.h> +#include <grp.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <time.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/un.h> + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/protocol.h> +#include <haproxy/proto_sockpair.h> +#include <haproxy/sock.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + + +static int sockpair_bind_listener(struct listener *listener, char *errmsg, int errlen); +static void sockpair_enable_listener(struct listener *listener); +static void sockpair_disable_listener(struct listener *listener); +static int sockpair_connect_server(struct connection *conn, int flags); +static int sockpair_accepting_conn(const struct receiver *rx); +struct connection *sockpair_accept_conn(struct listener *l, int *status); + +struct proto_fam proto_fam_sockpair = { + .name = "sockpair", + .sock_domain = AF_CUST_SOCKPAIR, + .sock_family = AF_UNIX, + .sock_addrlen = sizeof(struct sockaddr_un), + .l3_addrlen = sizeof(((struct sockaddr_un*)0)->sun_path), + .addrcmp = NULL, + .bind = sockpair_bind_receiver, + .get_src = NULL, + .get_dst = NULL, +}; + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_sockpair = { + .name = "sockpair", + + /* connection layer */ + .xprt_type = PROTO_TYPE_STREAM, + .listen = sockpair_bind_listener, + .enable = sockpair_enable_listener, + .disable = sockpair_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .accept_conn = sockpair_accept_conn, + .ctrl_init = sock_conn_ctrl_init, + .ctrl_close = sock_conn_ctrl_close, + .connect = sockpair_connect_server, + .drain = sock_drain, + .check_events = sock_check_events, + .ignore_events = sock_ignore_events, + + /* binding layer */ + /* Note: suspend/resume not supported */ + + /* address family */ + .fam = &proto_fam_sockpair, + + /* socket layer */ + .proto_type = PROTO_TYPE_STREAM, + .sock_type = SOCK_STREAM, + .sock_prot = 0, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .rx_listening = sockpair_accepting_conn, + .default_iocb = sock_accept_iocb, + .receivers = LIST_HEAD_INIT(proto_sockpair.receivers), + .nb_receivers = 0, +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_sockpair); + +/* Enable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void sockpair_enable_listener(struct listener *l) +{ + fd_want_recv_safe(l->rx.fd); +} + +/* Disable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void sockpair_disable_listener(struct listener *l) +{ + fd_stop_recv(l->rx.fd); +} + +/* Binds receiver <rx>, and assigns rx->iocb and rx->owner as the callback + * and context, respectively, with ->bind_thread as the thread mask. Returns an + * error code made of ERR_* bits on failure or ERR_NONE on success. On failure, + * an error message may be passed into <errmsg>. Note that the binding address + * is only an FD to receive the incoming FDs on. Thus by definition there is no + * real "bind" operation, this only completes the receiver. Such FDs are not + * inherited upon reload. + */ +int sockpair_bind_receiver(struct receiver *rx, char **errmsg) +{ + int err; + + /* ensure we never return garbage */ + if (errmsg) + *errmsg = 0; + + err = ERR_NONE; + + if (rx->flags & RX_F_BOUND) + return ERR_NONE; + + if (rx->flags & RX_F_MUST_DUP) { + /* this is a secondary receiver that is an exact copy of a + * reference which must already be bound (or has failed). + * We'll try to dup() the other one's FD and take it. We + * try hard not to reconfigure the socket since it's shared. + */ + BUG_ON(!rx->shard_info); + if (!(rx->shard_info->ref->flags & RX_F_BOUND)) { + /* it's assumed that the first one has already reported + * the error, let's not spam with another one, and do + * not set ERR_ALERT. + */ + err |= ERR_RETRYABLE; + goto bind_ret_err; + } + /* taking the other one's FD will result in it being marked + * extern and being dup()ed. Let's mark the receiver as + * inherited so that it properly bypasses all second-stage + * setup and avoids being passed to new processes. + */ + rx->flags |= RX_F_INHERITED; + rx->fd = rx->shard_info->ref->fd; + } + + if (rx->fd == -1) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "sockpair may be only used with inherited FDs"); + goto bind_return; + } + + if (rx->fd >= global.maxsock) { + err |= ERR_FATAL | ERR_ABORT | ERR_ALERT; + memprintf(errmsg, "not enough free sockets (raise '-n' parameter)"); + goto bind_close_return; + } + + if (fd_set_nonblock(rx->fd) == -1) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "cannot make socket non-blocking"); + goto bind_close_return; + } + + rx->flags |= RX_F_BOUND; + + fd_insert(rx->fd, rx->owner, rx->iocb, rx->bind_tgroup, rx->bind_thread); + return err; + + bind_return: + if (errmsg && *errmsg) + memprintf(errmsg, "%s for [fd %d]", *errmsg, rx->fd); + + bind_ret_err: + return err; + + bind_close_return: + close(rx->fd); + goto bind_return; +} + +/* This function changes the state from ASSIGNED to LISTEN. The socket is NOT + * enabled for polling. The return value is composed from ERR_NONE, + * ERR_RETRYABLE and ERR_FATAL. It may return a warning or an error message in + * <errmsg> if the message is at most <errlen> bytes long (including '\0'). + * Note that <errmsg> may be NULL if <errlen> is also zero. + */ +static int sockpair_bind_listener(struct listener *listener, char *errmsg, int errlen) +{ + int err; + char *msg = NULL; + + err = ERR_NONE; + + /* ensure we never return garbage */ + if (errlen) + *errmsg = 0; + + if (listener->state != LI_ASSIGNED) + return ERR_NONE; /* already bound */ + + if (!(listener->rx.flags & RX_F_BOUND)) { + msg = "receiving socket not bound"; + goto err_return; + } + + listener_set_state(listener, LI_LISTEN); + return err; + + err_return: + if (msg && errlen) + snprintf(errmsg, errlen, "%s [fd %d]", msg, listener->rx.fd); + return err; +} + +/* + * Send FD over a unix socket + * + * <send_fd> is the FD to send + * <fd> is the fd of the unix socket to use for the transfer + * + * The iobuf variable could be use in the future to enhance the protocol. + */ +int send_fd_uxst(int fd, int send_fd) +{ + char iobuf[2]; + struct iovec iov; + struct msghdr msghdr; + + char cmsgbuf[CMSG_SPACE(sizeof(int))]; + char buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr *cmsg = (void *)buf; + + int *fdptr; + + iov.iov_base = iobuf; + iov.iov_len = sizeof(iobuf); + + memset(&msghdr, 0, sizeof(msghdr)); + msghdr.msg_iov = &iov; + msghdr.msg_iovlen = 1; + + /* Now send the fds */ + msghdr.msg_control = cmsgbuf; + msghdr.msg_controllen = CMSG_SPACE(sizeof(int)); + + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + + fdptr = (int *)CMSG_DATA(cmsg); + memcpy(fdptr, &send_fd, sizeof(send_fd)); + + if (sendmsg(fd, &msghdr, 0) != sizeof(iobuf)) { + return -1; + } + + return 0; +} + +/* + * + * This function works like uxst_connect_server but instead of creating a + * socket and establishing a connection, it creates a pair of connected + * sockets, and send one of them through the destination FD. The destination FD + * is stored in conn->dst->sin_addr.s_addr during configuration parsing. + * + * conn->target may point either to a valid server or to a backend, depending + * on conn->target. Only OBJ_TYPE_PROXY and OBJ_TYPE_SERVER are supported. The + * <data> parameter is a boolean indicating whether there are data waiting for + * being sent or not, in order to adjust data write polling and on some + * platforms. The <delack> argument is ignored. + * + * Note that a pending send_proxy message accounts for data. + * + * It can return one of : + * - SF_ERR_NONE if everything's OK + * - SF_ERR_SRVTO if there are no more servers + * - SF_ERR_SRVCL if the connection was refused by the server + * - SF_ERR_PRXCOND if the connection has been limited by the proxy (maxconn) + * - SF_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * - SF_ERR_INTERNAL for any other purely internal errors + * Additionally, in the case of SF_ERR_RESOURCE, an emergency log will be emitted. + * + * The connection's fd is inserted only when SF_ERR_NONE is returned, otherwise + * it's invalid and the caller has nothing to do. + */ +static int sockpair_connect_server(struct connection *conn, int flags) +{ + int sv[2], fd, dst_fd = -1; + + BUG_ON(!conn->dst); + + /* the FD is stored in the sockaddr struct */ + dst_fd = ((struct sockaddr_in *)conn->dst)->sin_addr.s_addr; + + if (obj_type(conn->target) != OBJ_TYPE_PROXY && + obj_type(conn->target) != OBJ_TYPE_SERVER) { + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if (socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) { + ha_alert("socketpair(): Cannot create socketpair. Giving up.\n"); + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } + + fd = conn->handle.fd = sv[1]; + + if (fd >= global.maxsock) { + /* do not log anything there, it's a normal condition when this option + * is used to serialize connections to a server ! + */ + ha_alert("socket(): not enough free sockets. Raise -n argument. Giving up.\n"); + close(sv[0]); + close(sv[1]); + conn->err_code = CO_ER_CONF_FDLIM; + conn->flags |= CO_FL_ERROR; + return SF_ERR_PRXCOND; /* it is a configuration limit */ + } + + if (fd_set_nonblock(fd) == -1) { + qfprintf(stderr,"Cannot set client socket to non blocking mode.\n"); + close(sv[0]); + close(sv[1]); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if (master == 1 && fd_set_cloexec(fd) == -1) { + ha_alert("Cannot set CLOEXEC on client socket.\n"); + close(sv[0]); + close(sv[1]); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if (global.tune.server_sndbuf) + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &global.tune.server_sndbuf, sizeof(global.tune.server_sndbuf)); + + if (global.tune.server_rcvbuf) + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &global.tune.server_rcvbuf, sizeof(global.tune.server_rcvbuf)); + + /* The new socket is sent on the other side, it should be retrieved and + * considered as an 'accept' socket on the server side */ + if (send_fd_uxst(dst_fd, sv[0]) == -1) { + ha_alert("socketpair: Cannot transfer the fd %d over sockpair@%d. Giving up.\n", sv[0], dst_fd); + close(sv[0]); + close(sv[1]); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + close(sv[0]); /* we don't need this side anymore */ + + conn->flags &= ~CO_FL_WAIT_L4_CONN; + + /* Prepare to send a few handshakes related to the on-wire protocol. */ + if (conn->send_proxy_ofs) + conn->flags |= CO_FL_SEND_PROXY; + + conn_ctrl_init(conn); /* registers the FD */ + HA_ATOMIC_AND(&fdtab[fd].state, ~FD_LINGER_RISK); /* no need to disable lingering */ + + return SF_ERR_NONE; /* connection is OK */ +} + + +/* + * Receives a file descriptor transferred from a unix socket. + * + * Return -1 or a socket fd; + * + * The iobuf variable could be used in the future to enhance the protocol. + */ +int recv_fd_uxst(int sock) +{ + struct msghdr msghdr; + struct iovec iov; + char iobuf[2]; + + char cmsgbuf[CMSG_SPACE(sizeof(int))]; + char buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr *cmsg = (void *)buf; + + + int recv_fd = -1; + int ret = -1; + + memset(&msghdr, 0, sizeof(msghdr)); + + iov.iov_base = iobuf; + iov.iov_len = sizeof(iobuf); + + msghdr.msg_iov = &iov; + msghdr.msg_iovlen = 1; + + msghdr.msg_control = cmsgbuf; + msghdr.msg_controllen = CMSG_SPACE(sizeof(int)); + + iov.iov_len = sizeof(iobuf); + iov.iov_base = iobuf; + + while (1) { + ret = recvmsg(sock, &msghdr, 0); + if (ret == -1 && errno == EINTR) + continue; + else + break; + } + + if (ret == -1) + return ret; + + cmsg = CMSG_FIRSTHDR(&msghdr); + if (cmsg && cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + size_t totlen = cmsg->cmsg_len - + CMSG_LEN(0); + memcpy(&recv_fd, CMSG_DATA(cmsg), totlen); + } + return recv_fd; +} + +/* Tests if the receiver supports accepting connections. Returns positive on + * success, 0 if not possible, negative if the socket is non-recoverable. In + * practice zero is never returned since we don't support suspending sockets. + * The real test consists in verifying we have a connected SOCK_STREAM of + * family AF_UNIX. + */ +static int sockpair_accepting_conn(const struct receiver *rx) +{ + struct sockaddr sa; + socklen_t len; + int val; + + len = sizeof(val); + if (getsockopt(rx->fd, SOL_SOCKET, SO_TYPE, &val, &len) == -1) + return -1; + + if (val != SOCK_STREAM) + return -1; + + len = sizeof(sa); + if (getsockname(rx->fd, &sa, &len) != 0) + return -1; + + if (sa.sa_family != AF_UNIX) + return -1; + + len = sizeof(val); + if (getsockopt(rx->fd, SOL_SOCKET, SO_ACCEPTCONN, &val, &len) == -1) + return -1; + + /* Note: cannot be a listening socket, must be established */ + if (val) + return -1; + + return 1; +} + +/* Accept an incoming connection from listener <l>, and return it, as well as + * a CO_AC_* status code into <status> if not null. Null is returned on error. + * <l> must be a valid listener with a valid frontend. + */ +struct connection *sockpair_accept_conn(struct listener *l, int *status) +{ + struct proxy *p = l->bind_conf->frontend; + struct connection *conn = NULL; + int ret; + int cfd; + + if ((cfd = recv_fd_uxst(l->rx.fd)) != -1) + fd_set_nonblock(cfd); + + if (likely(cfd != -1)) { + /* Perfect, the connection was accepted */ + conn = conn_new(&l->obj_type); + if (!conn) + goto fail_conn; + + if (!sockaddr_alloc(&conn->src, NULL, 0)) + goto fail_addr; + + /* just like with UNIX sockets, only the family is filled */ + conn->src->ss_family = AF_UNIX; + conn->handle.fd = cfd; + ret = CO_AC_DONE; + goto done; + } + + switch (errno) { +#if defined(EWOULDBLOCK) && defined(EAGAIN) && EWOULDBLOCK != EAGAIN + case EWOULDBLOCK: +#endif + case EAGAIN: + ret = CO_AC_DONE; /* nothing more to accept */ + if (fdtab[l->rx.fd].state & (FD_POLL_HUP|FD_POLL_ERR)) { + /* the listening socket might have been disabled in a shared + * process and we're a collateral victim. We'll just pause for + * a while in case it comes back. In the mean time, we need to + * clear this sticky flag. + */ + _HA_ATOMIC_AND(&fdtab[l->rx.fd].state, ~(FD_POLL_HUP|FD_POLL_ERR)); + ret = CO_AC_PAUSE; + } + fd_cant_recv(l->rx.fd); + break; + + case EINVAL: + /* might be trying to accept on a shut fd (eg: soft stop) */ + ret = CO_AC_PAUSE; + break; + + case EINTR: + case ECONNABORTED: + ret = CO_AC_RETRY; + break; + + case ENFILE: + if (p) + send_log(p, LOG_EMERG, + "Proxy %s reached system FD limit (maxsock=%d). Please check system tunables.\n", + p->id, global.maxsock); + ret = CO_AC_PAUSE; + break; + + case EMFILE: + if (p) + send_log(p, LOG_EMERG, + "Proxy %s reached process FD limit (maxsock=%d). Please check 'ulimit-n' and restart.\n", + p->id, global.maxsock); + ret = CO_AC_PAUSE; + break; + + case ENOBUFS: + case ENOMEM: + if (p) + send_log(p, LOG_EMERG, + "Proxy %s reached system memory limit (maxsock=%d). Please check system tunables.\n", + p->id, global.maxsock); + ret = CO_AC_PAUSE; + break; + + default: + /* unexpected result, let's give up and let other tasks run */ + ret = CO_AC_YIELD; + } + done: + if (status) + *status = ret; + return conn; + + fail_addr: + conn_free(conn); + conn = NULL; + fail_conn: + ret = CO_AC_PAUSE; + goto done; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/proto_tcp.c b/src/proto_tcp.c new file mode 100644 index 0000000..45ce27f --- /dev/null +++ b/src/proto_tcp.c @@ -0,0 +1,834 @@ +/* + * AF_INET/AF_INET6 SOCK_STREAM protocol layer (tcp) + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <netinet/tcp.h> +#include <netinet/in.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/namespace.h> +#include <haproxy/port_range.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy-t.h> +#include <haproxy/sock.h> +#include <haproxy/sock_inet.h> +#include <haproxy/tools.h> + + +static int tcp_bind_listener(struct listener *listener, char *errmsg, int errlen); +static int tcp_suspend_receiver(struct receiver *rx); +static int tcp_resume_receiver(struct receiver *rx); +static void tcp_enable_listener(struct listener *listener); +static void tcp_disable_listener(struct listener *listener); + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_tcpv4 = { + .name = "tcpv4", + + /* connection layer */ + .xprt_type = PROTO_TYPE_STREAM, + .listen = tcp_bind_listener, + .enable = tcp_enable_listener, + .disable = tcp_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .suspend = default_suspend_listener, + .resume = default_resume_listener, + .accept_conn = sock_accept_conn, + .ctrl_init = sock_conn_ctrl_init, + .ctrl_close = sock_conn_ctrl_close, + .connect = tcp_connect_server, + .drain = sock_drain, + .check_events = sock_check_events, + .ignore_events = sock_ignore_events, + + /* binding layer */ + .rx_suspend = tcp_suspend_receiver, + .rx_resume = tcp_resume_receiver, + + /* address family */ + .fam = &proto_fam_inet4, + + /* socket layer */ + .proto_type = PROTO_TYPE_STREAM, + .sock_type = SOCK_STREAM, + .sock_prot = IPPROTO_TCP, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .rx_listening = sock_accepting_conn, + .default_iocb = sock_accept_iocb, + .receivers = LIST_HEAD_INIT(proto_tcpv4.receivers), + .nb_receivers = 0, +#ifdef SO_REUSEPORT + .flags = PROTO_F_REUSEPORT_SUPPORTED, +#endif +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_tcpv4); + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_tcpv6 = { + .name = "tcpv6", + + /* connection layer */ + .xprt_type = PROTO_TYPE_STREAM, + .listen = tcp_bind_listener, + .enable = tcp_enable_listener, + .disable = tcp_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .suspend = default_suspend_listener, + .resume = default_resume_listener, + .accept_conn = sock_accept_conn, + .ctrl_init = sock_conn_ctrl_init, + .ctrl_close = sock_conn_ctrl_close, + .connect = tcp_connect_server, + .drain = sock_drain, + .check_events = sock_check_events, + .ignore_events = sock_ignore_events, + + /* binding layer */ + .rx_suspend = tcp_suspend_receiver, + .rx_resume = tcp_resume_receiver, + + /* address family */ + .fam = &proto_fam_inet6, + + /* socket layer */ + .proto_type = PROTO_TYPE_STREAM, + .sock_type = SOCK_STREAM, + .sock_prot = IPPROTO_TCP, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .rx_listening = sock_accepting_conn, + .default_iocb = sock_accept_iocb, + .receivers = LIST_HEAD_INIT(proto_tcpv6.receivers), + .nb_receivers = 0, +#ifdef SO_REUSEPORT + .flags = PROTO_F_REUSEPORT_SUPPORTED, +#endif +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_tcpv6); + +/* Binds ipv4/ipv6 address <local> to socket <fd>, unless <flags> is set, in which + * case we try to bind <remote>. <flags> is a 2-bit field consisting of : + * - 0 : ignore remote address (may even be a NULL pointer) + * - 1 : use provided address + * - 2 : use provided port + * - 3 : use both + * + * The function supports multiple foreign binding methods : + * - linux_tproxy: we directly bind to the foreign address + * The second one can be used as a fallback for the first one. + * This function returns 0 when everything's OK, 1 if it could not bind, to the + * local address, 2 if it could not bind to the foreign address. + */ +int tcp_bind_socket(int fd, int flags, struct sockaddr_storage *local, struct sockaddr_storage *remote) +{ + struct sockaddr_storage bind_addr; + int foreign_ok = 0; + int ret; + static THREAD_LOCAL int ip_transp_working = 1; + static THREAD_LOCAL int ip6_transp_working = 1; + + switch (local->ss_family) { + case AF_INET: + if (flags && ip_transp_working) { + /* This deserves some explanation. Some platforms will support + * multiple combinations of certain methods, so we try the + * supported ones until one succeeds. + */ + if (sock_inet4_make_foreign(fd)) + foreign_ok = 1; + else + ip_transp_working = 0; + } + break; + case AF_INET6: + if (flags && ip6_transp_working) { + if (sock_inet6_make_foreign(fd)) + foreign_ok = 1; + else + ip6_transp_working = 0; + } + break; + } + + if (flags) { + memset(&bind_addr, 0, sizeof(bind_addr)); + bind_addr.ss_family = remote->ss_family; + switch (remote->ss_family) { + case AF_INET: + if (flags & 1) + ((struct sockaddr_in *)&bind_addr)->sin_addr = ((struct sockaddr_in *)remote)->sin_addr; + if (flags & 2) + ((struct sockaddr_in *)&bind_addr)->sin_port = ((struct sockaddr_in *)remote)->sin_port; + break; + case AF_INET6: + if (flags & 1) + ((struct sockaddr_in6 *)&bind_addr)->sin6_addr = ((struct sockaddr_in6 *)remote)->sin6_addr; + if (flags & 2) + ((struct sockaddr_in6 *)&bind_addr)->sin6_port = ((struct sockaddr_in6 *)remote)->sin6_port; + break; + default: + /* we don't want to try to bind to an unknown address family */ + foreign_ok = 0; + } + } + + setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + if (foreign_ok) { + if (is_inet_addr(&bind_addr)) { + ret = bind(fd, (struct sockaddr *)&bind_addr, get_addr_len(&bind_addr)); + if (ret < 0) + return 2; + } + } + else { + if (is_inet_addr(local)) { + ret = bind(fd, (struct sockaddr *)local, get_addr_len(local)); + if (ret < 0) + return 1; + } + } + + if (!flags) + return 0; + + if (!foreign_ok) + /* we could not bind to a foreign address */ + return 2; + + return 0; +} + +/* + * This function initiates a TCP connection establishment to the target assigned + * to connection <conn> using (si->{target,dst}). A source address may be + * pointed to by conn->src in case of transparent proxying. Normal source + * bind addresses are still determined locally (due to the possible need of a + * source port). conn->target may point either to a valid server or to a backend, + * depending on conn->target. Only OBJ_TYPE_PROXY and OBJ_TYPE_SERVER are + * supported. The <data> parameter is a boolean indicating whether there are data + * waiting for being sent or not, in order to adjust data write polling and on + * some platforms, the ability to avoid an empty initial ACK. The <flags> argument + * allows the caller to force using a delayed ACK when establishing the connection + * - 0 = no delayed ACK unless data are advertised and backend has tcp-smart-connect + * - CONNECT_DELACK_SMART_CONNECT = delayed ACK if backend has tcp-smart-connect, regardless of data + * - CONNECT_DELACK_ALWAYS = delayed ACK regardless of backend options + * + * Note that a pending send_proxy message accounts for data. + * + * It can return one of : + * - SF_ERR_NONE if everything's OK + * - SF_ERR_SRVTO if there are no more servers + * - SF_ERR_SRVCL if the connection was refused by the server + * - SF_ERR_PRXCOND if the connection has been limited by the proxy (maxconn) + * - SF_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * - SF_ERR_INTERNAL for any other purely internal errors + * Additionally, in the case of SF_ERR_RESOURCE, an emergency log will be emitted. + * + * The connection's fd is inserted only when SF_ERR_NONE is returned, otherwise + * it's invalid and the caller has nothing to do. + */ + +int tcp_connect_server(struct connection *conn, int flags) +{ + int fd; + struct server *srv; + struct proxy *be; + struct conn_src *src; + int use_fastopen = 0; + struct sockaddr_storage *addr; + + BUG_ON(!conn->dst); + + conn->flags |= CO_FL_WAIT_L4_CONN; /* connection in progress */ + + switch (obj_type(conn->target)) { + case OBJ_TYPE_PROXY: + be = __objt_proxy(conn->target); + srv = NULL; + break; + case OBJ_TYPE_SERVER: + srv = __objt_server(conn->target); + be = srv->proxy; + /* Make sure we check that we have data before activating + * TFO, or we could trigger a kernel issue whereby after + * a successful connect() == 0, any subsequent connect() + * will return EINPROGRESS instead of EISCONN. + */ + use_fastopen = (srv->flags & SRV_F_FASTOPEN) && + ((flags & (CONNECT_CAN_USE_TFO | CONNECT_HAS_DATA)) == + (CONNECT_CAN_USE_TFO | CONNECT_HAS_DATA)); + break; + default: + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + fd = conn->handle.fd = sock_create_server_socket(conn); + + if (fd == -1) { + qfprintf(stderr, "Cannot get a server socket.\n"); + + if (errno == ENFILE) { + conn->err_code = CO_ER_SYS_FDLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached system FD limit (maxsock=%d). Please check system tunables.\n", + be->id, global.maxsock); + } + else if (errno == EMFILE) { + conn->err_code = CO_ER_PROC_FDLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached process FD limit (maxsock=%d). Please check 'ulimit-n' and restart.\n", + be->id, global.maxsock); + } + else if (errno == ENOBUFS || errno == ENOMEM) { + conn->err_code = CO_ER_SYS_MEMLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached system memory limit (maxsock=%d). Please check system tunables.\n", + be->id, global.maxsock); + } + else if (errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) { + conn->err_code = CO_ER_NOPROTO; + } + else + conn->err_code = CO_ER_SOCK_ERR; + + /* this is a resource error */ + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } + + if (fd >= global.maxsock) { + /* do not log anything there, it's a normal condition when this option + * is used to serialize connections to a server ! + */ + ha_alert("socket(): not enough free sockets. Raise -n argument. Giving up.\n"); + close(fd); + conn->err_code = CO_ER_CONF_FDLIM; + conn->flags |= CO_FL_ERROR; + return SF_ERR_PRXCOND; /* it is a configuration limit */ + } + + if (fd_set_nonblock(fd) == -1 || + (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)) == -1)) { + qfprintf(stderr,"Cannot set client socket to non blocking mode.\n"); + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if (master == 1 && fd_set_cloexec(fd) == -1) { + ha_alert("Cannot set CLOEXEC on client socket.\n"); + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if (be->options & PR_O_TCP_SRV_KA) { + setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &one, sizeof(one)); + +#ifdef TCP_KEEPCNT + if (be->srvtcpka_cnt) + setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &be->srvtcpka_cnt, sizeof(be->srvtcpka_cnt)); +#endif + +#ifdef TCP_KEEPIDLE + if (be->srvtcpka_idle) + setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &be->srvtcpka_idle, sizeof(be->srvtcpka_idle)); +#endif + +#ifdef TCP_KEEPINTVL + if (be->srvtcpka_intvl) + setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &be->srvtcpka_intvl, sizeof(be->srvtcpka_intvl)); +#endif + } + + /* allow specific binding : + * - server-specific at first + * - proxy-specific next + */ + if (srv && srv->conn_src.opts & CO_SRC_BIND) + src = &srv->conn_src; + else if (be->conn_src.opts & CO_SRC_BIND) + src = &be->conn_src; + else + src = NULL; + + if (src) { + int ret, flags = 0; + + if (conn->src && is_inet_addr(conn->src)) { + switch (src->opts & CO_SRC_TPROXY_MASK) { + case CO_SRC_TPROXY_CLI: + case CO_SRC_TPROXY_ADDR: + flags = 3; + break; + case CO_SRC_TPROXY_CIP: + case CO_SRC_TPROXY_DYN: + flags = 1; + break; + } + } + +#ifdef SO_BINDTODEVICE + /* Note: this might fail if not CAP_NET_RAW */ + if (src->iface_name) + setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, src->iface_name, src->iface_len + 1); +#endif + + if (src->sport_range) { + int attempts = 10; /* should be more than enough to find a spare port */ + struct sockaddr_storage sa; + + ret = 1; + memcpy(&sa, &src->source_addr, sizeof(sa)); + + do { + /* note: in case of retry, we may have to release a previously + * allocated port, hence this loop's construct. + */ + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + + if (!attempts) + break; + attempts--; + + fdinfo[fd].local_port = port_range_alloc_port(src->sport_range); + if (!fdinfo[fd].local_port) { + conn->err_code = CO_ER_PORT_RANGE; + break; + } + + fdinfo[fd].port_range = src->sport_range; + set_host_port(&sa, fdinfo[fd].local_port); + + ret = tcp_bind_socket(fd, flags, &sa, conn->src); + if (ret != 0) + conn->err_code = CO_ER_CANT_BIND; + } while (ret != 0); /* binding NOK */ + } + else { +#ifdef IP_BIND_ADDRESS_NO_PORT + static THREAD_LOCAL int bind_address_no_port = 1; + setsockopt(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT, (const void *) &bind_address_no_port, sizeof(int)); +#endif + ret = tcp_bind_socket(fd, flags, &src->source_addr, conn->src); + if (ret != 0) + conn->err_code = CO_ER_CANT_BIND; + } + + if (unlikely(ret != 0)) { + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + close(fd); + + if (ret == 1) { + ha_alert("Cannot bind to source address before connect() for backend %s. Aborting.\n", + be->id); + send_log(be, LOG_EMERG, + "Cannot bind to source address before connect() for backend %s.\n", + be->id); + } else { + ha_alert("Cannot bind to tproxy source address before connect() for backend %s. Aborting.\n", + be->id); + send_log(be, LOG_EMERG, + "Cannot bind to tproxy source address before connect() for backend %s.\n", + be->id); + } + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } + } + +#if defined(TCP_QUICKACK) + /* disabling tcp quick ack now allows the first request to leave the + * machine with the first ACK. We only do this if there are pending + * data in the buffer. + */ + if (flags & (CONNECT_DELACK_ALWAYS) || + ((flags & CONNECT_DELACK_SMART_CONNECT || + (flags & CONNECT_HAS_DATA) || conn->send_proxy_ofs) && + (be->options2 & PR_O2_SMARTCON))) + setsockopt(fd, IPPROTO_TCP, TCP_QUICKACK, &zero, sizeof(zero)); +#endif + +#ifdef TCP_USER_TIMEOUT + /* there is not much more we can do here when it fails, it's still minor */ + if (srv && srv->tcp_ut) + setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &srv->tcp_ut, sizeof(srv->tcp_ut)); +#endif + + if (use_fastopen) { +#if defined(TCP_FASTOPEN_CONNECT) + setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN_CONNECT, &one, sizeof(one)); +#endif + } + if (global.tune.server_sndbuf) + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &global.tune.server_sndbuf, sizeof(global.tune.server_sndbuf)); + + if (global.tune.server_rcvbuf) + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &global.tune.server_rcvbuf, sizeof(global.tune.server_rcvbuf)); + + addr = (conn->flags & CO_FL_SOCKS4) ? &srv->socks4_addr : conn->dst; + if (connect(fd, (const struct sockaddr *)addr, get_addr_len(addr)) == -1) { + if (errno == EINPROGRESS || errno == EALREADY) { + /* common case, let's wait for connect status */ + conn->flags |= CO_FL_WAIT_L4_CONN; + } + else if (errno == EISCONN) { + /* should normally not happen but if so, indicates that it's OK */ + conn->flags &= ~CO_FL_WAIT_L4_CONN; + } + else if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EADDRINUSE || errno == EADDRNOTAVAIL) { + char *msg; + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EADDRNOTAVAIL) { + msg = "no free ports"; + conn->err_code = CO_ER_FREE_PORTS; + } + else { + msg = "local address already in use"; + conn->err_code = CO_ER_ADDR_INUSE; + } + + qfprintf(stderr,"Connect() failed for backend %s: %s.\n", be->id, msg); + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + close(fd); + send_log(be, LOG_ERR, "Connect() failed for backend %s: %s.\n", be->id, msg); + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } else if (errno == ETIMEDOUT) { + //qfprintf(stderr,"Connect(): ETIMEDOUT"); + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_SRVTO; + } else { + // (errno == ECONNREFUSED || errno == ENETUNREACH || errno == EACCES || errno == EPERM) + //qfprintf(stderr,"Connect(): %d", errno); + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); + fdinfo[fd].port_range = NULL; + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_SRVCL; + } + } + else { + /* connect() == 0, this is great! */ + conn->flags &= ~CO_FL_WAIT_L4_CONN; + } + + conn_ctrl_init(conn); /* registers the FD */ + HA_ATOMIC_OR(&fdtab[fd].state, FD_LINGER_RISK); /* close hard if needed */ + + if (conn->flags & CO_FL_WAIT_L4_CONN) { + fd_want_send(fd); + fd_cant_send(fd); + fd_cant_recv(fd); + } + + return SF_ERR_NONE; /* connection is OK */ +} + +/* This function tries to bind a TCPv4/v6 listener. It may return a warning or + * an error message in <errmsg> if the message is at most <errlen> bytes long + * (including '\0'). Note that <errmsg> may be NULL if <errlen> is also zero. + * The return value is composed from ERR_ABORT, ERR_WARN, + * ERR_ALERT, ERR_RETRYABLE and ERR_FATAL. ERR_NONE indicates that everything + * was alright and that no message was returned. ERR_RETRYABLE means that an + * error occurred but that it may vanish after a retry (eg: port in use), and + * ERR_FATAL indicates a non-fixable error. ERR_WARN and ERR_ALERT do not alter + * the meaning of the error, but just indicate that a message is present which + * should be displayed with the respective level. Last, ERR_ABORT indicates + * that it's pointless to try to start other listeners. No error message is + * returned if errlen is NULL. + */ +int tcp_bind_listener(struct listener *listener, char *errmsg, int errlen) +{ + int fd, err; + int ready; + struct buffer *msg = alloc_trash_chunk(); + + err = ERR_NONE; + + if (!msg) { + if (errlen) + snprintf(errmsg, errlen, "out of memory"); + return ERR_ALERT | ERR_FATAL; + } + + /* ensure we never return garbage */ + if (errlen) + *errmsg = 0; + + if (listener->state != LI_ASSIGNED) + return ERR_NONE; /* already bound */ + + if (!(listener->rx.flags & RX_F_BOUND)) { + chunk_appendf(msg, "%sreceiving socket not bound", msg->data ? ", " : ""); + goto tcp_return; + } + + if (listener->rx.flags & RX_F_MUST_DUP) + goto done; + + fd = listener->rx.fd; + + if (listener->bind_conf->options & BC_O_NOLINGER) + setsockopt(fd, SOL_SOCKET, SO_LINGER, &nolinger, sizeof(struct linger)); + else { + struct linger tmplinger; + socklen_t len = sizeof(tmplinger); + if (getsockopt(fd, SOL_SOCKET, SO_LINGER, &tmplinger, &len) == 0 && + (tmplinger.l_onoff == 1 || tmplinger.l_linger == 0)) { + tmplinger.l_onoff = 0; + tmplinger.l_linger = 0; + setsockopt(fd, SOL_SOCKET, SO_LINGER, &tmplinger, + sizeof(tmplinger)); + } + } + +#if defined(TCP_MAXSEG) + if (listener->bind_conf->maxseg > 0) { + if (setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, + &listener->bind_conf->maxseg, sizeof(listener->bind_conf->maxseg)) == -1) { + chunk_appendf(msg, "%scannot set MSS to %d", msg->data ? ", " : "", listener->bind_conf->maxseg); + err |= ERR_WARN; + } + } else { + /* we may want to try to restore the default MSS if the socket was inherited */ + int tmpmaxseg = -1; + int defaultmss; + socklen_t len = sizeof(tmpmaxseg); + + if (listener->rx.addr.ss_family == AF_INET) + defaultmss = sock_inet_tcp_maxseg_default; + else + defaultmss = sock_inet6_tcp_maxseg_default; + + getsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &tmpmaxseg, &len); + if (defaultmss > 0 && + tmpmaxseg != defaultmss && + setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &defaultmss, sizeof(defaultmss)) == -1) { + chunk_appendf(msg, "%scannot set MSS to %d", msg->data ? ", " : "", defaultmss); + err |= ERR_WARN; + } + } +#endif +#if defined(TCP_USER_TIMEOUT) + if (listener->bind_conf->tcp_ut) { + if (setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, + &listener->bind_conf->tcp_ut, sizeof(listener->bind_conf->tcp_ut)) == -1) { + chunk_appendf(msg, "%scannot set TCP User Timeout", msg->data ? ", " : ""); + err |= ERR_WARN; + } + } else + setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &zero, + sizeof(zero)); +#endif +#if defined(TCP_DEFER_ACCEPT) + if (listener->bind_conf->options & BC_O_DEF_ACCEPT) { + /* defer accept by up to one second */ + int accept_delay = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, &accept_delay, sizeof(accept_delay)) == -1) { + chunk_appendf(msg, "%scannot enable DEFER_ACCEPT", msg->data ? ", " : ""); + err |= ERR_WARN; + } + } else + setsockopt(fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, &zero, + sizeof(zero)); +#endif +#if defined(TCP_FASTOPEN) + if (listener->bind_conf->options & BC_O_TCP_FO) { + /* TFO needs a queue length, let's use the configured backlog */ + int qlen = listener_backlog(listener); + if (setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen)) == -1) { + chunk_appendf(msg, "%scannot enable TCP_FASTOPEN", msg->data ? ", " : ""); + err |= ERR_WARN; + } + } else { + socklen_t len; + int qlen; + len = sizeof(qlen); + /* Only disable fast open if it was enabled, we don't want + * the kernel to create a fast open queue if there's none. + */ + if (getsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, &qlen, &len) == 0 && + qlen != 0) { + if (setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, &zero, + sizeof(zero)) == -1) { + chunk_appendf(msg, "%scannot disable TCP_FASTOPEN", msg->data ? ", " : ""); + err |= ERR_WARN; + } + } + } +#endif + + ready = sock_accepting_conn(&listener->rx) > 0; + + if (!ready && /* only listen if not already done by external process */ + listen(fd, listener_backlog(listener)) == -1) { + err |= ERR_RETRYABLE | ERR_ALERT; + chunk_appendf(msg, "%scannot listen to socket", msg->data ? ", " : ""); + goto tcp_close_return; + } + +#if !defined(TCP_DEFER_ACCEPT) && defined(SO_ACCEPTFILTER) + /* the socket needs to listen first */ + if (listener->bind_conf->options & BC_O_DEF_ACCEPT) { + struct accept_filter_arg accept; + memset(&accept, 0, sizeof(accept)); + strlcpy2(accept.af_name, "dataready", sizeof(accept.af_name)); + if (setsockopt(fd, SOL_SOCKET, SO_ACCEPTFILTER, &accept, sizeof(accept)) == -1) { + chunk_appendf(msg, "%scannot enable ACCEPT_FILTER", msg->data ? ", " : ""); + err |= ERR_WARN; + } + } +#endif +#if defined(TCP_QUICKACK) + if (listener->bind_conf->options & BC_O_NOQUICKACK) + setsockopt(fd, IPPROTO_TCP, TCP_QUICKACK, &zero, sizeof(zero)); + else + setsockopt(fd, IPPROTO_TCP, TCP_QUICKACK, &one, sizeof(one)); +#endif + + done: + /* the socket is ready */ + listener_set_state(listener, LI_LISTEN); + goto tcp_return; + + tcp_close_return: + free_trash_chunk(msg); + msg = NULL; + close(fd); + tcp_return: + if (msg && errlen && msg->data) { + char pn[INET6_ADDRSTRLEN]; + + addr_to_str(&listener->rx.addr, pn, sizeof(pn)); + snprintf(errmsg, errlen, "%s for [%s:%d]", msg->area, pn, get_host_port(&listener->rx.addr)); + } + free_trash_chunk(msg); + msg = NULL; + return err; +} + +/* Enable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void tcp_enable_listener(struct listener *l) +{ + fd_want_recv_safe(l->rx.fd); +} + +/* Disable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void tcp_disable_listener(struct listener *l) +{ + fd_stop_recv(l->rx.fd); +} + +/* Suspend a receiver. Returns < 0 in case of failure, 0 if the receiver + * was totally stopped, or > 0 if correctly suspended. Note that inherited FDs + * are neither suspended nor resumed, we only enable/disable polling on them. + */ +static int tcp_suspend_receiver(struct receiver *rx) +{ + const struct sockaddr sa = { .sa_family = AF_UNSPEC }; + int ret; + + /* We never disconnect a shared FD otherwise we'd break it in the + * parent process and any possible subsequent worker inheriting it. + * Thus we just stop receiving from it. + */ + if (rx->flags & RX_F_INHERITED) + goto done; + + if (connect(rx->fd, &sa, sizeof(sa)) < 0) + goto check_already_done; + done: + fd_stop_recv(rx->fd); + return 1; + + check_already_done: + /* in case one of the shutdown() above fails, it might be because we're + * dealing with a socket that is shared with other processes doing the + * same. Let's check if it's still accepting connections. + */ + ret = sock_accepting_conn(rx); + if (ret <= 0) { + /* unrecoverable or paused by another process */ + fd_stop_recv(rx->fd); + return ret == 0; + } + + /* still listening, that's not good */ + return -1; +} + +/* Resume a receiver. Returns < 0 in case of failure, 0 if the receiver + * was totally stopped, or > 0 if correctly resumed. Note that inherited FDs + * are neither suspended nor resumed, we only enable/disable polling on them. + */ +static int tcp_resume_receiver(struct receiver *rx) +{ + struct listener *l = LIST_ELEM(rx, struct listener *, rx); + + if (rx->fd < 0) + return 0; + + if ((rx->flags & RX_F_INHERITED) || listen(rx->fd, listener_backlog(l)) == 0) { + fd_want_recv(l->rx.fd); + return 1; + } + return -1; +} + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/proto_udp.c b/src/proto_udp.c new file mode 100644 index 0000000..9855974 --- /dev/null +++ b/src/proto_udp.c @@ -0,0 +1,247 @@ +/* + * UDP protocol layer on top of AF_INET/AF_INET6 + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * Partial merge by Emeric Brun <ebrun@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <netinet/udp.h> +#include <netinet/in.h> + +#include <haproxy/fd.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/namespace.h> +#include <haproxy/port_range.h> +#include <haproxy/protocol.h> +#include <haproxy/proto_udp.h> +#include <haproxy/proxy.h> +#include <haproxy/server.h> +#include <haproxy/sock.h> +#include <haproxy/sock_inet.h> +#include <haproxy/task.h> +#include <haproxy/tools.h> + +static int udp_bind_listener(struct listener *listener, char *errmsg, int errlen); +static void udp_enable_listener(struct listener *listener); +static void udp_disable_listener(struct listener *listener); + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_udp4 = { + .name = "udp4", + + /* connection layer */ + .xprt_type = PROTO_TYPE_DGRAM, + .listen = udp_bind_listener, + .enable = udp_enable_listener, + .disable = udp_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .suspend = default_suspend_listener, + .resume = default_resume_listener, + + /* binding layer */ + .rx_suspend = udp_suspend_receiver, + .rx_resume = udp_resume_receiver, + + /* address family */ + .fam = &proto_fam_inet4, + + /* socket layer */ + .proto_type = PROTO_TYPE_DGRAM, + .sock_type = SOCK_DGRAM, + .sock_prot = IPPROTO_UDP, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .receivers = LIST_HEAD_INIT(proto_udp4.receivers), + .nb_receivers = 0, +#ifdef SO_REUSEPORT + .flags = PROTO_F_REUSEPORT_SUPPORTED, +#endif +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_udp4); + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_udp6 = { + .name = "udp6", + + /* connection layer */ + .xprt_type = PROTO_TYPE_DGRAM, + .listen = udp_bind_listener, + .enable = udp_enable_listener, + .disable = udp_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .suspend = default_suspend_listener, + .resume = default_resume_listener, + + /* binding layer */ + .rx_suspend = udp_suspend_receiver, + .rx_resume = udp_resume_receiver, + + /* address family */ + .fam = &proto_fam_inet6, + + /* socket layer */ + .proto_type = PROTO_TYPE_DGRAM, + .sock_type = SOCK_DGRAM, + .sock_prot = IPPROTO_UDP, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .receivers = LIST_HEAD_INIT(proto_udp6.receivers), + .nb_receivers = 0, +#ifdef SO_REUSEPORT + .flags = PROTO_F_REUSEPORT_SUPPORTED, +#endif +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_udp6); + +/* This function tries to bind a UDPv4/v6 listener. It may return a warning or + * an error message in <errmsg> if the message is at most <errlen> bytes long + * (including '\0'). Note that <errmsg> may be NULL if <errlen> is also zero. + * The return value is composed from ERR_ABORT, ERR_WARN, + * ERR_ALERT, ERR_RETRYABLE and ERR_FATAL. ERR_NONE indicates that everything + * was alright and that no message was returned. ERR_RETRYABLE means that an + * error occurred but that it may vanish after a retry (eg: port in use), and + * ERR_FATAL indicates a non-fixable error. ERR_WARN and ERR_ALERT do not alter + * the meaning of the error, but just indicate that a message is present which + * should be displayed with the respective level. Last, ERR_ABORT indicates + * that it's pointless to try to start other listeners. No error message is + * returned if errlen is NULL. + */ +int udp_bind_listener(struct listener *listener, char *errmsg, int errlen) +{ + int err = ERR_NONE; + char *msg = NULL; + + /* ensure we never return garbage */ + if (errlen) + *errmsg = 0; + + if (listener->state != LI_ASSIGNED) + return ERR_NONE; /* already bound */ + + if (!(listener->rx.flags & RX_F_BOUND)) { + msg = "receiving socket not bound"; + goto udp_return; + } + + /* we may want to adjust the output buffer (tune.sndbuf.backend) */ + if (global.tune.frontend_rcvbuf) + setsockopt(listener->rx.fd, SOL_SOCKET, SO_RCVBUF, &global.tune.frontend_rcvbuf, sizeof(global.tune.frontend_rcvbuf)); + + if (global.tune.frontend_sndbuf) + setsockopt(listener->rx.fd, SOL_SOCKET, SO_SNDBUF, &global.tune.frontend_sndbuf, sizeof(global.tune.frontend_sndbuf)); + + listener_set_state(listener, LI_LISTEN); + + udp_return: + if (msg && errlen) { + char pn[INET6_ADDRSTRLEN]; + + addr_to_str(&listener->rx.addr, pn, sizeof(pn)); + snprintf(errmsg, errlen, "%s for [%s:%d]", msg, pn, get_host_port(&listener->rx.addr)); + } + return err; +} + +/* Enable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void udp_enable_listener(struct listener *l) +{ + fd_want_recv_safe(l->rx.fd); +} + +/* Disable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void udp_disable_listener(struct listener *l) +{ + fd_stop_recv(l->rx.fd); +} + +/* Suspend a receiver. Returns < 0 in case of failure, 0 if the receiver + * was totally stopped, or > 0 if correctly suspended. + * The principle is a bit ugly but works well, at least on Linux: in order to + * suspend the receiver, we want it to stop receiving traffic, which means that + * the socket must be unhashed from the kernel's socket table. The simple way + * to do this is to connect to any address that is reachable and will not be + * used by regular traffic, and a great one is reconnecting to self. Note that + * inherited FDs are neither suspended nor resumed, we only enable/disable + * polling on them. + */ +int udp_suspend_receiver(struct receiver *rx) +{ + struct sockaddr_storage ss; + socklen_t len = sizeof(ss); + + if (rx->fd < 0) + return 0; + + /* we never do that with a shared FD otherwise we'd break it in the + * parent process and any possible subsequent worker inheriting it. + */ + if (rx->flags & RX_F_INHERITED) + goto done; + + if (getsockname(rx->fd, (struct sockaddr *)&ss, &len) < 0) + return -1; + + if (connect(rx->fd, (struct sockaddr *)&ss, len) < 0) + return -1; + done: + /* not necessary but may make debugging clearer */ + fd_stop_recv(rx->fd); + return 1; +} + +/* Resume a receiver. Returns < 0 in case of failure, 0 if the receiver + * was totally stopped, or > 0 if correctly suspended. + * The principle is to reverse the change above, we'll break the connection by + * connecting to AF_UNSPEC. The association breaks and the socket starts to + * receive from everywhere again. Note that inherited FDs are neither suspended + * nor resumed, we only enable/disable polling on them. + */ +int udp_resume_receiver(struct receiver *rx) +{ + const struct sockaddr sa = { .sa_family = AF_UNSPEC }; + + if (rx->fd < 0) + return 0; + + if (!(rx->flags & RX_F_INHERITED) && connect(rx->fd, &sa, sizeof(sa)) < 0) + return -1; + + fd_want_recv(rx->fd); + return 1; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/proto_uxdg.c b/src/proto_uxdg.c new file mode 100644 index 0000000..43cbe5a --- /dev/null +++ b/src/proto_uxdg.c @@ -0,0 +1,159 @@ +/* + * DGRAM protocol layer on top of AF_UNIX + * + * Copyright 2020 HAProxy Technologies, Emeric Brun <ebrun@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> + +#include <haproxy/fd.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/namespace.h> +#include <haproxy/protocol.h> +#include <haproxy/sock.h> +#include <haproxy/sock_unix.h> +#include <haproxy/tools.h> + +static int uxdg_bind_listener(struct listener *listener, char *errmsg, int errlen); +static void uxdg_enable_listener(struct listener *listener); +static void uxdg_disable_listener(struct listener *listener); +static int uxdg_suspend_receiver(struct receiver *rx); + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_uxdg = { + .name = "uxdg", + + /* connection layer */ + .xprt_type = PROTO_TYPE_DGRAM, + .listen = uxdg_bind_listener, + .enable = uxdg_enable_listener, + .disable = uxdg_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .suspend = default_suspend_listener, + .resume = default_resume_listener, + + /* binding layer */ + .rx_suspend = uxdg_suspend_receiver, + + /* address family */ + .fam = &proto_fam_unix, + + /* socket layer */ + .proto_type = PROTO_TYPE_DGRAM, + .sock_type = SOCK_DGRAM, + .sock_prot = 0, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .receivers = LIST_HEAD_INIT(proto_uxdg.receivers), + .nb_receivers = 0, +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_uxdg); + +/* This function tries to bind dgram unix socket listener. It may return a warning or + * an error message in <errmsg> if the message is at most <errlen> bytes long + * (including '\0'). Note that <errmsg> may be NULL if <errlen> is also zero. + * The return value is composed from ERR_ABORT, ERR_WARN, + * ERR_ALERT, ERR_RETRYABLE and ERR_FATAL. ERR_NONE indicates that everything + * was alright and that no message was returned. ERR_RETRYABLE means that an + * error occurred but that it may vanish after a retry (eg: port in use), and + * ERR_FATAL indicates a non-fixable error. ERR_WARN and ERR_ALERT do not alter + * the meaning of the error, but just indicate that a message is present which + * should be displayed with the respective level. Last, ERR_ABORT indicates + * that it's pointless to try to start other listeners. No error message is + * returned if errlen is NULL. + */ +int uxdg_bind_listener(struct listener *listener, char *errmsg, int errlen) +{ + int err = ERR_NONE; + char *msg = NULL; + + /* ensure we never return garbage */ + if (errlen) + *errmsg = 0; + + if (listener->state != LI_ASSIGNED) + return ERR_NONE; /* already bound */ + + if (!(listener->rx.flags & RX_F_BOUND)) { + msg = "receiving socket not bound"; + err |= ERR_FATAL | ERR_ALERT; + goto uxdg_return; + } + + listener_set_state(listener, LI_LISTEN); + + uxdg_return: + if (msg && errlen) { + char *path_str; + + path_str = sa2str((struct sockaddr_storage *)&listener->rx.addr, 0, 0); + snprintf(errmsg, errlen, "%s for [%s]", msg, ((path_str) ? path_str : "")); + ha_free(&path_str); + } + return err; +} + +/* Enable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void uxdg_enable_listener(struct listener *l) +{ + fd_want_recv_safe(l->rx.fd); +} + +/* Disable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void uxdg_disable_listener(struct listener *l) +{ + fd_stop_recv(l->rx.fd); +} + +/* Suspend a receiver. Returns < 0 in case of failure, 0 if the receiver + * was totally stopped, or > 0 if correctly suspended. For plain unix sockets + * we only disable the listener to prevent data from being handled but nothing + * more is done since currently it's the new process which handles the renaming. + * Abstract sockets are completely unbound and closed so there's no need to stop + * the poller. + */ +static int uxdg_suspend_receiver(struct receiver *rx) +{ + struct listener *l = LIST_ELEM(rx, struct listener *, rx); + + if (((struct sockaddr_un *)&rx->addr)->sun_path[0]) { + uxdg_disable_listener(l); + return 1; + } + + /* Listener's lock already held. Call lockless version of + * unbind_listener. */ + do_unbind_listener(l); + return 0; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/proto_uxst.c b/src/proto_uxst.c new file mode 100644 index 0000000..7988e00 --- /dev/null +++ b/src/proto_uxst.c @@ -0,0 +1,372 @@ +/* + * UNIX SOCK_STREAM protocol layer (uxst) + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <time.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/un.h> + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/protocol.h> +#include <haproxy/proto_uxst.h> +#include <haproxy/sock.h> +#include <haproxy/sock_unix.h> +#include <haproxy/tools.h> +#include <haproxy/version.h> + + +static int uxst_bind_listener(struct listener *listener, char *errmsg, int errlen); +static int uxst_connect_server(struct connection *conn, int flags); +static void uxst_enable_listener(struct listener *listener); +static void uxst_disable_listener(struct listener *listener); +static int uxst_suspend_receiver(struct receiver *rx); + +/* Note: must not be declared <const> as its list will be overwritten */ +struct protocol proto_uxst = { + .name = "unix_stream", + + /* connection layer */ + .xprt_type = PROTO_TYPE_STREAM, + .listen = uxst_bind_listener, + .enable = uxst_enable_listener, + .disable = uxst_disable_listener, + .add = default_add_listener, + .unbind = default_unbind_listener, + .suspend = default_suspend_listener, + .resume = default_resume_listener, + .accept_conn = sock_accept_conn, + .ctrl_init = sock_conn_ctrl_init, + .ctrl_close = sock_conn_ctrl_close, + .connect = uxst_connect_server, + .drain = sock_drain, + .check_events = sock_check_events, + .ignore_events = sock_ignore_events, + + /* binding layer */ + .rx_suspend = uxst_suspend_receiver, + + /* address family */ + .fam = &proto_fam_unix, + + /* socket layer */ + .proto_type = PROTO_TYPE_STREAM, + .sock_type = SOCK_STREAM, + .sock_prot = 0, + .rx_enable = sock_enable, + .rx_disable = sock_disable, + .rx_unbind = sock_unbind, + .rx_listening = sock_accepting_conn, + .default_iocb = sock_accept_iocb, + .receivers = LIST_HEAD_INIT(proto_uxst.receivers), + .nb_receivers = 0, +}; + +INITCALL1(STG_REGISTER, protocol_register, &proto_uxst); + +/******************************** + * 1) low-level socket functions + ********************************/ + + +/******************************** + * 2) listener-oriented functions + ********************************/ + +/* This function creates a UNIX socket associated to the listener. It changes + * the state from ASSIGNED to LISTEN. The socket is NOT enabled for polling. + * The return value is composed from ERR_NONE, ERR_RETRYABLE and ERR_FATAL. It + * may return a warning or an error message in <errmsg> if the message is at + * most <errlen> bytes long (including '\0'). Note that <errmsg> may be NULL if + * <errlen> is also zero. + */ +static int uxst_bind_listener(struct listener *listener, char *errmsg, int errlen) +{ + int fd, err; + int ready; + char *msg = NULL; + + err = ERR_NONE; + + /* ensure we never return garbage */ + if (errlen) + *errmsg = 0; + + if (listener->state != LI_ASSIGNED) + return ERR_NONE; /* already bound */ + + if (!(listener->rx.flags & RX_F_BOUND)) { + msg = "receiving socket not bound"; + err |= ERR_FATAL | ERR_ALERT; + goto uxst_return; + } + + if (listener->rx.flags & RX_F_MUST_DUP) + goto done; + + fd = listener->rx.fd; + ready = sock_accepting_conn(&listener->rx) > 0; + + if (!ready && /* only listen if not already done by external process */ + listen(fd, listener_backlog(listener)) < 0) { + err |= ERR_FATAL | ERR_ALERT; + msg = "cannot listen to UNIX socket"; + goto uxst_close_return; + } + + done: + /* the socket is now listening */ + listener_set_state(listener, LI_LISTEN); + return err; + + uxst_close_return: + close(fd); + uxst_return: + if (msg && errlen) { + char *path_str; + + path_str = sa2str((struct sockaddr_storage *)&listener->rx.addr, 0, 0); + snprintf(errmsg, errlen, "%s for [%s]", msg, ((path_str) ? path_str : "")); + ha_free(&path_str); + } + return err; +} + +/* Enable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void uxst_enable_listener(struct listener *l) +{ + fd_want_recv_safe(l->rx.fd); +} + +/* Disable receipt of incoming connections for listener <l>. The receiver must + * still be valid. + */ +static void uxst_disable_listener(struct listener *l) +{ + fd_stop_recv(l->rx.fd); +} + +/* Suspend a receiver. Returns < 0 in case of failure, 0 if the receiver + * was totally stopped, or > 0 if correctly suspended. For plain unix sockets + * we only disable the listener to prevent data from being handled but nothing + * more is done since currently it's the new process which handles the renaming. + * Abstract sockets are completely unbound and closed so there's no need to stop + * the poller. + */ +static int uxst_suspend_receiver(struct receiver *rx) +{ + struct listener *l = LIST_ELEM(rx, struct listener *, rx); + + if (((struct sockaddr_un *)&rx->addr)->sun_path[0]) { + uxst_disable_listener(l); + return 1; + } + + /* Listener's lock already held. Call lockless version of + * unbind_listener. */ + do_unbind_listener(l); + return 0; +} + + +/* + * This function initiates a UNIX connection establishment to the target assigned + * to connection <conn> using (si->{target,dst}). The source address is ignored + * and will be selected by the system. conn->target may point either to a valid + * server or to a backend, depending on conn->target. Only OBJ_TYPE_PROXY and + * OBJ_TYPE_SERVER are supported. The <data> parameter is a boolean indicating + * whether there are data waiting for being sent or not, in order to adjust data + * write polling and on some platforms. The <delack> argument is ignored. + * + * Note that a pending send_proxy message accounts for data. + * + * It can return one of : + * - SF_ERR_NONE if everything's OK + * - SF_ERR_SRVTO if there are no more servers + * - SF_ERR_SRVCL if the connection was refused by the server + * - SF_ERR_PRXCOND if the connection has been limited by the proxy (maxconn) + * - SF_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * - SF_ERR_INTERNAL for any other purely internal errors + * Additionally, in the case of SF_ERR_RESOURCE, an emergency log will be emitted. + * + * The connection's fd is inserted only when SF_ERR_NONE is returned, otherwise + * it's invalid and the caller has nothing to do. + */ +static int uxst_connect_server(struct connection *conn, int flags) +{ + int fd; + struct server *srv; + struct proxy *be; + + BUG_ON(!conn->dst); + + switch (obj_type(conn->target)) { + case OBJ_TYPE_PROXY: + be = __objt_proxy(conn->target); + srv = NULL; + break; + case OBJ_TYPE_SERVER: + srv = __objt_server(conn->target); + be = srv->proxy; + break; + default: + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if ((fd = conn->handle.fd = socket(PF_UNIX, SOCK_STREAM, 0)) == -1) { + qfprintf(stderr, "Cannot get a server socket.\n"); + + if (errno == ENFILE) { + conn->err_code = CO_ER_SYS_FDLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached system FD limit (maxsock=%d). Please check system tunables.\n", + be->id, global.maxsock); + } + else if (errno == EMFILE) { + conn->err_code = CO_ER_PROC_FDLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached process FD limit (maxsock=%d). Please check 'ulimit-n' and restart.\n", + be->id, global.maxsock); + } + else if (errno == ENOBUFS || errno == ENOMEM) { + conn->err_code = CO_ER_SYS_MEMLIM; + send_log(be, LOG_EMERG, + "Proxy %s reached system memory limit (maxsock=%d). Please check system tunables.\n", + be->id, global.maxsock); + } + else if (errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) { + conn->err_code = CO_ER_NOPROTO; + } + else + conn->err_code = CO_ER_SOCK_ERR; + + /* this is a resource error */ + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } + + if (fd >= global.maxsock) { + /* do not log anything there, it's a normal condition when this option + * is used to serialize connections to a server ! + */ + ha_alert("socket(): not enough free sockets. Raise -n argument. Giving up.\n"); + close(fd); + conn->err_code = CO_ER_CONF_FDLIM; + conn->flags |= CO_FL_ERROR; + return SF_ERR_PRXCOND; /* it is a configuration limit */ + } + + if (fd_set_nonblock(fd) == -1) { + qfprintf(stderr,"Cannot set client socket to non blocking mode.\n"); + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if (master == 1 && fd_set_cloexec(fd) == -1) { + ha_alert("Cannot set CLOEXEC on client socket.\n"); + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_INTERNAL; + } + + if (global.tune.server_sndbuf) + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &global.tune.server_sndbuf, sizeof(global.tune.server_sndbuf)); + + if (global.tune.server_rcvbuf) + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &global.tune.server_rcvbuf, sizeof(global.tune.server_rcvbuf)); + + if (connect(fd, (struct sockaddr *)conn->dst, get_addr_len(conn->dst)) == -1) { + if (errno == EINPROGRESS || errno == EALREADY) { + conn->flags |= CO_FL_WAIT_L4_CONN; + } + else if (errno == EISCONN) { + conn->flags &= ~CO_FL_WAIT_L4_CONN; + } + else if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EADDRINUSE || errno == EADDRNOTAVAIL) { + char *msg; + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EADDRNOTAVAIL) { + msg = "can't connect to destination unix socket, check backlog size on the server"; + conn->err_code = CO_ER_FREE_PORTS; + } + else { + msg = "local address already in use"; + conn->err_code = CO_ER_ADDR_INUSE; + } + + qfprintf(stderr,"Connect() failed for backend %s: %s.\n", be->id, msg); + close(fd); + send_log(be, LOG_ERR, "Connect() failed for backend %s: %s.\n", be->id, msg); + conn->flags |= CO_FL_ERROR; + return SF_ERR_RESOURCE; + } + else if (errno == ETIMEDOUT) { + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_SRVTO; + } + else { // (errno == ECONNREFUSED || errno == ENETUNREACH || errno == EACCES || errno == EPERM) + close(fd); + conn->err_code = CO_ER_SOCK_ERR; + conn->flags |= CO_FL_ERROR; + return SF_ERR_SRVCL; + } + } + else { + /* connect() already succeeded, which is quite usual for unix + * sockets. Let's avoid a second connect() probe to complete it. + */ + conn->flags &= ~CO_FL_WAIT_L4_CONN; + } + + /* Prepare to send a few handshakes related to the on-wire protocol. */ + if (conn->send_proxy_ofs) + conn->flags |= CO_FL_SEND_PROXY; + + conn_ctrl_init(conn); /* registers the FD */ + HA_ATOMIC_AND(&fdtab[fd].state, ~FD_LINGER_RISK); /* no need to disable lingering */ + + if (conn->flags & CO_FL_WAIT_L4_CONN) { + fd_want_send(fd); + fd_cant_send(fd); + fd_cant_recv(fd); + } + + return SF_ERR_NONE; /* connection is OK */ +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/protocol.c b/src/protocol.c new file mode 100644 index 0000000..25ed6b7 --- /dev/null +++ b/src/protocol.c @@ -0,0 +1,309 @@ +/* + * Protocol registration functions. + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/types.h> +#include <sys/socket.h> + +#include <haproxy/api.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/proto_quic.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/sock.h> +#include <haproxy/tools.h> + + +/* List head of all registered protocols */ +static struct list protocols = LIST_HEAD_INIT(protocols); +struct protocol *__protocol_by_family[AF_CUST_MAX][PROTO_NUM_TYPES][2] __read_mostly = { }; + +/* This is the global spinlock we may need to register/unregister listeners or + * protocols. Its main purpose is in fact to serialize the rare stop/deinit() + * phases. + */ +__decl_spinlock(proto_lock); + +/* Registers the protocol <proto> */ +void protocol_register(struct protocol *proto) +{ + int sock_domain = proto->fam->sock_domain; + + BUG_ON(sock_domain < 0 || sock_domain >= AF_CUST_MAX); + BUG_ON(proto->proto_type >= PROTO_NUM_TYPES); + + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + LIST_APPEND(&protocols, &proto->list); + __protocol_by_family[sock_domain] + [proto->proto_type] + [proto->xprt_type == PROTO_TYPE_DGRAM] = proto; + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); +} + +/* Unregisters the protocol <proto>. Note that all listeners must have + * previously been unbound. + */ +void protocol_unregister(struct protocol *proto) +{ + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + LIST_DELETE(&proto->list); + LIST_INIT(&proto->list); + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); +} + +/* clears flag <flag> on all protocols. */ +void protocol_clrf_all(uint flag) +{ + struct protocol *proto; + + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + list_for_each_entry(proto, &protocols, list) + proto->flags &= ~flag; + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); +} + +/* sets flag <flag> on all protocols. */ +void protocol_setf_all(uint flag) +{ + struct protocol *proto; + + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + list_for_each_entry(proto, &protocols, list) + proto->flags |= flag; + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); +} + +/* Checks if protocol <proto> supports PROTO_F flag <flag>. Returns zero if not, + * non-zero if supported. It may return a cached value from a previous test, + * and may run live tests then update the proto's flags to cache a result. It's + * better to call it only if needed so that it doesn't result in modules being + * loaded in case of a live test. It is only supposed to be used during boot. + */ +int protocol_supports_flag(struct protocol *proto, uint flag) +{ + if (flag == PROTO_F_REUSEPORT_SUPPORTED) { + int ret = 0; + + /* check if the protocol supports SO_REUSEPORT */ + if (!(_HA_ATOMIC_LOAD(&proto->flags) & PROTO_F_REUSEPORT_SUPPORTED)) + return 0; + + /* at least nobody said it was not supported */ + if (_HA_ATOMIC_LOAD(&proto->flags) & PROTO_F_REUSEPORT_TESTED) + return 1; + + /* run a live check */ + ret = _sock_supports_reuseport(proto->fam, proto->sock_type, proto->sock_prot); + if (!ret) + _HA_ATOMIC_AND(&proto->flags, ~PROTO_F_REUSEPORT_SUPPORTED); + + _HA_ATOMIC_OR(&proto->flags, PROTO_F_REUSEPORT_TESTED); + return ret; + } + return 0; +} + +#ifdef USE_QUIC +/* Return 1 if QUIC protocol may be bound, 0 if no, depending on the tuning + * parameters. + */ +static inline int protocol_may_bind_quic(struct listener *l) +{ + if (global.tune.options & GTUNE_NO_QUIC) + return 0; + return 1; +} +#endif + +/* binds all listeners of all registered protocols. Returns a composition + * of ERR_NONE, ERR_RETRYABLE, ERR_FATAL. + */ +int protocol_bind_all(int verbose) +{ + struct protocol *proto; + struct listener *listener; + struct receiver *receiver; + char msg[1000]; + char *errmsg; + int err, lerr; + + err = 0; + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + list_for_each_entry(proto, &protocols, list) { + list_for_each_entry(receiver, &proto->receivers, proto_list) { + listener = LIST_ELEM(receiver, struct listener *, rx); +#ifdef USE_QUIC + if ((proto == &proto_quic4 || proto == &proto_quic6) && + !protocol_may_bind_quic(listener)) + continue; +#endif + + lerr = proto->fam->bind(receiver, &errmsg); + err |= lerr; + + /* errors are reported if <verbose> is set or if they are fatal */ + if (verbose || (lerr & (ERR_FATAL | ERR_ABORT))) { + struct proxy *px = listener->bind_conf->frontend; + + if (lerr & ERR_ALERT) + ha_alert("Binding [%s:%d] for %s %s: %s\n", + listener->bind_conf->file, listener->bind_conf->line, + proxy_type_str(px), px->id, errmsg); + else if (lerr & ERR_WARN) + ha_warning("Binding [%s:%d] for %s %s: %s\n", + listener->bind_conf->file, listener->bind_conf->line, + proxy_type_str(px), px->id, errmsg); + } + if (lerr != ERR_NONE) + ha_free(&errmsg); + + if (lerr & ERR_ABORT) + break; + + if (lerr & ~ERR_WARN) + continue; + + /* for now there's still always a listening function */ + BUG_ON(!proto->listen); + lerr = proto->listen(listener, msg, sizeof(msg)); + err |= lerr; + + if (verbose || (lerr & (ERR_FATAL | ERR_ABORT))) { + struct proxy *px = listener->bind_conf->frontend; + + if (lerr & ERR_ALERT) + ha_alert("Starting [%s:%d] for %s %s: %s\n", + listener->bind_conf->file, listener->bind_conf->line, + proxy_type_str(px), px->id, msg); + else if (lerr & ERR_WARN) + ha_warning("Starting [%s:%d] for %s %s: %s\n", + listener->bind_conf->file, listener->bind_conf->line, + proxy_type_str(px), px->id, msg); + } + if (lerr & ERR_ABORT) + break; + } + if (err & ERR_ABORT) + break; + } + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); + return err; +} + +/* unbinds all listeners of all registered protocols. They are also closed. + * This must be performed before calling exit() in order to get a chance to + * remove file-system based sockets and pipes. + * Returns a composition of ERR_NONE, ERR_RETRYABLE, ERR_FATAL, ERR_ABORT. + */ +int protocol_unbind_all(void) +{ + struct protocol *proto; + struct listener *listener; + int err; + + err = 0; + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + list_for_each_entry(proto, &protocols, list) { + list_for_each_entry(listener, &proto->receivers, rx.proto_list) + unbind_listener(listener); + } + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); + return err; +} + +/* stops all listeners of all registered protocols. This will normally catch + * every single listener, all protocols included. This is to be used during + * soft_stop() only. It does not return any error. + */ +void protocol_stop_now(void) +{ + struct protocol *proto; + struct listener *listener, *lback; + + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + list_for_each_entry(proto, &protocols, list) { + list_for_each_entry_safe(listener, lback, &proto->receivers, rx.proto_list) + stop_listener(listener, 0, 1, 0); + } + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); +} + +/* suspends all listeners of all registered protocols. This is typically + * used on SIG_TTOU to release all listening sockets for the time needed to + * try to bind a new process. The listeners enter LI_PAUSED or LI_ASSIGNED. + * It returns ERR_NONE, with ERR_FATAL on failure. + */ +int protocol_pause_all(void) +{ + struct protocol *proto; + struct listener *listener; + int err; + + err = 0; + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + list_for_each_entry(proto, &protocols, list) { + list_for_each_entry(listener, &proto->receivers, rx.proto_list) + if (!suspend_listener(listener, 0, 0)) + err |= ERR_FATAL; + } + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); + return err; +} + +/* resumes all listeners of all registered protocols. This is typically used on + * SIG_TTIN to re-enable listening sockets after a new process failed to bind. + * The listeners switch to LI_READY/LI_FULL. It returns ERR_NONE, with ERR_FATAL + * on failure. + */ +int protocol_resume_all(void) +{ + struct protocol *proto; + struct listener *listener; + int err; + + err = 0; + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + list_for_each_entry(proto, &protocols, list) { + list_for_each_entry(listener, &proto->receivers, rx.proto_list) + if (!resume_listener(listener, 0, 0)) + err |= ERR_FATAL; + } + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); + return err; +} + +/* enables all listeners of all registered protocols. This is intended to be + * used after a fork() to enable reading on all file descriptors. Returns + * composition of ERR_NONE. + */ +int protocol_enable_all(void) +{ + struct protocol *proto; + struct listener *listener; + + HA_SPIN_LOCK(PROTO_LOCK, &proto_lock); + list_for_each_entry(proto, &protocols, list) { + list_for_each_entry(listener, &proto->receivers, rx.proto_list) + enable_listener(listener); + } + HA_SPIN_UNLOCK(PROTO_LOCK, &proto_lock); + return ERR_NONE; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/proxy.c b/src/proxy.c new file mode 100644 index 0000000..ef95340 --- /dev/null +++ b/src/proxy.c @@ -0,0 +1,3451 @@ +/* + * Proxy variables and functions. + * + * Copyright 2000-2009 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <unistd.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/stat.h> + +#include <import/eb32tree.h> +#include <import/ebistree.h> + +#include <haproxy/acl.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/capture-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/filters.h> +#include <haproxy/global.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_htx.h> +#include <haproxy/http_ext.h> +#include <haproxy/http_rules.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/peers.h> +#include <haproxy/pool.h> +#include <haproxy/protocol.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/proxy.h> +#include <haproxy/sc_strm.h> +#include <haproxy/quic_tp.h> +#include <haproxy/server-t.h> +#include <haproxy/signal.h> +#include <haproxy/stats-t.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/tcpcheck.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> + + +int listeners; /* # of proxy listeners, set by cfgparse */ +struct proxy *proxies_list = NULL; /* list of all existing proxies */ +struct eb_root used_proxy_id = EB_ROOT; /* list of proxy IDs in use */ +struct eb_root proxy_by_name = EB_ROOT; /* tree of proxies sorted by name */ +struct eb_root defproxy_by_name = EB_ROOT; /* tree of default proxies sorted by name (dups possible) */ +unsigned int error_snapshot_id = 0; /* global ID assigned to each error then incremented */ + +/* CLI context used during "show servers {state|conn}" */ +struct show_srv_ctx { + struct proxy *px; /* current proxy to dump or NULL */ + struct server *sv; /* current server to dump or NULL */ + uint only_pxid; /* dump only this proxy ID when explicit */ + int show_conn; /* non-zero = "conn" otherwise "state" */ + enum { + SHOW_SRV_HEAD = 0, + SHOW_SRV_LIST, + } state; +}; + +/* proxy->options */ +const struct cfg_opt cfg_opts[] = +{ + { "abortonclose", PR_O_ABRT_CLOSE, PR_CAP_BE, 0, 0 }, + { "allbackups", PR_O_USE_ALL_BK, PR_CAP_BE, 0, 0 }, + { "checkcache", PR_O_CHK_CACHE, PR_CAP_BE, 0, PR_MODE_HTTP }, + { "clitcpka", PR_O_TCP_CLI_KA, PR_CAP_FE, 0, 0 }, + { "contstats", PR_O_CONTSTATS, PR_CAP_FE, 0, 0 }, + { "dontlognull", PR_O_NULLNOLOG, PR_CAP_FE, 0, 0 }, + { "http-buffer-request", PR_O_WREQ_BODY, PR_CAP_FE | PR_CAP_BE, 0, PR_MODE_HTTP }, + { "http-ignore-probes", PR_O_IGNORE_PRB, PR_CAP_FE, 0, PR_MODE_HTTP }, + { "idle-close-on-response", PR_O_IDLE_CLOSE_RESP, PR_CAP_FE, 0, PR_MODE_HTTP }, + { "prefer-last-server", PR_O_PREF_LAST, PR_CAP_BE, 0, PR_MODE_HTTP }, + { "logasap", PR_O_LOGASAP, PR_CAP_FE, 0, 0 }, + { "nolinger", PR_O_TCP_NOLING, PR_CAP_FE | PR_CAP_BE, 0, 0 }, + { "persist", PR_O_PERSIST, PR_CAP_BE, 0, 0 }, + { "srvtcpka", PR_O_TCP_SRV_KA, PR_CAP_BE, 0, 0 }, +#ifdef USE_TPROXY + { "transparent", PR_O_TRANSP, PR_CAP_BE, 0, 0 }, +#else + { "transparent", 0, 0, 0, 0 }, +#endif + + { NULL, 0, 0, 0, 0 } +}; + +/* proxy->options2 */ +const struct cfg_opt cfg_opts2[] = +{ +#ifdef USE_LINUX_SPLICE + { "splice-request", PR_O2_SPLIC_REQ, PR_CAP_FE|PR_CAP_BE, 0, 0 }, + { "splice-response", PR_O2_SPLIC_RTR, PR_CAP_FE|PR_CAP_BE, 0, 0 }, + { "splice-auto", PR_O2_SPLIC_AUT, PR_CAP_FE|PR_CAP_BE, 0, 0 }, +#else + { "splice-request", 0, 0, 0, 0 }, + { "splice-response", 0, 0, 0, 0 }, + { "splice-auto", 0, 0, 0, 0 }, +#endif + { "accept-invalid-http-request", PR_O2_REQBUG_OK, PR_CAP_FE, 0, PR_MODE_HTTP }, + { "accept-invalid-http-response", PR_O2_RSPBUG_OK, PR_CAP_BE, 0, PR_MODE_HTTP }, + { "dontlog-normal", PR_O2_NOLOGNORM, PR_CAP_FE, 0, 0 }, + { "log-separate-errors", PR_O2_LOGERRORS, PR_CAP_FE, 0, 0 }, + { "log-health-checks", PR_O2_LOGHCHKS, PR_CAP_BE, 0, 0 }, + { "socket-stats", PR_O2_SOCKSTAT, PR_CAP_FE, 0, 0 }, + { "tcp-smart-accept", PR_O2_SMARTACC, PR_CAP_FE, 0, 0 }, + { "tcp-smart-connect", PR_O2_SMARTCON, PR_CAP_BE, 0, 0 }, + { "independent-streams", PR_O2_INDEPSTR, PR_CAP_FE|PR_CAP_BE, 0, 0 }, + { "http-use-proxy-header", PR_O2_USE_PXHDR, PR_CAP_FE, 0, PR_MODE_HTTP }, + { "http-pretend-keepalive", PR_O2_FAKE_KA, PR_CAP_BE, 0, PR_MODE_HTTP }, + { "http-no-delay", PR_O2_NODELAY, PR_CAP_FE|PR_CAP_BE, 0, PR_MODE_HTTP }, + + {"h1-case-adjust-bogus-client", PR_O2_H1_ADJ_BUGCLI, PR_CAP_FE, 0, 0 }, + {"h1-case-adjust-bogus-server", PR_O2_H1_ADJ_BUGSRV, PR_CAP_BE, 0, 0 }, + {"disable-h2-upgrade", PR_O2_NO_H2_UPGRADE, PR_CAP_FE, 0, PR_MODE_HTTP }, + { NULL, 0, 0, 0 } +}; + +/* Helper function to resolve a single sticking rule after config parsing. + * Returns 1 for success and 0 for failure + */ +int resolve_stick_rule(struct proxy *curproxy, struct sticking_rule *mrule) +{ + struct stktable *target; + + if (mrule->table.name) + target = stktable_find_by_name(mrule->table.name); + else + target = curproxy->table; + + if (!target) { + ha_alert("Proxy '%s': unable to find stick-table '%s'.\n", + curproxy->id, mrule->table.name ? mrule->table.name : curproxy->id); + return 0; + } + else if (!stktable_compatible_sample(mrule->expr, target->type)) { + ha_alert("Proxy '%s': type of fetch not usable with type of stick-table '%s'.\n", + curproxy->id, mrule->table.name ? mrule->table.name : curproxy->id); + return 0; + } + + /* success */ + ha_free(&mrule->table.name); + mrule->table.t = target; + stktable_alloc_data_type(target, STKTABLE_DT_SERVER_ID, NULL, NULL); + stktable_alloc_data_type(target, STKTABLE_DT_SERVER_KEY, NULL, NULL); + if (!in_proxies_list(target->proxies_list, curproxy)) { + curproxy->next_stkt_ref = target->proxies_list; + target->proxies_list = curproxy; + } + return 1; +} + +void free_stick_rules(struct list *rules) +{ + struct sticking_rule *rule, *ruleb; + + list_for_each_entry_safe(rule, ruleb, rules, list) { + LIST_DELETE(&rule->list); + free_acl_cond(rule->cond); + release_sample_expr(rule->expr); + free(rule); + } +} + +static void free_logformat_list(struct list *lfs) +{ + struct logformat_node *lf, *lfb; + + list_for_each_entry_safe(lf, lfb, lfs, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } +} + +void free_server_rules(struct list *srules) +{ + struct server_rule *srule, *sruleb; + + list_for_each_entry_safe(srule, sruleb, srules, list) { + LIST_DELETE(&srule->list); + free_acl_cond(srule->cond); + free_logformat_list(&srule->expr); + free(srule->file); + free(srule); + } +} + +void free_proxy(struct proxy *p) +{ + struct server *s; + struct cap_hdr *h,*h_next; + struct listener *l,*l_next; + struct bind_conf *bind_conf, *bind_back; + struct acl_cond *cond, *condb; + struct acl *acl, *aclb; + struct switching_rule *rule, *ruleb; + struct redirect_rule *rdr, *rdrb; + struct logger *log, *logb; + struct proxy_deinit_fct *pxdf; + struct server_deinit_fct *srvdf; + + if (!p) + return; + + free(p->conf.file); + free(p->id); + free(p->cookie_name); + free(p->cookie_domain); + free(p->cookie_attrs); + free(p->lbprm.arg_str); + release_sample_expr(p->lbprm.expr); + free(p->server_state_file_name); + free(p->capture_name); + istfree(&p->monitor_uri); + free(p->rdp_cookie_name); + free(p->invalid_rep); + free(p->invalid_req); +#if defined(CONFIG_HAP_TRANSPARENT) + free(p->conn_src.bind_hdr_name); +#endif + if (p->conf.logformat_string != default_http_log_format && + p->conf.logformat_string != default_tcp_log_format && + p->conf.logformat_string != clf_http_log_format && + p->conf.logformat_string != default_https_log_format && + p->conf.logformat_string != httpclient_log_format) + free(p->conf.logformat_string); + + free(p->conf.lfs_file); + free(p->conf.uniqueid_format_string); + istfree(&p->header_unique_id); + free(p->conf.uif_file); + if ((p->lbprm.algo & BE_LB_LKUP) == BE_LB_LKUP_MAP) + free(p->lbprm.map.srv); + if (p->mode == PR_MODE_SYSLOG) + free(p->lbprm.log.srv); + + if (p->conf.logformat_sd_string != default_rfc5424_sd_log_format) + free(p->conf.logformat_sd_string); + free(p->conf.lfsd_file); + + free(p->conf.error_logformat_string); + free(p->conf.elfs_file); + + list_for_each_entry_safe(cond, condb, &p->mon_fail_cond, list) { + LIST_DELETE(&cond->list); + free_acl_cond(cond); + } + + EXTRA_COUNTERS_FREE(p->extra_counters_fe); + EXTRA_COUNTERS_FREE(p->extra_counters_be); + + list_for_each_entry_safe(acl, aclb, &p->acl, list) { + LIST_DELETE(&acl->list); + prune_acl(acl); + free(acl); + } + + free_server_rules(&p->server_rules); + + list_for_each_entry_safe(rule, ruleb, &p->switching_rules, list) { + LIST_DELETE(&rule->list); + free_acl_cond(rule->cond); + free(rule->file); + free(rule); + } + + list_for_each_entry_safe(rdr, rdrb, &p->redirect_rules, list) { + LIST_DELETE(&rdr->list); + http_free_redirect_rule(rdr); + } + + list_for_each_entry_safe(log, logb, &p->loggers, list) { + LIST_DEL_INIT(&log->list); + free_logger(log); + } + + free_logformat_list(&p->logformat); + free_logformat_list(&p->logformat_sd); + free_logformat_list(&p->format_unique_id); + free_logformat_list(&p->logformat_error); + + free_act_rules(&p->tcp_req.inspect_rules); + free_act_rules(&p->tcp_rep.inspect_rules); + free_act_rules(&p->tcp_req.l4_rules); + free_act_rules(&p->tcp_req.l5_rules); + free_act_rules(&p->http_req_rules); + free_act_rules(&p->http_res_rules); + free_act_rules(&p->http_after_res_rules); + + free_stick_rules(&p->storersp_rules); + free_stick_rules(&p->sticking_rules); + + h = p->req_cap; + while (h) { + if (p->defpx && h == p->defpx->req_cap) + break; + h_next = h->next; + free(h->name); + pool_destroy(h->pool); + free(h); + h = h_next; + }/* end while(h) */ + + h = p->rsp_cap; + while (h) { + if (p->defpx && h == p->defpx->rsp_cap) + break; + h_next = h->next; + free(h->name); + pool_destroy(h->pool); + free(h); + h = h_next; + }/* end while(h) */ + + s = p->srv; + while (s) { + list_for_each_entry(srvdf, &server_deinit_list, list) + srvdf->fct(s); + s = srv_drop(s); + }/* end while(s) */ + + /* also free default-server parameters since some of them might have + * been dynamically allocated (e.g.: config hints, cookies, ssl..) + */ + srv_free_params(&p->defsrv); + + list_for_each_entry_safe(l, l_next, &p->conf.listeners, by_fe) { + LIST_DELETE(&l->by_fe); + LIST_DELETE(&l->by_bind); + free(l->name); + free(l->per_thr); + free(l->counters); + task_destroy(l->rx.rhttp.task); + + EXTRA_COUNTERS_FREE(l->extra_counters); + free(l); + } + + /* Release unused SSL configs. */ + list_for_each_entry_safe(bind_conf, bind_back, &p->conf.bind, by_fe) { + if (bind_conf->xprt->destroy_bind_conf) + bind_conf->xprt->destroy_bind_conf(bind_conf); + free(bind_conf->file); + free(bind_conf->arg); + free(bind_conf->settings.interface); + LIST_DELETE(&bind_conf->by_fe); + free(bind_conf->rhttp_srvname); + free(bind_conf); + } + + flt_deinit(p); + + list_for_each_entry(pxdf, &proxy_deinit_list, list) + pxdf->fct(p); + + free(p->desc); + + http_ext_clean(p); + + task_destroy(p->task); + + pool_destroy(p->req_cap_pool); + pool_destroy(p->rsp_cap_pool); + + stktable_deinit(p->table); + ha_free(&p->table); + + HA_RWLOCK_DESTROY(&p->lbprm.lock); + HA_RWLOCK_DESTROY(&p->lock); + + proxy_unref_defaults(p); + ha_free(&p); +} + +/* + * This function returns a string containing a name describing capabilities to + * report comprehensible error messages. Specifically, it will return the words + * "frontend", "backend" when appropriate, "defaults" if it corresponds to a + * defaults section, or "proxy" for all other cases including the proxies + * declared in "listen" mode. + */ +const char *proxy_cap_str(int cap) +{ + if (cap & PR_CAP_DEF) + return "defaults"; + + if ((cap & PR_CAP_LISTEN) != PR_CAP_LISTEN) { + if (cap & PR_CAP_FE) + return "frontend"; + else if (cap & PR_CAP_BE) + return "backend"; + } + return "proxy"; +} + +/* + * This function returns a string containing the mode of the proxy in a format + * suitable for error messages. + */ +const char *proxy_mode_str(int mode) { + + if (mode == PR_MODE_TCP) + return "tcp"; + else if (mode == PR_MODE_HTTP) + return "http"; + else if (mode == PR_MODE_CLI) + return "cli"; + else if (mode == PR_MODE_SYSLOG) + return "syslog"; + else if (mode == PR_MODE_PEERS) + return "peers"; + else + return "unknown"; +} + +/* try to find among known options the one that looks closest to <word> by + * counting transitions between letters, digits and other characters. Will + * return the best matching word if found, otherwise NULL. An optional array + * of extra words to compare may be passed in <extra>, but it must then be + * terminated by a NULL entry. If unused it may be NULL. + */ +const char *proxy_find_best_option(const char *word, const char **extra) +{ + uint8_t word_sig[1024]; + uint8_t list_sig[1024]; + const char *best_ptr = NULL; + int dist, best_dist = INT_MAX; + int index; + + make_word_fingerprint(word_sig, word); + + for (index = 0; cfg_opts[index].name; index++) { + make_word_fingerprint(list_sig, cfg_opts[index].name); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_ptr = cfg_opts[index].name; + } + } + + for (index = 0; cfg_opts2[index].name; index++) { + make_word_fingerprint(list_sig, cfg_opts2[index].name); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_ptr = cfg_opts2[index].name; + } + } + + while (extra && *extra) { + make_word_fingerprint(list_sig, *extra); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_ptr = *extra; + } + extra++; + } + + if (best_dist > 2 * strlen(word) || (best_ptr && best_dist > 2 * strlen(best_ptr))) + best_ptr = NULL; + return best_ptr; +} + +/* This function parses a "timeout" statement in a proxy section. It returns + * -1 if there is any error, 1 for a warning, otherwise zero. If it does not + * return zero, it will write an error or warning message into a preallocated + * buffer returned at <err>. The trailing is not be written. The function must + * be called with <args> pointing to the first command line word, with <proxy> + * pointing to the proxy being parsed, and <defpx> to the default proxy or NULL. + * As a special case for compatibility with older configs, it also accepts + * "{cli|srv|con}timeout" in args[0]. + */ +static int proxy_parse_timeout(char **args, int section, struct proxy *proxy, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + unsigned timeout; + int retval, cap; + const char *res, *name; + int *tv = NULL; + const int *td = NULL; + + retval = 0; + + /* simply skip "timeout" but remain compatible with old form */ + if (strcmp(args[0], "timeout") == 0) + args++; + + name = args[0]; + if (strcmp(args[0], "client") == 0) { + name = "client"; + tv = &proxy->timeout.client; + td = &defpx->timeout.client; + cap = PR_CAP_FE; + } else if (strcmp(args[0], "tarpit") == 0) { + tv = &proxy->timeout.tarpit; + td = &defpx->timeout.tarpit; + cap = PR_CAP_FE | PR_CAP_BE; + } else if (strcmp(args[0], "client-hs") == 0) { + tv = &proxy->timeout.client_hs; + td = &defpx->timeout.client_hs; + cap = PR_CAP_FE; + } else if (strcmp(args[0], "http-keep-alive") == 0) { + tv = &proxy->timeout.httpka; + td = &defpx->timeout.httpka; + cap = PR_CAP_FE | PR_CAP_BE; + } else if (strcmp(args[0], "http-request") == 0) { + tv = &proxy->timeout.httpreq; + td = &defpx->timeout.httpreq; + cap = PR_CAP_FE | PR_CAP_BE; + } else if (strcmp(args[0], "server") == 0) { + name = "server"; + tv = &proxy->timeout.server; + td = &defpx->timeout.server; + cap = PR_CAP_BE; + } else if (strcmp(args[0], "connect") == 0) { + name = "connect"; + tv = &proxy->timeout.connect; + td = &defpx->timeout.connect; + cap = PR_CAP_BE; + } else if (strcmp(args[0], "check") == 0) { + tv = &proxy->timeout.check; + td = &defpx->timeout.check; + cap = PR_CAP_BE; + } else if (strcmp(args[0], "queue") == 0) { + tv = &proxy->timeout.queue; + td = &defpx->timeout.queue; + cap = PR_CAP_BE; + } else if (strcmp(args[0], "tunnel") == 0) { + tv = &proxy->timeout.tunnel; + td = &defpx->timeout.tunnel; + cap = PR_CAP_BE; + } else if (strcmp(args[0], "client-fin") == 0) { + tv = &proxy->timeout.clientfin; + td = &defpx->timeout.clientfin; + cap = PR_CAP_FE; + } else if (strcmp(args[0], "server-fin") == 0) { + tv = &proxy->timeout.serverfin; + td = &defpx->timeout.serverfin; + cap = PR_CAP_BE; + } else if (strcmp(args[0], "clitimeout") == 0) { + memprintf(err, "the '%s' directive is not supported anymore since HAProxy 2.1. Use 'timeout client'.", args[0]); + return -1; + } else if (strcmp(args[0], "srvtimeout") == 0) { + memprintf(err, "the '%s' directive is not supported anymore since HAProxy 2.1. Use 'timeout server'.", args[0]); + return -1; + } else if (strcmp(args[0], "contimeout") == 0) { + memprintf(err, "the '%s' directive is not supported anymore since HAProxy 2.1. Use 'timeout connect'.", args[0]); + return -1; + } else { + memprintf(err, + "'timeout' supports 'client', 'server', 'connect', 'check', " + "'queue', 'handshake', 'http-keep-alive', 'http-request', 'tunnel', 'tarpit', " + "'client-fin' and 'server-fin' (got '%s')", + args[0]); + return -1; + } + + if (*args[1] == 0) { + memprintf(err, "'timeout %s' expects an integer value (in milliseconds)", name); + return -1; + } + + res = parse_time_err(args[1], &timeout, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to 'timeout %s' (maximum value is 2147483647 ms or ~24.8 days)", + args[1], name); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to 'timeout %s' (minimum non-null value is 1 ms)", + args[1], name); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in 'timeout %s'", *res, name); + return -1; + } + + if (!(proxy->cap & cap)) { + memprintf(err, "'timeout %s' will be ignored because %s '%s' has no %s capability", + name, proxy_type_str(proxy), proxy->id, + (cap & PR_CAP_BE) ? "backend" : "frontend"); + retval = 1; + } + else if (defpx && *tv != *td) { + memprintf(err, "overwriting 'timeout %s' which was already specified", name); + retval = 1; + } + + if (*args[2] != 0) { + memprintf(err, "'timeout %s' : unexpected extra argument '%s' after value '%s'.", name, args[2], args[1]); + retval = -1; + } + + *tv = MS_TO_TICKS(timeout); + return retval; +} + +/* This function parses a "rate-limit" statement in a proxy section. It returns + * -1 if there is any error, 1 for a warning, otherwise zero. If it does not + * return zero, it will write an error or warning message into a preallocated + * buffer returned at <err>. The function must be called with <args> pointing + * to the first command line word, with <proxy> pointing to the proxy being + * parsed, and <defpx> to the default proxy or NULL. + */ +static int proxy_parse_rate_limit(char **args, int section, struct proxy *proxy, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int retval; + char *res; + unsigned int *tv = NULL; + const unsigned int *td = NULL; + unsigned int val; + + retval = 0; + + if (strcmp(args[1], "sessions") == 0) { + tv = &proxy->fe_sps_lim; + td = &defpx->fe_sps_lim; + } + else { + memprintf(err, "'%s' only supports 'sessions' (got '%s')", args[0], args[1]); + return -1; + } + + if (*args[2] == 0) { + memprintf(err, "'%s %s' expects expects an integer value (in sessions/second)", args[0], args[1]); + return -1; + } + + val = strtoul(args[2], &res, 0); + if (*res) { + memprintf(err, "'%s %s' : unexpected character '%c' in integer value '%s'", args[0], args[1], *res, args[2]); + return -1; + } + + if (!(proxy->cap & PR_CAP_FE)) { + memprintf(err, "%s %s will be ignored because %s '%s' has no frontend capability", + args[0], args[1], proxy_type_str(proxy), proxy->id); + retval = 1; + } + else if (defpx && *tv != *td) { + memprintf(err, "overwriting %s %s which was already specified", args[0], args[1]); + retval = 1; + } + + *tv = val; + return retval; +} + +/* This function parses a "max-keep-alive-queue" statement in a proxy section. + * It returns -1 if there is any error, 1 for a warning, otherwise zero. If it + * does not return zero, it will write an error or warning message into a + * preallocated buffer returned at <err>. The function must be called with + * <args> pointing to the first command line word, with <proxy> pointing to + * the proxy being parsed, and <defpx> to the default proxy or NULL. + */ +static int proxy_parse_max_ka_queue(char **args, int section, struct proxy *proxy, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int retval; + char *res; + unsigned int val; + + retval = 0; + + if (*args[1] == 0) { + memprintf(err, "'%s' expects expects an integer value (or -1 to disable)", args[0]); + return -1; + } + + val = strtol(args[1], &res, 0); + if (*res) { + memprintf(err, "'%s' : unexpected character '%c' in integer value '%s'", args[0], *res, args[1]); + return -1; + } + + if (!(proxy->cap & PR_CAP_BE)) { + memprintf(err, "%s will be ignored because %s '%s' has no backend capability", + args[0], proxy_type_str(proxy), proxy->id); + retval = 1; + } + + /* we store <val+1> so that a user-facing value of -1 is stored as zero (default) */ + proxy->max_ka_queue = val + 1; + return retval; +} + +/* This function parses a "declare" statement in a proxy section. It returns -1 + * if there is any error, 1 for warning, otherwise 0. If it does not return zero, + * it will write an error or warning message into a preallocated buffer returned + * at <err>. The function must be called with <args> pointing to the first command + * line word, with <proxy> pointing to the proxy being parsed, and <defpx> to the + * default proxy or NULL. + */ +static int proxy_parse_declare(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + /* Capture keyword wannot be declared in a default proxy. */ + if (curpx == defpx) { + memprintf(err, "'%s' not available in default section", args[0]); + return -1; + } + + /* Capture keyword is only available in frontend. */ + if (!(curpx->cap & PR_CAP_FE)) { + memprintf(err, "'%s' only available in frontend or listen section", args[0]); + return -1; + } + + /* Check mandatory second keyword. */ + if (!args[1] || !*args[1]) { + memprintf(err, "'%s' needs a second keyword that specify the type of declaration ('capture')", args[0]); + return -1; + } + + /* Actually, declare is only available for declaring capture + * slot, but in the future it can declare maps or variables. + * So, this section permits to check and switch according with + * the second keyword. + */ + if (strcmp(args[1], "capture") == 0) { + char *error = NULL; + long len; + struct cap_hdr *hdr; + + /* Check the next keyword. */ + if (!args[2] || !*args[2] || + (strcmp(args[2], "response") != 0 && + strcmp(args[2], "request") != 0)) { + memprintf(err, "'%s %s' requires a direction ('request' or 'response')", args[0], args[1]); + return -1; + } + + /* Check the 'len' keyword. */ + if (!args[3] || !*args[3] || strcmp(args[3], "len") != 0) { + memprintf(err, "'%s %s' requires a capture length ('len')", args[0], args[1]); + return -1; + } + + /* Check the length value. */ + if (!args[4] || !*args[4]) { + memprintf(err, "'%s %s': 'len' requires a numeric value that represents the " + "capture length", + args[0], args[1]); + return -1; + } + + /* convert the length value. */ + len = strtol(args[4], &error, 10); + if (*error != '\0') { + memprintf(err, "'%s %s': cannot parse the length '%s'.", + args[0], args[1], args[3]); + return -1; + } + + /* check length. */ + if (len <= 0) { + memprintf(err, "length must be > 0"); + return -1; + } + + /* register the capture. */ + hdr = calloc(1, sizeof(*hdr)); + if (!hdr) { + memprintf(err, "proxy '%s': out of memory while registering a capture", curpx->id); + return -1; + } + hdr->name = NULL; /* not a header capture */ + hdr->namelen = 0; + hdr->len = len; + hdr->pool = create_pool("caphdr", hdr->len + 1, MEM_F_SHARED); + + if (strcmp(args[2], "request") == 0) { + hdr->next = curpx->req_cap; + hdr->index = curpx->nb_req_cap++; + curpx->req_cap = hdr; + } + if (strcmp(args[2], "response") == 0) { + hdr->next = curpx->rsp_cap; + hdr->index = curpx->nb_rsp_cap++; + curpx->rsp_cap = hdr; + } + return 0; + } + else { + memprintf(err, "unknown declaration type '%s' (supports 'capture')", args[1]); + return -1; + } +} + +/* This function parses a "retry-on" statement */ +static int +proxy_parse_retry_on(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int i; + + if (!(*args[1])) { + memprintf(err, "'%s' needs at least one keyword to specify when to retry", args[0]); + return -1; + } + if (!(curpx->cap & PR_CAP_BE)) { + memprintf(err, "'%s' only available in backend or listen section", args[0]); + return -1; + } + curpx->retry_type = 0; + for (i = 1; *(args[i]); i++) { + if (strcmp(args[i], "conn-failure") == 0) + curpx->retry_type |= PR_RE_CONN_FAILED; + else if (strcmp(args[i], "empty-response") == 0) + curpx->retry_type |= PR_RE_DISCONNECTED; + else if (strcmp(args[i], "response-timeout") == 0) + curpx->retry_type |= PR_RE_TIMEOUT; + else if (strcmp(args[i], "401") == 0) + curpx->retry_type |= PR_RE_401; + else if (strcmp(args[i], "403") == 0) + curpx->retry_type |= PR_RE_403; + else if (strcmp(args[i], "404") == 0) + curpx->retry_type |= PR_RE_404; + else if (strcmp(args[i], "408") == 0) + curpx->retry_type |= PR_RE_408; + else if (strcmp(args[i], "425") == 0) + curpx->retry_type |= PR_RE_425; + else if (strcmp(args[i], "500") == 0) + curpx->retry_type |= PR_RE_500; + else if (strcmp(args[i], "501") == 0) + curpx->retry_type |= PR_RE_501; + else if (strcmp(args[i], "502") == 0) + curpx->retry_type |= PR_RE_502; + else if (strcmp(args[i], "503") == 0) + curpx->retry_type |= PR_RE_503; + else if (strcmp(args[i], "504") == 0) + curpx->retry_type |= PR_RE_504; + else if (strcmp(args[i], "0rtt-rejected") == 0) + curpx->retry_type |= PR_RE_EARLY_ERROR; + else if (strcmp(args[i], "junk-response") == 0) + curpx->retry_type |= PR_RE_JUNK_REQUEST; + else if (!(strcmp(args[i], "all-retryable-errors"))) + curpx->retry_type |= PR_RE_CONN_FAILED | PR_RE_DISCONNECTED | + PR_RE_TIMEOUT | PR_RE_500 | PR_RE_502 | + PR_RE_503 | PR_RE_504 | PR_RE_EARLY_ERROR | + PR_RE_JUNK_REQUEST; + else if (strcmp(args[i], "none") == 0) { + if (i != 1 || *args[i + 1]) { + memprintf(err, "'%s' 'none' keyworld only usable alone", args[0]); + return -1; + } + } else { + memprintf(err, "'%s': unknown keyword '%s'", args[0], args[i]); + return -1; + } + + } + + + return 0; +} + +#ifdef TCP_KEEPCNT +/* This function parses "{cli|srv}tcpka-cnt" statements */ +static int proxy_parse_tcpka_cnt(char **args, int section, struct proxy *proxy, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int retval; + char *res; + unsigned int tcpka_cnt; + + retval = 0; + + if (*args[1] == 0) { + memprintf(err, "'%s' expects an integer value", args[0]); + return -1; + } + + tcpka_cnt = strtol(args[1], &res, 0); + if (*res) { + memprintf(err, "'%s' : unexpected character '%c' in integer value '%s'", args[0], *res, args[1]); + return -1; + } + + if (strcmp(args[0], "clitcpka-cnt") == 0) { + if (!(proxy->cap & PR_CAP_FE)) { + memprintf(err, "%s will be ignored because %s '%s' has no frontend capability", + args[0], proxy_type_str(proxy), proxy->id); + retval = 1; + } + proxy->clitcpka_cnt = tcpka_cnt; + } else if (strcmp(args[0], "srvtcpka-cnt") == 0) { + if (!(proxy->cap & PR_CAP_BE)) { + memprintf(err, "%s will be ignored because %s '%s' has no backend capability", + args[0], proxy_type_str(proxy), proxy->id); + retval = 1; + } + proxy->srvtcpka_cnt = tcpka_cnt; + } else { + /* unreachable */ + memprintf(err, "'%s': unknown keyword", args[0]); + return -1; + } + + return retval; +} +#endif + +#ifdef TCP_KEEPIDLE +/* This function parses "{cli|srv}tcpka-idle" statements */ +static int proxy_parse_tcpka_idle(char **args, int section, struct proxy *proxy, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int retval; + const char *res; + unsigned int tcpka_idle; + + retval = 0; + + if (*args[1] == 0) { + memprintf(err, "'%s' expects an integer value", args[0]); + return -1; + } + res = parse_time_err(args[1], &tcpka_idle, TIME_UNIT_S); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[1], args[0]); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[1], args[0]); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in argument to <%s>.\n", *res, args[0]); + return -1; + } + + if (strcmp(args[0], "clitcpka-idle") == 0) { + if (!(proxy->cap & PR_CAP_FE)) { + memprintf(err, "%s will be ignored because %s '%s' has no frontend capability", + args[0], proxy_type_str(proxy), proxy->id); + retval = 1; + } + proxy->clitcpka_idle = tcpka_idle; + } else if (strcmp(args[0], "srvtcpka-idle") == 0) { + if (!(proxy->cap & PR_CAP_BE)) { + memprintf(err, "%s will be ignored because %s '%s' has no backend capability", + args[0], proxy_type_str(proxy), proxy->id); + retval = 1; + } + proxy->srvtcpka_idle = tcpka_idle; + } else { + /* unreachable */ + memprintf(err, "'%s': unknown keyword", args[0]); + return -1; + } + + return retval; +} +#endif + +#ifdef TCP_KEEPINTVL +/* This function parses "{cli|srv}tcpka-intvl" statements */ +static int proxy_parse_tcpka_intvl(char **args, int section, struct proxy *proxy, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int retval; + const char *res; + unsigned int tcpka_intvl; + + retval = 0; + + if (*args[1] == 0) { + memprintf(err, "'%s' expects an integer value", args[0]); + return -1; + } + res = parse_time_err(args[1], &tcpka_intvl, TIME_UNIT_S); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[1], args[0]); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[1], args[0]); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in argument to <%s>.\n", *res, args[0]); + return -1; + } + + if (strcmp(args[0], "clitcpka-intvl") == 0) { + if (!(proxy->cap & PR_CAP_FE)) { + memprintf(err, "%s will be ignored because %s '%s' has no frontend capability", + args[0], proxy_type_str(proxy), proxy->id); + retval = 1; + } + proxy->clitcpka_intvl = tcpka_intvl; + } else if (strcmp(args[0], "srvtcpka-intvl") == 0) { + if (!(proxy->cap & PR_CAP_BE)) { + memprintf(err, "%s will be ignored because %s '%s' has no backend capability", + args[0], proxy_type_str(proxy), proxy->id); + retval = 1; + } + proxy->srvtcpka_intvl = tcpka_intvl; + } else { + /* unreachable */ + memprintf(err, "'%s': unknown keyword", args[0]); + return -1; + } + + return retval; +} +#endif + +/* This function inserts proxy <px> into the tree of known proxies (regular + * ones or defaults depending on px->cap & PR_CAP_DEF). The proxy's name is + * used as the storing key so it must already have been initialized. + */ +void proxy_store_name(struct proxy *px) +{ + struct eb_root *root = (px->cap & PR_CAP_DEF) ? &defproxy_by_name : &proxy_by_name; + + px->conf.by_name.key = px->id; + ebis_insert(root, &px->conf.by_name); +} + +/* Returns a pointer to the first proxy matching capabilities <cap> and id + * <id>. NULL is returned if no match is found. If <table> is non-zero, it + * only considers proxies having a table. + */ +struct proxy *proxy_find_by_id(int id, int cap, int table) +{ + struct eb32_node *n; + + for (n = eb32_lookup(&used_proxy_id, id); n; n = eb32_next(n)) { + struct proxy *px = container_of(n, struct proxy, conf.id); + + if (px->uuid != id) + break; + + if ((px->cap & cap) != cap) + continue; + + if (table && (!px->table || !px->table->size)) + continue; + + return px; + } + return NULL; +} + +/* Returns a pointer to the first proxy matching either name <name>, or id + * <name> if <name> begins with a '#'. NULL is returned if no match is found. + * If <table> is non-zero, it only considers proxies having a table. The search + * is made into the regular proxies, unless <cap> has PR_CAP_DEF set in which + * case it's searched into the defproxy tree. + */ +struct proxy *proxy_find_by_name(const char *name, int cap, int table) +{ + struct proxy *curproxy; + + if (*name == '#' && !(cap & PR_CAP_DEF)) { + curproxy = proxy_find_by_id(atoi(name + 1), cap, table); + if (curproxy) + return curproxy; + } + else { + struct eb_root *root; + struct ebpt_node *node; + + root = (cap & PR_CAP_DEF) ? &defproxy_by_name : &proxy_by_name; + for (node = ebis_lookup(root, name); node; node = ebpt_next(node)) { + curproxy = container_of(node, struct proxy, conf.by_name); + + if (strcmp(curproxy->id, name) != 0) + break; + + if ((curproxy->cap & cap) != cap) + continue; + + if (table && (!curproxy->table || !curproxy->table->size)) + continue; + + return curproxy; + } + } + return NULL; +} + +/* Finds the best match for a proxy with capabilities <cap>, name <name> and id + * <id>. At most one of <id> or <name> may be different provided that <cap> is + * valid. Either <id> or <name> may be left unspecified (0). The purpose is to + * find a proxy based on some information from a previous configuration, across + * reloads or during information exchange between peers. + * + * Names are looked up first if present, then IDs are compared if present. In + * case of an inexact match whatever is forced in the configuration has + * precedence in the following order : + * - 1) forced ID (proves a renaming / change of proxy type) + * - 2) proxy name+type (may indicate a move if ID differs) + * - 3) automatic ID+type (may indicate a renaming) + * + * Depending on what is found, we can end up in the following situations : + * + * name id cap | possible causes + * -------------+----------------- + * -- -- -- | nothing found + * -- -- ok | nothing found + * -- ok -- | proxy deleted, ID points to next one + * -- ok ok | proxy renamed, or deleted with ID pointing to next one + * ok -- -- | proxy deleted, but other half with same name still here (before) + * ok -- ok | proxy's ID changed (proxy moved in the config file) + * ok ok -- | proxy deleted, but other half with same name still here (after) + * ok ok ok | perfect match + * + * Upon return if <diff> is not NULL, it is zeroed then filled with up to 3 bits : + * - PR_FBM_MISMATCH_ID : proxy was found but ID differs + * (and ID was not zero) + * - PR_FBM_MISMATCH_NAME : proxy was found by ID but name differs + * (and name was not NULL) + * - PR_FBM_MISMATCH_PROXYTYPE : a proxy of different type was found with + * the same name and/or id + * + * Only a valid proxy is returned. If capabilities do not match, NULL is + * returned. The caller can check <diff> to report detailed warnings / errors, + * and decide whether or not to use what was found. + */ +struct proxy *proxy_find_best_match(int cap, const char *name, int id, int *diff) +{ + struct proxy *byname; + struct proxy *byid; + + if (!name && !id) + return NULL; + + if (diff) + *diff = 0; + + byname = byid = NULL; + + if (name) { + byname = proxy_find_by_name(name, cap, 0); + if (byname && (!id || byname->uuid == id)) + return byname; + } + + /* remaining possibilities : + * - name not set + * - name set but not found + * - name found, but ID doesn't match. + */ + if (id) { + byid = proxy_find_by_id(id, cap, 0); + if (byid) { + if (byname) { + /* id+type found, name+type found, but not all 3. + * ID wins only if forced, otherwise name wins. + */ + if (byid->options & PR_O_FORCED_ID) { + if (diff) + *diff |= PR_FBM_MISMATCH_NAME; + return byid; + } + else { + if (diff) + *diff |= PR_FBM_MISMATCH_ID; + return byname; + } + } + + /* remaining possibilities : + * - name not set + * - name set but not found + */ + if (name && diff) + *diff |= PR_FBM_MISMATCH_NAME; + return byid; + } + + /* ID not found */ + if (byname) { + if (diff) + *diff |= PR_FBM_MISMATCH_ID; + return byname; + } + } + + /* All remaining possibilities will lead to NULL. If we can report more + * detailed information to the caller about changed types and/or name, + * we'll do it. For example, we could detect that "listen foo" was + * split into "frontend foo_ft" and "backend foo_bk" if IDs are forced. + * - name not set, ID not found + * - name not found, ID not set + * - name not found, ID not found + */ + if (!diff) + return NULL; + + if (name) { + byname = proxy_find_by_name(name, 0, 0); + if (byname && (!id || byname->uuid == id)) + *diff |= PR_FBM_MISMATCH_PROXYTYPE; + } + + if (id) { + byid = proxy_find_by_id(id, 0, 0); + if (byid) { + if (!name) + *diff |= PR_FBM_MISMATCH_PROXYTYPE; /* only type changed */ + else if (byid->options & PR_O_FORCED_ID) + *diff |= PR_FBM_MISMATCH_NAME | PR_FBM_MISMATCH_PROXYTYPE; /* name and type changed */ + /* otherwise it's a different proxy that was returned */ + } + } + return NULL; +} + +/* + * This function finds a server with matching name within selected proxy. + * It also checks if there are more matching servers with + * requested name as this often leads into unexpected situations. + */ + +struct server *findserver(const struct proxy *px, const char *name) { + + struct server *cursrv, *target = NULL; + + if (!px) + return NULL; + + for (cursrv = px->srv; cursrv; cursrv = cursrv->next) { + if (strcmp(cursrv->id, name) != 0) + continue; + + if (!target) { + target = cursrv; + continue; + } + + ha_alert("Refusing to use duplicated server '%s' found in proxy: %s!\n", + name, px->id); + + return NULL; + } + + return target; +} + +/* + * This function finds a server with matching "<puid> x <rid>" within + * selected proxy <px>. + * Using the combination of proxy-uid + revision id ensures that the function + * will either return the server we're expecting or NULL if it has been removed + * from the proxy. + */ +struct server *findserver_unique_id(const struct proxy *px, int puid, uint32_t rid) { + + struct server *cursrv; + + if (!px) + return NULL; + + for (cursrv = px->srv; cursrv; cursrv = cursrv->next) { + if (cursrv->puid == puid && cursrv->rid == rid) + return cursrv; + } + + return NULL; +} + +/* + * This function finds a server with matching "<name> x <rid>" within + * selected proxy <px>. + * Using the combination of name + revision id ensures that the function will + * either return the server we're expecting or NULL if it has been removed + * from the proxy. + */ +struct server *findserver_unique_name(const struct proxy *px, const char *name, uint32_t rid) { + + struct server *cursrv; + + if (!px) + return NULL; + + for (cursrv = px->srv; cursrv; cursrv = cursrv->next) { + if (!strcmp(cursrv->id, name) && cursrv->rid == rid) + return cursrv; + } + + return NULL; +} + +/* This function checks that the designated proxy has no http directives + * enabled. It will output a warning if there are, and will fix some of them. + * It returns the number of fatal errors encountered. This should be called + * at the end of the configuration parsing if the proxy is not in http mode. + * The <file> argument is used to construct the error message. + */ +int proxy_cfg_ensure_no_http(struct proxy *curproxy) +{ + if (curproxy->cookie_name != NULL) { + ha_warning("cookie will be ignored for %s '%s' (needs 'mode http').\n", + proxy_type_str(curproxy), curproxy->id); + } + if (isttest(curproxy->monitor_uri)) { + ha_warning("monitor-uri will be ignored for %s '%s' (needs 'mode http').\n", + proxy_type_str(curproxy), curproxy->id); + } + if (curproxy->lbprm.algo & BE_LB_NEED_HTTP) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_RR; + ha_warning("Layer 7 hash not possible for %s '%s' (needs 'mode http'). Falling back to round robin.\n", + proxy_type_str(curproxy), curproxy->id); + } + if (curproxy->to_log & (LW_REQ | LW_RESP)) { + curproxy->to_log &= ~(LW_REQ | LW_RESP); + ha_warning("parsing [%s:%d] : HTTP log/header format not usable with %s '%s' (needs 'mode http').\n", + curproxy->conf.lfs_file, curproxy->conf.lfs_line, + proxy_type_str(curproxy), curproxy->id); + } + if (curproxy->conf.logformat_string == default_http_log_format || + curproxy->conf.logformat_string == clf_http_log_format) { + /* Note: we don't change the directive's file:line number */ + curproxy->conf.logformat_string = default_tcp_log_format; + ha_warning("parsing [%s:%d] : 'option httplog' not usable with %s '%s' (needs 'mode http'). Falling back to 'option tcplog'.\n", + curproxy->conf.lfs_file, curproxy->conf.lfs_line, + proxy_type_str(curproxy), curproxy->id); + } + else if (curproxy->conf.logformat_string == default_https_log_format) { + /* Note: we don't change the directive's file:line number */ + curproxy->conf.logformat_string = default_tcp_log_format; + ha_warning("parsing [%s:%d] : 'option httpslog' not usable with %s '%s' (needs 'mode http'). Falling back to 'option tcplog'.\n", + curproxy->conf.lfs_file, curproxy->conf.lfs_line, + proxy_type_str(curproxy), curproxy->id); + } + + return 0; +} + +/* This function checks that the designated proxy has no log directives + * enabled. It will output a warning if there are, and will fix some of them. + * It returns the number of fatal errors encountered. This should be called + * at the end of the configuration parsing if the proxy is not in log mode. + * The <file> argument is used to construct the error message. + */ +int proxy_cfg_ensure_no_log(struct proxy *curproxy) +{ + if (curproxy->lbprm.algo & BE_LB_NEED_LOG) { + curproxy->lbprm.algo &= ~BE_LB_ALGO; + curproxy->lbprm.algo |= BE_LB_ALGO_RR; + ha_warning("Unusable balance algorithm for %s '%s' (needs 'mode log'). Falling back to round robin.\n", + proxy_type_str(curproxy), curproxy->id); + } + + return 0; +} + +/* Perform the most basic initialization of a proxy : + * memset(), list_init(*), reset_timeouts(*). + * Any new proxy or peer should be initialized via this function. + */ +void init_new_proxy(struct proxy *p) +{ + memset(p, 0, sizeof(struct proxy)); + p->obj_type = OBJ_TYPE_PROXY; + queue_init(&p->queue, p, NULL); + LIST_INIT(&p->acl); + LIST_INIT(&p->http_req_rules); + LIST_INIT(&p->http_res_rules); + LIST_INIT(&p->http_after_res_rules); + LIST_INIT(&p->redirect_rules); + LIST_INIT(&p->mon_fail_cond); + LIST_INIT(&p->switching_rules); + LIST_INIT(&p->server_rules); + LIST_INIT(&p->persist_rules); + LIST_INIT(&p->sticking_rules); + LIST_INIT(&p->storersp_rules); + LIST_INIT(&p->tcp_req.inspect_rules); + LIST_INIT(&p->tcp_rep.inspect_rules); + LIST_INIT(&p->tcp_req.l4_rules); + LIST_INIT(&p->tcp_req.l5_rules); + MT_LIST_INIT(&p->listener_queue); + LIST_INIT(&p->loggers); + LIST_INIT(&p->logformat); + LIST_INIT(&p->logformat_sd); + LIST_INIT(&p->format_unique_id); + LIST_INIT(&p->logformat_error); + LIST_INIT(&p->conf.bind); + LIST_INIT(&p->conf.listeners); + LIST_INIT(&p->conf.errors); + LIST_INIT(&p->conf.args.list); + LIST_INIT(&p->filter_configs); + LIST_INIT(&p->tcpcheck_rules.preset_vars); + + p->defsrv.id = "default-server"; + p->conf.used_listener_id = EB_ROOT; + p->conf.used_server_id = EB_ROOT; + p->used_server_addr = EB_ROOT_UNIQUE; + + /* Timeouts are defined as -1 */ + proxy_reset_timeouts(p); + p->tcp_rep.inspect_delay = TICK_ETERNITY; + + /* initial uuid is unassigned (-1) */ + p->uuid = -1; + + /* Default to only allow L4 retries */ + p->retry_type = PR_RE_CONN_FAILED; + + p->extra_counters_fe = NULL; + p->extra_counters_be = NULL; + + HA_RWLOCK_INIT(&p->lock); + + /* initialize the default settings */ + proxy_preset_defaults(p); +} + +/* Preset default settings onto proxy <defproxy>. */ +void proxy_preset_defaults(struct proxy *defproxy) +{ + defproxy->mode = PR_MODE_TCP; + defproxy->flags = 0; + if (!(defproxy->cap & PR_CAP_INT)) { + defproxy->maxconn = cfg_maxpconn; + defproxy->conn_retries = CONN_RETRIES; + } + defproxy->redispatch_after = 0; + defproxy->options = PR_O_REUSE_SAFE; + if (defproxy->cap & PR_CAP_INT) + defproxy->options2 |= PR_O2_INDEPSTR; + defproxy->max_out_conns = MAX_SRV_LIST; + + defproxy->defsrv.check.inter = DEF_CHKINTR; + defproxy->defsrv.check.fastinter = 0; + defproxy->defsrv.check.downinter = 0; + defproxy->defsrv.agent.inter = DEF_CHKINTR; + defproxy->defsrv.agent.fastinter = 0; + defproxy->defsrv.agent.downinter = 0; + defproxy->defsrv.check.rise = DEF_RISETIME; + defproxy->defsrv.check.fall = DEF_FALLTIME; + defproxy->defsrv.agent.rise = DEF_AGENT_RISETIME; + defproxy->defsrv.agent.fall = DEF_AGENT_FALLTIME; + defproxy->defsrv.check.port = 0; + defproxy->defsrv.agent.port = 0; + defproxy->defsrv.maxqueue = 0; + defproxy->defsrv.minconn = 0; + defproxy->defsrv.maxconn = 0; + defproxy->defsrv.max_reuse = -1; + defproxy->defsrv.max_idle_conns = -1; + defproxy->defsrv.pool_purge_delay = 5000; + defproxy->defsrv.slowstart = 0; + defproxy->defsrv.onerror = DEF_HANA_ONERR; + defproxy->defsrv.consecutive_errors_limit = DEF_HANA_ERRLIMIT; + defproxy->defsrv.uweight = defproxy->defsrv.iweight = 1; + LIST_INIT(&defproxy->defsrv.pp_tlvs); + + defproxy->email_alert.level = LOG_ALERT; + defproxy->load_server_state_from_file = PR_SRV_STATE_FILE_UNSPEC; + + if (defproxy->cap & PR_CAP_INT) + defproxy->timeout.connect = 5000; +} + +/* Frees all dynamic settings allocated on a default proxy that's about to be + * destroyed. This is a subset of the complete proxy deinit code, but these + * should probably be merged ultimately. Note that most of the fields are not + * even reset, so extreme care is required here, and calling + * proxy_preset_defaults() afterwards would be safer. + */ +void proxy_free_defaults(struct proxy *defproxy) +{ + struct acl *acl, *aclb; + struct logger *log, *logb; + struct cap_hdr *h,*h_next; + + ha_free(&defproxy->id); + ha_free(&defproxy->conf.file); + ha_free((char **)&defproxy->defsrv.conf.file); + ha_free(&defproxy->check_command); + ha_free(&defproxy->check_path); + ha_free(&defproxy->cookie_name); + ha_free(&defproxy->rdp_cookie_name); + ha_free(&defproxy->dyncookie_key); + ha_free(&defproxy->cookie_domain); + ha_free(&defproxy->cookie_attrs); + ha_free(&defproxy->lbprm.arg_str); + ha_free(&defproxy->capture_name); + istfree(&defproxy->monitor_uri); + ha_free(&defproxy->defbe.name); + ha_free(&defproxy->conn_src.iface_name); + istfree(&defproxy->server_id_hdr_name); + + http_ext_clean(defproxy); + + list_for_each_entry_safe(acl, aclb, &defproxy->acl, list) { + LIST_DELETE(&acl->list); + prune_acl(acl); + free(acl); + } + + free_act_rules(&defproxy->tcp_req.inspect_rules); + free_act_rules(&defproxy->tcp_rep.inspect_rules); + free_act_rules(&defproxy->tcp_req.l4_rules); + free_act_rules(&defproxy->tcp_req.l5_rules); + free_act_rules(&defproxy->http_req_rules); + free_act_rules(&defproxy->http_res_rules); + free_act_rules(&defproxy->http_after_res_rules); + + h = defproxy->req_cap; + while (h) { + h_next = h->next; + free(h->name); + pool_destroy(h->pool); + free(h); + h = h_next; + } + + h = defproxy->rsp_cap; + while (h) { + h_next = h->next; + free(h->name); + pool_destroy(h->pool); + free(h); + h = h_next; + } + + if (defproxy->conf.logformat_string != default_http_log_format && + defproxy->conf.logformat_string != default_tcp_log_format && + defproxy->conf.logformat_string != clf_http_log_format && + defproxy->conf.logformat_string != default_https_log_format) { + ha_free(&defproxy->conf.logformat_string); + } + + if (defproxy->conf.logformat_sd_string != default_rfc5424_sd_log_format) + ha_free(&defproxy->conf.logformat_sd_string); + + list_for_each_entry_safe(log, logb, &defproxy->loggers, list) { + LIST_DEL_INIT(&log->list); + free_logger(log); + } + + ha_free(&defproxy->conf.uniqueid_format_string); + ha_free(&defproxy->conf.error_logformat_string); + ha_free(&defproxy->conf.lfs_file); + ha_free(&defproxy->conf.lfsd_file); + ha_free(&defproxy->conf.uif_file); + ha_free(&defproxy->conf.elfs_file); + chunk_destroy(&defproxy->log_tag); + + free_email_alert(defproxy); + proxy_release_conf_errors(defproxy); + deinit_proxy_tcpcheck(defproxy); + + /* FIXME: we cannot free uri_auth because it might already be used by + * another proxy (legacy code for stats URI ...). Refcount anyone ? + */ +} + +/* delete a defproxy from the tree if still in it, frees its content and its + * storage. Nothing is done if <px> is NULL or if it doesn't have PR_CAP_DEF + * set, allowing to pass it the direct result of a lookup function. + */ +void proxy_destroy_defaults(struct proxy *px) +{ + if (!px) + return; + if (!(px->cap & PR_CAP_DEF)) + return; + BUG_ON(px->conf.refcount != 0); + ebpt_delete(&px->conf.by_name); + proxy_free_defaults(px); + free(px); +} + +/* delete all unreferenced default proxies. A default proxy is unreferenced if + * its refcount is equal to zero. + */ +void proxy_destroy_all_unref_defaults() +{ + struct ebpt_node *n; + + n = ebpt_first(&defproxy_by_name); + while (n) { + struct proxy *px = container_of(n, struct proxy, conf.by_name); + BUG_ON(!(px->cap & PR_CAP_DEF)); + n = ebpt_next(n); + if (!px->conf.refcount) + proxy_destroy_defaults(px); + } +} + +/* Add a reference on the default proxy <defpx> for the proxy <px> Nothing is + * done if <px> already references <defpx>. Otherwise, the default proxy + * refcount is incremented by one. For now, this operation is not thread safe + * and is perform during init stage only. + */ +void proxy_ref_defaults(struct proxy *px, struct proxy *defpx) +{ + if (px->defpx == defpx) + return; + BUG_ON(px->defpx != NULL); + px->defpx = defpx; + defpx->conf.refcount++; +} + +/* proxy <px> removes its reference on its default proxy. The default proxy + * refcount is decremented by one. If it was the last reference, the + * corresponding default proxy is destroyed. For now this operation is not + * thread safe and is performed during deinit staged only. +*/ +void proxy_unref_defaults(struct proxy *px) +{ + if (px->defpx == NULL) + return; + if (!--px->defpx->conf.refcount) + proxy_destroy_defaults(px->defpx); + px->defpx = NULL; +} + +/* Allocates a new proxy <name> of type <cap>. + * Returns the proxy instance on success. On error, NULL is returned. + */ +struct proxy *alloc_new_proxy(const char *name, unsigned int cap, char **errmsg) +{ + struct proxy *curproxy; + + if ((curproxy = calloc(1, sizeof(*curproxy))) == NULL) { + memprintf(errmsg, "proxy '%s': out of memory", name); + goto fail; + } + + init_new_proxy(curproxy); + curproxy->last_change = ns_to_sec(now_ns); + curproxy->id = strdup(name); + curproxy->cap = cap; + + if (!(cap & PR_CAP_INT)) + proxy_store_name(curproxy); + + done: + return curproxy; + + fail: + /* Note: in case of fatal error here, we WILL make valgrind unhappy, + * but its not worth trying to unroll everything here just before + * quitting. + */ + free(curproxy); + return NULL; +} + +/* Copy the proxy settings from <defproxy> to <curproxy>. + * Returns 0 on success. + * Returns 1 on error. <errmsg> will be allocated with an error description. + */ +static int proxy_defproxy_cpy(struct proxy *curproxy, const struct proxy *defproxy, + char **errmsg) +{ + struct logger *tmplogger; + char *tmpmsg = NULL; + + /* set default values from the specified default proxy */ + srv_settings_cpy(&curproxy->defsrv, &defproxy->defsrv, 0); + + curproxy->flags = (defproxy->flags & PR_FL_DISABLED); /* Only inherit from disabled flag */ + curproxy->options = defproxy->options; + curproxy->options2 = defproxy->options2; + curproxy->no_options = defproxy->no_options; + curproxy->no_options2 = defproxy->no_options2; + curproxy->retry_type = defproxy->retry_type; + curproxy->tcp_req.inspect_delay = defproxy->tcp_req.inspect_delay; + curproxy->tcp_rep.inspect_delay = defproxy->tcp_rep.inspect_delay; + + http_ext_clean(curproxy); + http_ext_dup(defproxy, curproxy); + + if (isttest(defproxy->server_id_hdr_name)) + curproxy->server_id_hdr_name = istdup(defproxy->server_id_hdr_name); + + /* initialize error relocations */ + if (!proxy_dup_default_conf_errors(curproxy, defproxy, &tmpmsg)) { + memprintf(errmsg, "proxy '%s' : %s", curproxy->id, tmpmsg); + free(tmpmsg); + return 1; + } + + if (curproxy->cap & PR_CAP_FE) { + curproxy->maxconn = defproxy->maxconn; + curproxy->backlog = defproxy->backlog; + curproxy->fe_sps_lim = defproxy->fe_sps_lim; + + curproxy->to_log = defproxy->to_log & ~LW_COOKIE & ~LW_REQHDR & ~ LW_RSPHDR; + curproxy->max_out_conns = defproxy->max_out_conns; + + curproxy->clitcpka_cnt = defproxy->clitcpka_cnt; + curproxy->clitcpka_idle = defproxy->clitcpka_idle; + curproxy->clitcpka_intvl = defproxy->clitcpka_intvl; + } + + if (curproxy->cap & PR_CAP_BE) { + curproxy->lbprm.algo = defproxy->lbprm.algo; + curproxy->lbprm.hash_balance_factor = defproxy->lbprm.hash_balance_factor; + curproxy->fullconn = defproxy->fullconn; + curproxy->conn_retries = defproxy->conn_retries; + curproxy->redispatch_after = defproxy->redispatch_after; + curproxy->max_ka_queue = defproxy->max_ka_queue; + + curproxy->tcpcheck_rules.flags = (defproxy->tcpcheck_rules.flags & ~TCPCHK_RULES_UNUSED_RS); + curproxy->tcpcheck_rules.list = defproxy->tcpcheck_rules.list; + if (!LIST_ISEMPTY(&defproxy->tcpcheck_rules.preset_vars)) { + if (!dup_tcpcheck_vars(&curproxy->tcpcheck_rules.preset_vars, + &defproxy->tcpcheck_rules.preset_vars)) { + memprintf(errmsg, "proxy '%s': failed to duplicate tcpcheck preset-vars", curproxy->id); + return 1; + } + } + + curproxy->ck_opts = defproxy->ck_opts; + + if (defproxy->cookie_name) + curproxy->cookie_name = strdup(defproxy->cookie_name); + curproxy->cookie_len = defproxy->cookie_len; + + if (defproxy->dyncookie_key) + curproxy->dyncookie_key = strdup(defproxy->dyncookie_key); + if (defproxy->cookie_domain) + curproxy->cookie_domain = strdup(defproxy->cookie_domain); + + if (defproxy->cookie_maxidle) + curproxy->cookie_maxidle = defproxy->cookie_maxidle; + + if (defproxy->cookie_maxlife) + curproxy->cookie_maxlife = defproxy->cookie_maxlife; + + if (defproxy->rdp_cookie_name) + curproxy->rdp_cookie_name = strdup(defproxy->rdp_cookie_name); + curproxy->rdp_cookie_len = defproxy->rdp_cookie_len; + + if (defproxy->cookie_attrs) + curproxy->cookie_attrs = strdup(defproxy->cookie_attrs); + + if (defproxy->lbprm.arg_str) + curproxy->lbprm.arg_str = strdup(defproxy->lbprm.arg_str); + curproxy->lbprm.arg_len = defproxy->lbprm.arg_len; + curproxy->lbprm.arg_opt1 = defproxy->lbprm.arg_opt1; + curproxy->lbprm.arg_opt2 = defproxy->lbprm.arg_opt2; + curproxy->lbprm.arg_opt3 = defproxy->lbprm.arg_opt3; + + if (defproxy->conn_src.iface_name) + curproxy->conn_src.iface_name = strdup(defproxy->conn_src.iface_name); + curproxy->conn_src.iface_len = defproxy->conn_src.iface_len; + curproxy->conn_src.opts = defproxy->conn_src.opts; +#if defined(CONFIG_HAP_TRANSPARENT) + curproxy->conn_src.tproxy_addr = defproxy->conn_src.tproxy_addr; +#endif + curproxy->load_server_state_from_file = defproxy->load_server_state_from_file; + + curproxy->srvtcpka_cnt = defproxy->srvtcpka_cnt; + curproxy->srvtcpka_idle = defproxy->srvtcpka_idle; + curproxy->srvtcpka_intvl = defproxy->srvtcpka_intvl; + } + + if (curproxy->cap & PR_CAP_FE) { + if (defproxy->capture_name) + curproxy->capture_name = strdup(defproxy->capture_name); + curproxy->capture_namelen = defproxy->capture_namelen; + curproxy->capture_len = defproxy->capture_len; + + curproxy->nb_req_cap = defproxy->nb_req_cap; + curproxy->req_cap = defproxy->req_cap; + + curproxy->nb_rsp_cap = defproxy->nb_rsp_cap; + curproxy->rsp_cap = defproxy->rsp_cap; + } + + if (curproxy->cap & PR_CAP_FE) { + curproxy->timeout.client = defproxy->timeout.client; + curproxy->timeout.client_hs = defproxy->timeout.client_hs; + curproxy->timeout.clientfin = defproxy->timeout.clientfin; + curproxy->timeout.tarpit = defproxy->timeout.tarpit; + curproxy->timeout.httpreq = defproxy->timeout.httpreq; + curproxy->timeout.httpka = defproxy->timeout.httpka; + if (isttest(defproxy->monitor_uri)) + curproxy->monitor_uri = istdup(defproxy->monitor_uri); + if (defproxy->defbe.name) + curproxy->defbe.name = strdup(defproxy->defbe.name); + + /* get either a pointer to the logformat string or a copy of it */ + curproxy->conf.logformat_string = defproxy->conf.logformat_string; + if (curproxy->conf.logformat_string && + curproxy->conf.logformat_string != default_http_log_format && + curproxy->conf.logformat_string != default_tcp_log_format && + curproxy->conf.logformat_string != clf_http_log_format && + curproxy->conf.logformat_string != default_https_log_format) + curproxy->conf.logformat_string = strdup(curproxy->conf.logformat_string); + + if (defproxy->conf.lfs_file) { + curproxy->conf.lfs_file = strdup(defproxy->conf.lfs_file); + curproxy->conf.lfs_line = defproxy->conf.lfs_line; + } + + /* get either a pointer to the logformat string for RFC5424 structured-data or a copy of it */ + curproxy->conf.logformat_sd_string = defproxy->conf.logformat_sd_string; + if (curproxy->conf.logformat_sd_string && + curproxy->conf.logformat_sd_string != default_rfc5424_sd_log_format) + curproxy->conf.logformat_sd_string = strdup(curproxy->conf.logformat_sd_string); + + if (defproxy->conf.lfsd_file) { + curproxy->conf.lfsd_file = strdup(defproxy->conf.lfsd_file); + curproxy->conf.lfsd_line = defproxy->conf.lfsd_line; + } + + curproxy->conf.error_logformat_string = defproxy->conf.error_logformat_string; + if (curproxy->conf.error_logformat_string) + curproxy->conf.error_logformat_string = strdup(curproxy->conf.error_logformat_string); + + if (defproxy->conf.elfs_file) { + curproxy->conf.elfs_file = strdup(defproxy->conf.elfs_file); + curproxy->conf.elfs_line = defproxy->conf.elfs_line; + } + } + + if (curproxy->cap & PR_CAP_BE) { + curproxy->timeout.connect = defproxy->timeout.connect; + curproxy->timeout.server = defproxy->timeout.server; + curproxy->timeout.serverfin = defproxy->timeout.serverfin; + curproxy->timeout.check = defproxy->timeout.check; + curproxy->timeout.queue = defproxy->timeout.queue; + curproxy->timeout.tarpit = defproxy->timeout.tarpit; + curproxy->timeout.httpreq = defproxy->timeout.httpreq; + curproxy->timeout.httpka = defproxy->timeout.httpka; + curproxy->timeout.tunnel = defproxy->timeout.tunnel; + curproxy->conn_src.source_addr = defproxy->conn_src.source_addr; + } + + curproxy->mode = defproxy->mode; + curproxy->uri_auth = defproxy->uri_auth; /* for stats */ + + /* copy default loggers to curproxy */ + list_for_each_entry(tmplogger, &defproxy->loggers, list) { + struct logger *node = dup_logger(tmplogger); + + if (!node) { + memprintf(errmsg, "proxy '%s': out of memory", curproxy->id); + return 1; + } + LIST_APPEND(&curproxy->loggers, &node->list); + } + + curproxy->conf.uniqueid_format_string = defproxy->conf.uniqueid_format_string; + if (curproxy->conf.uniqueid_format_string) + curproxy->conf.uniqueid_format_string = strdup(curproxy->conf.uniqueid_format_string); + + chunk_dup(&curproxy->log_tag, &defproxy->log_tag); + + if (defproxy->conf.uif_file) { + curproxy->conf.uif_file = strdup(defproxy->conf.uif_file); + curproxy->conf.uif_line = defproxy->conf.uif_line; + } + + /* copy default header unique id */ + if (isttest(defproxy->header_unique_id)) { + const struct ist copy = istdup(defproxy->header_unique_id); + + if (!isttest(copy)) { + memprintf(errmsg, "proxy '%s': out of memory for unique-id-header", curproxy->id); + return 1; + } + curproxy->header_unique_id = copy; + } + + /* default compression options */ + if (defproxy->comp != NULL) { + curproxy->comp = calloc(1, sizeof(*curproxy->comp)); + if (!curproxy->comp) { + memprintf(errmsg, "proxy '%s': out of memory for default compression options", curproxy->id); + return 1; + } + curproxy->comp->algos_res = defproxy->comp->algos_res; + curproxy->comp->algo_req = defproxy->comp->algo_req; + curproxy->comp->types_res = defproxy->comp->types_res; + curproxy->comp->types_req = defproxy->comp->types_req; + curproxy->comp->flags = defproxy->comp->flags; + } + + if (defproxy->check_path) + curproxy->check_path = strdup(defproxy->check_path); + if (defproxy->check_command) + curproxy->check_command = strdup(defproxy->check_command); + + if (defproxy->email_alert.mailers.name) + curproxy->email_alert.mailers.name = strdup(defproxy->email_alert.mailers.name); + if (defproxy->email_alert.from) + curproxy->email_alert.from = strdup(defproxy->email_alert.from); + if (defproxy->email_alert.to) + curproxy->email_alert.to = strdup(defproxy->email_alert.to); + if (defproxy->email_alert.myhostname) + curproxy->email_alert.myhostname = strdup(defproxy->email_alert.myhostname); + curproxy->email_alert.level = defproxy->email_alert.level; + curproxy->email_alert.set = defproxy->email_alert.set; + + return 0; +} + +/* Allocates a new proxy <name> of type <cap> found at position <file:linenum>, + * preset it from the defaults of <defproxy> and returns it. In case of error, + * an alert is printed and NULL is returned. + */ +struct proxy *parse_new_proxy(const char *name, unsigned int cap, + const char *file, int linenum, + const struct proxy *defproxy) +{ + struct proxy *curproxy = NULL; + char *errmsg = NULL; + + if (!(curproxy = alloc_new_proxy(name, cap, &errmsg))) { + ha_alert("parsing [%s:%d] : %s\n", file, linenum, errmsg); + free(errmsg); + return NULL; + } + + if (defproxy) { + if (proxy_defproxy_cpy(curproxy, defproxy, &errmsg)) { + ha_alert("parsing [%s:%d] : %s\n", file, linenum, errmsg); + free(errmsg); + + ha_free(&curproxy); + return NULL; + } + } + + curproxy->conf.args.file = curproxy->conf.file = strdup(file); + curproxy->conf.args.line = curproxy->conf.line = linenum; + + return curproxy; +} + +/* to be called under the proxy lock after pausing some listeners. This will + * automatically update the p->flags flag + */ +void proxy_cond_pause(struct proxy *p) +{ + if (p->li_ready) + return; + p->flags |= PR_FL_PAUSED; +} + +/* to be called under the proxy lock after resuming some listeners. This will + * automatically update the p->flags flag + */ +void proxy_cond_resume(struct proxy *p) +{ + if (!p->li_ready) + return; + p->flags &= ~PR_FL_PAUSED; +} + +/* to be called under the proxy lock after stopping some listeners. This will + * automatically update the p->flags flag after stopping the last one, and + * will emit a log indicating the proxy's condition. The function is idempotent + * so that it will not emit multiple logs; a proxy will be disabled only once. + */ +void proxy_cond_disable(struct proxy *p) +{ + if (p->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + return; + + if (p->li_ready + p->li_paused > 0) + return; + + p->flags |= PR_FL_STOPPED; + + /* Note: syslog proxies use their own loggers so while it's somewhat OK + * to report them being stopped as a warning, we must not spam their log + * servers which are in fact production servers. For other types (CLI, + * peers, etc) we must not report them at all as they're not really on + * the data plane but on the control plane. + */ + if ((p->mode == PR_MODE_TCP || p->mode == PR_MODE_HTTP || p->mode == PR_MODE_SYSLOG) && !(p->cap & PR_CAP_INT)) + ha_warning("Proxy %s stopped (cumulated conns: FE: %lld, BE: %lld).\n", + p->id, p->fe_counters.cum_conn, p->be_counters.cum_conn); + + if ((p->mode == PR_MODE_TCP || p->mode == PR_MODE_HTTP) && !(p->cap & PR_CAP_INT)) + send_log(p, LOG_WARNING, "Proxy %s stopped (cumulated conns: FE: %lld, BE: %lld).\n", + p->id, p->fe_counters.cum_conn, p->be_counters.cum_conn); + + if (p->table && p->table->size && p->table->sync_task) + task_wakeup(p->table->sync_task, TASK_WOKEN_MSG); + + if (p->task) + task_wakeup(p->task, TASK_WOKEN_MSG); +} + +/* + * This is the proxy management task. It enables proxies when there are enough + * free streams, or stops them when the table is full. It is designed to be + * called as a task which is woken up upon stopping or when rate limiting must + * be enforced. + */ +struct task *manage_proxy(struct task *t, void *context, unsigned int state) +{ + struct proxy *p = context; + int next = TICK_ETERNITY; + unsigned int wait; + + /* We should periodically try to enable listeners waiting for a + * global resource here. + */ + + /* If the proxy holds a stick table, we need to purge all unused + * entries. These are all the ones in the table with ref_cnt == 0 + * and all the ones in the pool used to allocate new entries. Any + * entry attached to an existing stream waiting for a store will + * be in neither list. Any entry being dumped will have ref_cnt > 0. + * However we protect tables that are being synced to peers. + */ + if (unlikely(stopping && (p->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) && p->table && p->table->current)) { + + if (!p->table->refcnt) { + /* !table->refcnt means there + * is no more pending full resync + * to push to a new process and + * we are free to flush the table. + */ + int budget; + int cleaned_up; + + /* We purposely enforce a budget limitation since we don't want + * to spend too much time purging old entries + * + * This is known to cause the watchdog to occasionnaly trigger if + * the table is huge and all entries become available for purge + * at the same time + * + * Moreover, we must also anticipate the pool_gc() call which + * will also be much slower if there is too much work at once + */ + budget = MIN(p->table->current, (1 << 15)); /* max: 32K */ + cleaned_up = stktable_trash_oldest(p->table, budget); + if (cleaned_up) { + /* immediately release freed memory since we are stopping */ + pool_gc(NULL); + if (cleaned_up > (budget / 2)) { + /* most of the budget was used to purge entries, + * it is very likely that there are still trashable + * entries in the table, reschedule a new cleanup + * attempt ASAP + */ + t->expire = TICK_ETERNITY; + task_wakeup(t, TASK_WOKEN_RES); + return t; + } + } + } + if (p->table->current) { + /* some entries still remain but are not yet available + * for cleanup, let's recheck in one second + */ + next = tick_first(next, tick_add(now_ms, 1000)); + } + } + + /* the rest below is just for frontends */ + if (!(p->cap & PR_CAP_FE)) + goto out; + + /* check the various reasons we may find to block the frontend */ + if (unlikely(p->feconn >= p->maxconn)) + goto out; + + if (p->fe_sps_lim && + (wait = next_event_delay(&p->fe_sess_per_sec, p->fe_sps_lim, 0))) { + /* we're blocking because a limit was reached on the number of + * requests/s on the frontend. We want to re-check ASAP, which + * means in 1 ms before estimated expiration date, because the + * timer will have settled down. + */ + next = tick_first(next, tick_add(now_ms, wait)); + goto out; + } + + /* The proxy is not limited so we can re-enable any waiting listener */ + dequeue_proxy_listeners(p); + out: + t->expire = next; + task_queue(t); + return t; +} + + +static int proxy_parse_grace(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + const char *res; + + if (!*args[1]) { + memprintf(err, "'%s' expects <time> as argument.\n", args[0]); + return -1; + } + res = parse_time_err(args[1], &global.grace_delay, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[1], args[0]); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[1], args[0]); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in argument to <%s>.\n", *res, args[0]); + return -1; + } + return 0; +} + +static int proxy_parse_hard_stop_after(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + const char *res; + + if (!*args[1]) { + memprintf(err, "'%s' expects <time> as argument.\n", args[0]); + return -1; + } + res = parse_time_err(args[1], &global.hard_stop_after, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[1], args[0]); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[1], args[0]); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in argument to <%s>.\n", *res, args[0]); + return -1; + } + return 0; +} + +static int proxy_parse_close_spread_time(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + const char *res; + + if (!*args[1]) { + memprintf(err, "'%s' expects <time> as argument.\n", args[0]); + return -1; + } + + /* If close-spread-time is set to "infinite", disable the active connection + * closing during soft-stop. + */ + if (strcmp(args[1], "infinite") == 0) { + global.tune.options |= GTUNE_DISABLE_ACTIVE_CLOSE; + global.close_spread_time = TICK_ETERNITY; + return 0; + } + + res = parse_time_err(args[1], &global.close_spread_time, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[1], args[0]); + return -1; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[1], args[0]); + return -1; + } + else if (res) { + memprintf(err, "unexpected character '%c' in argument to <%s>.\n", *res, args[0]); + return -1; + } + global.tune.options &= ~GTUNE_DISABLE_ACTIVE_CLOSE; + + return 0; +} + +struct task *hard_stop(struct task *t, void *context, unsigned int state) +{ + struct proxy *p; + struct stream *s; + int thr; + + if (killed) { + ha_warning("Some tasks resisted to hard-stop, exiting now.\n"); + send_log(NULL, LOG_WARNING, "Some tasks resisted to hard-stop, exiting now.\n"); + killed = 2; + for (thr = 0; thr < global.nbthread; thr++) + if (_HA_ATOMIC_LOAD(&ha_thread_info[thr].tg->threads_enabled) & ha_thread_info[thr].ltid_bit) + wake_thread(thr); + t->expire = TICK_ETERNITY; + return t; + } + + ha_warning("soft-stop running for too long, performing a hard-stop.\n"); + send_log(NULL, LOG_WARNING, "soft-stop running for too long, performing a hard-stop.\n"); + p = proxies_list; + while (p) { + if ((p->cap & PR_CAP_FE) && (p->feconn > 0)) { + ha_warning("Proxy %s hard-stopped (%d remaining conns will be closed).\n", + p->id, p->feconn); + send_log(p, LOG_WARNING, "Proxy %s hard-stopped (%d remaining conns will be closed).\n", + p->id, p->feconn); + } + p = p->next; + } + + thread_isolate(); + + for (thr = 0; thr < global.nbthread; thr++) { + list_for_each_entry(s, &ha_thread_ctx[thr].streams, list) { + stream_shutdown(s, SF_ERR_KILLED); + } + } + + thread_release(); + + killed = 1; + t->expire = tick_add(now_ms, MS_TO_TICKS(1000)); + return t; +} + +/* perform the soft-stop right now (i.e. unbind listeners) */ +static void do_soft_stop_now() +{ + struct proxy *p; + struct task *task; + + /* disable busy polling to avoid cpu eating for the new process */ + global.tune.options &= ~GTUNE_BUSY_POLLING; + + if (tick_isset(global.close_spread_time)) { + global.close_spread_end = tick_add(now_ms, global.close_spread_time); + } + + /* schedule a hard-stop after a delay if needed */ + if (tick_isset(global.hard_stop_after)) { + task = task_new_anywhere(); + if (task) { + task->process = hard_stop; + task_schedule(task, tick_add(now_ms, global.hard_stop_after)); + } + else { + ha_alert("out of memory trying to allocate the hard-stop task.\n"); + } + } + + /* we isolate so that we have a chance of stopping listeners in other groups */ + thread_isolate(); + + /* stop all stoppable listeners */ + protocol_stop_now(); + + thread_release(); + + /* Loop on proxies to stop backends */ + p = proxies_list; + while (p) { + HA_RWLOCK_WRLOCK(PROXY_LOCK, &p->lock); + proxy_cond_disable(p); + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); + p = p->next; + } + + /* signal zero is used to broadcast the "stopping" event */ + signal_handler(0); +} + +/* triggered by a soft-stop delayed with `grace` */ +static struct task *grace_expired(struct task *t, void *context, unsigned int state) +{ + ha_notice("Grace period expired, proceeding with soft-stop now.\n"); + send_log(NULL, LOG_NOTICE, "Grace period expired, proceeding with soft-stop now.\n"); + do_soft_stop_now(); + task_destroy(t); + return NULL; +} + +/* + * this function disables health-check servers so that the process will quickly be ignored + * by load balancers. + */ +void soft_stop(void) +{ + struct task *task; + + stopping = 1; + + if (tick_isset(global.grace_delay)) { + task = task_new_anywhere(); + if (task) { + ha_notice("Scheduling a soft-stop in %u ms.\n", global.grace_delay); + send_log(NULL, LOG_WARNING, "Scheduling a soft-stop in %u ms.\n", global.grace_delay); + task->process = grace_expired; + task_schedule(task, tick_add(now_ms, global.grace_delay)); + return; + } + else { + ha_alert("out of memory trying to allocate the stop-stop task, stopping now.\n"); + } + } + + /* no grace (or failure to enforce it): stop now */ + do_soft_stop_now(); +} + + +/* Temporarily disables listening on all of the proxy's listeners. Upon + * success, the proxy enters the PR_PAUSED state. The function returns 0 + * if it fails, or non-zero on success. + * The function takes the proxy's lock so it's safe to + * call from multiple places. + */ +int pause_proxy(struct proxy *p) +{ + struct listener *l; + + HA_RWLOCK_WRLOCK(PROXY_LOCK, &p->lock); + + if (!(p->cap & PR_CAP_FE) || (p->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) || !p->li_ready) + goto end; + + list_for_each_entry(l, &p->conf.listeners, by_fe) + suspend_listener(l, 1, 0); + + if (p->li_ready) { + ha_warning("%s %s failed to enter pause mode.\n", proxy_cap_str(p->cap), p->id); + send_log(p, LOG_WARNING, "%s %s failed to enter pause mode.\n", proxy_cap_str(p->cap), p->id); + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); + return 0; + } +end: + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); + return 1; +} + +/* + * This function completely stops a proxy and releases its listeners. It has + * to be called when going down in order to release the ports so that another + * process may bind to them. It must also be called on disabled proxies at the + * end of start-up. If all listeners are closed, the proxy is set to the + * PR_STOPPED state. + * The function takes the proxy's lock so it's safe to + * call from multiple places. + */ +void stop_proxy(struct proxy *p) +{ + struct listener *l; + + HA_RWLOCK_WRLOCK(PROXY_LOCK, &p->lock); + + list_for_each_entry(l, &p->conf.listeners, by_fe) + stop_listener(l, 1, 0, 0); + + if (!(p->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) && !p->li_ready) { + /* might be just a backend */ + p->flags |= PR_FL_STOPPED; + } + + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); +} + +/* This function resumes listening on the specified proxy. It scans all of its + * listeners and tries to enable them all. If any of them fails, the proxy is + * put back to the paused state. It returns 1 upon success, or zero if an error + * is encountered. + * The function takes the proxy's lock so it's safe to + * call from multiple places. + */ +int resume_proxy(struct proxy *p) +{ + struct listener *l; + int fail; + + HA_RWLOCK_WRLOCK(PROXY_LOCK, &p->lock); + + if ((p->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) || !p->li_paused) + goto end; + + fail = 0; + list_for_each_entry(l, &p->conf.listeners, by_fe) { + if (!resume_listener(l, 1, 0)) { + int port; + + port = get_host_port(&l->rx.addr); + if (port) { + ha_warning("Port %d busy while trying to enable %s %s.\n", + port, proxy_cap_str(p->cap), p->id); + send_log(p, LOG_WARNING, "Port %d busy while trying to enable %s %s.\n", + port, proxy_cap_str(p->cap), p->id); + } + else { + ha_warning("Bind on socket %d busy while trying to enable %s %s.\n", + l->luid, proxy_cap_str(p->cap), p->id); + send_log(p, LOG_WARNING, "Bind on socket %d busy while trying to enable %s %s.\n", + l->luid, proxy_cap_str(p->cap), p->id); + } + + /* Another port might have been enabled. Let's stop everything. */ + fail = 1; + break; + } + } + + if (fail) { + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); + /* pause_proxy will take PROXY_LOCK */ + pause_proxy(p); + return 0; + } +end: + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); + return 1; +} + +/* Set current stream's backend to <be>. Nothing is done if the + * stream already had a backend assigned, which is indicated by + * s->flags & SF_BE_ASSIGNED. + * All flags, stats and counters which need be updated are updated. + * Returns 1 if done, 0 in case of internal error, eg: lack of resource. + */ +int stream_set_backend(struct stream *s, struct proxy *be) +{ + unsigned int req_ana; + + if (s->flags & SF_BE_ASSIGNED) + return 1; + + if (flt_set_stream_backend(s, be) < 0) + return 0; + + s->be = be; + HA_ATOMIC_UPDATE_MAX(&be->be_counters.conn_max, + HA_ATOMIC_ADD_FETCH(&be->beconn, 1)); + proxy_inc_be_ctr(be); + + /* assign new parameters to the stream from the new backend */ + s->scb->flags &= ~SC_FL_INDEP_STR; + if (be->options2 & PR_O2_INDEPSTR) + s->scb->flags |= SC_FL_INDEP_STR; + + /* We want to enable the backend-specific analysers except those which + * were already run as part of the frontend/listener. Note that it would + * be more reliable to store the list of analysers that have been run, + * but what we do here is OK for now. + */ + req_ana = be->be_req_ana; + if (!(strm_fe(s)->options & PR_O_WREQ_BODY) && be->options & PR_O_WREQ_BODY) { + /* The backend request to parse a request body while it was not + * performed on the frontend, so add the corresponding analyser + */ + req_ana |= AN_REQ_HTTP_BODY; + } + if (IS_HTX_STRM(s) && strm_fe(s)->mode != PR_MODE_HTTP) { + /* The stream was already upgraded to HTTP, so remove analysers + * set during the upgrade + */ + req_ana &= ~(AN_REQ_WAIT_HTTP|AN_REQ_HTTP_PROCESS_FE); + } + s->req.analysers |= req_ana & ~(strm_li(s) ? strm_li(s)->bind_conf->analysers : 0); + + if (!IS_HTX_STRM(s) && be->mode == PR_MODE_HTTP) { + /* If we chain a TCP frontend to an HTX backend, we must upgrade + * the client mux */ + if (!stream_set_http_mode(s, NULL)) + return 0; + } + else if (IS_HTX_STRM(s) && be->mode != PR_MODE_HTTP) { + /* If a TCP backend is assgiend to an HTX stream, return an + * error. It may happens for a new stream on a previously + * upgraded connections. */ + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_INTERNAL; + return 0; + } + else { + /* If the target backend requires HTTP processing, we have to allocate + * the HTTP transaction if we did not have one. + */ + if (unlikely(!s->txn && be->http_needed && !http_create_txn(s))) + return 0; + } + + s->flags |= SF_BE_ASSIGNED; + if (be->options2 & PR_O2_NODELAY) { + s->scf->flags |= SC_FL_SND_NEVERWAIT; + s->scb->flags |= SC_FL_SND_NEVERWAIT; + } + + return 1; +} + +/* Capture a bad request or response and archive it in the proxy's structure. + * It is relatively protocol-agnostic so it requires that a number of elements + * are passed : + * - <proxy> is the proxy where the error was detected and where the snapshot + * needs to be stored + * - <is_back> indicates that the error happened when receiving the response + * - <other_end> is a pointer to the proxy on the other side when known + * - <target> is the target of the connection, usually a server or a proxy + * - <sess> is the session which experienced the error + * - <ctx> may be NULL or should contain any info relevant to the protocol + * - <buf> is the buffer containing the offending data + * - <buf_ofs> is the position of this buffer's input data in the input + * stream, starting at zero. It may be passed as zero if unknown. + * - <buf_out> is the portion of <buf->data> which was already forwarded and + * which precedes the buffer's input. The buffer's input starts at + * buf->head + buf_out. + * - <err_pos> is the pointer to the faulty byte in the buffer's input. + * - <show> is the callback to use to display <ctx>. It may be NULL. + */ +void proxy_capture_error(struct proxy *proxy, int is_back, + struct proxy *other_end, enum obj_type *target, + const struct session *sess, + const struct buffer *buf, long buf_ofs, + unsigned int buf_out, unsigned int err_pos, + const union error_snapshot_ctx *ctx, + void (*show)(struct buffer *, const struct error_snapshot *)) +{ + struct error_snapshot *es; + unsigned int buf_len; + int len1, len2; + unsigned int ev_id; + + ev_id = HA_ATOMIC_FETCH_ADD(&error_snapshot_id, 1); + + buf_len = b_data(buf) - buf_out; + + es = malloc(sizeof(*es) + buf_len); + if (!es) + return; + + es->buf_len = buf_len; + es->ev_id = ev_id; + + len1 = b_size(buf) - b_peek_ofs(buf, buf_out); + if (len1 > buf_len) + len1 = buf_len; + + if (len1) { + memcpy(es->buf, b_peek(buf, buf_out), len1); + len2 = buf_len - len1; + if (len2) + memcpy(es->buf + len1, b_orig(buf), len2); + } + + es->buf_err = err_pos; + es->when = date; // user-visible date + es->srv = objt_server(target); + es->oe = other_end; + if (sess && objt_conn(sess->origin) && conn_get_src(__objt_conn(sess->origin))) + es->src = *__objt_conn(sess->origin)->src; + else + memset(&es->src, 0, sizeof(es->src)); + + es->buf_wrap = b_wrap(buf) - b_peek(buf, buf_out); + es->buf_out = buf_out; + es->buf_ofs = buf_ofs; + + /* be sure to indicate the offset of the first IN byte */ + if (es->buf_ofs >= es->buf_len) + es->buf_ofs -= es->buf_len; + else + es->buf_ofs = 0; + + /* protocol-specific part now */ + if (ctx) + es->ctx = *ctx; + else + memset(&es->ctx, 0, sizeof(es->ctx)); + es->show = show; + + /* note: we still lock since we have to be certain that nobody is + * dumping the output while we free. + */ + HA_RWLOCK_WRLOCK(PROXY_LOCK, &proxy->lock); + if (is_back) { + es = HA_ATOMIC_XCHG(&proxy->invalid_rep, es); + } else { + es = HA_ATOMIC_XCHG(&proxy->invalid_req, es); + } + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &proxy->lock); + ha_free(&es); +} + +/* Configure all proxies which lack a maxconn setting to use the global one by + * default. This avoids the common mistake consisting in setting maxconn only + * in the global section and discovering the hard way that it doesn't propagate + * through the frontends. These values are also propagated through the various + * targeted backends, whose fullconn is finally calculated if not yet set. + */ +void proxy_adjust_all_maxconn() +{ + struct proxy *curproxy; + struct switching_rule *swrule1, *swrule2; + + for (curproxy = proxies_list; curproxy; curproxy = curproxy->next) { + if (curproxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + continue; + + if (!(curproxy->cap & PR_CAP_FE)) + continue; + + if (!curproxy->maxconn) + curproxy->maxconn = global.maxconn; + + /* update the target backend's fullconn count : default_backend */ + if (curproxy->defbe.be) + curproxy->defbe.be->tot_fe_maxconn += curproxy->maxconn; + else if ((curproxy->cap & PR_CAP_LISTEN) == PR_CAP_LISTEN) + curproxy->tot_fe_maxconn += curproxy->maxconn; + + list_for_each_entry(swrule1, &curproxy->switching_rules, list) { + /* For each target of switching rules, we update their + * tot_fe_maxconn, except if a previous rule points to + * the same backend or to the default backend. + */ + if (swrule1->be.backend != curproxy->defbe.be) { + /* note: swrule1->be.backend isn't a backend if the rule + * is dynamic, it's an expression instead, so it must not + * be dereferenced as a backend before being certain it is. + */ + list_for_each_entry(swrule2, &curproxy->switching_rules, list) { + if (swrule2 == swrule1) { + if (!swrule1->dynamic) + swrule1->be.backend->tot_fe_maxconn += curproxy->maxconn; + break; + } + else if (!swrule2->dynamic && swrule2->be.backend == swrule1->be.backend) { + /* there are multiple refs of this backend */ + break; + } + } + } + } + } + + /* automatically compute fullconn if not set. We must not do it in the + * loop above because cross-references are not yet fully resolved. + */ + for (curproxy = proxies_list; curproxy; curproxy = curproxy->next) { + if (curproxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + continue; + + /* If <fullconn> is not set, let's set it to 10% of the sum of + * the possible incoming frontend's maxconns. + */ + if (!curproxy->fullconn && (curproxy->cap & PR_CAP_BE)) { + /* we have the sum of the maxconns in <total>. We only + * keep 10% of that sum to set the default fullconn, with + * a hard minimum of 1 (to avoid a divide by zero). + */ + curproxy->fullconn = (curproxy->tot_fe_maxconn + 9) / 10; + if (!curproxy->fullconn) + curproxy->fullconn = 1; + } + } +} + +/* Config keywords below */ + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "grace", proxy_parse_grace }, + { CFG_GLOBAL, "hard-stop-after", proxy_parse_hard_stop_after }, + { CFG_GLOBAL, "close-spread-time", proxy_parse_close_spread_time }, + { CFG_LISTEN, "timeout", proxy_parse_timeout }, + { CFG_LISTEN, "clitimeout", proxy_parse_timeout }, /* This keyword actually fails to parse, this line remains for better error messages. */ + { CFG_LISTEN, "contimeout", proxy_parse_timeout }, /* This keyword actually fails to parse, this line remains for better error messages. */ + { CFG_LISTEN, "srvtimeout", proxy_parse_timeout }, /* This keyword actually fails to parse, this line remains for better error messages. */ + { CFG_LISTEN, "rate-limit", proxy_parse_rate_limit }, + { CFG_LISTEN, "max-keep-alive-queue", proxy_parse_max_ka_queue }, + { CFG_LISTEN, "declare", proxy_parse_declare }, + { CFG_LISTEN, "retry-on", proxy_parse_retry_on }, +#ifdef TCP_KEEPCNT + { CFG_LISTEN, "clitcpka-cnt", proxy_parse_tcpka_cnt }, + { CFG_LISTEN, "srvtcpka-cnt", proxy_parse_tcpka_cnt }, +#endif +#ifdef TCP_KEEPIDLE + { CFG_LISTEN, "clitcpka-idle", proxy_parse_tcpka_idle }, + { CFG_LISTEN, "srvtcpka-idle", proxy_parse_tcpka_idle }, +#endif +#ifdef TCP_KEEPINTVL + { CFG_LISTEN, "clitcpka-intvl", proxy_parse_tcpka_intvl }, + { CFG_LISTEN, "srvtcpka-intvl", proxy_parse_tcpka_intvl }, +#endif + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* Expects to find a frontend named <arg> and returns it, otherwise displays various + * adequate error messages and returns NULL. This function is designed to be used by + * functions requiring a frontend on the CLI. + */ +struct proxy *cli_find_frontend(struct appctx *appctx, const char *arg) +{ + struct proxy *px; + + if (!*arg) { + cli_err(appctx, "A frontend name is expected.\n"); + return NULL; + } + + px = proxy_fe_by_name(arg); + if (!px) { + cli_err(appctx, "No such frontend.\n"); + return NULL; + } + return px; +} + +/* Expects to find a backend named <arg> and returns it, otherwise displays various + * adequate error messages and returns NULL. This function is designed to be used by + * functions requiring a frontend on the CLI. + */ +struct proxy *cli_find_backend(struct appctx *appctx, const char *arg) +{ + struct proxy *px; + + if (!*arg) { + cli_err(appctx, "A backend name is expected.\n"); + return NULL; + } + + px = proxy_be_by_name(arg); + if (!px) { + cli_err(appctx, "No such backend.\n"); + return NULL; + } + return px; +} + + +/* parse a "show servers [state|conn]" CLI line, returns 0 if it wants to start + * the dump or 1 if it stops immediately. If an argument is specified, it will + * reserve a show_srv_ctx context and set the proxy pointer into ->px, its ID + * into ->only_pxid, and ->show_conn to 0 for "state", or 1 for "conn". + */ +static int cli_parse_show_servers(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_srv_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct proxy *px; + + ctx->show_conn = *args[2] == 'c'; // "conn" vs "state" + + /* check if a backend name has been provided */ + if (*args[3]) { + /* read server state from local file */ + px = proxy_be_by_name(args[3]); + + if (!px) + return cli_err(appctx, "Can't find backend.\n"); + + ctx->px = px; + ctx->only_pxid = px->uuid; + } + return 0; +} + +/* helper to dump server addr */ +static void dump_server_addr(const struct sockaddr_storage *addr, char *addr_str) +{ + addr_str[0] = '\0'; + switch (addr->ss_family) { + case AF_INET: + case AF_INET6: + addr_to_str(addr, addr_str, INET6_ADDRSTRLEN + 1); + break; + default: + memcpy(addr_str, "-\0", 2); + break; + } +} + +/* dumps server state information for all the servers found in backend cli.p0. + * These information are all the parameters which may change during HAProxy runtime. + * By default, we only export to the last known server state file format. These + * information can be used at next startup to recover same level of server + * state. It takes its context from show_srv_ctx, with the proxy pointer from + * ->px, the proxy's id ->only_pxid, the server's pointer from ->sv, and the + * choice of what to dump from ->show_conn. + */ +static int dump_servers_state(struct stconn *sc) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_srv_ctx *ctx = appctx->svcctx; + struct proxy *px = ctx->px; + struct server *srv; + char srv_addr[INET6_ADDRSTRLEN + 1]; + char srv_agent_addr[INET6_ADDRSTRLEN + 1]; + char srv_check_addr[INET6_ADDRSTRLEN + 1]; + time_t srv_time_since_last_change; + int bk_f_forced_id, srv_f_forced_id; + char *srvrecord; + + if (!ctx->sv) + ctx->sv = px->srv; + + for (; ctx->sv != NULL; ctx->sv = srv->next) { + srv = ctx->sv; + + dump_server_addr(&srv->addr, srv_addr); + dump_server_addr(&srv->check.addr, srv_check_addr); + dump_server_addr(&srv->agent.addr, srv_agent_addr); + + srv_time_since_last_change = ns_to_sec(now_ns) - srv->last_change; + bk_f_forced_id = px->options & PR_O_FORCED_ID ? 1 : 0; + srv_f_forced_id = srv->flags & SRV_F_FORCED_ID ? 1 : 0; + + srvrecord = NULL; + if (srv->srvrq && srv->srvrq->name) + srvrecord = srv->srvrq->name; + + if (ctx->show_conn == 0) { + /* show servers state */ + chunk_printf(&trash, + "%d %s " + "%d %s %s " + "%d %d %d %d %ld " + "%d %d %d %d %d " + "%d %d %s %u " + "%s %d %d " + "%s %s %d" + "\n", + px->uuid, HA_ANON_CLI(px->id), + srv->puid, HA_ANON_CLI(srv->id), + hash_ipanon(appctx->cli_anon_key, srv_addr, 0), + srv->cur_state, srv->cur_admin, srv->uweight, srv->iweight, + (long int)srv_time_since_last_change, + srv->check.status, srv->check.result, srv->check.health, + srv->check.state & 0x0F, srv->agent.state & 0x1F, + bk_f_forced_id, srv_f_forced_id, + srv->hostname ? HA_ANON_CLI(srv->hostname) : "-", srv->svc_port, + srvrecord ? srvrecord : "-", srv->use_ssl, srv->check.port, + srv_check_addr, srv_agent_addr, srv->agent.port); + } else { + /* show servers conn */ + int thr; + + chunk_printf(&trash, + "%s/%s %d/%d %s %u - %u %u %u %u %u %u %d %u", + HA_ANON_CLI(px->id), HA_ANON_CLI(srv->id), + px->uuid, srv->puid, hash_ipanon(appctx->cli_anon_key, srv_addr, 0), + srv->svc_port, srv->pool_purge_delay, + srv->curr_used_conns, srv->max_used_conns, srv->est_need_conns, + srv->curr_idle_nb, srv->curr_safe_nb, (int)srv->max_idle_conns, srv->curr_idle_conns); + + for (thr = 0; thr < global.nbthread && srv->curr_idle_thr; thr++) + chunk_appendf(&trash, " %u", srv->curr_idle_thr[thr]); + + chunk_appendf(&trash, "\n"); + } + + if (applet_putchk(appctx, &trash) == -1) { + return 0; + } + } + return 1; +} + +/* Parses backend list or simply use backend name provided by the user to return + * states of servers to stdout. It takes its context from show_srv_ctx and dumps + * proxy ->px and stops if ->only_pxid is non-null. + */ +static int cli_io_handler_servers_state(struct appctx *appctx) +{ + struct show_srv_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct proxy *curproxy; + + if (ctx->state == SHOW_SRV_HEAD) { + if (ctx->show_conn == 0) + chunk_printf(&trash, "%d\n# %s\n", SRV_STATE_FILE_VERSION, SRV_STATE_FILE_FIELD_NAMES); + else + chunk_printf(&trash, + "# bkname/svname bkid/svid addr port - purge_delay used_cur used_max need_est unsafe_nb safe_nb idle_lim idle_cur idle_per_thr[%d]\n", + global.nbthread); + + if (applet_putchk(appctx, &trash) == -1) + return 0; + + ctx->state = SHOW_SRV_LIST; + + if (!ctx->px) + ctx->px = proxies_list; + } + + for (; ctx->px != NULL; ctx->px = curproxy->next) { + curproxy = ctx->px; + /* servers are only in backends */ + if ((curproxy->cap & PR_CAP_BE) && !(curproxy->cap & PR_CAP_INT)) { + if (!dump_servers_state(sc)) + return 0; + } + /* only the selected proxy is dumped */ + if (ctx->only_pxid) + break; + } + + return 1; +} + +/* Parses backend list and simply report backend names. It keeps the proxy + * pointer in svcctx since there's nothing else to store there. + */ +static int cli_io_handler_show_backend(struct appctx *appctx) +{ + struct proxy *curproxy; + + chunk_reset(&trash); + + if (!appctx->svcctx) { + chunk_printf(&trash, "# name\n"); + if (applet_putchk(appctx, &trash) == -1) + return 0; + + appctx->svcctx = proxies_list; + } + + for (; appctx->svcctx != NULL; appctx->svcctx = curproxy->next) { + curproxy = appctx->svcctx; + + /* looking for non-internal backends only */ + if ((curproxy->cap & (PR_CAP_BE|PR_CAP_INT)) != PR_CAP_BE) + continue; + + chunk_appendf(&trash, "%s\n", curproxy->id); + if (applet_putchk(appctx, &trash) == -1) + return 0; + } + + return 1; +} + +/* Parses the "enable dynamic-cookies backend" directive, it always returns 1. + * + * Grabs the proxy lock and each server's lock. + */ +static int cli_parse_enable_dyncookie_backend(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *px; + struct server *s; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + px = cli_find_backend(appctx, args[3]); + if (!px) + return 1; + + if (px->mode != PR_MODE_TCP && px->mode != PR_MODE_HTTP) + return cli_err(appctx, "Not available.\n"); + + /* Note: this lock is to make sure this doesn't change while another + * thread is in srv_set_dyncookie(). + */ + HA_RWLOCK_WRLOCK(PROXY_LOCK, &px->lock); + px->ck_opts |= PR_CK_DYNAMIC; + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &px->lock); + + for (s = px->srv; s != NULL; s = s->next) { + HA_SPIN_LOCK(SERVER_LOCK, &s->lock); + srv_set_dyncookie(s); + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + } + + return 1; +} + +/* Parses the "disable dynamic-cookies backend" directive, it always returns 1. + * + * Grabs the proxy lock and each server's lock. + */ +static int cli_parse_disable_dyncookie_backend(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *px; + struct server *s; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + px = cli_find_backend(appctx, args[3]); + if (!px) + return 1; + + if (px->mode != PR_MODE_TCP && px->mode != PR_MODE_HTTP) + return cli_err(appctx, "Not available.\n"); + + /* Note: this lock is to make sure this doesn't change while another + * thread is in srv_set_dyncookie(). + */ + HA_RWLOCK_WRLOCK(PROXY_LOCK, &px->lock); + px->ck_opts &= ~PR_CK_DYNAMIC; + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &px->lock); + + for (s = px->srv; s != NULL; s = s->next) { + HA_SPIN_LOCK(SERVER_LOCK, &s->lock); + if (!(s->flags & SRV_F_COOKIESET)) + ha_free(&s->cookie); + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + } + + return 1; +} + +/* Parses the "set dynamic-cookie-key backend" directive, it always returns 1. + * + * Grabs the proxy lock and each server's lock. + */ +static int cli_parse_set_dyncookie_key_backend(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *px; + struct server *s; + char *newkey; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + px = cli_find_backend(appctx, args[3]); + if (!px) + return 1; + + if (px->mode != PR_MODE_TCP && px->mode != PR_MODE_HTTP) + return cli_err(appctx, "Not available.\n"); + + if (!*args[4]) + return cli_err(appctx, "String value expected.\n"); + + newkey = strdup(args[4]); + if (!newkey) + return cli_err(appctx, "Failed to allocate memory.\n"); + + /* Note: this lock is to make sure this doesn't change while another + * thread is in srv_set_dyncookie(). + */ + HA_RWLOCK_WRLOCK(PROXY_LOCK, &px->lock); + free(px->dyncookie_key); + px->dyncookie_key = newkey; + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &px->lock); + + for (s = px->srv; s != NULL; s = s->next) { + HA_SPIN_LOCK(SERVER_LOCK, &s->lock); + srv_set_dyncookie(s); + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + } + + return 1; +} + +/* Parses the "set maxconn frontend" directive, it always returns 1. + * + * Grabs the proxy lock. + */ +static int cli_parse_set_maxconn_frontend(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *px; + struct listener *l; + int v; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + px = cli_find_frontend(appctx, args[3]); + if (!px) + return 1; + + if (!*args[4]) + return cli_err(appctx, "Integer value expected.\n"); + + v = atoi(args[4]); + if (v < 0) + return cli_err(appctx, "Value out of range.\n"); + + /* OK, the value is fine, so we assign it to the proxy and to all of + * its listeners. The blocked ones will be dequeued. + */ + HA_RWLOCK_WRLOCK(PROXY_LOCK, &px->lock); + + px->maxconn = v; + list_for_each_entry(l, &px->conf.listeners, by_fe) { + if (l->state == LI_FULL) + relax_listener(l, 1, 0); + } + + if (px->maxconn > px->feconn) + dequeue_proxy_listeners(px); + + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &px->lock); + + return 1; +} + +/* Parses the "shutdown frontend" directive, it always returns 1. + * + * Grabs the proxy lock. + */ +static int cli_parse_shutdown_frontend(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *px; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + px = cli_find_frontend(appctx, args[2]); + if (!px) + return 1; + + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + return cli_msg(appctx, LOG_NOTICE, "Frontend was already shut down.\n"); + + stop_proxy(px); + return 1; +} + +/* Parses the "disable frontend" directive, it always returns 1. + * + * Grabs the proxy lock. + */ +static int cli_parse_disable_frontend(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *px; + int ret; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + px = cli_find_frontend(appctx, args[2]); + if (!px) + return 1; + + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + return cli_msg(appctx, LOG_NOTICE, "Frontend was previously shut down, cannot disable.\n"); + + if (!px->li_ready) + return cli_msg(appctx, LOG_NOTICE, "All sockets are already disabled.\n"); + + /* pause_proxy will take PROXY_LOCK */ + ret = pause_proxy(px); + + if (!ret) + return cli_err(appctx, "Failed to pause frontend, check logs for precise cause.\n"); + + return 1; +} + +/* Parses the "enable frontend" directive, it always returns 1. + * + * Grabs the proxy lock. + */ +static int cli_parse_enable_frontend(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *px; + int ret; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + px = cli_find_frontend(appctx, args[2]); + if (!px) + return 1; + + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + return cli_err(appctx, "Frontend was previously shut down, cannot enable.\n"); + + if (px->li_ready == px->li_all) + return cli_msg(appctx, LOG_NOTICE, "All sockets are already enabled.\n"); + + /* resume_proxy will take PROXY_LOCK */ + ret = resume_proxy(px); + + if (!ret) + return cli_err(appctx, "Failed to resume frontend, check logs for precise cause (port conflict?).\n"); + return 1; +} + +/* appctx context used during "show errors" */ +struct show_errors_ctx { + struct proxy *px; /* current proxy being dumped, NULL = not started yet. */ + unsigned int flag; /* bit0: buffer being dumped, 0 = req, 1 = resp ; bit1=skip req ; bit2=skip resp. */ + unsigned int ev_id; /* event ID of error being dumped */ + int iid; /* if >= 0, ID of the proxy to filter on */ + int ptr; /* <0: headers, >=0 : text pointer to restart from */ + int bol; /* pointer to beginning of current line */ +}; + +/* "show errors" handler for the CLI. Returns 0 if wants to continue, 1 to stop + * now. + */ +static int cli_parse_show_errors(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_errors_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + if (*args[2]) { + struct proxy *px; + + px = proxy_find_by_name(args[2], 0, 0); + if (px) + ctx->iid = px->uuid; + else + ctx->iid = atoi(args[2]); + + if (!ctx->iid) + return cli_err(appctx, "No such proxy.\n"); + } + else + ctx->iid = -1; // dump all proxies + + ctx->flag = 0; + if (strcmp(args[3], "request") == 0) + ctx->flag |= 4; // ignore response + else if (strcmp(args[3], "response") == 0) + ctx->flag |= 2; // ignore request + ctx->px = NULL; + return 0; +} + +/* This function dumps all captured errors onto the stream connector's + * read buffer. It returns 0 if the output buffer is full and it needs + * to be called again, otherwise non-zero. + */ +static int cli_io_handler_show_errors(struct appctx *appctx) +{ + struct show_errors_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + extern const char *monthname[12]; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + chunk_reset(&trash); + + if (!ctx->px) { + /* the function had not been called yet, let's prepare the + * buffer for a response. + */ + struct tm tm; + + get_localtime(date.tv_sec, &tm); + chunk_appendf(&trash, "Total events captured on [%02d/%s/%04d:%02d:%02d:%02d.%03d] : %u\n", + tm.tm_mday, monthname[tm.tm_mon], tm.tm_year+1900, + tm.tm_hour, tm.tm_min, tm.tm_sec, (int)(date.tv_usec/1000), + error_snapshot_id); + + if (applet_putchk(appctx, &trash) == -1) + goto cant_send; + + ctx->px = proxies_list; + ctx->bol = 0; + ctx->ptr = -1; + } + + /* we have two inner loops here, one for the proxy, the other one for + * the buffer. + */ + while (ctx->px) { + struct error_snapshot *es; + + HA_RWLOCK_RDLOCK(PROXY_LOCK, &ctx->px->lock); + + if ((ctx->flag & 1) == 0) { + es = ctx->px->invalid_req; + if (ctx->flag & 2) // skip req + goto next; + } + else { + es = ctx->px->invalid_rep; + if (ctx->flag & 4) // skip resp + goto next; + } + + if (!es) + goto next; + + if (ctx->iid >= 0 && + ctx->px->uuid != ctx->iid && + (!es->oe || es->oe->uuid != ctx->iid)) + goto next; + + if (ctx->ptr < 0) { + /* just print headers now */ + + char pn[INET6_ADDRSTRLEN]; + struct tm tm; + int port; + + get_localtime(es->when.tv_sec, &tm); + chunk_appendf(&trash, " \n[%02d/%s/%04d:%02d:%02d:%02d.%03d]", + tm.tm_mday, monthname[tm.tm_mon], tm.tm_year+1900, + tm.tm_hour, tm.tm_min, tm.tm_sec, (int)(es->when.tv_usec/1000)); + + switch (addr_to_str(&es->src, pn, sizeof(pn))) { + case AF_INET: + case AF_INET6: + port = get_host_port(&es->src); + break; + default: + port = 0; + } + + switch (ctx->flag & 1) { + case 0: + chunk_appendf(&trash, + " frontend %s (#%d): invalid request\n" + " backend %s (#%d)", + ctx->px->id, ctx->px->uuid, + (es->oe && es->oe->cap & PR_CAP_BE) ? es->oe->id : "<NONE>", + (es->oe && es->oe->cap & PR_CAP_BE) ? es->oe->uuid : -1); + break; + case 1: + chunk_appendf(&trash, + " backend %s (#%d): invalid response\n" + " frontend %s (#%d)", + ctx->px->id, ctx->px->uuid, + es->oe ? es->oe->id : "<NONE>" , es->oe ? es->oe->uuid : -1); + break; + } + + chunk_appendf(&trash, + ", server %s (#%d), event #%u, src %s:%d\n" + " buffer starts at %llu (including %u out), %u free,\n" + " len %u, wraps at %u, error at position %u\n", + es->srv ? es->srv->id : "<NONE>", + es->srv ? es->srv->puid : -1, + es->ev_id, pn, port, + es->buf_ofs, es->buf_out, + global.tune.bufsize - es->buf_out - es->buf_len, + es->buf_len, es->buf_wrap, es->buf_err); + + if (es->show) + es->show(&trash, es); + + chunk_appendf(&trash, " \n"); + + if (applet_putchk(appctx, &trash) == -1) + goto cant_send_unlock; + + ctx->ptr = 0; + ctx->ev_id = es->ev_id; + } + + if (ctx->ev_id != es->ev_id) { + /* the snapshot changed while we were dumping it */ + chunk_appendf(&trash, + " WARNING! update detected on this snapshot, dump interrupted. Please re-check!\n"); + if (applet_putchk(appctx, &trash) == -1) + goto cant_send_unlock; + + goto next; + } + + /* OK, ptr >= 0, so we have to dump the current line */ + while (ctx->ptr < es->buf_len && ctx->ptr < global.tune.bufsize) { + int newptr; + int newline; + + newline = ctx->bol; + newptr = dump_text_line(&trash, es->buf, global.tune.bufsize, es->buf_len, &newline, ctx->ptr); + if (newptr == ctx->ptr) { + sc_need_room(sc, 0); + goto cant_send_unlock; + } + + if (applet_putchk(appctx, &trash) == -1) + goto cant_send_unlock; + + ctx->ptr = newptr; + ctx->bol = newline; + }; + next: + HA_RWLOCK_RDUNLOCK(PROXY_LOCK, &ctx->px->lock); + ctx->bol = 0; + ctx->ptr = -1; + ctx->flag ^= 1; + if (!(ctx->flag & 1)) + ctx->px = ctx->px->next; + } + + /* dump complete */ + return 1; + + cant_send_unlock: + HA_RWLOCK_RDUNLOCK(PROXY_LOCK, &ctx->px->lock); + cant_send: + return 0; +} + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "disable", "frontend", NULL }, "disable frontend <frontend> : temporarily disable specific frontend", cli_parse_disable_frontend, NULL, NULL }, + { { "enable", "frontend", NULL }, "enable frontend <frontend> : re-enable specific frontend", cli_parse_enable_frontend, NULL, NULL }, + { { "set", "maxconn", "frontend", NULL }, "set maxconn frontend <frontend> <value> : change a frontend's maxconn setting", cli_parse_set_maxconn_frontend, NULL }, + { { "show","servers", "conn", NULL }, "show servers conn [<backend>] : dump server connections status (all or for a single backend)", cli_parse_show_servers, cli_io_handler_servers_state }, + { { "show","servers", "state", NULL }, "show servers state [<backend>] : dump volatile server information (all or for a single backend)", cli_parse_show_servers, cli_io_handler_servers_state }, + { { "show", "backend", NULL }, "show backend : list backends in the current running config", NULL, cli_io_handler_show_backend }, + { { "shutdown", "frontend", NULL }, "shutdown frontend <frontend> : stop a specific frontend", cli_parse_shutdown_frontend, NULL, NULL }, + { { "set", "dynamic-cookie-key", "backend", NULL }, "set dynamic-cookie-key backend <bk> <k> : change a backend secret key for dynamic cookies", cli_parse_set_dyncookie_key_backend, NULL }, + { { "enable", "dynamic-cookie", "backend", NULL }, "enable dynamic-cookie backend <bk> : enable dynamic cookies on a specific backend", cli_parse_enable_dyncookie_backend, NULL }, + { { "disable", "dynamic-cookie", "backend", NULL }, "disable dynamic-cookie backend <bk> : disable dynamic cookies on a specific backend", cli_parse_disable_dyncookie_backend, NULL }, + { { "show", "errors", NULL }, "show errors [<px>] [request|response] : report last request and/or response errors for each proxy", cli_parse_show_errors, cli_io_handler_show_errors, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/qmux_http.c b/src/qmux_http.c new file mode 100644 index 0000000..edf26b1 --- /dev/null +++ b/src/qmux_http.c @@ -0,0 +1,108 @@ +#include <haproxy/qmux_http.h> + +#include <haproxy/api-t.h> +#include <haproxy/htx.h> +#include <haproxy/qmux_trace.h> + +/* QUIC MUX rcv_buf operation using HTX data. Received data from stream <qcs> + * will be transferred as HTX in <buf>. Output buffer is expected to be of + * length <count>. <fin> will be set to signal the last data to receive on this + * stream. + * + * Return the size in bytes of transferred data. + */ +size_t qcs_http_rcv_buf(struct qcs *qcs, struct buffer *buf, size_t count, + char *fin) +{ + struct htx *qcs_htx = NULL; + struct htx *cs_htx = NULL; + size_t ret = 0; + + TRACE_ENTER(QMUX_EV_STRM_RECV, qcs->qcc->conn, qcs); + + *fin = 0; + qcs_htx = htx_from_buf(&qcs->rx.app_buf); + if (htx_is_empty(qcs_htx)) { + /* Set buffer data to 0 as HTX is empty. */ + htx_to_buf(qcs_htx, &qcs->rx.app_buf); + goto end; + } + + ret = qcs_htx->data; + + cs_htx = htx_from_buf(buf); + if (htx_is_empty(cs_htx) && htx_used_space(qcs_htx) <= count) { + /* EOM will be copied to cs_htx via b_xfer(). */ + if (qcs_htx->flags & HTX_FL_EOM) + *fin = 1; + + htx_to_buf(cs_htx, buf); + htx_to_buf(qcs_htx, &qcs->rx.app_buf); + b_xfer(buf, &qcs->rx.app_buf, b_data(&qcs->rx.app_buf)); + goto end; + } + + htx_xfer_blks(cs_htx, qcs_htx, count, HTX_BLK_UNUSED); + BUG_ON(qcs_htx->flags & HTX_FL_PARSING_ERROR); + + /* Copy EOM from src to dst buffer if all data copied. */ + if (htx_is_empty(qcs_htx) && (qcs_htx->flags & HTX_FL_EOM)) { + cs_htx->flags |= HTX_FL_EOM; + *fin = 1; + } + + cs_htx->extra = qcs_htx->extra ? (qcs_htx->data + qcs_htx->extra) : 0; + htx_to_buf(cs_htx, buf); + htx_to_buf(qcs_htx, &qcs->rx.app_buf); + ret -= qcs_htx->data; + + end: + TRACE_LEAVE(QMUX_EV_STRM_RECV, qcs->qcc->conn, qcs); + + return ret; +} + +/* QUIC MUX snd_buf operation using HTX data. HTX data will be transferred from + * <buf> to <qcs> stream buffer. Input buffer is expected to be of length + * <count>. <fin> will be set to signal the last data to send for this stream. + * + * Return the size in bytes of transferred data. + */ +size_t qcs_http_snd_buf(struct qcs *qcs, struct buffer *buf, size_t count, + char *fin) +{ + struct htx *htx; + size_t ret; + int eom = 0; + + TRACE_ENTER(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + + htx = htxbuf(buf); + eom = (htx->flags & HTX_FL_EOM); + ret = qcs->qcc->app_ops->snd_buf(qcs, buf, count); + *fin = (eom && !b_data(buf)); + + TRACE_LEAVE(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + + return ret; +} + +/* QUIC MUX snd_buf reset. HTX data stored in <buf> of length <count> will be + * cleared. This can be used when data should not be transmitted any longer. + * + * Return the size in bytes of cleared data. + */ +size_t qcs_http_reset_buf(struct qcs *qcs, struct buffer *buf, size_t count) +{ + struct htx *htx; + + TRACE_ENTER(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + + htx = htx_from_buf(buf); + htx_reset(htx); + htx_to_buf(htx, buf); + + TRACE_LEAVE(QMUX_EV_STRM_SEND, qcs->qcc->conn, qcs); + + return count; +} diff --git a/src/qmux_trace.c b/src/qmux_trace.c new file mode 100644 index 0000000..b213ed4 --- /dev/null +++ b/src/qmux_trace.c @@ -0,0 +1,114 @@ +#include <haproxy/qmux_trace.h> + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/chunk.h> +#include <haproxy/mux_quic.h> +#include <haproxy/quic_frame-t.h> + +/* trace source and events */ +static void qmux_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +static const struct name_desc qmux_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the connection */ }, + /* arg2 */ { .name="qcs", .desc="QUIC stream" }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc qmux_trace_decoding[] = { +#define QMUX_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define QMUX_VERB_MINIMAL 2 + { .name="minimal", .desc="report only qcc/qcs state and flags, no real decoding" }, + { /* end */ } +}; + +struct trace_source trace_qmux = { + .name = IST("qmux"), + .desc = "QUIC multiplexer", + .arg_def = TRC_ARG1_CONN, /* TRACE()'s first argument is always a connection */ + .default_cb = qmux_trace, + .known_events = qmux_trace_events, + .lockon_args = qmux_trace_lockon_args, + .decoding = qmux_trace_decoding, + .report_events = ~0, /* report everything by default */ +}; + + +static void qmux_trace_frm(const struct quic_frame *frm) +{ + switch (frm->type) { + case QUIC_FT_MAX_STREAMS_BIDI: + chunk_appendf(&trace_buf, " max_streams=%llu", + (ullong)frm->max_streams_bidi.max_streams); + break; + + case QUIC_FT_MAX_STREAMS_UNI: + chunk_appendf(&trace_buf, " max_streams=%llu", + (ullong)frm->max_streams_uni.max_streams); + break; + + default: + break; + } +} + +/* quic-mux trace handler */ +static void qmux_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct connection *conn = a1; + const struct qcc *qcc = conn ? conn->ctx : NULL; + const struct qcs *qcs = a2; + + if (!qcc) + return; + + if (src->verbosity > QMUX_VERB_CLEAN) { + chunk_appendf(&trace_buf, " : qcc=%p(F)", qcc); + if (qcc->conn->handle.qc) + chunk_appendf(&trace_buf, " qc=%p", qcc->conn->handle.qc); + + chunk_appendf(&trace_buf, " md=%llu/%llu/%llu", + (ullong)qcc->rfctl.md, (ullong)qcc->tx.offsets, (ullong)qcc->tx.sent_offsets); + + if (qcs) { + chunk_appendf(&trace_buf, " qcs=%p .id=%llu .st=%s", + qcs, (ullong)qcs->id, + qcs_st_to_str(qcs->st)); + chunk_appendf(&trace_buf, " msd=%llu/%llu/%llu", + (ullong)qcs->tx.msd, (ullong)qcs->tx.offset, (ullong)qcs->tx.sent_offset); + } + + if (mask & QMUX_EV_QCC_NQCS) { + const uint64_t *id = a3; + chunk_appendf(&trace_buf, " id=%llu", (ullong)*id); + } + + if (mask & QMUX_EV_SEND_FRM) + qmux_trace_frm(a3); + + if (mask & QMUX_EV_QCS_XFER_DATA) { + const struct qcs_xfer_data_trace_arg *arg = a3; + chunk_appendf(&trace_buf, " prep=%lu xfer=%d", + (ulong)arg->prep, arg->xfer); + } + + if (mask & QMUX_EV_QCS_BUILD_STRM) { + const struct qcs_build_stream_trace_arg *arg = a3; + chunk_appendf(&trace_buf, " len=%lu fin=%d offset=%llu", + (ulong)arg->len, arg->fin, (ullong)arg->offset); + } + } +} + + +/* register qmux traces */ +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); diff --git a/src/qpack-dec.c b/src/qpack-dec.c new file mode 100644 index 0000000..97392bb --- /dev/null +++ b/src/qpack-dec.c @@ -0,0 +1,563 @@ +/* + * QPACK decompressor + * + * Copyright 2021 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <import/ist.h> +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/h3.h> +#include <haproxy/mux_quic.h> +#include <haproxy/qpack-t.h> +#include <haproxy/qpack-dec.h> +#include <haproxy/qpack-tbl.h> +#include <haproxy/hpack-huff.h> +#include <haproxy/hpack-tbl.h> +#include <haproxy/http-hdr.h> +#include <haproxy/tools.h> + +#if defined(DEBUG_QPACK) +#define qpack_debug_printf fprintf +#define qpack_debug_hexdump debug_hexdump +#else +#define qpack_debug_printf(...) do { } while (0) +#define qpack_debug_hexdump(...) do { } while (0) +#endif + +/* Encoded field line bitmask */ +#define QPACK_EFL_BITMASK 0xf0 +#define QPACK_LFL_WPBNM 0x00 // Literal field line with post-base name reference +#define QPACK_IFL_WPBI 0x10 // Indexed field line with post-based index +#define QPACK_LFL_WLN_BIT 0x20 // Literal field line with literal name +#define QPACK_LFL_WNR_BIT 0x40 // Literal field line with name reference +#define QPACK_IFL_BIT 0x80 // Indexed field line + +/* reads a varint from <raw>'s lowest <b> bits and <len> bytes max (raw included). + * returns the 64-bit value on success after updating buf and len_in. Forces + * len_in to (uint64_t)-1 on truncated input. + * Note that this function is similar to the one used for HPACK (except that is supports + * up to 62-bits integers). + */ +static uint64_t qpack_get_varint(const unsigned char **buf, uint64_t *len_in, int b) +{ + uint64_t ret = 0; + int len = *len_in; + const uint8_t *raw = *buf; + uint8_t shift = 0; + + len--; + ret = *raw++ & ((1ULL << b) - 1); + if (ret != (uint64_t)((1ULL << b) - 1)) + goto end; + + while (len && (*raw & 128)) { + ret += ((uint64_t)*raw++ & 127) << shift; + shift += 7; + len--; + } + + /* last 7 bits */ + if (!len) + goto too_short; + + len--; + ret += ((uint64_t)*raw++ & 127) << shift; + + end: + *buf = raw; + *len_in = len; + return ret; + + too_short: + *len_in = (uint64_t)-1; + return 0; +} + +/* Decode an encoder stream. + * + * Returns 0 on success else non-zero. + */ +int qpack_decode_enc(struct buffer *buf, int fin, void *ctx) +{ + struct qcs *qcs = ctx; + size_t len; + unsigned char inst; + + /* RFC 9204 4.2. Encoder and Decoder Streams + * + * The sender MUST NOT close either of these streams, and the receiver + * MUST NOT request that the sender close either of these streams. + * Closure of either unidirectional stream type MUST be treated as a + * connection error of type H3_CLOSED_CRITICAL_STREAM. + */ + if (fin) { + qcc_set_error(qcs->qcc, H3_CLOSED_CRITICAL_STREAM, 1); + return -1; + } + + len = b_data(buf); + qpack_debug_hexdump(stderr, "[QPACK-DEC-ENC] ", b_head(buf), 0, len); + + if (!len) { + qpack_debug_printf(stderr, "[QPACK-DEC-ENC] empty stream\n"); + return 0; + } + + inst = (unsigned char)*b_head(buf) & QPACK_ENC_INST_BITMASK; + if (inst == QPACK_ENC_INST_DUP) { + /* Duplicate */ + } + else if (inst & QPACK_ENC_INST_IWNR_BIT) { + /* Insert With Name Reference */ + } + else if (inst & QPACK_ENC_INST_IWLN_BIT) { + /* Insert with literal name */ + } + else if (inst & QPACK_ENC_INST_SDTC_BIT) { + /* Set dynamic table capacity */ + } + + return 0; +} + +/* Decode an decoder stream. + * + * Returns 0 on success else non-zero. + */ +int qpack_decode_dec(struct buffer *buf, int fin, void *ctx) +{ + struct qcs *qcs = ctx; + size_t len; + unsigned char inst; + + /* RFC 9204 4.2. Encoder and Decoder Streams + * + * The sender MUST NOT close either of these streams, and the receiver + * MUST NOT request that the sender close either of these streams. + * Closure of either unidirectional stream type MUST be treated as a + * connection error of type H3_CLOSED_CRITICAL_STREAM. + */ + if (fin) { + qcc_set_error(qcs->qcc, H3_CLOSED_CRITICAL_STREAM, 1); + return -1; + } + + len = b_data(buf); + qpack_debug_hexdump(stderr, "[QPACK-DEC-DEC] ", b_head(buf), 0, len); + + if (!len) { + qpack_debug_printf(stderr, "[QPACK-DEC-DEC] empty stream\n"); + return 0; + } + + inst = (unsigned char)*b_head(buf) & QPACK_DEC_INST_BITMASK; + if (inst == QPACK_DEC_INST_ICINC) { + /* Insert count increment */ + } + else if (inst & QPACK_DEC_INST_SACK) { + /* Section Acknowledgment */ + } + else if (inst & QPACK_DEC_INST_SCCL) { + /* Stream cancellation */ + } + + return 0; +} + +/* Decode a field section prefix made of <enc_ric> and <db> two varints. + * Also set the 'S' sign bit for <db>. + * Return a negative error if failed, 0 if not. + */ +static int qpack_decode_fs_pfx(uint64_t *enc_ric, uint64_t *db, int *sign_bit, + const unsigned char **raw, uint64_t *len) +{ + *enc_ric = qpack_get_varint(raw, len, 8); + if (*len == (uint64_t)-1) + return -QPACK_ERR_RIC; + + *sign_bit = **raw & 0x8; + *db = qpack_get_varint(raw, len, 7); + if (*len == (uint64_t)-1) + return -QPACK_ERR_DB; + + return 0; +} + +/* Decode a field section from the <raw> buffer of <len> bytes. Each parsed + * header is inserted into <list> of <list_size> entries max and uses <tmp> as + * a storage for some elements pointing into it. An end marker is inserted at + * the end of the list with empty strings as name/value. + * + * Returns the number of headers inserted into list excluding the end marker. + * In case of error, a negative code QPACK_ERR_* is returned. + */ +int qpack_decode_fs(const unsigned char *raw, uint64_t len, struct buffer *tmp, + struct http_hdr *list, int list_size) +{ + struct ist name, value; + uint64_t enc_ric, db; + int s; + unsigned int efl_type; + int ret; + int hdr_idx = 0; + + qpack_debug_hexdump(stderr, "[QPACK-DEC-FS] ", (const char *)raw, 0, len); + + /* parse field section prefix */ + ret = qpack_decode_fs_pfx(&enc_ric, &db, &s, &raw, &len); + if (ret < 0) { + qpack_debug_printf(stderr, "##ERR@%d(%d)\n", __LINE__, ret); + goto out; + } + + chunk_reset(tmp); + qpack_debug_printf(stderr, "enc_ric: %llu db: %llu s=%d\n", + (unsigned long long)enc_ric, (unsigned long long)db, !!s); + /* Decode field lines */ + while (len) { + if (hdr_idx >= list_size) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TOO_LARGE; + goto out; + } + + /* parse field line representation */ + efl_type = *raw & QPACK_EFL_BITMASK; + qpack_debug_printf(stderr, "efl_type=0x%02x\n", efl_type); + + if (efl_type == QPACK_LFL_WPBNM) { + /* Literal field line with post-base name reference + * TODO adjust this when dynamic table support is implemented. + */ +#if 0 + uint64_t index __maybe_unused, length; + unsigned int n __maybe_unused, h __maybe_unused; + + qpack_debug_printf(stderr, "literal field line with post-base name reference:"); + n = *raw & 0x08; + index = qpack_get_varint(&raw, &len, 3); + if (len == (uint64_t)-1) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + qpack_debug_printf(stderr, " n=%d index=%llu", !!n, (unsigned long long)index); + h = *raw & 0x80; + length = qpack_get_varint(&raw, &len, 7); + if (len == (uint64_t)-1) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + qpack_debug_printf(stderr, " h=%d length=%llu", !!h, (unsigned long long)length); + + if (len < length) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + raw += length; + len -= length; +#endif + + /* RFC9204 2.2.3 Invalid References + * + * If the decoder encounters a reference in a field line representation + * to a dynamic table entry that has already been evicted or that has an + * absolute index greater than or equal to the declared Required Insert + * Count (Section 4.5.1), it MUST treat this as a connection error of + * type QPACK_DECOMPRESSION_FAILED. + */ + return -QPACK_DECOMPRESSION_FAILED; + } + else if (efl_type == QPACK_IFL_WPBI) { + /* Indexed field line with post-base index + * TODO adjust this when dynamic table support is implemented. + */ +#if 0 + uint64_t index __maybe_unused; + + qpack_debug_printf(stderr, "indexed field line with post-base index:"); + index = qpack_get_varint(&raw, &len, 4); + if (len == (uint64_t)-1) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + qpack_debug_printf(stderr, " index=%llu", (unsigned long long)index); +#endif + + /* RFC9204 2.2.3 Invalid References + * + * If the decoder encounters a reference in a field line representation + * to a dynamic table entry that has already been evicted or that has an + * absolute index greater than or equal to the declared Required Insert + * Count (Section 4.5.1), it MUST treat this as a connection error of + * type QPACK_DECOMPRESSION_FAILED. + */ + return -QPACK_DECOMPRESSION_FAILED; + } + else if (efl_type & QPACK_IFL_BIT) { + /* Indexed field line */ + uint64_t index; + unsigned int static_tbl; + + qpack_debug_printf(stderr, "indexed field line:"); + static_tbl = efl_type & 0x40; + index = qpack_get_varint(&raw, &len, 6); + if (len == (uint64_t)-1) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + if (static_tbl && index < QPACK_SHT_SIZE) { + name = qpack_sht[index].n; + value = qpack_sht[index].v; + } + else { + /* RFC9204 2.2.3 Invalid References + * + * If the decoder encounters a reference in a field line representation + * to a dynamic table entry that has already been evicted or that has an + * absolute index greater than or equal to the declared Required Insert + * Count (Section 4.5.1), it MUST treat this as a connection error of + * type QPACK_DECOMPRESSION_FAILED. + * + * TODO adjust this when dynamic table support is implemented. + */ + return -QPACK_DECOMPRESSION_FAILED; + } + + qpack_debug_printf(stderr, " t=%d index=%llu", !!static_tbl, (unsigned long long)index); + } + else if (efl_type & QPACK_LFL_WNR_BIT) { + /* Literal field line with name reference */ + uint64_t index, length; + unsigned int static_tbl, n __maybe_unused, h; + + qpack_debug_printf(stderr, "Literal field line with name reference:"); + n = efl_type & 0x20; + static_tbl = efl_type & 0x10; + index = qpack_get_varint(&raw, &len, 4); + if (len == (uint64_t)-1) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + if (static_tbl && index < QPACK_SHT_SIZE) { + name = qpack_sht[index].n; + } + else { + /* RFC9204 2.2.3 Invalid References + * + * If the decoder encounters a reference in a field line representation + * to a dynamic table entry that has already been evicted or that has an + * absolute index greater than or equal to the declared Required Insert + * Count (Section 4.5.1), it MUST treat this as a connection error of + * type QPACK_DECOMPRESSION_FAILED. + * + * TODO adjust this when dynamic table support is implemented. + */ + return -QPACK_DECOMPRESSION_FAILED; + } + + qpack_debug_printf(stderr, " n=%d t=%d index=%llu", !!n, !!static_tbl, (unsigned long long)index); + h = *raw & 0x80; + length = qpack_get_varint(&raw, &len, 7); + if (len == (uint64_t)-1) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + qpack_debug_printf(stderr, " h=%d length=%llu", !!h, (unsigned long long)length); + if (h) { + char *trash; + int nlen; + + trash = chunk_newstr(tmp); + if (!trash) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_DECOMPRESSION_FAILED; + goto out; + } + nlen = huff_dec(raw, length, trash, tmp->size - tmp->data); + if (nlen == (uint32_t)-1) { + qpack_debug_printf(stderr, " can't decode huffman.\n"); + ret = -QPACK_ERR_HUFFMAN; + goto out; + } + + qpack_debug_printf(stderr, " [name huff %d->%d '%s']", (int)length, (int)nlen, trash); + /* makes an ist from tmp storage */ + b_add(tmp, nlen); + value = ist2(trash, nlen); + } + else { + value = ist2(raw, length); + } + + if (len < length) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + raw += length; + len -= length; + } + else if (efl_type & QPACK_LFL_WLN_BIT) { + /* Literal field line with literal name */ + unsigned int n __maybe_unused, hname, hvalue; + uint64_t name_len, value_len; + + qpack_debug_printf(stderr, "Literal field line with literal name:"); + n = *raw & 0x10; + hname = *raw & 0x08; + name_len = qpack_get_varint(&raw, &len, 3); + if (len == (uint64_t)-1) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + qpack_debug_printf(stderr, " n=%d hname=%d name_len=%llu", !!n, !!hname, (unsigned long long)name_len); + /* Name string */ + + if (len < name_len) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + if (hname) { + char *trash; + int nlen; + + trash = chunk_newstr(tmp); + if (!trash) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_DECOMPRESSION_FAILED; + goto out; + } + nlen = huff_dec(raw, name_len, trash, tmp->size - tmp->data); + if (nlen == (uint32_t)-1) { + qpack_debug_printf(stderr, " can't decode huffman.\n"); + ret = -QPACK_ERR_HUFFMAN; + goto out; + } + + qpack_debug_printf(stderr, " [name huff %d->%d '%s']", (int)name_len, (int)nlen, trash); + /* makes an ist from tmp storage */ + b_add(tmp, nlen); + name = ist2(trash, nlen); + } + else { + name = ist2(raw, name_len); + } + + raw += name_len; + len -= name_len; + + hvalue = *raw & 0x80; + value_len = qpack_get_varint(&raw, &len, 7); + if (len == (uint64_t)-1) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + qpack_debug_printf(stderr, " hvalue=%d value_len=%llu", !!hvalue, (unsigned long long)value_len); + + if (len < value_len) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TRUNCATED; + goto out; + } + + if (hvalue) { + char *trash; + int nlen; + + trash = chunk_newstr(tmp); + if (!trash) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_DECOMPRESSION_FAILED; + goto out; + } + nlen = huff_dec(raw, value_len, trash, tmp->size - tmp->data); + if (nlen == (uint32_t)-1) { + qpack_debug_printf(stderr, " can't decode huffman.\n"); + ret = -QPACK_ERR_HUFFMAN; + goto out; + } + + qpack_debug_printf(stderr, " [name huff %d->%d '%s']", (int)value_len, (int)nlen, trash); + /* makes an ist from tmp storage */ + b_add(tmp, nlen); + value = ist2(trash, nlen); + } + else { + value = ist2(raw, value_len); + } + + raw += value_len; + len -= value_len; + } + + /* We must not accept empty header names (forbidden by the spec and used + * as a list termination). + */ + if (!name.len) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_DECOMPRESSION_FAILED; + goto out; + } + + list[hdr_idx].n = name; + list[hdr_idx].v = value; + ++hdr_idx; + + qpack_debug_printf(stderr, "\n"); + } + + if (hdr_idx >= list_size) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_ERR_TOO_LARGE; + goto out; + } + + /* put an end marker */ + list[hdr_idx].n = list[hdr_idx].v = IST_NULL; + ret = hdr_idx; + + out: + qpack_debug_printf(stderr, "-- done: ret=%d\n", ret); + return ret; +} diff --git a/src/qpack-enc.c b/src/qpack-enc.c new file mode 100644 index 0000000..006f1f1 --- /dev/null +++ b/src/qpack-enc.c @@ -0,0 +1,185 @@ +#include <haproxy/qpack-enc.h> + +#include <haproxy/buf.h> +#include <haproxy/intops.h> + +/* Returns the byte size required to encode <i> as a <prefix_size>-prefix + * integer. + */ +static size_t qpack_get_prefix_int_size(int i, int prefix_size) +{ + int n = (1 << prefix_size) - 1; + if (i < n) { + return 1; + } + else { + size_t result = 0; + while (i) { + ++result; + i >>= 7; + } + return 1 + result; + } +} + +/* Encode the integer <i> in the buffer <out> in a <prefix_size>-bit prefix + * integer. The caller must ensure there is enough size in the buffer. The + * prefix is OR-ed with <before_prefix> byte. + * + * Returns 0 if success else non-zero. + */ +static int qpack_encode_prefix_integer(struct buffer *out, int i, + int prefix_size, + unsigned char before_prefix) +{ + const int mod = (1 << prefix_size) - 1; + BUG_ON_HOT(!prefix_size); + + if (i < mod) { + if (b_room(out) < 1) + return 1; + + b_putchr(out, before_prefix | i); + } + else { + int to_encode = i - mod; + const size_t sz = to_encode / mod; + + if (b_room(out) < sz) + return 1; + + b_putchr(out, before_prefix | mod); + while (1) { + if (to_encode > 0x7f) { + b_putchr(out, 0x80 | (to_encode & 0x7f)); + to_encode >>= 7; + } + else { + b_putchr(out, to_encode & 0x7f); + break; + } + } + } + + return 0; +} + +/* Returns 0 on success else non-zero. */ +int qpack_encode_int_status(struct buffer *out, unsigned int status) +{ + int status_size, idx = 0; + + if (status < 100 || status > 999) + return 1; + + switch (status) { + case 103: idx = 24; break; + case 200: idx = 25; break; + case 304: idx = 26; break; + case 404: idx = 27; break; + case 503: idx = 28; break; + case 100: idx = 63; break; + case 204: idx = 64; break; + case 206: idx = 65; break; + case 302: idx = 66; break; + case 400: idx = 67; break; + case 403: idx = 68; break; + case 421: idx = 69; break; + case 425: idx = 70; break; + case 500: idx = 71; break; + + /* status code not in QPACK static table, idx is null. */ + default: break; + } + + if (idx) { + /* status code present in QPACK static table + * -> indexed field line + */ + status_size = qpack_get_prefix_int_size(idx, 6); + if (b_room(out) < status_size) + return 1; + + qpack_encode_prefix_integer(out, idx, 6, 0xc0); + } + else { + /* status code not present in QPACK static table + * -> literal field line with name reference + */ + char a, b, c; + a = '0' + status / 100; + status -= (status / 100 * 100); + b = '0' + status / 10; + status -= (status / 10 * 10); + c = '0' + status; + + /* field name */ + if (qpack_encode_prefix_integer(out, 24, 4, 0x50)) + return 1; + + /* field value length */ + if (qpack_encode_prefix_integer(out, 3, 7, 0x00)) + return 1; + + if (b_room(out) < 3) + return 1; + + b_putchr(out, a); + b_putchr(out, b); + b_putchr(out, c); + } + + return 0; +} + +/* Returns 0 on success else non-zero. */ +int qpack_encode_field_section_line(struct buffer *out) +{ + char qpack_field_section[] = { + '\x00', /* required insert count */ + '\x00', /* S + delta base */ + }; + + if (b_room(out) < 2) + return 1; + + b_putblk(out, qpack_field_section, 2); + + return 0; +} + +#define QPACK_LFL_WLN_BIT 0x20 // Literal field line with literal name + +/* Encode a header in literal field line with literal name. + * Returns 0 on success else non-zero. + */ +int qpack_encode_header(struct buffer *out, const struct ist n, const struct ist v) +{ + int i; + size_t sz = qpack_get_prefix_int_size(n.len, 3) + n.len + + qpack_get_prefix_int_size(v.len, 7) + v.len; + + if (sz > b_room(out)) + return 1; + + /* literal field line with literal name + * | 0 | 0 | 1 | N | H | . | . | . | + * N :(allow an intermediary to add the header in a dynamic table) + * H: huffman encoded + * name len + */ + qpack_encode_prefix_integer(out, n.len, 3, QPACK_LFL_WLN_BIT); + /* name */ + for (i = 0; i < n.len; ++i) + b_putchr(out, n.ptr[i]); + + /* | 0 | . | . | . | . | . | . | . | + * value len + */ + qpack_encode_prefix_integer(out, v.len, 7, 0x00); + /* value */ + for (i = 0; i < v.len; ++i) + b_putchr(out, v.ptr[i]); + + return 0; +} diff --git a/src/qpack-tbl.c b/src/qpack-tbl.c new file mode 100644 index 0000000..7c59fd2 --- /dev/null +++ b/src/qpack-tbl.c @@ -0,0 +1,415 @@ +/* + * QPACK header table management (draft-ietf-quic-qpack-20) + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <inttypes.h> +#include <stdio.h> + +#include <import/ist.h> +#include <haproxy/http-hdr-t.h> +#include <haproxy/qpack-tbl.h> + +/* static header table as in draft-ietf-quic-qpack-20 Appendix A. */ +const struct http_hdr qpack_sht[QPACK_SHT_SIZE] = { + [ 0] = { .n = IST(":authority"), .v = IST("") }, + [ 1] = { .n = IST(":path"), .v = IST("/") }, + [ 2] = { .n = IST("age"), .v = IST("0") }, + [ 3] = { .n = IST("content-disposition"), .v = IST("") }, + [ 4] = { .n = IST("content-length"), .v = IST("0") }, + [ 5] = { .n = IST("cookie"), .v = IST("") }, + [ 6] = { .n = IST("date"), .v = IST("") }, + [ 7] = { .n = IST("etag"), .v = IST("") }, + [ 8] = { .n = IST("if-modified-since"), .v = IST("") }, + [ 9] = { .n = IST("if-none-match"), .v = IST("") }, + [10] = { .n = IST("last-modified"), .v = IST("") }, + [11] = { .n = IST("link"), .v = IST("") }, + [12] = { .n = IST("location"), .v = IST("") }, + [13] = { .n = IST("referer"), .v = IST("") }, + [14] = { .n = IST("set-cookie"), .v = IST("") }, + [15] = { .n = IST(":method"), .v = IST("CONNECT") }, + [16] = { .n = IST(":method"), .v = IST("DELETE") }, + [17] = { .n = IST(":method"), .v = IST("GET") }, + [18] = { .n = IST(":method"), .v = IST("HEAD") }, + [19] = { .n = IST(":method"), .v = IST("OPTIONS") }, + [20] = { .n = IST(":method"), .v = IST("POST") }, + [21] = { .n = IST(":method"), .v = IST("PUT") }, + [22] = { .n = IST(":scheme"), .v = IST("http") }, + [23] = { .n = IST(":scheme"), .v = IST("https") }, + [24] = { .n = IST(":status"), .v = IST("103") }, + [25] = { .n = IST(":status"), .v = IST("200") }, + [26] = { .n = IST(":status"), .v = IST("304") }, + [27] = { .n = IST(":status"), .v = IST("404") }, + [28] = { .n = IST(":status"), .v = IST("503") }, + [29] = { .n = IST("accept"), .v = IST("*/*") }, + [30] = { .n = IST("accept"), .v = IST("application/dns-message") }, + [31] = { .n = IST("accept-encoding"), .v = IST("gzip, deflate, br") }, + [32] = { .n = IST("accept-ranges"), .v = IST("bytes") }, + [33] = { .n = IST("access-control-allow-headers"), .v = IST("cache-control") }, + [34] = { .n = IST("access-control-allow-headers"), .v = IST("content-type") }, + [35] = { .n = IST("access-control-allow-origin"), .v = IST("*") }, + [36] = { .n = IST("cache-control"), .v = IST("max-age=0") }, + [37] = { .n = IST("cache-control"), .v = IST("max-age=2592000") }, + [38] = { .n = IST("cache-control"), .v = IST("max-age=604800") }, + [39] = { .n = IST("cache-control"), .v = IST("no-cache") }, + [40] = { .n = IST("cache-control"), .v = IST("no-store") }, + [41] = { .n = IST("cache-control"), .v = IST("public, max-age=31536000") }, + [42] = { .n = IST("content-encoding"), .v = IST("br") }, + [43] = { .n = IST("content-encoding"), .v = IST("gzip") }, + [44] = { .n = IST("content-type"), .v = IST("application/dns-message") }, + [45] = { .n = IST("content-type"), .v = IST("application/javascript") }, + [46] = { .n = IST("content-type"), .v = IST("application/json") }, + [47] = { .n = IST("content-type"), .v = IST("application/" + "x-www-form-urlencoded") }, + [48] = { .n = IST("content-type"), .v = IST("image/gif") }, + [49] = { .n = IST("content-type"), .v = IST("image/jpeg") }, + [50] = { .n = IST("content-type"), .v = IST("image/png") }, + [51] = { .n = IST("content-type"), .v = IST("text/css") }, + [52] = { .n = IST("content-type"), .v = IST("text/html;" + " charset=utf-8") }, + [53] = { .n = IST("content-type"), .v = IST("text/plain") }, + [54] = { .n = IST("content-type"), .v = IST("text/plain;" + "charset=utf-8") }, + [55] = { .n = IST("range"), .v = IST("bytes=0-") }, + [56] = { .n = IST("strict-transport-security"), .v = IST("max-age=31536000") }, + [57] = { .n = IST("strict-transport-security"), .v = IST("max-age=31536000;" + " includesubdomains") }, + [58] = { .n = IST("strict-transport-security"), .v = IST("max-age=31536000;" + " includesubdomains;" + " preload") }, + [59] = { .n = IST("vary"), .v = IST("accept-encoding") }, + [60] = { .n = IST("vary"), .v = IST("origin") }, + [61] = { .n = IST("x-content-type-options"), .v = IST("nosniff") }, + [62] = { .n = IST("x-xss-protection"), .v = IST("1; mode=block") }, + [63] = { .n = IST(":status"), .v = IST("100") }, + [64] = { .n = IST(":status"), .v = IST("204") }, + [65] = { .n = IST(":status"), .v = IST("206") }, + [66] = { .n = IST(":status"), .v = IST("302") }, + [67] = { .n = IST(":status"), .v = IST("400") }, + [68] = { .n = IST(":status"), .v = IST("403") }, + [69] = { .n = IST(":status"), .v = IST("421") }, + [70] = { .n = IST(":status"), .v = IST("425") }, + [71] = { .n = IST(":status"), .v = IST("500") }, + [72] = { .n = IST("accept-language"), .v = IST("") }, + [73] = { .n = IST("access-control-allow-credentials"), .v = IST("FALSE") }, + [74] = { .n = IST("access-control-allow-credentials"), .v = IST("TRUE") }, + [75] = { .n = IST("access-control-allow-headers"), .v = IST("*") }, + [76] = { .n = IST("access-control-allow-methods"), .v = IST("get") }, + [77] = { .n = IST("access-control-allow-methods"), .v = IST("get, post, options") }, + [78] = { .n = IST("access-control-allow-methods"), .v = IST("options") }, + [79] = { .n = IST("access-control-expose-headers"), .v = IST("content-length") }, + [80] = { .n = IST("access-control-request-headers"), .v = IST("content-type") }, + [81] = { .n = IST("access-control-request-method"), .v = IST("get") }, + [82] = { .n = IST("access-control-request-method"), .v = IST("post") }, + [83] = { .n = IST("alt-svc"), .v = IST("clear") }, + [84] = { .n = IST("authorization"), .v = IST("") }, + [85] = { .n = IST("content-security-policy"), .v = IST("script-src 'none';" + " object-src 'none';" + " base-uri 'none'") }, + [86] = { .n = IST("early-data"), .v = IST("1") }, + [87] = { .n = IST("expect-ct"), .v = IST("") }, + [88] = { .n = IST("forwarded"), .v = IST("") }, + [89] = { .n = IST("if-range"), .v = IST("") }, + [90] = { .n = IST("origin"), .v = IST("") }, + [91] = { .n = IST("purpose"), .v = IST("prefetch") }, + [92] = { .n = IST("server"), .v = IST("") }, + [93] = { .n = IST("timing-allow-origin"), .v = IST("*") }, + [94] = { .n = IST("upgrade-insecure-requests"), .v = IST("1") }, + [95] = { .n = IST("user-agent"), .v = IST("") }, + [96] = { .n = IST("x-forwarded-for"), .v = IST("") }, + [97] = { .n = IST("x-frame-options"), .v = IST("deny") }, + [98] = { .n = IST("x-frame-options"), .v = IST("sameorigin") }, +}; + +struct pool_head *pool_head_qpack_tbl = NULL; + +#ifdef DEBUG_QPACK +/* dump the whole dynamic header table */ +void qpack_dht_dump(FILE *out, const struct qpack_dht *dht) +{ + unsigned int i; + unsigned int slot; + char name[4096], value[4096]; + + for (i = QPACK_SHT_SIZE; i < QPACK_SHT_SIZE + dht->used; i++) { + slot = (qpack_get_dte(dht, i - QPACK_SHT_SIZE + 1) - dht->dte); + fprintf(out, "idx=%u slot=%u name=<%s> value=<%s> addr=%u-%u\n", + i, slot, + istpad(name, qpack_idx_to_name(dht, i)).ptr, + istpad(value, qpack_idx_to_value(dht, i)).ptr, + dht->dte[slot].addr, dht->dte[slot].addr+dht->dte[slot].nlen+dht->dte[slot].vlen-1); + } +} + +/* check for the whole dynamic header table consistency, abort on failures */ +void qpack_dht_check_consistency(const struct qpack_dht *dht) +{ + unsigned slot = qpack_dht_get_tail(dht); + unsigned used2 = dht->used; + unsigned total = 0; + + if (!dht->used) + return; + + if (dht->front >= dht->wrap) + abort(); + + if (dht->used > dht->wrap) + abort(); + + if (dht->head >= dht->wrap) + abort(); + + while (used2--) { + total += dht->dte[slot].nlen + dht->dte[slot].vlen; + slot++; + if (slot >= dht->wrap) + slot = 0; + } + + if (total != dht->total) { + fprintf(stderr, "%d: total=%u dht=%u\n", __LINE__, total, dht->total); + abort(); + } +} +#endif // DEBUG_QPACK + +/* rebuild a new dynamic header table from <dht> with an unwrapped index and + * contents at the end. The new table is returned, the caller must not use the + * previous one anymore. NULL may be returned if no table could be allocated. + */ +static struct qpack_dht *qpack_dht_defrag(struct qpack_dht *dht) +{ + struct qpack_dht *alt_dht; + uint16_t old, new; + uint32_t addr; + + /* Note: for small tables we could use alloca() instead but + * portability especially for large tables can be problematic. + */ + alt_dht = qpack_dht_alloc(); + if (!alt_dht) + return NULL; + + alt_dht->total = dht->total; + alt_dht->used = dht->used; + alt_dht->wrap = dht->used; + + new = 0; + addr = alt_dht->size; + + if (dht->used) { + /* start from the tail */ + old = qpack_dht_get_tail(dht); + do { + alt_dht->dte[new].nlen = dht->dte[old].nlen; + alt_dht->dte[new].vlen = dht->dte[old].vlen; + addr -= dht->dte[old].nlen + dht->dte[old].vlen; + alt_dht->dte[new].addr = addr; + + memcpy((void *)alt_dht + alt_dht->dte[new].addr, + (void *)dht + dht->dte[old].addr, + dht->dte[old].nlen + dht->dte[old].vlen); + + old++; + if (old >= dht->wrap) + old = 0; + new++; + } while (new < dht->used); + } + + alt_dht->front = alt_dht->head = new - 1; + + memcpy(dht, alt_dht, dht->size); + qpack_dht_free(alt_dht); + + return dht; +} + +/* Purges table dht until a header field of <needed> bytes fits according to + * the protocol (adding 32 bytes overhead). Returns non-zero on success, zero + * on failure (ie: table empty but still not sufficient). It must only be + * called when the table is not large enough to suit the new entry and there + * are some entries left. In case of doubt, use dht_make_room() instead. + */ +int __qpack_dht_make_room(struct qpack_dht *dht, unsigned int needed) +{ + unsigned int used = dht->used; + unsigned int wrap = dht->wrap; + unsigned int tail; + + do { + tail = ((dht->head + 1U < used) ? wrap : 0) + dht->head + 1U - used; + dht->total -= dht->dte[tail].nlen + dht->dte[tail].vlen; + if (tail == dht->front) + dht->front = dht->head; + used--; + } while (used && used * 32 + dht->total + needed + 32 > dht->size); + + dht->used = used; + + /* realign if empty */ + if (!used) + dht->front = dht->head = 0; + + /* pack the table if it doesn't wrap anymore */ + if (dht->head + 1U >= used) + dht->wrap = dht->head + 1; + + /* no need to check for 'used' here as if it doesn't fit, used==0 */ + return needed + 32 <= dht->size; +} + +/* tries to insert a new header <name>:<value> in front of the current head. A + * negative value is returned on error. + */ +int qpack_dht_insert(struct qpack_dht *dht, struct ist name, struct ist value) +{ + unsigned int used; + unsigned int head; + unsigned int prev; + unsigned int wrap; + unsigned int tail; + uint32_t headroom, tailroom; + + if (!qpack_dht_make_room(dht, name.len + value.len)) + return 0; + + /* Now there is enough room in the table, that's guaranteed by the + * protocol, but not necessarily where we need it. + */ + + used = dht->used; + if (!used) { + /* easy, the table was empty */ + dht->front = dht->head = 0; + dht->wrap = dht->used = 1; + dht->total = 0; + head = 0; + dht->dte[head].addr = dht->size - (name.len + value.len); + goto copy; + } + + /* compute the new head, used and wrap position */ + prev = head = dht->head; + wrap = dht->wrap; + tail = qpack_dht_get_tail(dht); + + used++; + head++; + + if (head >= wrap) { + /* head is leading the entries, we either need to push the + * table further or to loop back to released entries. We could + * force to loop back when at least half of the allocatable + * entries are free but in practice it never happens. + */ + if ((sizeof(*dht) + (wrap + 1) * sizeof(dht->dte[0]) <= dht->dte[dht->front].addr)) + wrap++; + else if (head >= used) /* there's a hole at the beginning */ + head = 0; + else { + /* no more room, head hits tail and the index cannot be + * extended, we have to realign the whole table. + */ + if (!qpack_dht_defrag(dht)) + return -1; + + wrap = dht->wrap + 1; + head = dht->head + 1; + prev = head - 1; + tail = 0; + } + } + else if (used >= wrap) { + /* we've hit the tail, we need to reorganize the index so that + * the head is at the end (but not necessarily move the data). + */ + if (!qpack_dht_defrag(dht)) + return -1; + + wrap = dht->wrap + 1; + head = dht->head + 1; + prev = head - 1; + tail = 0; + } + + /* Now we have updated head, used and wrap, we know that there is some + * available room at least from the protocol's perspective. This space + * is split in two areas : + * + * 1: if the previous head was the front cell, the space between the + * end of the index table and the front cell's address. + * 2: if the previous head was the front cell, the space between the + * end of the tail and the end of the table ; or if the previous + * head was not the front cell, the space between the end of the + * tail and the head's address. + */ + if (prev == dht->front) { + /* the area was contiguous */ + headroom = dht->dte[dht->front].addr - (sizeof(*dht) + wrap * sizeof(dht->dte[0])); + tailroom = dht->size - dht->dte[tail].addr - dht->dte[tail].nlen - dht->dte[tail].vlen; + } + else { + /* it's already wrapped so we can't store anything in the headroom */ + headroom = 0; + tailroom = dht->dte[prev].addr - dht->dte[tail].addr - dht->dte[tail].nlen - dht->dte[tail].vlen; + } + + /* We can decide to stop filling the headroom as soon as there's enough + * room left in the tail to suit the protocol, but tests show that in + * practice it almost never happens in other situations so the extra + * test is useless and we simply fill the headroom as long as it's + * available and we don't wrap. + */ + if (prev == dht->front && headroom >= name.len + value.len) { + /* install upfront and update ->front */ + dht->dte[head].addr = dht->dte[dht->front].addr - (name.len + value.len); + dht->front = head; + } + else if (tailroom >= name.len + value.len) { + dht->dte[head].addr = dht->dte[tail].addr + dht->dte[tail].nlen + dht->dte[tail].vlen + tailroom - (name.len + value.len); + } + else { + /* need to defragment the table before inserting upfront */ + dht = qpack_dht_defrag(dht); + wrap = dht->wrap + 1; + head = dht->head + 1; + dht->dte[head].addr = dht->dte[dht->front].addr - (name.len + value.len); + dht->front = head; + } + + dht->wrap = wrap; + dht->head = head; + dht->used = used; + + copy: + dht->total += name.len + value.len; + dht->dte[head].nlen = name.len; + dht->dte[head].vlen = value.len; + + memcpy((void *)dht + dht->dte[head].addr, name.ptr, name.len); + memcpy((void *)dht + dht->dte[head].addr + name.len, value.ptr, value.len); + return 0; +} diff --git a/src/queue.c b/src/queue.c new file mode 100644 index 0000000..f20285b --- /dev/null +++ b/src/queue.c @@ -0,0 +1,761 @@ +/* + * Queue management functions. + * + * Copyright 2000-2009 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* Short explanation on the locking, which is far from being trivial : a + * pendconn is a list element which necessarily is associated with an existing + * stream. It has pendconn->strm always valid. A pendconn may only be in one of + * these three states : + * - unlinked : in this case it is an empty list head ; + * - linked into the server's queue ; + * - linked into the proxy's queue. + * + * A stream does not necessarily have such a pendconn. Thus the pendconn is + * designated by the stream->pend_pos pointer. This results in some properties : + * - pendconn->strm->pend_pos is never NULL for any valid pendconn + * - if p->node.node.leaf_p is NULL, the element is unlinked, + * otherwise it necessarily belongs to one of the other lists ; this may + * not be atomically checked under threads though ; + * - pendconn->px is never NULL if pendconn->list is not empty + * - pendconn->srv is never NULL if pendconn->list is in the server's queue, + * and is always NULL if pendconn->list is in the backend's queue or empty. + * - pendconn->target is NULL while the element is queued, and points to the + * assigned server when the pendconn is picked. + * + * Threads complicate the design a little bit but rules remain simple : + * - the server's queue lock must be held at least when manipulating the + * server's queue, which is when adding a pendconn to the queue and when + * removing a pendconn from the queue. It protects the queue's integrity. + * + * - the proxy's queue lock must be held at least when manipulating the + * proxy's queue, which is when adding a pendconn to the queue and when + * removing a pendconn from the queue. It protects the queue's integrity. + * + * - both locks are compatible and may be held at the same time. + * + * - a pendconn_add() is only performed by the stream which will own the + * pendconn ; the pendconn is allocated at this moment and returned ; it is + * added to either the server or the proxy's queue while holding this +s * queue's lock. + * + * - the pendconn is then met by a thread walking over the proxy or server's + * queue with the respective lock held. This lock is exclusive and the + * pendconn can only appear in one queue so by definition a single thread + * may find this pendconn at a time. + * + * - the pendconn is unlinked either by its own stream upon success/abort/ + * free, or by another one offering it its server slot. This is achieved by + * pendconn_process_next_strm() under either the server or proxy's lock, + * pendconn_redistribute() under the server's lock, pendconn_grab_from_px() + * under the proxy's lock, or pendconn_unlink() under either the proxy's or + * the server's lock depending on the queue the pendconn is attached to. + * + * - no single operation except the pendconn initialisation prior to the + * insertion are performed without eithre a queue lock held or the element + * being unlinked and visible exclusively to its stream. + * + * - pendconn_grab_from_px() and pendconn_process_next_strm() assign ->target + * so that the stream knows what server to work with (via + * pendconn_dequeue() which sets it on strm->target). + * + * - a pendconn doesn't switch between queues, it stays where it is. + */ + +#include <import/eb32tree.h> +#include <haproxy/api.h> +#include <haproxy/backend.h> +#include <haproxy/http_rules.h> +#include <haproxy/pool.h> +#include <haproxy/queue.h> +#include <haproxy/sample.h> +#include <haproxy/server-t.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/thread.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> + + +#define NOW_OFFSET_BOUNDARY() ((now_ms - (TIMER_LOOK_BACK >> 12)) & 0xfffff) +#define KEY_CLASS(key) ((u32)key & 0xfff00000) +#define KEY_OFFSET(key) ((u32)key & 0x000fffff) +#define KEY_CLASS_OFFSET_BOUNDARY(key) (KEY_CLASS(key) | NOW_OFFSET_BOUNDARY()) +#define MAKE_KEY(class, offset) (((u32)(class + 0x7ff) << 20) | ((u32)(now_ms + offset) & 0xfffff)) + +DECLARE_POOL(pool_head_pendconn, "pendconn", sizeof(struct pendconn)); + +/* returns the effective dynamic maxconn for a server, considering the minconn + * and the proxy's usage relative to its dynamic connections limit. It is + * expected that 0 < s->minconn <= s->maxconn when this is called. If the + * server is currently warming up, the slowstart is also applied to the + * resulting value, which can be lower than minconn in this case, but never + * less than 1. + */ +unsigned int srv_dynamic_maxconn(const struct server *s) +{ + unsigned int max; + + if (s->proxy->beconn >= s->proxy->fullconn) + /* no fullconn or proxy is full */ + max = s->maxconn; + else if (s->minconn == s->maxconn) + /* static limit */ + max = s->maxconn; + else max = MAX(s->minconn, + s->proxy->beconn * s->maxconn / s->proxy->fullconn); + + if ((s->cur_state == SRV_ST_STARTING) && + ns_to_sec(now_ns) < s->last_change + s->slowstart && + ns_to_sec(now_ns) >= s->last_change) { + unsigned int ratio; + ratio = 100 * (ns_to_sec(now_ns) - s->last_change) / s->slowstart; + max = MAX(1, max * ratio / 100); + } + return max; +} + +/* Remove the pendconn from the server's queue. At this stage, the connection + * is not really dequeued. It will be done during the process_stream. It is + * up to the caller to atomically decrement the pending counts. + * + * The caller must own the lock on the server queue. The pendconn must still be + * queued (p->node.leaf_p != NULL) and must be in a server (p->srv != NULL). + */ +static void __pendconn_unlink_srv(struct pendconn *p) +{ + p->strm->logs.srv_queue_pos += _HA_ATOMIC_LOAD(&p->queue->idx) - p->queue_idx; + eb32_delete(&p->node); +} + +/* Remove the pendconn from the proxy's queue. At this stage, the connection + * is not really dequeued. It will be done during the process_stream. It is + * up to the caller to atomically decrement the pending counts. + * + * The caller must own the lock on the proxy queue. The pendconn must still be + * queued (p->node.leaf_p != NULL) and must be in the proxy (p->srv == NULL). + */ +static void __pendconn_unlink_prx(struct pendconn *p) +{ + p->strm->logs.prx_queue_pos += _HA_ATOMIC_LOAD(&p->queue->idx) - p->queue_idx; + eb32_delete(&p->node); +} + +/* Locks the queue the pendconn element belongs to. This relies on both p->px + * and p->srv to be properly initialized (which is always the case once the + * element has been added). + */ +static inline void pendconn_queue_lock(struct pendconn *p) +{ + HA_SPIN_LOCK(QUEUE_LOCK, &p->queue->lock); +} + +/* Unlocks the queue the pendconn element belongs to. This relies on both p->px + * and p->srv to be properly initialized (which is always the case once the + * element has been added). + */ +static inline void pendconn_queue_unlock(struct pendconn *p) +{ + HA_SPIN_UNLOCK(QUEUE_LOCK, &p->queue->lock); +} + +/* Removes the pendconn from the server/proxy queue. At this stage, the + * connection is not really dequeued. It will be done during process_stream(). + * This function takes all the required locks for the operation. The pendconn + * must be valid, though it doesn't matter if it was already unlinked. Prefer + * pendconn_cond_unlink() to first check <p>. It also forces a serialization + * on p->del_lock to make sure another thread currently waking it up finishes + * first. + */ +void pendconn_unlink(struct pendconn *p) +{ + struct queue *q = p->queue; + struct proxy *px = q->px; + struct server *sv = q->sv; + uint oldidx; + int done = 0; + + oldidx = _HA_ATOMIC_LOAD(&p->queue->idx); + HA_SPIN_LOCK(QUEUE_LOCK, &q->lock); + HA_SPIN_LOCK(QUEUE_LOCK, &p->del_lock); + + if (p->node.node.leaf_p) { + eb32_delete(&p->node); + done = 1; + } + + HA_SPIN_UNLOCK(QUEUE_LOCK, &p->del_lock); + HA_SPIN_UNLOCK(QUEUE_LOCK, &q->lock); + + if (done) { + oldidx -= p->queue_idx; + if (sv) + p->strm->logs.srv_queue_pos += oldidx; + else + p->strm->logs.prx_queue_pos += oldidx; + + _HA_ATOMIC_DEC(&q->length); + _HA_ATOMIC_DEC(&px->totpend); + } +} + +/* Retrieve the first pendconn from tree <pendconns>. Classes are always + * considered first, then the time offset. The time does wrap, so the + * lookup is performed twice, one to retrieve the first class and a second + * time to retrieve the earliest time in this class. + */ +static struct pendconn *pendconn_first(struct eb_root *pendconns) +{ + struct eb32_node *node, *node2 = NULL; + u32 key; + + node = eb32_first(pendconns); + if (!node) + return NULL; + + key = KEY_CLASS_OFFSET_BOUNDARY(node->key); + node2 = eb32_lookup_ge(pendconns, key); + + if (!node2 || + KEY_CLASS(node2->key) != KEY_CLASS(node->key)) { + /* no other key in the tree, or in this class */ + return eb32_entry(node, struct pendconn, node); + } + + /* found a better key */ + return eb32_entry(node2, struct pendconn, node); +} + +/* Process the next pending connection from either a server or a proxy, and + * returns a strictly positive value on success (see below). If no pending + * connection is found, 0 is returned. Note that neither <srv> nor <px> may be + * NULL. Priority is given to the oldest request in the queue if both <srv> and + * <px> have pending requests. This ensures that no request will be left + * unserved. The <px> queue is not considered if the server (or a tracked + * server) is not RUNNING, is disabled, or has a null weight (server going + * down). The <srv> queue is still considered in this case, because if some + * connections remain there, it means that some requests have been forced there + * after it was seen down (eg: due to option persist). The stream is + * immediately marked as "assigned", and both its <srv> and <srv_conn> are set + * to <srv>. + * + * The proxy's queue will be consulted only if px_ok is non-zero. + * + * This function must only be called if the server queue is locked _AND_ the + * proxy queue is not. Today it is only called by process_srv_queue. + * When a pending connection is dequeued, this function returns 1 if a pendconn + * is dequeued, otherwise 0. + */ +static int pendconn_process_next_strm(struct server *srv, struct proxy *px, int px_ok) +{ + struct pendconn *p = NULL; + struct pendconn *pp = NULL; + u32 pkey, ppkey; + + p = NULL; + if (srv->queue.length) + p = pendconn_first(&srv->queue.head); + + pp = NULL; + if (px_ok && px->queue.length) { + /* the lock only remains held as long as the pp is + * in the proxy's queue. + */ + HA_SPIN_LOCK(QUEUE_LOCK, &px->queue.lock); + pp = pendconn_first(&px->queue.head); + if (!pp) + HA_SPIN_UNLOCK(QUEUE_LOCK, &px->queue.lock); + } + + if (!p && !pp) + return 0; + else if (!pp) + goto use_p; /* p != NULL */ + else if (!p) + goto use_pp; /* pp != NULL */ + + /* p != NULL && pp != NULL*/ + + if (KEY_CLASS(p->node.key) < KEY_CLASS(pp->node.key)) + goto use_p; + + if (KEY_CLASS(pp->node.key) < KEY_CLASS(p->node.key)) + goto use_pp; + + pkey = KEY_OFFSET(p->node.key); + ppkey = KEY_OFFSET(pp->node.key); + + if (pkey < NOW_OFFSET_BOUNDARY()) + pkey += 0x100000; // key in the future + + if (ppkey < NOW_OFFSET_BOUNDARY()) + ppkey += 0x100000; // key in the future + + if (pkey <= ppkey) + goto use_p; + + use_pp: + /* we'd like to release the proxy lock ASAP to let other threads + * work with other servers. But for this we must first hold the + * pendconn alive to prevent a removal from its owning stream. + */ + HA_SPIN_LOCK(QUEUE_LOCK, &pp->del_lock); + + /* now the element won't go, we can release the proxy */ + __pendconn_unlink_prx(pp); + HA_SPIN_UNLOCK(QUEUE_LOCK, &px->queue.lock); + + pp->strm_flags |= SF_ASSIGNED; + pp->target = srv; + stream_add_srv_conn(pp->strm, srv); + + /* we must wake the task up before releasing the lock as it's the only + * way to make sure the task still exists. The pendconn cannot vanish + * under us since the task will need to take the lock anyway and to wait + * if it wakes up on a different thread. + */ + task_wakeup(pp->strm->task, TASK_WOKEN_RES); + HA_SPIN_UNLOCK(QUEUE_LOCK, &pp->del_lock); + + _HA_ATOMIC_DEC(&px->queue.length); + _HA_ATOMIC_INC(&px->queue.idx); + return 1; + + use_p: + /* we don't need the px queue lock anymore, we have the server's lock */ + if (pp) + HA_SPIN_UNLOCK(QUEUE_LOCK, &px->queue.lock); + + p->strm_flags |= SF_ASSIGNED; + p->target = srv; + stream_add_srv_conn(p->strm, srv); + + /* we must wake the task up before releasing the lock as it's the only + * way to make sure the task still exists. The pendconn cannot vanish + * under us since the task will need to take the lock anyway and to wait + * if it wakes up on a different thread. + */ + task_wakeup(p->strm->task, TASK_WOKEN_RES); + __pendconn_unlink_srv(p); + + _HA_ATOMIC_DEC(&srv->queue.length); + _HA_ATOMIC_INC(&srv->queue.idx); + return 1; +} + +/* Manages a server's connection queue. This function will try to dequeue as + * many pending streams as possible, and wake them up. + */ +void process_srv_queue(struct server *s) +{ + struct server *ref = s->track ? s->track : s; + struct proxy *p = s->proxy; + int maxconn; + int stop = 0; + int done = 0; + int px_ok; + + /* if a server is not usable or backup and must not be used + * to dequeue backend requests. + */ + px_ok = srv_currently_usable(ref) && + (!(s->flags & SRV_F_BACKUP) || + (!p->srv_act && + (s == p->lbprm.fbck || (p->options & PR_O_USE_ALL_BK)))); + + /* let's repeat that under the lock on each round. Threads competing + * for the same server will give up, knowing that at least one of + * them will check the conditions again before quitting. In order + * to avoid the deadly situation where one thread spends its time + * dequeueing for others, we limit the number of rounds it does. + * However we still re-enter the loop for one pass if there's no + * more served, otherwise we could end up with no other thread + * trying to dequeue them. + */ + while (!stop && (done < global.tune.maxpollevents || !s->served) && + s->served < (maxconn = srv_dynamic_maxconn(s))) { + if (HA_SPIN_TRYLOCK(QUEUE_LOCK, &s->queue.lock) != 0) + break; + + while (s->served < maxconn) { + stop = !pendconn_process_next_strm(s, p, px_ok); + if (stop) + break; + _HA_ATOMIC_INC(&s->served); + done++; + if (done >= global.tune.maxpollevents) + break; + } + HA_SPIN_UNLOCK(QUEUE_LOCK, &s->queue.lock); + } + + if (done) { + _HA_ATOMIC_SUB(&p->totpend, done); + _HA_ATOMIC_ADD(&p->served, done); + __ha_barrier_atomic_store(); + if (p->lbprm.server_take_conn) + p->lbprm.server_take_conn(s); + } +} + +/* Adds the stream <strm> to the pending connection queue of server <strm>->srv + * or to the one of <strm>->proxy if srv is NULL. All counters and back pointers + * are updated accordingly. Returns NULL if no memory is available, otherwise the + * pendconn itself. If the stream was already marked as served, its flag is + * cleared. It is illegal to call this function with a non-NULL strm->srv_conn. + * The stream's queue position is counted with an offset of -1 because we want + * to make sure that being at the first position in the queue reports 1. + * + * The queue is sorted by the composition of the priority_class, and the current + * timestamp offset by strm->priority_offset. The timestamp is in milliseconds + * and truncated to 20 bits, so will wrap every 17m28s575ms. + * The offset can be positive or negative, and an offset of 0 puts it in the + * middle of this range (~ 8 min). Note that this also means if the adjusted + * timestamp wraps around, the request will be misinterpreted as being of + * the highest priority for that priority class. + * + * This function must be called by the stream itself, so in the context of + * process_stream. + */ +struct pendconn *pendconn_add(struct stream *strm) +{ + struct pendconn *p; + struct proxy *px; + struct server *srv; + struct queue *q; + unsigned int *max_ptr; + unsigned int old_max, new_max; + + p = pool_alloc(pool_head_pendconn); + if (!p) + return NULL; + + p->target = NULL; + p->node.key = MAKE_KEY(strm->priority_class, strm->priority_offset); + p->strm = strm; + p->strm_flags = strm->flags; + HA_SPIN_INIT(&p->del_lock); + strm->pend_pos = p; + + px = strm->be; + if (strm->flags & SF_ASSIGNED) + srv = objt_server(strm->target); + else + srv = NULL; + + if (srv) { + q = &srv->queue; + max_ptr = &srv->counters.nbpend_max; + } + else { + q = &px->queue; + max_ptr = &px->be_counters.nbpend_max; + } + + p->queue = q; + p->queue_idx = _HA_ATOMIC_LOAD(&q->idx) - 1; // for logging only + new_max = _HA_ATOMIC_ADD_FETCH(&q->length, 1); + old_max = _HA_ATOMIC_LOAD(max_ptr); + while (new_max > old_max) { + if (likely(_HA_ATOMIC_CAS(max_ptr, &old_max, new_max))) + break; + } + __ha_barrier_atomic_store(); + + HA_SPIN_LOCK(QUEUE_LOCK, &q->lock); + eb32_insert(&q->head, &p->node); + HA_SPIN_UNLOCK(QUEUE_LOCK, &q->lock); + + _HA_ATOMIC_INC(&px->totpend); + return p; +} + +/* Redistribute pending connections when a server goes down. The number of + * connections redistributed is returned. It will take the server queue lock + * and does not use nor depend on other locks. + */ +int pendconn_redistribute(struct server *s) +{ + struct pendconn *p; + struct eb32_node *node, *nodeb; + int xferred = 0; + + /* The REDISP option was specified. We will ignore cookie and force to + * balance or use the dispatcher. */ + if ((s->proxy->options & (PR_O_REDISP|PR_O_PERSIST)) != PR_O_REDISP) + return 0; + + HA_SPIN_LOCK(QUEUE_LOCK, &s->queue.lock); + for (node = eb32_first(&s->queue.head); node; node = nodeb) { + nodeb = eb32_next(node); + + p = eb32_entry(node, struct pendconn, node); + if (p->strm_flags & SF_FORCE_PRST) + continue; + + /* it's left to the dispatcher to choose a server */ + __pendconn_unlink_srv(p); + p->strm_flags &= ~(SF_DIRECT | SF_ASSIGNED); + + task_wakeup(p->strm->task, TASK_WOKEN_RES); + xferred++; + } + HA_SPIN_UNLOCK(QUEUE_LOCK, &s->queue.lock); + + if (xferred) { + _HA_ATOMIC_SUB(&s->queue.length, xferred); + _HA_ATOMIC_SUB(&s->proxy->totpend, xferred); + } + return xferred; +} + +/* Check for pending connections at the backend, and assign some of them to + * the server coming up. The server's weight is checked before being assigned + * connections it may not be able to handle. The total number of transferred + * connections is returned. It will take the proxy's queue lock and will not + * use nor depend on other locks. + */ +int pendconn_grab_from_px(struct server *s) +{ + struct pendconn *p; + int maxconn, xferred = 0; + + if (!srv_currently_usable(s)) + return 0; + + /* if this is a backup server and there are active servers or at + * least another backup server was elected, then this one must + * not dequeue requests from the proxy. + */ + if ((s->flags & SRV_F_BACKUP) && + (s->proxy->srv_act || + ((s != s->proxy->lbprm.fbck) && !(s->proxy->options & PR_O_USE_ALL_BK)))) + return 0; + + HA_SPIN_LOCK(QUEUE_LOCK, &s->proxy->queue.lock); + maxconn = srv_dynamic_maxconn(s); + while ((p = pendconn_first(&s->proxy->queue.head))) { + if (s->maxconn && s->served + xferred >= maxconn) + break; + + __pendconn_unlink_prx(p); + p->target = s; + + task_wakeup(p->strm->task, TASK_WOKEN_RES); + xferred++; + } + HA_SPIN_UNLOCK(QUEUE_LOCK, &s->proxy->queue.lock); + if (xferred) { + _HA_ATOMIC_SUB(&s->proxy->queue.length, xferred); + _HA_ATOMIC_SUB(&s->proxy->totpend, xferred); + } + return xferred; +} + +/* Try to dequeue pending connection attached to the stream <strm>. It must + * always exists here. If the pendconn is still linked to the server or the + * proxy queue, nothing is done and the function returns 1. Otherwise, + * <strm>->flags and <strm>->target are updated, the pendconn is released and 0 + * is returned. + * + * This function must be called by the stream itself, so in the context of + * process_stream. + */ +int pendconn_dequeue(struct stream *strm) +{ + struct pendconn *p; + int is_unlinked; + + /* unexpected case because it is called by the stream itself and + * only the stream can release a pendconn. So it is only + * possible if a pendconn is released by someone else or if the + * stream is supposed to be queued but without its associated + * pendconn. In both cases it is a bug! */ + BUG_ON(!strm->pend_pos); + + p = strm->pend_pos; + + /* note below : we need to grab the queue's lock to check for emptiness + * because we don't want a partial _grab_from_px() or _redistribute() + * to be called in parallel and show an empty list without having the + * time to finish. With this we know that if we see the element + * unlinked, these functions were completely done. + */ + pendconn_queue_lock(p); + is_unlinked = !p->node.node.leaf_p; + pendconn_queue_unlock(p); + + /* serialize to make sure the element was finished processing */ + HA_SPIN_LOCK(QUEUE_LOCK, &p->del_lock); + HA_SPIN_UNLOCK(QUEUE_LOCK, &p->del_lock); + + if (!is_unlinked) + return 1; + + /* the pendconn is not queued anymore and will not be so we're safe + * to proceed. + */ + strm->flags &= ~(SF_DIRECT | SF_ASSIGNED); + strm->flags |= p->strm_flags & (SF_DIRECT | SF_ASSIGNED); + + /* the entry might have been redistributed to another server */ + if (!(strm->flags & SF_ASSIGNED)) + sockaddr_free(&strm->scb->dst); + + if (p->target) { + /* a server picked this pendconn, it must skip LB */ + strm->target = &p->target->obj_type; + strm->flags |= SF_ASSIGNED; + } + + strm->pend_pos = NULL; + pool_free(pool_head_pendconn, p); + return 0; +} + +static enum act_return action_set_priority_class(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct sample *smp; + + smp = sample_fetch_as_type(px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.expr, SMP_T_SINT); + if (!smp) + return ACT_RET_CONT; + + s->priority_class = queue_limit_class(smp->data.u.sint); + return ACT_RET_CONT; +} + +static enum act_return action_set_priority_offset(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct sample *smp; + + smp = sample_fetch_as_type(px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.expr, SMP_T_SINT); + if (!smp) + return ACT_RET_CONT; + + s->priority_offset = queue_limit_offset(smp->data.u.sint); + + return ACT_RET_CONT; +} + +static enum act_parse_ret parse_set_priority_class(const char **args, int *arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + unsigned int where = 0; + + rule->arg.expr = sample_parse_expr((char **)args, arg, px->conf.args.file, + px->conf.args.line, err, &px->conf.args, NULL); + if (!rule->arg.expr) + return ACT_RET_PRS_ERR; + + if (px->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + + if (!(rule->arg.expr->fetch->val & where)) { + memprintf(err, + "fetch method '%s' extracts information from '%s', none of which is available here", + args[0], sample_src_names(rule->arg.expr->fetch->use)); + free(rule->arg.expr); + return ACT_RET_PRS_ERR; + } + + rule->action = ACT_CUSTOM; + rule->action_ptr = action_set_priority_class; + return ACT_RET_PRS_OK; +} + +static enum act_parse_ret parse_set_priority_offset(const char **args, int *arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + unsigned int where = 0; + + rule->arg.expr = sample_parse_expr((char **)args, arg, px->conf.args.file, + px->conf.args.line, err, &px->conf.args, NULL); + if (!rule->arg.expr) + return ACT_RET_PRS_ERR; + + if (px->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + + if (!(rule->arg.expr->fetch->val & where)) { + memprintf(err, + "fetch method '%s' extracts information from '%s', none of which is available here", + args[0], sample_src_names(rule->arg.expr->fetch->use)); + free(rule->arg.expr); + return ACT_RET_PRS_ERR; + } + + rule->action = ACT_CUSTOM; + rule->action_ptr = action_set_priority_offset; + return ACT_RET_PRS_OK; +} + +static struct action_kw_list tcp_cont_kws = {ILH, { + { "set-priority-class", parse_set_priority_class }, + { "set-priority-offset", parse_set_priority_offset }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_cont_keywords_register, &tcp_cont_kws); + +static struct action_kw_list http_req_kws = {ILH, { + { "set-priority-class", parse_set_priority_class }, + { "set-priority-offset", parse_set_priority_offset }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_kws); + +static int +smp_fetch_priority_class(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->strm->priority_class; + + return 1; +} + +static int +smp_fetch_priority_offset(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->strm->priority_offset; + + return 1; +} + + +static struct sample_fetch_kw_list smp_kws = {ILH, { + { "prio_class", smp_fetch_priority_class, 0, NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "prio_offset", smp_fetch_priority_offset, 0, NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { /* END */}, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/quic_ack.c b/src/quic_ack.c new file mode 100644 index 0000000..d28a698 --- /dev/null +++ b/src/quic_ack.c @@ -0,0 +1,258 @@ +#include <inttypes.h> + +#include <import/eb64tree.h> + +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_enc.h> +#include <haproxy/quic_trace.h> +#include <haproxy/trace.h> + +DECLARE_STATIC_POOL(pool_head_quic_arng, "quic_arng", sizeof(struct quic_arng_node)); + +/* Deallocate <l> list of ACK ranges. */ +void quic_free_arngs(struct quic_conn *qc, struct quic_arngs *arngs) +{ + struct eb64_node *n; + struct quic_arng_node *ar; + + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + + n = eb64_first(&arngs->root); + while (n) { + struct eb64_node *next; + + ar = eb64_entry(n, struct quic_arng_node, first); + next = eb64_next(n); + eb64_delete(n); + pool_free(pool_head_quic_arng, ar); + n = next; + } + + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); +} + +/* Return the gap value between <p> and <q> ACK ranges where <q> follows <p> in + * descending order. + */ +static inline size_t sack_gap(struct quic_arng_node *p, + struct quic_arng_node *q) +{ + return p->first.key - q->last - 2; +} + +/* Set the encoded size of <arngs> QUIC ack ranges. */ +static void quic_arngs_set_enc_sz(struct quic_conn *qc, struct quic_arngs *arngs) +{ + struct eb64_node *node, *next; + struct quic_arng_node *ar, *ar_next; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + node = eb64_last(&arngs->root); + if (!node) + goto leave; + + ar = eb64_entry(node, struct quic_arng_node, first); + arngs->enc_sz = quic_int_getsize(ar->last) + + quic_int_getsize(ar->last - ar->first.key) + quic_int_getsize(arngs->sz - 1); + + while ((next = eb64_prev(node))) { + ar_next = eb64_entry(next, struct quic_arng_node, first); + arngs->enc_sz += quic_int_getsize(sack_gap(ar, ar_next)) + + quic_int_getsize(ar_next->last - ar_next->first.key); + node = next; + ar = eb64_entry(node, struct quic_arng_node, first); + } + + leave: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); +} + +/* Insert <ar> ack range into <argns> tree of ack ranges. + * Returns the ack range node which has been inserted if succeeded, NULL if not. + */ +static inline +struct quic_arng_node *quic_insert_new_range(struct quic_conn *qc, + struct quic_arngs *arngs, + struct quic_arng *ar) +{ + struct quic_arng_node *new_ar; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT, qc); + + if (arngs->sz >= QUIC_MAX_ACK_RANGES) { + struct eb64_node *first; + + first = eb64_first(&arngs->root); + BUG_ON(first == NULL); + eb64_delete(first); + pool_free(pool_head_quic_arng, first); + arngs->sz--; + } + + new_ar = pool_alloc(pool_head_quic_arng); + if (!new_ar) { + TRACE_ERROR("ack range allocation failed", QUIC_EV_CONN_RXPKT, qc); + goto leave; + } + + new_ar->first.key = ar->first; + new_ar->last = ar->last; + eb64_insert(&arngs->root, &new_ar->first); + arngs->sz++; + + leave: + TRACE_LEAVE(QUIC_EV_CONN_RXPKT, qc); + return new_ar; +} + +/* Update <arngs> tree of ACK ranges with <ar> as new ACK range value. + * Note that this function computes the number of bytes required to encode + * this tree of ACK ranges in descending order. + * + * Descending order + * -------------> + * range1 range2 + * ..........|--------|..............|--------| + * ^ ^ ^ ^ + * | | | | + * last1 first1 last2 first2 + * ..........+--------+--------------+--------+...... + * diff1 gap12 diff2 + * + * To encode the previous list of ranges we must encode integers as follows in + * descending order: + * enc(last2),enc(diff2),enc(gap12),enc(diff1) + * with diff1 = last1 - first1 + * diff2 = last2 - first2 + * gap12 = first1 - last2 - 2 (>= 0) + * + +returns 0 on error + + */ +int quic_update_ack_ranges_list(struct quic_conn *qc, + struct quic_arngs *arngs, + struct quic_arng *ar) +{ + int ret = 0; + struct eb64_node *le; + struct quic_arng_node *new_node; + struct eb64_node *new; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT, qc); + + new = NULL; + if (eb_is_empty(&arngs->root)) { + new_node = quic_insert_new_range(qc, arngs, ar); + if (new_node) + ret = 1; + + goto leave; + } + + le = eb64_lookup_le(&arngs->root, ar->first); + if (!le) { + new_node = quic_insert_new_range(qc, arngs, ar); + if (!new_node) + goto leave; + + new = &new_node->first; + } + else { + struct quic_arng_node *le_ar = + eb64_entry(le, struct quic_arng_node, first); + + /* Already existing range */ + if (le_ar->last >= ar->last) { + ret = 1; + } + else if (le_ar->last + 1 >= ar->first) { + le_ar->last = ar->last; + new = le; + new_node = le_ar; + } + else { + new_node = quic_insert_new_range(qc, arngs, ar); + if (!new_node) + goto leave; + + new = &new_node->first; + } + } + + /* Verify that the new inserted node does not overlap the nodes + * which follow it. + */ + if (new) { + struct eb64_node *next; + struct quic_arng_node *next_node; + + while ((next = eb64_next(new))) { + next_node = + eb64_entry(next, struct quic_arng_node, first); + if (new_node->last + 1 < next_node->first.key) + break; + + if (next_node->last > new_node->last) + new_node->last = next_node->last; + eb64_delete(next); + pool_free(pool_head_quic_arng, next_node); + /* Decrement the size of these ranges. */ + arngs->sz--; + } + } + + ret = 1; + leave: + quic_arngs_set_enc_sz(qc, arngs); + TRACE_LEAVE(QUIC_EV_CONN_RXPKT, qc); + return ret; +} + +/* Remove already sent ranges of acknowledged packet numbers from + * <pktns> packet number space tree below <largest_acked_pn> possibly + * updating the range which contains <largest_acked_pn>. + * Never fails. + */ +void qc_treat_ack_of_ack(struct quic_conn *qc, struct quic_arngs *arngs, + int64_t largest_acked_pn) +{ + struct eb64_node *ar, *next_ar; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + + ar = eb64_first(&arngs->root); + while (ar) { + struct quic_arng_node *ar_node; + + next_ar = eb64_next(ar); + ar_node = eb64_entry(ar, struct quic_arng_node, first); + + if ((int64_t)ar_node->first.key > largest_acked_pn) { + TRACE_DEVEL("first.key > largest", QUIC_EV_CONN_PRSAFRM, qc); + break; + } + + if (largest_acked_pn < ar_node->last) { + eb64_delete(ar); + ar_node->first.key = largest_acked_pn + 1; + eb64_insert(&arngs->root, ar); + break; + } + + /* Do not empty the tree: the first ACK range contains the + * largest acknowledged packet number. + */ + if (arngs->sz == 1) + break; + + eb64_delete(ar); + pool_free(pool_head_quic_arng, ar_node); + arngs->sz--; + ar = next_ar; + } + + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); +} + diff --git a/src/quic_cc.c b/src/quic_cc.c new file mode 100644 index 0000000..8fd99d3 --- /dev/null +++ b/src/quic_cc.c @@ -0,0 +1,49 @@ +/* + * Congestion controller handling. + * + * This file contains definitions for QUIC congestion control. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <haproxy/quic_cc.h> + +struct quic_cc_algo *default_quic_cc_algo = &quic_cc_algo_cubic; + +/* + * Initialize <cc> congestion control with <algo> as algorithm depending on <ipv4> + * a boolean which is true for an IPv4 path. + */ +void quic_cc_init(struct quic_cc *cc, + struct quic_cc_algo *algo, struct quic_conn *qc) +{ + cc->qc = qc; + cc->algo = algo; + if (cc->algo->init) + (cc->algo->init(cc)); +} + +/* Send <ev> event to <cc> congestion controller. */ +void quic_cc_event(struct quic_cc *cc, struct quic_cc_event *ev) +{ + cc->algo->event(cc, ev); +} + +void quic_cc_state_trace(struct buffer *buf, const struct quic_cc *cc) +{ + cc->algo->state_trace(buf, cc); +} diff --git a/src/quic_cc_cubic.c b/src/quic_cc_cubic.c new file mode 100644 index 0000000..76a62ac --- /dev/null +++ b/src/quic_cc_cubic.c @@ -0,0 +1,542 @@ +#include <haproxy/quic_cc.h> +#include <haproxy/quic_trace.h> +#include <haproxy/ticks.h> +#include <haproxy/trace.h> + +/* IMPORTANT NOTE about the units defined by the RFC 9438 + * (CUBIC for Fast and Long-Distance Networks): + * + * RFC 9438 4.1. Definitions: + * The unit of all window sizes in this document is segments of the SMSS, and + * the unit of all times is seconds. Implementations can use bytes to express + * window sizes, which would require factoring in the SMSS wherever necessary + * and replacing segments_acked (Figure 4) with the number of acknowledged + * bytes. + */ + +/* So, this is the reason why here in this implementation each time a number + * of segments is used (typically a congestion window value), its value is + * multiplied by the MTU value. + */ + +/* This source file is highly inspired from Linux kernel source file + * implementation for TCP Cubic. In fact, we have no choice if we do + * not want to use any floating point operations to be fast! + * (See net/ipv4/tcp_cubic.c) + */ + +/* Constants definitions: + * CUBIC_BETA_SCALED refers to the scaled value of RFC 9438 beta_cubic variable. + * CUBIC_C_SCALED refers to the scaled value of RFC 9438 C variable. + */ + +/* The right shifting value to apply to scaled values to get its real value. */ +#define CUBIC_SCALE_FACTOR_SHIFT 10 + +/* CUBIC multiplicative decrease factor as described in RFC 9438 section 4.6 */ +#define CUBIC_BETA_SCALED 717 /* beta_cubic = 0.7 (constant) */ + +/* CUBIC C constant that determines the aggressiveness of CUBIC in competing + * with other congestion control algorithms in high-BDP networks. + */ +#define CUBIC_C_SCALED 410 /* RFC 9438 C = 0.4 segment/seconds^3 + * or 410 mB/s^3 in this implementation. + */ + +/* The scaled value of 1 */ +#define CUBIC_ONE_SCALED (1 << CUBIC_SCALE_FACTOR_SHIFT) + +/* The maximum time value which may be cubed and multiplied by CUBIC_C_SCALED */ +#define CUBIC_TIME_LIMIT 355535ULL /* ms */ + +/* By connection CUBIC algorithm state. Note that the current congestion window + * value is not stored in this structure. + */ +struct cubic { + /* QUIC_CC_ST_* state values. */ + uint32_t state; + /* Slow start threshold (in bytes) */ + uint32_t ssthresh; + /* Remaining number of acknowledged bytes between two ACK for CUBIC congestion + * control window (in bytes). + */ + uint32_t remaining_inc; + /* Start time of at which the current avoidance stage started (in ms). */ + uint32_t t_epoch; + /* The window to reach for each recovery period during a concave region (in bytes). */ + uint32_t W_target; + /* The time period to reach W_target during a concave region (in ms). */ + uint32_t K; + /* The last window maximum reached (in bytes). */ + uint32_t last_w_max; + /* Estimated value of the Reno congestion window in the TCP-friendly region (in bytes). */ + uint32_t W_est; + /* Remaining number of acknowledged bytes between two ACKs for estimated + * TCP-Reno congestion control window (in bytes). + */ + uint32_t remaining_W_est_inc; + /* Start time of recovery period (used to avoid re-entering this state, if already + * in recovery period) (in ms). + */ + uint32_t recovery_start_time; +}; + +static void quic_cc_cubic_reset(struct quic_cc *cc) +{ + struct cubic *c = quic_cc_priv(cc); + + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + c->state = QUIC_CC_ST_SS; + c->ssthresh = QUIC_CC_INFINITE_SSTHESH; + c->remaining_inc = 0; + c->remaining_W_est_inc = 0; + c->t_epoch = 0; + c->W_target = 0; + c->K = 0; + c->last_w_max = 0; + c->W_est = 0; + c->recovery_start_time = 0; + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +static int quic_cc_cubic_init(struct quic_cc *cc) +{ + quic_cc_cubic_reset(cc); + return 1; +} + +/* Cubic root. + * Highly inspired from Linux kernel sources. + * See net/ipv4/tcp_cubic.c + */ +static uint32_t cubic_root(uint64_t val) +{ + uint32_t x, b, shift; + + static const uint8_t v[] = { + 0, 54, 54, 54, 118, 118, 118, 118, + 123, 129, 134, 138, 143, 147, 151, 156, + 157, 161, 164, 168, 170, 173, 176, 179, + 181, 185, 187, 190, 192, 194, 197, 199, + 200, 202, 204, 206, 209, 211, 213, 215, + 217, 219, 221, 222, 224, 225, 227, 229, + 231, 232, 234, 236, 237, 239, 240, 242, + 244, 245, 246, 248, 250, 251, 252, 254, + }; + + if (!val || (b = my_flsl(val)) < 7) { + /* val in [0..63] */ + return ((uint32_t)v[(uint32_t)val] + 35) >> 6; + } + + b = ((b * 84) >> 8) - 1; + shift = (val >> (b * 3)); + + x = ((uint32_t)(((uint32_t)v[shift] + 10) << b)) >> 6; + + x = 2 * x + (uint32_t)(val / ((uint64_t)x * (uint64_t)(x - 1))); + x = ((x * 341) >> 10); + + return x; +} + +/* + * RFC 9438 3.1. Principle 1 for the CUBIC Increase Function + * + * For better network utilization and stability, CUBIC [HRX08] uses a cubic + * window increase function in terms of the elapsed time from the last + * congestion event. While most congestion control algorithms that provide + * alternatives to Reno increase the congestion window using convex functions, + * CUBIC uses both the concave and convex profiles of a cubic function for + * window growth. + * + * After a window reduction in response to a congestion event detected by + * duplicate acknowledgments (ACKs), Explicit Congestion Notification-Echo + * (ECN-Echo (ECE)) ACKs [RFC3168], RACK-TLP for TCP [RFC8985], or QUIC loss + * detection [RFC9002], CUBIC remembers the congestion window size at which it + * received the congestion event and performs a multiplicative decrease of the + * congestion window. When CUBIC enters into congestion avoidance, it starts to + * increase the congestion window using the concave profile of the cubic + * function. The cubic function is set to have its plateau at the remembered + * congestion window size, so that the concave window increase continues until + * then. After that, the cubic function turns into a convex profile and the + * convex window increase begins. + * + * W_cubic(time) (bytes) + * ^ convex region + * | <-------------------------> + * | . + + * | . + + * | . + + * | . + + * | . + ^ + * | . + | W_cubic_t + * | . + | + * | . + | + * W_target |-----------+--------------------------+------------------------+ + * (W_max) | +. + . t + * | + . + . + * | + . + . + * | + . + . + * | + . + . + * | .+ . + * | + . + * | + . + * | + . + * | . . + * | . . + * | . . + * +-----------+--------------------------+-+------------------------> time (s) + * 0 t_epoch (t_epoch + K) + * <--------------------------> + * . concave region + * . + * congestion + * event + * + * RFC 9438 4.2. Window Increase Function: + * + * W_cubic(t) = C*(t-K)^3 + W_max (Figure 1) + * K = cubic_root((W_max - cwnd_epoch)/C) (Figure 2) + * + * +--------------------------------------------------------------------+ + * | RFC 9438 definitions | Code variables | + * +--------------------------------------------------------------------+ + * | C (segments/s^3) | CUBIC_C_SCALED (mB/s^3) | + * +--------------------------------------------------------------------+ + * | W_max (segments) | c->last_w_max - path->cwnd (bytes) | + * +--------------------------------------------------------------------+ + * | K (s) | c->K (ms) | + * +--------------------------------------------------------------------+ + * | beta_cubic (constant) | CUBIC_BETA_SCALED (constant) | + * +--------------------------------------------------------------------+ + */ +static inline void quic_cubic_update(struct quic_cc *cc, uint32_t acked) +{ + struct cubic *c = quic_cc_priv(cc); + struct quic_cc_path *path = container_of(cc, struct quic_cc_path, cc); + /* The elapsed time since the start of the congestion event. */ + uint32_t elapsed_time; + /* Target value of the congestion window. */ + uint32_t target; + /* The time at which the congestion window will be computed based + * on the cubic increase function. + */ + uint64_t t; + /* The computed value of the congestion window at time t based on the cubic + * increase function. + */ + uint64_t W_cubic_t; + uint32_t inc, inc_diff; + + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + if (!c->t_epoch) { + c->t_epoch = now_ms; + if (c->last_w_max <= path->cwnd) { + c->K = 0; + c->W_target = path->cwnd; + } + else { + /* K value computing (in seconds): + * K = cubic_root((W_max - cwnd_epoch)/C) (Figure 2) + * Note that K is stored in milliseconds. + */ + c->K = cubic_root(((c->last_w_max - path->cwnd) << CUBIC_SCALE_FACTOR_SHIFT) / (CUBIC_C_SCALED * path->mtu)); + /* Convert to miliseconds. */ + c->K *= 1000; + c->W_target = c->last_w_max; + } + + c->W_est = path->cwnd; + c->remaining_inc = 0; + c->remaining_W_est_inc = 0; + } + + elapsed_time = now_ms + path->loss.rtt_min - c->t_epoch; + if (elapsed_time < c->K) { + t = c->K - elapsed_time; + } + else { + t = elapsed_time - c->K; + } + + if (t > CUBIC_TIME_LIMIT) { + /* TODO : should not happen if we handle the case + * of very late acks receipt. This must be handled as a congestion + * control event: a very late ack should trigger a congestion + * control algorithm reset. + */ + quic_cc_cubic_reset(cc); + goto leave; + } + + /* Compute W_cubic_t at t time. */ + W_cubic_t = CUBIC_C_SCALED * path->mtu; + W_cubic_t = (W_cubic_t * t) / 1000; + W_cubic_t = (W_cubic_t * t) / 1000; + W_cubic_t = (W_cubic_t * t) / 1000; + W_cubic_t >>= CUBIC_SCALE_FACTOR_SHIFT; + if (elapsed_time < c->K) + target = c->W_target - W_cubic_t; + else + target = c->W_target + W_cubic_t; + + if (target > path->cwnd) { + /* Concave region */ + + /* RFC 9438 4.4. Concave Region + * + * When receiving a new ACK in congestion avoidance, if CUBIC is not in + * the Reno-friendly region and cwnd is less than Wmax, then CUBIC is + * in the concave region. In this region, cwnd MUST be incremented by + * (target - cwnd) / cwnd. + */ + inc_diff = c->remaining_inc + path->mtu * (target - path->cwnd); + c->remaining_inc = inc_diff % path->cwnd; + inc = inc_diff / path->cwnd; + } + else { + /* Convex region: very small increment */ + + /* RFC 9438 4.5. Convex Region + * + * When receiving a new ACK in congestion avoidance, if CUBIC is not in + * the Reno-friendly region and cwnd is larger than or equal to Wmax, + * then CUBIC is in the convex region.The convex region indicates that + * the network conditions might have changed since the last congestion + * event, possibly implying more available bandwidth after some flow + * departures. Since the Internet is highly asynchronous, some amount + * of perturbation is always possible without causing a major change in + * available bandwidth.Unless the cwnd is overridden by the AIMD window + * increase, CUBIC will behave cautiously when operating in this region. + * The convex profile aims to increase the window very slowly at the + * beginning when cwnd is around Wmax and then gradually increases its + * rate of increase. This region is also called the "maximum probing + * phase", since CUBIC is searching for a new Wmax. In this region, + * cwnd MUST be incremented by (target - cwnd) / cwnd) for each received + * new ACK, where target is calculated as described in Section 4.2. + */ + inc_diff = c->remaining_inc + path->mtu; + c->remaining_inc = inc_diff % (100 * path->cwnd); + inc = inc_diff / (100 * path->cwnd); + } + + inc_diff = c->remaining_W_est_inc + path->mtu * acked; + c->W_est += inc_diff / path->cwnd; + c->remaining_W_est_inc = inc_diff % path->cwnd; + + /* TCP friendliness : + * RFC 9438 4.3. Reno-Friendly Region + * + * Reno performs well in certain types of networks -- for example, under + * short RTTs and small bandwidths (or small BDPs). In these networks, + * CUBIC remains in the Reno-friendly region to achieve at least the same + * throughput as Reno. + * + * When receiving a new ACK in congestion avoidance (where cwnd could be + * greater than or less than Wmax), CUBIC checks whether Wcubic(t) is less + * than West. If so, CUBIC is in the Reno-friendly region and cwnd SHOULD + * be set to West at each reception of a new ACK. + * + * West is set equal to cwnd_epoch at the start of the congestion avoidance + * stage. After that, on every new ACK, West is updated using Figure 4. + * Note that this equation uses segments_acked and cwnd is measured in + * segments. An implementation that measures cwnd in bytes should adjust the + * equation accordingly using the number of acknowledged bytes and the SMSS. + * Also note that this equation works for connections with enabled or + * disabled delayed ACKs [RFC5681], as segments_acked will be different based + * on the segments actually acknowledged by a new ACK. + * + * Figure 4 : West = West + alpha_cubic * (segments_acked / cwnd) + * + * Once West has grown to reach the cwnd at the time of most recently + * setting ssthresh -- that is, West >= cwndprior -- the sender SHOULD set + * alpha_cubic to 1 to ensure that it can achieve the same congestion window + * increment rate as Reno, which uses AIMD(1, 0.5). + */ + if (c->W_est > path->cwnd) { + uint32_t W_est_inc = path->mtu * (c->W_est - path->cwnd) / path->cwnd; + if (W_est_inc > inc) + inc = W_est_inc; + } + + path->cwnd += inc; + path->cwnd = QUIC_MIN(path->max_cwnd, path->cwnd); + path->mcwnd = QUIC_MAX(path->cwnd, path->mcwnd); + leave: + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +static void quic_cc_cubic_slow_start(struct quic_cc *cc) +{ + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + quic_cc_cubic_reset(cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +static void quic_enter_recovery(struct quic_cc *cc) +{ + struct quic_cc_path *path = container_of(cc, struct quic_cc_path, cc); + struct cubic *c = quic_cc_priv(cc); + /* Current cwnd as number of packets */ + + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + c->t_epoch = 0; + c->recovery_start_time = now_ms; + + /* RFC 9438 4.7. Fast Convergence + * + * To improve convergence speed, CUBIC uses a heuristic. When a new flow + * joins the network, existing flows need to give up some of their bandwidth + * to allow the new flow some room for growth if the existing flows have + * been using all the network bandwidth. To speed up this bandwidth release + * by existing flows, the following fast convergence mechanism SHOULD be + * implemented.With fast convergence, when a congestion event occurs, Wmax + * is updated as follows, before the window reduction described in Section + * 4.6. + * + * if cwnd < Wmax and fast convergence enabled, further reduce Wax: + * Wmax = cwnd * (1 + beta_cubic) + * otherwise, remember cwn before reduction: + * Wmax = cwnd + */ + if (path->cwnd < c->last_w_max) { + /* (1 + beta_cubic) * path->cwnd / 2 */ + c->last_w_max = (path->cwnd * (CUBIC_ONE_SCALED + CUBIC_BETA_SCALED) / 2) >> CUBIC_SCALE_FACTOR_SHIFT; + } + else { + c->last_w_max = path->cwnd; + } + + c->ssthresh = (CUBIC_BETA_SCALED * path->cwnd) >> CUBIC_SCALE_FACTOR_SHIFT; + path->cwnd = QUIC_MAX(c->ssthresh, (uint32_t)path->min_cwnd); + c->state = QUIC_CC_ST_RP; + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc, NULL, cc); +} + +/* Congestion slow-start callback. */ +static void quic_cc_cubic_ss_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + struct quic_cc_path *path = container_of(cc, struct quic_cc_path, cc); + struct cubic *c = quic_cc_priv(cc); + + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + TRACE_PROTO("CC cubic", QUIC_EV_CONN_CC, cc->qc, ev); + switch (ev->type) { + case QUIC_CC_EVT_ACK: + if (path->cwnd < QUIC_CC_INFINITE_SSTHESH - ev->ack.acked) { + path->cwnd += ev->ack.acked; + path->cwnd = QUIC_MIN(path->max_cwnd, path->cwnd); + } + /* Exit to congestion avoidance if slow start threshold is reached. */ + if (path->cwnd >= c->ssthresh) + c->state = QUIC_CC_ST_CA; + path->mcwnd = QUIC_MAX(path->cwnd, path->mcwnd); + break; + + case QUIC_CC_EVT_LOSS: + quic_enter_recovery(cc); + break; + + case QUIC_CC_EVT_ECN_CE: + /* TODO */ + break; + } + + out: + TRACE_PROTO("CC cubic", QUIC_EV_CONN_CC, cc->qc, NULL, cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +/* Congestion avoidance callback. */ +static void quic_cc_cubic_ca_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + TRACE_PROTO("CC cubic", QUIC_EV_CONN_CC, cc->qc, ev); + switch (ev->type) { + case QUIC_CC_EVT_ACK: + quic_cubic_update(cc, ev->ack.acked); + break; + case QUIC_CC_EVT_LOSS: + quic_enter_recovery(cc); + break; + case QUIC_CC_EVT_ECN_CE: + /* TODO */ + break; + } + + out: + TRACE_PROTO("CC cubic", QUIC_EV_CONN_CC, cc->qc, NULL, cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +/* Recovery period callback */ +static void quic_cc_cubic_rp_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + struct cubic *c = quic_cc_priv(cc); + + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc, ev); + TRACE_PROTO("CC cubic", QUIC_EV_CONN_CC, cc->qc, ev, cc); + + switch (ev->type) { + case QUIC_CC_EVT_ACK: + /* RFC 9002 7.3.2. Recovery + * A recovery period ends and the sender enters congestion avoidance when a + * packet sent during the recovery period is acknowledged. + */ + if (tick_is_le(ev->ack.time_sent, c->recovery_start_time)) { + TRACE_PROTO("CC cubic (still in recov. period)", QUIC_EV_CONN_CC, cc->qc); + goto leave; + } + + c->state = QUIC_CC_ST_CA; + c->recovery_start_time = TICK_ETERNITY; + break; + case QUIC_CC_EVT_LOSS: + break; + case QUIC_CC_EVT_ECN_CE: + /* TODO */ + break; + } + + leave: + TRACE_PROTO("CC cubic", QUIC_EV_CONN_CC, cc->qc, NULL, cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc, NULL, cc); +} + +static void (*quic_cc_cubic_state_cbs[])(struct quic_cc *cc, + struct quic_cc_event *ev) = { + [QUIC_CC_ST_SS] = quic_cc_cubic_ss_cb, + [QUIC_CC_ST_CA] = quic_cc_cubic_ca_cb, + [QUIC_CC_ST_RP] = quic_cc_cubic_rp_cb, +}; + +static void quic_cc_cubic_event(struct quic_cc *cc, struct quic_cc_event *ev) +{ + struct cubic *c = quic_cc_priv(cc); + + return quic_cc_cubic_state_cbs[c->state](cc, ev); +} + +static void quic_cc_cubic_state_trace(struct buffer *buf, const struct quic_cc *cc) +{ + struct quic_cc_path *path; + struct cubic *c = quic_cc_priv(cc); + + path = container_of(cc, struct quic_cc_path, cc); + chunk_appendf(buf, " state=%s cwnd=%llu mcwnd=%llu ssthresh=%d rpst=%dms", + quic_cc_state_str(c->state), + (unsigned long long)path->cwnd, + (unsigned long long)path->mcwnd, + (int)c->ssthresh, + !tick_isset(c->recovery_start_time) ? -1 : + TICKS_TO_MS(tick_remain(c->recovery_start_time, now_ms))); +} + +struct quic_cc_algo quic_cc_algo_cubic = { + .type = QUIC_CC_ALGO_TP_CUBIC, + .init = quic_cc_cubic_init, + .event = quic_cc_cubic_event, + .slow_start = quic_cc_cubic_slow_start, + .state_trace = quic_cc_cubic_state_trace, +}; diff --git a/src/quic_cc_newreno.c b/src/quic_cc_newreno.c new file mode 100644 index 0000000..405b0ba --- /dev/null +++ b/src/quic_cc_newreno.c @@ -0,0 +1,220 @@ +/* + * NewReno congestion control algorithm. + * + * This file contains definitions for QUIC congestion control. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <haproxy/api-t.h> +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/quic_cc.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_trace.h> +#include <haproxy/trace.h> + +/* Newreno state */ +struct nr { + uint32_t state; + uint32_t ssthresh; + uint32_t recovery_start_time; + uint32_t remain_acked; +}; + +static int quic_cc_nr_init(struct quic_cc *cc) +{ + struct nr *nr = quic_cc_priv(cc); + + nr->state = QUIC_CC_ST_SS; + nr->ssthresh = QUIC_CC_INFINITE_SSTHESH; + nr->recovery_start_time = 0; + nr->remain_acked = 0; + + return 1; +} + +/* Re-enter slow start state. */ +static void quic_cc_nr_slow_start(struct quic_cc *cc) +{ + struct quic_cc_path *path; + struct nr *nr = quic_cc_priv(cc); + + path = container_of(cc, struct quic_cc_path, cc); + path->cwnd = path->min_cwnd; + /* Re-entering slow start state. */ + nr->state = QUIC_CC_ST_SS; + /* Recovery start time reset */ + nr->recovery_start_time = 0; +} + +/* Enter a recovery period. */ +static void quic_cc_nr_enter_recovery(struct quic_cc *cc) +{ + struct quic_cc_path *path; + struct nr *nr = quic_cc_priv(cc); + + path = container_of(cc, struct quic_cc_path, cc); + nr->recovery_start_time = now_ms; + nr->ssthresh = path->cwnd >> 1; + path->cwnd = QUIC_MAX(nr->ssthresh, (uint32_t)path->min_cwnd); + nr->state = QUIC_CC_ST_RP; +} + +/* Slow start callback. */ +static void quic_cc_nr_ss_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + struct quic_cc_path *path; + struct nr *nr = quic_cc_priv(cc); + + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + TRACE_PROTO("CC reno", QUIC_EV_CONN_CC, cc->qc, ev); + path = container_of(cc, struct quic_cc_path, cc); + switch (ev->type) { + case QUIC_CC_EVT_ACK: + path->cwnd += ev->ack.acked; + path->cwnd = QUIC_MIN(path->max_cwnd, path->cwnd); + path->mcwnd = QUIC_MAX(path->cwnd, path->mcwnd); + /* Exit to congestion avoidance if slow start threshold is reached. */ + if (path->cwnd > nr->ssthresh) + nr->state = QUIC_CC_ST_CA; + break; + + case QUIC_CC_EVT_LOSS: + quic_cc_nr_enter_recovery(cc); + break; + + case QUIC_CC_EVT_ECN_CE: + /* XXX TO DO XXX */ + break; + } + TRACE_PROTO("CC reno", QUIC_EV_CONN_CC, cc->qc, NULL, cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +/* Congestion avoidance callback. */ +static void quic_cc_nr_ca_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + struct quic_cc_path *path; + struct nr *nr = quic_cc_priv(cc); + + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + TRACE_PROTO("CC reno", QUIC_EV_CONN_CC, cc->qc, ev); + path = container_of(cc, struct quic_cc_path, cc); + switch (ev->type) { + case QUIC_CC_EVT_ACK: + { + uint64_t acked; + + /* Increasing the congestion window by (acked / cwnd) + */ + acked = ev->ack.acked * path->mtu + nr->remain_acked; + nr->remain_acked = acked % path->cwnd; + path->cwnd += acked / path->cwnd; + path->cwnd = QUIC_MIN(path->max_cwnd, path->cwnd); + path->mcwnd = QUIC_MAX(path->cwnd, path->mcwnd); + break; + } + + case QUIC_CC_EVT_LOSS: + quic_cc_nr_enter_recovery(cc); + break; + + case QUIC_CC_EVT_ECN_CE: + /* XXX TO DO XXX */ + break; + } + + out: + TRACE_PROTO("CC reno", QUIC_EV_CONN_CC, cc->qc, NULL, cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +/* Recovery period callback. */ +static void quic_cc_nr_rp_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + struct quic_cc_path *path; + struct nr *nr = quic_cc_priv(cc); + + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + TRACE_PROTO("CC reno", QUIC_EV_CONN_CC, cc->qc, ev); + path = container_of(cc, struct quic_cc_path, cc); + switch (ev->type) { + case QUIC_CC_EVT_ACK: + /* RFC 9022 7.3.2. Recovery + * A recovery period ends and the sender enters congestion avoidance when a + * packet sent during the recovery period is acknowledged. + */ + if (tick_is_le(ev->ack.time_sent, nr->recovery_start_time)) { + TRACE_PROTO("CC reno (still in recovery period)", QUIC_EV_CONN_CC, cc->qc, ev); + goto leave; + } + + nr->state = QUIC_CC_ST_CA; + nr->recovery_start_time = TICK_ETERNITY; + path->cwnd = nr->ssthresh; + break; + case QUIC_CC_EVT_LOSS: + /* Do nothing */ + break; + case QUIC_CC_EVT_ECN_CE: + /* XXX TO DO XXX */ + break; + } + + leave: + TRACE_PROTO("CC reno", QUIC_EV_CONN_CC, cc->qc, ev); + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc, ev); +} +static void quic_cc_nr_state_trace(struct buffer *buf, const struct quic_cc *cc) +{ + struct quic_cc_path *path; + struct nr *nr = quic_cc_priv(cc); + + path = container_of(cc, struct quic_cc_path, cc); + chunk_appendf(buf, " state=%s cwnd=%llu mcwnd=%llu ssthresh=%ld rpst=%dms pktloss=%llu", + quic_cc_state_str(nr->state), + (unsigned long long)path->cwnd, + (unsigned long long)path->mcwnd, + (long)nr->ssthresh, + !tick_isset(nr->recovery_start_time) ? -1 : + TICKS_TO_MS(tick_remain(nr->recovery_start_time, now_ms)), + (unsigned long long)path->loss.nb_lost_pkt); +} + +static void (*quic_cc_nr_state_cbs[])(struct quic_cc *cc, + struct quic_cc_event *ev) = { + [QUIC_CC_ST_SS] = quic_cc_nr_ss_cb, + [QUIC_CC_ST_CA] = quic_cc_nr_ca_cb, + [QUIC_CC_ST_RP] = quic_cc_nr_rp_cb, +}; + +static void quic_cc_nr_event(struct quic_cc *cc, struct quic_cc_event *ev) +{ + struct nr *nr = quic_cc_priv(cc); + + return quic_cc_nr_state_cbs[nr->state](cc, ev); +} + +struct quic_cc_algo quic_cc_algo_nr = { + .type = QUIC_CC_ALGO_TP_NEWRENO, + .init = quic_cc_nr_init, + .event = quic_cc_nr_event, + .slow_start = quic_cc_nr_slow_start, + .state_trace = quic_cc_nr_state_trace, +}; + diff --git a/src/quic_cc_nocc.c b/src/quic_cc_nocc.c new file mode 100644 index 0000000..6e5cff9 --- /dev/null +++ b/src/quic_cc_nocc.c @@ -0,0 +1,76 @@ +/* + * Fake congestion control algorithm which does nothing except initializing + * the congestion control window to a fixed value. + * + */ + +#include <haproxy/api-t.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_trace.h> +#include <haproxy/trace.h> + +static int quic_cc_nocc_init(struct quic_cc *cc) +{ + struct quic_cc_path *path; + + path = container_of(cc, struct quic_cc_path, cc); + path->cwnd = path->max_cwnd; + return 1; +} + +static void quic_cc_nocc_slow_start(struct quic_cc *cc) +{ +} + +/* Slow start callback. */ +static void quic_cc_nocc_ss_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + TRACE_PROTO("CC nocc", QUIC_EV_CONN_CC, cc->qc, ev, cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +/* Congestion avoidance callback. */ +static void quic_cc_nocc_ca_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + TRACE_PROTO("CC nocc", QUIC_EV_CONN_CC, cc->qc, ev, cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +/* Recovery period callback. */ +static void quic_cc_nocc_rp_cb(struct quic_cc *cc, struct quic_cc_event *ev) +{ + TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); + TRACE_PROTO("CC nocc", QUIC_EV_CONN_CC, cc->qc, ev, cc); + TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); +} + +static void quic_cc_nocc_state_trace(struct buffer *buf, const struct quic_cc *cc) +{ + struct quic_cc_path *path; + + path = container_of(cc, struct quic_cc_path, cc); + chunk_appendf(buf, " cwnd=%llu", (unsigned long long)path->cwnd); +} + +static void (*quic_cc_nocc_state_cbs[])(struct quic_cc *cc, + struct quic_cc_event *ev) = { + [QUIC_CC_ST_SS] = quic_cc_nocc_ss_cb, + [QUIC_CC_ST_CA] = quic_cc_nocc_ca_cb, + [QUIC_CC_ST_RP] = quic_cc_nocc_rp_cb, +}; + +static void quic_cc_nocc_event(struct quic_cc *cc, struct quic_cc_event *ev) +{ + return quic_cc_nocc_state_cbs[QUIC_CC_ST_SS](cc, ev); +} + +struct quic_cc_algo quic_cc_algo_nocc = { + .type = QUIC_CC_ALGO_TP_NOCC, + .init = quic_cc_nocc_init, + .event = quic_cc_nocc_event, + .slow_start = quic_cc_nocc_slow_start, + .state_trace = quic_cc_nocc_state_trace, +}; + diff --git a/src/quic_cid.c b/src/quic_cid.c new file mode 100644 index 0000000..19c1f07 --- /dev/null +++ b/src/quic_cid.c @@ -0,0 +1,286 @@ +#include <import/eb64tree.h> +#include <import/ebmbtree.h> + +#include <haproxy/pool.h> +#include <haproxy/quic_cid.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_rx-t.h> +#include <haproxy/quic_trace.h> +#include <haproxy/trace.h> +#include <haproxy/xxhash.h> + +/* Initialize the stateless reset token attached to <conn_id> connection ID. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_stateless_reset_token_init(struct quic_connection_id *conn_id) +{ + /* Output secret */ + unsigned char *token = conn_id->stateless_reset_token; + size_t tokenlen = sizeof conn_id->stateless_reset_token; + /* Salt */ + const unsigned char *cid = conn_id->cid.data; + size_t cidlen = conn_id->cid.len; + + return quic_stateless_reset_token_cpy(token, tokenlen, cid, cidlen); +} + +/* Generate a CID directly derived from <orig> CID and <addr> address. + * + * Returns the derived CID. + */ +struct quic_cid quic_derive_cid(const struct quic_cid *orig, + const struct sockaddr_storage *addr) +{ + struct quic_cid cid; + const struct sockaddr_in *in; + const struct sockaddr_in6 *in6; + char *pos = trash.area; + size_t idx = 0; + uint64_t hash; + int i; + + /* Prepare buffer for hash using original CID first. */ + memcpy(pos, orig->data, orig->len); + idx += orig->len; + + /* Concatenate client address. */ + switch (addr->ss_family) { + case AF_INET: + in = (struct sockaddr_in *)addr; + + memcpy(&pos[idx], &in->sin_addr, sizeof(in->sin_addr)); + idx += sizeof(in->sin_addr); + memcpy(&pos[idx], &in->sin_port, sizeof(in->sin_port)); + idx += sizeof(in->sin_port); + break; + + case AF_INET6: + in6 = (struct sockaddr_in6 *)addr; + + memcpy(&pos[idx], &in6->sin6_addr, sizeof(in6->sin6_addr)); + idx += sizeof(in6->sin6_addr); + memcpy(&pos[idx], &in6->sin6_port, sizeof(in6->sin6_port)); + idx += sizeof(in6->sin6_port); + break; + + default: + /* TODO to implement */ + ABORT_NOW(); + } + + /* Avoid similar values between multiple haproxy process. */ + memcpy(&pos[idx], boot_seed, sizeof(boot_seed)); + idx += sizeof(boot_seed); + + /* Hash the final buffer content. */ + hash = XXH64(pos, idx, 0); + + for (i = 0; i < sizeof(hash); ++i) + cid.data[i] = hash >> ((sizeof(hash) * 7) - (8 * i)); + cid.len = sizeof(hash); + + return cid; +} + +/* Allocate a new CID and attach it to <root> ebtree. + * + * If <orig> and <addr> params are non null, the new CID value is directly + * derived from them. Else a random value is generated. The CID is then marked + * with the current thread ID. + * + * Returns the new CID if succeeded, NULL if not. + */ +struct quic_connection_id *new_quic_cid(struct eb_root *root, + struct quic_conn *qc, + const struct quic_cid *orig, + const struct sockaddr_storage *addr) +{ + struct quic_connection_id *conn_id; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + /* Caller must set either none or both values. */ + BUG_ON(!!orig != !!addr); + + conn_id = pool_alloc(pool_head_quic_connection_id); + if (!conn_id) { + TRACE_ERROR("cid allocation failed", QUIC_EV_CONN_TXPKT, qc); + goto err; + } + + conn_id->cid.len = QUIC_HAP_CID_LEN; + + if (!orig) { + if (quic_newcid_from_hash64) + quic_newcid_from_hash64(conn_id->cid.data, conn_id->cid.len, qc->hash64, + global.cluster_secret, sizeof(global.cluster_secret)); + else if (RAND_bytes(conn_id->cid.data, conn_id->cid.len) != 1) { + /* TODO: RAND_bytes() should be replaced */ + TRACE_ERROR("RAND_bytes() failed", QUIC_EV_CONN_TXPKT, qc); + goto err; + } + } + else { + /* Derive the new CID value from original CID. */ + conn_id->cid = quic_derive_cid(orig, addr); + } + + if (quic_stateless_reset_token_init(conn_id) != 1) { + TRACE_ERROR("quic_stateless_reset_token_init() failed", QUIC_EV_CONN_TXPKT, qc); + goto err; + } + + conn_id->qc = qc; + HA_ATOMIC_STORE(&conn_id->tid, tid); + + conn_id->seq_num.key = qc ? qc->next_cid_seq_num++ : 0; + conn_id->retire_prior_to = 0; + /* insert the allocated CID in the quic_conn tree */ + if (root) + eb64_insert(root, &conn_id->seq_num); + + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return conn_id; + + err: + pool_free(pool_head_quic_connection_id, conn_id); + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return NULL; +} + +/* Retrieve the thread ID associated to QUIC connection ID <cid> of length + * <cid_len>. CID may be not found on the CID tree because it is an ODCID. In + * this case, it will derived using client address <cli_addr> as hash + * parameter. However, this is done only if <pos> points to an INITIAL or 0RTT + * packet of length <len>. + * + * Returns the thread ID or a negative error code. + */ +int quic_get_cid_tid(const unsigned char *cid, size_t cid_len, + const struct sockaddr_storage *cli_addr, + unsigned char *pos, size_t len) +{ + struct quic_cid_tree *tree; + struct quic_connection_id *conn_id; + struct ebmb_node *node; + + tree = &quic_cid_trees[_quic_cid_tree_idx(cid)]; + HA_RWLOCK_RDLOCK(QC_CID_LOCK, &tree->lock); + node = ebmb_lookup(&tree->root, cid, cid_len); + HA_RWLOCK_RDUNLOCK(QC_CID_LOCK, &tree->lock); + + if (!node) { + struct quic_cid orig, derive_cid; + struct quic_rx_packet pkt; + + if (!qc_parse_hd_form(&pkt, &pos, pos + len)) + goto not_found; + + if (pkt.type != QUIC_PACKET_TYPE_INITIAL && + pkt.type != QUIC_PACKET_TYPE_0RTT) { + goto not_found; + } + + memcpy(orig.data, cid, cid_len); + orig.len = cid_len; + derive_cid = quic_derive_cid(&orig, cli_addr); + + tree = &quic_cid_trees[quic_cid_tree_idx(&derive_cid)]; + HA_RWLOCK_RDLOCK(QC_CID_LOCK, &tree->lock); + node = ebmb_lookup(&tree->root, cid, cid_len); + HA_RWLOCK_RDUNLOCK(QC_CID_LOCK, &tree->lock); + } + + if (!node) + goto not_found; + + conn_id = ebmb_entry(node, struct quic_connection_id, node); + return HA_ATOMIC_LOAD(&conn_id->tid); + + not_found: + return -1; +} + +/* Retrieve a quic_conn instance from the <pkt> DCID field. If the packet is an + * INITIAL or 0RTT type, we may have to use client address <saddr> if an ODCID + * is used. + * + * Returns the instance or NULL if not found. + */ +struct quic_conn *retrieve_qc_conn_from_cid(struct quic_rx_packet *pkt, + struct sockaddr_storage *saddr, + int *new_tid) +{ + struct quic_conn *qc = NULL; + struct ebmb_node *node; + struct quic_connection_id *conn_id; + struct quic_cid_tree *tree; + uint conn_id_tid; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT); + *new_tid = -1; + + /* First look into DCID tree. */ + tree = &quic_cid_trees[_quic_cid_tree_idx(pkt->dcid.data)]; + HA_RWLOCK_RDLOCK(QC_CID_LOCK, &tree->lock); + node = ebmb_lookup(&tree->root, pkt->dcid.data, pkt->dcid.len); + + /* If not found on an Initial/0-RTT packet, it could be because an + * ODCID is reused by the client. Calculate the derived CID value to + * retrieve it from the DCID tree. + */ + if (!node && (pkt->type == QUIC_PACKET_TYPE_INITIAL || + pkt->type == QUIC_PACKET_TYPE_0RTT)) { + const struct quic_cid derive_cid = quic_derive_cid(&pkt->dcid, saddr); + + HA_RWLOCK_RDUNLOCK(QC_CID_LOCK, &tree->lock); + + tree = &quic_cid_trees[quic_cid_tree_idx(&derive_cid)]; + HA_RWLOCK_RDLOCK(QC_CID_LOCK, &tree->lock); + node = ebmb_lookup(&tree->root, derive_cid.data, derive_cid.len); + } + + if (!node) + goto end; + + conn_id = ebmb_entry(node, struct quic_connection_id, node); + conn_id_tid = HA_ATOMIC_LOAD(&conn_id->tid); + if (conn_id_tid != tid) { + *new_tid = conn_id_tid; + goto end; + } + qc = conn_id->qc; + + end: + HA_RWLOCK_RDUNLOCK(QC_CID_LOCK, &tree->lock); + TRACE_LEAVE(QUIC_EV_CONN_RXPKT, qc); + return qc; +} + +/* Build a NEW_CONNECTION_ID frame for <conn_id> CID of <qc> connection. + * + * Returns 1 on success else 0. + */ +int qc_build_new_connection_id_frm(struct quic_conn *qc, + struct quic_connection_id *conn_id) +{ + int ret = 0; + struct quic_frame *frm; + struct quic_enc_level *qel; + + TRACE_ENTER(QUIC_EV_CONN_PRSHPKT, qc); + + qel = qc->ael; + frm = qc_frm_alloc(QUIC_FT_NEW_CONNECTION_ID); + if (!frm) { + TRACE_ERROR("frame allocation error", QUIC_EV_CONN_IO_CB, qc); + goto leave; + } + + quic_connection_id_to_frm_cpy(frm, conn_id); + LIST_APPEND(&qel->pktns->tx.frms, &frm->list); + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSHPKT, qc); + return ret; +} diff --git a/src/quic_cli.c b/src/quic_cli.c new file mode 100644 index 0000000..56301fa --- /dev/null +++ b/src/quic_cli.c @@ -0,0 +1,413 @@ +#include <import/eb64tree.h> + +#include <haproxy/applet-t.h> +#include <haproxy/cli.h> +#include <haproxy/list.h> +#include <haproxy/tools.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_tp.h> + +/* incremented by each "show quic". */ +unsigned int qc_epoch = 0; + +enum quic_dump_format { + QUIC_DUMP_FMT_ONELINE, + QUIC_DUMP_FMT_FULL, +}; + +/* appctx context used by "show quic" command */ +struct show_quic_ctx { + unsigned int epoch; + struct bref bref; /* back-reference to the quic-conn being dumped */ + unsigned int thr; + int flags; + enum quic_dump_format format; +}; + +#define QC_CLI_FL_SHOW_ALL 0x1 /* show closing/draining connections */ + +static int cli_parse_show_quic(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_quic_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int argc = 2; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + ctx->epoch = _HA_ATOMIC_FETCH_ADD(&qc_epoch, 1); + ctx->thr = 0; + ctx->flags = 0; + ctx->format = QUIC_DUMP_FMT_ONELINE; + + if (strcmp(args[argc], "oneline") == 0) { + /* format already used as default value */ + ++argc; + } + else if (strcmp(args[argc], "full") == 0) { + ctx->format = QUIC_DUMP_FMT_FULL; + ++argc; + } + + while (*args[argc]) { + if (strcmp(args[argc], "all") == 0) + ctx->flags |= QC_CLI_FL_SHOW_ALL; + + ++argc; + } + + LIST_INIT(&ctx->bref.users); + + return 0; +} + +/* Dump for "show quic" with "oneline" format. */ +static void dump_quic_oneline(struct show_quic_ctx *ctx, struct quic_conn *qc) +{ + char bufaddr[INET6_ADDRSTRLEN], bufport[6]; + int ret; + unsigned char cid_len; + + ret = chunk_appendf(&trash, "%p[%02u]/%-.12s ", qc, ctx->thr, + qc->li->bind_conf->frontend->id); + chunk_appendf(&trash, "%*s", 36 - ret, " "); /* align output */ + + /* State */ + if (qc->flags & QUIC_FL_CONN_CLOSING) + chunk_appendf(&trash, "CLOSE "); + else if (qc->flags & QUIC_FL_CONN_DRAINING) + chunk_appendf(&trash, "DRAIN "); + else if (qc->state < QUIC_HS_ST_COMPLETE) + chunk_appendf(&trash, "HDSHK "); + else + chunk_appendf(&trash, "ESTAB "); + + /* Bytes in flight / Lost packets */ + chunk_appendf(&trash, "%9llu %6llu %6llu ", + (ullong)qc->path->in_flight, + (ullong)qc->path->ifae_pkts, + (ullong)qc->path->loss.nb_lost_pkt); + + /* Socket */ + if (qc->local_addr.ss_family == AF_INET || + qc->local_addr.ss_family == AF_INET6) { + addr_to_str(&qc->local_addr, bufaddr, sizeof(bufaddr)); + port_to_str(&qc->local_addr, bufport, sizeof(bufport)); + chunk_appendf(&trash, "%15s:%-5s ", bufaddr, bufport); + + addr_to_str(&qc->peer_addr, bufaddr, sizeof(bufaddr)); + port_to_str(&qc->peer_addr, bufport, sizeof(bufport)); + chunk_appendf(&trash, "%15s:%-5s ", bufaddr, bufport); + + } + + /* CIDs */ + for (cid_len = 0; cid_len < qc->scid.len; ++cid_len) + chunk_appendf(&trash, "%02x", qc->scid.data[cid_len]); + + chunk_appendf(&trash, " "); + for (cid_len = 0; cid_len < qc->dcid.len; ++cid_len) + chunk_appendf(&trash, "%02x", qc->dcid.data[cid_len]); + + chunk_appendf(&trash, "\n"); +} + +/* Dump for "show quic" with "full" format. */ +static void dump_quic_full(struct show_quic_ctx *ctx, struct quic_conn *qc) +{ + struct quic_pktns *pktns; + struct eb64_node *node; + struct qc_stream_desc *stream; + char bufaddr[INET6_ADDRSTRLEN], bufport[6]; + int expire, i, addnl; + unsigned char cid_len; + + addnl = 0; + /* CIDs */ + chunk_appendf(&trash, "* %p[%02u]: scid=", qc, ctx->thr); + for (cid_len = 0; cid_len < qc->scid.len; ++cid_len) + chunk_appendf(&trash, "%02x", qc->scid.data[cid_len]); + while (cid_len++ < 20) + chunk_appendf(&trash, ".."); + + chunk_appendf(&trash, " dcid="); + for (cid_len = 0; cid_len < qc->dcid.len; ++cid_len) + chunk_appendf(&trash, "%02x", qc->dcid.data[cid_len]); + while (cid_len++ < 20) + chunk_appendf(&trash, ".."); + + chunk_appendf(&trash, "\n"); + + chunk_appendf(&trash, " loc. TPs:"); + quic_transport_params_dump(&trash, qc, &qc->rx.params); + chunk_appendf(&trash, "\n"); + chunk_appendf(&trash, " rem. TPs:"); + quic_transport_params_dump(&trash, qc, &qc->tx.params); + chunk_appendf(&trash, "\n"); + + /* Connection state */ + if (qc->flags & QUIC_FL_CONN_CLOSING) + chunk_appendf(&trash, " st=closing "); + else if (qc->flags & QUIC_FL_CONN_DRAINING) + chunk_appendf(&trash, " st=draining "); + else if (qc->state < QUIC_HS_ST_CONFIRMED) + chunk_appendf(&trash, " st=handshake "); + else + chunk_appendf(&trash, " st=opened "); + + if (qc->mux_state == QC_MUX_NULL) + chunk_appendf(&trash, "mux=null "); + else if (qc->mux_state == QC_MUX_READY) + chunk_appendf(&trash, "mux=ready "); + else + chunk_appendf(&trash, "mux=released "); + + if (qc->idle_timer_task) { + expire = qc->idle_timer_task->expire; + chunk_appendf(&trash, "expire=%02ds ", + TICKS_TO_MS(tick_remain(now_ms, expire)) / 1000); + } + + chunk_appendf(&trash, "\n"); + + /* Socket */ + chunk_appendf(&trash, " fd=%d", qc->fd); + if (qc->local_addr.ss_family == AF_INET || + qc->local_addr.ss_family == AF_INET6) { + addr_to_str(&qc->local_addr, bufaddr, sizeof(bufaddr)); + port_to_str(&qc->local_addr, bufport, sizeof(bufport)); + chunk_appendf(&trash, " local_addr=%s:%s", bufaddr, bufport); + + addr_to_str(&qc->peer_addr, bufaddr, sizeof(bufaddr)); + port_to_str(&qc->peer_addr, bufport, sizeof(bufport)); + chunk_appendf(&trash, " foreign_addr=%s:%s", bufaddr, bufport); + } + + chunk_appendf(&trash, "\n"); + + /* Packet number spaces information */ + pktns = qc->ipktns; + if (pktns) { + chunk_appendf(&trash, " [initl] rx.ackrng=%-6zu tx.inflight=%-6zu", + pktns->rx.arngs.sz, pktns->tx.in_flight); + } + + pktns = qc->hpktns; + if (pktns) { + chunk_appendf(&trash, " [hndshk] rx.ackrng=%-6zu tx.inflight=%-6zu\n", + pktns->rx.arngs.sz, pktns->tx.in_flight); + } + + pktns = qc->apktns; + if (pktns) { + chunk_appendf(&trash, " [01rtt] rx.ackrng=%-6zu tx.inflight=%-6zu\n", + pktns->rx.arngs.sz, pktns->tx.in_flight); + } + + chunk_appendf(&trash, " srtt=%-4u rttvar=%-4u rttmin=%-4u ptoc=%-4u cwnd=%-6llu" + " mcwnd=%-6llu sentpkts=%-6llu lostpkts=%-6llu\n reorderedpkts=%-6llu", + qc->path->loss.srtt, qc->path->loss.rtt_var, + qc->path->loss.rtt_min, qc->path->loss.pto_count, (ullong)qc->path->cwnd, + (ullong)qc->path->mcwnd, (ullong)qc->cntrs.sent_pkt, (ullong)qc->path->loss.nb_lost_pkt, (ullong)qc->path->loss.nb_reordered_pkt); + + if (qc->cntrs.dropped_pkt) { + chunk_appendf(&trash, " droppkts=%-6llu", qc->cntrs.dropped_pkt); + addnl = 1; + } + if (qc->cntrs.dropped_pkt_bufoverrun) { + chunk_appendf(&trash, " dropbuff=%-6llu", qc->cntrs.dropped_pkt_bufoverrun); + addnl = 1; + } + if (qc->cntrs.dropped_parsing) { + chunk_appendf(&trash, " droppars=%-6llu", qc->cntrs.dropped_parsing); + addnl = 1; + } + if (qc->cntrs.socket_full) { + chunk_appendf(&trash, " sockfull=%-6llu", qc->cntrs.socket_full); + addnl = 1; + } + if (qc->cntrs.sendto_err) { + chunk_appendf(&trash, " sendtoerr=%-6llu", qc->cntrs.sendto_err); + addnl = 1; + } + if (qc->cntrs.sendto_err_unknown) { + chunk_appendf(&trash, " sendtounknerr=%-6llu", qc->cntrs.sendto_err); + addnl = 1; + } + if (qc->cntrs.conn_migration_done) { + chunk_appendf(&trash, " migrdone=%-6llu", qc->cntrs.conn_migration_done); + addnl = 1; + } + if (qc->cntrs.data_blocked) { + chunk_appendf(&trash, " datablocked=%-6llu", qc->cntrs.data_blocked); + addnl = 1; + } + if (qc->cntrs.stream_data_blocked) { + chunk_appendf(&trash, " sdatablocked=%-6llu", qc->cntrs.stream_data_blocked); + addnl = 1; + } + if (qc->cntrs.streams_blocked_bidi) { + chunk_appendf(&trash, " sblockebidi=%-6llu", qc->cntrs.streams_blocked_bidi); + addnl = 1; + } + if (qc->cntrs.streams_blocked_uni) { + chunk_appendf(&trash, " sblockeduni=%-6llu", qc->cntrs.streams_blocked_uni); + addnl = 1; + } + if (addnl) + chunk_appendf(&trash, "\n"); + + /* Streams */ + node = eb64_first(&qc->streams_by_id); + i = 0; + while (node) { + stream = eb64_entry(node, struct qc_stream_desc, by_id); + node = eb64_next(node); + + chunk_appendf(&trash, " | stream=%-8llu", (unsigned long long)stream->by_id.key); + chunk_appendf(&trash, " off=%-8llu ack=%-8llu", + (unsigned long long)stream->buf_offset, + (unsigned long long)stream->ack_offset); + + if (!(++i % 3)) { + chunk_appendf(&trash, "\n"); + i = 0; + } + } + + chunk_appendf(&trash, "\n"); +} + +static int cli_io_handler_dump_quic(struct appctx *appctx) +{ + struct show_quic_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct quic_conn *qc; + + thread_isolate(); + + if (ctx->thr >= global.nbthread) + goto done; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) { + /* If we're forced to shut down, we might have to remove our + * reference to the last stream being dumped. + */ + if (!LIST_ISEMPTY(&ctx->bref.users)) + LIST_DEL_INIT(&ctx->bref.users); + goto done; + } + + chunk_reset(&trash); + + if (!LIST_ISEMPTY(&ctx->bref.users)) { + /* Remove show_quic_ctx from previous quic_conn instance. */ + LIST_DEL_INIT(&ctx->bref.users); + } + else if (!ctx->bref.ref) { + /* First invocation. */ + ctx->bref.ref = ha_thread_ctx[ctx->thr].quic_conns.n; + + /* Print legend for oneline format. */ + if (ctx->format == QUIC_DUMP_FMT_ONELINE) { + chunk_appendf(&trash, "# conn/frontend state " + "in_flight infl_p lost_p " + "Local Address Foreign Address " + "local & remote CIDs\n"); + applet_putchk(appctx, &trash); + } + } + + while (1) { + int done = 0; + + if (ctx->bref.ref == &ha_thread_ctx[ctx->thr].quic_conns) { + /* If closing connections requested through "all", move + * to quic_conns_clo list after browsing quic_conns. + * Else move directly to the next quic_conns thread. + */ + if (ctx->flags & QC_CLI_FL_SHOW_ALL) { + ctx->bref.ref = ha_thread_ctx[ctx->thr].quic_conns_clo.n; + continue; + } + + done = 1; + } + else if (ctx->bref.ref == &ha_thread_ctx[ctx->thr].quic_conns_clo) { + /* Closing list entirely browsed, go to next quic_conns + * thread. + */ + done = 1; + } + else { + /* Retrieve next element of the current list. */ + qc = LIST_ELEM(ctx->bref.ref, struct quic_conn *, el_th_ctx); + if ((int)(qc->qc_epoch - ctx->epoch) > 0) + done = 1; + } + + if (done) { + ++ctx->thr; + if (ctx->thr >= global.nbthread) + break; + /* Switch to next thread quic_conns list. */ + ctx->bref.ref = ha_thread_ctx[ctx->thr].quic_conns.n; + continue; + } + + switch (ctx->format) { + case QUIC_DUMP_FMT_FULL: + dump_quic_full(ctx, qc); + break; + case QUIC_DUMP_FMT_ONELINE: + dump_quic_oneline(ctx, qc); + break; + } + + if (applet_putchk(appctx, &trash) == -1) { + /* Register show_quic_ctx to quic_conn instance. */ + LIST_APPEND(&qc->back_refs, &ctx->bref.users); + goto full; + } + + ctx->bref.ref = qc->el_th_ctx.n; + } + + done: + thread_release(); + return 1; + + full: + thread_release(); + return 0; +} + +static void cli_release_show_quic(struct appctx *appctx) +{ + struct show_quic_ctx *ctx = appctx->svcctx; + + if (ctx->thr < global.nbthread) { + thread_isolate(); + if (!LIST_ISEMPTY(&ctx->bref.users)) + LIST_DEL_INIT(&ctx->bref.users); + thread_release(); + } +} + +static struct cli_kw_list cli_kws = {{ }, { + { { "show", "quic", NULL }, "show quic [oneline|full] [all] : display quic connections status", cli_parse_show_quic, cli_io_handler_dump_quic, cli_release_show_quic }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +static void cli_quic_init() +{ + int thr; + + for (thr = 0; thr < MAX_THREADS; ++thr) { + LIST_INIT(&ha_thread_ctx[thr].quic_conns); + LIST_INIT(&ha_thread_ctx[thr].quic_conns_clo); + } +} +INITCALL0(STG_INIT, cli_quic_init); diff --git a/src/quic_conn.c b/src/quic_conn.c new file mode 100644 index 0000000..5233496 --- /dev/null +++ b/src/quic_conn.c @@ -0,0 +1,1893 @@ +/* + * QUIC protocol implementation. Lower layer with internal features implemented + * here such as QUIC encryption, idle timeout, acknowledgement and + * retransmission. + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/quic_conn.h> + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include <netinet/tcp.h> + +#include <import/ebmbtree.h> + +#include <haproxy/buf-t.h> +#include <haproxy/compat.h> +#include <haproxy/api.h> +#include <haproxy/debug.h> +#include <haproxy/tools.h> +#include <haproxy/ticks.h> + +#include <haproxy/connection.h> +#include <haproxy/fd.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/h3.h> +#include <haproxy/hq_interop.h> +#include <haproxy/log.h> +#include <haproxy/mux_quic.h> +#include <haproxy/ncbuf.h> +#include <haproxy/pipe.h> +#include <haproxy/proxy.h> +#include <haproxy/quic_ack.h> +#include <haproxy/quic_cc.h> +#include <haproxy/quic_cli-t.h> +#include <haproxy/quic_frame.h> +#include <haproxy/quic_enc.h> +#include <haproxy/quic_loss.h> +#include <haproxy/quic_rx.h> +#include <haproxy/quic_ssl.h> +#include <haproxy/quic_sock.h> +#include <haproxy/quic_stats.h> +#include <haproxy/quic_stream.h> +#include <haproxy/quic_tp.h> +#include <haproxy/quic_trace.h> +#include <haproxy/quic_tx.h> +#include <haproxy/cbuf.h> +#include <haproxy/proto_quic.h> +#include <haproxy/quic_tls.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/task.h> +#include <haproxy/thread.h> +#include <haproxy/trace.h> + +/* list of supported QUIC versions by this implementation */ +const struct quic_version quic_versions[] = { + { + .num = QUIC_PROTOCOL_VERSION_DRAFT_29, + .initial_salt = initial_salt_draft_29, + .initial_salt_len = sizeof initial_salt_draft_29, + .key_label = (const unsigned char *)QUIC_HKDF_KEY_LABEL_V1, + .key_label_len = sizeof(QUIC_HKDF_KEY_LABEL_V1) - 1, + .iv_label = (const unsigned char *)QUIC_HKDF_IV_LABEL_V1, + .iv_label_len = sizeof(QUIC_HKDF_IV_LABEL_V1) - 1, + .hp_label = (const unsigned char *)QUIC_HKDF_HP_LABEL_V1, + .hp_label_len = sizeof(QUIC_HKDF_HP_LABEL_V1) - 1, + .ku_label = (const unsigned char *)QUIC_HKDF_KU_LABEL_V1, + .ku_label_len = sizeof(QUIC_HKDF_KU_LABEL_V1) - 1, + .retry_tag_key = (const unsigned char *)QUIC_TLS_RETRY_KEY_DRAFT, + .retry_tag_nonce = (const unsigned char *)QUIC_TLS_RETRY_NONCE_DRAFT, + }, + { + .num = QUIC_PROTOCOL_VERSION_1, + .initial_salt = initial_salt_v1, + .initial_salt_len = sizeof initial_salt_v1, + .key_label = (const unsigned char *)QUIC_HKDF_KEY_LABEL_V1, + .key_label_len = sizeof(QUIC_HKDF_KEY_LABEL_V1) - 1, + .iv_label = (const unsigned char *)QUIC_HKDF_IV_LABEL_V1, + .iv_label_len = sizeof(QUIC_HKDF_IV_LABEL_V1) - 1, + .hp_label = (const unsigned char *)QUIC_HKDF_HP_LABEL_V1, + .hp_label_len = sizeof(QUIC_HKDF_HP_LABEL_V1) - 1, + .ku_label = (const unsigned char *)QUIC_HKDF_KU_LABEL_V1, + .ku_label_len = sizeof(QUIC_HKDF_KU_LABEL_V1) - 1, + .retry_tag_key = (const unsigned char *)QUIC_TLS_RETRY_KEY_V1, + .retry_tag_nonce = (const unsigned char *)QUIC_TLS_RETRY_NONCE_V1, + }, + { + .num = QUIC_PROTOCOL_VERSION_2, + .initial_salt = initial_salt_v2, + .initial_salt_len = sizeof initial_salt_v2, + .key_label = (const unsigned char *)QUIC_HKDF_KEY_LABEL_V2, + .key_label_len = sizeof(QUIC_HKDF_KEY_LABEL_V2) - 1, + .iv_label = (const unsigned char *)QUIC_HKDF_IV_LABEL_V2, + .iv_label_len = sizeof(QUIC_HKDF_IV_LABEL_V2) - 1, + .hp_label = (const unsigned char *)QUIC_HKDF_HP_LABEL_V2, + .hp_label_len = sizeof(QUIC_HKDF_HP_LABEL_V2) - 1, + .ku_label = (const unsigned char *)QUIC_HKDF_KU_LABEL_V2, + .ku_label_len = sizeof(QUIC_HKDF_KU_LABEL_V2) - 1, + .retry_tag_key = (const unsigned char *)QUIC_TLS_RETRY_KEY_V2, + .retry_tag_nonce = (const unsigned char *)QUIC_TLS_RETRY_NONCE_V2, + }, +}; + +/* Function pointers, can be used to compute a hash from first generated CID and to derive new CIDs */ +uint64_t (*quic_hash64_from_cid)(const unsigned char *cid, int size, const unsigned char *secret, size_t secretlen) = NULL; +void (*quic_newcid_from_hash64)(unsigned char *cid, int size, uint64_t hash, const unsigned char *secret, size_t secretlen) = NULL; + +/* The total number of supported versions */ +const size_t quic_versions_nb = sizeof quic_versions / sizeof *quic_versions; +/* Listener only preferred version */ +const struct quic_version *preferred_version; +/* RFC 8999 5.4. Version + * A Version field with a + * value of 0x00000000 is reserved for version negotiation + */ +const struct quic_version quic_version_VN_reserved = { .num = 0, }; + +DECLARE_STATIC_POOL(pool_head_quic_conn, "quic_conn", sizeof(struct quic_conn)); +DECLARE_STATIC_POOL(pool_head_quic_conn_closed, "quic_conn_closed", sizeof(struct quic_conn_closed)); +DECLARE_STATIC_POOL(pool_head_quic_cids, "quic_cids", sizeof(struct eb_root)); +DECLARE_POOL(pool_head_quic_connection_id, + "quic_connection_id", sizeof(struct quic_connection_id)); + +struct task *quic_conn_app_io_cb(struct task *t, void *context, unsigned int state); +static int quic_conn_init_timer(struct quic_conn *qc); +static int quic_conn_init_idle_timer_task(struct quic_conn *qc, struct proxy *px); + +/* Returns 1 if the peer has validated <qc> QUIC connection address, 0 if not. */ +int quic_peer_validated_addr(struct quic_conn *qc) +{ + if (!qc_is_listener(qc)) + return 1; + + if (qc->flags & QUIC_FL_CONN_PEER_VALIDATED_ADDR) + return 1; + + BUG_ON(qc->bytes.prep > 3 * qc->bytes.rx); + + return 0; +} + +/* To be called to kill a connection as soon as possible (without sending any packet). */ +void qc_kill_conn(struct quic_conn *qc) +{ + TRACE_ENTER(QUIC_EV_CONN_KILL, qc); + TRACE_PROTO("killing the connection", QUIC_EV_CONN_KILL, qc); + qc->flags |= QUIC_FL_CONN_TO_KILL; + qc->flags &= ~QUIC_FL_CONN_RETRANS_NEEDED; + task_wakeup(qc->idle_timer_task, TASK_WOKEN_OTHER); + + qc_notify_err(qc); + + TRACE_LEAVE(QUIC_EV_CONN_KILL, qc); +} + +/* Set the timer attached to the QUIC connection with <ctx> as I/O handler and used for + * both loss detection and PTO and schedule the task assiated to this timer if needed. + */ +void qc_set_timer(struct quic_conn *qc) +{ + struct quic_pktns *pktns; + unsigned int pto; + int handshake_confirmed; + + TRACE_ENTER(QUIC_EV_CONN_STIMER, qc); + TRACE_PROTO("set timer", QUIC_EV_CONN_STIMER, qc, NULL, NULL, &qc->path->ifae_pkts); + + pktns = NULL; + if (!qc->timer_task) { + TRACE_PROTO("already released timer task", QUIC_EV_CONN_STIMER, qc); + goto leave; + } + + pktns = quic_loss_pktns(qc); + if (tick_isset(pktns->tx.loss_time)) { + qc->timer = pktns->tx.loss_time; + goto out; + } + + /* anti-amplification: the timer must be + * cancelled for a server which reached the anti-amplification limit. + */ + if (!quic_peer_validated_addr(qc) && + (qc->flags & QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED)) { + TRACE_PROTO("anti-amplification reached", QUIC_EV_CONN_STIMER, qc); + qc->timer = TICK_ETERNITY; + goto out; + } + + if (!qc->path->ifae_pkts && quic_peer_validated_addr(qc)) { + TRACE_PROTO("timer cancellation", QUIC_EV_CONN_STIMER, qc); + /* Timer cancellation. */ + qc->timer = TICK_ETERNITY; + goto out; + } + + handshake_confirmed = qc->state >= QUIC_HS_ST_CONFIRMED; + pktns = quic_pto_pktns(qc, handshake_confirmed, &pto); + if (tick_isset(pto)) + qc->timer = pto; + out: + if (qc->timer == TICK_ETERNITY) { + qc->timer_task->expire = TICK_ETERNITY; + } + else if (tick_is_expired(qc->timer, now_ms)) { + TRACE_DEVEL("wakeup asap timer task", QUIC_EV_CONN_STIMER, qc); + task_wakeup(qc->timer_task, TASK_WOKEN_MSG); + } + else { + TRACE_DEVEL("timer task scheduling", QUIC_EV_CONN_STIMER, qc); + task_schedule(qc->timer_task, qc->timer); + } + leave: + TRACE_PROTO("set timer", QUIC_EV_CONN_STIMER, qc, pktns); + TRACE_LEAVE(QUIC_EV_CONN_STIMER, qc); +} + +/* Prepare the emission of CONNECTION_CLOSE with error <err>. All send/receive + * activity for <qc> will be interrupted. + */ +void quic_set_connection_close(struct quic_conn *qc, const struct quic_err err) +{ + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + if (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) + goto leave; + + TRACE_STATE("setting immediate close", QUIC_EV_CONN_CLOSE, qc); + qc->flags |= QUIC_FL_CONN_IMMEDIATE_CLOSE; + qc->err.code = err.code; + qc->err.app = err.app; + + leave: + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); +} + +/* Set <alert> TLS alert as QUIC CRYPTO_ERROR error */ +void quic_set_tls_alert(struct quic_conn *qc, int alert) +{ + TRACE_ENTER(QUIC_EV_CONN_SSLALERT, qc); + + quic_set_connection_close(qc, quic_err_tls(alert)); + qc->flags |= QUIC_FL_CONN_TLS_ALERT; + TRACE_STATE("Alert set", QUIC_EV_CONN_SSLALERT, qc); + + TRACE_LEAVE(QUIC_EV_CONN_SSLALERT, qc); +} + +/* Set the application for <qc> QUIC connection. + * Return 1 if succeeded, 0 if not. + */ +int quic_set_app_ops(struct quic_conn *qc, const unsigned char *alpn, size_t alpn_len) +{ + if (alpn_len >= 2 && memcmp(alpn, "h3", 2) == 0) + qc->app_ops = &h3_ops; + else if (alpn_len >= 10 && memcmp(alpn, "hq-interop", 10) == 0) + qc->app_ops = &hq_interop_ops; + else + return 0; + + return 1; +} + +/* Schedule a CONNECTION_CLOSE emission on <qc> if the MUX has been released + * and all STREAM data are acknowledged. The MUX is responsible to have set + * <qc.err> before as it is reused for the CONNECTION_CLOSE frame. + * + * TODO this should also be called on lost packet detection + */ +void qc_check_close_on_released_mux(struct quic_conn *qc) +{ + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + + if (qc->mux_state == QC_MUX_RELEASED && eb_is_empty(&qc->streams_by_id)) { + /* Reuse errcode which should have been previously set by the MUX on release. */ + quic_set_connection_close(qc, qc->err); + tasklet_wakeup(qc->wait_event.tasklet); + } + + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); +} + +/* Finalize <qc> QUIC connection: + + * MUST be called after having received the remote transport parameters which + * are parsed when the TLS callback for the ClientHello message is called upon + * SSL_do_handshake() calls, not necessarily at the first time as this TLS + * message may be split between packets + * Return 1 if succeeded, 0 if not. + */ +int qc_conn_finalize(struct quic_conn *qc, int server) +{ + int ret = 0; + + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + if (qc->flags & QUIC_FL_CONN_FINALIZED) + goto finalized; + + if (!quic_tls_finalize(qc, server)) + goto out; + + /* This connection is functional (ready to send/receive) */ + qc->flags |= QUIC_FL_CONN_FINALIZED; + + finalized: + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + return ret; +} + +void quic_conn_closed_err_count_inc(struct quic_conn *qc, struct quic_frame *frm) +{ + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + + if (frm->type == QUIC_FT_CONNECTION_CLOSE) + quic_stats_transp_err_count_inc(qc->prx_counters, frm->connection_close.error_code); + else if (frm->type == QUIC_FT_CONNECTION_CLOSE_APP) { + if (qc->mux_state != QC_MUX_READY || !qc->qcc->app_ops->inc_err_cnt) + goto out; + + qc->qcc->app_ops->inc_err_cnt(qc->qcc->ctx, frm->connection_close_app.error_code); + } + + out: + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); +} + +/* Cancel a request on connection <qc> for stream id <id>. This is useful when + * the client opens a new stream but the MUX has already been released. A + * STOP_SENDING + RESET_STREAM frames are prepared for emission. + * + * TODO this function is closely related to H3. Its place should be in H3 layer + * instead of quic-conn but this requires an architecture adjustment. + * + * Returns 1 on success else 0. + */ +int qc_h3_request_reject(struct quic_conn *qc, uint64_t id) +{ + int ret = 0; + struct quic_frame *ss, *rs; + struct quic_enc_level *qel = qc->ael; + const uint64_t app_error_code = H3_REQUEST_REJECTED; + + TRACE_ENTER(QUIC_EV_CONN_PRSHPKT, qc); + + /* Do not emit rejection for unknown unidirectional stream as it is + * forbidden to close some of them (H3 control stream and QPACK + * encoder/decoder streams). + */ + if (quic_stream_is_uni(id)) { + ret = 1; + goto out; + } + + ss = qc_frm_alloc(QUIC_FT_STOP_SENDING); + if (!ss) { + TRACE_ERROR("failed to allocate quic_frame", QUIC_EV_CONN_PRSHPKT, qc); + goto out; + } + + ss->stop_sending.id = id; + ss->stop_sending.app_error_code = app_error_code; + + rs = qc_frm_alloc(QUIC_FT_RESET_STREAM); + if (!rs) { + TRACE_ERROR("failed to allocate quic_frame", QUIC_EV_CONN_PRSHPKT, qc); + qc_frm_free(qc, &ss); + goto out; + } + + rs->reset_stream.id = id; + rs->reset_stream.app_error_code = app_error_code; + rs->reset_stream.final_size = 0; + + LIST_APPEND(&qel->pktns->tx.frms, &ss->list); + LIST_APPEND(&qel->pktns->tx.frms, &rs->list); + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_PRSHPKT, qc); + return ret; +} + +/* Remove a <qc> quic-conn from its ha_thread_ctx list. If <closing> is true, + * it will immediately be reinserted in the ha_thread_ctx quic_conns_clo list. + */ +void qc_detach_th_ctx_list(struct quic_conn *qc, int closing) +{ + struct bref *bref, *back; + + /* Detach CLI context watchers currently dumping this connection. + * Reattach them to the next quic_conn instance. + */ + list_for_each_entry_safe(bref, back, &qc->back_refs, users) { + /* Remove watcher from this quic_conn instance. */ + LIST_DEL_INIT(&bref->users); + + /* Attach it to next instance unless it was the last list element. */ + if (qc->el_th_ctx.n != &th_ctx->quic_conns && + qc->el_th_ctx.n != &th_ctx->quic_conns_clo) { + struct quic_conn *next = LIST_NEXT(&qc->el_th_ctx, + struct quic_conn *, + el_th_ctx); + LIST_APPEND(&next->back_refs, &bref->users); + } + bref->ref = qc->el_th_ctx.n; + __ha_barrier_store(); + } + + /* Remove quic_conn from global ha_thread_ctx list. */ + LIST_DEL_INIT(&qc->el_th_ctx); + + if (closing) + LIST_APPEND(&th_ctx->quic_conns_clo, &qc->el_th_ctx); +} + + +/* Copy at <pos> position a stateless reset token depending on the + * <salt> salt input. This is the cluster secret which will be derived + * as HKDF input secret to generate this token. + * Return 1 if succeeded, 0 if not. + */ +int quic_stateless_reset_token_cpy(unsigned char *pos, size_t len, + const unsigned char *salt, size_t saltlen) +{ + /* Input secret */ + const unsigned char *key = global.cluster_secret; + size_t keylen = sizeof global.cluster_secret; + /* Info */ + const unsigned char label[] = "stateless token"; + size_t labellen = sizeof label - 1; + int ret; + + ret = quic_hkdf_extract_and_expand(EVP_sha256(), pos, len, + key, keylen, salt, saltlen, label, labellen); + return ret; +} + +/* Build all the frames which must be sent just after the handshake have succeeded. + * This is essentially NEW_CONNECTION_ID frames. A QUIC server must also send + * a HANDSHAKE_DONE frame. + * Return 1 if succeeded, 0 if not. + */ +int quic_build_post_handshake_frames(struct quic_conn *qc) +{ + int ret = 0, max; + struct quic_enc_level *qel; + struct quic_frame *frm, *frmbak; + struct list frm_list = LIST_HEAD_INIT(frm_list); + struct eb64_node *node; + + TRACE_ENTER(QUIC_EV_CONN_IO_CB, qc); + + qel = qc->ael; + /* Only servers must send a HANDSHAKE_DONE frame. */ + if (qc_is_listener(qc)) { + frm = qc_frm_alloc(QUIC_FT_HANDSHAKE_DONE); + if (!frm) { + TRACE_ERROR("frame allocation error", QUIC_EV_CONN_IO_CB, qc); + goto leave; + } + + LIST_APPEND(&frm_list, &frm->list); + } + + /* Initialize <max> connection IDs minus one: there is + * already one connection ID used for the current connection. Also limit + * the number of connection IDs sent to the peer to 4 (3 from this function + * plus 1 for the current connection. + * Note that active_connection_id_limit >= 2: this has been already checked + * when receiving this parameter. + */ + max = QUIC_MIN(qc->tx.params.active_connection_id_limit - 1, (uint64_t)3); + while (max--) { + struct quic_connection_id *conn_id; + + frm = qc_frm_alloc(QUIC_FT_NEW_CONNECTION_ID); + if (!frm) { + TRACE_ERROR("frame allocation error", QUIC_EV_CONN_IO_CB, qc); + goto err; + } + + conn_id = new_quic_cid(qc->cids, qc, NULL, NULL); + if (!conn_id) { + qc_frm_free(qc, &frm); + TRACE_ERROR("CID allocation error", QUIC_EV_CONN_IO_CB, qc); + goto err; + } + + /* TODO To prevent CID tree locking, all CIDs created here + * could be allocated at the same time as the first one. + */ + quic_cid_insert(conn_id); + + quic_connection_id_to_frm_cpy(frm, conn_id); + LIST_APPEND(&frm_list, &frm->list); + } + + LIST_SPLICE(&qel->pktns->tx.frms, &frm_list); + qc->flags &= ~QUIC_FL_CONN_NEED_POST_HANDSHAKE_FRMS; + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_IO_CB, qc); + return ret; + + err: + /* free the frames */ + list_for_each_entry_safe(frm, frmbak, &frm_list, list) + qc_frm_free(qc, &frm); + + /* The first CID sequence number value used to allocated CIDs by this function is 1, + * 0 being the sequence number of the CID for this connection. + */ + node = eb64_lookup_ge(qc->cids, 1); + while (node) { + struct quic_connection_id *conn_id; + + conn_id = eb64_entry(node, struct quic_connection_id, seq_num); + if (conn_id->seq_num.key >= max) + break; + + node = eb64_next(node); + quic_cid_delete(conn_id); + + eb64_delete(&conn_id->seq_num); + pool_free(pool_head_quic_connection_id, conn_id); + } + goto leave; +} + + +/* QUIC connection packet handler task (post handshake) */ +struct task *quic_conn_app_io_cb(struct task *t, void *context, unsigned int state) +{ + struct quic_conn *qc = context; + struct quic_enc_level *qel; + + TRACE_ENTER(QUIC_EV_CONN_IO_CB, qc); + + qel = qc->ael; + TRACE_STATE("connection handshake state", QUIC_EV_CONN_IO_CB, qc, &qc->state); + + if (qc_test_fd(qc)) + qc_rcv_buf(qc); + + /* Prepare post-handshake frames + * - after connection is instantiated (accept is done) + * - handshake state is completed (may not be the case here in 0-RTT) + */ + if ((qc->flags & QUIC_FL_CONN_NEED_POST_HANDSHAKE_FRMS) && qc->conn && + qc->state >= QUIC_HS_ST_COMPLETE) { + quic_build_post_handshake_frames(qc); + } + + /* Retranmissions */ + if (qc->flags & QUIC_FL_CONN_RETRANS_NEEDED) { + TRACE_STATE("retransmission needed", QUIC_EV_CONN_IO_CB, qc); + qc->flags &= ~QUIC_FL_CONN_RETRANS_NEEDED; + if (!qc_dgrams_retransmit(qc)) + goto out; + } + + if (!qc_treat_rx_pkts(qc)) { + TRACE_DEVEL("qc_treat_rx_pkts() failed", QUIC_EV_CONN_IO_CB, qc); + goto out; + } + + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_DEVEL("connection to be killed", QUIC_EV_CONN_IO_CB, qc); + goto out; + } + + if ((qc->flags & QUIC_FL_CONN_DRAINING) && + !(qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE)) { + TRACE_STATE("draining connection (must not send packets)", QUIC_EV_CONN_IO_CB, qc); + goto out; + } + + /* XXX TODO: how to limit the list frames to send */ + if (!qc_send_app_pkts(qc, &qel->pktns->tx.frms)) { + TRACE_DEVEL("qc_send_app_pkts() failed", QUIC_EV_CONN_IO_CB, qc); + goto out; + } + + out: + if ((qc->flags & QUIC_FL_CONN_CLOSING) && qc->mux_state != QC_MUX_READY) { + quic_conn_release(qc); + qc = NULL; + } + + TRACE_LEAVE(QUIC_EV_CONN_IO_CB, qc); + return t; +} + +static void quic_release_cc_conn(struct quic_conn_closed *cc_qc) +{ + struct quic_conn *qc = (struct quic_conn *)cc_qc; + + TRACE_ENTER(QUIC_EV_CONN_IO_CB, cc_qc); + + task_destroy(cc_qc->idle_timer_task); + cc_qc->idle_timer_task = NULL; + tasklet_free(qc->wait_event.tasklet); + free_quic_conn_cids(qc); + pool_free(pool_head_quic_cids, cc_qc->cids); + cc_qc->cids = NULL; + pool_free(pool_head_quic_cc_buf, cc_qc->cc_buf_area); + cc_qc->cc_buf_area = NULL; + /* free the SSL sock context */ + pool_free(pool_head_quic_conn_closed, cc_qc); + + TRACE_ENTER(QUIC_EV_CONN_IO_CB); +} + +/* QUIC connection packet handler task used when in "closing connection" state. */ +static struct task *quic_conn_closed_io_cb(struct task *t, void *context, unsigned int state) +{ + struct quic_conn_closed *cc_qc = context; + struct quic_conn *qc = (struct quic_conn *)cc_qc; + struct buffer buf; + uint16_t dglen; + struct quic_tx_packet *first_pkt; + size_t headlen = sizeof dglen + sizeof first_pkt; + + TRACE_ENTER(QUIC_EV_CONN_IO_CB, qc); + + if (qc_test_fd(qc)) + qc_rcv_buf(qc); + + /* Do not send too much data if the peer address was not validated. */ + if ((qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) && + !(qc->flags & QUIC_FL_CONN_PEER_VALIDATED_ADDR) && + quic_may_send_bytes(qc) < cc_qc->cc_dgram_len) + goto leave; + + buf = b_make(cc_qc->cc_buf_area + headlen, + QUIC_MAX_CC_BUFSIZE - headlen, 0, cc_qc->cc_dgram_len); + if (qc_snd_buf(qc, &buf, buf.data, 0) < 0) { + TRACE_ERROR("sendto fatal error", QUIC_EV_CONN_IO_CB, qc); + quic_release_cc_conn(cc_qc); + cc_qc = NULL; + qc = NULL; + t = NULL; + goto leave; + } + + qc->flags &= ~QUIC_FL_CONN_IMMEDIATE_CLOSE; + + leave: + TRACE_LEAVE(QUIC_EV_CONN_IO_CB, qc); + + return t; +} + +/* The task handling the idle timeout of a connection in "connection close" state */ +static struct task *quic_conn_closed_idle_timer_task(struct task *t, void *ctx, unsigned int state) +{ + struct quic_conn_closed *cc_qc = ctx; + + quic_release_cc_conn(cc_qc); + + return NULL; +} + +/* Allocate a new connection in "connection close" state and return it + * if succeeded, NULL if not. This function is also responsible of + * copying enough and the least possible information from <qc> original + * connection to the newly allocated connection so that to keep it + * functional until its idle timer expires. + */ +static struct quic_conn_closed *qc_new_cc_conn(struct quic_conn *qc) +{ + struct quic_conn_closed *cc_qc; + + cc_qc = pool_alloc(pool_head_quic_conn_closed); + if (!cc_qc) + return NULL; + + quic_conn_mv_cids_to_cc_conn(cc_qc, qc); + + qc_init_fd((struct quic_conn *)cc_qc); + + cc_qc->flags = qc->flags; + cc_qc->err = qc->err; + + cc_qc->nb_pkt_for_cc = qc->nb_pkt_for_cc; + cc_qc->nb_pkt_since_cc = qc->nb_pkt_since_cc; + + cc_qc->local_addr = qc->local_addr; + cc_qc->peer_addr = qc->peer_addr; + + cc_qc->wait_event.tasklet = qc->wait_event.tasklet; + cc_qc->wait_event.tasklet->process = quic_conn_closed_io_cb; + cc_qc->wait_event.tasklet->context = cc_qc; + cc_qc->wait_event.events = 0; + cc_qc->subs = NULL; + + cc_qc->bytes.prep = qc->bytes.prep; + cc_qc->bytes.tx = qc->bytes.tx; + cc_qc->bytes.rx = qc->bytes.rx; + + cc_qc->odcid = qc->odcid; + cc_qc->dcid = qc->dcid; + cc_qc->scid = qc->scid; + + cc_qc->li = qc->li; + cc_qc->cids = qc->cids; + + cc_qc->idle_timer_task = qc->idle_timer_task; + cc_qc->idle_timer_task->process = quic_conn_closed_idle_timer_task; + cc_qc->idle_timer_task->context = cc_qc; + cc_qc->idle_expire = qc->idle_expire; + + cc_qc->conn = qc->conn; + qc->conn = NULL; + + cc_qc->cc_buf_area = qc->tx.cc_buf_area; + cc_qc->cc_dgram_len = qc->tx.cc_dgram_len; + TRACE_PRINTF(TRACE_LEVEL_PROTO, QUIC_EV_CONN_IO_CB, qc, 0, 0, 0, + "switch qc@%p to cc_qc@%p", qc, cc_qc); + + return cc_qc; +} + +/* QUIC connection packet handler task. */ +struct task *quic_conn_io_cb(struct task *t, void *context, unsigned int state) +{ + int ret; + struct quic_conn *qc = context; + struct buffer *buf = NULL; + int st; + struct tasklet *tl = (struct tasklet *)t; + + TRACE_ENTER(QUIC_EV_CONN_IO_CB, qc); + + st = qc->state; + TRACE_PROTO("connection state", QUIC_EV_CONN_IO_CB, qc, &st); + + if (HA_ATOMIC_LOAD(&tl->state) & TASK_HEAVY) { + HA_ATOMIC_AND(&tl->state, ~TASK_HEAVY); + qc_ssl_provide_all_quic_data(qc, qc->xprt_ctx); + } + + /* Retranmissions */ + if (qc->flags & QUIC_FL_CONN_RETRANS_NEEDED) { + TRACE_DEVEL("retransmission needed", QUIC_EV_CONN_PHPKTS, qc); + qc->flags &= ~QUIC_FL_CONN_RETRANS_NEEDED; + if (!qc_dgrams_retransmit(qc)) + goto out; + } + + if (qc_test_fd(qc)) + qc_rcv_buf(qc); + + if (!qc_treat_rx_pkts(qc)) + goto out; + + if (HA_ATOMIC_LOAD(&tl->state) & TASK_HEAVY) { + tasklet_wakeup(tl); + goto out; + } + + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_DEVEL("connection to be killed", QUIC_EV_CONN_PHPKTS, qc); + goto out; + } + + if ((qc->flags & QUIC_FL_CONN_DRAINING) && + !(qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE)) + goto out; + + st = qc->state; + if (st >= QUIC_HS_ST_COMPLETE) { + if (!(qc->flags & QUIC_FL_CONN_HPKTNS_DCD)) { + /* Discard the Handshake packet number space. */ + TRACE_PROTO("discarding Handshake pktns", QUIC_EV_CONN_PHPKTS, qc); + quic_pktns_discard(qc->hel->pktns, qc); + qc_set_timer(qc); + qc_el_rx_pkts_del(qc->hel); + qc_release_pktns_frms(qc, qc->hel->pktns); + } + } + + buf = qc_get_txb(qc); + if (!buf) + goto out; + + if (b_data(buf) && !qc_purge_txbuf(qc, buf)) + goto out; + + /* Currently buf cannot be non-empty at this stage. Even if a previous + * sendto() has failed it is emptied to simulate packet emission and + * rely on QUIC lost detection to try to emit it. + */ + BUG_ON_HOT(b_data(buf)); + b_reset(buf); + + ret = qc_prep_hpkts(qc, buf, NULL); + if (ret == -1) { + qc_txb_release(qc); + goto out; + } + + if (ret && !qc_send_ppkts(buf, qc->xprt_ctx)) { + if (qc->flags & QUIC_FL_CONN_TO_KILL) + qc_txb_release(qc); + goto out; + } + + qc_txb_release(qc); + + out: + /* Release the Handshake encryption level and packet number space if + * the Handshake is confirmed and if there is no need to send + * anymore Handshake packets. + */ + if (quic_tls_pktns_is_dcd(qc, qc->hpktns) && + !qc_need_sending(qc, qc->hel)) { + /* Ensure Initial packet encryption level and packet number space have + * been released. + */ + qc_enc_level_free(qc, &qc->iel); + quic_pktns_release(qc, &qc->ipktns); + qc_enc_level_free(qc, &qc->hel); + quic_pktns_release(qc, &qc->hpktns); + /* Also release the negotiated Initial TLS context. */ + quic_nictx_free(qc); + } + + if ((qc->flags & QUIC_FL_CONN_CLOSING) && qc->mux_state != QC_MUX_READY) { + quic_conn_release(qc); + qc = NULL; + } + + TRACE_PROTO("ssl error", QUIC_EV_CONN_IO_CB, qc, &st); + TRACE_LEAVE(QUIC_EV_CONN_IO_CB, qc); + return t; +} + +/* Callback called upon loss detection and PTO timer expirations. */ +struct task *qc_process_timer(struct task *task, void *ctx, unsigned int state) +{ + struct quic_conn *qc = ctx; + struct quic_pktns *pktns; + + TRACE_ENTER(QUIC_EV_CONN_PTIMER, qc); + TRACE_PROTO("process timer", QUIC_EV_CONN_PTIMER, qc, + NULL, NULL, &qc->path->ifae_pkts); + + task->expire = TICK_ETERNITY; + pktns = quic_loss_pktns(qc); + + if (qc->flags & (QUIC_FL_CONN_DRAINING|QUIC_FL_CONN_TO_KILL)) { + TRACE_PROTO("cancelled action (draining state)", QUIC_EV_CONN_PTIMER, qc); + goto out; + } + + if (tick_isset(pktns->tx.loss_time)) { + struct list lost_pkts = LIST_HEAD_INIT(lost_pkts); + + qc_packet_loss_lookup(pktns, qc, &lost_pkts); + if (!LIST_ISEMPTY(&lost_pkts)) + tasklet_wakeup(qc->wait_event.tasklet); + if (qc_release_lost_pkts(qc, pktns, &lost_pkts, now_ms)) + qc_set_timer(qc); + goto out; + } + + if (qc->path->in_flight) { + pktns = quic_pto_pktns(qc, qc->state >= QUIC_HS_ST_CONFIRMED, NULL); + if (!pktns->tx.in_flight) { + TRACE_PROTO("No in flight packets to probe with", QUIC_EV_CONN_TXPKT, qc); + goto out; + } + + if (pktns == qc->ipktns) { + if (qc_may_probe_ipktns(qc)) { + qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; + pktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; + TRACE_STATE("needs to probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } + else { + TRACE_STATE("Cannot probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } + if (qc->hpktns && qc->hpktns->tx.in_flight) { + qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; + qc->hpktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; + TRACE_STATE("needs to probe Handshake packet number space", QUIC_EV_CONN_TXPKT, qc); + } + } + else if (pktns == qc->hpktns) { + TRACE_STATE("needs to probe Handshake packet number space", QUIC_EV_CONN_TXPKT, qc); + qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; + pktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; + if (qc->ipktns && qc->ipktns->tx.in_flight) { + if (qc_may_probe_ipktns(qc)) { + qc->ipktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; + TRACE_STATE("needs to probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } + else { + TRACE_STATE("Cannot probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } + } + } + else if (pktns == qc->apktns) { + pktns->tx.pto_probe = QUIC_MAX_NB_PTO_DGRAMS; + /* Wake up upper layer if waiting to send new data. */ + if (!qc_notify_send(qc)) { + TRACE_STATE("needs to probe 01RTT packet number space", QUIC_EV_CONN_TXPKT, qc); + qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; + pktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; + } + } + } + else if (!qc_is_listener(qc) && qc->state <= QUIC_HS_ST_COMPLETE) { + if (quic_tls_has_tx_sec(qc->hel)) + qc->hel->pktns->tx.pto_probe = 1; + if (quic_tls_has_tx_sec(qc->iel)) + qc->iel->pktns->tx.pto_probe = 1; + } + + tasklet_wakeup(qc->wait_event.tasklet); + qc->path->loss.pto_count++; + + out: + TRACE_PROTO("process timer", QUIC_EV_CONN_PTIMER, qc, pktns); + TRACE_LEAVE(QUIC_EV_CONN_PTIMER, qc); + + return task; +} + +/* Allocate a new QUIC connection with <version> as QUIC version. <ipv4> + * boolean is set to 1 for IPv4 connection, 0 for IPv6. <server> is set to 1 + * for QUIC servers (or haproxy listeners). + * <dcid> is the destination connection ID, <scid> is the source connection ID. + * This latter <scid> CID as the same value on the wire as the one for <conn_id> + * which is the first CID of this connection but a different internal representation used to build + * NEW_CONNECTION_ID frames. This is the responsibility of the caller to insert + * <conn_id> in the CIDs tree for this connection (qc->cids). + * <token> is the token found to be used for this connection with <token_len> as + * length. Endpoints addresses are specified via <local_addr> and <peer_addr>. + * Returns the connection if succeeded, NULL if not. + */ +struct quic_conn *qc_new_conn(const struct quic_version *qv, int ipv4, + struct quic_cid *dcid, struct quic_cid *scid, + const struct quic_cid *token_odcid, + struct quic_connection_id *conn_id, + struct sockaddr_storage *local_addr, + struct sockaddr_storage *peer_addr, + int server, int token, void *owner) +{ + int i; + struct quic_conn *qc = NULL; + struct listener *l = server ? owner : NULL; + struct proxy *prx = l ? l->bind_conf->frontend : NULL; + struct quic_cc_algo *cc_algo = NULL; + unsigned int next_actconn = 0, next_sslconn = 0, next_handshake = 0; + + TRACE_ENTER(QUIC_EV_CONN_INIT); + + next_actconn = increment_actconn(); + if (!next_actconn) { + _HA_ATOMIC_INC(&maxconn_reached); + TRACE_STATE("maxconn reached", QUIC_EV_CONN_INIT); + goto err; + } + + next_sslconn = increment_sslconn(); + if (!next_sslconn) { + TRACE_STATE("sslconn reached", QUIC_EV_CONN_INIT); + goto err; + } + + if (server) { + next_handshake = quic_increment_curr_handshake(l); + if (!next_handshake) { + TRACE_STATE("max handshake reached", QUIC_EV_CONN_INIT); + goto err; + } + } + + qc = pool_alloc(pool_head_quic_conn); + if (!qc) { + TRACE_ERROR("Could not allocate a new connection", QUIC_EV_CONN_INIT); + goto err; + } + + /* Now that quic_conn instance is allocated, quic_conn_release() will + * ensure global accounting is decremented. + */ + next_handshake = next_sslconn = next_actconn = 0; + + /* Initialize in priority qc members required for a safe dealloc. */ + qc->nictx = NULL; + /* Prevents these CID to be dumped by TRACE() calls */ + qc->scid.len = qc->odcid.len = qc->dcid.len = 0; + /* required to use MTLIST_IN_LIST */ + MT_LIST_INIT(&qc->accept_list); + + LIST_INIT(&qc->rx.pkt_list); + + qc->streams_by_id = EB_ROOT_UNIQUE; + + /* Required to call free_quic_conn_cids() from quic_conn_release() */ + qc->cids = NULL; + qc->tx.cc_buf_area = NULL; + qc_init_fd(qc); + + LIST_INIT(&qc->back_refs); + LIST_INIT(&qc->el_th_ctx); + + qc->wait_event.tasklet = NULL; + + /* Required to destroy <qc> tasks from quic_conn_release() */ + qc->timer_task = NULL; + qc->idle_timer_task = NULL; + + qc->xprt_ctx = NULL; + qc->conn = NULL; + qc->qcc = NULL; + qc->app_ops = NULL; + qc->path = NULL; + + /* Keyupdate: required to safely call quic_tls_ku_free() from + * quic_conn_release(). + */ + quic_tls_ku_reset(&qc->ku.prv_rx); + quic_tls_ku_reset(&qc->ku.nxt_rx); + quic_tls_ku_reset(&qc->ku.nxt_tx); + + /* Encryption levels */ + qc->iel = qc->eel = qc->hel = qc->ael = NULL; + LIST_INIT(&qc->qel_list); + /* Packet number spaces */ + qc->ipktns = qc->hpktns = qc->apktns = NULL; + LIST_INIT(&qc->pktns_list); + + /* Required to safely call quic_conn_prx_cntrs_update() from quic_conn_release(). */ + qc->prx_counters = NULL; + + /* QUIC Server (or listener). */ + if (server) { + cc_algo = l->bind_conf->quic_cc_algo; + + qc->prx_counters = EXTRA_COUNTERS_GET(prx->extra_counters_fe, + &quic_stats_module); + qc->flags = QUIC_FL_CONN_LISTENER; + qc->state = QUIC_HS_ST_SERVER_INITIAL; + /* Copy the client original DCID. */ + qc->odcid = *dcid; + /* Copy the packet SCID to reuse it as DCID for sending */ + qc->dcid = *scid; + qc->tx.buf = BUF_NULL; + qc->li = l; + } + /* QUIC Client (outgoing connection to servers) */ + else { + qc->state = QUIC_HS_ST_CLIENT_INITIAL; + if (dcid->len) + memcpy(qc->dcid.data, dcid->data, dcid->len); + qc->dcid.len = dcid->len; + qc->li = NULL; + } + qc->mux_state = QC_MUX_NULL; + qc->err = quic_err_transport(QC_ERR_NO_ERROR); + + /* If connection is instantiated due to an INITIAL packet with an + * already checked token, consider the peer address as validated. + */ + if (token_odcid->len) { + TRACE_STATE("validate peer address due to initial token", + QUIC_EV_CONN_INIT, qc); + qc->flags |= QUIC_FL_CONN_PEER_VALIDATED_ADDR; + } + else { + HA_ATOMIC_INC(&qc->prx_counters->half_open_conn); + } + + /* Now proceeds to allocation of qc members. */ + qc->rx.buf.area = pool_alloc(pool_head_quic_conn_rxbuf); + if (!qc->rx.buf.area) { + TRACE_ERROR("Could not allocate a new RX buffer", QUIC_EV_CONN_INIT, qc); + goto err; + } + + qc->cids = pool_alloc(pool_head_quic_cids); + if (!qc->cids) { + TRACE_ERROR("Could not allocate a new CID tree", QUIC_EV_CONN_INIT, qc); + goto err; + } + *qc->cids = EB_ROOT; + + conn_id->qc = qc; + + if (HA_ATOMIC_LOAD(&l->rx.quic_mode) == QUIC_SOCK_MODE_CONN && + (global.tune.options & GTUNE_QUIC_SOCK_PER_CONN) && + is_addr(local_addr)) { + TRACE_USER("Allocate a socket for QUIC connection", QUIC_EV_CONN_INIT, qc); + qc_alloc_fd(qc, local_addr, peer_addr); + + /* haproxy soft-stop is supported only for QUIC connections + * with their owned socket. + */ + if (qc_test_fd(qc)) + _HA_ATOMIC_INC(&jobs); + } + + /* Select our SCID which is the first CID with 0 as sequence number. */ + qc->scid = conn_id->cid; + + if (!qc_enc_level_alloc(qc, &qc->ipktns, &qc->iel, ssl_encryption_initial)) { + TRACE_ERROR("Could not initialize an encryption level", QUIC_EV_CONN_INIT, qc); + goto err; + } + + qc->original_version = qv; + qc->negotiated_version = NULL; + qc->tps_tls_ext = (qc->original_version->num & 0xff000000) == 0xff000000 ? + TLS_EXTENSION_QUIC_TRANSPORT_PARAMETERS_DRAFT: + TLS_EXTENSION_QUIC_TRANSPORT_PARAMETERS; + /* TX part. */ + qc->bytes.tx = qc->bytes.prep = 0; + memset(&qc->tx.params, 0, sizeof(qc->tx.params)); + qc->tx.buf = BUF_NULL; + qc->tx.cc_buf = BUF_NULL; + qc->tx.cc_buf_area = NULL; + qc->tx.cc_dgram_len = 0; + /* RX part. */ + qc->bytes.rx = 0; + memset(&qc->rx.params, 0, sizeof(qc->rx.params)); + qc->rx.buf = b_make(qc->rx.buf.area, QUIC_CONN_RX_BUFSZ, 0, 0); + for (i = 0; i < QCS_MAX_TYPES; i++) + qc->rx.strms[i].nb_streams = 0; + + qc->nb_pkt_for_cc = 1; + qc->nb_pkt_since_cc = 0; + + if (!quic_tls_ku_init(qc)) { + TRACE_ERROR("Key update initialization failed", QUIC_EV_CONN_INIT, qc); + goto err; + } + + qc->max_ack_delay = 0; + /* Only one path at this time (multipath not supported) */ + qc->path = &qc->paths[0]; + quic_cc_path_init(qc->path, ipv4, server ? l->bind_conf->max_cwnd : 0, + cc_algo ? cc_algo : default_quic_cc_algo, qc); + + qc->stream_buf_count = 0; + memcpy(&qc->local_addr, local_addr, sizeof(qc->local_addr)); + memcpy(&qc->peer_addr, peer_addr, sizeof qc->peer_addr); + + if (server && !qc_lstnr_params_init(qc, &l->bind_conf->quic_params, + conn_id->stateless_reset_token, + dcid->data, dcid->len, + qc->scid.data, qc->scid.len, token_odcid)) + goto err; + + /* Initialize the idle timeout of the connection at the "max_idle_timeout" + * value from local transport parameters. + */ + qc->max_idle_timeout = qc->rx.params.max_idle_timeout; + qc->wait_event.tasklet = tasklet_new(); + if (!qc->wait_event.tasklet) { + TRACE_ERROR("tasklet_new() failed", QUIC_EV_CONN_TXPKT); + goto err; + } + qc->wait_event.tasklet->process = quic_conn_io_cb; + qc->wait_event.tasklet->context = qc; + qc->wait_event.events = 0; + qc->subs = NULL; + + if (qc_alloc_ssl_sock_ctx(qc) || + !quic_conn_init_timer(qc) || + !quic_conn_init_idle_timer_task(qc, prx)) + goto err; + + if (!qc_new_isecs(qc, &qc->iel->tls_ctx, qc->original_version, dcid->data, dcid->len, 1)) + goto err; + + /* Counters initialization */ + memset(&qc->cntrs, 0, sizeof qc->cntrs); + + LIST_APPEND(&th_ctx->quic_conns, &qc->el_th_ctx); + qc->qc_epoch = HA_ATOMIC_LOAD(&qc_epoch); + + TRACE_LEAVE(QUIC_EV_CONN_INIT, qc); + + return qc; + + err: + quic_conn_release(qc); + + /* Decrement global counters. Done only for errors happening before or + * on pool_head_quic_conn alloc. All other cases are covered by + * quic_conn_release(). + */ + if (next_actconn) + _HA_ATOMIC_DEC(&actconn); + if (next_sslconn) + _HA_ATOMIC_DEC(&global.sslconns); + if (next_handshake) + _HA_ATOMIC_DEC(&l->rx.quic_curr_handshake); + + TRACE_LEAVE(QUIC_EV_CONN_INIT); + return NULL; +} + +/* React to a connection migration initiated on <qc> by a client with the new + * path addresses <peer_addr>/<local_addr>. + * + * Returns 0 on success else non-zero. + */ +int qc_handle_conn_migration(struct quic_conn *qc, + const struct sockaddr_storage *peer_addr, + const struct sockaddr_storage *local_addr) +{ + TRACE_ENTER(QUIC_EV_CONN_LPKT, qc); + + /* RFC 9000. Connection Migration + * + * If the peer sent the disable_active_migration transport parameter, + * an endpoint also MUST NOT send packets (including probing packets; + * see Section 9.1) from a different local address to the address the peer + * used during the handshake, unless the endpoint has acted on a + * preferred_address transport parameter from the peer. + */ + if (qc->li->bind_conf->quic_params.disable_active_migration) { + TRACE_ERROR("Active migration was disabled, datagram dropped", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + /* RFC 9000 9. Connection Migration + * + * The design of QUIC relies on endpoints retaining a stable address for + * the duration of the handshake. An endpoint MUST NOT initiate + * connection migration before the handshake is confirmed, as defined in + * Section 4.1.2 of [QUIC-TLS]. + */ + if (qc->state < QUIC_HS_ST_COMPLETE) { + TRACE_STATE("Connection migration during handshake rejected", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + /* RFC 9000 9. Connection Migration + * + * TODO + * An endpoint MUST + * perform path validation (Section 8.2) if it detects any change to a + * peer's address, unless it has previously validated that address. + */ + + /* Update quic-conn owned socket if in used. + * TODO try to reuse it instead of closing and opening a new one. + */ + if (qc_test_fd(qc)) { + /* TODO try to reuse socket instead of closing it and opening a new one. */ + TRACE_STATE("Connection migration detected, allocate a new connection socket", QUIC_EV_CONN_LPKT, qc); + qc_release_fd(qc, 1); + /* TODO need to adjust <jobs> on socket allocation failure. */ + qc_alloc_fd(qc, local_addr, peer_addr); + } + + qc->local_addr = *local_addr; + qc->peer_addr = *peer_addr; + qc->cntrs.conn_migration_done++; + + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc); + return 0; + + err: + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc); + return 1; +} + + +/* Update the proxy counters of <qc> QUIC connection from its counters */ +static inline void quic_conn_prx_cntrs_update(struct quic_conn *qc) +{ + if (!qc->prx_counters) + return; + + HA_ATOMIC_ADD(&qc->prx_counters->dropped_pkt, qc->cntrs.dropped_pkt); + HA_ATOMIC_ADD(&qc->prx_counters->dropped_pkt_bufoverrun, qc->cntrs.dropped_pkt_bufoverrun); + HA_ATOMIC_ADD(&qc->prx_counters->dropped_parsing, qc->cntrs.dropped_parsing); + HA_ATOMIC_ADD(&qc->prx_counters->socket_full, qc->cntrs.socket_full); + HA_ATOMIC_ADD(&qc->prx_counters->sendto_err, qc->cntrs.sendto_err); + HA_ATOMIC_ADD(&qc->prx_counters->sendto_err_unknown, qc->cntrs.sendto_err_unknown); + HA_ATOMIC_ADD(&qc->prx_counters->sent_pkt, qc->cntrs.sent_pkt); + /* It is possible that ->path was not initialized. For instance if a + * QUIC connection allocation has failed. + */ + if (qc->path) + HA_ATOMIC_ADD(&qc->prx_counters->lost_pkt, qc->path->loss.nb_lost_pkt); + HA_ATOMIC_ADD(&qc->prx_counters->conn_migration_done, qc->cntrs.conn_migration_done); + /* Stream related counters */ + HA_ATOMIC_ADD(&qc->prx_counters->data_blocked, qc->cntrs.data_blocked); + HA_ATOMIC_ADD(&qc->prx_counters->stream_data_blocked, qc->cntrs.stream_data_blocked); + HA_ATOMIC_ADD(&qc->prx_counters->streams_blocked_bidi, qc->cntrs.streams_blocked_bidi); + HA_ATOMIC_ADD(&qc->prx_counters->streams_blocked_uni, qc->cntrs.streams_blocked_uni); +} + +/* Release the quic_conn <qc>. The connection is removed from the CIDs tree. + * The connection tasklet is killed. + * + * This function must only be called by the thread responsible of the quic_conn + * tasklet. + */ +void quic_conn_release(struct quic_conn *qc) +{ + struct eb64_node *node; + struct quic_rx_packet *pkt, *pktback; + struct quic_conn_closed *cc_qc; + + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + + if (!qc) + goto leave; + + /* We must not free the quic-conn if the MUX is still allocated. */ + BUG_ON(qc->mux_state == QC_MUX_READY); + + cc_qc = NULL; + if ((qc->flags & QUIC_FL_CONN_CLOSING) && !(qc->flags & QUIC_FL_CONN_EXP_TIMER) && + qc->tx.cc_buf_area) + cc_qc = qc_new_cc_conn(qc); + + if (!cc_qc) { + task_destroy(qc->idle_timer_task); + qc->idle_timer_task = NULL; + tasklet_free(qc->wait_event.tasklet); + /* remove the connection from receiver cids trees */ + free_quic_conn_cids(qc); + pool_free(pool_head_quic_cids, qc->cids); + qc->cids = NULL; + pool_free(pool_head_quic_cc_buf, qc->tx.cc_buf_area); + qc->tx.cc_buf_area = NULL; + } + + if (qc_test_fd(qc)) + _HA_ATOMIC_DEC(&jobs); + + /* Close quic-conn socket fd. */ + qc_release_fd(qc, 0); + + /* in the unlikely (but possible) case the connection was just added to + * the accept_list we must delete it from there. + */ + if (MT_LIST_INLIST(&qc->accept_list)) { + MT_LIST_DELETE(&qc->accept_list); + BUG_ON(qc->li->rx.quic_curr_accept == 0); + HA_ATOMIC_DEC(&qc->li->rx.quic_curr_accept); + } + + /* free remaining stream descriptors */ + node = eb64_first(&qc->streams_by_id); + while (node) { + struct qc_stream_desc *stream; + + stream = eb64_entry(node, struct qc_stream_desc, by_id); + node = eb64_next(node); + + /* all streams attached to the quic-conn are released, so + * qc_stream_desc_free will liberate the stream instance. + */ + BUG_ON(!stream->release); + qc_stream_desc_free(stream, 1); + } + + /* free the SSL sock context */ + qc_free_ssl_sock_ctx(&qc->xprt_ctx); + /* Purge Rx packet list. */ + list_for_each_entry_safe(pkt, pktback, &qc->rx.pkt_list, qc_rx_pkt_list) { + LIST_DELETE(&pkt->qc_rx_pkt_list); + pool_free(pool_head_quic_rx_packet, pkt); + } + + task_destroy(qc->timer_task); + qc->timer_task = NULL; + + quic_tls_ku_free(qc); + if (qc->ael) { + struct quic_tls_ctx *actx = &qc->ael->tls_ctx; + + /* Secrets used by keyupdate */ + pool_free(pool_head_quic_tls_secret, actx->rx.secret); + pool_free(pool_head_quic_tls_secret, actx->tx.secret); + } + + qc_enc_level_free(qc, &qc->iel); + qc_enc_level_free(qc, &qc->eel); + qc_enc_level_free(qc, &qc->hel); + qc_enc_level_free(qc, &qc->ael); + + quic_tls_ctx_free(&qc->nictx); + + quic_pktns_release(qc, &qc->ipktns); + quic_pktns_release(qc, &qc->hpktns); + quic_pktns_release(qc, &qc->apktns); + + qc_detach_th_ctx_list(qc, 0); + + quic_conn_prx_cntrs_update(qc); + pool_free(pool_head_quic_conn_rxbuf, qc->rx.buf.area); + qc->rx.buf.area = NULL; + + /* Connection released before peer address validated. */ + if (unlikely(!(qc->flags & QUIC_FL_CONN_PEER_VALIDATED_ADDR))) { + BUG_ON(!qc->prx_counters->half_open_conn); + HA_ATOMIC_DEC(&qc->prx_counters->half_open_conn); + } + + /* Connection released before handshake completion. */ + if (unlikely(qc->state < QUIC_HS_ST_COMPLETE)) { + if (qc_is_listener(qc)) { + BUG_ON(qc->li->rx.quic_curr_handshake == 0); + HA_ATOMIC_DEC(&qc->li->rx.quic_curr_handshake); + } + } + + pool_free(pool_head_quic_conn, qc); + qc = NULL; + + /* Decrement global counters when quic_conn is deallocated. + * quic_conn_closed instances are not accounted as they run for a short + * time with limited resources. + */ + _HA_ATOMIC_DEC(&actconn); + _HA_ATOMIC_DEC(&global.sslconns); + + TRACE_PROTO("QUIC conn. freed", QUIC_EV_CONN_FREED, qc); + leave: + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); +} + +/* Initialize the timer task of <qc> QUIC connection. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_conn_init_timer(struct quic_conn *qc) +{ + int ret = 0; + /* Attach this task to the same thread ID used for the connection */ + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + qc->timer_task = task_new_here(); + if (!qc->timer_task) { + TRACE_ERROR("timer task allocation failed", QUIC_EV_CONN_NEW, qc); + goto leave; + } + + qc->timer = TICK_ETERNITY; + qc->timer_task->process = qc_process_timer; + qc->timer_task->context = qc; + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + return ret; +} + +/* Rearm the idle timer or the ack timer (if not already armde) for <qc> QUIC + * connection. */ +void qc_idle_timer_do_rearm(struct quic_conn *qc, int arm_ack) +{ + unsigned int expire; + + /* It is possible the idle timer task has been already released. */ + if (!qc->idle_timer_task) + return; + + if (qc->flags & (QUIC_FL_CONN_CLOSING|QUIC_FL_CONN_DRAINING)) { + /* RFC 9000 10.2. Immediate Close + * + * The closing and draining connection states exist to ensure that + * connections close cleanly and that delayed or reordered packets are + * properly discarded. These states SHOULD persist for at least three + * times the current PTO interval as defined in [QUIC-RECOVERY]. + */ + + /* Delay is limited to 1s which should cover most of network + * conditions. The process should not be impacted by a + * connection with a high RTT. + */ + expire = MIN(3 * quic_pto(qc), 1000); + } + else { + /* RFC 9000 10.1. Idle Timeout + * + * To avoid excessively small idle timeout periods, endpoints MUST + * increase the idle timeout period to be at least three times the + * current Probe Timeout (PTO). This allows for multiple PTOs to expire, + * and therefore multiple probes to be sent and lost, prior to idle + * timeout. + */ + expire = QUIC_MAX(3 * quic_pto(qc), qc->max_idle_timeout); + } + + qc->idle_expire = tick_add(now_ms, MS_TO_TICKS(expire)); + /* Note that the ACK timer is not armed during the handshake. So, + * the handshake expiration date is taken into an account only + * when <arm_ack> is false. + */ + if (arm_ack) { + /* Arm the ack timer only if not already armed. */ + if (!tick_isset(qc->ack_expire)) { + qc->ack_expire = tick_add(now_ms, MS_TO_TICKS(QUIC_ACK_DELAY)); + qc->idle_timer_task->expire = qc->ack_expire; + task_queue(qc->idle_timer_task); + TRACE_PROTO("ack timer armed", QUIC_EV_CONN_IDLE_TIMER, qc); + } + } + else { + qc->idle_timer_task->expire = tick_first(qc->ack_expire, qc->idle_expire); + if (qc->state < QUIC_HS_ST_COMPLETE) + qc->idle_timer_task->expire = tick_first(qc->hs_expire, qc->idle_expire); + task_queue(qc->idle_timer_task); + TRACE_PROTO("idle timer armed", QUIC_EV_CONN_IDLE_TIMER, qc); + } +} + +/* Rearm the idle timer or ack timer for <qc> QUIC connection depending on <read> + * and <arm_ack> booleans. The former is set to 1 when receiving a packet , + * and 0 when sending packet. <arm_ack> is set to 1 if this is the ack timer + * which must be rearmed. + */ +void qc_idle_timer_rearm(struct quic_conn *qc, int read, int arm_ack) +{ + TRACE_ENTER(QUIC_EV_CONN_IDLE_TIMER, qc); + + if (read) { + qc->flags |= QUIC_FL_CONN_IDLE_TIMER_RESTARTED_AFTER_READ; + } + else { + qc->flags &= ~QUIC_FL_CONN_IDLE_TIMER_RESTARTED_AFTER_READ; + } + qc_idle_timer_do_rearm(qc, arm_ack); + + leave: + TRACE_LEAVE(QUIC_EV_CONN_IDLE_TIMER, qc); +} + +/* The task handling the idle timeout */ +struct task *qc_idle_timer_task(struct task *t, void *ctx, unsigned int state) +{ + struct quic_conn *qc = ctx; + + TRACE_ENTER(QUIC_EV_CONN_IDLE_TIMER, qc); + + if ((state & TASK_WOKEN_ANY) == TASK_WOKEN_TIMER && !tick_is_expired(t->expire, now_ms)) + goto requeue; + + if (tick_is_expired(qc->ack_expire, now_ms)) { + TRACE_PROTO("ack timer expired", QUIC_EV_CONN_IDLE_TIMER, qc); + qc->ack_expire = TICK_ETERNITY; + /* Note that ->idle_expire is always set. */ + t->expire = qc->idle_expire; + /* Do not wakeup the I/O handler in DRAINING state or if the + * connection must be killed as soon as possible. + */ + if (!(qc->flags & (QUIC_FL_CONN_DRAINING|QUIC_FL_CONN_TO_KILL))) { + qc->flags |= QUIC_FL_CONN_ACK_TIMER_FIRED; + tasklet_wakeup(qc->wait_event.tasklet); + } + + goto requeue; + } + + TRACE_PROTO("idle timer task running", QUIC_EV_CONN_IDLE_TIMER, qc); + /* Notify the MUX before settings QUIC_FL_CONN_EXP_TIMER or the MUX + * might free the quic-conn too early via quic_close(). + */ + qc_notify_err(qc); + + /* If the MUX is still alive, keep the quic-conn. The MUX is + * responsible to call quic_close to release it. + */ + qc->flags |= QUIC_FL_CONN_EXP_TIMER; + if (qc->mux_state != QC_MUX_READY) { + quic_conn_release(qc); + qc = NULL; + } + else { + task_destroy(t); + qc->idle_timer_task = NULL; + } + + t = NULL; + + /* TODO if the quic-conn cannot be freed because of the MUX, we may at + * least clean some parts of it such as the tasklet. + */ + + requeue: + TRACE_LEAVE(QUIC_EV_CONN_IDLE_TIMER, qc); + return t; +} + +/* Initialize the idle timeout task for <qc>. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_conn_init_idle_timer_task(struct quic_conn *qc, + struct proxy *px) +{ + int ret = 0; + int timeout; + + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + + timeout = px->timeout.client_hs ? px->timeout.client_hs : px->timeout.client; + qc->idle_timer_task = task_new_here(); + if (!qc->idle_timer_task) { + TRACE_ERROR("Idle timer task allocation failed", QUIC_EV_CONN_NEW, qc); + goto leave; + } + + qc->idle_timer_task->process = qc_idle_timer_task; + qc->idle_timer_task->context = qc; + qc->ack_expire = TICK_ETERNITY; + qc->hs_expire = tick_add_ifset(now_ms, MS_TO_TICKS(timeout)); + qc_idle_timer_rearm(qc, 1, 0); + task_queue(qc->idle_timer_task); + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + return ret; +} + +/* Return the QUIC version (quic_version struct) with <version> as version number + * if supported or NULL if not. + */ +const struct quic_version *qc_supported_version(uint32_t version) +{ + int i; + + if (unlikely(!version)) + return &quic_version_VN_reserved; + + for (i = 0; i < quic_versions_nb; i++) + if (quic_versions[i].num == version) + return &quic_versions[i]; + + return NULL; +} + +/* Check if connection ID <dcid> of length <dcid_len> belongs to <qc> local + * CIDs. This can be used to determine if a datagram is addressed to the right + * connection instance. + * + * Returns a boolean value. + */ +int qc_check_dcid(struct quic_conn *qc, unsigned char *dcid, size_t dcid_len) +{ + const uchar idx = _quic_cid_tree_idx(dcid); + struct quic_connection_id *conn_id; + struct ebmb_node *node = NULL; + struct quic_cid_tree *tree = &quic_cid_trees[idx]; + + /* Test against our default CID or client ODCID. */ + if ((qc->scid.len == dcid_len && + memcmp(qc->scid.data, dcid, dcid_len) == 0) || + (qc->odcid.len == dcid_len && + memcmp(qc->odcid.data, dcid, dcid_len) == 0)) { + return 1; + } + + /* Test against our other CIDs. This can happen if the client has + * decided to switch to a new one. + * + * TODO to avoid locking, loop through qc.cids as an alternative. + * + * TODO set it to our default CID to avoid this operation next time. + */ + HA_RWLOCK_RDLOCK(QC_CID_LOCK, &tree->lock); + node = ebmb_lookup(&tree->root, dcid, dcid_len); + HA_RWLOCK_RDUNLOCK(QC_CID_LOCK, &tree->lock); + + if (node) { + conn_id = ebmb_entry(node, struct quic_connection_id, node); + if (qc == conn_id->qc) + return 1; + } + + return 0; +} + +/* Wake-up upper layer for sending if all conditions are met : + * - room in congestion window or probe packet to sent + * - socket FD ready to sent or listener socket used + * + * Returns 1 if upper layer has been woken up else 0. + */ +int qc_notify_send(struct quic_conn *qc) +{ + const struct quic_pktns *pktns = qc->apktns; + + if (qc->subs && qc->subs->events & SUB_RETRY_SEND) { + /* RFC 9002 7.5. Probe Timeout + * + * Probe packets MUST NOT be blocked by the congestion controller. + */ + if ((quic_cc_path_prep_data(qc->path) || pktns->tx.pto_probe) && + (!qc_test_fd(qc) || !fd_send_active(qc->fd))) { + tasklet_wakeup(qc->subs->tasklet); + qc->subs->events &= ~SUB_RETRY_SEND; + if (!qc->subs->events) + qc->subs = NULL; + + return 1; + } + } + + return 0; +} + +/* Notify upper layer of a fatal error which forces to close the connection. */ +void qc_notify_err(struct quic_conn *qc) +{ + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + + if (qc->mux_state == QC_MUX_READY) { + TRACE_STATE("error notified to mux", QUIC_EV_CONN_CLOSE, qc); + + /* Mark socket as closed. */ + qc->conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + + /* TODO quic-conn layer must stay active until MUX is released. + * Thus, we have to wake up directly to ensure upper stream + * layer will be notified of the error. If a proper separation + * is made between MUX and quic-conn layer, wake up could be + * conducted only with qc.subs. + */ + tasklet_wakeup(qc->qcc->wait_event.tasklet); + } + + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); +} + +/* Move a <qc> QUIC connection and its resources from the current thread to the + * new one <new_tid> optionally in association with <new_li> (since it may need + * to change when migrating to a thread from a different group, otherwise leave + * it NULL). After this call, the connection cannot be dereferenced anymore on + * the current thread. + * + * Returns 0 on success else non-zero. + */ +int qc_set_tid_affinity(struct quic_conn *qc, uint new_tid, struct listener *new_li) +{ + struct task *t1 = NULL, *t2 = NULL; + struct tasklet *t3 = NULL; + + struct quic_connection_id *conn_id; + struct eb64_node *node; + + TRACE_ENTER(QUIC_EV_CONN_SET_AFFINITY, qc); + + /* Pre-allocate all required resources. This ensures we do not left a + * connection with only some of its field rebinded. + */ + if (((t1 = task_new_on(new_tid)) == NULL) || + (qc->timer_task && (t2 = task_new_on(new_tid)) == NULL) || + (t3 = tasklet_new()) == NULL) { + goto err; + } + + /* Reinit idle timer task. */ + task_kill(qc->idle_timer_task); + t1->expire = qc->idle_timer_task->expire; + qc->idle_timer_task = t1; + qc->idle_timer_task->process = qc_idle_timer_task; + qc->idle_timer_task->context = qc; + + /* Reinit timer task if allocated. */ + if (qc->timer_task) { + task_kill(qc->timer_task); + qc->timer_task = t2; + qc->timer_task->process = qc_process_timer; + qc->timer_task->context = qc; + } + + /* Reinit IO tasklet. */ + if (qc->wait_event.tasklet->state & TASK_IN_LIST) + qc->flags |= QUIC_FL_CONN_IO_TO_REQUEUE; + tasklet_kill(qc->wait_event.tasklet); + /* In most cases quic_conn_app_io_cb is used but for 0-RTT quic_conn_io_cb can be still activated. */ + t3->process = qc->wait_event.tasklet->process; + qc->wait_event.tasklet = t3; + qc->wait_event.tasklet->tid = new_tid; + qc->wait_event.tasklet->context = qc; + qc->wait_event.events = 0; + + /* Rebind the connection FD. */ + if (qc_test_fd(qc)) { + /* Reading is reactivated by the new thread. */ + fd_migrate_on(qc->fd, new_tid); + } + + /* Remove conn from per-thread list instance. It will be hidden from + * "show quic" until rebinding is completed. + */ + qc_detach_th_ctx_list(qc, 0); + + node = eb64_first(qc->cids); + BUG_ON(!node || eb64_next(node)); /* One and only one CID must be present before affinity rebind. */ + conn_id = eb64_entry(node, struct quic_connection_id, seq_num); + + /* At this point no connection was accounted for yet on this + * listener so it's OK to just swap the pointer. + */ + if (new_li && new_li != qc->li) + qc->li = new_li; + + /* Rebinding is considered done when CID points to the new thread. No + * access should be done to quic-conn instance after it. + */ + qc->flags |= QUIC_FL_CONN_AFFINITY_CHANGED; + HA_ATOMIC_STORE(&conn_id->tid, new_tid); + qc = NULL; + + TRACE_LEAVE(QUIC_EV_CONN_SET_AFFINITY, NULL); + return 0; + + err: + task_destroy(t1); + task_destroy(t2); + tasklet_free(t3); + + TRACE_DEVEL("leaving on error", QUIC_EV_CONN_SET_AFFINITY, qc); + return 1; +} + +/* Must be called after qc_set_tid_affinity() on the new thread. */ +void qc_finalize_affinity_rebind(struct quic_conn *qc) +{ + TRACE_ENTER(QUIC_EV_CONN_SET_AFFINITY, qc); + + /* This function must not be called twice after an affinity rebind. */ + BUG_ON(!(qc->flags & QUIC_FL_CONN_AFFINITY_CHANGED)); + qc->flags &= ~QUIC_FL_CONN_AFFINITY_CHANGED; + + /* If quic_conn is closing it is unnecessary to migrate it as it will + * be soon released. Besides, special care must be taken for CLOSING + * connections (using quic_conn_closed and th_ctx.quic_conns_clo list for + * instance). This should never occur as CLOSING connections are + * skipped by quic_sock_accept_conn(). + */ + BUG_ON(qc->flags & (QUIC_FL_CONN_CLOSING|QUIC_FL_CONN_DRAINING)); + + /* Reinsert connection in ha_thread_ctx global list. */ + LIST_APPEND(&th_ctx->quic_conns, &qc->el_th_ctx); + qc->qc_epoch = HA_ATOMIC_LOAD(&qc_epoch); + + /* Reactivate FD polling if connection socket is active. */ + qc_want_recv(qc); + + /* Reactivate timer task if needed. */ + qc_set_timer(qc); + + /* Idle timer task is always active. */ + task_queue(qc->idle_timer_task); + + /* Reactivate IO tasklet if needed. */ + if (qc->flags & QUIC_FL_CONN_IO_TO_REQUEUE) { + tasklet_wakeup(qc->wait_event.tasklet); + qc->flags &= ~QUIC_FL_CONN_IO_TO_REQUEUE; + } + + TRACE_LEAVE(QUIC_EV_CONN_SET_AFFINITY, qc); +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/quic_frame.c b/src/quic_frame.c new file mode 100644 index 0000000..61d2c93 --- /dev/null +++ b/src/quic_frame.c @@ -0,0 +1,1273 @@ +/* + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <string.h> + +#include <import/eb64tree.h> +#include <haproxy/buf-t.h> +#include <haproxy/chunk.h> +#include <haproxy/pool.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_enc.h> +#include <haproxy/quic_frame.h> +#include <haproxy/quic_rx-t.h> +#include <haproxy/quic_tp-t.h> +#include <haproxy/quic_trace.h> +#include <haproxy/quic_tx.h> +#include <haproxy/trace.h> + +DECLARE_POOL(pool_head_quic_frame, "quic_frame", sizeof(struct quic_frame)); +DECLARE_POOL(pool_head_qf_crypto, "qf_crypto", sizeof(struct qf_crypto)); + +const char *quic_frame_type_string(enum quic_frame_type ft) +{ + switch (ft) { + case QUIC_FT_PADDING: + return "PADDING"; + case QUIC_FT_PING: + return "PING"; + case QUIC_FT_ACK: + return "ACK"; + case QUIC_FT_ACK_ECN: + return "ACK_ECN"; + case QUIC_FT_RESET_STREAM: + return "RESET_STREAM"; + case QUIC_FT_STOP_SENDING: + return "STOP_SENDING"; + case QUIC_FT_CRYPTO: + return "CRYPTO"; + case QUIC_FT_NEW_TOKEN: + return "NEW_TOKEN"; + + case QUIC_FT_STREAM_8: + return "STREAM_8"; + case QUIC_FT_STREAM_9: + return "STREAM_9"; + case QUIC_FT_STREAM_A: + return "STREAM_A"; + case QUIC_FT_STREAM_B: + return "STREAM_B"; + case QUIC_FT_STREAM_C: + return "STREAM_C"; + case QUIC_FT_STREAM_D: + return "STREAM_D"; + case QUIC_FT_STREAM_E: + return "STREAM_E"; + case QUIC_FT_STREAM_F: + return "STREAM_F"; + + case QUIC_FT_MAX_DATA: + return "MAX_DATA"; + case QUIC_FT_MAX_STREAM_DATA: + return "MAX_STREAM_DATA"; + case QUIC_FT_MAX_STREAMS_BIDI: + return "MAX_STREAMS_BIDI"; + case QUIC_FT_MAX_STREAMS_UNI: + return "MAX_STREAMS_UNI"; + case QUIC_FT_DATA_BLOCKED: + return "DATA_BLOCKED"; + case QUIC_FT_STREAM_DATA_BLOCKED: + return "STREAM_DATA_BLOCKED"; + case QUIC_FT_STREAMS_BLOCKED_BIDI: + return "STREAMS_BLOCKED_BIDI"; + case QUIC_FT_STREAMS_BLOCKED_UNI: + return "STREAMS_BLOCKED_UNI"; + case QUIC_FT_NEW_CONNECTION_ID: + return "NEW_CONNECTION_ID"; + case QUIC_FT_RETIRE_CONNECTION_ID: + return "RETIRE_CONNECTION_ID"; + case QUIC_FT_PATH_CHALLENGE: + return "PATH_CHALLENGE"; + case QUIC_FT_PATH_RESPONSE: + return "PATH_RESPONSE"; + case QUIC_FT_CONNECTION_CLOSE: + return "CONNECTION_CLOSE"; + case QUIC_FT_CONNECTION_CLOSE_APP: + return "CONNECTION_CLOSE_APP"; + case QUIC_FT_HANDSHAKE_DONE: + return "HANDSHAKE_DONE"; + default: + return "UNKNOWN"; + } +} + +static void chunk_cc_phrase_appendf(struct buffer *buf, + const unsigned char *phr, size_t phrlen) +{ + chunk_appendf(buf, " reason_phrase: '"); + while (phrlen--) + chunk_appendf(buf, "%c", *phr++); + chunk_appendf(buf, "'"); +} + +/* Add traces to <buf> depending on <frm> frame type. */ +void chunk_frm_appendf(struct buffer *buf, const struct quic_frame *frm) +{ + chunk_appendf(buf, " %s", quic_frame_type_string(frm->type)); + switch (frm->type) { + case QUIC_FT_CRYPTO: + { + const struct qf_crypto *crypto_frm = &frm->crypto; + chunk_appendf(buf, " cfoff=%llu cflen=%llu", + (ull)crypto_frm->offset, (ull)crypto_frm->len); + break; + } + case QUIC_FT_RESET_STREAM: + { + const struct qf_reset_stream *rs_frm = &frm->reset_stream; + chunk_appendf(buf, " id=%llu app_error_code=%llu final_size=%llu", + (ull)rs_frm->id, (ull)rs_frm->app_error_code, (ull)rs_frm->final_size); + break; + } + case QUIC_FT_STOP_SENDING: + { + const struct qf_stop_sending *ss_frm = &frm->stop_sending; + chunk_appendf(&trace_buf, " id=%llu app_error_code=%llu", + (ull)ss_frm->id, (ull)ss_frm->app_error_code); + break; + } + case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: + { + const struct qf_stream *strm_frm = &frm->stream; + chunk_appendf(&trace_buf, " uni=%d fin=%d id=%llu off=%llu len=%llu", + !!(strm_frm->id & QUIC_STREAM_FRAME_ID_DIR_BIT), + !!(frm->type & QUIC_STREAM_FRAME_TYPE_FIN_BIT), + (ull)strm_frm->id, (ull)strm_frm->offset.key, (ull)strm_frm->len); + break; + } + case QUIC_FT_MAX_DATA: + { + const struct qf_max_data *md_frm = &frm->max_data; + chunk_appendf(&trace_buf, " max_data=%llu", (ull)md_frm->max_data); + break; + } + case QUIC_FT_MAX_STREAM_DATA: + { + const struct qf_max_stream_data *msd_frm = &frm->max_stream_data; + chunk_appendf(&trace_buf, " id=%llu max_stream_data=%llu", + (ull)msd_frm->id, (ull)msd_frm->max_stream_data); + break; + } + case QUIC_FT_MAX_STREAMS_BIDI: + { + const struct qf_max_streams *ms_frm = &frm->max_streams_bidi; + chunk_appendf(&trace_buf, " max_streams=%llu", (ull)ms_frm->max_streams); + break; + } + case QUIC_FT_MAX_STREAMS_UNI: + { + const struct qf_max_streams *ms_frm = &frm->max_streams_uni; + chunk_appendf(&trace_buf, " max_streams=%llu", (ull)ms_frm->max_streams); + break; + } + case QUIC_FT_DATA_BLOCKED: + { + const struct qf_data_blocked *db_frm = &frm->data_blocked; + chunk_appendf(&trace_buf, " limit=%llu", (ull)db_frm->limit); + break; + } + case QUIC_FT_STREAM_DATA_BLOCKED: + { + const struct qf_stream_data_blocked *sdb_frm = &frm->stream_data_blocked; + chunk_appendf(&trace_buf, " id=%llu limit=%llu", + (ull)sdb_frm->id, (ull)sdb_frm->limit); + break; + } + case QUIC_FT_STREAMS_BLOCKED_BIDI: + { + const struct qf_streams_blocked *sb_frm = &frm->streams_blocked_bidi; + chunk_appendf(&trace_buf, " limit=%llu", (ull)sb_frm->limit); + break; + } + case QUIC_FT_STREAMS_BLOCKED_UNI: + { + const struct qf_streams_blocked *sb_frm = &frm->streams_blocked_uni; + chunk_appendf(&trace_buf, " limit=%llu", (ull)sb_frm->limit); + break; + } + case QUIC_FT_RETIRE_CONNECTION_ID: + { + const struct qf_retire_connection_id *rcid_frm = &frm->retire_connection_id; + chunk_appendf(&trace_buf, " seq_num=%llu", (ull)rcid_frm->seq_num); + break; + } + case QUIC_FT_CONNECTION_CLOSE: + { + const struct qf_connection_close *cc_frm = &frm->connection_close; + size_t plen = QUIC_MIN((size_t)cc_frm->reason_phrase_len, sizeof cc_frm->reason_phrase); + chunk_appendf(&trace_buf, + " error_code=%llu frame_type=%llu reason_phrase_len=%llu", + (ull)cc_frm->error_code, (ull)cc_frm->frame_type, + (ull)cc_frm->reason_phrase_len); + if (plen) + chunk_cc_phrase_appendf(&trace_buf, cc_frm->reason_phrase, plen); + break; + } + case QUIC_FT_CONNECTION_CLOSE_APP: + { + const struct qf_connection_close_app *cc_frm = &frm->connection_close_app; + size_t plen = QUIC_MIN((size_t)cc_frm->reason_phrase_len, sizeof cc_frm->reason_phrase); + chunk_appendf(&trace_buf, + " error_code=%llu reason_phrase_len=%llu", + (ull)cc_frm->error_code, (ull)cc_frm->reason_phrase_len); + if (plen) + chunk_cc_phrase_appendf(&trace_buf, cc_frm->reason_phrase, plen); + break; + } + } +} + +/* Encode <frm> PADDING frame at <pos> buffer position, <end> being one byte past the end + * of this buffer. + * Returns 1 if succeeded (enough room in the buffer to encode the frame), 0 if not. + */ +static int quic_build_padding_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_padding *padding_frm = &frm->padding; + + if (end - *pos < padding_frm->len - 1) + return 0; + + memset(*pos, 0, padding_frm->len - 1); + *pos += padding_frm->len - 1; + + return 1; +} + +/* Parse a PADDING frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_padding_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + const unsigned char *beg; + struct qf_padding *padding_frm = &frm->padding; + + beg = *pos; + padding_frm->len = 1; + while (*pos < end && !**pos) + (*pos)++; + padding_frm->len += *pos - beg; + + return 1; +} + +/* Encode a ACK frame at <pos> buffer position. + * Always succeeds. + */ +static int quic_build_ping_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + /* No field */ + return 1; +} + +/* Parse a PADDING frame from <pos> buffer position with <end> as end into <frm> frame. + * Always succeeds. + */ +static int quic_parse_ping_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + /* No field */ + return 1; +} + +/* Encode a ACK frame. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_ack_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *qc) +{ + struct qf_tx_ack *ack_frm = &frm->tx_ack; + struct eb64_node *ar, *prev_ar; + struct quic_arng_node *ar_node, *prev_ar_node; + + ar = eb64_last(&ack_frm->arngs->root); + ar_node = eb64_entry(ar, struct quic_arng_node, first); + TRACE_PROTO("TX ack range", QUIC_EV_CONN_PRSAFRM, + qc,, &ar_node->last, &ar_node->first.key); + if (!quic_enc_int(pos, end, ar_node->last) || + !quic_enc_int(pos, end, ack_frm->ack_delay) || + !quic_enc_int(pos, end, ack_frm->arngs->sz - 1) || + !quic_enc_int(pos, end, ar_node->last - ar_node->first.key)) + return 0; + + while ((prev_ar = eb64_prev(ar))) { + prev_ar_node = eb64_entry(prev_ar, struct quic_arng_node, first); + TRACE_PROTO("TX ack range", QUIC_EV_CONN_PRSAFRM, qc,, + &prev_ar_node->last, &prev_ar_node->first.key); + if (!quic_enc_int(pos, end, ar_node->first.key - prev_ar_node->last - 2) || + !quic_enc_int(pos, end, prev_ar_node->last - prev_ar_node->first.key)) + return 0; + + ar = prev_ar; + ar_node = eb64_entry(ar, struct quic_arng_node, first); + } + + return 1; +} + +/* Parse an ACK frame header at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_ack_frame_header(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + int ret; + struct qf_ack *ack_frm = &frm->ack; + + ret = quic_dec_int(&ack_frm->largest_ack, pos, end); + if (!ret) + return 0; + + ret = quic_dec_int(&ack_frm->ack_delay, pos, end); + if (!ret) + return 0; + + ret = quic_dec_int(&ack_frm->ack_range_num, pos, end); + if (!ret) + return 0; + + ret = quic_dec_int(&ack_frm->first_ack_range, pos, end); + if (!ret) + return 0; + + return 1; +} + +/* Encode a ACK_ECN frame. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_ack_ecn_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_ack *ack_frm = &frm->ack; + + return quic_enc_int(pos, end, ack_frm->largest_ack) && + quic_enc_int(pos, end, ack_frm->ack_delay) && + quic_enc_int(pos, end, ack_frm->first_ack_range) && + quic_enc_int(pos, end, ack_frm->ack_range_num); +} + +/* Parse an ACK_ECN frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_ack_ecn_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_ack *ack_frm = &frm->ack; + + return quic_dec_int(&ack_frm->largest_ack, pos, end) && + quic_dec_int(&ack_frm->ack_delay, pos, end) && + quic_dec_int(&ack_frm->first_ack_range, pos, end) && + quic_dec_int(&ack_frm->ack_range_num, pos, end); +} + +/* Encode a RESET_STREAM frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_reset_stream_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_reset_stream *rs_frm = &frm->reset_stream; + + return quic_enc_int(pos, end, rs_frm->id) && + quic_enc_int(pos, end, rs_frm->app_error_code) && + quic_enc_int(pos, end, rs_frm->final_size); +} + +/* Parse a RESET_STREAM frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_reset_stream_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_reset_stream *rs_frm = &frm->reset_stream; + + return quic_dec_int(&rs_frm->id, pos, end) && + quic_dec_int(&rs_frm->app_error_code, pos, end) && + quic_dec_int(&rs_frm->final_size, pos, end); +} + +/* Encode a STOP_SENDING frame. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_stop_sending_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_stop_sending *ss_frm = &frm->stop_sending; + + return quic_enc_int(pos, end, ss_frm->id) && + quic_enc_int(pos, end, ss_frm->app_error_code); +} + +/* Parse a STOP_SENDING frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_stop_sending_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_stop_sending *ss_frm = &frm->stop_sending; + + return quic_dec_int(&ss_frm->id, pos, end) && + quic_dec_int(&ss_frm->app_error_code, pos, end); +} + +/* Encode a CRYPTO frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_crypto_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_crypto *crypto_frm = &frm->crypto; + const struct quic_enc_level *qel = crypto_frm->qel; + size_t offset, len; + + if (!quic_enc_int(pos, end, crypto_frm->offset) || + !quic_enc_int(pos, end, crypto_frm->len) || end - *pos < crypto_frm->len) + return 0; + + len = crypto_frm->len; + offset = crypto_frm->offset; + while (len) { + int idx; + size_t to_copy; + const unsigned char *data; + + idx = offset >> QUIC_CRYPTO_BUF_SHIFT; + to_copy = qel->tx.crypto.bufs[idx]->sz - (offset & QUIC_CRYPTO_BUF_MASK); + if (to_copy > len) + to_copy = len; + data = qel->tx.crypto.bufs[idx]->data + (offset & QUIC_CRYPTO_BUF_MASK); + memcpy(*pos, data, to_copy); + *pos += to_copy; + offset += to_copy; + len -= to_copy; + } + + return 1; +} + +/* Parse a CRYPTO frame from <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_crypto_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_crypto *crypto_frm = &frm->crypto; + + if (!quic_dec_int(&crypto_frm->offset, pos, end) || + !quic_dec_int(&crypto_frm->len, pos, end) || end - *pos < crypto_frm->len) + return 0; + + crypto_frm->data = *pos; + *pos += crypto_frm->len; + + return 1; +} + +/* Encode a NEW_TOKEN frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_new_token_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_new_token *new_token_frm = &frm->new_token; + + if (!quic_enc_int(pos, end, new_token_frm->len) || end - *pos < new_token_frm->len) + return 0; + + memcpy(*pos, new_token_frm->data, new_token_frm->len); + + return 1; +} + +/* Parse a NEW_TOKEN frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_new_token_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_new_token *new_token_frm = &frm->new_token; + + if (!quic_dec_int(&new_token_frm->len, pos, end) || end - *pos < new_token_frm->len) + return 0; + + new_token_frm->data = *pos; + *pos += new_token_frm->len; + + return 1; +} + +/* Encode a STREAM frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_stream_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_stream *strm_frm = &frm->stream; + const unsigned char *wrap; + + /* Caller must set OFF bit if and only if a non-null offset is used. */ + BUG_ON(!!(frm->type & QUIC_STREAM_FRAME_TYPE_OFF_BIT) != + !!strm_frm->offset.key); + + if (!quic_enc_int(pos, end, strm_frm->id) || + ((frm->type & QUIC_STREAM_FRAME_TYPE_OFF_BIT) && !quic_enc_int(pos, end, strm_frm->offset.key)) || + ((frm->type & QUIC_STREAM_FRAME_TYPE_LEN_BIT) && + (!quic_enc_int(pos, end, strm_frm->len) || end - *pos < strm_frm->len))) + return 0; + + /* No need for data memcpy if no payload. */ + if (!strm_frm->len) + return 1; + + wrap = (const unsigned char *)b_wrap(strm_frm->buf); + if (strm_frm->data + strm_frm->len > wrap) { + size_t to_copy = wrap - strm_frm->data; + memcpy(*pos, strm_frm->data, to_copy); + *pos += to_copy; + + to_copy = strm_frm->len - to_copy; + memcpy(*pos, b_orig(strm_frm->buf), to_copy); + *pos += to_copy; + } + else { + memcpy(*pos, strm_frm->data, strm_frm->len); + *pos += strm_frm->len; + } + + return 1; +} + +/* Parse a STREAM frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_stream_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_stream *strm_frm = &frm->stream; + + if (!quic_dec_int(&strm_frm->id, pos, end)) + return 0; + + /* Offset parsing */ + if (!(frm->type & QUIC_STREAM_FRAME_TYPE_OFF_BIT)) { + strm_frm->offset.key = 0; + } + else if (!quic_dec_int((uint64_t *)&strm_frm->offset.key, pos, end)) + return 0; + + /* Length parsing */ + if (!(frm->type & QUIC_STREAM_FRAME_TYPE_LEN_BIT)) { + strm_frm->len = end - *pos; + } + else if (!quic_dec_int(&strm_frm->len, pos, end) || end - *pos < strm_frm->len) + return 0; + + strm_frm->data = *pos; + *pos += strm_frm->len; + + return 1; +} + +/* Encode a MAX_DATA frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_max_data_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_max_data *md_frm = &frm->max_data; + + return quic_enc_int(pos, end, md_frm->max_data); +} + +/* Parse a MAX_DATA frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_max_data_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_max_data *md_frm = &frm->max_data; + + return quic_dec_int(&md_frm->max_data, pos, end); +} + +/* Encode a MAX_STREAM_DATA frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_max_stream_data_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_max_stream_data *msd_frm = &frm->max_stream_data; + + return quic_enc_int(pos, end, msd_frm->id) && + quic_enc_int(pos, end, msd_frm->max_stream_data); +} + +/* Parse a MAX_STREAM_DATA frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_max_stream_data_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_max_stream_data *msd_frm = &frm->max_stream_data; + + return quic_dec_int(&msd_frm->id, pos, end) && + quic_dec_int(&msd_frm->max_stream_data, pos, end); +} + +/* Encode a MAX_STREAMS frame for bidirectional streams at <buf> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_max_streams_bidi_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_max_streams *ms_frm = &frm->max_streams_bidi; + + return quic_enc_int(pos, end, ms_frm->max_streams); +} + +/* Parse a MAX_STREAMS frame for bidirectional streams at <pos> buffer position with <end> + * as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_max_streams_bidi_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_max_streams *ms_frm = &frm->max_streams_bidi; + + return quic_dec_int(&ms_frm->max_streams, pos, end); +} + +/* Encode a MAX_STREAMS frame for unidirectional streams at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_max_streams_uni_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_max_streams *ms_frm = &frm->max_streams_uni; + + return quic_enc_int(pos, end, ms_frm->max_streams); +} + +/* Parse a MAX_STREAMS frame for undirectional streams at <pos> buffer position with <end> + * as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_max_streams_uni_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_max_streams *ms_frm = &frm->max_streams_uni; + + return quic_dec_int(&ms_frm->max_streams, pos, end); +} + +/* Encode a DATA_BLOCKED frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_data_blocked_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_data_blocked *db_frm = &frm->data_blocked; + + return quic_enc_int(pos, end, db_frm->limit); +} + +/* Parse a DATA_BLOCKED frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_data_blocked_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_data_blocked *db_frm = &frm->data_blocked; + + return quic_dec_int(&db_frm->limit, pos, end); +} + +/* Encode a STREAM_DATA_BLOCKED at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_stream_data_blocked_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_stream_data_blocked *sdb_frm = &frm->stream_data_blocked; + + return quic_enc_int(pos, end, sdb_frm->id) && + quic_enc_int(pos, end, sdb_frm->limit); +} + +/* Parse a STREAM_DATA_BLOCKED frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_stream_data_blocked_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_stream_data_blocked *sdb_frm = &frm->stream_data_blocked; + + return quic_dec_int(&sdb_frm->id, pos, end) && + quic_dec_int(&sdb_frm->limit, pos, end); +} + +/* Encode a STREAMS_BLOCKED frame for bidirectional streams at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_streams_blocked_bidi_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_streams_blocked *sb_frm = &frm->streams_blocked_bidi; + + return quic_enc_int(pos, end, sb_frm->limit); +} + +/* Parse a STREAMS_BLOCKED frame for bidirectional streams at <pos> buffer position with <end> + * as end into <frm> frame. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_streams_blocked_bidi_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_streams_blocked *sb_frm = &frm->streams_blocked_bidi; + + return quic_dec_int(&sb_frm->limit, pos, end); +} + +/* Encode a STREAMS_BLOCKED frame for unidirectional streams at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_streams_blocked_uni_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_streams_blocked *sb_frm = &frm->streams_blocked_uni; + + return quic_enc_int(pos, end, sb_frm->limit); +} + +/* Parse a STREAMS_BLOCKED frame for unidirectional streams at <pos> buffer position with <end> + * as end into <frm> frame. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_streams_blocked_uni_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_streams_blocked *sb_frm = &frm->streams_blocked_uni; + + return quic_dec_int(&sb_frm->limit, pos, end); +} + +/* Encode a NEW_CONNECTION_ID frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_new_connection_id_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_new_connection_id *ncid_frm = &frm->new_connection_id; + + if (!quic_enc_int(pos, end, ncid_frm->seq_num) || + !quic_enc_int(pos, end, ncid_frm->retire_prior_to) || + end - *pos < sizeof ncid_frm->cid.len + ncid_frm->cid.len + QUIC_STATELESS_RESET_TOKEN_LEN) + return 0; + + *(*pos)++ = ncid_frm->cid.len; + + if (ncid_frm->cid.len) { + memcpy(*pos, ncid_frm->cid.data, ncid_frm->cid.len); + *pos += ncid_frm->cid.len; + } + memcpy(*pos, ncid_frm->stateless_reset_token, QUIC_STATELESS_RESET_TOKEN_LEN); + *pos += QUIC_STATELESS_RESET_TOKEN_LEN; + + return 1; +} + +/* Parse a NEW_CONNECTION_ID frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_new_connection_id_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_new_connection_id *ncid_frm = &frm->new_connection_id; + + if (!quic_dec_int(&ncid_frm->seq_num, pos, end) || + !quic_dec_int(&ncid_frm->retire_prior_to, pos, end) || end <= *pos) + return 0; + + ncid_frm->cid.len = *(*pos)++; + if (end - *pos < ncid_frm->cid.len + QUIC_STATELESS_RESET_TOKEN_LEN) + return 0; + + if (ncid_frm->cid.len) { + ncid_frm->cid.data = *pos; + *pos += ncid_frm->cid.len; + } + ncid_frm->stateless_reset_token = *pos; + *pos += QUIC_STATELESS_RESET_TOKEN_LEN; + + return 1; +} + +/* Encode a RETIRE_CONNECTION_ID frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_retire_connection_id_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_retire_connection_id *rcid_frm = &frm->retire_connection_id; + + return quic_enc_int(pos, end, rcid_frm->seq_num); +} + +/* Parse a RETIRE_CONNECTION_ID frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room to parse this frame), 0 if not. + */ +static int quic_parse_retire_connection_id_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_retire_connection_id *rcid_frm = &frm->retire_connection_id; + + return quic_dec_int(&rcid_frm->seq_num, pos, end); +} + +/* Encode a PATH_CHALLENGE frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_path_challenge_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_path_challenge *pc_frm = &frm->path_challenge; + + if (end - *pos < sizeof pc_frm->data) + return 0; + + memcpy(*pos, pc_frm->data, sizeof pc_frm->data); + *pos += sizeof pc_frm->data; + + return 1; +} + +/* Parse a PATH_CHALLENGE frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_path_challenge_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_path_challenge *pc_frm = &frm->path_challenge; + + if (end - *pos < sizeof pc_frm->data) + return 0; + + memcpy(pc_frm->data, *pos, sizeof pc_frm->data); + *pos += sizeof pc_frm->data; + + return 1; +} + + +/* Encode a PATH_RESPONSE frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_path_response_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_path_challenge_response *pcr_frm = &frm->path_challenge_response; + + if (end - *pos < sizeof pcr_frm->data) + return 0; + + memcpy(*pos, pcr_frm->data, sizeof pcr_frm->data); + *pos += sizeof pcr_frm->data; + + return 1; +} + +/* Parse a PATH_RESPONSE frame at <pos> buffer position with <end> as end into <frm> frame. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_path_response_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_path_challenge_response *pcr_frm = &frm->path_challenge_response; + + if (end - *pos < sizeof pcr_frm->data) + return 0; + + memcpy(pcr_frm->data, *pos, sizeof pcr_frm->data); + *pos += sizeof pcr_frm->data; + + return 1; +} + +/* Encode a CONNECTION_CLOSE frame at QUIC layer at <pos> buffer position. + * Note there exist two types of CONNECTION_CLOSE frame, one for the application layer + * and another at QUIC layer. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_connection_close_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_connection_close *cc_frm = &frm->connection_close; + + if (!quic_enc_int(pos, end, cc_frm->error_code) || + !quic_enc_int(pos, end, cc_frm->frame_type) || + !quic_enc_int(pos, end, cc_frm->reason_phrase_len) || + end - *pos < cc_frm->reason_phrase_len) + return 0; + + memcpy(*pos, cc_frm->reason_phrase, cc_frm->reason_phrase_len); + *pos += cc_frm->reason_phrase_len; + + return 1; +} + +/* Parse a CONNECTION_CLOSE frame at QUIC layer at <pos> buffer position with <end> as end into <frm> frame. + * Note there exist two types of CONNECTION_CLOSE frame, one for the application layer + * and another at QUIC layer. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_connection_close_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + size_t plen; + struct qf_connection_close *cc_frm = &frm->connection_close; + + if (!quic_dec_int(&cc_frm->error_code, pos, end) || + !quic_dec_int(&cc_frm->frame_type, pos, end) || + !quic_dec_int(&cc_frm->reason_phrase_len, pos, end) || + end - *pos < cc_frm->reason_phrase_len) + return 0; + + plen = QUIC_MIN((size_t)cc_frm->reason_phrase_len, sizeof cc_frm->reason_phrase); + memcpy(cc_frm->reason_phrase, *pos, plen); + *pos += cc_frm->reason_phrase_len; + + return 1; +} + +/* Encode a CONNECTION_CLOSE frame at application layer at <pos> buffer position. + * Note there exist two types of CONNECTION_CLOSE frame, one for application layer + * and another at QUIC layer. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + */ +static int quic_build_connection_close_app_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + struct qf_connection_close_app *cc_frm = &frm->connection_close_app; + + if (!quic_enc_int(pos, end, cc_frm->error_code) || + !quic_enc_int(pos, end, cc_frm->reason_phrase_len) || + end - *pos < cc_frm->reason_phrase_len) + return 0; + + memcpy(*pos, cc_frm->reason_phrase, cc_frm->reason_phrase_len); + *pos += cc_frm->reason_phrase_len; + + return 1; +} + +/* Parse a CONNECTION_CLOSE frame at QUIC layer at <pos> buffer position with <end> as end into <frm> frame. + * Note there exist two types of CONNECTION_CLOSE frame, one for the application layer + * and another at QUIC layer. + * Return 1 if succeeded (enough room at <pos> buffer position to parse this frame), 0 if not. + */ +static int quic_parse_connection_close_app_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + size_t plen; + struct qf_connection_close_app *cc_frm = &frm->connection_close_app; + + if (!quic_dec_int(&cc_frm->error_code, pos, end) || + !quic_dec_int(&cc_frm->reason_phrase_len, pos, end) || + end - *pos < cc_frm->reason_phrase_len) + return 0; + + plen = QUIC_MIN((size_t)cc_frm->reason_phrase_len, sizeof cc_frm->reason_phrase); + memcpy(cc_frm->reason_phrase, *pos, plen); + *pos += cc_frm->reason_phrase_len; + + return 1; +} + +/* Encode a HANDSHAKE_DONE frame at <pos> buffer position. + * Always succeeds. + */ +static int quic_build_handshake_done_frame(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn) +{ + /* No field */ + return 1; +} + +/* Parse a HANDSHAKE_DONE frame at QUIC layer at <pos> buffer position with <end> as end into <frm> frame. + * Always succeed. + */ +static int quic_parse_handshake_done_frame(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end) +{ + /* No field */ + return 1; +} + +struct quic_frame_builder { + int (*func)(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_conn *conn); + uint32_t mask; + unsigned char flags; +}; + +const struct quic_frame_builder quic_frame_builders[] = { + [QUIC_FT_PADDING] = { .func = quic_build_padding_frame, .flags = QUIC_FL_TX_PACKET_PADDING, .mask = QUIC_FT_PKT_TYPE_IH01_BITMASK, }, + [QUIC_FT_PING] = { .func = quic_build_ping_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE_IH01_BITMASK, }, + [QUIC_FT_ACK] = { .func = quic_build_ack_frame, .flags = 0, .mask = QUIC_FT_PKT_TYPE_IH_1_BITMASK, }, + [QUIC_FT_ACK_ECN] = { .func = quic_build_ack_ecn_frame, .flags = 0, .mask = QUIC_FT_PKT_TYPE_IH_1_BITMASK, }, + [QUIC_FT_RESET_STREAM] = { .func = quic_build_reset_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STOP_SENDING] = { .func = quic_build_stop_sending_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_CRYPTO] = { .func = quic_build_crypto_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE_IH_1_BITMASK, }, + [QUIC_FT_NEW_TOKEN] = { .func = quic_build_new_token_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE____1_BITMASK, }, + [QUIC_FT_STREAM_8] = { .func = quic_build_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_9] = { .func = quic_build_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_A] = { .func = quic_build_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_B] = { .func = quic_build_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_C] = { .func = quic_build_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_D] = { .func = quic_build_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_E] = { .func = quic_build_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_F] = { .func = quic_build_stream_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_MAX_DATA] = { .func = quic_build_max_data_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_MAX_STREAM_DATA] = { .func = quic_build_max_stream_data_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_MAX_STREAMS_BIDI] = { .func = quic_build_max_streams_bidi_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_MAX_STREAMS_UNI] = { .func = quic_build_max_streams_uni_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_DATA_BLOCKED] = { .func = quic_build_data_blocked_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_DATA_BLOCKED] = { .func = quic_build_stream_data_blocked_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAMS_BLOCKED_BIDI] = { .func = quic_build_streams_blocked_bidi_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAMS_BLOCKED_UNI] = { .func = quic_build_streams_blocked_uni_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_NEW_CONNECTION_ID] = { .func = quic_build_new_connection_id_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_RETIRE_CONNECTION_ID] = { .func = quic_build_retire_connection_id_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_PATH_CHALLENGE] = { .func = quic_build_path_challenge_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_PATH_RESPONSE] = { .func = quic_build_path_response_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_CONNECTION_CLOSE] = { .func = quic_build_connection_close_frame, .flags = 0, .mask = QUIC_FT_PKT_TYPE_IH01_BITMASK, }, + [QUIC_FT_CONNECTION_CLOSE_APP] = { .func = quic_build_connection_close_app_frame, .flags = 0, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_HANDSHAKE_DONE] = { .func = quic_build_handshake_done_frame, .flags = QUIC_FL_TX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE____1_BITMASK, }, +}; + +struct quic_frame_parser { + int (*func)(struct quic_frame *frm, struct quic_conn *qc, + const unsigned char **pos, const unsigned char *end); + uint32_t mask; + unsigned char flags; +}; + +const struct quic_frame_parser quic_frame_parsers[] = { + [QUIC_FT_PADDING] = { .func = quic_parse_padding_frame, .flags = 0, .mask = QUIC_FT_PKT_TYPE_IH01_BITMASK, }, + [QUIC_FT_PING] = { .func = quic_parse_ping_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE_IH01_BITMASK, }, + [QUIC_FT_ACK] = { .func = quic_parse_ack_frame_header, .flags = 0, .mask = QUIC_FT_PKT_TYPE_IH_1_BITMASK, }, + [QUIC_FT_ACK_ECN] = { .func = quic_parse_ack_ecn_frame, .flags = 0, .mask = QUIC_FT_PKT_TYPE_IH_1_BITMASK, }, + [QUIC_FT_RESET_STREAM] = { .func = quic_parse_reset_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STOP_SENDING] = { .func = quic_parse_stop_sending_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_CRYPTO] = { .func = quic_parse_crypto_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE_IH_1_BITMASK, }, + [QUIC_FT_NEW_TOKEN] = { .func = quic_parse_new_token_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE____1_BITMASK, }, + [QUIC_FT_STREAM_8] = { .func = quic_parse_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_9] = { .func = quic_parse_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_A] = { .func = quic_parse_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_B] = { .func = quic_parse_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_C] = { .func = quic_parse_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_D] = { .func = quic_parse_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_E] = { .func = quic_parse_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_F] = { .func = quic_parse_stream_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_MAX_DATA] = { .func = quic_parse_max_data_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_MAX_STREAM_DATA] = { .func = quic_parse_max_stream_data_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_MAX_STREAMS_BIDI] = { .func = quic_parse_max_streams_bidi_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_MAX_STREAMS_UNI] = { .func = quic_parse_max_streams_uni_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_DATA_BLOCKED] = { .func = quic_parse_data_blocked_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAM_DATA_BLOCKED] = { .func = quic_parse_stream_data_blocked_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAMS_BLOCKED_BIDI] = { .func = quic_parse_streams_blocked_bidi_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_STREAMS_BLOCKED_UNI] = { .func = quic_parse_streams_blocked_uni_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_NEW_CONNECTION_ID] = { .func = quic_parse_new_connection_id_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_RETIRE_CONNECTION_ID] = { .func = quic_parse_retire_connection_id_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_PATH_CHALLENGE] = { .func = quic_parse_path_challenge_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_PATH_RESPONSE] = { .func = quic_parse_path_response_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_CONNECTION_CLOSE] = { .func = quic_parse_connection_close_frame, .flags = 0, .mask = QUIC_FT_PKT_TYPE_IH01_BITMASK, }, + [QUIC_FT_CONNECTION_CLOSE_APP] = { .func = quic_parse_connection_close_app_frame, .flags = 0, .mask = QUIC_FT_PKT_TYPE___01_BITMASK, }, + [QUIC_FT_HANDSHAKE_DONE] = { .func = quic_parse_handshake_done_frame, .flags = QUIC_FL_RX_PACKET_ACK_ELICITING, .mask = QUIC_FT_PKT_TYPE____1_BITMASK, }, +}; + +/* Decode a QUIC frame at <pos> buffer position into <frm> frame. + * Returns 1 if succeeded (enough data at <pos> buffer position to parse the frame), 0 if not. + */ +int qc_parse_frm(struct quic_frame *frm, struct quic_rx_packet *pkt, + const unsigned char **pos, const unsigned char *end, + struct quic_conn *qc) +{ + int ret = 0; + const struct quic_frame_parser *parser; + + TRACE_ENTER(QUIC_EV_CONN_PRSFRM, qc); + if (end <= *pos) { + TRACE_DEVEL("wrong frame", QUIC_EV_CONN_PRSFRM, qc); + goto leave; + } + + frm->type = *(*pos)++; + if (frm->type >= QUIC_FT_MAX) { + TRACE_DEVEL("wrong frame type", QUIC_EV_CONN_PRSFRM, qc, frm); + goto leave; + } + + parser = &quic_frame_parsers[frm->type]; + if (!(parser->mask & (1U << pkt->type))) { + TRACE_DEVEL("unauthorized frame", QUIC_EV_CONN_PRSFRM, qc, frm); + goto leave; + } + + if (!parser->func(frm, qc, pos, end)) { + TRACE_DEVEL("parsing error", QUIC_EV_CONN_PRSFRM, qc, frm); + goto leave; + } + + TRACE_PROTO("RX frm", QUIC_EV_CONN_PSTRM, qc, frm); + + pkt->flags |= parser->flags; + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSFRM, qc); + return ret; +} + +/* Encode <frm> QUIC frame at <pos> buffer position. + * Returns 1 if succeeded (enough room at <pos> buffer position to encode the frame), 0 if not. + * The buffer is updated to point to one byte past the end of the built frame + * only if succeeded. + */ +int qc_build_frm(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_tx_packet *pkt, + struct quic_conn *qc) +{ + int ret = 0; + const struct quic_frame_builder *builder; + unsigned char *p = *pos; + + TRACE_ENTER(QUIC_EV_CONN_BFRM, qc); + builder = &quic_frame_builders[frm->type]; + if (!(builder->mask & (1U << pkt->type))) { + /* XXX This it a bug to send an unauthorized frame with such a packet type XXX */ + TRACE_ERROR("unauthorized frame", QUIC_EV_CONN_BFRM, qc, frm); + BUG_ON(!(builder->mask & (1U << pkt->type))); + } + + if (end <= p) { + TRACE_DEVEL("not enough room", QUIC_EV_CONN_BFRM, qc, frm); + goto leave; + } + + TRACE_PROTO("TX frm", QUIC_EV_CONN_BFRM, qc, frm); + *p++ = frm->type; + if (!quic_frame_builders[frm->type].func(&p, end, frm, qc)) { + TRACE_ERROR("frame building error", QUIC_EV_CONN_BFRM, qc, frm); + goto leave; + } + + pkt->flags |= builder->flags; + *pos = p; + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_BFRM, qc); + return ret; +} + +/* Detach all duplicated frames from <frm> reflist. */ +void qc_frm_unref(struct quic_frame *frm, struct quic_conn *qc) +{ + struct quic_frame *f, *tmp; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc, frm); + + list_for_each_entry_safe(f, tmp, &frm->reflist, ref) { + f->origin = NULL; + LIST_DEL_INIT(&f->ref); + if (f->pkt) { + TRACE_DEVEL("remove frame reference", + QUIC_EV_CONN_PRSAFRM, qc, f, &f->pkt->pn_node.key); + } + else { + TRACE_DEVEL("remove frame reference for unsent frame", + QUIC_EV_CONN_PRSAFRM, qc, f); + } + } + + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); +} + +/* Free a <frm> quic_frame. Remove it from parent element if still attached. */ +void qc_frm_free(struct quic_conn *qc, struct quic_frame **frm) +{ + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc, *frm); + /* Caller must ensure that no other frame points to <frm>. Use + * qc_frm_unref() to handle this properly. + */ + BUG_ON(!LIST_ISEMPTY(&((*frm)->reflist))); + BUG_ON(LIST_INLIST(&((*frm)->ref))); + + /* TODO simplify frame deallocation. In some code paths, we must + * manually call this LIST_DEL_INIT before using + * quic_tx_packet_refdec() and freeing the frame. + */ + LIST_DEL_INIT(&((*frm)->list)); + + pool_free(pool_head_quic_frame, *frm); + *frm = NULL; + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); +} + +/* Release <frm> frame and mark its copies as acknowledged */ +void qc_release_frm(struct quic_conn *qc, struct quic_frame *frm) +{ + uint64_t pn; + struct quic_frame *origin, *f, *tmp; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc, frm); + + /* Identify this frame: a frame copy or one of its copies */ + origin = frm->origin ? frm->origin : frm; + /* Ensure the source of the copies is flagged as acked, <frm> being + * possibly a copy of <origin> + */ + origin->flags |= QUIC_FL_TX_FRAME_ACKED; + /* Mark all the copy of <origin> as acknowledged. We must + * not release the packets (releasing the frames) at this time as + * they are possibly also to be acknowledged alongside the + * the current one. + */ + list_for_each_entry_safe(f, tmp, &origin->reflist, ref) { + if (f->pkt) { + f->flags |= QUIC_FL_TX_FRAME_ACKED; + f->origin = NULL; + LIST_DEL_INIT(&f->ref); + pn = f->pkt->pn_node.key; + TRACE_DEVEL("mark frame as acked from packet", + QUIC_EV_CONN_PRSAFRM, qc, f, &pn); + } + else { + TRACE_DEVEL("freeing unsent frame", + QUIC_EV_CONN_PRSAFRM, qc, f); + LIST_DEL_INIT(&f->ref); + qc_frm_free(qc, &f); + } + } + LIST_DEL_INIT(&frm->list); + pn = frm->pkt->pn_node.key; + quic_tx_packet_refdec(frm->pkt); + TRACE_DEVEL("freeing frame from packet", + QUIC_EV_CONN_PRSAFRM, qc, frm, &pn); + qc_frm_free(qc, &frm); + + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); +} + diff --git a/src/quic_loss.c b/src/quic_loss.c new file mode 100644 index 0000000..fd9568a --- /dev/null +++ b/src/quic_loss.c @@ -0,0 +1,312 @@ +#include <import/eb64tree.h> + +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_loss.h> +#include <haproxy/quic_tls.h> +#include <haproxy/quic_trace.h> + +#include <haproxy/atomic.h> +#include <haproxy/list.h> +#include <haproxy/ticks.h> +#include <haproxy/trace.h> + +/* Update <ql> QUIC loss information with new <rtt> measurement and <ack_delay> + * on ACK frame receipt which MUST be min(ack->ack_delay, max_ack_delay) + * before the handshake is confirmed. + */ +void quic_loss_srtt_update(struct quic_loss *ql, + unsigned int rtt, unsigned int ack_delay, + struct quic_conn *qc) +{ + TRACE_ENTER(QUIC_EV_CONN_RTTUPDT, qc); + TRACE_PROTO("TX loss srtt update", QUIC_EV_CONN_RTTUPDT, qc, &rtt, &ack_delay, ql); + + ql->latest_rtt = rtt; + if (!ql->rtt_min) { + /* No previous measurement. */ + ql->srtt = rtt; + ql->rtt_var = rtt / 2; + ql->rtt_min = rtt; + } + else { + int diff; + + ql->rtt_min = QUIC_MIN(rtt, ql->rtt_min); + /* Specific to QUIC (RTT adjustment). */ + if (ack_delay && rtt >= ql->rtt_min + ack_delay) + rtt -= ack_delay; + diff = ql->srtt - rtt; + if (diff < 0) + diff = -diff; + ql->rtt_var = (3 * ql->rtt_var + diff) / 4; + ql->srtt = (7 * ql->srtt + rtt) / 8; + } + + TRACE_PROTO("TX loss srtt update", QUIC_EV_CONN_RTTUPDT, qc,,, ql); + TRACE_LEAVE(QUIC_EV_CONN_RTTUPDT, qc); +} + +/* Returns for <qc> QUIC connection the first packet number space which + * experienced packet loss, if any or a packet number space with + * TICK_ETERNITY as packet loss time if not. + */ +struct quic_pktns *quic_loss_pktns(struct quic_conn *qc) +{ + struct quic_pktns *pktns, *p; + + TRACE_ENTER(QUIC_EV_CONN_SPTO, qc); + + BUG_ON(LIST_ISEMPTY(&qc->pktns_list)); + pktns = p = LIST_NEXT(&qc->pktns_list, struct quic_pktns *, list); + + do { + TRACE_PROTO("TX loss pktns", QUIC_EV_CONN_SPTO, qc, p); + if (!tick_isset(pktns->tx.loss_time) || + tick_is_lt(p->tx.loss_time, pktns->tx.loss_time)) { + pktns = p; + } + p = LIST_NEXT(&p->list, struct quic_pktns *, list); + } while (&p->list != &qc->pktns_list); + + TRACE_LEAVE(QUIC_EV_CONN_SPTO, qc); + + return pktns; +} + +/* Returns for <qc> QUIC connection the first packet number space to + * arm the PTO for if any or a packet number space with TICK_ETERNITY + * as PTO value if not. + */ +struct quic_pktns *quic_pto_pktns(struct quic_conn *qc, + int handshake_confirmed, + unsigned int *pto) +{ + unsigned int duration, lpto; + struct quic_loss *ql = &qc->path->loss; + struct quic_pktns *pktns, *p; + + TRACE_ENTER(QUIC_EV_CONN_SPTO, qc); + + BUG_ON(LIST_ISEMPTY(&qc->pktns_list)); + duration = + ql->srtt + + (QUIC_MAX(4 * ql->rtt_var, QUIC_TIMER_GRANULARITY) << ql->pto_count); + + /* RFC 9002 6.2.2.1. Before Address Validation + * + * the client MUST set the PTO timer if the client has not received an + * acknowledgment for any of its Handshake packets and the handshake is + * not confirmed (see Section 4.1.2 of [QUIC-TLS]), even if there are no + * packets in flight. + * + * TODO implement the above paragraph for QUIC on backend side. Note + * that if now_ms is used this function is not reentrant anymore and can + * not be used anytime without side-effect (for example after QUIC + * connection migration). + */ + + lpto = TICK_ETERNITY; + pktns = p = LIST_NEXT(&qc->pktns_list, struct quic_pktns *, list); + + do { + unsigned int tmp_pto; + + if (p->tx.in_flight) { + if (p == qc->apktns) { + if (!handshake_confirmed) { + TRACE_STATE("TX PTO handshake not already confirmed", QUIC_EV_CONN_SPTO, qc); + goto out; + } + + duration += qc->max_ack_delay << ql->pto_count; + } + + tmp_pto = tick_add(p->tx.time_of_last_eliciting, duration); + if (!tick_isset(lpto) || tick_is_lt(tmp_pto, lpto)) { + lpto = tmp_pto; + pktns = p; + } + + TRACE_PROTO("TX PTO", QUIC_EV_CONN_SPTO, qc, p); + } + + p = LIST_NEXT(&p->list, struct quic_pktns *, list); + } while (&p->list != &qc->pktns_list); + + out: + if (pto) + *pto = lpto; + TRACE_PROTO("TX PTO", QUIC_EV_CONN_SPTO, qc, pktns, &duration); + TRACE_LEAVE(QUIC_EV_CONN_SPTO, qc); + + return pktns; +} + +/* Look for packet loss from sent packets for <qel> encryption level of a + * connection with <ctx> as I/O handler context. If remove is true, remove them from + * their tree if deemed as lost or set the <loss_time> value the packet number + * space if any not deemed lost. + * Should be called after having received an ACK frame with newly acknowledged + * packets or when the the loss detection timer has expired. + * Always succeeds. + */ +void qc_packet_loss_lookup(struct quic_pktns *pktns, struct quic_conn *qc, + struct list *lost_pkts) +{ + struct eb_root *pkts; + struct eb64_node *node; + struct quic_loss *ql; + unsigned int loss_delay; + uint64_t pktthresh; + + TRACE_ENTER(QUIC_EV_CONN_PKTLOSS, qc); + TRACE_PROTO("TX loss", QUIC_EV_CONN_PKTLOSS, qc, pktns); + pkts = &pktns->tx.pkts; + pktns->tx.loss_time = TICK_ETERNITY; + if (eb_is_empty(pkts)) + goto out; + + ql = &qc->path->loss; + loss_delay = QUIC_MAX(ql->latest_rtt, ql->srtt); + loss_delay = QUIC_MAX(loss_delay, MS_TO_TICKS(QUIC_TIMER_GRANULARITY)) * + QUIC_LOSS_TIME_THRESHOLD_MULTIPLICAND / QUIC_LOSS_TIME_THRESHOLD_DIVISOR; + + node = eb64_first(pkts); + + /* RFC 9002 6.1.1. Packet Threshold + * The RECOMMENDED initial value for the packet reordering threshold + * (kPacketThreshold) is 3, based on best practices for TCP loss detection + * [RFC5681] [RFC6675]. In order to remain similar to TCP, implementations + * SHOULD NOT use a packet threshold less than 3; see [RFC5681]. + + * Some networks may exhibit higher degrees of packet reordering, causing a + * sender to detect spurious losses. Additionally, packet reordering could be + * more common with QUIC than TCP because network elements that could observe + * and reorder TCP packets cannot do that for QUIC and also because QUIC + * packet numbers are encrypted. + */ + + /* Dynamic packet reordering threshold calculation depending on the distance + * (in packets) between the last transmitted packet and the oldest still in + * flight before loss detection. + */ + pktthresh = pktns->tx.next_pn - 1 - eb64_entry(node, struct quic_tx_packet, pn_node)->pn_node.key; + /* Apply a ratio to this threshold and add it to QUIC_LOSS_PACKET_THRESHOLD. */ + pktthresh = pktthresh * global.tune.quic_reorder_ratio / 100 + QUIC_LOSS_PACKET_THRESHOLD; + while (node) { + struct quic_tx_packet *pkt; + int64_t largest_acked_pn; + unsigned int loss_time_limit, time_sent; + int reordered; + + pkt = eb64_entry(&node->node, struct quic_tx_packet, pn_node); + largest_acked_pn = pktns->rx.largest_acked_pn; + node = eb64_next(node); + if ((int64_t)pkt->pn_node.key > largest_acked_pn) + break; + + time_sent = pkt->time_sent; + loss_time_limit = tick_add(time_sent, loss_delay); + + reordered = (int64_t)largest_acked_pn >= pkt->pn_node.key + pktthresh; + if (reordered) + ql->nb_reordered_pkt++; + + if (tick_is_le(loss_time_limit, now_ms) || reordered) { + eb64_delete(&pkt->pn_node); + LIST_APPEND(lost_pkts, &pkt->list); + ql->nb_lost_pkt++; + } + else { + if (tick_isset(pktns->tx.loss_time)) + pktns->tx.loss_time = tick_first(pktns->tx.loss_time, loss_time_limit); + else + pktns->tx.loss_time = loss_time_limit; + break; + } + } + + out: + TRACE_PROTO("TX loss", QUIC_EV_CONN_PKTLOSS, qc, pktns, lost_pkts); + TRACE_LEAVE(QUIC_EV_CONN_PKTLOSS, qc); +} + +/* Handle <pkts> list of lost packets detected at <now_us> handling their TX + * frames. Send a packet loss event to the congestion controller if in flight + * packet have been lost. Also frees the packet in <pkts> list. + * + * Returns 1 on success else 0 if loss limit has been exceeded. A + * CONNECTION_CLOSE was prepared to close the connection ASAP. + */ +int qc_release_lost_pkts(struct quic_conn *qc, struct quic_pktns *pktns, + struct list *pkts, uint64_t now_us) +{ + struct quic_tx_packet *pkt, *tmp, *oldest_lost, *newest_lost; + int close = 0; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + + if (LIST_ISEMPTY(pkts)) + goto leave; + + oldest_lost = newest_lost = NULL; + list_for_each_entry_safe(pkt, tmp, pkts, list) { + struct list tmp = LIST_HEAD_INIT(tmp); + + pkt->pktns->tx.in_flight -= pkt->in_flight_len; + qc->path->prep_in_flight -= pkt->in_flight_len; + qc->path->in_flight -= pkt->in_flight_len; + if (pkt->flags & QUIC_FL_TX_PACKET_ACK_ELICITING) + qc->path->ifae_pkts--; + /* Treat the frames of this lost packet. */ + if (!qc_handle_frms_of_lost_pkt(qc, pkt, &pktns->tx.frms)) + close = 1; + LIST_DELETE(&pkt->list); + if (!oldest_lost) { + oldest_lost = newest_lost = pkt; + } + else { + if (newest_lost != oldest_lost) + quic_tx_packet_refdec(newest_lost); + newest_lost = pkt; + } + } + + if (!close) { + if (newest_lost) { + /* Sent a congestion event to the controller */ + struct quic_cc_event ev = { }; + + ev.type = QUIC_CC_EVT_LOSS; + ev.loss.time_sent = newest_lost->time_sent; + + quic_cc_event(&qc->path->cc, &ev); + } + + /* If an RTT have been already sampled, <rtt_min> has been set. + * We must check if we are experiencing a persistent congestion. + * If this is the case, the congestion controller must re-enter + * slow start state. + */ + if (qc->path->loss.rtt_min && newest_lost != oldest_lost) { + unsigned int period = newest_lost->time_sent - oldest_lost->time_sent; + + if (quic_loss_persistent_congestion(&qc->path->loss, period, + now_ms, qc->max_ack_delay)) + qc->path->cc.algo->slow_start(&qc->path->cc); + } + } + + /* <oldest_lost> cannot be NULL at this stage because we have ensured + * that <pkts> list is not empty. Without this, GCC 12.2.0 reports a + * possible overflow on a 0 byte region with O2 optimization. + */ + ALREADY_CHECKED(oldest_lost); + quic_tx_packet_refdec(oldest_lost); + if (newest_lost != oldest_lost) + quic_tx_packet_refdec(newest_lost); + + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); + return !close; +} diff --git a/src/quic_openssl_compat.c b/src/quic_openssl_compat.c new file mode 100644 index 0000000..d914ac4 --- /dev/null +++ b/src/quic_openssl_compat.c @@ -0,0 +1,531 @@ +#ifndef USE_QUIC +#error "Must define USE_QUIC" +#endif + +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/openssl-compat.h> +/* Highly inspired from nginx QUIC TLS compatibility code */ +#include <openssl/kdf.h> + +#include <haproxy/quic_conn.h> +#include <haproxy/quic_tls.h> +#include <haproxy/quic_trace.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/trace.h> + +#ifndef HAVE_SSL_KEYLOG +#error "HAVE_SSL_KEYLOG is not defined" +#endif + +#define QUIC_OPENSSL_COMPAT_RECORD_SIZE 1024 + +#define QUIC_TLS_KEY_LABEL "key" +#define QUIC_TLS_IV_LABEL "iv" + +struct quic_tls_compat_record { + unsigned char type; + const unsigned char *payload; + size_t payload_len; + uint64_t number; + struct quic_tls_compat_keys *keys; +}; + +/* Callback used to set the local transport parameters into the TLS stack. + * Must be called after having been set at the QUIC connection level. + */ +static int qc_ssl_compat_add_tps_cb(SSL *ssl, unsigned int ext_type, unsigned int context, + const unsigned char **out, size_t *outlen, + X509 *x, size_t chainidx, int *al, void *add_arg) +{ + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + + *out = qc->enc_params; + *outlen = qc->enc_params_len; + + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return 1; +} + +/* Set the keylog callback used to derive TLS secrets and the callback + * used to pass local transport parameters to the TLS stack. + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_compat_init(struct bind_conf *bind_conf, SSL_CTX *ctx) +{ + /* Ignore non-QUIC connections */ + if (bind_conf->xprt != xprt_get(XPRT_QUIC)) + return 1; + + /* This callback is already registered if the TLS keylog is activated for + * traffic decryption analysis. + */ + if (!global_ssl.keylog) + SSL_CTX_set_keylog_callback(ctx, quic_tls_compat_keylog_callback); + + if (SSL_CTX_has_client_custom_ext(ctx, QUIC_OPENSSL_COMPAT_SSL_TP_EXT)) + return 1; + + if (!SSL_CTX_add_custom_ext(ctx, QUIC_OPENSSL_COMPAT_SSL_TP_EXT, + SSL_EXT_CLIENT_HELLO | SSL_EXT_TLS1_3_ENCRYPTED_EXTENSIONS, + qc_ssl_compat_add_tps_cb, NULL, NULL, + NULL, NULL)) + return 0; + + return 1; +} + +static int quic_tls_compat_set_encryption_secret(struct quic_conn *qc, + struct quic_tls_compat_keys *keys, + enum ssl_encryption_level_t level, + const SSL_CIPHER *cipher, + const uint8_t *secret, size_t secret_len) +{ + int ret = 0, key_len; + struct quic_tls_secret *peer_secret; + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + + peer_secret = &keys->secret; + if (sizeof(peer_secret->secret.data) < secret_len) + goto leave; + + keys->cipher = tls_aead(cipher); + if (!keys->cipher) + goto leave; + + key_len = EVP_CIPHER_key_length(keys->cipher); + + peer_secret->secret.len = secret_len; + memcpy(peer_secret->secret.data, secret, secret_len); + + peer_secret->key.len = key_len; + peer_secret->iv.len = QUIC_OPENSSL_COMPAT_TLS_IV_LEN; + if (!quic_hkdf_expand_label(tls_md(cipher), + peer_secret->key.data, peer_secret->key.len, + secret, secret_len, + (const unsigned char *)QUIC_TLS_KEY_LABEL, + sizeof(QUIC_TLS_KEY_LABEL) - 1) || + !quic_hkdf_expand_label(tls_md(cipher), + peer_secret->iv.data, peer_secret->iv.len, + secret, secret_len, + (const unsigned char *)QUIC_TLS_IV_LABEL, + sizeof(QUIC_TLS_IV_LABEL) - 1)) + goto leave; + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return ret; +} + +/* Callback used to get the Handshake and Application level secrets from + * the TLS stack. + */ +void quic_tls_compat_keylog_callback(const SSL *ssl, const char *line) +{ + unsigned char ch, value; + const char *start, *p; + size_t n; + unsigned int write; + struct quic_openssl_compat *compat; + enum ssl_encryption_level_t level; + unsigned char secret[EVP_MAX_MD_SIZE]; + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + + /* Ignore non-QUIC connections */ + if (!qc) + return; + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + + p = line; + for (start = p; *p && *p != ' '; p++); + n = p - start; + + if (sizeof(QUIC_OPENSSL_COMPAT_CLIENT_HANDSHAKE) - 1 == n && + !strncmp(start, QUIC_OPENSSL_COMPAT_CLIENT_HANDSHAKE, n)) { + level = ssl_encryption_handshake; + write = 0; + } + else if (sizeof(QUIC_OPENSSL_COMPAT_SERVER_HANDSHAKE) - 1 == n && + !strncmp(start, QUIC_OPENSSL_COMPAT_SERVER_HANDSHAKE, n)) { + level = ssl_encryption_handshake; + write = 1; + } + else if (sizeof(QUIC_OPENSSL_COMPAT_CLIENT_APPLICATION) - 1 == n && + !strncmp(start, QUIC_OPENSSL_COMPAT_CLIENT_APPLICATION, n)) { + level = ssl_encryption_application; + write = 0; + } + else if (sizeof(QUIC_OPENSSL_COMPAT_SERVER_APPLICATION) - 1 == n && + !strncmp(start, QUIC_OPENSSL_COMPAT_SERVER_APPLICATION, n)) { + level = ssl_encryption_application; + write = 1; + } + else + goto leave; + + if (*p++ == '\0') + goto leave; + + while (*p && *p != ' ') + p++; + + if (*p++ == '\0') + goto leave; + + for (n = 0, start = p; *p; p++) { + ch = *p; + if (ch >= '0' && ch <= '9') { + value = ch - '0'; + goto next; + } + + ch = (unsigned char) (ch | 0x20); + if (ch >= 'a' && ch <= 'f') { + value = ch - 'a' + 10; + goto next; + } + + goto leave; + +next: + if ((p - start) % 2) { + secret[n++] += value; + } + else { + if (n >= EVP_MAX_MD_SIZE) + goto leave; + + secret[n] = (value << 4); + } + } + + /* Secret successfully parsed */ + compat = &qc->openssl_compat; + if (write) { + compat->method->set_encryption_secrets((SSL *) ssl, level, NULL, secret, n); + compat->write_level = level; + + } else { + const SSL_CIPHER *cipher; + + cipher = SSL_get_current_cipher(ssl); + /* AES_128_CCM_SHA256 not supported at this time. Furthermore, this + * algorithm is silently disabled by the TLS stack. But it can be + * enabled with "ssl-default-bind-ciphersuites" setting. + */ + if (SSL_CIPHER_get_id(cipher) == TLS1_3_CK_AES_128_CCM_SHA256) { + quic_set_tls_alert(qc, SSL_AD_HANDSHAKE_FAILURE); + goto leave; + } + + compat->method->set_encryption_secrets((SSL *) ssl, level, secret, NULL, n); + compat->read_level = level; + compat->read_record = 0; + quic_tls_compat_set_encryption_secret(qc, &compat->keys, level, + cipher, secret, n); + } + + leave: + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); +} + +static size_t quic_tls_compat_create_header(struct quic_conn *qc, + struct quic_tls_compat_record *rec, + unsigned char *out, int plain) +{ + unsigned char type; + size_t len; + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + + len = rec->payload_len; + if (plain) { + type = rec->type; + } + else { + type = SSL3_RT_APPLICATION_DATA; + len += EVP_GCM_TLS_TAG_LEN; + } + + out[0] = type; + out[1] = 0x03; + out[2] = 0x03; + out[3] = (len >> 8); + out[4] = len; + + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return 5; +} + +static void quic_tls_compute_nonce(unsigned char *nonce, size_t len, uint64_t pn) +{ + nonce[len - 8] ^= (pn >> 56) & 0x3f; + nonce[len - 7] ^= (pn >> 48) & 0xff; + nonce[len - 6] ^= (pn >> 40) & 0xff; + nonce[len - 5] ^= (pn >> 32) & 0xff; + nonce[len - 4] ^= (pn >> 24) & 0xff; + nonce[len - 3] ^= (pn >> 16) & 0xff; + nonce[len - 2] ^= (pn >> 8) & 0xff; + nonce[len - 1] ^= pn & 0xff; +} + +/* Cipher <in> buffer data into <out> with <cipher> as AEAD cipher, <s> as secret. + * <ad> is the buffer for the additional data. + */ +static int quic_tls_tls_seal(struct quic_conn *qc, + const EVP_CIPHER *cipher, struct quic_tls_secret *s, + unsigned char *out, size_t *outlen, unsigned char *nonce, + const unsigned char *in, size_t inlen, + const unsigned char *ad, size_t adlen) +{ + int ret = 0, wlen; + EVP_CIPHER_CTX *ctx; + int aead_nid = EVP_CIPHER_nid(cipher); + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + ctx = EVP_CIPHER_CTX_new(); + if (ctx == NULL) + goto leave; + + /* Note that the following encryption code works with NID_aes_128_ccm, but leads + * to an handshake failure with "bad record mac" (20) TLS alert received from + * the peer. + */ + if (!EVP_EncryptInit_ex(ctx, cipher, NULL, NULL, NULL) || + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, s->iv.len, NULL) || + (aead_nid == NID_aes_128_ccm && + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, EVP_GCM_TLS_TAG_LEN, NULL)) || + !EVP_EncryptInit_ex(ctx, NULL, NULL, s->key.data, nonce) || + (aead_nid == NID_aes_128_ccm && + !EVP_EncryptUpdate(ctx, NULL, &wlen, NULL, inlen)) || + !EVP_EncryptUpdate(ctx, NULL, &wlen, ad, adlen) || + !EVP_EncryptUpdate(ctx, out, &wlen, in, inlen) || + !EVP_EncryptFinal_ex(ctx, out + wlen, &wlen) || + (aead_nid != NID_aes_128_ccm && + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, EVP_GCM_TLS_TAG_LEN, out + inlen))) { + goto leave; + } + + *outlen = inlen + adlen + EVP_GCM_TLS_TAG_LEN; + ret = 1; + leave: + /* Safe to call EVP_CIPHER_CTX_free() with null ctx */ + EVP_CIPHER_CTX_free(ctx); + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return ret; +} + +static int quic_tls_compat_create_record(struct quic_conn *qc, + enum ssl_encryption_level_t level, + struct quic_tls_compat_record *rec, + unsigned char *res) +{ + int ret = 0; + unsigned char *ad; + size_t adlen; + unsigned char *out; + size_t outlen; + struct quic_tls_secret *secret; + unsigned char nonce[QUIC_OPENSSL_COMPAT_TLS_IV_LEN]; + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + + ad = res; + adlen = quic_tls_compat_create_header(qc, rec, ad, 0); + + out = res + adlen; + outlen = rec->payload_len + EVP_GCM_TLS_TAG_LEN; + + secret = &rec->keys->secret; + + memcpy(nonce, secret->iv.data, secret->iv.len); + quic_tls_compute_nonce(nonce, sizeof(nonce), rec->number); + + if (!quic_tls_tls_seal(qc, rec->keys->cipher, secret, out, &outlen, + nonce, rec->payload, rec->payload_len, ad, adlen)) + goto leave; + + ret = outlen; +leave: + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return ret; +} + +/* Callback use to parse TLS messages for <ssl> TLS session. */ +void quic_tls_compat_msg_callback(struct connection *conn, + int write_p, int version, int content_type, + const void *buf, size_t len, SSL *ssl) +{ + unsigned int alert; + enum ssl_encryption_level_t level; + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + struct quic_openssl_compat *com; + + if (!write_p || !qc) + goto leave; + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + + com = &qc->openssl_compat; + level = com->write_level; + switch (content_type) { + case SSL3_RT_HANDSHAKE: + com->method->add_handshake_data(ssl, level, buf, len); + break; + case SSL3_RT_ALERT: + if (len >= 2) { + alert = ((unsigned char *) buf)[1]; + com->method->send_alert(ssl, level, alert); + } + break; + } + + leave: + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); +} + +int SSL_set_quic_method(SSL *ssl, const SSL_QUIC_METHOD *quic_method) +{ + int ret = 0; + BIO *rbio, *wbio = NULL; + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + + rbio = BIO_new(BIO_s_mem()); + if (!rbio) + goto err; + + wbio = BIO_new(BIO_s_null()); + if (!wbio) + goto err; + + SSL_set_bio(ssl, rbio, wbio); + /* No ealy data support */ + SSL_set_max_early_data(ssl, 0); + + qc->openssl_compat.rbio = rbio; + qc->openssl_compat.wbio = wbio; + qc->openssl_compat.method = quic_method; + qc->openssl_compat.read_level = ssl_encryption_initial; + qc->openssl_compat.write_level = ssl_encryption_initial; + ret = 1; + + leave: + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return ret; + err: + BIO_free(rbio); + BIO_free(wbio); + goto leave; +} + +enum ssl_encryption_level_t SSL_quic_read_level(const SSL *ssl) +{ + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return qc->openssl_compat.read_level; +} + + +enum ssl_encryption_level_t SSL_quic_write_level(const SSL *ssl) +{ + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return qc->openssl_compat.write_level; +} + +int SSL_provide_quic_data(SSL *ssl, enum ssl_encryption_level_t level, + const uint8_t *data, size_t len) +{ + int ret = 0; + BIO *rbio; + struct quic_tls_compat_record rec; + unsigned char in[QUIC_OPENSSL_COMPAT_RECORD_SIZE + 1]; + unsigned char out[QUIC_OPENSSL_COMPAT_RECORD_SIZE + 1 + + SSL3_RT_HEADER_LENGTH + EVP_GCM_TLS_TAG_LEN]; + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + size_t n; + + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + + rbio = SSL_get_rbio(ssl); + + while (len) { + memset(&rec, 0, sizeof rec); + rec.type = SSL3_RT_HANDSHAKE; + rec.number = qc->openssl_compat.read_record++; + rec.keys = &qc->openssl_compat.keys; + if (level == ssl_encryption_initial) { + n = QUIC_MIN(len, (size_t)65535); + rec.payload = (unsigned char *)data; + rec.payload_len = n; + quic_tls_compat_create_header(qc, &rec, out, 1); + BIO_write(rbio, out, SSL3_RT_HEADER_LENGTH); + BIO_write(rbio, data, n); + } + else { + size_t outlen; + unsigned char *p = in; + + n = QUIC_MIN(len, (size_t)QUIC_OPENSSL_COMPAT_RECORD_SIZE); + memcpy(in, data, n); + p += n; + *p++ = SSL3_RT_HANDSHAKE; + + rec.payload = in; + rec.payload_len = p - in; + + if (!rec.keys->cipher) + goto leave; + + outlen = quic_tls_compat_create_record(qc, level, &rec, out); + if (!outlen) + goto leave; + + BIO_write(rbio, out, outlen); + } + + data += n; + len -= n; + } + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return ret; +} + +int SSL_process_quic_post_handshake(SSL *ssl) +{ + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + + /* Do nothing: rely on the TLS message callback to parse alert messages. */ + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return 1; +} + +int SSL_set_quic_transport_params(SSL *ssl, const uint8_t *params, size_t params_len) +{ + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + /* The local transport parameters are stored into the quic_conn object. + * There is no need to add an intermediary to store pointers to these + * transport paraemters. + */ + TRACE_ENTER(QUIC_EV_CONN_SSL_COMPAT, qc); + TRACE_LEAVE(QUIC_EV_CONN_SSL_COMPAT, qc); + return 1; +} + diff --git a/src/quic_retransmit.c b/src/quic_retransmit.c new file mode 100644 index 0000000..d06293f --- /dev/null +++ b/src/quic_retransmit.c @@ -0,0 +1,252 @@ +#include <import/eb64tree.h> + +#include <haproxy/quic_conn.h> +#include <haproxy/quic_frame.h> +#include <haproxy/quic_retransmit.h> +#include <haproxy/quic_trace.h> +#include <haproxy/quic_tx.h> +#include <haproxy/trace.h> + +#define TRACE_SOURCE &trace_quic + +/* Duplicate all frames from <pkt_frm_list> list into <out_frm_list> list + * for <qc> QUIC connection. + * This is a best effort function which never fails even if no memory could be + * allocated to duplicate these frames. + */ +static void qc_dup_pkt_frms(struct quic_conn *qc, + struct list *pkt_frm_list, struct list *out_frm_list) +{ + struct quic_frame *frm, *frmbak; + struct list tmp = LIST_HEAD_INIT(tmp); + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + + list_for_each_entry_safe(frm, frmbak, pkt_frm_list, list) { + struct quic_frame *dup_frm, *origin; + + if (frm->flags & QUIC_FL_TX_FRAME_ACKED) { + TRACE_DEVEL("already acknowledged frame", QUIC_EV_CONN_PRSAFRM, qc, frm); + continue; + } + + switch (frm->type) { + case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: + { + struct qf_stream *strm_frm = &frm->stream; + struct eb64_node *node = NULL; + struct qc_stream_desc *stream_desc; + + node = eb64_lookup(&qc->streams_by_id, strm_frm->id); + if (!node) { + TRACE_DEVEL("ignored frame for a released stream", QUIC_EV_CONN_PRSAFRM, qc, frm); + continue; + } + + stream_desc = eb64_entry(node, struct qc_stream_desc, by_id); + /* Do not resend this frame if in the "already acked range" */ + if (strm_frm->offset.key + strm_frm->len <= stream_desc->ack_offset) { + TRACE_DEVEL("ignored frame in already acked range", + QUIC_EV_CONN_PRSAFRM, qc, frm); + continue; + } + else if (strm_frm->offset.key < stream_desc->ack_offset) { + uint64_t diff = stream_desc->ack_offset - strm_frm->offset.key; + + qc_stream_frm_mv_fwd(frm, diff); + TRACE_DEVEL("updated partially acked frame", + QUIC_EV_CONN_PRSAFRM, qc, frm); + } + + strm_frm->dup = 1; + break; + } + + default: + break; + } + + /* If <frm> is already a copy of another frame, we must take + * its original frame as source for the copy. + */ + origin = frm->origin ? frm->origin : frm; + dup_frm = qc_frm_dup(origin); + if (!dup_frm) { + TRACE_ERROR("could not duplicate frame", QUIC_EV_CONN_PRSAFRM, qc, frm); + break; + } + + TRACE_DEVEL("built probing frame", QUIC_EV_CONN_PRSAFRM, qc, origin); + if (origin->pkt) { + TRACE_DEVEL("duplicated from packet", QUIC_EV_CONN_PRSAFRM, + qc, dup_frm, &origin->pkt->pn_node.key); + } + else { + /* <origin> is a frame which was sent from a packet detected as lost. */ + TRACE_DEVEL("duplicated from lost packet", QUIC_EV_CONN_PRSAFRM, qc); + } + + LIST_APPEND(&tmp, &dup_frm->list); + } + + LIST_SPLICE(out_frm_list, &tmp); + + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); +} + +/* Boolean function which return 1 if <pkt> TX packet is only made of + * already acknowledged frame. + */ +static inline int qc_pkt_with_only_acked_frms(struct quic_tx_packet *pkt) +{ + struct quic_frame *frm; + + list_for_each_entry(frm, &pkt->frms, list) + if (!(frm->flags & QUIC_FL_TX_FRAME_ACKED)) + return 0; + + return 1; +} + +/* Prepare a fast retransmission from <qel> encryption level */ +void qc_prep_fast_retrans(struct quic_conn *qc, + struct quic_pktns *pktns, + struct list *frms1, struct list *frms2) +{ + struct eb_root *pkts = &pktns->tx.pkts; + struct list *frms = frms1; + struct eb64_node *node; + struct quic_tx_packet *pkt; + + TRACE_ENTER(QUIC_EV_CONN_SPPKTS, qc); + + BUG_ON(frms1 == frms2); + + pkt = NULL; + node = eb64_first(pkts); + start: + while (node) { + struct quic_tx_packet *p; + + p = eb64_entry(node, struct quic_tx_packet, pn_node); + node = eb64_next(node); + /* Skip the empty and coalesced packets */ + TRACE_PRINTF(TRACE_LEVEL_PROTO, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0, + "--> pn=%llu (%d %d %d)", (ull)p->pn_node.key, + LIST_ISEMPTY(&p->frms), !!(p->flags & QUIC_FL_TX_PACKET_COALESCED), + qc_pkt_with_only_acked_frms(p)); + if (!LIST_ISEMPTY(&p->frms) && !qc_pkt_with_only_acked_frms(p)) { + pkt = p; + break; + } + } + + if (!pkt) + goto leave; + + /* When building a packet from another one, the field which may increase the + * packet size is the packet number. And the maximum increase is 4 bytes. + */ + if (!quic_peer_validated_addr(qc) && qc_is_listener(qc) && + pkt->len + 4 > quic_may_send_bytes(qc)) { + qc->flags |= QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED; + TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_SPPKTS, qc, pkt); + goto leave; + } + + TRACE_PROTO("duplicating packet", QUIC_EV_CONN_SPPKTS, qc, pkt); + qc_dup_pkt_frms(qc, &pkt->frms, frms); + if (frms == frms1 && frms2) { + frms = frms2; + goto start; + } + leave: + TRACE_LEAVE(QUIC_EV_CONN_SPPKTS, qc); +} + +/* Prepare a fast retransmission during a handshake after a client + * has resent Initial packets. According to the RFC a server may retransmit + * Initial packets send them coalescing with others (Handshake here). + * (Listener only function). + */ +void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, + struct list *ifrms, struct list *hfrms) +{ + struct list itmp = LIST_HEAD_INIT(itmp); + struct list htmp = LIST_HEAD_INIT(htmp); + + struct quic_enc_level *iqel = qc->iel; + struct quic_enc_level *hqel = qc->hel; + struct quic_enc_level *qel = iqel; + struct eb_root *pkts; + struct eb64_node *node; + struct quic_tx_packet *pkt; + struct list *tmp = &itmp; + + TRACE_ENTER(QUIC_EV_CONN_SPPKTS, qc); + start: + pkt = NULL; + pkts = &qel->pktns->tx.pkts; + node = eb64_first(pkts); + /* Skip the empty packet (they have already been retransmitted) */ + while (node) { + struct quic_tx_packet *p; + + p = eb64_entry(node, struct quic_tx_packet, pn_node); + TRACE_PRINTF(TRACE_LEVEL_PROTO, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0, + "--> pn=%llu (%d %d)", (ull)p->pn_node.key, + LIST_ISEMPTY(&p->frms), !!(p->flags & QUIC_FL_TX_PACKET_COALESCED)); + if (!LIST_ISEMPTY(&p->frms) && !(p->flags & QUIC_FL_TX_PACKET_COALESCED) && + !qc_pkt_with_only_acked_frms(p)) { + pkt = p; + break; + } + + node = eb64_next(node); + } + + if (!pkt) + goto end; + + /* When building a packet from another one, the field which may increase the + * packet size is the packet number. And the maximum increase is 4 bytes. + */ + if (!quic_peer_validated_addr(qc) && qc_is_listener(qc)) { + size_t dglen = pkt->len + 4; + size_t may_send; + + may_send = quic_may_send_bytes(qc); + dglen += pkt->next ? pkt->next->len + 4 : 0; + if (dglen > may_send) { + qc->flags |= QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED; + TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_SPPKTS, qc, pkt); + if (pkt->next) + TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_SPPKTS, qc, pkt->next); + if (qel == iqel && may_send >= QUIC_INITIAL_PACKET_MINLEN) + TRACE_PROTO("will probe Initial packet number space", QUIC_EV_CONN_SPPKTS, qc); + goto end; + } + } + + qel->pktns->tx.pto_probe += 1; + + /* No risk to loop here, #packet per datagram is bounded */ + requeue: + TRACE_PROTO("duplicating packet", QUIC_EV_CONN_PRSAFRM, qc, NULL, &pkt->pn_node.key); + qc_dup_pkt_frms(qc, &pkt->frms, tmp); + if (qel == iqel) { + if (pkt->next && pkt->next->type == QUIC_PACKET_TYPE_HANDSHAKE) { + pkt = pkt->next; + tmp = &htmp; + hqel->pktns->tx.pto_probe += 1; + TRACE_DEVEL("looping for next packet", QUIC_EV_CONN_SPPKTS, qc); + goto requeue; + } + } + + end: + LIST_SPLICE(ifrms, &itmp); + LIST_SPLICE(hfrms, &htmp); + + TRACE_LEAVE(QUIC_EV_CONN_SPPKTS, qc); +} diff --git a/src/quic_retry.c b/src/quic_retry.c new file mode 100644 index 0000000..1c58e5e --- /dev/null +++ b/src/quic_retry.c @@ -0,0 +1,320 @@ +#include <string.h> + +#include <haproxy/clock.h> +#include <haproxy/global.h> +#include <haproxy/quic_retry.h> +#include <haproxy/quic_tls.h> +#include <haproxy/quic_trace-t.h> +#include <haproxy/trace.h> + +#define TRACE_SOURCE &trace_quic + +/* Salt length used to derive retry token secret */ +#define QUIC_RETRY_TOKEN_SALTLEN 16 /* bytes */ + +/* Copy <saddr> socket address data into <buf> buffer. + * This is the responsibility of the caller to check the output buffer is big + * enough to contain these socket address data. + * Return the number of bytes copied. + */ +static inline size_t quic_saddr_cpy(unsigned char *buf, + const struct sockaddr_storage *saddr) +{ + void *port, *addr; + unsigned char *p; + size_t port_len, addr_len; + + p = buf; + if (saddr->ss_family == AF_INET6) { + port = &((struct sockaddr_in6 *)saddr)->sin6_port; + addr = &((struct sockaddr_in6 *)saddr)->sin6_addr; + port_len = sizeof ((struct sockaddr_in6 *)saddr)->sin6_port; + addr_len = sizeof ((struct sockaddr_in6 *)saddr)->sin6_addr; + } + else { + port = &((struct sockaddr_in *)saddr)->sin_port; + addr = &((struct sockaddr_in *)saddr)->sin_addr; + port_len = sizeof ((struct sockaddr_in *)saddr)->sin_port; + addr_len = sizeof ((struct sockaddr_in *)saddr)->sin_addr; + } + memcpy(p, port, port_len); + p += port_len; + memcpy(p, addr, addr_len); + p += addr_len; + + return p - buf; +} + + +/* QUIC server only function. + * Add AAD to <add> buffer from <cid> connection ID and <addr> socket address. + * This is the responsibility of the caller to check <aad> size is big enough + * to contain these data. + * Return the number of bytes copied to <aad>. + */ +static int quic_generate_retry_token_aad(unsigned char *aad, + uint32_t version, + const struct quic_cid *cid, + const struct sockaddr_storage *addr) +{ + unsigned char *p; + + p = aad; + *(uint32_t *)p = htonl(version); + p += sizeof version; + p += quic_saddr_cpy(p, addr); + memcpy(p, cid->data, cid->len); + p += cid->len; + + return p - aad; +} + +/* QUIC server only function. + * Generate the token to be used in Retry packets. The token is written to + * <token> with <len> as length. <odcid> is the original destination connection + * ID and <dcid> is our side destination connection ID (or client source + * connection ID). + * Returns the length of the encoded token or 0 on error. + */ +int quic_generate_retry_token(unsigned char *token, size_t len, + const uint32_t version, + const struct quic_cid *odcid, + const struct quic_cid *dcid, + struct sockaddr_storage *addr) +{ + int ret = 0; + unsigned char *p; + unsigned char aad[sizeof(uint32_t) + sizeof(in_port_t) + + sizeof(struct in6_addr) + QUIC_CID_MAXLEN]; + size_t aadlen; + unsigned char salt[QUIC_RETRY_TOKEN_SALTLEN]; + unsigned char key[QUIC_TLS_KEY_LEN]; + unsigned char iv[QUIC_TLS_IV_LEN]; + const unsigned char *sec = global.cluster_secret; + size_t seclen = sizeof global.cluster_secret; + EVP_CIPHER_CTX *ctx = NULL; + const EVP_CIPHER *aead = EVP_aes_128_gcm(); + uint32_t timestamp = (uint32_t)date.tv_sec; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT); + + /* The token is made of the token format byte, the ODCID prefixed by its one byte + * length, the creation timestamp, an AEAD TAG, and finally + * the random bytes used to derive the secret to encrypt the token. + */ + if (1 + odcid->len + 1 + sizeof(timestamp) + QUIC_TLS_TAG_LEN + QUIC_RETRY_TOKEN_SALTLEN > len) + goto err; + + aadlen = quic_generate_retry_token_aad(aad, version, dcid, addr); + /* TODO: RAND_bytes() should be replaced */ + if (RAND_bytes(salt, sizeof salt) != 1) { + TRACE_ERROR("RAND_bytes()", QUIC_EV_CONN_TXPKT); + goto err; + } + + if (!quic_tls_derive_retry_token_secret(EVP_sha256(), key, sizeof key, iv, sizeof iv, + salt, sizeof salt, sec, seclen)) { + TRACE_ERROR("quic_tls_derive_retry_token_secret() failed", QUIC_EV_CONN_TXPKT); + goto err; + } + + if (!quic_tls_tx_ctx_init(&ctx, aead, key)) { + TRACE_ERROR("quic_tls_tx_ctx_init() failed", QUIC_EV_CONN_TXPKT); + goto err; + } + + /* Token build */ + p = token; + *p++ = QUIC_TOKEN_FMT_RETRY, + *p++ = odcid->len; + memcpy(p, odcid->data, odcid->len); + p += odcid->len; + write_u32(p, htonl(timestamp)); + p += sizeof timestamp; + + /* Do not encrypt the QUIC_TOKEN_FMT_RETRY byte */ + if (!quic_tls_encrypt(token + 1, p - token - 1, aad, aadlen, ctx, aead, iv)) { + TRACE_ERROR("quic_tls_encrypt() failed", QUIC_EV_CONN_TXPKT); + goto err; + } + + p += QUIC_TLS_TAG_LEN; + memcpy(p, salt, sizeof salt); + p += sizeof salt; + EVP_CIPHER_CTX_free(ctx); + + ret = p - token; + leave: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT); + return ret; + + err: + if (ctx) + EVP_CIPHER_CTX_free(ctx); + goto leave; +} + +/* Parse the Retry token from buffer <token> with <end> a pointer to + * one byte past the end of this buffer. This will extract the ODCID + * which will be stored into <odcid> + * + * Returns 0 on success else non-zero. + */ +int parse_retry_token(struct quic_conn *qc, + const unsigned char *token, const unsigned char *end, + struct quic_cid *odcid) +{ + int ret = 0; + uint64_t odcid_len; + uint32_t timestamp; + uint32_t now_sec = (uint32_t)date.tv_sec; + + TRACE_ENTER(QUIC_EV_CONN_LPKT, qc); + + if (!quic_dec_int(&odcid_len, &token, end)) { + TRACE_ERROR("quic_dec_int() error", QUIC_EV_CONN_LPKT, qc); + goto leave; + } + + /* RFC 9000 7.2. Negotiating Connection IDs: + * When an Initial packet is sent by a client that has not previously + * received an Initial or Retry packet from the server, the client + * populates the Destination Connection ID field with an unpredictable + * value. This Destination Connection ID MUST be at least 8 bytes in length. + */ + if (odcid_len < QUIC_ODCID_MINLEN || odcid_len > QUIC_CID_MAXLEN) { + TRACE_ERROR("wrong ODCID length", QUIC_EV_CONN_LPKT, qc); + goto leave; + } + + if (end - token < odcid_len + sizeof timestamp) { + TRACE_ERROR("too long ODCID length", QUIC_EV_CONN_LPKT, qc); + goto leave; + } + + timestamp = ntohl(read_u32(token + odcid_len)); + /* check if elapsed time is +/- QUIC_RETRY_DURATION_SEC + * to tolerate token generator is not perfectly time synced + */ + if ((uint32_t)(now_sec - timestamp) > QUIC_RETRY_DURATION_SEC && + (uint32_t)(timestamp - now_sec) > QUIC_RETRY_DURATION_SEC) { + TRACE_ERROR("token has expired", QUIC_EV_CONN_LPKT, qc); + goto leave; + } + + ret = 1; + memcpy(odcid->data, token, odcid_len); + odcid->len = odcid_len; + leave: + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc); + return !ret; +} + +/* QUIC server only function. + * + * Check the validity of the Retry token from Initial packet <pkt>. <dgram> is + * the UDP datagram containing <pkt> and <l> is the listener instance on which + * it was received. If the token is valid, the ODCID of <qc> QUIC connection + * will be put into <odcid>. <qc> is used to retrieve the QUIC version needed + * to validate the token but it can be NULL : in this case the version will be + * retrieved from the packet. + * + * Return 1 if succeeded, 0 if not. + */ + +int quic_retry_token_check(struct quic_rx_packet *pkt, + struct quic_dgram *dgram, + struct listener *l, + struct quic_conn *qc, + struct quic_cid *odcid) +{ + struct proxy *prx; + struct quic_counters *prx_counters; + int ret = 0; + unsigned char *token = pkt->token; + const uint64_t tokenlen = pkt->token_len; + unsigned char buf[128]; + unsigned char aad[sizeof(uint32_t) + QUIC_CID_MAXLEN + + sizeof(in_port_t) + sizeof(struct in6_addr)]; + size_t aadlen; + const unsigned char *salt; + unsigned char key[QUIC_TLS_KEY_LEN]; + unsigned char iv[QUIC_TLS_IV_LEN]; + const unsigned char *sec = global.cluster_secret; + size_t seclen = sizeof global.cluster_secret; + EVP_CIPHER_CTX *ctx = NULL; + const EVP_CIPHER *aead = EVP_aes_128_gcm(); + const struct quic_version *qv = qc ? qc->original_version : + pkt->version; + + TRACE_ENTER(QUIC_EV_CONN_LPKT, qc); + + /* The caller must ensure this. */ + BUG_ON(!pkt->token_len); + + prx = l->bind_conf->frontend; + prx_counters = EXTRA_COUNTERS_GET(prx->extra_counters_fe, &quic_stats_module); + + if (*pkt->token != QUIC_TOKEN_FMT_RETRY) { + /* TODO: New token check */ + TRACE_PROTO("Packet dropped", QUIC_EV_CONN_LPKT, qc, NULL, NULL, pkt->version); + goto leave; + } + + if (sizeof buf < tokenlen) { + TRACE_ERROR("too short buffer", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + /* The token is made of the token format byte, the ODCID prefixed by its one byte + * length, the creation timestamp, an AEAD TAG, and finally + * the random bytes used to derive the secret to encrypt the token. + */ + if (tokenlen < 2 + QUIC_ODCID_MINLEN + sizeof(uint32_t) + QUIC_TLS_TAG_LEN + QUIC_RETRY_TOKEN_SALTLEN || + tokenlen > 2 + QUIC_CID_MAXLEN + sizeof(uint32_t) + QUIC_TLS_TAG_LEN + QUIC_RETRY_TOKEN_SALTLEN) { + TRACE_ERROR("invalid token length", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + aadlen = quic_generate_retry_token_aad(aad, qv->num, &pkt->scid, &dgram->saddr); + salt = token + tokenlen - QUIC_RETRY_TOKEN_SALTLEN; + if (!quic_tls_derive_retry_token_secret(EVP_sha256(), key, sizeof key, iv, sizeof iv, + salt, QUIC_RETRY_TOKEN_SALTLEN, sec, seclen)) { + TRACE_ERROR("Could not derive retry secret", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + if (!quic_tls_rx_ctx_init(&ctx, aead, key)) { + TRACE_ERROR("quic_tls_rx_ctx_init() failed", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + /* The token is prefixed by a one-byte length format which is not ciphered. */ + if (!quic_tls_decrypt2(buf, token + 1, tokenlen - QUIC_RETRY_TOKEN_SALTLEN - 1, aad, aadlen, + ctx, aead, key, iv)) { + TRACE_ERROR("Could not decrypt retry token", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + if (parse_retry_token(qc, buf, buf + tokenlen - QUIC_RETRY_TOKEN_SALTLEN - 1, odcid)) { + TRACE_ERROR("Error during Initial token parsing", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + EVP_CIPHER_CTX_free(ctx); + + ret = 1; + HA_ATOMIC_INC(&prx_counters->retry_validated); + + leave: + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc); + return ret; + + err: + HA_ATOMIC_INC(&prx_counters->retry_error); + if (ctx) + EVP_CIPHER_CTX_free(ctx); + goto leave; +} + + diff --git a/src/quic_rx.c b/src/quic_rx.c new file mode 100644 index 0000000..9e55aa3 --- /dev/null +++ b/src/quic_rx.c @@ -0,0 +1,2290 @@ +/* + * QUIC protocol implementation. Lower layer with internal features implemented + * here such as QUIC encryption, idle timeout, acknowledgement and + * retransmission. + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/quic_rx.h> + +#include <haproxy/h3.h> +#include <haproxy/list.h> +#include <haproxy/ncbuf.h> +#include <haproxy/proto_quic.h> +#include <haproxy/quic_ack.h> +#include <haproxy/quic_cid.h> +#include <haproxy/quic_retransmit.h> +#include <haproxy/quic_retry.h> +#include <haproxy/quic_sock.h> +#include <haproxy/quic_stream.h> +#include <haproxy/quic_ssl.h> +#include <haproxy/quic_tls.h> +#include <haproxy/quic_trace.h> +#include <haproxy/quic_tx.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/trace.h> + +DECLARE_POOL(pool_head_quic_conn_rxbuf, "quic_conn_rxbuf", QUIC_CONN_RX_BUFSZ); +DECLARE_POOL(pool_head_quic_dgram, "quic_dgram", sizeof(struct quic_dgram)); +DECLARE_POOL(pool_head_quic_rx_packet, "quic_rx_packet", sizeof(struct quic_rx_packet)); + +/* Decode an expected packet number from <truncated_on> its truncated value, + * depending on <largest_pn> the largest received packet number, and <pn_nbits> + * the number of bits used to encode this packet number (its length in bytes * 8). + * See https://quicwg.org/base-drafts/draft-ietf-quic-transport.html#packet-encoding + */ +static uint64_t decode_packet_number(uint64_t largest_pn, + uint32_t truncated_pn, unsigned int pn_nbits) +{ + uint64_t expected_pn = largest_pn + 1; + uint64_t pn_win = (uint64_t)1 << pn_nbits; + uint64_t pn_hwin = pn_win / 2; + uint64_t pn_mask = pn_win - 1; + uint64_t candidate_pn; + + + candidate_pn = (expected_pn & ~pn_mask) | truncated_pn; + /* Note that <pn_win> > <pn_hwin>. */ + if (candidate_pn < QUIC_MAX_PACKET_NUM - pn_win && + candidate_pn + pn_hwin <= expected_pn) + return candidate_pn + pn_win; + + if (candidate_pn > expected_pn + pn_hwin && candidate_pn >= pn_win) + return candidate_pn - pn_win; + + return candidate_pn; +} + +/* Remove the header protection of <pkt> QUIC packet using <tls_ctx> as QUIC TLS + * cryptographic context. + * <largest_pn> is the largest received packet number and <pn> the address of + * the packet number field for this packet with <byte0> address of its first byte. + * <end> points to one byte past the end of this packet. + * Returns 1 if succeeded, 0 if not. + */ +static int qc_do_rm_hp(struct quic_conn *qc, + struct quic_rx_packet *pkt, struct quic_tls_ctx *tls_ctx, + int64_t largest_pn, unsigned char *pn, unsigned char *byte0) +{ + int ret, i, pnlen; + uint64_t packet_number; + uint32_t truncated_pn = 0; + unsigned char mask[5] = {0}; + unsigned char *sample; + + TRACE_ENTER(QUIC_EV_CONN_RMHP, qc); + + ret = 0; + + /* Check there is enough data in this packet. */ + if (pkt->len - (pn - byte0) < QUIC_PACKET_PN_MAXLEN + sizeof mask) { + TRACE_PROTO("too short packet", QUIC_EV_CONN_RMHP, qc, pkt); + goto leave; + } + + sample = pn + QUIC_PACKET_PN_MAXLEN; + + if (!quic_tls_aes_decrypt(mask, sample, sizeof mask, tls_ctx->rx.hp_ctx)) { + TRACE_ERROR("HP removing failed", QUIC_EV_CONN_RMHP, qc, pkt); + goto leave; + } + + *byte0 ^= mask[0] & (*byte0 & QUIC_PACKET_LONG_HEADER_BIT ? 0xf : 0x1f); + pnlen = (*byte0 & QUIC_PACKET_PNL_BITMASK) + 1; + for (i = 0; i < pnlen; i++) { + pn[i] ^= mask[i + 1]; + truncated_pn = (truncated_pn << 8) | pn[i]; + } + + packet_number = decode_packet_number(largest_pn, truncated_pn, pnlen * 8); + /* Store remaining information for this unprotected header */ + pkt->pn = packet_number; + pkt->pnl = pnlen; + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_RMHP, qc); + return ret; +} + +/* Decrypt <pkt> packet using encryption level <qel> for <qc> connection. + * Decryption is done in place in packet buffer. + * + * Returns 1 on success else 0. + */ +static int qc_pkt_decrypt(struct quic_conn *qc, struct quic_enc_level *qel, + struct quic_rx_packet *pkt) +{ + int ret, kp_changed; + unsigned char iv[QUIC_TLS_IV_LEN]; + struct quic_tls_ctx *tls_ctx = + qc_select_tls_ctx(qc, qel, pkt->type, pkt->version); + EVP_CIPHER_CTX *rx_ctx = tls_ctx->rx.ctx; + unsigned char *rx_iv = tls_ctx->rx.iv; + size_t rx_iv_sz = tls_ctx->rx.ivlen; + unsigned char *rx_key = tls_ctx->rx.key; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT, qc); + + ret = 0; + kp_changed = 0; + + if (pkt->type == QUIC_PACKET_TYPE_SHORT) { + /* The two tested bits are not at the same position, + * this is why they are first both inversed. + */ + if (!(*pkt->data & QUIC_PACKET_KEY_PHASE_BIT) ^ !(tls_ctx->flags & QUIC_FL_TLS_KP_BIT_SET)) { + if (pkt->pn < tls_ctx->rx.pn) { + /* The lowest packet number of a previous key phase + * cannot be null if it really stores previous key phase + * secrets. + */ + // TODO: check if BUG_ON() more suitable + if (!qc->ku.prv_rx.pn) { + TRACE_ERROR("null previous packet number", QUIC_EV_CONN_RXPKT, qc); + goto leave; + } + + rx_ctx = qc->ku.prv_rx.ctx; + rx_iv = qc->ku.prv_rx.iv; + rx_key = qc->ku.prv_rx.key; + } + else if (pkt->pn > qel->pktns->rx.largest_pn) { + /* Next key phase */ + TRACE_PROTO("Key phase changed", QUIC_EV_CONN_RXPKT, qc); + kp_changed = 1; + rx_ctx = qc->ku.nxt_rx.ctx; + rx_iv = qc->ku.nxt_rx.iv; + rx_key = qc->ku.nxt_rx.key; + } + } + } + + quic_aead_iv_build(iv, sizeof iv, rx_iv, rx_iv_sz, pkt->pn); + + ret = quic_tls_decrypt(pkt->data + pkt->aad_len, pkt->len - pkt->aad_len, + pkt->data, pkt->aad_len, + rx_ctx, tls_ctx->rx.aead, rx_key, iv); + if (!ret) { + TRACE_ERROR("quic_tls_decrypt() failed", QUIC_EV_CONN_RXPKT, qc); + goto leave; + } + + /* Update the keys only if the packet decryption succeeded. */ + if (kp_changed) { + quic_tls_rotate_keys(qc); + /* Toggle the Key Phase bit */ + tls_ctx->flags ^= QUIC_FL_TLS_KP_BIT_SET; + /* Store the lowest packet number received for the current key phase */ + tls_ctx->rx.pn = pkt->pn; + /* Prepare the next key update */ + if (!quic_tls_key_update(qc)) { + TRACE_ERROR("quic_tls_key_update() failed", QUIC_EV_CONN_RXPKT, qc); + goto leave; + } + } + + /* Update the packet length (required to parse the frames). */ + pkt->len -= QUIC_TLS_TAG_LEN; + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_RXPKT, qc); + return ret; +} + +/* Remove from <stream> the acknowledged frames. + * + * Returns 1 if at least one frame was removed else 0. + */ +static int quic_stream_try_to_consume(struct quic_conn *qc, + struct qc_stream_desc *stream) +{ + int ret; + struct eb64_node *frm_node; + + TRACE_ENTER(QUIC_EV_CONN_ACKSTRM, qc); + + ret = 0; + frm_node = eb64_first(&stream->acked_frms); + while (frm_node) { + struct qf_stream *strm_frm; + struct quic_frame *frm; + size_t offset, len; + + strm_frm = eb64_entry(frm_node, struct qf_stream, offset); + offset = strm_frm->offset.key; + len = strm_frm->len; + + if (offset > stream->ack_offset) + break; + + if (qc_stream_desc_ack(&stream, offset, len)) { + /* cf. next comment : frame may be freed at this stage. */ + TRACE_DEVEL("stream consumed", QUIC_EV_CONN_ACKSTRM, + qc, stream ? strm_frm : NULL, stream); + ret = 1; + } + + /* If stream is NULL after qc_stream_desc_ack(), it means frame + * has been freed. with the stream frames tree. Nothing to do + * anymore in here. + */ + if (!stream) { + qc_check_close_on_released_mux(qc); + ret = 1; + goto leave; + } + + frm_node = eb64_next(frm_node); + eb64_delete(&strm_frm->offset); + + frm = container_of(strm_frm, struct quic_frame, stream); + qc_release_frm(qc, frm); + } + + leave: + TRACE_LEAVE(QUIC_EV_CONN_ACKSTRM, qc); + return ret; +} + +/* Handle <frm> frame whose packet it is attached to has just been acknowledged. The memory allocated + * for this frame will be at least released in every cases. + * Never fail. + */ +static void qc_handle_newly_acked_frm(struct quic_conn *qc, struct quic_frame *frm) +{ + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + TRACE_PROTO("RX ack TX frm", QUIC_EV_CONN_PRSAFRM, qc, frm); + + switch (frm->type) { + case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: + { + struct qf_stream *strm_frm = &frm->stream; + struct eb64_node *node = NULL; + struct qc_stream_desc *stream = NULL; + const size_t offset = strm_frm->offset.key; + const size_t len = strm_frm->len; + + /* do not use strm_frm->stream as the qc_stream_desc instance + * might be freed at this stage. Use the id to do a proper + * lookup. + * + * TODO if lookup operation impact on the perf is noticeable, + * implement a refcount on qc_stream_desc instances. + */ + node = eb64_lookup(&qc->streams_by_id, strm_frm->id); + if (!node) { + TRACE_DEVEL("acked stream for released stream", QUIC_EV_CONN_ACKSTRM, qc, strm_frm); + qc_release_frm(qc, frm); + /* early return */ + goto leave; + } + stream = eb64_entry(node, struct qc_stream_desc, by_id); + + TRACE_DEVEL("acked stream", QUIC_EV_CONN_ACKSTRM, qc, strm_frm, stream); + if (offset <= stream->ack_offset) { + if (qc_stream_desc_ack(&stream, offset, len)) { + TRACE_DEVEL("stream consumed", QUIC_EV_CONN_ACKSTRM, + qc, strm_frm, stream); + } + + if (!stream) { + /* no need to continue if stream freed. */ + TRACE_DEVEL("stream released and freed", QUIC_EV_CONN_ACKSTRM, qc); + qc_release_frm(qc, frm); + qc_check_close_on_released_mux(qc); + break; + } + + TRACE_DEVEL("stream consumed", QUIC_EV_CONN_ACKSTRM, + qc, strm_frm, stream); + qc_release_frm(qc, frm); + } + else { + eb64_insert(&stream->acked_frms, &strm_frm->offset); + } + + quic_stream_try_to_consume(qc, stream); + } + break; + default: + qc_release_frm(qc, frm); + } + + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); +} + +/* Collect newly acknowledged TX packets from <pkts> ebtree into <newly_acked_pkts> + * list depending on <largest> and <smallest> packet number of a range of acknowledged + * packets announced in an ACK frame. <largest_node> may be provided to start + * looking from this packet node. + */ +static void qc_newly_acked_pkts(struct quic_conn *qc, struct eb_root *pkts, + struct list *newly_acked_pkts, + struct eb64_node *largest_node, + uint64_t largest, uint64_t smallest) +{ + struct eb64_node *node; + struct quic_tx_packet *pkt; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + + node = eb64_lookup_ge(pkts, smallest); + if (!node) + goto leave; + + largest_node = largest_node ? largest_node : eb64_lookup_le(pkts, largest); + if (!largest_node) + goto leave; + + while (node && node->key <= largest_node->key) { + pkt = eb64_entry(node, struct quic_tx_packet, pn_node); + LIST_APPEND(newly_acked_pkts, &pkt->list); + node = eb64_next(node); + eb64_delete(&pkt->pn_node); + } + + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); +} + +/* Handle <newly_acked_pkts> list of newly acknowledged TX packets */ +static void qc_handle_newly_acked_pkts(struct quic_conn *qc, + unsigned int *pkt_flags, struct list *newly_acked_pkts) +{ + struct quic_tx_packet *pkt, *tmp; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + + list_for_each_entry_safe(pkt, tmp, newly_acked_pkts, list) { + struct quic_frame *frm, *frmbak; + + *pkt_flags |= pkt->flags; + TRACE_DEVEL("Removing packet #", QUIC_EV_CONN_PRSAFRM, qc, NULL, &pkt->pn_node.key); + list_for_each_entry_safe(frm, frmbak, &pkt->frms, list) + qc_handle_newly_acked_frm(qc, frm); + /* If there are others packet in the same datagram <pkt> is attached to, + * detach the previous one and the next one from <pkt>. + */ + quic_tx_packet_dgram_detach(pkt); + eb64_delete(&pkt->pn_node); + } + + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); +} + +/* Handle all frames sent from <pkt> packet and reinsert them in the same order + * they have been sent into <pktns_frm_list>. The loss counter of each frame is + * incremented and checked if it does not exceed retransmission limit. + * + * Returns 1 on success, 0 if a frame loss limit is exceeded. A + * CONNECTION_CLOSE is scheduled in this case. + */ +int qc_handle_frms_of_lost_pkt(struct quic_conn *qc, + struct quic_tx_packet *pkt, + struct list *pktns_frm_list) +{ + struct quic_frame *frm, *frmbak; + struct list *pkt_frm_list = &pkt->frms; + uint64_t pn = pkt->pn_node.key; + int close = 0; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + + list_for_each_entry_safe(frm, frmbak, pkt_frm_list, list) { + /* First remove this frame from the packet it was attached to */ + LIST_DEL_INIT(&frm->list); + quic_tx_packet_refdec(pkt); + /* At this time, this frame is not freed but removed from its packet */ + frm->pkt = NULL; + /* Remove any reference to this frame */ + qc_frm_unref(frm, qc); + switch (frm->type) { + case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: + { + struct qf_stream *strm_frm = &frm->stream; + struct eb64_node *node = NULL; + struct qc_stream_desc *stream_desc; + + node = eb64_lookup(&qc->streams_by_id, strm_frm->id); + if (!node) { + TRACE_DEVEL("released stream", QUIC_EV_CONN_PRSAFRM, qc, frm); + TRACE_DEVEL("freeing frame from packet", QUIC_EV_CONN_PRSAFRM, + qc, frm, &pn); + qc_frm_free(qc, &frm); + continue; + } + + stream_desc = eb64_entry(node, struct qc_stream_desc, by_id); + /* Do not resend this frame if in the "already acked range" */ + if (strm_frm->offset.key + strm_frm->len <= stream_desc->ack_offset) { + TRACE_DEVEL("ignored frame in already acked range", + QUIC_EV_CONN_PRSAFRM, qc, frm); + qc_frm_free(qc, &frm); + continue; + } + else if (strm_frm->offset.key < stream_desc->ack_offset) { + uint64_t diff = stream_desc->ack_offset - strm_frm->offset.key; + + qc_stream_frm_mv_fwd(frm, diff); + TRACE_DEVEL("updated partially acked frame", + QUIC_EV_CONN_PRSAFRM, qc, frm); + } + break; + } + + default: + break; + } + + /* Do not resend probing packet with old data */ + if (pkt->flags & QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA) { + TRACE_DEVEL("ignored frame with old data from packet", QUIC_EV_CONN_PRSAFRM, + qc, frm, &pn); + if (frm->origin) + LIST_DEL_INIT(&frm->ref); + qc_frm_free(qc, &frm); + continue; + } + + if (frm->flags & QUIC_FL_TX_FRAME_ACKED) { + TRACE_DEVEL("already acked frame", QUIC_EV_CONN_PRSAFRM, qc, frm); + TRACE_DEVEL("freeing frame from packet", QUIC_EV_CONN_PRSAFRM, + qc, frm, &pn); + qc_frm_free(qc, &frm); + } + else { + if (++frm->loss_count >= global.tune.quic_max_frame_loss) { + TRACE_ERROR("retransmission limit reached, closing the connection", QUIC_EV_CONN_PRSAFRM, qc); + quic_set_connection_close(qc, quic_err_transport(QC_ERR_INTERNAL_ERROR)); + qc_notify_err(qc); + close = 1; + } + + LIST_APPEND(pktns_frm_list, &frm->list); + TRACE_DEVEL("frame requeued", QUIC_EV_CONN_PRSAFRM, qc, frm); + } + } + + end: + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); + return !close; +} + +/* Send a packet ack event nofication for each newly acked packet of + * <newly_acked_pkts> list and free them. + * Always succeeds. + */ +static void qc_notify_cc_of_newly_acked_pkts(struct quic_conn *qc, + struct list *newly_acked_pkts) +{ + struct quic_tx_packet *pkt, *tmp; + struct quic_cc_event ev = { .type = QUIC_CC_EVT_ACK, }; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + + list_for_each_entry_safe(pkt, tmp, newly_acked_pkts, list) { + pkt->pktns->tx.in_flight -= pkt->in_flight_len; + qc->path->prep_in_flight -= pkt->in_flight_len; + qc->path->in_flight -= pkt->in_flight_len; + if (pkt->flags & QUIC_FL_TX_PACKET_ACK_ELICITING) + qc->path->ifae_pkts--; + /* If this packet contained an ACK frame, proceed to the + * acknowledging of range of acks from the largest acknowledged + * packet number which was sent in an ACK frame by this packet. + */ + if (pkt->largest_acked_pn != -1) + qc_treat_ack_of_ack(qc, &pkt->pktns->rx.arngs, pkt->largest_acked_pn); + ev.ack.acked = pkt->in_flight_len; + ev.ack.time_sent = pkt->time_sent; + quic_cc_event(&qc->path->cc, &ev); + LIST_DEL_INIT(&pkt->list); + quic_tx_packet_refdec(pkt); + } + + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); + +} + +/* Parse ACK frame into <frm> from a buffer at <buf> address with <end> being at + * one byte past the end of this buffer. Also update <rtt_sample> if needed, i.e. + * if the largest acked packet was newly acked and if there was at least one newly + * acked ack-eliciting packet. + * Return 1, if succeeded, 0 if not. + */ +static int qc_parse_ack_frm(struct quic_conn *qc, + struct quic_frame *frm, + struct quic_enc_level *qel, + unsigned int *rtt_sample, + const unsigned char **pos, const unsigned char *end) +{ + struct qf_ack *ack_frm = &frm->ack; + uint64_t smallest, largest; + struct eb_root *pkts; + struct eb64_node *largest_node; + unsigned int time_sent, pkt_flags; + struct list newly_acked_pkts = LIST_HEAD_INIT(newly_acked_pkts); + struct list lost_pkts = LIST_HEAD_INIT(lost_pkts); + int ret = 0, new_largest_acked_pn = 0; + struct quic_tx_packet *pkt, *tmp; + + TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + + pkts = &qel->pktns->tx.pkts; + if (ack_frm->largest_ack > qel->pktns->tx.next_pn) { + TRACE_DEVEL("ACK for not sent packet", QUIC_EV_CONN_PRSAFRM, + qc, NULL, &ack_frm->largest_ack); + goto err; + } + + if (ack_frm->first_ack_range > ack_frm->largest_ack) { + TRACE_DEVEL("too big first ACK range", QUIC_EV_CONN_PRSAFRM, + qc, NULL, &ack_frm->first_ack_range); + goto err; + } + + largest = ack_frm->largest_ack; + smallest = largest - ack_frm->first_ack_range; + pkt_flags = 0; + largest_node = NULL; + time_sent = 0; + + if ((int64_t)ack_frm->largest_ack > qel->pktns->rx.largest_acked_pn) { + largest_node = eb64_lookup(pkts, largest); + if (!largest_node) { + TRACE_DEVEL("Largest acked packet not found", + QUIC_EV_CONN_PRSAFRM, qc); + } + else { + time_sent = eb64_entry(largest_node, + struct quic_tx_packet, pn_node)->time_sent; + new_largest_acked_pn = 1; + } + } + + TRACE_PROTO("RX ack range", QUIC_EV_CONN_PRSAFRM, + qc, NULL, &largest, &smallest); + do { + uint64_t gap, ack_range; + + qc_newly_acked_pkts(qc, pkts, &newly_acked_pkts, + largest_node, largest, smallest); + if (!ack_frm->ack_range_num--) + break; + + if (!quic_dec_int(&gap, pos, end)) { + TRACE_ERROR("quic_dec_int(gap) failed", QUIC_EV_CONN_PRSAFRM, qc); + goto err; + } + + if (smallest < gap + 2) { + TRACE_DEVEL("wrong gap value", QUIC_EV_CONN_PRSAFRM, + qc, NULL, &gap, &smallest); + goto err; + } + + largest = smallest - gap - 2; + if (!quic_dec_int(&ack_range, pos, end)) { + TRACE_ERROR("quic_dec_int(ack_range) failed", QUIC_EV_CONN_PRSAFRM, qc); + goto err; + } + + if (largest < ack_range) { + TRACE_DEVEL("wrong ack range value", QUIC_EV_CONN_PRSAFRM, + qc, NULL, &largest, &ack_range); + goto err; + } + + /* Do not use this node anymore. */ + largest_node = NULL; + /* Next range */ + smallest = largest - ack_range; + + TRACE_PROTO("RX next ack range", QUIC_EV_CONN_PRSAFRM, + qc, NULL, &largest, &smallest); + } while (1); + + if (!LIST_ISEMPTY(&newly_acked_pkts)) { + qc_handle_newly_acked_pkts(qc, &pkt_flags, &newly_acked_pkts); + if (new_largest_acked_pn && (pkt_flags & QUIC_FL_TX_PACKET_ACK_ELICITING)) { + *rtt_sample = tick_remain(time_sent, now_ms); + qel->pktns->rx.largest_acked_pn = ack_frm->largest_ack; + } + + if (!eb_is_empty(&qel->pktns->tx.pkts)) { + qc_packet_loss_lookup(qel->pktns, qc, &lost_pkts); + if (!qc_release_lost_pkts(qc, qel->pktns, &lost_pkts, now_ms)) + goto leave; + } + qc_notify_cc_of_newly_acked_pkts(qc, &newly_acked_pkts); + if (quic_peer_validated_addr(qc)) + qc->path->loss.pto_count = 0; + qc_set_timer(qc); + qc_notify_send(qc); + } + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); + return ret; + + err: + /* Move back these packets into their tree. */ + list_for_each_entry_safe(pkt, tmp, &newly_acked_pkts, list) { + LIST_DEL_INIT(&pkt->list); + eb64_insert(pkts, &pkt->pn_node); + } + goto leave; +} + +/* Parse a STREAM frame <strm_frm> received in <pkt> packet for <qc> + * connection. <fin> is true if FIN bit is set on frame type. + * + * Return 1 on success. On error, 0 is returned. In this case, the packet + * containing the frame must not be acknowledged. + */ +static int qc_handle_strm_frm(struct quic_rx_packet *pkt, + struct qf_stream *strm_frm, + struct quic_conn *qc, char fin) +{ + int ret; + + /* RFC9000 13.1. Packet Processing + * + * A packet MUST NOT be acknowledged until packet protection has been + * successfully removed and all frames contained in the packet have + * been processed. For STREAM frames, this means the data has been + * enqueued in preparation to be received by the application protocol, + * but it does not require that data be delivered and consumed. + */ + TRACE_ENTER(QUIC_EV_CONN_PRSFRM, qc); + + ret = qcc_recv(qc->qcc, strm_frm->id, strm_frm->len, + strm_frm->offset.key, fin, (char *)strm_frm->data); + + /* frame rejected - packet must not be acknowledeged */ + TRACE_LEAVE(QUIC_EV_CONN_PRSFRM, qc); + return !ret; +} + +/* Parse <frm> CRYPTO frame coming with <pkt> packet at <qel> <qc> connectionn. + * Returns 1 if succeeded, 0 if not. Also set <*fast_retrans> to 1 if the + * speed up handshake completion may be run after having received duplicated + * CRYPTO data. + */ +static int qc_handle_crypto_frm(struct quic_conn *qc, + struct qf_crypto *crypto_frm, struct quic_rx_packet *pkt, + struct quic_enc_level *qel, int *fast_retrans) +{ + int ret = 0; + enum ncb_ret ncb_ret; + /* XXX TO DO: <cfdebug> is used only for the traces. */ + struct quic_rx_crypto_frm cfdebug = { + .offset_node.key = crypto_frm->offset, + .len = crypto_frm->len, + }; + struct quic_cstream *cstream = qel->cstream; + struct ncbuf *ncbuf = &qel->cstream->rx.ncbuf; + + TRACE_ENTER(QUIC_EV_CONN_PRSHPKT, qc); + + if (unlikely(crypto_frm->offset < cstream->rx.offset)) { + size_t diff; + + if (crypto_frm->offset + crypto_frm->len <= cstream->rx.offset) { + /* Nothing to do */ + TRACE_PROTO("Already received CRYPTO data", + QUIC_EV_CONN_RXPKT, qc, pkt, &cfdebug); + if (qc_is_listener(qc) && qel == qc->iel && + !(qc->flags & QUIC_FL_CONN_HANDSHAKE_SPEED_UP)) + *fast_retrans = 1; + goto done; + } + + TRACE_PROTO("Partially already received CRYPTO data", + QUIC_EV_CONN_RXPKT, qc, pkt, &cfdebug); + + diff = cstream->rx.offset - crypto_frm->offset; + crypto_frm->len -= diff; + crypto_frm->data += diff; + crypto_frm->offset = cstream->rx.offset; + } + + if (crypto_frm->offset == cstream->rx.offset && ncb_is_empty(ncbuf)) { + struct qf_crypto *qf_crypto; + + qf_crypto = pool_alloc(pool_head_qf_crypto); + if (!qf_crypto) { + TRACE_ERROR("CRYPTO frame allocation failed", QUIC_EV_CONN_PRSHPKT, qc); + goto leave; + } + + qf_crypto->offset = crypto_frm->offset; + qf_crypto->len = crypto_frm->len; + qf_crypto->data = crypto_frm->data; + qf_crypto->qel = qel; + LIST_APPEND(&qel->rx.crypto_frms, &qf_crypto->list); + + cstream->rx.offset += crypto_frm->len; + HA_ATOMIC_OR(&qc->wait_event.tasklet->state, TASK_HEAVY); + TRACE_DEVEL("increment crypto level offset", QUIC_EV_CONN_PHPKTS, qc, qel); + goto done; + } + + if (!quic_get_ncbuf(ncbuf) || + ncb_is_null(ncbuf)) { + TRACE_ERROR("CRYPTO ncbuf allocation failed", QUIC_EV_CONN_PRSHPKT, qc); + goto leave; + } + + /* crypto_frm->offset > cstream-trx.offset */ + ncb_ret = ncb_add(ncbuf, crypto_frm->offset - cstream->rx.offset, + (const char *)crypto_frm->data, crypto_frm->len, NCB_ADD_COMPARE); + if (ncb_ret != NCB_RET_OK) { + if (ncb_ret == NCB_RET_DATA_REJ) { + TRACE_ERROR("overlapping data rejected", QUIC_EV_CONN_PRSHPKT, qc); + quic_set_connection_close(qc, quic_err_transport(QC_ERR_PROTOCOL_VIOLATION)); + qc_notify_err(qc); + } + else if (ncb_ret == NCB_RET_GAP_SIZE) { + TRACE_ERROR("cannot bufferize frame due to gap size limit", + QUIC_EV_CONN_PRSHPKT, qc); + } + goto leave; + } + + if (ncb_data(ncbuf, 0)) + HA_ATOMIC_OR(&qc->wait_event.tasklet->state, TASK_HEAVY); + + done: + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSHPKT, qc); + return ret; +} + +/* Handle RETIRE_CONNECTION_ID frame from <frm> frame. + * Return 1 if succeeded, 0 if not. If succeeded, also set <to_retire> + * to the CID to be retired if not already retired. + */ +static int qc_handle_retire_connection_id_frm(struct quic_conn *qc, + struct quic_frame *frm, + struct quic_cid *dcid, + struct quic_connection_id **to_retire) +{ + int ret = 0; + struct qf_retire_connection_id *rcid_frm = &frm->retire_connection_id; + struct eb64_node *node; + struct quic_connection_id *conn_id; + + TRACE_ENTER(QUIC_EV_CONN_PRSHPKT, qc); + + /* RFC 9000 19.16. RETIRE_CONNECTION_ID Frames: + * Receipt of a RETIRE_CONNECTION_ID frame containing a sequence number greater + * than any previously sent to the peer MUST be treated as a connection error + * of type PROTOCOL_VIOLATION. + */ + if (rcid_frm->seq_num >= qc->next_cid_seq_num) { + TRACE_PROTO("CID seq. number too big", QUIC_EV_CONN_PSTRM, qc, frm); + goto protocol_violation; + } + + /* RFC 9000 19.16. RETIRE_CONNECTION_ID Frames: + * The sequence number specified in a RETIRE_CONNECTION_ID frame MUST NOT refer to + * the Destination Connection ID field of the packet in which the frame is contained. + * The peer MAY treat this as a connection error of type PROTOCOL_VIOLATION. + */ + node = eb64_lookup(qc->cids, rcid_frm->seq_num); + if (!node) { + TRACE_PROTO("CID already retired", QUIC_EV_CONN_PSTRM, qc, frm); + goto out; + } + + conn_id = eb64_entry(node, struct quic_connection_id, seq_num); + /* Note that the length of <dcid> has already been checked. It must match the + * length of the CIDs which have been provided to the peer. + */ + if (!memcmp(dcid->data, conn_id->cid.data, QUIC_HAP_CID_LEN)) { + TRACE_PROTO("cannot retire the current CID", QUIC_EV_CONN_PSTRM, qc, frm); + goto protocol_violation; + } + + *to_retire = conn_id; + out: + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSHPKT, qc); + return ret; + protocol_violation: + quic_set_connection_close(qc, quic_err_transport(QC_ERR_PROTOCOL_VIOLATION)); + qc_notify_err(qc); + goto leave; +} + +/* Returns the <ack_delay> field value in milliseconds from <ack_frm> ACK frame for + * <conn> QUIC connection. Note that the value of <ack_delay> coming from + * ACK frame is in microseconds. + */ +static inline unsigned int quic_ack_delay_ms(struct qf_ack *ack_frm, + struct quic_conn *conn) +{ + return (ack_frm->ack_delay << conn->tx.params.ack_delay_exponent) / 1000; +} + +/* Parse all the frames of <pkt> QUIC packet for QUIC connection <qc> and <qel> + * as encryption level. + * Returns 1 if succeeded, 0 if failed. + */ +static int qc_parse_pkt_frms(struct quic_conn *qc, struct quic_rx_packet *pkt, + struct quic_enc_level *qel) +{ + struct quic_frame frm; + const unsigned char *pos, *end; + int fast_retrans = 0, ret = 0; + + TRACE_ENTER(QUIC_EV_CONN_PRSHPKT, qc); + /* Skip the AAD */ + pos = pkt->data + pkt->aad_len; + end = pkt->data + pkt->len; + + /* Packet with no frame. */ + if (pos == end) { + /* RFC9000 12.4. Frames and Frame Types + * + * The payload of a packet that contains frames MUST contain at least + * one frame, and MAY contain multiple frames and multiple frame types. + * An endpoint MUST treat receipt of a packet containing no frames as a + * connection error of type PROTOCOL_VIOLATION. Frames always fit within + * a single QUIC packet and cannot span multiple packets. + */ + quic_set_connection_close(qc, quic_err_transport(QC_ERR_PROTOCOL_VIOLATION)); + goto leave; + } + + while (pos < end) { + if (!qc_parse_frm(&frm, pkt, &pos, end, qc)) { + // trace already emitted by function above + goto leave; + } + + switch (frm.type) { + case QUIC_FT_PADDING: + break; + case QUIC_FT_PING: + break; + case QUIC_FT_ACK: + { + unsigned int rtt_sample; + rtt_sample = UINT_MAX; + + if (!qc_parse_ack_frm(qc, &frm, qel, &rtt_sample, &pos, end)) { + // trace already emitted by function above + goto leave; + } + + if (rtt_sample != UINT_MAX) { + unsigned int ack_delay; + + ack_delay = !quic_application_pktns(qel->pktns, qc) ? 0 : + qc->state >= QUIC_HS_ST_CONFIRMED ? + MS_TO_TICKS(QUIC_MIN(quic_ack_delay_ms(&frm.ack, qc), qc->max_ack_delay)) : + MS_TO_TICKS(quic_ack_delay_ms(&frm.ack, qc)); + quic_loss_srtt_update(&qc->path->loss, rtt_sample, ack_delay, qc); + } + break; + } + case QUIC_FT_RESET_STREAM: + if (qc->mux_state == QC_MUX_READY) { + struct qf_reset_stream *rs_frm = &frm.reset_stream; + qcc_recv_reset_stream(qc->qcc, rs_frm->id, rs_frm->app_error_code, rs_frm->final_size); + } + break; + case QUIC_FT_STOP_SENDING: + { + struct qf_stop_sending *ss_frm = &frm.stop_sending; + if (qc->mux_state == QC_MUX_READY) { + if (qcc_recv_stop_sending(qc->qcc, ss_frm->id, + ss_frm->app_error_code)) { + TRACE_ERROR("qcc_recv_stop_sending() failed", QUIC_EV_CONN_PRSHPKT, qc); + goto leave; + } + } + break; + } + case QUIC_FT_CRYPTO: + if (!qc_handle_crypto_frm(qc, &frm.crypto, pkt, qel, &fast_retrans)) + goto leave; + break; + case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: + { + struct qf_stream *strm_frm = &frm.stream; + unsigned nb_streams = qc->rx.strms[qcs_id_type(strm_frm->id)].nb_streams; + const char fin = frm.type & QUIC_STREAM_FRAME_TYPE_FIN_BIT; + + /* The upper layer may not be allocated. */ + if (qc->mux_state != QC_MUX_READY) { + if ((strm_frm->id >> QCS_ID_TYPE_SHIFT) < nb_streams) { + TRACE_DATA("Already closed stream", QUIC_EV_CONN_PRSHPKT, qc); + } + else { + TRACE_DEVEL("No mux for new stream", QUIC_EV_CONN_PRSHPKT, qc); + if (qc->app_ops == &h3_ops) { + if (!qc_h3_request_reject(qc, strm_frm->id)) { + TRACE_ERROR("error on request rejection", QUIC_EV_CONN_PRSHPKT, qc); + /* This packet will not be acknowledged */ + goto leave; + } + } + else { + /* This packet will not be acknowledged */ + goto leave; + } + } + + break; + } + + if (!qc_handle_strm_frm(pkt, strm_frm, qc, fin)) { + TRACE_ERROR("qc_handle_strm_frm() failed", QUIC_EV_CONN_PRSHPKT, qc); + goto leave; + } + + break; + } + case QUIC_FT_MAX_DATA: + if (qc->mux_state == QC_MUX_READY) { + struct qf_max_data *md_frm = &frm.max_data; + qcc_recv_max_data(qc->qcc, md_frm->max_data); + } + break; + case QUIC_FT_MAX_STREAM_DATA: + if (qc->mux_state == QC_MUX_READY) { + struct qf_max_stream_data *msd_frm = &frm.max_stream_data; + if (qcc_recv_max_stream_data(qc->qcc, msd_frm->id, + msd_frm->max_stream_data)) { + TRACE_ERROR("qcc_recv_max_stream_data() failed", QUIC_EV_CONN_PRSHPKT, qc); + goto leave; + } + } + break; + case QUIC_FT_MAX_STREAMS_BIDI: + case QUIC_FT_MAX_STREAMS_UNI: + break; + case QUIC_FT_DATA_BLOCKED: + qc->cntrs.data_blocked++; + break; + case QUIC_FT_STREAM_DATA_BLOCKED: + qc->cntrs.stream_data_blocked++; + break; + case QUIC_FT_STREAMS_BLOCKED_BIDI: + qc->cntrs.streams_blocked_bidi++; + break; + case QUIC_FT_STREAMS_BLOCKED_UNI: + qc->cntrs.streams_blocked_uni++; + break; + case QUIC_FT_NEW_CONNECTION_ID: + /* XXX TO DO XXX */ + break; + case QUIC_FT_RETIRE_CONNECTION_ID: + { + struct quic_cid_tree *tree; + struct quic_connection_id *conn_id = NULL; + + if (!qc_handle_retire_connection_id_frm(qc, &frm, &pkt->dcid, &conn_id)) + goto leave; + + if (!conn_id) + break; + + tree = &quic_cid_trees[quic_cid_tree_idx(&conn_id->cid)]; + HA_RWLOCK_WRLOCK(QC_CID_LOCK, &tree->lock); + ebmb_delete(&conn_id->node); + HA_RWLOCK_WRUNLOCK(QC_CID_LOCK, &tree->lock); + eb64_delete(&conn_id->seq_num); + pool_free(pool_head_quic_connection_id, conn_id); + TRACE_PROTO("CID retired", QUIC_EV_CONN_PSTRM, qc); + + conn_id = new_quic_cid(qc->cids, qc, NULL, NULL); + if (!conn_id) { + TRACE_ERROR("CID allocation error", QUIC_EV_CONN_IO_CB, qc); + } + else { + quic_cid_insert(conn_id); + qc_build_new_connection_id_frm(qc, conn_id); + } + break; + } + case QUIC_FT_CONNECTION_CLOSE: + case QUIC_FT_CONNECTION_CLOSE_APP: + /* Increment the error counters */ + quic_conn_closed_err_count_inc(qc, &frm); + if (!(qc->flags & QUIC_FL_CONN_DRAINING)) { + TRACE_STATE("Entering draining state", QUIC_EV_CONN_PRSHPKT, qc); + /* RFC 9000 10.2. Immediate Close: + * The closing and draining connection states exist to ensure + * that connections close cleanly and that delayed or reordered + * packets are properly discarded. These states SHOULD persist + * for at least three times the current PTO interval... + * + * Rearm the idle timeout only one time when entering draining + * state. + */ + qc->flags |= QUIC_FL_CONN_DRAINING|QUIC_FL_CONN_IMMEDIATE_CLOSE; + qc_detach_th_ctx_list(qc, 1); + qc_idle_timer_do_rearm(qc, 0); + qc_notify_err(qc); + } + break; + case QUIC_FT_HANDSHAKE_DONE: + if (qc_is_listener(qc)) { + TRACE_ERROR("non accepted QUIC_FT_HANDSHAKE_DONE frame", + QUIC_EV_CONN_PRSHPKT, qc); + goto leave; + } + + qc->state = QUIC_HS_ST_CONFIRMED; + break; + default: + TRACE_ERROR("unknosw frame type", QUIC_EV_CONN_PRSHPKT, qc); + goto leave; + } + } + + if (fast_retrans && qc->iel && qc->hel) { + struct quic_enc_level *iqel = qc->iel; + struct quic_enc_level *hqel = qc->hel; + + TRACE_PROTO("speeding up handshake completion", QUIC_EV_CONN_PRSHPKT, qc); + qc_prep_hdshk_fast_retrans(qc, &iqel->pktns->tx.frms, &hqel->pktns->tx.frms); + qc->flags |= QUIC_FL_CONN_HANDSHAKE_SPEED_UP; + } + + /* The server must switch from INITIAL to HANDSHAKE handshake state when it + * has successfully parse a Handshake packet. The Initial encryption must also + * be discarded. + */ + if (pkt->type == QUIC_PACKET_TYPE_HANDSHAKE && qc_is_listener(qc)) { + if (qc->state >= QUIC_HS_ST_SERVER_INITIAL) { + if (qc->ipktns && !quic_tls_pktns_is_dcd(qc, qc->ipktns)) { + /* Discard the handshake packet number space. */ + TRACE_PROTO("discarding Initial pktns", QUIC_EV_CONN_PRSHPKT, qc); + quic_pktns_discard(qc->ipktns, qc); + qc_set_timer(qc); + qc_el_rx_pkts_del(qc->iel); + qc_release_pktns_frms(qc, qc->ipktns); + } + if (qc->state < QUIC_HS_ST_SERVER_HANDSHAKE) + qc->state = QUIC_HS_ST_SERVER_HANDSHAKE; + } + } + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PRSHPKT, qc); + return ret; +} + +/* Detect the value of the spin bit to be used. */ +static inline void qc_handle_spin_bit(struct quic_conn *qc, struct quic_rx_packet *pkt, + struct quic_enc_level *qel) +{ + uint64_t largest_pn = qel->pktns->rx.largest_pn; + + if (qel != qc->ael || largest_pn == -1 || + pkt->pn <= largest_pn) + return; + + if (qc_is_listener(qc)) { + if (pkt->flags & QUIC_FL_RX_PACKET_SPIN_BIT) + qc->flags |= QUIC_FL_CONN_SPIN_BIT; + else + qc->flags &= ~QUIC_FL_CONN_SPIN_BIT; + } + else { + if (pkt->flags & QUIC_FL_RX_PACKET_SPIN_BIT) + qc->flags &= ~QUIC_FL_CONN_SPIN_BIT; + else + qc->flags |= QUIC_FL_CONN_SPIN_BIT; + } +} + +/* Remove the header protection of packets at <el> encryption level. + * Always succeeds. + */ +static void qc_rm_hp_pkts(struct quic_conn *qc, struct quic_enc_level *el) +{ + struct quic_rx_packet *pqpkt, *pkttmp; + + TRACE_ENTER(QUIC_EV_CONN_ELRMHP, qc); + /* A server must not process incoming 1-RTT packets before the handshake is complete. */ + if (el == qc->ael && qc_is_listener(qc) && qc->state < QUIC_HS_ST_COMPLETE) { + TRACE_PROTO("RX hp not removed (handshake not completed)", + QUIC_EV_CONN_ELRMHP, qc); + goto out; + } + + list_for_each_entry_safe(pqpkt, pkttmp, &el->rx.pqpkts, list) { + struct quic_tls_ctx *tls_ctx; + + tls_ctx = qc_select_tls_ctx(qc, el, pqpkt->type, pqpkt->version); + if (!qc_do_rm_hp(qc, pqpkt, tls_ctx, el->pktns->rx.largest_pn, + pqpkt->data + pqpkt->pn_offset, pqpkt->data)) { + TRACE_ERROR("RX hp removing error", QUIC_EV_CONN_ELRMHP, qc); + } + else { + qc_handle_spin_bit(qc, pqpkt, el); + /* The AAD includes the packet number field */ + pqpkt->aad_len = pqpkt->pn_offset + pqpkt->pnl; + /* Store the packet into the tree of packets to decrypt. */ + pqpkt->pn_node.key = pqpkt->pn; + eb64_insert(&el->rx.pkts, &pqpkt->pn_node); + quic_rx_packet_refinc(pqpkt); + TRACE_PROTO("RX hp removed", QUIC_EV_CONN_ELRMHP, qc, pqpkt); + } + LIST_DELETE(&pqpkt->list); + quic_rx_packet_refdec(pqpkt); + } + + out: + TRACE_LEAVE(QUIC_EV_CONN_ELRMHP, qc); +} + +/* Process all the CRYPTO frame at <el> encryption level. This is the + * responsibility of the called to ensure there exists a CRYPTO data + * stream for this level. + * Return 1 if succeeded, 0 if not. + */ +int qc_treat_rx_crypto_frms(struct quic_conn *qc, struct quic_enc_level *el, + struct ssl_sock_ctx *ctx) +{ + int ret = 0; + struct ncbuf *ncbuf; + struct quic_cstream *cstream = el->cstream; + ncb_sz_t data; + + TRACE_ENTER(QUIC_EV_CONN_PHPKTS, qc); + + BUG_ON(!cstream); + ncbuf = &cstream->rx.ncbuf; + if (ncb_is_null(ncbuf)) + goto done; + + /* TODO not working if buffer is wrapping */ + while ((data = ncb_data(ncbuf, 0))) { + const unsigned char *cdata = (const unsigned char *)ncb_head(ncbuf); + + if (!qc_ssl_provide_quic_data(&el->cstream->rx.ncbuf, el->level, + ctx, cdata, data)) + goto leave; + + cstream->rx.offset += data; + TRACE_DEVEL("buffered crypto data were provided to TLS stack", + QUIC_EV_CONN_PHPKTS, qc, el); + } + + done: + ret = 1; + leave: + if (!ncb_is_null(ncbuf) && ncb_is_empty(ncbuf)) { + TRACE_DEVEL("freeing crypto buf", QUIC_EV_CONN_PHPKTS, qc, el); + quic_free_ncbuf(ncbuf); + } + TRACE_LEAVE(QUIC_EV_CONN_PHPKTS, qc); + return ret; +} + +/* Check if it's possible to remove header protection for packets related to + * encryption level <qel>. If <qel> is NULL, assume it's false. + * + * Return true if the operation is possible else false. + */ +static int qc_qel_may_rm_hp(struct quic_conn *qc, struct quic_enc_level *qel) +{ + int ret = 0; + + TRACE_ENTER(QUIC_EV_CONN_TRMHP, qc); + + if (!qel) + goto cant_rm_hp; + + if (!quic_tls_has_rx_sec(qel)) { + TRACE_PROTO("non available secrets", QUIC_EV_CONN_TRMHP, qc); + goto cant_rm_hp; + } + + if (qel == qc->ael && qc->state < QUIC_HS_ST_COMPLETE) { + TRACE_PROTO("handshake not complete", QUIC_EV_CONN_TRMHP, qc); + goto cant_rm_hp; + } + + /* check if the connection layer is ready before using app level */ + if ((qel == qc->ael || qel == qc->eel) && + qc->mux_state == QC_MUX_NULL) { + TRACE_PROTO("connection layer not ready", QUIC_EV_CONN_TRMHP, qc); + goto cant_rm_hp; + } + + ret = 1; + cant_rm_hp: + TRACE_LEAVE(QUIC_EV_CONN_TRMHP, qc); + return ret; +} + +/* Process all the packets for all the encryption levels listed in <qc> QUIC connection. + * Return 1 if succeeded, 0 if not. + */ +int qc_treat_rx_pkts(struct quic_conn *qc) +{ + int ret = 0; + struct eb64_node *node; + int64_t largest_pn = -1; + unsigned int largest_pn_time_received = 0; + struct quic_enc_level *qel, *qelbak; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT, qc); + + list_for_each_entry_safe(qel, qelbak, &qc->qel_list, list) { + /* Treat packets waiting for header packet protection decryption */ + if (!LIST_ISEMPTY(&qel->rx.pqpkts) && qc_qel_may_rm_hp(qc, qel)) + qc_rm_hp_pkts(qc, qel); + + node = eb64_first(&qel->rx.pkts); + while (node) { + struct quic_rx_packet *pkt; + + pkt = eb64_entry(node, struct quic_rx_packet, pn_node); + TRACE_DATA("new packet", QUIC_EV_CONN_RXPKT, + qc, pkt, NULL, qc->xprt_ctx->ssl); + if (!qc_pkt_decrypt(qc, qel, pkt)) { + /* Drop the packet */ + TRACE_ERROR("packet decryption failed -> dropped", + QUIC_EV_CONN_RXPKT, qc, pkt); + } + else { + if (!qc_parse_pkt_frms(qc, pkt, qel)) { + /* Drop the packet */ + TRACE_ERROR("packet parsing failed -> dropped", + QUIC_EV_CONN_RXPKT, qc, pkt); + qc->cntrs.dropped_parsing++; + } + else { + struct quic_arng ar = { .first = pkt->pn, .last = pkt->pn }; + + /* RFC 9000 8.1. Address Validation during Connection Establishment + * + * Connection establishment implicitly provides address validation for + * both endpoints. In particular, receipt of a packet protected with + * Handshake keys confirms that the peer successfully processed an + * Initial packet. + */ + if (qel == qc->hel && + !(qc->flags & QUIC_FL_CONN_PEER_VALIDATED_ADDR)) { + TRACE_STATE("validate peer address on handshake packet", + QUIC_EV_CONN_RXPKT, qc, pkt); + qc->flags |= QUIC_FL_CONN_PEER_VALIDATED_ADDR; + BUG_ON(!qc->prx_counters->half_open_conn); + HA_ATOMIC_DEC(&qc->prx_counters->half_open_conn); + } + + /* Update the list of ranges to acknowledge. */ + if (quic_update_ack_ranges_list(qc, &qel->pktns->rx.arngs, &ar)) { + if (pkt->flags & QUIC_FL_RX_PACKET_ACK_ELICITING) { + int arm_ack_timer = + qc->state >= QUIC_HS_ST_COMPLETE && + qel->pktns == qc->apktns; + + qel->pktns->flags |= QUIC_FL_PKTNS_ACK_REQUIRED; + qel->pktns->rx.nb_aepkts_since_last_ack++; + qc_idle_timer_rearm(qc, 1, arm_ack_timer); + } + + if (pkt->pn > largest_pn) { + largest_pn = pkt->pn; + largest_pn_time_received = pkt->time_received; + } + } + else { + TRACE_ERROR("Could not update ack range list", + QUIC_EV_CONN_RXPKT, qc); + } + } + } + node = eb64_next(node); + eb64_delete(&pkt->pn_node); + quic_rx_packet_refdec(pkt); + } + + if (largest_pn != -1 && largest_pn > qel->pktns->rx.largest_pn) { + /* Update the largest packet number. */ + qel->pktns->rx.largest_pn = largest_pn; + /* Update the largest acknowledged packet timestamps */ + qel->pktns->rx.largest_time_received = largest_pn_time_received; + qel->pktns->flags |= QUIC_FL_PKTNS_NEW_LARGEST_PN; + } + + if (qel->cstream) { + struct ncbuf *ncbuf = &qel->cstream->rx.ncbuf; + + if (!ncb_is_null(ncbuf) && ncb_data(ncbuf, 0)) { + /* Some in order CRYPTO data were bufferized. */ + HA_ATOMIC_OR(&qc->wait_event.tasklet->state, TASK_HEAVY); + } + } + + /* Release the Initial encryption level and packet number space. */ + if ((qc->flags & QUIC_FL_CONN_IPKTNS_DCD) && qel == qc->iel) { + qc_enc_level_free(qc, &qc->iel); + quic_pktns_release(qc, &qc->ipktns); + } + + largest_pn = -1; + } + + out: + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_RXPKT, qc); + return ret; +} + +/* Parse into <pkt> a long header located at <*pos> position, <end> begin a pointer to the end + * past one byte of this buffer. + */ +static inline int quic_packet_read_long_header(unsigned char **pos, const unsigned char *end, + struct quic_rx_packet *pkt) +{ + int ret = 0; + unsigned char dcid_len, scid_len; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT); + + if (end == *pos) { + TRACE_ERROR("buffer data consumed", QUIC_EV_CONN_RXPKT); + goto leave; + } + + /* Destination Connection ID Length */ + dcid_len = *(*pos)++; + /* We want to be sure we can read <dcid_len> bytes and one more for <scid_len> value */ + if (dcid_len > QUIC_CID_MAXLEN || end - *pos < dcid_len + 1) { + TRACE_ERROR("too long DCID", QUIC_EV_CONN_RXPKT); + goto leave; + } + + if (dcid_len) { + /* Check that the length of this received DCID matches the CID lengths + * of our implementation for non Initials packets only. + */ + if (pkt->version && pkt->version->num && + pkt->type != QUIC_PACKET_TYPE_INITIAL && + pkt->type != QUIC_PACKET_TYPE_0RTT && + dcid_len != QUIC_HAP_CID_LEN) { + TRACE_ERROR("wrong DCID length", QUIC_EV_CONN_RXPKT); + goto leave; + } + + memcpy(pkt->dcid.data, *pos, dcid_len); + } + + pkt->dcid.len = dcid_len; + *pos += dcid_len; + + /* Source Connection ID Length */ + scid_len = *(*pos)++; + if (scid_len > QUIC_CID_MAXLEN || end - *pos < scid_len) { + TRACE_ERROR("too long SCID", QUIC_EV_CONN_RXPKT); + goto leave; + } + + if (scid_len) + memcpy(pkt->scid.data, *pos, scid_len); + pkt->scid.len = scid_len; + *pos += scid_len; + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_RXPKT); + return ret; +} + +/* Try to remove the header protection of <pkt> QUIC packet with <beg> the + * address of the packet first byte, using the keys from encryption level <el>. + * + * If header protection has been successfully removed, packet data are copied + * into <qc> Rx buffer. If <el> secrets are not yet available, the copy is also + * proceeded, and the packet is inserted into <qc> protected packets tree. In + * both cases, packet can now be considered handled by the <qc> connection. + * + * If header protection cannot be removed due to <el> secrets already + * discarded, no operation is conducted. + * + * Returns 1 on success : packet data is now handled by the connection. On + * error 0 is returned : packet should be dropped by the caller. + */ +static int qc_try_rm_hp(struct quic_conn *qc, struct quic_rx_packet *pkt, + unsigned char *beg, struct quic_enc_level **el) +{ + int ret = 0; + unsigned char *pn = NULL; /* Packet number field */ + enum quic_tls_enc_level tel; + struct quic_enc_level *qel; + /* Only for traces. */ + + TRACE_ENTER(QUIC_EV_CONN_TRMHP, qc); + BUG_ON(!pkt->pn_offset); + + /* The packet number is here. This is also the start minus + * QUIC_PACKET_PN_MAXLEN of the sample used to add/remove the header + * protection. + */ + pn = beg + pkt->pn_offset; + + tel = quic_packet_type_enc_level(pkt->type); + qel = qc_quic_enc_level(qc, tel); + if (!qel) { + struct quic_enc_level **qc_qel = qel_to_qel_addr(qc, tel); + struct quic_pktns **qc_pktns = qel_to_quic_pktns(qc, tel); + + if (!qc_enc_level_alloc(qc, qc_pktns, qc_qel, quic_to_ssl_enc_level(tel))) { + TRACE_PROTO("Could not allocated an encryption level", QUIC_EV_CONN_ADDDATA, qc); + goto out; + } + + qel = *qc_qel; + } + + if (qc_qel_may_rm_hp(qc, qel)) { + struct quic_tls_ctx *tls_ctx = + qc_select_tls_ctx(qc, qel, pkt->type, pkt->version); + + /* Note that the following function enables us to unprotect the packet + * number and its length subsequently used to decrypt the entire + * packets. + */ + if (!qc_do_rm_hp(qc, pkt, tls_ctx, + qel->pktns->rx.largest_pn, pn, beg)) { + TRACE_PROTO("hp error", QUIC_EV_CONN_TRMHP, qc); + goto out; + } + + qc_handle_spin_bit(qc, pkt, qel); + /* The AAD includes the packet number field. */ + pkt->aad_len = pkt->pn_offset + pkt->pnl; + if (pkt->len - pkt->aad_len < QUIC_TLS_TAG_LEN) { + TRACE_PROTO("Too short packet", QUIC_EV_CONN_TRMHP, qc); + goto out; + } + + TRACE_PROTO("RX hp removed", QUIC_EV_CONN_TRMHP, qc, pkt); + } + else { + TRACE_PROTO("RX hp not removed", QUIC_EV_CONN_TRMHP, qc, pkt); + LIST_APPEND(&qel->rx.pqpkts, &pkt->list); + quic_rx_packet_refinc(pkt); + } + + *el = qel; + /* No reference counter incrementation here!!! */ + LIST_APPEND(&qc->rx.pkt_list, &pkt->qc_rx_pkt_list); + memcpy(b_tail(&qc->rx.buf), beg, pkt->len); + pkt->data = (unsigned char *)b_tail(&qc->rx.buf); + b_add(&qc->rx.buf, pkt->len); + + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_TRMHP, qc); + return ret; +} + +/* Return a 32-bits integer in <val> from QUIC packet with <buf> as address. + * Makes <buf> point to the data after this 32-bits value if succeeded. + * Note that these 32-bits integers are network bytes ordered. + * Returns 0 if failed (not enough data in the buffer), 1 if succeeded. + */ +static inline int quic_read_uint32(uint32_t *val, + const unsigned char **buf, + const unsigned char *end) +{ + if (end - *buf < sizeof *val) + return 0; + + *val = ntohl(*(uint32_t *)*buf); + *buf += sizeof *val; + + return 1; +} + +/* Parse a QUIC packet header starting at <pos> position without exceeding <end>. + * Version and type are stored in <pkt> packet instance. Type is set to unknown + * on two occasions : for unsupported version, in this case version field is + * set to NULL; for Version Negotiation packet with version number set to 0. + * + * Returns 1 on success else 0. + */ +int qc_parse_hd_form(struct quic_rx_packet *pkt, + unsigned char **pos, const unsigned char *end) +{ + uint32_t version; + int ret = 0; + const unsigned char byte0 = **pos; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT); + pkt->version = NULL; + pkt->type = QUIC_PACKET_TYPE_UNKNOWN; + + (*pos)++; + if (byte0 & QUIC_PACKET_LONG_HEADER_BIT) { + unsigned char type = + (byte0 >> QUIC_PACKET_TYPE_SHIFT) & QUIC_PACKET_TYPE_BITMASK; + + /* Version */ + if (!quic_read_uint32(&version, (const unsigned char **)pos, end)) { + TRACE_ERROR("could not read the packet version", QUIC_EV_CONN_RXPKT); + goto out; + } + + pkt->version = qc_supported_version(version); + if (version && pkt->version) { + if (version != QUIC_PROTOCOL_VERSION_2) { + pkt->type = type; + } + else { + switch (type) { + case 0: + pkt->type = QUIC_PACKET_TYPE_RETRY; + break; + case 1: + pkt->type = QUIC_PACKET_TYPE_INITIAL; + break; + case 2: + pkt->type = QUIC_PACKET_TYPE_0RTT; + break; + case 3: + pkt->type = QUIC_PACKET_TYPE_HANDSHAKE; + break; + } + } + } + } + else { + if (byte0 & QUIC_PACKET_SPIN_BIT) + pkt->flags |= QUIC_FL_RX_PACKET_SPIN_BIT; + pkt->type = QUIC_PACKET_TYPE_SHORT; + } + + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_RXPKT); + return ret; +} + +/* Check that all the bytes between <pos> included and <end> address + * excluded are null. This is the responsibility of the caller to + * check that there is at least one byte between <pos> end <end>. + * Return 1 if this all the bytes are null, 0 if not. + */ +static inline int quic_padding_check(const unsigned char *pos, + const unsigned char *end) +{ + while (pos < end && !*pos) + pos++; + + return pos == end; +} + +/* Find the associated connection to the packet <pkt> or create a new one if + * this is an Initial packet. <dgram> is the datagram containing the packet and + * <l> is the listener instance on which it was received. + * + * By default, <new_tid> is set to -1. However, if thread affinity has been + * chanbed, it will be set to its new thread ID. + * + * Returns the quic-conn instance or NULL if not found or thread affinity + * changed. + */ +static struct quic_conn *quic_rx_pkt_retrieve_conn(struct quic_rx_packet *pkt, + struct quic_dgram *dgram, + struct listener *l, + int *new_tid) +{ + struct quic_cid token_odcid = { .len = 0 }; + struct quic_conn *qc = NULL; + struct proxy *prx; + struct quic_counters *prx_counters; + + TRACE_ENTER(QUIC_EV_CONN_LPKT); + + *new_tid = -1; + + prx = l->bind_conf->frontend; + prx_counters = EXTRA_COUNTERS_GET(prx->extra_counters_fe, &quic_stats_module); + + qc = retrieve_qc_conn_from_cid(pkt, &dgram->saddr, new_tid); + + /* If connection already created or rebinded on another thread. */ + if (!qc && *new_tid != -1 && tid != *new_tid) + goto out; + + if (pkt->type == QUIC_PACKET_TYPE_INITIAL) { + BUG_ON(!pkt->version); /* This must not happen. */ + + if (!qc) { + struct quic_cid_tree *tree; + struct ebmb_node *node; + struct quic_connection_id *conn_id; + int ipv4; + + /* Reject INITIAL early if listener limits reached. */ + if (unlikely(HA_ATOMIC_LOAD(&l->rx.quic_curr_handshake) >= + quic_listener_max_handshake(l))) { + TRACE_DATA("Drop INITIAL on max handshake", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + goto out; + } + + if (unlikely(HA_ATOMIC_LOAD(&l->rx.quic_curr_accept) >= + quic_listener_max_accept(l))) { + TRACE_DATA("Drop INITIAL on max accept", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + goto out; + } + + if (pkt->token_len) { + /* Validate the token only when connection is unknown. */ + if (!quic_retry_token_check(pkt, dgram, l, qc, &token_odcid)) + goto err; + } + else if (!(l->bind_conf->options & BC_O_QUIC_FORCE_RETRY) && + HA_ATOMIC_LOAD(&prx_counters->half_open_conn) >= global.tune.quic_retry_threshold) { + TRACE_PROTO("Initial without token, sending retry", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + if (send_retry(l->rx.fd, &dgram->saddr, pkt, pkt->version)) { + TRACE_ERROR("Error during Retry generation", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + goto out; + } + + HA_ATOMIC_INC(&prx_counters->retry_sent); + goto out; + } + + /* RFC 9000 7.2. Negotiating Connection IDs: + * When an Initial packet is sent by a client that has not previously + * received an Initial or Retry packet from the server, the client + * populates the Destination Connection ID field with an unpredictable + * value. This Destination Connection ID MUST be at least 8 bytes in length. + */ + if (pkt->dcid.len < QUIC_ODCID_MINLEN) { + TRACE_PROTO("dropped packet", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + goto err; + } + + pkt->saddr = dgram->saddr; + ipv4 = dgram->saddr.ss_family == AF_INET; + + /* Generate the first connection CID. This is derived from the client + * ODCID and address. This allows to retrieve the connection from the + * ODCID without storing it in the CID tree. This is an interesting + * optimization as the client is expected to stop using its ODCID in + * favor of our generated value. + */ + conn_id = new_quic_cid(NULL, NULL, &pkt->dcid, &pkt->saddr); + if (!conn_id) + goto err; + + qc = qc_new_conn(pkt->version, ipv4, &pkt->dcid, &pkt->scid, &token_odcid, + conn_id, &dgram->daddr, &pkt->saddr, 1, + !!pkt->token_len, l); + if (qc == NULL) { + pool_free(pool_head_quic_connection_id, conn_id); + goto err; + } + + /* Compute and store into the quic_conn the hash used to compute extra CIDs */ + if (quic_hash64_from_cid) + qc->hash64 = quic_hash64_from_cid(conn_id->cid.data, conn_id->cid.len, + global.cluster_secret, sizeof(global.cluster_secret)); + + tree = &quic_cid_trees[quic_cid_tree_idx(&conn_id->cid)]; + HA_RWLOCK_WRLOCK(QC_CID_LOCK, &tree->lock); + node = ebmb_insert(&tree->root, &conn_id->node, conn_id->cid.len); + if (node != &conn_id->node) { + pool_free(pool_head_quic_connection_id, conn_id); + + conn_id = ebmb_entry(node, struct quic_connection_id, node); + *new_tid = HA_ATOMIC_LOAD(&conn_id->tid); + quic_conn_release(qc); + qc = NULL; + } + else { + /* From here, <qc> is the correct connection for this <pkt> Initial + * packet. <conn_id> must be inserted in the CIDs tree for this + * connection. + */ + eb64_insert(qc->cids, &conn_id->seq_num); + /* Initialize the next CID sequence number to be used for this connection. */ + qc->next_cid_seq_num = 1; + } + HA_RWLOCK_WRUNLOCK(QC_CID_LOCK, &tree->lock); + + if (*new_tid != -1) + goto out; + } + } + else if (!qc) { + TRACE_PROTO("RX non Initial pkt without connection", QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + if (!send_stateless_reset(l, &dgram->saddr, pkt)) + TRACE_ERROR("stateless reset not sent", QUIC_EV_CONN_LPKT, qc); + goto err; + } + + out: + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc); + return qc; + + err: + HA_ATOMIC_INC(&prx_counters->dropped_pkt); + + TRACE_LEAVE(QUIC_EV_CONN_LPKT); + return NULL; +} + +/* Parse a QUIC packet starting at <pos>. Data won't be read after <end> even + * if the packet is incomplete. This function will populate fields of <pkt> + * instance, most notably its length. <dgram> is the UDP datagram which + * contains the parsed packet. <l> is the listener instance on which it was + * received. + * + * Returns 0 on success else non-zero. Packet length is guaranteed to be set to + * the real packet value or to cover all data between <pos> and <end> : this is + * useful to reject a whole datagram. + */ +static int quic_rx_pkt_parse(struct quic_rx_packet *pkt, + unsigned char *pos, const unsigned char *end, + struct quic_dgram *dgram, struct listener *l) +{ + const unsigned char *beg = pos; + struct proxy *prx; + struct quic_counters *prx_counters; + + TRACE_ENTER(QUIC_EV_CONN_LPKT); + + prx = l->bind_conf->frontend; + prx_counters = EXTRA_COUNTERS_GET(prx->extra_counters_fe, &quic_stats_module); + + if (end <= pos) { + TRACE_PROTO("Packet dropped", QUIC_EV_CONN_LPKT); + goto drop; + } + + /* Fixed bit */ + if (!(*pos & QUIC_PACKET_FIXED_BIT)) { + if (!(pkt->flags & QUIC_FL_RX_PACKET_DGRAM_FIRST) && + quic_padding_check(pos, end)) { + /* Some browsers may pad the remaining datagram space with null bytes. + * That is what we called add padding out of QUIC packets. Such + * datagrams must be considered as valid. But we can only consume + * the remaining space. + */ + pkt->len = end - pos; + goto drop_silent; + } + + TRACE_PROTO("Packet dropped", QUIC_EV_CONN_LPKT); + goto drop; + } + + /* Header form */ + if (!qc_parse_hd_form(pkt, &pos, end)) { + TRACE_PROTO("Packet dropped", QUIC_EV_CONN_LPKT); + goto drop; + } + + if (pkt->type != QUIC_PACKET_TYPE_SHORT) { + uint64_t len; + TRACE_PROTO("long header packet received", QUIC_EV_CONN_LPKT); + + if (!quic_packet_read_long_header(&pos, end, pkt)) { + TRACE_PROTO("Packet dropped", QUIC_EV_CONN_LPKT); + goto drop; + } + + /* When multiple QUIC packets are coalesced on the same UDP datagram, + * they must have the same DCID. + */ + if (!(pkt->flags & QUIC_FL_RX_PACKET_DGRAM_FIRST) && + (pkt->dcid.len != dgram->dcid_len || + memcmp(dgram->dcid, pkt->dcid.data, pkt->dcid.len))) { + TRACE_PROTO("Packet dropped", QUIC_EV_CONN_LPKT); + goto drop; + } + + /* Retry of Version Negotiation packets are only sent by servers */ + if (pkt->type == QUIC_PACKET_TYPE_RETRY || + (pkt->version && !pkt->version->num)) { + TRACE_PROTO("Packet dropped", QUIC_EV_CONN_LPKT); + goto drop; + } + + /* RFC9000 6. Version Negotiation */ + if (!pkt->version) { + /* unsupported version, send Negotiation packet */ + if (send_version_negotiation(l->rx.fd, &dgram->saddr, pkt)) { + TRACE_ERROR("VN packet not sent", QUIC_EV_CONN_LPKT); + goto drop_silent; + } + + TRACE_PROTO("VN packet sent", QUIC_EV_CONN_LPKT); + goto drop_silent; + } + + /* For Initial packets, and for servers (QUIC clients connections), + * there is no Initial connection IDs storage. + */ + if (pkt->type == QUIC_PACKET_TYPE_INITIAL) { + uint64_t token_len; + + if (!quic_dec_int(&token_len, (const unsigned char **)&pos, end) || + end - pos < token_len) { + TRACE_PROTO("Packet dropped", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + goto drop; + } + + /* TODO Retry should be automatically activated if + * suspect network usage is detected. + */ + if (!token_len) { + if (l->bind_conf->options & BC_O_QUIC_FORCE_RETRY) { + TRACE_PROTO("Initial without token, sending retry", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + if (send_retry(l->rx.fd, &dgram->saddr, pkt, pkt->version)) { + TRACE_PROTO("Error during Retry generation", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + goto drop_silent; + } + + HA_ATOMIC_INC(&prx_counters->retry_sent); + goto drop_silent; + } + } + + pkt->token = pos; + pkt->token_len = token_len; + pos += pkt->token_len; + } + else if (pkt->type != QUIC_PACKET_TYPE_0RTT) { + if (pkt->dcid.len != QUIC_HAP_CID_LEN) { + TRACE_PROTO("Packet dropped", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + goto drop; + } + } + + if (!quic_dec_int(&len, (const unsigned char **)&pos, end) || + end - pos < len) { + TRACE_PROTO("Packet dropped", + QUIC_EV_CONN_LPKT, NULL, NULL, NULL, pkt->version); + goto drop; + } + + /* Packet Number is stored here. Packet Length totalizes the + * rest of the content. + */ + pkt->pn_offset = pos - beg; + pkt->len = pkt->pn_offset + len; + + /* RFC 9000. Initial Datagram Size + * + * A server MUST discard an Initial packet that is carried in a UDP datagram + * with a payload that is smaller than the smallest allowed maximum datagram + * size of 1200 bytes. + */ + if (pkt->type == QUIC_PACKET_TYPE_INITIAL && + dgram->len < QUIC_INITIAL_PACKET_MINLEN) { + TRACE_PROTO("RX too short datagram with an Initial packet", QUIC_EV_CONN_LPKT); + HA_ATOMIC_INC(&prx_counters->too_short_initial_dgram); + goto drop; + } + + /* Interrupt parsing after packet length retrieval : this + * ensures that only the packet is dropped but not the whole + * datagram. + */ + if (pkt->type == QUIC_PACKET_TYPE_0RTT && !l->bind_conf->ssl_conf.early_data) { + TRACE_PROTO("RX 0-RTT packet not supported", QUIC_EV_CONN_LPKT); + goto drop; + } + } + else { + TRACE_PROTO("RX short header packet", QUIC_EV_CONN_LPKT); + if (end - pos < QUIC_HAP_CID_LEN) { + TRACE_PROTO("RX pkt dropped", QUIC_EV_CONN_LPKT); + goto drop; + } + + memcpy(pkt->dcid.data, pos, QUIC_HAP_CID_LEN); + pkt->dcid.len = QUIC_HAP_CID_LEN; + + /* When multiple QUIC packets are coalesced on the same UDP datagram, + * they must have the same DCID. + */ + if (!(pkt->flags & QUIC_FL_RX_PACKET_DGRAM_FIRST) && + (pkt->dcid.len != dgram->dcid_len || + memcmp(dgram->dcid, pkt->dcid.data, pkt->dcid.len))) { + TRACE_PROTO("RX pkt dropped", QUIC_EV_CONN_LPKT); + goto drop; + } + + pos += QUIC_HAP_CID_LEN; + + pkt->pn_offset = pos - beg; + /* A short packet is the last one of a UDP datagram. */ + pkt->len = end - beg; + } + + TRACE_PROTO("RX pkt parsed", QUIC_EV_CONN_LPKT, NULL, pkt, NULL, pkt->version); + TRACE_LEAVE(QUIC_EV_CONN_LPKT); + return 0; + + drop: + HA_ATOMIC_INC(&prx_counters->dropped_pkt); + drop_silent: + if (!pkt->len) + pkt->len = end - beg; + TRACE_PROTO("RX pkt parsing failed", QUIC_EV_CONN_LPKT, NULL, pkt, NULL, pkt->version); + TRACE_LEAVE(QUIC_EV_CONN_LPKT); + return -1; +} + +/* Check if received packet <pkt> should be drop due to <qc> already in closing + * state. This can be true if a CONNECTION_CLOSE has already been emitted for + * this connection. + * + * Returns false if connection is not in closing state else true. The caller + * should drop the whole datagram in the last case to not mess up <qc> + * CONNECTION_CLOSE rate limit counter. + */ +static int qc_rx_check_closing(struct quic_conn *qc, + struct quic_rx_packet *pkt) +{ + if (!(qc->flags & QUIC_FL_CONN_CLOSING)) + return 0; + + TRACE_STATE("Closing state connection", QUIC_EV_CONN_LPKT, qc, NULL, NULL, pkt->version); + + /* Check if CONNECTION_CLOSE rate reemission is reached. */ + if (++qc->nb_pkt_since_cc >= qc->nb_pkt_for_cc) { + qc->flags |= QUIC_FL_CONN_IMMEDIATE_CLOSE; + qc->nb_pkt_for_cc++; + qc->nb_pkt_since_cc = 0; + } + + return 1; +} + +/* Release the memory for the RX packets which are no more referenced + * and consume their payloads which have been copied to the RX buffer + * for the connection. + * Always succeeds. + */ +static void quic_rx_pkts_del(struct quic_conn *qc) +{ + struct quic_rx_packet *pkt, *pktback; + + list_for_each_entry_safe(pkt, pktback, &qc->rx.pkt_list, qc_rx_pkt_list) { + TRACE_PRINTF(TRACE_LEVEL_DEVELOPER, QUIC_EV_CONN_LPKT, qc, 0, 0, 0, + "pkt #%lld(type=%d,len=%llu,rawlen=%llu,refcnt=%u) (diff: %zd)", + (long long)pkt->pn_node.key, + pkt->type, (ull)pkt->len, (ull)pkt->raw_len, pkt->refcnt, + (unsigned char *)b_head(&qc->rx.buf) - pkt->data); + if (pkt->data != (unsigned char *)b_head(&qc->rx.buf)) { + size_t cdata; + + cdata = b_contig_data(&qc->rx.buf, 0); + TRACE_PRINTF(TRACE_LEVEL_DEVELOPER, QUIC_EV_CONN_LPKT, qc, 0, 0, 0, + "cdata=%llu *b_head()=0x%x", (ull)cdata, *b_head(&qc->rx.buf)); + if (cdata && !*b_head(&qc->rx.buf)) { + /* Consume the remaining data */ + b_del(&qc->rx.buf, cdata); + } + break; + } + + if (pkt->refcnt) + break; + + b_del(&qc->rx.buf, pkt->raw_len); + LIST_DELETE(&pkt->qc_rx_pkt_list); + pool_free(pool_head_quic_rx_packet, pkt); + } + + /* In frequent cases the buffer will be emptied at this stage. */ + b_realign_if_empty(&qc->rx.buf); +} + +/* Handle a parsed packet <pkt> by the connection <qc>. Data will be copied + * into <qc> receive buffer after header protection removal procedure. + * + * <dgram> must be set to the datagram which contains the QUIC packet. <beg> + * must point to packet buffer first byte. + * + * <tasklist_head> may be non-NULL when the caller treat several datagrams for + * different quic-conn. In this case, each quic-conn tasklet will be appended + * to it in order to be woken up after the current task. + * + * The caller can safely removed the packet data. If packet refcount was not + * incremented by this function, it means that the connection did not handled + * it and it should be freed by the caller. + */ +static void qc_rx_pkt_handle(struct quic_conn *qc, struct quic_rx_packet *pkt, + struct quic_dgram *dgram, unsigned char *beg, + struct list **tasklist_head) +{ + const struct quic_version *qv = pkt->version; + struct quic_enc_level *qel = NULL; + size_t b_cspace; + + TRACE_ENTER(QUIC_EV_CONN_LPKT, qc); + TRACE_PROTO("RX pkt", QUIC_EV_CONN_LPKT, qc, pkt, NULL, qv); + + if (pkt->flags & QUIC_FL_RX_PACKET_DGRAM_FIRST && + qc->flags & QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED) { + TRACE_PROTO("PTO timer must be armed after anti-amplication was reached", + QUIC_EV_CONN_LPKT, qc, NULL, NULL, qv); + TRACE_DEVEL("needs to wakeup the timer task after the amplification limit was reached", + QUIC_EV_CONN_LPKT, qc); + /* Reset the anti-amplification bit. It will be set again + * when sending the next packet if reached again. + */ + qc->flags &= ~QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED; + qc_set_timer(qc); + if (qc->timer_task && tick_isset(qc->timer) && tick_is_lt(qc->timer, now_ms)) + task_wakeup(qc->timer_task, TASK_WOKEN_MSG); + } + + /* Drop asap packet whose packet number space is discarded. */ + if (quic_tls_pkt_type_pktns_dcd(qc, pkt->type)) { + TRACE_PROTO("Discarded packet number space", QUIC_EV_CONN_TRMHP, qc); + goto drop_silent; + } + + if (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) { + TRACE_PROTO("Connection error", + QUIC_EV_CONN_LPKT, qc, NULL, NULL, qv); + goto out; + } + + pkt->raw_len = pkt->len; + quic_rx_pkts_del(qc); + b_cspace = b_contig_space(&qc->rx.buf); + if (b_cspace < pkt->len) { + TRACE_PRINTF(TRACE_LEVEL_DEVELOPER, QUIC_EV_CONN_LPKT, qc, 0, 0, 0, + "bspace=%llu pkt->len=%llu", (ull)b_cspace, (ull)pkt->len); + /* Do not consume buf if space not at the end. */ + if (b_tail(&qc->rx.buf) + b_cspace < b_wrap(&qc->rx.buf)) { + TRACE_PROTO("Packet dropped", + QUIC_EV_CONN_LPKT, qc, NULL, NULL, qv); + qc->cntrs.dropped_pkt_bufoverrun++; + goto drop_silent; + } + + /* Let us consume the remaining contiguous space. */ + if (b_cspace) { + b_putchr(&qc->rx.buf, 0x00); + b_cspace--; + } + b_add(&qc->rx.buf, b_cspace); + if (b_contig_space(&qc->rx.buf) < pkt->len) { + TRACE_PROTO("Too big packet", + QUIC_EV_CONN_LPKT, qc, pkt, &pkt->len, qv); + qc->cntrs.dropped_pkt_bufoverrun++; + goto drop_silent; + } + } + + if (!qc_try_rm_hp(qc, pkt, beg, &qel)) { + TRACE_PROTO("Packet dropped", QUIC_EV_CONN_LPKT, qc, NULL, NULL, qv); + goto drop; + } + + TRACE_DATA("New packet", QUIC_EV_CONN_LPKT, qc, pkt, NULL, qv); + if (pkt->aad_len) { + /* Insert this RX packet in its encryption level tree */ + pkt->pn_node.key = pkt->pn; + quic_rx_packet_refinc(pkt); + eb64_insert(&qel->rx.pkts, &pkt->pn_node); + } + out: + *tasklist_head = tasklet_wakeup_after(*tasklist_head, + qc->wait_event.tasklet); + + drop_silent: + TRACE_PROTO("RX pkt", QUIC_EV_CONN_LPKT, qc ? qc : NULL, pkt, NULL, qv); + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc ? qc : NULL); + return; + + drop: + qc->cntrs.dropped_pkt++; + TRACE_PROTO("packet drop", QUIC_EV_CONN_LPKT, qc, pkt, NULL, qv); + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc); +} + +/* Handle a new <dgram> received. Parse each QUIC packets and copied their + * content to a quic-conn instance. The datagram content can be released after + * this function. + * + * If datagram has been received on a quic-conn owned FD, <from_qc> must be set + * to the connection instance. <li> is the attached listener. The caller is + * responsible to ensure that the first packet is destined to this connection + * by comparing CIDs. + * + * If datagram has been received on a receiver FD, <from_qc> will be NULL. This + * function will thus retrieve the connection from the CID tree or allocate a + * new one if possible. <li> is the listener attached to the receiver. + * + * Returns 0 on success else non-zero. If an error happens, some packets from + * the datagram may not have been parsed. + */ +int quic_dgram_parse(struct quic_dgram *dgram, struct quic_conn *from_qc, + struct listener *li) +{ + struct quic_rx_packet *pkt; + struct quic_conn *qc = NULL; + unsigned char *pos, *end; + struct list *tasklist_head = NULL; + + TRACE_ENTER(QUIC_EV_CONN_LPKT); + + pos = dgram->buf; + end = pos + dgram->len; + do { + pkt = pool_alloc(pool_head_quic_rx_packet); + if (!pkt) { + TRACE_ERROR("RX packet allocation failed", QUIC_EV_CONN_LPKT); + goto err; + } + + LIST_INIT(&pkt->qc_rx_pkt_list); + pkt->version = NULL; + pkt->type = QUIC_PACKET_TYPE_UNKNOWN; + pkt->pn_offset = 0; + pkt->len = 0; + pkt->raw_len = 0; + pkt->token = NULL; + pkt->token_len = 0; + pkt->aad_len = 0; + pkt->data = NULL; + pkt->pn_node.key = (uint64_t)-1; + pkt->refcnt = 0; + pkt->flags = 0; + pkt->time_received = now_ms; + + /* Set flag if pkt is the first one in dgram. */ + if (pos == dgram->buf) + pkt->flags |= QUIC_FL_RX_PACKET_DGRAM_FIRST; + + quic_rx_packet_refinc(pkt); + if (quic_rx_pkt_parse(pkt, pos, end, dgram, li)) + goto next; + + /* Search quic-conn instance for first packet of the datagram. + * quic_rx_packet_parse() is responsible to discard packets + * with different DCID as the first one in the same datagram. + */ + if (!qc) { + int new_tid = -1; + + qc = from_qc ? from_qc : quic_rx_pkt_retrieve_conn(pkt, dgram, li, &new_tid); + /* qc is NULL if receiving a non Initial packet for an + * unknown connection or on connection affinity rebind. + */ + if (!qc) { + if (new_tid >= 0) { + MT_LIST_APPEND(&quic_dghdlrs[new_tid].dgrams, + &dgram->handler_list); + tasklet_wakeup(quic_dghdlrs[new_tid].task); + pool_free(pool_head_quic_rx_packet, pkt); + goto out; + } + + /* Skip the entire datagram. */ + pkt->len = end - pos; + goto next; + } + + dgram->qc = qc; + } + + /* Ensure thread connection migration is finalized ASAP. */ + if (qc->flags & QUIC_FL_CONN_AFFINITY_CHANGED) + qc_finalize_affinity_rebind(qc); + + if (qc_rx_check_closing(qc, pkt)) { + /* Skip the entire datagram. */ + pkt->len = end - pos; + goto next; + } + + /* Detect QUIC connection migration. */ + if (ipcmp(&qc->peer_addr, &dgram->saddr, 1)) { + if (qc_handle_conn_migration(qc, &dgram->saddr, &dgram->daddr)) { + /* Skip the entire datagram. */ + TRACE_ERROR("error during connection migration, datagram dropped", QUIC_EV_CONN_LPKT, qc); + pkt->len = end - pos; + goto next; + } + } + + qc_rx_pkt_handle(qc, pkt, dgram, pos, &tasklist_head); + + next: + pos += pkt->len; + quic_rx_packet_refdec(pkt); + + /* Free rejected packets */ + if (!pkt->refcnt) { + BUG_ON(LIST_INLIST(&pkt->qc_rx_pkt_list)); + pool_free(pool_head_quic_rx_packet, pkt); + } + } while (pos < end); + + /* Increasing the received bytes counter by the UDP datagram length + * if this datagram could be associated to a connection. + */ + if (dgram->qc) + dgram->qc->bytes.rx += dgram->len; + + /* This must never happen. */ + BUG_ON(pos > end); + BUG_ON(pos < end || pos > dgram->buf + dgram->len); + /* Mark this datagram as consumed */ + HA_ATOMIC_STORE(&dgram->buf, NULL); + + out: + TRACE_LEAVE(QUIC_EV_CONN_LPKT); + return 0; + + err: + /* Mark this datagram as consumed as maybe at least some packets were parsed. */ + HA_ATOMIC_STORE(&dgram->buf, NULL); + TRACE_LEAVE(QUIC_EV_CONN_LPKT); + return -1; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/quic_sock.c b/src/quic_sock.c new file mode 100644 index 0000000..c479249 --- /dev/null +++ b/src/quic_sock.c @@ -0,0 +1,1080 @@ +/* + * QUIC socket management. + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE /* required for struct in6_pktinfo */ +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include <netinet/in.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/fd.h> +#include <haproxy/global-t.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/pool.h> +#include <haproxy/proto_quic.h> +#include <haproxy/proxy-t.h> +#include <haproxy/quic_cid.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_rx.h> +#include <haproxy/quic_sock.h> +#include <haproxy/quic_tp-t.h> +#include <haproxy/quic_trace.h> +#include <haproxy/session.h> +#include <haproxy/stats-t.h> +#include <haproxy/task.h> +#include <haproxy/trace.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> + +/* Log only first EACCES bind() error runtime occurrence. */ +static volatile char quic_bind_eacces_warn = 0; + +/* Retrieve a connection's source address. Returns -1 on failure. */ +int quic_sock_get_src(struct connection *conn, struct sockaddr *addr, socklen_t len) +{ + struct quic_conn *qc; + + if (!conn || !conn->handle.qc) + return -1; + + qc = conn->handle.qc; + if (conn_is_back(conn)) { + /* no source address defined for outgoing connections for now */ + return -1; + } else { + /* front connection, return the peer's address */ + if (len > sizeof(qc->peer_addr)) + len = sizeof(qc->peer_addr); + memcpy(addr, &qc->peer_addr, len); + return 0; + } +} + +/* Retrieve a connection's destination address. Returns -1 on failure. */ +int quic_sock_get_dst(struct connection *conn, struct sockaddr *addr, socklen_t len) +{ + struct quic_conn *qc; + + if (!conn || !conn->handle.qc) + return -1; + + qc = conn->handle.qc; + if (conn_is_back(conn)) { + /* back connection, return the peer's address */ + if (len > sizeof(qc->peer_addr)) + len = sizeof(qc->peer_addr); + memcpy(addr, &qc->peer_addr, len); + } else { + struct sockaddr_storage *from; + + /* Return listener address if IP_PKTINFO or friends are not + * supported by the socket. + */ + BUG_ON(!qc->li); + from = is_addr(&qc->local_addr) ? &qc->local_addr : + &qc->li->rx.addr; + if (len > sizeof(*from)) + len = sizeof(*from); + memcpy(addr, from, len); + } + return 0; +} + +/* + * Inspired from session_accept_fd(). + * Instantiate a new connection (connection struct) to be attached to <qc> + * QUIC connection of <l> listener. + * Returns 1 if succeeded, 0 if not. + */ +static int new_quic_cli_conn(struct quic_conn *qc, struct listener *l, + struct sockaddr_storage *saddr) +{ + struct connection *cli_conn; + + if (unlikely((cli_conn = conn_new(&l->obj_type)) == NULL)) + goto out; + + if (!sockaddr_alloc(&cli_conn->src, saddr, sizeof *saddr)) + goto out_free_conn; + + cli_conn->flags |= CO_FL_FDLESS; + qc->conn = cli_conn; + cli_conn->handle.qc = qc; + + cli_conn->target = &l->obj_type; + + return 1; + + out_free_conn: + qc->conn = NULL; + conn_stop_tracking(cli_conn); + conn_xprt_close(cli_conn); + conn_free(cli_conn); + out: + + return 0; +} + +/* Tests if the receiver supports accepting connections. Returns positive on + * success, 0 if not possible + */ +int quic_sock_accepting_conn(const struct receiver *rx) +{ + return 1; +} + +/* Accept an incoming connection from listener <l>, and return it, as well as + * a CO_AC_* status code into <status> if not null. Null is returned on error. + * <l> must be a valid listener with a valid frontend. + */ +struct connection *quic_sock_accept_conn(struct listener *l, int *status) +{ + struct quic_conn *qc; + struct li_per_thread *lthr = &l->per_thr[ti->ltid]; + + qc = MT_LIST_POP(<hr->quic_accept.conns, struct quic_conn *, accept_list); + if (!qc || qc->flags & (QUIC_FL_CONN_CLOSING|QUIC_FL_CONN_DRAINING)) + goto done; + + if (!new_quic_cli_conn(qc, l, &qc->peer_addr)) + goto err; + + done: + *status = CO_AC_DONE; + + if (qc) { + BUG_ON(l->rx.quic_curr_accept <= 0); + HA_ATOMIC_DEC(&l->rx.quic_curr_accept); + return qc->conn; + } + else { + return NULL; + } + + err: + /* in case of error reinsert the element to process it later. */ + MT_LIST_INSERT(<hr->quic_accept.conns, &qc->accept_list); + + *status = CO_AC_PAUSE; + return NULL; +} + +/* QUIC datagrams handler task. */ +struct task *quic_lstnr_dghdlr(struct task *t, void *ctx, unsigned int state) +{ + struct quic_dghdlr *dghdlr = ctx; + struct quic_dgram *dgram; + int max_dgrams = global.tune.maxpollevents; + + TRACE_ENTER(QUIC_EV_CONN_LPKT); + + while ((dgram = MT_LIST_POP(&dghdlr->dgrams, typeof(dgram), handler_list))) { + if (quic_dgram_parse(dgram, NULL, dgram->owner)) { + /* TODO should we requeue the datagram ? */ + break; + } + + if (--max_dgrams <= 0) + goto stop_here; + } + + TRACE_LEAVE(QUIC_EV_CONN_LPKT); + return t; + + stop_here: + /* too much work done at once, come back here later */ + if (!MT_LIST_ISEMPTY(&dghdlr->dgrams)) + tasklet_wakeup((struct tasklet *)t); + + TRACE_LEAVE(QUIC_EV_CONN_LPKT); + return t; +} + +/* Retrieve the DCID from a QUIC datagram or packet at <pos> position, + * <end> being at one byte past the end of this datagram. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_get_dgram_dcid(unsigned char *pos, const unsigned char *end, + unsigned char **dcid, size_t *dcid_len) +{ + int ret = 0, long_header; + size_t minlen, skip; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT); + + if (!(*pos & QUIC_PACKET_FIXED_BIT)) { + TRACE_PROTO("fixed bit not set", QUIC_EV_CONN_RXPKT); + goto err; + } + + long_header = *pos & QUIC_PACKET_LONG_HEADER_BIT; + minlen = long_header ? QUIC_LONG_PACKET_MINLEN : + QUIC_SHORT_PACKET_MINLEN + QUIC_HAP_CID_LEN + QUIC_TLS_TAG_LEN; + skip = long_header ? QUIC_LONG_PACKET_DCID_OFF : QUIC_SHORT_PACKET_DCID_OFF; + if (end - pos < minlen) + goto err; + + pos += skip; + *dcid_len = long_header ? *pos++ : QUIC_HAP_CID_LEN; + if (*dcid_len > QUIC_CID_MAXLEN || end - pos <= *dcid_len) + goto err; + + *dcid = pos; + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_RXPKT); + return ret; + + err: + TRACE_PROTO("wrong datagram", QUIC_EV_CONN_RXPKT); + goto leave; +} + + +/* Retrieve the DCID from the datagram found at <pos> position and deliver it to the + * correct datagram handler. + * Return 1 if a correct datagram could be found, 0 if not. + */ +static int quic_lstnr_dgram_dispatch(unsigned char *pos, size_t len, void *owner, + struct sockaddr_storage *saddr, + struct sockaddr_storage *daddr, + struct quic_dgram *new_dgram, struct list *dgrams) +{ + struct quic_dgram *dgram; + unsigned char *dcid; + size_t dcid_len; + int cid_tid; + + if (!len || !quic_get_dgram_dcid(pos, pos + len, &dcid, &dcid_len)) + goto err; + + dgram = new_dgram ? new_dgram : pool_alloc(pool_head_quic_dgram); + if (!dgram) + goto err; + + if ((cid_tid = quic_get_cid_tid(dcid, dcid_len, saddr, pos, len)) < 0) { + /* Use the current thread if CID not found. If a clients opens + * a connection with multiple packets, it is possible that + * several threads will deal with datagrams sharing the same + * CID. For this reason, the CID tree insertion will be + * conducted as an atomic operation and the datagram ultimately + * redispatch by the late thread. + */ + cid_tid = tid; + } + + /* All the members must be initialized! */ + dgram->owner = owner; + dgram->buf = pos; + dgram->len = len; + dgram->dcid = dcid; + dgram->dcid_len = dcid_len; + dgram->saddr = *saddr; + dgram->daddr = *daddr; + dgram->qc = NULL; + + /* Attached datagram to its quic_receiver_buf and quic_dghdlrs. */ + LIST_APPEND(dgrams, &dgram->recv_list); + MT_LIST_APPEND(&quic_dghdlrs[cid_tid].dgrams, &dgram->handler_list); + + /* typically quic_lstnr_dghdlr() */ + tasklet_wakeup(quic_dghdlrs[cid_tid].task); + + return 1; + + err: + pool_free(pool_head_quic_dgram, new_dgram); + return 0; +} + +/* This function is responsible to remove unused datagram attached in front of + * <buf>. Each instances will be freed until a not yet consumed datagram is + * found or end of the list is hit. The last unused datagram found is not freed + * and is instead returned so that the caller can reuse it if needed. + * + * Returns the last unused datagram or NULL if no occurrence found. + */ +static struct quic_dgram *quic_rxbuf_purge_dgrams(struct quic_receiver_buf *rbuf) +{ + struct quic_dgram *cur, *prev = NULL; + + while (!LIST_ISEMPTY(&rbuf->dgram_list)) { + cur = LIST_ELEM(rbuf->dgram_list.n, struct quic_dgram *, recv_list); + + /* Loop until a not yet consumed datagram is found. */ + if (HA_ATOMIC_LOAD(&cur->buf)) + break; + + /* Clear buffer of current unused datagram. */ + LIST_DELETE(&cur->recv_list); + b_del(&rbuf->buf, cur->len); + + /* Free last found unused datagram. */ + pool_free(pool_head_quic_dgram, prev); + prev = cur; + } + + /* Return last unused datagram found. */ + return prev; +} + +/* Receive data from datagram socket <fd>. Data are placed in <out> buffer of + * length <len>. + * + * Datagram addresses will be returned via the next arguments. <from> will be + * the peer address and <to> the reception one. Note that <to> can only be + * retrieved if the socket supports IP_PKTINFO or affiliated options. If not, + * <to> will be set as AF_UNSPEC. The caller must specify <to_port> to ensure + * that <to> address is completely filled. + * + * Returns value from recvmsg syscall. + */ +static ssize_t quic_recv(int fd, void *out, size_t len, + struct sockaddr *from, socklen_t from_len, + struct sockaddr *to, socklen_t to_len, + uint16_t dst_port) +{ + union pktinfo { +#ifdef IP_PKTINFO + struct in_pktinfo in; +#else /* !IP_PKTINFO */ + struct in_addr addr; +#endif +#ifdef IPV6_RECVPKTINFO + struct in6_pktinfo in6; +#endif + }; + char cdata[CMSG_SPACE(sizeof(union pktinfo))]; + struct msghdr msg; + struct iovec vec; + struct cmsghdr *cmsg; + ssize_t ret; + + vec.iov_base = out; + vec.iov_len = len; + + memset(&msg, 0, sizeof(msg)); + msg.msg_name = from; + msg.msg_namelen = from_len; + msg.msg_iov = &vec; + msg.msg_iovlen = 1; + msg.msg_control = &cdata; + msg.msg_controllen = sizeof(cdata); + + clear_addr((struct sockaddr_storage *)to); + + do { + ret = recvmsg(fd, &msg, 0); + } while (ret < 0 && errno == EINTR); + + /* TODO handle errno. On EAGAIN/EWOULDBLOCK use fd_cant_recv() if + * using dedicated connection socket. + */ + + if (ret < 0) + goto end; + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + switch (cmsg->cmsg_level) { + case IPPROTO_IP: +#if defined(IP_PKTINFO) + if (cmsg->cmsg_type == IP_PKTINFO) { + struct sockaddr_in *in = (struct sockaddr_in *)to; + struct in_pktinfo *info = (struct in_pktinfo *)CMSG_DATA(cmsg); + + if (to_len >= sizeof(struct sockaddr_in)) { + in->sin_family = AF_INET; + in->sin_addr = info->ipi_addr; + in->sin_port = dst_port; + } + } +#elif defined(IP_RECVDSTADDR) + if (cmsg->cmsg_type == IP_RECVDSTADDR) { + struct sockaddr_in *in = (struct sockaddr_in *)to; + struct in_addr *info = (struct in_addr *)CMSG_DATA(cmsg); + + if (to_len >= sizeof(struct sockaddr_in)) { + in->sin_family = AF_INET; + in->sin_addr.s_addr = info->s_addr; + in->sin_port = dst_port; + } + } +#endif /* IP_PKTINFO || IP_RECVDSTADDR */ + break; + + case IPPROTO_IPV6: +#ifdef IPV6_RECVPKTINFO + if (cmsg->cmsg_type == IPV6_PKTINFO) { + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)to; + struct in6_pktinfo *info6 = (struct in6_pktinfo *)CMSG_DATA(cmsg); + + if (to_len >= sizeof(struct sockaddr_in6)) { + in6->sin6_family = AF_INET6; + memcpy(&in6->sin6_addr, &info6->ipi6_addr, sizeof(in6->sin6_addr)); + in6->sin6_port = dst_port; + } + } +#endif + break; + } + } + + end: + return ret; +} + +/* Function called on a read event from a listening socket. It tries + * to handle as many connections as possible. + */ +void quic_lstnr_sock_fd_iocb(int fd) +{ + ssize_t ret; + struct quic_receiver_buf *rxbuf; + struct buffer *buf; + struct listener *l = objt_listener(fdtab[fd].owner); + struct quic_transport_params *params; + /* Source address */ + struct sockaddr_storage saddr = {0}, daddr = {0}; + size_t max_sz, cspace; + struct quic_dgram *new_dgram; + unsigned char *dgram_buf; + int max_dgrams; + + BUG_ON(!l); + + new_dgram = NULL; + if (!l) + return; + + if (!(fdtab[fd].state & FD_POLL_IN) || !fd_recv_ready(fd)) + return; + + rxbuf = MT_LIST_POP(&l->rx.rxbuf_list, typeof(rxbuf), rxbuf_el); + if (!rxbuf) + goto out; + + buf = &rxbuf->buf; + + max_dgrams = global.tune.maxpollevents; + start: + /* Try to reuse an existing dgram. Note that there is always at + * least one datagram to pick, except the first time we enter + * this function for this <rxbuf> buffer. + */ + new_dgram = quic_rxbuf_purge_dgrams(rxbuf); + + params = &l->bind_conf->quic_params; + max_sz = params->max_udp_payload_size; + cspace = b_contig_space(buf); + if (cspace < max_sz) { + struct proxy *px = l->bind_conf->frontend; + struct quic_counters *prx_counters = EXTRA_COUNTERS_GET(px->extra_counters_fe, &quic_stats_module); + struct quic_dgram *dgram; + + /* Do no mark <buf> as full, and do not try to consume it + * if the contiguous remaining space is not at the end + */ + if (b_tail(buf) + cspace < b_wrap(buf)) { + HA_ATOMIC_INC(&prx_counters->rxbuf_full); + goto out; + } + + /* Allocate a fake datagram, without data to locate + * the end of the RX buffer (required during purging). + */ + dgram = pool_alloc(pool_head_quic_dgram); + if (!dgram) + goto out; + + /* Initialize only the useful members of this fake datagram. */ + dgram->buf = NULL; + dgram->len = cspace; + /* Append this datagram only to the RX buffer list. It will + * not be treated by any datagram handler. + */ + LIST_APPEND(&rxbuf->dgram_list, &dgram->recv_list); + + /* Consume the remaining space */ + b_add(buf, cspace); + if (b_contig_space(buf) < max_sz) { + HA_ATOMIC_INC(&prx_counters->rxbuf_full); + goto out; + } + } + + dgram_buf = (unsigned char *)b_tail(buf); + ret = quic_recv(fd, dgram_buf, max_sz, + (struct sockaddr *)&saddr, sizeof(saddr), + (struct sockaddr *)&daddr, sizeof(daddr), + get_net_port(&l->rx.addr)); + if (ret <= 0) + goto out; + + b_add(buf, ret); + if (!quic_lstnr_dgram_dispatch(dgram_buf, ret, l, &saddr, &daddr, + new_dgram, &rxbuf->dgram_list)) { + /* If wrong, consume this datagram */ + b_sub(buf, ret); + } + new_dgram = NULL; + if (--max_dgrams > 0) + goto start; + out: + pool_free(pool_head_quic_dgram, new_dgram); + MT_LIST_APPEND(&l->rx.rxbuf_list, &rxbuf->rxbuf_el); +} + +/* FD-owned quic-conn socket callback. */ +void quic_conn_sock_fd_iocb(int fd) +{ + struct quic_conn *qc = fdtab[fd].owner; + + TRACE_ENTER(QUIC_EV_CONN_RCV, qc); + + if (fd_send_active(fd) && fd_send_ready(fd)) { + TRACE_DEVEL("send ready", QUIC_EV_CONN_RCV, qc); + fd_stop_send(fd); + tasklet_wakeup_after(NULL, qc->wait_event.tasklet); + qc_notify_send(qc); + } + + if (fd_recv_ready(fd)) { + TRACE_DEVEL("recv ready", QUIC_EV_CONN_RCV, qc); + tasklet_wakeup_after(NULL, qc->wait_event.tasklet); + fd_stop_recv(fd); + } + + TRACE_LEAVE(QUIC_EV_CONN_RCV, qc); +} + +/* Send a datagram stored into <buf> buffer with <sz> as size. + * The caller must ensure there is at least <sz> bytes in this buffer. + * + * Returns the total bytes sent over the socket. 0 is returned if a transient + * error is encountered which allows send to be retry later. A negative value + * is used for a fatal error which guarantee that all future send operation for + * this connection will fail. + * + * TODO standardize this function for a generic UDP sendto wrapper. This can be + * done by removing the <qc> arg and replace it with address/port. + */ +int qc_snd_buf(struct quic_conn *qc, const struct buffer *buf, size_t sz, + int flags) +{ + ssize_t ret; + + do { + if (qc_test_fd(qc)) { + if (!fd_send_ready(qc->fd)) + return 0; + + ret = send(qc->fd, b_peek(buf, b_head_ofs(buf)), sz, + MSG_DONTWAIT | MSG_NOSIGNAL); + } +#if defined(IP_PKTINFO) || defined(IP_RECVDSTADDR) || defined(IPV6_RECVPKTINFO) + else if (is_addr(&qc->local_addr)) { + struct msghdr msg = { 0 }; + struct iovec vec; + struct cmsghdr *cmsg; +#ifdef IP_PKTINFO + struct in_pktinfo in; +#endif /* IP_PKTINFO */ +#ifdef IPV6_RECVPKTINFO + struct in6_pktinfo in6; +#endif /* IPV6_RECVPKTINFO */ + union { +#ifdef IP_PKTINFO + char buf[CMSG_SPACE(sizeof(in))]; +#endif /* IP_PKTINFO */ +#ifdef IPV6_RECVPKTINFO + char buf6[CMSG_SPACE(sizeof(in6))]; +#endif /* IPV6_RECVPKTINFO */ + char bufaddr[CMSG_SPACE(sizeof(struct in_addr))]; + struct cmsghdr align; + } u; + + vec.iov_base = b_peek(buf, b_head_ofs(buf)); + vec.iov_len = sz; + msg.msg_name = &qc->peer_addr; + msg.msg_namelen = get_addr_len(&qc->peer_addr); + msg.msg_iov = &vec; + msg.msg_iovlen = 1; + + switch (qc->local_addr.ss_family) { + case AF_INET: +#if defined(IP_PKTINFO) + memset(&in, 0, sizeof(in)); + memcpy(&in.ipi_spec_dst, + &((struct sockaddr_in *)&qc->local_addr)->sin_addr, + sizeof(struct in_addr)); + + msg.msg_control = u.buf; + msg.msg_controllen = sizeof(u.buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = IPPROTO_IP; + cmsg->cmsg_type = IP_PKTINFO; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); + memcpy(CMSG_DATA(cmsg), &in, sizeof(in)); +#elif defined(IP_RECVDSTADDR) + msg.msg_control = u.bufaddr; + msg.msg_controllen = sizeof(u.bufaddr); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = IPPROTO_IP; + cmsg->cmsg_type = IP_SENDSRCADDR; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_addr)); + memcpy(CMSG_DATA(cmsg), + &((struct sockaddr_in *)&qc->local_addr)->sin_addr, + sizeof(struct in_addr)); +#endif /* IP_PKTINFO || IP_RECVDSTADDR */ + break; + + case AF_INET6: +#ifdef IPV6_RECVPKTINFO + memset(&in6, 0, sizeof(in6)); + memcpy(&in6.ipi6_addr, + &((struct sockaddr_in6 *)&qc->local_addr)->sin6_addr, + sizeof(struct in6_addr)); + + msg.msg_control = u.buf6; + msg.msg_controllen = sizeof(u.buf6); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = IPPROTO_IPV6; + cmsg->cmsg_type = IPV6_PKTINFO; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); + memcpy(CMSG_DATA(cmsg), &in6, sizeof(in6)); +#endif /* IPV6_RECVPKTINFO */ + break; + + default: + break; + } + + ret = sendmsg(qc->li->rx.fd, &msg, + MSG_DONTWAIT|MSG_NOSIGNAL); + } +#endif /* IP_PKTINFO || IP_RECVDSTADDR || IPV6_RECVPKTINFO */ + else { + ret = sendto(qc->li->rx.fd, b_peek(buf, b_head_ofs(buf)), sz, + MSG_DONTWAIT|MSG_NOSIGNAL, + (struct sockaddr *)&qc->peer_addr, + get_addr_len(&qc->peer_addr)); + } + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || + errno == ENOTCONN || errno == EINPROGRESS) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + qc->cntrs.socket_full++; + else + qc->cntrs.sendto_err++; + + /* transient error */ + fd_want_send(qc->fd); + fd_cant_send(qc->fd); + TRACE_PRINTF(TRACE_LEVEL_USER, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0, + "UDP send failure errno=%d (%s)", errno, strerror(errno)); + return 0; + } + else { + /* unrecoverable error */ + qc->cntrs.sendto_err_unknown++; + TRACE_PRINTF(TRACE_LEVEL_USER, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0, + "UDP send failure errno=%d (%s)", errno, strerror(errno)); + return -1; + } + } + + if (ret != sz) + return 0; + + return ret; +} + +/* Receive datagram on <qc> FD-owned socket. + * + * Returns the total number of bytes read or a negative value on error. + */ +int qc_rcv_buf(struct quic_conn *qc) +{ + struct sockaddr_storage saddr = {0}, daddr = {0}; + struct quic_transport_params *params; + struct quic_dgram *new_dgram = NULL; + struct buffer buf = BUF_NULL; + size_t max_sz; + unsigned char *dgram_buf; + struct listener *l; + ssize_t ret = 0; + + /* Do not call this if quic-conn FD is uninitialized. */ + BUG_ON(qc->fd < 0); + + TRACE_ENTER(QUIC_EV_CONN_RCV, qc); + l = qc->li; + + params = &l->bind_conf->quic_params; + max_sz = params->max_udp_payload_size; + + do { + if (!b_alloc(&buf)) + break; /* TODO subscribe for memory again available. */ + + b_reset(&buf); + BUG_ON(b_contig_space(&buf) < max_sz); + + /* Allocate datagram on first loop or after requeuing. */ + if (!new_dgram && !(new_dgram = pool_alloc(pool_head_quic_dgram))) + break; /* TODO subscribe for memory again available. */ + + dgram_buf = (unsigned char *)b_tail(&buf); + ret = quic_recv(qc->fd, dgram_buf, max_sz, + (struct sockaddr *)&saddr, sizeof(saddr), + (struct sockaddr *)&daddr, sizeof(daddr), + get_net_port(&qc->local_addr)); + if (ret <= 0) { + /* Subscribe FD for future reception. */ + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == ENOTCONN) + fd_want_recv(qc->fd); + /* TODO handle other error codes as fatal on the connection. */ + break; + } + + b_add(&buf, ret); + + new_dgram->buf = dgram_buf; + new_dgram->len = ret; + new_dgram->dcid_len = 0; + new_dgram->dcid = NULL; + new_dgram->saddr = saddr; + new_dgram->daddr = daddr; + new_dgram->qc = NULL; /* set later via quic_dgram_parse() */ + + TRACE_DEVEL("read datagram", QUIC_EV_CONN_RCV, qc, new_dgram); + + if (!quic_get_dgram_dcid(new_dgram->buf, + new_dgram->buf + new_dgram->len, + &new_dgram->dcid, &new_dgram->dcid_len)) { + continue; + } + + if (!qc_check_dcid(qc, new_dgram->dcid, new_dgram->dcid_len)) { + /* Datagram received by error on the connection FD, dispatch it + * to its associated quic-conn. + * + * TODO count redispatch datagrams. + */ + struct quic_receiver_buf *rxbuf; + struct quic_dgram *tmp_dgram; + unsigned char *rxbuf_tail; + size_t cspace; + + TRACE_STATE("datagram for other connection on quic-conn socket, requeue it", QUIC_EV_CONN_RCV, qc); + + rxbuf = MT_LIST_POP(&l->rx.rxbuf_list, typeof(rxbuf), rxbuf_el); + ALREADY_CHECKED(rxbuf); + cspace = b_contig_space(&rxbuf->buf); + + tmp_dgram = quic_rxbuf_purge_dgrams(rxbuf); + pool_free(pool_head_quic_dgram, tmp_dgram); + + /* Insert a fake datagram if space wraps to consume it. */ + if (cspace < new_dgram->len && b_space_wraps(&rxbuf->buf)) { + struct quic_dgram *fake_dgram = pool_alloc(pool_head_quic_dgram); + if (!fake_dgram) { + /* TODO count lost datagrams */ + MT_LIST_APPEND(&l->rx.rxbuf_list, &rxbuf->rxbuf_el); + continue; + } + + fake_dgram->buf = NULL; + fake_dgram->len = cspace; + LIST_APPEND(&rxbuf->dgram_list, &fake_dgram->recv_list); + b_add(&rxbuf->buf, cspace); + } + + /* Recheck contig space after fake datagram insert. */ + if (b_contig_space(&rxbuf->buf) < new_dgram->len) { + /* TODO count lost datagrams */ + MT_LIST_APPEND(&l->rx.rxbuf_list, &rxbuf->rxbuf_el); + continue; + } + + rxbuf_tail = (unsigned char *)b_tail(&rxbuf->buf); + __b_putblk(&rxbuf->buf, (char *)dgram_buf, new_dgram->len); + if (!quic_lstnr_dgram_dispatch(rxbuf_tail, ret, l, &saddr, &daddr, + new_dgram, &rxbuf->dgram_list)) { + /* TODO count lost datagrams. */ + b_sub(&buf, ret); + } + else { + /* datagram must not be freed as it was requeued. */ + new_dgram = NULL; + } + + MT_LIST_APPEND(&l->rx.rxbuf_list, &rxbuf->rxbuf_el); + continue; + } + + quic_dgram_parse(new_dgram, qc, qc->li); + /* A datagram must always be consumed after quic_parse_dgram(). */ + BUG_ON(new_dgram->buf); + } while (ret > 0); + + pool_free(pool_head_quic_dgram, new_dgram); + + if (b_size(&buf)) { + b_free(&buf); + offer_buffers(NULL, 1); + } + + TRACE_LEAVE(QUIC_EV_CONN_RCV, qc); + return ret; +} + +/* Allocate a socket file-descriptor specific for QUIC connection <qc>. + * Endpoint addresses are specified by the two following arguments : <src> is + * the local address and <dst> is the remote one. + * + * Return the socket FD or a negative error code. On error, socket is marked as + * uninitialized. + */ +void qc_alloc_fd(struct quic_conn *qc, const struct sockaddr_storage *src, + const struct sockaddr_storage *dst) +{ + struct bind_conf *bc = qc->li->bind_conf; + struct proxy *p = bc->frontend; + int fd = -1; + int ret; + + /* Must not happen. */ + BUG_ON(src->ss_family != dst->ss_family); + + qc_init_fd(qc); + + fd = socket(src->ss_family, SOCK_DGRAM, 0); + if (fd < 0) + goto err; + + if (fd >= global.maxsock) { + send_log(p, LOG_EMERG, + "Proxy %s reached the configured maximum connection limit. Please check the global 'maxconn' value.\n", + p->id); + goto err; + } + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)); + if (ret < 0) + goto err; + + switch (src->ss_family) { + case AF_INET: +#if defined(IP_PKTINFO) + ret = setsockopt(fd, IPPROTO_IP, IP_PKTINFO, &one, sizeof(one)); +#elif defined(IP_RECVDSTADDR) + ret = setsockopt(fd, IPPROTO_IP, IP_RECVDSTADDR, &one, sizeof(one)); +#endif /* IP_PKTINFO || IP_RECVDSTADDR */ + break; + case AF_INET6: +#ifdef IPV6_RECVPKTINFO + ret = setsockopt(fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one)); +#endif + break; + } + if (ret < 0) + goto err; + + ret = bind(fd, (struct sockaddr *)src, get_addr_len(src)); + if (ret < 0) { + if (errno == EACCES) { + if (!quic_bind_eacces_warn) { + send_log(p, LOG_WARNING, + "Permission error on QUIC socket binding for proxy %s. Consider using setcap cap_net_bind_service (Linux only) or running as root.\n", + p->id); + quic_bind_eacces_warn = 1; + } + + /* Fallback to listener socket for this receiver instance. */ + HA_ATOMIC_STORE(&qc->li->rx.quic_mode, QUIC_SOCK_MODE_LSTNR); + } + goto err; + } + + ret = connect(fd, (struct sockaddr *)dst, get_addr_len(dst)); + if (ret < 0) + goto err; + + qc->fd = fd; + fd_set_nonblock(fd); + fd_insert(fd, qc, quic_conn_sock_fd_iocb, tgid, ti->ltid_bit); + fd_want_recv(fd); + + return; + + err: + if (fd >= 0) + close(fd); +} + +/* Release socket file-descriptor specific for QUIC connection <qc>. Set + * <reinit> if socket should be reinitialized after address migration. + */ +void qc_release_fd(struct quic_conn *qc, int reinit) +{ + if (qc_test_fd(qc)) { + fd_delete(qc->fd); + qc->fd = DEAD_FD_MAGIC; + + if (reinit) + qc_init_fd(qc); + } +} + +/* Wrapper for fd_want_recv(). Safe even if connection does not used its owned + * socket. + */ +void qc_want_recv(struct quic_conn *qc) +{ + if (qc_test_fd(qc)) + fd_want_recv(qc->fd); +} + +/*********************** QUIC accept queue management ***********************/ +/* per-thread accept queues */ +struct quic_accept_queue *quic_accept_queues; + +/* Install <qc> on the queue ready to be accepted. The queue task is then woken + * up. If <qc> accept is already scheduled or done, nothing is done. + */ +void quic_accept_push_qc(struct quic_conn *qc) +{ + struct quic_accept_queue *queue = &quic_accept_queues[tid]; + struct li_per_thread *lthr = &qc->li->per_thr[ti->ltid]; + + /* early return if accept is already in progress/done for this + * connection + */ + if (qc->flags & QUIC_FL_CONN_ACCEPT_REGISTERED) + return; + + BUG_ON(MT_LIST_INLIST(&qc->accept_list)); + HA_ATOMIC_INC(&qc->li->rx.quic_curr_accept); + + qc->flags |= QUIC_FL_CONN_ACCEPT_REGISTERED; + /* 1. insert the listener in the accept queue + * + * Use TRY_APPEND as there is a possible race even with INLIST if + * multiple threads try to add the same listener instance from several + * quic_conn. + */ + if (!MT_LIST_INLIST(&(lthr->quic_accept.list))) + MT_LIST_TRY_APPEND(&queue->listeners, &(lthr->quic_accept.list)); + + /* 2. insert the quic_conn in the listener per-thread queue. */ + MT_LIST_APPEND(<hr->quic_accept.conns, &qc->accept_list); + + /* 3. wake up the queue tasklet */ + tasklet_wakeup(quic_accept_queues[tid].tasklet); +} + +/* Tasklet handler to accept QUIC connections. Call listener_accept on every + * listener instances registered in the accept queue. + */ +struct task *quic_accept_run(struct task *t, void *ctx, unsigned int i) +{ + struct li_per_thread *lthr; + struct mt_list *elt1, elt2; + struct quic_accept_queue *queue = &quic_accept_queues[tid]; + + mt_list_for_each_entry_safe(lthr, &queue->listeners, quic_accept.list, elt1, elt2) { + listener_accept(lthr->li); + if (!MT_LIST_ISEMPTY(<hr->quic_accept.conns)) + tasklet_wakeup((struct tasklet*)t); + else + MT_LIST_DELETE_SAFE(elt1); + } + + return NULL; +} + +/* Returns the maximum number of QUIC connections waiting for handshake to + * complete in parallel on listener <l> instance. This is directly based on + * listener backlog value. + */ +int quic_listener_max_handshake(const struct listener *l) +{ + return listener_backlog(l) / 2; +} + +/* Returns the value which is considered as the maximum number of QUIC + * connections waiting to be accepted for listener <l> instance. This is + * directly based on listener backlog value. + */ +int quic_listener_max_accept(const struct listener *l) +{ + return listener_backlog(l) / 2; +} + +static int quic_alloc_accept_queues(void) +{ + int i; + + quic_accept_queues = calloc(global.nbthread, + sizeof(*quic_accept_queues)); + if (!quic_accept_queues) { + ha_alert("Failed to allocate the quic accept queues.\n"); + return 0; + } + + for (i = 0; i < global.nbthread; ++i) { + struct tasklet *task; + if (!(task = tasklet_new())) { + ha_alert("Failed to allocate the quic accept queue on thread %d.\n", i); + return 0; + } + + tasklet_set_tid(task, i); + task->process = quic_accept_run; + quic_accept_queues[i].tasklet = task; + + MT_LIST_INIT(&quic_accept_queues[i].listeners); + } + + return 1; +} +REGISTER_POST_CHECK(quic_alloc_accept_queues); + +static int quic_deallocate_accept_queues(void) +{ + int i; + + if (quic_accept_queues) { + for (i = 0; i < global.nbthread; ++i) + tasklet_free(quic_accept_queues[i].tasklet); + free(quic_accept_queues); + } + + return 1; +} +REGISTER_POST_DEINIT(quic_deallocate_accept_queues); diff --git a/src/quic_ssl.c b/src/quic_ssl.c new file mode 100644 index 0000000..314f587 --- /dev/null +++ b/src/quic_ssl.c @@ -0,0 +1,790 @@ +#include <haproxy/errors.h> +#include <haproxy/ncbuf.h> +#include <haproxy/proxy.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_rx.h> +#include <haproxy/quic_sock.h> +#include <haproxy/quic_ssl.h> +#include <haproxy/quic_tls.h> +#include <haproxy/quic_tp.h> +#include <haproxy/quic_trace.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/trace.h> + +static BIO_METHOD *ha_quic_meth; + +DECLARE_POOL(pool_head_quic_ssl_sock_ctx, "quic_ssl_sock_ctx", sizeof(struct ssl_sock_ctx)); + +/* Set the encoded version of the transport parameter into the TLS + * stack depending on <ver> QUIC version and <server> boolean which must + * be set to 1 for a QUIC server, 0 for a client. + * Return 1 if succeeded, 0 if not. + */ +static int qc_ssl_set_quic_transport_params(struct quic_conn *qc, + const struct quic_version *ver, int server) +{ + int ret = 0; +#ifdef USE_QUIC_OPENSSL_COMPAT + unsigned char *in = qc->enc_params; + size_t insz = sizeof qc->enc_params; + size_t *enclen = &qc->enc_params_len; +#else + unsigned char tps[QUIC_TP_MAX_ENCLEN]; + size_t tpslen; + unsigned char *in = tps; + size_t insz = sizeof tps; + size_t *enclen = &tpslen; +#endif + + TRACE_ENTER(QUIC_EV_CONN_RWSEC, qc); + *enclen = quic_transport_params_encode(in, in + insz, &qc->rx.params, ver, server); + if (!*enclen) { + TRACE_ERROR("quic_transport_params_encode() failed", QUIC_EV_CONN_RWSEC); + goto leave; + } + + if (!SSL_set_quic_transport_params(qc->xprt_ctx->ssl, in, *enclen)) { + TRACE_ERROR("SSL_set_quic_transport_params() failed", QUIC_EV_CONN_RWSEC); + goto leave; + } + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_RWSEC, qc); + return ret; +} + +/* This function copies the CRYPTO data provided by the TLS stack found at <data> + * with <len> as size in CRYPTO buffers dedicated to store the information about + * outgoing CRYPTO frames so that to be able to replay the CRYPTO data streams. + * It fails (returns 0) only if it could not managed to allocate enough CRYPTO + * buffers to store all the data. + * Note that CRYPTO data may exist at any encryption level except at 0-RTT. + */ +static int qc_ssl_crypto_data_cpy(struct quic_conn *qc, struct quic_enc_level *qel, + const unsigned char *data, size_t len) +{ + struct quic_crypto_buf **qcb; + /* The remaining byte to store in CRYPTO buffers. */ + size_t cf_offset, cf_len, *nb_buf; + unsigned char *pos; + int ret = 0; + + nb_buf = &qel->tx.crypto.nb_buf; + qcb = &qel->tx.crypto.bufs[*nb_buf - 1]; + cf_offset = (*nb_buf - 1) * QUIC_CRYPTO_BUF_SZ + (*qcb)->sz; + cf_len = len; + + TRACE_ENTER(QUIC_EV_CONN_ADDDATA, qc); + + while (len) { + size_t to_copy, room; + + pos = (*qcb)->data + (*qcb)->sz; + room = QUIC_CRYPTO_BUF_SZ - (*qcb)->sz; + to_copy = len > room ? room : len; + if (to_copy) { + memcpy(pos, data, to_copy); + /* Increment the total size of this CRYPTO buffers by <to_copy>. */ + qel->tx.crypto.sz += to_copy; + (*qcb)->sz += to_copy; + len -= to_copy; + data += to_copy; + } + else { + struct quic_crypto_buf **tmp; + + // FIXME: realloc! + tmp = realloc(qel->tx.crypto.bufs, + (*nb_buf + 1) * sizeof *qel->tx.crypto.bufs); + if (tmp) { + qel->tx.crypto.bufs = tmp; + qcb = &qel->tx.crypto.bufs[*nb_buf]; + *qcb = pool_alloc(pool_head_quic_crypto_buf); + if (!*qcb) { + TRACE_ERROR("Could not allocate crypto buf", QUIC_EV_CONN_ADDDATA, qc); + goto leave; + } + + (*qcb)->sz = 0; + ++*nb_buf; + } + else { + break; + } + } + } + + /* Allocate a TX CRYPTO frame only if all the CRYPTO data + * have been buffered. + */ + if (!len) { + struct quic_frame *frm; + struct quic_frame *found = NULL; + + /* There is at most one CRYPTO frame in this packet number + * space. Let's look for it. + */ + list_for_each_entry(frm, &qel->pktns->tx.frms, list) { + if (frm->type != QUIC_FT_CRYPTO) + continue; + + /* Found */ + found = frm; + break; + } + + if (found) { + found->crypto.len += cf_len; + } + else { + frm = qc_frm_alloc(QUIC_FT_CRYPTO); + if (!frm) { + TRACE_ERROR("Could not allocate quic frame", QUIC_EV_CONN_ADDDATA, qc); + goto leave; + } + + frm->crypto.offset = cf_offset; + frm->crypto.len = cf_len; + frm->crypto.qel = qel; + LIST_APPEND(&qel->pktns->tx.frms, &frm->list); + } + } + ret = len == 0; + leave: + TRACE_LEAVE(QUIC_EV_CONN_ADDDATA, qc); + return ret; +} + +/* returns 0 on error, 1 on success */ +static int ha_quic_set_encryption_secrets(SSL *ssl, enum ssl_encryption_level_t level, + const uint8_t *read_secret, + const uint8_t *write_secret, size_t secret_len) +{ + int ret = 0; + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + struct quic_enc_level **qel = ssl_to_qel_addr(qc, level); + struct quic_pktns **pktns = ssl_to_quic_pktns(qc, level); + struct quic_tls_ctx *tls_ctx; + const SSL_CIPHER *cipher = SSL_get_current_cipher(ssl); + struct quic_tls_secrets *rx = NULL, *tx = NULL; + const struct quic_version *ver = + qc->negotiated_version ? qc->negotiated_version : qc->original_version; + + TRACE_ENTER(QUIC_EV_CONN_RWSEC, qc); + BUG_ON(secret_len > QUIC_TLS_SECRET_LEN); + + if (!*qel && !qc_enc_level_alloc(qc, pktns, qel, level)) { + TRACE_PROTO("Could not allocate an encryption level", QUIC_EV_CONN_ADDDATA, qc); + goto leave; + } + + tls_ctx = &(*qel)->tls_ctx; + + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_PROTO("connection to be killed", QUIC_EV_CONN_ADDDATA, qc); + goto out; + } + + if (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) { + TRACE_PROTO("CC required", QUIC_EV_CONN_RWSEC, qc); + goto out; + } + + if (!read_secret) + goto write; + + rx = &tls_ctx->rx; + rx->aead = tls_aead(cipher); + rx->md = tls_md(cipher); + rx->hp = tls_hp(cipher); + if (!rx->aead || !rx->md || !rx->hp) + goto leave; + + if (!quic_tls_secrets_keys_alloc(rx)) { + TRACE_ERROR("RX keys allocation failed", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + if (!quic_tls_derive_keys(rx->aead, rx->hp, rx->md, ver, rx->key, rx->keylen, + rx->iv, rx->ivlen, rx->hp_key, sizeof rx->hp_key, + read_secret, secret_len)) { + TRACE_ERROR("TX key derivation failed", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + if (!quic_tls_rx_ctx_init(&rx->ctx, rx->aead, rx->key)) { + TRACE_ERROR("could not initial RX TLS cipher context", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + if (!quic_tls_dec_aes_ctx_init(&rx->hp_ctx, rx->hp, rx->hp_key)) { + TRACE_ERROR("could not initial RX TLS cipher context for HP", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + /* Enqueue this connection asap if we could derive O-RTT secrets as + * listener. Note that a listener derives only RX secrets for this + * level. + */ + if (qc_is_listener(qc) && level == ssl_encryption_early_data) { + TRACE_DEVEL("pushing connection into accept queue", QUIC_EV_CONN_RWSEC, qc); + quic_accept_push_qc(qc); + } + +write: + + if (!write_secret) + goto keyupdate_init; + + tx = &tls_ctx->tx; + tx->aead = tls_aead(cipher); + tx->md = tls_md(cipher); + tx->hp = tls_hp(cipher); + if (!tx->aead || !tx->md || !tx->hp) + goto leave; + + if (!quic_tls_secrets_keys_alloc(tx)) { + TRACE_ERROR("TX keys allocation failed", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + if (!quic_tls_derive_keys(tx->aead, tx->hp, tx->md, ver, tx->key, tx->keylen, + tx->iv, tx->ivlen, tx->hp_key, sizeof tx->hp_key, + write_secret, secret_len)) { + TRACE_ERROR("TX key derivation failed", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + if (!quic_tls_tx_ctx_init(&tx->ctx, tx->aead, tx->key)) { + TRACE_ERROR("could not initial RX TLS cipher context", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + if (!quic_tls_enc_aes_ctx_init(&tx->hp_ctx, tx->hp, tx->hp_key)) { + TRACE_ERROR("could not initial TX TLS cipher context for HP", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + /* Set the transport parameters in the TLS stack. */ + if (level == ssl_encryption_handshake && qc_is_listener(qc) && + !qc_ssl_set_quic_transport_params(qc, ver, 1)) + goto leave; + + keyupdate_init: + /* Store the secret provided by the TLS stack, required for keyupdate. */ + if (level == ssl_encryption_application) { + struct quic_tls_kp *prv_rx = &qc->ku.prv_rx; + struct quic_tls_kp *nxt_rx = &qc->ku.nxt_rx; + struct quic_tls_kp *nxt_tx = &qc->ku.nxt_tx; + + if (rx) { + if (!(rx->secret = pool_alloc(pool_head_quic_tls_secret))) { + TRACE_ERROR("Could not allocate RX Application secrete keys", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + memcpy(rx->secret, read_secret, secret_len); + rx->secretlen = secret_len; + } + + if (tx) { + if (!(tx->secret = pool_alloc(pool_head_quic_tls_secret))) { + TRACE_ERROR("Could not allocate TX Application secrete keys", QUIC_EV_CONN_RWSEC, qc); + goto leave; + } + + memcpy(tx->secret, write_secret, secret_len); + tx->secretlen = secret_len; + } + + /* Initialize all the secret keys lengths */ + prv_rx->secretlen = nxt_rx->secretlen = nxt_tx->secretlen = secret_len; + } + + out: + ret = 1; + leave: + if (!ret) { + /* Release the CRYPTO frames which have been provided by the TLS stack + * to prevent the transmission of ack-eliciting packets. + */ + qc_release_pktns_frms(qc, qc->ipktns); + qc_release_pktns_frms(qc, qc->hpktns); + qc_release_pktns_frms(qc, qc->apktns); + quic_set_tls_alert(qc, SSL_AD_HANDSHAKE_FAILURE); + } + + TRACE_LEAVE(QUIC_EV_CONN_RWSEC, qc, &level); + return ret; +} + +#if defined(OPENSSL_IS_AWSLC) +/* compatibility function for split read/write encryption secrets to be used + * with the API which uses 2 callbacks. */ +static inline int ha_quic_set_read_secret(SSL *ssl, enum ssl_encryption_level_t level, + const SSL_CIPHER *cipher, const uint8_t *secret, + size_t secret_len) +{ + return ha_quic_set_encryption_secrets(ssl, level, secret, NULL, secret_len); + +} + +static inline int ha_quic_set_write_secret(SSL *ssl, enum ssl_encryption_level_t level, + const SSL_CIPHER *cipher, const uint8_t *secret, + size_t secret_len) +{ + + return ha_quic_set_encryption_secrets(ssl, level, NULL, secret, secret_len); + +} +#endif + +/* ->add_handshake_data QUIC TLS callback used by the QUIC TLS stack when it + * wants to provide the QUIC layer with CRYPTO data. + * Returns 1 if succeeded, 0 if not. + */ +static int ha_quic_add_handshake_data(SSL *ssl, enum ssl_encryption_level_t level, + const uint8_t *data, size_t len) +{ + int ret = 0; + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + struct quic_enc_level **qel = ssl_to_qel_addr(qc, level); + struct quic_pktns **pktns = ssl_to_quic_pktns(qc, level); + + TRACE_ENTER(QUIC_EV_CONN_ADDDATA, qc); + + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_PROTO("connection to be killed", QUIC_EV_CONN_ADDDATA, qc); + goto out; + } + + if (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) { + TRACE_PROTO("CC required", QUIC_EV_CONN_ADDDATA, qc); + goto out; + } + + if (!*qel && !qc_enc_level_alloc(qc, pktns, qel, level)) + goto leave; + + if (!qc_ssl_crypto_data_cpy(qc, *qel, data, len)) { + TRACE_ERROR("Could not bufferize", QUIC_EV_CONN_ADDDATA, qc); + goto leave; + } + + TRACE_DEVEL("CRYPTO data buffered", QUIC_EV_CONN_ADDDATA, + qc, &level, &len); + out: + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_ADDDATA, qc); + return ret; +} + +static int ha_quic_flush_flight(SSL *ssl) +{ + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + + TRACE_ENTER(QUIC_EV_CONN_FFLIGHT, qc); + TRACE_LEAVE(QUIC_EV_CONN_FFLIGHT, qc); + + return 1; +} + +static int ha_quic_send_alert(SSL *ssl, enum ssl_encryption_level_t level, uint8_t alert) +{ + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + + TRACE_ENTER(QUIC_EV_CONN_SSLALERT, qc); + + TRACE_PROTO("Received TLS alert", QUIC_EV_CONN_SSLALERT, qc, &alert, &level); + + quic_set_tls_alert(qc, alert); + TRACE_LEAVE(QUIC_EV_CONN_SSLALERT, qc); + return 1; +} + +/* QUIC TLS methods */ +#if defined(OPENSSL_IS_AWSLC) +/* write/read set secret split */ +static SSL_QUIC_METHOD ha_quic_method = { + .set_read_secret = ha_quic_set_read_secret, + .set_write_secret = ha_quic_set_write_secret, + .add_handshake_data = ha_quic_add_handshake_data, + .flush_flight = ha_quic_flush_flight, + .send_alert = ha_quic_send_alert, +}; + +#else + +static SSL_QUIC_METHOD ha_quic_method = { + .set_encryption_secrets = ha_quic_set_encryption_secrets, + .add_handshake_data = ha_quic_add_handshake_data, + .flush_flight = ha_quic_flush_flight, + .send_alert = ha_quic_send_alert, +}; +#endif + +/* Initialize the TLS context of a listener with <bind_conf> as configuration. + * Returns an error count. + */ +int ssl_quic_initial_ctx(struct bind_conf *bind_conf) +{ + struct ssl_bind_conf __maybe_unused *ssl_conf_cur; + int cfgerr = 0; + + long options = + (SSL_OP_ALL & ~SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS) | + SSL_OP_SINGLE_ECDH_USE | + SSL_OP_CIPHER_SERVER_PREFERENCE; + SSL_CTX *ctx; + + ctx = SSL_CTX_new(TLS_server_method()); + bind_conf->initial_ctx = ctx; + + SSL_CTX_set_options(ctx, options); + SSL_CTX_set_mode(ctx, SSL_MODE_RELEASE_BUFFERS); + SSL_CTX_set_min_proto_version(ctx, TLS1_3_VERSION); + SSL_CTX_set_max_proto_version(ctx, TLS1_3_VERSION); + +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME +# if defined(HAVE_SSL_CLIENT_HELLO_CB) +# if defined(SSL_OP_NO_ANTI_REPLAY) + if (bind_conf->ssl_conf.early_data) { + SSL_CTX_set_options(ctx, SSL_OP_NO_ANTI_REPLAY); +# if defined(USE_QUIC_OPENSSL_COMPAT) || defined(OPENSSL_IS_AWSLC) + ha_warning("Binding [%s:%d] for %s %s: 0-RTT is not supported in limited QUIC compatibility mode, ignored.\n", + bind_conf->file, bind_conf->line, proxy_type_str(bind_conf->frontend), bind_conf->frontend->id); +# else + SSL_CTX_set_max_early_data(ctx, 0xffffffff); +# endif /* ! USE_QUIC_OPENSSL_COMPAT */ + } +# endif /* !SSL_OP_NO_ANTI_REPLAY */ + SSL_CTX_set_client_hello_cb(ctx, ssl_sock_switchctx_cbk, NULL); + SSL_CTX_set_tlsext_servername_callback(ctx, ssl_sock_switchctx_err_cbk); +# else /* ! HAVE_SSL_CLIENT_HELLO_CB */ + SSL_CTX_set_tlsext_servername_callback(ctx, ssl_sock_switchctx_cbk); +# endif + SSL_CTX_set_tlsext_servername_arg(ctx, bind_conf); +#endif +#ifdef USE_QUIC_OPENSSL_COMPAT + if (!quic_tls_compat_init(bind_conf, ctx)) + cfgerr++; +#endif + + return cfgerr; +} + +/* This function gives the detail of the SSL error. It is used only + * if the debug mode and the verbose mode are activated. It dump all + * the SSL error until the stack was empty. + */ +static forceinline void qc_ssl_dump_errors(struct connection *conn) +{ + if (unlikely(global.mode & MODE_DEBUG)) { + while (1) { + const char *func = NULL; + unsigned long ret; + + ERR_peek_error_func(&func); + ret = ERR_get_error(); + if (!ret) + return; + + fprintf(stderr, "conn. @%p OpenSSL error[0x%lx] %s: %s\n", conn, ret, + func, ERR_reason_error_string(ret)); + } + } +} + +/* Provide CRYPTO data to the TLS stack found at <data> with <len> as length + * from <qel> encryption level with <ctx> as QUIC connection context. + * Remaining parameter are there for debugging purposes. + * Return 1 if succeeded, 0 if not. + */ +int qc_ssl_provide_quic_data(struct ncbuf *ncbuf, + enum ssl_encryption_level_t level, + struct ssl_sock_ctx *ctx, + const unsigned char *data, size_t len) +{ +#ifdef DEBUG_STRICT + enum ncb_ret ncb_ret; +#endif + int ssl_err, state; + struct quic_conn *qc; + int ret = 0; + + ssl_err = SSL_ERROR_NONE; + qc = ctx->qc; + + TRACE_ENTER(QUIC_EV_CONN_SSLDATA, qc); + + if (SSL_provide_quic_data(ctx->ssl, level, data, len) != 1) { + TRACE_ERROR("SSL_provide_quic_data() error", + QUIC_EV_CONN_SSLDATA, qc, NULL, NULL, ctx->ssl); + goto leave; + } + + state = qc->state; + if (state < QUIC_HS_ST_COMPLETE) { + ssl_err = SSL_do_handshake(ctx->ssl); + + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_DEVEL("connection to be killed", QUIC_EV_CONN_IO_CB, qc); + goto leave; + } + + /* Finalize the connection as soon as possible if the peer transport parameters + * have been received. This may be useful to send packets even if this + * handshake fails. + */ + if ((qc->flags & QUIC_FL_CONN_TX_TP_RECEIVED) && !qc_conn_finalize(qc, 1)) { + TRACE_ERROR("connection finalization failed", QUIC_EV_CONN_IO_CB, qc, &state); + goto leave; + } + + if (ssl_err != 1) { + ssl_err = SSL_get_error(ctx->ssl, ssl_err); + if (ssl_err == SSL_ERROR_WANT_READ || ssl_err == SSL_ERROR_WANT_WRITE) { + TRACE_PROTO("SSL handshake in progress", + QUIC_EV_CONN_IO_CB, qc, &state, &ssl_err); + goto out; + } + + TRACE_ERROR("SSL handshake error", QUIC_EV_CONN_IO_CB, qc, &state, &ssl_err); + HA_ATOMIC_INC(&qc->prx_counters->hdshk_fail); + qc_ssl_dump_errors(ctx->conn); + ERR_clear_error(); + goto leave; + } + + TRACE_PROTO("SSL handshake OK", QUIC_EV_CONN_IO_CB, qc, &state); + + /* Check the alpn could be negotiated */ + if (!qc->app_ops) { + TRACE_ERROR("No negotiated ALPN", QUIC_EV_CONN_IO_CB, qc, &state); + quic_set_tls_alert(qc, SSL_AD_NO_APPLICATION_PROTOCOL); + goto leave; + } + + /* I/O callback switch */ + qc->wait_event.tasklet->process = quic_conn_app_io_cb; + if (qc_is_listener(ctx->qc)) { + qc->flags |= QUIC_FL_CONN_NEED_POST_HANDSHAKE_FRMS; + qc->state = QUIC_HS_ST_CONFIRMED; + /* The connection is ready to be accepted. */ + quic_accept_push_qc(qc); + + BUG_ON(qc->li->rx.quic_curr_handshake == 0); + HA_ATOMIC_DEC(&qc->li->rx.quic_curr_handshake); + } + else { + qc->state = QUIC_HS_ST_COMPLETE; + } + + /* Prepare the next key update */ + if (!quic_tls_key_update(qc)) { + TRACE_ERROR("quic_tls_key_update() failed", QUIC_EV_CONN_IO_CB, qc); + goto leave; + } + } else { + ssl_err = SSL_process_quic_post_handshake(ctx->ssl); + if (ssl_err != 1) { + ssl_err = SSL_get_error(ctx->ssl, ssl_err); + if (ssl_err == SSL_ERROR_WANT_READ || ssl_err == SSL_ERROR_WANT_WRITE) { + TRACE_PROTO("SSL post handshake in progress", + QUIC_EV_CONN_IO_CB, qc, &state, &ssl_err); + goto out; + } + + TRACE_ERROR("SSL post handshake error", + QUIC_EV_CONN_IO_CB, qc, &state, &ssl_err); + goto leave; + } + + TRACE_STATE("SSL post handshake succeeded", QUIC_EV_CONN_IO_CB, qc, &state); + } + + out: + ret = 1; + leave: + /* The CRYPTO data are consumed even in case of an error to release + * the memory asap. + */ + if (!ncb_is_null(ncbuf)) { +#ifdef DEBUG_STRICT + ncb_ret = ncb_advance(ncbuf, len); + /* ncb_advance() must always succeed. This is guaranteed as + * this is only done inside a data block. If false, this will + * lead to handshake failure with quic_enc_level offset shifted + * from buffer data. + */ + BUG_ON(ncb_ret != NCB_RET_OK); +#else + ncb_advance(ncbuf, len); +#endif + } + + TRACE_LEAVE(QUIC_EV_CONN_SSLDATA, qc); + return ret; +} + +/* Provide all the stored in order CRYPTO data received from the peer to the TLS. + * Return 1 if succeeded, 0 if not. + */ +int qc_ssl_provide_all_quic_data(struct quic_conn *qc, struct ssl_sock_ctx *ctx) +{ + int ret = 0; + struct quic_enc_level *qel; + struct ncbuf ncbuf = NCBUF_NULL; + + TRACE_ENTER(QUIC_EV_CONN_PHPKTS, qc); + list_for_each_entry(qel, &qc->qel_list, list) { + struct qf_crypto *qf_crypto, *qf_back; + + list_for_each_entry_safe(qf_crypto, qf_back, &qel->rx.crypto_frms, list) { + const unsigned char *crypto_data = qf_crypto->data; + size_t crypto_len = qf_crypto->len; + + /* Free this frame asap */ + LIST_DELETE(&qf_crypto->list); + pool_free(pool_head_qf_crypto, qf_crypto); + + if (!qc_ssl_provide_quic_data(&ncbuf, qel->level, ctx, + crypto_data, crypto_len)) + goto leave; + + TRACE_DEVEL("buffered crypto data were provided to TLS stack", + QUIC_EV_CONN_PHPKTS, qc, qel); + } + + if (!qel->cstream) + continue; + + if (!qc_treat_rx_crypto_frms(qc, qel, ctx)) + goto leave; + } + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PHPKTS, qc); + return ret; +} + +/* Try to allocate the <*ssl> SSL session object for <qc> QUIC connection + * with <ssl_ctx> as SSL context inherited settings. Also set the transport + * parameters of this session. + * This is the responsibility of the caller to check the validity of all the + * pointers passed as parameter to this function. + * Return 0 if succeeded, -1 if not. If failed, sets the ->err_code member of <qc->conn> to + * CO_ER_SSL_NO_MEM. + */ +static int qc_ssl_sess_init(struct quic_conn *qc, SSL_CTX *ssl_ctx, SSL **ssl) +{ + int retry, ret = -1; + + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + retry = 1; + retry: + *ssl = SSL_new(ssl_ctx); + if (!*ssl) { + if (!retry--) + goto leave; + + pool_gc(NULL); + goto retry; + } + + if (!SSL_set_ex_data(*ssl, ssl_qc_app_data_index, qc) || + !SSL_set_quic_method(*ssl, &ha_quic_method)) { + SSL_free(*ssl); + *ssl = NULL; + if (!retry--) + goto leave; + + pool_gc(NULL); + goto retry; + } + + ret = 0; + leave: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + return ret; +} + +/* Allocate the ssl_sock_ctx from connection <qc>. This creates the tasklet + * used to process <qc> received packets. The allocated context is stored in + * <qc.xprt_ctx>. + * + * Returns 0 on success else non-zero. + */ +int qc_alloc_ssl_sock_ctx(struct quic_conn *qc) +{ + int ret = 0; + struct bind_conf *bc = qc->li->bind_conf; + struct ssl_sock_ctx *ctx = NULL; + + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + ctx = pool_alloc(pool_head_quic_ssl_sock_ctx); + if (!ctx) { + TRACE_ERROR("SSL context allocation failed", QUIC_EV_CONN_TXPKT); + goto err; + } + + ctx->conn = NULL; + ctx->bio = NULL; + ctx->xprt = NULL; + ctx->xprt_ctx = NULL; + memset(&ctx->wait_event, 0, sizeof(ctx->wait_event)); + ctx->subs = NULL; + ctx->xprt_st = 0; + ctx->error_code = 0; + ctx->early_buf = BUF_NULL; + ctx->sent_early_data = 0; + ctx->qc = qc; + + if (qc_is_listener(qc)) { + if (qc_ssl_sess_init(qc, bc->initial_ctx, &ctx->ssl) == -1) + goto err; +#if (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) && !defined(OPENSSL_IS_AWSLC) +#ifndef USE_QUIC_OPENSSL_COMPAT + /* Enabling 0-RTT */ + if (bc->ssl_conf.early_data) + SSL_set_quic_early_data_enabled(ctx->ssl, 1); +#endif +#endif + + SSL_set_accept_state(ctx->ssl); + } + + ctx->xprt = xprt_get(XPRT_QUIC); + + /* Store the allocated context in <qc>. */ + qc->xprt_ctx = ctx; + + /* global.sslconns is already incremented on INITIAL packet parsing. */ + _HA_ATOMIC_INC(&global.totalsslconns); + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + return !ret; + + err: + pool_free(pool_head_quic_ssl_sock_ctx, ctx); + goto leave; +} + +static void __quic_conn_init(void) +{ + ha_quic_meth = BIO_meth_new(0x666, "ha QUIC methods"); +} +INITCALL0(STG_REGISTER, __quic_conn_init); + +static void __quic_conn_deinit(void) +{ + BIO_meth_free(ha_quic_meth); +} +REGISTER_POST_DEINIT(__quic_conn_deinit); diff --git a/src/quic_stats.c b/src/quic_stats.c new file mode 100644 index 0000000..3657f30 --- /dev/null +++ b/src/quic_stats.c @@ -0,0 +1,215 @@ +#include <haproxy/quic_frame-t.h> +#include <haproxy/quic_stats-t.h> +#include <haproxy/stats.h> + +static struct name_desc quic_stats[] = { + [QUIC_ST_RXBUF_FULL] = { .name = "quic_rxbuf_full", + .desc = "Total number of cancelled reception due to full receiver buffer" }, + [QUIC_ST_DROPPED_PACKET] = { .name = "quic_dropped_pkt", + .desc = "Total number of dropped packets" }, + [QUIC_ST_DROPPED_PACKET_BUFOVERRUN] = { .name = "quic_dropped_pkt_bufoverrun", + .desc = "Total number of dropped packets because of buffer overrun" }, + [QUIC_ST_DROPPED_PARSING] = { .name = "quic_dropped_parsing_pkt", + .desc = "Total number of dropped packets upon parsing error" }, + [QUIC_ST_SOCKET_FULL] = { .name = "quic_socket_full", + .desc = "Total number of EAGAIN error on sendto() calls" }, + [QUIC_ST_SENDTO_ERR] = { .name = "quic_sendto_err", + .desc = "Total number of error on sendto() calls, EAGAIN excepted" }, + [QUIC_ST_SENDTO_ERR_UNKNWN] = { .name = "quic_sendto_err_unknwn", + .desc = "Total number of error on sendto() calls not explicitly listed" }, + [QUIC_ST_SENT_PACKET] = { .name = "quic_sent_pkt", + .desc = "Total number of sent packets" }, + [QUIC_ST_LOST_PACKET] = { .name = "quic_lost_pkt", + .desc = "Total number of lost sent packets" }, + [QUIC_ST_TOO_SHORT_INITIAL_DGRAM] = { .name = "quic_too_short_dgram", + .desc = "Total number of too short dgrams with Initial packets" }, + [QUIC_ST_RETRY_SENT] = { .name = "quic_retry_sent", + .desc = "Total number of Retry sent" }, + [QUIC_ST_RETRY_VALIDATED] = { .name = "quic_retry_validated", + .desc = "Total number of validated Retry tokens" }, + [QUIC_ST_RETRY_ERRORS] = { .name = "quic_retry_error", + .desc = "Total number of Retry tokens errors" }, + [QUIC_ST_HALF_OPEN_CONN] = { .name = "quic_half_open_conn", + .desc = "Total number of half open connections" }, + [QUIC_ST_HDSHK_FAIL] = { .name = "quic_hdshk_fail", + .desc = "Total number of handshake failures" }, + [QUIC_ST_STATELESS_RESET_SENT] = { .name = "quic_stless_rst_sent", + .desc = "Total number of stateless reset packet sent" }, + /* Special events of interest */ + [QUIC_ST_CONN_MIGRATION_DONE] = { .name = "quic_conn_migration_done", + .desc = "Total number of connection migration proceeded" }, + /* Transport errors */ + [QUIC_ST_TRANSP_ERR_NO_ERROR] = { .name = "quic_transp_err_no_error", + .desc = "Total number of NO_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_INTERNAL_ERROR] = { .name = "quic_transp_err_internal_error", + .desc = "Total number of INTERNAL_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_CONNECTION_REFUSED] = { .name = "quic_transp_err_connection_refused", + .desc = "Total number of CONNECTION_REFUSED errors received" }, + [QUIC_ST_TRANSP_ERR_FLOW_CONTROL_ERROR] = { .name = "quic_transp_err_flow_control_error", + .desc = "Total number of FLOW_CONTROL_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_STREAM_LIMIT_ERROR] = { .name = "quic_transp_err_stream_limit_error", + .desc = "Total number of STREAM_LIMIT_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_STREAM_STATE_ERROR] = { .name = "quic_transp_err_stream_state_error", + .desc = "Total number of STREAM_STATE_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_FINAL_SIZE_ERROR] = { .name = "quic_transp_err_final_size_error", + .desc = "Total number of FINAL_SIZE_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_FRAME_ENCODING_ERROR] = { .name = "quic_transp_err_frame_encoding_error", + .desc = "Total number of FRAME_ENCODING_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_TRANSPORT_PARAMETER_ERROR] = { .name = "quic_transp_err_transport_parameter_error", + .desc = "Total number of TRANSPORT_PARAMETER_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_CONNECTION_ID_LIMIT_ERROR] = { .name = "quic_transp_err_connection_id_limit", + .desc = "Total number of CONNECTION_ID_LIMIT_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_PROTOCOL_VIOLATION] = { .name = "quic_transp_err_protocol_violation_error", + .desc = "Total number of PROTOCOL_VIOLATION errors received" }, + [QUIC_ST_TRANSP_ERR_INVALID_TOKEN] = { .name = "quic_transp_err_invalid_token", + .desc = "Total number of INVALID_TOKEN errors received" }, + [QUIC_ST_TRANSP_ERR_APPLICATION_ERROR] = { .name = "quic_transp_err_application_error", + .desc = "Total number of APPLICATION_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_CRYPTO_BUFFER_EXCEEDED] = { .name = "quic_transp_err_crypto_buffer_exceeded", + .desc = "Total number of CRYPTO_BUFFER_EXCEEDED errors received" }, + [QUIC_ST_TRANSP_ERR_KEY_UPDATE_ERROR] = { .name = "quic_transp_err_key_update_error", + .desc = "Total number of KEY_UPDATE_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_AEAD_LIMIT_REACHED] = { .name = "quic_transp_err_aead_limit_reached", + .desc = "Total number of AEAD_LIMIT_REACHED errors received" }, + [QUIC_ST_TRANSP_ERR_NO_VIABLE_PATH] = { .name = "quic_transp_err_no_viable_path", + .desc = "Total number of NO_VIABLE_PATH errors received" }, + [QUIC_ST_TRANSP_ERR_CRYPTO_ERROR] = { .name = "quic_transp_err_crypto_error", + .desc = "Total number of CRYPTO_ERROR errors received" }, + [QUIC_ST_TRANSP_ERR_UNKNOWN_ERROR] = { .name = "quic_transp_err_unknown_error", + .desc = "Total number of UNKNOWN_ERROR errors received" }, + /* Streams related counters */ + [QUIC_ST_DATA_BLOCKED] = { .name = "quic_data_blocked", + .desc = "Total number of received DATA_BLOCKED frames" }, + [QUIC_ST_STREAM_DATA_BLOCKED] = { .name = "quic_stream_data_blocked", + .desc = "Total number of received STREAM_DATA_BLOCKED frames" }, + [QUIC_ST_STREAMS_BLOCKED_BIDI] = { .name = "quic_streams_blocked_bidi", + .desc = "Total number of received STREAMS_BLOCKED_BIDI frames" }, + [QUIC_ST_STREAMS_BLOCKED_UNI] = { .name = "quic_streams_blocked_uni", + .desc = "Total number of received STREAMS_BLOCKED_UNI frames" }, +}; + +struct quic_counters quic_counters; + +static void quic_fill_stats(void *data, struct field *stats) +{ + struct quic_counters *counters = data; + + stats[QUIC_ST_RXBUF_FULL] = mkf_u64(FN_COUNTER, counters->rxbuf_full); + stats[QUIC_ST_DROPPED_PACKET] = mkf_u64(FN_COUNTER, counters->dropped_pkt); + stats[QUIC_ST_DROPPED_PACKET_BUFOVERRUN] = mkf_u64(FN_COUNTER, counters->dropped_pkt_bufoverrun); + stats[QUIC_ST_DROPPED_PARSING] = mkf_u64(FN_COUNTER, counters->dropped_parsing); + stats[QUIC_ST_SOCKET_FULL] = mkf_u64(FN_COUNTER, counters->socket_full); + stats[QUIC_ST_SENDTO_ERR] = mkf_u64(FN_COUNTER, counters->sendto_err); + stats[QUIC_ST_SENDTO_ERR_UNKNWN] = mkf_u64(FN_COUNTER, counters->sendto_err_unknown); + stats[QUIC_ST_SENT_PACKET] = mkf_u64(FN_COUNTER, counters->sent_pkt); + stats[QUIC_ST_LOST_PACKET] = mkf_u64(FN_COUNTER, counters->lost_pkt); + stats[QUIC_ST_TOO_SHORT_INITIAL_DGRAM] = mkf_u64(FN_COUNTER, counters->too_short_initial_dgram); + stats[QUIC_ST_RETRY_SENT] = mkf_u64(FN_COUNTER, counters->retry_sent); + stats[QUIC_ST_RETRY_VALIDATED] = mkf_u64(FN_COUNTER, counters->retry_validated); + stats[QUIC_ST_RETRY_ERRORS] = mkf_u64(FN_COUNTER, counters->retry_error); + stats[QUIC_ST_HALF_OPEN_CONN] = mkf_u64(FN_GAUGE, counters->half_open_conn); + stats[QUIC_ST_HDSHK_FAIL] = mkf_u64(FN_COUNTER, counters->hdshk_fail); + stats[QUIC_ST_STATELESS_RESET_SENT] = mkf_u64(FN_COUNTER, counters->stateless_reset_sent); + /* Special events of interest */ + stats[QUIC_ST_CONN_MIGRATION_DONE] = mkf_u64(FN_COUNTER, counters->conn_migration_done); + /* Transport errors */ + stats[QUIC_ST_TRANSP_ERR_NO_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_no_error); + stats[QUIC_ST_TRANSP_ERR_INTERNAL_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_internal_error); + stats[QUIC_ST_TRANSP_ERR_CONNECTION_REFUSED] = mkf_u64(FN_COUNTER, counters->quic_transp_err_connection_refused); + stats[QUIC_ST_TRANSP_ERR_FLOW_CONTROL_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_flow_control_error); + stats[QUIC_ST_TRANSP_ERR_STREAM_LIMIT_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_stream_limit_error); + stats[QUIC_ST_TRANSP_ERR_STREAM_STATE_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_stream_state_error); + stats[QUIC_ST_TRANSP_ERR_FINAL_SIZE_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_final_size_error); + stats[QUIC_ST_TRANSP_ERR_FRAME_ENCODING_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_frame_encoding_error); + stats[QUIC_ST_TRANSP_ERR_TRANSPORT_PARAMETER_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_transport_parameter_error); + stats[QUIC_ST_TRANSP_ERR_CONNECTION_ID_LIMIT_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_connection_id_limit); + stats[QUIC_ST_TRANSP_ERR_PROTOCOL_VIOLATION] = mkf_u64(FN_COUNTER, counters->quic_transp_err_protocol_violation); + stats[QUIC_ST_TRANSP_ERR_INVALID_TOKEN] = mkf_u64(FN_COUNTER, counters->quic_transp_err_invalid_token); + stats[QUIC_ST_TRANSP_ERR_APPLICATION_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_application_error); + stats[QUIC_ST_TRANSP_ERR_CRYPTO_BUFFER_EXCEEDED] = mkf_u64(FN_COUNTER, counters->quic_transp_err_crypto_buffer_exceeded); + stats[QUIC_ST_TRANSP_ERR_KEY_UPDATE_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_key_update_error); + stats[QUIC_ST_TRANSP_ERR_AEAD_LIMIT_REACHED] = mkf_u64(FN_COUNTER, counters->quic_transp_err_aead_limit_reached); + stats[QUIC_ST_TRANSP_ERR_NO_VIABLE_PATH] = mkf_u64(FN_COUNTER, counters->quic_transp_err_no_viable_path); + stats[QUIC_ST_TRANSP_ERR_CRYPTO_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_crypto_error); + stats[QUIC_ST_TRANSP_ERR_UNKNOWN_ERROR] = mkf_u64(FN_COUNTER, counters->quic_transp_err_unknown_error); + /* Streams related counters */ + stats[QUIC_ST_DATA_BLOCKED] = mkf_u64(FN_COUNTER, counters->data_blocked); + stats[QUIC_ST_STREAM_DATA_BLOCKED] = mkf_u64(FN_COUNTER, counters->stream_data_blocked); + stats[QUIC_ST_STREAMS_BLOCKED_BIDI] = mkf_u64(FN_COUNTER, counters->streams_blocked_bidi); + stats[QUIC_ST_STREAMS_BLOCKED_UNI] = mkf_u64(FN_COUNTER, counters->streams_blocked_uni); +} + +struct stats_module quic_stats_module = { + .name = "quic", + .fill_stats = quic_fill_stats, + .stats = quic_stats, + .stats_count = QUIC_STATS_COUNT, + .counters = &quic_counters, + .counters_size = sizeof(quic_counters), + .domain_flags = MK_STATS_PROXY_DOMAIN(STATS_PX_CAP_FE), + .clearable = 1, +}; + +INITCALL1(STG_REGISTER, stats_register_module, &quic_stats_module); + +void quic_stats_transp_err_count_inc(struct quic_counters *ctrs, int error_code) +{ + switch (error_code) { + case QC_ERR_NO_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_no_error); + break; + case QC_ERR_INTERNAL_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_internal_error); + break; + case QC_ERR_CONNECTION_REFUSED: + HA_ATOMIC_INC(&ctrs->quic_transp_err_connection_refused); + break; + case QC_ERR_FLOW_CONTROL_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_flow_control_error); + break; + case QC_ERR_STREAM_LIMIT_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_stream_limit_error); + break; + case QC_ERR_STREAM_STATE_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_stream_state_error); + break; + case QC_ERR_FINAL_SIZE_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_final_size_error); + break; + case QC_ERR_FRAME_ENCODING_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_frame_encoding_error); + break; + case QC_ERR_TRANSPORT_PARAMETER_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_transport_parameter_error); + break; + case QC_ERR_CONNECTION_ID_LIMIT_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_connection_id_limit); + break; + case QC_ERR_PROTOCOL_VIOLATION: + HA_ATOMIC_INC(&ctrs->quic_transp_err_protocol_violation); + break; + case QC_ERR_INVALID_TOKEN: + HA_ATOMIC_INC(&ctrs->quic_transp_err_invalid_token); + break; + case QC_ERR_APPLICATION_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_application_error); + break; + case QC_ERR_CRYPTO_BUFFER_EXCEEDED: + HA_ATOMIC_INC(&ctrs->quic_transp_err_crypto_buffer_exceeded); + break; + case QC_ERR_KEY_UPDATE_ERROR: + HA_ATOMIC_INC(&ctrs->quic_transp_err_key_update_error); + break; + case QC_ERR_AEAD_LIMIT_REACHED: + HA_ATOMIC_INC(&ctrs->quic_transp_err_aead_limit_reached); + break; + case QC_ERR_NO_VIABLE_PATH: + HA_ATOMIC_INC(&ctrs->quic_transp_err_no_viable_path); + break; + default: + if (error_code >= 0x100 && error_code <= 0x1ff) + HA_ATOMIC_INC(&ctrs->quic_transp_err_crypto_error); + else + HA_ATOMIC_INC(&ctrs->quic_transp_err_unknown_error); + } +} diff --git a/src/quic_stream.c b/src/quic_stream.c new file mode 100644 index 0000000..a4b984d --- /dev/null +++ b/src/quic_stream.c @@ -0,0 +1,294 @@ +#include <haproxy/quic_stream.h> + +#include <import/eb64tree.h> + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/dynbuf.h> +#include <haproxy/list.h> +#include <haproxy/mux_quic-t.h> +#include <haproxy/pool.h> +#include <haproxy/quic_conn.h> +#include <haproxy/task.h> + +DECLARE_STATIC_POOL(pool_head_quic_stream_desc, "qc_stream_desc", + sizeof(struct qc_stream_desc)); +DECLARE_STATIC_POOL(pool_head_quic_stream_buf, "qc_stream_buf", + sizeof(struct qc_stream_buf)); + + +static void qc_stream_buf_free(struct qc_stream_desc *stream, + struct qc_stream_buf **stream_buf) +{ + struct quic_conn *qc = stream->qc; + struct buffer *buf = &(*stream_buf)->buf; + + LIST_DEL_INIT(&(*stream_buf)->list); + + /* Reset current buf ptr if deleted instance is the same one. */ + if (*stream_buf == stream->buf) + stream->buf = NULL; + + b_free(buf); + offer_buffers(NULL, 1); + pool_free(pool_head_quic_stream_buf, *stream_buf); + *stream_buf = NULL; + + /* notify MUX about available buffers. */ + --qc->stream_buf_count; + if (qc->mux_state == QC_MUX_READY) { + if (qc->qcc->flags & QC_CF_CONN_FULL) { + qc->qcc->flags &= ~QC_CF_CONN_FULL; + tasklet_wakeup(qc->qcc->wait_event.tasklet); + } + } +} + +/* Allocate a new stream descriptor with id <id>. The caller is responsible to + * store the stream in the appropriate tree. -1 special value must be used for + * a CRYPTO data stream, the type being ignored. + * + * Returns the newly allocated instance on success or else NULL. + */ +struct qc_stream_desc *qc_stream_desc_new(uint64_t id, enum qcs_type type, void *ctx, + struct quic_conn *qc) +{ + struct qc_stream_desc *stream; + + stream = pool_alloc(pool_head_quic_stream_desc); + if (!stream) + return NULL; + + if (id == (uint64_t)-1) { + stream->by_id.key = (uint64_t)-1; + } + else { + stream->by_id.key = id; + eb64_insert(&qc->streams_by_id, &stream->by_id); + qc->rx.strms[type].nb_streams++; + } + stream->qc = qc; + + stream->buf = NULL; + LIST_INIT(&stream->buf_list); + stream->buf_offset = 0; + + stream->acked_frms = EB_ROOT; + stream->ack_offset = 0; + stream->release = 0; + stream->ctx = ctx; + + return stream; +} + +/* Mark the stream descriptor <stream> as released. It will be freed as soon as + * all its buffered data are acknowledged. Does nothing if <stream> is already + * NULL. + * + * <final_size> corresponds to the last offset sent for this stream. If there + * is unsent data present, they will be remove first to guarantee that buffer + * is freed after receiving all acknowledges. + */ +void qc_stream_desc_release(struct qc_stream_desc *stream, + uint64_t final_size) +{ + if (!stream) + return; + + /* A stream can be released only one time. */ + BUG_ON(stream->release); + + stream->release = 1; + stream->ctx = NULL; + + if (stream->buf) { + struct qc_stream_buf *stream_buf = stream->buf; + struct buffer *buf = &stream_buf->buf; + const uint64_t tail_offset = + MAX(stream->buf_offset, stream->ack_offset) + b_data(buf); + + /* final_size cannot be greater than all currently stored data. */ + BUG_ON(final_size > tail_offset); + + /* Remove unsent data from current buffer. */ + if (final_size < tail_offset) { + b_sub(buf, tail_offset - final_size); + /* Remove buffer is all ACK already received. */ + if (!b_data(buf)) + qc_stream_buf_free(stream, &stream_buf); + } + + /* A released stream does not use <stream.buf>. */ + stream->buf = NULL; + } + + if (LIST_ISEMPTY(&stream->buf_list)) { + /* if no buffer left we can free the stream. */ + qc_stream_desc_free(stream, 0); + } +} + +/* Acknowledge data at <offset> of length <len> for <stream>. It is handled + * only if it covers a range corresponding to stream.ack_offset. After data + * removal, if the stream does not contains data any more and is already + * released, the instance stream is freed. <stream> is set to NULL to indicate + * this. + * + * Returns the count of byte removed from stream. Do not forget to check if + * <stream> is NULL after invocation. + */ +int qc_stream_desc_ack(struct qc_stream_desc **stream, size_t offset, size_t len) +{ + struct qc_stream_desc *s = *stream; + struct qc_stream_buf *stream_buf; + struct buffer *buf; + size_t diff; + + if (offset + len <= s->ack_offset || offset > s->ack_offset) + return 0; + + /* There must be at least a buffer or we must not report an ACK. */ + BUG_ON(LIST_ISEMPTY(&s->buf_list)); + + /* get oldest buffer from buf_list */ + stream_buf = LIST_NEXT(&s->buf_list, struct qc_stream_buf *, list); + buf = &stream_buf->buf; + + diff = offset + len - s->ack_offset; + s->ack_offset += diff; + b_del(buf, diff); + + /* Free oldest buffer if all data acknowledged. */ + if (!b_data(buf)) { + qc_stream_buf_free(s, &stream_buf); + + /* Free stream instance if already released and no buffers left. */ + if (s->release && LIST_ISEMPTY(&s->buf_list)) { + qc_stream_desc_free(s, 0); + *stream = NULL; + } + } + + return diff; +} + +/* Free the stream descriptor <stream> content. This function should be used + * when all its data have been acknowledged or on full connection closing if <closing> + * boolean is set to 1. It must only be called after the stream is released. + */ +void qc_stream_desc_free(struct qc_stream_desc *stream, int closing) +{ + struct qc_stream_buf *buf, *buf_back; + struct quic_conn *qc = stream->qc; + struct eb64_node *frm_node; + unsigned int free_count = 0; + + /* This function only deals with released streams. */ + BUG_ON(!stream->release); + + /* free remaining stream buffers */ + list_for_each_entry_safe(buf, buf_back, &stream->buf_list, list) { + if (!(b_data(&buf->buf)) || closing) { + b_free(&buf->buf); + LIST_DELETE(&buf->list); + pool_free(pool_head_quic_stream_buf, buf); + + ++free_count; + } + } + + if (free_count) { + offer_buffers(NULL, free_count); + + qc->stream_buf_count -= free_count; + if (qc->mux_state == QC_MUX_READY) { + /* notify MUX about available buffers. */ + if (qc->qcc->flags & QC_CF_CONN_FULL) { + qc->qcc->flags &= ~QC_CF_CONN_FULL; + tasklet_wakeup(qc->qcc->wait_event.tasklet); + } + } + } + + /* qc_stream_desc might be freed before having received all its ACKs. + * This is the case if some frames were retransmitted. + */ + frm_node = eb64_first(&stream->acked_frms); + while (frm_node) { + struct qf_stream *strm_frm; + struct quic_frame *frm; + + strm_frm = eb64_entry(frm_node, struct qf_stream, offset); + + frm_node = eb64_next(frm_node); + eb64_delete(&strm_frm->offset); + + frm = container_of(strm_frm, struct quic_frame, stream); + qc_release_frm(qc, frm); + } + + if (stream->by_id.key != (uint64_t)-1) + eb64_delete(&stream->by_id); + pool_free(pool_head_quic_stream_desc, stream); +} + +/* Return the current buffer of <stream>. May be NULL if not allocated. */ +struct buffer *qc_stream_buf_get(struct qc_stream_desc *stream) +{ + if (!stream->buf) + return NULL; + + return &stream->buf->buf; +} + +/* Returns the count of available buffer left for <qc>. */ +static int qc_stream_buf_avail(struct quic_conn *qc) +{ + BUG_ON(qc->stream_buf_count > global.tune.quic_streams_buf); + return global.tune.quic_streams_buf - qc->stream_buf_count; +} + +/* Allocate a new current buffer for <stream>. The buffer limit count for the + * connection is checked first. This function is not allowed if current buffer + * is not NULL prior to this call. The new buffer represents stream payload at + * offset <offset>. + * + * Returns the buffer or NULL on error. Caller may check <avail> to ensure if + * the connection buffer limit was reached or a fatal error was encountered. + */ +struct buffer *qc_stream_buf_alloc(struct qc_stream_desc *stream, + uint64_t offset, int *avail) +{ + struct quic_conn *qc = stream->qc; + + /* current buffer must be released first before allocate a new one. */ + BUG_ON(stream->buf); + + *avail = qc_stream_buf_avail(qc); + if (!*avail) + return NULL; + + stream->buf_offset = offset; + stream->buf = pool_alloc(pool_head_quic_stream_buf); + if (!stream->buf) + return NULL; + + ++qc->stream_buf_count; + + stream->buf->buf = BUF_NULL; + LIST_APPEND(&stream->buf_list, &stream->buf->list); + + return &stream->buf->buf; +} + +/* Release the current buffer of <stream>. It will be kept internally by + * the <stream>. The current buffer cannot be NULL. + */ +void qc_stream_buf_release(struct qc_stream_desc *stream) +{ + /* current buffer already released */ + BUG_ON(!stream->buf); + + stream->buf = NULL; + stream->buf_offset = 0; +} diff --git a/src/quic_tls.c b/src/quic_tls.c new file mode 100644 index 0000000..581d615 --- /dev/null +++ b/src/quic_tls.c @@ -0,0 +1,1095 @@ +#include <haproxy/quic_tls.h> + +#include <string.h> + +#include <openssl/evp.h> +#include <openssl/kdf.h> +#include <openssl/ssl.h> + +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/pool.h> +#include <haproxy/quic_ack.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_rx.h> +#include <haproxy/quic_stream.h> + + +DECLARE_POOL(pool_head_quic_enc_level, "quic_enc_level", sizeof(struct quic_enc_level)); +DECLARE_POOL(pool_head_quic_pktns, "quic_pktns", sizeof(struct quic_pktns)); +DECLARE_POOL(pool_head_quic_tls_ctx, "quic_tls_ctx", sizeof(struct quic_tls_ctx)); +DECLARE_POOL(pool_head_quic_tls_secret, "quic_tls_secret", QUIC_TLS_SECRET_LEN); +DECLARE_POOL(pool_head_quic_tls_iv, "quic_tls_iv", QUIC_TLS_IV_LEN); +DECLARE_POOL(pool_head_quic_tls_key, "quic_tls_key", QUIC_TLS_KEY_LEN); + +DECLARE_POOL(pool_head_quic_crypto_buf, "quic_crypto_buf", sizeof(struct quic_crypto_buf)); +DECLARE_STATIC_POOL(pool_head_quic_cstream, "quic_cstream", sizeof(struct quic_cstream)); + +/* Initial salt depending on QUIC version to derive client/server initial secrets. + * This one is for draft-29 QUIC version. + */ +const unsigned char initial_salt_draft_29[20] = { + 0xaf, 0xbf, 0xec, 0x28, 0x99, 0x93, 0xd2, 0x4c, + 0x9e, 0x97, 0x86, 0xf1, 0x9c, 0x61, 0x11, 0xe0, + 0x43, 0x90, 0xa8, 0x99 +}; + +const unsigned char initial_salt_v1[20] = { + 0x38, 0x76, 0x2c, 0xf7, 0xf5, 0x59, 0x34, 0xb3, + 0x4d, 0x17, 0x9a, 0xe6, 0xa4, 0xc8, 0x0c, 0xad, + 0xcc, 0xbb, 0x7f, 0x0a +}; + +const unsigned char initial_salt_v2[20] = { + 0x0d, 0xed, 0xe3, 0xde, 0xf7, 0x00, 0xa6, 0xdb, + 0x81, 0x93, 0x81, 0xbe, 0x6e, 0x26, 0x9d, 0xcb, + 0xf9, 0xbd, 0x2e, 0xd9 +}; + +/* Dump the RX/TX secrets of <secs> QUIC TLS secrets. */ +void quic_tls_keys_hexdump(struct buffer *buf, + const struct quic_tls_secrets *secs) +{ + int i; + size_t aead_keylen; + size_t aead_ivlen; + size_t hp_len; + + if (!secs->aead || !secs->hp) + return; + + aead_keylen = (size_t)EVP_CIPHER_key_length(secs->aead); + aead_ivlen = (size_t)EVP_CIPHER_iv_length(secs->aead); + hp_len = (size_t)EVP_CIPHER_key_length(secs->hp); + + chunk_appendf(buf, "\n key="); + for (i = 0; i < aead_keylen; i++) + chunk_appendf(buf, "%02x", secs->key[i]); + chunk_appendf(buf, "\n iv="); + for (i = 0; i < aead_ivlen; i++) + chunk_appendf(buf, "%02x", secs->iv[i]); + chunk_appendf(buf, "\n hp="); + for (i = 0; i < hp_len; i++) + chunk_appendf(buf, "%02x", secs->hp_key[i]); +} + +/* Dump the RX/TX secrets of <kp> QUIC TLS key phase */ +void quic_tls_kp_keys_hexdump(struct buffer *buf, + const struct quic_tls_kp *kp) +{ + int i; + + chunk_appendf(buf, "\n secret="); + for (i = 0; i < kp->secretlen; i++) + chunk_appendf(buf, "%02x", kp->secret[i]); + chunk_appendf(buf, "\n key="); + for (i = 0; i < kp->keylen; i++) + chunk_appendf(buf, "%02x", kp->key[i]); + chunk_appendf(buf, "\n iv="); + for (i = 0; i < kp->ivlen; i++) + chunk_appendf(buf, "%02x", kp->iv[i]); +} + +/* Release the memory of <pktns> packet number space attached to <qc> QUIC connection. */ +void quic_pktns_release(struct quic_conn *qc, struct quic_pktns **pktns) +{ + if (!*pktns) + return; + + quic_pktns_tx_pkts_release(*pktns, qc); + qc_release_pktns_frms(qc, *pktns); + quic_free_arngs(qc, &(*pktns)->rx.arngs); + LIST_DEL_INIT(&(*pktns)->list); + pool_free(pool_head_quic_pktns, *pktns); + *pktns = NULL; +} + +/* Dump <secret> TLS secret. */ +void quic_tls_secret_hexdump(struct buffer *buf, + const unsigned char *secret, size_t secret_len) +{ + int i; + + chunk_appendf(buf, " secret="); + for (i = 0; i < secret_len; i++) + chunk_appendf(buf, "%02x", secret[i]); +} + +/* Release the memory allocated for <cs> CRYPTO stream */ +void quic_cstream_free(struct quic_cstream *cs) +{ + if (!cs) { + /* This is the case for ORTT encryption level */ + return; + } + + quic_free_ncbuf(&cs->rx.ncbuf); + + qc_stream_desc_release(cs->desc, 0); + pool_free(pool_head_quic_cstream, cs); +} + +/* Allocate a new QUIC stream for <qc>. + * Return it if succeeded, NULL if not. + */ +struct quic_cstream *quic_cstream_new(struct quic_conn *qc) +{ + struct quic_cstream *cs, *ret_cs = NULL; + + TRACE_ENTER(QUIC_EV_CONN_LPKT, qc); + cs = pool_alloc(pool_head_quic_cstream); + if (!cs) { + TRACE_ERROR("crypto stream allocation failed", QUIC_EV_CONN_INIT, qc); + goto leave; + } + + cs->rx.offset = 0; + cs->rx.ncbuf = NCBUF_NULL; + cs->rx.offset = 0; + + cs->tx.offset = 0; + cs->tx.sent_offset = 0; + cs->tx.buf = BUF_NULL; + cs->desc = qc_stream_desc_new((uint64_t)-1, -1, cs, qc); + if (!cs->desc) { + TRACE_ERROR("crypto stream allocation failed", QUIC_EV_CONN_INIT, qc); + goto err; + } + + ret_cs = cs; + leave: + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc); + return ret_cs; + + err: + pool_free(pool_head_quic_cstream, cs); + goto leave; +} + +/* Uninitialize <qel> QUIC encryption level. Never fails. */ +void quic_conn_enc_level_uninit(struct quic_conn *qc, struct quic_enc_level *qel) +{ + int i; + struct qf_crypto *qf_crypto, *qfback; + + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + + for (i = 0; i < qel->tx.crypto.nb_buf; i++) { + if (qel->tx.crypto.bufs[i]) { + pool_free(pool_head_quic_crypto_buf, qel->tx.crypto.bufs[i]); + qel->tx.crypto.bufs[i] = NULL; + } + } + + list_for_each_entry_safe(qf_crypto, qfback, &qel->rx.crypto_frms, list) { + LIST_DELETE(&qf_crypto->list); + pool_free(pool_head_qf_crypto, qf_crypto); + } + + ha_free(&qel->tx.crypto.bufs); + quic_cstream_free(qel->cstream); + + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); +} + +/* Initialize QUIC TLS encryption level with <level<> as level for <qc> QUIC + * connection allocating everything needed. + * + * Returns 1 if succeeded, 0 if not. On error the caller is responsible to use + * quic_conn_enc_level_uninit() to cleanup partially allocated content. + */ +static int quic_conn_enc_level_init(struct quic_conn *qc, + struct quic_enc_level **el, + struct quic_pktns *pktns, + enum ssl_encryption_level_t level) +{ + int ret = 0; + struct quic_enc_level *qel; + + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + + qel = pool_alloc(pool_head_quic_enc_level); + if (!qel) + goto leave; + + LIST_INIT(&qel->retrans); + qel->retrans_frms = NULL; + qel->tx.crypto.bufs = NULL; + qel->tx.crypto.nb_buf = 0; + qel->cstream = NULL; + qel->pktns = pktns; + qel->level = level; + quic_tls_ctx_reset(&qel->tls_ctx); + + qel->rx.pkts = EB_ROOT; + LIST_INIT(&qel->rx.pqpkts); + LIST_INIT(&qel->rx.crypto_frms); + + /* Allocate only one buffer. */ + /* TODO: use a pool */ + qel->tx.crypto.bufs = malloc(sizeof *qel->tx.crypto.bufs); + if (!qel->tx.crypto.bufs) + goto err; + + qel->tx.crypto.bufs[0] = pool_alloc(pool_head_quic_crypto_buf); + if (!qel->tx.crypto.bufs[0]) + goto err; + + + qel->tx.crypto.bufs[0]->sz = 0; + qel->tx.crypto.nb_buf = 1; + + qel->tx.crypto.sz = 0; + qel->tx.crypto.offset = 0; + /* No CRYPTO data for early data TLS encryption level */ + if (level == ssl_encryption_early_data) + qel->cstream = NULL; + else { + qel->cstream = quic_cstream_new(qc); + if (!qel->cstream) + goto err; + } + + LIST_APPEND(&qc->qel_list, &qel->list); + *el = qel; + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); + return ret; + + err: + quic_conn_enc_level_uninit(qc, qel); + pool_free(pool_head_quic_enc_level, qel); + goto leave; +} + +/* Allocate a QUIC TLS encryption with <level> as TLS stack encryption to be + * attached to <qc> QUIC connection. Also allocate the associated packet number + * space object with <pktns> as address to be attached to <qc> if not already + * allocated. + * Return 1 if succeeded, 0 if not. + */ +int qc_enc_level_alloc(struct quic_conn *qc, struct quic_pktns **pktns, + struct quic_enc_level **qel, enum ssl_encryption_level_t level) +{ + int ret = 0; + + BUG_ON(!qel || !pktns); + BUG_ON(*qel && !*pktns); + + if (!*pktns && !quic_pktns_init(qc, pktns)) + goto leave; + + if (!*qel && !quic_conn_enc_level_init(qc, qel, *pktns, level)) + goto leave; + + ret = 1; + leave: + return ret; +} + +/* Free the memory allocated to the encryption level attached to <qc> connection + * with <qel> as pointer address. Also remove it from the list of the encryption + * levels attached to this connection and reset its value to NULL. + * Never fails. + */ +void qc_enc_level_free(struct quic_conn *qc, struct quic_enc_level **qel) +{ + if (!*qel) + return; + + quic_tls_ctx_secs_free(&(*qel)->tls_ctx); + quic_conn_enc_level_uninit(qc, *qel); + LIST_DEL_INIT(&(*qel)->list); + pool_free(pool_head_quic_enc_level, *qel); + *qel = NULL; +} + +int quic_hkdf_extract(const EVP_MD *md, + unsigned char *buf, size_t buflen, + const unsigned char *key, size_t keylen, + const unsigned char *salt, size_t saltlen) +{ + EVP_PKEY_CTX *ctx; + + ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_HKDF, NULL); + if (!ctx) + return 0; + + if (EVP_PKEY_derive_init(ctx) <= 0 || + EVP_PKEY_CTX_hkdf_mode(ctx, EVP_PKEY_HKDEF_MODE_EXTRACT_ONLY) <= 0 || + EVP_PKEY_CTX_set_hkdf_md(ctx, md) <= 0 || + EVP_PKEY_CTX_set1_hkdf_salt(ctx, salt, saltlen) <= 0 || + EVP_PKEY_CTX_set1_hkdf_key(ctx, key, keylen) <= 0 || + EVP_PKEY_derive(ctx, buf, &buflen) <= 0) + goto err; + + EVP_PKEY_CTX_free(ctx); + return 1; + + err: + EVP_PKEY_CTX_free(ctx); + return 0; +} + +int quic_hkdf_expand(const EVP_MD *md, + unsigned char *buf, size_t buflen, + const unsigned char *key, size_t keylen, + const unsigned char *label, size_t labellen) +{ + EVP_PKEY_CTX *ctx; + + ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_HKDF, NULL); + if (!ctx) + return 0; + + if (EVP_PKEY_derive_init(ctx) <= 0 || + EVP_PKEY_CTX_hkdf_mode(ctx, EVP_PKEY_HKDEF_MODE_EXPAND_ONLY) <= 0 || + EVP_PKEY_CTX_set_hkdf_md(ctx, md) <= 0 || + EVP_PKEY_CTX_set1_hkdf_key(ctx, key, keylen) <= 0 || + EVP_PKEY_CTX_add1_hkdf_info(ctx, label, labellen) <= 0 || + EVP_PKEY_derive(ctx, buf, &buflen) <= 0) + goto err; + + EVP_PKEY_CTX_free(ctx); + return 1; + + err: + EVP_PKEY_CTX_free(ctx); + return 0; +} + +/* Extracts a peudo-random secret key from <key> which is eventually not + * pseudo-random and expand it to a new pseudo-random key into + * <buf> with <buflen> as key length according to HKDF specifications + * (https://datatracker.ietf.org/doc/html/rfc5869). + * According to this specifications it is highly recommended to use + * a salt, even if optional (NULL value). + * Return 1 if succeeded, 0 if not. + */ +int quic_hkdf_extract_and_expand(const EVP_MD *md, + unsigned char *buf, size_t buflen, + const unsigned char *key, size_t keylen, + const unsigned char *salt, size_t saltlen, + const unsigned char *label, size_t labellen) +{ + EVP_PKEY_CTX *ctx; + + ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_HKDF, NULL); + if (!ctx) + return 0; + + if (EVP_PKEY_derive_init(ctx) <= 0 || + EVP_PKEY_CTX_hkdf_mode(ctx, EVP_PKEY_HKDEF_MODE_EXTRACT_AND_EXPAND) <= 0 || + EVP_PKEY_CTX_set_hkdf_md(ctx, md) <= 0 || + EVP_PKEY_CTX_set1_hkdf_salt(ctx, salt, saltlen) <= 0 || + EVP_PKEY_CTX_set1_hkdf_key(ctx, key, keylen) <= 0 || + EVP_PKEY_CTX_add1_hkdf_info(ctx, label, labellen) <= 0 || + EVP_PKEY_derive(ctx, buf, &buflen) <= 0) + goto err; + + EVP_PKEY_CTX_free(ctx); + return 1; + + err: + EVP_PKEY_CTX_free(ctx); + return 0; +} + +/* https://quicwg.org/base-drafts/draft-ietf-quic-tls.html#protection-keys + * refers to: + * + * https://tools.ietf.org/html/rfc8446#section-7.1: + * 7.1. Key Schedule + * + * The key derivation process makes use of the HKDF-Extract and + * HKDF-Expand functions as defined for HKDF [RFC5869], as well as the + * functions defined below: + * + * HKDF-Expand-Label(Secret, Label, Context, Length) = + * HKDF-Expand(Secret, HkdfLabel, Length) + * + * Where HkdfLabel is specified as: + * + * struct { + * uint16 length = Length; + * opaque label<7..255> = "tls13 " + Label; + * opaque context<0..255> = Context; + * } HkdfLabel; + * + * Derive-Secret(Secret, Label, Messages) = + * HKDF-Expand-Label(Secret, Label, + * Transcript-Hash(Messages), Hash.length) + * + */ +int quic_hkdf_expand_label(const EVP_MD *md, + unsigned char *buf, size_t buflen, + const unsigned char *key, size_t keylen, + const unsigned char *label, size_t labellen) +{ + unsigned char hdkf_label[256], *pos; + const unsigned char hdkf_label_label[] = "tls13 "; + size_t hdkf_label_label_sz = sizeof hdkf_label_label - 1; + + pos = hdkf_label; + *pos++ = buflen >> 8; + *pos++ = buflen & 0xff; + *pos++ = hdkf_label_label_sz + labellen; + memcpy(pos, hdkf_label_label, hdkf_label_label_sz); + pos += hdkf_label_label_sz; + memcpy(pos, label, labellen); + pos += labellen; + *pos++ = '\0'; + + return quic_hkdf_expand(md, buf, buflen, + key, keylen, hdkf_label, pos - hdkf_label); +} + +/* + * This function derives two keys from <secret> is <ctx> as TLS cryptographic context. + * ->key is the TLS key to be derived to encrypt/decrypt data at TLS level. + * ->iv is the initialization vector to be used with ->key. + * ->hp_key is the key to be derived for header protection. + * Obviouly these keys have the same size becaused derived with the same TLS cryptographic context. + */ +int quic_tls_derive_keys(const EVP_CIPHER *aead, const EVP_CIPHER *hp, + const EVP_MD *md, const struct quic_version *qv, + unsigned char *key, size_t keylen, + unsigned char *iv, size_t ivlen, + unsigned char *hp_key, size_t hp_keylen, + const unsigned char *secret, size_t secretlen) +{ + size_t aead_keylen = (size_t)EVP_CIPHER_key_length(aead); + size_t aead_ivlen = (size_t)EVP_CIPHER_iv_length(aead); + size_t hp_len = hp ? (size_t)EVP_CIPHER_key_length(hp) : 0; + + if (aead_keylen > keylen || aead_ivlen > ivlen || hp_len > hp_keylen) + return 0; + + if (!quic_hkdf_expand_label(md, key, aead_keylen, secret, secretlen, + qv->key_label,qv->key_label_len) || + !quic_hkdf_expand_label(md, iv, aead_ivlen, secret, secretlen, + qv->iv_label, qv->iv_label_len) || + (hp_key && !quic_hkdf_expand_label(md, hp_key, hp_len, secret, secretlen, + qv->hp_label, qv->hp_label_len))) + return 0; + + return 1; +} + +/* + * Derive the initial secret from <secret> and QUIC version dependent salt. + * Returns the size of the derived secret if succeeded, 0 if not. + */ +int quic_derive_initial_secret(const EVP_MD *md, + const unsigned char *initial_salt, size_t initial_salt_sz, + unsigned char *initial_secret, size_t initial_secret_sz, + const unsigned char *secret, size_t secret_sz) +{ + if (!quic_hkdf_extract(md, initial_secret, initial_secret_sz, secret, secret_sz, + initial_salt, initial_salt_sz)) + return 0; + + return 1; +} + +/* + * Derive the client initial secret from the initial secret. + * Returns the size of the derived secret if succeeded, 0 if not. + */ +int quic_tls_derive_initial_secrets(const EVP_MD *md, + unsigned char *rx, size_t rx_sz, + unsigned char *tx, size_t tx_sz, + const unsigned char *secret, size_t secret_sz, + int server) +{ + const unsigned char client_label[] = "client in"; + const unsigned char server_label[] = "server in"; + const unsigned char *tx_label, *rx_label; + size_t rx_label_sz, tx_label_sz; + + if (server) { + rx_label = client_label; + rx_label_sz = sizeof client_label; + tx_label = server_label; + tx_label_sz = sizeof server_label; + } + else { + rx_label = server_label; + rx_label_sz = sizeof server_label; + tx_label = client_label; + tx_label_sz = sizeof client_label; + } + + if (!quic_hkdf_expand_label(md, rx, rx_sz, secret, secret_sz, + rx_label, rx_label_sz - 1) || + !quic_hkdf_expand_label(md, tx, tx_sz, secret, secret_sz, + tx_label, tx_label_sz - 1)) + return 0; + + return 1; +} + +/* Update <sec> secret key into <new_sec> according to RFC 9001 6.1. + * Always succeeds. + */ +int quic_tls_sec_update(const EVP_MD *md, const struct quic_version *qv, + unsigned char *new_sec, size_t new_seclen, + const unsigned char *sec, size_t seclen) +{ + return quic_hkdf_expand_label(md, new_sec, new_seclen, sec, seclen, + qv->ku_label, qv->ku_label_len); +} + +/* + * Build an IV into <iv> buffer with <ivlen> as size from <aead_iv> with + * <aead_ivlen> as size depending on <pn> packet number. + * This is the function which must be called to build an AEAD IV for the AEAD cryptographic algorithm + * used to encrypt/decrypt the QUIC packet payloads depending on the packet number <pn>. + */ +void quic_aead_iv_build(unsigned char *iv, size_t ivlen, + unsigned char *aead_iv, size_t aead_ivlen, uint64_t pn) +{ + int i; + unsigned int shift; + unsigned char *pos = iv; + + /* Input buffers must have the same size. */ + BUG_ON(ivlen != aead_ivlen); + + for (i = 0; i < ivlen - sizeof pn; i++) + *pos++ = *aead_iv++; + + /* Only the remaining (sizeof pn) bytes are XOR'ed. */ + shift = 56; + for (i = aead_ivlen - sizeof pn; i < aead_ivlen ; i++, shift -= 8) + *pos++ = *aead_iv++ ^ (pn >> shift); +} + +/* Initialize the cipher context for RX part of <tls_ctx> QUIC TLS context. + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_rx_ctx_init(EVP_CIPHER_CTX **rx_ctx, + const EVP_CIPHER *aead, unsigned char *key) +{ + EVP_CIPHER_CTX *ctx; + int aead_nid = EVP_CIPHER_nid(aead); + + ctx = EVP_CIPHER_CTX_new(); + if (!ctx) + return 0; + + if (!EVP_DecryptInit_ex(ctx, aead, NULL, NULL, NULL) || + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_IVLEN, QUIC_TLS_IV_LEN, NULL) || + (aead_nid == NID_aes_128_ccm && + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG, QUIC_TLS_TAG_LEN, NULL)) || + !EVP_DecryptInit_ex(ctx, NULL, NULL, key, NULL)) + goto err; + + *rx_ctx = ctx; + + return 1; + + err: + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +/* Initialize <*aes_ctx> AES cipher context with <key> as key for encryption */ +int quic_tls_enc_aes_ctx_init(EVP_CIPHER_CTX **aes_ctx, + const EVP_CIPHER *aes, unsigned char *key) +{ + EVP_CIPHER_CTX *ctx; + + ctx = EVP_CIPHER_CTX_new(); + if (!ctx) + return 0; + + if (!EVP_EncryptInit_ex(ctx, aes, NULL, key, NULL)) + goto err; + + *aes_ctx = ctx; + return 1; + + err: + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +/* Encrypt <inlen> bytes from <in> buffer into <out> with <ctx> as AES + * cipher context. This is the responsibility of the caller to check there + * is at least <inlen> bytes of available space in <out> buffer. + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_aes_encrypt(unsigned char *out, + const unsigned char *in, size_t inlen, + EVP_CIPHER_CTX *ctx) +{ + int ret = 0; + + if (!EVP_EncryptInit_ex(ctx, NULL, NULL, NULL, in) || + !EVP_EncryptUpdate(ctx, out, &ret, out, inlen) || + !EVP_EncryptFinal_ex(ctx, out, &ret)) + return 0; + + return 1; +} + +/* Initialize <*aes_ctx> AES cipher context with <key> as key for decryption */ +int quic_tls_dec_aes_ctx_init(EVP_CIPHER_CTX **aes_ctx, + const EVP_CIPHER *aes, unsigned char *key) +{ + EVP_CIPHER_CTX *ctx; + + ctx = EVP_CIPHER_CTX_new(); + if (!ctx) + return 0; + + if (!EVP_DecryptInit_ex(ctx, aes, NULL, key, NULL)) + goto err; + + *aes_ctx = ctx; + return 1; + + err: + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +/* Decrypt <in> data into <out> with <ctx> as AES cipher context. + * This is the responsibility of the caller to check there is at least + * <outlen> bytes into <in> buffer. + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_aes_decrypt(unsigned char *out, + const unsigned char *in, size_t inlen, + EVP_CIPHER_CTX *ctx) +{ + int ret = 0; + + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, in) || + !EVP_DecryptUpdate(ctx, out, &ret, out, inlen) || + !EVP_DecryptFinal_ex(ctx, out, &ret)) + return 0; + + return 1; +} + +/* Initialize the cipher context for TX part of <tls_ctx> QUIC TLS context. + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_tx_ctx_init(EVP_CIPHER_CTX **tx_ctx, + const EVP_CIPHER *aead, unsigned char *key) +{ + EVP_CIPHER_CTX *ctx; + int aead_nid = EVP_CIPHER_nid(aead); + + ctx = EVP_CIPHER_CTX_new(); + if (!ctx) + return 0; + + if (!EVP_EncryptInit_ex(ctx, aead, NULL, NULL, NULL) || + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_IVLEN, QUIC_TLS_IV_LEN, NULL) || + (aead_nid == NID_aes_128_ccm && + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG, QUIC_TLS_TAG_LEN, NULL)) || + !EVP_EncryptInit_ex(ctx, NULL, NULL, key, NULL)) + goto err; + + *tx_ctx = ctx; + + return 1; + + err: + EVP_CIPHER_CTX_free(ctx); + return 0; +} + +/* + * https://quicwg.org/base-drafts/draft-ietf-quic-tls.html#aead + * + * 5.3. AEAD Usage + * + * Packets are protected prior to applying header protection (Section 5.4). + * The unprotected packet header is part of the associated data (A). When removing + * packet protection, an endpoint first removes the header protection. + * (...) + * These ciphersuites have a 16-byte authentication tag and produce an output 16 + * bytes larger than their input. + * The key and IV for the packet are computed as described in Section 5.1. The nonce, + * N, is formed by combining the packet protection IV with the packet number. The 62 + * bits of the reconstructed QUIC packet number in network byte order are left-padded + * with zeros to the size of the IV. The exclusive OR of the padded packet number and + * the IV forms the AEAD nonce. + * + * The associated data, A, for the AEAD is the contents of the QUIC header, starting + * from the flags byte in either the short or long header, up to and including the + * unprotected packet number. + * + * The input plaintext, P, for the AEAD is the payload of the QUIC packet, as described + * in [QUIC-TRANSPORT]. + * + * The output ciphertext, C, of the AEAD is transmitted in place of P. + * + * Some AEAD functions have limits for how many packets can be encrypted under the same + * key and IV (see for example [AEBounds]). This might be lower than the packet number limit. + * An endpoint MUST initiate a key update (Section 6) prior to exceeding any limit set for + * the AEAD that is in use. + */ + +/* Encrypt in place <buf> plaintext with <len> as length with QUIC_TLS_TAG_LEN + * included tailing bytes for the tag. + * Note that for CCM mode, we must set the the ciphertext length if AAD data + * are provided from <aad> buffer with <aad_len> as length. This is always the + * case here. So the caller of this function must provide <aad>. + * + * https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption + */ +int quic_tls_encrypt(unsigned char *buf, size_t len, + const unsigned char *aad, size_t aad_len, + EVP_CIPHER_CTX *ctx, const EVP_CIPHER *aead, + const unsigned char *iv) +{ + int outlen; + int aead_nid = EVP_CIPHER_nid(aead); + + if (!EVP_EncryptInit_ex(ctx, NULL, NULL, NULL, iv) || + (aead_nid == NID_aes_128_ccm && + !EVP_EncryptUpdate(ctx, NULL, &outlen, NULL, len)) || + !EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len) || + !EVP_EncryptUpdate(ctx, buf, &outlen, buf, len) || + !EVP_EncryptFinal_ex(ctx, buf + outlen, &outlen) || + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_GET_TAG, QUIC_TLS_TAG_LEN, buf + len)) + return 0; + + return 1; +} + +/* Decrypt in place <buf> ciphertext with <len> as length with QUIC_TLS_TAG_LEN + * included tailing bytes for the tag. + * Note that for CCM mode, we must set the the ciphertext length if AAD data + * are provided from <aad> buffer with <aad_len> as length. This is always the + * case here. So the caller of this function must provide <aad>. Also not the + * there is no need to call EVP_DecryptFinal_ex for CCM mode. + * + * https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption + */ +int quic_tls_decrypt(unsigned char *buf, size_t len, + unsigned char *aad, size_t aad_len, + EVP_CIPHER_CTX *ctx, const EVP_CIPHER *aead, + const unsigned char *key, const unsigned char *iv) +{ + int outlen; + int aead_nid = EVP_CIPHER_nid(aead); + + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, iv) || + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG, QUIC_TLS_TAG_LEN, + buf + len - QUIC_TLS_TAG_LEN) || + (aead_nid == NID_aes_128_ccm && + !EVP_DecryptUpdate(ctx, NULL, &outlen, NULL, len - QUIC_TLS_TAG_LEN)) || + !EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len) || + !EVP_DecryptUpdate(ctx, buf, &outlen, buf, len - QUIC_TLS_TAG_LEN) || + (aead_nid != NID_aes_128_ccm && + !EVP_DecryptFinal_ex(ctx, buf + outlen, &outlen))) + return 0; + + return 1; +} + +/* Similar to quic_tls_decrypt(), except that this function does not decrypt + * in place its ciphertest if <out> output buffer ciphertest with <len> as length + * is different from <in> input buffer. This is the responbality of the caller + * to check that the output buffer has at least the same size as the input buffer. + * Note that for CCM mode, we must set the the ciphertext length if AAD data + * are provided from <aad> buffer with <aad_len> as length. This is always the + * case here. So the caller of this function must provide <aad>. Also note that + * there is no need to call EVP_DecryptFinal_ex for CCM mode. + * + * https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption + * + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_decrypt2(unsigned char *out, + unsigned char *in, size_t len, + unsigned char *aad, size_t aad_len, + EVP_CIPHER_CTX *ctx, const EVP_CIPHER *aead, + const unsigned char *key, const unsigned char *iv) +{ + int outlen; + int aead_nid = EVP_CIPHER_nid(aead); + + len -= QUIC_TLS_TAG_LEN; + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, iv) || + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG, QUIC_TLS_TAG_LEN, in + len) || + (aead_nid == NID_aes_128_ccm && + !EVP_DecryptUpdate(ctx, NULL, &outlen, NULL, len)) || + !EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len) || + !EVP_DecryptUpdate(ctx, out, &outlen, in, len) || + (aead_nid != NID_aes_128_ccm && + !EVP_DecryptFinal_ex(ctx, out + outlen, &outlen))) + return 0; + + return 1; +} + +/* Derive <key> and <iv> key and IV to be used to encrypt a retry token + * with <secret> which is not pseudo-random. + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_derive_retry_token_secret(const EVP_MD *md, + unsigned char *key, size_t keylen, + unsigned char *iv, size_t ivlen, + const unsigned char *salt, size_t saltlen, + const unsigned char *secret, size_t secretlen) +{ + unsigned char tmpkey[QUIC_TLS_KEY_LEN]; + const unsigned char key_label[] = "retry token key"; + const unsigned char iv_label[] = "retry token iv"; + + if (!quic_hkdf_extract(md, tmpkey, sizeof tmpkey, + secret, secretlen, salt, saltlen) || + !quic_hkdf_expand(md, key, keylen, tmpkey, sizeof tmpkey, + key_label, sizeof key_label - 1) || + !quic_hkdf_expand(md, iv, ivlen, tmpkey, sizeof tmpkey, + iv_label, sizeof iv_label - 1)) + return 0; + + return 1; +} + +/* Generate the AEAD tag for the Retry packet <pkt> of <pkt_len> bytes and + * write it to <tag>. The tag is written just after the <pkt> area. It should + * be at least 16 bytes longs. <odcid> is the CID of the Initial packet + * received which triggers the Retry. + * + * Returns non-zero on success else zero. + */ +int quic_tls_generate_retry_integrity_tag(unsigned char *odcid, unsigned char odcid_len, + unsigned char *pkt, size_t pkt_len, + const struct quic_version *qv) +{ + const EVP_CIPHER *evp = EVP_aes_128_gcm(); + EVP_CIPHER_CTX *ctx; + + /* encryption buffer - not used as only AEAD tag generation is proceed */ + unsigned char *out = NULL; + /* address to store the AEAD tag */ + unsigned char *tag = pkt + pkt_len; + int outlen, ret = 0; + + ctx = EVP_CIPHER_CTX_new(); + if (!ctx) + return 0; + + /* rfc9001 5.8. Retry Packet Integrity + * + * AEAD is proceed over a pseudo-Retry packet used as AAD. It contains + * the ODCID len + data and the Retry packet itself. + */ + if (!EVP_EncryptInit_ex(ctx, evp, NULL, qv->retry_tag_key, qv->retry_tag_nonce) || + /* specify pseudo-Retry as AAD */ + !EVP_EncryptUpdate(ctx, NULL, &outlen, &odcid_len, sizeof(odcid_len)) || + !EVP_EncryptUpdate(ctx, NULL, &outlen, odcid, odcid_len) || + !EVP_EncryptUpdate(ctx, NULL, &outlen, pkt, pkt_len) || + /* finalize */ + !EVP_EncryptFinal_ex(ctx, out, &outlen) || + /* store the tag */ + !EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_GET_TAG, QUIC_TLS_TAG_LEN, tag)) { + goto out; + } + ret = 1; + + out: + EVP_CIPHER_CTX_free(ctx); + return ret; +} + +/* Derive new keys and ivs required for Key Update feature for <qc> QUIC + * connection. + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_key_update(struct quic_conn *qc) +{ + struct quic_tls_ctx *tls_ctx = &qc->ael->tls_ctx; + struct quic_tls_secrets *rx = &tls_ctx->rx; + struct quic_tls_secrets *tx = &tls_ctx->tx; + /* Used only for the traces */ + struct quic_kp_trace kp_trace = { + .rx_sec = rx->secret, + .rx_seclen = rx->secretlen, + .tx_sec = tx->secret, + .tx_seclen = tx->secretlen, + }; + /* The next key phase secrets to be derived */ + struct quic_tls_kp *nxt_rx = &qc->ku.nxt_rx; + struct quic_tls_kp *nxt_tx = &qc->ku.nxt_tx; + const struct quic_version *ver = + qc->negotiated_version ? qc->negotiated_version : qc->original_version; + int ret = 0; + + TRACE_ENTER(QUIC_EV_CONN_KP, qc); + + nxt_rx = &qc->ku.nxt_rx; + nxt_tx = &qc->ku.nxt_tx; + + TRACE_PRINTF(TRACE_LEVEL_DEVELOPER, QUIC_EV_CONN_SPPKTS, qc, 0, 0, 0, + "nxt_rx->secretlen=%llu rx->secretlen=%llu", + (ull)nxt_rx->secretlen, (ull)rx->secretlen); + /* Prepare new RX secrets */ + if (!quic_tls_sec_update(rx->md, ver, nxt_rx->secret, nxt_rx->secretlen, + rx->secret, rx->secretlen)) { + TRACE_ERROR("New RX secret update failed", QUIC_EV_CONN_KP, qc); + goto leave; + } + + if (!quic_tls_derive_keys(rx->aead, NULL, rx->md, ver, + nxt_rx->key, nxt_rx->keylen, + nxt_rx->iv, nxt_rx->ivlen, NULL, 0, + nxt_rx->secret, nxt_rx->secretlen)) { + TRACE_ERROR("New RX key derivation failed", QUIC_EV_CONN_KP, qc); + goto leave; + } + + kp_trace.rx = nxt_rx; + /* Prepare new TX secrets */ + if (!quic_tls_sec_update(tx->md, ver, nxt_tx->secret, nxt_tx->secretlen, + tx->secret, tx->secretlen)) { + TRACE_ERROR("New TX secret update failed", QUIC_EV_CONN_KP, qc); + goto leave; + } + + if (!quic_tls_derive_keys(tx->aead, NULL, tx->md, ver, + nxt_tx->key, nxt_tx->keylen, + nxt_tx->iv, nxt_tx->ivlen, NULL, 0, + nxt_tx->secret, nxt_tx->secretlen)) { + TRACE_ERROR("New TX key derivation failed", QUIC_EV_CONN_KP, qc); + goto leave; + } + + kp_trace.tx = nxt_tx; + if (nxt_rx->ctx) { + EVP_CIPHER_CTX_free(nxt_rx->ctx); + nxt_rx->ctx = NULL; + } + + if (!quic_tls_rx_ctx_init(&nxt_rx->ctx, tls_ctx->rx.aead, nxt_rx->key)) { + TRACE_ERROR("could not initialize RX TLS cipher context", QUIC_EV_CONN_KP, qc); + goto leave; + } + + if (nxt_tx->ctx) { + EVP_CIPHER_CTX_free(nxt_tx->ctx); + nxt_tx->ctx = NULL; + } + + if (!quic_tls_tx_ctx_init(&nxt_tx->ctx, tls_ctx->tx.aead, nxt_tx->key)) { + TRACE_ERROR("could not initialize TX TLS cipher context", QUIC_EV_CONN_KP, qc); + goto leave; + } + + ret = 1; + leave: + TRACE_PROTO("key update", QUIC_EV_CONN_KP, qc, &kp_trace); + TRACE_LEAVE(QUIC_EV_CONN_KP, qc); + return ret; +} + +/* Rotate the Key Update information for <qc> QUIC connection. + * Must be used after having updated them. + * Always succeeds. + */ +void quic_tls_rotate_keys(struct quic_conn *qc) +{ + struct quic_tls_ctx *tls_ctx = &qc->ael->tls_ctx; + unsigned char *curr_secret, *curr_iv, *curr_key; + EVP_CIPHER_CTX *curr_ctx; + + TRACE_ENTER(QUIC_EV_CONN_RXPKT, qc); + + /* Rotate the RX secrets */ + curr_ctx = tls_ctx->rx.ctx; + curr_secret = tls_ctx->rx.secret; + curr_iv = tls_ctx->rx.iv; + curr_key = tls_ctx->rx.key; + + tls_ctx->rx.ctx = qc->ku.nxt_rx.ctx; + tls_ctx->rx.secret = qc->ku.nxt_rx.secret; + tls_ctx->rx.iv = qc->ku.nxt_rx.iv; + tls_ctx->rx.key = qc->ku.nxt_rx.key; + + qc->ku.nxt_rx.ctx = qc->ku.prv_rx.ctx; + qc->ku.nxt_rx.secret = qc->ku.prv_rx.secret; + qc->ku.nxt_rx.iv = qc->ku.prv_rx.iv; + qc->ku.nxt_rx.key = qc->ku.prv_rx.key; + + qc->ku.prv_rx.ctx = curr_ctx; + qc->ku.prv_rx.secret = curr_secret; + qc->ku.prv_rx.iv = curr_iv; + qc->ku.prv_rx.key = curr_key; + qc->ku.prv_rx.pn = tls_ctx->rx.pn; + + /* Update the TX secrets */ + curr_ctx = tls_ctx->tx.ctx; + curr_secret = tls_ctx->tx.secret; + curr_iv = tls_ctx->tx.iv; + curr_key = tls_ctx->tx.key; + + tls_ctx->tx.ctx = qc->ku.nxt_tx.ctx; + tls_ctx->tx.secret = qc->ku.nxt_tx.secret; + tls_ctx->tx.iv = qc->ku.nxt_tx.iv; + tls_ctx->tx.key = qc->ku.nxt_tx.key; + + qc->ku.nxt_tx.ctx = curr_ctx; + qc->ku.nxt_tx.secret = curr_secret; + qc->ku.nxt_tx.iv = curr_iv; + qc->ku.nxt_tx.key = curr_key; + + TRACE_LEAVE(QUIC_EV_CONN_RXPKT, qc); +} + +/* Release the memory allocated for the QUIC TLS context with <ctx> as address. */ +void quic_tls_ctx_free(struct quic_tls_ctx **ctx) +{ + if (!*ctx) + return; + + quic_tls_ctx_secs_free(*ctx); + pool_free(pool_head_quic_tls_ctx, *ctx); + *ctx = NULL; +} + +/* Finalize <qc> QUIC connection: + * - allocated and initialize the Initial QUIC TLS context for negotiated + * version if needed, + * - derive the secrets for this context, + * - set them into the TLS stack, + * + * Return 1 if succeeded, 0 if not. + */ +int quic_tls_finalize(struct quic_conn *qc, int server) +{ + int ret = 0; + + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + if (!qc->negotiated_version) + goto done; + + qc->nictx = pool_alloc(pool_head_quic_tls_ctx); + if (!qc->nictx) + goto err; + + quic_tls_ctx_reset(qc->nictx); + if (!qc_new_isecs(qc, qc->nictx, qc->negotiated_version, + qc->odcid.data, qc->odcid.len, server)) + goto err; + + done: + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + return ret; + + err: + quic_tls_ctx_free(&qc->nictx); + goto out; +} diff --git a/src/quic_tp.c b/src/quic_tp.c new file mode 100644 index 0000000..caf48ce --- /dev/null +++ b/src/quic_tp.c @@ -0,0 +1,714 @@ +#include <arpa/inet.h> +#include <string.h> + +#include <haproxy/global.h> +#include <haproxy/ncbuf-t.h> +#include <haproxy/net_helper.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_enc.h> +#include <haproxy/quic_tp.h> +#include <haproxy/quic_trace.h> +#include <haproxy/trace.h> + +#define QUIC_MAX_UDP_PAYLOAD_SIZE 2048 + +/* This is the values of some QUIC transport parameters when absent. + * Should be used to initialize any transport parameters (local or remote) + * before updating them with customized values. + */ +struct quic_transport_params quic_dflt_transport_params = { + .max_udp_payload_size = QUIC_TP_DFLT_MAX_UDP_PAYLOAD_SIZE, + .ack_delay_exponent = QUIC_TP_DFLT_ACK_DELAY_COMPONENT, + .max_ack_delay = QUIC_TP_DFLT_MAX_ACK_DELAY, + .active_connection_id_limit = QUIC_TP_DFLT_ACTIVE_CONNECTION_ID_LIMIT, +}; + +/* Initialize <dst> transport parameters with default values (when absent) + * from <quic_dflt_transport_params>. + * Never fails. + */ +static void quic_dflt_transport_params_cpy(struct quic_transport_params *dst) +{ + dst->max_udp_payload_size = quic_dflt_transport_params.max_udp_payload_size; + dst->ack_delay_exponent = quic_dflt_transport_params.ack_delay_exponent; + dst->max_ack_delay = quic_dflt_transport_params.max_ack_delay; + dst->active_connection_id_limit = quic_dflt_transport_params.active_connection_id_limit; +} + +/* Initialize <p> transport parameters. <server> is a boolean, set if TPs are + * used by a server (haproxy frontend) else this is for a client (haproxy + * backend). + * + * This must only be used for haproxy local parameters. To initialize peer + * parameters, see quic_dflt_transport_params_cpy(). + * + * Never fails. + */ +void quic_transport_params_init(struct quic_transport_params *p, int server) +{ + const uint64_t ncb_size = global.tune.bufsize - NCB_RESERVED_SZ; + const int max_streams_bidi = global.tune.quic_frontend_max_streams_bidi; + const int max_streams_uni = 3; + + /* Set RFC default values for unspecified parameters. */ + quic_dflt_transport_params_cpy(p); + + /* Set the max_udp_payload_size value. If not would equal to + * QUIC_TP_DFLT_MAX_UDP_PAYLOAD_SIZE + */ + p->max_udp_payload_size = QUIC_MAX_UDP_PAYLOAD_SIZE; + if (server) + p->max_idle_timeout = global.tune.quic_frontend_max_idle_timeout; + else + p->max_idle_timeout = global.tune.quic_backend_max_idle_timeout; + + p->initial_max_streams_bidi = max_streams_bidi; + p->initial_max_streams_uni = max_streams_uni; + p->initial_max_stream_data_bidi_local = ncb_size; + p->initial_max_stream_data_bidi_remote = ncb_size; + p->initial_max_stream_data_uni = ncb_size; + p->initial_max_data = (max_streams_bidi + max_streams_uni) * ncb_size; + + if (server) { + p->with_stateless_reset_token = 1; + p->disable_active_migration = 1; + } + + p->active_connection_id_limit = 8; + + p->retry_source_connection_id.len = 0; +} + +/* Encode <addr> preferred address transport parameter in <buf> without its + * "type+len" prefix. + * It is the responsibility of the caller to check there is enough room in <buf> to encode + * this address. + * Never fails. + */ +static void quic_transport_param_enc_pref_addr_val(unsigned char **buf, + const unsigned char *end, + struct tp_preferred_address *addr) +{ + write_n16(*buf, addr->ipv4_port); + *buf += sizeof addr->ipv4_port; + + memcpy(*buf, (uint8_t *)&addr->ipv4_addr.s_addr, sizeof(addr->ipv4_addr.s_addr)); + *buf += sizeof(addr->ipv4_addr.s_addr); + + write_n16(*buf, addr->ipv6_port); + *buf += sizeof addr->ipv6_port; + + memcpy(*buf, addr->ipv6_addr.s6_addr, sizeof(addr->ipv6_addr.s6_addr)); + *buf += sizeof(addr->ipv6_addr.s6_addr); + + *(*buf)++ = addr->cid.len; + if (addr->cid.len) { + memcpy(*buf, addr->cid.data, addr->cid.len); + *buf += addr->cid.len; + } + + memcpy(*buf, addr->stateless_reset_token, sizeof addr->stateless_reset_token); + *buf += sizeof addr->stateless_reset_token; +} + +/* Decode into <addr> preferred address transport parameter found in <*buf> buffer. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_transport_param_dec_pref_addr(struct tp_preferred_address *addr, + const unsigned char **buf, + const unsigned char *end) +{ + ssize_t addr_len; + + addr_len = sizeof(addr->ipv4_port) + sizeof(addr->ipv4_addr.s_addr); + addr_len += sizeof(addr->ipv6_port) + sizeof(addr->ipv6_addr.s6_addr); + addr_len += sizeof(addr->cid.len); + + if (end - *buf < addr_len) + return 0; + + memcpy((uint8_t *)&addr->ipv4_addr.s_addr, *buf, sizeof(addr->ipv4_addr.s_addr)); + *buf += sizeof(addr->ipv4_addr.s_addr); + + addr->ipv4_port = read_n16(*buf); + *buf += sizeof addr->ipv4_port; + + memcpy(addr->ipv6_addr.s6_addr, *buf, sizeof(addr->ipv6_addr.s6_addr)); + *buf += sizeof(addr->ipv6_addr.s6_addr); + + addr->ipv6_port = read_n16(*buf); + *buf += sizeof addr->ipv6_port; + + addr->cid.len = *(*buf)++; + if (addr->cid.len) { + if (end - sizeof(addr->stateless_reset_token) - *buf > addr->cid.len || + addr->cid.len > sizeof(addr->cid.data)) { + return 0; + } + + memcpy(addr->cid.data, *buf, addr->cid.len); + *buf += addr->cid.len; + } + + if (end - *buf != sizeof(addr->stateless_reset_token)) + return 0; + + memcpy(addr->stateless_reset_token, *buf, end - *buf); + *buf += sizeof addr->stateless_reset_token; + + return *buf == end; +} + +/* Decode into <v> version information received transport parameters from <*buf> + * buffer. <server> must be set to 1 for QUIC clients which receive server + * transport parameters, and 0 for QUIC servers which receive client transport + * parameters. + * Also set the QUIC negotiated version into <tp>. + * Return 1 if succeeded, 0 if not. + */ +static int quic_transport_param_dec_version_info(struct tp_version_information *tp, + const unsigned char **buf, + const unsigned char *end, int server) +{ + size_t tp_len = end - *buf; + const uint32_t *ver, *others; + + /* <tp_len> must be a multiple of sizeof(uint32_t) */ + if (tp_len < sizeof tp->chosen || (tp_len & 0x3)) + return 0; + + tp->chosen = ntohl(*(uint32_t *)*buf); + /* Must not be null */ + if (!tp->chosen) + return 0; + + *buf += sizeof tp->chosen; + others = (const uint32_t *)*buf; + + /* Others versions must not be null */ + for (ver = others; ver < (const uint32_t *)end; ver++) { + if (!*ver) + return 0; + } + + if (server) + /* TODO: not supported */ + return 0; + + for (ver = others; ver < (const uint32_t *)end; ver++) { + if (!tp->negotiated_version) { + int i; + + for (i = 0; i < quic_versions_nb; i++) { + if (ntohl(*ver) == quic_versions[i].num) { + tp->negotiated_version = &quic_versions[i]; + break; + } + } + } + + if (preferred_version && ntohl(*ver) == preferred_version->num) { + tp->negotiated_version = preferred_version; + goto out; + } + } + + out: + *buf = end; + + return 1; +} + +/* Decode into <p> struct a transport parameter found in <*buf> buffer with + * <type> as type and <len> as length, depending on <server> boolean value which + * must be set to 1 for a server (haproxy listener) or 0 for a client (connection + * to an haproxy server). + */ +static int quic_transport_param_decode(struct quic_transport_params *p, + int server, uint64_t type, + const unsigned char **buf, size_t len) +{ + const unsigned char *end = *buf + len; + + switch (type) { + case QUIC_TP_ORIGINAL_DESTINATION_CONNECTION_ID: + if (!server || len > sizeof p->original_destination_connection_id.data) + return 0; + + if (len) + memcpy(p->original_destination_connection_id.data, *buf, len); + p->original_destination_connection_id.len = len; + *buf += len; + p->original_destination_connection_id_present = 1; + break; + case QUIC_TP_INITIAL_SOURCE_CONNECTION_ID: + if (len > sizeof p->initial_source_connection_id.data) + return 0; + + if (len) + memcpy(p->initial_source_connection_id.data, *buf, len); + p->initial_source_connection_id.len = len; + *buf += len; + p->initial_source_connection_id_present = 1; + break; + case QUIC_TP_STATELESS_RESET_TOKEN: + if (!server || len != sizeof p->stateless_reset_token) + return 0; + memcpy(p->stateless_reset_token, *buf, len); + *buf += len; + p->with_stateless_reset_token = 1; + break; + case QUIC_TP_PREFERRED_ADDRESS: + if (!server) + return 0; + if (!quic_transport_param_dec_pref_addr(&p->preferred_address, buf, *buf + len)) + return 0; + p->with_preferred_address = 1; + break; + case QUIC_TP_MAX_IDLE_TIMEOUT: + if (!quic_dec_int(&p->max_idle_timeout, buf, end)) + return 0; + break; + case QUIC_TP_MAX_UDP_PAYLOAD_SIZE: + if (!quic_dec_int(&p->max_udp_payload_size, buf, end)) + return 0; + break; + case QUIC_TP_INITIAL_MAX_DATA: + if (!quic_dec_int(&p->initial_max_data, buf, end)) + return 0; + break; + case QUIC_TP_INITIAL_MAX_STREAM_DATA_BIDI_LOCAL: + if (!quic_dec_int(&p->initial_max_stream_data_bidi_local, buf, end)) + return 0; + break; + case QUIC_TP_INITIAL_MAX_STREAM_DATA_BIDI_REMOTE: + if (!quic_dec_int(&p->initial_max_stream_data_bidi_remote, buf, end)) + return 0; + break; + case QUIC_TP_INITIAL_MAX_STREAM_DATA_UNI: + if (!quic_dec_int(&p->initial_max_stream_data_uni, buf, end)) + return 0; + break; + case QUIC_TP_INITIAL_MAX_STREAMS_BIDI: + if (!quic_dec_int(&p->initial_max_streams_bidi, buf, end)) + return 0; + break; + case QUIC_TP_INITIAL_MAX_STREAMS_UNI: + if (!quic_dec_int(&p->initial_max_streams_uni, buf, end)) + return 0; + break; + case QUIC_TP_ACK_DELAY_EXPONENT: + if (!quic_dec_int(&p->ack_delay_exponent, buf, end) || + p->ack_delay_exponent > QUIC_TP_ACK_DELAY_EXPONENT_LIMIT) + return 0; + break; + case QUIC_TP_MAX_ACK_DELAY: + if (!quic_dec_int(&p->max_ack_delay, buf, end) || + p->max_ack_delay > QUIC_TP_MAX_ACK_DELAY_LIMIT) + return 0; + break; + case QUIC_TP_DISABLE_ACTIVE_MIGRATION: + /* Zero-length parameter type. */ + if (len != 0) + return 0; + p->disable_active_migration = 1; + break; + case QUIC_TP_ACTIVE_CONNECTION_ID_LIMIT: + if (!quic_dec_int(&p->active_connection_id_limit, buf, end)) + return 0; + break; + case QUIC_TP_VERSION_INFORMATION: + if (!quic_transport_param_dec_version_info(&p->version_information, + buf, *buf + len, server)) + return 0; + break; + default: + *buf += len; + }; + + return *buf == end; +} + +/* Encode <type> and <len> variable length values in <buf>. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_transport_param_encode_type_len(unsigned char **buf, + const unsigned char *end, + uint64_t type, uint64_t len) +{ + return quic_enc_int(buf, end, type) && quic_enc_int(buf, end, len); +} + +/* Decode variable length type and length values of a QUIC transport parameter + * into <type> and <len> found in <*buf> buffer. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_transport_param_decode_type_len(uint64_t *type, uint64_t *len, + const unsigned char **buf, + const unsigned char *end) +{ + return quic_dec_int(type, buf, end) && quic_dec_int(len, buf, end); +} + +/* Encode <param> bytes stream with <type> as type and <length> as length into buf. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_transport_param_enc_mem(unsigned char **buf, const unsigned char *end, + uint64_t type, void *param, uint64_t length) +{ + if (!quic_transport_param_encode_type_len(buf, end, type, length)) + return 0; + + if (end - *buf < length) + return 0; + + if (length) + memcpy(*buf, param, length); + *buf += length; + + return 1; +} + +/* Encode <val> 64-bits value as variable length integer into <buf>. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_transport_param_enc_int(unsigned char **buf, + const unsigned char *end, + uint64_t type, uint64_t val) +{ + size_t len; + + len = quic_int_getsize(val); + + return len && quic_transport_param_encode_type_len(buf, end, type, len) && + quic_enc_int(buf, end, val); +} + +/* Returns the required length in bytes to encode <cid> QUIC connection ID. */ +static inline size_t sizeof_quic_cid(const struct tp_cid *cid) +{ + return sizeof cid->len + cid->len; +} + +/* Encode <addr> preferred address into <buf>. + * Returns 1 if succeeded, 0 if not. + */ +static int quic_transport_param_enc_pref_addr(unsigned char **buf, + const unsigned char *end, + struct tp_preferred_address *addr) +{ + uint64_t addr_len = 0; + + addr_len += sizeof(addr->ipv4_port) + sizeof(addr->ipv4_addr.s_addr); + addr_len += sizeof(addr->ipv6_port) + sizeof(addr->ipv6_addr.s6_addr); + addr_len += sizeof_quic_cid(&addr->cid); + addr_len += sizeof(addr->stateless_reset_token); + + if (!quic_transport_param_encode_type_len(buf, end, QUIC_TP_PREFERRED_ADDRESS, addr_len)) + return 0; + + if (end - *buf < addr_len) + return 0; + + quic_transport_param_enc_pref_addr_val(buf, end, addr); + + return 1; +} + +/* Encode version information transport parameters with <chosen_version> as chosen + * version. + * Return 1 if succeeded, 0 if not. + */ +static int quic_transport_param_enc_version_info(unsigned char **buf, + const unsigned char *end, + const struct quic_version *chosen_version, + int server) +{ + int i; + uint64_t tp_len; + uint32_t ver; + + tp_len = sizeof chosen_version->num + quic_versions_nb * sizeof(uint32_t); + if (!quic_transport_param_encode_type_len(buf, end, + QUIC_TP_VERSION_INFORMATION, + tp_len)) + return 0; + + if (end - *buf < tp_len) + return 0; + + /* First: chosen version */ + ver = htonl(chosen_version->num); + memcpy(*buf, &ver, sizeof ver); + *buf += sizeof ver; + /* For servers: all supported version, chosen included */ + for (i = 0; i < quic_versions_nb; i++) { + ver = htonl(quic_versions[i].num); + memcpy(*buf, &ver, sizeof ver); + *buf += sizeof ver; + } + + return 1; +} + +/* Encode <p> transport parameter into <buf> depending on <server> value which + * must be set to 1 for a server (haproxy listener) or 0 for a client + * (connection to a haproxy server). + * Return the number of bytes consumed if succeeded, 0 if not. + */ +int quic_transport_params_encode(unsigned char *buf, + const unsigned char *end, + struct quic_transport_params *p, + const struct quic_version *chosen_version, + int server) +{ + unsigned char *head; + unsigned char *pos; + + head = pos = buf; + if (server) { + if (!quic_transport_param_enc_mem(&pos, end, + QUIC_TP_ORIGINAL_DESTINATION_CONNECTION_ID, + p->original_destination_connection_id.data, + p->original_destination_connection_id.len)) + return 0; + + if (p->retry_source_connection_id.len) { + if (!quic_transport_param_enc_mem(&pos, end, + QUIC_TP_RETRY_SOURCE_CONNECTION_ID, + p->retry_source_connection_id.data, + p->retry_source_connection_id.len)) + return 0; + } + + if (p->with_stateless_reset_token && + !quic_transport_param_enc_mem(&pos, end, QUIC_TP_STATELESS_RESET_TOKEN, + p->stateless_reset_token, + sizeof p->stateless_reset_token)) + return 0; + if (p->with_preferred_address && + !quic_transport_param_enc_pref_addr(&pos, end, &p->preferred_address)) + return 0; + } + + if (!quic_transport_param_enc_mem(&pos, end, + QUIC_TP_INITIAL_SOURCE_CONNECTION_ID, + p->initial_source_connection_id.data, + p->initial_source_connection_id.len)) + return 0; + + if (p->max_idle_timeout && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_MAX_IDLE_TIMEOUT, p->max_idle_timeout)) + return 0; + + /* + * "max_packet_size" transport parameter must be transmitted only if different + * of the default value. + */ + if (p->max_udp_payload_size != QUIC_TP_DFLT_MAX_UDP_PAYLOAD_SIZE && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_MAX_UDP_PAYLOAD_SIZE, p->max_udp_payload_size)) + return 0; + + if (p->initial_max_data && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_INITIAL_MAX_DATA, p->initial_max_data)) + return 0; + + if (p->initial_max_stream_data_bidi_local && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_INITIAL_MAX_STREAM_DATA_BIDI_LOCAL, + p->initial_max_stream_data_bidi_local)) + return 0; + + if (p->initial_max_stream_data_bidi_remote && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_INITIAL_MAX_STREAM_DATA_BIDI_REMOTE, + p->initial_max_stream_data_bidi_remote)) + return 0; + + if (p->initial_max_stream_data_uni && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_INITIAL_MAX_STREAM_DATA_UNI, + p->initial_max_stream_data_uni)) + return 0; + + if (p->initial_max_streams_bidi && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_INITIAL_MAX_STREAMS_BIDI, + p->initial_max_streams_bidi)) + return 0; + + if (p->initial_max_streams_uni && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_INITIAL_MAX_STREAMS_UNI, + p->initial_max_streams_uni)) + return 0; + + /* + * "ack_delay_exponent" transport parameter must be transmitted only if different + * of the default value. + */ + if (p->ack_delay_exponent != QUIC_TP_DFLT_ACK_DELAY_COMPONENT && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_ACK_DELAY_EXPONENT, p->ack_delay_exponent)) + return 0; + + /* + * "max_ack_delay" transport parameter must be transmitted only if different + * of the default value. + */ + if (p->max_ack_delay != QUIC_TP_DFLT_MAX_ACK_DELAY && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_MAX_ACK_DELAY, p->max_ack_delay)) + return 0; + + /* 0-length value */ + if (p->disable_active_migration && + !quic_transport_param_encode_type_len(&pos, end, QUIC_TP_DISABLE_ACTIVE_MIGRATION, 0)) + return 0; + + if (p->active_connection_id_limit && + p->active_connection_id_limit != QUIC_TP_DFLT_ACTIVE_CONNECTION_ID_LIMIT && + !quic_transport_param_enc_int(&pos, end, QUIC_TP_ACTIVE_CONNECTION_ID_LIMIT, + p->active_connection_id_limit)) + return 0; + + if (!quic_transport_param_enc_version_info(&pos, end, chosen_version, server)) + return 0; + + return pos - head; +} + +/* Decode transport parameters found in <buf> buffer into <p>, depending on + * <server> boolean value which must be set to 1 for a server (haproxy listener) + * or 0 for a client (connection to a haproxy server). + * Returns 1 if succeeded, 0 if not. + */ +static int quic_transport_params_decode(struct quic_transport_params *p, int server, + const unsigned char *buf, + const unsigned char *end) +{ + const unsigned char *pos; + uint64_t type, len = 0; + + pos = buf; + + while (pos != end) { + if (!quic_transport_param_decode_type_len(&type, &len, &pos, end)) + return 0; + + if (end - pos < len) + return 0; + + if (!quic_transport_param_decode(p, server, type, &pos, len)) + return 0; + } + + /* + * A server MUST send original_destination_connection_id transport parameter. + * initial_source_connection_id must be present both for server and client. + */ + if ((server && !p->original_destination_connection_id_present) || + !p->initial_source_connection_id_present) + return 0; + + /* Note that if not received by the peer, active_connection_id_limit will + * have QUIC_TP_DFLT_ACTIVE_CONNECTION_ID_LIMIT as default value. This + * is also the minimum value for this transport parameter. + */ + if (p->active_connection_id_limit < QUIC_TP_DFLT_ACTIVE_CONNECTION_ID_LIMIT) + return 0; + + return 1; +} + +/* Store transport parameters found in <buf> buffer into <qc> QUIC connection + * depending on <server> value which must be 1 for a server (haproxy listener) + * or 0 for a client (connection to a haproxy server). + * Note that peer transport parameters are stored in the TX part of the connection: + * they are used to send packets to the peer with its transport parameters as + * limitations. + * Returns 1 if succeeded, 0 if not. + */ +int quic_transport_params_store(struct quic_conn *qc, int server, + const unsigned char *buf, + const unsigned char *end) +{ + struct quic_transport_params *tx_params = &qc->tx.params; + struct quic_transport_params *rx_params = &qc->rx.params; + /* Initial source connection ID */ + struct tp_cid *iscid; + + /* initialize peer TPs to RFC default value */ + quic_dflt_transport_params_cpy(tx_params); + + if (!quic_transport_params_decode(tx_params, server, buf, end)) + return 0; + + /* Update the connection from transport parameters received */ + if (tx_params->version_information.negotiated_version && + tx_params->version_information.negotiated_version != qc->original_version) + qc->negotiated_version = + qc->tx.params.version_information.negotiated_version; + + if (tx_params->max_ack_delay) + qc->max_ack_delay = tx_params->max_ack_delay; + + if (tx_params->max_idle_timeout && rx_params->max_idle_timeout) + qc->max_idle_timeout = + QUIC_MIN(tx_params->max_idle_timeout, rx_params->max_idle_timeout); + else + qc->max_idle_timeout = + QUIC_MAX(tx_params->max_idle_timeout, rx_params->max_idle_timeout); + TRACE_PROTO("\nTX(remote) transp. params.", QUIC_EV_TRANSP_PARAMS, qc, tx_params); + + /* Check that the "initial_source_connection_id" transport parameter matches + * the SCID received which is also the DCID of the connection. + */ + iscid = &tx_params->initial_source_connection_id; + if (qc->dcid.len != iscid->len || + (qc->dcid.len && memcmp(qc->dcid.data, iscid->data, qc->dcid.len))) { + TRACE_PROTO("initial_source_connection_id transport parameter mismatch", + QUIC_EV_TRANSP_PARAMS, qc); + /* Kill the connection as soon as possible */ + qc_kill_conn(qc); + } + + return 1; +} + +/* QUIC server (or haproxy listener) only function. + * Initialize the local transport parameters <rx_params> from <listener_params> + * coming from configuration and Initial packet information (destination + * connection ID, source connection ID, original destination connection ID) from + * client token. + * Returns 1 if succeeded, 0 if not. + */ +int qc_lstnr_params_init(struct quic_conn *qc, + const struct quic_transport_params *listener_params, + const unsigned char *stateless_reset_token, + const unsigned char *dcid, size_t dcidlen, + const unsigned char *scid, size_t scidlen, + const struct quic_cid *token_odcid) +{ + struct quic_transport_params *rx_params = &qc->rx.params; + struct tp_cid *odcid_param = &rx_params->original_destination_connection_id; + + /* Copy the transport parameters. */ + *rx_params = *listener_params; + /* Copy the stateless reset token */ + memcpy(rx_params->stateless_reset_token, stateless_reset_token, + sizeof rx_params->stateless_reset_token); + /* Copy original_destination_connection_id transport parameter. */ + if (token_odcid->len) { + memcpy(odcid_param->data, token_odcid->data, token_odcid->len); + odcid_param->len = token_odcid->len; + /* Copy retry_source_connection_id transport parameter. */ + memcpy(rx_params->retry_source_connection_id.data, dcid, dcidlen); + rx_params->retry_source_connection_id.len = dcidlen; + } + else { + memcpy(odcid_param->data, dcid, dcidlen); + odcid_param->len = dcidlen; + } + + /* Copy the initial source connection ID. */ + memcpy(rx_params->initial_source_connection_id.data, scid, scidlen); + rx_params->initial_source_connection_id.len = scidlen; + TRACE_PROTO("\nRX(local) transp. params.", QUIC_EV_TRANSP_PARAMS, qc, rx_params); + + return 1; +} + diff --git a/src/quic_trace.c b/src/quic_trace.c new file mode 100644 index 0000000..9ab9626 --- /dev/null +++ b/src/quic_trace.c @@ -0,0 +1,633 @@ +/* + * QUIC traces + * + * Copyright 2000-2020 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <inttypes.h> + +#include <haproxy/quic_conn.h> +#include <haproxy/quic_tls.h> +#include <haproxy/quic_trace.h> +#include <haproxy/quic_tp.h> +#include <haproxy/trace.h> + +static void quic_trace(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +static const struct trace_event quic_trace_events[] = { + { .mask = QUIC_EV_CONN_NEW, .name = "new_conn", .desc = "new QUIC connection" }, + { .mask = QUIC_EV_CONN_INIT, .name = "new_conn_init", .desc = "new QUIC connection initialization" }, + { .mask = QUIC_EV_CONN_ISEC, .name = "init_secs", .desc = "initial secrets derivation" }, + { .mask = QUIC_EV_CONN_RSEC, .name = "read_secs", .desc = "read secrets derivation" }, + { .mask = QUIC_EV_CONN_WSEC, .name = "write_secs", .desc = "write secrets derivation" }, + { .mask = QUIC_EV_CONN_LPKT, .name = "lstnr_packet", .desc = "new listener received packet" }, + { .mask = QUIC_EV_CONN_SPKT, .name = "srv_packet", .desc = "new server received packet" }, + { .mask = QUIC_EV_CONN_ENCPKT, .name = "enc_hdshk_pkt", .desc = "handhshake packet encryption" }, + { .mask = QUIC_EV_CONN_TXPKT, .name = "tx_pkt", .desc = "TX packet" }, + { .mask = QUIC_EV_CONN_PAPKT, .name = "phdshk_apkt", .desc = "post handhshake application packet preparation" }, + { .mask = QUIC_EV_CONN_PAPKTS, .name = "phdshk_apkts", .desc = "post handhshake application packets preparation" }, + { .mask = QUIC_EV_CONN_IO_CB, .name = "qc_io_cb", .desc = "QUIC conn. I/O processing" }, + { .mask = QUIC_EV_CONN_RMHP, .name = "rm_hp", .desc = "Remove header protection" }, + { .mask = QUIC_EV_CONN_PRSHPKT, .name = "parse_hpkt", .desc = "parse handshake packet" }, + { .mask = QUIC_EV_CONN_PRSAPKT, .name = "parse_apkt", .desc = "parse application packet" }, + { .mask = QUIC_EV_CONN_PRSFRM, .name = "parse_frm", .desc = "parse frame" }, + { .mask = QUIC_EV_CONN_PRSAFRM, .name = "parse_ack_frm", .desc = "parse ACK frame" }, + { .mask = QUIC_EV_CONN_BFRM, .name = "build_frm", .desc = "build frame" }, + { .mask = QUIC_EV_CONN_PHPKTS, .name = "phdshk_pkts", .desc = "handhshake packets preparation" }, + { .mask = QUIC_EV_CONN_TRMHP, .name = "rm_hp_try", .desc = "header protection removing try" }, + { .mask = QUIC_EV_CONN_ELRMHP, .name = "el_rm_hp", .desc = "handshake enc. level header protection removing" }, + { .mask = QUIC_EV_CONN_RXPKT, .name = "rx_pkt", .desc = "RX packet" }, + { .mask = QUIC_EV_CONN_SSLDATA, .name = "ssl_provide_data", .desc = "CRYPTO data provision to TLS stack" }, + { .mask = QUIC_EV_CONN_RXCDATA, .name = "el_treat_rx_cfrms",.desc = "enc. level RX CRYPTO frames processing"}, + { .mask = QUIC_EV_CONN_ADDDATA, .name = "add_hdshk_data", .desc = "TLS stack ->add_handshake_data() call"}, + { .mask = QUIC_EV_CONN_FFLIGHT, .name = "flush_flight", .desc = "TLS stack ->flush_flight() call"}, + { .mask = QUIC_EV_CONN_SSLALERT, .name = "send_alert", .desc = "TLS stack ->send_alert() call"}, + { .mask = QUIC_EV_CONN_RTTUPDT, .name = "rtt_updt", .desc = "RTT sampling" }, + { .mask = QUIC_EV_CONN_SPPKTS, .name = "sppkts", .desc = "send prepared packets" }, + { .mask = QUIC_EV_CONN_PKTLOSS, .name = "pktloss", .desc = "detect packet loss" }, + { .mask = QUIC_EV_CONN_STIMER, .name = "stimer", .desc = "set timer" }, + { .mask = QUIC_EV_CONN_PTIMER, .name = "ptimer", .desc = "process timer" }, + { .mask = QUIC_EV_CONN_SPTO, .name = "spto", .desc = "set PTO" }, + { .mask = QUIC_EV_CONN_BCFRMS, .name = "bcfrms", .desc = "build CRYPTO data frames" }, + { .mask = QUIC_EV_CONN_XPRTSEND, .name = "xprt_send", .desc = "sending XRPT subscription" }, + { .mask = QUIC_EV_CONN_XPRTRECV, .name = "xprt_recv", .desc = "receiving XRPT subscription" }, + { .mask = QUIC_EV_CONN_FREED, .name = "conn_freed", .desc = "releasing conn. memory" }, + { .mask = QUIC_EV_CONN_CLOSE, .name = "conn_close", .desc = "closing conn." }, + { .mask = QUIC_EV_CONN_ACKSTRM, .name = "ack_strm", .desc = "STREAM ack."}, + { .mask = QUIC_EV_CONN_FRMLIST, .name = "frm_list", .desc = "frame list"}, + { .mask = QUIC_EV_STATELESS_RST, .name = "stateless_reset", .desc = "stateless reset sent"}, + { .mask = QUIC_EV_TRANSP_PARAMS, .name = "transport_params", .desc = "transport parameters"}, + { .mask = QUIC_EV_CONN_IDLE_TIMER, .name = "idle_timer", .desc = "idle timer task"}, + { .mask = QUIC_EV_CONN_SUB, .name = "xprt_sub", .desc = "RX/TX subscription or unsubscription to QUIC xprt"}, + { .mask = QUIC_EV_CONN_RCV, .name = "conn_recv", .desc = "RX on connection" }, + { .mask = QUIC_EV_CONN_SET_AFFINITY, .name = "conn_set_affinity", .desc = "set connection thread affinity" }, + { /* end */ } +}; + +static const struct name_desc quic_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the connection */ }, + /* arg2 */ { .name="quic", .desc="QUIC transport" }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc quic_trace_decoding[] = { +#define QUIC_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, + { /* end */ } +}; + + +struct trace_source trace_quic = { + .name = IST("quic"), + .desc = "QUIC xprt", + .arg_def = TRC_ARG1_QCON, /* TRACE()'s first argument is always a quic_conn */ + .default_cb = quic_trace, + .known_events = quic_trace_events, + .lockon_args = quic_trace_lockon_args, + .decoding = quic_trace_decoding, + .report_events = ~0, /* report everything by default */ +}; + +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + +/* Trace callback for QUIC. + * These traces always expect that arg1, if non-null, is of type connection. + */ +static void quic_trace(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct quic_conn *qc = a1; + + if (qc) { + const struct quic_tls_ctx *tls_ctx; + + chunk_appendf(&trace_buf, " : qc@%p idle_timer_task@%p flags=0x%x", + qc, qc->idle_timer_task, qc->flags); + if (mask & QUIC_EV_CONN_INIT) { + chunk_appendf(&trace_buf, "\n odcid"); + quic_cid_dump(&trace_buf, &qc->odcid); + chunk_appendf(&trace_buf, "\n dcid"); + quic_cid_dump(&trace_buf, &qc->dcid); + chunk_appendf(&trace_buf, "\n scid"); + quic_cid_dump(&trace_buf, &qc->scid); + } + + if (mask & QUIC_EV_TRANSP_PARAMS) { + const struct quic_transport_params *p = a2; + + if (p) + quic_transport_params_dump(&trace_buf, qc, p); + } + + if (mask & QUIC_EV_CONN_ADDDATA) { + const enum ssl_encryption_level_t *level = a2; + const size_t *len = a3; + + if (level) { + enum quic_tls_enc_level lvl = ssl_to_quic_enc_level(*level); + + chunk_appendf(&trace_buf, " el=%c(%d)", quic_enc_level_char(lvl), lvl); + } + if (len) + chunk_appendf(&trace_buf, " len=%llu", (unsigned long long)*len); + } + if ((mask & QUIC_EV_CONN_ISEC) && qc) { + /* Initial read & write secrets. */ + const unsigned char *rx_sec = a2; + const unsigned char *tx_sec = a3; + + tls_ctx = &qc->iel->tls_ctx; + chunk_appendf(&trace_buf, "\n RX el=I"); + if (rx_sec) + quic_tls_secret_hexdump(&trace_buf, rx_sec, 32); + quic_tls_keys_hexdump(&trace_buf, &tls_ctx->rx); + chunk_appendf(&trace_buf, "\n TX el=I"); + if (tx_sec) + quic_tls_secret_hexdump(&trace_buf, tx_sec, 32); + quic_tls_keys_hexdump(&trace_buf, &tls_ctx->tx); + } + + if ((mask & QUIC_EV_CONN_KP) && qc) { + /* Initial read & write secrets. */ + const struct quic_kp_trace *kp = a2; + + if (kp) { + if (kp->rx) { + chunk_appendf(&trace_buf, "\n RX kp"); + if (kp->rx_sec) + quic_tls_secret_hexdump(&trace_buf, kp->rx_sec, kp->rx_seclen); + quic_tls_kp_keys_hexdump(&trace_buf, kp->rx); + } + if (kp->tx) { + chunk_appendf(&trace_buf, "\n TX kp"); + if (kp->tx_sec) + quic_tls_secret_hexdump(&trace_buf, kp->tx_sec, kp->tx_seclen); + quic_tls_kp_keys_hexdump(&trace_buf, kp->tx); + } + } + } + + if (mask & (QUIC_EV_CONN_RSEC|QUIC_EV_CONN_RWSEC)) { + const enum ssl_encryption_level_t *level = a2; + + if (level) { + enum quic_tls_enc_level lvl = ssl_to_quic_enc_level(*level); + struct quic_enc_level *qel = qc_quic_enc_level(qc, lvl); + + chunk_appendf(&trace_buf, "\n RX el=%c", quic_enc_level_char(lvl)); + if (quic_tls_has_rx_sec(qel)) + quic_tls_keys_hexdump(&trace_buf, &qel->tls_ctx.rx); + else + chunk_appendf(&trace_buf, " (none)"); + } + } + + if (mask & (QUIC_EV_CONN_WSEC|QUIC_EV_CONN_RWSEC)) { + const enum ssl_encryption_level_t *level = a2; + + if (level) { + enum quic_tls_enc_level lvl = ssl_to_quic_enc_level(*level); + struct quic_enc_level *qel = qc_quic_enc_level(qc, lvl); + + chunk_appendf(&trace_buf, "\n TX el=%c", quic_enc_level_char(lvl)); + if (quic_tls_has_tx_sec(qel)) { + quic_tls_keys_hexdump(&trace_buf, &qel->tls_ctx.tx); + } + else + chunk_appendf(&trace_buf, " (none)"); + } + + } + + if (mask & QUIC_EV_CONN_FRMLIST) { + const struct list *l = a2; + + if (l) { + const struct quic_frame *frm; + list_for_each_entry(frm, l, list) { + chunk_appendf(&trace_buf, " frm@%p", frm); + chunk_frm_appendf(&trace_buf, frm); + } + } + } + + if (mask & (QUIC_EV_CONN_TXPKT|QUIC_EV_CONN_PAPKT)) { + const struct quic_tx_packet *pkt = a2; + const struct quic_enc_level *qel = a3; + const ssize_t *room = a4; + + if (qel) { + const struct quic_pktns *pktns = qel->pktns; + chunk_appendf(&trace_buf, " qel=%c flags=0x%x pto_count=%d cwnd=%llu ppif=%lld pif=%llu " + "if=%llu pp=%u", + quic_enc_level_char_from_qel(qel, qc), + qel->pktns->flags, + qc->path->loss.pto_count, + (unsigned long long)qc->path->cwnd, + (unsigned long long)qc->path->prep_in_flight, + (unsigned long long)qc->path->in_flight, + (unsigned long long)pktns->tx.in_flight, + pktns->tx.pto_probe); + } + if (pkt) { + const struct quic_frame *frm; + if (pkt->pn_node.key != (uint64_t)-1) + chunk_appendf(&trace_buf, " pn=%llu",(ull)pkt->pn_node.key); + list_for_each_entry(frm, &pkt->frms, list) { + chunk_appendf(&trace_buf, " frm@%p", frm); + chunk_frm_appendf(&trace_buf, frm); + } + } + + if (room) { + chunk_appendf(&trace_buf, " room=%lld", (long long)*room); + chunk_appendf(&trace_buf, " dcid.len=%llu scid.len=%llu", + (unsigned long long)qc->dcid.len, (unsigned long long)qc->scid.len); + } + } + + if (mask & QUIC_EV_CONN_IO_CB) { + const enum quic_handshake_state *state = a2; + + if (state) + chunk_appendf(&trace_buf, " state=%s", quic_hdshk_state_str(*state)); + } + + if (mask & (QUIC_EV_CONN_TRMHP|QUIC_EV_CONN_ELRMHP|QUIC_EV_CONN_SPKT)) { + const struct quic_rx_packet *pkt = a2; + const unsigned long *pktlen = a3; + const SSL *ssl = a4; + + if (pkt) { + chunk_appendf(&trace_buf, " pkt@%p", pkt); + if (pkt->type == QUIC_PACKET_TYPE_SHORT && pkt->data) + chunk_appendf(&trace_buf, " kp=%d", + !!(*pkt->data & QUIC_PACKET_KEY_PHASE_BIT)); + chunk_appendf(&trace_buf, " el=%c", + quic_packet_type_enc_level_char(pkt->type)); + if (pkt->pnl) + chunk_appendf(&trace_buf, " pnl=%u pn=%llu", pkt->pnl, + (unsigned long long)pkt->pn); + if (pkt->token_len) + chunk_appendf(&trace_buf, " toklen=%llu", + (unsigned long long)pkt->token_len); + if (pkt->aad_len) + chunk_appendf(&trace_buf, " aadlen=%llu", + (unsigned long long)pkt->aad_len); + chunk_appendf(&trace_buf, " flags=0x%x len=%llu", + pkt->flags, (unsigned long long)pkt->len); + } + if (pktlen) + chunk_appendf(&trace_buf, " (%ld)", *pktlen); + if (ssl) { + enum ssl_encryption_level_t level = SSL_quic_read_level(ssl); + chunk_appendf(&trace_buf, " el=%c", + quic_enc_level_char(ssl_to_quic_enc_level(level))); + } + } + + if (mask & (QUIC_EV_CONN_RXPKT|QUIC_EV_CONN_PRSHPKT|QUIC_EV_CONN_SSLDATA)) { + const struct quic_rx_packet *pkt = a2; + const struct quic_rx_crypto_frm *cf = a3; + const SSL *ssl = a4; + + if (pkt) + chunk_appendf(&trace_buf, " pkt@%p el=%c pn=%llu", pkt, + quic_packet_type_enc_level_char(pkt->type), + (unsigned long long)pkt->pn); + if (cf) + chunk_appendf(&trace_buf, " cfoff=%llu cflen=%llu", + (unsigned long long)cf->offset_node.key, + (unsigned long long)cf->len); + if (ssl) { + enum ssl_encryption_level_t level = SSL_quic_read_level(ssl); + chunk_appendf(&trace_buf, " rel=%c", + quic_enc_level_char(ssl_to_quic_enc_level(level))); + } + + if (qc->err.code) + chunk_appendf(&trace_buf, " err_code=0x%llx", (ull)qc->err.code); + } + + if (mask & (QUIC_EV_CONN_PRSFRM|QUIC_EV_CONN_BFRM)) { + const struct quic_frame *frm = a2; + + if (frm) + chunk_appendf(&trace_buf, " %s", quic_frame_type_string(frm->type)); + } + + if (mask & QUIC_EV_CONN_PHPKTS) { + const struct quic_enc_level *qel = a2; + const struct list *l = a3; + + if (qel) { + const struct quic_pktns *pktns = qel->pktns; + chunk_appendf(&trace_buf, + " qel=%c flags=0x%x state=%s ack?%d pto_count=%d cwnd=%llu " + "ppif=%lld pif=%llu if=%llu pp=%u off=%llu", + quic_enc_level_char_from_qel(qel, qc), + qel->pktns->flags, + quic_hdshk_state_str(qc->state), + !!(qel->pktns->flags & QUIC_FL_PKTNS_ACK_REQUIRED), + qc->path->loss.pto_count, + (unsigned long long)qc->path->cwnd, + (unsigned long long)qc->path->prep_in_flight, + (unsigned long long)qc->path->in_flight, + (unsigned long long)pktns->tx.in_flight, + pktns->tx.pto_probe, + qel->cstream ? (unsigned long long)qel->cstream->rx.offset : 0); + } + + if (l) { + const struct quic_frame *frm; + list_for_each_entry(frm, l, list) { + chunk_appendf(&trace_buf, " frm@%p", frm); + chunk_frm_appendf(&trace_buf, frm); + } + } + } + + if (mask & QUIC_EV_CONN_ENCPKT) { + const struct enc_debug_info *edi = a2; + + if (edi) + chunk_appendf(&trace_buf, + " payload=@%p payload_len=%llu" + " aad=@%p aad_len=%llu pn=%llu", + edi->payload, (unsigned long long)edi->payload_len, + edi->aad, (unsigned long long)edi->aad_len, + (unsigned long long)edi->pn); + } + + if (mask & QUIC_EV_CONN_RMHP) { + const struct quic_rx_packet *pkt = a2; + + if (pkt) { + const int *ret = a3; + + chunk_appendf(&trace_buf, " pkt@%p", pkt); + if (ret && *ret) + chunk_appendf(&trace_buf, " pnl=%u pn=%llu", + pkt->pnl, (unsigned long long)pkt->pn); + } + } + + if (mask & QUIC_EV_CONN_PRSAFRM) { + const struct quic_frame *frm = a2; + const unsigned long *val1 = a3; + const unsigned long *val2 = a4; + + if (frm) { + chunk_appendf(&trace_buf, " frm@%p", frm); + chunk_frm_appendf(&trace_buf, frm); + } + if (val1) + chunk_appendf(&trace_buf, " %lu", *val1); + if (val2) + chunk_appendf(&trace_buf, "..%lu", *val2); + } + + if (mask & QUIC_EV_CONN_ACKSTRM) { + const struct qf_stream *strm_frm = a2; + const struct qc_stream_desc *stream = a3; + + if (strm_frm) + chunk_appendf(&trace_buf, " off=%llu len=%llu", (ull)strm_frm->offset.key, (ull)strm_frm->len); + if (stream) + chunk_appendf(&trace_buf, " ack_offset=%llu", (ull)stream->ack_offset); + } + + if (mask & QUIC_EV_CONN_RTTUPDT) { + const unsigned int *rtt_sample = a2; + const unsigned int *ack_delay = a3; + const struct quic_loss *ql = a4; + + if (rtt_sample) + chunk_appendf(&trace_buf, " rtt_sample=%ums", *rtt_sample); + if (ack_delay) + chunk_appendf(&trace_buf, " ack_delay=%ums", *ack_delay); + if (ql) + chunk_appendf(&trace_buf, + " srtt=%ums rttvar=%ums min_rtt=%ums", + ql->srtt, ql->rtt_var, ql->rtt_min); + } + if (mask & QUIC_EV_CONN_CC) { + const struct quic_cc_event *ev = a2; + const struct quic_cc *cc = a3; + + if (a2) + quic_cc_event_trace(&trace_buf, ev); + if (a3) + quic_cc_state_trace(&trace_buf, cc); + } + + if (mask & QUIC_EV_CONN_PKTLOSS) { + const struct quic_pktns *pktns = a2; + const struct list *lost_pkts = a3; + + if (pktns) { + chunk_appendf(&trace_buf, " pktns=%c", quic_pktns_char(qc, pktns)); + if (pktns->tx.loss_time) + chunk_appendf(&trace_buf, " loss_time=%dms", + TICKS_TO_MS(tick_remain(now_ms, pktns->tx.loss_time))); + } + if (lost_pkts && !LIST_ISEMPTY(lost_pkts)) { + struct quic_tx_packet *pkt; + + chunk_appendf(&trace_buf, " lost_pkts:"); + list_for_each_entry(pkt, lost_pkts, list) + chunk_appendf(&trace_buf, " %lu", (unsigned long)pkt->pn_node.key); + } + } + + if (mask & (QUIC_EV_CONN_STIMER|QUIC_EV_CONN_PTIMER|QUIC_EV_CONN_SPTO)) { + const struct quic_pktns *pktns = a2; + const int *duration = a3; + const uint64_t *ifae_pkts = a4; + + if (ifae_pkts) + chunk_appendf(&trace_buf, " ifae_pkts=%llu", + (unsigned long long)*ifae_pkts); + if (pktns) { + chunk_appendf(&trace_buf, " pktns=%c pp=%d", + quic_pktns_char(qc, pktns), + pktns->tx.pto_probe); + if (mask & (QUIC_EV_CONN_STIMER|QUIC_EV_CONN_SPTO)) { + if (pktns->tx.in_flight) + chunk_appendf(&trace_buf, " if=%llu", (ull)pktns->tx.in_flight); + if (pktns->tx.loss_time) + chunk_appendf(&trace_buf, " loss_time=%dms", + TICKS_TO_MS(pktns->tx.loss_time - now_ms)); + } + if (mask & QUIC_EV_CONN_SPTO) { + if (pktns->tx.time_of_last_eliciting) + chunk_appendf(&trace_buf, " tole=%dms", + TICKS_TO_MS(pktns->tx.time_of_last_eliciting - now_ms)); + if (duration) + chunk_appendf(&trace_buf, " dur=%dms", TICKS_TO_MS(*duration)); + } + } + + if (!(mask & (QUIC_EV_CONN_SPTO|QUIC_EV_CONN_PTIMER)) && qc->timer_task) { + chunk_appendf(&trace_buf, + " expire=%dms", TICKS_TO_MS(qc->timer - now_ms)); + } + } + + if (mask & QUIC_EV_CONN_SPPKTS) { + const struct quic_tx_packet *pkt = a2; + + chunk_appendf(&trace_buf, " pto_count=%d cwnd=%llu ppif=%llu pif=%llu", + qc->path->loss.pto_count, + (unsigned long long)qc->path->cwnd, + (unsigned long long)qc->path->prep_in_flight, + (unsigned long long)qc->path->in_flight); + if (pkt) { + const struct quic_frame *frm; + if (pkt->flags & QUIC_FL_TX_PACKET_ACK) + chunk_appendf(&trace_buf, " ack"); + chunk_appendf(&trace_buf, " pn=%lu(%c) iflen=%llu", + (unsigned long)pkt->pn_node.key, + quic_pktns_char(qc, pkt->pktns), + (unsigned long long)pkt->in_flight_len); + chunk_appendf(&trace_buf, " bytes.rx=%llu bytes.tx=%llu", + (unsigned long long)qc->bytes.rx, + (unsigned long long)qc->bytes.tx); + list_for_each_entry(frm, &pkt->frms, list) { + chunk_appendf(&trace_buf, " frm@%p", frm); + chunk_frm_appendf(&trace_buf, frm); + } + + if (pkt->type == QUIC_PACKET_TYPE_INITIAL) { + chunk_appendf(&trace_buf, " with scid"); + quic_cid_dump(&trace_buf, &qc->scid); + } + } + } + + if (mask & QUIC_EV_CONN_SSLALERT) { + const uint8_t *alert = a2; + const enum ssl_encryption_level_t *level = a3; + + if (alert) + chunk_appendf(&trace_buf, " alert=0x%02x", *alert); + if (level) + chunk_appendf(&trace_buf, " el=%c", + quic_enc_level_char(ssl_to_quic_enc_level(*level))); + } + + if (mask & QUIC_EV_CONN_BCFRMS) { + const size_t *sz1 = a2; + const size_t *sz2 = a3; + const size_t *sz3 = a4; + + if (sz1) + chunk_appendf(&trace_buf, " %llu", (unsigned long long)*sz1); + if (sz2) + chunk_appendf(&trace_buf, " %llu", (unsigned long long)*sz2); + if (sz3) + chunk_appendf(&trace_buf, " %llu", (unsigned long long)*sz3); + } + + if (mask & QUIC_EV_CONN_PSTRM) { + const struct quic_frame *frm = a2; + + if (frm) + chunk_frm_appendf(&trace_buf, frm); + } + + if (mask & QUIC_EV_CONN_ELEVELSEL) { + const enum quic_handshake_state *state = a2; + const enum quic_tls_enc_level *level = a3; + const enum quic_tls_enc_level *next_level = a4; + + if (state) + chunk_appendf(&trace_buf, " state=%s", quic_hdshk_state_str(qc->state)); + if (level) + chunk_appendf(&trace_buf, " level=%c", quic_enc_level_char(*level)); + if (next_level) + chunk_appendf(&trace_buf, " next_level=%c", quic_enc_level_char(*next_level)); + + } + + if (mask & QUIC_EV_CONN_IDLE_TIMER) { + if (tick_isset(qc->ack_expire)) + chunk_appendf(&trace_buf, " ack_expire=%ums", + TICKS_TO_MS(tick_remain(now_ms, qc->ack_expire))); + if (tick_isset(qc->idle_expire)) + chunk_appendf(&trace_buf, " idle_expire=%ums", + TICKS_TO_MS(tick_remain(now_ms, qc->idle_expire))); + if (qc->idle_timer_task && tick_isset(qc->idle_timer_task->expire)) + chunk_appendf(&trace_buf, " expire=%ums", + TICKS_TO_MS(tick_remain(now_ms, qc->idle_timer_task->expire))); + } + } + + if (mask & QUIC_EV_CONN_RCV) { + int i; + const struct quic_dgram *dgram = a2; + char bufaddr[INET6_ADDRSTRLEN], bufport[6]; + + if (qc) { + addr_to_str(&qc->peer_addr, bufaddr, sizeof(bufaddr)); + port_to_str(&qc->peer_addr, bufport, sizeof(bufport)); + chunk_appendf(&trace_buf, " peer_addr=%s:%s ", bufaddr, bufport); + } + + if (dgram) { + chunk_appendf(&trace_buf, " dgram.len=%zu", dgram->len); + /* Socket */ + if (dgram->saddr.ss_family == AF_INET || + dgram->saddr.ss_family == AF_INET6) { + addr_to_str(&dgram->saddr, bufaddr, sizeof(bufaddr)); + port_to_str(&dgram->saddr, bufport, sizeof(bufport)); + chunk_appendf(&trace_buf, "saddr=%s:%s ", bufaddr, bufport); + + addr_to_str(&dgram->daddr, bufaddr, sizeof(bufaddr)); + port_to_str(&dgram->daddr, bufport, sizeof(bufport)); + chunk_appendf(&trace_buf, "daddr=%s:%s ", bufaddr, bufport); + } + /* DCID */ + for (i = 0; i < dgram->dcid_len; ++i) + chunk_appendf(&trace_buf, "%02x", dgram->dcid[i]); + + } + } + + if (mask & QUIC_EV_CONN_LPKT) { + const struct quic_rx_packet *pkt = a2; + const uint64_t *len = a3; + const struct quic_version *ver = a4; + + if (pkt) { + chunk_appendf(&trace_buf, " pkt@%p type=0x%02x %s", + pkt, pkt->type, qc_pkt_long(pkt) ? "long" : "short"); + if (pkt->pn_node.key != (uint64_t)-1) + chunk_appendf(&trace_buf, " pn=%llu", pkt->pn_node.key); + } + + if (len) + chunk_appendf(&trace_buf, " len=%llu", (ull)*len); + + if (ver) + chunk_appendf(&trace_buf, " ver=0x%08x", ver->num); + } + + if (mask & QUIC_EV_STATELESS_RST) { + const struct quic_cid *cid = a2; + + if (cid) + quic_cid_dump(&trace_buf, cid); + } + +} diff --git a/src/quic_tx.c b/src/quic_tx.c new file mode 100644 index 0000000..306b4c2 --- /dev/null +++ b/src/quic_tx.c @@ -0,0 +1,2348 @@ +/* + * QUIC protocol implementation. Lower layer with internal features implemented + * here such as QUIC encryption, idle timeout, acknowledgement and + * retransmission. + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/quic_tx.h> + +#include <haproxy/pool.h> +#include <haproxy/trace.h> +#include <haproxy/quic_cid.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_retransmit.h> +#include <haproxy/quic_retry.h> +#include <haproxy/quic_sock.h> +#include <haproxy/quic_tls.h> +#include <haproxy/quic_trace.h> +#include <haproxy/ssl_sock-t.h> + +DECLARE_POOL(pool_head_quic_tx_packet, "quic_tx_packet", sizeof(struct quic_tx_packet)); +DECLARE_POOL(pool_head_quic_cc_buf, "quic_cc_buf", QUIC_MAX_CC_BUFSIZE); + +static struct quic_tx_packet *qc_build_pkt(unsigned char **pos, const unsigned char *buf_end, + struct quic_enc_level *qel, struct quic_tls_ctx *ctx, + struct list *frms, struct quic_conn *qc, + const struct quic_version *ver, size_t dglen, int pkt_type, + int must_ack, int padding, int probe, int cc, int *err); + +static void quic_packet_encrypt(unsigned char *payload, size_t payload_len, + unsigned char *aad, size_t aad_len, uint64_t pn, + struct quic_tls_ctx *tls_ctx, struct quic_conn *qc, + int *fail) +{ + unsigned char iv[QUIC_TLS_IV_LEN]; + unsigned char *tx_iv = tls_ctx->tx.iv; + size_t tx_iv_sz = tls_ctx->tx.ivlen; + struct enc_debug_info edi; + + TRACE_ENTER(QUIC_EV_CONN_ENCPKT, qc); + *fail = 0; + + quic_aead_iv_build(iv, sizeof iv, tx_iv, tx_iv_sz, pn); + + if (!quic_tls_encrypt(payload, payload_len, aad, aad_len, + tls_ctx->tx.ctx, tls_ctx->tx.aead, iv)) { + TRACE_ERROR("QUIC packet encryption failed", QUIC_EV_CONN_ENCPKT, qc); + *fail = 1; + enc_debug_info_init(&edi, payload, payload_len, aad, aad_len, pn); + } + + TRACE_LEAVE(QUIC_EV_CONN_ENCPKT, qc); +} + +/* Free <pkt> TX packet and its attached frames. + * This is the responsibility of the caller to remove this packet of + * any data structure it was possibly attached to. + */ +static inline void free_quic_tx_packet(struct quic_conn *qc, + struct quic_tx_packet *pkt) +{ + struct quic_frame *frm, *frmbak; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + if (!pkt) + goto leave; + + list_for_each_entry_safe(frm, frmbak, &pkt->frms, list) + qc_frm_free(qc, &frm); + pool_free(pool_head_quic_tx_packet, pkt); + + leave: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); +} + +/* Allocate Tx buffer from <qc> quic-conn if needed. + * + * Returns allocated buffer or NULL on error. + */ +struct buffer *qc_txb_alloc(struct quic_conn *qc) +{ + struct buffer *buf = &qc->tx.buf; + if (!b_alloc(buf)) + return NULL; + + return buf; +} + +/* Free Tx buffer from <qc> if it is empty. */ +void qc_txb_release(struct quic_conn *qc) +{ + struct buffer *buf = &qc->tx.buf; + + /* For the moment sending function is responsible to purge the buffer + * entirely. It may change in the future but this requires to be able + * to reuse old data. + * For the moment we do not care to leave data in the buffer for + * a connection which is supposed to be killed asap. + */ + BUG_ON_HOT(buf && b_data(buf)); + + if (!b_data(buf)) { + b_free(buf); + offer_buffers(NULL, 1); + } +} + +/* Return the TX buffer dedicated to the "connection close" datagram to be built + * if an immediate close is required after having allocated it or directly + * allocate a TX buffer if an immediate close is not required. + */ +struct buffer *qc_get_txb(struct quic_conn *qc) +{ + struct buffer *buf; + + if (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) { + TRACE_PROTO("Immediate close required", QUIC_EV_CONN_PHPKTS, qc); + buf = &qc->tx.cc_buf; + if (b_is_null(buf)) { + qc->tx.cc_buf_area = pool_alloc(pool_head_quic_cc_buf); + if (!qc->tx.cc_buf_area) + goto err; + } + + /* In every case, initialize ->tx.cc_buf */ + qc->tx.cc_buf = b_make(qc->tx.cc_buf_area, QUIC_MAX_CC_BUFSIZE, 0, 0); + } + else { + buf = qc_txb_alloc(qc); + if (!buf) + goto err; + } + + return buf; + err: + return NULL; +} + +/* Commit a datagram payload written into <buf> of length <length>. <first_pkt> + * must contains the address of the first packet stored in the payload. + * + * Caller is responsible that there is enough space in the buffer. + */ +static void qc_txb_store(struct buffer *buf, uint16_t length, + struct quic_tx_packet *first_pkt) +{ + const size_t hdlen = sizeof(uint16_t) + sizeof(void *); + BUG_ON_HOT(b_contig_space(buf) < hdlen); /* this must not happen */ + + write_u16(b_tail(buf), length); + write_ptr(b_tail(buf) + sizeof(length), first_pkt); + b_add(buf, hdlen + length); +} + +/* Returns 1 if a packet may be built for <qc> from <qel> encryption level + * with <frms> as ack-eliciting frame list to send, 0 if not. + * <cc> must equal to 1 if an immediate close was asked, 0 if not. + * <probe> must equalt to 1 if a probing packet is required, 0 if not. + * Also set <*must_ack> to inform the caller if an acknowledgement should be sent. + */ +static int qc_may_build_pkt(struct quic_conn *qc, struct list *frms, + struct quic_enc_level *qel, int cc, int probe, + int *must_ack) +{ + int force_ack = qel == qc->iel || qel == qc->hel; + int nb_aepkts_since_last_ack = qel->pktns->rx.nb_aepkts_since_last_ack; + + /* An acknowledgement must be sent if this has been forced by the caller, + * typically during the handshake when the packets must be acknowledged as + * soon as possible. This is also the case when the ack delay timer has been + * triggered, or at least every QUIC_MAX_RX_AEPKTS_SINCE_LAST_ACK packets. + */ + *must_ack = (qc->flags & QUIC_FL_CONN_ACK_TIMER_FIRED) || + ((qel->pktns->flags & QUIC_FL_PKTNS_ACK_REQUIRED) && + (force_ack || nb_aepkts_since_last_ack >= QUIC_MAX_RX_AEPKTS_SINCE_LAST_ACK)); + + TRACE_PRINTF(TRACE_LEVEL_DEVELOPER, QUIC_EV_CONN_PHPKTS, qc, 0, 0, 0, + "has_sec=%d cc=%d probe=%d must_ack=%d frms=%d prep_in_fligh=%llu cwnd=%llu", + quic_tls_has_tx_sec(qel), cc, probe, *must_ack, LIST_ISEMPTY(frms), + (ullong)qc->path->prep_in_flight, (ullong)qc->path->cwnd); + + /* Do not build any more packet if the TX secrets are not available or + * if there is nothing to send, i.e. if no CONNECTION_CLOSE or ACK are required + * and if there is no more packets to send upon PTO expiration + * and if there is no more ack-eliciting frames to send or in flight + * congestion control limit is reached for prepared data + */ + if (!quic_tls_has_tx_sec(qel) || + (!cc && !probe && !*must_ack && + (LIST_ISEMPTY(frms) || qc->path->prep_in_flight >= qc->path->cwnd))) { + return 0; + } + + return 1; +} + +/* Prepare as much as possible QUIC packets for sending from prebuilt frames + * <frms>. Each packet is stored in a distinct datagram written to <buf>. + * + * Each datagram is prepended by a two fields header : the datagram length and + * the address of the packet contained in the datagram. + * + * Returns the number of bytes prepared in packets if succeeded (may be 0), or + * -1 if something wrong happened. + */ +static int qc_prep_app_pkts(struct quic_conn *qc, struct buffer *buf, + struct list *frms) +{ + int ret = -1, cc; + struct quic_enc_level *qel; + unsigned char *end, *pos; + struct quic_tx_packet *pkt; + size_t total; + + TRACE_ENTER(QUIC_EV_CONN_PHPKTS, qc); + + qel = qc->ael; + total = 0; + pos = (unsigned char *)b_tail(buf); + cc = qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE; + /* Each datagram is prepended with its length followed by the address + * of the first packet in the datagram (QUIC_DGRAM_HEADLEN). + */ + while ((!cc && b_contig_space(buf) >= (int)qc->path->mtu + QUIC_DGRAM_HEADLEN) || + (cc && b_contig_space(buf) >= QUIC_MIN_CC_PKTSIZE + QUIC_DGRAM_HEADLEN)) { + int err, probe, must_ack; + + TRACE_PROTO("TX prep app pkts", QUIC_EV_CONN_PHPKTS, qc, qel, frms); + probe = 0; + /* We do not probe if an immediate close was asked */ + if (!cc) + probe = qel->pktns->tx.pto_probe; + + if (!qc_may_build_pkt(qc, frms, qel, cc, probe, &must_ack)) + break; + + /* Leave room for the datagram header */ + pos += QUIC_DGRAM_HEADLEN; + if (cc) { + end = pos + QUIC_MIN_CC_PKTSIZE; + } + else if (!quic_peer_validated_addr(qc) && qc_is_listener(qc)) { + end = pos + QUIC_MIN(qc->path->mtu, quic_may_send_bytes(qc)); + } + else { + end = pos + qc->path->mtu; + } + + pkt = qc_build_pkt(&pos, end, qel, &qel->tls_ctx, frms, qc, NULL, 0, + QUIC_PACKET_TYPE_SHORT, must_ack, 0, probe, cc, &err); + switch (err) { + case -3: + qc_purge_txbuf(qc, buf); + goto leave; + case -2: + // trace already emitted by function above + goto leave; + case -1: + /* As we provide qc_build_pkt() with an enough big buffer to fulfill an + * MTU, we are here because of the congestion control window. There is + * no need to try to reuse this buffer. + */ + TRACE_PROTO("could not prepare anymore packet", QUIC_EV_CONN_PHPKTS, qc, qel); + goto out; + default: + break; + } + + /* This is to please to GCC. We cannot have (err >= 0 && !pkt) */ + BUG_ON(!pkt); + + if (qc->flags & QUIC_FL_CONN_RETRANS_OLD_DATA) + pkt->flags |= QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA; + + total += pkt->len; + + /* Write datagram header. */ + qc_txb_store(buf, pkt->len, pkt); + /* Build only one datagram when an immediate close is required. */ + if (cc) + break; + } + + out: + if (total && cc) { + BUG_ON(buf != &qc->tx.cc_buf); + qc->tx.cc_dgram_len = total; + } + ret = total; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PHPKTS, qc); + return ret; +} + +/* Free all frames in <l> list. In addition also remove all these frames + * from the original ones if they are the results of duplications. + */ +static inline void qc_free_frm_list(struct quic_conn *qc, struct list *l) +{ + struct quic_frame *frm, *frmbak; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + list_for_each_entry_safe(frm, frmbak, l, list) { + LIST_DEL_INIT(&frm->ref); + qc_frm_free(qc, &frm); + } + + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); +} + +/* Free <pkt> TX packet and all the packets coalesced to it. */ +static inline void qc_free_tx_coalesced_pkts(struct quic_conn *qc, + struct quic_tx_packet *p) +{ + struct quic_tx_packet *pkt, *nxt_pkt; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + for (pkt = p; pkt; pkt = nxt_pkt) { + qc_free_frm_list(qc, &pkt->frms); + nxt_pkt = pkt->next; + pool_free(pool_head_quic_tx_packet, pkt); + } + + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); +} + +/* Purge <buf> TX buffer from its prepare packets. */ +static void qc_purge_tx_buf(struct quic_conn *qc, struct buffer *buf) +{ + while (b_contig_data(buf, 0)) { + uint16_t dglen; + struct quic_tx_packet *pkt; + size_t headlen = sizeof dglen + sizeof pkt; + + dglen = read_u16(b_head(buf)); + pkt = read_ptr(b_head(buf) + sizeof dglen); + qc_free_tx_coalesced_pkts(qc, pkt); + b_del(buf, dglen + headlen); + } + + BUG_ON(b_data(buf)); +} + +/* Send datagrams stored in <buf>. + * + * This function returns 1 for success. On error, there is several behavior + * depending on underlying sendto() error : + * - for an unrecoverable error, 0 is returned and connection is killed. + * - a transient error is handled differently if connection has its owned + * socket. If this is the case, 0 is returned and socket is subscribed on the + * poller. The other case is assimilated to a success case with 1 returned. + * Remaining data are purged from the buffer and will eventually be detected + * as lost which gives the opportunity to retry sending. + */ +int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx) +{ + int ret = 0; + struct quic_conn *qc; + char skip_sendto = 0; + + qc = ctx->qc; + TRACE_ENTER(QUIC_EV_CONN_SPPKTS, qc); + while (b_contig_data(buf, 0)) { + unsigned char *pos; + struct buffer tmpbuf = { }; + struct quic_tx_packet *first_pkt, *pkt, *next_pkt; + uint16_t dglen; + size_t headlen = sizeof dglen + sizeof first_pkt; + unsigned int time_sent; + + pos = (unsigned char *)b_head(buf); + dglen = read_u16(pos); + BUG_ON_HOT(!dglen); /* this should not happen */ + + pos += sizeof dglen; + first_pkt = read_ptr(pos); + pos += sizeof first_pkt; + tmpbuf.area = (char *)pos; + tmpbuf.size = tmpbuf.data = dglen; + + TRACE_PROTO("TX dgram", QUIC_EV_CONN_SPPKTS, qc); + /* If sendto is on error just skip the call to it for the rest + * of the loop but continue to purge the buffer. Data will be + * transmitted when QUIC packets are detected as lost on our + * side. + * + * TODO use fd-monitoring to detect when send operation can be + * retry. This should improve the bandwidth without relying on + * retransmission timer. However, it requires a major rework on + * quic-conn fd management. + */ + if (!skip_sendto) { + int ret = qc_snd_buf(qc, &tmpbuf, tmpbuf.data, 0); + if (ret < 0) { + TRACE_ERROR("sendto fatal error", QUIC_EV_CONN_SPPKTS, qc, first_pkt); + qc_kill_conn(qc); + qc_free_tx_coalesced_pkts(qc, first_pkt); + b_del(buf, dglen + headlen); + qc_purge_tx_buf(qc, buf); + goto leave; + } + else if (!ret) { + /* Connection owned socket : poller will wake us up when transient error is cleared. */ + if (qc_test_fd(qc)) { + TRACE_ERROR("sendto error, subscribe to poller", QUIC_EV_CONN_SPPKTS, qc); + goto leave; + } + + /* No connection owned-socket : rely on retransmission to retry sending. */ + skip_sendto = 1; + TRACE_ERROR("sendto error, simulate sending for the rest of data", QUIC_EV_CONN_SPPKTS, qc); + } + } + + b_del(buf, dglen + headlen); + qc->bytes.tx += tmpbuf.data; + time_sent = now_ms; + + for (pkt = first_pkt; pkt; pkt = next_pkt) { + /* RFC 9000 14.1 Initial datagram size + * a server MUST expand the payload of all UDP datagrams carrying ack-eliciting + * Initial packets to at least the smallest allowed maximum datagram size of + * 1200 bytes. + */ + qc->cntrs.sent_pkt++; + BUG_ON_HOT(pkt->type == QUIC_PACKET_TYPE_INITIAL && + (pkt->flags & QUIC_FL_TX_PACKET_ACK_ELICITING) && + dglen < QUIC_INITIAL_PACKET_MINLEN); + + pkt->time_sent = time_sent; + if (pkt->flags & QUIC_FL_TX_PACKET_ACK_ELICITING) { + pkt->pktns->tx.time_of_last_eliciting = time_sent; + qc->path->ifae_pkts++; + if (qc->flags & QUIC_FL_CONN_IDLE_TIMER_RESTARTED_AFTER_READ) + qc_idle_timer_rearm(qc, 0, 0); + } + if (!(qc->flags & QUIC_FL_CONN_CLOSING) && + (pkt->flags & QUIC_FL_TX_PACKET_CC)) { + qc->flags |= QUIC_FL_CONN_CLOSING; + qc_detach_th_ctx_list(qc, 1); + + /* RFC 9000 10.2. Immediate Close: + * The closing and draining connection states exist to ensure + * that connections close cleanly and that delayed or reordered + * packets are properly discarded. These states SHOULD persist + * for at least three times the current PTO interval... + * + * Rearm the idle timeout only one time when entering closing + * state. + */ + qc_idle_timer_do_rearm(qc, 0); + if (qc->timer_task) { + task_destroy(qc->timer_task); + qc->timer_task = NULL; + } + } + qc->path->in_flight += pkt->in_flight_len; + pkt->pktns->tx.in_flight += pkt->in_flight_len; + if (pkt->in_flight_len) + qc_set_timer(qc); + TRACE_PROTO("TX pkt", QUIC_EV_CONN_SPPKTS, qc, pkt); + next_pkt = pkt->next; + quic_tx_packet_refinc(pkt); + eb64_insert(&pkt->pktns->tx.pkts, &pkt->pn_node); + } + } + + ret = 1; +leave: + TRACE_LEAVE(QUIC_EV_CONN_SPPKTS, qc); + + return ret; +} + +/* Flush txbuf for <qc> connection. This must be called prior to a packet + * preparation when txbuf contains older data. A send will be conducted for + * these data. + * + * Returns 1 on success : buffer is empty and can be use for packet + * preparation. On error 0 is returned. + */ +int qc_purge_txbuf(struct quic_conn *qc, struct buffer *buf) +{ + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + /* This operation can only be conducted if txbuf is not empty. This + * case only happens for connection with their owned socket due to an + * older transient sendto() error. + */ + BUG_ON(!qc_test_fd(qc)); + + if (b_data(buf) && !qc_send_ppkts(buf, qc->xprt_ctx)) { + if (qc->flags & QUIC_FL_CONN_TO_KILL) + qc_txb_release(qc); + TRACE_DEVEL("leaving in error", QUIC_EV_CONN_TXPKT, qc); + return 0; + } + + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return 1; +} + +/* Try to send application frames from list <frms> on connection <qc>. + * + * Use qc_send_app_probing wrapper when probing with old data. + * + * Returns 1 on success. Some data might not have been sent due to congestion, + * in this case they are left in <frms> input list. The caller may subscribe on + * quic-conn to retry later. + * + * Returns 0 on critical error. + * TODO review and classify more distinctly transient from definitive errors to + * allow callers to properly handle it. + */ +int qc_send_app_pkts(struct quic_conn *qc, struct list *frms) +{ + int status = 0, ret; + struct buffer *buf; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + buf = qc_get_txb(qc); + if (!buf) { + TRACE_ERROR("could not get a buffer", QUIC_EV_CONN_TXPKT, qc); + goto err; + } + + if (b_data(buf) && !qc_purge_txbuf(qc, buf)) + goto err; + + /* Prepare and send packets until we could not further prepare packets. */ + do { + /* Currently buf cannot be non-empty at this stage. Even if a + * previous sendto() has failed it is emptied to simulate + * packet emission and rely on QUIC lost detection to try to + * emit it. + */ + BUG_ON_HOT(b_data(buf)); + b_reset(buf); + + ret = qc_prep_app_pkts(qc, buf, frms); + + if (b_data(buf) && !qc_send_ppkts(buf, qc->xprt_ctx)) { + if (qc->flags & QUIC_FL_CONN_TO_KILL) + qc_txb_release(qc); + goto err; + } + } while (ret > 0); + + qc_txb_release(qc); + if (ret < 0) + goto err; + + status = 1; + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return status; + + err: + TRACE_DEVEL("leaving in error", QUIC_EV_CONN_TXPKT, qc); + return 0; +} + +/* Try to send application frames from list <frms> on connection <qc>. Use this + * function when probing is required. + * + * Returns the result from qc_send_app_pkts function. + */ +static forceinline int qc_send_app_probing(struct quic_conn *qc, + struct list *frms) +{ + int ret; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + TRACE_PROTO("preparing old data (probing)", QUIC_EV_CONN_FRMLIST, qc, frms); + qc->flags |= QUIC_FL_CONN_RETRANS_OLD_DATA; + ret = qc_send_app_pkts(qc, frms); + qc->flags &= ~QUIC_FL_CONN_RETRANS_OLD_DATA; + + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return ret; +} + +/* Try to send application frames from list <frms> on connection <qc>. This + * function is provided for MUX upper layer usage only. + * + * Returns the result from qc_send_app_pkts function. + */ +int qc_send_mux(struct quic_conn *qc, struct list *frms) +{ + int ret; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + BUG_ON(qc->mux_state != QC_MUX_READY); /* Only MUX can uses this function so it must be ready. */ + + if (qc->conn->flags & CO_FL_SOCK_WR_SH) { + qc->conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH; + TRACE_DEVEL("connection on error", QUIC_EV_CONN_TXPKT, qc); + return 0; + } + + /* Try to send post handshake frames first unless on 0-RTT. */ + if ((qc->flags & QUIC_FL_CONN_NEED_POST_HANDSHAKE_FRMS) && + qc->state >= QUIC_HS_ST_COMPLETE) { + quic_build_post_handshake_frames(qc); + qc_send_app_pkts(qc, &qc->ael->pktns->tx.frms); + } + + TRACE_STATE("preparing data (from MUX)", QUIC_EV_CONN_TXPKT, qc); + qc->flags |= QUIC_FL_CONN_TX_MUX_CONTEXT; + ret = qc_send_app_pkts(qc, frms); + qc->flags &= ~QUIC_FL_CONN_TX_MUX_CONTEXT; + + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return ret; +} + +/* Return the encryption level following the one which contains <el> list head + * depending on <retrans> TX mode (retranmission or not). + */ +static inline struct quic_enc_level *qc_list_next_qel(struct list *el, int retrans) +{ + return !retrans ? LIST_NEXT(el, struct quic_enc_level *, list) : + LIST_NEXT(el, struct quic_enc_level *, retrans); +} + +/* Return the encryption level following <qel> depending on <retrans> TX mode + * (retranmission or not). + */ +static inline struct quic_enc_level *qc_next_qel(struct quic_enc_level *qel, int retrans) +{ + struct list *el = !retrans ? &qel->list : &qel->retrans; + + return qc_list_next_qel(el, retrans); +} + +/* Return 1 if <qel> is at the head of its list, 0 if not. */ +static inline int qc_qel_is_head(struct quic_enc_level *qel, struct list *l, + int retrans) +{ + return !retrans ? &qel->list == l : &qel->retrans == l; +} + +/* Select <*tls_ctx>, <*frms> and <*ver> for the encryption level <qel> of <qc> QUIC + * connection, depending on its state, especially the negotiated version and if + * retransmissions are required. If this the case <qels> is the list of encryption + * levels to used, or NULL if no retransmissions are required. + * Never fails. + */ +static inline void qc_select_tls_frms_ver(struct quic_conn *qc, + struct quic_enc_level *qel, + struct quic_tls_ctx **tls_ctx, + struct list **frms, + const struct quic_version **ver, + struct list *qels) +{ + if (qc->negotiated_version) { + *ver = qc->negotiated_version; + if (qel == qc->iel) + *tls_ctx = qc->nictx; + else + *tls_ctx = &qel->tls_ctx; + } + else { + *ver = qc->original_version; + *tls_ctx = &qel->tls_ctx; + } + + if (!qels) + *frms = &qel->pktns->tx.frms; + else + *frms = qel->retrans_frms; +} + +/* Prepare as much as possible QUIC datagrams/packets for sending from <qels> + * list of encryption levels. Several packets can be coalesced into a single + * datagram. The result is written into <buf>. Note that if <qels> is NULL, + * the encryption levels which will be used are those currently allocated + * and attached to the connection. + * + * Each datagram is prepended by a two fields header : the datagram length and + * the address of first packet in the datagram. + * + * Returns the number of bytes prepared in datragrams/packets if succeeded + * (may be 0), or -1 if something wrong happened. + */ +int qc_prep_hpkts(struct quic_conn *qc, struct buffer *buf, struct list *qels) +{ + int ret, cc, retrans, padding; + struct quic_tx_packet *first_pkt, *prv_pkt; + unsigned char *end, *pos; + uint16_t dglen; + size_t total; + struct list *qel_list; + struct quic_enc_level *qel; + + TRACE_ENTER(QUIC_EV_CONN_IO_CB, qc); + /* Currently qc_prep_pkts() does not handle buffer wrapping so the + * caller must ensure that buf is reset. + */ + BUG_ON_HOT(buf->head || buf->data); + + ret = -1; + cc = qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE; + retrans = !!qels; + padding = 0; + first_pkt = prv_pkt = NULL; + end = pos = (unsigned char *)b_head(buf); + dglen = 0; + total = 0; + + qel_list = qels ? qels : &qc->qel_list; + qel = qc_list_next_qel(qel_list, retrans); + while (!qc_qel_is_head(qel, qel_list, retrans)) { + struct quic_tls_ctx *tls_ctx; + const struct quic_version *ver; + struct list *frms, *next_frms; + struct quic_enc_level *next_qel; + + if (qel == qc->eel) { + /* Next encryption level */ + qel = qc_next_qel(qel, retrans); + continue; + } + + qc_select_tls_frms_ver(qc, qel, &tls_ctx, &frms, &ver, qels); + + next_qel = qc_next_qel(qel, retrans); + next_frms = qc_qel_is_head(next_qel, qel_list, retrans) ? NULL : + !qels ? &next_qel->pktns->tx.frms : next_qel->retrans_frms; + + /* Build as much as datagrams at <qel> encryption level. + * Each datagram is prepended with its length followed by the address + * of the first packet in the datagram (QUIC_DGRAM_HEADLEN). + */ + while ((!cc && b_contig_space(buf) >= (int)qc->path->mtu + QUIC_DGRAM_HEADLEN) || + (cc && b_contig_space(buf) >= QUIC_MIN_CC_PKTSIZE + QUIC_DGRAM_HEADLEN) || prv_pkt) { + int err, probe, must_ack; + enum quic_pkt_type pkt_type; + struct quic_tx_packet *cur_pkt; + + TRACE_PROTO("TX prep pkts", QUIC_EV_CONN_PHPKTS, qc, qel); + probe = 0; + /* We do not probe if an immediate close was asked */ + if (!cc) + probe = qel->pktns->tx.pto_probe; + + if (!qc_may_build_pkt(qc, frms, qel, cc, probe, &must_ack)) { + if (prv_pkt && qc_qel_is_head(next_qel, qel_list, retrans)) { + qc_txb_store(buf, dglen, first_pkt); + /* Build only one datagram when an immediate close is required. */ + if (cc) + goto out; + } + + TRACE_DEVEL("next encryption level", QUIC_EV_CONN_PHPKTS, qc); + break; + } + + if (!prv_pkt) { + /* Leave room for the datagram header */ + pos += QUIC_DGRAM_HEADLEN; + if (cc) { + end = pos + QUIC_MIN_CC_PKTSIZE; + } + else if (!quic_peer_validated_addr(qc) && qc_is_listener(qc)) { + end = pos + QUIC_MIN(qc->path->mtu, quic_may_send_bytes(qc)); + } + else { + end = pos + qc->path->mtu; + } + } + + /* RFC 9000 14.1 Initial datagram size + * a server MUST expand the payload of all UDP datagrams carrying ack-eliciting + * Initial packets to at least the smallest allowed maximum datagram size of + * 1200 bytes. + * + * Ensure that no ack-eliciting packets are sent into too small datagrams + */ + if (qel == qc->iel && !LIST_ISEMPTY(frms)) { + if (end - pos < QUIC_INITIAL_PACKET_MINLEN) { + TRACE_PROTO("No more enough room to build an Initial packet", + QUIC_EV_CONN_PHPKTS, qc); + break; + } + + /* Pad this Initial packet if there is no ack-eliciting frames to send from + * the next packet number space. + */ + if (!next_frms || LIST_ISEMPTY(next_frms)) + padding = 1; + } + + pkt_type = quic_enc_level_pkt_type(qc, qel); + cur_pkt = qc_build_pkt(&pos, end, qel, tls_ctx, frms, + qc, ver, dglen, pkt_type, + must_ack, padding, probe, cc, &err); + switch (err) { + case -3: + if (first_pkt) + qc_txb_store(buf, dglen, first_pkt); + qc_purge_tx_buf(qc, buf); + goto leave; + case -2: + // trace already emitted by function above + goto leave; + case -1: + /* If there was already a correct packet present, set the + * current datagram as prepared into <cbuf>. + */ + if (prv_pkt) + qc_txb_store(buf, dglen, first_pkt); + TRACE_PROTO("could not prepare anymore packet", QUIC_EV_CONN_PHPKTS, qc, qel); + goto out; + default: + break; + } + + /* This is to please to GCC. We cannot have (err >= 0 && !cur_pkt) */ + BUG_ON(!cur_pkt); + + total += cur_pkt->len; + dglen += cur_pkt->len; + + if (qc->flags & QUIC_FL_CONN_RETRANS_OLD_DATA) + cur_pkt->flags |= QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA; + + /* keep trace of the first packet in the datagram */ + if (!first_pkt) + first_pkt = cur_pkt; + + /* Attach the current one to the previous one and vice versa */ + if (prv_pkt) { + prv_pkt->next = cur_pkt; + cur_pkt->prev = prv_pkt; + cur_pkt->flags |= QUIC_FL_TX_PACKET_COALESCED; + } + + /* If there is no more packet to build for this encryption level, + * select the next one <next_qel>, if any, to coalesce a packet in + * the same datagram, except if <qel> is the Application data + * encryption level which cannot be selected to do that. + */ + if (LIST_ISEMPTY(frms) && qel != qc->ael && + !qc_qel_is_head(next_qel, qel_list, retrans)) { + if (qel == qc->iel && + (!qc_is_listener(qc) || + cur_pkt->flags & QUIC_FL_TX_PACKET_ACK_ELICITING)) + padding = 1; + + prv_pkt = cur_pkt; + break; + } + else { + qc_txb_store(buf, dglen, first_pkt); + /* Build only one datagram when an immediate close is required. */ + if (cc) + goto out; + first_pkt = NULL; + dglen = 0; + padding = 0; + prv_pkt = NULL; + } + } + + /* Next encryption level */ + qel = next_qel; + } + + out: + if (cc && total) { + BUG_ON(buf != &qc->tx.cc_buf); + BUG_ON(dglen != total); + qc->tx.cc_dgram_len = dglen; + } + + ret = total; + leave: + TRACE_LEAVE(QUIC_EV_CONN_PHPKTS, qc); + return ret; +} + +/* Sends handshake packets from up to two encryption levels <tel> and <next_te> + * with <tel_frms> and <next_tel_frms> as frame list respectively for <qc> + * QUIC connection. <old_data> is used as boolean to send data already sent but + * not already acknowledged (in flight). + * Returns 1 if succeeded, 0 if not. + */ +int qc_send_hdshk_pkts(struct quic_conn *qc, int old_data, + struct quic_enc_level *qel1, struct quic_enc_level *qel2) +{ + int ret, status = 0; + struct buffer *buf = qc_get_txb(qc); + struct list qels = LIST_HEAD_INIT(qels); + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + if (!buf) { + TRACE_ERROR("buffer allocation failed", QUIC_EV_CONN_TXPKT, qc); + goto leave; + } + + if (b_data(buf) && !qc_purge_txbuf(qc, buf)) { + TRACE_ERROR("Could not purge TX buffer", QUIC_EV_CONN_TXPKT, qc); + goto out; + } + + /* Currently buf cannot be non-empty at this stage. Even if a previous + * sendto() has failed it is emptied to simulate packet emission and + * rely on QUIC lost detection to try to emit it. + */ + BUG_ON_HOT(b_data(buf)); + b_reset(buf); + + if (old_data) { + TRACE_STATE("old data for probing asked", QUIC_EV_CONN_TXPKT, qc); + qc->flags |= QUIC_FL_CONN_RETRANS_OLD_DATA; + } + + if (qel1) { + BUG_ON(LIST_INLIST(&qel1->retrans)); + LIST_APPEND(&qels, &qel1->retrans); + } + + if (qel2) { + BUG_ON(LIST_INLIST(&qel2->retrans)); + LIST_APPEND(&qels, &qel2->retrans); + } + + ret = qc_prep_hpkts(qc, buf, &qels); + if (ret == -1) { + qc_txb_release(qc); + TRACE_ERROR("Could not build some packets", QUIC_EV_CONN_TXPKT, qc); + goto out; + } + + if (ret && !qc_send_ppkts(buf, qc->xprt_ctx)) { + if (qc->flags & QUIC_FL_CONN_TO_KILL) + qc_txb_release(qc); + TRACE_ERROR("Could not send some packets", QUIC_EV_CONN_TXPKT, qc); + goto out; + } + + qc_txb_release(qc); + status = 1; + + out: + if (qel1) { + LIST_DEL_INIT(&qel1->retrans); + qel1->retrans_frms = NULL; + } + + if (qel2) { + LIST_DEL_INIT(&qel2->retrans); + qel2->retrans_frms = NULL; + } + + TRACE_STATE("no more need old data for probing", QUIC_EV_CONN_TXPKT, qc); + qc->flags &= ~QUIC_FL_CONN_RETRANS_OLD_DATA; + leave: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return status; +} + +/* Retransmit up to two datagrams depending on packet number space. + * Return 0 when failed, 0 if not. + */ +int qc_dgrams_retransmit(struct quic_conn *qc) +{ + int ret = 0; + int sret; + struct quic_pktns *ipktns = qc->ipktns; + struct quic_pktns *hpktns = qc->hpktns; + struct quic_pktns *apktns = qc->apktns; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + /* Note that if the Initial packet number space is not discarded, + * this is also the case for the Handshake packet number space. + */ + if (ipktns && (ipktns->flags & QUIC_FL_PKTNS_PROBE_NEEDED)) { + int i; + + for (i = 0; i < QUIC_MAX_NB_PTO_DGRAMS; i++) { + struct list ifrms = LIST_HEAD_INIT(ifrms); + struct list hfrms = LIST_HEAD_INIT(hfrms); + struct list qels = LIST_HEAD_INIT(qels); + + qc_prep_hdshk_fast_retrans(qc, &ifrms, &hfrms); + TRACE_DEVEL("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &ifrms); + TRACE_DEVEL("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &hfrms); + if (!LIST_ISEMPTY(&ifrms)) { + ipktns->tx.pto_probe = 1; + if (!LIST_ISEMPTY(&hfrms)) + hpktns->tx.pto_probe = 1; + qc->iel->retrans_frms = &ifrms; + if (qc->hel) + qc->hel->retrans_frms = &hfrms; + sret = qc_send_hdshk_pkts(qc, 1, qc->iel, qc->hel); + qc_free_frm_list(qc, &ifrms); + qc_free_frm_list(qc, &hfrms); + if (!sret) + goto leave; + } + else { + /* We are in the case where the anti-amplification limit will be + * reached after having sent this datagram or some handshake frames + * could not be allocated. There is no need to send more than one + * datagram. + */ + ipktns->tx.pto_probe = 1; + qc->iel->retrans_frms = &ifrms; + sret = qc_send_hdshk_pkts(qc, 0, qc->iel, NULL); + qc_free_frm_list(qc, &ifrms); + qc_free_frm_list(qc, &hfrms); + if (!sret) + goto leave; + + break; + } + } + TRACE_STATE("no more need to probe Initial packet number space", + QUIC_EV_CONN_TXPKT, qc); + ipktns->flags &= ~QUIC_FL_PKTNS_PROBE_NEEDED; + if (hpktns) + hpktns->flags &= ~QUIC_FL_PKTNS_PROBE_NEEDED; + } + else { + int i; + + if (hpktns && (hpktns->flags & QUIC_FL_PKTNS_PROBE_NEEDED)) { + hpktns->tx.pto_probe = 0; + for (i = 0; i < QUIC_MAX_NB_PTO_DGRAMS; i++) { + struct list frms1 = LIST_HEAD_INIT(frms1); + + qc_prep_fast_retrans(qc, hpktns, &frms1, NULL); + TRACE_DEVEL("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &frms1); + if (!LIST_ISEMPTY(&frms1)) { + hpktns->tx.pto_probe = 1; + qc->hel->retrans_frms = &frms1; + sret = qc_send_hdshk_pkts(qc, 1, qc->hel, NULL); + qc_free_frm_list(qc, &frms1); + if (!sret) + goto leave; + } + } + TRACE_STATE("no more need to probe Handshake packet number space", + QUIC_EV_CONN_TXPKT, qc); + hpktns->flags &= ~QUIC_FL_PKTNS_PROBE_NEEDED; + } + else if (apktns && (apktns->flags & QUIC_FL_PKTNS_PROBE_NEEDED)) { + struct list frms2 = LIST_HEAD_INIT(frms2); + struct list frms1 = LIST_HEAD_INIT(frms1); + + apktns->tx.pto_probe = 0; + qc_prep_fast_retrans(qc, apktns, &frms1, &frms2); + TRACE_PROTO("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &frms1); + TRACE_PROTO("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &frms2); + + if (!LIST_ISEMPTY(&frms1)) { + apktns->tx.pto_probe = 1; + sret = qc_send_app_probing(qc, &frms1); + qc_free_frm_list(qc, &frms1); + if (!sret) { + qc_free_frm_list(qc, &frms2); + goto leave; + } + } + + if (!LIST_ISEMPTY(&frms2)) { + apktns->tx.pto_probe = 1; + sret = qc_send_app_probing(qc, &frms2); + qc_free_frm_list(qc, &frms2); + if (!sret) + goto leave; + } + TRACE_STATE("no more need to probe 01RTT packet number space", + QUIC_EV_CONN_TXPKT, qc); + apktns->flags &= ~QUIC_FL_PKTNS_PROBE_NEEDED; + } + } + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return ret; +} + +/* + * Send a Version Negotiation packet on response to <pkt> on socket <fd> to + * address <addr>. + * Implementation of RFC9000 6. Version Negotiation + * + * TODO implement a rate-limiting sending of Version Negotiation packets + * + * Returns 0 on success else non-zero + */ +int send_version_negotiation(int fd, struct sockaddr_storage *addr, + struct quic_rx_packet *pkt) +{ + char buf[256]; + int ret = 0, i = 0, j; + uint32_t version; + const socklen_t addrlen = get_addr_len(addr); + + TRACE_ENTER(QUIC_EV_CONN_TXPKT); + /* + * header form + * long header, fixed bit to 0 for Version Negotiation + */ + /* TODO: RAND_bytes() should be replaced? */ + if (RAND_bytes((unsigned char *)buf, 1) != 1) { + TRACE_ERROR("RAND_bytes() error", QUIC_EV_CONN_TXPKT); + goto out; + } + + buf[i++] |= '\x80'; + /* null version for Version Negotiation */ + buf[i++] = '\x00'; + buf[i++] = '\x00'; + buf[i++] = '\x00'; + buf[i++] = '\x00'; + + /* source connection id */ + buf[i++] = pkt->scid.len; + memcpy(&buf[i], pkt->scid.data, pkt->scid.len); + i += pkt->scid.len; + + /* destination connection id */ + buf[i++] = pkt->dcid.len; + memcpy(&buf[i], pkt->dcid.data, pkt->dcid.len); + i += pkt->dcid.len; + + /* supported version */ + for (j = 0; j < quic_versions_nb; j++) { + version = htonl(quic_versions[j].num); + memcpy(&buf[i], &version, sizeof(version)); + i += sizeof(version); + } + + if (sendto(fd, buf, i, 0, (struct sockaddr *)addr, addrlen) < 0) + goto out; + + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT); + return !ret; +} + +/* Send a stateless reset packet depending on <pkt> RX packet information + * from <fd> UDP socket to <dst> + * Return 1 if succeeded, 0 if not. + */ +int send_stateless_reset(struct listener *l, struct sockaddr_storage *dstaddr, + struct quic_rx_packet *rxpkt) +{ + int ret = 0, pktlen, rndlen; + unsigned char pkt[64]; + const socklen_t addrlen = get_addr_len(dstaddr); + struct proxy *prx; + struct quic_counters *prx_counters; + + TRACE_ENTER(QUIC_EV_STATELESS_RST); + + prx = l->bind_conf->frontend; + prx_counters = EXTRA_COUNTERS_GET(prx->extra_counters_fe, &quic_stats_module); + /* 10.3 Stateless Reset (https://www.rfc-editor.org/rfc/rfc9000.html#section-10.3) + * The resulting minimum size of 21 bytes does not guarantee that a Stateless + * Reset is difficult to distinguish from other packets if the recipient requires + * the use of a connection ID. To achieve that end, the endpoint SHOULD ensure + * that all packets it sends are at least 22 bytes longer than the minimum + * connection ID length that it requests the peer to include in its packets, + * adding PADDING frames as necessary. This ensures that any Stateless Reset + * sent by the peer is indistinguishable from a valid packet sent to the endpoint. + * An endpoint that sends a Stateless Reset in response to a packet that is + * 43 bytes or shorter SHOULD send a Stateless Reset that is one byte shorter + * than the packet it responds to. + */ + + /* Note that we build at most a 42 bytes QUIC packet to mimic a short packet */ + pktlen = rxpkt->len <= 43 ? rxpkt->len - 1 : 0; + pktlen = QUIC_MAX(QUIC_STATELESS_RESET_PACKET_MINLEN, pktlen); + rndlen = pktlen - QUIC_STATELESS_RESET_TOKEN_LEN; + + /* Put a header of random bytes */ + /* TODO: RAND_bytes() should be replaced */ + if (RAND_bytes(pkt, rndlen) != 1) { + TRACE_ERROR("RAND_bytes() failed", QUIC_EV_STATELESS_RST); + goto leave; + } + + /* Clear the most significant bit, and set the second one */ + *pkt = (*pkt & ~0x80) | 0x40; + if (!quic_stateless_reset_token_cpy(pkt + rndlen, QUIC_STATELESS_RESET_TOKEN_LEN, + rxpkt->dcid.data, rxpkt->dcid.len)) + goto leave; + + if (sendto(l->rx.fd, pkt, pktlen, 0, (struct sockaddr *)dstaddr, addrlen) < 0) + goto leave; + + ret = 1; + HA_ATOMIC_INC(&prx_counters->stateless_reset_sent); + TRACE_PROTO("stateless reset sent", QUIC_EV_STATELESS_RST, NULL, &rxpkt->dcid); + leave: + TRACE_LEAVE(QUIC_EV_STATELESS_RST); + return ret; +} + +/* Return the long packet type matching with <qv> version and <type> */ +static inline int quic_pkt_type(int type, uint32_t version) +{ + if (version != QUIC_PROTOCOL_VERSION_2) + return type; + + switch (type) { + case QUIC_PACKET_TYPE_INITIAL: + return 1; + case QUIC_PACKET_TYPE_0RTT: + return 2; + case QUIC_PACKET_TYPE_HANDSHAKE: + return 3; + case QUIC_PACKET_TYPE_RETRY: + return 0; + } + + return -1; +} + + +/* Generate a Retry packet and send it on <fd> socket to <addr> in response to + * the Initial <pkt> packet. + * + * Returns 0 on success else non-zero. + */ +int send_retry(int fd, struct sockaddr_storage *addr, + struct quic_rx_packet *pkt, const struct quic_version *qv) +{ + int ret = 0; + unsigned char buf[128]; + int i = 0, token_len; + const socklen_t addrlen = get_addr_len(addr); + struct quic_cid scid; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT); + + /* long header(1) | fixed bit(1) | packet type QUIC_PACKET_TYPE_RETRY(2) | unused random bits(4)*/ + buf[i++] = (QUIC_PACKET_LONG_HEADER_BIT | QUIC_PACKET_FIXED_BIT) | + (quic_pkt_type(QUIC_PACKET_TYPE_RETRY, qv->num) << QUIC_PACKET_TYPE_SHIFT) | + statistical_prng_range(16); + /* version */ + write_n32(&buf[i], qv->num); + i += sizeof(uint32_t); + + /* Use the SCID from <pkt> for Retry DCID. */ + buf[i++] = pkt->scid.len; + memcpy(&buf[i], pkt->scid.data, pkt->scid.len); + i += pkt->scid.len; + + /* Generate a new CID to be used as SCID for the Retry packet. */ + scid.len = QUIC_HAP_CID_LEN; + /* TODO: RAND_bytes() should be replaced */ + if (RAND_bytes(scid.data, scid.len) != 1) { + TRACE_ERROR("RAND_bytes() failed", QUIC_EV_CONN_TXPKT); + goto out; + } + + buf[i++] = scid.len; + memcpy(&buf[i], scid.data, scid.len); + i += scid.len; + + /* token */ + if (!(token_len = quic_generate_retry_token(&buf[i], sizeof(buf) - i, qv->num, + &pkt->dcid, &pkt->scid, addr))) { + TRACE_ERROR("quic_generate_retry_token() failed", QUIC_EV_CONN_TXPKT); + goto out; + } + + i += token_len; + + /* token integrity tag */ + if ((sizeof(buf) - i < QUIC_TLS_TAG_LEN) || + !quic_tls_generate_retry_integrity_tag(pkt->dcid.data, + pkt->dcid.len, buf, i, qv)) { + TRACE_ERROR("quic_tls_generate_retry_integrity_tag() failed", QUIC_EV_CONN_TXPKT); + goto out; + } + + i += QUIC_TLS_TAG_LEN; + + if (sendto(fd, buf, i, 0, (struct sockaddr *)addr, addrlen) < 0) { + TRACE_ERROR("quic_tls_generate_retry_integrity_tag() failed", QUIC_EV_CONN_TXPKT); + goto out; + } + + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT); + return !ret; +} + +/* Write a 32-bits integer to a buffer with <buf> as address. + * Make <buf> point to the data after this 32-buts value if succeeded. + * Note that these 32-bits integers are networkg bytes ordered. + * Returns 0 if failed (not enough room in the buffer), 1 if succeeded. + */ +static inline int quic_write_uint32(unsigned char **buf, + const unsigned char *end, uint32_t val) +{ + if (end - *buf < sizeof val) + return 0; + + *(uint32_t *)*buf = htonl(val); + *buf += sizeof val; + + return 1; +} + +/* Return the maximum number of bytes we must use to completely fill a + * buffer with <sz> as size for a data field of bytes prefixed by its QUIC + * variable-length (may be 0). + * Also put in <*len_sz> the size of this QUIC variable-length. + * So after returning from this function we have : <*len_sz> + <ret> <= <sz> + * (<*len_sz> = { max(i), i + ret <= <sz> }) . + */ +static inline size_t max_available_room(size_t sz, size_t *len_sz) +{ + size_t sz_sz, ret; + size_t diff; + + sz_sz = quic_int_getsize(sz); + if (sz <= sz_sz) + return 0; + + ret = sz - sz_sz; + *len_sz = quic_int_getsize(ret); + /* Difference between the two sizes. Note that <sz_sz> >= <*len_sz>. */ + diff = sz_sz - *len_sz; + if (unlikely(diff > 0)) { + /* Let's try to take into an account remaining bytes. + * + * <----------------> <sz_sz> + * <--------------><--------> +----> <max_int> + * <ret> <len_sz> | + * +---------------------------+-----------.... + * <--------------------------------> <sz> + */ + size_t max_int = quic_max_int(*len_sz); + + if (max_int + *len_sz <= sz) + ret = max_int; + else + ret = sz - diff; + } + + return ret; +} + +/* This function computes the maximum data we can put into a buffer with <sz> as + * size prefixed with a variable-length field "Length" whose value is the + * remaining data length, already filled of <ilen> bytes which must be taken + * into an account by "Length" field, and finally followed by the data we want + * to put in this buffer prefixed again by a variable-length field. + * <sz> is the size of the buffer to fill. + * <ilen> the number of bytes already put after the "Length" field. + * <dlen> the number of bytes we want to at most put in the buffer. + * Also set <*dlen_sz> to the size of the data variable-length we want to put in + * the buffer. This is typically this function which must be used to fill as + * much as possible a QUIC packet made of only one CRYPTO or STREAM frames. + * Returns this computed size if there is enough room in the buffer, 0 if not. + */ +static inline size_t max_stream_data_size(size_t sz, size_t ilen, size_t dlen) +{ + size_t ret, len_sz, dlen_sz; + + /* + * The length of variable-length QUIC integers are powers of two. + * Look for the first 3length" field value <len_sz> which match our need. + * As we must put <ilen> bytes in our buffer, the minimum value for + * <len_sz> is the number of bytes required to encode <ilen>. + */ + for (len_sz = quic_int_getsize(ilen); + len_sz <= QUIC_VARINT_MAX_SIZE; + len_sz <<= 1) { + if (sz < len_sz + ilen) + return 0; + + ret = max_available_room(sz - len_sz - ilen, &dlen_sz); + if (!ret) + return 0; + + /* Check that <*len_sz> matches <ret> value */ + if (len_sz + ilen + dlen_sz + ret <= quic_max_int(len_sz)) + return ret < dlen ? ret : dlen; + } + + return 0; +} + +/* Return the length in bytes of <pn> packet number depending on + * <largest_acked_pn> the largest ackownledged packet number. + */ +static inline size_t quic_packet_number_length(int64_t pn, + int64_t largest_acked_pn) +{ + int64_t max_nack_pkts; + + /* About packet number encoding, the RFC says: + * The sender MUST use a packet number size able to represent more than + * twice as large a range than the difference between the largest + * acknowledged packet and packet number being sent. + */ + max_nack_pkts = 2 * (pn - largest_acked_pn) + 1; + if (max_nack_pkts > 0xffffff) + return 4; + if (max_nack_pkts > 0xffff) + return 3; + if (max_nack_pkts > 0xff) + return 2; + + return 1; +} + +/* Encode <pn> packet number with <pn_len> as length in byte into a buffer with + * <buf> as current copy address and <end> as pointer to one past the end of + * this buffer. This is the responsibility of the caller to check there is + * enough room in the buffer to copy <pn_len> bytes. + * Never fails. + */ +static inline int quic_packet_number_encode(unsigned char **buf, + const unsigned char *end, + uint64_t pn, size_t pn_len) +{ + if (end - *buf < pn_len) + return 0; + + /* Encode the packet number. */ + switch (pn_len) { + case 1: + **buf = pn; + break; + case 2: + write_n16(*buf, pn); + break; + case 3: + (*buf)[0] = pn >> 16; + (*buf)[1] = pn >> 8; + (*buf)[2] = pn; + break; + case 4: + write_n32(*buf, pn); + break; + } + *buf += pn_len; + + return 1; +} + +/* This function builds into a buffer at <pos> position a QUIC long packet header, + * <end> being one byte past the end of this buffer. + * Return 1 if enough room to build this header, 0 if not. + */ +static int quic_build_packet_long_header(unsigned char **pos, const unsigned char *end, + int type, size_t pn_len, + struct quic_conn *qc, const struct quic_version *ver) +{ + int ret = 0; + + TRACE_ENTER(QUIC_EV_CONN_LPKT, qc); + + if (end - *pos < sizeof ver->num + qc->dcid.len + qc->scid.len + 3) { + TRACE_DEVEL("not enough room", QUIC_EV_CONN_LPKT, qc); + goto leave; + } + + type = quic_pkt_type(type, ver->num); + /* #0 byte flags */ + *(*pos)++ = QUIC_PACKET_FIXED_BIT | QUIC_PACKET_LONG_HEADER_BIT | + (type << QUIC_PACKET_TYPE_SHIFT) | (pn_len - 1); + /* Version */ + quic_write_uint32(pos, end, ver->num); + *(*pos)++ = qc->dcid.len; + /* Destination connection ID */ + if (qc->dcid.len) { + memcpy(*pos, qc->dcid.data, qc->dcid.len); + *pos += qc->dcid.len; + } + /* Source connection ID */ + *(*pos)++ = qc->scid.len; + if (qc->scid.len) { + memcpy(*pos, qc->scid.data, qc->scid.len); + *pos += qc->scid.len; + } + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc); + return ret; +} + +/* This function builds into a buffer at <pos> position a QUIC short packet header, + * <end> being one byte past the end of this buffer. + * Return 1 if enough room to build this header, 0 if not. + */ +static int quic_build_packet_short_header(unsigned char **pos, const unsigned char *end, + size_t pn_len, struct quic_conn *qc, + unsigned char tls_flags) +{ + int ret = 0; + unsigned char spin_bit = + (qc->flags & QUIC_FL_CONN_SPIN_BIT) ? QUIC_PACKET_SPIN_BIT : 0; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + if (end - *pos < 1 + qc->dcid.len) { + TRACE_DEVEL("not enough room", QUIC_EV_CONN_LPKT, qc); + goto leave; + } + + /* #0 byte flags */ + *(*pos)++ = QUIC_PACKET_FIXED_BIT | spin_bit | + ((tls_flags & QUIC_FL_TLS_KP_BIT_SET) ? QUIC_PACKET_KEY_PHASE_BIT : 0) | (pn_len - 1); + /* Destination connection ID */ + if (qc->dcid.len) { + memcpy(*pos, qc->dcid.data, qc->dcid.len); + *pos += qc->dcid.len; + } + + ret = 1; + leave: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return ret; +} + +/* Apply QUIC header protection to the packet with <pos> as first byte address, + * <pn> as address of the Packet number field, <pnlen> being this field length + * with <aead> as AEAD cipher and <key> as secret key. + * + * TODO no error is expected as encryption is done in place but encryption + * manual is unclear. <fail> will be set to true if an error is detected. + */ +void quic_apply_header_protection(struct quic_conn *qc, unsigned char *pos, + unsigned char *pn, size_t pnlen, + struct quic_tls_ctx *tls_ctx, int *fail) + +{ + int i; + /* We need an IV of at least 5 bytes: one byte for bytes #0 + * and at most 4 bytes for the packet number + */ + unsigned char mask[5] = {0}; + EVP_CIPHER_CTX *aes_ctx = tls_ctx->tx.hp_ctx; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + *fail = 0; + + if (!quic_tls_aes_encrypt(mask, pn + QUIC_PACKET_PN_MAXLEN, sizeof mask, aes_ctx)) { + TRACE_ERROR("could not apply header protection", QUIC_EV_CONN_TXPKT, qc); + *fail = 1; + goto out; + } + + *pos ^= mask[0] & (*pos & QUIC_PACKET_LONG_HEADER_BIT ? 0xf : 0x1f); + for (i = 0; i < pnlen; i++) + pn[i] ^= mask[i + 1]; + + out: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); +} + +/* Prepare into <outlist> as most as possible ack-eliciting frame from their + * <inlist> prebuilt frames for <qel> encryption level to be encoded in a buffer + * with <room> as available room, and <*len> the packet Length field initialized + * with the number of bytes already present in this buffer which must be taken + * into an account for the Length packet field value. <headlen> is the number of + * bytes already present in this packet before building frames. + * + * Update consequently <*len> to reflect the size of these frames built + * by this function. Also attach these frames to <l> frame list. + * Return 1 if at least one ack-eleciting frame could be built, 0 if not. + */ +static int qc_build_frms(struct list *outlist, struct list *inlist, + size_t room, size_t *len, size_t headlen, + struct quic_enc_level *qel, + struct quic_conn *qc) +{ + int ret; + struct quic_frame *cf, *cfbak; + + TRACE_ENTER(QUIC_EV_CONN_BCFRMS, qc); + + ret = 0; + if (*len > room) + goto leave; + + /* If we are not probing we must take into an account the congestion + * control window. + */ + if (!qel->pktns->tx.pto_probe) { + size_t remain = quic_cc_path_prep_data(qc->path); + + if (headlen > remain) + goto leave; + + room = QUIC_MIN(room, remain - headlen); + } + + TRACE_PROTO("TX frms build (headlen)", + QUIC_EV_CONN_BCFRMS, qc, &headlen); + + /* NOTE: switch/case block inside a loop, a successful status must be + * returned by this function only if at least one frame could be built + * in the switch/case block. + */ + list_for_each_entry_safe(cf, cfbak, inlist, list) { + /* header length, data length, frame length. */ + size_t hlen, dlen, dlen_sz, avail_room, flen; + + if (!room) + break; + + switch (cf->type) { + case QUIC_FT_CRYPTO: + TRACE_DEVEL(" New CRYPTO frame build (room, len)", + QUIC_EV_CONN_BCFRMS, qc, &room, len); + /* Compute the length of this CRYPTO frame header */ + hlen = 1 + quic_int_getsize(cf->crypto.offset); + /* Compute the data length of this CRyPTO frame. */ + dlen = max_stream_data_size(room, *len + hlen, cf->crypto.len); + TRACE_DEVEL(" CRYPTO data length (hlen, crypto.len, dlen)", + QUIC_EV_CONN_BCFRMS, qc, &hlen, &cf->crypto.len, &dlen); + if (!dlen) + continue; + + /* CRYPTO frame length. */ + flen = hlen + quic_int_getsize(dlen) + dlen; + TRACE_DEVEL(" CRYPTO frame length (flen)", + QUIC_EV_CONN_BCFRMS, qc, &flen); + /* Add the CRYPTO data length and its encoded length to the packet + * length and the length of this length. + */ + *len += flen; + room -= flen; + if (dlen == cf->crypto.len) { + /* <cf> CRYPTO data have been consumed. */ + LIST_DEL_INIT(&cf->list); + LIST_APPEND(outlist, &cf->list); + } + else { + struct quic_frame *new_cf; + + new_cf = qc_frm_alloc(QUIC_FT_CRYPTO); + if (!new_cf) { + TRACE_ERROR("No memory for new crypto frame", QUIC_EV_CONN_BCFRMS, qc); + continue; + } + + new_cf->crypto.len = dlen; + new_cf->crypto.offset = cf->crypto.offset; + new_cf->crypto.qel = qel; + TRACE_DEVEL("split frame", QUIC_EV_CONN_PRSAFRM, qc, new_cf); + if (cf->origin) { + TRACE_DEVEL("duplicated frame", QUIC_EV_CONN_PRSAFRM, qc); + /* This <cf> frame was duplicated */ + LIST_APPEND(&cf->origin->reflist, &new_cf->ref); + new_cf->origin = cf->origin; + /* Detach the remaining CRYPTO frame from its original frame */ + LIST_DEL_INIT(&cf->ref); + cf->origin = NULL; + } + LIST_APPEND(outlist, &new_cf->list); + /* Consume <dlen> bytes of the current frame. */ + cf->crypto.len -= dlen; + cf->crypto.offset += dlen; + } + break; + + case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: + if (cf->stream.dup) { + struct eb64_node *node = NULL; + struct qc_stream_desc *stream_desc = NULL; + struct qf_stream *strm_frm = &cf->stream; + + /* As this frame has been already lost, ensure the stream is always + * available or the range of this frame is not consumed before + * resending it. + */ + node = eb64_lookup(&qc->streams_by_id, strm_frm->id); + if (!node) { + TRACE_DEVEL("released stream", QUIC_EV_CONN_PRSAFRM, qc, cf); + qc_frm_free(qc, &cf); + continue; + } + + stream_desc = eb64_entry(node, struct qc_stream_desc, by_id); + if (strm_frm->offset.key + strm_frm->len <= stream_desc->ack_offset) { + TRACE_DEVEL("ignored frame frame in already acked range", + QUIC_EV_CONN_PRSAFRM, qc, cf); + qc_frm_free(qc, &cf); + continue; + } + else if (strm_frm->offset.key < stream_desc->ack_offset) { + uint64_t diff = stream_desc->ack_offset - strm_frm->offset.key; + + qc_stream_frm_mv_fwd(cf, diff); + TRACE_DEVEL("updated partially acked frame", + QUIC_EV_CONN_PRSAFRM, qc, cf); + } + } + /* Note that these frames are accepted in short packets only without + * "Length" packet field. Here, <*len> is used only to compute the + * sum of the lengths of the already built frames for this packet. + * + * Compute the length of this STREAM frame "header" made a all the field + * excepting the variable ones. Note that +1 is for the type of this frame. + */ + hlen = 1 + quic_int_getsize(cf->stream.id) + + ((cf->type & QUIC_STREAM_FRAME_TYPE_OFF_BIT) ? quic_int_getsize(cf->stream.offset.key) : 0); + /* Compute the data length of this STREAM frame. */ + avail_room = room - hlen - *len; + if ((ssize_t)avail_room <= 0) + continue; + + TRACE_DEVEL(" New STREAM frame build (room, len)", + QUIC_EV_CONN_BCFRMS, qc, &room, len); + + /* hlen contains STREAM id and offset. Ensure there is + * enough room for length field. + */ + if (cf->type & QUIC_STREAM_FRAME_TYPE_LEN_BIT) { + dlen = QUIC_MIN((uint64_t)max_available_room(avail_room, &dlen_sz), + cf->stream.len); + dlen_sz = quic_int_getsize(dlen); + flen = hlen + dlen_sz + dlen; + } + else { + dlen = QUIC_MIN((uint64_t)avail_room, cf->stream.len); + flen = hlen + dlen; + } + + if (cf->stream.len && !dlen) { + /* Only a small gap is left on buffer, not + * enough to encode the STREAM data length. + */ + continue; + } + + TRACE_DEVEL(" STREAM data length (hlen, stream.len, dlen)", + QUIC_EV_CONN_BCFRMS, qc, &hlen, &cf->stream.len, &dlen); + TRACE_DEVEL(" STREAM frame length (flen)", + QUIC_EV_CONN_BCFRMS, qc, &flen); + /* Add the STREAM data length and its encoded length to the packet + * length and the length of this length. + */ + *len += flen; + room -= flen; + if (dlen == cf->stream.len) { + /* <cf> STREAM data have been consumed. */ + LIST_DEL_INIT(&cf->list); + LIST_APPEND(outlist, &cf->list); + + /* Do not notify MUX on retransmission. */ + if (qc->flags & QUIC_FL_CONN_TX_MUX_CONTEXT) { + qcc_streams_sent_done(cf->stream.stream->ctx, + cf->stream.len, + cf->stream.offset.key); + } + } + else { + struct quic_frame *new_cf; + struct buffer cf_buf; + + new_cf = qc_frm_alloc(cf->type); + if (!new_cf) { + TRACE_ERROR("No memory for new STREAM frame", QUIC_EV_CONN_BCFRMS, qc); + continue; + } + + new_cf->stream.stream = cf->stream.stream; + new_cf->stream.buf = cf->stream.buf; + new_cf->stream.id = cf->stream.id; + new_cf->stream.offset = cf->stream.offset; + new_cf->stream.len = dlen; + new_cf->type |= QUIC_STREAM_FRAME_TYPE_LEN_BIT; + /* FIN bit reset */ + new_cf->type &= ~QUIC_STREAM_FRAME_TYPE_FIN_BIT; + new_cf->stream.data = cf->stream.data; + new_cf->stream.dup = cf->stream.dup; + TRACE_DEVEL("split frame", QUIC_EV_CONN_PRSAFRM, qc, new_cf); + if (cf->origin) { + TRACE_DEVEL("duplicated frame", QUIC_EV_CONN_PRSAFRM, qc); + /* This <cf> frame was duplicated */ + LIST_APPEND(&cf->origin->reflist, &new_cf->ref); + new_cf->origin = cf->origin; + /* Detach this STREAM frame from its origin */ + LIST_DEL_INIT(&cf->ref); + cf->origin = NULL; + } + LIST_APPEND(outlist, &new_cf->list); + cf->type |= QUIC_STREAM_FRAME_TYPE_OFF_BIT; + /* Consume <dlen> bytes of the current frame. */ + cf_buf = b_make(b_orig(cf->stream.buf), + b_size(cf->stream.buf), + (char *)cf->stream.data - b_orig(cf->stream.buf), 0); + cf->stream.len -= dlen; + cf->stream.offset.key += dlen; + cf->stream.data = (unsigned char *)b_peek(&cf_buf, dlen); + + /* Do not notify MUX on retransmission. */ + if (qc->flags & QUIC_FL_CONN_TX_MUX_CONTEXT) { + qcc_streams_sent_done(new_cf->stream.stream->ctx, + new_cf->stream.len, + new_cf->stream.offset.key); + } + } + + /* TODO the MUX is notified about the frame sending via + * previous qcc_streams_sent_done call. However, the + * sending can fail later, for example if the sendto + * system call returns an error. As the MUX has been + * notified, the transport layer is responsible to + * bufferize and resent the announced data later. + */ + + break; + + default: + flen = qc_frm_len(cf); + BUG_ON(!flen); + if (flen > room) + continue; + + *len += flen; + room -= flen; + LIST_DEL_INIT(&cf->list); + LIST_APPEND(outlist, &cf->list); + break; + } + + /* Successful status as soon as a frame could be built */ + ret = 1; + } + + leave: + TRACE_LEAVE(QUIC_EV_CONN_BCFRMS, qc); + return ret; +} + +/* Generate a CONNECTION_CLOSE frame for <qc> on <qel> encryption level. <out> + * is used as return parameter and should be zero'ed by the caller. + */ +static void qc_build_cc_frm(struct quic_conn *qc, struct quic_enc_level *qel, + struct quic_frame *out) +{ + /* TODO improve CONNECTION_CLOSE on Initial/Handshake encryption levels + * + * A CONNECTION_CLOSE frame should be sent in several packets with + * different encryption levels depending on the client context. This is + * to ensure that the client can decrypt it. See RFC 9000 10.2.3 for + * more details on how to implement it. + */ + TRACE_ENTER(QUIC_EV_CONN_BFRM, qc); + + + if (qc->err.app) { + if (unlikely(qel == qc->iel || qel == qc->hel)) { + /* RFC 9000 10.2.3. Immediate Close during the Handshake + * + * Sending a CONNECTION_CLOSE of type 0x1d in an Initial or Handshake + * packet could expose application state or be used to alter application + * state. A CONNECTION_CLOSE of type 0x1d MUST be replaced by a + * CONNECTION_CLOSE of type 0x1c when sending the frame in Initial or + * Handshake packets. Otherwise, information about the application + * state might be revealed. Endpoints MUST clear the value of the + * Reason Phrase field and SHOULD use the APPLICATION_ERROR code when + * converting to a CONNECTION_CLOSE of type 0x1c. + */ + out->type = QUIC_FT_CONNECTION_CLOSE; + out->connection_close.error_code = QC_ERR_APPLICATION_ERROR; + out->connection_close.reason_phrase_len = 0; + } + else { + out->type = QUIC_FT_CONNECTION_CLOSE_APP; + out->connection_close_app.error_code = qc->err.code; + out->connection_close_app.reason_phrase_len = 0; + } + } + else { + out->type = QUIC_FT_CONNECTION_CLOSE; + out->connection_close.error_code = qc->err.code; + out->connection_close.reason_phrase_len = 0; + } + TRACE_LEAVE(QUIC_EV_CONN_BFRM, qc); + +} + +/* Returns the <ack_delay> field value in microsecond to be set in an ACK frame + * depending on the time the packet with a new largest packet number was received. + */ +static inline uint64_t quic_compute_ack_delay_us(unsigned int time_received, + struct quic_conn *conn) +{ + return ((now_ms - time_received) * 1000) >> conn->tx.params.ack_delay_exponent; +} + +/* This function builds a clear packet from <pkt> information (its type) + * into a buffer with <pos> as position pointer and <qel> as QUIC TLS encryption + * level for <conn> QUIC connection and <qel> as QUIC TLS encryption level, + * filling the buffer with as much frames as possible from <frms> list of + * prebuilt frames. + * The trailing QUIC_TLS_TAG_LEN bytes of this packet are not built. But they are + * reserved so that to ensure there is enough room to build this AEAD TAG after + * having returned from this function. + * This function also updates the value of <buf_pn> pointer to point to the packet + * number field in this packet. <pn_len> will also have the packet number + * length as value. + * + * Return 1 if succeeded (enough room to buile this packet), O if not. + */ +static int qc_do_build_pkt(unsigned char *pos, const unsigned char *end, + size_t dglen, struct quic_tx_packet *pkt, + int64_t pn, size_t *pn_len, unsigned char **buf_pn, + int must_ack, int padding, int cc, int probe, + struct quic_enc_level *qel, struct quic_conn *qc, + const struct quic_version *ver, struct list *frms) +{ + unsigned char *beg, *payload; + size_t len, len_sz, len_frms, padding_len; + struct quic_frame frm; + struct quic_frame ack_frm; + struct quic_frame cc_frm; + size_t ack_frm_len, head_len; + int64_t rx_largest_acked_pn; + int add_ping_frm; + struct list frm_list = LIST_HEAD_INIT(frm_list); + struct quic_frame *cf; + int ret = 0; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + + /* Length field value with CRYPTO frames if present. */ + len_frms = 0; + beg = pos; + /* When not probing, and no immediate close is required, reduce the size of this + * buffer to respect the congestion controller window. + * This size will be limited if we have ack-eliciting frames to send from <frms>. + */ + if (!probe && !LIST_ISEMPTY(frms) && !cc) { + size_t path_room; + + path_room = quic_cc_path_prep_data(qc->path); + if (end - beg > path_room) + end = beg + path_room; + } + + /* Ensure there is enough room for the TLS encryption tag and a zero token + * length field if any. + */ + if (end - pos < QUIC_TLS_TAG_LEN + + (pkt->type == QUIC_PACKET_TYPE_INITIAL ? 1 : 0)) + goto no_room; + + end -= QUIC_TLS_TAG_LEN; + rx_largest_acked_pn = qel->pktns->rx.largest_acked_pn; + /* packet number length */ + *pn_len = quic_packet_number_length(pn, rx_largest_acked_pn); + /* Build the header */ + if ((pkt->type == QUIC_PACKET_TYPE_SHORT && + !quic_build_packet_short_header(&pos, end, *pn_len, qc, qel->tls_ctx.flags)) || + (pkt->type != QUIC_PACKET_TYPE_SHORT && + !quic_build_packet_long_header(&pos, end, pkt->type, *pn_len, qc, ver))) + goto no_room; + + /* Encode the token length (0) for an Initial packet. */ + if (pkt->type == QUIC_PACKET_TYPE_INITIAL) { + if (end <= pos) + goto no_room; + + *pos++ = 0; + } + + head_len = pos - beg; + /* Build an ACK frame if required. */ + ack_frm_len = 0; + /* Do not ack and probe at the same time. */ + if ((must_ack || (qel->pktns->flags & QUIC_FL_PKTNS_ACK_REQUIRED)) && !qel->pktns->tx.pto_probe) { + struct quic_arngs *arngs = &qel->pktns->rx.arngs; + BUG_ON(eb_is_empty(&qel->pktns->rx.arngs.root)); + ack_frm.type = QUIC_FT_ACK; + ack_frm.tx_ack.arngs = arngs; + if (qel->pktns->flags & QUIC_FL_PKTNS_NEW_LARGEST_PN) { + qel->pktns->tx.ack_delay = + quic_compute_ack_delay_us(qel->pktns->rx.largest_time_received, qc); + qel->pktns->flags &= ~QUIC_FL_PKTNS_NEW_LARGEST_PN; + } + ack_frm.tx_ack.ack_delay = qel->pktns->tx.ack_delay; + /* XXX BE CAREFUL XXX : here we reserved at least one byte for the + * smallest frame (PING) and <*pn_len> more for the packet number. Note + * that from here, we do not know if we will have to send a PING frame. + * This will be decided after having computed the ack-eliciting frames + * to be added to this packet. + */ + if (end - pos <= 1 + *pn_len) + goto no_room; + + ack_frm_len = qc_frm_len(&ack_frm); + if (ack_frm_len > end - 1 - *pn_len - pos) + goto no_room; + } + + /* Length field value without the ack-eliciting frames. */ + len = ack_frm_len + *pn_len; + len_frms = 0; + if (!cc && !LIST_ISEMPTY(frms)) { + ssize_t room = end - pos; + + TRACE_PROTO("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, frms); + /* Initialize the length of the frames built below to <len>. + * If any frame could be successfully built by qc_build_frms(), + * we will have len_frms > len. + */ + len_frms = len; + if (!qc_build_frms(&frm_list, frms, + end - pos, &len_frms, pos - beg, qel, qc)) { + TRACE_PROTO("Not enough room", QUIC_EV_CONN_TXPKT, + qc, NULL, NULL, &room); + if (padding) { + len_frms = 0; + goto comp_pkt_len; + } + + if (!ack_frm_len && !qel->pktns->tx.pto_probe) + goto no_room; + } + } + + comp_pkt_len: + /* Length (of the remaining data). Must not fail because, the buffer size + * has been checked above. Note that we have reserved QUIC_TLS_TAG_LEN bytes + * for the encryption tag. It must be taken into an account for the length + * of this packet. + */ + if (len_frms) + len = len_frms + QUIC_TLS_TAG_LEN; + else + len += QUIC_TLS_TAG_LEN; + /* CONNECTION_CLOSE frame */ + if (cc) { + qc_build_cc_frm(qc, qel, &cc_frm); + len += qc_frm_len(&cc_frm); + } + add_ping_frm = 0; + padding_len = 0; + len_sz = quic_int_getsize(len); + /* Add this packet size to <dglen> */ + dglen += head_len + len_sz + len; + /* Note that <padding> is true only when building an Handshake packet + * coalesced to an Initial packet. + */ + if (padding && dglen < QUIC_INITIAL_PACKET_MINLEN) { + /* This is a maximum padding size */ + padding_len = QUIC_INITIAL_PACKET_MINLEN - dglen; + /* The length field value is of this packet is <len> + <padding_len> + * the size of which may be greater than the initial computed size + * <len_sz>. So, let's deduce the difference between these to packet + * sizes from <padding_len>. + */ + padding_len -= quic_int_getsize(len + padding_len) - len_sz; + len += padding_len; + } + else if (len_frms && len_frms < QUIC_PACKET_PN_MAXLEN) { + len += padding_len = QUIC_PACKET_PN_MAXLEN - len_frms; + } + else if (LIST_ISEMPTY(&frm_list)) { + if (qel->pktns->tx.pto_probe) { + /* If we cannot send a frame, we send a PING frame. */ + add_ping_frm = 1; + len += 1; + dglen += 1; + /* Note that only we are in the case where this Initial packet + * is not coalesced to an Handshake packet. We must directly + * pad the datragram. + */ + if (pkt->type == QUIC_PACKET_TYPE_INITIAL) { + if (dglen < QUIC_INITIAL_PACKET_MINLEN) { + padding_len = QUIC_INITIAL_PACKET_MINLEN - dglen; + padding_len -= quic_int_getsize(len + padding_len) - len_sz; + len += padding_len; + } + } + else { + /* Note that +1 is for the PING frame */ + if (*pn_len + 1 < QUIC_PACKET_PN_MAXLEN) + len += padding_len = QUIC_PACKET_PN_MAXLEN - *pn_len - 1; + } + } + else { + /* If there is no frame at all to follow, add at least a PADDING frame. */ + if (!ack_frm_len && !cc) + len += padding_len = QUIC_PACKET_PN_MAXLEN - *pn_len; + } + } + + if (pkt->type != QUIC_PACKET_TYPE_SHORT && !quic_enc_int(&pos, end, len)) + goto no_room; + + /* Packet number field address. */ + *buf_pn = pos; + + /* Packet number encoding. */ + if (!quic_packet_number_encode(&pos, end, pn, *pn_len)) + goto no_room; + + /* payload building (ack-eliciting or not frames) */ + payload = pos; + if (ack_frm_len) { + if (!qc_build_frm(&pos, end, &ack_frm, pkt, qc)) + goto no_room; + + pkt->largest_acked_pn = quic_pktns_get_largest_acked_pn(qel->pktns); + pkt->flags |= QUIC_FL_TX_PACKET_ACK; + } + + /* Ack-eliciting frames */ + if (!LIST_ISEMPTY(&frm_list)) { + struct quic_frame *tmp_cf; + list_for_each_entry_safe(cf, tmp_cf, &frm_list, list) { + if (!qc_build_frm(&pos, end, cf, pkt, qc)) { + ssize_t room = end - pos; + TRACE_PROTO("Not enough room", QUIC_EV_CONN_TXPKT, + qc, NULL, NULL, &room); + /* Note that <cf> was added from <frms> to <frm_list> list by + * qc_build_frms(). + */ + LIST_DEL_INIT(&cf->list); + LIST_INSERT(frms, &cf->list); + continue; + } + + quic_tx_packet_refinc(pkt); + cf->pkt = pkt; + } + } + + /* Build a PING frame if needed. */ + if (add_ping_frm) { + frm.type = QUIC_FT_PING; + if (!qc_build_frm(&pos, end, &frm, pkt, qc)) + goto no_room; + } + + /* Build a CONNECTION_CLOSE frame if needed. */ + if (cc) { + if (!qc_build_frm(&pos, end, &cc_frm, pkt, qc)) + goto no_room; + + pkt->flags |= QUIC_FL_TX_PACKET_CC; + } + + /* Build a PADDING frame if needed. */ + if (padding_len) { + frm.type = QUIC_FT_PADDING; + frm.padding.len = padding_len; + if (!qc_build_frm(&pos, end, &frm, pkt, qc)) + goto no_room; + } + + if (pos == payload) { + /* No payload was built because of congestion control */ + TRACE_PROTO("limited by congestion control", QUIC_EV_CONN_TXPKT, qc); + goto no_room; + } + + /* If this packet is ack-eliciting and we are probing let's + * decrement the PTO probe counter. + */ + if ((pkt->flags & QUIC_FL_TX_PACKET_ACK_ELICITING) && + qel->pktns->tx.pto_probe) + qel->pktns->tx.pto_probe--; + + pkt->len = pos - beg; + LIST_SPLICE(&pkt->frms, &frm_list); + + ret = 1; + TRACE_PROTO("Packet ack-eliciting frames", QUIC_EV_CONN_TXPKT, qc, pkt); + leave: + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return ret; + + no_room: + /* Replace the pre-built frames which could not be add to this packet */ + LIST_SPLICE(frms, &frm_list); + TRACE_PROTO("Remaining ack-eliciting frames", QUIC_EV_CONN_FRMLIST, qc, frms); + goto leave; +} + +static inline void quic_tx_packet_init(struct quic_tx_packet *pkt, int type) +{ + pkt->type = type; + pkt->len = 0; + pkt->in_flight_len = 0; + pkt->pn_node.key = (uint64_t)-1; + LIST_INIT(&pkt->frms); + pkt->time_sent = TICK_ETERNITY; + pkt->next = NULL; + pkt->prev = NULL; + pkt->largest_acked_pn = -1; + pkt->flags = 0; + pkt->refcnt = 0; +} + +/* Build a packet into a buffer at <pos> position, <end> pointing to one byte past + * the end of this buffer, with <pkt_type> as packet type for <qc> QUIC connection + * at <qel> encryption level with <frms> list of prebuilt frames. + * + * Return -3 if the packet could not be allocated, -2 if could not be encrypted for + * any reason, -1 if there was not enough room to build a packet. + * XXX NOTE XXX + * If you provide provide qc_build_pkt() with a big enough buffer to build a packet as big as + * possible (to fill an MTU), the unique reason why this function may fail is the congestion + * control window limitation. + */ +static struct quic_tx_packet *qc_build_pkt(unsigned char **pos, + const unsigned char *end, + struct quic_enc_level *qel, + struct quic_tls_ctx *tls_ctx, struct list *frms, + struct quic_conn *qc, const struct quic_version *ver, + size_t dglen, int pkt_type, int must_ack, + int padding, int probe, int cc, int *err) +{ + struct quic_tx_packet *ret_pkt = NULL; + /* The pointer to the packet number field. */ + unsigned char *buf_pn; + unsigned char *first_byte, *last_byte, *payload; + int64_t pn; + size_t pn_len, payload_len, aad_len; + struct quic_tx_packet *pkt; + int encrypt_failure = 0; + + TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); + TRACE_PROTO("TX pkt build", QUIC_EV_CONN_TXPKT, qc, NULL, qel); + *err = 0; + pkt = pool_alloc(pool_head_quic_tx_packet); + if (!pkt) { + TRACE_DEVEL("Not enough memory for a new packet", QUIC_EV_CONN_TXPKT, qc); + *err = -3; + goto err; + } + + quic_tx_packet_init(pkt, pkt_type); + first_byte = *pos; + pn_len = 0; + buf_pn = NULL; + + pn = qel->pktns->tx.next_pn + 1; + if (!qc_do_build_pkt(*pos, end, dglen, pkt, pn, &pn_len, &buf_pn, + must_ack, padding, cc, probe, qel, qc, ver, frms)) { + // trace already emitted by function above + *err = -1; + goto err; + } + + last_byte = first_byte + pkt->len; + payload = buf_pn + pn_len; + payload_len = last_byte - payload; + aad_len = payload - first_byte; + + quic_packet_encrypt(payload, payload_len, first_byte, aad_len, pn, tls_ctx, qc, &encrypt_failure); + if (encrypt_failure) { + /* TODO Unrecoverable failure, unencrypted data should be returned to the caller. */ + WARN_ON("quic_packet_encrypt failure"); + *err = -2; + goto err; + } + + last_byte += QUIC_TLS_TAG_LEN; + pkt->len += QUIC_TLS_TAG_LEN; + quic_apply_header_protection(qc, first_byte, buf_pn, pn_len, tls_ctx, &encrypt_failure); + if (encrypt_failure) { + /* TODO Unrecoverable failure, unencrypted data should be returned to the caller. */ + WARN_ON("quic_apply_header_protection failure"); + *err = -2; + goto err; + } + + /* Consume a packet number */ + qel->pktns->tx.next_pn++; + qc->bytes.prep += pkt->len; + if (qc->bytes.prep >= 3 * qc->bytes.rx && !quic_peer_validated_addr(qc)) { + qc->flags |= QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED; + TRACE_PROTO("anti-amplification limit reached", QUIC_EV_CONN_TXPKT, qc); + } + + /* Now that a correct packet is built, let us consume <*pos> buffer. */ + *pos = last_byte; + /* Attach the built packet to its tree. */ + pkt->pn_node.key = pn; + /* Set the packet in fligth length for in flight packet only. */ + if (pkt->flags & QUIC_FL_TX_PACKET_IN_FLIGHT) { + pkt->in_flight_len = pkt->len; + qc->path->prep_in_flight += pkt->len; + } + /* Always reset this flag */ + qc->flags &= ~QUIC_FL_CONN_IMMEDIATE_CLOSE; + if (pkt->flags & QUIC_FL_TX_PACKET_ACK) { + qel->pktns->flags &= ~QUIC_FL_PKTNS_ACK_REQUIRED; + qel->pktns->rx.nb_aepkts_since_last_ack = 0; + qc->flags &= ~QUIC_FL_CONN_ACK_TIMER_FIRED; + if (tick_isset(qc->ack_expire)) { + qc->ack_expire = TICK_ETERNITY; + qc->idle_timer_task->expire = qc->idle_expire; + task_queue(qc->idle_timer_task); + TRACE_PROTO("ack timer cancelled", QUIC_EV_CONN_IDLE_TIMER, qc); + } + } + + pkt->pktns = qel->pktns; + + ret_pkt = pkt; + leave: + TRACE_PROTO("TX pkt built", QUIC_EV_CONN_TXPKT, qc, ret_pkt); + TRACE_LEAVE(QUIC_EV_CONN_TXPKT, qc); + return ret_pkt; + + err: + /* TODO: what about the frames which have been built + * for this packet. + */ + free_quic_tx_packet(qc, pkt); + goto leave; +} +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/raw_sock.c b/src/raw_sock.c new file mode 100644 index 0000000..1287dc5 --- /dev/null +++ b/src/raw_sock.c @@ -0,0 +1,489 @@ +/* + * RAW transport layer over SOCK_STREAM sockets. + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <netinet/tcp.h> + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/pipe.h> +#include <haproxy/proxy.h> +#include <haproxy/tools.h> + + +#if defined(USE_LINUX_SPLICE) + +/* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes + * because of timestamps. Use this as a hint for not looping on splice(). + */ +#define SPLICE_FULL_HINT 16*1448 + +/* how many data we attempt to splice at once when the buffer is configured for + * infinite forwarding */ +#define MAX_SPLICE_AT_ONCE (1<<30) + +/* Returns : + * -1 if splice() is not supported + * >= 0 to report the amount of spliced bytes. + * connection flags are updated (error, read0, wait_room, wait_data). + * The caller must have previously allocated the pipe. + */ +int raw_sock_to_pipe(struct connection *conn, void *xprt_ctx, struct pipe *pipe, unsigned int count) +{ + int ret; + int retval = 0; + + + if (!conn_ctrl_ready(conn)) + return 0; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (!fd_recv_ready(conn->handle.fd)) + return 0; + + conn->flags &= ~CO_FL_WAIT_ROOM; + errno = 0; + + /* Under Linux, if FD_POLL_HUP is set, we have reached the end. + * Since older splice() implementations were buggy and returned + * EAGAIN on end of read, let's bypass the call to splice() now. + */ + if (unlikely(!(fdtab[conn->handle.fd].state & FD_POLL_IN))) { + /* stop here if we reached the end of data */ + if ((fdtab[conn->handle.fd].state & (FD_POLL_ERR|FD_POLL_HUP)) == FD_POLL_HUP) + goto out_read0; + + /* report error on POLL_ERR before connection establishment */ + if ((fdtab[conn->handle.fd].state & FD_POLL_ERR) && (conn->flags & CO_FL_WAIT_L4_CONN)) { + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + errno = 0; /* let the caller do a getsockopt() if it wants it */ + goto leave; + } + } + + while (count) { + if (count > MAX_SPLICE_AT_ONCE) + count = MAX_SPLICE_AT_ONCE; + + ret = splice(conn->handle.fd, NULL, pipe->prod, NULL, count, + SPLICE_F_MOVE|SPLICE_F_NONBLOCK); + + if (ret <= 0) { + if (ret == 0) + goto out_read0; + + if (errno == EAGAIN || errno == EWOULDBLOCK) { + /* there are two reasons for EAGAIN : + * - nothing in the socket buffer (standard) + * - pipe is full + * The difference between these two situations + * is problematic. Since we don't know if the + * pipe is full, we'll stop if the pipe is not + * empty. Anyway, we will almost always fill or + * empty the pipe. + */ + if (pipe->data) { + /* always stop reading until the pipe is flushed */ + conn->flags |= CO_FL_WAIT_ROOM; + break; + } + /* socket buffer exhausted */ + fd_cant_recv(conn->handle.fd); + break; + } + else if (errno == ENOSYS || errno == EINVAL || errno == EBADF) { + /* splice not supported on this end, disable it. + * We can safely return -1 since there is no + * chance that any data has been piped yet. + */ + retval = -1; + goto leave; + } + else if (errno == EINTR) { + /* try again */ + continue; + } + /* here we have another error */ + conn->flags |= CO_FL_ERROR; + break; + } /* ret <= 0 */ + + retval += ret; + pipe->data += ret; + count -= ret; + + if (pipe->data >= SPLICE_FULL_HINT || ret >= global.tune.recv_enough) { + /* We've read enough of it for this time, let's stop before + * being asked to poll. + */ + conn->flags |= CO_FL_WAIT_ROOM; + break; + } + } /* while */ + + if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && retval) + conn->flags &= ~CO_FL_WAIT_L4_CONN; + + leave: + if (retval > 0) + increment_send_rate(retval, 1); + + return retval; + + out_read0: + conn_sock_read0(conn); + conn->flags &= ~CO_FL_WAIT_L4_CONN; + goto leave; +} + +/* Send as many bytes as possible from the pipe to the connection's socket. + */ +int raw_sock_from_pipe(struct connection *conn, void *xprt_ctx, struct pipe *pipe, unsigned int count) +{ + int ret, done; + + if (!conn_ctrl_ready(conn)) + return 0; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (!fd_send_ready(conn->handle.fd)) + return 0; + + if (conn->flags & CO_FL_SOCK_WR_SH) { + /* it's already closed */ + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH; + errno = EPIPE; + return 0; + } + + if (unlikely(count > pipe->data)) + count = pipe->data; + + done = 0; + while (count) { + ret = splice(pipe->cons, NULL, conn->handle.fd, NULL, count, + SPLICE_F_MOVE|SPLICE_F_NONBLOCK); + + if (ret <= 0) { + if (ret == 0 || errno == EAGAIN || errno == EWOULDBLOCK) { + fd_cant_send(conn->handle.fd); + break; + } + else if (errno == EINTR) + continue; + + /* here we have another error */ + conn->flags |= CO_FL_ERROR; + break; + } + + done += ret; + count -= ret; + pipe->data -= ret; + } + if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && done) { + conn->flags &= ~CO_FL_WAIT_L4_CONN; + } + + return done; +} + +#endif /* USE_LINUX_SPLICE */ + + +/* Receive up to <count> bytes from connection <conn>'s socket and store them + * into buffer <buf>. Only one call to recv() is performed, unless the + * buffer wraps, in which case a second call may be performed. The connection's + * flags are updated with whatever special event is detected (error, read0, + * empty). The caller is responsible for taking care of those events and + * avoiding the call if inappropriate. The function does not call the + * connection's polling update function, so the caller is responsible for this. + * errno is cleared before starting so that the caller knows that if it spots an + * error without errno, it's pending and can be retrieved via getsockopt(SO_ERROR). + */ +static size_t raw_sock_to_buf(struct connection *conn, void *xprt_ctx, struct buffer *buf, size_t count, int flags) +{ + ssize_t ret; + size_t try, done = 0; + + if (!conn_ctrl_ready(conn)) + return 0; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (!fd_recv_ready(conn->handle.fd)) + return 0; + + conn->flags &= ~CO_FL_WAIT_ROOM; + errno = 0; + + if (unlikely(!(fdtab[conn->handle.fd].state & FD_POLL_IN))) { + /* stop here if we reached the end of data */ + if ((fdtab[conn->handle.fd].state & (FD_POLL_ERR|FD_POLL_HUP)) == FD_POLL_HUP) + goto read0; + + /* report error on POLL_ERR before connection establishment */ + if ((fdtab[conn->handle.fd].state & FD_POLL_ERR) && (conn->flags & CO_FL_WAIT_L4_CONN)) { + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + goto leave; + } + } + + /* read the largest possible block. For this, we perform only one call + * to recv() unless the buffer wraps and we exactly fill the first hunk, + * in which case we accept to do it once again. A new attempt is made on + * EINTR too. + */ + while (count > 0) { + try = b_contig_space(buf); + if (!try) + break; + + if (try > count) + try = count; + + ret = recv(conn->handle.fd, b_tail(buf), try, 0); + + if (ret > 0) { + b_add(buf, ret); + done += ret; + if (ret < try) { + /* socket buffer exhausted */ + fd_cant_recv(conn->handle.fd); + + /* unfortunately, on level-triggered events, POLL_HUP + * is generally delivered AFTER the system buffer is + * empty, unless the poller supports POLL_RDHUP. If + * we know this is the case, we don't try to read more + * as we know there's no more available. Similarly, if + * there's no problem with lingering we don't even try + * to read an unlikely close from the client since we'll + * close first anyway. + */ + if (fdtab[conn->handle.fd].state & FD_POLL_HUP) + goto read0; + + if (!(fdtab[conn->handle.fd].state & FD_LINGER_RISK) || + (cur_poller.flags & HAP_POLL_F_RDHUP)) { + break; + } + } + count -= ret; + + if (flags & CO_RFL_READ_ONCE) + break; + } + else if (ret == 0) { + goto read0; + } + else if (errno == EAGAIN || errno == EWOULDBLOCK || errno == ENOTCONN) { + /* socket buffer exhausted */ + fd_cant_recv(conn->handle.fd); + break; + } + else if (errno != EINTR) { + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + break; + } + } + + if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && done) + conn->flags &= ~CO_FL_WAIT_L4_CONN; + + leave: + return done; + + read0: + conn_sock_read0(conn); + conn->flags &= ~CO_FL_WAIT_L4_CONN; + + /* Now a final check for a possible asynchronous low-level error + * report. This can happen when a connection receives a reset + * after a shutdown, both POLL_HUP and POLL_ERR are queued, and + * we might have come from there by just checking POLL_HUP instead + * of recv()'s return value 0, so we have no way to tell there was + * an error without checking. + */ + if (unlikely(!done && fdtab[conn->handle.fd].state & FD_POLL_ERR)) + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + goto leave; +} + + +/* Send up to <count> pending bytes from buffer <buf> to connection <conn>'s + * socket. <flags> may contain some CO_SFL_* flags to hint the system about + * other pending data for example, but this flag is ignored at the moment. + * Only one call to send() is performed, unless the buffer wraps, in which case + * a second call may be performed. The connection's flags are updated with + * whatever special event is detected (error, empty). The caller is responsible + * for taking care of those events and avoiding the call if inappropriate. The + * function does not call the connection's polling update function, so the caller + * is responsible for this. It's up to the caller to update the buffer's contents + * based on the return value. + */ +static size_t raw_sock_from_buf(struct connection *conn, void *xprt_ctx, const struct buffer *buf, size_t count, int flags) +{ + ssize_t ret; + size_t try, done; + int send_flag; + + if (!conn_ctrl_ready(conn)) + return 0; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (!fd_send_ready(conn->handle.fd)) + return 0; + + if (unlikely(fdtab[conn->handle.fd].state & FD_POLL_ERR)) { + /* an error was reported on the FD, we can't send anymore */ + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_WR_SH | CO_FL_SOCK_RD_SH; + errno = EPIPE; + return 0; + } + + if (conn->flags & CO_FL_SOCK_WR_SH) { + /* it's already closed */ + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH; + errno = EPIPE; + return 0; + } + + done = 0; + /* send the largest possible block. For this we perform only one call + * to send() unless the buffer wraps and we exactly fill the first hunk, + * in which case we accept to do it once again. + */ + while (count) { + try = b_contig_data(buf, done); + if (try > count) + try = count; + + send_flag = MSG_DONTWAIT | MSG_NOSIGNAL; + if (try < count || flags & CO_SFL_MSG_MORE) + send_flag |= MSG_MORE; + + ret = send(conn->handle.fd, b_peek(buf, done), try, send_flag); + + if (ret > 0) { + count -= ret; + done += ret; + + /* if the system buffer is full, don't insist */ + if (ret < try) { + fd_cant_send(conn->handle.fd); + break; + } + if (!count) + fd_stop_send(conn->handle.fd); + } + else if (ret == 0 || errno == EAGAIN || errno == EWOULDBLOCK || errno == ENOTCONN || errno == EINPROGRESS) { + /* nothing written, we need to poll for write first */ + fd_cant_send(conn->handle.fd); + break; + } + else if (errno != EINTR) { + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + break; + } + } + if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && done) { + conn->flags &= ~CO_FL_WAIT_L4_CONN; + } + + if (done > 0) + increment_send_rate(done, 0); + + return done; +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0. + */ +static int raw_sock_subscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + return conn_subscribe(conn, xprt_ctx, event_type, es); +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type>. + * The <es> pointer is not allowed to differ from the one passed to the + * subscribe() call. It always returns zero. + */ +static int raw_sock_unsubscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + return conn_unsubscribe(conn, xprt_ctx, event_type, es); +} + +static void raw_sock_close(struct connection *conn, void *xprt_ctx) +{ + if (conn->subs != NULL) { + conn_unsubscribe(conn, NULL, conn->subs->events, conn->subs); + } +} + +/* We can't have an underlying XPRT, so just return -1 to signify failure */ +static int raw_sock_remove_xprt(struct connection *conn, void *xprt_ctx, void *toremove_ctx, const struct xprt_ops *newops, void *newctx) +{ + /* This is the lowest xprt we can have, so if we get there we didn't + * find the xprt we wanted to remove, that's a bug + */ + BUG_ON(1); + return -1; +} + +/* transport-layer operations for RAW sockets */ +static struct xprt_ops raw_sock = { + .snd_buf = raw_sock_from_buf, + .rcv_buf = raw_sock_to_buf, + .subscribe = raw_sock_subscribe, + .unsubscribe = raw_sock_unsubscribe, + .remove_xprt = raw_sock_remove_xprt, +#if defined(USE_LINUX_SPLICE) + .rcv_pipe = raw_sock_to_pipe, + .snd_pipe = raw_sock_from_pipe, +#endif + .shutr = NULL, + .shutw = NULL, + .close = raw_sock_close, + .name = "RAW", +}; + + +static void __raw_sock_init(void) +{ + xprt_register(XPRT_RAW, &raw_sock); +} + +INITCALL0(STG_REGISTER, __raw_sock_init); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/regex.c b/src/regex.c new file mode 100644 index 0000000..19c7eda --- /dev/null +++ b/src/regex.c @@ -0,0 +1,459 @@ +/* + * Regex and string management functions. + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <stdlib.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/regex.h> +#include <haproxy/tools.h> + +/* regex trash buffer used by various regex tests */ +THREAD_LOCAL regmatch_t pmatch[MAX_MATCH]; /* rm_so, rm_eo for regular expressions */ + +int exp_replace(char *dst, unsigned int dst_size, char *src, const char *str, const regmatch_t *matches) +{ + char *old_dst = dst; + char* dst_end = dst + dst_size; + + while (*str) { + if (*str == '\\') { + str++; + if (!*str) + return -1; + + if (isdigit((unsigned char)*str)) { + int len, num; + + num = *str - '0'; + str++; + + if (matches[num].rm_eo > -1 && matches[num].rm_so > -1) { + len = matches[num].rm_eo - matches[num].rm_so; + + if (dst + len >= dst_end) + return -1; + + memcpy(dst, src + matches[num].rm_so, len); + dst += len; + } + + } else if (*str == 'x') { + unsigned char hex1, hex2; + str++; + + if (!*str) + return -1; + + hex1 = toupper((unsigned char)*str++) - '0'; + + if (!*str) + return -1; + + hex2 = toupper((unsigned char)*str++) - '0'; + + if (hex1 > 9) hex1 -= 'A' - '9' - 1; + if (hex2 > 9) hex2 -= 'A' - '9' - 1; + + if (dst >= dst_end) + return -1; + + *dst++ = (hex1<<4) + hex2; + } else { + if (dst >= dst_end) + return -1; + + *dst++ = *str++; + } + } else { + if (dst >= dst_end) + return -1; + + *dst++ = *str++; + } + } + if (dst >= dst_end) + return -1; + + *dst = '\0'; + return dst - old_dst; +} + +/* returns NULL if the replacement string <str> is valid, or the pointer to the first error */ +const char *check_replace_string(const char *str) +{ + const char *err = NULL; + while (*str) { + if (*str == '\\') { + err = str; /* in case of a backslash, we return the pointer to it */ + str++; + if (!*str) + return err; + else if (isdigit((unsigned char)*str)) + err = NULL; + else if (*str == 'x') { + str++; + if (!ishex(*str)) + return err; + str++; + if (!ishex(*str)) + return err; + err = NULL; + } + else { + ha_warning("'\\%c' : deprecated use of a backslash before something not '\\','x' or a digit.\n", *str); + err = NULL; + } + } + str++; + } + return err; +} + + +/* This function apply regex. It take const null terminated char as input. + * If the function doesn't match, it returns false, else it returns true. + * When it is compiled with JIT, this function execute strlen on the subject. + * Currently the only supported flag is REG_NOTBOL. + */ +int regex_exec_match(const struct my_regex *preg, const char *subject, + size_t nmatch, regmatch_t pmatch[], int flags) { +#if defined(USE_PCRE) || defined(USE_PCRE_JIT) || defined(USE_PCRE2) || defined(USE_PCRE2_JIT) + int ret; +#ifdef USE_PCRE2 + PCRE2_SIZE *matches; + pcre2_match_data *pm; +#else + int matches[MAX_MATCH * 3]; +#endif + int enmatch; + int i; + int options; + + /* Silently limit the number of allowed matches. max + * match i the maximum value for match, in fact this + * limit is not applied. + */ + + enmatch = nmatch; + if (enmatch > MAX_MATCH) + enmatch = MAX_MATCH; + + options = 0; + if (flags & REG_NOTBOL) +#ifdef USE_PCRE2 + options |= PCRE2_NOTBOL; +#else + options |= PCRE_NOTBOL; +#endif + + /* The value returned by pcre_exec()/pcre2_match() is one more than the highest numbered + * pair that has been set. For example, if two substrings have been captured, + * the returned value is 3. If there are no capturing subpatterns, the return + * value from a successful match is 1, indicating that just the first pair of + * offsets has been set. + * + * It seems that this function returns 0 if it detects more matches than available + * space in the matches array. + */ +#ifdef USE_PCRE2 + pm = pcre2_match_data_create_from_pattern(preg->reg, NULL); + ret = preg->mfn(preg->reg, (PCRE2_SPTR)subject, (PCRE2_SIZE)strlen(subject), 0, options, pm, NULL); + + if (ret < 0) { + pcre2_match_data_free(pm); + return 0; + } + + matches = pcre2_get_ovector_pointer(pm); +#else + ret = pcre_exec(preg->reg, preg->extra, subject, strlen(subject), 0, options, matches, enmatch * 3); + + if (ret < 0) + return 0; +#endif + + if (ret == 0) + ret = enmatch; + + for (i=0; i<nmatch; i++) { + /* Copy offset. */ + if (i < ret) { + pmatch[i].rm_so = matches[(i*2)]; + pmatch[i].rm_eo = matches[(i*2)+1]; + continue; + } + /* Set the unmatvh flag (-1). */ + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + } +#ifdef USE_PCRE2 + pcre2_match_data_free(pm); +#endif + return 1; +#else + int match; + + flags &= REG_NOTBOL; + match = regexec(&preg->regex, subject, nmatch, pmatch, flags); + if (match == REG_NOMATCH) + return 0; + return 1; +#endif +} + +/* This function apply regex. It take a "char *" ans length as input. The + * <subject> can be modified during the processing. If the function doesn't + * match, it returns false, else it returns true. + * When it is compiled with standard POSIX regex or PCRE, this function add + * a temporary null characters at the end of the <subject>. The <subject> must + * have a real length of <length> + 1. Currently the only supported flag is + * REG_NOTBOL. + */ +int regex_exec_match2(const struct my_regex *preg, char *subject, int length, + size_t nmatch, regmatch_t pmatch[], int flags) { +#if defined(USE_PCRE) || defined(USE_PCRE_JIT) || defined(USE_PCRE2) || defined(USE_PCRE2_JIT) + int ret; +#ifdef USE_PCRE2 + PCRE2_SIZE *matches; + pcre2_match_data *pm; +#else + int matches[MAX_MATCH * 3]; +#endif + int enmatch; + int i; + int options; + + /* Silently limit the number of allowed matches. max + * match i the maximum value for match, in fact this + * limit is not applied. + */ + enmatch = nmatch; + if (enmatch > MAX_MATCH) + enmatch = MAX_MATCH; + + options = 0; + if (flags & REG_NOTBOL) +#ifdef USE_PCRE2 + options |= PCRE2_NOTBOL; +#else + options |= PCRE_NOTBOL; +#endif + + /* The value returned by pcre_exec()/pcre2_(jit)_match() is one more than the highest numbered + * pair that has been set. For example, if two substrings have been captured, + * the returned value is 3. If there are no capturing subpatterns, the return + * value from a successful match is 1, indicating that just the first pair of + * offsets has been set. + * + * It seems that this function returns 0 if it detects more matches than available + * space in the matches array. + */ +#ifdef USE_PCRE2 + pm = pcre2_match_data_create_from_pattern(preg->reg, NULL); + ret = preg->mfn(preg->reg, (PCRE2_SPTR)subject, (PCRE2_SIZE)length, 0, options, pm, NULL); + + if (ret < 0) { + pcre2_match_data_free(pm); + return 0; + } + + matches = pcre2_get_ovector_pointer(pm); +#else + ret = pcre_exec(preg->reg, preg->extra, subject, length, 0, options, matches, enmatch * 3); + if (ret < 0) + return 0; +#endif + + if (ret == 0) + ret = enmatch; + + for (i=0; i<nmatch; i++) { + /* Copy offset. */ + if (i < ret) { + pmatch[i].rm_so = matches[(i*2)]; + pmatch[i].rm_eo = matches[(i*2)+1]; + continue; + } + /* Set the unmatvh flag (-1). */ + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + } +#ifdef USE_PCRE2 + pcre2_match_data_free(pm); +#endif + return 1; +#else + char old_char = subject[length]; + int match; + + flags &= REG_NOTBOL; + subject[length] = 0; + match = regexec(&preg->regex, subject, nmatch, pmatch, flags); + subject[length] = old_char; + if (match == REG_NOMATCH) + return 0; + return 1; +#endif +} + +struct my_regex *regex_comp(const char *str, int cs, int cap, char **err) +{ + struct my_regex *regex = NULL; +#if defined(USE_PCRE) || defined(USE_PCRE_JIT) + int flags = 0; + const char *error; + int erroffset; +#elif defined(USE_PCRE2) || defined(USE_PCRE2_JIT) + int flags = 0; + int errn; +#if defined(USE_PCRE2_JIT) + int jit; +#endif + PCRE2_UCHAR error[256]; + PCRE2_SIZE erroffset; +#else + int flags = REG_EXTENDED; +#endif + + regex = calloc(1, sizeof(*regex)); + if (!regex) { + memprintf(err, "not enough memory to build regex"); + goto out_fail_alloc; + } + +#if defined(USE_PCRE) || defined(USE_PCRE_JIT) + if (!cs) + flags |= PCRE_CASELESS; + if (!cap) + flags |= PCRE_NO_AUTO_CAPTURE; + + regex->reg = pcre_compile(str, flags, &error, &erroffset, NULL); + if (!regex->reg) { + memprintf(err, "regex '%s' is invalid (error=%s, erroffset=%d)", str, error, erroffset); + goto out_fail_alloc; + } + + regex->extra = pcre_study(regex->reg, PCRE_STUDY_JIT_COMPILE, &error); + if (!regex->extra && error != NULL) { + pcre_free(regex->reg); + memprintf(err, "failed to compile regex '%s' (error=%s)", str, error); + goto out_fail_alloc; + } +#elif defined(USE_PCRE2) || defined(USE_PCRE2_JIT) + if (!cs) + flags |= PCRE2_CASELESS; + if (!cap) + flags |= PCRE2_NO_AUTO_CAPTURE; + + regex->reg = pcre2_compile((PCRE2_SPTR)str, PCRE2_ZERO_TERMINATED, flags, &errn, &erroffset, NULL); + if (!regex->reg) { + pcre2_get_error_message(errn, error, sizeof(error)); + memprintf(err, "regex '%s' is invalid (error=%s, erroffset=%zu)", str, error, erroffset); + goto out_fail_alloc; + } + + regex->mfn = &pcre2_match; +#if defined(USE_PCRE2_JIT) + jit = pcre2_jit_compile(regex->reg, PCRE2_JIT_COMPLETE); + /* + * We end if it is an error not related to lack of JIT support + * in a case of JIT support missing pcre2_jit_compile is "no-op" + */ + if (!jit) + regex->mfn = &pcre2_jit_match; + else { + if (jit != PCRE2_ERROR_JIT_BADOPTION) { + pcre2_code_free(regex->reg); + memprintf(err, "regex '%s' jit compilation failed", str); + goto out_fail_alloc; + } + else + regex->mfn = &pcre2_match; + } +#endif + +#else + if (!cs) + flags |= REG_ICASE; + if (!cap) + flags |= REG_NOSUB; + + if (regcomp(®ex->regex, str, flags) != 0) { + memprintf(err, "regex '%s' is invalid", str); + goto out_fail_alloc; + } +#endif + return regex; + + out_fail_alloc: + free(regex); + return NULL; +} + +static void regex_register_build_options(void) +{ + char *ptr = NULL; + +#ifdef USE_PCRE + memprintf(&ptr, "Built with PCRE version : %s", (HAP_XSTRING(Z PCRE_PRERELEASE)[1] == 0)? + HAP_XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) : + HAP_XSTRING(PCRE_MAJOR.PCRE_MINOR) HAP_XSTRING(PCRE_PRERELEASE PCRE_DATE)); + memprintf(&ptr, "%s\nRunning on PCRE version : %s", ptr, pcre_version()); + + memprintf(&ptr, "%s\nPCRE library supports JIT : %s", ptr, +#ifdef USE_PCRE_JIT + ({ + int r; + pcre_config(PCRE_CONFIG_JIT, &r); + r ? "yes" : "no (libpcre build without JIT?)"; + }) +#else + "no (USE_PCRE_JIT not set)" +#endif + ); +#endif /* USE_PCRE */ + +#ifdef USE_PCRE2 + memprintf(&ptr, "Built with PCRE2 version : %s", (HAP_XSTRING(Z PCRE2_PRERELEASE)[1] == 0) ? + HAP_XSTRING(PCRE2_MAJOR.PCRE2_MINOR PCRE2_DATE) : + HAP_XSTRING(PCRE2_MAJOR.PCRE2_MINOR) HAP_XSTRING(PCRE2_PRERELEASE PCRE2_DATE)); + memprintf(&ptr, "%s\nPCRE2 library supports JIT : %s", ptr, +#ifdef USE_PCRE2_JIT + ({ + int r; + pcre2_config(PCRE2_CONFIG_JIT, &r); + r ? "yes" : "no (libpcre2 build without JIT?)"; + }) +#else + "no (USE_PCRE2_JIT not set)" +#endif + ); +#endif /* USE_PCRE2 */ + +#if !defined(USE_PCRE) && !defined(USE_PCRE2) + memprintf(&ptr, "Built without PCRE or PCRE2 support (using libc's regex instead)"); +#endif + hap_register_build_opts(ptr, 1); +} + +INITCALL0(STG_REGISTER, regex_register_build_options); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/resolvers.c b/src/resolvers.c new file mode 100644 index 0000000..3275cd2 --- /dev/null +++ b/src/resolvers.c @@ -0,0 +1,3813 @@ +/* + * Name server resolution + * + * Copyright 2014 Baptiste Assmann <bedis9@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/types.h> + +#include <import/ebistree.h> + +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/cli.h> +#include <haproxy/dns.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/http_rules.h> +#include <haproxy/log.h> +#include <haproxy/net_helper.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/resolvers.h> +#include <haproxy/ring.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/stats.h> +#include <haproxy/stconn.h> +#include <haproxy/task.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/ticks.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/vars.h> +#include <haproxy/xxhash.h> + + +struct list sec_resolvers = LIST_HEAD_INIT(sec_resolvers); +struct list resolv_srvrq_list = LIST_HEAD_INIT(resolv_srvrq_list); + +static THREAD_LOCAL struct list death_row; /* list of deferred resolutions to kill, local validity only */ +static THREAD_LOCAL unsigned int recurse = 0; /* counter to track calls to public functions */ +static THREAD_LOCAL uint64_t resolv_query_id_seed = 0; /* random seed */ +struct resolvers *curr_resolvers = NULL; + +DECLARE_STATIC_POOL(resolv_answer_item_pool, "resolv_answer_item", sizeof(struct resolv_answer_item)); +DECLARE_STATIC_POOL(resolv_resolution_pool, "resolv_resolution", sizeof(struct resolv_resolution)); +DECLARE_POOL(resolv_requester_pool, "resolv_requester", sizeof(struct resolv_requester)); + +static unsigned int resolution_uuid = 1; +unsigned int resolv_failed_resolutions = 0; +struct task *process_resolvers(struct task *t, void *context, unsigned int state); +static void resolv_free_resolution(struct resolv_resolution *resolution); +static void _resolv_unlink_resolution(struct resolv_requester *requester); +static void enter_resolver_code(); +static void leave_resolver_code(); + +enum { + RSLV_STAT_ID, + RSLV_STAT_PID, + RSLV_STAT_SENT, + RSLV_STAT_SND_ERROR, + RSLV_STAT_VALID, + RSLV_STAT_UPDATE, + RSLV_STAT_CNAME, + RSLV_STAT_CNAME_ERROR, + RSLV_STAT_ANY_ERR, + RSLV_STAT_NX, + RSLV_STAT_TIMEOUT, + RSLV_STAT_REFUSED, + RSLV_STAT_OTHER, + RSLV_STAT_INVALID, + RSLV_STAT_TOO_BIG, + RSLV_STAT_TRUNCATED, + RSLV_STAT_OUTDATED, + RSLV_STAT_END, +}; + +static struct name_desc resolv_stats[] = { + [RSLV_STAT_ID] = { .name = "id", .desc = "ID" }, + [RSLV_STAT_PID] = { .name = "pid", .desc = "Parent ID" }, + [RSLV_STAT_SENT] = { .name = "sent", .desc = "Sent" }, + [RSLV_STAT_SND_ERROR] = { .name = "send_error", .desc = "Send error" }, + [RSLV_STAT_VALID] = { .name = "valid", .desc = "Valid" }, + [RSLV_STAT_UPDATE] = { .name = "update", .desc = "Update" }, + [RSLV_STAT_CNAME] = { .name = "cname", .desc = "CNAME" }, + [RSLV_STAT_CNAME_ERROR] = { .name = "cname_error", .desc = "CNAME error" }, + [RSLV_STAT_ANY_ERR] = { .name = "any_err", .desc = "Any errors" }, + [RSLV_STAT_NX] = { .name = "nx", .desc = "NX" }, + [RSLV_STAT_TIMEOUT] = { .name = "timeout", .desc = "Timeout" }, + [RSLV_STAT_REFUSED] = { .name = "refused", .desc = "Refused" }, + [RSLV_STAT_OTHER] = { .name = "other", .desc = "Other" }, + [RSLV_STAT_INVALID] = { .name = "invalid", .desc = "Invalid" }, + [RSLV_STAT_TOO_BIG] = { .name = "too_big", .desc = "Too big" }, + [RSLV_STAT_TRUNCATED] = { .name = "truncated", .desc = "Truncated" }, + [RSLV_STAT_OUTDATED] = { .name = "outdated", .desc = "Outdated" }, +}; + +static struct dns_counters dns_counters; + +static void resolv_fill_stats(void *d, struct field *stats) +{ + struct dns_counters *counters = d; + stats[RSLV_STAT_ID] = mkf_str(FO_CONFIG, counters->id); + stats[RSLV_STAT_PID] = mkf_str(FO_CONFIG, counters->pid); + stats[RSLV_STAT_SENT] = mkf_u64(FN_GAUGE, counters->sent); + stats[RSLV_STAT_SND_ERROR] = mkf_u64(FN_GAUGE, counters->snd_error); + stats[RSLV_STAT_VALID] = mkf_u64(FN_GAUGE, counters->app.resolver.valid); + stats[RSLV_STAT_UPDATE] = mkf_u64(FN_GAUGE, counters->app.resolver.update); + stats[RSLV_STAT_CNAME] = mkf_u64(FN_GAUGE, counters->app.resolver.cname); + stats[RSLV_STAT_CNAME_ERROR] = mkf_u64(FN_GAUGE, counters->app.resolver.cname_error); + stats[RSLV_STAT_ANY_ERR] = mkf_u64(FN_GAUGE, counters->app.resolver.any_err); + stats[RSLV_STAT_NX] = mkf_u64(FN_GAUGE, counters->app.resolver.nx); + stats[RSLV_STAT_TIMEOUT] = mkf_u64(FN_GAUGE, counters->app.resolver.timeout); + stats[RSLV_STAT_REFUSED] = mkf_u64(FN_GAUGE, counters->app.resolver.refused); + stats[RSLV_STAT_OTHER] = mkf_u64(FN_GAUGE, counters->app.resolver.other); + stats[RSLV_STAT_INVALID] = mkf_u64(FN_GAUGE, counters->app.resolver.invalid); + stats[RSLV_STAT_TOO_BIG] = mkf_u64(FN_GAUGE, counters->app.resolver.too_big); + stats[RSLV_STAT_TRUNCATED] = mkf_u64(FN_GAUGE, counters->app.resolver.truncated); + stats[RSLV_STAT_OUTDATED] = mkf_u64(FN_GAUGE, counters->app.resolver.outdated); +} + +static struct stats_module rslv_stats_module = { + .name = "resolvers", + .domain_flags = STATS_DOMAIN_RESOLVERS << STATS_DOMAIN, + .fill_stats = resolv_fill_stats, + .stats = resolv_stats, + .stats_count = RSLV_STAT_END, + .counters = &dns_counters, + .counters_size = sizeof(dns_counters), + .clearable = 0, +}; + +INITCALL1(STG_REGISTER, stats_register_module, &rslv_stats_module); + +/* CLI context used during "show resolvers" */ +struct show_resolvers_ctx { + struct resolvers *forced_section; + struct resolvers *resolvers; + struct dns_nameserver *ns; +}; + +/* Returns a pointer to the resolvers matching the id <id>. NULL is returned if + * no match is found. + */ +struct resolvers *find_resolvers_by_id(const char *id) +{ + struct resolvers *res; + + list_for_each_entry(res, &sec_resolvers, list) { + if (strcmp(res->id, id) == 0) + return res; + } + return NULL; +} + +/* Returns a pointer on the SRV request matching the name <name> for the proxy + * <px>. NULL is returned if no match is found. + */ +struct resolv_srvrq *find_srvrq_by_name(const char *name, struct proxy *px) +{ + struct resolv_srvrq *srvrq; + + list_for_each_entry(srvrq, &resolv_srvrq_list, list) { + if (srvrq->proxy == px && strcmp(srvrq->name, name) == 0) + return srvrq; + } + return NULL; +} + +/* Allocates a new SRVRQ for the given server with the name <fqdn>. It returns + * NULL if an error occurred. */ +struct resolv_srvrq *new_resolv_srvrq(struct server *srv, char *fqdn) +{ + struct proxy *px = srv->proxy; + struct resolv_srvrq *srvrq = NULL; + int fqdn_len, hostname_dn_len; + + fqdn_len = strlen(fqdn); + hostname_dn_len = resolv_str_to_dn_label(fqdn, fqdn_len, trash.area, + trash.size); + if (hostname_dn_len == -1) { + ha_alert("%s '%s', server '%s': failed to parse FQDN '%s'\n", + proxy_type_str(px), px->id, srv->id, fqdn); + goto err; + } + + if ((srvrq = calloc(1, sizeof(*srvrq))) == NULL) { + ha_alert("%s '%s', server '%s': out of memory\n", + proxy_type_str(px), px->id, srv->id); + goto err; + } + srvrq->obj_type = OBJ_TYPE_SRVRQ; + srvrq->proxy = px; + srvrq->name = strdup(fqdn); + srvrq->hostname_dn = strdup(trash.area); + srvrq->hostname_dn_len = hostname_dn_len; + if (!srvrq->name || !srvrq->hostname_dn) { + ha_alert("%s '%s', server '%s': out of memory\n", + proxy_type_str(px), px->id, srv->id); + goto err; + } + LIST_INIT(&srvrq->attached_servers); + srvrq->named_servers = EB_ROOT; + LIST_APPEND(&resolv_srvrq_list, &srvrq->list); + return srvrq; + + err: + if (srvrq) { + free(srvrq->name); + free(srvrq->hostname_dn); + free(srvrq); + } + return NULL; +} + + +/* finds and return the SRV answer item associated to a requester (whose type is 'server'). + * + * returns NULL in case of error or not found. + */ +struct resolv_answer_item *find_srvrq_answer_record(const struct resolv_requester *requester) +{ + struct resolv_resolution *res; + struct eb32_node *eb32; + struct server *srv; + + if (!requester) + return NULL; + + if ((srv = objt_server(requester->owner)) == NULL) + return NULL; + /* check if the server is managed by a SRV record */ + if (srv->srvrq == NULL) + return NULL; + + res = srv->srvrq->requester->resolution; + + /* search an ANSWER record whose target points to the server's hostname and whose port is + * the same as server's svc_port */ + for (eb32 = eb32_first(&res->response.answer_tree); eb32 != NULL; eb32 = eb32_next(eb32)) { + struct resolv_answer_item *item = eb32_entry(eb32, typeof(*item), link); + + if (memcmp(srv->hostname_dn, item->data.target, srv->hostname_dn_len) == 0 && + (srv->svc_port == item->port)) + return item; + } + + return NULL; +} + +/* 2 bytes random generator to generate DNS query ID */ +static inline uint16_t resolv_rnd16(void) +{ + if (!resolv_query_id_seed) + resolv_query_id_seed = now_ms; + resolv_query_id_seed ^= resolv_query_id_seed << 13; + resolv_query_id_seed ^= resolv_query_id_seed >> 7; + resolv_query_id_seed ^= resolv_query_id_seed << 17; + return resolv_query_id_seed; +} + + +static inline int resolv_resolution_timeout(struct resolv_resolution *res) +{ + return res->resolvers->timeout.resolve; +} + +/* Updates a resolvers' task timeout for next wake up and queue it */ +static void resolv_update_resolvers_timeout(struct resolvers *resolvers) +{ + struct resolv_resolution *res; + int next = TICK_ETERNITY; + + if (!LIST_ISEMPTY(&resolvers->resolutions.curr)) { + res = LIST_NEXT(&resolvers->resolutions.curr, struct resolv_resolution *, list); + next = tick_add(now_ms, resolvers->timeout.resolve); + next = tick_first(next, tick_add(res->last_query, resolvers->timeout.retry)); + } + + list_for_each_entry(res, &resolvers->resolutions.wait, list) + next = tick_first(next, tick_add(res->last_resolution, resolv_resolution_timeout(res))); + + resolvers->t->expire = next; + task_queue(resolvers->t); +} + +/* Forges a DNS query. It needs the following information from the caller: + * - <query_id> : the DNS query id corresponding to this query + * - <query_type> : DNS_RTYPE_* request DNS record type (A, AAAA, ANY...) + * - <hostname_dn> : hostname in domain name format + * - <hostname_dn_len> : length of <hostname_dn> + * + * To store the query, the caller must pass a buffer <buf> and its size + * <bufsize>. It returns the number of written bytes in success, -1 if <buf> is + * too short. + */ +static int resolv_build_query(int query_id, int query_type, unsigned int accepted_payload_size, + char *hostname_dn, int hostname_dn_len, char *buf, int bufsize) +{ + struct dns_header dns_hdr; + struct dns_question qinfo; + struct dns_additional_record edns; + char *p = buf; + + if (sizeof(dns_hdr) + sizeof(qinfo) + sizeof(edns) + hostname_dn_len >= bufsize) + return -1; + + memset(buf, 0, bufsize); + + /* Set dns query headers */ + dns_hdr.id = (unsigned short) htons(query_id); + dns_hdr.flags = htons(0x0100); /* qr=0, opcode=0, aa=0, tc=0, rd=1, ra=0, z=0, rcode=0 */ + dns_hdr.qdcount = htons(1); /* 1 question */ + dns_hdr.ancount = 0; + dns_hdr.nscount = 0; + dns_hdr.arcount = htons(1); + memcpy(p, &dns_hdr, sizeof(dns_hdr)); + p += sizeof(dns_hdr); + + /* Set up query hostname */ + memcpy(p, hostname_dn, hostname_dn_len); + p += hostname_dn_len; + *p++ = 0; + + /* Set up query info (type and class) */ + qinfo.qtype = htons(query_type); + qinfo.qclass = htons(DNS_RCLASS_IN); + memcpy(p, &qinfo, sizeof(qinfo)); + p += sizeof(qinfo); + + /* Set the DNS extension */ + edns.name = 0; + edns.type = htons(DNS_RTYPE_OPT); + edns.udp_payload_size = htons(accepted_payload_size); + edns.extension = 0; + edns.data_length = 0; + memcpy(p, &edns, sizeof(edns)); + p += sizeof(edns); + + return (p - buf); +} + +/* Sends a DNS query to resolvers associated to a resolution. It returns 0 on + * success or -1 if trash buffer is not large enough to build a valid query. + */ +static int resolv_send_query(struct resolv_resolution *resolution) +{ + struct resolvers *resolvers = resolution->resolvers; + struct dns_nameserver *ns; + int len; + + /* Update resolution */ + resolution->nb_queries = 0; + resolution->nb_responses = 0; + resolution->last_query = now_ms; + + len = resolv_build_query(resolution->query_id, resolution->query_type, + resolvers->accepted_payload_size, + resolution->hostname_dn, resolution->hostname_dn_len, + trash.area, trash.size); + if (len < 0) { + send_log(NULL, LOG_NOTICE, + "can not build the query message for %s, in resolvers %s.\n", + resolution->hostname_dn, resolvers->id); + return -1; + } + + list_for_each_entry(ns, &resolvers->nameservers, list) { + if (dns_send_nameserver(ns, trash.area, len) >= 0) + resolution->nb_queries++; + } + + /* Push the resolution at the end of the active list */ + LIST_DEL_INIT(&resolution->list); + LIST_APPEND(&resolvers->resolutions.curr, &resolution->list); + return 0; +} + +/* Prepares and sends a DNS resolution. It returns 1 if the query was sent, 0 if + * skipped and -1 if an error occurred. + */ +static int +resolv_run_resolution(struct resolv_resolution *resolution) +{ + struct resolvers *resolvers = resolution->resolvers; + int query_id, i; + + /* Avoid sending requests for resolutions that don't yet have an + * hostname, ie resolutions linked to servers that do not yet have an + * fqdn */ + if (!resolution->hostname_dn) + return 0; + + /* Check if a resolution has already been started for this server return + * directly to avoid resolution pill up. */ + if (resolution->step != RSLV_STEP_NONE) + return 0; + + /* Generates a new query id. We try at most 100 times to find a free + * query id */ + for (i = 0; i < 100; ++i) { + query_id = resolv_rnd16(); + if (!eb32_lookup(&resolvers->query_ids, query_id)) + break; + query_id = -1; + } + if (query_id == -1) { + send_log(NULL, LOG_NOTICE, + "could not generate a query id for %s, in resolvers %s.\n", + resolution->hostname_dn, resolvers->id); + return -1; + } + + /* Update resolution parameters */ + resolution->query_id = query_id; + resolution->qid.key = query_id; + resolution->step = RSLV_STEP_RUNNING; + resolution->query_type = resolution->prefered_query_type; + resolution->try = resolvers->resolve_retries; + eb32_insert(&resolvers->query_ids, &resolution->qid); + + /* Send the DNS query */ + resolution->try -= 1; + resolv_send_query(resolution); + return 1; +} + +/* Performs a name resolution for the requester <req> */ +void resolv_trigger_resolution(struct resolv_requester *req) +{ + struct resolvers *resolvers; + struct resolv_resolution *res; + int exp; + + if (!req || !req->resolution) + return; + res = req->resolution; + resolvers = res->resolvers; + + enter_resolver_code(); + + /* The resolution must not be triggered yet. Use the cached response, if + * valid */ + exp = tick_add(res->last_resolution, resolvers->hold.valid); + if (resolvers->t && (!tick_isset(resolvers->t->expire) || res->status != RSLV_STATUS_VALID || + !tick_isset(res->last_resolution) || tick_is_expired(exp, now_ms))) { + /* If the resolution is not running and the requester is a + * server, reset the resolution timer to force a quick + * resolution. + */ + if (res->step == RSLV_STEP_NONE && + (obj_type(req->owner) == OBJ_TYPE_SERVER || + obj_type(req->owner) == OBJ_TYPE_SRVRQ)) + res->last_resolution = TICK_ETERNITY; + task_wakeup(resolvers->t, TASK_WOKEN_OTHER); + } + + leave_resolver_code(); +} + + +/* Resets some resolution parameters to initial values and also delete the query + * ID from the resolver's tree. + */ +static void resolv_reset_resolution(struct resolv_resolution *resolution) +{ + /* update resolution status */ + resolution->step = RSLV_STEP_NONE; + resolution->try = 0; + resolution->last_resolution = now_ms; + resolution->nb_queries = 0; + resolution->nb_responses = 0; + resolution->query_type = resolution->prefered_query_type; + + /* clean up query id */ + eb32_delete(&resolution->qid); + resolution->query_id = 0; + resolution->qid.key = 0; +} + +/* Returns the query id contained in a DNS response */ +static inline unsigned short resolv_response_get_query_id(unsigned char *resp) +{ + return resp[0] * 256 + resp[1]; +} + + +/* Analyses, re-builds and copies the name <name> from the DNS response packet + * <buffer>. <name> must point to the 'data_len' information or pointer 'c0' + * for compressed data. The result is copied into <dest>, ensuring we don't + * overflow using <dest_len> Returns the number of bytes the caller can move + * forward. If 0 it means an error occurred while parsing the name. <offset> is + * the number of bytes the caller could move forward. + */ +int resolv_read_name(unsigned char *buffer, unsigned char *bufend, + unsigned char *name, char *destination, int dest_len, + int *offset, unsigned int depth) +{ + int nb_bytes = 0, n = 0; + int label_len; + unsigned char *reader = name; + char *dest = destination; + + while (1) { + if (reader >= bufend) + goto err; + + /* Name compression is in use */ + if ((*reader & 0xc0) == 0xc0) { + if (reader + 1 >= bufend) + goto err; + + /* Must point BEFORE current position */ + if ((buffer + reader[1]) > reader) + goto err; + + if (depth++ > 100) + goto err; + + n = resolv_read_name(buffer, bufend, buffer + (*reader & 0x3f)*256 + reader[1], + dest, dest_len - nb_bytes, offset, depth); + if (n == 0) + goto err; + + dest += n; + nb_bytes += n; + goto out; + } + + label_len = *reader; + if (label_len == 0) + goto out; + + /* Check if: + * - we won't read outside the buffer + * - there is enough place in the destination + */ + if ((reader + label_len >= bufend) || (nb_bytes + label_len >= dest_len)) + goto err; + + /* +1 to take label len + label string */ + label_len++; + + memcpy(dest, reader, label_len); + + dest += label_len; + nb_bytes += label_len; + reader += label_len; + } + + out: + /* offset computation: + * parse from <name> until finding either NULL or a pointer "c0xx" + */ + reader = name; + *offset = 0; + while (reader < bufend) { + if ((reader[0] & 0xc0) == 0xc0) { + *offset += 2; + break; + } + else if (*reader == 0) { + *offset += 1; + break; + } + *offset += 1; + ++reader; + } + return nb_bytes; + + err: + return 0; +} + +/* Reinitialize the list of aborted resolutions before calling certain + * functions relying on it. The list must be processed by calling + * leave_resolver_code() after operations. + */ +static void enter_resolver_code() +{ + if (!recurse) + LIST_INIT(&death_row); + recurse++; +} + +/* Add a resolution to the death_row. */ +static void abort_resolution(struct resolv_resolution *res) +{ + /* Remove the resolution from query_ids tree and from any resolvers list */ + eb32_delete(&res->qid); + res->query_id = 0; + res->qid.key = 0; + + LIST_DEL_INIT(&res->list); + LIST_APPEND(&death_row, &res->list); +} + +/* This releases any aborted resolution found in the death row. It is mandatory + * to call enter_resolver_code() first before the function (or loop) that + * needs to defer deletions. Note that some of them are in relation via internal + * objects and might cause the deletion of other ones from the same list, so we + * must absolutely not use a list_for_each_entry_safe() nor any such thing here, + * and solely rely on each call to remove the first remaining list element. + */ +static void leave_resolver_code() +{ + struct resolv_resolution *res; + + recurse--; + if (recurse) + return; + + while (!LIST_ISEMPTY(&death_row)) { + res = LIST_NEXT(&death_row, struct resolv_resolution *, list); + resolv_free_resolution(res); + } + + /* make sure nobody tries to add anything without having initialized it */ + death_row = (struct list){ }; +} + +/* Cleanup fqdn/port and address of a server attached to a SRV resolution. This + * happens when an SRV item is purged or when the server status is considered as + * obsolete. + * + * Must be called with the DNS lock held, and with the death_row already + * initialized via enter_resolver_code(). + */ +static void resolv_srvrq_cleanup_srv(struct server *srv) +{ + _resolv_unlink_resolution(srv->resolv_requester); + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srvrq_update_srv_status(srv, 1); + ha_free(&srv->hostname); + ha_free(&srv->hostname_dn); + srv->hostname_dn_len = 0; + memset(&srv->addr, 0, sizeof(srv->addr)); + srv->svc_port = 0; + srv->flags |= SRV_F_NO_RESOLUTION; + + ebpt_delete(&srv->host_dn); + ha_free(&srv->host_dn.key); + + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + LIST_DEL_INIT(&srv->srv_rec_item); + LIST_APPEND(&srv->srvrq->attached_servers, &srv->srv_rec_item); + + srv->srvrq_check->expire = TICK_ETERNITY; +} + +/* Takes care to cleanup a server resolution when it is outdated. This only + * happens for a server relying on a SRV record. + */ +static struct task *resolv_srvrq_expire_task(struct task *t, void *context, unsigned int state) +{ + struct server *srv = context; + + if (!tick_is_expired(t->expire, now_ms)) + goto end; + + enter_resolver_code(); + HA_SPIN_LOCK(DNS_LOCK, &srv->srvrq->resolvers->lock); + resolv_srvrq_cleanup_srv(srv); + HA_SPIN_UNLOCK(DNS_LOCK, &srv->srvrq->resolvers->lock); + leave_resolver_code(); + + end: + return t; +} + +/* Checks for any obsolete record, also identify any SRV request, and try to + * find a corresponding server. + */ +static void resolv_check_response(struct resolv_resolution *res) +{ + struct resolvers *resolvers = res->resolvers; + struct resolv_requester *req; + struct eb32_node *eb32, *eb32_back; + struct server *srv, *srvback; + struct resolv_srvrq *srvrq; + + for (eb32 = eb32_first(&res->response.answer_tree); eb32 && (eb32_back = eb32_next(eb32), 1); eb32 = eb32_back) { + struct resolv_answer_item *item = eb32_entry(eb32, typeof(*item), link); + struct resolv_answer_item *ar_item = item->ar_item; + + /* clean up obsolete Additional record */ + if (ar_item && tick_is_lt(tick_add(ar_item->last_seen, resolvers->hold.obsolete), now_ms)) { + /* Cleaning up the AR item will trigger an extra DNS resolution, except if the SRV + * item is also obsolete. + */ + pool_free(resolv_answer_item_pool, ar_item); + item->ar_item = NULL; + } + + /* Remove obsolete items */ + if (tick_is_lt(tick_add(item->last_seen, resolvers->hold.obsolete), now_ms)) { + if (item->type == DNS_RTYPE_A || item->type == DNS_RTYPE_AAAA) { + /* Remove any associated server */ + list_for_each_entry_safe(srv, srvback, &item->attached_servers, ip_rec_item) { + LIST_DEL_INIT(&srv->ip_rec_item); + } + } + else if (item->type == DNS_RTYPE_SRV) { + /* Remove any associated server */ + list_for_each_entry_safe(srv, srvback, &item->attached_servers, srv_rec_item) + resolv_srvrq_cleanup_srv(srv); + } + + eb32_delete(&item->link); + if (item->ar_item) { + pool_free(resolv_answer_item_pool, item->ar_item); + item->ar_item = NULL; + } + pool_free(resolv_answer_item_pool, item); + continue; + } + + if (item->type != DNS_RTYPE_SRV) + continue; + + /* Now process SRV records */ + list_for_each_entry(req, &res->requesters, list) { + struct ebpt_node *node; + char target[DNS_MAX_NAME_SIZE+1]; + + int i; + if ((srvrq = objt_resolv_srvrq(req->owner)) == NULL) + continue; + + /* Check if a server already uses that record */ + srv = NULL; + list_for_each_entry(srv, &item->attached_servers, srv_rec_item) { + if (srv->srvrq == srvrq) { + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + goto srv_found; + } + } + + + /* If not empty we try to match a server + * in server state file tree with the same hostname + */ + if (!eb_is_empty(&srvrq->named_servers)) { + srv = NULL; + + /* convert the key to lookup in lower case */ + for (i = 0 ; item->data.target[i] ; i++) + target[i] = tolower(item->data.target[i]); + target[i] = 0; + + node = ebis_lookup(&srvrq->named_servers, target); + if (node) { + srv = ebpt_entry(node, struct server, host_dn); + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + + /* an entry was found with the same hostname + * let check this node if the port matches + * and try next node if the hostname + * is still the same + */ + while (1) { + if (srv->svc_port == item->port) { + /* server found, we remove it from tree */ + ebpt_delete(node); + ha_free(&srv->host_dn.key); + goto srv_found; + } + + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + + node = ebpt_next(node); + if (!node) + break; + + srv = ebpt_entry(node, struct server, host_dn); + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + + if ((item->data_len != srv->hostname_dn_len) + || memcmp(srv->hostname_dn, item->data.target, item->data_len) != 0) { + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + break; + } + } + } + } + + /* Pick the first server listed in srvrq (those ones don't + * have hostname and are free to use) + */ + srv = NULL; + list_for_each_entry(srv, &srvrq->attached_servers, srv_rec_item) { + LIST_DEL_INIT(&srv->srv_rec_item); + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + goto srv_found; + } + srv = NULL; + +srv_found: + /* And update this server, if found (srv is locked here) */ + if (srv) { + /* re-enable DNS resolution for this server by default */ + srv->flags &= ~SRV_F_NO_RESOLUTION; + srv->srvrq_check->expire = TICK_ETERNITY; + + srv->svc_port = item->port; + srv->flags &= ~SRV_F_MAPPORTS; + + /* Check if an Additional Record is associated to this SRV record. + * Perform some sanity checks too to ensure the record can be used. + * If all fine, we simply pick up the IP address found and associate + * it to the server. And DNS resolution is disabled for this server. + */ + if ((item->ar_item != NULL) && + (item->ar_item->type == DNS_RTYPE_A || item->ar_item->type == DNS_RTYPE_AAAA)) + { + + switch (item->ar_item->type) { + case DNS_RTYPE_A: + srv_update_addr(srv, &item->ar_item->data.in4.sin_addr, AF_INET, "DNS additional record"); + break; + case DNS_RTYPE_AAAA: + srv_update_addr(srv, &item->ar_item->data.in6.sin6_addr, AF_INET6, "DNS additional record"); + break; + } + + srv->flags |= SRV_F_NO_RESOLUTION; + + /* Unlink A/AAAA resolution for this server if there is an AR item. + * It is usless to perform an extra resolution + */ + _resolv_unlink_resolution(srv->resolv_requester); + } + + if (!srv->hostname_dn) { + const char *msg = NULL; + char hostname[DNS_MAX_NAME_SIZE+1]; + + if (resolv_dn_label_to_str(item->data.target, item->data_len, + hostname, sizeof(hostname)) == -1) { + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + continue; + } + msg = srv_update_fqdn(srv, hostname, "SRV record", 1); + if (msg) + send_log(srv->proxy, LOG_NOTICE, "%s", msg); + } + + if (!LIST_INLIST(&srv->srv_rec_item)) + LIST_APPEND(&item->attached_servers, &srv->srv_rec_item); + + if (!(srv->flags & SRV_F_NO_RESOLUTION)) { + /* If there is no AR item responsible of the FQDN resolution, + * trigger a dedicated DNS resolution + */ + if (!srv->resolv_requester || !srv->resolv_requester->resolution) + resolv_link_resolution(srv, OBJ_TYPE_SERVER, 1); + } + + /* Update the server status */ + srvrq_update_srv_status(srv, (srv->addr.ss_family != AF_INET && srv->addr.ss_family != AF_INET6)); + + if (!srv->resolv_opts.ignore_weight) { + char weight[9]; + int ha_weight; + + /* DNS weight range if from 0 to 65535 + * HAProxy weight is from 0 to 256 + * The rule below ensures that weight 0 is well respected + * while allowing a "mapping" from DNS weight into HAProxy's one. + */ + ha_weight = (item->weight + 255) / 256; + + snprintf(weight, sizeof(weight), "%d", ha_weight); + server_parse_weight_change_request(srv, weight); + } + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + } + } + } +} + +/* Validates that the buffer DNS response provided in <resp> and finishing + * before <bufend> is valid from a DNS protocol point of view. + * + * The result is stored in <resolution>' response, buf_response, + * response_query_records and response_answer_records members. + * + * This function returns one of the RSLV_RESP_* code to indicate the type of + * error found. + */ +static int resolv_validate_dns_response(unsigned char *resp, unsigned char *bufend, + struct resolv_resolution *resolution, int max_answer_records) +{ + unsigned char *reader; + char *previous_dname, tmpname[DNS_MAX_NAME_SIZE]; + int len, flags, offset; + int nb_saved_records; + struct resolv_query_item *query; + struct resolv_answer_item *answer_record, *tmp_record; + struct resolv_response *r_res; + struct eb32_node *eb32; + uint32_t key = 0; + int i, found = 0; + int cause = RSLV_RESP_ERROR; + + reader = resp; + len = 0; + previous_dname = NULL; + query = NULL; + answer_record = NULL; + + /* Initialization of response buffer and structure */ + r_res = &resolution->response; + + /* query id */ + if (reader + 2 >= bufend) + goto invalid_resp; + + r_res->header.id = reader[0] * 256 + reader[1]; + reader += 2; + + /* Flags and rcode are stored over 2 bytes + * First byte contains: + * - response flag (1 bit) + * - opcode (4 bits) + * - authoritative (1 bit) + * - truncated (1 bit) + * - recursion desired (1 bit) + */ + if (reader + 2 >= bufend) + goto invalid_resp; + + flags = reader[0] * 256 + reader[1]; + + if ((flags & DNS_FLAG_REPLYCODE) != DNS_RCODE_NO_ERROR) { + if ((flags & DNS_FLAG_REPLYCODE) == DNS_RCODE_NX_DOMAIN) { + cause = RSLV_RESP_NX_DOMAIN; + goto return_error; + } + else if ((flags & DNS_FLAG_REPLYCODE) == DNS_RCODE_REFUSED) { + cause = RSLV_RESP_REFUSED; + goto return_error; + } + else { + cause = RSLV_RESP_ERROR; + goto return_error; + } + } + + /* Move forward 2 bytes for flags */ + reader += 2; + + /* 2 bytes for question count */ + if (reader + 2 >= bufend) + goto invalid_resp; + r_res->header.qdcount = reader[0] * 256 + reader[1]; + /* (for now) we send one query only, so we expect only one in the + * response too */ + if (r_res->header.qdcount != 1) { + cause = RSLV_RESP_QUERY_COUNT_ERROR; + goto return_error; + } + + if (r_res->header.qdcount > DNS_MAX_QUERY_RECORDS) + goto invalid_resp; + reader += 2; + + /* 2 bytes for answer count */ + if (reader + 2 >= bufend) + goto invalid_resp; + r_res->header.ancount = reader[0] * 256 + reader[1]; + if (r_res->header.ancount == 0) { + cause = RSLV_RESP_ANCOUNT_ZERO; + goto return_error; + } + + /* Check if too many records are announced */ + if (r_res->header.ancount > max_answer_records) + goto invalid_resp; + reader += 2; + + /* 2 bytes authority count */ + if (reader + 2 >= bufend) + goto invalid_resp; + r_res->header.nscount = reader[0] * 256 + reader[1]; + reader += 2; + + /* 2 bytes additional count */ + if (reader + 2 >= bufend) + goto invalid_resp; + r_res->header.arcount = reader[0] * 256 + reader[1]; + reader += 2; + + /* Parsing dns queries. For now there is only one query and it exists + * because (qdcount == 1). + */ + query = &resolution->response_query_records[0]; + + /* Name is a NULL terminated string in our case, since we have + * one query per response and the first one can't be compressed + * (using the 0x0c format) */ + offset = 0; + len = resolv_read_name(resp, bufend, reader, query->name, DNS_MAX_NAME_SIZE, &offset, 0); + + if (len == 0) + goto invalid_resp; + + /* Now let's check the query's dname corresponds to the one we sent. */ + if (len != resolution->hostname_dn_len || + memcmp(query->name, resolution->hostname_dn, resolution->hostname_dn_len) != 0) { + cause = RSLV_RESP_WRONG_NAME; + goto return_error; + } + + reader += offset; + previous_dname = query->name; + + /* move forward 2 bytes for question type */ + if (reader + 2 >= bufend) + goto invalid_resp; + query->type = reader[0] * 256 + reader[1]; + reader += 2; + + /* move forward 2 bytes for question class */ + if (reader + 2 >= bufend) + goto invalid_resp; + query->class = reader[0] * 256 + reader[1]; + reader += 2; + + /* TRUNCATED flag must be checked after we could read the query type + * because a TRUNCATED SRV query type response can still be exploited + */ + if (query->type != DNS_RTYPE_SRV && flags & DNS_FLAG_TRUNCATED) { + cause = RSLV_RESP_TRUNCATED; + goto return_error; + } + + /* now parsing response records */ + nb_saved_records = 0; + for (i = 0; i < r_res->header.ancount; i++) { + if (reader >= bufend) + goto invalid_resp; + + answer_record = pool_alloc(resolv_answer_item_pool); + if (answer_record == NULL) + goto invalid_resp; + + /* initialization */ + answer_record->ar_item = NULL; + answer_record->last_seen = TICK_ETERNITY; + LIST_INIT(&answer_record->attached_servers); + answer_record->link.node.leaf_p = NULL; + + offset = 0; + len = resolv_read_name(resp, bufend, reader, tmpname, DNS_MAX_NAME_SIZE, &offset, 0); + + if (len == 0) + goto invalid_resp; + + /* Check if the current record dname is valid. previous_dname + * points either to queried dname or last CNAME target */ + if (query->type != DNS_RTYPE_SRV && memcmp(previous_dname, tmpname, len) != 0) { + if (i == 0) { + /* First record, means a mismatch issue between + * queried dname and dname found in the first + * record */ + goto invalid_resp; + } + else { + /* If not the first record, this means we have a + * CNAME resolution error. + */ + cause = RSLV_RESP_CNAME_ERROR; + goto return_error; + } + + } + + memcpy(answer_record->name, tmpname, len); + answer_record->name[len] = 0; + + reader += offset; + if (reader >= bufend) + goto invalid_resp; + + /* 2 bytes for record type (A, AAAA, CNAME, etc...) */ + if (reader + 2 > bufend) + goto invalid_resp; + + answer_record->type = reader[0] * 256 + reader[1]; + reader += 2; + + /* 2 bytes for class (2) */ + if (reader + 2 > bufend) + goto invalid_resp; + + answer_record->class = reader[0] * 256 + reader[1]; + reader += 2; + + /* 4 bytes for ttl (4) */ + if (reader + 4 > bufend) + goto invalid_resp; + + answer_record->ttl = reader[0] * 16777216 + reader[1] * 65536 + + reader[2] * 256 + reader[3]; + reader += 4; + + /* Now reading data len */ + if (reader + 2 > bufend) + goto invalid_resp; + + answer_record->data_len = reader[0] * 256 + reader[1]; + + /* Move forward 2 bytes for data len */ + reader += 2; + + if (reader + answer_record->data_len > bufend) + goto invalid_resp; + + /* Analyzing record content */ + switch (answer_record->type) { + case DNS_RTYPE_A: + /* ipv4 is stored on 4 bytes */ + if (answer_record->data_len != 4) + goto invalid_resp; + + answer_record->data.in4.sin_family = AF_INET; + memcpy(&answer_record->data.in4.sin_addr, reader, answer_record->data_len); + key = XXH32(reader, answer_record->data_len, answer_record->type); + break; + + case DNS_RTYPE_CNAME: + /* Check if this is the last record and update the caller about the status: + * no IP could be found and last record was a CNAME. Could be triggered + * by a wrong query type + * + * + 1 because answer_record_id starts at 0 + * while number of answers is an integer and + * starts at 1. + */ + if (i + 1 == r_res->header.ancount) { + cause = RSLV_RESP_CNAME_ERROR; + goto return_error; + } + + offset = 0; + len = resolv_read_name(resp, bufend, reader, tmpname, DNS_MAX_NAME_SIZE, &offset, 0); + if (len == 0) + goto invalid_resp; + + memcpy(answer_record->data.target, tmpname, len); + answer_record->data.target[len] = 0; + key = XXH32(tmpname, len, answer_record->type); + previous_dname = answer_record->data.target; + break; + + + case DNS_RTYPE_SRV: + /* Answer must contain : + * - 2 bytes for the priority + * - 2 bytes for the weight + * - 2 bytes for the port + * - the target hostname + */ + if (answer_record->data_len <= 6) + goto invalid_resp; + + answer_record->priority = read_n16(reader); + reader += sizeof(uint16_t); + answer_record->weight = read_n16(reader); + reader += sizeof(uint16_t); + answer_record->port = read_n16(reader); + reader += sizeof(uint16_t); + offset = 0; + len = resolv_read_name(resp, bufend, reader, tmpname, DNS_MAX_NAME_SIZE, &offset, 0); + if (len == 0) + goto invalid_resp; + + answer_record->data_len = len; + memcpy(answer_record->data.target, tmpname, len); + answer_record->data.target[len] = 0; + key = XXH32(tmpname, len, answer_record->type); + if (answer_record->ar_item != NULL) { + pool_free(resolv_answer_item_pool, answer_record->ar_item); + answer_record->ar_item = NULL; + } + break; + + case DNS_RTYPE_AAAA: + /* ipv6 is stored on 16 bytes */ + if (answer_record->data_len != 16) + goto invalid_resp; + + answer_record->data.in6.sin6_family = AF_INET6; + memcpy(&answer_record->data.in6.sin6_addr, reader, answer_record->data_len); + key = XXH32(reader, answer_record->data_len, answer_record->type); + break; + + } /* switch (record type) */ + + /* Increment the counter for number of records saved into our + * local response */ + nb_saved_records++; + + /* Move forward answer_record->data_len for analyzing next + * record in the response */ + reader += ((answer_record->type == DNS_RTYPE_SRV) + ? offset + : answer_record->data_len); + + /* Lookup to see if we already had this entry */ + found = 0; + + for (eb32 = eb32_lookup(&r_res->answer_tree, key); eb32 != NULL; eb32 = eb32_next(eb32)) { + tmp_record = eb32_entry(eb32, typeof(*tmp_record), link); + if (tmp_record->type != answer_record->type) + continue; + + switch(tmp_record->type) { + case DNS_RTYPE_A: + if (!memcmp(&answer_record->data.in4.sin_addr, + &tmp_record->data.in4.sin_addr, + sizeof(answer_record->data.in4.sin_addr))) + found = 1; + break; + + case DNS_RTYPE_AAAA: + if (!memcmp(&answer_record->data.in6.sin6_addr, + &tmp_record->data.in6.sin6_addr, + sizeof(answer_record->data.in6.sin6_addr))) + found = 1; + break; + + case DNS_RTYPE_SRV: + if (answer_record->data_len == tmp_record->data_len && + memcmp(answer_record->data.target, tmp_record->data.target, answer_record->data_len) == 0 && + answer_record->port == tmp_record->port) { + tmp_record->weight = answer_record->weight; + found = 1; + } + break; + + default: + break; + } + + if (found == 1) + break; + } + + if (found == 1) { + tmp_record->last_seen = now_ms; + pool_free(resolv_answer_item_pool, answer_record); + answer_record = NULL; + } + else { + answer_record->last_seen = now_ms; + answer_record->ar_item = NULL; + answer_record->link.key = key; + eb32_insert(&r_res->answer_tree, &answer_record->link); + answer_record = NULL; + } + } /* for i 0 to ancount */ + + /* Save the number of records we really own */ + r_res->header.ancount = nb_saved_records; + + /* now parsing additional records for SRV queries only */ + if (query->type != DNS_RTYPE_SRV) + goto skip_parsing_additional_records; + + /* if we find Authority records, just skip them */ + for (i = 0; i < r_res->header.nscount; i++) { + offset = 0; + len = resolv_read_name(resp, bufend, reader, tmpname, DNS_MAX_NAME_SIZE, + &offset, 0); + if (len == 0) + continue; + + if (reader + offset + 10 >= bufend) + goto invalid_resp; + + reader += offset; + /* skip 2 bytes for class */ + reader += 2; + /* skip 2 bytes for type */ + reader += 2; + /* skip 4 bytes for ttl */ + reader += 4; + /* read data len */ + len = reader[0] * 256 + reader[1]; + reader += 2; + + if (reader + len >= bufend) + goto invalid_resp; + + reader += len; + } + + nb_saved_records = 0; + for (i = 0; i < r_res->header.arcount; i++) { + if (reader >= bufend) + goto invalid_resp; + + answer_record = pool_alloc(resolv_answer_item_pool); + if (answer_record == NULL) + goto invalid_resp; + answer_record->last_seen = TICK_ETERNITY; + LIST_INIT(&answer_record->attached_servers); + + offset = 0; + len = resolv_read_name(resp, bufend, reader, tmpname, DNS_MAX_NAME_SIZE, &offset, 0); + + if (len == 0) { + pool_free(resolv_answer_item_pool, answer_record); + answer_record = NULL; + continue; + } + + memcpy(answer_record->name, tmpname, len); + answer_record->name[len] = 0; + + reader += offset; + if (reader >= bufend) + goto invalid_resp; + + /* 2 bytes for record type (A, AAAA, CNAME, etc...) */ + if (reader + 2 > bufend) + goto invalid_resp; + + answer_record->type = reader[0] * 256 + reader[1]; + reader += 2; + + /* 2 bytes for class (2) */ + if (reader + 2 > bufend) + goto invalid_resp; + + answer_record->class = reader[0] * 256 + reader[1]; + reader += 2; + + /* 4 bytes for ttl (4) */ + if (reader + 4 > bufend) + goto invalid_resp; + + answer_record->ttl = reader[0] * 16777216 + reader[1] * 65536 + + reader[2] * 256 + reader[3]; + reader += 4; + + /* Now reading data len */ + if (reader + 2 > bufend) + goto invalid_resp; + + answer_record->data_len = reader[0] * 256 + reader[1]; + + /* Move forward 2 bytes for data len */ + reader += 2; + + if (reader + answer_record->data_len > bufend) + goto invalid_resp; + + /* Analyzing record content */ + switch (answer_record->type) { + case DNS_RTYPE_A: + /* ipv4 is stored on 4 bytes */ + if (answer_record->data_len != 4) + goto invalid_resp; + + answer_record->data.in4.sin_family = AF_INET; + memcpy(&answer_record->data.in4.sin_addr, reader, answer_record->data_len); + break; + + case DNS_RTYPE_AAAA: + /* ipv6 is stored on 16 bytes */ + if (answer_record->data_len != 16) + goto invalid_resp; + + answer_record->data.in6.sin6_family = AF_INET6; + memcpy(&answer_record->data.in6.sin6_addr, reader, answer_record->data_len); + break; + + default: + pool_free(resolv_answer_item_pool, answer_record); + answer_record = NULL; + continue; + + } /* switch (record type) */ + + /* Increment the counter for number of records saved into our + * local response */ + nb_saved_records++; + + /* Move forward answer_record->data_len for analyzing next + * record in the response */ + reader += answer_record->data_len; + + /* Lookup to see if we already had this entry */ + found = 0; + + for (eb32 = eb32_first(&r_res->answer_tree); eb32 != NULL; eb32 = eb32_next(eb32)) { + struct resolv_answer_item *ar_item; + + tmp_record = eb32_entry(eb32, typeof(*tmp_record), link); + if (tmp_record->type != DNS_RTYPE_SRV || !tmp_record->ar_item) + continue; + + ar_item = tmp_record->ar_item; + if (ar_item->type != answer_record->type || ar_item->last_seen == now_ms || + len != tmp_record->data_len || + memcmp(answer_record->name, tmp_record->data.target, tmp_record->data_len) != 0) + continue; + + switch(ar_item->type) { + case DNS_RTYPE_A: + if (!memcmp(&answer_record->data.in4.sin_addr, + &ar_item->data.in4.sin_addr, + sizeof(answer_record->data.in4.sin_addr))) + found = 1; + break; + + case DNS_RTYPE_AAAA: + if (!memcmp(&answer_record->data.in6.sin6_addr, + &ar_item->data.in6.sin6_addr, + sizeof(answer_record->data.in6.sin6_addr))) + found = 1; + break; + + default: + break; + } + + if (found == 1) + break; + } + + if (found == 1) { + tmp_record->ar_item->last_seen = now_ms; + pool_free(resolv_answer_item_pool, answer_record); + answer_record = NULL; + } + else { + answer_record->last_seen = now_ms; + answer_record->ar_item = NULL; + + // looking for the SRV record in the response list linked to this additional record + for (eb32 = eb32_first(&r_res->answer_tree); eb32 != NULL; eb32 = eb32_next(eb32)) { + tmp_record = eb32_entry(eb32, typeof(*tmp_record), link); + + if (tmp_record->type == DNS_RTYPE_SRV && + tmp_record->ar_item == NULL && + memcmp(tmp_record->data.target, answer_record->name, tmp_record->data_len) == 0) { + /* Always use the received additional record to refresh info */ + pool_free(resolv_answer_item_pool, tmp_record->ar_item); + tmp_record->ar_item = answer_record; + answer_record = NULL; + break; + } + } + if (answer_record) { + pool_free(resolv_answer_item_pool, answer_record); + answer_record = NULL; + } + } + } /* for i 0 to arcount */ + + skip_parsing_additional_records: + + /* Save the number of records we really own */ + r_res->header.arcount = nb_saved_records; + resolv_check_response(resolution); + return RSLV_RESP_VALID; + + invalid_resp: + cause = RSLV_RESP_INVALID; + + return_error: + pool_free(resolv_answer_item_pool, answer_record); + return cause; +} + +/* Searches dn_name resolution in resp. + * If existing IP not found, return the first IP matching family_priority, + * otherwise, first ip found + * The following tasks are the responsibility of the caller: + * - <r_res> contains an error free DNS response + * For both cases above, resolv_validate_dns_response is required + * returns one of the RSLV_UPD_* code + */ +int resolv_get_ip_from_response(struct resolv_response *r_res, + struct resolv_options *resolv_opts, void *currentip, + short currentip_sin_family, + void **newip, short *newip_sin_family, + struct server *owner) +{ + struct resolv_answer_item *record, *found_record = NULL; + struct eb32_node *eb32; + int family_priority; + int currentip_found; + unsigned char *newip4, *newip6; + int currentip_sel; + int j; + int score, max_score; + int allowed_duplicated_ip; + + /* srv is linked to an alive ip record */ + if (owner && LIST_INLIST(&owner->ip_rec_item)) + return RSLV_UPD_NO; + + family_priority = resolv_opts->family_prio; + allowed_duplicated_ip = resolv_opts->accept_duplicate_ip; + *newip = newip4 = newip6 = NULL; + currentip_found = 0; + *newip_sin_family = AF_UNSPEC; + max_score = -1; + + /* Select an IP regarding configuration preference. + * Top priority is the preferred network ip version, + * second priority is the preferred network. + * the last priority is the currently used IP, + * + * For these three priorities, a score is calculated. The + * weight are: + * 8 - preferred ip version. + * 4 - preferred network. + * 2 - if the ip in the record is not affected to any other server in the same backend (duplication) + * 1 - current ip. + * The result with the biggest score is returned. + */ + + for (eb32 = eb32_first(&r_res->answer_tree); eb32 != NULL; eb32 = eb32_next(eb32)) { + void *ip; + unsigned char ip_type; + + record = eb32_entry(eb32, typeof(*record), link); + if (record->type == DNS_RTYPE_A) { + ip_type = AF_INET; + ip = &record->data.in4.sin_addr; + } + else if (record->type == DNS_RTYPE_AAAA) { + ip_type = AF_INET6; + ip = &record->data.in6.sin6_addr; + } + else + continue; + score = 0; + + /* Check for preferred ip protocol. */ + if (ip_type == family_priority) + score += 8; + + /* Check for preferred network. */ + for (j = 0; j < resolv_opts->pref_net_nb; j++) { + + /* Compare only the same addresses class. */ + if (resolv_opts->pref_net[j].family != ip_type) + continue; + + if ((ip_type == AF_INET && + in_net_ipv4(ip, + &resolv_opts->pref_net[j].mask.in4, + &resolv_opts->pref_net[j].addr.in4)) || + (ip_type == AF_INET6 && + in_net_ipv6(ip, + &resolv_opts->pref_net[j].mask.in6, + &resolv_opts->pref_net[j].addr.in6))) { + score += 4; + break; + } + } + + /* Check if the IP found in the record is already affected to a + * member of a group. If not, the score should be incremented + * by 2. */ + if (owner) { + struct server *srv; + int already_used = 0; + + list_for_each_entry(srv, &record->attached_servers, ip_rec_item) { + if (srv == owner) + continue; + if (srv->proxy == owner->proxy) { + already_used = 1; + break; + } + } + if (already_used) { + if (!allowed_duplicated_ip) { + continue; + } + } + else { + score += 2; + } + } else { + score += 2; + } + + /* Check for current ip matching. */ + if (ip_type == currentip_sin_family && + ((currentip_sin_family == AF_INET && + !memcmp(ip, currentip, 4)) || + (currentip_sin_family == AF_INET6 && + !memcmp(ip, currentip, 16)))) { + score++; + currentip_sel = 1; + } + else + currentip_sel = 0; + + /* Keep the address if the score is better than the previous + * score. The maximum score is 15, if this value is reached, we + * break the parsing. Implicitly, this score is reached the ip + * selected is the current ip. */ + if (score > max_score) { + if (ip_type == AF_INET) + newip4 = ip; + else + newip6 = ip; + found_record = record; + currentip_found = currentip_sel; + if (score == 15) { + /* this was not registered on the current record but it matches + * let's fix it (it may comes from state file */ + if (owner) + LIST_APPEND(&found_record->attached_servers, &owner->ip_rec_item); + return RSLV_UPD_NO; + } + max_score = score; + } + } /* list for each record entries */ + + /* No IP found in the response */ + if (!newip4 && !newip6) + return RSLV_UPD_NO_IP_FOUND; + + /* Case when the caller looks first for an IPv4 address */ + if (family_priority == AF_INET) { + if (newip4) { + *newip = newip4; + *newip_sin_family = AF_INET; + } + else if (newip6) { + *newip = newip6; + *newip_sin_family = AF_INET6; + } + } + /* Case when the caller looks first for an IPv6 address */ + else if (family_priority == AF_INET6) { + if (newip6) { + *newip = newip6; + *newip_sin_family = AF_INET6; + } + else if (newip4) { + *newip = newip4; + *newip_sin_family = AF_INET; + } + } + /* Case when the caller have no preference (we prefer IPv6) */ + else if (family_priority == AF_UNSPEC) { + if (newip6) { + *newip = newip6; + *newip_sin_family = AF_INET6; + } + else if (newip4) { + *newip = newip4; + *newip_sin_family = AF_INET; + } + } + + /* the ip of this record was chosen for the server */ + if (owner && found_record) { + LIST_DEL_INIT(&owner->ip_rec_item); + LIST_APPEND(&found_record->attached_servers, &owner->ip_rec_item); + } + + eb32 = eb32_first(&r_res->answer_tree); + if (eb32) { + /* Move the first record to the end of the list, for internal + * round robin. + */ + eb32_delete(eb32); + eb32_insert(&r_res->answer_tree, eb32); + } + + return (currentip_found ? RSLV_UPD_NO : RSLV_UPD_SRVIP_NOT_FOUND); +} + +/* Turns a domain name label into a string: 3www7haproxy3org into www.haproxy.org + * + * <dn> contains the input label of <dn_len> bytes long and does not need to be + * null-terminated. <str> must be allocated large enough to contain a full host + * name plus the trailing zero, and the allocated size must be passed in + * <str_len>. + * + * In case of error, -1 is returned, otherwise, the number of bytes copied in + * <str> (including the terminating null byte). + */ +int resolv_dn_label_to_str(const char *dn, int dn_len, char *str, int str_len) +{ + char *ptr; + int i, sz; + + if (str_len < dn_len) + return -1; + + ptr = str; + for (i = 0; i < dn_len; ++i) { + sz = dn[i]; + if (i) + *ptr++ = '.'; + /* copy the string at i+1 to lower case */ + for (; sz > 0; sz--) + *(ptr++) = tolower(dn[++i]); + } + *ptr++ = '\0'; + return (ptr - str); +} + +/* Turns a string into domain name label: www.haproxy.org into 3www7haproxy3org + * + * <str> contains the input string that is <str_len> bytes long (trailing zero + * not needed). <dn> buffer must be allocated large enough to contain the + * encoded string and a trailing zero, so it must be at least str_len+2, and + * this allocated buffer size must be passed in <dn_len>. + * + * In case of error, -1 is returned, otherwise, the number of bytes copied in + * <dn> (excluding the terminating null byte). + */ +int resolv_str_to_dn_label(const char *str, int str_len, char *dn, int dn_len) +{ + int i, offset; + + if (dn_len < str_len + 2) + return -1; + + /* First byte of dn will be used to store the length of the first + * label */ + offset = 0; + for (i = 0; i < str_len; ++i) { + if (str[i] == '.') { + /* 2 or more consecutive dots is invalid */ + if (i == offset) + return -1; + + /* ignore trailing dot */ + if (i + 1 == str_len) + break; + + dn[offset] = (i - offset); + offset = i+1; + continue; + } + dn[i+1] = tolower(str[i]); + } + dn[offset] = i - offset; + dn[i+1] = '\0'; + return i+1; +} + +/* Validates host name: + * - total size + * - each label size individually + * returns: + * 0 in case of error. If <err> is not NULL, an error message is stored there. + * 1 when no error. <err> is left unaffected. + */ +int resolv_hostname_validation(const char *string, char **err) +{ + int i; + + if (strlen(string) > DNS_MAX_NAME_SIZE) { + if (err) + *err = DNS_TOO_LONG_FQDN; + return 0; + } + + while (*string) { + i = 0; + while (*string && *string != '.' && i < DNS_MAX_LABEL_SIZE) { + if (!(*string == '-' || *string == '_' || + (*string >= 'a' && *string <= 'z') || + (*string >= 'A' && *string <= 'Z') || + (*string >= '0' && *string <= '9'))) { + if (err) + *err = DNS_INVALID_CHARACTER; + return 0; + } + i++; + string++; + } + + if (!(*string)) + break; + + if (*string != '.' && i >= DNS_MAX_LABEL_SIZE) { + if (err) + *err = DNS_LABEL_TOO_LONG; + return 0; + } + + string++; + } + return 1; +} + +/* Picks up an available resolution from the different resolution list + * associated to a resolvers section, in this order: + * 1. check in resolutions.curr for the same hostname and query_type + * 2. check in resolutions.wait for the same hostname and query_type + * 3. Get a new resolution from resolution pool + * + * Returns an available resolution, NULL if none found. + */ +static struct resolv_resolution *resolv_pick_resolution(struct resolvers *resolvers, + char **hostname_dn, int hostname_dn_len, + int query_type) +{ + struct resolv_resolution *res; + + if (!*hostname_dn) + goto from_pool; + + /* Search for same hostname and query type in resolutions.curr */ + list_for_each_entry(res, &resolvers->resolutions.curr, list) { + if (!res->hostname_dn) + continue; + if ((query_type == res->prefered_query_type) && + hostname_dn_len == res->hostname_dn_len && + memcmp(*hostname_dn, res->hostname_dn, hostname_dn_len) == 0) + return res; + } + + /* Search for same hostname and query type in resolutions.wait */ + list_for_each_entry(res, &resolvers->resolutions.wait, list) { + if (!res->hostname_dn) + continue; + if ((query_type == res->prefered_query_type) && + hostname_dn_len == res->hostname_dn_len && + memcmp(*hostname_dn, res->hostname_dn, hostname_dn_len) == 0) + return res; + } + + from_pool: + /* No resolution could be found, so let's allocate a new one */ + res = pool_zalloc(resolv_resolution_pool); + if (res) { + res->resolvers = resolvers; + res->uuid = resolution_uuid; + res->status = RSLV_STATUS_NONE; + res->step = RSLV_STEP_NONE; + res->last_valid = now_ms; + + LIST_INIT(&res->requesters); + res->response.answer_tree = EB_ROOT; + + res->prefered_query_type = query_type; + res->query_type = query_type; + res->hostname_dn = *hostname_dn; + res->hostname_dn_len = hostname_dn_len; + + ++resolution_uuid; + + /* Move the resolution to the resolvers wait queue */ + LIST_APPEND(&resolvers->resolutions.wait, &res->list); + } + return res; +} + +/* deletes and frees all answer_items from the resolution's answer_list */ +static void resolv_purge_resolution_answer_records(struct resolv_resolution *resolution) +{ + struct eb32_node *eb32, *eb32_back; + struct resolv_answer_item *item; + + for (eb32 = eb32_first(&resolution->response.answer_tree); + eb32 && (eb32_back = eb32_next(eb32), 1); + eb32 = eb32_back) { + item = eb32_entry(eb32, typeof(*item), link); + eb32_delete(&item->link); + pool_free(resolv_answer_item_pool, item->ar_item); + pool_free(resolv_answer_item_pool, item); + } +} + +/* Releases a resolution from its requester(s) and move it back to the pool */ +static void resolv_free_resolution(struct resolv_resolution *resolution) +{ + struct resolv_requester *req, *reqback; + + /* clean up configuration */ + resolv_reset_resolution(resolution); + resolution->hostname_dn = NULL; + resolution->hostname_dn_len = 0; + + list_for_each_entry_safe(req, reqback, &resolution->requesters, list) { + LIST_DEL_INIT(&req->list); + req->resolution = NULL; + } + resolv_purge_resolution_answer_records(resolution); + + LIST_DEL_INIT(&resolution->list); + pool_free(resolv_resolution_pool, resolution); +} + +/* If *<req> is not NULL, returns it, otherwise tries to allocate a requester + * and makes it owned by this obj_type, with the proposed callback and error + * callback. On success, *req is assigned the allocated requester. Returns + * NULL on allocation failure. + */ +static struct resolv_requester * +resolv_get_requester(struct resolv_requester **req, enum obj_type *owner, + int (*cb)(struct resolv_requester *, struct dns_counters *), + int (*err_cb)(struct resolv_requester *, int)) +{ + struct resolv_requester *tmp; + + if (*req) + return *req; + + tmp = pool_alloc(resolv_requester_pool); + if (!tmp) + goto end; + + LIST_INIT(&tmp->list); + tmp->owner = owner; + tmp->resolution = NULL; + tmp->requester_cb = cb; + tmp->requester_error_cb = err_cb; + *req = tmp; + end: + return tmp; +} + +/* Links a requester (a server or a resolv_srvrq) with a resolution. It returns 0 + * on success, -1 otherwise. + */ +int resolv_link_resolution(void *requester, int requester_type, int requester_locked) +{ + struct resolv_resolution *res = NULL; + struct resolv_requester *req; + struct resolvers *resolvers; + struct server *srv = NULL; + struct resolv_srvrq *srvrq = NULL; + struct stream *stream = NULL; + char **hostname_dn; + int hostname_dn_len, query_type; + + enter_resolver_code(); + switch (requester_type) { + case OBJ_TYPE_SERVER: + srv = (struct server *)requester; + + if (!requester_locked) + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + + req = resolv_get_requester(&srv->resolv_requester, + &srv->obj_type, + snr_resolution_cb, + snr_resolution_error_cb); + + if (!requester_locked) + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + + if (!req) + goto err; + + hostname_dn = &srv->hostname_dn; + hostname_dn_len = srv->hostname_dn_len; + resolvers = srv->resolvers; + query_type = ((srv->resolv_opts.family_prio == AF_INET) + ? DNS_RTYPE_A + : DNS_RTYPE_AAAA); + break; + + case OBJ_TYPE_SRVRQ: + srvrq = (struct resolv_srvrq *)requester; + + req = resolv_get_requester(&srvrq->requester, + &srvrq->obj_type, + snr_resolution_cb, + srvrq_resolution_error_cb); + if (!req) + goto err; + + hostname_dn = &srvrq->hostname_dn; + hostname_dn_len = srvrq->hostname_dn_len; + resolvers = srvrq->resolvers; + query_type = DNS_RTYPE_SRV; + break; + + case OBJ_TYPE_STREAM: + stream = (struct stream *)requester; + + req = resolv_get_requester(&stream->resolv_ctx.requester, + &stream->obj_type, + act_resolution_cb, + act_resolution_error_cb); + if (!req) + goto err; + + hostname_dn = &stream->resolv_ctx.hostname_dn; + hostname_dn_len = stream->resolv_ctx.hostname_dn_len; + resolvers = stream->resolv_ctx.parent->arg.resolv.resolvers; + query_type = ((stream->resolv_ctx.parent->arg.resolv.opts->family_prio == AF_INET) + ? DNS_RTYPE_A + : DNS_RTYPE_AAAA); + break; + default: + goto err; + } + + /* Get a resolution from the resolvers' wait queue or pool */ + if ((res = resolv_pick_resolution(resolvers, hostname_dn, hostname_dn_len, query_type)) == NULL) + goto err; + + req->resolution = res; + + LIST_APPEND(&res->requesters, &req->list); + leave_resolver_code(); + return 0; + + err: + if (res && LIST_ISEMPTY(&res->requesters)) + resolv_free_resolution(res); + leave_resolver_code(); + return -1; +} + +/* This function removes all server/srvrq references on answer items. */ +void resolv_detach_from_resolution_answer_items(struct resolv_resolution *res, struct resolv_requester *req) +{ + struct eb32_node *eb32, *eb32_back; + struct resolv_answer_item *item; + struct server *srv, *srvback; + struct resolv_srvrq *srvrq; + + enter_resolver_code(); + if ((srv = objt_server(req->owner)) != NULL) { + LIST_DEL_INIT(&srv->ip_rec_item); + } + else if ((srvrq = objt_resolv_srvrq(req->owner)) != NULL) { + for (eb32 = eb32_first(&res->response.answer_tree); + eb32 && (eb32_back = eb32_next(eb32), 1); + eb32 = eb32_back) { + item = eb32_entry(eb32, typeof(*item), link); + if (item->type == DNS_RTYPE_SRV) { + list_for_each_entry_safe(srv, srvback, &item->attached_servers, srv_rec_item) { + if (srv->srvrq == srvrq) + resolv_srvrq_cleanup_srv(srv); + } + } + } + } + leave_resolver_code(); +} + +/* Removes a requester from a DNS resolution. It takes takes care of all the + * consequences. It also cleans up some parameters from the requester. + */ +static void _resolv_unlink_resolution(struct resolv_requester *requester) +{ + struct resolv_resolution *res; + struct resolv_requester *req; + + /* Nothing to do */ + if (!requester || !requester->resolution) + return; + res = requester->resolution; + + /* Clean up the requester */ + LIST_DEL_INIT(&requester->list); + requester->resolution = NULL; + + /* remove ref from the resolution answer item list to the requester */ + resolv_detach_from_resolution_answer_items(res, requester); + + /* We need to find another requester linked on this resolution */ + if (!LIST_ISEMPTY(&res->requesters)) + req = LIST_NEXT(&res->requesters, struct resolv_requester *, list); + else { + abort_resolution(res); + return; + } + + /* Move hostname_dn related pointers to the next requester */ + switch (obj_type(req->owner)) { + case OBJ_TYPE_SERVER: + res->hostname_dn = __objt_server(req->owner)->hostname_dn; + res->hostname_dn_len = __objt_server(req->owner)->hostname_dn_len; + break; + case OBJ_TYPE_SRVRQ: + res->hostname_dn = __objt_resolv_srvrq(req->owner)->hostname_dn; + res->hostname_dn_len = __objt_resolv_srvrq(req->owner)->hostname_dn_len; + break; + case OBJ_TYPE_STREAM: + res->hostname_dn = __objt_stream(req->owner)->resolv_ctx.hostname_dn; + res->hostname_dn_len = __objt_stream(req->owner)->resolv_ctx.hostname_dn_len; + break; + default: + res->hostname_dn = NULL; + res->hostname_dn_len = 0; + break; + } +} + +/* The public version of the function above that deals with the death row. */ +void resolv_unlink_resolution(struct resolv_requester *requester) +{ + enter_resolver_code(); + _resolv_unlink_resolution(requester); + leave_resolver_code(); +} + +/* Called when a network IO is generated on a name server socket for an incoming + * packet. It performs the following actions: + * - check if the packet requires processing (not outdated resolution) + * - ensure the DNS packet received is valid and call requester's callback + * - call requester's error callback if invalid response + * - check the dn_name in the packet against the one sent + */ +static int resolv_process_responses(struct dns_nameserver *ns) +{ + struct dns_counters *tmpcounters; + struct resolvers *resolvers; + struct resolv_resolution *res; + unsigned char buf[DNS_MAX_UDP_MESSAGE + 1]; + unsigned char *bufend; + int buflen, dns_resp; + int max_answer_records; + unsigned short query_id; + struct eb32_node *eb; + struct resolv_requester *req; + int keep_answer_items; + + resolvers = ns->parent; + enter_resolver_code(); + HA_SPIN_LOCK(DNS_LOCK, &resolvers->lock); + + /* process all pending input messages */ + while (1) { + /* read message received */ + memset(buf, '\0', resolvers->accepted_payload_size + 1); + if ((buflen = dns_recv_nameserver(ns, (void *)buf, sizeof(buf))) <= 0) { + break; + } + + /* message too big */ + if (buflen > resolvers->accepted_payload_size) { + ns->counters->app.resolver.too_big++; + continue; + } + + /* initializing variables */ + bufend = buf + buflen; /* pointer to mark the end of the buffer */ + + /* read the query id from the packet (16 bits) */ + if (buf + 2 > bufend) { + ns->counters->app.resolver.invalid++; + continue; + } + query_id = resolv_response_get_query_id(buf); + + /* search the query_id in the pending resolution tree */ + eb = eb32_lookup(&resolvers->query_ids, query_id); + if (eb == NULL) { + /* unknown query id means an outdated response and can be safely ignored */ + ns->counters->app.resolver.outdated++; + continue; + } + + /* known query id means a resolution in progress */ + res = eb32_entry(eb, struct resolv_resolution, qid); + /* number of responses received */ + res->nb_responses++; + + max_answer_records = (resolvers->accepted_payload_size - DNS_HEADER_SIZE) / DNS_MIN_RECORD_SIZE; + dns_resp = resolv_validate_dns_response(buf, bufend, res, max_answer_records); + + switch (dns_resp) { + case RSLV_RESP_VALID: + break; + + case RSLV_RESP_INVALID: + case RSLV_RESP_QUERY_COUNT_ERROR: + case RSLV_RESP_WRONG_NAME: + res->status = RSLV_STATUS_INVALID; + ns->counters->app.resolver.invalid++; + break; + + case RSLV_RESP_NX_DOMAIN: + res->status = RSLV_STATUS_NX; + ns->counters->app.resolver.nx++; + break; + + case RSLV_RESP_REFUSED: + res->status = RSLV_STATUS_REFUSED; + ns->counters->app.resolver.refused++; + break; + + case RSLV_RESP_ANCOUNT_ZERO: + res->status = RSLV_STATUS_OTHER; + ns->counters->app.resolver.any_err++; + break; + + case RSLV_RESP_CNAME_ERROR: + res->status = RSLV_STATUS_OTHER; + ns->counters->app.resolver.cname_error++; + break; + + case RSLV_RESP_TRUNCATED: + res->status = RSLV_STATUS_OTHER; + ns->counters->app.resolver.truncated++; + break; + + case RSLV_RESP_NO_EXPECTED_RECORD: + case RSLV_RESP_ERROR: + case RSLV_RESP_INTERNAL: + res->status = RSLV_STATUS_OTHER; + ns->counters->app.resolver.other++; + break; + } + + /* Wait all nameservers response to handle errors */ + if (dns_resp != RSLV_RESP_VALID && res->nb_responses < res->nb_queries) + continue; + + /* Process error codes */ + if (dns_resp != RSLV_RESP_VALID) { + if (res->prefered_query_type != res->query_type) { + /* The fallback on the query type was already performed, + * so check the try counter. If it falls to 0, we can + * report an error. Else, wait the next attempt. */ + if (!res->try) + goto report_res_error; + } + else { + /* Fallback from A to AAAA or the opposite and re-send + * the resolution immediately. try counter is not + * decremented. */ + if (res->prefered_query_type == DNS_RTYPE_A) { + res->query_type = DNS_RTYPE_AAAA; + resolv_send_query(res); + } + else if (res->prefered_query_type == DNS_RTYPE_AAAA) { + res->query_type = DNS_RTYPE_A; + resolv_send_query(res); + } + } + continue; + } + + /* So the resolution succeeded */ + res->status = RSLV_STATUS_VALID; + res->last_valid = now_ms; + ns->counters->app.resolver.valid++; + goto report_res_success; + + report_res_error: + keep_answer_items = 0; + list_for_each_entry(req, &res->requesters, list) + keep_answer_items |= req->requester_error_cb(req, dns_resp); + if (!keep_answer_items) + resolv_purge_resolution_answer_records(res); + resolv_reset_resolution(res); + LIST_DEL_INIT(&res->list); + LIST_APPEND(&resolvers->resolutions.wait, &res->list); + continue; + + report_res_success: + /* Only the 1rst requester s managed by the server, others are + * from the cache */ + tmpcounters = ns->counters; + list_for_each_entry(req, &res->requesters, list) { + struct server *s = objt_server(req->owner); + + if (s) + HA_SPIN_LOCK(SERVER_LOCK, &s->lock); + req->requester_cb(req, tmpcounters); + if (s) + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + tmpcounters = NULL; + } + + resolv_reset_resolution(res); + LIST_DEL_INIT(&res->list); + LIST_APPEND(&resolvers->resolutions.wait, &res->list); + continue; + } + resolv_update_resolvers_timeout(resolvers); + HA_SPIN_UNLOCK(DNS_LOCK, &resolvers->lock); + leave_resolver_code(); + return buflen; +} + +/* Processes DNS resolution. First, it checks the active list to detect expired + * resolutions and retry them if possible. Else a timeout is reported. Then, it + * checks the wait list to trigger new resolutions. + */ +struct task *process_resolvers(struct task *t, void *context, unsigned int state) +{ + struct resolvers *resolvers = context; + struct resolv_resolution *res, *resback; + int exp; + + enter_resolver_code(); + HA_SPIN_LOCK(DNS_LOCK, &resolvers->lock); + + /* Handle all expired resolutions from the active list. Elements that + * need to be removed will in fact be moved to the death_row. Other + * ones will be handled normally. + */ + + res = LIST_NEXT(&resolvers->resolutions.curr, struct resolv_resolution *, list); + while (&res->list != &resolvers->resolutions.curr) { + resback = LIST_NEXT(&res->list, struct resolv_resolution *, list); + + if (LIST_ISEMPTY(&res->requesters)) { + abort_resolution(res); + res = resback; + continue; + } + + /* When we find the first resolution in the future, then we can + * stop here */ + exp = tick_add(res->last_query, resolvers->timeout.retry); + if (!tick_is_expired(exp, now_ms)) + break; + + /* If current resolution has been tried too many times and + * finishes in timeout we update its status and remove it from + * the list */ + if (!res->try) { + struct resolv_requester *req; + int keep_answer_items = 0; + + /* Notify the result to the requesters */ + if (!res->nb_responses) + res->status = RSLV_STATUS_TIMEOUT; + list_for_each_entry(req, &res->requesters, list) + keep_answer_items |= req->requester_error_cb(req, res->status); + if (!keep_answer_items) + resolv_purge_resolution_answer_records(res); + + /* Clean up resolution info and remove it from the + * current list */ + resolv_reset_resolution(res); + + /* subsequent entries might have been deleted here */ + resback = LIST_NEXT(&res->list, struct resolv_resolution *, list); + LIST_DEL_INIT(&res->list); + LIST_APPEND(&resolvers->resolutions.wait, &res->list); + res = resback; + } + else { + /* Otherwise resend the DNS query and requeue the resolution */ + if (!res->nb_responses || res->prefered_query_type != res->query_type) { + /* No response received (a real timeout) or fallback already done */ + res->query_type = res->prefered_query_type; + res->try--; + } + else { + /* Fallback from A to AAAA or the opposite and re-send + * the resolution immediately. try counter is not + * decremented. */ + if (res->prefered_query_type == DNS_RTYPE_A) + res->query_type = DNS_RTYPE_AAAA; + else if (res->prefered_query_type == DNS_RTYPE_AAAA) + res->query_type = DNS_RTYPE_A; + else + res->try--; + } + resolv_send_query(res); + resback = LIST_NEXT(&res->list, struct resolv_resolution *, list); + res = resback; + } + } + + /* Handle all resolutions in the wait list */ + list_for_each_entry_safe(res, resback, &resolvers->resolutions.wait, list) { + + if (unlikely(stopping)) { + /* If haproxy is stopping, check if the resolution to know if it must be run or not. + * If at least a requester is a stream (because of a do-resolv action) or if there + * is a requester attached to a running proxy, the resolution is performed. + * Otherwise, it is skipped for now. + */ + struct resolv_requester *req; + int must_run = 0; + + list_for_each_entry(req, &res->requesters, list) { + struct proxy *px = NULL; + + switch (obj_type(req->owner)) { + case OBJ_TYPE_SERVER: + px = __objt_server(req->owner)->proxy; + break; + case OBJ_TYPE_SRVRQ: + px = __objt_resolv_srvrq(req->owner)->proxy; + break; + case OBJ_TYPE_STREAM: + /* Always perform the resolution */ + must_run = 1; + break; + default: + break; + } + /* Perform the resolution if the proxy is not stopped or disabled */ + if (px && !(px->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) + must_run = 1; + + if (must_run) + break; + } + + if (!must_run) { + /* Skip the reolsution. reset it and wait for the next wakeup */ + resolv_reset_resolution(res); + continue; + } + } + + if (LIST_ISEMPTY(&res->requesters)) { + abort_resolution(res); + continue; + } + + exp = tick_add(res->last_resolution, resolv_resolution_timeout(res)); + if (tick_isset(res->last_resolution) && !tick_is_expired(exp, now_ms)) + continue; + + if (resolv_run_resolution(res) != 1) { + res->last_resolution = now_ms; + LIST_DEL_INIT(&res->list); + LIST_APPEND(&resolvers->resolutions.wait, &res->list); + } + } + + resolv_update_resolvers_timeout(resolvers); + HA_SPIN_UNLOCK(DNS_LOCK, &resolvers->lock); + + if (unlikely(stopping)) { + struct dns_nameserver *ns; + + if (LIST_ISEMPTY(&resolvers->resolutions.curr)) + t->expire = TICK_ETERNITY; + + list_for_each_entry(ns, &resolvers->nameservers, list) { + if (ns->stream) + task_wakeup(ns->stream->task_idle, TASK_WOKEN_MSG); + } + } + + /* now we can purge all queued deletions */ + leave_resolver_code(); + return t; +} + + +/* destroy a resolvers */ +static void resolvers_destroy(struct resolvers *resolvers) +{ + struct dns_nameserver *ns, *nsback; + struct resolv_resolution *res, *resback; + struct resolv_requester *req, *reqback; + + list_for_each_entry_safe(ns, nsback, &resolvers->nameservers, list) { + free(ns->id); + free((char *)ns->conf.file); + if (ns->dgram) { + if (ns->dgram->conn.t.sock.fd != -1) { + fd_delete(ns->dgram->conn.t.sock.fd); + close(ns->dgram->conn.t.sock.fd); + } + ring_free(ns->dgram->ring_req); + free(ns->dgram); + } + if (ns->stream) { + ring_free(ns->stream->ring_req); + task_destroy(ns->stream->task_req); + task_destroy(ns->stream->task_rsp); + free(ns->stream); + } + LIST_DEL_INIT(&ns->list); + EXTRA_COUNTERS_FREE(ns->extra_counters); + free(ns); + } + + list_for_each_entry_safe(res, resback, &resolvers->resolutions.curr, list) { + list_for_each_entry_safe(req, reqback, &res->requesters, list) { + LIST_DEL_INIT(&req->list); + pool_free(resolv_requester_pool, req); + } + resolv_free_resolution(res); + } + + list_for_each_entry_safe(res, resback, &resolvers->resolutions.wait, list) { + list_for_each_entry_safe(req, reqback, &res->requesters, list) { + LIST_DEL_INIT(&req->list); + pool_free(resolv_requester_pool, req); + } + resolv_free_resolution(res); + } + + free_proxy(resolvers->px); + free(resolvers->id); + free((char *)resolvers->conf.file); + task_destroy(resolvers->t); + LIST_DEL_INIT(&resolvers->list); + free(resolvers); +} + +/* Release memory allocated by DNS */ +static void resolvers_deinit(void) +{ + struct resolvers *resolvers, *resolversback; + struct resolv_srvrq *srvrq, *srvrqback; + + list_for_each_entry_safe(resolvers, resolversback, &sec_resolvers, list) { + resolvers_destroy(resolvers); + } + + list_for_each_entry_safe(srvrq, srvrqback, &resolv_srvrq_list, list) { + free(srvrq->name); + free(srvrq->hostname_dn); + LIST_DEL_INIT(&srvrq->list); + free(srvrq); + } +} + +/* Finalizes the DNS configuration by allocating required resources and checking + * live parameters. + * Returns 0 on success, 1 on error. + */ +static int resolvers_finalize_config(void) +{ + struct resolvers *resolvers; + struct proxy *px; + int err_code = 0; + + enter_resolver_code(); + + /* allocate pool of resolution per resolvers */ + list_for_each_entry(resolvers, &sec_resolvers, list) { + struct dns_nameserver *ns; + struct task *t; + + /* Check if we can create the socket with nameservers info */ + list_for_each_entry(ns, &resolvers->nameservers, list) { + int fd; + + if (ns->dgram) { + /* Check nameserver info */ + if ((fd = socket(ns->dgram->conn.addr.to.ss_family, SOCK_DGRAM, IPPROTO_UDP)) == -1) { + if (!resolvers->conf.implicit) { /* emit a warning only if it was configured manually */ + ha_alert("resolvers '%s': can't create socket for nameserver '%s'.\n", + resolvers->id, ns->id); + err_code |= (ERR_ALERT|ERR_ABORT); + } + continue; + } + if (connect(fd, (struct sockaddr*)&ns->dgram->conn.addr.to, get_addr_len(&ns->dgram->conn.addr.to)) == -1) { + if (!resolvers->conf.implicit) { /* emit a warning only if it was configured manually */ + ha_warning("resolvers '%s': can't connect socket for nameserver '%s'.\n", + resolvers->id, ns->id); + } + close(fd); + err_code |= ERR_WARN; + continue; + } + close(fd); + } + } + + /* Create the task associated to the resolvers section */ + if ((t = task_new_anywhere()) == NULL) { + ha_alert("resolvers '%s' : out of memory.\n", resolvers->id); + err_code |= (ERR_ALERT|ERR_ABORT); + goto err; + } + + /* Update task's parameters */ + t->process = process_resolvers; + t->context = resolvers; + resolvers->t = t; + task_wakeup(t, TASK_WOKEN_INIT); + } + + for (px = proxies_list; px; px = px->next) { + struct server *srv; + + if (px->flags & PR_FL_DISABLED) { + /* must not run and will not work anyway since + * nothing in the proxy is initialized. + */ + continue; + } + + for (srv = px->srv; srv; srv = srv->next) { + struct resolvers *resolvers; + + if (!srv->resolvers_id) + continue; + + if ((resolvers = find_resolvers_by_id(srv->resolvers_id)) == NULL) { + ha_alert("%s '%s', server '%s': unable to find required resolvers '%s'\n", + proxy_type_str(px), px->id, srv->id, srv->resolvers_id); + err_code |= (ERR_ALERT|ERR_ABORT); + continue; + } + srv->resolvers = resolvers; + srv->srvrq_check = NULL; + if (srv->srvrq) { + if (!srv->srvrq->resolvers) { + srv->srvrq->resolvers = srv->resolvers; + if (resolv_link_resolution(srv->srvrq, OBJ_TYPE_SRVRQ, 0) == -1) { + ha_alert("%s '%s' : unable to set DNS resolution for server '%s'.\n", + proxy_type_str(px), px->id, srv->id); + err_code |= (ERR_ALERT|ERR_ABORT); + continue; + } + } + + srv->srvrq_check = task_new_anywhere(); + if (!srv->srvrq_check) { + ha_alert("%s '%s' : unable to create SRVRQ task for server '%s'.\n", + proxy_type_str(px), px->id, srv->id); + err_code |= (ERR_ALERT|ERR_ABORT); + goto err; + } + srv->srvrq_check->process = resolv_srvrq_expire_task; + srv->srvrq_check->context = srv; + srv->srvrq_check->expire = TICK_ETERNITY; + } + else if (resolv_link_resolution(srv, OBJ_TYPE_SERVER, 0) == -1) { + ha_alert("%s '%s', unable to set DNS resolution for server '%s'.\n", + proxy_type_str(px), px->id, srv->id); + err_code |= (ERR_ALERT|ERR_ABORT); + continue; + } + + srv->flags |= SRV_F_NON_PURGEABLE; + } + } + + if (err_code & (ERR_ALERT|ERR_ABORT)) + goto err; + + leave_resolver_code(); + return 0; + err: + leave_resolver_code(); + resolvers_deinit(); + return 1; + +} + +static int stats_dump_resolv_to_buffer(struct stconn *sc, + struct dns_nameserver *ns, + struct field *stats, size_t stats_count, + struct list *stat_modules) +{ + struct appctx *appctx = __sc_appctx(sc); + struct stats_module *mod; + size_t idx = 0; + + memset(stats, 0, sizeof(struct field) * stats_count); + + list_for_each_entry(mod, stat_modules, list) { + struct counters_node *counters = EXTRA_COUNTERS_GET(ns->extra_counters, mod); + + mod->fill_stats(counters, stats + idx); + idx += mod->stats_count; + } + + if (!stats_dump_one_line(stats, idx, appctx)) + return 0; + + if (!stats_putchk(appctx, NULL)) + goto full; + + return 1; + + full: + return 0; +} + +/* Uses <appctx.ctx.stats.obj1> as a pointer to the current resolver and <obj2> + * as a pointer to the current nameserver. + */ +int stats_dump_resolvers(struct stconn *sc, + struct field *stats, size_t stats_count, + struct list *stat_modules) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct channel *rep = sc_ic(sc); + struct resolvers *resolver = ctx->obj1; + struct dns_nameserver *ns = ctx->obj2; + + if (!resolver) + resolver = LIST_NEXT(&sec_resolvers, struct resolvers *, list); + + /* dump resolvers */ + list_for_each_entry_from(resolver, &sec_resolvers, list) { + ctx->obj1 = resolver; + + ns = ctx->obj2 ? + ctx->obj2 : + LIST_NEXT(&resolver->nameservers, struct dns_nameserver *, list); + + list_for_each_entry_from(ns, &resolver->nameservers, list) { + ctx->obj2 = ns; + + if (buffer_almost_full(&rep->buf)) { + sc_need_room(sc, b_size(&rep->buf) / 2); + goto full; + } + + if (!stats_dump_resolv_to_buffer(sc, ns, + stats, stats_count, + stat_modules)) { + return 0; + } + } + + ctx->obj2 = NULL; + } + + return 1; + + full: + return 0; +} + +void resolv_stats_clear_counters(int clrall, struct list *stat_modules) +{ + struct resolvers *resolvers; + struct dns_nameserver *ns; + struct stats_module *mod; + void *counters; + + list_for_each_entry(mod, stat_modules, list) { + if (!mod->clearable && !clrall) + continue; + + list_for_each_entry(resolvers, &sec_resolvers, list) { + list_for_each_entry(ns, &resolvers->nameservers, list) { + counters = EXTRA_COUNTERS_GET(ns->extra_counters, mod); + memcpy(counters, mod->counters, mod->counters_size); + } + } + } + +} + +int resolv_allocate_counters(struct list *stat_modules) +{ + struct stats_module *mod; + struct resolvers *resolvers; + struct dns_nameserver *ns; + + list_for_each_entry(resolvers, &sec_resolvers, list) { + list_for_each_entry(ns, &resolvers->nameservers, list) { + EXTRA_COUNTERS_REGISTER(&ns->extra_counters, COUNTERS_RSLV, + alloc_failed); + + list_for_each_entry(mod, stat_modules, list) { + EXTRA_COUNTERS_ADD(mod, + ns->extra_counters, + mod->counters, + mod->counters_size); + } + + EXTRA_COUNTERS_ALLOC(ns->extra_counters, alloc_failed); + + list_for_each_entry(mod, stat_modules, list) { + memcpy(ns->extra_counters->data + mod->counters_off[ns->extra_counters->type], + mod->counters, mod->counters_size); + + /* Store the ns counters pointer */ + if (strcmp(mod->name, "resolvers") == 0) { + ns->counters = (struct dns_counters *)ns->extra_counters->data + mod->counters_off[COUNTERS_RSLV]; + ns->counters->id = ns->id; + ns->counters->pid = resolvers->id; + } + } + } + } + + return 1; + +alloc_failed: + return 0; +} + +/* if an arg is found, it sets the optional resolvers section pointer into a + * show_resolvers_ctx struct pointed to by svcctx, or NULL when dumping all. + */ +static int cli_parse_stat_resolvers(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_resolvers_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct resolvers *presolvers; + + if (*args[2]) { + list_for_each_entry(presolvers, &sec_resolvers, list) { + if (strcmp(presolvers->id, args[2]) == 0) { + ctx->forced_section = presolvers; + break; + } + } + if (ctx->forced_section == NULL) + return cli_err(appctx, "Can't find that resolvers section\n"); + } + return 0; +} + +/* Dumps counters from all resolvers section and associated name servers. It + * returns 0 if the output buffer is full and it needs to be called again, + * otherwise non-zero. It may limit itself to the resolver pointed to by the + * <resolvers> field of struct show_resolvers_ctx pointed to by <svcctx> if + * it's not null. + */ +static int cli_io_handler_dump_resolvers_to_buffer(struct appctx *appctx) +{ + struct show_resolvers_ctx *ctx = appctx->svcctx; + struct resolvers *resolvers = ctx->resolvers; + struct dns_nameserver *ns; + + chunk_reset(&trash); + + if (LIST_ISEMPTY(&sec_resolvers)) { + if (applet_putstr(appctx, "No resolvers found\n") == -1) + goto full; + } + else { + if (!resolvers) + resolvers = LIST_ELEM(sec_resolvers.n, typeof(resolvers), list); + + list_for_each_entry_from(resolvers, &sec_resolvers, list) { + if (ctx->forced_section != NULL && ctx->forced_section != resolvers) + continue; + + ctx->resolvers = resolvers; + ns = ctx->ns; + + if (!ns) { + chunk_printf(&trash, "Resolvers section %s\n", resolvers->id); + if (applet_putchk(appctx, &trash) == -1) + goto full; + + ns = LIST_ELEM(resolvers->nameservers.n, typeof(ns), list); + ctx->ns = ns; + } + + list_for_each_entry_from(ns, &resolvers->nameservers, list) { + chunk_reset(&trash); + chunk_appendf(&trash, " nameserver %s:\n", ns->id); + chunk_appendf(&trash, " sent: %lld\n", ns->counters->sent); + chunk_appendf(&trash, " snd_error: %lld\n", ns->counters->snd_error); + chunk_appendf(&trash, " valid: %lld\n", ns->counters->app.resolver.valid); + chunk_appendf(&trash, " update: %lld\n", ns->counters->app.resolver.update); + chunk_appendf(&trash, " cname: %lld\n", ns->counters->app.resolver.cname); + chunk_appendf(&trash, " cname_error: %lld\n", ns->counters->app.resolver.cname_error); + chunk_appendf(&trash, " any_err: %lld\n", ns->counters->app.resolver.any_err); + chunk_appendf(&trash, " nx: %lld\n", ns->counters->app.resolver.nx); + chunk_appendf(&trash, " timeout: %lld\n", ns->counters->app.resolver.timeout); + chunk_appendf(&trash, " refused: %lld\n", ns->counters->app.resolver.refused); + chunk_appendf(&trash, " other: %lld\n", ns->counters->app.resolver.other); + chunk_appendf(&trash, " invalid: %lld\n", ns->counters->app.resolver.invalid); + chunk_appendf(&trash, " too_big: %lld\n", ns->counters->app.resolver.too_big); + chunk_appendf(&trash, " truncated: %lld\n", ns->counters->app.resolver.truncated); + chunk_appendf(&trash, " outdated: %lld\n", ns->counters->app.resolver.outdated); + if (applet_putchk(appctx, &trash) == -1) + goto full; + ctx->ns = ns; + } + + ctx->ns = NULL; + + /* was this the only section to dump ? */ + if (ctx->forced_section) + break; + } + } + + /* done! */ + return 1; + full: + /* the output buffer is full, retry later */ + return 0; +} + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ }, { + { { "show", "resolvers", NULL }, "show resolvers [id] : dumps counters from all resolvers section and associated name servers", + cli_parse_stat_resolvers, cli_io_handler_dump_resolvers_to_buffer }, + {{},} + } +}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +/* + * Prepare <rule> for hostname resolution. + * Returns -1 in case of any allocation failure, 0 if not. + * On error, a global failure counter is also incremented. + */ +static int action_prepare_for_resolution(struct stream *stream, const char *hostname, int hostname_len) +{ + char *hostname_dn; + int hostname_dn_len; + struct buffer *tmp = get_trash_chunk(); + + if (!hostname) + return 0; + + hostname_dn = tmp->area; + hostname_dn_len = resolv_str_to_dn_label(hostname, hostname_len, + hostname_dn, tmp->size); + if (hostname_dn_len == -1) + goto err; + + + stream->resolv_ctx.hostname_dn = strdup(hostname_dn); + stream->resolv_ctx.hostname_dn_len = hostname_dn_len; + if (!stream->resolv_ctx.hostname_dn) + goto err; + + return 0; + + err: + ha_free(&stream->resolv_ctx.hostname_dn); + resolv_failed_resolutions += 1; + return -1; +} + + +/* + * Execute the "do-resolution" action. May be called from {tcp,http}request. + */ +enum act_return resolv_action_do_resolve(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct resolv_resolution *resolution; + struct sample *smp; + struct resolv_requester *req; + struct resolvers *resolvers; + struct resolv_resolution *res; + int exp, locked = 0; + enum act_return ret = ACT_RET_CONT; + + resolvers = rule->arg.resolv.resolvers; + + enter_resolver_code(); + + /* we have a response to our DNS resolution */ + use_cache: + if (s->resolv_ctx.requester && s->resolv_ctx.requester->resolution != NULL) { + resolution = s->resolv_ctx.requester->resolution; + if (!locked) { + HA_SPIN_LOCK(DNS_LOCK, &resolvers->lock); + locked = 1; + } + + if (resolution->step == RSLV_STEP_RUNNING) + goto yield; + if (resolution->step == RSLV_STEP_NONE) { + /* We update the variable only if we have a valid + * response. If the response was not received yet, we + * must yield. + */ + if (resolution->status == RSLV_STATUS_NONE) + goto yield; + if (resolution->status == RSLV_STATUS_VALID) { + struct sample smp; + short ip_sin_family = 0; + void *ip = NULL; + + resolv_get_ip_from_response(&resolution->response, rule->arg.resolv.opts, NULL, + 0, &ip, &ip_sin_family, NULL); + + switch (ip_sin_family) { + case AF_INET: + smp.data.type = SMP_T_IPV4; + memcpy(&smp.data.u.ipv4, ip, 4); + break; + case AF_INET6: + smp.data.type = SMP_T_IPV6; + memcpy(&smp.data.u.ipv6, ip, 16); + break; + default: + ip = NULL; + } + + if (ip) { + smp.px = px; + smp.sess = sess; + smp.strm = s; + + vars_set_by_name(rule->arg.resolv.varname, strlen(rule->arg.resolv.varname), &smp); + } + } + } + + goto release_requester; + } + + /* need to configure and start a new DNS resolution */ + smp = sample_fetch_as_type(px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.resolv.expr, SMP_T_STR); + if (smp == NULL) + goto end; + + if (action_prepare_for_resolution(s, smp->data.u.str.area, smp->data.u.str.data) == -1) + goto end; /* on error, ignore the action */ + + s->resolv_ctx.parent = rule; + + HA_SPIN_LOCK(DNS_LOCK, &resolvers->lock); + locked = 1; + + resolv_link_resolution(s, OBJ_TYPE_STREAM, 0); + + /* Check if there is a fresh enough response in the cache of our associated resolution */ + req = s->resolv_ctx.requester; + if (!req || !req->resolution) + goto release_requester; /* on error, ignore the action */ + res = req->resolution; + + exp = tick_add(res->last_resolution, resolvers->hold.valid); + if (resolvers->t && res->status == RSLV_STATUS_VALID && tick_isset(res->last_resolution) + && !tick_is_expired(exp, now_ms)) { + goto use_cache; + } + + resolv_trigger_resolution(s->resolv_ctx.requester); + + yield: + if (flags & ACT_OPT_FINAL) + goto release_requester; + ret = ACT_RET_YIELD; + + end: + leave_resolver_code(); + if (locked) + HA_SPIN_UNLOCK(DNS_LOCK, &resolvers->lock); + return ret; + + release_requester: + ha_free(&s->resolv_ctx.hostname_dn); + s->resolv_ctx.hostname_dn_len = 0; + if (s->resolv_ctx.requester) { + _resolv_unlink_resolution(s->resolv_ctx.requester); + pool_free(resolv_requester_pool, s->resolv_ctx.requester); + s->resolv_ctx.requester = NULL; + } + goto end; +} + +static void release_resolv_action(struct act_rule *rule) +{ + release_sample_expr(rule->arg.resolv.expr); + free(rule->arg.resolv.varname); + free(rule->arg.resolv.resolvers_id); + free(rule->arg.resolv.opts); +} + + +/* parse "do-resolve" action + * This action takes the following arguments: + * do-resolve(<varName>,<resolversSectionName>,<resolvePrefer>) <expr> + * + * - <varName> is the variable name where the result of the DNS resolution will be stored + * (mandatory) + * - <resolversSectionName> is the name of the resolvers section to use to perform the resolution + * (mandatory) + * - <resolvePrefer> can be either 'ipv4' or 'ipv6' and is the IP family we would like to resolve first + * (optional), defaults to ipv6 + * - <expr> is an HAProxy expression used to fetch the name to be resolved + */ +enum act_parse_ret resolv_parse_do_resolve(const char **args, int *orig_arg, struct proxy *px, struct act_rule *rule, char **err) +{ + int cur_arg; + struct sample_expr *expr; + unsigned int where; + const char *beg, *end; + + /* orig_arg points to the first argument, but we need to analyse the command itself first */ + cur_arg = *orig_arg - 1; + + /* locate varName, which is mandatory */ + beg = strchr(args[cur_arg], '('); + if (beg == NULL) + goto do_resolve_parse_error; + beg = beg + 1; /* beg should points to the first character after opening parenthesis '(' */ + end = strchr(beg, ','); + if (end == NULL) + goto do_resolve_parse_error; + rule->arg.resolv.varname = my_strndup(beg, end - beg); + if (rule->arg.resolv.varname == NULL) + goto do_resolve_parse_error; + + + /* locate resolversSectionName, which is mandatory. + * Since next parameters are optional, the delimiter may be comma ',' + * or closing parenthesis ')' + */ + beg = end + 1; + end = strchr(beg, ','); + if (end == NULL) + end = strchr(beg, ')'); + if (end == NULL) + goto do_resolve_parse_error; + rule->arg.resolv.resolvers_id = my_strndup(beg, end - beg); + if (rule->arg.resolv.resolvers_id == NULL) + goto do_resolve_parse_error; + + + rule->arg.resolv.opts = calloc(1, sizeof(*rule->arg.resolv.opts)); + if (rule->arg.resolv.opts == NULL) + goto do_resolve_parse_error; + + /* Default priority is ipv6 */ + rule->arg.resolv.opts->family_prio = AF_INET6; + + /* optional arguments accepted for now: + * ipv4 or ipv6 + */ + while (*end != ')') { + beg = end + 1; + end = strchr(beg, ','); + if (end == NULL) + end = strchr(beg, ')'); + if (end == NULL) + goto do_resolve_parse_error; + + if (strncmp(beg, "ipv4", end - beg) == 0) { + rule->arg.resolv.opts->family_prio = AF_INET; + } + else if (strncmp(beg, "ipv6", end - beg) == 0) { + rule->arg.resolv.opts->family_prio = AF_INET6; + } + else { + goto do_resolve_parse_error; + } + } + + cur_arg = cur_arg + 1; + + expr = sample_parse_expr((char **)args, &cur_arg, px->conf.args.file, px->conf.args.line, err, &px->conf.args, NULL); + if (!expr) + goto do_resolve_parse_error; + + + where = 0; + if (px->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + + if (!(expr->fetch->val & where)) { + memprintf(err, + "fetch method '%s' extracts information from '%s', none of which is available here", + args[cur_arg-1], sample_src_names(expr->fetch->use)); + free(expr); + return ACT_RET_PRS_ERR; + } + rule->arg.resolv.expr = expr; + rule->action = ACT_CUSTOM; + rule->action_ptr = resolv_action_do_resolve; + *orig_arg = cur_arg; + + rule->check_ptr = check_action_do_resolve; + rule->release_ptr = release_resolv_action; + + return ACT_RET_PRS_OK; + + do_resolve_parse_error: + ha_free(&rule->arg.resolv.varname); + ha_free(&rule->arg.resolv.resolvers_id); + memprintf(err, "Can't parse '%s'. Expects 'do-resolve(<varname>,<resolvers>[,<options>]) <expr>'. Available options are 'ipv4' and 'ipv6'", + args[cur_arg]); + return ACT_RET_PRS_ERR; +} + +static struct action_kw_list http_req_kws = { { }, { + { "do-resolve", resolv_parse_do_resolve, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_kws); + +static struct action_kw_list tcp_req_cont_actions = {ILH, { + { "do-resolve", resolv_parse_do_resolve, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_cont_keywords_register, &tcp_req_cont_actions); + +/* Check an "http-request do-resolve" action. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +int check_action_do_resolve(struct act_rule *rule, struct proxy *px, char **err) +{ + struct resolvers *resolvers = NULL; + + if (rule->arg.resolv.resolvers_id == NULL) { + memprintf(err,"Proxy '%s': %s", px->id, "do-resolve action without resolvers"); + return 0; + } + + resolvers = find_resolvers_by_id(rule->arg.resolv.resolvers_id); + if (resolvers == NULL) { + memprintf(err,"Can't find resolvers section '%s' for do-resolve action", rule->arg.resolv.resolvers_id); + return 0; + } + rule->arg.resolv.resolvers = resolvers; + + return 1; +} + +void resolvers_setup_proxy(struct proxy *px) +{ + px->last_change = ns_to_sec(now_ns); + px->cap = PR_CAP_FE | PR_CAP_BE; + px->maxconn = 0; + px->conn_retries = 1; + px->timeout.server = TICK_ETERNITY; + px->timeout.client = TICK_ETERNITY; + px->timeout.connect = 1000; // by default same than timeout.resolve + px->accept = NULL; + px->options2 |= PR_O2_INDEPSTR | PR_O2_SMARTCON; +} + +static int parse_resolve_conf(char **errmsg, char **warnmsg) +{ + struct dns_nameserver *newnameserver = NULL; + const char *whitespace = "\r\n\t "; + char *resolv_line = NULL; + int resolv_linenum = 0; + FILE *f = NULL; + char *address = NULL; + struct sockaddr_storage *sk = NULL; + struct protocol *proto; + int duplicate_name = 0; + int err_code = 0; + + if ((resolv_line = malloc(sizeof(*resolv_line) * LINESIZE)) == NULL) { + memprintf(errmsg, "out of memory.\n"); + err_code |= ERR_ALERT | ERR_FATAL; + goto resolv_out; + } + + if ((f = fopen("/etc/resolv.conf", "r")) == NULL) { + if (errmsg) + memprintf(errmsg, "failed to open /etc/resolv.conf."); + err_code |= ERR_ALERT | ERR_FATAL; + goto resolv_out; + } + + sk = calloc(1, sizeof(*sk)); + if (sk == NULL) { + if (errmsg) + memprintf(errmsg, "parsing [/etc/resolv.conf:%d] : out of memory.", resolv_linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto resolv_out; + } + + while (fgets(resolv_line, LINESIZE, f) != NULL) { + resolv_linenum++; + if (strncmp(resolv_line, "nameserver", 10) != 0) + continue; + + address = strtok(resolv_line + 10, whitespace); + if (address == resolv_line + 10) + continue; + + if (address == NULL) { + if (warnmsg) + memprintf(warnmsg, "%sparsing [/etc/resolv.conf:%d] : nameserver line is missing address.\n", + *warnmsg ? *warnmsg : "", resolv_linenum); + err_code |= ERR_WARN; + continue; + } + + duplicate_name = 0; + list_for_each_entry(newnameserver, &curr_resolvers->nameservers, list) { + if (strcmp(newnameserver->id, address) == 0) { + if (warnmsg) + memprintf(warnmsg, "%sParsing [/etc/resolv.conf:%d] : generated name for /etc/resolv.conf nameserver '%s' conflicts with another nameserver (declared at %s:%d), it appears to be a duplicate and will be excluded.\n", + *warnmsg ? *warnmsg : "", resolv_linenum, address, newnameserver->conf.file, newnameserver->conf.line); + err_code |= ERR_WARN; + duplicate_name = 1; + } + } + + if (duplicate_name) + continue; + + memset(sk, 0, sizeof(*sk)); + if (!str2ip2(address, sk, 1)) { + if (warnmsg) + memprintf(warnmsg, "%sparsing [/etc/resolv.conf:%d] : address '%s' could not be recognized, nameserver will be excluded.\n", + *warnmsg ? *warnmsg : "", resolv_linenum, address); + err_code |= ERR_WARN; + continue; + } + + set_host_port(sk, 53); + + proto = protocol_lookup(sk->ss_family, PROTO_TYPE_STREAM, 0); + if (!proto || !proto->connect) { + if (warnmsg) + memprintf(warnmsg, "%sparsing [/etc/resolv.conf:%d] : '%s' : connect() not supported for this address family.\n", + *warnmsg ? *warnmsg : "", resolv_linenum, address); + err_code |= ERR_WARN; + continue; + } + + if ((newnameserver = calloc(1, sizeof(*newnameserver))) == NULL) { + if (errmsg) + memprintf(errmsg, "parsing [/etc/resolv.conf:%d] : out of memory.", resolv_linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto resolv_out; + } + + if (dns_dgram_init(newnameserver, sk) < 0) { + if (errmsg) + memprintf(errmsg, "parsing [/etc/resolv.conf:%d] : out of memory.", resolv_linenum); + err_code |= ERR_ALERT | ERR_FATAL; + free(newnameserver); + goto resolv_out; + } + + newnameserver->conf.file = strdup("/etc/resolv.conf"); + if (newnameserver->conf.file == NULL) { + if (errmsg) + memprintf(errmsg, "parsing [/etc/resolv.conf:%d] : out of memory.", resolv_linenum); + err_code |= ERR_ALERT | ERR_FATAL; + free(newnameserver); + goto resolv_out; + } + + newnameserver->id = strdup(address); + if (newnameserver->id == NULL) { + if (errmsg) + memprintf(errmsg, "parsing [/etc/resolv.conf:%d] : out of memory.", resolv_linenum); + err_code |= ERR_ALERT | ERR_FATAL; + free((char *)newnameserver->conf.file); + free(newnameserver); + goto resolv_out; + } + + newnameserver->parent = curr_resolvers; + newnameserver->process_responses = resolv_process_responses; + newnameserver->conf.line = resolv_linenum; + LIST_APPEND(&curr_resolvers->nameservers, &newnameserver->list); + } + +resolv_out: + free(sk); + free(resolv_line); + if (f != NULL) + fclose(f); + + return err_code; +} + +static int resolvers_new(struct resolvers **resolvers, const char *id, const char *file, int linenum) +{ + struct resolvers *r = NULL; + struct proxy *p = NULL; + int err_code = 0; + + if ((r = calloc(1, sizeof(*r))) == NULL) { + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* allocate new proxy to tcp servers */ + p = calloc(1, sizeof *p); + if (!p) { + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + init_new_proxy(p); + resolvers_setup_proxy(p); + p->parent = r; + p->id = strdup(id); + p->conf.args.file = p->conf.file = strdup(file); + p->conf.args.line = p->conf.line = linenum; + r->px = p; + + /* default values */ + LIST_APPEND(&sec_resolvers, &r->list); + r->conf.file = strdup(file); + r->conf.line = linenum; + r->id = strdup(id); + r->query_ids = EB_ROOT; + /* default maximum response size */ + r->accepted_payload_size = 512; + /* default hold period for nx, other, refuse and timeout is 30s */ + r->hold.nx = 30000; + r->hold.other = 30000; + r->hold.refused = 30000; + r->hold.timeout = 30000; + r->hold.obsolete = 0; + /* default hold period for valid is 10s */ + r->hold.valid = 10000; + r->timeout.resolve = 1000; + r->timeout.retry = 1000; + r->resolve_retries = 3; + LIST_INIT(&r->nameservers); + LIST_INIT(&r->resolutions.curr); + LIST_INIT(&r->resolutions.wait); + HA_SPIN_INIT(&r->lock); + + *resolvers = r; + +out: + if (err_code & (ERR_FATAL|ERR_ABORT)) { + ha_free(&r); + ha_free(&p); + } + + return err_code; +} + + +/* + * Parse a <resolvers> section. + * Returns the error code, 0 if OK, or any combination of : + * - ERR_ABORT: must abort ASAP + * - ERR_FATAL: we can continue parsing but not start the service + * - ERR_WARN: a warning has been emitted + * - ERR_ALERT: an alert has been emitted + * Only the two first ones can stop processing, the two others are just + * indicators. + */ +int cfg_parse_resolvers(const char *file, int linenum, char **args, int kwm) +{ + const char *err; + int err_code = 0; + char *errmsg = NULL; + char *warnmsg = NULL; + + if (strcmp(args[0], "resolvers") == 0) { /* new resolvers section */ + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing name for resolvers section.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in '%s' name '%s'.\n", + file, linenum, *err, args[0], args[1]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + list_for_each_entry(curr_resolvers, &sec_resolvers, list) { + /* Error if two resolvers owns the same name */ + if (strcmp(curr_resolvers->id, args[1]) == 0) { + ha_alert("Parsing [%s:%d]: resolvers '%s' has same name as another resolvers (declared at %s:%d).\n", + file, linenum, args[1], curr_resolvers->conf.file, curr_resolvers->conf.line); + err_code |= ERR_ALERT | ERR_ABORT; + } + } + + err_code |= resolvers_new(&curr_resolvers, args[1], file, linenum); + if (err_code & ERR_ALERT) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + goto out; + } + + } + else if (strcmp(args[0], "nameserver") == 0) { /* nameserver definition */ + struct dns_nameserver *newnameserver = NULL; + struct sockaddr_storage *sk; + int port1, port2; + struct protocol *proto; + + if (!*args[2]) { + ha_alert("parsing [%s:%d] : '%s' expects <name> and <addr>[:<port>] as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err = invalid_char(args[1]); + if (err) { + ha_alert("parsing [%s:%d] : character '%c' is not permitted in server name '%s'.\n", + file, linenum, *err, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + list_for_each_entry(newnameserver, &curr_resolvers->nameservers, list) { + /* Error if two resolvers owns the same name */ + if (strcmp(newnameserver->id, args[1]) == 0) { + ha_alert("Parsing [%s:%d]: nameserver '%s' has same name as another nameserver (declared at %s:%d).\n", + file, linenum, args[1], newnameserver->conf.file, newnameserver->conf.line); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + + sk = str2sa_range(args[2], NULL, &port1, &port2, NULL, &proto, NULL, + &errmsg, NULL, NULL, PA_O_RESOLVE | PA_O_PORT_OK | PA_O_PORT_MAND | PA_O_DGRAM | PA_O_STREAM | PA_O_DEFAULT_DGRAM); + if (!sk) { + ha_alert("parsing [%s:%d] : '%s %s' : %s\n", file, linenum, args[0], args[1], errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((newnameserver = calloc(1, sizeof(*newnameserver))) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if (proto && proto->xprt_type == PROTO_TYPE_STREAM) { + err_code |= parse_server(file, linenum, args, curr_resolvers->px, NULL, + SRV_PARSE_PARSE_ADDR|SRV_PARSE_INITIAL_RESOLVE); + if (err_code & (ERR_FATAL|ERR_ABORT)) { + err_code |= ERR_ABORT; + goto out; + } + + if (dns_stream_init(newnameserver, curr_resolvers->px->srv) < 0) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT|ERR_ABORT; + goto out; + } + } + else if (dns_dgram_init(newnameserver, sk) < 0) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if ((newnameserver->conf.file = strdup(file)) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + if ((newnameserver->id = strdup(args[1])) == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, linenum); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + newnameserver->parent = curr_resolvers; + newnameserver->process_responses = resolv_process_responses; + newnameserver->conf.line = linenum; + /* the nameservers are linked backward first */ + LIST_APPEND(&curr_resolvers->nameservers, &newnameserver->list); + } + else if (strcmp(args[0], "parse-resolv-conf") == 0) { + err_code |= parse_resolve_conf(&errmsg, &warnmsg); + if (err_code & ERR_WARN) { + indent_msg(&warnmsg, 8); + ha_warning("parsing [%s:%d]: %s\n", file, linenum, warnmsg); + ha_free(&warnmsg); + } + if (err_code & ERR_ALERT) { + indent_msg(&errmsg, 8); + ha_alert("parsing [%s:%d]: %s\n", file, linenum, errmsg); + ha_free(&errmsg); + goto out; + } + } + else if (strcmp(args[0], "hold") == 0) { /* hold periods */ + const char *res; + unsigned int time; + + if (!*args[2]) { + ha_alert("parsing [%s:%d] : '%s' expects an <event> and a <time> as arguments.\n", + file, linenum, args[0]); + ha_alert("<event> can be either 'valid', 'nx', 'refused', 'timeout', or 'other'\n"); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + res = parse_time_err(args[2], &time, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s>, maximum value is 2147483647 ms (~24.8 days).\n", + file, linenum, args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s>, minimum non-null value is 1 ms.\n", + file, linenum, args[1], args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res) { + ha_alert("parsing [%s:%d]: unexpected character '%c' in argument to <%s>.\n", + file, linenum, *res, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (strcmp(args[1], "nx") == 0) + curr_resolvers->hold.nx = time; + else if (strcmp(args[1], "other") == 0) + curr_resolvers->hold.other = time; + else if (strcmp(args[1], "refused") == 0) + curr_resolvers->hold.refused = time; + else if (strcmp(args[1], "timeout") == 0) + curr_resolvers->hold.timeout = time; + else if (strcmp(args[1], "valid") == 0) + curr_resolvers->hold.valid = time; + else if (strcmp(args[1], "obsolete") == 0) + curr_resolvers->hold.obsolete = time; + else { + ha_alert("parsing [%s:%d] : '%s' unknown <event>: '%s', expects either 'nx', 'timeout', 'valid', 'obsolete' or 'other'.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + } + else if (strcmp(args[0], "accepted_payload_size") == 0) { + int i = 0; + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects <nb> as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + i = atoi(args[1]); + if (i < DNS_HEADER_SIZE || i > DNS_MAX_UDP_MESSAGE) { + ha_alert("parsing [%s:%d] : '%s' must be between %d and %d inclusive (was %s).\n", + file, linenum, args[0], DNS_HEADER_SIZE, DNS_MAX_UDP_MESSAGE, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + curr_resolvers->accepted_payload_size = i; + } + else if (strcmp(args[0], "resolution_pool_size") == 0) { + ha_alert("parsing [%s:%d] : '%s' directive is not supported anymore (it never appeared in a stable release).\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[0], "resolve_retries") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects <nb> as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curr_resolvers->resolve_retries = atoi(args[1]); + } + else if (strcmp(args[0], "timeout") == 0) { + if (!*args[1]) { + ha_alert("parsing [%s:%d] : '%s' expects 'retry' or 'resolve' and <time> as arguments.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (strcmp(args[1], "retry") == 0 || + strcmp(args[1], "resolve") == 0) { + const char *res; + unsigned int tout; + + if (!*args[2]) { + ha_alert("parsing [%s:%d] : '%s %s' expects <time> as argument.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + res = parse_time_err(args[2], &tout, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s %s>, maximum value is 2147483647 ms (~24.8 days).\n", + file, linenum, args[2], args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s %s>, minimum non-null value is 1 ms.\n", + file, linenum, args[2], args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (res) { + ha_alert("parsing [%s:%d]: unexpected character '%c' in argument to <%s %s>.\n", + file, linenum, *res, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (args[1][2] == 't') + curr_resolvers->timeout.retry = tout; + else { + curr_resolvers->timeout.resolve = tout; + curr_resolvers->px->timeout.connect = tout; + } + + } + else { + ha_alert("parsing [%s:%d] : '%s' expects 'retry' or 'resolve' and <time> as arguments got '%s'.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (*args[0] != 0) { + ha_alert("parsing [%s:%d] : unknown keyword '%s' in '%s' section\n", file, linenum, args[0], cursection); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + +out: + free(errmsg); + free(warnmsg); + return err_code; +} + +/* try to create a "default" resolvers section which uses "/etc/resolv.conf" + * + * This function is opportunistic and does not try to display errors or warnings. + */ +int resolvers_create_default() +{ + int err_code = 0; + + if (global.mode & MODE_MWORKER_WAIT) /* does not create the section if in wait mode */ + return 0; + + /* if the section already exists, do nothing */ + if (find_resolvers_by_id("default")) + return 0; + + curr_resolvers = NULL; + err_code |= resolvers_new(&curr_resolvers, "default", "<internal>", 0); + if (err_code & ERR_CODE) + goto err; + + curr_resolvers->conf.implicit = 1; + + err_code |= parse_resolve_conf(NULL, NULL); + if (err_code & ERR_CODE) + goto err; + /* check if there was any nameserver in the resolvconf file */ + if (LIST_ISEMPTY(&curr_resolvers->nameservers)) { + err_code |= ERR_FATAL; + goto err; + } + +err: + if (err_code & ERR_CODE) { + resolvers_destroy(curr_resolvers); + curr_resolvers = NULL; + } + + /* we never return an error there, we only try to create this section + * if that's possible */ + return 0; +} + +int cfg_post_parse_resolvers() +{ + int err_code = 0; + struct server *srv; + + if (curr_resolvers) { + + /* prepare forward server descriptors */ + if (curr_resolvers->px) { + srv = curr_resolvers->px->srv; + while (srv) { + /* init ssl if needed */ + if (srv->use_ssl == 1 && xprt_get(XPRT_SSL) && xprt_get(XPRT_SSL)->prepare_srv) { + if (xprt_get(XPRT_SSL)->prepare_srv(srv)) { + ha_alert("unable to prepare SSL for server '%s' in resolvers section '%s'.\n", srv->id, curr_resolvers->id); + err_code |= ERR_ALERT | ERR_FATAL; + break; + } + } + srv = srv->next; + } + } + } + curr_resolvers = NULL; + return err_code; +} + +REGISTER_CONFIG_SECTION("resolvers", cfg_parse_resolvers, cfg_post_parse_resolvers); +REGISTER_POST_DEINIT(resolvers_deinit); +REGISTER_CONFIG_POSTPARSER("dns runtime resolver", resolvers_finalize_config); +REGISTER_PRE_CHECK(resolvers_create_default); diff --git a/src/ring.c b/src/ring.c new file mode 100644 index 0000000..849221e --- /dev/null +++ b/src/ring.c @@ -0,0 +1,482 @@ +/* + * Ring buffer management + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdlib.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/buf.h> +#include <haproxy/cli.h> +#include <haproxy/ring.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/thread.h> + +/* context used to dump the contents of a ring via "show events" or "show errors" */ +struct show_ring_ctx { + struct ring *ring; /* ring to be dumped */ + size_t ofs; /* storage offset to restart from; ~0=oldest */ + uint flags; /* set of RING_WF_* */ +}; + +/* Initialize a pre-allocated ring with the buffer area + * of size */ +void ring_init(struct ring *ring, void *area, size_t size) +{ + HA_RWLOCK_INIT(&ring->lock); + LIST_INIT(&ring->waiters); + ring->readers_count = 0; + ring->buf = b_make(area, size, 0, 0); + /* write the initial RC byte */ + b_putchr(&ring->buf, 0); +} + +/* Creates and returns a ring buffer of size <size> bytes. Returns NULL on + * allocation failure. + */ +struct ring *ring_new(size_t size) +{ + struct ring *ring = NULL; + void *area = NULL; + + if (size < 2) + goto fail; + + ring = malloc(sizeof(*ring)); + if (!ring) + goto fail; + + area = malloc(size); + if (!area) + goto fail; + + ring_init(ring, area, size); + return ring; + fail: + free(area); + free(ring); + return NULL; +} + +/* Creates a unified ring + storage area at address <area> for <size> bytes. + * If <area> is null, then it's allocated of the requested size. The ring + * struct is part of the area so the usable area is slightly reduced. However + * the ring storage is immediately adjacent to the struct. ring_free() will + * ignore such rings, so the caller is responsible for releasing them. + */ +struct ring *ring_make_from_area(void *area, size_t size) +{ + struct ring *ring = NULL; + + if (size < sizeof(*ring)) + return NULL; + + if (!area) + area = malloc(size); + if (!area) + return NULL; + + ring = area; + area += sizeof(*ring); + ring_init(ring, area, size - sizeof(*ring)); + return ring; +} + +/* Cast an unified ring + storage area to a ring from <area>, without + * reinitializing the data buffer. + * + * Reinitialize the waiters and the lock. + */ +struct ring *ring_cast_from_area(void *area) +{ + struct ring *ring = NULL; + + ring = area; + ring->buf.area = area + sizeof(*ring); + + HA_RWLOCK_INIT(&ring->lock); + LIST_INIT(&ring->waiters); + ring->readers_count = 0; + + return ring; +} + +/* Resizes existing ring <ring> to <size> which must be larger, without losing + * its contents. The new size must be at least as large as the previous one or + * no change will be performed. The pointer to the ring is returned on success, + * or NULL on allocation failure. This will lock the ring for writes. + */ +struct ring *ring_resize(struct ring *ring, size_t size) +{ + void *area; + + if (b_size(&ring->buf) >= size) + return ring; + + area = malloc(size); + if (!area) + return NULL; + + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + + /* recheck the buffer's size, it may have changed during the malloc */ + if (b_size(&ring->buf) < size) { + /* copy old contents */ + b_getblk(&ring->buf, area, ring->buf.data, 0); + area = HA_ATOMIC_XCHG(&ring->buf.area, area); + ring->buf.size = size; + } + + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); + + free(area); + return ring; +} + +/* destroys and frees ring <ring> */ +void ring_free(struct ring *ring) +{ + if (!ring) + return; + + /* make sure it was not allocated by ring_make_from_area */ + if (ring->buf.area == (void *)ring + sizeof(*ring)) + return; + + free(ring->buf.area); + free(ring); +} + +/* Tries to send <npfx> parts from <prefix> followed by <nmsg> parts from <msg> + * to ring <ring>. The message is sent atomically. It may be truncated to + * <maxlen> bytes if <maxlen> is non-null. There is no distinction between the + * two lists, it's just a convenience to help the caller prepend some prefixes + * when necessary. It takes the ring's write lock to make sure no other thread + * will touch the buffer during the update. Returns the number of bytes sent, + * or <=0 on failure. + */ +ssize_t ring_write(struct ring *ring, size_t maxlen, const struct ist pfx[], size_t npfx, const struct ist msg[], size_t nmsg) +{ + struct buffer *buf = &ring->buf; + struct appctx *appctx; + size_t totlen = 0; + size_t lenlen; + uint64_t dellen; + int dellenlen; + ssize_t sent = 0; + int i; + + /* we have to find some room to add our message (the buffer is + * never empty and at least contains the previous counter) and + * to update both the buffer contents and heads at the same + * time (it's doable using atomic ops but not worth the + * trouble, let's just lock). For this we first need to know + * the total message's length. We cannot measure it while + * copying due to the varint encoding of the length. + */ + for (i = 0; i < npfx; i++) + totlen += pfx[i].len; + for (i = 0; i < nmsg; i++) + totlen += msg[i].len; + + if (totlen > maxlen) + totlen = maxlen; + + lenlen = varint_bytes(totlen); + + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + if (lenlen + totlen + 1 + 1 > b_size(buf)) + goto done_buf; + + while (b_room(buf) < lenlen + totlen + 1) { + /* we need to delete the oldest message (from the end), + * and we have to stop if there's a reader stuck there. + * Unless there's corruption in the buffer it's guaranteed + * that we have enough data to find 1 counter byte, a + * varint-encoded length (1 byte min) and the message + * payload (0 bytes min). + */ + if (*b_head(buf)) + goto done_buf; + dellenlen = b_peek_varint(buf, 1, &dellen); + if (!dellenlen) + goto done_buf; + BUG_ON(b_data(buf) < 1 + dellenlen + dellen); + + b_del(buf, 1 + dellenlen + dellen); + } + + /* OK now we do have room */ + __b_put_varint(buf, totlen); + + totlen = 0; + for (i = 0; i < npfx; i++) { + size_t len = pfx[i].len; + + if (len + totlen > maxlen) + len = maxlen - totlen; + if (len) + __b_putblk(buf, pfx[i].ptr, len); + totlen += len; + } + + for (i = 0; i < nmsg; i++) { + size_t len = msg[i].len; + + if (len + totlen > maxlen) + len = maxlen - totlen; + if (len) + __b_putblk(buf, msg[i].ptr, len); + totlen += len; + } + + *b_tail(buf) = 0; buf->data++; // new read counter + sent = lenlen + totlen + 1; + + /* notify potential readers */ + list_for_each_entry(appctx, &ring->waiters, wait_entry) + appctx_wakeup(appctx); + + done_buf: + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); + return sent; +} + +/* Tries to attach appctx <appctx> as a new reader on ring <ring>. This is + * meant to be used by low level appctx code such as CLI or ring forwarding. + * For higher level functions, please see the relevant parts in appctx or CLI. + * It returns non-zero on success or zero on failure if too many users are + * already attached. On success, the caller MUST call ring_detach_appctx() + * to detach itself, even if it was never woken up. + */ +int ring_attach(struct ring *ring) +{ + int users = ring->readers_count; + + do { + if (users >= 255) + return 0; + } while (!_HA_ATOMIC_CAS(&ring->readers_count, &users, users + 1)); + return 1; +} + +/* detach an appctx from a ring. The appctx is expected to be waiting at offset + * <ofs> relative to the beginning of the storage, or ~0 if not waiting yet. + * Nothing is done if <ring> is NULL. + */ +void ring_detach_appctx(struct ring *ring, struct appctx *appctx, size_t ofs) +{ + if (!ring) + return; + + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + if (ofs != ~0) { + /* reader was still attached */ + if (ofs < b_head_ofs(&ring->buf)) + ofs += b_size(&ring->buf) - b_head_ofs(&ring->buf); + else + ofs -= b_head_ofs(&ring->buf); + + BUG_ON(ofs >= b_size(&ring->buf)); + LIST_DEL_INIT(&appctx->wait_entry); + HA_ATOMIC_DEC(b_peek(&ring->buf, ofs)); + } + HA_ATOMIC_DEC(&ring->readers_count); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); +} + +/* Tries to attach CLI handler <appctx> as a new reader on ring <ring>. This is + * meant to be used when registering a CLI function to dump a buffer, so it + * returns zero on success, or non-zero on failure with a message in the appctx + * CLI context. It automatically sets the io_handler and io_release callbacks if + * they were not set. The <flags> take a combination of RING_WF_*. + */ +int ring_attach_cli(struct ring *ring, struct appctx *appctx, uint flags) +{ + struct show_ring_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (!ring_attach(ring)) + return cli_err(appctx, + "Sorry, too many watchers (255) on this ring buffer. " + "What could it have so interesting to attract so many watchers ?"); + + if (!appctx->io_handler) + appctx->io_handler = cli_io_handler_show_ring; + if (!appctx->io_release) + appctx->io_release = cli_io_release_show_ring; + + memset(ctx, 0, sizeof(*ctx)); + ctx->ring = ring; + ctx->ofs = ~0; // start from the oldest event + ctx->flags = flags; + return 0; +} + +/* This function dumps all events from the ring whose pointer is in <p0> into + * the appctx's output buffer, and takes from <o0> the seek offset into the + * buffer's history (0 for oldest known event). It looks at <i0> for boolean + * options: bit0 means it must wait for new data or any key to be pressed. Bit1 + * means it must seek directly to the end to wait for new contents. It returns + * 0 if the output buffer or events are missing is full and it needs to be + * called again, otherwise non-zero. It is meant to be used with + * cli_release_show_ring() to clean up. + */ +int cli_io_handler_show_ring(struct appctx *appctx) +{ + struct show_ring_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct ring *ring = ctx->ring; + struct buffer *buf = &ring->buf; + size_t ofs; + size_t last_ofs; + uint64_t msg_len; + size_t len, cnt; + int ret; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + return 1; + + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + LIST_DEL_INIT(&appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); + + HA_RWLOCK_RDLOCK(RING_LOCK, &ring->lock); + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(ctx->ofs == ~0)) { + /* going to the end means looking at tail-1 */ + ctx->ofs = b_peek_ofs(buf, (ctx->flags & RING_WF_SEEK_NEW) ? b_data(buf) - 1 : 0); + HA_ATOMIC_INC(b_orig(buf) + ctx->ofs); + } + + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs = ctx->ofs - b_head_ofs(buf); + if (ctx->ofs < b_head_ofs(buf)) + ofs += b_size(buf); + + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + /* in this loop, ofs always points to the counter byte that precedes + * the message so that we can take our reference there if we have to + * stop before the end (ret=0). + */ + ret = 1; + while (ofs + 1 < b_data(buf)) { + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + + if (unlikely(msg_len + 1 > b_size(&trash))) { + /* too large a message to ever fit, let's skip it */ + ofs += cnt + msg_len; + continue; + } + + chunk_reset(&trash); + len = b_getblk(buf, trash.area, msg_len, ofs + cnt); + trash.data += len; + trash.area[trash.data++] = '\n'; + + if (applet_putchk(appctx, &trash) == -1) { + ret = 0; + break; + } + ofs += cnt + msg_len; + } + + HA_ATOMIC_INC(b_peek(buf, ofs)); + last_ofs = b_tail_ofs(buf); + ctx->ofs = b_peek_ofs(buf, ofs); + HA_RWLOCK_RDUNLOCK(RING_LOCK, &ring->lock); + + if (ret && (ctx->flags & RING_WF_WAIT_MODE)) { + /* we've drained everything and are configured to wait for more + * data or an event (keypress, close) + */ + if (!sc_oc(sc)->output && !(sc->flags & SC_FL_SHUT_DONE)) { + /* let's be woken up once new data arrive */ + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + LIST_APPEND(&ring->waiters, &appctx->wait_entry); + ofs = b_tail_ofs(&ring->buf); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); + if (ofs != last_ofs) { + /* more data was added into the ring between the + * unlock and the lock, and the writer might not + * have seen us. We need to reschedule a read. + */ + applet_have_more_data(appctx); + } else + applet_have_no_more_data(appctx); + ret = 0; + } + /* always drain all the request */ + co_skip(sc_oc(sc), sc_oc(sc)->output); + } + + applet_expect_no_data(appctx); + return ret; +} + +/* must be called after cli_io_handler_show_ring() above */ +void cli_io_release_show_ring(struct appctx *appctx) +{ + struct show_ring_ctx *ctx = appctx->svcctx; + struct ring *ring = ctx->ring; + size_t ofs = ctx->ofs; + + ring_detach_appctx(ring, appctx, ofs); +} + +/* Returns the MAXIMUM payload len that could theoretically fit into the ring + * based on ring buffer size. + * + * Computation logic relies on implementation details from 'ring-t.h'. + */ +size_t ring_max_payload(const struct ring *ring) +{ + size_t max; + + /* initial max = bufsize - 1 (initial RC) - 1 (payload RC) */ + max = b_size(&ring->buf) - 1 - 1; + + /* subtract payload VI (varint-encoded size) */ + max -= varint_bytes(max); + return max; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/sample.c b/src/sample.c new file mode 100644 index 0000000..89de612 --- /dev/null +++ b/src/sample.c @@ -0,0 +1,5173 @@ +/* + * Sample management functions. + * + * Copyright 2009-2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <string.h> +#include <arpa/inet.h> +#include <stdio.h> + +#include <import/mjson.h> +#include <import/sha1.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/auth.h> +#include <haproxy/base64.h> +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/clock.h> +#include <haproxy/errors.h> +#include <haproxy/fix.h> +#include <haproxy/global.h> +#include <haproxy/hash.h> +#include <haproxy/http.h> +#include <haproxy/istbuf.h> +#include <haproxy/mqtt.h> +#include <haproxy/net_helper.h> +#include <haproxy/protobuf.h> +#include <haproxy/proxy.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/sink.h> +#include <haproxy/stick_table.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/uri_auth-t.h> +#include <haproxy/vars.h> +#include <haproxy/xxhash.h> +#include <haproxy/jwt.h> + +/* sample type names */ +const char *smp_to_type[SMP_TYPES] = { + [SMP_T_ANY] = "any", + [SMP_T_SAME] = "same", + [SMP_T_BOOL] = "bool", + [SMP_T_SINT] = "sint", + [SMP_T_ADDR] = "addr", + [SMP_T_IPV4] = "ipv4", + [SMP_T_IPV6] = "ipv6", + [SMP_T_STR] = "str", + [SMP_T_BIN] = "bin", + [SMP_T_METH] = "meth", +}; + +/* static sample used in sample_process() when <p> is NULL */ +static THREAD_LOCAL struct sample temp_smp; + +/* list head of all known sample fetch keywords */ +static struct sample_fetch_kw_list sample_fetches = { + .list = LIST_HEAD_INIT(sample_fetches.list) +}; + +/* list head of all known sample format conversion keywords */ +static struct sample_conv_kw_list sample_convs = { + .list = LIST_HEAD_INIT(sample_convs.list) +}; + +const unsigned int fetch_cap[SMP_SRC_ENTRIES] = { + [SMP_SRC_CONST] = (SMP_VAL_FE_CON_ACC | SMP_VAL_FE_SES_ACC | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL_BE_CHK_RUL | SMP_VAL_CFG_PARSER | + SMP_VAL_CLI_PARSER ), + + [SMP_SRC_INTRN] = (SMP_VAL_FE_CON_ACC | SMP_VAL_FE_SES_ACC | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL_CLI_PARSER ), + + [SMP_SRC_LISTN] = (SMP_VAL_FE_CON_ACC | SMP_VAL_FE_SES_ACC | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_FTEND] = (SMP_VAL_FE_CON_ACC | SMP_VAL_FE_SES_ACC | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_L4CLI] = (SMP_VAL_FE_CON_ACC | SMP_VAL_FE_SES_ACC | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_L5CLI] = (SMP_VAL___________ | SMP_VAL_FE_SES_ACC | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_TRACK] = (SMP_VAL_FE_CON_ACC | SMP_VAL_FE_SES_ACC | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_L6REQ] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_HRQHV] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_HRQHP] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_HRQBO] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_BKEND] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_SERVR] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_L4SRV] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_L5SRV] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_L6RES] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL___________ | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_HRSHV] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL___________ | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_HRSHP] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL_BE_RES_CNT | + SMP_VAL_BE_HRS_HDR | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL_FE_LOG_END | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_HRSBO] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | + SMP_VAL_FE_RES_CNT | SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | + SMP_VAL___________ | SMP_VAL_BE_CHK_RUL | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_RQFIN] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_RSFIN] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_TXFIN] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), + + [SMP_SRC_SSFIN] = (SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL_FE_LOG_END | SMP_VAL___________ | SMP_VAL___________ | + SMP_VAL___________ ), +}; + +static const char *fetch_src_names[SMP_SRC_ENTRIES] = { + [SMP_SRC_INTRN] = "internal state", + [SMP_SRC_LISTN] = "listener", + [SMP_SRC_FTEND] = "frontend", + [SMP_SRC_L4CLI] = "client address", + [SMP_SRC_L5CLI] = "client-side connection", + [SMP_SRC_TRACK] = "track counters", + [SMP_SRC_L6REQ] = "request buffer", + [SMP_SRC_HRQHV] = "HTTP request headers", + [SMP_SRC_HRQHP] = "HTTP request", + [SMP_SRC_HRQBO] = "HTTP request body", + [SMP_SRC_BKEND] = "backend", + [SMP_SRC_SERVR] = "server", + [SMP_SRC_L4SRV] = "server address", + [SMP_SRC_L5SRV] = "server-side connection", + [SMP_SRC_L6RES] = "response buffer", + [SMP_SRC_HRSHV] = "HTTP response headers", + [SMP_SRC_HRSHP] = "HTTP response", + [SMP_SRC_HRSBO] = "HTTP response body", + [SMP_SRC_RQFIN] = "request buffer statistics", + [SMP_SRC_RSFIN] = "response buffer statistics", + [SMP_SRC_TXFIN] = "transaction statistics", + [SMP_SRC_SSFIN] = "session statistics", +}; + +static const char *fetch_ckp_names[SMP_CKP_ENTRIES] = { + [SMP_CKP_FE_CON_ACC] = "frontend tcp-request connection rule", + [SMP_CKP_FE_SES_ACC] = "frontend tcp-request session rule", + [SMP_CKP_FE_REQ_CNT] = "frontend tcp-request content rule", + [SMP_CKP_FE_HRQ_HDR] = "frontend http-request header rule", + [SMP_CKP_FE_HRQ_BDY] = "frontend http-request body rule", + [SMP_CKP_FE_SET_BCK] = "frontend use-backend rule", + [SMP_CKP_BE_REQ_CNT] = "backend tcp-request content rule", + [SMP_CKP_BE_HRQ_HDR] = "backend http-request header rule", + [SMP_CKP_BE_HRQ_BDY] = "backend http-request body rule", + [SMP_CKP_BE_SET_SRV] = "backend use-server, balance or stick-match rule", + [SMP_CKP_BE_SRV_CON] = "server source selection", + [SMP_CKP_BE_RES_CNT] = "backend tcp-response content rule", + [SMP_CKP_BE_HRS_HDR] = "backend http-response header rule", + [SMP_CKP_BE_HRS_BDY] = "backend http-response body rule", + [SMP_CKP_BE_STO_RUL] = "backend stick-store rule", + [SMP_CKP_FE_RES_CNT] = "frontend tcp-response content rule", + [SMP_CKP_FE_HRS_HDR] = "frontend http-response header rule", + [SMP_CKP_FE_HRS_BDY] = "frontend http-response body rule", + [SMP_CKP_FE_LOG_END] = "logs", + [SMP_CKP_BE_CHK_RUL] = "backend tcp-check rule", + [SMP_CKP_CFG_PARSER] = "configuration parser", + [SMP_CKP_CLI_PARSER] = "CLI parser", +}; + +/* This function returns the most accurate expected type of the data returned + * by the sample_expr. It assumes that the <expr> and all of its converters are + * properly initialized. + */ +int smp_expr_output_type(struct sample_expr *expr) +{ + struct sample_conv_expr *cur_smp = NULL; + int cur_type = SMP_T_ANY; /* current type in the chain */ + int next_type = SMP_T_ANY; /* next type in the chain */ + + if (!LIST_ISEMPTY(&expr->conv_exprs)) { + /* Ignore converters that output SMP_T_SAME if switching to them is + * conversion-free. (such converter's output match with input, thus only + * their input is considered) + * + * We start looking at the end of conv list and then loop back until the + * sample fetch for better performance (it is more likely to find the last + * effective output type near the end of the chain) + */ + do { + struct list *cur_head = (cur_smp) ? &cur_smp->list : &expr->conv_exprs; + + cur_smp = LIST_PREV(cur_head, struct sample_conv_expr *, list); + if (cur_smp->conv->out_type != SMP_T_SAME) { + /* current converter has effective out_type */ + cur_type = cur_smp->conv->out_type; + goto out; + } + else if (sample_casts[cur_type][next_type] != c_none) + return next_type; /* switching to next type is not conversion-free */ + + next_type = cur_smp->conv->in_type; + } while (cur_smp != LIST_NEXT(&expr->conv_exprs, struct sample_conv_expr *, list)); + } + /* conv list empty or doesn't have effective out_type, + * falling back to sample fetch out_type + */ + cur_type = expr->fetch->out_type; + out: + if (sample_casts[cur_type][next_type] != c_none) + return next_type; /* switching to next type is not conversion-free */ + return cur_type; +} + + +/* fill the trash with a comma-delimited list of source names for the <use> bit + * field which must be composed of a non-null set of SMP_USE_* flags. The return + * value is the pointer to the string in the trash buffer. + */ +const char *sample_src_names(unsigned int use) +{ + int bit; + + trash.data = 0; + trash.area[0] = '\0'; + for (bit = 0; bit < SMP_SRC_ENTRIES; bit++) { + if (!(use & ~((1 << bit) - 1))) + break; /* no more bits */ + + if (!(use & (1 << bit))) + continue; /* bit not set */ + + trash.data += snprintf(trash.area + trash.data, + trash.size - trash.data, "%s%s", + (use & ((1 << bit) - 1)) ? "," : "", + fetch_src_names[bit]); + } + return trash.area; +} + +/* return a pointer to the correct sample checkpoint name, or "unknown" when + * the flags are invalid. Only the lowest bit is used, higher bits are ignored + * if set. + */ +const char *sample_ckp_names(unsigned int use) +{ + int bit; + + for (bit = 0; bit < SMP_CKP_ENTRIES; bit++) + if (use & (1 << bit)) + return fetch_ckp_names[bit]; + return "unknown sample check place, please report this bug"; +} + +/* + * Registers the sample fetch keyword list <kwl> as a list of valid keywords + * for next parsing sessions. The fetch keywords capabilities are also computed + * from their ->use field. + */ +void sample_register_fetches(struct sample_fetch_kw_list *kwl) +{ + struct sample_fetch *sf; + int bit; + + for (sf = kwl->kw; sf->kw != NULL; sf++) { + for (bit = 0; bit < SMP_SRC_ENTRIES; bit++) + if (sf->use & (1 << bit)) + sf->val |= fetch_cap[bit]; + } + LIST_APPEND(&sample_fetches.list, &kwl->list); +} + +/* + * Registers the sample format coverstion keyword list <pckl> as a list of valid keywords for next + * parsing sessions. + */ +void sample_register_convs(struct sample_conv_kw_list *pckl) +{ + LIST_APPEND(&sample_convs.list, &pckl->list); +} + +/* + * Returns the pointer on sample fetch keyword structure identified by + * string of <len> in buffer <kw>. + * + */ +struct sample_fetch *find_sample_fetch(const char *kw, int len) +{ + int index; + struct sample_fetch_kw_list *kwl; + + list_for_each_entry(kwl, &sample_fetches.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (strncmp(kwl->kw[index].kw, kw, len) == 0 && + kwl->kw[index].kw[len] == '\0') + return &kwl->kw[index]; + } + } + return NULL; +} + +/* dump list of registered sample fetch keywords on stdout */ +void smp_dump_fetch_kw(void) +{ + struct sample_fetch_kw_list *kwl; + struct sample_fetch *kwp, *kw; + uint64_t mask; + int index; + int arg; + int bit; + + for (bit = 0; bit <= SMP_CKP_ENTRIES + 1; bit++) { + putchar('#'); + for (index = 0; bit + index <= SMP_CKP_ENTRIES; index++) + putchar(' '); + for (index = 0; index < bit && index < SMP_CKP_ENTRIES; index++) + printf((bit <= SMP_CKP_ENTRIES) ? "/ " : " |"); + for (index = bit; bit < SMP_CKP_ENTRIES && index < SMP_CKP_ENTRIES + 2; index++) + if (index == bit) + putchar('_'); + else if (index == bit + 1) + putchar('.'); + else + putchar('-'); + printf(" %s\n", (bit < SMP_CKP_ENTRIES) ? fetch_ckp_names[bit] : ""); + } + + for (kw = kwp = NULL;; kwp = kw) { + list_for_each_entry(kwl, &sample_fetches.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (strordered(kwp ? kwp->kw : NULL, + kwl->kw[index].kw, + kw != kwp ? kw->kw : NULL)) + kw = &kwl->kw[index]; + } + } + + if (kw == kwp) + break; + + printf("[ "); + for (bit = 0; bit < SMP_CKP_ENTRIES; bit++) + printf("%s", (kw->val & (1 << bit)) ? "Y " : ". "); + + printf("] %s", kw->kw); + if (kw->arg_mask) { + mask = kw->arg_mask >> ARGM_BITS; + printf("("); + for (arg = 0; + arg < ARGM_NBARGS && ((mask >> (arg * ARGT_BITS)) & ARGT_MASK); + arg++) { + if (arg == (kw->arg_mask & ARGM_MASK)) { + /* now dumping extra args */ + printf("["); + } + if (arg) + printf(","); + printf("%s", arg_type_names[(mask >> (arg * ARGT_BITS)) & ARGT_MASK]); + } + if (arg > (kw->arg_mask & ARGM_MASK)) { + /* extra args were dumped */ + printf("]"); + } + printf(")"); + } + printf(": %s", smp_to_type[kw->out_type]); + printf("\n"); + } +} + +/* dump list of registered sample converter keywords on stdout */ +void smp_dump_conv_kw(void) +{ + struct sample_conv_kw_list *kwl; + struct sample_conv *kwp, *kw; + uint64_t mask; + int index; + int arg; + + for (kw = kwp = NULL;; kwp = kw) { + list_for_each_entry(kwl, &sample_convs.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (strordered(kwp ? kwp->kw : NULL, + kwl->kw[index].kw, + kw != kwp ? kw->kw : NULL)) + kw = &kwl->kw[index]; + } + } + + if (kw == kwp) + break; + + printf("%s", kw->kw); + if (kw->arg_mask) { + mask = kw->arg_mask >> ARGM_BITS; + printf("("); + for (arg = 0; + arg < ARGM_NBARGS && ((mask >> (arg * ARGT_BITS)) & ARGT_MASK); + arg++) { + if (arg == (kw->arg_mask & ARGM_MASK)) { + /* now dumping extra args */ + printf("["); + } + if (arg) + printf(","); + printf("%s", arg_type_names[(mask >> (arg * ARGT_BITS)) & ARGT_MASK]); + } + if (arg > (kw->arg_mask & ARGM_MASK)) { + /* extra args were dumped */ + printf("]"); + } + printf(")"); + } + printf(": %s => %s", smp_to_type[kw->out_type], smp_to_type[kw->in_type]); + printf("\n"); + } +} + +/* This function browses the list of available sample fetches. <current> is + * the last used sample fetch. If it is the first call, it must set to NULL. + * <idx> is the index of the next sample fetch entry. It is used as private + * value. It is useless to initiate it. + * + * It returns always the new fetch_sample entry, and NULL when the end of + * the list is reached. + */ +struct sample_fetch *sample_fetch_getnext(struct sample_fetch *current, int *idx) +{ + struct sample_fetch_kw_list *kwl; + struct sample_fetch *base; + + if (!current) { + /* Get first kwl entry. */ + kwl = LIST_NEXT(&sample_fetches.list, struct sample_fetch_kw_list *, list); + (*idx) = 0; + } else { + /* Get kwl corresponding to the current entry. */ + base = current + 1 - (*idx); + kwl = container_of(base, struct sample_fetch_kw_list, kw); + } + + while (1) { + + /* Check if kwl is the last entry. */ + if (&kwl->list == &sample_fetches.list) + return NULL; + + /* idx contain the next keyword. If it is available, return it. */ + if (kwl->kw[*idx].kw) { + (*idx)++; + return &kwl->kw[(*idx)-1]; + } + + /* get next entry in the main list, and return NULL if the end is reached. */ + kwl = LIST_NEXT(&kwl->list, struct sample_fetch_kw_list *, list); + + /* Set index to 0, ans do one other loop. */ + (*idx) = 0; + } +} + +/* This function browses the list of available converters. <current> is + * the last used converter. If it is the first call, it must set to NULL. + * <idx> is the index of the next converter entry. It is used as private + * value. It is useless to initiate it. + * + * It returns always the next sample_conv entry, and NULL when the end of + * the list is reached. + */ +struct sample_conv *sample_conv_getnext(struct sample_conv *current, int *idx) +{ + struct sample_conv_kw_list *kwl; + struct sample_conv *base; + + if (!current) { + /* Get first kwl entry. */ + kwl = LIST_NEXT(&sample_convs.list, struct sample_conv_kw_list *, list); + (*idx) = 0; + } else { + /* Get kwl corresponding to the current entry. */ + base = current + 1 - (*idx); + kwl = container_of(base, struct sample_conv_kw_list, kw); + } + + while (1) { + /* Check if kwl is the last entry. */ + if (&kwl->list == &sample_convs.list) + return NULL; + + /* idx contain the next keyword. If it is available, return it. */ + if (kwl->kw[*idx].kw) { + (*idx)++; + return &kwl->kw[(*idx)-1]; + } + + /* get next entry in the main list, and return NULL if the end is reached. */ + kwl = LIST_NEXT(&kwl->list, struct sample_conv_kw_list *, list); + + /* Set index to 0, ans do one other loop. */ + (*idx) = 0; + } +} + +/* + * Returns the pointer on sample format conversion keyword structure identified by + * string of <len> in buffer <kw>. + * + */ +struct sample_conv *find_sample_conv(const char *kw, int len) +{ + int index; + struct sample_conv_kw_list *kwl; + + list_for_each_entry(kwl, &sample_convs.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (strncmp(kwl->kw[index].kw, kw, len) == 0 && + kwl->kw[index].kw[len] == '\0') + return &kwl->kw[index]; + } + } + return NULL; +} + +/******************************************************************/ +/* Sample casts functions */ +/******************************************************************/ + +static int c_ip2int(struct sample *smp) +{ + smp->data.u.sint = ntohl(smp->data.u.ipv4.s_addr); + smp->data.type = SMP_T_SINT; + return 1; +} + +static int c_ip2str(struct sample *smp) +{ + struct buffer *trash = get_trash_chunk(); + + if (!inet_ntop(AF_INET, (void *)&smp->data.u.ipv4, trash->area, trash->size)) + return 0; + + trash->data = strlen(trash->area); + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + + return 1; +} + +static int c_ip2ipv6(struct sample *smp) +{ + v4tov6(&smp->data.u.ipv6, &smp->data.u.ipv4); + smp->data.type = SMP_T_IPV6; + return 1; +} + +static int c_ipv62ip(struct sample *smp) +{ + if (!v6tov4(&smp->data.u.ipv4, &smp->data.u.ipv6)) + return 0; + smp->data.type = SMP_T_IPV4; + return 1; +} + +static int c_ipv62str(struct sample *smp) +{ + struct buffer *trash = get_trash_chunk(); + + if (!inet_ntop(AF_INET6, (void *)&smp->data.u.ipv6, trash->area, trash->size)) + return 0; + + trash->data = strlen(trash->area); + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +/* +static int c_ipv62ip(struct sample *smp) +{ + return v6tov4(&smp->data.u.ipv4, &smp->data.u.ipv6); +} +*/ + +static int c_int2ip(struct sample *smp) +{ + smp->data.u.ipv4.s_addr = htonl((unsigned int)smp->data.u.sint); + smp->data.type = SMP_T_IPV4; + return 1; +} + +static int c_int2ipv6(struct sample *smp) +{ + smp->data.u.ipv4.s_addr = htonl((unsigned int)smp->data.u.sint); + v4tov6(&smp->data.u.ipv6, &smp->data.u.ipv4); + smp->data.type = SMP_T_IPV6; + return 1; +} + +static int c_str2addr(struct sample *smp) +{ + if (!buf2ip(smp->data.u.str.area, smp->data.u.str.data, &smp->data.u.ipv4)) { + if (!buf2ip6(smp->data.u.str.area, smp->data.u.str.data, &smp->data.u.ipv6)) + return 0; + smp->data.type = SMP_T_IPV6; + smp->flags &= ~SMP_F_CONST; + return 1; + } + smp->data.type = SMP_T_IPV4; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int c_str2ip(struct sample *smp) +{ + if (!buf2ip(smp->data.u.str.area, smp->data.u.str.data, &smp->data.u.ipv4)) + return 0; + smp->data.type = SMP_T_IPV4; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int c_str2ipv6(struct sample *smp) +{ + if (!buf2ip6(smp->data.u.str.area, smp->data.u.str.data, &smp->data.u.ipv6)) + return 0; + smp->data.type = SMP_T_IPV6; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +/* + * The NULL char always enforces the end of string if it is met. + * Data is never changed, so we can ignore the CONST case + */ +static int c_bin2str(struct sample *smp) +{ + int i; + + for (i = 0; i < smp->data.u.str.data; i++) { + if (!smp->data.u.str.area[i]) { + smp->data.u.str.data = i; + break; + } + } + smp->data.type = SMP_T_STR; + return 1; +} + +static int c_int2str(struct sample *smp) +{ + struct buffer *trash = get_trash_chunk(); + char *pos; + + pos = lltoa_r(smp->data.u.sint, trash->area, trash->size); + if (!pos) + return 0; + + trash->size = trash->size - (pos - trash->area); + trash->area = pos; + trash->data = strlen(pos); + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +/* This function unconditionally duplicates data and removes the "const" flag. + * For strings and binary blocks, it also provides a known allocated size with + * a length that is capped to the size, and ensures a trailing zero is always + * appended for strings. This is necessary for some operations which may + * require to extend the length. It returns 0 if it fails, 1 on success. + */ +int smp_dup(struct sample *smp) +{ + struct buffer *trash; + + switch (smp->data.type) { + case SMP_T_BOOL: + case SMP_T_SINT: + case SMP_T_ADDR: + case SMP_T_IPV4: + case SMP_T_IPV6: + /* These type are not const. */ + break; + + case SMP_T_METH: + if (smp->data.u.meth.meth != HTTP_METH_OTHER) + break; + __fallthrough; + + case SMP_T_STR: + trash = get_trash_chunk(); + trash->data = smp->data.type == SMP_T_STR ? + smp->data.u.str.data : smp->data.u.meth.str.data; + if (trash->data > trash->size - 1) + trash->data = trash->size - 1; + + memcpy(trash->area, smp->data.type == SMP_T_STR ? + smp->data.u.str.area : smp->data.u.meth.str.area, + trash->data); + trash->area[trash->data] = 0; + smp->data.u.str = *trash; + break; + + case SMP_T_BIN: + trash = get_trash_chunk(); + trash->data = smp->data.u.str.data; + if (trash->data > trash->size) + trash->data = trash->size; + + memcpy(trash->area, smp->data.u.str.area, trash->data); + smp->data.u.str = *trash; + break; + + default: + /* Other cases are unexpected. */ + return 0; + } + + /* remove const flag */ + smp->flags &= ~SMP_F_CONST; + return 1; +} + +int c_none(struct sample *smp) +{ + return 1; +} + +/* special converter function used by pseudo types in the compatibility matrix + * to inform that the conversion is theoretically allowed at parsing time. + * + * However, being a pseudo type, it may not be emitted by fetches or converters + * so this function should never be called. If this is the case, then it means + * that a pseudo type has been used as a final output type at runtime, which is + * considered as a bug and should be fixed. To help spot this kind of bug, the + * process will crash in this case. + */ +int c_pseudo(struct sample *smp) +{ + ABORT_NOW(); // die loudly + /* never reached */ + return 0; +} + +static int c_str2int(struct sample *smp) +{ + const char *str; + const char *end; + + if (smp->data.u.str.data == 0) + return 0; + + str = smp->data.u.str.area; + end = smp->data.u.str.area + smp->data.u.str.data; + + smp->data.u.sint = read_int64(&str, end); + smp->data.type = SMP_T_SINT; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int c_str2meth(struct sample *smp) +{ + enum http_meth_t meth; + int len; + + meth = find_http_meth(smp->data.u.str.area, smp->data.u.str.data); + if (meth == HTTP_METH_OTHER) { + len = smp->data.u.str.data; + smp->data.u.meth.str.area = smp->data.u.str.area; + smp->data.u.meth.str.data = len; + } + else + smp->flags &= ~SMP_F_CONST; + smp->data.u.meth.meth = meth; + smp->data.type = SMP_T_METH; + return 1; +} + +static int c_meth2str(struct sample *smp) +{ + int len; + enum http_meth_t meth; + + if (smp->data.u.meth.meth == HTTP_METH_OTHER) { + /* The method is unknown. Copy the original pointer. */ + len = smp->data.u.meth.str.data; + smp->data.u.str.area = smp->data.u.meth.str.area; + smp->data.u.str.data = len; + smp->data.type = SMP_T_STR; + } + else if (smp->data.u.meth.meth < HTTP_METH_OTHER) { + /* The method is known, copy the pointer containing the string. */ + meth = smp->data.u.meth.meth; + smp->data.u.str.area = http_known_methods[meth].ptr; + smp->data.u.str.data = http_known_methods[meth].len; + smp->flags |= SMP_F_CONST; + smp->data.type = SMP_T_STR; + } + else { + /* Unknown method */ + return 0; + } + return 1; +} + +static int c_addr2bin(struct sample *smp) +{ + struct buffer *chk = get_trash_chunk(); + + if (smp->data.type == SMP_T_IPV4) { + chk->data = 4; + memcpy(chk->area, &smp->data.u.ipv4, chk->data); + } + else if (smp->data.type == SMP_T_IPV6) { + chk->data = 16; + memcpy(chk->area, &smp->data.u.ipv6, chk->data); + } + else + return 0; + + smp->data.u.str = *chk; + smp->data.type = SMP_T_BIN; + return 1; +} + +static int c_int2bin(struct sample *smp) +{ + struct buffer *chk = get_trash_chunk(); + + *(unsigned long long int *) chk->area = my_htonll(smp->data.u.sint); + chk->data = 8; + + smp->data.u.str = *chk; + smp->data.type = SMP_T_BIN; + return 1; +} + +static int c_bool2bin(struct sample *smp) +{ + struct buffer *chk = get_trash_chunk(); + + *(unsigned long long int *)chk->area = my_htonll(!!smp->data.u.sint); + chk->data = 8; + smp->data.u.str = *chk; + smp->data.type = SMP_T_BIN; + return 1; +} + + +/*****************************************************************/ +/* Sample casts matrix: */ +/* sample_casts[from type][to type] */ +/* NULL pointer used for impossible sample casts */ +/*****************************************************************/ + +sample_cast_fct sample_casts[SMP_TYPES][SMP_TYPES] = { +/* to: ANY SAME BOOL SINT ADDR IPV4 IPV6 STR BIN METH */ +/* from: ANY */ { c_none, NULL, c_pseudo, c_pseudo, c_pseudo, c_pseudo, c_pseudo, c_pseudo, c_pseudo, c_pseudo }, +/* SAME */ { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }, +/* BOOL */ { c_none, NULL, c_none, c_none, NULL, NULL, NULL, c_int2str, c_bool2bin, NULL }, +/* SINT */ { c_none, NULL, c_none, c_none, c_int2ip, c_int2ip, c_int2ipv6, c_int2str, c_int2bin, NULL }, +/* ADDR */ { c_none, NULL, NULL, NULL, c_pseudo, c_pseudo, c_pseudo, c_pseudo, c_pseudo, NULL }, +/* IPV4 */ { c_none, NULL, NULL, c_ip2int, c_none, c_none, c_ip2ipv6, c_ip2str, c_addr2bin, NULL }, +/* IPV6 */ { c_none, NULL, NULL, NULL, c_none, c_ipv62ip, c_none, c_ipv62str, c_addr2bin, NULL }, +/* STR */ { c_none, NULL, c_str2int, c_str2int, c_str2addr, c_str2ip, c_str2ipv6, c_none, c_none, c_str2meth }, +/* BIN */ { c_none, NULL, NULL, NULL, NULL, NULL, NULL, c_bin2str, c_none, c_str2meth }, +/* METH */ { c_none, NULL, NULL, NULL, NULL, NULL, NULL, c_meth2str, c_meth2str, c_none } +}; + +/* Process the converters (if any) for a sample expr after the first fetch + * keyword. We have two supported syntaxes for the converters, which can be + * combined: + * - comma-delimited list of converters just after the keyword and args ; + * - one converter per keyword (if <idx> != NULL) + * FIXME: should we continue to support this old syntax? + * The combination allows to have each keyword being a comma-delimited + * series of converters. + * + * We want to process the former first, then the latter. For this we start + * from the beginning of the supposed place in the exiting conv chain, which + * starts at the last comma (<start> which is then referred to as endt). + * + * If <endptr> is non-nul, it will be set to the first unparsed character + * (which may be the final '\0') on success. If it is nul, the expression + * must be properly terminated by a '\0' otherwise an error is reported. + * + * <expr> should point the the sample expression that is already initialized + * with the sample fetch that precedes the converters chain. + * + * The function returns a positive value for success and 0 for failure, in which + * case <err_msg> will point to an allocated string that brings some info + * about the failure. It is the caller's responsibility to free it. + */ +int sample_parse_expr_cnv(char **str, int *idx, char **endptr, char **err_msg, struct arg_list *al, const char *file, int line, + struct sample_expr *expr, const char *start) +{ + struct sample_conv *conv; + const char *endt = start; /* end of term */ + const char *begw; /* beginning of word */ + const char *endw; /* end of word */ + char *ckw = NULL; + unsigned long prev_type = expr->fetch->out_type; + int success = 1; + + while (1) { + struct sample_conv_expr *conv_expr; + int err_arg; + int argcnt; + + if (*endt && *endt != ',') { + if (endptr) { + /* end found, let's stop here */ + break; + } + if (ckw) + memprintf(err_msg, "missing comma after converter '%s'", ckw); + else + memprintf(err_msg, "missing comma after fetch keyword"); + goto out_error; + } + + /* FIXME: how long should we support such idiocies ? Maybe we + * should already warn ? + */ + while (*endt == ',') /* then trailing commas */ + endt++; + + begw = endt; /* start of converter */ + + if (!*begw) { + /* none ? skip to next string if idx is set */ + if (!idx) + break; /* end of converters */ + (*idx)++; + begw = str[*idx]; + if (!begw || !*begw) + break; + } + + for (endw = begw; is_idchar(*endw); endw++) + ; + + ha_free(&ckw); + ckw = my_strndup(begw, endw - begw); + + conv = find_sample_conv(begw, endw - begw); + if (!conv) { + /* we found an isolated keyword that we don't know, it's not ours */ + if (idx && begw == str[*idx]) { + endt = begw; + break; + } + memprintf(err_msg, "unknown converter '%s'", ckw); + goto out_error; + } + + if (conv->in_type >= SMP_TYPES || conv->out_type >= SMP_TYPES) { + memprintf(err_msg, "return type of converter '%s' is unknown", ckw); + goto out_error; + } + + /* If impossible type conversion */ + if (!sample_casts[prev_type][conv->in_type]) { + memprintf(err_msg, "converter '%s' cannot be applied", ckw); + goto out_error; + } + + /* Ignore converters that output SMP_T_SAME if switching to them is + * conversion-free. (such converter's output match with input, thus only + * their input is considered) + */ + if (conv->out_type != SMP_T_SAME) + prev_type = conv->out_type; + else if (sample_casts[prev_type][conv->in_type] != c_none) + prev_type = conv->in_type; + + conv_expr = calloc(1, sizeof(*conv_expr)); + if (!conv_expr) + goto out_error; + + LIST_APPEND(&(expr->conv_exprs), &(conv_expr->list)); + conv_expr->conv = conv; + + if (al) { + al->kw = expr->fetch->kw; + al->conv = conv_expr->conv->kw; + } + argcnt = make_arg_list(endw, -1, conv->arg_mask, &conv_expr->arg_p, err_msg, &endt, &err_arg, al); + if (argcnt < 0) { + memprintf(err_msg, "invalid arg %d in converter '%s' : %s", err_arg+1, ckw, *err_msg); + goto out_error; + } + + if (argcnt && !conv->arg_mask) { + memprintf(err_msg, "converter '%s' does not support any args", ckw); + goto out_error; + } + + if (!conv_expr->arg_p) + conv_expr->arg_p = empty_arg_list; + + if (conv->val_args && !conv->val_args(conv_expr->arg_p, conv, file, line, err_msg)) { + memprintf(err_msg, "invalid args in converter '%s' : %s", ckw, *err_msg); + goto out_error; + } + } + + if (endptr) { + /* end found, let's stop here */ + *endptr = (char *)endt; + } + out: + free(ckw); + return success; + + out_error: + success = 0; + goto out; +} + +/* + * Parse a sample expression configuration: + * fetch keyword followed by format conversion keywords. + * + * <al> is an arg_list serving as a list head to report missing dependencies. + * It may be NULL if such dependencies are not allowed. Otherwise, the caller + * must have set al->ctx if al is set. + * + * Returns a pointer on allocated sample expression structure or NULL in case + * of error, in which case <err_msg> will point to an allocated string that + * brings some info about the failure. It is the caller's responsibility to + * free it. + */ +struct sample_expr *sample_parse_expr(char **str, int *idx, const char *file, int line, char **err_msg, struct arg_list *al, char **endptr) +{ + const char *begw; /* beginning of word */ + const char *endw; /* end of word */ + const char *endt; /* end of term */ + struct sample_expr *expr = NULL; + struct sample_fetch *fetch; + char *fkw = NULL; + int err_arg; + + begw = str[*idx]; + for (endw = begw; is_idchar(*endw); endw++) + ; + + if (endw == begw) { + memprintf(err_msg, "missing fetch method"); + goto out_error; + } + + /* keep a copy of the current fetch keyword for error reporting */ + fkw = my_strndup(begw, endw - begw); + + fetch = find_sample_fetch(begw, endw - begw); + if (!fetch) { + memprintf(err_msg, "unknown fetch method '%s'", fkw); + goto out_error; + } + + /* At this point, we have : + * - begw : beginning of the keyword + * - endw : end of the keyword, first character not part of keyword + */ + + if (fetch->out_type >= SMP_TYPES) { + memprintf(err_msg, "returns type of fetch method '%s' is unknown", fkw); + goto out_error; + } + + expr = calloc(1, sizeof(*expr)); + if (!expr) + goto out_error; + + LIST_INIT(&(expr->conv_exprs)); + expr->fetch = fetch; + expr->arg_p = empty_arg_list; + + /* Note that we call the argument parser even with an empty string, + * this allows it to automatically create entries for mandatory + * implicit arguments (eg: local proxy name). + */ + if (al) { + al->kw = expr->fetch->kw; + al->conv = NULL; + } + if (make_arg_list(endw, -1, fetch->arg_mask, &expr->arg_p, err_msg, &endt, &err_arg, al) < 0) { + memprintf(err_msg, "fetch method '%s' : %s", fkw, *err_msg); + goto out_error; + } + + /* now endt is our first char not part of the arg list, typically the + * comma after the sample fetch name or after the closing parenthesis, + * or the NUL char. + */ + + if (!expr->arg_p) { + expr->arg_p = empty_arg_list; + } + else if (fetch->val_args && !fetch->val_args(expr->arg_p, err_msg)) { + memprintf(err_msg, "invalid args in fetch method '%s' : %s", fkw, *err_msg); + goto out_error; + } + + if (!sample_parse_expr_cnv(str, idx, endptr, err_msg, al, file, line, expr, endt)) + goto out_error; + + out: + free(fkw); + return expr; + +out_error: + release_sample_expr(expr); + expr = NULL; + goto out; +} + +/* + * Helper function to process the converter list of a given sample expression + * <expr> using the sample <p> (which is assumed to be properly initialized) + * as input. + * + * Returns 1 on success and 0 on failure. + */ +int sample_process_cnv(struct sample_expr *expr, struct sample *p) +{ + struct sample_conv_expr *conv_expr; + + list_for_each_entry(conv_expr, &expr->conv_exprs, list) { + /* we want to ensure that p->type can be casted into + * conv_expr->conv->in_type. We have 3 possibilities : + * - NULL => not castable. + * - c_none => nothing to do (let's optimize it) + * - other => apply cast and prepare to fail + */ + if (!sample_casts[p->data.type][conv_expr->conv->in_type]) + return 0; + + if (sample_casts[p->data.type][conv_expr->conv->in_type] != c_none && + !sample_casts[p->data.type][conv_expr->conv->in_type](p)) + return 0; + + /* OK cast succeeded */ + + if (!conv_expr->conv->process(conv_expr->arg_p, p, conv_expr->conv->private)) + return 0; + } + return 1; +} + +/* + * Process a fetch + format conversion of defined by the sample expression <expr> + * on request or response considering the <opt> parameter. + * Returns a pointer on a typed sample structure containing the result or NULL if + * sample is not found or when format conversion failed. + * If <p> is not null, function returns results in structure pointed by <p>. + * If <p> is null, functions returns a pointer on a static sample structure. + * + * Note: the fetch functions are required to properly set the return type. The + * conversion functions must do so too. However the cast functions do not need + * to since they're made to cast multiple types according to what is required. + * + * The caller may indicate in <opt> if it considers the result final or not. + * The caller needs to check the SMP_F_MAY_CHANGE flag in p->flags to verify + * if the result is stable or not, according to the following table : + * + * return MAY_CHANGE FINAL Meaning for the sample + * NULL 0 * Not present and will never be (eg: header) + * NULL 1 0 Not present yet, could change (eg: POST param) + * NULL 1 1 Not present yet, will not change anymore + * smp 0 * Present and will not change (eg: header) + * smp 1 0 Present, may change (eg: request length) + * smp 1 1 Present, last known value (eg: request length) + */ +struct sample *sample_process(struct proxy *px, struct session *sess, + struct stream *strm, unsigned int opt, + struct sample_expr *expr, struct sample *p) +{ + if (p == NULL) { + p = &temp_smp; + memset(p, 0, sizeof(*p)); + } + + smp_set_owner(p, px, sess, strm, opt); + if (!expr->fetch->process(expr->arg_p, p, expr->fetch->kw, expr->fetch->private)) + return NULL; + + if (!sample_process_cnv(expr, p)) + return NULL; + return p; +} + +/* + * Resolve all remaining arguments in proxy <p>. Returns the number of + * errors or 0 if everything is fine. If at least one error is met, it will + * be appended to *err. If *err==NULL it will be allocated first. + */ +int smp_resolve_args(struct proxy *p, char **err) +{ + struct arg_list *cur, *bak; + const char *ctx, *where; + const char *conv_ctx, *conv_pre, *conv_pos; + struct userlist *ul; + struct my_regex *reg; + struct arg *arg; + int cfgerr = 0; + int rflags; + + list_for_each_entry_safe(cur, bak, &p->conf.args.list, list) { + struct proxy *px; + struct server *srv; + struct stktable *t; + char *pname, *sname, *stktname; + char *err2; + + arg = cur->arg; + + /* prepare output messages */ + conv_pre = conv_pos = conv_ctx = ""; + if (cur->conv) { + conv_ctx = cur->conv; + conv_pre = "conversion keyword '"; + conv_pos = "' for "; + } + + where = "in"; + ctx = "sample fetch keyword"; + switch (cur->ctx) { + case ARGC_STK: where = "in stick rule in"; break; + case ARGC_TRK: where = "in tracking rule in"; break; + case ARGC_LOG: where = "in log-format string in"; break; + case ARGC_LOGSD: where = "in log-format-sd string in"; break; + case ARGC_HRQ: where = "in http-request expression in"; break; + case ARGC_HRS: where = "in http-response response in"; break; + case ARGC_UIF: where = "in unique-id-format string in"; break; + case ARGC_RDR: where = "in redirect format string in"; break; + case ARGC_CAP: where = "in capture rule in"; break; + case ARGC_ACL: ctx = "ACL keyword"; break; + case ARGC_SRV: where = "in server directive in"; break; + case ARGC_SPOE: where = "in spoe-message directive in"; break; + case ARGC_UBK: where = "in use_backend expression in"; break; + case ARGC_USRV: where = "in use-server or balance expression in"; break; + case ARGC_HERR: where = "in http-error directive in"; break; + case ARGC_OT: where = "in ot-scope directive in"; break; + case ARGC_OPT: where = "in option directive in"; break; + case ARGC_TCO: where = "in tcp-request connection expression in"; break; + case ARGC_TSE: where = "in tcp-request session expression in"; break; + case ARGC_TRQ: where = "in tcp-request content expression in"; break; + case ARGC_TRS: where = "in tcp-response content expression in"; break; + case ARGC_TCK: where = "in tcp-check expression in"; break; + case ARGC_CFG: where = "in configuration expression in"; break; + case ARGC_CLI: where = "in CLI expression in"; break; + } + + /* set a few default settings */ + px = p; + pname = p->id; + + switch (arg->type) { + case ARGT_SRV: + if (!arg->data.str.data) { + memprintf(err, "%sparsing [%s:%d]: missing server name in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + continue; + } + + /* we support two formats : "bck/srv" and "srv" */ + sname = strrchr(arg->data.str.area, '/'); + + if (sname) { + *sname++ = '\0'; + pname = arg->data.str.area; + + px = proxy_be_by_name(pname); + if (!px) { + memprintf(err, "%sparsing [%s:%d]: unable to find proxy '%s' referenced in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, pname, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + } + else { + if (px->cap & PR_CAP_DEF) { + memprintf(err, "%sparsing [%s:%d]: backend name must be set in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + sname = arg->data.str.area; + } + + srv = findserver(px, sname); + if (!srv) { + memprintf(err, "%sparsing [%s:%d]: unable to find server '%s' in proxy '%s', referenced in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, sname, pname, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + srv->flags |= SRV_F_NON_PURGEABLE; + + chunk_destroy(&arg->data.str); + arg->unresolved = 0; + arg->data.srv = srv; + break; + + case ARGT_FE: + if (arg->data.str.data) { + pname = arg->data.str.area; + px = proxy_fe_by_name(pname); + } + + if (!px) { + memprintf(err, "%sparsing [%s:%d]: unable to find frontend '%s' referenced in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, pname, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + if (!(px->cap & PR_CAP_FE)) { + memprintf(err, "%sparsing [%s:%d]: proxy '%s', referenced in arg %d of %s%s%s%s '%s' %s proxy '%s', has not frontend capability.\n", + *err ? *err : "", cur->file, cur->line, pname, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + chunk_destroy(&arg->data.str); + arg->unresolved = 0; + arg->data.prx = px; + break; + + case ARGT_BE: + if (arg->data.str.data) { + pname = arg->data.str.area; + px = proxy_be_by_name(pname); + } + + if (!px) { + memprintf(err, "%sparsing [%s:%d]: unable to find backend '%s' referenced in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, pname, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + if (!(px->cap & PR_CAP_BE)) { + memprintf(err, "%sparsing [%s:%d]: proxy '%s', referenced in arg %d of %s%s%s%s '%s' %s proxy '%s', has not backend capability.\n", + *err ? *err : "", cur->file, cur->line, pname, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + chunk_destroy(&arg->data.str); + arg->unresolved = 0; + arg->data.prx = px; + break; + + case ARGT_TAB: + if (arg->data.str.data) + stktname = arg->data.str.area; + else { + if (px->cap & PR_CAP_DEF) { + memprintf(err, "%sparsing [%s:%d]: table name must be set in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + stktname = px->id; + } + + t = stktable_find_by_name(stktname); + if (!t) { + memprintf(err, "%sparsing [%s:%d]: unable to find table '%s' referenced in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, stktname, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + if (!t->size) { + memprintf(err, "%sparsing [%s:%d]: no table in proxy '%s' referenced in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, stktname, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + if (!in_proxies_list(t->proxies_list, p)) { + p->next_stkt_ref = t->proxies_list; + t->proxies_list = p; + } + + chunk_destroy(&arg->data.str); + arg->unresolved = 0; + arg->data.t = t; + break; + + case ARGT_USR: + if (!arg->data.str.data) { + memprintf(err, "%sparsing [%s:%d]: missing userlist name in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + if (p->uri_auth && p->uri_auth->userlist && + strcmp(p->uri_auth->userlist->name, arg->data.str.area) == 0) + ul = p->uri_auth->userlist; + else + ul = auth_find_userlist(arg->data.str.area); + + if (!ul) { + memprintf(err, "%sparsing [%s:%d]: unable to find userlist '%s' referenced in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, + arg->data.str.area, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + break; + } + + chunk_destroy(&arg->data.str); + arg->unresolved = 0; + arg->data.usr = ul; + break; + + case ARGT_REG: + if (!arg->data.str.data) { + memprintf(err, "%sparsing [%s:%d]: missing regex in arg %d of %s%s%s%s '%s' %s proxy '%s'.\n", + *err ? *err : "", cur->file, cur->line, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id); + cfgerr++; + continue; + } + + rflags = 0; + rflags |= (arg->type_flags & ARGF_REG_ICASE) ? REG_ICASE : 0; + err2 = NULL; + + if (!(reg = regex_comp(arg->data.str.area, !(rflags & REG_ICASE), 1 /* capture substr */, &err2))) { + memprintf(err, "%sparsing [%s:%d]: error in regex '%s' in arg %d of %s%s%s%s '%s' %s proxy '%s' : %s.\n", + *err ? *err : "", cur->file, cur->line, + arg->data.str.area, + cur->arg_pos + 1, conv_pre, conv_ctx, conv_pos, ctx, cur->kw, where, p->id, err2); + cfgerr++; + continue; + } + + chunk_destroy(&arg->data.str); + arg->unresolved = 0; + arg->data.reg = reg; + break; + + + } + + LIST_DELETE(&cur->list); + free(cur); + } /* end of args processing */ + + return cfgerr; +} + +/* + * Process a fetch + format conversion as defined by the sample expression + * <expr> on request or response considering the <opt> parameter. The output is + * not explicitly set to <smp_type>, but shall be compatible with it as + * specified by 'sample_casts' table. If a stable sample can be fetched, or an + * unstable one when <opt> contains SMP_OPT_FINAL, the sample is converted and + * returned without the SMP_F_MAY_CHANGE flag. If an unstable sample is found + * and <opt> does not contain SMP_OPT_FINAL, then the sample is returned as-is + * with its SMP_F_MAY_CHANGE flag so that the caller can check it and decide to + * take actions (eg: wait longer). If a sample could not be found or could not + * be converted, NULL is returned. The caller MUST NOT use the sample if the + * SMP_F_MAY_CHANGE flag is present, as it is used only as a hint that there is + * still hope to get it after waiting longer, and is not converted to string. + * The possible output combinations are the following : + * + * return MAY_CHANGE FINAL Meaning for the sample + * NULL * * Not present and will never be (eg: header) + * smp 0 * Final value converted (eg: header) + * smp 1 0 Not present yet, may appear later (eg: header) + * smp 1 1 never happens (either flag is cleared on output) + */ +struct sample *sample_fetch_as_type(struct proxy *px, struct session *sess, + struct stream *strm, unsigned int opt, + struct sample_expr *expr, int smp_type) +{ + struct sample *smp = &temp_smp; + + memset(smp, 0, sizeof(*smp)); + + if (!sample_process(px, sess, strm, opt, expr, smp)) { + if ((smp->flags & SMP_F_MAY_CHANGE) && !(opt & SMP_OPT_FINAL)) + return smp; + return NULL; + } + + if (!sample_casts[smp->data.type][smp_type]) + return NULL; + + if (sample_casts[smp->data.type][smp_type] != c_none && + !sample_casts[smp->data.type][smp_type](smp)) + return NULL; + + smp->flags &= ~SMP_F_MAY_CHANGE; + return smp; +} + +static void release_sample_arg(struct arg *p) +{ + struct arg *p_back = p; + + if (!p) + return; + + while (p->type != ARGT_STOP) { + if (p->type == ARGT_STR || p->unresolved) { + chunk_destroy(&p->data.str); + p->unresolved = 0; + } + else if (p->type == ARGT_REG) { + regex_free(p->data.reg); + p->data.reg = NULL; + } + p++; + } + + if (p_back != empty_arg_list) + free(p_back); +} + +void release_sample_expr(struct sample_expr *expr) +{ + struct sample_conv_expr *conv_expr, *conv_exprb; + + if (!expr) + return; + + list_for_each_entry_safe(conv_expr, conv_exprb, &expr->conv_exprs, list) { + LIST_DELETE(&conv_expr->list); + release_sample_arg(conv_expr->arg_p); + free(conv_expr); + } + + release_sample_arg(expr->arg_p); + free(expr); +} + +/*****************************************************************/ +/* Sample format convert functions */ +/* These functions set the data type on return. */ +/*****************************************************************/ + +static int sample_conv_debug(const struct arg *arg_p, struct sample *smp, void *private) +{ + int i; + struct sample tmp; + struct buffer *buf; + struct sink *sink; + struct ist line; + char *pfx; + + buf = alloc_trash_chunk(); + if (!buf) + goto end; + + sink = (struct sink *)arg_p[1].data.ptr; + BUG_ON(!sink); + + pfx = arg_p[0].data.str.area; + BUG_ON(!pfx); + + chunk_printf(buf, "[debug] %s: type=%s ", pfx, smp_to_type[smp->data.type]); + if (!sample_casts[smp->data.type][SMP_T_STR]) + goto nocast; + + /* Copy sample fetch. This puts the sample as const, the + * cast will copy data if a transformation is required. + */ + memcpy(&tmp, smp, sizeof(struct sample)); + tmp.flags = SMP_F_CONST; + + if (!sample_casts[smp->data.type][SMP_T_STR](&tmp)) + goto nocast; + + /* Display the displayable chars*. */ + b_putchr(buf, '<'); + for (i = 0; i < tmp.data.u.str.data; i++) { + if (isprint((unsigned char)tmp.data.u.str.area[i])) + b_putchr(buf, tmp.data.u.str.area[i]); + else + b_putchr(buf, '.'); + } + b_putchr(buf, '>'); + + done: + line = ist2(buf->area, buf->data); + sink_write(sink, LOG_HEADER_NONE, 0, &line, 1); + end: + free_trash_chunk(buf); + return 1; + nocast: + chunk_appendf(buf, "(undisplayable)"); + goto done; +} + +// This function checks the "debug" converter's arguments. +static int smp_check_debug(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + const char *name = "buf0"; + struct sink *sink = NULL; + + if (args[0].type != ARGT_STR) { + /* optional prefix */ + args[0].data.str.area = ""; + args[0].data.str.data = 0; + } + + if (args[1].type == ARGT_STR) + name = args[1].data.str.area; + + sink = sink_find(name); + if (!sink) { + memprintf(err, "No such sink '%s'", name); + return 0; + } + + chunk_destroy(&args[1].data.str); + args[1].type = ARGT_PTR; + args[1].data.ptr = sink; + return 1; +} + +static int sample_conv_base642bin(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + int bin_len; + + trash->data = 0; + bin_len = base64dec(smp->data.u.str.area, smp->data.u.str.data, + trash->area, trash->size); + if (bin_len < 0) + return 0; + + trash->data = bin_len; + smp->data.u.str = *trash; + smp->data.type = SMP_T_BIN; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int sample_conv_base64url2bin(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + int bin_len; + + trash->data = 0; + bin_len = base64urldec(smp->data.u.str.area, smp->data.u.str.data, + trash->area, trash->size); + if (bin_len < 0) + return 0; + + trash->data = bin_len; + smp->data.u.str = *trash; + smp->data.type = SMP_T_BIN; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int sample_conv_bin2base64(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + int b64_len; + + trash->data = 0; + b64_len = a2base64(smp->data.u.str.area, smp->data.u.str.data, + trash->area, trash->size); + if (b64_len < 0) + return 0; + + trash->data = b64_len; + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int sample_conv_bin2base64url(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + int b64_len; + + trash->data = 0; + b64_len = a2base64url(smp->data.u.str.area, smp->data.u.str.data, + trash->area, trash->size); + if (b64_len < 0) + return 0; + + trash->data = b64_len; + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +/* This function returns a sample struct filled with the conversion of variable + * <var> to sample type <type> (SMP_T_*), via a cast to the target type. If the + * variable cannot be retrieved or casted, 0 is returned, otherwise 1. + * + * Keep in mind that the sample content may be written to a pre-allocated + * trash chunk as returned by get_trash_chunk(). + */ +int sample_conv_var2smp(const struct var_desc *var, struct sample *smp, int type) +{ + if (!vars_get_by_desc(var, smp, NULL)) + return 0; + if (!sample_casts[smp->data.type][type]) + return 0; + if (!sample_casts[smp->data.type][type](smp)) + return 0; + return 1; +} + +static int sample_conv_sha1(const struct arg *arg_p, struct sample *smp, void *private) +{ + blk_SHA_CTX ctx; + struct buffer *trash = get_trash_chunk(); + + memset(&ctx, 0, sizeof(ctx)); + + blk_SHA1_Init(&ctx); + blk_SHA1_Update(&ctx, smp->data.u.str.area, smp->data.u.str.data); + blk_SHA1_Final((unsigned char *) trash->area, &ctx); + + trash->data = 20; + smp->data.u.str = *trash; + smp->data.type = SMP_T_BIN; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +/* This function returns a sample struct filled with an <arg> content. + * If the <arg> contains a string, it is returned in the sample flagged as + * SMP_F_CONST. If the <arg> contains a variable descriptor, the sample is + * filled with the content of the variable by using vars_get_by_desc(). + * + * Keep in mind that the sample content may be written to a pre-allocated + * trash chunk as returned by get_trash_chunk(). + * + * This function returns 0 if an error occurs, otherwise it returns 1. + */ +int sample_conv_var2smp_str(const struct arg *arg, struct sample *smp) +{ + switch (arg->type) { + case ARGT_STR: + smp->data.type = SMP_T_STR; + smp->data.u.str = arg->data.str; + smp->flags = SMP_F_CONST; + return 1; + case ARGT_VAR: + return sample_conv_var2smp(&arg->data.var, smp, SMP_T_STR); + default: + return 0; + } +} + +static int sample_conv_be2dec_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (args[1].data.sint <= 0 || args[1].data.sint > sizeof(unsigned long long)) { + memprintf(err, "chunk_size out of [1..%u] range (%lld)", (uint)sizeof(unsigned long long), args[1].data.sint); + return 0; + } + + if (args[2].data.sint != 0 && args[2].data.sint != 1) { + memprintf(err, "Unsupported truncate value (%lld)", args[2].data.sint); + return 0; + } + + return 1; +} + +/* Converts big-endian binary input sample to a string containing an unsigned + * integer number per <chunk_size> input bytes separated with <separator>. + * Optional <truncate> flag indicates if input is truncated at <chunk_size> + * boundaries. + * Arguments: separator (string), chunk_size (integer), truncate (0,1) + */ +static int sample_conv_be2dec(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + const int last = args[2].data.sint ? smp->data.u.str.data - args[1].data.sint + 1 : smp->data.u.str.data; + int max_size = trash->size - 2; + int i; + int start; + int ptr = 0; + unsigned long long number; + char *pos; + + trash->data = 0; + + while (ptr < last && trash->data <= max_size) { + start = trash->data; + if (ptr) { + /* Add separator */ + memcpy(trash->area + trash->data, args[0].data.str.area, args[0].data.str.data); + trash->data += args[0].data.str.data; + } + else + max_size -= args[0].data.str.data; + + /* Add integer */ + for (number = 0, i = 0; i < args[1].data.sint && ptr < smp->data.u.str.data; i++) + number = (number << 8) + (unsigned char)smp->data.u.str.area[ptr++]; + + pos = ulltoa(number, trash->area + trash->data, trash->size - trash->data); + if (pos) + trash->data = pos - trash->area; + else { + trash->data = start; + break; + } + } + + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int sample_conv_be2hex_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (args[1].data.sint <= 0 && (args[0].data.str.data > 0 || args[2].data.sint != 0)) { + memprintf(err, "chunk_size needs to be positive (%lld)", args[1].data.sint); + return 0; + } + + if (args[2].data.sint != 0 && args[2].data.sint != 1) { + memprintf(err, "Unsupported truncate value (%lld)", args[2].data.sint); + return 0; + } + + return 1; +} + +/* Converts big-endian binary input sample to a hex string containing two hex + * digits per input byte. <separator> is put every <chunk_size> binary input + * bytes if specified. Optional <truncate> flag indicates if input is truncated + * at <chunk_size> boundaries. + * Arguments: separator (string), chunk_size (integer), truncate (0,1) + */ +static int sample_conv_be2hex(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + int chunk_size = args[1].data.sint; + const int last = args[2].data.sint ? smp->data.u.str.data - chunk_size + 1 : smp->data.u.str.data; + int i; + int max_size; + int ptr = 0; + unsigned char c; + + trash->data = 0; + if (args[0].data.str.data == 0 && args[2].data.sint == 0) + chunk_size = smp->data.u.str.data; + max_size = trash->size - 2 * chunk_size; + + while (ptr < last && trash->data <= max_size) { + if (ptr) { + /* Add separator */ + memcpy(trash->area + trash->data, args[0].data.str.area, args[0].data.str.data); + trash->data += args[0].data.str.data; + } + else + max_size -= args[0].data.str.data; + + /* Add hex */ + for (i = 0; i < chunk_size && ptr < smp->data.u.str.data; i++) { + c = smp->data.u.str.area[ptr++]; + trash->area[trash->data++] = hextab[(c >> 4) & 0xF]; + trash->area[trash->data++] = hextab[c & 0xF]; + } + } + + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int sample_conv_bin2hex(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + unsigned char c; + int ptr = 0; + + trash->data = 0; + while (ptr < smp->data.u.str.data && trash->data <= trash->size - 2) { + c = smp->data.u.str.area[ptr++]; + trash->area[trash->data++] = hextab[(c >> 4) & 0xF]; + trash->area[trash->data++] = hextab[c & 0xF]; + } + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int sample_conv_hex2int(const struct arg *arg_p, struct sample *smp, void *private) +{ + long long int n = 0; + int i, c; + + for (i = 0; i < smp->data.u.str.data; i++) { + if ((c = hex2i(smp->data.u.str.area[i])) < 0) + return 0; + n = (n << 4) + c; + } + + smp->data.u.sint = n; + smp->data.type = SMP_T_SINT; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +/* hashes the binary input into a 32-bit unsigned int */ +static int sample_conv_djb2(const struct arg *arg_p, struct sample *smp, void *private) +{ + smp->data.u.sint = hash_djb2(smp->data.u.str.area, + smp->data.u.str.data); + if (arg_p->data.sint) + smp->data.u.sint = full_hash(smp->data.u.sint); + smp->data.type = SMP_T_SINT; + return 1; +} + +static int sample_conv_length(const struct arg *arg_p, struct sample *smp, void *private) +{ + int i = smp->data.u.str.data; + smp->data.u.sint = i; + smp->data.type = SMP_T_SINT; + return 1; +} + + +static int sample_conv_str2lower(const struct arg *arg_p, struct sample *smp, void *private) +{ + int i; + + if (!smp_make_rw(smp)) + return 0; + + for (i = 0; i < smp->data.u.str.data; i++) { + if ((smp->data.u.str.area[i] >= 'A') && (smp->data.u.str.area[i] <= 'Z')) + smp->data.u.str.area[i] += 'a' - 'A'; + } + return 1; +} + +static int sample_conv_str2upper(const struct arg *arg_p, struct sample *smp, void *private) +{ + int i; + + if (!smp_make_rw(smp)) + return 0; + + for (i = 0; i < smp->data.u.str.data; i++) { + if ((smp->data.u.str.area[i] >= 'a') && (smp->data.u.str.area[i] <= 'z')) + smp->data.u.str.area[i] += 'A' - 'a'; + } + return 1; +} + +/* takes the IPv4 mask in args[0] and an optional IPv6 mask in args[1] */ +static int sample_conv_ipmask(const struct arg *args, struct sample *smp, void *private) +{ + /* Attempt to convert to IPv4 to apply the correct mask. */ + c_ipv62ip(smp); + + if (smp->data.type == SMP_T_IPV4) { + smp->data.u.ipv4.s_addr &= args[0].data.ipv4.s_addr; + smp->data.type = SMP_T_IPV4; + } + else if (smp->data.type == SMP_T_IPV6) { + /* IPv6 cannot be converted without an IPv6 mask. */ + if (args[1].type != ARGT_IPV6) + return 0; + + write_u64(&smp->data.u.ipv6.s6_addr[0], + read_u64(&smp->data.u.ipv6.s6_addr[0]) & read_u64(&args[1].data.ipv6.s6_addr[0])); + write_u64(&smp->data.u.ipv6.s6_addr[8], + read_u64(&smp->data.u.ipv6.s6_addr[8]) & read_u64(&args[1].data.ipv6.s6_addr[8])); + smp->data.type = SMP_T_IPV6; + } + + return 1; +} + +/* + * This function implement a conversion specifier seeker for %N so it could be + * replaced before doing strftime. + * + * <format> is the input format string which is used as a haystack + * + * The function fills multiple variables: + * <skip> is the len of the conversion specifier string which was found (ex: strlen(%N):2, strlen(%3N):3 strlen(%123N): 5) + * <width> is the width argument, default width is 9 (ex: %3N: 3, %4N: 4: %N: 9, %5N: 5) + * + * Returns a ptr to the first character of the conversion specifier or NULL if not found + */ +static const char *lookup_convspec_N(const char *format, int *skip, int *width) +{ + const char *p, *needle; + const char *digits; + int state; + + p = format; + + /* this looks for % in loop. The iteration stops when a %N conversion + * specifier was found or there is no '%' anymore */ +lookagain: + while (p && *p) { + state = 0; + digits = NULL; + + p = needle = strchr(p, '%'); + /* Once we find a % we try to move forward in the string + * + * state 0: found % + * state 1: digits (precision) + * state 2: N + */ + while (p && *p) { + switch (state) { + case 0: + state = 1; + break; + + case 1: + if (isdigit((unsigned char)*p) && !digits) /* set the start of the digits */ + digits = p; + + if (isdigit((unsigned char)*p)) + break; + else + state = 2; + /* if this is not a number anymore, we + * don't want to increment p but try the + * next state directly */ + __fallthrough; + case 2: + if (*p == 'N') + goto found; + else + /* this was not a %N, start again */ + goto lookagain; + break; + } + p++; + } + } + + *skip = 0; + *width = 0; + return NULL; + +found: + *skip = p - needle + 1; + if (digits) + *width = atoi(digits); + else + *width = 9; + return needle; +} + + /* + * strftime(3) does not implement nanoseconds, but we still want them in our + * date format. + * + * This function implements %N like in date(1) which gives you the nanoseconds part of the timestamp + * An optional field width can be specified, a maximum width of 9 is supported (ex: %3N %6N %9N) + * + * <format> is the format string + * <curr_date> in seconds since epoch + * <ns> only the nanoseconds part of the timestamp + * <local> chose the localtime instead of UTC time + * + * Return the results of strftime in the trash buffer + */ +static struct buffer *conv_time_common(const char *format, time_t curr_date, uint64_t ns, int local) +{ + struct buffer *tmp_format = NULL; + struct buffer *res = NULL; + struct tm tm; + const char *p; + char ns_str[10] = {}; + int set = 0; + + if (local) + get_localtime(curr_date, &tm); + else + get_gmtime(curr_date, &tm); + + + /* we need to iterate in order to replace all the %N in the string */ + + p = format; + while (*p) { + const char *needle; + int skip = 0; + int cpy = 0; + int width = 0; + + /* look for the next %N onversion specifier */ + if (!(needle = lookup_convspec_N(p, &skip, &width))) + break; + + if (width > 9) /* we don't handle more that 9 */ + width = 9; + cpy = needle - p; + + if (!tmp_format) + tmp_format = alloc_trash_chunk(); + if (!tmp_format) + goto error; + + if (set != 9) /* if the snprintf wasn't done yet */ + set = snprintf(ns_str, sizeof(ns_str), "%.9llu", (unsigned long long)ns); + + if (chunk_istcat(tmp_format, ist2(p, cpy)) == 0) /* copy before the %N */ + goto error; + if (chunk_istcat(tmp_format, ist2(ns_str, width)) == 0) /* copy the %N result with the right precision */ + goto error; + + p += skip + cpy; /* skip the %N */ + } + + + if (tmp_format) { /* %N was found */ + if (chunk_strcat(tmp_format, p) == 0) /* copy the end of the string if needed or just the \0 */ + goto error; + res = get_trash_chunk(); + res->data = strftime(res->area, res->size, tmp_format->area , &tm); + } else { + res = get_trash_chunk(); + res->data = strftime(res->area, res->size, format, &tm); + } + +error: + free_trash_chunk(tmp_format); + return res; +} + + + +/* + * same as sample_conv_ltime but input is us and %N is supported + */ +static int sample_conv_us_ltime(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *temp; + time_t curr_date = smp->data.u.sint / 1000000; /* convert us to s */ + uint64_t ns = (smp->data.u.sint % 1000000) * 1000; /* us part to ns */ + + /* add offset */ + if (args[1].type == ARGT_SINT) + curr_date += args[1].data.sint; + + temp = conv_time_common(args[0].data.str.area, curr_date, ns, 1); + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + return 1; +} + +/* + * same as sample_conv_ltime but input is ms and %N is supported + */ +static int sample_conv_ms_ltime(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *temp; + time_t curr_date = smp->data.u.sint / 1000; /* convert ms to s */ + uint64_t ns = (smp->data.u.sint % 1000) * 1000000; /* ms part to ns */ + + /* add offset */ + if (args[1].type == ARGT_SINT) + curr_date += args[1].data.sint; + + temp = conv_time_common(args[0].data.str.area, curr_date, ns, 1); + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + return 1; +} + + +/* takes an UINT value on input supposed to represent the time since EPOCH, + * adds an optional offset found in args[1] and emits a string representing + * the local time in the format specified in args[1] using strftime(). + */ +static int sample_conv_ltime(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *temp; + /* With high numbers, the date returned can be negative, the 55 bits mask prevent this. */ + time_t curr_date = smp->data.u.sint & 0x007fffffffffffffLL; + struct tm tm; + + /* add offset */ + if (args[1].type == ARGT_SINT) + curr_date += args[1].data.sint; + + get_localtime(curr_date, &tm); + + temp = get_trash_chunk(); + temp->data = strftime(temp->area, temp->size, args[0].data.str.area, &tm); + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + return 1; +} + +/* hashes the binary input into a 32-bit unsigned int */ +static int sample_conv_sdbm(const struct arg *arg_p, struct sample *smp, void *private) +{ + smp->data.u.sint = hash_sdbm(smp->data.u.str.area, + smp->data.u.str.data); + if (arg_p->data.sint) + smp->data.u.sint = full_hash(smp->data.u.sint); + smp->data.type = SMP_T_SINT; + return 1; +} + +/* + * same as sample_conv_utime but input is us and %N is supported + */ +static int sample_conv_us_utime(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *temp; + time_t curr_date = smp->data.u.sint / 1000000; /* convert us to s */ + uint64_t ns = (smp->data.u.sint % 1000000) * 1000; /* us part to ns */ + + /* add offset */ + if (args[1].type == ARGT_SINT) + curr_date += args[1].data.sint; + + temp = conv_time_common(args[0].data.str.area, curr_date, ns, 0); + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + return 1; +} + +/* + * same as sample_conv_utime but input is ms and %N is supported + */ +static int sample_conv_ms_utime(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *temp; + time_t curr_date = smp->data.u.sint / 1000; /* convert ms to s */ + uint64_t ns = (smp->data.u.sint % 1000) * 1000000; /* ms part to ns */ + + /* add offset */ + if (args[1].type == ARGT_SINT) + curr_date += args[1].data.sint; + + temp = conv_time_common(args[0].data.str.area, curr_date, ns, 0); + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + return 1; +} + +/* takes an UINT value on input supposed to represent the time since EPOCH, + * adds an optional offset found in args[1] and emits a string representing + * the UTC date in the format specified in args[1] using strftime(). + */ +static int sample_conv_utime(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *temp; + /* With high numbers, the date returned can be negative, the 55 bits mask prevent this. */ + time_t curr_date = smp->data.u.sint & 0x007fffffffffffffLL; + struct tm tm; + + /* add offset */ + if (args[1].type == ARGT_SINT) + curr_date += args[1].data.sint; + + get_gmtime(curr_date, &tm); + + temp = get_trash_chunk(); + temp->data = strftime(temp->area, temp->size, args[0].data.str.area, &tm); + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + return 1; +} + +/* hashes the binary input into a 32-bit unsigned int */ +static int sample_conv_wt6(const struct arg *arg_p, struct sample *smp, void *private) +{ + smp->data.u.sint = hash_wt6(smp->data.u.str.area, + smp->data.u.str.data); + if (arg_p->data.sint) + smp->data.u.sint = full_hash(smp->data.u.sint); + smp->data.type = SMP_T_SINT; + return 1; +} + +/* hashes the binary input into a 32-bit unsigned int using xxh. + * The seed of the hash defaults to 0 but can be changd in argument 1. + */ +static int sample_conv_xxh32(const struct arg *arg_p, struct sample *smp, void *private) +{ + unsigned int seed; + + if (arg_p->data.sint) + seed = arg_p->data.sint; + else + seed = 0; + smp->data.u.sint = XXH32(smp->data.u.str.area, smp->data.u.str.data, + seed); + smp->data.type = SMP_T_SINT; + return 1; +} + +/* hashes the binary input into a 64-bit unsigned int using xxh. + * In fact, the function returns a 64 bit unsigned, but the sample + * storage of haproxy only proposes 64-bits signed, so the value is + * cast as signed. This cast doesn't impact the hash repartition. + * The seed of the hash defaults to 0 but can be changd in argument 1. + */ +static int sample_conv_xxh64(const struct arg *arg_p, struct sample *smp, void *private) +{ + unsigned long long int seed; + + if (arg_p->data.sint) + seed = (unsigned long long int)arg_p->data.sint; + else + seed = 0; + smp->data.u.sint = (long long int)XXH64(smp->data.u.str.area, + smp->data.u.str.data, seed); + smp->data.type = SMP_T_SINT; + return 1; +} + +static int sample_conv_xxh3(const struct arg *arg_p, struct sample *smp, void *private) +{ + unsigned long long int seed; + + if (arg_p->data.sint) + seed = (unsigned long long int)arg_p->data.sint; + else + seed = 0; + smp->data.u.sint = (long long int)XXH3(smp->data.u.str.area, + smp->data.u.str.data, seed); + smp->data.type = SMP_T_SINT; + return 1; +} + +/* hashes the binary input into a 32-bit unsigned int */ +static int sample_conv_crc32(const struct arg *arg_p, struct sample *smp, void *private) +{ + smp->data.u.sint = hash_crc32(smp->data.u.str.area, + smp->data.u.str.data); + if (arg_p->data.sint) + smp->data.u.sint = full_hash(smp->data.u.sint); + smp->data.type = SMP_T_SINT; + return 1; +} + +/* hashes the binary input into crc32c (RFC4960, Appendix B [8].) */ +static int sample_conv_crc32c(const struct arg *arg_p, struct sample *smp, void *private) +{ + smp->data.u.sint = hash_crc32c(smp->data.u.str.area, + smp->data.u.str.data); + if (arg_p->data.sint) + smp->data.u.sint = full_hash(smp->data.u.sint); + smp->data.type = SMP_T_SINT; + return 1; +} + +/* This function escape special json characters. The returned string can be + * safely set between two '"' and used as json string. The json string is + * defined like this: + * + * any Unicode character except '"' or '\' or control character + * \", \\, \/, \b, \f, \n, \r, \t, \u + four-hex-digits + * + * The enum input_type contain all the allowed mode for decoding the input + * string. + */ +enum input_type { + IT_ASCII = 0, + IT_UTF8, + IT_UTF8S, + IT_UTF8P, + IT_UTF8PS, +}; + +static int sample_conv_json_check(struct arg *arg, struct sample_conv *conv, + const char *file, int line, char **err) +{ + enum input_type type; + + if (strcmp(arg->data.str.area, "") == 0) + type = IT_ASCII; + else if (strcmp(arg->data.str.area, "ascii") == 0) + type = IT_ASCII; + else if (strcmp(arg->data.str.area, "utf8") == 0) + type = IT_UTF8; + else if (strcmp(arg->data.str.area, "utf8s") == 0) + type = IT_UTF8S; + else if (strcmp(arg->data.str.area, "utf8p") == 0) + type = IT_UTF8P; + else if (strcmp(arg->data.str.area, "utf8ps") == 0) + type = IT_UTF8PS; + else { + memprintf(err, "Unexpected input code type. " + "Allowed value are 'ascii', 'utf8', 'utf8s', 'utf8p' and 'utf8ps'"); + return 0; + } + + chunk_destroy(&arg->data.str); + arg->type = ARGT_SINT; + arg->data.sint = type; + return 1; +} + +static int sample_conv_json(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *temp; + char _str[7]; /* \u + 4 hex digit + null char for sprintf. */ + const char *str; + int len; + enum input_type input_type = IT_ASCII; + unsigned int c; + unsigned int ret; + char *p; + + input_type = arg_p->data.sint; + + temp = get_trash_chunk(); + temp->data = 0; + + p = smp->data.u.str.area; + while (p < smp->data.u.str.area + smp->data.u.str.data) { + + if (input_type == IT_ASCII) { + /* Read input as ASCII. */ + c = *(unsigned char *)p; + p++; + } + else { + /* Read input as UTF8. */ + ret = utf8_next(p, + smp->data.u.str.data - ( p - smp->data.u.str.area), + &c); + p += utf8_return_length(ret); + + if (input_type == IT_UTF8 && utf8_return_code(ret) != UTF8_CODE_OK) + return 0; + if (input_type == IT_UTF8S && utf8_return_code(ret) != UTF8_CODE_OK) + continue; + if (input_type == IT_UTF8P && utf8_return_code(ret) & (UTF8_CODE_INVRANGE|UTF8_CODE_BADSEQ)) + return 0; + if (input_type == IT_UTF8PS && utf8_return_code(ret) & (UTF8_CODE_INVRANGE|UTF8_CODE_BADSEQ)) + continue; + + /* Check too big values. */ + if ((unsigned int)c > 0xffff) { + if (input_type == IT_UTF8 || input_type == IT_UTF8P) + return 0; + continue; + } + } + + /* Convert character. */ + if (c == '"') { + len = 2; + str = "\\\""; + } + else if (c == '\\') { + len = 2; + str = "\\\\"; + } + else if (c == '/') { + len = 2; + str = "\\/"; + } + else if (c == '\b') { + len = 2; + str = "\\b"; + } + else if (c == '\f') { + len = 2; + str = "\\f"; + } + else if (c == '\r') { + len = 2; + str = "\\r"; + } + else if (c == '\n') { + len = 2; + str = "\\n"; + } + else if (c == '\t') { + len = 2; + str = "\\t"; + } + else if (c > 0xff || !isprint((unsigned char)c)) { + /* isprint generate a segfault if c is too big. The man says that + * c must have the value of an unsigned char or EOF. + */ + len = 6; + _str[0] = '\\'; + _str[1] = 'u'; + snprintf(&_str[2], 5, "%04x", (unsigned short)c); + str = _str; + } + else { + len = 1; + _str[0] = c; + str = _str; + } + + /* Check length */ + if (temp->data + len > temp->size) + return 0; + + /* Copy string. */ + memcpy(temp->area + temp->data, str, len); + temp->data += len; + } + + smp->flags &= ~SMP_F_CONST; + smp->data.u.str = *temp; + smp->data.type = SMP_T_STR; + + return 1; +} + +/* This sample function is designed to extract some bytes from an input buffer. + * First arg is the offset. + * Optional second arg is the length to truncate */ +static int sample_conv_bytes(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct sample smp_arg0, smp_arg1; + long long start_idx, length; + + // determine the start_idx and length of the output + smp_set_owner(&smp_arg0, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(&arg_p[0], &smp_arg0) || smp_arg0.data.u.sint < 0) { + /* invalid or negative value */ + goto fail; + } + + if (smp_arg0.data.u.sint >= smp->data.u.str.data) { + // arg0 >= the input length + if (smp->opt & SMP_OPT_FINAL) { + // empty output value on final smp + smp->data.u.str.data = 0; + goto end; + } + goto wait; + } + start_idx = smp_arg0.data.u.sint; + + // length comes from arg1 if present, otherwise it's the remaining length + length = smp->data.u.str.data - start_idx; + if (arg_p[1].type != ARGT_STOP) { + smp_set_owner(&smp_arg1, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(&arg_p[1], &smp_arg1) || smp_arg1.data.u.sint < 0) { + // invalid or negative value + goto fail; + } + + if (smp_arg1.data.u.sint > (smp->data.u.str.data - start_idx)) { + // arg1 value is greater than the remaining length + if (smp->opt & SMP_OPT_FINAL) { + // truncate to remaining length + length = smp->data.u.str.data - start_idx; + goto end; + } + goto wait; + } + length = smp_arg1.data.u.sint; + } + + // update the output using the start_idx and length + smp->data.u.str.area += start_idx; + smp->data.u.str.data = length; + + end: + return 1; + + fail: + smp->flags &= ~SMP_F_MAY_CHANGE; + wait: + smp->data.u.str.data = 0; + return 0; +} + +static int sample_conv_field_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + struct arg *arg = args; + + if (arg->type != ARGT_SINT) { + memprintf(err, "Unexpected arg type"); + return 0; + } + + if (!arg->data.sint) { + memprintf(err, "Unexpected value 0 for index"); + return 0; + } + + arg++; + + if (arg->type != ARGT_STR) { + memprintf(err, "Unexpected arg type"); + return 0; + } + + if (!arg->data.str.data) { + memprintf(err, "Empty separators list"); + return 0; + } + + return 1; +} + +/* This sample function is designed to a return selected part of a string (field). + * First arg is the index of the field (start at 1) + * Second arg is a char list of separators (type string) + */ +static int sample_conv_field(const struct arg *arg_p, struct sample *smp, void *private) +{ + int field; + char *start, *end; + int i; + int count = (arg_p[2].type == ARGT_SINT) ? arg_p[2].data.sint : 1; + + if (!arg_p[0].data.sint) + return 0; + + if (arg_p[0].data.sint < 0) { + field = -1; + end = start = smp->data.u.str.area + smp->data.u.str.data; + while (start > smp->data.u.str.area) { + for (i = 0 ; i < arg_p[1].data.str.data; i++) { + if (*(start-1) == arg_p[1].data.str.area[i]) { + if (field == arg_p[0].data.sint) { + if (count == 1) + goto found; + else if (count > 1) + count--; + } else { + end = start-1; + field--; + } + break; + } + } + start--; + } + } else { + field = 1; + end = start = smp->data.u.str.area; + while (end - smp->data.u.str.area < smp->data.u.str.data) { + for (i = 0 ; i < arg_p[1].data.str.data; i++) { + if (*end == arg_p[1].data.str.area[i]) { + if (field == arg_p[0].data.sint) { + if (count == 1) + goto found; + else if (count > 1) + count--; + } else { + start = end+1; + field++; + } + break; + } + } + end++; + } + } + + /* Field not found */ + if (field != arg_p[0].data.sint) { + smp->data.u.str.data = 0; + return 0; + } +found: + smp->data.u.str.data = end - start; + /* If ret string is len 0, no need to + change pointers or to update size */ + if (!smp->data.u.str.data) + return 1; + + /* Compute remaining size if needed + Note: smp->data.u.str.size cannot be set to 0 */ + if (smp->data.u.str.size) + smp->data.u.str.size -= start - smp->data.u.str.area; + + smp->data.u.str.area = start; + + return 1; +} + +/* This sample function is designed to return a word from a string. + * First arg is the index of the word (start at 1) + * Second arg is a char list of words separators (type string) + */ +static int sample_conv_word(const struct arg *arg_p, struct sample *smp, void *private) +{ + int word; + char *start, *end; + int i, issep, inword; + int count = (arg_p[2].type == ARGT_SINT) ? arg_p[2].data.sint : 1; + + if (!arg_p[0].data.sint) + return 0; + + word = 0; + inword = 0; + if (arg_p[0].data.sint < 0) { + end = start = smp->data.u.str.area + smp->data.u.str.data; + while (start > smp->data.u.str.area) { + issep = 0; + for (i = 0 ; i < arg_p[1].data.str.data; i++) { + if (*(start-1) == arg_p[1].data.str.area[i]) { + issep = 1; + break; + } + } + if (!inword) { + if (!issep) { + if (word != arg_p[0].data.sint) { + word--; + end = start; + } + inword = 1; + } + } + else if (issep) { + if (word == arg_p[0].data.sint) { + if (count == 1) + goto found; + else if (count > 1) + count--; + } + inword = 0; + } + start--; + } + } else { + end = start = smp->data.u.str.area; + while (end - smp->data.u.str.area < smp->data.u.str.data) { + issep = 0; + for (i = 0 ; i < arg_p[1].data.str.data; i++) { + if (*end == arg_p[1].data.str.area[i]) { + issep = 1; + break; + } + } + if (!inword) { + if (!issep) { + if (word != arg_p[0].data.sint) { + word++; + start = end; + } + inword = 1; + } + } + else if (issep) { + if (word == arg_p[0].data.sint) { + if (count == 1) + goto found; + else if (count > 1) + count--; + } + inword = 0; + } + end++; + } + } + + /* Field not found */ + if (word != arg_p[0].data.sint) { + smp->data.u.str.data = 0; + return 0; + } +found: + smp->data.u.str.data = end - start; + /* If ret string is len 0, no need to + change pointers or to update size */ + if (!smp->data.u.str.data) + return 1; + + + /* Compute remaining size if needed + Note: smp->data.u.str.size cannot be set to 0 */ + if (smp->data.u.str.size) + smp->data.u.str.size -= start - smp->data.u.str.area; + + smp->data.u.str.area = start; + + return 1; +} + +static int sample_conv_param_check(struct arg *arg, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (arg[1].type == ARGT_STR && arg[1].data.str.data != 1) { + memprintf(err, "Delimiter must be exactly 1 character."); + return 0; + } + + return 1; +} + +static int sample_conv_param(const struct arg *arg_p, struct sample *smp, void *private) +{ + char *pos, *end, *pend, *equal; + char delim = '&'; + const char *name = arg_p[0].data.str.area; + size_t name_l = arg_p[0].data.str.data; + + if (arg_p[1].type == ARGT_STR) + delim = *arg_p[1].data.str.area; + + pos = smp->data.u.str.area; + end = pos + smp->data.u.str.data; + while (pos < end) { + equal = pos + name_l; + /* Parameter not found */ + if (equal > end) + break; + + if (equal == end || *equal == delim) { + if (memcmp(pos, name, name_l) == 0) { + /* input contains parameter, but no value is supplied */ + smp->data.u.str.data = 0; + return 1; + } + pos = equal + 1; + continue; + } + + if (*equal == '=' && memcmp(pos, name, name_l) == 0) { + pos = equal + 1; + pend = memchr(pos, delim, end - pos); + if (pend == NULL) + pend = end; + + if (smp->data.u.str.size) + smp->data.u.str.size -= pos - smp->data.u.str.area; + smp->data.u.str.area = pos; + smp->data.u.str.data = pend - pos; + return 1; + } + /* find the next delimiter and set position to character after that */ + pos = memchr(pos, delim, end - pos); + if (pos == NULL) + pos = end; + else + pos++; + } + /* Parameter not found */ + smp->data.u.str.data = 0; + return 0; +} + +static int sample_conv_regsub_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + struct arg *arg = args; + char *p; + int len; + + /* arg0 is a regex, it uses type_flag for ICASE and global match */ + arg[0].type_flags = 0; + + if (arg[2].type != ARGT_STR) + return 1; + + p = arg[2].data.str.area; + len = arg[2].data.str.data; + while (len) { + if (*p == 'i') { + arg[0].type_flags |= ARGF_REG_ICASE; + } + else if (*p == 'g') { + arg[0].type_flags |= ARGF_REG_GLOB; + } + else { + memprintf(err, "invalid regex flag '%c', only 'i' and 'g' are supported", *p); + return 0; + } + p++; + len--; + } + return 1; +} + +/* This sample function is designed to do the equivalent of s/match/replace/ on + * the input string. It applies a regex and restarts from the last matched + * location until nothing matches anymore. First arg is the regex to apply to + * the input string, second arg is the replacement expression. + */ +static int sample_conv_regsub(const struct arg *arg_p, struct sample *smp, void *private) +{ + char *start, *end; + struct my_regex *reg = arg_p[0].data.reg; + regmatch_t pmatch[MAX_MATCH]; + struct buffer *trash = get_trash_chunk(); + struct buffer *output; + int flag, max; + int found; + + start = smp->data.u.str.area; + end = start + smp->data.u.str.data; + + flag = 0; + while (1) { + /* check for last round which is used to copy remaining parts + * when not running in global replacement mode. + */ + found = 0; + if ((arg_p[0].type_flags & ARGF_REG_GLOB) || !(flag & REG_NOTBOL)) { + /* Note: we can have start == end on empty strings or at the end */ + found = regex_exec_match2(reg, start, end - start, MAX_MATCH, pmatch, flag); + } + + if (!found) + pmatch[0].rm_so = end - start; + + /* copy the heading non-matching part (which may also be the tail if nothing matches) */ + max = trash->size - trash->data; + if (max && pmatch[0].rm_so > 0) { + if (max > pmatch[0].rm_so) + max = pmatch[0].rm_so; + memcpy(trash->area + trash->data, start, max); + trash->data += max; + } + + if (!found) + break; + + output = alloc_trash_chunk(); + if (!output) + break; + + output->data = exp_replace(output->area, output->size, start, arg_p[1].data.str.area, pmatch); + + /* replace the matching part */ + max = output->size - output->data; + if (max) { + if (max > output->data) + max = output->data; + memcpy(trash->area + trash->data, + output->area, max); + trash->data += max; + } + + free_trash_chunk(output); + + /* stop here if we're done with this string */ + if (start >= end) + break; + + /* We have a special case for matches of length 0 (eg: "x*y*"). + * These ones are considered to match in front of a character, + * so we have to copy that character and skip to the next one. + */ + if (!pmatch[0].rm_eo) { + if (trash->data < trash->size) + trash->area[trash->data++] = start[pmatch[0].rm_eo]; + pmatch[0].rm_eo++; + } + + start += pmatch[0].rm_eo; + flag |= REG_NOTBOL; + } + + smp->data.u.str = *trash; + return 1; +} + +/* This function check an operator entry. It expects a string. + * The string can be an integer or a variable name. + */ +static int check_operator(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + const char *str; + const char *end; + long long int i; + + /* Try to decode a variable. The 'err' variable is intentionnaly left + * NULL since the operators accept an integer as argument in which case + * vars_check_arg call will fail. + */ + if (vars_check_arg(&args[0], NULL)) + return 1; + + /* Try to convert an integer */ + str = args[0].data.str.area; + end = str + strlen(str); + i = read_int64(&str, end); + if (*str != '\0') { + memprintf(err, "expects an integer or a variable name"); + return 0; + } + + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = i; + return 1; +} + +/* This function returns a sample struct filled with an arg content. + * If the arg contain an integer, the integer is returned in the + * sample. If the arg contains a variable descriptor, it returns the + * variable value. + * + * This function returns 0 if an error occurs, otherwise it returns 1. + */ +int sample_conv_var2smp_sint(const struct arg *arg, struct sample *smp) +{ + switch (arg->type) { + case ARGT_SINT: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = arg->data.sint; + return 1; + case ARGT_VAR: + return sample_conv_var2smp(&arg->data.var, smp, SMP_T_SINT); + default: + return 0; + } +} + +/* Takes a SINT on input, applies a binary twos complement and returns the SINT + * result. + */ +static int sample_conv_binary_cpl(const struct arg *arg_p, struct sample *smp, void *private) +{ + smp->data.u.sint = ~smp->data.u.sint; + return 1; +} + +/* Takes a SINT on input, applies a binary "and" with the SINT directly in + * arg_p or in the variable described in arg_p, and returns the SINT result. + */ +static int sample_conv_binary_and(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct sample tmp; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(arg_p, &tmp)) + return 0; + smp->data.u.sint &= tmp.data.u.sint; + return 1; +} + +/* Takes a SINT on input, applies a binary "or" with the SINT directly in + * arg_p or in the variable described in arg_p, and returns the SINT result. + */ +static int sample_conv_binary_or(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct sample tmp; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(arg_p, &tmp)) + return 0; + smp->data.u.sint |= tmp.data.u.sint; + return 1; +} + +/* Takes a SINT on input, applies a binary "xor" with the SINT directly in + * arg_p or in the variable described in arg_p, and returns the SINT result. + */ +static int sample_conv_binary_xor(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct sample tmp; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(arg_p, &tmp)) + return 0; + smp->data.u.sint ^= tmp.data.u.sint; + return 1; +} + +static inline long long int arith_add(long long int a, long long int b) +{ + /* Prevent overflow and makes capped calculus. + * We must ensure that the check calculus doesn't + * exceed the signed 64 bits limits. + * + * +----------+----------+ + * | a<0 | a>=0 | + * +------+----------+----------+ + * | b<0 | MIN-a>b | no check | + * +------+----------+----------+ + * | b>=0 | no check | MAX-a<b | + * +------+----------+----------+ + */ + if ((a ^ b) >= 0) { + /* signs are same. */ + if (a < 0) { + if (LLONG_MIN - a > b) + return LLONG_MIN; + } + else if (LLONG_MAX - a < b) + return LLONG_MAX; + } + return a + b; +} + +/* Takes a SINT on input, applies an arithmetic "add" with the SINT directly in + * arg_p or in the variable described in arg_p, and returns the SINT result. + */ +static int sample_conv_arith_add(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct sample tmp; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(arg_p, &tmp)) + return 0; + smp->data.u.sint = arith_add(smp->data.u.sint, tmp.data.u.sint); + return 1; +} + +/* Takes a SINT on input, applies an arithmetic "sub" with the SINT directly in + * arg_p or in the variable described in arg_p, and returns the SINT result. + */ +static int sample_conv_arith_sub(const struct arg *arg_p, + struct sample *smp, void *private) +{ + struct sample tmp; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(arg_p, &tmp)) + return 0; + + /* We cannot represent -LLONG_MIN because abs(LLONG_MIN) is greater + * than abs(LLONG_MAX). So, the following code use LLONG_MAX in place + * of -LLONG_MIN and correct the result. + */ + if (tmp.data.u.sint == LLONG_MIN) { + smp->data.u.sint = arith_add(smp->data.u.sint, LLONG_MAX); + if (smp->data.u.sint < LLONG_MAX) + smp->data.u.sint++; + return 1; + } + + /* standard subtraction: we use the "add" function and negate + * the second operand. + */ + smp->data.u.sint = arith_add(smp->data.u.sint, -tmp.data.u.sint); + return 1; +} + +/* Takes a SINT on input, applies an arithmetic "mul" with the SINT directly in + * arg_p or in the variable described in arg_p, and returns the SINT result. + * If the result makes an overflow, then the largest possible quantity is + * returned. + */ +static int sample_conv_arith_mul(const struct arg *arg_p, + struct sample *smp, void *private) +{ + struct sample tmp; + long long int c; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(arg_p, &tmp)) + return 0; + + /* prevent divide by 0 during the check */ + if (!smp->data.u.sint || !tmp.data.u.sint) { + smp->data.u.sint = 0; + return 1; + } + + /* The multiply between LLONG_MIN and -1 returns a + * "floating point exception". + */ + if (smp->data.u.sint == LLONG_MIN && tmp.data.u.sint == -1) { + smp->data.u.sint = LLONG_MAX; + return 1; + } + + /* execute standard multiplication. */ + c = smp->data.u.sint * tmp.data.u.sint; + + /* check for overflow and makes capped multiply. */ + if (smp->data.u.sint != c / tmp.data.u.sint) { + if ((smp->data.u.sint < 0) == (tmp.data.u.sint < 0)) { + smp->data.u.sint = LLONG_MAX; + return 1; + } + smp->data.u.sint = LLONG_MIN; + return 1; + } + smp->data.u.sint = c; + return 1; +} + +/* Takes a SINT on input, applies an arithmetic "div" with the SINT directly in + * arg_p or in the variable described in arg_p, and returns the SINT result. + * If arg_p makes the result overflow, then the largest possible quantity is + * returned. + */ +static int sample_conv_arith_div(const struct arg *arg_p, + struct sample *smp, void *private) +{ + struct sample tmp; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(arg_p, &tmp)) + return 0; + + if (tmp.data.u.sint) { + /* The divide between LLONG_MIN and -1 returns a + * "floating point exception". + */ + if (smp->data.u.sint == LLONG_MIN && tmp.data.u.sint == -1) { + smp->data.u.sint = LLONG_MAX; + return 1; + } + smp->data.u.sint /= tmp.data.u.sint; + return 1; + } + smp->data.u.sint = LLONG_MAX; + return 1; +} + +/* Takes a SINT on input, applies an arithmetic "mod" with the SINT directly in + * arg_p or in the variable described in arg_p, and returns the SINT result. + * If arg_p makes the result overflow, then 0 is returned. + */ +static int sample_conv_arith_mod(const struct arg *arg_p, + struct sample *smp, void *private) +{ + struct sample tmp; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_sint(arg_p, &tmp)) + return 0; + + if (tmp.data.u.sint) { + /* The divide between LLONG_MIN and -1 returns a + * "floating point exception". + */ + if (smp->data.u.sint == LLONG_MIN && tmp.data.u.sint == -1) { + smp->data.u.sint = 0; + return 1; + } + smp->data.u.sint %= tmp.data.u.sint; + return 1; + } + smp->data.u.sint = 0; + return 1; +} + +/* Takes an SINT on input, applies an arithmetic "neg" and returns the SINT + * result. + */ +static int sample_conv_arith_neg(const struct arg *arg_p, + struct sample *smp, void *private) +{ + if (smp->data.u.sint == LLONG_MIN) + smp->data.u.sint = LLONG_MAX; + else + smp->data.u.sint = -smp->data.u.sint; + return 1; +} + +/* Takes a SINT on input, returns true is the value is non-null, otherwise + * false. The output is a BOOL. + */ +static int sample_conv_arith_bool(const struct arg *arg_p, + struct sample *smp, void *private) +{ + smp->data.u.sint = !!smp->data.u.sint; + smp->data.type = SMP_T_BOOL; + return 1; +} + +/* Takes a SINT on input, returns false is the value is non-null, otherwise + * truee. The output is a BOOL. + */ +static int sample_conv_arith_not(const struct arg *arg_p, + struct sample *smp, void *private) +{ + smp->data.u.sint = !smp->data.u.sint; + smp->data.type = SMP_T_BOOL; + return 1; +} + +/* Takes a SINT on input, returns true is the value is odd, otherwise false. + * The output is a BOOL. + */ +static int sample_conv_arith_odd(const struct arg *arg_p, + struct sample *smp, void *private) +{ + smp->data.u.sint = smp->data.u.sint & 1; + smp->data.type = SMP_T_BOOL; + return 1; +} + +/* Takes a SINT on input, returns true is the value is even, otherwise false. + * The output is a BOOL. + */ +static int sample_conv_arith_even(const struct arg *arg_p, + struct sample *smp, void *private) +{ + smp->data.u.sint = !(smp->data.u.sint & 1); + smp->data.type = SMP_T_BOOL; + return 1; +} + +/* appends an optional const string, an optional variable contents and another + * optional const string to an existing string. + */ +static int sample_conv_concat(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *trash; + struct sample tmp; + int max; + + trash = alloc_trash_chunk(); + if (!trash) + return 0; + + trash->data = smp->data.u.str.data; + if (trash->data > trash->size - 1) + trash->data = trash->size - 1; + + memcpy(trash->area, smp->data.u.str.area, trash->data); + trash->area[trash->data] = 0; + + /* append first string */ + max = arg_p[0].data.str.data; + if (max > trash->size - 1 - trash->data) + max = trash->size - 1 - trash->data; + + if (max) { + memcpy(trash->area + trash->data, arg_p[0].data.str.area, max); + trash->data += max; + trash->area[trash->data] = 0; + } + + /* append second string (variable) if it's found and we can turn it + * into a string. + */ + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (arg_p[1].type == ARGT_VAR && vars_get_by_desc(&arg_p[1].data.var, &tmp, NULL) && + (sample_casts[tmp.data.type][SMP_T_STR] == c_none || + sample_casts[tmp.data.type][SMP_T_STR](&tmp))) { + + max = tmp.data.u.str.data; + if (max > trash->size - 1 - trash->data) + max = trash->size - 1 - trash->data; + + if (max) { + memcpy(trash->area + trash->data, tmp.data.u.str.area, + max); + trash->data += max; + trash->area[trash->data] = 0; + } + } + + /* append third string */ + max = arg_p[2].data.str.data; + if (max > trash->size - 1 - trash->data) + max = trash->size - 1 - trash->data; + + if (max) { + memcpy(trash->area + trash->data, arg_p[2].data.str.area, max); + trash->data += max; + trash->area[trash->data] = 0; + } + + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp_dup(smp); + free_trash_chunk(trash); + return 1; +} + +/* This function checks the "concat" converter's arguments and extracts the + * variable name and its scope. + */ +static int smp_check_concat(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + /* Try to decode a variable. */ + if (args[1].data.str.data > 0 && !vars_check_arg(&args[1], NULL)) { + memprintf(err, "failed to register variable name '%s'", + args[1].data.str.area); + return 0; + } + return 1; +} + +/* Append delimiter (only to a non empty input) followed by the optional + * variable contents concatenated with the optional sufix. + */ +static int sample_conv_add_item(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *tmpbuf; + struct sample tmp; + size_t max; + int var_available; + + tmpbuf = alloc_trash_chunk(); + if (!tmpbuf) + return 0; + + tmpbuf->data = smp->data.u.str.data; + if (tmpbuf->data > tmpbuf->size - 1) + tmpbuf->data = tmpbuf->size - 1; + + memcpy(tmpbuf->area, smp->data.u.str.area, tmpbuf->data); + tmpbuf->area[tmpbuf->data] = 0; + + /* Check if variable is found and we can turn into a string. */ + var_available = 0; + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (arg_p[1].type == ARGT_VAR && vars_get_by_desc(&arg_p[1].data.var, &tmp, NULL) && + (sample_casts[tmp.data.type][SMP_T_STR] == c_none || + sample_casts[tmp.data.type][SMP_T_STR](&tmp))) + var_available = 1; + + /* Append delimiter only if input is not empty and either + * the variable or the suffix are not empty + */ + if (smp->data.u.str.data && ((var_available && tmp.data.u.str.data) || + arg_p[2].data.str.data)) { + max = arg_p[0].data.str.data; + if (max > tmpbuf->size - 1 - tmpbuf->data) + max = tmpbuf->size - 1 - tmpbuf->data; + + if (max) { + memcpy(tmpbuf->area + tmpbuf->data, arg_p[0].data.str.area, max); + tmpbuf->data += max; + tmpbuf->area[tmpbuf->data] = 0; + } + } + + /* Append variable contents if variable is found and turned into string. */ + if (var_available) { + max = tmp.data.u.str.data; + if (max > tmpbuf->size - 1 - tmpbuf->data) + max = tmpbuf->size - 1 - tmpbuf->data; + + if (max) { + memcpy(tmpbuf->area + tmpbuf->data, tmp.data.u.str.area, max); + tmpbuf->data += max; + tmpbuf->area[tmpbuf->data] = 0; + } + } + + /* Append optional suffix. */ + max = arg_p[2].data.str.data; + if (max > tmpbuf->size - 1 - tmpbuf->data) + max = tmpbuf->size - 1 - tmpbuf->data; + + if (max) { + memcpy(tmpbuf->area + tmpbuf->data, arg_p[2].data.str.area, max); + tmpbuf->data += max; + tmpbuf->area[tmpbuf->data] = 0; + } + + smp->data.u.str = *tmpbuf; + smp->data.type = SMP_T_STR; + smp_dup(smp); + free_trash_chunk(tmpbuf); + return 1; +} + +/* Check the "add_item" converter's arguments and extracts the + * variable name and its scope. + */ +static int smp_check_add_item(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + /* Try to decode a variable. */ + if (args[1].data.str.data > 0 && !vars_check_arg(&args[1], NULL)) { + memprintf(err, "failed to register variable name '%s'", + args[1].data.str.area); + return 0; + } + + if (args[1].data.str.data == 0 && args[2].data.str.data == 0) { + memprintf(err, "one of the optional arguments has to be nonempty"); + return 0; + } + + return 1; +} + +/* Compares string with a variable containing a string. Return value + * is compatible with strcmp(3)'s return value. + */ +static int sample_conv_strcmp(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct sample tmp; + int max, result; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (arg_p[0].type != ARGT_VAR) + return 0; + + if (!sample_conv_var2smp(&arg_p[0].data.var, &tmp, SMP_T_STR)) + return 0; + + max = MIN(smp->data.u.str.data, tmp.data.u.str.data); + result = strncmp(smp->data.u.str.area, tmp.data.u.str.area, max); + if (result == 0) { + if (smp->data.u.str.data != tmp.data.u.str.data) { + if (smp->data.u.str.data < tmp.data.u.str.data) { + result = -1; + } + else { + result = 1; + } + } + } + + smp->data.u.sint = result; + smp->data.type = SMP_T_SINT; + return 1; +} +/* + * This converter can takes a Host header value as defined by rfc9110#section-7.2 + * Host = uri-host [ ":" port ] ; + * It returns the uri-host value in lowecase with the port stripped. + */ +static int sample_conv_host_only(const struct arg *arg_p, struct sample *smp, void *private) +{ + /* Working cases: hostname00, hostname00:80, 127.0.0.1, 127.0.0.1:80, [::1], [::1]:80 */ + char *beg = smp->data.u.str.area; + char *end = smp->data.u.str.area + smp->data.u.str.data - 1; + char *p; + + for (p = end; p >= beg; p--) { + if (*p == ':' || *p == ']') + break; + } + + if (p >= beg && *p == ':') + smp->data.u.str.data = p - beg; + /* if no port part was found, the hostname is the whole string */ + + smp->data.type = SMP_T_STR; + + return sample_conv_str2lower(arg_p, smp, NULL); +} + +/* + * This converter can takes a Host header value as defined by rfc9110#section-7.2 + * Host = uri-host [ ":" port ] ; + * It returns the port value as a int. + */ +static int sample_conv_port_only(const struct arg *arg_p, struct sample *smp, void *private) +{ + /* Working cases: hostname00, hostname00:80, 127.0.0.1, 127.0.0.1:80, [::1], [::1]:80 */ + char *beg = smp->data.u.str.area; + char *end = smp->data.u.str.area + smp->data.u.str.data - 1; + char *p; + + for (p = end; p >= beg; p--) { + if (*p == ':' || *p == ']') + break; + } + + smp->data.type = SMP_T_SINT; + if (p >= beg && *p == ':' && ++p <= end) { + smp->data.u.sint = strl2ui(p, smp->data.u.str.data + smp->data.u.str.area - p); + } else { + smp->data.u.sint = 0; + } + return 1; +} + + +/* Takes a boolean as input. Returns the first argument if that boolean is true and + * the second argument otherwise. + */ +static int sample_conv_iif(const struct arg *arg_p, struct sample *smp, void *private) +{ + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_CONST; + + if (smp->data.u.sint) { + smp->data.u.str.data = arg_p[0].data.str.data; + smp->data.u.str.area = arg_p[0].data.str.area; + } + else { + smp->data.u.str.data = arg_p[1].data.str.data; + smp->data.u.str.area = arg_p[1].data.str.area; + } + + return 1; +} + +#define GRPC_MSG_COMPRESS_FLAG_SZ 1 /* 1 byte */ +#define GRPC_MSG_LENGTH_SZ 4 /* 4 bytes */ +#define GRPC_MSG_HEADER_SZ (GRPC_MSG_COMPRESS_FLAG_SZ + GRPC_MSG_LENGTH_SZ) + +/* + * Extract the field value of an input binary sample. Takes a mandatory argument: + * the protocol buffers field identifier (dotted notation) internally represented + * as an array of unsigned integers and its size. + * Return 1 if the field was found, 0 if not. + */ +static int sample_conv_ungrpc(const struct arg *arg_p, struct sample *smp, void *private) +{ + unsigned char *pos; + size_t grpc_left; + + pos = (unsigned char *)smp->data.u.str.area; + grpc_left = smp->data.u.str.data; + + while (grpc_left > GRPC_MSG_HEADER_SZ) { + size_t grpc_msg_len, left; + + grpc_msg_len = left = ntohl(*(uint32_t *)(pos + GRPC_MSG_COMPRESS_FLAG_SZ)); + + pos += GRPC_MSG_HEADER_SZ; + grpc_left -= GRPC_MSG_HEADER_SZ; + + if (grpc_left < left) + return 0; + + if (protobuf_field_lookup(arg_p, smp, &pos, &left)) + return 1; + + grpc_left -= grpc_msg_len; + } + + return 0; +} + +static int sample_conv_protobuf(const struct arg *arg_p, struct sample *smp, void *private) +{ + unsigned char *pos; + size_t left; + + pos = (unsigned char *)smp->data.u.str.area; + left = smp->data.u.str.data; + + return protobuf_field_lookup(arg_p, smp, &pos, &left); +} + +static int sample_conv_protobuf_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (!args[1].type) { + args[1].type = ARGT_SINT; + args[1].data.sint = PBUF_T_BINARY; + } + else { + int pbuf_type; + + pbuf_type = protobuf_type(args[1].data.str.area); + if (pbuf_type == -1) { + memprintf(err, "Wrong protocol buffer type '%s'", args[1].data.str.area); + return 0; + } + + chunk_destroy(&args[1].data.str); + args[1].type = ARGT_SINT; + args[1].data.sint = pbuf_type; + } + + return 1; +} + +/* + * Extract the tag value of an input binary sample. Takes a mandatory argument: + * the FIX protocol tag identifier. + * Return 1 if the tag was found, 0 if not. + */ +static int sample_conv_fix_tag_value(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct ist value; + + smp->flags &= ~SMP_F_MAY_CHANGE; + value = fix_tag_value(ist2(smp->data.u.str.area, smp->data.u.str.data), + arg_p[0].data.sint); + if (!istlen(value)) { + if (isttest(value)) { + /* value != IST_NULL, need more data */ + smp->flags |= SMP_F_MAY_CHANGE; + } + return 0; + } + + smp->data.u.str = ist2buf(value); + smp->flags |= SMP_F_CONST; + + return 1; +} + +/* This function checks the "fix_tag_value" converter configuration. + * It expects a "known" (by HAProxy) tag name or ID. + * Tag string names are converted to their ID counterpart because this is the + * format they are sent over the wire. + */ +static int sample_conv_fix_value_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + struct ist str; + unsigned int tag; + + str = ist2(args[0].data.str.area, args[0].data.str.data); + tag = fix_tagid(str); + if (!tag) { + memprintf(err, "Unknown FIX tag name '%s'", args[0].data.str.area); + return 0; + } + + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = tag; + + return 1; +} + +/* + * Checks that a buffer contains a valid FIX message + * + * Return 1 if the check could be run, 0 if not. + * The result of the analyse itself is stored in <smp> as a boolean + */ +static int sample_conv_fix_is_valid(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct ist msg; + + msg = ist2(smp->data.u.str.area, smp->data.u.str.data); + + smp->flags &= ~SMP_F_MAY_CHANGE; + switch (fix_validate_message(msg)) { + case FIX_VALID_MESSAGE: + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 1; + return 1; + case FIX_NEED_MORE_DATA: + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + case FIX_INVALID_MESSAGE: + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 0; + return 1; + } + return 0; +} + +/* + * Extract the field value of an input binary sample containing an MQTT packet. + * Takes 2 mandatory arguments: + * - packet type + * - field name + * + * return 1 if the field was found, 0 if not. + */ +static int sample_conv_mqtt_field_value(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct ist pkt, value; + int type, fieldname_id; + + pkt = ist2(smp->data.u.str.area, smp->data.u.str.data); + type = arg_p[0].data.sint; + fieldname_id = arg_p[1].data.sint; + + smp->flags &= ~SMP_F_MAY_CHANGE; + value = mqtt_field_value(pkt, type, fieldname_id); + if (!istlen(value)) { + if (isttest(value)) { + /* value != IST_NULL, need more data */ + smp->flags |= SMP_F_MAY_CHANGE; + } + return 0; + } + + smp->data.u.str = ist2buf(value); + smp->flags |= SMP_F_CONST; + return 1; +} + +/* + * this function checks the "mqtt_field_value" converter configuration. + * It expects a known packet type name or ID and a field name, in this order + * + * Args[0] will be turned into a MQTT_CPT_* value for direct matching when parsing + * a packet. + */ +static int sample_conv_mqtt_field_value_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + int type, fieldname_id; + + /* check the MQTT packet type is valid */ + type = mqtt_typeid(ist2(args[0].data.str.area, args[0].data.str.data)); + if (type == MQTT_CPT_INVALID) { + memprintf(err, "Unknown MQTT type '%s'", args[0].data.str.area); + return 0; + } + + /* check the field name belongs to the MQTT packet type */ + fieldname_id = mqtt_check_type_fieldname(type, ist2(args[1].data.str.area, args[1].data.str.data)); + if (fieldname_id == MQTT_FN_INVALID) { + memprintf(err, "Unknown MQTT field name '%s' for packet type '%s'", args[1].data.str.area, + args[0].data.str.area); + return 0; + } + + /* save numeric counterparts of type and field name */ + chunk_destroy(&args[0].data.str); + chunk_destroy(&args[1].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = type; + args[1].type = ARGT_SINT; + args[1].data.sint = fieldname_id; + + return 1; +} + +/* + * Checks that <smp> contains a valid MQTT message + * + * The function returns 1 if the check was run to its end, 0 otherwise. + * The result of the analyse itself is stored in <smp> as a boolean. + */ +static int sample_conv_mqtt_is_valid(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct ist msg; + + msg = ist2(smp->data.u.str.area, smp->data.u.str.data); + + smp->flags &= ~SMP_F_MAY_CHANGE; + switch (mqtt_validate_message(msg, NULL)) { + case FIX_VALID_MESSAGE: + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 1; + return 1; + case FIX_NEED_MORE_DATA: + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + case FIX_INVALID_MESSAGE: + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 0; + return 1; + } + return 0; +} + +/* This function checks the "strcmp" converter's arguments and extracts the + * variable name and its scope. + */ +static int smp_check_strcmp(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (!args[0].data.str.data) { + memprintf(err, "missing variable name"); + return 0; + } + + /* Try to decode a variable. */ + if (vars_check_arg(&args[0], NULL)) + return 1; + + memprintf(err, "failed to register variable name '%s'", + args[0].data.str.area); + return 0; +} + +/**/ +static int sample_conv_htonl(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *tmp; + uint32_t n; + + n = htonl((uint32_t)smp->data.u.sint); + tmp = get_trash_chunk(); + + memcpy(b_head(tmp), &n, 4); + b_add(tmp, 4); + + smp->data.u.str = *tmp; + smp->data.type = SMP_T_BIN; + return 1; +} + +/**/ +static int sample_conv_cut_crlf(const struct arg *arg_p, struct sample *smp, void *private) +{ + char *p; + size_t l; + + p = smp->data.u.str.area; + for (l = 0; l < smp->data.u.str.data; l++) { + if (*(p+l) == '\r' || *(p+l) == '\n') + break; + } + smp->data.u.str.data = l; + return 1; +} + +/**/ +static int sample_conv_ltrim(const struct arg *arg_p, struct sample *smp, void *private) +{ + char *delimiters, *p; + size_t dlen, l; + + delimiters = arg_p[0].data.str.area; + dlen = arg_p[0].data.str.data; + + l = smp->data.u.str.data; + p = smp->data.u.str.area; + while (l && memchr(delimiters, *p, dlen) != NULL) { + p++; + l--; + } + + smp->data.u.str.area = p; + smp->data.u.str.data = l; + return 1; +} + +/**/ +static int sample_conv_rtrim(const struct arg *arg_p, struct sample *smp, void *private) +{ + char *delimiters, *p; + size_t dlen, l; + + delimiters = arg_p[0].data.str.area; + dlen = arg_p[0].data.str.data; + + l = smp->data.u.str.data; + p = smp->data.u.str.area + l - 1; + while (l && memchr(delimiters, *p, dlen) != NULL) { + p--; + l--; + } + + smp->data.u.str.data = l; + return 1; +} + +/* This function checks the "json_query" converter's arguments. */ +static int sample_check_json_query(struct arg *arg, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (arg[0].data.str.data == 0) { + memprintf(err, "json_path must not be empty"); + return 0; + } + + if (arg[1].data.str.data != 0) { + if (strcmp(arg[1].data.str.area, "int") != 0) { + memprintf(err, "output_type only supports \"int\" as argument"); + return 0; + } else { + arg[1].type = ARGT_SINT; + arg[1].data.sint = 0; + } + } + return 1; +} + +/* Limit JSON integer values to the range [-(2**53)+1, (2**53)-1] as per + * the recommendation for interoperable integers in section 6 of RFC 7159. + */ +#define JSON_INT_MAX ((1LL << 53) - 1) +#define JSON_INT_MIN (-JSON_INT_MAX) + +/* This sample function get the value from a given json string. + * The mjson library is used to parse the JSON struct + */ +static int sample_conv_json_query(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + const char *token; /* holds the temporary string from mjson_find */ + int token_size; /* holds the length of <token> */ + + enum mjson_tok token_type; + + token_type = mjson_find(smp->data.u.str.area, smp->data.u.str.data, args[0].data.str.area, &token, &token_size); + + switch (token_type) { + case MJSON_TOK_NUMBER: + if (args[1].type == ARGT_SINT) { + smp->data.u.sint = strtoll(token, NULL, 0); + + if (smp->data.u.sint < JSON_INT_MIN || smp->data.u.sint > JSON_INT_MAX) + return 0; + + smp->data.type = SMP_T_SINT; + + return 1; + } else { + double double_val; + + if (mjson_get_number(smp->data.u.str.area, smp->data.u.str.data, args[0].data.str.area, &double_val) == 0) + return 0; + + trash->data = snprintf(trash->area,trash->size,"%g",double_val); + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + + return 1; + } + case MJSON_TOK_TRUE: + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 1; + + return 1; + case MJSON_TOK_FALSE: + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 0; + + return 1; + case MJSON_TOK_STRING: { + int len; + + len = mjson_get_string(smp->data.u.str.area, smp->data.u.str.data, args[0].data.str.area, trash->area, trash->size); + + if (len == -1) { + /* invalid string */ + return 0; + } + + trash->data = len; + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + + return 1; + } + case MJSON_TOK_ARRAY: { + // We copy the complete array, including square brackets into the return buffer + // result looks like: ["manage-account","manage-account-links","view-profile"] + trash->data = b_putblk(trash, token, token_size); + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + return 1; + } + case MJSON_TOK_NULL: + case MJSON_TOK_OBJECT: + /* We cannot handle these. */ + return 0; + case MJSON_TOK_INVALID: + /* Nothing matches the query. */ + return 0; + case MJSON_TOK_KEY: + /* This is not a valid return value according to the + * mjson documentation, but we handle it to benefit + * from '-Wswitch'. + */ + return 0; + } + + my_unreachable(); + return 0; +} + +#ifdef USE_OPENSSL +static int sample_conv_jwt_verify_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + vars_check_arg(&args[0], NULL); + vars_check_arg(&args[1], NULL); + + if (args[0].type == ARGT_STR) { + enum jwt_alg alg = jwt_parse_alg(args[0].data.str.area, args[0].data.str.data); + + if (alg == JWT_ALG_DEFAULT) { + memprintf(err, "unknown JWT algorithm: %s", args[0].data.str.area); + return 0; + } + } + + if (args[1].type == ARGT_STR) { + jwt_tree_load_cert(args[1].data.str.area, args[1].data.str.data, err); + } + + return 1; +} + +/* Check that a JWT's signature is correct */ +static int sample_conv_jwt_verify(const struct arg *args, struct sample *smp, void *private) +{ + struct sample alg_smp, key_smp; + enum jwt_vrfy_status ret; + + smp_set_owner(&alg_smp, smp->px, smp->sess, smp->strm, smp->opt); + smp_set_owner(&key_smp, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_str(&args[0], &alg_smp)) + return 0; + if (!sample_conv_var2smp_str(&args[1], &key_smp)) + return 0; + + ret = jwt_verify(&smp->data.u.str, &alg_smp.data.u.str, &key_smp.data.u.str); + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = ret; + return 1; +} + + +/* + * Returns the decoded header or payload of a JWT if no parameter is given, or + * the value of the specified field of the corresponding JWT subpart if a + * parameter is given. + */ +static int sample_conv_jwt_member_query(const struct arg *args, struct sample *smp, + void *private, enum jwt_elt member) +{ + struct jwt_item items[JWT_ELT_MAX] = { { 0 } }; + unsigned int item_num = member + 1; /* We don't need to tokenize the full token */ + struct buffer *decoded_header = get_trash_chunk(); + int retval = 0; + int ret; + + jwt_tokenize(&smp->data.u.str, items, &item_num); + + if (item_num < member + 1) + goto end; + + ret = base64urldec(items[member].start, items[member].length, + decoded_header->area, decoded_header->size); + if (ret == -1) + goto end; + + decoded_header->data = ret; + if (args[0].type != ARGT_STR) { + smp->data.u.str = *decoded_header; + smp->data.type = SMP_T_STR; + goto end; + } + + /* We look for a specific field of the header or payload part of the JWT */ + smp->data.u.str = *decoded_header; + + retval = sample_conv_json_query(args, smp, private); + +end: + return retval; +} + +/* This function checks the "jwt_header_query" and "jwt_payload_query" converters' arguments. + * It is based on the "json_query" converter's check with the only difference + * being that the jwt converters can take 0 parameters as well. + */ +static int sample_conv_jwt_query_check(struct arg *arg, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (arg[1].data.str.data != 0) { + if (strcmp(arg[1].data.str.area, "int") != 0) { + memprintf(err, "output_type only supports \"int\" as argument"); + return 0; + } else { + arg[1].type = ARGT_SINT; + arg[1].data.sint = 0; + } + } + return 1; +} + +/* + * If no parameter is given, return the decoded header part of a JWT (the first + * base64 encoded part, corresponding to the JOSE header). + * If a parameter is given, this converter acts as a "json_query" on this + * decoded JSON. + */ +static int sample_conv_jwt_header_query(const struct arg *args, struct sample *smp, void *private) +{ + return sample_conv_jwt_member_query(args, smp, private, JWT_ELT_JOSE); +} + +/* + * If no parameter is given, return the decoded payload part of a JWT (the + * second base64 encoded part, which contains all the claims). If a parameter + * is given, this converter acts as a "json_query" on this decoded JSON. + */ +static int sample_conv_jwt_payload_query(const struct arg *args, struct sample *smp, void *private) +{ + return sample_conv_jwt_member_query(args, smp, private, JWT_ELT_CLAIMS); +} + +#endif /* USE_OPENSSL */ + +/************************************************************************/ +/* All supported sample fetch functions must be declared here */ +/************************************************************************/ + + +/* returns the actconn */ +static int +smp_fetch_actconn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = actconn; + return 1; +} + + +/* force TRUE to be returned at the fetch level */ +static int +smp_fetch_true(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp_make_rw(smp)) + return 0; + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 1; + return 1; +} + +/* force FALSE to be returned at the fetch level */ +static int +smp_fetch_false(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = 0; + return 1; +} + +/* retrieve environment variable $1 as a string */ +static int +smp_fetch_env(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + char *env; + + if (args[0].type != ARGT_STR) + return 0; + + env = getenv(args[0].data.str.area); + if (!env) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.area = env; + smp->data.u.str.data = strlen(env); + return 1; +} + +/* Validates the data unit argument passed to "date" fetch. Argument 1 support an + * optional string representing the unit of the result: "s" for seconds, "ms" for + * milliseconds and "us" for microseconds. + * Returns 0 on error and non-zero if OK. + */ +int smp_check_date_unit(struct arg *args, char **err) +{ + if (args[1].type == ARGT_STR) { + long long int unit; + + if (strcmp(args[1].data.str.area, "s") == 0) { + unit = TIME_UNIT_S; + } + else if (strcmp(args[1].data.str.area, "ms") == 0) { + unit = TIME_UNIT_MS; + } + else if (strcmp(args[1].data.str.area, "us") == 0) { + unit = TIME_UNIT_US; + } + else { + memprintf(err, "expects 's', 'ms' or 'us', got '%s'", + args[1].data.str.area); + return 0; + } + + chunk_destroy(&args[1].data.str); + args[1].type = ARGT_SINT; + args[1].data.sint = unit; + } + else if (args[1].type != ARGT_STOP) { + memprintf(err, "Unexpected arg type"); + return 0; + } + + return 1; +} + +/* retrieve the current local date in epoch time, converts it to milliseconds + * or microseconds if asked to in optional args[1] unit param, and applies an + * optional args[0] offset. + */ +static int +smp_fetch_date(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.u.sint = date.tv_sec; + + /* report in milliseconds */ + if (args[1].type == ARGT_SINT && args[1].data.sint == TIME_UNIT_MS) { + smp->data.u.sint *= 1000; + smp->data.u.sint += date.tv_usec / 1000; + } + /* report in microseconds */ + else if (args[1].type == ARGT_SINT && args[1].data.sint == TIME_UNIT_US) { + smp->data.u.sint *= 1000000; + smp->data.u.sint += date.tv_usec; + } + + /* add offset */ + if (args[0].type == ARGT_SINT) + smp->data.u.sint += args[0].data.sint; + + smp->data.type = SMP_T_SINT; + smp->flags |= SMP_F_VOL_TEST | SMP_F_MAY_CHANGE; + return 1; +} + +/* retrieve the current microsecond part of the date */ +static int +smp_fetch_date_us(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.u.sint = date.tv_usec; + smp->data.type = SMP_T_SINT; + smp->flags |= SMP_F_VOL_TEST | SMP_F_MAY_CHANGE; + return 1; +} + + +/* returns the hostname */ +static int +smp_fetch_hostname(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_CONST; + smp->data.u.str.area = hostname; + smp->data.u.str.data = strlen(hostname); + return 1; +} + +/* returns the number of processes */ +static int +smp_fetch_nbproc(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 1; + return 1; +} + +/* returns the PID of the current process */ +static int +smp_fetch_pid(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = pid; + return 1; +} + + +/* returns the number of the current process (between 1 and nbproc */ +static int +smp_fetch_proc(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 1; + return 1; +} + +/* returns the number of the current thread (between 1 and nbthread */ +static int +smp_fetch_thread(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = tid; + return 1; +} + +/* generate a random 32-bit integer for whatever purpose, with an optional + * range specified in argument. + */ +static int +smp_fetch_rand(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.u.sint = statistical_prng(); + + /* reduce if needed. Don't do a modulo, use all bits! */ + if (args[0].type == ARGT_SINT) + smp->data.u.sint = ((u64)smp->data.u.sint * (u64)args[0].data.sint) >> 32; + + smp->data.type = SMP_T_SINT; + smp->flags |= SMP_F_VOL_TEST | SMP_F_MAY_CHANGE; + return 1; +} + +/* returns true if the current process is stopping */ +static int +smp_fetch_stopping(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = stopping; + return 1; +} + +/* returns the number of calls of the current stream's process_stream() */ +static int +smp_fetch_cpu_calls(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->strm->task->calls; + return 1; +} + +/* returns the average number of nanoseconds spent processing the stream per call */ +static int +smp_fetch_cpu_ns_avg(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->strm->task->calls ? smp->strm->cpu_time / smp->strm->task->calls : 0; + return 1; +} + +/* returns the total number of nanoseconds spent processing the stream */ +static int +smp_fetch_cpu_ns_tot(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->strm->cpu_time; + return 1; +} + +/* returns the average number of nanoseconds per call spent waiting for other tasks to be processed */ +static int +smp_fetch_lat_ns_avg(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->strm->task->calls ? smp->strm->lat_time / smp->strm->task->calls : 0; + return 1; +} + +/* returns the total number of nanoseconds per call spent waiting for other tasks to be processed */ +static int +smp_fetch_lat_ns_tot(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = smp->strm->lat_time; + return 1; +} + +static int smp_fetch_const_str(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags |= SMP_F_CONST; + smp->data.type = SMP_T_STR; + smp->data.u.str.area = args[0].data.str.area; + smp->data.u.str.data = args[0].data.str.data; + return 1; +} + +static int smp_check_const_bool(struct arg *args, char **err) +{ + if (strcasecmp(args[0].data.str.area, "true") == 0 || + strcasecmp(args[0].data.str.area, "1") == 0) { + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = 1; + return 1; + } + if (strcasecmp(args[0].data.str.area, "false") == 0 || + strcasecmp(args[0].data.str.area, "0") == 0) { + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = 0; + return 1; + } + memprintf(err, "Expects 'true', 'false', '0' or '1'"); + return 0; +} + +static int smp_fetch_const_bool(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = args[0].data.sint; + return 1; +} + +static int smp_fetch_const_int(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_SINT; + smp->data.u.sint = args[0].data.sint; + return 1; +} + +static int smp_fetch_const_ipv4(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_IPV4; + smp->data.u.ipv4 = args[0].data.ipv4; + return 1; +} + +static int smp_fetch_const_ipv6(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_IPV6; + smp->data.u.ipv6 = args[0].data.ipv6; + return 1; +} + +static int smp_check_const_bin(struct arg *args, char **err) +{ + char *binstr = NULL; + int binstrlen; + + if (!parse_binary(args[0].data.str.area, &binstr, &binstrlen, err)) + return 0; + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_STR; + args[0].data.str.area = binstr; + args[0].data.str.data = binstrlen; + return 1; +} + +static int smp_fetch_const_bin(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags |= SMP_F_CONST; + smp->data.type = SMP_T_BIN; + smp->data.u.str.area = args[0].data.str.area; + smp->data.u.str.data = args[0].data.str.data; + return 1; +} + +static int smp_check_const_meth(struct arg *args, char **err) +{ + enum http_meth_t meth; + int i; + + meth = find_http_meth(args[0].data.str.area, args[0].data.str.data); + if (meth != HTTP_METH_OTHER) { + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = meth; + } else { + /* Check method availability. A method is a token defined as : + * tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / + * "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA + * token = 1*tchar + */ + for (i = 0; i < args[0].data.str.data; i++) { + if (!HTTP_IS_TOKEN(args[0].data.str.area[i])) { + memprintf(err, "expects valid method."); + return 0; + } + } + } + return 1; +} + +static int smp_fetch_const_meth(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_METH; + if (args[0].type == ARGT_SINT) { + smp->flags &= ~SMP_F_CONST; + smp->data.u.meth.meth = args[0].data.sint; + smp->data.u.meth.str.area = ""; + smp->data.u.meth.str.data = 0; + } else { + smp->flags |= SMP_F_CONST; + smp->data.u.meth.meth = HTTP_METH_OTHER; + smp->data.u.meth.str.area = args[0].data.str.area; + smp->data.u.meth.str.data = args[0].data.str.data; + } + return 1; +} + +// This function checks the "uuid" sample's arguments. +// Function won't get called when no parameter is specified (maybe a bug?) +static int smp_check_uuid(struct arg *args, char **err) +{ + if (!args[0].type) { + args[0].type = ARGT_SINT; + args[0].data.sint = 4; + } + else if (args[0].data.sint != 4) { + memprintf(err, "Unsupported UUID version: '%lld'", args[0].data.sint); + return 0; + } + + return 1; +} + +// Generate a RFC4122 UUID (default is v4 = fully random) +static int smp_fetch_uuid(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (args[0].data.sint == 4 || !args[0].type) { + ha_generate_uuid(&trash); + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_VOL_TEST | SMP_F_MAY_CHANGE; + smp->data.u.str = trash; + return 1; + } + + // more implementations of other uuid formats possible here + return 0; +} + +/* Check if QUIC support was compiled and was not disabled by "no-quic" global option */ +static int smp_fetch_quic_enabled(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->data.type = SMP_T_BOOL; + smp->flags = 0; +#ifdef USE_QUIC + smp->data.u.sint = !(global.tune.options & GTUNE_NO_QUIC); +#else + smp->data.u.sint = 0; +#endif + return smp->data.u.sint; +} + +/* Timing events re{q,s}.timer. */ +static int smp_fetch_reX_timers(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct strm_logs *logs; + int t_request = -1; + + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->flags = 0; + + logs = &smp->strm->logs; + + + if ((llong)(logs->request_ts - logs->accept_ts) >= 0) + t_request = ns_to_ms(logs->request_ts - logs->accept_ts); + + /* req.timer. */ + if (kw[2] == 'q') { + + switch (kw[10]) { + + /* req.timer.idle (%Ti) */ + case 'i': + smp->data.u.sint = logs->t_idle; + break; + + /* req.timer.tq (%Tq) */ + case 't': + smp->data.u.sint = t_request; + break; + + /* req.timer.hdr (%TR) */ + case 'h': + smp->data.u.sint = (t_request >= 0) ? t_request - logs->t_idle - logs->t_handshake : -1; + break; + + /* req.timer.queue (%Tw) */ + case 'q': + smp->data.u.sint = (logs->t_queue >= 0) ? logs->t_queue - t_request : -1; + break; + + default: + goto error; + + } + } else { + /* res.timer. */ + switch (kw[10]) { + /* res.timer.hdr (%Tr) */ + case 'h': + smp->data.u.sint = (logs->t_data >= 0) ? logs->t_data - logs->t_connect : -1; + break; + + /* res.timer.data (%Td) */ + case 'd': + smp->data.u.sint = (logs->t_data >= 0) ? logs->t_close - logs->t_data : -1; + break; + + default: + goto error; + + } + + } + + return 1; +error: + + return 0; + } + + +/* Timing events txn. */ +static int smp_fetch_txn_timers(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct strm_logs *logs; + + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->flags = 0; + + logs = &smp->strm->logs; + + /* txn.timer. */ + switch (kw[10]) { + + /* txn.timer.total (%Ta) */ + case 't': + smp->data.u.sint = logs->t_close - (logs->t_idle >= 0 ? logs->t_idle + logs->t_handshake : 0); + break; + + + /* txn.timer.user (%Tu) */ + case 'u': + smp->data.u.sint = logs->t_close - (logs->t_idle >= 0 ? logs->t_idle : 0); + break; + + default: + goto error; + + } + + return 1; +error: + + return 0; +} + +/* Timing events {f,bc}.timer. */ +static int smp_fetch_conn_timers(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct strm_logs *logs; + + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->flags = 0; + + logs = &smp->strm->logs; + + if (kw[0] == 'b') { + /* fc.timer. */ + switch (kw[9]) { + + /* bc.timer.connect (%Tc) */ + case 'c': + smp->data.u.sint = (logs->t_connect >= 0) ? logs->t_connect - logs->t_queue : -1; + break; + + default: + goto error; + } + + } else { + + /* fc.timer. */ + switch (kw[9]) { + + /* fc.timer.handshake (%Th) */ + case 'h': + smp->data.u.sint = logs->t_handshake; + break; + + /* fc,timer.total (%Tt) */ + case 't': + smp->data.u.sint = logs->t_close; + break; + + default: + goto error; + } + + } + + return 1; +error: + + return 0; +} + +/* bytes_{in,out} */ +static int smp_fetch_bytes(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct strm_logs *logs; + + if (!smp->strm) + return 0; + + smp->data.type = SMP_T_SINT; + smp->flags = 0; + + logs = &smp->strm->logs; + if (!logs) + return 0; + + if (kw[6] == 'i') { /* bytes_in */ + smp->data.u.sint = logs->bytes_in; + } else { /* bytes_out */ + smp->data.u.sint = logs->bytes_out; + } + + return 1; +} + +static int sample_conv_bytes_check(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + // arg0 is not optional, must be >= 0 + if (!check_operator(&args[0], conv, file, line, err)) { + return 0; + } + if (args[0].type != ARGT_VAR) { + if (args[0].type != ARGT_SINT || args[0].data.sint < 0) { + memprintf(err, "expects a non-negative integer"); + return 0; + } + } + // arg1 is optional, must be > 0 + if (args[1].type != ARGT_STOP) { + if (!check_operator(&args[1], conv, file, line, err)) { + return 0; + } + if (args[1].type != ARGT_VAR) { + if (args[1].type != ARGT_SINT || args[1].data.sint <= 0) { + memprintf(err, "expects a positive integer"); + return 0; + } + } + } + + return 1; +} + +static struct sample_fetch_kw_list smp_logs_kws = {ILH, { + { "bytes_in", smp_fetch_bytes, 0, NULL, SMP_T_SINT, SMP_USE_INTRN }, + { "bytes_out", smp_fetch_bytes, 0, NULL, SMP_T_SINT, SMP_USE_INTRN }, + + { "txn.timer.total", smp_fetch_txn_timers, 0, NULL, SMP_T_SINT, SMP_USE_TXFIN }, /* "Ta" */ + { "txn.timer.user", smp_fetch_txn_timers, 0, NULL, SMP_T_SINT, SMP_USE_TXFIN }, /* "Tu" */ + + { "bc.timer.connect", smp_fetch_conn_timers, 0, NULL, SMP_T_SINT, SMP_USE_L4SRV }, /* "Tc" */ + { "fc.timer.handshake", smp_fetch_conn_timers, 0, NULL, SMP_T_SINT, SMP_USE_L4CLI }, /* "Th" */ + { "fc.timer.total", smp_fetch_conn_timers, 0, NULL, SMP_T_SINT, SMP_USE_SSFIN }, /* "Tt" */ + + { "req.timer.idle", smp_fetch_reX_timers, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV }, /* "Ti" */ + { "req.timer.tq", smp_fetch_reX_timers, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV }, /* "Tq" */ + { "req.timer.hdr", smp_fetch_reX_timers, 0, NULL, SMP_T_SINT, SMP_USE_HRQHV }, /* "TR" */ + { "req.timer.queue", smp_fetch_reX_timers, 0, NULL, SMP_T_SINT, SMP_USE_L4SRV }, /* "Tw" */ + { "res.timer.data", smp_fetch_reX_timers, 0, NULL, SMP_T_SINT, SMP_USE_RSFIN }, /* "Td" */ + { "res.timer.hdr", smp_fetch_reX_timers, 0, NULL, SMP_T_SINT, SMP_USE_HRSHV }, /* "Tr" */ + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_logs_kws); + +/* Note: must not be declared <const> as its list will be overwritten. + * Note: fetches that may return multiple types should be declared using the + * appropriate pseudo-type. If not available it must be declared as the lowest + * common denominator, the type that can be casted into all other ones. + */ +static struct sample_fetch_kw_list smp_kws = {ILH, { + { "act_conn", smp_fetch_actconn, 0, NULL, SMP_T_SINT, SMP_USE_CONST }, + { "always_false", smp_fetch_false, 0, NULL, SMP_T_BOOL, SMP_USE_CONST }, + { "always_true", smp_fetch_true, 0, NULL, SMP_T_BOOL, SMP_USE_CONST }, + { "env", smp_fetch_env, ARG1(1,STR), NULL, SMP_T_STR, SMP_USE_CONST }, + { "date", smp_fetch_date, ARG2(0,SINT,STR), smp_check_date_unit, SMP_T_SINT, SMP_USE_CONST }, + { "date_us", smp_fetch_date_us, 0, NULL, SMP_T_SINT, SMP_USE_CONST }, + { "hostname", smp_fetch_hostname, 0, NULL, SMP_T_STR, SMP_USE_CONST }, + { "nbproc", smp_fetch_nbproc,0, NULL, SMP_T_SINT, SMP_USE_CONST }, + { "pid", smp_fetch_pid, 0, NULL, SMP_T_SINT, SMP_USE_CONST }, + { "proc", smp_fetch_proc, 0, NULL, SMP_T_SINT, SMP_USE_CONST }, + { "quic_enabled", smp_fetch_quic_enabled, 0, NULL, SMP_T_BOOL, SMP_USE_CONST }, + { "thread", smp_fetch_thread, 0, NULL, SMP_T_SINT, SMP_USE_CONST }, + { "rand", smp_fetch_rand, ARG1(0,SINT), NULL, SMP_T_SINT, SMP_USE_CONST }, + { "stopping", smp_fetch_stopping, 0, NULL, SMP_T_BOOL, SMP_USE_INTRN }, + { "uuid", smp_fetch_uuid, ARG1(0, SINT), smp_check_uuid, SMP_T_STR, SMP_USE_CONST }, + + { "cpu_calls", smp_fetch_cpu_calls, 0, NULL, SMP_T_SINT, SMP_USE_INTRN }, + { "cpu_ns_avg", smp_fetch_cpu_ns_avg, 0, NULL, SMP_T_SINT, SMP_USE_INTRN }, + { "cpu_ns_tot", smp_fetch_cpu_ns_tot, 0, NULL, SMP_T_SINT, SMP_USE_INTRN }, + { "lat_ns_avg", smp_fetch_lat_ns_avg, 0, NULL, SMP_T_SINT, SMP_USE_INTRN }, + { "lat_ns_tot", smp_fetch_lat_ns_tot, 0, NULL, SMP_T_SINT, SMP_USE_INTRN }, + + { "str", smp_fetch_const_str, ARG1(1,STR), NULL , SMP_T_STR, SMP_USE_CONST }, + { "bool", smp_fetch_const_bool, ARG1(1,STR), smp_check_const_bool, SMP_T_BOOL, SMP_USE_CONST }, + { "int", smp_fetch_const_int, ARG1(1,SINT), NULL , SMP_T_SINT, SMP_USE_CONST }, + { "ipv4", smp_fetch_const_ipv4, ARG1(1,IPV4), NULL , SMP_T_IPV4, SMP_USE_CONST }, + { "ipv6", smp_fetch_const_ipv6, ARG1(1,IPV6), NULL , SMP_T_IPV6, SMP_USE_CONST }, + { "bin", smp_fetch_const_bin, ARG1(1,STR), smp_check_const_bin , SMP_T_BIN, SMP_USE_CONST }, + { "meth", smp_fetch_const_meth, ARG1(1,STR), smp_check_const_meth, SMP_T_METH, SMP_USE_CONST }, + + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_conv_kw_list sample_conv_kws = {ILH, { + { "add_item",sample_conv_add_item, ARG3(2,STR,STR,STR), smp_check_add_item, SMP_T_STR, SMP_T_STR }, + { "debug", sample_conv_debug, ARG2(0,STR,STR), smp_check_debug, SMP_T_ANY, SMP_T_SAME }, + { "b64dec", sample_conv_base642bin, 0, NULL, SMP_T_STR, SMP_T_BIN }, + { "base64", sample_conv_bin2base64, 0, NULL, SMP_T_BIN, SMP_T_STR }, + { "concat", sample_conv_concat, ARG3(1,STR,STR,STR), smp_check_concat, SMP_T_STR, SMP_T_STR }, + { "ub64enc", sample_conv_bin2base64url,0, NULL, SMP_T_BIN, SMP_T_STR }, + { "ub64dec", sample_conv_base64url2bin,0, NULL, SMP_T_STR, SMP_T_BIN }, + { "upper", sample_conv_str2upper, 0, NULL, SMP_T_STR, SMP_T_STR }, + { "lower", sample_conv_str2lower, 0, NULL, SMP_T_STR, SMP_T_STR }, + { "length", sample_conv_length, 0, NULL, SMP_T_STR, SMP_T_SINT }, + { "be2dec", sample_conv_be2dec, ARG3(1,STR,SINT,SINT), sample_conv_be2dec_check, SMP_T_BIN, SMP_T_STR }, + { "be2hex", sample_conv_be2hex, ARG3(1,STR,SINT,SINT), sample_conv_be2hex_check, SMP_T_BIN, SMP_T_STR }, + { "hex", sample_conv_bin2hex, 0, NULL, SMP_T_BIN, SMP_T_STR }, + { "hex2i", sample_conv_hex2int, 0, NULL, SMP_T_STR, SMP_T_SINT }, + { "ipmask", sample_conv_ipmask, ARG2(1,MSK4,MSK6), NULL, SMP_T_ADDR, SMP_T_ADDR }, + { "ltime", sample_conv_ltime, ARG2(1,STR,SINT), NULL, SMP_T_SINT, SMP_T_STR }, + { "ms_ltime", sample_conv_ms_ltime, ARG2(1,STR,SINT), NULL, SMP_T_SINT, SMP_T_STR }, + { "us_ltime", sample_conv_us_ltime, ARG2(1,STR,SINT), NULL, SMP_T_SINT, SMP_T_STR }, + { "utime", sample_conv_utime, ARG2(1,STR,SINT), NULL, SMP_T_SINT, SMP_T_STR }, + { "ms_utime", sample_conv_ms_utime, ARG2(1,STR,SINT), NULL, SMP_T_SINT, SMP_T_STR }, + { "us_utime", sample_conv_us_utime, ARG2(1,STR,SINT), NULL, SMP_T_SINT, SMP_T_STR }, + { "crc32", sample_conv_crc32, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_T_SINT }, + { "crc32c", sample_conv_crc32c, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_T_SINT }, + { "djb2", sample_conv_djb2, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_T_SINT }, + { "sdbm", sample_conv_sdbm, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_T_SINT }, + { "wt6", sample_conv_wt6, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_T_SINT }, + { "xxh3", sample_conv_xxh3, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_T_SINT }, + { "xxh32", sample_conv_xxh32, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_T_SINT }, + { "xxh64", sample_conv_xxh64, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_T_SINT }, + { "json", sample_conv_json, ARG1(1,STR), sample_conv_json_check, SMP_T_STR, SMP_T_STR }, + { "bytes", sample_conv_bytes, ARG2(1,STR,STR), sample_conv_bytes_check, SMP_T_BIN, SMP_T_BIN }, + { "field", sample_conv_field, ARG3(2,SINT,STR,SINT), sample_conv_field_check, SMP_T_STR, SMP_T_STR }, + { "word", sample_conv_word, ARG3(2,SINT,STR,SINT), sample_conv_field_check, SMP_T_STR, SMP_T_STR }, + { "param", sample_conv_param, ARG2(1,STR,STR), sample_conv_param_check, SMP_T_STR, SMP_T_STR }, + { "regsub", sample_conv_regsub, ARG3(2,REG,STR,STR), sample_conv_regsub_check, SMP_T_STR, SMP_T_STR }, + { "sha1", sample_conv_sha1, 0, NULL, SMP_T_BIN, SMP_T_BIN }, + { "strcmp", sample_conv_strcmp, ARG1(1,STR), smp_check_strcmp, SMP_T_STR, SMP_T_SINT }, + { "host_only", sample_conv_host_only, 0, NULL, SMP_T_STR, SMP_T_STR }, + { "port_only", sample_conv_port_only, 0, NULL, SMP_T_STR, SMP_T_SINT }, + + /* gRPC converters. */ + { "ungrpc", sample_conv_ungrpc, ARG2(1,PBUF_FNUM,STR), sample_conv_protobuf_check, SMP_T_BIN, SMP_T_BIN }, + { "protobuf", sample_conv_protobuf, ARG2(1,PBUF_FNUM,STR), sample_conv_protobuf_check, SMP_T_BIN, SMP_T_BIN }, + + /* FIX converters */ + { "fix_is_valid", sample_conv_fix_is_valid, 0, NULL, SMP_T_BIN, SMP_T_BOOL }, + { "fix_tag_value", sample_conv_fix_tag_value, ARG1(1,STR), sample_conv_fix_value_check, SMP_T_BIN, SMP_T_BIN }, + + /* MQTT converters */ + { "mqtt_is_valid", sample_conv_mqtt_is_valid, 0, NULL, SMP_T_BIN, SMP_T_BOOL }, + { "mqtt_field_value", sample_conv_mqtt_field_value, ARG2(2,STR,STR), sample_conv_mqtt_field_value_check, SMP_T_BIN, SMP_T_STR }, + + { "iif", sample_conv_iif, ARG2(2, STR, STR), NULL, SMP_T_BOOL, SMP_T_STR }, + + { "and", sample_conv_binary_and, ARG1(1,STR), check_operator, SMP_T_SINT, SMP_T_SINT }, + { "or", sample_conv_binary_or, ARG1(1,STR), check_operator, SMP_T_SINT, SMP_T_SINT }, + { "xor", sample_conv_binary_xor, ARG1(1,STR), check_operator, SMP_T_SINT, SMP_T_SINT }, + { "cpl", sample_conv_binary_cpl, 0, NULL, SMP_T_SINT, SMP_T_SINT }, + { "bool", sample_conv_arith_bool, 0, NULL, SMP_T_SINT, SMP_T_BOOL }, + { "not", sample_conv_arith_not, 0, NULL, SMP_T_SINT, SMP_T_BOOL }, + { "odd", sample_conv_arith_odd, 0, NULL, SMP_T_SINT, SMP_T_BOOL }, + { "even", sample_conv_arith_even, 0, NULL, SMP_T_SINT, SMP_T_BOOL }, + { "add", sample_conv_arith_add, ARG1(1,STR), check_operator, SMP_T_SINT, SMP_T_SINT }, + { "sub", sample_conv_arith_sub, ARG1(1,STR), check_operator, SMP_T_SINT, SMP_T_SINT }, + { "mul", sample_conv_arith_mul, ARG1(1,STR), check_operator, SMP_T_SINT, SMP_T_SINT }, + { "div", sample_conv_arith_div, ARG1(1,STR), check_operator, SMP_T_SINT, SMP_T_SINT }, + { "mod", sample_conv_arith_mod, ARG1(1,STR), check_operator, SMP_T_SINT, SMP_T_SINT }, + { "neg", sample_conv_arith_neg, 0, NULL, SMP_T_SINT, SMP_T_SINT }, + + { "htonl", sample_conv_htonl, 0, NULL, SMP_T_SINT, SMP_T_BIN }, + { "cut_crlf", sample_conv_cut_crlf, 0, NULL, SMP_T_STR, SMP_T_STR }, + { "ltrim", sample_conv_ltrim, ARG1(1,STR), NULL, SMP_T_STR, SMP_T_STR }, + { "rtrim", sample_conv_rtrim, ARG1(1,STR), NULL, SMP_T_STR, SMP_T_STR }, + { "json_query", sample_conv_json_query, ARG2(1,STR,STR), sample_check_json_query , SMP_T_STR, SMP_T_ANY }, + +#ifdef USE_OPENSSL + /* JSON Web Token converters */ + { "jwt_header_query", sample_conv_jwt_header_query, ARG2(0,STR,STR), sample_conv_jwt_query_check, SMP_T_BIN, SMP_T_ANY }, + { "jwt_payload_query", sample_conv_jwt_payload_query, ARG2(0,STR,STR), sample_conv_jwt_query_check, SMP_T_BIN, SMP_T_ANY }, + { "jwt_verify", sample_conv_jwt_verify, ARG2(2,STR,STR), sample_conv_jwt_verify_check, SMP_T_BIN, SMP_T_SINT }, +#endif + { NULL, NULL, 0, 0, 0 }, +}}; + +INITCALL1(STG_REGISTER, sample_register_convs, &sample_conv_kws); diff --git a/src/server.c b/src/server.c new file mode 100644 index 0000000..829fbb3 --- /dev/null +++ b/src/server.c @@ -0,0 +1,6765 @@ +/* + * Server management functions. + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * Copyright 2007-2008 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/types.h> +#include <netinet/tcp.h> +#include <ctype.h> +#include <errno.h> + +#include <import/ebmbtree.h> + +#include <haproxy/api.h> +#include <haproxy/applet-t.h> +#include <haproxy/backend.h> +#include <haproxy/cfgparse.h> +#include <haproxy/check.h> +#include <haproxy/cli.h> +#include <haproxy/connection.h> +#include <haproxy/dict-t.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/log.h> +#include <haproxy/mailers.h> +#include <haproxy/namespace.h> +#include <haproxy/port_range.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy.h> +#include <haproxy/queue.h> +#include <haproxy/resolvers.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/stats.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/tcpcheck.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/xxhash.h> +#include <haproxy/event_hdl.h> + + +static void srv_update_status(struct server *s, int type, int cause); +static int srv_apply_lastaddr(struct server *srv, int *err_code); +static void srv_cleanup_connections(struct server *srv); + +/* extra keywords used as value for other arguments. They are used as + * suggestions for mistyped words. + */ +static const char *extra_kw_list[] = { + "ipv4", "ipv6", "legacy", "octet-count", + "fail-check", "sudden-death", "mark-down", + NULL /* must be last */ +}; + +/* List head of all known server keywords */ +struct srv_kw_list srv_keywords = { + .list = LIST_HEAD_INIT(srv_keywords.list) +}; + +__decl_thread(HA_SPINLOCK_T idle_conn_srv_lock); +struct eb_root idle_conn_srv = EB_ROOT; +struct task *idle_conn_task __read_mostly = NULL; +struct list servers_list = LIST_HEAD_INIT(servers_list); +static struct task *server_atomic_sync_task = NULL; +static event_hdl_async_equeue server_atomic_sync_queue; + +/* SERVER DELETE(n)->ADD global tracker: + * This is meant to provide srv->rid (revision id) value. + * Revision id allows to differentiate between a previously existing + * deleted server and a new server reusing deleted server name/id. + * + * start value is 0 (even value) + * LSB is used to specify that one or multiple srv delete in a row + * were performed. + * When adding a new server, increment by 1 if current + * value is odd (odd = LSB set), + * because adding a new server after one or + * multiple deletions means we could potentially be reusing old names: + * Increase the revision id to prevent mixups between old and new names. + * + * srv->rid is calculated from cnt even values only. + * sizeof(srv_id_reuse_cnt) must be twice sizeof(srv->rid) + * + * Wraparound is expected and should not cause issues + * (with current design we allow up to 4 billion unique revisions) + * + * Counter is only used under thread_isolate (cli_add/cli_del), + * no need for atomic ops. + */ +static uint64_t srv_id_reuse_cnt = 0; + +/* The server names dictionary */ +struct dict server_key_dict = { + .name = "server keys", + .values = EB_ROOT_UNIQUE, +}; + +static const char *srv_adm_st_chg_cause_str[] = { + [SRV_ADM_STCHGC_NONE] = "", + [SRV_ADM_STCHGC_DNS_NOENT] = "entry removed from SRV record", + [SRV_ADM_STCHGC_DNS_NOIP] = "No IP for server ", + [SRV_ADM_STCHGC_DNS_NX] = "DNS NX status", + [SRV_ADM_STCHGC_DNS_TIMEOUT] = "DNS timeout status", + [SRV_ADM_STCHGC_DNS_REFUSED] = "DNS refused status", + [SRV_ADM_STCHGC_DNS_UNSPEC] = "unspecified DNS error", + [SRV_ADM_STCHGC_STATS_DISABLE] = "'disable' on stats page", + [SRV_ADM_STCHGC_STATS_STOP] = "'stop' on stats page" +}; + +const char *srv_adm_st_chg_cause(enum srv_adm_st_chg_cause cause) +{ + return srv_adm_st_chg_cause_str[cause]; +} + +static const char *srv_op_st_chg_cause_str[] = { + [SRV_OP_STCHGC_NONE] = "", + [SRV_OP_STCHGC_HEALTH] = "", + [SRV_OP_STCHGC_AGENT] = "", + [SRV_OP_STCHGC_CLI] = "changed from CLI", + [SRV_OP_STCHGC_LUA] = "changed from Lua script", + [SRV_OP_STCHGC_STATS_WEB] = "changed from Web interface", + [SRV_OP_STCHGC_STATEFILE] = "changed from server-state after a reload" +}; + +const char *srv_op_st_chg_cause(enum srv_op_st_chg_cause cause) +{ + return srv_op_st_chg_cause_str[cause]; +} + +int srv_downtime(const struct server *s) +{ + if ((s->cur_state != SRV_ST_STOPPED) || s->last_change >= ns_to_sec(now_ns)) // ignore negative time + return s->down_time; + + return ns_to_sec(now_ns) - s->last_change + s->down_time; +} + +int srv_lastsession(const struct server *s) +{ + if (s->counters.last_sess) + return ns_to_sec(now_ns) - s->counters.last_sess; + + return -1; +} + +int srv_getinter(const struct check *check) +{ + const struct server *s = check->server; + + if ((check->state & (CHK_ST_CONFIGURED|CHK_ST_FASTINTER)) == CHK_ST_CONFIGURED && + (check->health == check->rise + check->fall - 1)) + return check->inter; + + if ((s->next_state == SRV_ST_STOPPED) && check->health == 0) + return (check->downinter)?(check->downinter):(check->inter); + + return (check->fastinter)?(check->fastinter):(check->inter); +} + +/* Update server's addr:svc_port tuple in INET context + * + * Must be called under thread isolation to ensure consistent readings accross + * all threads (addr:svc_port might be read without srv lock being held). + */ +static void _srv_set_inetaddr_port(struct server *srv, + const struct sockaddr_storage *addr, + unsigned int svc_port, uint8_t mapped_port) +{ + ipcpy(addr, &srv->addr); + srv->svc_port = svc_port; + if (mapped_port) + srv->flags |= SRV_F_MAPPORTS; + else + srv->flags &= ~SRV_F_MAPPORTS; + + if (srv->log_target && srv->log_target->type == LOG_TARGET_DGRAM) { + /* server is used as a log target, manually update log target addr for DGRAM */ + ipcpy(addr, srv->log_target->addr); + set_host_port(srv->log_target->addr, svc_port); + } +} + +/* same as _srv_set_inetaddr_port() but only updates the addr part + */ +static void _srv_set_inetaddr(struct server *srv, + const struct sockaddr_storage *addr) +{ + _srv_set_inetaddr_port(srv, addr, srv->svc_port, !!(srv->flags & SRV_F_MAPPORTS)); +} + +/* + * Function executed by server_atomic_sync_task to perform atomic updates on + * compatible server struct members that are not guarded by any lock since + * they are not supposed to change often and are subject to being used in + * sensitive codepaths + * + * Some updates may require thread isolation: we start without isolation + * but as soon as we encounter an event that requires isolation, we do so. + * Once the event is processed, we keep the isolation until we've processed + * the whole batch of events and leave isolation once we're done, as it would + * be very costly to try to acquire isolation multiple times in a row. + * The task will limit itself to a number of events per run to prevent + * thread contention (see: "tune.events.max-events-at-once"). + * + * TODO: if we find out that enforcing isolation is too costly, we may + * consider adding thread_isolate_try_full(timeout) or equivalent to the + * thread API so that we can do our best not to block harmless threads + * for too long if one or multiple threads are still heavily busy. This + * would mean that the task would be capable of rescheduling itself to + * start again on the current event if it failed to acquire thread + * isolation. This would also imply that the event_hdl API allows us + * to check an event without popping it from the queue first (remove the + * event once it is successfully processed). + */ +static void srv_set_addr_desc(struct server *s, int reattach); +static struct task *server_atomic_sync(struct task *task, void *context, unsigned int state) +{ + unsigned int remain = event_hdl_tune.max_events_at_once; // to limit max number of events per batch + struct event_hdl_async_event *event; + + /* check for new server events that we care about */ + while ((event = event_hdl_async_equeue_pop(&server_atomic_sync_queue))) { + if (event_hdl_sub_type_equal(event->type, EVENT_HDL_SUB_END)) { + /* ending event: no more events to come */ + event_hdl_async_free_event(event); + task_destroy(task); + task = NULL; + break; + } + + if (!remain) { + /* STOP: we've already spent all our budget here, and + * considering we possibly are under isolation, we cannot + * keep blocking other threads any longer. + * + * Reschedule the task to finish where we left off if + * there are remaining events in the queue. + */ + if (!event_hdl_async_equeue_isempty(&server_atomic_sync_queue)) + task_wakeup(task, TASK_WOKEN_OTHER); + break; + } + remain--; + + /* new event to process */ + if (event_hdl_sub_type_equal(event->type, EVENT_HDL_SUB_SERVER_INETADDR)) { + struct sockaddr_storage new_addr; + struct event_hdl_cb_data_server_inetaddr *data = event->data; + struct proxy *px; + struct server *srv; + + /* server ip:port changed, we must atomically update data members + * to prevent invalid reads by other threads. + */ + + /* check if related server still exists */ + px = proxy_find_by_id(data->server.safe.proxy_uuid, PR_CAP_BE, 0); + if (!px) + continue; + srv = findserver_unique_id(px, data->server.safe.puid, data->server.safe.rid); + if (!srv) + continue; + + /* prepare new addr based on event cb data */ + memset(&new_addr, 0, sizeof(new_addr)); + new_addr.ss_family = data->safe.next.family; + switch (new_addr.ss_family) { + case AF_INET: + ((struct sockaddr_in *)&new_addr)->sin_addr.s_addr = + data->safe.next.addr.v4.s_addr; + break; + case AF_INET6: + memcpy(&((struct sockaddr_in6 *)&new_addr)->sin6_addr, + &data->safe.next.addr.v6, + sizeof(struct in6_addr)); + break; + case AF_UNSPEC: + /* addr reset, nothing to do */ + break; + default: + /* should not happen */ + break; + } + /* + * this requires thread isolation, which is safe since we're the only + * task working for the current subscription and we don't hold locks + * or ressources that other threads may depend on to complete a running + * cycle. Note that we do this way because we assume that this event is + * rather rare. + */ + if (!thread_isolated()) + thread_isolate_full(); + + /* apply new addr:port combination */ + _srv_set_inetaddr_port(srv, &new_addr, + data->safe.next.port.svc, data->safe.next.port.map); + + /* propagate the changes */ + if (data->safe.purge_conn) /* force connection cleanup on the given server? */ + srv_cleanup_connections(srv); + srv_set_dyncookie(srv); + srv_set_addr_desc(srv, 1); + } + event_hdl_async_free_event(event); + } + + /* some events possibly required thread_isolation: + * now that we are done, we must leave thread isolation before + * returning + */ + if (thread_isolated()) + thread_release(); + + return task; +} + +/* Try to start the atomic server sync task. + * + * Returns ERR_NONE on success and a combination of ERR_CODE on failure + */ +static int server_atomic_sync_start() +{ + struct event_hdl_sub_type subscriptions = EVENT_HDL_SUB_NONE; + + if (server_atomic_sync_task) + return ERR_NONE; // nothing to do + server_atomic_sync_task = task_new_anywhere(); + if (!server_atomic_sync_task) + goto fail; + server_atomic_sync_task->process = server_atomic_sync; + event_hdl_async_equeue_init(&server_atomic_sync_queue); + + /* task created, now subscribe to relevant server events in the global list */ + subscriptions = event_hdl_sub_type_add(subscriptions, EVENT_HDL_SUB_SERVER_INETADDR); + if (!event_hdl_subscribe(NULL, subscriptions, + EVENT_HDL_ASYNC_TASK(&server_atomic_sync_queue, + server_atomic_sync_task, + NULL, + NULL))) + goto fail; + + + return ERR_NONE; + + fail: + task_destroy(server_atomic_sync_task); + server_atomic_sync_task = NULL; + return ERR_ALERT | ERR_FATAL; +} +REGISTER_POST_CHECK(server_atomic_sync_start); + +/* fill common server event data members struct + * must be called with server lock or under thread isolate + */ +static inline void _srv_event_hdl_prepare(struct event_hdl_cb_data_server *cb_data, + struct server *srv, uint8_t thread_isolate) +{ + /* safe data assignments */ + cb_data->safe.puid = srv->puid; + cb_data->safe.rid = srv->rid; + cb_data->safe.flags = srv->flags; + snprintf(cb_data->safe.name, sizeof(cb_data->safe.name), "%s", srv->id); + cb_data->safe.proxy_name[0] = '\0'; + cb_data->safe.proxy_uuid = -1; /* default value */ + if (srv->proxy) { + cb_data->safe.proxy_uuid = srv->proxy->uuid; + snprintf(cb_data->safe.proxy_name, sizeof(cb_data->safe.proxy_name), "%s", srv->proxy->id); + } + /* unsafe data assignments */ + cb_data->unsafe.ptr = srv; + cb_data->unsafe.thread_isolate = thread_isolate; + cb_data->unsafe.srv_lock = !thread_isolate; +} + +/* take an event-check snapshot from a live check */ +void _srv_event_hdl_prepare_checkres(struct event_hdl_cb_data_server_checkres *checkres, + struct check *check) +{ + checkres->agent = !!(check->state & CHK_ST_AGENT); + checkres->result = check->result; + checkres->duration = check->duration; + checkres->reason.status = check->status; + checkres->reason.code = check->code; + checkres->health.cur = check->health; + checkres->health.rise = check->rise; + checkres->health.fall = check->fall; +} + +/* Prepare SERVER_STATE event + * + * This special event will contain extra hints related to the state change + * + * Must be called with server lock held + */ +void _srv_event_hdl_prepare_state(struct event_hdl_cb_data_server_state *cb_data, + struct server *srv, int type, int cause, + enum srv_state prev_state, int requeued) +{ + /* state event provides additional info about the server state change */ + cb_data->safe.type = type; + cb_data->safe.new_state = srv->cur_state; + cb_data->safe.old_state = prev_state; + cb_data->safe.requeued = requeued; + if (type) { + /* administrative */ + cb_data->safe.adm_st_chg.cause = cause; + } + else { + /* operational */ + cb_data->safe.op_st_chg.cause = cause; + if (cause == SRV_OP_STCHGC_HEALTH || cause == SRV_OP_STCHGC_AGENT) { + struct check *check = (cause == SRV_OP_STCHGC_HEALTH) ? &srv->check : &srv->agent; + + /* provide additional check-related state change result */ + _srv_event_hdl_prepare_checkres(&cb_data->safe.op_st_chg.check, check); + } + } +} + +/* Prepare SERVER_INETADDR event, prev data is learned from the current + * server settings. + * + * This special event will contain extra hints related to the addr change + * + * Must be called with the server lock held. + */ +static void _srv_event_hdl_prepare_inetaddr(struct event_hdl_cb_data_server_inetaddr *cb_data, + struct server *srv, + const struct sockaddr_storage *next_addr, + unsigned int next_port, uint8_t next_mapports, + uint8_t purge_conn) +{ + struct sockaddr_storage *prev_addr = &srv->addr; + unsigned int prev_port = srv->svc_port; + uint8_t prev_mapports = !!(srv->flags & SRV_F_MAPPORTS); + + /* only INET families are supported */ + BUG_ON((prev_addr->ss_family != AF_UNSPEC && + prev_addr->ss_family != AF_INET && prev_addr->ss_family != AF_INET6) || + (next_addr->ss_family != AF_UNSPEC && + next_addr->ss_family != AF_INET && next_addr->ss_family != AF_INET6)); + + /* prev */ + cb_data->safe.prev.family = prev_addr->ss_family; + memset(&cb_data->safe.prev.addr, 0, sizeof(cb_data->safe.prev.addr)); + if (prev_addr->ss_family == AF_INET) + cb_data->safe.prev.addr.v4.s_addr = + ((struct sockaddr_in *)prev_addr)->sin_addr.s_addr; + else if (prev_addr->ss_family == AF_INET6) + memcpy(&cb_data->safe.prev.addr.v6, + &((struct sockaddr_in6 *)prev_addr)->sin6_addr, + sizeof(struct in6_addr)); + cb_data->safe.prev.port.svc = prev_port; + cb_data->safe.prev.port.map = prev_mapports; + + /* next */ + cb_data->safe.next.family = next_addr->ss_family; + memset(&cb_data->safe.next.addr, 0, sizeof(cb_data->safe.next.addr)); + if (next_addr->ss_family == AF_INET) + cb_data->safe.next.addr.v4.s_addr = + ((struct sockaddr_in *)next_addr)->sin_addr.s_addr; + else if (next_addr->ss_family == AF_INET6) + memcpy(&cb_data->safe.next.addr.v6, + &((struct sockaddr_in6 *)next_addr)->sin6_addr, + sizeof(struct in6_addr)); + cb_data->safe.next.port.svc = next_port; + cb_data->safe.next.port.map = next_mapports; + + cb_data->safe.purge_conn = purge_conn; +} + +/* server event publishing helper: publish in both global and + * server dedicated subscription list. + */ +#define _srv_event_hdl_publish(e, d, s) \ + ({ \ + /* publish in server dedicated sub list */ \ + event_hdl_publish(&s->e_subs, e, EVENT_HDL_CB_DATA(&d));\ + /* publish in global subscription list */ \ + event_hdl_publish(NULL, e, EVENT_HDL_CB_DATA(&d)); \ + }) + +/* General server event publishing: + * Use this to publish EVENT_HDL_SUB_SERVER family type event + * from srv facility. + * + * server ptr must be valid. + * Must be called with srv lock or under thread_isolate. + */ +static void srv_event_hdl_publish(struct event_hdl_sub_type event, + struct server *srv, uint8_t thread_isolate) +{ + struct event_hdl_cb_data_server cb_data; + + /* prepare event data */ + _srv_event_hdl_prepare(&cb_data, srv, thread_isolate); + _srv_event_hdl_publish(event, cb_data, srv); +} + +/* Publish SERVER_CHECK event + * + * This special event will contain extra hints related to the check itself + * + * Must be called with server lock held + */ +void srv_event_hdl_publish_check(struct server *srv, struct check *check) +{ + struct event_hdl_cb_data_server_check cb_data; + + /* check event provides additional info about the server check */ + _srv_event_hdl_prepare_checkres(&cb_data.safe.res, check); + + cb_data.unsafe.ptr = check; + + /* prepare event data (common server data) */ + _srv_event_hdl_prepare((struct event_hdl_cb_data_server *)&cb_data, srv, 0); + + _srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_CHECK, cb_data, srv); +} + +/* + * Check that we did not get a hash collision. + * Unlikely, but it can happen. The server's proxy must be at least + * read-locked. + */ +static inline void srv_check_for_dup_dyncookie(struct server *s) +{ + struct proxy *p = s->proxy; + struct server *tmpserv; + + for (tmpserv = p->srv; tmpserv != NULL; + tmpserv = tmpserv->next) { + if (tmpserv == s) + continue; + if (tmpserv->next_admin & SRV_ADMF_FMAINT) + continue; + if (tmpserv->cookie && + strcmp(tmpserv->cookie, s->cookie) == 0) { + ha_warning("We generated two equal cookies for two different servers.\n" + "Please change the secret key for '%s'.\n", + s->proxy->id); + } + } + +} + +/* + * Must be called with the server lock held, and will read-lock the proxy. + */ +void srv_set_dyncookie(struct server *s) +{ + struct proxy *p = s->proxy; + char *tmpbuf; + unsigned long long hash_value; + size_t key_len; + size_t buffer_len; + int addr_len; + int port; + + HA_RWLOCK_RDLOCK(PROXY_LOCK, &p->lock); + + if ((s->flags & SRV_F_COOKIESET) || + !(s->proxy->ck_opts & PR_CK_DYNAMIC) || + s->proxy->dyncookie_key == NULL) + goto out; + key_len = strlen(p->dyncookie_key); + + if (s->addr.ss_family != AF_INET && + s->addr.ss_family != AF_INET6) + goto out; + /* + * Buffer to calculate the cookie value. + * The buffer contains the secret key + the server IP address + * + the TCP port. + */ + addr_len = (s->addr.ss_family == AF_INET) ? 4 : 16; + /* + * The TCP port should use only 2 bytes, but is stored in + * an unsigned int in struct server, so let's use 4, to be + * on the safe side. + */ + buffer_len = key_len + addr_len + 4; + tmpbuf = trash.area; + memcpy(tmpbuf, p->dyncookie_key, key_len); + memcpy(&(tmpbuf[key_len]), + s->addr.ss_family == AF_INET ? + (void *)&((struct sockaddr_in *)&s->addr)->sin_addr.s_addr : + (void *)&(((struct sockaddr_in6 *)&s->addr)->sin6_addr.s6_addr), + addr_len); + /* + * Make sure it's the same across all the load balancers, + * no matter their endianness. + */ + port = htonl(s->svc_port); + memcpy(&tmpbuf[key_len + addr_len], &port, 4); + hash_value = XXH64(tmpbuf, buffer_len, 0); + memprintf(&s->cookie, "%016llx", hash_value); + if (!s->cookie) + goto out; + s->cklen = 16; + + /* Don't bother checking if the dyncookie is duplicated if + * the server is marked as "disabled", maybe it doesn't have + * its real IP yet, but just a place holder. + */ + if (!(s->next_admin & SRV_ADMF_FMAINT)) + srv_check_for_dup_dyncookie(s); + out: + HA_RWLOCK_RDUNLOCK(PROXY_LOCK, &p->lock); +} + +/* Returns true if it's possible to reuse an idle connection from server <srv> + * for a websocket stream. This is the case if server is configured to use the + * same protocol for both HTTP and websocket streams. This depends on the value + * of "proto", "alpn" and "ws" keywords. + */ +int srv_check_reuse_ws(struct server *srv) +{ + if (srv->mux_proto || srv->use_ssl != 1 || !srv->ssl_ctx.alpn_str) { + /* explicit srv.mux_proto or no ALPN : srv.mux_proto is used + * for mux selection. + */ + const struct ist srv_mux = srv->mux_proto ? + srv->mux_proto->token : IST_NULL; + + switch (srv->ws) { + /* "auto" means use the same protocol : reuse is possible. */ + case SRV_WS_AUTO: + return 1; + + /* "h2" means use h2 for websocket : reuse is possible if + * server mux is h2. + */ + case SRV_WS_H2: + if (srv->mux_proto && isteq(srv_mux, ist("h2"))) + return 1; + break; + + /* "h1" means use h1 for websocket : reuse is possible if + * server mux is h1. + */ + case SRV_WS_H1: + if (!srv->mux_proto || isteq(srv_mux, ist("h1"))) + return 1; + break; + } + } + else { + /* ALPN selection. + * Based on the assumption that only "h2" and "http/1.1" token + * are used on server ALPN. + */ + const struct ist alpn = ist2(srv->ssl_ctx.alpn_str, + srv->ssl_ctx.alpn_len); + + switch (srv->ws) { + case SRV_WS_AUTO: + /* for auto mode, consider reuse as possible if the + * server uses a single protocol ALPN + */ + if (!istchr(alpn, ',')) + return 1; + break; + + case SRV_WS_H2: + return isteq(alpn, ist("\x02h2")); + + case SRV_WS_H1: + return isteq(alpn, ist("\x08http/1.1")); + } + } + + return 0; +} + +/* Return the proto to used for a websocket stream on <srv> without ALPN. NULL + * is a valid value indicating to use the fallback mux. + */ +const struct mux_ops *srv_get_ws_proto(struct server *srv) +{ + const struct mux_proto_list *mux = NULL; + + switch (srv->ws) { + case SRV_WS_AUTO: + mux = srv->mux_proto; + break; + + case SRV_WS_H1: + mux = get_mux_proto(ist("h1")); + break; + + case SRV_WS_H2: + mux = get_mux_proto(ist("h2")); + break; + } + + return mux ? mux->mux : NULL; +} + +/* + * Must be called with the server lock held. The server is first removed from + * the proxy tree if it was already attached. If <reattach> is true, the server + * will then be attached in the proxy tree. The proxy lock is held to + * manipulate the tree. + */ +static void srv_set_addr_desc(struct server *s, int reattach) +{ + struct proxy *p = s->proxy; + char *key; + + key = sa2str(&s->addr, s->svc_port, s->flags & SRV_F_MAPPORTS); + + if (s->addr_node.key) { + if (key && strcmp(key, s->addr_node.key) == 0) { + free(key); + return; + } + + HA_RWLOCK_WRLOCK(PROXY_LOCK, &p->lock); + ebpt_delete(&s->addr_node); + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); + + free(s->addr_node.key); + } + + s->addr_node.key = key; + + if (reattach) { + if (s->addr_node.key) { + HA_RWLOCK_WRLOCK(PROXY_LOCK, &p->lock); + ebis_insert(&p->used_server_addr, &s->addr_node); + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); + } + } +} + +/* + * Registers the server keyword list <kwl> as a list of valid keywords for next + * parsing sessions. + */ +void srv_register_keywords(struct srv_kw_list *kwl) +{ + LIST_APPEND(&srv_keywords.list, &kwl->list); +} + +/* Return a pointer to the server keyword <kw>, or NULL if not found. If the + * keyword is found with a NULL ->parse() function, then an attempt is made to + * find one with a valid ->parse() function. This way it is possible to declare + * platform-dependant, known keywords as NULL, then only declare them as valid + * if some options are met. Note that if the requested keyword contains an + * opening parenthesis, everything from this point is ignored. + */ +struct srv_kw *srv_find_kw(const char *kw) +{ + int index; + const char *kwend; + struct srv_kw_list *kwl; + struct srv_kw *ret = NULL; + + kwend = strchr(kw, '('); + if (!kwend) + kwend = kw + strlen(kw); + + list_for_each_entry(kwl, &srv_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if ((strncmp(kwl->kw[index].kw, kw, kwend - kw) == 0) && + kwl->kw[index].kw[kwend-kw] == 0) { + if (kwl->kw[index].parse) + return &kwl->kw[index]; /* found it !*/ + else + ret = &kwl->kw[index]; /* may be OK */ + } + } + } + return ret; +} + +/* Dumps all registered "server" keywords to the <out> string pointer. The + * unsupported keywords are only dumped if their supported form was not + * found. + */ +void srv_dump_kws(char **out) +{ + struct srv_kw_list *kwl; + int index; + + if (!out) + return; + + *out = NULL; + list_for_each_entry(kwl, &srv_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + if (kwl->kw[index].parse || + srv_find_kw(kwl->kw[index].kw) == &kwl->kw[index]) { + memprintf(out, "%s[%4s] %s%s%s%s\n", *out ? *out : "", + kwl->scope, + kwl->kw[index].kw, + kwl->kw[index].skip ? " <arg>" : "", + kwl->kw[index].default_ok ? " [dflt_ok]" : "", + kwl->kw[index].parse ? "" : " (not supported)"); + } + } + } +} + +/* Try to find in srv_keyword the word that looks closest to <word> by counting + * transitions between letters, digits and other characters. Will return the + * best matching word if found, otherwise NULL. An optional array of extra + * words to compare may be passed in <extra>, but it must then be terminated + * by a NULL entry. If unused it may be NULL. + */ +static const char *srv_find_best_kw(const char *word) +{ + uint8_t word_sig[1024]; + uint8_t list_sig[1024]; + const struct srv_kw_list *kwl; + const char *best_ptr = NULL; + int dist, best_dist = INT_MAX; + const char **extra; + int index; + + make_word_fingerprint(word_sig, word); + list_for_each_entry(kwl, &srv_keywords.list, list) { + for (index = 0; kwl->kw[index].kw != NULL; index++) { + make_word_fingerprint(list_sig, kwl->kw[index].kw); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_ptr = kwl->kw[index].kw; + } + } + } + + for (extra = extra_kw_list; *extra; extra++) { + make_word_fingerprint(list_sig, *extra); + dist = word_fingerprint_distance(word_sig, list_sig); + if (dist < best_dist) { + best_dist = dist; + best_ptr = *extra; + } + } + + if (best_dist > 2 * strlen(word) || (best_ptr && best_dist > 2 * strlen(best_ptr))) + best_ptr = NULL; + + return best_ptr; +} + +/* Parse the "backup" server keyword */ +static int srv_parse_backup(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->flags |= SRV_F_BACKUP; + return 0; +} + + +/* Parse the "cookie" server keyword */ +static int srv_parse_cookie(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' expects <value> as argument.\n", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(newsrv->cookie); + newsrv->cookie = strdup(arg); + newsrv->cklen = strlen(arg); + newsrv->flags |= SRV_F_COOKIESET; + return 0; +} + +/* Parse the "disabled" server keyword */ +static int srv_parse_disabled(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->next_admin |= SRV_ADMF_CMAINT | SRV_ADMF_FMAINT; + newsrv->next_state = SRV_ST_STOPPED; + newsrv->check.state |= CHK_ST_PAUSED; + newsrv->check.health = 0; + return 0; +} + +/* Parse the "enabled" server keyword */ +static int srv_parse_enabled(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->next_admin &= ~SRV_ADMF_CMAINT & ~SRV_ADMF_FMAINT; + newsrv->next_state = SRV_ST_RUNNING; + newsrv->check.state &= ~CHK_ST_PAUSED; + newsrv->check.health = newsrv->check.rise; + return 0; +} + +/* Parse the "error-limit" server keyword */ +static int srv_parse_error_limit(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' expects an integer argument.", + args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + newsrv->consecutive_errors_limit = atoi(args[*cur_arg + 1]); + + if (newsrv->consecutive_errors_limit <= 0) { + memprintf(err, "%s has to be > 0.", + args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* Parse the "ws" keyword */ +static int srv_parse_ws(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (!args[*cur_arg + 1]) { + memprintf(err, "'%s' expects 'auto', 'h1' or 'h2' value", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[*cur_arg + 1], "h1") == 0) { + newsrv->ws = SRV_WS_H1; + } + else if (strcmp(args[*cur_arg + 1], "h2") == 0) { + newsrv->ws = SRV_WS_H2; + } + else if (strcmp(args[*cur_arg + 1], "auto") == 0) { + newsrv->ws = SRV_WS_AUTO; + } + else { + memprintf(err, "'%s' has to be 'auto', 'h1' or 'h2'", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + + return 0; +} + +/* Parse the "init-addr" server keyword */ +static int srv_parse_init_addr(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *p, *end; + int done; + struct sockaddr_storage sa; + + newsrv->init_addr_methods = 0; + memset(&newsrv->init_addr, 0, sizeof(newsrv->init_addr)); + + for (p = args[*cur_arg + 1]; *p; p = end) { + /* cut on next comma */ + for (end = p; *end && *end != ','; end++); + if (*end) + *(end++) = 0; + + memset(&sa, 0, sizeof(sa)); + if (strcmp(p, "libc") == 0) { + done = srv_append_initaddr(&newsrv->init_addr_methods, SRV_IADDR_LIBC); + } + else if (strcmp(p, "last") == 0) { + done = srv_append_initaddr(&newsrv->init_addr_methods, SRV_IADDR_LAST); + } + else if (strcmp(p, "none") == 0) { + done = srv_append_initaddr(&newsrv->init_addr_methods, SRV_IADDR_NONE); + } + else if (str2ip2(p, &sa, 0)) { + if (is_addr(&newsrv->init_addr)) { + memprintf(err, "'%s' : initial address already specified, cannot add '%s'.", + args[*cur_arg], p); + return ERR_ALERT | ERR_FATAL; + } + newsrv->init_addr = sa; + done = srv_append_initaddr(&newsrv->init_addr_methods, SRV_IADDR_IP); + } + else { + memprintf(err, "'%s' : unknown init-addr method '%s', supported methods are 'libc', 'last', 'none'.", + args[*cur_arg], p); + return ERR_ALERT | ERR_FATAL; + } + if (!done) { + memprintf(err, "'%s' : too many init-addr methods when trying to add '%s'", + args[*cur_arg], p); + return ERR_ALERT | ERR_FATAL; + } + } + + return 0; +} + +/* Parse the "log-bufsize" server keyword */ +static int srv_parse_log_bufsize(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' expects an integer argument.", + args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + newsrv->log_bufsize = atoi(args[*cur_arg + 1]); + + if (newsrv->log_bufsize <= 0) { + memprintf(err, "%s has to be > 0.", + args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* Parse the "log-proto" server keyword */ +static int srv_parse_log_proto(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (strcmp(args[*cur_arg + 1], "legacy") == 0) + newsrv->log_proto = SRV_LOG_PROTO_LEGACY; + else if (strcmp(args[*cur_arg + 1], "octet-count") == 0) + newsrv->log_proto = SRV_LOG_PROTO_OCTET_COUNTING; + else { + memprintf(err, "'%s' expects one of 'legacy' or 'octet-count' but got '%s'", + args[*cur_arg], args[*cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* Parse the "maxconn" server keyword */ +static int srv_parse_maxconn(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->maxconn = atol(args[*cur_arg + 1]); + return 0; +} + +/* Parse the "maxqueue" server keyword */ +static int srv_parse_maxqueue(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->maxqueue = atol(args[*cur_arg + 1]); + return 0; +} + +/* Parse the "minconn" server keyword */ +static int srv_parse_minconn(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->minconn = atol(args[*cur_arg + 1]); + return 0; +} + +static int srv_parse_max_reuse(char **args, int *cur_arg, struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' expects <value> as argument.\n", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + newsrv->max_reuse = atoi(arg); + + return 0; +} + +static int srv_parse_pool_purge_delay(char **args, int *cur_arg, struct proxy *curproxy, struct server *newsrv, char **err) +{ + const char *res; + char *arg; + unsigned int time; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' expects <value> as argument.\n", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + res = parse_time_err(arg, &time, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 2147483647 ms or ~24.8 days)", + args[*cur_arg+1], args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + else if (res == PARSE_TIME_UNDER) { + memprintf(err, "timer underflow in argument '%s' to '%s' (minimum non-null value is 1 ms)", + args[*cur_arg+1], args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + else if (res) { + memprintf(err, "unexpected character '%c' in argument to <%s>.\n", + *res, args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + newsrv->pool_purge_delay = time; + + return 0; +} + +static int srv_parse_pool_low_conn(char **args, int *cur_arg, struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' expects <value> as argument.\n", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + newsrv->low_idle_conns = atoi(arg); + return 0; +} + +static int srv_parse_pool_max_conn(char **args, int *cur_arg, struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' expects <value> as argument.\n", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + newsrv->max_idle_conns = atoi(arg); + if ((int)newsrv->max_idle_conns < -1) { + memprintf(err, "'%s' must be >= -1", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* parse the "id" server keyword */ +static int srv_parse_id(char **args, int *cur_arg, struct proxy *curproxy, struct server *newsrv, char **err) +{ + struct eb32_node *node; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : expects an integer argument", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + newsrv->puid = atol(args[*cur_arg + 1]); + newsrv->conf.id.key = newsrv->puid; + + if (newsrv->puid <= 0) { + memprintf(err, "'%s' : custom id has to be > 0", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + node = eb32_lookup(&curproxy->conf.used_server_id, newsrv->puid); + if (node) { + struct server *target = container_of(node, struct server, conf.id); + memprintf(err, "'%s' : custom id %d already used at %s:%d ('server %s')", + args[*cur_arg], newsrv->puid, target->conf.file, target->conf.line, + target->id); + return ERR_ALERT | ERR_FATAL; + } + + newsrv->flags |= SRV_F_FORCED_ID; + return 0; +} + +/* Parse the "namespace" server keyword */ +static int srv_parse_namespace(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ +#ifdef USE_NS + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' : expects <name> as argument", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(arg, "*") == 0) { + /* Use the namespace associated with the connection (if present). */ + newsrv->flags |= SRV_F_USE_NS_FROM_PP; + return 0; + } + + /* + * As this parser may be called several times for the same 'default-server' + * object, or for a new 'server' instance deriving from a 'default-server' + * one with SRV_F_USE_NS_FROM_PP flag enabled, let's reset it. + */ + newsrv->flags &= ~SRV_F_USE_NS_FROM_PP; + + newsrv->netns = netns_store_lookup(arg, strlen(arg)); + if (!newsrv->netns) + newsrv->netns = netns_store_insert(arg); + + if (!newsrv->netns) { + memprintf(err, "Cannot open namespace '%s'", arg); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +#else + memprintf(err, "'%s': '%s' option not implemented", args[0], args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; +#endif +} + +/* Parse the "no-backup" server keyword */ +static int srv_parse_no_backup(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->flags &= ~SRV_F_BACKUP; + return 0; +} + + +/* Disable server PROXY protocol flags. */ +static inline int srv_disable_pp_flags(struct server *srv, unsigned int flags) +{ + srv->pp_opts &= ~flags; + return 0; +} + +/* Parse the "no-send-proxy" server keyword */ +static int srv_parse_no_send_proxy(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + return srv_disable_pp_flags(newsrv, SRV_PP_V1); +} + +/* Parse the "no-send-proxy-v2" server keyword */ +static int srv_parse_no_send_proxy_v2(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + return srv_disable_pp_flags(newsrv, SRV_PP_V2); +} + +/* Parse the "shard" server keyword */ +static int srv_parse_shard(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->shard = atol(args[*cur_arg + 1]); + return 0; +} + +/* Parse the "no-tfo" server keyword */ +static int srv_parse_no_tfo(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->flags &= ~SRV_F_FASTOPEN; + return 0; +} + +/* Parse the "non-stick" server keyword */ +static int srv_parse_non_stick(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->flags |= SRV_F_NON_STICK; + return 0; +} + +/* Enable server PROXY protocol flags. */ +static inline int srv_enable_pp_flags(struct server *srv, unsigned int flags) +{ + srv->pp_opts |= flags; + return 0; +} +/* parse the "proto" server keyword */ +static int srv_parse_proto(char **args, int *cur_arg, + struct proxy *px, struct server *newsrv, char **err) +{ + struct ist proto; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' : missing value", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + proto = ist(args[*cur_arg + 1]); + newsrv->mux_proto = get_mux_proto(proto); + if (!newsrv->mux_proto) { + memprintf(err, "'%s' : unknown MUX protocol '%s'", args[*cur_arg], args[*cur_arg+1]); + return ERR_ALERT | ERR_FATAL; + } + return 0; +} + +/* parse the "proxy-v2-options" */ +static int srv_parse_proxy_v2_options(char **args, int *cur_arg, + struct proxy *px, struct server *newsrv, char **err) +{ + char *p, *n; + for (p = args[*cur_arg+1]; p; p = n) { + n = strchr(p, ','); + if (n) + *n++ = '\0'; + if (strcmp(p, "ssl") == 0) { + newsrv->pp_opts |= SRV_PP_V2_SSL; + } else if (strcmp(p, "cert-cn") == 0) { + newsrv->pp_opts |= SRV_PP_V2_SSL; + newsrv->pp_opts |= SRV_PP_V2_SSL_CN; + } else if (strcmp(p, "cert-key") == 0) { + newsrv->pp_opts |= SRV_PP_V2_SSL; + newsrv->pp_opts |= SRV_PP_V2_SSL_KEY_ALG; + } else if (strcmp(p, "cert-sig") == 0) { + newsrv->pp_opts |= SRV_PP_V2_SSL; + newsrv->pp_opts |= SRV_PP_V2_SSL_SIG_ALG; + } else if (strcmp(p, "ssl-cipher") == 0) { + newsrv->pp_opts |= SRV_PP_V2_SSL; + newsrv->pp_opts |= SRV_PP_V2_SSL_CIPHER; + } else if (strcmp(p, "authority") == 0) { + newsrv->pp_opts |= SRV_PP_V2_AUTHORITY; + } else if (strcmp(p, "crc32c") == 0) { + newsrv->pp_opts |= SRV_PP_V2_CRC32C; + } else if (strcmp(p, "unique-id") == 0) { + newsrv->pp_opts |= SRV_PP_V2_UNIQUE_ID; + } else + goto fail; + } + return 0; + fail: + if (err) + memprintf(err, "'%s' : proxy v2 option not implemented", p); + return ERR_ALERT | ERR_FATAL; +} + +/* Parse the "observe" server keyword */ +static int srv_parse_observe(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' expects <mode> as argument.\n", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(arg, "none") == 0) { + newsrv->observe = HANA_OBS_NONE; + } + else if (strcmp(arg, "layer4") == 0) { + newsrv->observe = HANA_OBS_LAYER4; + } + else if (strcmp(arg, "layer7") == 0) { + if (curproxy->mode != PR_MODE_HTTP) { + memprintf(err, "'%s' can only be used in http proxies.\n", arg); + return ERR_ALERT; + } + newsrv->observe = HANA_OBS_LAYER7; + } + else { + memprintf(err, "'%s' expects one of 'none', 'layer4', 'layer7' " + "but got '%s'\n", args[*cur_arg], arg); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* Parse the "on-error" server keyword */ +static int srv_parse_on_error(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (strcmp(args[*cur_arg + 1], "fastinter") == 0) + newsrv->onerror = HANA_ONERR_FASTINTER; + else if (strcmp(args[*cur_arg + 1], "fail-check") == 0) + newsrv->onerror = HANA_ONERR_FAILCHK; + else if (strcmp(args[*cur_arg + 1], "sudden-death") == 0) + newsrv->onerror = HANA_ONERR_SUDDTH; + else if (strcmp(args[*cur_arg + 1], "mark-down") == 0) + newsrv->onerror = HANA_ONERR_MARKDWN; + else { + memprintf(err, "'%s' expects one of 'fastinter', " + "'fail-check', 'sudden-death' or 'mark-down' but got '%s'", + args[*cur_arg], args[*cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* Parse the "on-marked-down" server keyword */ +static int srv_parse_on_marked_down(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (strcmp(args[*cur_arg + 1], "shutdown-sessions") == 0) + newsrv->onmarkeddown = HANA_ONMARKEDDOWN_SHUTDOWNSESSIONS; + else { + memprintf(err, "'%s' expects 'shutdown-sessions' but got '%s'", + args[*cur_arg], args[*cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* Parse the "on-marked-up" server keyword */ +static int srv_parse_on_marked_up(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (strcmp(args[*cur_arg + 1], "shutdown-backup-sessions") == 0) + newsrv->onmarkedup = HANA_ONMARKEDUP_SHUTDOWNBACKUPSESSIONS; + else { + memprintf(err, "'%s' expects 'shutdown-backup-sessions' but got '%s'", + args[*cur_arg], args[*cur_arg + 1]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* Parse the "redir" server keyword */ +static int srv_parse_redir(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'%s' expects <prefix> as argument.\n", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + free(newsrv->rdr_pfx); + newsrv->rdr_pfx = strdup(arg); + newsrv->rdr_len = strlen(arg); + + return 0; +} + +/* Parse the "resolvers" server keyword */ +static int srv_parse_resolvers(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + free(newsrv->resolvers_id); + newsrv->resolvers_id = strdup(args[*cur_arg + 1]); + return 0; +} + +/* Parse the "resolve-net" server keyword */ +static int srv_parse_resolve_net(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *p, *e; + unsigned char mask; + struct resolv_options *opt; + + if (!args[*cur_arg + 1] || args[*cur_arg + 1][0] == '\0') { + memprintf(err, "'%s' expects a list of networks.", + args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + opt = &newsrv->resolv_opts; + + /* Split arguments by comma, and convert it from ipv4 or ipv6 + * string network in in_addr or in6_addr. + */ + p = args[*cur_arg + 1]; + e = p; + while (*p != '\0') { + /* If no room available, return error. */ + if (opt->pref_net_nb >= SRV_MAX_PREF_NET) { + memprintf(err, "'%s' exceed %d networks.", + args[*cur_arg], SRV_MAX_PREF_NET); + return ERR_ALERT | ERR_FATAL; + } + /* look for end or comma. */ + while (*e != ',' && *e != '\0') + e++; + if (*e == ',') { + *e = '\0'; + e++; + } + if (str2net(p, 0, &opt->pref_net[opt->pref_net_nb].addr.in4, + &opt->pref_net[opt->pref_net_nb].mask.in4)) { + /* Try to convert input string from ipv4 or ipv6 network. */ + opt->pref_net[opt->pref_net_nb].family = AF_INET; + } else if (str62net(p, &opt->pref_net[opt->pref_net_nb].addr.in6, + &mask)) { + /* Try to convert input string from ipv6 network. */ + len2mask6(mask, &opt->pref_net[opt->pref_net_nb].mask.in6); + opt->pref_net[opt->pref_net_nb].family = AF_INET6; + } else { + /* All network conversions fail, return error. */ + memprintf(err, "'%s' invalid network '%s'.", + args[*cur_arg], p); + return ERR_ALERT | ERR_FATAL; + } + opt->pref_net_nb++; + p = e; + } + + return 0; +} + +/* Parse the "resolve-opts" server keyword */ +static int srv_parse_resolve_opts(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *p, *end; + + for (p = args[*cur_arg + 1]; *p; p = end) { + /* cut on next comma */ + for (end = p; *end && *end != ','; end++); + if (*end) + *(end++) = 0; + + if (strcmp(p, "allow-dup-ip") == 0) { + newsrv->resolv_opts.accept_duplicate_ip = 1; + } + else if (strcmp(p, "ignore-weight") == 0) { + newsrv->resolv_opts.ignore_weight = 1; + } + else if (strcmp(p, "prevent-dup-ip") == 0) { + newsrv->resolv_opts.accept_duplicate_ip = 0; + } + else { + memprintf(err, "'%s' : unknown resolve-opts option '%s', supported methods are 'allow-dup-ip', 'ignore-weight', and 'prevent-dup-ip'.", + args[*cur_arg], p); + return ERR_ALERT | ERR_FATAL; + } + } + + return 0; +} + +/* Parse the "resolve-prefer" server keyword */ +static int srv_parse_resolve_prefer(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (strcmp(args[*cur_arg + 1], "ipv4") == 0) + newsrv->resolv_opts.family_prio = AF_INET; + else if (strcmp(args[*cur_arg + 1], "ipv6") == 0) + newsrv->resolv_opts.family_prio = AF_INET6; + else { + memprintf(err, "'%s' expects either ipv4 or ipv6 as argument.", + args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + +/* Parse the "send-proxy" server keyword */ +static int srv_parse_send_proxy(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + return srv_enable_pp_flags(newsrv, SRV_PP_V1); +} + +/* Parse the "send-proxy-v2" server keyword */ +static int srv_parse_send_proxy_v2(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + return srv_enable_pp_flags(newsrv, SRV_PP_V2); +} + +/* Parse the "set-proxy-v2-tlv-fmt" server keyword */ +static int srv_parse_set_proxy_v2_tlv_fmt(char **args, int *cur_arg, + struct proxy *px, struct server *newsrv, char **err) +{ + char *error = NULL, *cmd = NULL; + unsigned int tlv_type = 0; + struct srv_pp_tlv_list *srv_tlv = NULL; + + cmd = args[*cur_arg]; + if (!*cmd) { + memprintf(err, "'%s' : could not read set-proxy-v2-tlv-fmt command", args[*cur_arg]); + goto fail; + } + + cmd += strlen("set-proxy-v2-tlv-fmt"); + + if (*cmd == '(') { + cmd++; /* skip the '(' */ + errno = 0; + tlv_type = strtoul(cmd, &error, 0); /* convert TLV ID */ + if (unlikely((cmd == error) || (errno != 0))) { + memprintf(err, "'%s' : could not convert TLV ID", args[*cur_arg]); + goto fail; + } + if (errno == EINVAL) { + memprintf(err, "'%s' : could not find a valid number for the TLV ID", args[*cur_arg]); + goto fail; + } + if (*error != ')') { + memprintf(err, "'%s' : expects set-proxy-v2-tlv(<TLV ID>)", args[*cur_arg]); + goto fail; + } + if (tlv_type > 0xFF) { + memprintf(err, "'%s' : the maximum allowed TLV ID is %d", args[*cur_arg], 0xFF); + goto fail; + } + } + + srv_tlv = malloc(sizeof(*srv_tlv)); + if (unlikely(!srv_tlv)) { + memprintf(err, "'%s' : failed to parse allocate TLV entry", args[*cur_arg]); + goto fail; + } + srv_tlv->type = tlv_type; + srv_tlv->fmt_string = strdup(args[*cur_arg + 1]); + if (unlikely(!srv_tlv->fmt_string)) { + memprintf(err, "'%s' : failed to save format string for parsing", args[*cur_arg]); + goto fail; + } + + LIST_APPEND(&newsrv->pp_tlvs, &srv_tlv->list); + + (*cur_arg)++; + + return 0; + + fail: + free(srv_tlv); + errno = 0; + return ERR_ALERT | ERR_FATAL; +} + +/* Parse the "slowstart" server keyword */ +static int srv_parse_slowstart(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + /* slowstart is stored in seconds */ + unsigned int val; + const char *time_err = parse_time_err(args[*cur_arg + 1], &val, TIME_UNIT_MS); + + if (time_err == PARSE_TIME_OVER) { + memprintf(err, "overflow in argument <%s> to <%s> of server %s, maximum value is 2147483647 ms (~24.8 days).", + args[*cur_arg+1], args[*cur_arg], newsrv->id); + return ERR_ALERT | ERR_FATAL; + } + else if (time_err == PARSE_TIME_UNDER) { + memprintf(err, "underflow in argument <%s> to <%s> of server %s, minimum non-null value is 1 ms.", + args[*cur_arg+1], args[*cur_arg], newsrv->id); + return ERR_ALERT | ERR_FATAL; + } + else if (time_err) { + memprintf(err, "unexpected character '%c' in 'slowstart' argument of server %s.", + *time_err, newsrv->id); + return ERR_ALERT | ERR_FATAL; + } + newsrv->slowstart = (val + 999) / 1000; + + return 0; +} + +/* Parse the "source" server keyword */ +static int srv_parse_source(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *errmsg; + int port_low, port_high; + struct sockaddr_storage *sk; + + errmsg = NULL; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' expects <addr>[:<port>[-<port>]], and optionally '%s' <addr>, " + "and '%s' <name> as argument.\n", args[*cur_arg], "usesrc", "interface"); + goto err; + } + + /* 'sk' is statically allocated (no need to be freed). */ + sk = str2sa_range(args[*cur_arg + 1], NULL, &port_low, &port_high, NULL, NULL, NULL, + &errmsg, NULL, NULL, + PA_O_RESOLVE | PA_O_PORT_OK | PA_O_PORT_RANGE | PA_O_STREAM | PA_O_CONNECT); + if (!sk) { + memprintf(err, "'%s %s' : %s\n", args[*cur_arg], args[*cur_arg + 1], errmsg); + goto err; + } + + newsrv->conn_src.opts |= CO_SRC_BIND; + newsrv->conn_src.source_addr = *sk; + + if (port_low != port_high) { + int i; + + newsrv->conn_src.sport_range = port_range_alloc_range(port_high - port_low + 1); + if (!newsrv->conn_src.sport_range) { + ha_alert("Server '%s': Out of memory (sport_range)\n", args[0]); + goto err; + } + for (i = 0; i < newsrv->conn_src.sport_range->size; i++) + newsrv->conn_src.sport_range->ports[i] = port_low + i; + } + + *cur_arg += 2; + while (*(args[*cur_arg])) { + if (strcmp(args[*cur_arg], "usesrc") == 0) { /* address to use outside */ +#if defined(CONFIG_HAP_TRANSPARENT) + if (!*args[*cur_arg + 1]) { + ha_alert("'usesrc' expects <addr>[:<port>], 'client', 'clientip', " + "or 'hdr_ip(name,#)' as argument.\n"); + goto err; + } + if (strcmp(args[*cur_arg + 1], "client") == 0) { + newsrv->conn_src.opts &= ~CO_SRC_TPROXY_MASK; + newsrv->conn_src.opts |= CO_SRC_TPROXY_CLI; + } + else if (strcmp(args[*cur_arg + 1], "clientip") == 0) { + newsrv->conn_src.opts &= ~CO_SRC_TPROXY_MASK; + newsrv->conn_src.opts |= CO_SRC_TPROXY_CIP; + } + else if (!strncmp(args[*cur_arg + 1], "hdr_ip(", 7)) { + char *name, *end; + + name = args[*cur_arg + 1] + 7; + while (isspace((unsigned char)*name)) + name++; + + end = name; + while (*end && !isspace((unsigned char)*end) && *end != ',' && *end != ')') + end++; + + newsrv->conn_src.opts &= ~CO_SRC_TPROXY_MASK; + newsrv->conn_src.opts |= CO_SRC_TPROXY_DYN; + free(newsrv->conn_src.bind_hdr_name); + newsrv->conn_src.bind_hdr_name = calloc(1, end - name + 1); + if (!newsrv->conn_src.bind_hdr_name) { + ha_alert("Server '%s': Out of memory (bind_hdr_name)\n", args[0]); + goto err; + } + newsrv->conn_src.bind_hdr_len = end - name; + memcpy(newsrv->conn_src.bind_hdr_name, name, end - name); + newsrv->conn_src.bind_hdr_name[end - name] = '\0'; + newsrv->conn_src.bind_hdr_occ = -1; + + /* now look for an occurrence number */ + while (isspace((unsigned char)*end)) + end++; + if (*end == ',') { + end++; + name = end; + if (*end == '-') + end++; + while (isdigit((unsigned char)*end)) + end++; + newsrv->conn_src.bind_hdr_occ = strl2ic(name, end - name); + } + + if (newsrv->conn_src.bind_hdr_occ < -MAX_HDR_HISTORY) { + ha_alert("usesrc hdr_ip(name,num) does not support negative" + " occurrences values smaller than %d.\n", MAX_HDR_HISTORY); + goto err; + } + } + else { + struct sockaddr_storage *sk; + int port1, port2; + + /* 'sk' is statically allocated (no need to be freed). */ + sk = str2sa_range(args[*cur_arg + 1], NULL, &port1, &port2, NULL, NULL, NULL, + &errmsg, NULL, NULL, + PA_O_RESOLVE | PA_O_PORT_OK | PA_O_STREAM | PA_O_CONNECT); + if (!sk) { + ha_alert("'%s %s' : %s\n", args[*cur_arg], args[*cur_arg + 1], errmsg); + goto err; + } + + newsrv->conn_src.tproxy_addr = *sk; + newsrv->conn_src.opts |= CO_SRC_TPROXY_ADDR; + } + global.last_checks |= LSTCHK_NETADM; + *cur_arg += 2; + continue; +#else /* no TPROXY support */ + ha_alert("'usesrc' not allowed here because support for TPROXY was not compiled in.\n"); + goto err; +#endif /* defined(CONFIG_HAP_TRANSPARENT) */ + } /* "usesrc" */ + + if (strcmp(args[*cur_arg], "interface") == 0) { /* specifically bind to this interface */ +#ifdef SO_BINDTODEVICE + if (!*args[*cur_arg + 1]) { + ha_alert("'%s' : missing interface name.\n", args[0]); + goto err; + } + free(newsrv->conn_src.iface_name); + newsrv->conn_src.iface_name = strdup(args[*cur_arg + 1]); + newsrv->conn_src.iface_len = strlen(newsrv->conn_src.iface_name); + global.last_checks |= LSTCHK_NETADM; +#else + ha_alert("'%s' : '%s' option not implemented.\n", args[0], args[*cur_arg]); + goto err; +#endif + *cur_arg += 2; + continue; + } + /* this keyword in not an option of "source" */ + break; + } /* while */ + + return 0; + + err: + free(errmsg); + return ERR_ALERT | ERR_FATAL; +} + +/* Parse the "stick" server keyword */ +static int srv_parse_stick(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + newsrv->flags &= ~SRV_F_NON_STICK; + return 0; +} + +/* Parse the "track" server keyword */ +static int srv_parse_track(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *arg; + + arg = args[*cur_arg + 1]; + if (!*arg) { + memprintf(err, "'track' expects [<proxy>/]<server> as argument.\n"); + return ERR_ALERT | ERR_FATAL; + } + + free(newsrv->trackit); + newsrv->trackit = strdup(arg); + + return 0; +} + +/* Parse the "socks4" server keyword */ +static int srv_parse_socks4(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + char *errmsg; + int port_low, port_high; + struct sockaddr_storage *sk; + + errmsg = NULL; + + if (!*args[*cur_arg + 1]) { + memprintf(err, "'%s' expects <addr>:<port> as argument.\n", args[*cur_arg]); + goto err; + } + + /* 'sk' is statically allocated (no need to be freed). */ + sk = str2sa_range(args[*cur_arg + 1], NULL, &port_low, &port_high, NULL, NULL, NULL, + &errmsg, NULL, NULL, + PA_O_RESOLVE | PA_O_PORT_OK | PA_O_PORT_MAND | PA_O_STREAM | PA_O_CONNECT); + if (!sk) { + memprintf(err, "'%s %s' : %s\n", args[*cur_arg], args[*cur_arg + 1], errmsg); + goto err; + } + + newsrv->flags |= SRV_F_SOCKS4_PROXY; + newsrv->socks4_addr = *sk; + + return 0; + + err: + free(errmsg); + return ERR_ALERT | ERR_FATAL; +} + + +/* parse the "tfo" server keyword */ +static int srv_parse_tfo(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + newsrv->flags |= SRV_F_FASTOPEN; + return 0; +} + +/* parse the "usesrc" server keyword */ +static int srv_parse_usesrc(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + memprintf(err, "'%s' only allowed after a '%s' statement.", + "usesrc", "source"); + return ERR_ALERT | ERR_FATAL; +} + +/* parse the "weight" server keyword */ +static int srv_parse_weight(char **args, int *cur_arg, struct proxy *px, struct server *newsrv, char **err) +{ + int w; + + w = atol(args[*cur_arg + 1]); + if (w < 0 || w > SRV_UWGHT_MAX) { + memprintf(err, "weight of server %s is not within 0 and %d (%d).", + newsrv->id, SRV_UWGHT_MAX, w); + return ERR_ALERT | ERR_FATAL; + } + newsrv->uweight = newsrv->iweight = w; + + return 0; +} + +/* Returns 1 if the server has streams pointing to it, and 0 otherwise. + * + * Must be called with the server lock held. + */ +static int srv_has_streams(struct server *srv) +{ + int thr; + + for (thr = 0; thr < global.nbthread; thr++) + if (!MT_LIST_ISEMPTY(&srv->per_thr[thr].streams)) + return 1; + return 0; +} + +/* Shutdown all connections of a server. The caller must pass a termination + * code in <why>, which must be one of SF_ERR_* indicating the reason for the + * shutdown. + * + * Must be called with the server lock held. + */ +void srv_shutdown_streams(struct server *srv, int why) +{ + struct stream *stream; + struct mt_list *elt1, elt2; + int thr; + + for (thr = 0; thr < global.nbthread; thr++) + mt_list_for_each_entry_safe(stream, &srv->per_thr[thr].streams, by_srv, elt1, elt2) + if (stream->srv_conn == srv) + stream_shutdown(stream, why); +} + +/* Shutdown all connections of all backup servers of a proxy. The caller must + * pass a termination code in <why>, which must be one of SF_ERR_* indicating + * the reason for the shutdown. + * + * Must be called with the server lock held. + */ +void srv_shutdown_backup_streams(struct proxy *px, int why) +{ + struct server *srv; + + for (srv = px->srv; srv != NULL; srv = srv->next) + if (srv->flags & SRV_F_BACKUP) + srv_shutdown_streams(srv, why); +} + +static void srv_append_op_chg_cause(struct buffer *msg, struct server *s, enum srv_op_st_chg_cause cause) +{ + switch (cause) { + case SRV_OP_STCHGC_NONE: + break; /* do nothing */ + case SRV_OP_STCHGC_HEALTH: + check_append_info(msg, &s->check); + break; + case SRV_OP_STCHGC_AGENT: + check_append_info(msg, &s->agent); + break; + default: + chunk_appendf(msg, ", %s", srv_op_st_chg_cause(cause)); + break; + } +} + +static void srv_append_adm_chg_cause(struct buffer *msg, struct server *s, enum srv_adm_st_chg_cause cause) +{ + if (cause) + chunk_appendf(msg, " (%s)", srv_adm_st_chg_cause(cause)); +} + +/* Appends some information to a message string related to a server tracking + * or requeued connections info. + * + * If <forced> is null and the server tracks another one, a "via" + * If <xferred> is non-negative, some information about requeued sessions are + * provided. + * + * Must be called with the server lock held. + */ +static void srv_append_more(struct buffer *msg, struct server *s, + int xferred, int forced) +{ + if (!forced && s->track) { + chunk_appendf(msg, " via %s/%s", s->track->proxy->id, s->track->id); + } + + if (xferred >= 0) { + if (s->next_state == SRV_ST_STOPPED) + chunk_appendf(msg, ". %d active and %d backup servers left.%s" + " %d sessions active, %d requeued, %d remaining in queue", + s->proxy->srv_act, s->proxy->srv_bck, + (s->proxy->srv_bck && !s->proxy->srv_act) ? " Running on backup." : "", + s->cur_sess, xferred, s->queue.length); + else + chunk_appendf(msg, ". %d active and %d backup servers online.%s" + " %d sessions requeued, %d total in queue", + s->proxy->srv_act, s->proxy->srv_bck, + (s->proxy->srv_bck && !s->proxy->srv_act) ? " Running on backup." : "", + xferred, s->queue.length); + } +} + +/* Marks server <s> down, regardless of its checks' statuses. The server + * transfers queued streams whenever possible to other servers at a sync + * point. Maintenance servers are ignored. + * + * Must be called with the server lock held. + */ +void srv_set_stopped(struct server *s, enum srv_op_st_chg_cause cause) +{ + struct server *srv; + + if ((s->cur_admin & SRV_ADMF_MAINT) || s->next_state == SRV_ST_STOPPED) + return; + + s->next_state = SRV_ST_STOPPED; + + /* propagate changes */ + srv_update_status(s, 0, cause); + + for (srv = s->trackers; srv; srv = srv->tracknext) { + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_set_stopped(srv, SRV_OP_STCHGC_NONE); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + } +} + +/* Marks server <s> up regardless of its checks' statuses and provided it isn't + * in maintenance. The server tries to grab requests from the proxy at a sync + * point. Maintenance servers are ignored. + * + * Must be called with the server lock held. + */ +void srv_set_running(struct server *s, enum srv_op_st_chg_cause cause) +{ + struct server *srv; + + if (s->cur_admin & SRV_ADMF_MAINT) + return; + + if (s->next_state == SRV_ST_STARTING || s->next_state == SRV_ST_RUNNING) + return; + + s->next_state = SRV_ST_STARTING; + + if (s->slowstart <= 0) + s->next_state = SRV_ST_RUNNING; + + /* propagate changes */ + srv_update_status(s, 0, cause); + + for (srv = s->trackers; srv; srv = srv->tracknext) { + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_set_running(srv, SRV_OP_STCHGC_NONE); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + } +} + +/* Marks server <s> stopping regardless of its checks' statuses and provided it + * isn't in maintenance. The server tries to redispatch pending requests + * to the proxy. Maintenance servers are ignored. + * + * Must be called with the server lock held. + */ +void srv_set_stopping(struct server *s, enum srv_op_st_chg_cause cause) +{ + struct server *srv; + + if (s->cur_admin & SRV_ADMF_MAINT) + return; + + if (s->next_state == SRV_ST_STOPPING) + return; + + s->next_state = SRV_ST_STOPPING; + + /* propagate changes */ + srv_update_status(s, 0, cause); + + for (srv = s->trackers; srv; srv = srv->tracknext) { + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_set_stopping(srv, SRV_OP_STCHGC_NONE); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + } +} + +/* Enables admin flag <mode> (among SRV_ADMF_*) on server <s>. This is used to + * enforce either maint mode or drain mode. It is not allowed to set more than + * one flag at once. The equivalent "inherited" flag is propagated to all + * tracking servers. Maintenance mode disables health checks (but not agent + * checks). When either the flag is already set or no flag is passed, nothing + * is done. If <cause> is non-null, it will be displayed at the end of the log + * lines to justify the state change. + * + * Must be called with the server lock held. + */ +void srv_set_admin_flag(struct server *s, enum srv_admin mode, enum srv_adm_st_chg_cause cause) +{ + struct server *srv; + + if (!mode) + return; + + /* stop going down as soon as we meet a server already in the same state */ + if (s->next_admin & mode) + return; + + s->next_admin |= mode; + + /* propagate changes */ + srv_update_status(s, 1, cause); + + /* stop going down if the equivalent flag was already present (forced or inherited) */ + if (((mode & SRV_ADMF_MAINT) && (s->next_admin & ~mode & SRV_ADMF_MAINT)) || + ((mode & SRV_ADMF_DRAIN) && (s->next_admin & ~mode & SRV_ADMF_DRAIN))) + return; + + /* compute the inherited flag to propagate */ + if (mode & SRV_ADMF_MAINT) + mode = SRV_ADMF_IMAINT; + else if (mode & SRV_ADMF_DRAIN) + mode = SRV_ADMF_IDRAIN; + + for (srv = s->trackers; srv; srv = srv->tracknext) { + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_set_admin_flag(srv, mode, cause); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + } +} + +/* Disables admin flag <mode> (among SRV_ADMF_*) on server <s>. This is used to + * stop enforcing either maint mode or drain mode. It is not allowed to set more + * than one flag at once. The equivalent "inherited" flag is propagated to all + * tracking servers. Leaving maintenance mode re-enables health checks. When + * either the flag is already cleared or no flag is passed, nothing is done. + * + * Must be called with the server lock held. + */ +void srv_clr_admin_flag(struct server *s, enum srv_admin mode) +{ + struct server *srv; + + if (!mode) + return; + + /* stop going down as soon as we see the flag is not there anymore */ + if (!(s->next_admin & mode)) + return; + + s->next_admin &= ~mode; + + /* propagate changes */ + srv_update_status(s, 1, SRV_ADM_STCHGC_NONE); + + /* stop going down if the equivalent flag is still present (forced or inherited) */ + if (((mode & SRV_ADMF_MAINT) && (s->next_admin & SRV_ADMF_MAINT)) || + ((mode & SRV_ADMF_DRAIN) && (s->next_admin & SRV_ADMF_DRAIN))) + return; + + if (mode & SRV_ADMF_MAINT) + mode = SRV_ADMF_IMAINT; + else if (mode & SRV_ADMF_DRAIN) + mode = SRV_ADMF_IDRAIN; + + for (srv = s->trackers; srv; srv = srv->tracknext) { + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + srv_clr_admin_flag(srv, mode); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + } +} + +/* principle: propagate maint and drain to tracking servers. This is useful + * upon startup so that inherited states are correct. + */ +static void srv_propagate_admin_state(struct server *srv) +{ + struct server *srv2; + + if (!srv->trackers) + return; + + for (srv2 = srv->trackers; srv2; srv2 = srv2->tracknext) { + HA_SPIN_LOCK(SERVER_LOCK, &srv2->lock); + if (srv->next_admin & (SRV_ADMF_MAINT | SRV_ADMF_CMAINT)) + srv_set_admin_flag(srv2, SRV_ADMF_IMAINT, SRV_ADM_STCHGC_NONE); + + if (srv->next_admin & SRV_ADMF_DRAIN) + srv_set_admin_flag(srv2, SRV_ADMF_IDRAIN, SRV_ADM_STCHGC_NONE); + HA_SPIN_UNLOCK(SERVER_LOCK, &srv2->lock); + } +} + +/* Compute and propagate the admin states for all servers in proxy <px>. + * Only servers *not* tracking another one are considered, because other + * ones will be handled when the server they track is visited. + */ +void srv_compute_all_admin_states(struct proxy *px) +{ + struct server *srv; + + for (srv = px->srv; srv; srv = srv->next) { + if (srv->track) + continue; + srv_propagate_admin_state(srv); + } +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted, doing so helps + * all code contributors. + * Optional keywords are also declared with a NULL ->parse() function so that + * the config parser can report an appropriate error when a known keyword was + * not enabled. + * Note: -1 as ->skip value means that the number of arguments are variable. + */ +static struct srv_kw_list srv_kws = { "ALL", { }, { + { "backup", srv_parse_backup, 0, 1, 1 }, /* Flag as backup server */ + { "cookie", srv_parse_cookie, 1, 1, 0 }, /* Assign a cookie to the server */ + { "disabled", srv_parse_disabled, 0, 1, 1 }, /* Start the server in 'disabled' state */ + { "enabled", srv_parse_enabled, 0, 1, 1 }, /* Start the server in 'enabled' state */ + { "error-limit", srv_parse_error_limit, 1, 1, 1 }, /* Configure the consecutive count of check failures to consider a server on error */ + { "ws", srv_parse_ws, 1, 1, 1 }, /* websocket protocol */ + { "id", srv_parse_id, 1, 0, 1 }, /* set id# of server */ + { "init-addr", srv_parse_init_addr, 1, 1, 0 }, /* */ + { "log-bufsize", srv_parse_log_bufsize, 1, 1, 0 }, /* Set the ring bufsize for log server (only for log backends) */ + { "log-proto", srv_parse_log_proto, 1, 1, 0 }, /* Set the protocol for event messages, only relevant in a log or ring section */ + { "maxconn", srv_parse_maxconn, 1, 1, 1 }, /* Set the max number of concurrent connection */ + { "maxqueue", srv_parse_maxqueue, 1, 1, 1 }, /* Set the max number of connection to put in queue */ + { "max-reuse", srv_parse_max_reuse, 1, 1, 0 }, /* Set the max number of requests on a connection, -1 means unlimited */ + { "minconn", srv_parse_minconn, 1, 1, 1 }, /* Enable a dynamic maxconn limit */ + { "namespace", srv_parse_namespace, 1, 1, 0 }, /* Namespace the server socket belongs to (if supported) */ + { "no-backup", srv_parse_no_backup, 0, 1, 1 }, /* Flag as non-backup server */ + { "no-send-proxy", srv_parse_no_send_proxy, 0, 1, 1 }, /* Disable use of PROXY V1 protocol */ + { "no-send-proxy-v2", srv_parse_no_send_proxy_v2, 0, 1, 1 }, /* Disable use of PROXY V2 protocol */ + { "no-tfo", srv_parse_no_tfo, 0, 1, 1 }, /* Disable use of TCP Fast Open */ + { "non-stick", srv_parse_non_stick, 0, 1, 0 }, /* Disable stick-table persistence */ + { "observe", srv_parse_observe, 1, 1, 1 }, /* Enables health adjusting based on observing communication with the server */ + { "on-error", srv_parse_on_error, 1, 1, 1 }, /* Configure the action on check failure */ + { "on-marked-down", srv_parse_on_marked_down, 1, 1, 1 }, /* Configure the action when a server is marked down */ + { "on-marked-up", srv_parse_on_marked_up, 1, 1, 1 }, /* Configure the action when a server is marked up */ + { "pool-low-conn", srv_parse_pool_low_conn, 1, 1, 1 }, /* Set the min number of orphan idle connecbefore being allowed to pick from other threads */ + { "pool-max-conn", srv_parse_pool_max_conn, 1, 1, 1 }, /* Set the max number of orphan idle connections, -1 means unlimited */ + { "pool-purge-delay", srv_parse_pool_purge_delay, 1, 1, 1 }, /* Set the time before we destroy orphan idle connections, defaults to 1s */ + { "proto", srv_parse_proto, 1, 1, 1 }, /* Set the proto to use for all outgoing connections */ + { "proxy-v2-options", srv_parse_proxy_v2_options, 1, 1, 1 }, /* options for send-proxy-v2 */ + { "redir", srv_parse_redir, 1, 1, 0 }, /* Enable redirection mode */ + { "resolve-net", srv_parse_resolve_net, 1, 1, 0 }, /* Set the preferred network range for name resolution */ + { "resolve-opts", srv_parse_resolve_opts, 1, 1, 0 }, /* Set options for name resolution */ + { "resolve-prefer", srv_parse_resolve_prefer, 1, 1, 0 }, /* Set the preferred family for name resolution */ + { "resolvers", srv_parse_resolvers, 1, 1, 0 }, /* Configure the resolver to use for name resolution */ + { "send-proxy", srv_parse_send_proxy, 0, 1, 1 }, /* Enforce use of PROXY V1 protocol */ + { "send-proxy-v2", srv_parse_send_proxy_v2, 0, 1, 1 }, /* Enforce use of PROXY V2 protocol */ + { "set-proxy-v2-tlv-fmt", srv_parse_set_proxy_v2_tlv_fmt, 0, 1, 1 }, /* Set TLV of PROXY V2 protocol */ + { "shard", srv_parse_shard, 1, 1, 1 }, /* Server shard (only in peers protocol context) */ + { "slowstart", srv_parse_slowstart, 1, 1, 1 }, /* Set the warm-up timer for a previously failed server */ + { "source", srv_parse_source, -1, 1, 1 }, /* Set the source address to be used to connect to the server */ + { "stick", srv_parse_stick, 0, 1, 0 }, /* Enable stick-table persistence */ + { "tfo", srv_parse_tfo, 0, 1, 1 }, /* enable TCP Fast Open of server */ + { "track", srv_parse_track, 1, 1, 1 }, /* Set the current state of the server, tracking another one */ + { "socks4", srv_parse_socks4, 1, 1, 0 }, /* Set the socks4 proxy of the server*/ + { "usesrc", srv_parse_usesrc, 0, 1, 1 }, /* safe-guard against usesrc without preceding <source> keyword */ + { "weight", srv_parse_weight, 1, 1, 1 }, /* Set the load-balancing weight */ + { NULL, NULL, 0 }, +}}; + +INITCALL1(STG_REGISTER, srv_register_keywords, &srv_kws); + +/* Recomputes the server's eweight based on its state, uweight, the current time, + * and the proxy's algorithm. To be used after updating sv->uweight. The warmup + * state is automatically disabled if the time is elapsed. If <must_update> is + * not zero, the update will be propagated immediately. + * + * Must be called with the server lock held. + */ +void server_recalc_eweight(struct server *sv, int must_update) +{ + struct proxy *px = sv->proxy; + unsigned w; + + if (ns_to_sec(now_ns) < sv->last_change || ns_to_sec(now_ns) >= sv->last_change + sv->slowstart) { + /* go to full throttle if the slowstart interval is reached */ + if (sv->next_state == SRV_ST_STARTING) + sv->next_state = SRV_ST_RUNNING; + } + + /* We must take care of not pushing the server to full throttle during slow starts. + * It must also start immediately, at least at the minimal step when leaving maintenance. + */ + if ((sv->next_state == SRV_ST_STARTING) && (px->lbprm.algo & BE_LB_PROP_DYN)) + w = (px->lbprm.wdiv * (ns_to_sec(now_ns) - sv->last_change) + sv->slowstart) / sv->slowstart; + else + w = px->lbprm.wdiv; + + sv->next_eweight = (sv->uweight * w + px->lbprm.wmult - 1) / px->lbprm.wmult; + + /* propagate changes only if needed (i.e. not recursively) */ + if (must_update) + srv_update_status(sv, 0, SRV_OP_STCHGC_NONE); +} + +/* + * Parses weight_str and configures sv accordingly. + * Returns NULL on success, error message string otherwise. + * + * Must be called with the server lock held. + */ +const char *server_parse_weight_change_request(struct server *sv, + const char *weight_str) +{ + struct proxy *px; + long int w; + char *end; + + px = sv->proxy; + + /* if the weight is terminated with '%', it is set relative to + * the initial weight, otherwise it is absolute. + */ + if (!*weight_str) + return "Require <weight> or <weight%>.\n"; + + w = strtol(weight_str, &end, 10); + if (end == weight_str) + return "Empty weight string empty or preceded by garbage"; + else if (end[0] == '%' && end[1] == '\0') { + if (w < 0) + return "Relative weight must be positive.\n"; + /* Avoid integer overflow */ + if (w > 25600) + w = 25600; + w = sv->iweight * w / 100; + if (w > 256) + w = 256; + } + else if (w < 0 || w > 256) + return "Absolute weight can only be between 0 and 256 inclusive.\n"; + else if (end[0] != '\0') + return "Trailing garbage in weight string"; + + if (w && w != sv->iweight && !(px->lbprm.algo & BE_LB_PROP_DYN)) + return "Backend is using a static LB algorithm and only accepts weights '0%' and '100%'.\n"; + + sv->uweight = w; + server_recalc_eweight(sv, 1); + + return NULL; +} + +/* + * Parses <addr_str> and configures <sv> accordingly. <from> precise + * the source of the change in the associated message log. + * Returns: + * - error string on error + * - NULL on success + * + * Must be called with the server lock held. + */ +const char *server_parse_addr_change_request(struct server *sv, + const char *addr_str, const char *updater) +{ + unsigned char ip[INET6_ADDRSTRLEN]; + + if (inet_pton(AF_INET6, addr_str, ip)) { + srv_update_addr(sv, ip, AF_INET6, updater); + return NULL; + } + if (inet_pton(AF_INET, addr_str, ip)) { + srv_update_addr(sv, ip, AF_INET, updater); + return NULL; + } + + return "Could not understand IP address format.\n"; +} + +/* + * Must be called with the server lock held. + */ +const char *server_parse_maxconn_change_request(struct server *sv, + const char *maxconn_str) +{ + long int v; + char *end; + + if (!*maxconn_str) + return "Require <maxconn>.\n"; + + v = strtol(maxconn_str, &end, 10); + if (end == maxconn_str) + return "maxconn string empty or preceded by garbage"; + else if (end[0] != '\0') + return "Trailing garbage in maxconn string"; + + if (sv->maxconn == sv->minconn) { // static maxconn + sv->maxconn = sv->minconn = v; + } else { // dynamic maxconn + sv->maxconn = v; + } + + if (may_dequeue_tasks(sv, sv->proxy)) + process_srv_queue(sv); + + return NULL; +} + +static struct sample_expr *srv_sni_sample_parse_expr(struct server *srv, struct proxy *px, + const char *file, int linenum, char **err) +{ + int idx; + const char *args[] = { + srv->sni_expr, + NULL, + }; + + idx = 0; + px->conf.args.ctx = ARGC_SRV; + + return sample_parse_expr((char **)args, &idx, file, linenum, err, &px->conf.args, NULL); +} + +int server_parse_sni_expr(struct server *newsrv, struct proxy *px, char **err) +{ + struct sample_expr *expr; + + expr = srv_sni_sample_parse_expr(newsrv, px, px->conf.file, px->conf.line, err); + if (!expr) { + memprintf(err, "error detected while parsing sni expression : %s", *err); + return ERR_ALERT | ERR_FATAL; + } + + if (!(expr->fetch->val & SMP_VAL_BE_SRV_CON)) { + memprintf(err, "error detected while parsing sni expression : " + " fetch method '%s' extracts information from '%s', " + "none of which is available here.", + newsrv->sni_expr, sample_src_names(expr->fetch->use)); + return ERR_ALERT | ERR_FATAL; + } + + px->http_needed |= !!(expr->fetch->use & SMP_USE_HTTP_ANY); + release_sample_expr(newsrv->ssl_ctx.sni); + newsrv->ssl_ctx.sni = expr; + + return 0; +} + +static void display_parser_err(const char *file, int linenum, char **args, int cur_arg, int err_code, char **err) +{ + char *msg = "error encountered while processing "; + char *quote = "'"; + char *token = args[cur_arg]; + + if (err && *err) { + indent_msg(err, 2); + msg = *err; + quote = ""; + token = ""; + } + + if (err_code & ERR_WARN && !(err_code & ERR_ALERT)) + ha_warning("%s%s%s%s.\n", msg, quote, token, quote); + else + ha_alert("%s%s%s%s.\n", msg, quote, token, quote); +} + +static void srv_conn_src_sport_range_cpy(struct server *srv, const struct server *src) +{ + int range_sz; + + range_sz = src->conn_src.sport_range->size; + if (range_sz > 0) { + srv->conn_src.sport_range = port_range_alloc_range(range_sz); + if (srv->conn_src.sport_range != NULL) { + int i; + + for (i = 0; i < range_sz; i++) { + srv->conn_src.sport_range->ports[i] = + src->conn_src.sport_range->ports[i]; + } + } + } +} + +/* + * Copy <src> server connection source settings to <srv> server everything needed. + */ +static void srv_conn_src_cpy(struct server *srv, const struct server *src) +{ + srv->conn_src.opts = src->conn_src.opts; + srv->conn_src.source_addr = src->conn_src.source_addr; + + /* Source port range copy. */ + if (src->conn_src.sport_range != NULL) + srv_conn_src_sport_range_cpy(srv, src); + +#ifdef CONFIG_HAP_TRANSPARENT + if (src->conn_src.bind_hdr_name != NULL) { + srv->conn_src.bind_hdr_name = strdup(src->conn_src.bind_hdr_name); + srv->conn_src.bind_hdr_len = strlen(src->conn_src.bind_hdr_name); + } + srv->conn_src.bind_hdr_occ = src->conn_src.bind_hdr_occ; + srv->conn_src.tproxy_addr = src->conn_src.tproxy_addr; +#endif + if (src->conn_src.iface_name != NULL) + srv->conn_src.iface_name = strdup(src->conn_src.iface_name); +} + +/* + * Copy <src> server SSL settings to <srv> server allocating + * everything needed. + */ +#if defined(USE_OPENSSL) +static void srv_ssl_settings_cpy(struct server *srv, const struct server *src) +{ + /* <src> is the current proxy's default server and SSL is enabled */ + BUG_ON(src->ssl_ctx.ctx != NULL); /* the SSL_CTX must never be initialized in a default-server */ + + if (src == &srv->proxy->defsrv && src->use_ssl == 1) + srv->flags |= SRV_F_DEFSRV_USE_SSL; + + if (src->ssl_ctx.ca_file != NULL) + srv->ssl_ctx.ca_file = strdup(src->ssl_ctx.ca_file); + if (src->ssl_ctx.crl_file != NULL) + srv->ssl_ctx.crl_file = strdup(src->ssl_ctx.crl_file); + if (src->ssl_ctx.client_crt != NULL) + srv->ssl_ctx.client_crt = strdup(src->ssl_ctx.client_crt); + + srv->ssl_ctx.verify = src->ssl_ctx.verify; + + + if (src->ssl_ctx.verify_host != NULL) + srv->ssl_ctx.verify_host = strdup(src->ssl_ctx.verify_host); + if (src->ssl_ctx.ciphers != NULL) + srv->ssl_ctx.ciphers = strdup(src->ssl_ctx.ciphers); + if (src->ssl_ctx.options) + srv->ssl_ctx.options = src->ssl_ctx.options; + if (src->ssl_ctx.methods.flags) + srv->ssl_ctx.methods.flags = src->ssl_ctx.methods.flags; + if (src->ssl_ctx.methods.min) + srv->ssl_ctx.methods.min = src->ssl_ctx.methods.min; + if (src->ssl_ctx.methods.max) + srv->ssl_ctx.methods.max = src->ssl_ctx.methods.max; + + if (src->ssl_ctx.ciphersuites != NULL) + srv->ssl_ctx.ciphersuites = strdup(src->ssl_ctx.ciphersuites); + if (src->sni_expr != NULL) + srv->sni_expr = strdup(src->sni_expr); + + if (src->ssl_ctx.alpn_str) { + srv->ssl_ctx.alpn_str = malloc(src->ssl_ctx.alpn_len); + if (srv->ssl_ctx.alpn_str) { + memcpy(srv->ssl_ctx.alpn_str, src->ssl_ctx.alpn_str, + src->ssl_ctx.alpn_len); + srv->ssl_ctx.alpn_len = src->ssl_ctx.alpn_len; + } + } + + if (src->ssl_ctx.npn_str) { + srv->ssl_ctx.npn_str = malloc(src->ssl_ctx.npn_len); + if (srv->ssl_ctx.npn_str) { + memcpy(srv->ssl_ctx.npn_str, src->ssl_ctx.npn_str, + src->ssl_ctx.npn_len); + srv->ssl_ctx.npn_len = src->ssl_ctx.npn_len; + } + } +} + +/* Activate ssl on server <s>. + * do nothing if there is no change to apply + * + * Must be called with the server lock held. + */ +void srv_set_ssl(struct server *s, int use_ssl) +{ + if (s->use_ssl == use_ssl) + return; + + s->use_ssl = use_ssl; + if (s->use_ssl) + s->xprt = xprt_get(XPRT_SSL); + else + s->xprt = xprt_get(XPRT_RAW); +} + +#endif /* USE_OPENSSL */ + +/* + * Prepare <srv> for hostname resolution. + * May be safely called with a default server as <src> argument (without hostname). + * Returns -1 in case of any allocation failure, 0 if not. + */ +int srv_prepare_for_resolution(struct server *srv, const char *hostname) +{ + char *hostname_dn; + int hostname_len, hostname_dn_len; + + if (!hostname) + return 0; + + hostname_len = strlen(hostname); + hostname_dn = trash.area; + hostname_dn_len = resolv_str_to_dn_label(hostname, hostname_len, + hostname_dn, trash.size); + if (hostname_dn_len == -1) + goto err; + + + free(srv->hostname); + free(srv->hostname_dn); + srv->hostname = strdup(hostname); + srv->hostname_dn = strdup(hostname_dn); + srv->hostname_dn_len = hostname_dn_len; + if (!srv->hostname || !srv->hostname_dn) + goto err; + + return 0; + + err: + ha_free(&srv->hostname); + ha_free(&srv->hostname_dn); + return -1; +} + +/* + * Copy <src> server settings to <srv> server allocating + * everything needed. + * This function is not supposed to be called at any time, but only + * during server settings parsing or during server allocations from + * a server template, and just after having calloc()'ed a new server. + * So, <src> may only be a default server (when parsing server settings) + * or a server template (during server allocations from a server template). + * <srv_tmpl> distinguishes these two cases (must be 1 if <srv> is a template, + * 0 if not). + */ +void srv_settings_cpy(struct server *srv, const struct server *src, int srv_tmpl) +{ + struct srv_pp_tlv_list *srv_tlv = NULL, *new_srv_tlv = NULL; + + /* Connection source settings copy */ + srv_conn_src_cpy(srv, src); + + if (srv_tmpl) { + srv->addr = src->addr; + srv->addr_type = src->addr_type; + srv->svc_port = src->svc_port; + } + + srv->pp_opts = src->pp_opts; + if (src->rdr_pfx != NULL) { + srv->rdr_pfx = strdup(src->rdr_pfx); + srv->rdr_len = src->rdr_len; + } + if (src->cookie != NULL) { + srv->cookie = strdup(src->cookie); + srv->cklen = src->cklen; + } + srv->use_ssl = src->use_ssl; + srv->check.addr = src->check.addr; + srv->agent.addr = src->agent.addr; + srv->check.use_ssl = src->check.use_ssl; + srv->check.port = src->check.port; + srv->check.sni = src->check.sni; + srv->check.alpn_str = src->check.alpn_str; + srv->check.alpn_len = src->check.alpn_len; + /* Note: 'flags' field has potentially been already initialized. */ + srv->flags |= src->flags; + srv->do_check = src->do_check; + srv->do_agent = src->do_agent; + srv->check.inter = src->check.inter; + srv->check.fastinter = src->check.fastinter; + srv->check.downinter = src->check.downinter; + srv->agent.use_ssl = src->agent.use_ssl; + srv->agent.port = src->agent.port; + + if (src->agent.tcpcheck_rules) { + srv->agent.tcpcheck_rules = calloc(1, sizeof(*srv->agent.tcpcheck_rules)); + if (srv->agent.tcpcheck_rules) { + srv->agent.tcpcheck_rules->flags = src->agent.tcpcheck_rules->flags; + srv->agent.tcpcheck_rules->list = src->agent.tcpcheck_rules->list; + LIST_INIT(&srv->agent.tcpcheck_rules->preset_vars); + dup_tcpcheck_vars(&srv->agent.tcpcheck_rules->preset_vars, + &src->agent.tcpcheck_rules->preset_vars); + } + } + + srv->agent.inter = src->agent.inter; + srv->agent.fastinter = src->agent.fastinter; + srv->agent.downinter = src->agent.downinter; + srv->maxqueue = src->maxqueue; + srv->ws = src->ws; + srv->minconn = src->minconn; + srv->maxconn = src->maxconn; + srv->slowstart = src->slowstart; + srv->observe = src->observe; + srv->onerror = src->onerror; + srv->onmarkeddown = src->onmarkeddown; + srv->onmarkedup = src->onmarkedup; + if (src->trackit != NULL) + srv->trackit = strdup(src->trackit); + srv->consecutive_errors_limit = src->consecutive_errors_limit; + srv->uweight = srv->iweight = src->iweight; + + srv->check.send_proxy = src->check.send_proxy; + /* health: up, but will fall down at first failure */ + srv->check.rise = srv->check.health = src->check.rise; + srv->check.fall = src->check.fall; + + /* Here we check if 'disabled' is the default server state */ + if (src->next_admin & (SRV_ADMF_CMAINT | SRV_ADMF_FMAINT)) { + srv->next_admin |= SRV_ADMF_CMAINT | SRV_ADMF_FMAINT; + srv->next_state = SRV_ST_STOPPED; + srv->check.state |= CHK_ST_PAUSED; + srv->check.health = 0; + } + + /* health: up but will fall down at first failure */ + srv->agent.rise = srv->agent.health = src->agent.rise; + srv->agent.fall = src->agent.fall; + + if (src->resolvers_id != NULL) + srv->resolvers_id = strdup(src->resolvers_id); + srv->resolv_opts.family_prio = src->resolv_opts.family_prio; + srv->resolv_opts.accept_duplicate_ip = src->resolv_opts.accept_duplicate_ip; + srv->resolv_opts.ignore_weight = src->resolv_opts.ignore_weight; + if (srv->resolv_opts.family_prio == AF_UNSPEC) + srv->resolv_opts.family_prio = AF_INET6; + memcpy(srv->resolv_opts.pref_net, + src->resolv_opts.pref_net, + sizeof srv->resolv_opts.pref_net); + srv->resolv_opts.pref_net_nb = src->resolv_opts.pref_net_nb; + + srv->init_addr_methods = src->init_addr_methods; + srv->init_addr = src->init_addr; +#if defined(USE_OPENSSL) + srv_ssl_settings_cpy(srv, src); +#endif +#ifdef TCP_USER_TIMEOUT + srv->tcp_ut = src->tcp_ut; +#endif + srv->mux_proto = src->mux_proto; + srv->pool_purge_delay = src->pool_purge_delay; + srv->low_idle_conns = src->low_idle_conns; + srv->max_idle_conns = src->max_idle_conns; + srv->max_reuse = src->max_reuse; + + if (srv_tmpl) + srv->srvrq = src->srvrq; + + srv->netns = src->netns; + srv->check.via_socks4 = src->check.via_socks4; + srv->socks4_addr = src->socks4_addr; + srv->log_bufsize = src->log_bufsize; + + LIST_INIT(&srv->pp_tlvs); + + list_for_each_entry(srv_tlv, &src->pp_tlvs, list) { + new_srv_tlv = malloc(sizeof(*new_srv_tlv)); + if (unlikely(!new_srv_tlv)) { + break; + } + new_srv_tlv->fmt_string = strdup(srv_tlv->fmt_string); + if (unlikely(!new_srv_tlv->fmt_string)) { + free(new_srv_tlv); + break; + } + new_srv_tlv->type = srv_tlv->type; + LIST_APPEND(&srv->pp_tlvs, &new_srv_tlv->list); + } +} + +/* allocate a server and attach it to the global servers_list. Returns + * the server on success, otherwise NULL. + */ +struct server *new_server(struct proxy *proxy) +{ + struct server *srv; + + srv = calloc(1, sizeof *srv); + if (!srv) + return NULL; + + srv_take(srv); + + srv->obj_type = OBJ_TYPE_SERVER; + srv->proxy = proxy; + queue_init(&srv->queue, proxy, srv); + LIST_APPEND(&servers_list, &srv->global_list); + LIST_INIT(&srv->srv_rec_item); + LIST_INIT(&srv->ip_rec_item); + LIST_INIT(&srv->pp_tlvs); + MT_LIST_INIT(&srv->prev_deleted); + event_hdl_sub_list_init(&srv->e_subs); + srv->rid = 0; /* rid defaults to 0 */ + + srv->next_state = SRV_ST_RUNNING; /* early server setup */ + srv->last_change = ns_to_sec(now_ns); + + srv->check.obj_type = OBJ_TYPE_CHECK; + srv->check.status = HCHK_STATUS_INI; + srv->check.server = srv; + srv->check.proxy = proxy; + srv->check.tcpcheck_rules = &proxy->tcpcheck_rules; + + srv->agent.obj_type = OBJ_TYPE_CHECK; + srv->agent.status = HCHK_STATUS_INI; + srv->agent.server = srv; + srv->agent.proxy = proxy; + srv->xprt = srv->check.xprt = srv->agent.xprt = xprt_get(XPRT_RAW); + + srv->extra_counters = NULL; +#ifdef USE_OPENSSL + HA_RWLOCK_INIT(&srv->ssl_ctx.lock); +#endif + + /* please don't put default server settings here, they are set in + * proxy_preset_defaults(). + */ + return srv; +} + +/* Increment the server refcount. */ +void srv_take(struct server *srv) +{ + HA_ATOMIC_INC(&srv->refcount); +} + +/* deallocate common server parameters (may be used by default-servers) */ +void srv_free_params(struct server *srv) +{ + free(srv->cookie); + free(srv->rdr_pfx); + free(srv->hostname); + free(srv->hostname_dn); + free((char*)srv->conf.file); + free(srv->per_thr); + free(srv->per_tgrp); + free(srv->curr_idle_thr); + free(srv->resolvers_id); + free(srv->addr_node.key); + free(srv->lb_nodes); + if (srv->log_target) { + deinit_log_target(srv->log_target); + free(srv->log_target); + } + + if (xprt_get(XPRT_SSL) && xprt_get(XPRT_SSL)->destroy_srv) + xprt_get(XPRT_SSL)->destroy_srv(srv); +} + +/* Deallocate a server <srv> and its member. <srv> must be allocated. For + * dynamic servers, its refcount is decremented first. The free operations are + * conducted only if the refcount is nul. + * + * As a convenience, <srv.next> is returned if srv is not NULL. It may be useful + * when calling srv_drop on the list of servers. + */ +struct server *srv_drop(struct server *srv) +{ + struct server *next = NULL; + + if (!srv) + goto end; + + next = srv->next; + + /* For dynamic servers, decrement the reference counter. Only free the + * server when reaching zero. + */ + if (HA_ATOMIC_SUB_FETCH(&srv->refcount, 1)) + goto end; + + /* make sure we are removed from our 'next->prev_deleted' list + * This doesn't require full thread isolation as we're using mt lists + * However this could easily be turned into regular list if required + * (with the proper use of thread isolation) + */ + MT_LIST_DELETE(&srv->prev_deleted); + + task_destroy(srv->warmup); + task_destroy(srv->srvrq_check); + + free(srv->id); + srv_free_params(srv); + + HA_SPIN_DESTROY(&srv->lock); + + LIST_DELETE(&srv->global_list); + event_hdl_sub_list_destroy(&srv->e_subs); + + EXTRA_COUNTERS_FREE(srv->extra_counters); + + ha_free(&srv); + + end: + return next; +} + +/* Detach server from proxy list. It is supported to call this + * even if the server is not yet in the list + */ +static void _srv_detach(struct server *srv) +{ + struct proxy *be = srv->proxy; + + if (be->srv == srv) { + be->srv = srv->next; + } + else { + struct server *prev; + + for (prev = be->srv; prev && prev->next != srv; prev = prev->next) + ; + if (prev) + prev->next = srv->next; + } +} + +/* Remove a server <srv> from a tracking list if <srv> is tracking another + * server. No special care is taken if <srv> is tracked itself by another one : + * this situation should be avoided by the caller. + * + * Not thread-safe. + */ +static void release_server_track(struct server *srv) +{ + struct server *strack = srv->track; + struct server **base; + + if (!strack) + return; + + for (base = &strack->trackers; *base; base = &((*base)->tracknext)) { + if (*base == srv) { + *base = srv->tracknext; + return; + } + } + + /* srv not found on the tracking list, this should never happen */ + BUG_ON(!*base); +} + +/* + * Parse as much as possible such a range string argument: low[-high] + * Set <nb_low> and <nb_high> values so that they may be reused by this loop + * for(int i = nb_low; i <= nb_high; i++)... with nb_low >= 1. + * Fails if 'low' < 0 or 'high' is present and not higher than 'low'. + * Returns 0 if succeeded, -1 if not. + */ +static int _srv_parse_tmpl_range(struct server *srv, const char *arg, + int *nb_low, int *nb_high) +{ + char *nb_high_arg; + + *nb_high = 0; + chunk_printf(&trash, "%s", arg); + *nb_low = atoi(trash.area); + + if ((nb_high_arg = strchr(trash.area, '-'))) { + *nb_high_arg++ = '\0'; + *nb_high = atoi(nb_high_arg); + } + else { + *nb_high += *nb_low; + *nb_low = 1; + } + + if (*nb_low < 0 || *nb_high < *nb_low) + return -1; + + return 0; +} + +/* Parse as much as possible such a range string argument: low[-high] + * Set <nb_low> and <nb_high> values so that they may be reused by this loop + * for(int i = nb_low; i <= nb_high; i++)... with nb_low >= 1. + * + * This function is first intended to be used through parse_server to + * initialize a new server on startup. + * + * Fails if 'low' < 0 or 'high' is present and not higher than 'low'. + * Returns 0 if succeeded, -1 if not. + */ +static inline void _srv_parse_set_id_from_prefix(struct server *srv, + const char *prefix, int nb) +{ + chunk_printf(&trash, "%s%d", prefix, nb); + free(srv->id); + srv->id = strdup(trash.area); +} + +/* Initialize as much as possible servers from <srv> server template. + * Note that a server template is a special server with + * a few different parameters than a server which has + * been parsed mostly the same way as a server. + * + * This function is first intended to be used through parse_server to + * initialize a new server on startup. + * + * Returns the number of servers successfully allocated, + * 'srv' template included. + */ +static int _srv_parse_tmpl_init(struct server *srv, struct proxy *px) +{ + int i; + struct server *newsrv; + + for (i = srv->tmpl_info.nb_low + 1; i <= srv->tmpl_info.nb_high; i++) { + newsrv = new_server(px); + if (!newsrv) + goto err; + + newsrv->conf.file = strdup(srv->conf.file); + newsrv->conf.line = srv->conf.line; + + srv_settings_cpy(newsrv, srv, 1); + srv_prepare_for_resolution(newsrv, srv->hostname); + + if (newsrv->sni_expr) { + newsrv->ssl_ctx.sni = srv_sni_sample_parse_expr(newsrv, px, NULL, 0, NULL); + if (!newsrv->ssl_ctx.sni) + goto err; + } + + /* append to list of servers available to receive an hostname */ + if (newsrv->srvrq) + LIST_APPEND(&newsrv->srvrq->attached_servers, &newsrv->srv_rec_item); + + /* Set this new server ID. */ + _srv_parse_set_id_from_prefix(newsrv, srv->tmpl_info.prefix, i); + + /* Linked backwards first. This will be restablished after parsing. */ + newsrv->next = px->srv; + px->srv = newsrv; + } + _srv_parse_set_id_from_prefix(srv, srv->tmpl_info.prefix, srv->tmpl_info.nb_low); + + return i - srv->tmpl_info.nb_low; + + err: + _srv_parse_set_id_from_prefix(srv, srv->tmpl_info.prefix, srv->tmpl_info.nb_low); + if (newsrv) { + release_sample_expr(newsrv->ssl_ctx.sni); + free_check(&newsrv->agent); + free_check(&newsrv->check); + LIST_DELETE(&newsrv->global_list); + } + free(newsrv); + return i - srv->tmpl_info.nb_low; +} + +/* Ensure server config will work with effective proxy mode + * + * This function is expected to be called after _srv_parse_init() initialization + * but only when the effective server's proxy mode is known, which is not always + * the case during parsing time, in which case the function will be called during + * postparsing thanks to the _srv_postparse() below. + * + * Returns ERR_NONE on success else a combination or ERR_CODE. + */ +static int _srv_check_proxy_mode(struct server *srv, char postparse) +{ + int err_code = ERR_NONE; + + if (postparse && !(srv->proxy->cap & PR_CAP_LB)) + return ERR_NONE; /* nothing to do, the check was already performed during parsing */ + + if (srv->conf.file) + set_usermsgs_ctx(srv->conf.file, srv->conf.line, NULL); + + if (!srv->proxy) { + /* proxy mode not known, cannot perform checks (ie: defaults section) */ + goto out; + } + + if (srv->proxy->mode == PR_MODE_SYSLOG) { + /* log backend server (belongs to proxy with mode log enabled): + * perform some compatibility checks + */ + + /* supported address family types are: + * - ipv4 + * - ipv6 + * (UNSPEC is supported because it means it will be resolved later) + */ + if (srv->addr.ss_family != AF_UNSPEC && + srv->addr.ss_family != AF_INET && srv->addr.ss_family != AF_INET6) { + ha_alert("log server address family not supported for log backend server.\n"); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* only @tcp or @udp address forms (or equivalent) are supported */ + if (!(srv->addr_type.xprt_type == PROTO_TYPE_DGRAM && srv->addr_type.proto_type == PROTO_TYPE_DGRAM) && + !(srv->addr_type.xprt_type == PROTO_TYPE_STREAM && srv->addr_type.proto_type == PROTO_TYPE_STREAM)) { + ha_alert("log server address type not supported for log backend server.\n"); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + else { + /* for all other proxy modes: only TCP expected as srv's transport type for now */ + if (srv->addr_type.xprt_type != PROTO_TYPE_STREAM) { + ha_alert("unsupported transport for server address in '%s' backend.\n", proxy_mode_str(srv->proxy->mode)); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + out: + if (srv->conf.file) + reset_usermsgs_ctx(); + + return err_code; +} + +/* Perform some server postparsing checks / tasks: + * We must be careful that checks / postinits performed within this function + * don't depend or conflict with other postcheck functions that are registered + * using REGISTER_POST_SERVER_CHECK() hook. + * + * Returns ERR_NONE on success else a combination or ERR_CODE. + */ +static int _srv_postparse(struct server *srv) +{ + int err_code = ERR_NONE; + + err_code |= _srv_check_proxy_mode(srv, 1); + + return err_code; +} +REGISTER_POST_SERVER_CHECK(_srv_postparse); + +/* Allocate a new server pointed by <srv> and try to parse the first arguments + * in <args> as an address for a server or an address-range for a template or + * nothing for a default-server. <cur_arg> is incremented to the next argument. + * + * This function is first intended to be used through parse_server to + * initialize a new server on startup. + * + * A mask of errors is returned. On a parsing error, ERR_FATAL is set. In case + * of memory exhaustion, ERR_ABORT is set. If the server cannot be allocated, + * <srv> will be set to NULL. + */ +static int _srv_parse_init(struct server **srv, char **args, int *cur_arg, + struct proxy *curproxy, + int parse_flags) +{ + struct server *newsrv = NULL; + const char *err = NULL; + int err_code = 0; + char *fqdn = NULL; + int tmpl_range_low = 0, tmpl_range_high = 0; + char *errmsg = NULL; + + *srv = NULL; + + /* There is no mandatory first arguments for default server. */ + if (parse_flags & SRV_PARSE_PARSE_ADDR) { + if (parse_flags & SRV_PARSE_TEMPLATE) { + if (!*args[3]) { + /* 'server-template' line number of argument check. */ + ha_alert("'%s' expects <prefix> <nb | range> <addr>[:<port>] as arguments.\n", + args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err = invalid_prefix_char(args[1]); + } + else { + if (!*args[2]) { + /* 'server' line number of argument check. */ + ha_alert("'%s' expects <name> and <addr>[:<port>] as arguments.\n", + args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err = invalid_char(args[1]); + } + + if (err) { + ha_alert("character '%c' is not permitted in %s %s '%s'.\n", + *err, args[0], !(parse_flags & SRV_PARSE_TEMPLATE) ? "name" : "prefix", args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + *cur_arg = 2; + if (parse_flags & SRV_PARSE_TEMPLATE) { + /* Parse server-template <nb | range> arg. */ + if (_srv_parse_tmpl_range(newsrv, args[*cur_arg], &tmpl_range_low, &tmpl_range_high) < 0) { + ha_alert("Wrong %s number or range arg '%s'.\n", + args[0], args[*cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + (*cur_arg)++; + } + + if (!(parse_flags & SRV_PARSE_DEFAULT_SERVER)) { + struct sockaddr_storage *sk; + int port1, port2, port; + + *srv = newsrv = new_server(curproxy); + if (!newsrv) { + ha_alert("out of memory.\n"); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + register_parsing_obj(&newsrv->obj_type); + + if (parse_flags & SRV_PARSE_TEMPLATE) { + newsrv->tmpl_info.nb_low = tmpl_range_low; + newsrv->tmpl_info.nb_high = tmpl_range_high; + } + + if (parse_flags & SRV_PARSE_DYNAMIC) + newsrv->flags |= SRV_F_DYNAMIC; + + /* Note: for a server template, its id is its prefix. + * This is a temporary id which will be used for server allocations to come + * after parsing. + */ + if (!(parse_flags & SRV_PARSE_TEMPLATE)) + newsrv->id = strdup(args[1]); + else + newsrv->tmpl_info.prefix = strdup(args[1]); + + /* several ways to check the port component : + * - IP => port=+0, relative (IPv4 only) + * - IP: => port=+0, relative + * - IP:N => port=N, absolute + * - IP:+N => port=+N, relative + * - IP:-N => port=-N, relative + */ + if (!(parse_flags & SRV_PARSE_PARSE_ADDR)) + goto skip_addr; + + sk = str2sa_range(args[*cur_arg], &port, &port1, &port2, NULL, NULL, &newsrv->addr_type, + &errmsg, NULL, &fqdn, + (parse_flags & SRV_PARSE_INITIAL_RESOLVE ? PA_O_RESOLVE : 0) | PA_O_PORT_OK | + (parse_flags & SRV_PARSE_IN_PEER_SECTION ? PA_O_PORT_MAND : PA_O_PORT_OFS) | + PA_O_STREAM | PA_O_DGRAM | PA_O_XPRT); + if (!sk) { + ha_alert("%s\n", errmsg); + err_code |= ERR_ALERT | ERR_FATAL; + ha_free(&errmsg); + goto out; + } + + if (!port1 || !port2) { + if (sk->ss_family != AF_CUST_RHTTP_SRV) { + /* no port specified, +offset, -offset */ + newsrv->flags |= SRV_F_MAPPORTS; + } + else { + newsrv->flags |= SRV_F_RHTTP; + } + } + + /* save hostname and create associated name resolution */ + if (fqdn) { + if (fqdn[0] == '_') { /* SRV record */ + /* Check if a SRV request already exists, and if not, create it */ + if ((newsrv->srvrq = find_srvrq_by_name(fqdn, curproxy)) == NULL) + newsrv->srvrq = new_resolv_srvrq(newsrv, fqdn); + if (newsrv->srvrq == NULL) { + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + LIST_APPEND(&newsrv->srvrq->attached_servers, &newsrv->srv_rec_item); + } + else if (srv_prepare_for_resolution(newsrv, fqdn) == -1) { + ha_alert("Can't create DNS resolution for server '%s'\n", + newsrv->id); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + newsrv->addr = *sk; + newsrv->svc_port = port; + /* + * we don't need to lock the server here, because + * we are in the process of initializing. + * + * Note that the server is not attached into the proxy tree if + * this is a dynamic server. + */ + srv_set_addr_desc(newsrv, !(parse_flags & SRV_PARSE_DYNAMIC)); + + if (!newsrv->srvrq && !newsrv->hostname && + !protocol_lookup(newsrv->addr.ss_family, PROTO_TYPE_STREAM, 0)) { + ha_alert("Unknown protocol family %d '%s'\n", + newsrv->addr.ss_family, args[*cur_arg]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + (*cur_arg)++; + skip_addr: + if (!(parse_flags & SRV_PARSE_DYNAMIC)) { + /* Copy default server settings to new server */ + srv_settings_cpy(newsrv, &curproxy->defsrv, 0); + } else { + /* Initialize dynamic server weight to 1 */ + newsrv->uweight = newsrv->iweight = 1; + + /* A dynamic server is disabled on startup */ + newsrv->next_admin = SRV_ADMF_FMAINT; + newsrv->next_state = SRV_ST_STOPPED; + server_recalc_eweight(newsrv, 0); + + /* Set default values for checks */ + newsrv->check.inter = DEF_CHKINTR; + newsrv->check.rise = DEF_RISETIME; + newsrv->check.fall = DEF_FALLTIME; + + newsrv->agent.inter = DEF_CHKINTR; + newsrv->agent.rise = DEF_AGENT_RISETIME; + newsrv->agent.fall = DEF_AGENT_FALLTIME; + } + HA_SPIN_INIT(&newsrv->lock); + } + else { + *srv = newsrv = &curproxy->defsrv; + *cur_arg = 1; + newsrv->resolv_opts.family_prio = AF_INET6; + newsrv->resolv_opts.accept_duplicate_ip = 0; + } + + free(fqdn); + if (!(curproxy->cap & PR_CAP_LB)) { + /* No need to wait for effective proxy mode, it is already known: + * Only general purpose user-declared proxies ("listen", "frontend", "backend") + * offer the possibility to configure the mode of the proxy. Hopefully for us, + * they have the PR_CAP_LB set. + */ + return _srv_check_proxy_mode(newsrv, 0); + } + return 0; + +out: + free(fqdn); + return err_code; +} + +/* Parse the server keyword in <args>. + * <cur_arg> is incremented beyond the keyword optional value. Note that this + * might not be the case if an error is reported. + * + * This function is first intended to be used through parse_server to + * initialize a new server on startup. + * + * A mask of errors is returned. ERR_FATAL is set if the parsing should be + * interrupted. + */ +static int _srv_parse_kw(struct server *srv, char **args, int *cur_arg, + struct proxy *curproxy, + int parse_flags) +{ + int err_code = 0; + struct srv_kw *kw; + const char *best; + char *errmsg = NULL; + + kw = srv_find_kw(args[*cur_arg]); + if (!kw) { + best = srv_find_best_kw(args[*cur_arg]); + if (best) + ha_alert("unknown keyword '%s'; did you mean '%s' maybe ?%s\n", + args[*cur_arg], best, + (parse_flags & SRV_PARSE_PARSE_ADDR) ? "" : + " Hint: no address was expected for this server."); + else + ha_alert("unknown keyword '%s'.%s\n", args[*cur_arg], + (parse_flags & SRV_PARSE_PARSE_ADDR) ? "" : + " Hint: no address was expected for this server."); + + return ERR_ALERT | ERR_FATAL; + } + + if (!kw->parse) { + ha_alert("'%s' option is not implemented in this version (check build options)\n", + args[*cur_arg]); + err_code = ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((parse_flags & SRV_PARSE_DEFAULT_SERVER) && !kw->default_ok) { + ha_alert("'%s' option is not accepted in default-server sections\n", + args[*cur_arg]); + err_code = ERR_ALERT; + goto out; + } + else if ((parse_flags & SRV_PARSE_DYNAMIC) && !kw->dynamic_ok) { + ha_alert("'%s' option is not accepted for dynamic server\n", + args[*cur_arg]); + err_code |= ERR_ALERT; + goto out; + } + + err_code = kw->parse(args, cur_arg, curproxy, srv, &errmsg); + if (err_code) { + display_parser_err(NULL, 0, args, *cur_arg, err_code, &errmsg); + free(errmsg); + } + +out: + if (kw->skip != -1) + *cur_arg += 1 + kw->skip; + + return err_code; +} + +/* This function is first intended to be used through parse_server to + * initialize a new server on startup. + */ +static int _srv_parse_sni_expr_init(char **args, int cur_arg, + struct server *srv, struct proxy *proxy, + char **errmsg) +{ + int ret; + + if (!srv->sni_expr) + return 0; + + ret = server_parse_sni_expr(srv, proxy, errmsg); + if (!ret) + return 0; + + return ret; +} + +/* Server initializations finalization. + * Initialize health check, agent check, SNI expression and outgoing TLVs if enabled. + * Must not be called for a default server instance. + * + * This function is first intended to be used through parse_server to + * initialize a new server on startup. + */ +static int _srv_parse_finalize(char **args, int cur_arg, + struct server *srv, struct proxy *px, + int parse_flags) +{ + int ret; + char *errmsg = NULL; + struct srv_pp_tlv_list *srv_tlv = NULL; + + if (srv->do_check && srv->trackit) { + ha_alert("unable to enable checks and tracking at the same time!\n"); + return ERR_ALERT | ERR_FATAL; + } + + if (srv->do_agent && !srv->agent.port) { + ha_alert("server %s does not have agent port. Agent check has been disabled.\n", + srv->id); + return ERR_ALERT | ERR_FATAL; + } + + if ((ret = _srv_parse_sni_expr_init(args, cur_arg, srv, px, &errmsg)) != 0) { + if (errmsg) { + ha_alert("%s\n", errmsg); + free(errmsg); + } + return ret; + } + + /* A dynamic server is disabled on startup. It must not be counted as + * an active backend entry. + */ + if (!(parse_flags & SRV_PARSE_DYNAMIC)) { + if (srv->flags & SRV_F_BACKUP) + px->srv_bck++; + else + px->srv_act++; + } + + list_for_each_entry(srv_tlv, &srv->pp_tlvs, list) { + LIST_INIT(&srv_tlv->fmt); + if (srv_tlv->fmt_string && unlikely(!parse_logformat_string(srv_tlv->fmt_string, + srv->proxy, &srv_tlv->fmt, 0, SMP_VAL_BE_SRV_CON, &errmsg))) { + if (errmsg) { + ha_alert("%s\n", errmsg); + free(errmsg); + } + return ERR_ALERT | ERR_FATAL; + } + } + + srv_lb_commit_status(srv); + + return 0; +} + +int parse_server(const char *file, int linenum, char **args, + struct proxy *curproxy, const struct proxy *defproxy, + int parse_flags) +{ + struct server *newsrv = NULL; + int err_code = 0; + + int cur_arg; + + set_usermsgs_ctx(file, linenum, NULL); + + if (!(parse_flags & SRV_PARSE_DEFAULT_SERVER) && curproxy == defproxy) { + ha_alert("'%s' not allowed in 'defaults' section.\n", args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (failifnotcap(curproxy, PR_CAP_BE, file, linenum, args[0], NULL)) { + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if ((parse_flags & (SRV_PARSE_IN_PEER_SECTION|SRV_PARSE_PARSE_ADDR)) == + (SRV_PARSE_IN_PEER_SECTION|SRV_PARSE_PARSE_ADDR)) { + if (!*args[2]) + return 0; + } + + err_code = _srv_parse_init(&newsrv, args, &cur_arg, curproxy, + parse_flags); + + /* the servers are linked backwards first */ + if (newsrv && !(parse_flags & SRV_PARSE_DEFAULT_SERVER)) { + newsrv->next = curproxy->srv; + curproxy->srv = newsrv; + } + + if (err_code & ERR_CODE) + goto out; + + if (!newsrv->conf.file) // note: do it only once for default-server + newsrv->conf.file = strdup(file); + newsrv->conf.line = linenum; + + while (*args[cur_arg]) { + err_code = _srv_parse_kw(newsrv, args, &cur_arg, curproxy, + parse_flags); + if (err_code & ERR_FATAL) + goto out; + } + + if (!(parse_flags & SRV_PARSE_DEFAULT_SERVER)) { + err_code |= _srv_parse_finalize(args, cur_arg, newsrv, curproxy, parse_flags); + if (err_code & ERR_FATAL) + goto out; + } + + if (parse_flags & SRV_PARSE_TEMPLATE) + _srv_parse_tmpl_init(newsrv, curproxy); + + /* If the server id is fixed, insert it in the proxy used_id tree. + * This is needed to detect a later duplicate id via srv_parse_id. + * + * If no is specified, a dynamic one is generated in + * check_config_validity. + */ + if (newsrv->flags & SRV_F_FORCED_ID) + eb32_insert(&curproxy->conf.used_server_id, &newsrv->conf.id); + + HA_DIAG_WARNING_COND((curproxy->cap & PR_CAP_LB) && !newsrv->uweight, + "configured with weight of 0 will never be selected by load balancing algorithms\n"); + + reset_usermsgs_ctx(); + return 0; + + out: + reset_usermsgs_ctx(); + return err_code; +} + +/* Returns a pointer to the first server matching either id <id>. + * NULL is returned if no match is found. + * the lookup is performed in the backend <bk> + */ +struct server *server_find_by_id(struct proxy *bk, int id) +{ + struct eb32_node *eb32; + struct server *curserver; + + if (!bk || (id ==0)) + return NULL; + + /* <bk> has no backend capabilities, so it can't have a server */ + if (!(bk->cap & PR_CAP_BE)) + return NULL; + + curserver = NULL; + + eb32 = eb32_lookup(&bk->conf.used_server_id, id); + if (eb32) + curserver = container_of(eb32, struct server, conf.id); + + return curserver; +} + +/* Returns a pointer to the first server matching either name <name>, or id + * if <name> starts with a '#'. NULL is returned if no match is found. + * the lookup is performed in the backend <bk> + */ +struct server *server_find_by_name(struct proxy *bk, const char *name) +{ + struct server *curserver; + + if (!bk || !name) + return NULL; + + /* <bk> has no backend capabilities, so it can't have a server */ + if (!(bk->cap & PR_CAP_BE)) + return NULL; + + curserver = NULL; + if (*name == '#') { + curserver = server_find_by_id(bk, atoi(name + 1)); + if (curserver) + return curserver; + } + else { + curserver = bk->srv; + + while (curserver && (strcmp(curserver->id, name) != 0)) + curserver = curserver->next; + + if (curserver) + return curserver; + } + + return NULL; +} + +struct server *server_find_best_match(struct proxy *bk, char *name, int id, int *diff) +{ + struct server *byname; + struct server *byid; + + if (!name && !id) + return NULL; + + if (diff) + *diff = 0; + + byname = byid = NULL; + + if (name) { + byname = server_find_by_name(bk, name); + if (byname && (!id || byname->puid == id)) + return byname; + } + + /* remaining possibilities : + * - name not set + * - name set but not found + * - name found but ID doesn't match + */ + if (id) { + byid = server_find_by_id(bk, id); + if (byid) { + if (byname) { + /* use id only if forced by configuration */ + if (byid->flags & SRV_F_FORCED_ID) { + if (diff) + *diff |= 2; + return byid; + } + else { + if (diff) + *diff |= 1; + return byname; + } + } + + /* remaining possibilities: + * - name not set + * - name set but not found + */ + if (name && diff) + *diff |= 2; + return byid; + } + + /* id bot found */ + if (byname) { + if (diff) + *diff |= 1; + return byname; + } + } + + return NULL; +} + +/* + * update a server's current IP address. + * ip is a pointer to the new IP address, whose address family is ip_sin_family. + * ip is in network format. + * updater is a string which contains an information about the requester of the update. + * updater is used if not NULL. + * + * A log line and a stderr warning message is generated based on server's backend options. + * + * Must be called with the server lock held. + */ +int srv_update_addr(struct server *s, void *ip, int ip_sin_family, const char *updater) +{ + union { + struct event_hdl_cb_data_server_inetaddr addr; + struct event_hdl_cb_data_server common; + } cb_data; + struct sockaddr_storage new_addr = { }; // shut up gcc warning + + /* save the new IP family & address if necessary */ + switch (ip_sin_family) { + case AF_INET: + if (s->addr.ss_family == ip_sin_family && + !memcmp(ip, &((struct sockaddr_in *)&s->addr)->sin_addr.s_addr, 4)) + return 0; + break; + case AF_INET6: + if (s->addr.ss_family == ip_sin_family && + !memcmp(ip, &((struct sockaddr_in6 *)&s->addr)->sin6_addr.s6_addr, 16)) + return 0; + break; + }; + + /* generates a log line and a warning on stderr */ + if (1) { + /* book enough space for both IPv4 and IPv6 */ + char oldip[INET6_ADDRSTRLEN]; + char newip[INET6_ADDRSTRLEN]; + + memset(oldip, '\0', INET6_ADDRSTRLEN); + memset(newip, '\0', INET6_ADDRSTRLEN); + + /* copy old IP address in a string */ + switch (s->addr.ss_family) { + case AF_INET: + inet_ntop(s->addr.ss_family, &((struct sockaddr_in *)&s->addr)->sin_addr, oldip, INET_ADDRSTRLEN); + break; + case AF_INET6: + inet_ntop(s->addr.ss_family, &((struct sockaddr_in6 *)&s->addr)->sin6_addr, oldip, INET6_ADDRSTRLEN); + break; + default: + strlcpy2(oldip, "(none)", sizeof(oldip)); + break; + }; + + /* copy new IP address in a string */ + switch (ip_sin_family) { + case AF_INET: + inet_ntop(ip_sin_family, ip, newip, INET_ADDRSTRLEN); + break; + case AF_INET6: + inet_ntop(ip_sin_family, ip, newip, INET6_ADDRSTRLEN); + break; + }; + + /* save log line into a buffer */ + chunk_printf(&trash, "%s/%s changed its IP from %s to %s by %s", + s->proxy->id, s->id, oldip, newip, updater); + + /* write the buffer on stderr */ + ha_warning("%s.\n", trash.area); + + /* send a log */ + send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.area); + } + + /* save the new IP family */ + new_addr.ss_family = ip_sin_family; + /* save the new IP address */ + switch (ip_sin_family) { + case AF_INET: + memcpy(&((struct sockaddr_in *)&new_addr)->sin_addr.s_addr, ip, 4); + break; + case AF_INET6: + memcpy(((struct sockaddr_in6 *)&new_addr)->sin6_addr.s6_addr, ip, 16); + break; + }; + + _srv_event_hdl_prepare(&cb_data.common, s, 0); + _srv_event_hdl_prepare_inetaddr(&cb_data.addr, s, + &new_addr, s->svc_port, !!(s->flags & SRV_F_MAPPORTS), + 0); + + /* server_atomic_sync_task will apply the changes for us */ + _srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_INETADDR, cb_data, s); + + return 0; +} + +/* update agent health check address and port + * addr can be ip4/ip6 or a hostname + * if one error occurs, don't apply anything + * must be called with the server lock held. + */ +const char *srv_update_agent_addr_port(struct server *s, const char *addr, const char *port) +{ + struct sockaddr_storage sk; + struct buffer *msg; + int new_port; + + msg = get_trash_chunk(); + chunk_reset(msg); + + if (!(s->agent.state & CHK_ST_ENABLED)) { + chunk_strcat(msg, "agent checks are not enabled on this server"); + goto out; + } + if (addr) { + memset(&sk, 0, sizeof(struct sockaddr_storage)); + if (str2ip(addr, &sk) == NULL) { + chunk_appendf(msg, "invalid addr '%s'", addr); + goto out; + } + } + if (port) { + if (strl2irc(port, strlen(port), &new_port) != 0) { + chunk_appendf(msg, "provided port is not an integer"); + goto out; + } + if (new_port < 0 || new_port > 65535) { + chunk_appendf(msg, "provided port is invalid"); + goto out; + } + } +out: + if (msg->data) + return msg->area; + else { + if (addr) + set_srv_agent_addr(s, &sk); + if (port) + set_srv_agent_port(s, new_port); + } + return NULL; +} + +/* update server health check address and port + * addr must be ip4 or ip6, it won't be resolved + * if one error occurs, don't apply anything + * must be called with the server lock held. + */ +const char *srv_update_check_addr_port(struct server *s, const char *addr, const char *port) +{ + struct sockaddr_storage sk; + struct buffer *msg; + int new_port; + + msg = get_trash_chunk(); + chunk_reset(msg); + + if (!(s->check.state & CHK_ST_ENABLED)) { + chunk_strcat(msg, "health checks are not enabled on this server"); + goto out; + } + if (addr) { + memset(&sk, 0, sizeof(struct sockaddr_storage)); + if (str2ip2(addr, &sk, 0) == NULL) { + chunk_appendf(msg, "invalid addr '%s'", addr); + goto out; + } + } + if (port) { + if (strl2irc(port, strlen(port), &new_port) != 0) { + chunk_appendf(msg, "provided port is not an integer"); + goto out; + } + if (new_port < 0 || new_port > 65535) { + chunk_appendf(msg, "provided port is invalid"); + goto out; + } + /* prevent the update of port to 0 if MAPPORTS are in use */ + if ((s->flags & SRV_F_MAPPORTS) && new_port == 0) { + chunk_appendf(msg, "can't unset 'port' since MAPPORTS is in use"); + goto out; + } + } +out: + if (msg->data) + return msg->area; + else { + if (addr) + s->check.addr = sk; + if (port) + s->check.port = new_port; + } + return NULL; +} + +/* + * This function update a server's addr and port only for AF_INET and AF_INET6 families. + * + * Caller can pass its name through <updater> to get it integrated in the response message + * returned by the function. + * + * The function first does the following, in that order: + * - validates the new addr and/or port + * - checks if an update is required (new IP or port is different than current ones) + * - checks the update is allowed: + * - don't switch from/to a family other than AF_INET4 and AF_INET6 + * - allow all changes if no CHECKS are configured + * - if CHECK is configured: + * - if switch to port map (SRV_F_MAPPORTS), ensure health check have their own ports + * - applies required changes to both ADDR and PORT if both 'required' and 'allowed' + * conditions are met + * + * Must be called with the server lock held. + */ +const char *srv_update_addr_port(struct server *s, const char *addr, const char *port, char *updater) +{ + union { + struct event_hdl_cb_data_server_inetaddr addr; + struct event_hdl_cb_data_server common; + } cb_data; + struct sockaddr_storage sa; + int ret; + char current_addr[INET6_ADDRSTRLEN]; + uint16_t current_port, new_port = 0; + struct buffer *msg; + int ip_change = 0; + int port_change = 0; + uint8_t mapports = !!(s->flags & SRV_F_MAPPORTS); + + msg = get_trash_chunk(); + chunk_reset(msg); + + if (addr) { + memset(&sa, 0, sizeof(struct sockaddr_storage)); + if (str2ip2(addr, &sa, 0) == NULL) { + chunk_printf(msg, "Invalid addr '%s'", addr); + goto out; + } + + /* changes are allowed on AF_INET* families only */ + if ((sa.ss_family != AF_INET) && (sa.ss_family != AF_INET6)) { + chunk_printf(msg, "Update to families other than AF_INET and AF_INET6 supported only through configuration file"); + goto out; + } + + /* collecting data currently setup */ + memset(current_addr, '\0', sizeof(current_addr)); + ret = addr_to_str(&s->addr, current_addr, sizeof(current_addr)); + /* changes are allowed on AF_INET* families only */ + if ((ret != AF_INET) && (ret != AF_INET6)) { + chunk_printf(msg, "Update for the current server address family is only supported through configuration file"); + goto out; + } + + /* applying ADDR changes if required and allowed + * ipcmp returns 0 when both ADDR are the same + */ + if (ipcmp(&s->addr, &sa, 0) == 0) { + chunk_appendf(msg, "no need to change the addr"); + goto port; + } + ip_change = 1; + + /* update report for caller */ + chunk_printf(msg, "IP changed from '%s' to '%s'", current_addr, addr); + } + + port: + if (port) { + char sign = '\0'; + char *endptr; + + if (addr) + chunk_appendf(msg, ", "); + + /* collecting data currently setup */ + current_port = s->svc_port; + + sign = *port; + errno = 0; + new_port = strtol(port, &endptr, 10); + if ((errno != 0) || (port == endptr)) { + chunk_appendf(msg, "problem converting port '%s' to an int", port); + goto out; + } + + /* check if caller triggers a port mapped or offset */ + if (sign == '-' || (sign == '+')) { + /* check if server currently uses port map */ + if (!(s->flags & SRV_F_MAPPORTS)) { + /* check is configured + * we're switching from a fixed port to a SRV_F_MAPPORTS (mapped) port + * prevent PORT change if check doesn't have it's dedicated port while switching + * to port mapping */ + if (!s->check.port) { + chunk_appendf(msg, "can't change <port> to port map because it is incompatible with current health check port configuration (use 'port' statement from the 'server' directive."); + goto out; + } + /* switch from fixed port to port map mandatorily triggers + * a port change */ + port_change = 1; + } + /* we're already using port maps */ + else { + port_change = current_port != new_port; + } + } + /* fixed port */ + else { + port_change = current_port != new_port; + } + + /* applying PORT changes if required and update response message */ + if (port_change) { + uint16_t new_port_print = new_port; + + /* prepare message */ + chunk_appendf(msg, "port changed from '"); + if (s->flags & SRV_F_MAPPORTS) + chunk_appendf(msg, "+"); + chunk_appendf(msg, "%d' to '", current_port); + + if (sign == '-') { + mapports = 1; + chunk_appendf(msg, "%c", sign); + /* just use for result output */ + new_port_print = -new_port_print; + } + else if (sign == '+') { + mapports = 1; + chunk_appendf(msg, "%c", sign); + } + else { + mapports = 0; + } + + chunk_appendf(msg, "%d'", new_port_print); + } + else { + chunk_appendf(msg, "no need to change the port"); + } + } + +out: + if (ip_change || port_change) { + _srv_event_hdl_prepare(&cb_data.common, s, 0); + _srv_event_hdl_prepare_inetaddr(&cb_data.addr, s, + ((ip_change) ? &sa : &s->addr), + ((port_change) ? new_port : s->svc_port), mapports, + 1); + + /* server_atomic_sync_task will apply the changes for us */ + _srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_INETADDR, cb_data, s); + } + if (updater) + chunk_appendf(msg, " by '%s'", updater); + chunk_appendf(msg, "\n"); + return msg->area; +} + +/* + * update server status based on result of SRV resolution + * returns: + * 0 if server status is updated + * 1 if server status has not changed + * + * Must be called with the server lock held. + */ +int srvrq_update_srv_status(struct server *s, int has_no_ip) +{ + if (!s->srvrq) + return 1; + + /* since this server has an IP, it can go back in production */ + if (has_no_ip == 0) { + srv_clr_admin_flag(s, SRV_ADMF_RMAINT); + return 1; + } + + if (s->next_admin & SRV_ADMF_RMAINT) + return 1; + + srv_set_admin_flag(s, SRV_ADMF_RMAINT, SRV_ADM_STCHGC_DNS_NOENT); + return 0; +} + +/* + * update server status based on result of name resolution + * returns: + * 0 if server status is updated + * 1 if server status has not changed + * + * Must be called with the server lock held. + */ +int snr_update_srv_status(struct server *s, int has_no_ip) +{ + struct resolvers *resolvers = s->resolvers; + struct resolv_resolution *resolution = (s->resolv_requester ? s->resolv_requester->resolution : NULL); + int exp; + + /* If resolution is NULL we're dealing with SRV records Additional records */ + if (resolution == NULL) + return srvrq_update_srv_status(s, has_no_ip); + + switch (resolution->status) { + case RSLV_STATUS_NONE: + /* status when HAProxy has just (re)started. + * Nothing to do, since the task is already automatically started */ + break; + + case RSLV_STATUS_VALID: + /* + * resume health checks + * server will be turned back on if health check is safe + */ + if (has_no_ip) { + if (s->next_admin & SRV_ADMF_RMAINT) + return 1; + srv_set_admin_flag(s, SRV_ADMF_RMAINT, SRV_ADM_STCHGC_DNS_NOIP); + return 0; + } + + if (!(s->next_admin & SRV_ADMF_RMAINT)) + return 1; + srv_clr_admin_flag(s, SRV_ADMF_RMAINT); + chunk_printf(&trash, "Server %s/%s administratively READY thanks to valid DNS answer", + s->proxy->id, s->id); + + ha_warning("%s.\n", trash.area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.area); + return 0; + + case RSLV_STATUS_NX: + /* stop server if resolution is NX for a long enough period */ + exp = tick_add(resolution->last_valid, resolvers->hold.nx); + if (!tick_is_expired(exp, now_ms)) + break; + + if (s->next_admin & SRV_ADMF_RMAINT) + return 1; + srv_set_admin_flag(s, SRV_ADMF_RMAINT, SRV_ADM_STCHGC_DNS_NX); + return 0; + + case RSLV_STATUS_TIMEOUT: + /* stop server if resolution is TIMEOUT for a long enough period */ + exp = tick_add(resolution->last_valid, resolvers->hold.timeout); + if (!tick_is_expired(exp, now_ms)) + break; + + if (s->next_admin & SRV_ADMF_RMAINT) + return 1; + srv_set_admin_flag(s, SRV_ADMF_RMAINT, SRV_ADM_STCHGC_DNS_TIMEOUT); + return 0; + + case RSLV_STATUS_REFUSED: + /* stop server if resolution is REFUSED for a long enough period */ + exp = tick_add(resolution->last_valid, resolvers->hold.refused); + if (!tick_is_expired(exp, now_ms)) + break; + + if (s->next_admin & SRV_ADMF_RMAINT) + return 1; + srv_set_admin_flag(s, SRV_ADMF_RMAINT, SRV_ADM_STCHGC_DNS_REFUSED); + return 0; + + default: + /* stop server if resolution failed for a long enough period */ + exp = tick_add(resolution->last_valid, resolvers->hold.other); + if (!tick_is_expired(exp, now_ms)) + break; + + if (s->next_admin & SRV_ADMF_RMAINT) + return 1; + srv_set_admin_flag(s, SRV_ADMF_RMAINT, SRV_ADM_STCHGC_DNS_UNSPEC); + return 0; + } + + return 1; +} + +/* + * Server Name Resolution valid response callback + * It expects: + * - <nameserver>: the name server which answered the valid response + * - <response>: buffer containing a valid DNS response + * - <response_len>: size of <response> + * It performs the following actions: + * - ignore response if current ip found and server family not met + * - update with first new ip found if family is met and current IP is not found + * returns: + * 0 on error + * 1 when no error or safe ignore + * + * Must be called with server lock held + */ +int snr_resolution_cb(struct resolv_requester *requester, struct dns_counters *counters) +{ + struct server *s = NULL; + struct resolv_resolution *resolution = NULL; + void *serverip, *firstip; + short server_sin_family, firstip_sin_family; + int ret; + struct buffer *chk = get_trash_chunk(); + int has_no_ip = 0; + + s = objt_server(requester->owner); + if (!s) + return 1; + + if (s->srvrq) { + /* If DNS resolution is disabled ignore it. + * This is the case if the server was associated to + * a SRV record and this record is now expired. + */ + if (s->flags & SRV_F_NO_RESOLUTION) + return 1; + } + + resolution = (s->resolv_requester ? s->resolv_requester->resolution : NULL); + if (!resolution) + return 1; + + /* initializing variables */ + firstip = NULL; /* pointer to the first valid response found */ + /* it will be used as the new IP if a change is required */ + firstip_sin_family = AF_UNSPEC; + serverip = NULL; /* current server IP address */ + + /* initializing server IP pointer */ + server_sin_family = s->addr.ss_family; + switch (server_sin_family) { + case AF_INET: + serverip = &((struct sockaddr_in *)&s->addr)->sin_addr.s_addr; + break; + + case AF_INET6: + serverip = &((struct sockaddr_in6 *)&s->addr)->sin6_addr.s6_addr; + break; + + case AF_UNSPEC: + break; + + default: + goto invalid; + } + + ret = resolv_get_ip_from_response(&resolution->response, &s->resolv_opts, + serverip, server_sin_family, &firstip, + &firstip_sin_family, s); + + switch (ret) { + case RSLV_UPD_NO: + goto update_status; + + case RSLV_UPD_SRVIP_NOT_FOUND: + goto save_ip; + + case RSLV_UPD_NO_IP_FOUND: + has_no_ip = 1; + goto update_status; + + case RSLV_UPD_NAME_ERROR: + /* update resolution status to OTHER error type */ + resolution->status = RSLV_STATUS_OTHER; + has_no_ip = 1; + goto update_status; + + default: + has_no_ip = 1; + goto invalid; + + } + + save_ip: + if (counters) { + counters->app.resolver.update++; + /* save the first ip we found */ + chunk_printf(chk, "%s/%s", counters->pid, counters->id); + } + else + chunk_printf(chk, "DNS cache"); + srv_update_addr(s, firstip, firstip_sin_family, (char *) chk->area); + + update_status: + if (!snr_update_srv_status(s, has_no_ip) && has_no_ip) + memset(&s->addr, 0, sizeof(s->addr)); + return 1; + + invalid: + if (counters) { + counters->app.resolver.invalid++; + goto update_status; + } + if (!snr_update_srv_status(s, has_no_ip) && has_no_ip) + memset(&s->addr, 0, sizeof(s->addr)); + return 0; +} + +/* + * SRV record error management callback + * returns: + * 0 if we can trash answser items. + * 1 when safely ignored and we must kept answer items + * + * Grabs the server's lock. + */ +int srvrq_resolution_error_cb(struct resolv_requester *requester, int error_code) +{ + struct resolv_srvrq *srvrq; + struct resolv_resolution *res; + struct resolvers *resolvers; + int exp; + + /* SRV records */ + srvrq = objt_resolv_srvrq(requester->owner); + if (!srvrq) + return 0; + + resolvers = srvrq->resolvers; + res = requester->resolution; + + switch (res->status) { + + case RSLV_STATUS_NX: + /* stop server if resolution is NX for a long enough period */ + exp = tick_add(res->last_valid, resolvers->hold.nx); + if (!tick_is_expired(exp, now_ms)) + return 1; + break; + + case RSLV_STATUS_TIMEOUT: + /* stop server if resolution is TIMEOUT for a long enough period */ + exp = tick_add(res->last_valid, resolvers->hold.timeout); + if (!tick_is_expired(exp, now_ms)) + return 1; + break; + + case RSLV_STATUS_REFUSED: + /* stop server if resolution is REFUSED for a long enough period */ + exp = tick_add(res->last_valid, resolvers->hold.refused); + if (!tick_is_expired(exp, now_ms)) + return 1; + break; + + default: + /* stop server if resolution failed for a long enough period */ + exp = tick_add(res->last_valid, resolvers->hold.other); + if (!tick_is_expired(exp, now_ms)) + return 1; + } + + /* Remove any associated server ref */ + resolv_detach_from_resolution_answer_items(res, requester); + + return 0; +} + +/* + * Server Name Resolution error management callback + * returns: + * 0 if we can trash answser items. + * 1 when safely ignored and we must kept answer items + * + * Grabs the server's lock. + */ +int snr_resolution_error_cb(struct resolv_requester *requester, int error_code) +{ + struct server *s; + + s = objt_server(requester->owner); + if (!s) + return 0; + + HA_SPIN_LOCK(SERVER_LOCK, &s->lock); + if (!snr_update_srv_status(s, 1)) { + memset(&s->addr, 0, sizeof(s->addr)); + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + resolv_detach_from_resolution_answer_items(requester->resolution, requester); + return 0; + } + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + + return 1; +} + +/* + * Function to check if <ip> is already affected to a server in the backend + * which owns <srv> and is up. + * It returns a pointer to the first server found or NULL if <ip> is not yet + * assigned. + * + * Must be called with server lock held + */ +struct server *snr_check_ip_callback(struct server *srv, void *ip, unsigned char *ip_family) +{ + struct server *tmpsrv; + struct proxy *be; + + if (!srv) + return NULL; + + be = srv->proxy; + for (tmpsrv = be->srv; tmpsrv; tmpsrv = tmpsrv->next) { + /* we found the current server is the same, ignore it */ + if (srv == tmpsrv) + continue; + + /* We want to compare the IP in the record with the IP of the servers in the + * same backend, only if: + * * DNS resolution is enabled on the server + * * the hostname used for the resolution by our server is the same than the + * one used for the server found in the backend + * * the server found in the backend is not our current server + */ + HA_SPIN_LOCK(SERVER_LOCK, &tmpsrv->lock); + if ((tmpsrv->hostname_dn == NULL) || + (srv->hostname_dn_len != tmpsrv->hostname_dn_len) || + (strcasecmp(srv->hostname_dn, tmpsrv->hostname_dn) != 0) || + (srv->puid == tmpsrv->puid)) { + HA_SPIN_UNLOCK(SERVER_LOCK, &tmpsrv->lock); + continue; + } + + /* If the server has been taken down, don't consider it */ + if (tmpsrv->next_admin & SRV_ADMF_RMAINT) { + HA_SPIN_UNLOCK(SERVER_LOCK, &tmpsrv->lock); + continue; + } + + /* At this point, we have 2 different servers using the same DNS hostname + * for their respective resolution. + */ + if (*ip_family == tmpsrv->addr.ss_family && + ((tmpsrv->addr.ss_family == AF_INET && + memcmp(ip, &((struct sockaddr_in *)&tmpsrv->addr)->sin_addr, 4) == 0) || + (tmpsrv->addr.ss_family == AF_INET6 && + memcmp(ip, &((struct sockaddr_in6 *)&tmpsrv->addr)->sin6_addr, 16) == 0))) { + HA_SPIN_UNLOCK(SERVER_LOCK, &tmpsrv->lock); + return tmpsrv; + } + HA_SPIN_UNLOCK(SERVER_LOCK, &tmpsrv->lock); + } + + + return NULL; +} + +/* Sets the server's address (srv->addr) from srv->hostname using the libc's + * resolver. This is suited for initial address configuration. Returns 0 on + * success otherwise a non-zero error code. In case of error, *err_code, if + * not NULL, is filled up. + */ +int srv_set_addr_via_libc(struct server *srv, int *err_code) +{ + struct sockaddr_storage new_addr; + + memset(&new_addr, 0, sizeof(new_addr)); + + /* Use the preferred family, if configured */ + new_addr.ss_family = srv->addr.ss_family; + if (str2ip2(srv->hostname, &new_addr, 1) == NULL) { + if (err_code) + *err_code |= ERR_WARN; + return 1; + } + _srv_set_inetaddr(srv, &new_addr); + return 0; +} + +/* Set the server's FDQN (->hostname) from <hostname>. + * Returns -1 if failed, 0 if not. + * + * Must be called with the server lock held. + */ +int srv_set_fqdn(struct server *srv, const char *hostname, int resolv_locked) +{ + struct resolv_resolution *resolution; + char *hostname_dn; + int hostname_len, hostname_dn_len; + + /* Note that the server lock is already held. */ + if (!srv->resolvers) + return -1; + + if (!resolv_locked) + HA_SPIN_LOCK(DNS_LOCK, &srv->resolvers->lock); + /* run time DNS/SRV resolution was not active for this server + * and we can't enable it at run time for now. + */ + if (!srv->resolv_requester && !srv->srvrq) + goto err; + + chunk_reset(&trash); + hostname_len = strlen(hostname); + hostname_dn = trash.area; + hostname_dn_len = resolv_str_to_dn_label(hostname, hostname_len, + hostname_dn, trash.size); + if (hostname_dn_len == -1) + goto err; + + resolution = (srv->resolv_requester ? srv->resolv_requester->resolution : NULL); + if (resolution && + resolution->hostname_dn && + resolution->hostname_dn_len == hostname_dn_len && + strcasecmp(resolution->hostname_dn, hostname_dn) == 0) + goto end; + + resolv_unlink_resolution(srv->resolv_requester); + + free(srv->hostname); + free(srv->hostname_dn); + srv->hostname = strdup(hostname); + srv->hostname_dn = strdup(hostname_dn); + srv->hostname_dn_len = hostname_dn_len; + if (!srv->hostname || !srv->hostname_dn) + goto err; + + if (srv->flags & SRV_F_NO_RESOLUTION) + goto end; + + if (resolv_link_resolution(srv, OBJ_TYPE_SERVER, 1) == -1) + goto err; + + end: + if (!resolv_locked) + HA_SPIN_UNLOCK(DNS_LOCK, &srv->resolvers->lock); + return 0; + + err: + if (!resolv_locked) + HA_SPIN_UNLOCK(DNS_LOCK, &srv->resolvers->lock); + return -1; +} + +/* Sets the server's address (srv->addr) from srv->lastaddr which was filled + * from the state file. This is suited for initial address configuration. + * Returns 0 on success otherwise a non-zero error code. In case of error, + * *err_code, if not NULL, is filled up. + */ +static int srv_apply_lastaddr(struct server *srv, int *err_code) +{ + struct sockaddr_storage new_addr; + + memset(&new_addr, 0, sizeof(new_addr)); + + /* Use the preferred family, if configured */ + new_addr.ss_family = srv->addr.ss_family; + if (!str2ip2(srv->lastaddr, &new_addr, 0)) { + if (err_code) + *err_code |= ERR_WARN; + return 1; + } + _srv_set_inetaddr(srv, &new_addr); + return 0; +} + +/* returns 0 if no error, otherwise a combination of ERR_* flags */ +static int srv_iterate_initaddr(struct server *srv) +{ + char *name = srv->hostname; + int return_code = 0; + int err_code; + unsigned int methods; + + /* If no addr and no hostname set, get the name from the DNS SRV request */ + if (!name && srv->srvrq) + name = srv->srvrq->name; + + methods = srv->init_addr_methods; + if (!methods) { + /* otherwise default to "last,libc" */ + srv_append_initaddr(&methods, SRV_IADDR_LAST); + srv_append_initaddr(&methods, SRV_IADDR_LIBC); + if (srv->resolvers_id) { + /* dns resolution is configured, add "none" to not fail on startup */ + srv_append_initaddr(&methods, SRV_IADDR_NONE); + } + } + + /* "-dr" : always append "none" so that server addresses resolution + * failures are silently ignored, this is convenient to validate some + * configs out of their environment. + */ + if (global.tune.options & GTUNE_RESOLVE_DONTFAIL) + srv_append_initaddr(&methods, SRV_IADDR_NONE); + + while (methods) { + err_code = 0; + switch (srv_get_next_initaddr(&methods)) { + case SRV_IADDR_LAST: + if (!srv->lastaddr) + continue; + if (srv_apply_lastaddr(srv, &err_code) == 0) + goto out; + return_code |= err_code; + break; + + case SRV_IADDR_LIBC: + if (!srv->hostname) + continue; + if (srv_set_addr_via_libc(srv, &err_code) == 0) + goto out; + return_code |= err_code; + break; + + case SRV_IADDR_NONE: + srv_set_admin_flag(srv, SRV_ADMF_RMAINT, SRV_ADM_STCHGC_NONE); + if (return_code) { + ha_notice("could not resolve address '%s', disabling server.\n", + name); + } + return return_code; + + case SRV_IADDR_IP: + _srv_set_inetaddr(srv, &srv->init_addr); + if (return_code) { + ha_warning("could not resolve address '%s', falling back to configured address.\n", + name); + } + goto out; + + default: /* unhandled method */ + break; + } + } + + if (!return_code) + ha_alert("no method found to resolve address '%s'.\n", name); + else + ha_alert("could not resolve address '%s'.\n", name); + + return_code |= ERR_ALERT | ERR_FATAL; + return return_code; +out: + srv_set_dyncookie(srv); + srv_set_addr_desc(srv, 1); + return return_code; +} + +/* + * This function parses all backends and all servers within each backend + * and performs servers' addr resolution based on information provided by: + * - configuration file + * - server-state file (states provided by an 'old' haproxy process) + * + * Returns 0 if no error, otherwise, a combination of ERR_ flags. + */ +int srv_init_addr(void) +{ + struct proxy *curproxy; + int return_code = 0; + + curproxy = proxies_list; + while (curproxy) { + struct server *srv; + + /* servers are in backend only */ + if (!(curproxy->cap & PR_CAP_BE) || (curproxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) + goto srv_init_addr_next; + + for (srv = curproxy->srv; srv; srv = srv->next) { + set_usermsgs_ctx(srv->conf.file, srv->conf.line, &srv->obj_type); + if (srv->hostname || srv->srvrq) + return_code |= srv_iterate_initaddr(srv); + reset_usermsgs_ctx(); + } + + srv_init_addr_next: + curproxy = curproxy->next; + } + + return return_code; +} + +/* + * Must be called with the server lock held. + */ +const char *srv_update_fqdn(struct server *server, const char *fqdn, const char *updater, int resolv_locked) +{ + + struct buffer *msg; + + msg = get_trash_chunk(); + chunk_reset(msg); + + if (server->hostname && strcmp(fqdn, server->hostname) == 0) { + chunk_appendf(msg, "no need to change the FDQN"); + goto out; + } + + if (strlen(fqdn) > DNS_MAX_NAME_SIZE || invalid_domainchar(fqdn)) { + chunk_appendf(msg, "invalid fqdn '%s'", fqdn); + goto out; + } + + chunk_appendf(msg, "%s/%s changed its FQDN from %s to %s", + server->proxy->id, server->id, server->hostname, fqdn); + + if (srv_set_fqdn(server, fqdn, resolv_locked) < 0) { + chunk_reset(msg); + chunk_appendf(msg, "could not update %s/%s FQDN", + server->proxy->id, server->id); + goto out; + } + + /* Flag as FQDN set from stats socket. */ + server->next_admin |= SRV_ADMF_HMAINT; + + out: + if (updater) + chunk_appendf(msg, " by '%s'", updater); + chunk_appendf(msg, "\n"); + + return msg->area; +} + + +/* Expects to find a backend and a server in <arg> under the form <backend>/<server>, + * and returns the pointer to the server. Otherwise, display adequate error messages + * on the CLI, sets the CLI's state to CLI_ST_PRINT and returns NULL. This is only + * used for CLI commands requiring a server name. + * Important: the <arg> is modified to remove the '/'. + */ +struct server *cli_find_server(struct appctx *appctx, char *arg) +{ + struct proxy *px; + struct server *sv; + struct ist be_name, sv_name = ist(arg); + + be_name = istsplit(&sv_name, '/'); + if (!istlen(sv_name)) { + cli_err(appctx, "Require 'backend/server'."); + return NULL; + } + + if (!(px = proxy_be_by_name(ist0(be_name)))) { + cli_err(appctx, "No such backend."); + return NULL; + } + if (!(sv = server_find_by_name(px, ist0(sv_name)))) { + cli_err(appctx, "No such server."); + return NULL; + } + + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) { + cli_err(appctx, "Proxy is disabled.\n"); + return NULL; + } + + return sv; +} + + +/* grabs the server lock */ +static int cli_parse_set_server(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + const char *warning; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[2]); + if (!sv) + return 1; + + if (strcmp(args[3], "weight") == 0) { + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + warning = server_parse_weight_change_request(sv, args[4]); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + if (warning) + cli_err(appctx, warning); + } + else if (strcmp(args[3], "state") == 0) { + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (strcmp(args[4], "ready") == 0) + srv_adm_set_ready(sv); + else if (strcmp(args[4], "drain") == 0) + srv_adm_set_drain(sv); + else if (strcmp(args[4], "maint") == 0) + srv_adm_set_maint(sv); + else + cli_err(appctx, "'set server <srv> state' expects 'ready', 'drain' and 'maint'.\n"); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + } + else if (strcmp(args[3], "health") == 0) { + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (sv->track) + cli_err(appctx, "cannot change health on a tracking server.\n"); + else if (strcmp(args[4], "up") == 0) { + sv->check.health = sv->check.rise + sv->check.fall - 1; + srv_set_running(sv, SRV_OP_STCHGC_CLI); + } + else if (strcmp(args[4], "stopping") == 0) { + sv->check.health = sv->check.rise + sv->check.fall - 1; + srv_set_stopping(sv, SRV_OP_STCHGC_CLI); + } + else if (strcmp(args[4], "down") == 0) { + sv->check.health = 0; + srv_set_stopped(sv, SRV_OP_STCHGC_CLI); + } + else + cli_err(appctx, "'set server <srv> health' expects 'up', 'stopping', or 'down'.\n"); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + } + else if (strcmp(args[3], "agent") == 0) { + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (!(sv->agent.state & CHK_ST_ENABLED)) + cli_err(appctx, "agent checks are not enabled on this server.\n"); + else if (strcmp(args[4], "up") == 0) { + sv->agent.health = sv->agent.rise + sv->agent.fall - 1; + srv_set_running(sv, SRV_OP_STCHGC_CLI); + } + else if (strcmp(args[4], "down") == 0) { + sv->agent.health = 0; + srv_set_stopped(sv, SRV_OP_STCHGC_CLI); + } + else + cli_err(appctx, "'set server <srv> agent' expects 'up' or 'down'.\n"); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + } + else if (strcmp(args[3], "agent-addr") == 0) { + char *addr = NULL; + char *port = NULL; + if (strlen(args[4]) == 0) { + cli_err(appctx, "set server <b>/<s> agent-addr requires" + " an address and optionally a port.\n"); + goto out; + } + addr = args[4]; + if (strcmp(args[5], "port") == 0) + port = args[6]; + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + warning = srv_update_agent_addr_port(sv, addr, port); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + if (warning) + cli_msg(appctx, LOG_WARNING, warning); + } + else if (strcmp(args[3], "agent-port") == 0) { + char *port = NULL; + if (strlen(args[4]) == 0) { + cli_err(appctx, "set server <b>/<s> agent-port requires" + " a port.\n"); + goto out; + } + port = args[4]; + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + warning = srv_update_agent_addr_port(sv, NULL, port); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + if (warning) + cli_msg(appctx, LOG_WARNING, warning); + } + else if (strcmp(args[3], "agent-send") == 0) { + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (!(sv->agent.state & CHK_ST_ENABLED)) + cli_err(appctx, "agent checks are not enabled on this server.\n"); + else { + if (!set_srv_agent_send(sv, args[4])) + cli_err(appctx, "cannot allocate memory for new string.\n"); + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + } + else if (strcmp(args[3], "check-addr") == 0) { + char *addr = NULL; + char *port = NULL; + if (strlen(args[4]) == 0) { + cli_err(appctx, "set server <b>/<s> check-addr requires" + " an address and optionally a port.\n"); + goto out; + } + addr = args[4]; + if (strcmp(args[5], "port") == 0) + port = args[6]; + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + warning = srv_update_check_addr_port(sv, addr, port); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + if (warning) + cli_msg(appctx, LOG_WARNING, warning); + } + else if (strcmp(args[3], "check-port") == 0) { + char *port = NULL; + if (strlen(args[4]) == 0) { + cli_err(appctx, "set server <b>/<s> check-port requires" + " a port.\n"); + goto out; + } + port = args[4]; + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + warning = srv_update_check_addr_port(sv, NULL, port); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + if (warning) + cli_msg(appctx, LOG_WARNING, warning); + } + else if (strcmp(args[3], "addr") == 0) { + char *addr = NULL; + char *port = NULL; + if (strlen(args[4]) == 0) { + cli_err(appctx, "set server <b>/<s> addr requires an address and optionally a port.\n"); + goto out; + } + else { + addr = args[4]; + } + if (strcmp(args[5], "port") == 0) { + port = args[6]; + } + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + warning = srv_update_addr_port(sv, addr, port, "stats socket command"); + if (warning) + cli_msg(appctx, LOG_WARNING, warning); + srv_clr_admin_flag(sv, SRV_ADMF_RMAINT); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + } + else if (strcmp(args[3], "fqdn") == 0) { + if (!*args[4]) { + cli_err(appctx, "set server <b>/<s> fqdn requires a FQDN.\n"); + goto out; + } + if (!sv->resolvers) { + cli_err(appctx, "set server <b>/<s> fqdn failed because no resolution is configured.\n"); + goto out; + } + if (sv->srvrq) { + cli_err(appctx, "set server <b>/<s> fqdn failed because SRV resolution is configured.\n"); + goto out; + } + HA_SPIN_LOCK(DNS_LOCK, &sv->resolvers->lock); + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + /* ensure runtime resolver will process this new fqdn */ + if (sv->flags & SRV_F_NO_RESOLUTION) { + sv->flags &= ~SRV_F_NO_RESOLUTION; + } + warning = srv_update_fqdn(sv, args[4], "stats socket command", 1); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + HA_SPIN_UNLOCK(DNS_LOCK, &sv->resolvers->lock); + if (warning) + cli_msg(appctx, LOG_WARNING, warning); + } + else if (strcmp(args[3], "ssl") == 0) { +#ifdef USE_OPENSSL + if (sv->flags & SRV_F_DYNAMIC) { + cli_err(appctx, "'set server <srv> ssl' not supported on dynamic servers\n"); + goto out; + } + + if (sv->ssl_ctx.ctx == NULL) { + cli_err(appctx, "'set server <srv> ssl' cannot be set. " + " default-server should define ssl settings\n"); + goto out; + } + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + if (strcmp(args[4], "on") == 0) { + srv_set_ssl(sv, 1); + } else if (strcmp(args[4], "off") == 0) { + srv_set_ssl(sv, 0); + } else { + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + cli_err(appctx, "'set server <srv> ssl' expects 'on' or 'off'.\n"); + goto out; + } + srv_cleanup_connections(sv); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + cli_msg(appctx, LOG_NOTICE, "server ssl setting updated.\n"); +#else + cli_msg(appctx, LOG_NOTICE, "server ssl setting not supported.\n"); +#endif + } else { + cli_err(appctx, + "usage: set server <backend>/<server> " + "addr | agent | agent-addr | agent-port | agent-send | " + "check-addr | check-port | fqdn | health | ssl | " + "state | weight\n"); + } + out: + return 1; +} + +static int cli_parse_get_weight(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *be; + struct server *sv; + struct ist be_name, sv_name = ist(args[2]); + + be_name = istsplit(&sv_name, '/'); + if (!istlen(sv_name)) + return cli_err(appctx, "Require 'backend/server'."); + + if (!(be = proxy_be_by_name(ist0(be_name)))) + return cli_err(appctx, "No such backend."); + if (!(sv = server_find_by_name(be, ist0(sv_name)))) + return cli_err(appctx, "No such server."); + + /* return server's effective weight at the moment */ + snprintf(trash.area, trash.size, "%d (initial %d)\n", sv->uweight, + sv->iweight); + if (applet_putstr(appctx, trash.area) == -1) + return 0; + return 1; +} + +/* Parse a "set weight" command. + * + * Grabs the server lock. + */ +static int cli_parse_set_weight(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + const char *warning; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[2]); + if (!sv) + return 1; + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + + warning = server_parse_weight_change_request(sv, args[3]); + if (warning) + cli_err(appctx, warning); + + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + + return 1; +} + +/* parse a "set maxconn server" command. It always returns 1. + * + * Grabs the server lock. + */ +static int cli_parse_set_maxconn_server(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + const char *warning; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[3]); + if (!sv) + return 1; + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + + warning = server_parse_maxconn_change_request(sv, args[4]); + if (warning) + cli_err(appctx, warning); + + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + + return 1; +} + +/* parse a "disable agent" command. It always returns 1. + * + * Grabs the server lock. + */ +static int cli_parse_disable_agent(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[2]); + if (!sv) + return 1; + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + sv->agent.state &= ~CHK_ST_ENABLED; + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 1; +} + +/* parse a "disable health" command. It always returns 1. + * + * Grabs the server lock. + */ +static int cli_parse_disable_health(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[2]); + if (!sv) + return 1; + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + sv->check.state &= ~CHK_ST_ENABLED; + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 1; +} + +/* parse a "disable server" command. It always returns 1. + * + * Grabs the server lock. + */ +static int cli_parse_disable_server(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[2]); + if (!sv) + return 1; + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + srv_adm_set_maint(sv); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 1; +} + +/* parse a "enable agent" command. It always returns 1. + * + * Grabs the server lock. + */ +static int cli_parse_enable_agent(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[2]); + if (!sv) + return 1; + + if (!(sv->agent.state & CHK_ST_CONFIGURED)) + return cli_err(appctx, "Agent was not configured on this server, cannot enable.\n"); + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + sv->agent.state |= CHK_ST_ENABLED; + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 1; +} + +/* parse a "enable health" command. It always returns 1. + * + * Grabs the server lock. + */ +static int cli_parse_enable_health(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[2]); + if (!sv) + return 1; + + if (!(sv->check.state & CHK_ST_CONFIGURED)) + return cli_err(appctx, "Health check was not configured on this server, cannot enable.\n"); + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + sv->check.state |= CHK_ST_ENABLED; + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 1; +} + +/* parse a "enable server" command. It always returns 1. + * + * Grabs the server lock. + */ +static int cli_parse_enable_server(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[2]); + if (!sv) + return 1; + + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + srv_adm_set_ready(sv); + if (!(sv->flags & SRV_F_COOKIESET) + && (sv->proxy->ck_opts & PR_CK_DYNAMIC) && + sv->cookie) + srv_check_for_dup_dyncookie(sv); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 1; +} + +/* Allocates data structure related to load balancing for the server <sv>. It + * is only required for dynamic servers. + * + * At the moment, the server lock is not used as this function is only called + * for a dynamic server not yet registered. + * + * Returns 1 on success, 0 on allocation failure. + */ +static int srv_alloc_lb(struct server *sv, struct proxy *be) +{ + int node; + + sv->lb_tree = (sv->flags & SRV_F_BACKUP) ? + &be->lbprm.chash.bck : &be->lbprm.chash.act; + sv->lb_nodes_tot = sv->uweight * BE_WEIGHT_SCALE; + sv->lb_nodes_now = 0; + + if (((be->lbprm.algo & (BE_LB_KIND | BE_LB_PARM)) == (BE_LB_KIND_RR | BE_LB_RR_RANDOM)) || + ((be->lbprm.algo & (BE_LB_KIND | BE_LB_HASH_TYPE)) == (BE_LB_KIND_HI | BE_LB_HASH_CONS))) { + sv->lb_nodes = calloc(sv->lb_nodes_tot, sizeof(*sv->lb_nodes)); + + if (!sv->lb_nodes) + return 0; + + for (node = 0; node < sv->lb_nodes_tot; node++) { + sv->lb_nodes[node].server = sv; + sv->lb_nodes[node].node.key = full_hash(sv->puid * SRV_EWGHT_RANGE + node); + } + } + + return 1; +} + +/* updates the server's weight during a warmup stage. Once the final weight is + * reached, the task automatically stops. Note that any server status change + * must have updated s->last_change accordingly. + */ +static struct task *server_warmup(struct task *t, void *context, unsigned int state) +{ + struct server *s = context; + + /* by default, plan on stopping the task */ + t->expire = TICK_ETERNITY; + if ((s->next_admin & SRV_ADMF_MAINT) || + (s->next_state != SRV_ST_STARTING)) + return t; + + HA_SPIN_LOCK(SERVER_LOCK, &s->lock); + + /* recalculate the weights and update the state */ + server_recalc_eweight(s, 1); + + /* probably that we can refill this server with a bit more connections */ + pendconn_grab_from_px(s); + + HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); + + /* get back there in 1 second or 1/20th of the slowstart interval, + * whichever is greater, resulting in small 5% steps. + */ + if (s->next_state == SRV_ST_STARTING) + t->expire = tick_add(now_ms, MS_TO_TICKS(MAX(1000, s->slowstart / 20))); + return t; +} + +/* Allocate the slowstart task if the server is configured with a slowstart + * timer. If server next_state is SRV_ST_STARTING, the task is scheduled. + * + * Returns 0 on success else non-zero. + */ +static int init_srv_slowstart(struct server *srv) +{ + struct task *t; + + if (srv->slowstart) { + if ((t = task_new_anywhere()) == NULL) { + ha_alert("Cannot activate slowstart for server %s/%s: out of memory.\n", srv->proxy->id, srv->id); + return ERR_ALERT | ERR_FATAL; + } + + /* We need a warmup task that will be called when the server + * state switches from down to up. + */ + srv->warmup = t; + t->process = server_warmup; + t->context = srv; + + /* server can be in this state only because of */ + if (srv->next_state == SRV_ST_STARTING) { + task_schedule(srv->warmup, + tick_add(now_ms, + MS_TO_TICKS(MAX(1000, (ns_to_sec(now_ns) - srv->last_change)) / 20))); + } + } + + return ERR_NONE; +} +REGISTER_POST_SERVER_CHECK(init_srv_slowstart); + +/* Memory allocation and initialization of the per_thr field. + * Returns 0 if the field has been successfully initialized, -1 on failure. + */ +int srv_init_per_thr(struct server *srv) +{ + int i; + + srv->per_thr = calloc(global.nbthread, sizeof(*srv->per_thr)); + srv->per_tgrp = calloc(global.nbtgroups, sizeof(*srv->per_tgrp)); + if (!srv->per_thr || !srv->per_tgrp) + return -1; + + for (i = 0; i < global.nbthread; i++) { + srv->per_thr[i].idle_conns = EB_ROOT; + srv->per_thr[i].safe_conns = EB_ROOT; + srv->per_thr[i].avail_conns = EB_ROOT; + MT_LIST_INIT(&srv->per_thr[i].streams); + + LIST_INIT(&srv->per_thr[i].idle_conn_list); + } + + return 0; +} + +/* Parse a "add server" command + * Returns 0 if the server has been successfully initialized, 1 on failure. + */ +static int cli_parse_add_server(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *be; + struct server *srv; + char *be_name, *sv_name; + int errcode, argc; + int next_id; + const int parse_flags = SRV_PARSE_DYNAMIC|SRV_PARSE_PARSE_ADDR; + + usermsgs_clr("CLI"); + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + ++args; + + sv_name = be_name = args[1]; + /* split backend/server arg */ + while (*sv_name && *(++sv_name)) { + if (*sv_name == '/') { + *sv_name = '\0'; + ++sv_name; + break; + } + } + + if (!*sv_name) + return cli_err(appctx, "Require 'backend/server'."); + + be = proxy_be_by_name(be_name); + if (!be) + return cli_err(appctx, "No such backend."); + + if (!(be->lbprm.algo & BE_LB_PROP_DYN)) { + cli_err(appctx, "Backend must use a dynamic load balancing to support dynamic servers."); + return 1; + } + + if (be->mode == PR_MODE_SYSLOG) { + cli_err(appctx," Dynamic servers cannot be used with log backends."); + return 1; + } + + /* At this point, some operations might not be thread-safe anymore. This + * might be the case for parsing handlers which were designed to run + * only at the starting stage on single-thread mode. + * + * Activate thread isolation to ensure thread-safety. + */ + thread_isolate(); + + args[1] = sv_name; + errcode = _srv_parse_init(&srv, args, &argc, be, parse_flags); + if (errcode) + goto out; + + while (*args[argc]) { + errcode = _srv_parse_kw(srv, args, &argc, be, parse_flags); + + if (errcode) + goto out; + } + + errcode = _srv_parse_finalize(args, argc, srv, be, parse_flags); + if (errcode) + goto out; + + /* A dynamic server does not currently support resolution. + * + * Initialize it explicitly to the "none" method to ensure no + * resolution will ever be executed. + */ + srv->init_addr_methods = SRV_IADDR_NONE; + + if (srv->mux_proto) { + int proto_mode = conn_pr_mode_to_proto_mode(be->mode); + const struct mux_proto_list *mux_ent; + + mux_ent = conn_get_best_mux_entry(srv->mux_proto->token, PROTO_SIDE_BE, proto_mode); + + if (!mux_ent || !isteq(mux_ent->token, srv->mux_proto->token)) { + ha_alert("MUX protocol is not usable for server.\n"); + goto out; + } + } + + if (srv_init_per_thr(srv) == -1) { + ha_alert("failed to allocate per-thread lists for server.\n"); + goto out; + } + + if (srv->max_idle_conns != 0) { + srv->curr_idle_thr = calloc(global.nbthread, sizeof(*srv->curr_idle_thr)); + if (!srv->curr_idle_thr) { + ha_alert("failed to allocate counters for server.\n"); + goto out; + } + } + + if (!srv_alloc_lb(srv, be)) { + ha_alert("Failed to initialize load-balancing data.\n"); + goto out; + } + + if (!stats_allocate_proxy_counters_internal(&srv->extra_counters, + COUNTERS_SV, + STATS_PX_CAP_SRV)) { + ha_alert("failed to allocate extra counters for server.\n"); + goto out; + } + + /* ensure minconn/maxconn consistency */ + srv_minmax_conn_apply(srv); + + if (srv->use_ssl == 1 || (srv->proxy->options & PR_O_TCPCHK_SSL) || + srv->check.use_ssl == 1) { + if (xprt_get(XPRT_SSL) && xprt_get(XPRT_SSL)->prepare_srv) { + if (xprt_get(XPRT_SSL)->prepare_srv(srv)) + goto out; + } + } + + if (srv->trackit) { + if (srv_apply_track(srv, be)) + goto out; + } + + /* Init check/agent if configured. The check is manually disabled + * because a dynamic server is started in a disable state. It must be + * manually activated via a "enable health/agent" command. + */ + if (srv->do_check) { + if (init_srv_check(srv)) + goto out; + + srv->check.state &= ~CHK_ST_ENABLED; + } + + if (srv->do_agent) { + if (init_srv_agent_check(srv)) + goto out; + + srv->agent.state &= ~CHK_ST_ENABLED; + } + + /* Init slowstart if needed. */ + if (init_srv_slowstart(srv)) + goto out; + + /* Attach the server to the end of the proxy linked list. Note that this + * operation is not thread-safe so this is executed under thread + * isolation. + * + * If a server with the same name is found, reject the new one. + */ + + /* TODO use a double-linked list for px->srv */ + if (be->srv) { + struct server *next = be->srv; + + while (1) { + /* check for duplicate server */ + if (strcmp(srv->id, next->id) == 0) { + ha_alert("Already exists a server with the same name in backend.\n"); + goto out; + } + + if (!next->next) + break; + + next = next->next; + } + + next->next = srv; + } + else { + srv->next = be->srv; + be->srv = srv; + } + + /* generate the server id if not manually specified */ + if (!srv->puid) { + next_id = get_next_id(&be->conf.used_server_id, 1); + if (!next_id) { + ha_alert("Cannot attach server : no id left in proxy\n"); + goto out; + } + + srv->conf.id.key = srv->puid = next_id; + } + srv->conf.name.key = srv->id; + + /* insert the server in the backend trees */ + eb32_insert(&be->conf.used_server_id, &srv->conf.id); + ebis_insert(&be->conf.used_server_name, &srv->conf.name); + /* addr_node.key could be NULL if FQDN resolution is postponed (ie: add server from cli) */ + if (srv->addr_node.key) + ebis_insert(&be->used_server_addr, &srv->addr_node); + + /* check if LSB bit (odd bit) is set for reuse_cnt */ + if (srv_id_reuse_cnt & 1) { + /* cnt must be increased */ + srv_id_reuse_cnt++; + } + /* srv_id_reuse_cnt is always even at this stage, divide by 2 to + * save some space + * (sizeof(srv->rid) is half of sizeof(srv_id_reuse_cnt)) + */ + srv->rid = (srv_id_reuse_cnt) ? (srv_id_reuse_cnt / 2) : 0; + + /* adding server cannot fail when we reach this: + * publishing EVENT_HDL_SUB_SERVER_ADD + */ + srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_ADD, srv, 1); + + thread_release(); + + /* Start the check task. The server must be fully initialized. + * + * <srvpos> and <nbcheck> parameters are set to 1 as there should be no + * need to randomly spread the task interval for dynamic servers. + */ + if (srv->check.state & CHK_ST_CONFIGURED) { + if (!start_check_task(&srv->check, 0, 1, 1)) + ha_alert("System might be unstable, consider to execute a reload"); + } + if (srv->agent.state & CHK_ST_CONFIGURED) { + if (!start_check_task(&srv->agent, 0, 1, 1)) + ha_alert("System might be unstable, consider to execute a reload"); + } + + ha_notice("New server registered.\n"); + cli_umsg(appctx, LOG_INFO); + + return 0; + +out: + if (srv) { + if (srv->track) + release_server_track(srv); + + if (srv->check.state & CHK_ST_CONFIGURED) + free_check(&srv->check); + if (srv->agent.state & CHK_ST_CONFIGURED) + free_check(&srv->agent); + + /* remove the server from the proxy linked list */ + _srv_detach(srv); + } + + thread_release(); + + if (!usermsgs_empty()) + cli_umsgerr(appctx); + + if (srv) + srv_drop(srv); + + return 1; +} + +/* Parse a "del server" command + * Returns 0 if the server has been successfully initialized, 1 on failure. + */ +static int cli_parse_delete_server(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *be; + struct server *srv; + struct server *prev_del; + struct ist be_name, sv_name; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + ++args; + + /* The proxy servers list is currently not protected by a lock so this + * requires thread isolation. In addition, any place referencing the + * server about to be deleted would be unsafe after our operation, so + * we must be certain to be alone so that no other thread has even + * started to grab a temporary reference to this server. + */ + thread_isolate_full(); + + sv_name = ist(args[1]); + be_name = istsplit(&sv_name, '/'); + if (!istlen(sv_name)) { + cli_err(appctx, "Require 'backend/server'."); + goto out; + } + + if (!(be = proxy_be_by_name(ist0(be_name)))) { + cli_err(appctx, "No such backend."); + goto out; + } + if (!(srv = server_find_by_name(be, ist0(sv_name)))) { + cli_err(appctx, "No such server."); + goto out; + } + + if (srv->flags & SRV_F_NON_PURGEABLE) { + cli_err(appctx, "This server cannot be removed at runtime due to other configuration elements pointing to it."); + goto out; + } + + /* Only servers in maintenance can be deleted. This ensures that the + * server is not present anymore in the lb structures (through + * lbprm.set_server_status_down). + */ + if (!(srv->cur_admin & SRV_ADMF_MAINT)) { + cli_err(appctx, "Only servers in maintenance mode can be deleted."); + goto out; + } + + /* Ensure that there is no active/idle/pending connection on the server. + * + * TODO idle connections should not prevent server deletion. A proper + * cleanup function should be implemented to be used here. + */ + if (srv->cur_sess || srv->curr_idle_conns || + !eb_is_empty(&srv->queue.head) || srv_has_streams(srv)) { + cli_err(appctx, "Server still has connections attached to it, cannot remove it."); + goto out; + } + + /* removing cannot fail anymore when we reach this: + * publishing EVENT_HDL_SUB_SERVER_DEL + */ + srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_DEL, srv, 1); + + /* remove srv from tracking list */ + if (srv->track) + release_server_track(srv); + + /* stop the check task if running */ + if (srv->check.state & CHK_ST_CONFIGURED) + check_purge(&srv->check); + if (srv->agent.state & CHK_ST_CONFIGURED) + check_purge(&srv->agent); + + /* detach the server from the proxy linked list + * The proxy servers list is currently not protected by a lock, so this + * requires thread_isolate/release. + */ + _srv_detach(srv); + + /* Some deleted servers could still point to us using their 'next', + * update them as needed + * Please note the small race between the POP and APPEND, although in + * this situation this is not an issue as we are under full thread + * isolation + */ + while ((prev_del = MT_LIST_POP(&srv->prev_deleted, struct server *, prev_deleted))) { + /* update its 'next' ptr */ + prev_del->next = srv->next; + if (srv->next) { + /* now it is our 'next' responsibility */ + MT_LIST_APPEND(&srv->next->prev_deleted, &prev_del->prev_deleted); + } + } + + /* we ourselves need to inform our 'next' that we will still point it */ + if (srv->next) + MT_LIST_APPEND(&srv->next->prev_deleted, &srv->prev_deleted); + + /* remove srv from addr_node tree */ + eb32_delete(&srv->conf.id); + ebpt_delete(&srv->conf.name); + if (srv->addr_node.key) + ebpt_delete(&srv->addr_node); + + /* remove srv from idle_node tree for idle conn cleanup */ + eb32_delete(&srv->idle_node); + + /* flag the server as deleted + * (despite the server being removed from primary server list, + * one could still access the server data from a valid ptr) + * Deleted flag helps detecting when a server is in transient removal + * state. + * ie: removed from the list but not yet freed/purged from memory. + */ + srv->flags |= SRV_F_DELETED; + + /* set LSB bit (odd bit) for reuse_cnt */ + srv_id_reuse_cnt |= 1; + + thread_release(); + + ha_notice("Server deleted.\n"); + srv_drop(srv); + + cli_msg(appctx, LOG_INFO, "Server deleted."); + + return 0; + +out: + thread_release(); + + return 1; +} + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "disable", "agent", NULL }, "disable agent : disable agent checks", cli_parse_disable_agent, NULL }, + { { "disable", "health", NULL }, "disable health : disable health checks", cli_parse_disable_health, NULL }, + { { "disable", "server", NULL }, "disable server (DEPRECATED) : disable a server for maintenance (use 'set server' instead)", cli_parse_disable_server, NULL }, + { { "enable", "agent", NULL }, "enable agent : enable agent checks", cli_parse_enable_agent, NULL }, + { { "enable", "health", NULL }, "enable health : enable health checks", cli_parse_enable_health, NULL }, + { { "enable", "server", NULL }, "enable server (DEPRECATED) : enable a disabled server (use 'set server' instead)", cli_parse_enable_server, NULL }, + { { "set", "maxconn", "server", NULL }, "set maxconn server <bk>/<srv> : change a server's maxconn setting", cli_parse_set_maxconn_server, NULL }, + { { "set", "server", NULL }, "set server <bk>/<srv> [opts] : change a server's state, weight, address or ssl", cli_parse_set_server }, + { { "get", "weight", NULL }, "get weight <bk>/<srv> : report a server's current weight", cli_parse_get_weight }, + { { "set", "weight", NULL }, "set weight <bk>/<srv> (DEPRECATED) : change a server's weight (use 'set server' instead)", cli_parse_set_weight }, + { { "add", "server", NULL }, "add server <bk>/<srv> : create a new server", cli_parse_add_server, NULL }, + { { "del", "server", NULL }, "del server <bk>/<srv> : remove a dynamically added server", cli_parse_delete_server, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +/* Prepare a server <srv> to track check status of another one. <srv>.<trackit> + * field is used to retrieve the identifier of the tracked server, either with + * the format "proxy/server" or just "server". <curproxy> must point to the + * backend owning <srv>; if no proxy is specified in <trackit>, it will be used + * to find the tracked server. + * + * Returns 0 if the server track has been activated else non-zero. + * + * Not thread-safe. + */ +int srv_apply_track(struct server *srv, struct proxy *curproxy) +{ + struct proxy *px; + struct server *strack, *loop; + char *pname, *sname; + + if (!srv->trackit) + return 1; + + pname = srv->trackit; + sname = strrchr(pname, '/'); + + if (sname) { + *sname++ = '\0'; + } + else { + sname = pname; + pname = NULL; + } + + if (pname) { + px = proxy_be_by_name(pname); + if (!px) { + ha_alert("unable to find required proxy '%s' for tracking.\n", + pname); + return 1; + } + } + else { + px = curproxy; + } + + strack = findserver(px, sname); + if (!strack) { + ha_alert("unable to find required server '%s' for tracking.\n", + sname); + return 1; + } + + if (strack->flags & SRV_F_DYNAMIC) { + ha_alert("unable to use %s/%s for tracking as it is a dynamic server.\n", + px->id, strack->id); + return 1; + } + + if (!strack->do_check && !strack->do_agent && !strack->track && + !strack->trackit) { + ha_alert("unable to use %s/%s for " + "tracking as it does not have any check nor agent enabled.\n", + px->id, strack->id); + return 1; + } + + for (loop = strack->track; loop && loop != srv; loop = loop->track) + ; + + if (srv == strack || loop) { + ha_alert("unable to track %s/%s as it " + "belongs to a tracking chain looping back to %s/%s.\n", + px->id, strack->id, px->id, + srv == strack ? strack->id : loop->id); + return 1; + } + + if (curproxy != px && + (curproxy->options & PR_O_DISABLE404) != (px->options & PR_O_DISABLE404)) { + ha_alert("unable to use %s/%s for" + "tracking: disable-on-404 option inconsistency.\n", + px->id, strack->id); + return 1; + } + + srv->track = strack; + srv->tracknext = strack->trackers; + strack->trackers = srv; + strack->flags |= SRV_F_NON_PURGEABLE; + + ha_free(&srv->trackit); + + return 0; +} + +/* This function propagates srv state change to lb algorithms */ +static void srv_lb_propagate(struct server *s) +{ + struct proxy *px = s->proxy; + + if (px->lbprm.update_server_eweight) + px->lbprm.update_server_eweight(s); + else if (srv_willbe_usable(s)) { + if (px->lbprm.set_server_status_up) + px->lbprm.set_server_status_up(s); + } + else { + if (px->lbprm.set_server_status_down) + px->lbprm.set_server_status_down(s); + } +} + +/* directly update server state based on an operational change + * (compare current and next state to know which transition to apply) + * + * The function returns the number of requeued sessions (either taken by + * the server or redispatched to others servers) due to the server state + * change. + */ +static int _srv_update_status_op(struct server *s, enum srv_op_st_chg_cause cause) +{ + struct buffer *tmptrash = NULL; + int log_level; + int srv_was_stopping = (s->cur_state == SRV_ST_STOPPING) || (s->cur_admin & SRV_ADMF_DRAIN); + int xferred = 0; + + if ((s->cur_state != SRV_ST_STOPPED) && (s->next_state == SRV_ST_STOPPED)) { + srv_lb_propagate(s); + + if (s->onmarkeddown & HANA_ONMARKEDDOWN_SHUTDOWNSESSIONS) + srv_shutdown_streams(s, SF_ERR_DOWN); + + /* we might have streams queued on this server and waiting for + * a connection. Those which are redispatchable will be queued + * to another server or to the proxy itself. + */ + xferred = pendconn_redistribute(s); + + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, + "%sServer %s/%s is DOWN", s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + + srv_append_op_chg_cause(tmptrash, s, cause); + srv_append_more(tmptrash, s, xferred, 0); + + ha_warning("%s.\n", tmptrash->area); + + /* we don't send an alert if the server was previously paused */ + log_level = srv_was_stopping ? LOG_NOTICE : LOG_ALERT; + send_log(s->proxy, log_level, "%s.\n", + tmptrash->area); + send_email_alert(s, log_level, "%s", + tmptrash->area); + free_trash_chunk(tmptrash); + } + } + else if ((s->cur_state != SRV_ST_STOPPING) && (s->next_state == SRV_ST_STOPPING)) { + srv_lb_propagate(s); + + /* we might have streams queued on this server and waiting for + * a connection. Those which are redispatchable will be queued + * to another server or to the proxy itself. + */ + xferred = pendconn_redistribute(s); + + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, + "%sServer %s/%s is stopping", s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + + srv_append_op_chg_cause(tmptrash, s, cause); + srv_append_more(tmptrash, s, xferred, 0); + + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + free_trash_chunk(tmptrash); + } + } + else if (((s->cur_state != SRV_ST_RUNNING) && (s->next_state == SRV_ST_RUNNING)) + || ((s->cur_state != SRV_ST_STARTING) && (s->next_state == SRV_ST_STARTING))) { + + if (s->next_state == SRV_ST_STARTING && s->warmup) + task_schedule(s->warmup, tick_add(now_ms, MS_TO_TICKS(MAX(1000, s->slowstart / 20)))); + + server_recalc_eweight(s, 0); + /* now propagate the status change to any LB algorithms */ + srv_lb_propagate(s); + + /* If the server is set with "on-marked-up shutdown-backup-sessions", + * and it's not a backup server and its effective weight is > 0, + * then it can accept new connections, so we shut down all streams + * on all backup servers. + */ + if ((s->onmarkedup & HANA_ONMARKEDUP_SHUTDOWNBACKUPSESSIONS) && + !(s->flags & SRV_F_BACKUP) && s->next_eweight) + srv_shutdown_backup_streams(s->proxy, SF_ERR_UP); + + /* check if we can handle some connections queued at the proxy. We + * will take as many as we can handle. + */ + xferred = pendconn_grab_from_px(s); + + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, + "%sServer %s/%s is UP", s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + + srv_append_op_chg_cause(tmptrash, s, cause); + srv_append_more(tmptrash, s, xferred, 0); + + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + send_email_alert(s, LOG_NOTICE, "%s", + tmptrash->area); + free_trash_chunk(tmptrash); + } + } + else if (s->cur_eweight != s->next_eweight) { + /* now propagate the status change to any LB algorithms */ + srv_lb_propagate(s); + } + return xferred; +} + +/* deduct and update server state from an administrative change + * (use current and next admin to deduct the administrative transition that + * may result in server state update) + * + * The function returns the number of requeued sessions (either taken by + * the server or redispatched to others servers) due to the server state + * change. + */ +static int _srv_update_status_adm(struct server *s, enum srv_adm_st_chg_cause cause) +{ + struct buffer *tmptrash = NULL; + int srv_was_stopping = (s->cur_state == SRV_ST_STOPPING) || (s->cur_admin & SRV_ADMF_DRAIN); + int xferred = 0; + + /* Maintenance must also disable health checks */ + if (!(s->cur_admin & SRV_ADMF_MAINT) && (s->next_admin & SRV_ADMF_MAINT)) { + if (s->check.state & CHK_ST_ENABLED) { + s->check.state |= CHK_ST_PAUSED; + s->check.health = 0; + } + + if (s->cur_state == SRV_ST_STOPPED) { /* server was already down */ + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, + "%sServer %s/%s was DOWN and now enters maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", s->proxy->id, s->id); + srv_append_adm_chg_cause(tmptrash, s, cause); + srv_append_more(tmptrash, s, -1, (s->next_admin & SRV_ADMF_FMAINT)); + + if (!(global.mode & MODE_STARTING)) { + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + } + free_trash_chunk(tmptrash); + } + } + else { /* server was still running */ + s->check.health = 0; /* failure */ + + s->next_state = SRV_ST_STOPPED; + srv_lb_propagate(s); + + if (s->onmarkeddown & HANA_ONMARKEDDOWN_SHUTDOWNSESSIONS) + srv_shutdown_streams(s, SF_ERR_DOWN); + + /* force connection cleanup on the given server */ + srv_cleanup_connections(s); + /* we might have streams queued on this server and waiting for + * a connection. Those which are redispatchable will be queued + * to another server or to the proxy itself. + */ + xferred = pendconn_redistribute(s); + + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, + "%sServer %s/%s is going DOWN for maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + srv_append_adm_chg_cause(tmptrash, s, cause); + srv_append_more(tmptrash, s, xferred, (s->next_admin & SRV_ADMF_FMAINT)); + + if (!(global.mode & MODE_STARTING)) { + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, srv_was_stopping ? LOG_NOTICE : LOG_ALERT, "%s.\n", + tmptrash->area); + } + free_trash_chunk(tmptrash); + } + } + } + else if ((s->cur_admin & SRV_ADMF_MAINT) && !(s->next_admin & SRV_ADMF_MAINT)) { + /* OK here we're leaving maintenance, we have many things to check, + * because the server might possibly be coming back up depending on + * its state. In practice, leaving maintenance means that we should + * immediately turn to UP (more or less the slowstart) under the + * following conditions : + * - server is neither checked nor tracked + * - server tracks another server which is not checked + * - server tracks another server which is already up + * Which sums up as something simpler : + * "either the tracking server is up or the server's checks are disabled + * or up". Otherwise we only re-enable health checks. There's a special + * case associated to the stopping state which can be inherited. Note + * that the server might still be in drain mode, which is naturally dealt + * with by the lower level functions. + */ + if (s->check.state & CHK_ST_ENABLED) { + s->check.state &= ~CHK_ST_PAUSED; + s->check.health = s->check.rise; /* start OK but check immediately */ + } + + if ((!s->track || s->track->next_state != SRV_ST_STOPPED) && + (!(s->agent.state & CHK_ST_ENABLED) || (s->agent.health >= s->agent.rise)) && + (!(s->check.state & CHK_ST_ENABLED) || (s->check.health >= s->check.rise))) { + if (s->track && s->track->next_state == SRV_ST_STOPPING) { + s->next_state = SRV_ST_STOPPING; + } + else { + s->next_state = SRV_ST_STARTING; + if (s->slowstart > 0) { + if (s->warmup) + task_schedule(s->warmup, tick_add(now_ms, MS_TO_TICKS(MAX(1000, s->slowstart / 20)))); + } + else + s->next_state = SRV_ST_RUNNING; + } + + } + + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + if (!(s->next_admin & SRV_ADMF_FMAINT) && (s->cur_admin & SRV_ADMF_FMAINT)) { + chunk_printf(tmptrash, + "%sServer %s/%s is %s/%s (leaving forced maintenance)", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id, + (s->next_state == SRV_ST_STOPPED) ? "DOWN" : "UP", + (s->next_admin & SRV_ADMF_DRAIN) ? "DRAIN" : "READY"); + } + if (!(s->next_admin & SRV_ADMF_RMAINT) && (s->cur_admin & SRV_ADMF_RMAINT)) { + chunk_printf(tmptrash, + "%sServer %s/%s ('%s') is %s/%s (resolves again)", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id, s->hostname, + (s->next_state == SRV_ST_STOPPED) ? "DOWN" : "UP", + (s->next_admin & SRV_ADMF_DRAIN) ? "DRAIN" : "READY"); + } + if (!(s->next_admin & SRV_ADMF_IMAINT) && (s->cur_admin & SRV_ADMF_IMAINT)) { + chunk_printf(tmptrash, + "%sServer %s/%s is %s/%s (leaving maintenance)", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id, + (s->next_state == SRV_ST_STOPPED) ? "DOWN" : "UP", + (s->next_admin & SRV_ADMF_DRAIN) ? "DRAIN" : "READY"); + } + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + free_trash_chunk(tmptrash); + } + + server_recalc_eweight(s, 0); + /* now propagate the status change to any LB algorithms */ + srv_lb_propagate(s); + + /* If the server is set with "on-marked-up shutdown-backup-sessions", + * and it's not a backup server and its effective weight is > 0, + * then it can accept new connections, so we shut down all streams + * on all backup servers. + */ + if ((s->onmarkedup & HANA_ONMARKEDUP_SHUTDOWNBACKUPSESSIONS) && + !(s->flags & SRV_F_BACKUP) && s->next_eweight) + srv_shutdown_backup_streams(s->proxy, SF_ERR_UP); + + /* check if we can handle some connections queued at the proxy. We + * will take as many as we can handle. + */ + xferred = pendconn_grab_from_px(s); + } + else if (s->next_admin & SRV_ADMF_MAINT) { + /* remaining in maintenance mode, let's inform precisely about the + * situation. + */ + if (!(s->next_admin & SRV_ADMF_FMAINT) && (s->cur_admin & SRV_ADMF_FMAINT)) { + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, + "%sServer %s/%s is leaving forced maintenance but remains in maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + + if (s->track) /* normally it's mandatory here */ + chunk_appendf(tmptrash, " via %s/%s", + s->track->proxy->id, s->track->id); + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + free_trash_chunk(tmptrash); + } + } + if (!(s->next_admin & SRV_ADMF_RMAINT) && (s->cur_admin & SRV_ADMF_RMAINT)) { + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, + "%sServer %s/%s ('%s') resolves again but remains in maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id, s->hostname); + + if (s->track) /* normally it's mandatory here */ + chunk_appendf(tmptrash, " via %s/%s", + s->track->proxy->id, s->track->id); + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + free_trash_chunk(tmptrash); + } + } + else if (!(s->next_admin & SRV_ADMF_IMAINT) && (s->cur_admin & SRV_ADMF_IMAINT)) { + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, + "%sServer %s/%s remains in forced maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + free_trash_chunk(tmptrash); + } + } + /* don't report anything when leaving drain mode and remaining in maintenance */ + } + + if (!(s->next_admin & SRV_ADMF_MAINT)) { + if (!(s->cur_admin & SRV_ADMF_DRAIN) && (s->next_admin & SRV_ADMF_DRAIN)) { + /* drain state is applied only if not yet in maint */ + + srv_lb_propagate(s); + + /* we might have streams queued on this server and waiting for + * a connection. Those which are redispatchable will be queued + * to another server or to the proxy itself. + */ + xferred = pendconn_redistribute(s); + + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + chunk_printf(tmptrash, "%sServer %s/%s enters drain state", + s->flags & SRV_F_BACKUP ? "Backup " : "", s->proxy->id, s->id); + srv_append_adm_chg_cause(tmptrash, s, cause); + srv_append_more(tmptrash, s, xferred, (s->next_admin & SRV_ADMF_FDRAIN)); + + if (!(global.mode & MODE_STARTING)) { + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + send_email_alert(s, LOG_NOTICE, "%s", + tmptrash->area); + } + free_trash_chunk(tmptrash); + } + } + else if ((s->cur_admin & SRV_ADMF_DRAIN) && !(s->next_admin & SRV_ADMF_DRAIN)) { + /* OK completely leaving drain mode */ + server_recalc_eweight(s, 0); + + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + if (s->cur_admin & SRV_ADMF_FDRAIN) { + chunk_printf(tmptrash, + "%sServer %s/%s is %s (leaving forced drain)", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id, + (s->next_state == SRV_ST_STOPPED) ? "DOWN" : "UP"); + } + else { + chunk_printf(tmptrash, + "%sServer %s/%s is %s (leaving drain)", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id, + (s->next_state == SRV_ST_STOPPED) ? "DOWN" : "UP"); + if (s->track) /* normally it's mandatory here */ + chunk_appendf(tmptrash, " via %s/%s", + s->track->proxy->id, s->track->id); + } + + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + free_trash_chunk(tmptrash); + } + + /* now propagate the status change to any LB algorithms */ + srv_lb_propagate(s); + } + else if ((s->next_admin & SRV_ADMF_DRAIN)) { + /* remaining in drain mode after removing one of its flags */ + + tmptrash = alloc_trash_chunk(); + if (tmptrash) { + if (!(s->next_admin & SRV_ADMF_FDRAIN)) { + chunk_printf(tmptrash, + "%sServer %s/%s remains in drain mode", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + + if (s->track) /* normally it's mandatory here */ + chunk_appendf(tmptrash, " via %s/%s", + s->track->proxy->id, s->track->id); + } + else { + chunk_printf(tmptrash, + "%sServer %s/%s remains in forced drain mode", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + } + ha_warning("%s.\n", tmptrash->area); + send_log(s->proxy, LOG_NOTICE, "%s.\n", + tmptrash->area); + free_trash_chunk(tmptrash); + } + } + } + return xferred; +} + +/* + * This function applies server's status changes. + * + * Must be called with the server lock held. This may also be called at init + * time as the result of parsing the state file, in which case no lock will be + * held, and the server's warmup task can be null. + * <type> should be 0 for operational and 1 for administrative + * <cause> must be srv_op_st_chg_cause enum for operational and + * srv_adm_st_chg_cause enum for administrative + */ +static void srv_update_status(struct server *s, int type, int cause) +{ + int prev_srv_count = s->proxy->srv_bck + s->proxy->srv_act; + enum srv_state srv_prev_state = s->cur_state; + union { + struct event_hdl_cb_data_server_state state; + struct event_hdl_cb_data_server_admin admin; + struct event_hdl_cb_data_server common; + } cb_data; + int requeued; + + /* prepare common server event data */ + _srv_event_hdl_prepare(&cb_data.common, s, 0); + + if (type) { + cb_data.admin.safe.cause = cause; + cb_data.admin.safe.old_admin = s->cur_admin; + cb_data.admin.safe.new_admin = s->next_admin; + requeued = _srv_update_status_adm(s, cause); + cb_data.admin.safe.requeued = requeued; + /* publish admin change */ + _srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_ADMIN, cb_data.admin, s); + } + else + requeued = _srv_update_status_op(s, cause); + + /* explicitly commit state changes (even if it was already applied implicitly + * by some lb state change function), so we don't miss anything + */ + srv_lb_commit_status(s); + + /* check if server stats must be updated due the the server state change */ + if (srv_prev_state != s->cur_state) { + if (srv_prev_state == SRV_ST_STOPPED) { + /* server was down and no longer is */ + if (s->last_change < ns_to_sec(now_ns)) // ignore negative times + s->down_time += ns_to_sec(now_ns) - s->last_change; + _srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_UP, cb_data.common, s); + } + else if (s->cur_state == SRV_ST_STOPPED) { + /* server was up and is currently down */ + s->counters.down_trans++; + _srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_DOWN, cb_data.common, s); + } + s->last_change = ns_to_sec(now_ns); + + /* publish the state change */ + _srv_event_hdl_prepare_state(&cb_data.state, + s, type, cause, srv_prev_state, requeued); + _srv_event_hdl_publish(EVENT_HDL_SUB_SERVER_STATE, cb_data.state, s); + } + + /* check if backend stats must be updated due to the server state change */ + if (prev_srv_count && s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) + set_backend_down(s->proxy); /* backend going down */ + else if (!prev_srv_count && (s->proxy->srv_bck || s->proxy->srv_act)) { + /* backend was down and is back up again: + * no helper function, updating last_change and backend downtime stats + */ + if (s->proxy->last_change < ns_to_sec(now_ns)) // ignore negative times + s->proxy->down_time += ns_to_sec(now_ns) - s->proxy->last_change; + s->proxy->last_change = ns_to_sec(now_ns); + } +} + +struct task *srv_cleanup_toremove_conns(struct task *task, void *context, unsigned int state) +{ + struct connection *conn; + + while ((conn = MT_LIST_POP(&idle_conns[tid].toremove_conns, + struct connection *, toremove_list)) != NULL) { + conn->mux->destroy(conn->ctx); + } + + return task; +} + +/* Move <toremove_nb> count connections from <list> storage to <toremove_list> + * list storage. -1 means moving all of them. + * + * Returns the number of connections moved. + * + * Must be called with idle_conns_lock held. + */ +static int srv_migrate_conns_to_remove(struct list *list, struct mt_list *toremove_list, int toremove_nb) +{ + struct connection *conn; + int i = 0; + + while (!LIST_ISEMPTY(list)) { + if (toremove_nb != -1 && i >= toremove_nb) + break; + + conn = LIST_ELEM(list->n, struct connection *, idle_list); + conn_delete_from_tree(conn); + MT_LIST_APPEND(toremove_list, &conn->toremove_list); + i++; + } + + return i; +} +/* cleanup connections for a given server + * might be useful when going on forced maintenance or live changing ip/port + */ +static void srv_cleanup_connections(struct server *srv) +{ + int did_remove; + int i; + + /* nothing to do if pool-max-conn is null */ + if (!srv->max_idle_conns) + return; + + /* check all threads starting with ours */ + for (i = tid;;) { + did_remove = 0; + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock); + if (srv_migrate_conns_to_remove(&srv->per_thr[i].idle_conn_list, &idle_conns[i].toremove_conns, -1) > 0) + did_remove = 1; + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock); + if (did_remove) + task_wakeup(idle_conns[i].cleanup_task, TASK_WOKEN_OTHER); + + if ((i = ((i + 1 == global.nbthread) ? 0 : i + 1)) == tid) + break; + } +} + +/* removes an idle conn after updating the server idle conns counters */ +void srv_release_conn(struct server *srv, struct connection *conn) +{ + if (conn->flags & CO_FL_LIST_MASK) { + /* The connection is currently in the server's idle list, so tell it + * there's one less connection available in that list. + */ + _HA_ATOMIC_DEC(&srv->curr_idle_conns); + _HA_ATOMIC_DEC(conn->flags & CO_FL_SAFE_LIST ? &srv->curr_safe_nb : &srv->curr_idle_nb); + _HA_ATOMIC_DEC(&srv->curr_idle_thr[tid]); + } + else { + /* The connection is not private and not in any server's idle + * list, so decrement the current number of used connections + */ + _HA_ATOMIC_DEC(&srv->curr_used_conns); + } + + /* Remove the connection from any tree (safe, idle or available) */ + if (conn->hash_node) { + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + conn_delete_from_tree(conn); + conn->flags &= ~CO_FL_LIST_MASK; + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } +} + +/* retrieve a connection from its <hash> in <tree> + * returns NULL if no connection found + */ +struct connection *srv_lookup_conn(struct eb_root *tree, uint64_t hash) +{ + struct eb64_node *node = NULL; + struct connection *conn = NULL; + struct conn_hash_node *hash_node = NULL; + + node = eb64_lookup(tree, hash); + if (node) { + hash_node = ebmb_entry(node, struct conn_hash_node, node); + conn = hash_node->conn; + } + + return conn; +} + +/* retrieve the next connection sharing the same hash as <conn> + * returns NULL if no connection found + */ +struct connection *srv_lookup_conn_next(struct connection *conn) +{ + struct eb64_node *node = NULL; + struct connection *next_conn = NULL; + struct conn_hash_node *hash_node = NULL; + + node = eb64_next_dup(&conn->hash_node->node); + if (node) { + hash_node = eb64_entry(node, struct conn_hash_node, node); + next_conn = hash_node->conn; + } + + return next_conn; +} + +/* Add <conn> in <srv> idle trees. Set <is_safe> if connection is deemed safe + * for reuse. + * + * This function is a simple wrapper for tree insert. It should only be used + * for internal usage or when removing briefly the connection to avoid takeover + * on it before reinserting it with this function. In other context, prefer to + * use the full feature srv_add_to_idle_list(). + * + * Must be called with idle_conns_lock. + */ +void _srv_add_idle(struct server *srv, struct connection *conn, int is_safe) +{ + struct eb_root *tree = is_safe ? &srv->per_thr[tid].safe_conns : + &srv->per_thr[tid].idle_conns; + + /* first insert in idle or safe tree. */ + eb64_insert(tree, &conn->hash_node->node); + + /* insert in list sorted by connection usage. */ + LIST_APPEND(&srv->per_thr[tid].idle_conn_list, &conn->idle_list); +} + +/* This adds an idle connection to the server's list if the connection is + * reusable, not held by any owner anymore, but still has available streams. + */ +int srv_add_to_idle_list(struct server *srv, struct connection *conn, int is_safe) +{ + /* we try to keep the connection in the server's idle list + * if we don't have too many FD in use, and if the number of + * idle+current conns is lower than what was observed before + * last purge, or if we already don't have idle conns for the + * current thread and we don't exceed last count by global.nbthread. + */ + if (!(conn->flags & CO_FL_PRIVATE) && + srv && srv->pool_purge_delay > 0 && + ((srv->proxy->options & PR_O_REUSE_MASK) != PR_O_REUSE_NEVR) && + ha_used_fds < global.tune.pool_high_count && + (srv->max_idle_conns == -1 || srv->max_idle_conns > srv->curr_idle_conns) && + ((eb_is_empty(&srv->per_thr[tid].safe_conns) && + (is_safe || eb_is_empty(&srv->per_thr[tid].idle_conns))) || + (ha_used_fds < global.tune.pool_low_count && + (srv->curr_used_conns + srv->curr_idle_conns <= + MAX(srv->curr_used_conns, srv->est_need_conns) + srv->low_idle_conns || + (conn->flags & CO_FL_REVERSED)))) && + !conn->mux->used_streams(conn) && conn->mux->avail_streams(conn)) { + int retadd; + + retadd = _HA_ATOMIC_ADD_FETCH(&srv->curr_idle_conns, 1); + if (retadd > srv->max_idle_conns) { + _HA_ATOMIC_DEC(&srv->curr_idle_conns); + return 0; + } + _HA_ATOMIC_DEC(&srv->curr_used_conns); + + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + conn_delete_from_tree(conn); + + if (is_safe) { + conn->flags = (conn->flags & ~CO_FL_LIST_MASK) | CO_FL_SAFE_LIST; + _srv_add_idle(srv, conn, 1); + _HA_ATOMIC_INC(&srv->curr_safe_nb); + } else { + conn->flags = (conn->flags & ~CO_FL_LIST_MASK) | CO_FL_IDLE_LIST; + _srv_add_idle(srv, conn, 0); + _HA_ATOMIC_INC(&srv->curr_idle_nb); + } + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + _HA_ATOMIC_INC(&srv->curr_idle_thr[tid]); + + __ha_barrier_full(); + if ((volatile void *)srv->idle_node.node.leaf_p == NULL) { + HA_SPIN_LOCK(OTHER_LOCK, &idle_conn_srv_lock); + if ((volatile void *)srv->idle_node.node.leaf_p == NULL) { + srv->idle_node.key = tick_add(srv->pool_purge_delay, + now_ms); + eb32_insert(&idle_conn_srv, &srv->idle_node); + if (!task_in_wq(idle_conn_task) && ! + task_in_rq(idle_conn_task)) { + task_schedule(idle_conn_task, + srv->idle_node.key); + } + + } + HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conn_srv_lock); + } + return 1; + } + return 0; +} + +/* Insert <conn> connection in <srv> server available list. This is reserved + * for backend connection currently in used with usable streams left. + */ +void srv_add_to_avail_list(struct server *srv, struct connection *conn) +{ + /* connection cannot be in idle list if used as an avail idle conn. */ + BUG_ON(LIST_INLIST(&conn->idle_list)); + eb64_insert(&srv->per_thr[tid].avail_conns, &conn->hash_node->node); +} + +struct task *srv_cleanup_idle_conns(struct task *task, void *context, unsigned int state) +{ + struct server *srv; + struct eb32_node *eb; + int i; + unsigned int next_wakeup; + + next_wakeup = TICK_ETERNITY; + HA_SPIN_LOCK(OTHER_LOCK, &idle_conn_srv_lock); + while (1) { + int exceed_conns; + int to_kill; + int curr_idle; + + eb = eb32_lookup_ge(&idle_conn_srv, now_ms - TIMER_LOOK_BACK); + if (!eb) { + /* we might have reached the end of the tree, typically because + * <now_ms> is in the first half and we're first scanning the last + * half. Let's loop back to the beginning of the tree now. + */ + + eb = eb32_first(&idle_conn_srv); + if (likely(!eb)) + break; + } + if (tick_is_lt(now_ms, eb->key)) { + /* timer not expired yet, revisit it later */ + next_wakeup = eb->key; + break; + } + srv = eb32_entry(eb, struct server, idle_node); + + /* Calculate how many idle connections we want to kill : + * we want to remove half the difference between the total + * of established connections (used or idle) and the max + * number of used connections. + */ + curr_idle = srv->curr_idle_conns; + if (curr_idle == 0) + goto remove; + exceed_conns = srv->curr_used_conns + curr_idle - MAX(srv->max_used_conns, srv->est_need_conns); + exceed_conns = to_kill = exceed_conns / 2 + (exceed_conns & 1); + + srv->est_need_conns = (srv->est_need_conns + srv->max_used_conns) / 2; + if (srv->est_need_conns < srv->max_used_conns) + srv->est_need_conns = srv->max_used_conns; + + HA_ATOMIC_STORE(&srv->max_used_conns, srv->curr_used_conns); + + if (exceed_conns <= 0) + goto remove; + + /* check all threads starting with ours */ + for (i = tid;;) { + int max_conn; + int j; + int did_remove = 0; + + max_conn = (exceed_conns * srv->curr_idle_thr[i]) / + curr_idle + 1; + + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock); + j = srv_migrate_conns_to_remove(&srv->per_thr[i].idle_conn_list, &idle_conns[i].toremove_conns, max_conn); + if (j > 0) + did_remove = 1; + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock); + + if (did_remove) + task_wakeup(idle_conns[i].cleanup_task, TASK_WOKEN_OTHER); + + if ((i = ((i + 1 == global.nbthread) ? 0 : i + 1)) == tid) + break; + } +remove: + eb32_delete(&srv->idle_node); + + if (srv->curr_idle_conns) { + /* There are still more idle connections, add the + * server back in the tree. + */ + srv->idle_node.key = tick_add(srv->pool_purge_delay, now_ms); + eb32_insert(&idle_conn_srv, &srv->idle_node); + next_wakeup = tick_first(next_wakeup, srv->idle_node.key); + } + } + HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conn_srv_lock); + + task->expire = next_wakeup; + return task; +} + +/* Close remaining idle connections. This functions is designed to be run on + * process shutdown. This guarantees a proper socket shutdown to avoid + * TIME_WAIT state. For a quick operation, only ctrl is closed, xprt stack is + * bypassed. + * + * This function is not thread-safe so it must only be called via a global + * deinit function. + */ +static void srv_close_idle_conns(struct server *srv) +{ + struct eb_root **cleaned_tree; + int i; + + for (i = 0; i < global.nbthread; ++i) { + struct eb_root *conn_trees[] = { + &srv->per_thr[i].idle_conns, + &srv->per_thr[i].safe_conns, + &srv->per_thr[i].avail_conns, + NULL + }; + + for (cleaned_tree = conn_trees; *cleaned_tree; ++cleaned_tree) { + while (!eb_is_empty(*cleaned_tree)) { + struct ebmb_node *node = ebmb_first(*cleaned_tree); + struct conn_hash_node *conn_hash_node = ebmb_entry(node, struct conn_hash_node, node); + struct connection *conn = conn_hash_node->conn; + + if (conn->ctrl->ctrl_close) + conn->ctrl->ctrl_close(conn); + conn_delete_from_tree(conn); + } + } + } +} + +REGISTER_SERVER_DEINIT(srv_close_idle_conns); + +/* config parser for global "tune.idle-pool.shared", accepts "on" or "off" */ +static int cfg_parse_idle_pool_shared(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.options |= GTUNE_IDLE_POOL_SHARED; + else if (strcmp(args[1], "off") == 0) + global.tune.options &= ~GTUNE_IDLE_POOL_SHARED; + else { + memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +/* config parser for global "tune.pool-{low,high}-fd-ratio" */ +static int cfg_parse_pool_fd_ratio(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + int arg = -1; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (*(args[1]) != 0) + arg = atoi(args[1]); + + if (arg < 0 || arg > 100) { + memprintf(err, "'%s' expects an integer argument between 0 and 100.", args[0]); + return -1; + } + + if (args[0][10] == 'h') + global.tune.pool_high_ratio = arg; + else + global.tune.pool_low_ratio = arg; + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.idle-pool.shared", cfg_parse_idle_pool_shared }, + { CFG_GLOBAL, "tune.pool-high-fd-ratio", cfg_parse_pool_fd_ratio }, + { CFG_GLOBAL, "tune.pool-low-fd-ratio", cfg_parse_pool_fd_ratio }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/server_state.c b/src/server_state.c new file mode 100644 index 0000000..ebdcf3c --- /dev/null +++ b/src/server_state.c @@ -0,0 +1,947 @@ +/* + * Server-state management functions. + * + * Copyright (C) 2021 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> + +#include <import/eb64tree.h> +#include <import/ebistree.h> + +#include <haproxy/api.h> +#include <haproxy/backend.h> +#include <haproxy/cfgparse.h> +#include <haproxy/check.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/log.h> +#include <haproxy/port_range.h> +#include <haproxy/proxy.h> +#include <haproxy/resolvers.h> +#include <haproxy/server.h> +#include <haproxy/tools.h> +#include <haproxy/xxhash.h> + + +/* Update a server state using the parameters available in the params list. + * The caller must provide a supported version + * Grabs the server lock during operation. + */ +static void srv_state_srv_update(struct server *srv, int version, char **params) +{ + char *p; + struct buffer *msg; + const char *warning; + + /* fields since version 1 + * and common to all other upcoming versions + */ + enum srv_state srv_op_state; + enum srv_admin srv_admin_state; + unsigned srv_uweight, srv_iweight; + unsigned long srv_last_time_change; + short srv_check_status; + enum chk_result srv_check_result; + int srv_check_health; + int srv_check_state, srv_agent_state; + int bk_f_forced_id; + int srv_f_forced_id; + int fqdn_set_by_cli; + const char *fqdn; + const char *port_st; + unsigned int port_svc; + char *srvrecord; + char *addr; + int partial_apply = 0; +#ifdef USE_OPENSSL + int use_ssl; +#endif + + fqdn = NULL; + port_svc = 0; + msg = alloc_trash_chunk(); + if (!msg) + goto end; + + HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); + + /* Only version 1 supported for now, don't check it. Fields are : + * srv_addr: params[0] + * srv_op_state: params[1] + * srv_admin_state: params[2] + * srv_uweight: params[3] + * srv_iweight: params[4] + * srv_last_time_change: params[5] + * srv_check_status: params[6] + * srv_check_result: params[7] + * srv_check_health: params[8] + * srv_check_state: params[9] + * srv_agent_state: params[10] + * bk_f_forced_id: params[11] + * srv_f_forced_id: params[12] + * srv_fqdn: params[13] + * srv_port: params[14] + * srvrecord: params[15] + * srv_use_ssl: params[16] + * srv_check_port: params[17] + * srv_check_addr: params[18] + * srv_agent_addr: params[19] + * srv_agent_port: params[20] + */ + + /* validating srv_op_state */ + p = NULL; + errno = 0; + srv_op_state = strtol(params[1], &p, 10); + if ((p == params[1]) || errno == EINVAL || errno == ERANGE || + (srv_op_state != SRV_ST_STOPPED && + srv_op_state != SRV_ST_STARTING && + srv_op_state != SRV_ST_RUNNING && + srv_op_state != SRV_ST_STOPPING)) { + chunk_appendf(msg, ", invalid srv_op_state value '%s'", params[1]); + } + + /* validating srv_admin_state */ + p = NULL; + errno = 0; + srv_admin_state = strtol(params[2], &p, 10); + fqdn_set_by_cli = !!(srv_admin_state & SRV_ADMF_HMAINT); + + /* inherited statuses will be recomputed later. + * Also disable SRV_ADMF_HMAINT flag (set from stats socket fqdn). + */ + srv_admin_state &= ~SRV_ADMF_IDRAIN & ~SRV_ADMF_IMAINT & ~SRV_ADMF_HMAINT & ~SRV_ADMF_RMAINT; + + if ((p == params[2]) || errno == EINVAL || errno == ERANGE || + (srv_admin_state != 0 && + srv_admin_state != SRV_ADMF_FMAINT && + srv_admin_state != SRV_ADMF_CMAINT && + srv_admin_state != (SRV_ADMF_CMAINT | SRV_ADMF_FMAINT) && + srv_admin_state != (SRV_ADMF_CMAINT | SRV_ADMF_FDRAIN) && + srv_admin_state != SRV_ADMF_FDRAIN)) { + chunk_appendf(msg, ", invalid srv_admin_state value '%s'", params[2]); + } + + /* validating srv_uweight */ + p = NULL; + errno = 0; + srv_uweight = strtol(params[3], &p, 10); + if ((p == params[3]) || errno == EINVAL || errno == ERANGE || (srv_uweight > SRV_UWGHT_MAX)) + chunk_appendf(msg, ", invalid srv_uweight value '%s'", params[3]); + + /* validating srv_iweight */ + p = NULL; + errno = 0; + srv_iweight = strtol(params[4], &p, 10); + if ((p == params[4]) || errno == EINVAL || errno == ERANGE || (srv_iweight > SRV_UWGHT_MAX)) + chunk_appendf(msg, ", invalid srv_iweight value '%s'", params[4]); + + /* validating srv_last_time_change */ + p = NULL; + errno = 0; + srv_last_time_change = strtol(params[5], &p, 10); + if ((p == params[5]) || errno == EINVAL || errno == ERANGE) + chunk_appendf(msg, ", invalid srv_last_time_change value '%s'", params[5]); + + /* validating srv_check_status */ + p = NULL; + errno = 0; + srv_check_status = strtol(params[6], &p, 10); + if (p == params[6] || errno == EINVAL || errno == ERANGE || + (srv_check_status >= HCHK_STATUS_SIZE)) + chunk_appendf(msg, ", invalid srv_check_status value '%s'", params[6]); + + /* validating srv_check_result */ + p = NULL; + errno = 0; + srv_check_result = strtol(params[7], &p, 10); + if ((p == params[7]) || errno == EINVAL || errno == ERANGE || + (srv_check_result != CHK_RES_UNKNOWN && + srv_check_result != CHK_RES_NEUTRAL && + srv_check_result != CHK_RES_FAILED && + srv_check_result != CHK_RES_PASSED && + srv_check_result != CHK_RES_CONDPASS)) { + chunk_appendf(msg, ", invalid srv_check_result value '%s'", params[7]); + } + + /* validating srv_check_health */ + p = NULL; + errno = 0; + srv_check_health = strtol(params[8], &p, 10); + if (p == params[8] || errno == EINVAL || errno == ERANGE) + chunk_appendf(msg, ", invalid srv_check_health value '%s'", params[8]); + + /* validating srv_check_state */ + p = NULL; + errno = 0; + srv_check_state = strtol(params[9], &p, 10); + if (p == params[9] || errno == EINVAL || errno == ERANGE || + (srv_check_state & ~(CHK_ST_INPROGRESS | CHK_ST_CONFIGURED | CHK_ST_ENABLED | CHK_ST_PAUSED | CHK_ST_AGENT))) + chunk_appendf(msg, ", invalid srv_check_state value '%s'", params[9]); + + /* validating srv_agent_state */ + p = NULL; + errno = 0; + srv_agent_state = strtol(params[10], &p, 10); + if (p == params[10] || errno == EINVAL || errno == ERANGE || + (srv_agent_state & ~(CHK_ST_INPROGRESS | CHK_ST_CONFIGURED | CHK_ST_ENABLED | CHK_ST_PAUSED | CHK_ST_AGENT))) + chunk_appendf(msg, ", invalid srv_agent_state value '%s'", params[10]); + + /* validating bk_f_forced_id */ + p = NULL; + errno = 0; + bk_f_forced_id = strtol(params[11], &p, 10); + if (p == params[11] || errno == EINVAL || errno == ERANGE || !((bk_f_forced_id == 0) || (bk_f_forced_id == 1))) + chunk_appendf(msg, ", invalid bk_f_forced_id value '%s'", params[11]); + + /* validating srv_f_forced_id */ + p = NULL; + errno = 0; + srv_f_forced_id = strtol(params[12], &p, 10); + if (p == params[12] || errno == EINVAL || errno == ERANGE || !((srv_f_forced_id == 0) || (srv_f_forced_id == 1))) + chunk_appendf(msg, ", invalid srv_f_forced_id value '%s'", params[12]); + + /* validating srv_fqdn */ + fqdn = params[13]; + if (fqdn && *fqdn == '-') + fqdn = NULL; + if (fqdn && (strlen(fqdn) > DNS_MAX_NAME_SIZE || invalid_domainchar(fqdn))) { + chunk_appendf(msg, ", invalid srv_fqdn value '%s'", params[13]); + fqdn = NULL; + } + + port_st = params[14]; + if (port_st) { + port_svc = strl2uic(port_st, strlen(port_st)); + if (port_svc > USHRT_MAX) { + chunk_appendf(msg, ", invalid srv_port value '%s'", port_st); + port_st = NULL; + } + } + + /* SRV record + * NOTE: in HAProxy, SRV records must start with an underscore '_' + */ + srvrecord = params[15]; + if (srvrecord && *srvrecord != '_') + srvrecord = NULL; + + /* don't apply anything if one error has been detected */ + if (msg->data) + goto out; + partial_apply = 1; + + /* recover operational state and apply it to this server + * and all servers tracking this one */ + srv->check.health = srv_check_health; + switch (srv_op_state) { + case SRV_ST_STOPPED: + srv->check.health = 0; + srv_set_stopped(srv, SRV_OP_STCHGC_STATEFILE); + break; + case SRV_ST_STARTING: + /* If rise == 1 there is no STARTING state, let's switch to + * RUNNING + */ + if (srv->check.rise == 1) { + srv->check.health = srv->check.rise + srv->check.fall - 1; + srv_set_running(srv, SRV_OP_STCHGC_NONE); + break; + } + if (srv->check.health < 1 || srv->check.health >= srv->check.rise) + srv->check.health = srv->check.rise - 1; + srv->next_state = srv_op_state; + break; + case SRV_ST_STOPPING: + /* If fall == 1 there is no STOPPING state, let's switch to + * STOPPED + */ + if (srv->check.fall == 1) { + srv->check.health = 0; + srv_set_stopped(srv, SRV_OP_STCHGC_STATEFILE); + break; + } + if (srv->check.health < srv->check.rise || + srv->check.health > srv->check.rise + srv->check.fall - 2) + srv->check.health = srv->check.rise; + srv_set_stopping(srv, SRV_OP_STCHGC_STATEFILE); + break; + case SRV_ST_RUNNING: + srv->check.health = srv->check.rise + srv->check.fall - 1; + srv_set_running(srv, SRV_OP_STCHGC_NONE); + break; + } + + /* When applying server state, the following rules apply: + * - in case of a configuration change, we apply the setting from the new + * configuration, regardless of old running state + * - if no configuration change, we apply old running state only if old running + * state is different from new configuration state + */ + /* configuration has changed */ + if ((srv_admin_state & SRV_ADMF_CMAINT) != (srv->next_admin & SRV_ADMF_CMAINT)) { + if (srv->next_admin & SRV_ADMF_CMAINT) + srv_adm_set_maint(srv); + else + srv_adm_set_ready(srv); + } + /* configuration is the same, let's compate old running state and new conf state */ + else { + if (srv_admin_state & SRV_ADMF_FMAINT && !(srv->next_admin & SRV_ADMF_CMAINT)) + srv_adm_set_maint(srv); + else if (!(srv_admin_state & SRV_ADMF_FMAINT) && (srv->next_admin & SRV_ADMF_CMAINT)) + srv_adm_set_ready(srv); + } + /* apply drain mode if server is currently enabled */ + if (!(srv->next_admin & SRV_ADMF_FMAINT) && (srv_admin_state & SRV_ADMF_FDRAIN)) { + /* The SRV_ADMF_FDRAIN flag is inherited when srv->iweight is 0 + * (srv->iweight is the weight set up in configuration). + * There are two possible reasons for FDRAIN to have been present : + * - previous config weight was zero + * - "set server b/s drain" was sent to the CLI + * + * In the first case, we simply want to drop this drain state + * if the new weight is not zero anymore, meaning the administrator + * has intentionally turned the weight back to a positive value to + * enable the server again after an operation. In the second case, + * the drain state was forced on the CLI regardless of the config's + * weight so we don't want a change to the config weight to lose this + * status. What this means is : + * - if previous weight was 0 and new one is >0, drop the DRAIN state. + * - if the previous weight was >0, keep it. + */ + if (srv_iweight > 0 || srv->iweight == 0) + srv_adm_set_drain(srv); + } + + srv->last_change = ns_to_sec(now_ns) - srv_last_time_change; + srv->check.status = srv_check_status; + srv->check.result = srv_check_result; + + /* Only case we want to apply is removing ENABLED flag which could have been + * done by the "disable health" command over the stats socket + */ + if ((srv->check.state & CHK_ST_CONFIGURED) && + (srv_check_state & CHK_ST_CONFIGURED) && + !(srv_check_state & CHK_ST_ENABLED)) + srv->check.state &= ~CHK_ST_ENABLED; + + /* Only case we want to apply is removing ENABLED flag which could have been + * done by the "disable agent" command over the stats socket + */ + if ((srv->agent.state & CHK_ST_CONFIGURED) && + (srv_agent_state & CHK_ST_CONFIGURED) && + !(srv_agent_state & CHK_ST_ENABLED)) + srv->agent.state &= ~CHK_ST_ENABLED; + + /* We want to apply the previous 'running' weight (srv_uweight) only if there + * was no change in the configuration: both previous and new iweight are equals + * + * It means that a configuration file change has precedence over a unix socket change + * for server's weight + * + * by default, HAProxy applies the following weight when parsing the configuration + * srv->uweight = srv->iweight + */ + if (srv_iweight == srv->iweight) { + srv->uweight = srv_uweight; + } + server_recalc_eweight(srv, 1); + + /* load server IP address */ + if (strcmp(params[0], "-") != 0) + srv->lastaddr = strdup(params[0]); + + if (fqdn && srv->hostname) { + if (strcmp(srv->hostname, fqdn) == 0) { + /* Here we reset the 'set from stats socket FQDN' flag + * to support such transitions: + * Let's say initial FQDN value is foo1 (in configuration file). + * - FQDN changed from stats socket, from foo1 to foo2 value, + * - FQDN changed again from file configuration (with the same previous value + set from stats socket, from foo1 to foo2 value), + * - reload for any other reason than a FQDN modification, + * the configuration file FQDN matches the fqdn server state file value. + * So we must reset the 'set from stats socket FQDN' flag to be consistent with + * any further FQDN modification. + */ + srv->next_admin &= ~SRV_ADMF_HMAINT; + } + else { + /* If the FDQN has been changed from stats socket, + * apply fqdn state file value (which is the value set + * from stats socket). + * Also ensure the runtime resolver will process this resolution. + */ + if (fqdn_set_by_cli) { + srv_set_fqdn(srv, fqdn, 0); + srv->flags &= ~SRV_F_NO_RESOLUTION; + srv->next_admin |= SRV_ADMF_HMAINT; + } + } + } + /* If all the conditions below are validated, this means + * we're evaluating a server managed by SRV resolution + */ + else if (fqdn && !srv->hostname && srvrecord) { + int res; + int i; + char *tmp; + + /* we can't apply previous state if SRV record has changed */ + if (!srv->srvrq) { + chunk_appendf(msg, ", no SRV resolution for server '%s'. Previous state not applied", srv->id); + goto out; + } + if (strcmp(srv->srvrq->name, srvrecord) != 0) { + chunk_appendf(msg, ", SRV record mismatch between configuration ('%s') and state file ('%s) for server '%s'. Previous state not applied", srv->srvrq->name, srvrecord, srv->id); + goto out; + } + + /* prepare DNS resolution for this server */ + res = srv_prepare_for_resolution(srv, fqdn); + if (res == -1) { + chunk_appendf(msg, ", can't allocate memory for DNS resolution for server '%s'", srv->id); + goto out; + } + + /* Remove from available list and insert in tree + * since this server has an hostname + */ + LIST_DEL_INIT(&srv->srv_rec_item); + srv->host_dn.key = tmp = strdup(srv->hostname_dn); + + /* convert the key in lowercase because tree + * lookup is case sensitive but we don't care + */ + for (i = 0; tmp[i]; i++) + tmp[i] = tolower(tmp[i]); + + /* insert in tree and set the srvrq expiration date */ + ebis_insert(&srv->srvrq->named_servers, &srv->host_dn); + task_schedule(srv->srvrq_check, tick_add(now_ms, srv->srvrq->resolvers->hold.timeout)); + + /* Unset SRV_F_MAPPORTS for SRV records. + * SRV_F_MAPPORTS is unfortunately set by parse_server() + * because no ports are provided in the configuration file. + * This is because HAProxy will use the port found into the SRV record. + */ + srv->flags &= ~SRV_F_MAPPORTS; + } + + if (port_st) + srv->svc_port = port_svc; + + + if (params[16]) { +#ifdef USE_OPENSSL + use_ssl = strtol(params[16], &p, 10); + + /* configure ssl if connection has been initiated at startup */ + if (srv->ssl_ctx.ctx != NULL) + srv_set_ssl(srv, use_ssl); +#endif + } + + port_st = NULL; + if (params[17] && strcmp(params[17], "0") != 0) + port_st = params[17]; + addr = NULL; + if (params[18] && strcmp(params[18], "-") != 0) + addr = params[18]; + if (addr || port_st) { + warning = srv_update_check_addr_port(srv, addr, port_st); + if (warning) { + chunk_appendf(msg, ", %s", warning); + goto out; + } + } + + port_st = NULL; + if (params[20] && strcmp(params[20], "0") != 0) + port_st = params[20]; + addr = NULL; + if (params[19] && strcmp(params[19], "-") != 0) + addr = params[19]; + if (addr || port_st) { + warning = srv_update_agent_addr_port(srv, addr, port_st); + if (warning) { + chunk_appendf(msg, ", %s", warning); + goto out; + } + } + + out: + HA_SPIN_UNLOCK(SERVER_LOCK, &srv->lock); + if (msg->data) { + if (partial_apply == 1) + ha_warning("server-state partially applied for server '%s/%s'%s\n", + srv->proxy->id, srv->id, msg->area); + else + ha_warning("server-state application failed for server '%s/%s'%s\n", + srv->proxy->id, srv->id, msg->area); + } + end: + free_trash_chunk(msg); +} + +/* + * Loop on the proxy's servers and try to load its state from <st_tree> using + * srv_state_srv_update(). The proxy name and the server name are concatenated + * to form the key. If found the entry is removed from the tree. + */ +static void srv_state_px_update(const struct proxy *px, int vsn, struct eb_root *st_tree) +{ + struct server_state_line *st_line; + struct eb64_node *node; + struct server *srv; + unsigned long key; + + for (srv = px->srv; srv; srv = srv->next) { + chunk_printf(&trash, "%s %s", px->id, srv->id); + key = XXH3(trash.area, trash.data, 0); + node = eb64_lookup(st_tree, key); + if (!node) + continue; /* next server */ + st_line = eb64_entry(node, typeof(*st_line), node); + srv_state_srv_update(srv, vsn, st_line->params+4); + + /* the node may be released now */ + eb64_delete(node); + free(st_line->line); + free(st_line); + } +} + +/* + * read next line from file <f> and return the server state version if one found. + * If file is empty, then -1 is returned + * If no version is found, then 0 is returned + * Note that this should be the first read on <f> + */ +static int srv_state_get_version(FILE *f) { + char mybuf[SRV_STATE_LINE_MAXLEN]; + char *endptr; + long int vsn; + + /* first character of first line of the file must contain the version of the export */ + if (fgets(mybuf, SRV_STATE_LINE_MAXLEN, f) == NULL) + return -1; + + vsn = strtol(mybuf, &endptr, 10); + if (endptr == mybuf || *endptr != '\n') { + /* Empty or truncated line */ + return 0; + } + + if (vsn < SRV_STATE_FILE_VERSION_MIN || vsn > SRV_STATE_FILE_VERSION_MAX) { + /* Wrong version number */ + return 0; + } + + return vsn; +} + + +/* + * parses server state line stored in <buf> and supposedly in version <version>. + * Set <params> accordingly on success. It returns 1 on success, 0 if the line + * must be ignored and -1 on error. + * The caller must provide a supported version + */ +static int srv_state_parse_line(char *buf, const int version, char **params) +{ + int buflen, arg, ret; + char *cur; + + buflen = strlen(buf); + cur = buf; + ret = 1; /* be optimistic and pretend a success */ + + /* we need at least one character and a non-truncated line */ + if (buflen == 0 || buf[buflen - 1] != '\n') { + ret = -1; + goto out; + } + + /* skip blank characters at the beginning of the line */ + while (*cur == ' ' || *cur == '\t') + ++cur; + + /* ignore empty or commented lines */ + if (!*cur || *cur == '\n' || *cur == '#') { + ret = 0; + goto out; + } + + /* Removes trailing '\n' to ease parsing */ + buf[buflen - 1] = '\0'; + + /* we're now ready to move the line into <params> */ + memset(params, 0, SRV_STATE_FILE_MAX_FIELDS * sizeof(*params)); + arg = 0; + while (*cur) { + /* first of all, stop if there are too many fields */ + if (arg >= SRV_STATE_FILE_MAX_FIELDS) + break; + + /* then skip leading spaces */ + while (*cur && (*cur == ' ' || *cur == '\t')) { + ++cur; + if (!*cur) + break; + } + + /* + * idx: + * be_id: params[0] + * be_name: params[1] + * srv_id: params[2] + * srv_name: params[3] + * v1 + * srv_addr: params[4] + * srv_op_state: params[5] + * srv_admin_state: params[6] + * srv_uweight: params[7] + * srv_iweight: params[8] + * srv_last_time_change: params[9] + * srv_check_status: params[10] + * srv_check_result: params[11] + * srv_check_health: params[12] + * srv_check_state: params[13] + * srv_agent_state: params[14] + * bk_f_forced_id: params[15] + * srv_f_forced_id: params[16] + * srv_fqdn: params[17] + * srv_port: params[18] + * srvrecord: params[19] + * + * srv_use_ssl: params[20] (optional field) + * srv_check_port: params[21] (optional field) + * srv_check_addr: params[22] (optional field) + * srv_agent_addr: params[23] (optional field) + * srv_agent_port: params[24] (optional field) + * + */ + params[arg++] = cur; + + /* look for the end of the current field */ + while (*cur && *cur != ' ' && *cur != '\t') { + ++cur; + if (!*cur) + break; + } + + /* otherwise, cut the field and move to the next one */ + *cur++ = '\0'; + } + + /* if the number of fields does not match the version, then return an error */ + if (version == 1 && + (arg < SRV_STATE_FILE_MIN_FIELDS_VERSION_1 || + arg > SRV_STATE_FILE_MAX_FIELDS_VERSION_1)) + ret = -1; + + out: + return ret; +} + + +/* + * parses a server state line using srv_state_parse_line() and store the result + * in <st_tree>. If an error occurred during the parsing, the line is + * ignored. if <px> is defined, it is used to check the backend id/name against + * the parsed params and to compute the key of the line. + */ +static int srv_state_parse_and_store_line(char *line, int vsn, struct eb_root *st_tree, + struct proxy *px) +{ + struct server_state_line *st_line; + int ret = 0; + + /* store line in tree and duplicate the line */ + st_line = calloc(1, sizeof(*st_line)); + if (st_line == NULL) + goto skip_line; + st_line->line = strdup(line); + if (st_line->line == NULL) + goto skip_line; + + ret = srv_state_parse_line(st_line->line, vsn, st_line->params); + if (ret <= 0) + goto skip_line; + + /* Check backend name against params if <px> is defined */ + if (px) { + int check_id = (atoi(st_line->params[0]) == px->uuid); + int check_name = (strcmp(px->id, st_line->params[1]) == 0); + int bk_f_forced_id = (atoi(st_line->params[15]) & PR_O_FORCED_ID); + + + if (!check_id && !check_name) { + /* backend does not match at all: skip the line */ + goto skip_line; + } + else if (!check_id) { + /* Id mismatch: warn but continue */ + ha_warning("Proxy '%s': backend ID mismatch: from server state file: '%s', from running config '%d'\n", + px->id, st_line->params[0], px->uuid); + send_log(px, LOG_NOTICE, "backend ID mismatch: from server state file: '%s', from running config '%d'\n", + st_line->params[0], px->uuid); + } + else if (!check_name) { + /* Name mismatch: warn and skip the line, except if the backend id was forced + * in the previous configuration */ + ha_warning("Proxy '%s': backend name mismatch: from server state file: '%s', from running config '%s'\n", + px->id, st_line->params[1], px->id); + send_log(px, LOG_NOTICE, "backend name mismatch: from server state file: '%s', from running config '%s'\n", + st_line->params[1], px->id); + if (!bk_f_forced_id) + goto skip_line; + } + } + + /* + * The key: "be_name srv_name" + * if <px> is defined: be_name == px->id + * otherwise: be_name == params[1] + */ + chunk_printf(&trash, "%s %s", (px ? px->id : st_line->params[1]), st_line->params[3]); + st_line->node.key = XXH3(trash.area, trash.data, 0); + if (eb64_insert(st_tree, &st_line->node) != &st_line->node) { + /* this is a duplicate key, probably a hand-crafted file, drop it! */ + goto skip_line; + } + + return ret; + + skip_line: + /* free up memory in case of error during the processing of the line */ + if (st_line) { + free(st_line->line); + free(st_line); + } + return ret; +} + +/* Helper function to get the server-state file path. + * If <filename> starts with a '/', it is considered as an absolute path. In + * this case or if <global.server_state_base> is not set, <filename> only is + * considered. Otherwise, the <global.server_state_base> is concatenated to + * <filename> to produce the file path and copied to <dst_path>. in both cases, + * the result must not exceeds <maxpathlen>. + * + * The len is returned on success or -1 if the path is too long. On error, the + * caller must not rely on <dst_path>. + */ +static inline int srv_state_get_filepath(char *dst_path, int maxpathlen, const char *filename) +{ + char *sep; + int len = 0; + + /* create the globalfilepath variable */ + if (*filename == '/' || !global.server_state_base) { + /* absolute path or no base directory provided */ + len = strlcpy2(dst_path, filename, maxpathlen); + } + else { + /* concat base directory and global server-state file */ + sep = (global.server_state_base[strlen(global.server_state_base)-1] != '/' ? "/": ""); + len = snprintf(dst_path, maxpathlen, "%s%s%s", global.server_state_base, sep, filename); + } + return (len < maxpathlen ? len: -1); +} + + +/* This function parses all the proxies and only take care of the backends (since we're looking for server) + * For each proxy, it does the following: + * - opens its server state file (either one or local one) + * - read whole file, line by line + * - analyse each line to check if it matches our current backend: + * - backend name matches + * - backend id matches if id is forced and name doesn't match + * - if the server pointed by the line is found, then state is applied + * + * If the running backend uuid or id differs from the state file, then HAProxy reports + * a warning. + * + * Grabs the server's lock via srv_state_srv_update(). + */ +void apply_server_state(void) +{ + /* tree where global state_file is loaded */ + struct eb_root global_state_tree = EB_ROOT_UNIQUE; + struct proxy *curproxy; + struct server_state_line *st_line; + struct eb64_node *node, *next_node; + FILE *f; + char mybuf[SRV_STATE_LINE_MAXLEN]; + char file[MAXPATHLEN]; + int local_vsn, global_vsn, len, linenum; + + global_vsn = 0; /* no global file */ + if (!global.server_state_file) + goto no_globalfile; + len = srv_state_get_filepath(file, MAXPATHLEN, global.server_state_file); + if (len == -1) { + ha_warning("config: Can't load global server state file: file too long.\n"); + goto no_globalfile; + } + + /* Load global server state in a tree */ + errno = 0; + f = fopen(file, "r"); + if (!f) { + if (errno == ENOENT) + ha_notice("config: Can't open global server state file '%s': %s\n", file, strerror(errno)); + else + ha_warning("config: Can't open global server state file '%s': %s\n", file, strerror(errno)); + goto no_globalfile; + } + + global_vsn = srv_state_get_version(f); + if (global_vsn < 1) { + if (global_vsn == -1) + ha_notice("config: Empty global server state file '%s'.\n", + file); + if (global_vsn == 0) + ha_warning("config: Can't get version of the global server state file '%s'.\n", + file); + goto close_globalfile; + } + + for (linenum = 1; fgets(mybuf, SRV_STATE_LINE_MAXLEN, f); linenum++) { + int ret; + + ret = srv_state_parse_and_store_line(mybuf, global_vsn, &global_state_tree, NULL); + if (ret == -1) { + ha_warning("config: corrupted global server state file '%s' at line %d.\n", + file, linenum); + global_vsn = 0; + break; + } + } + + close_globalfile: + fclose(f); + + no_globalfile: + /* parse all proxies and load states form tree (global file) or from local file */ + for (curproxy = proxies_list; curproxy != NULL; curproxy = curproxy->next) { + struct eb_root local_state_tree = EB_ROOT_UNIQUE; + + /* Must be an enabled backend with at least a server */ + if (!(curproxy->cap & PR_CAP_BE) || (curproxy->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) || !curproxy->srv) + continue; /* next proxy */ + + /* Mode must be specified */ + BUG_ON(curproxy->load_server_state_from_file == PR_SRV_STATE_FILE_UNSPEC); + + /* No server-state file for this proxy */ + if (curproxy->load_server_state_from_file == PR_SRV_STATE_FILE_NONE) + continue; /* next proxy */ + + if (curproxy->load_server_state_from_file == PR_SRV_STATE_FILE_GLOBAL) { + /* when global file is used, we get data from the tree + * Note that in such case we don't check backend name neither uuid. + * Backend name can't be wrong since it's used as a key to retrieve the server state + * line from the tree. + */ + if (global_vsn) + srv_state_px_update(curproxy, global_vsn, &global_state_tree); + continue; /* next proxy */ + } + + /* + * Here we load a local server state-file + */ + + /* create file variable */ + len = srv_state_get_filepath(file, MAXPATHLEN, curproxy->server_state_file_name); + if (len == -1) { + ha_warning("Proxy '%s': Can't load local server state file: file too long.\n", curproxy->id); + continue; /* next proxy */ + } + + /* Load local server state in a tree */ + errno = 0; + f = fopen(file, "r"); + if (!f) { + if (errno == ENOENT) + ha_notice("Proxy '%s': Can't open server state file '%s': %s.\n", + curproxy->id, file, strerror(errno)); + else + ha_warning("Proxy '%s': Can't open server state file '%s': %s.\n", + curproxy->id, file, strerror(errno)); + continue; /* next proxy */ + } + + /* first character of first line of the file must contain the version of the export */ + local_vsn = srv_state_get_version(f); + if (local_vsn < 1) { + if (local_vsn == -1) + ha_notice("Proxy '%s': Empty server state file '%s'.\n", + curproxy->id, file); + if (local_vsn == 0) + ha_warning("Proxy '%s': Can't get version of the server state file '%s'.\n", + curproxy->id, file); + goto close_localfile; + } + + /* First, parse lines of the local server-state file and store them in a eb-tree */ + for (linenum = 1; fgets(mybuf, SRV_STATE_LINE_MAXLEN, f); linenum++) { + int ret; + + ret = srv_state_parse_and_store_line(mybuf, local_vsn, &local_state_tree, curproxy); + if (ret == -1) { + ha_warning("Proxy '%s': corrupted server state file '%s' at line %d.\n", + curproxy->id, file, linenum); + local_vsn = 0; + break; + } + } + + if (local_vsn) + srv_state_px_update(curproxy, local_vsn, &local_state_tree); + + /* Remove unused server-state lines */ + node = eb64_first(&local_state_tree); + while (node) { + st_line = eb64_entry(node, typeof(*st_line), node); + next_node = eb64_next(node); + eb64_delete(node); + + if (local_vsn) { + /* if no server found, then warn */ + ha_warning("Proxy '%s': can't find server '%s' in backend '%s'\n", + curproxy->id, st_line->params[3], curproxy->id); + send_log(curproxy, LOG_NOTICE, "can't find server '%s' in backend '%s'\n", + st_line->params[3], curproxy->id); + } + + free(st_line->line); + free(st_line); + node = next_node; + } + + close_localfile: + fclose(f); + } + + node = eb64_first(&global_state_tree); + while (node) { + st_line = eb64_entry(node, typeof(*st_line), node); + next_node = eb64_next(node); + eb64_delete(node); + free(st_line->line); + free(st_line); + node = next_node; + } +} diff --git a/src/session.c b/src/session.c new file mode 100644 index 0000000..ce9ccbf --- /dev/null +++ b/src/session.c @@ -0,0 +1,528 @@ +/* + * Session management functions. + * + * Copyright 2000-2015 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/ssl_sock-t.h> + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/global.h> +#include <haproxy/http.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/pool.h> +#include <haproxy/proxy.h> +#include <haproxy/session.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/tools.h> +#include <haproxy/vars.h> + + +DECLARE_POOL(pool_head_session, "session", sizeof(struct session)); +DECLARE_POOL(pool_head_sess_srv_list, "session server list", + sizeof(struct sess_srv_list)); + +int conn_complete_session(struct connection *conn); + +/* Create a a new session and assign it to frontend <fe>, listener <li>, + * origin <origin>, set the current date and clear the stick counters pointers. + * Returns the session upon success or NULL. The session may be released using + * session_free(). Note: <li> may be NULL. + */ +struct session *session_new(struct proxy *fe, struct listener *li, enum obj_type *origin) +{ + struct session *sess; + + sess = pool_alloc(pool_head_session); + if (sess) { + sess->listener = li; + sess->fe = fe; + sess->origin = origin; + sess->accept_date = date; /* user-visible date for logging */ + sess->accept_ts = now_ns; /* corrected date for internal use */ + sess->stkctr = NULL; + if (pool_head_stk_ctr) { + sess->stkctr = pool_alloc(pool_head_stk_ctr); + if (!sess->stkctr) + goto out_fail_alloc; + memset(sess->stkctr, 0, sizeof(sess->stkctr[0]) * global.tune.nb_stk_ctr); + } + vars_init_head(&sess->vars, SCOPE_SESS); + sess->task = NULL; + sess->t_handshake = -1; /* handshake not done yet */ + sess->t_idle = -1; + _HA_ATOMIC_INC(&totalconn); + _HA_ATOMIC_INC(&jobs); + LIST_INIT(&sess->srv_list); + sess->idle_conns = 0; + sess->flags = SESS_FL_NONE; + sess->src = NULL; + sess->dst = NULL; + } + return sess; + out_fail_alloc: + pool_free(pool_head_session, sess); + return NULL; +} + +void session_free(struct session *sess) +{ + struct connection *conn, *conn_back; + struct sess_srv_list *srv_list, *srv_list_back; + + if (sess->listener) + listener_release(sess->listener); + session_store_counters(sess); + pool_free(pool_head_stk_ctr, sess->stkctr); + vars_prune_per_sess(&sess->vars); + conn = objt_conn(sess->origin); + if (conn != NULL && conn->mux) + conn->mux->destroy(conn->ctx); + list_for_each_entry_safe(srv_list, srv_list_back, &sess->srv_list, srv_list) { + list_for_each_entry_safe(conn, conn_back, &srv_list->conn_list, session_list) { + LIST_DEL_INIT(&conn->session_list); + if (conn->mux) { + conn->owner = NULL; + conn->flags &= ~CO_FL_SESS_IDLE; + conn->mux->destroy(conn->ctx); + } else { + /* We have a connection, but not yet an associated mux. + * So destroy it now. + */ + conn_stop_tracking(conn); + conn_full_close(conn); + conn_free(conn); + } + } + pool_free(pool_head_sess_srv_list, srv_list); + } + sockaddr_free(&sess->src); + sockaddr_free(&sess->dst); + pool_free(pool_head_session, sess); + _HA_ATOMIC_DEC(&jobs); +} + +/* callback used from the connection/mux layer to notify that a connection is + * going to be released. + */ +void conn_session_free(struct connection *conn) +{ + session_free(conn->owner); + conn->owner = NULL; +} + +/* count a new session to keep frontend, listener and track stats up to date */ +static void session_count_new(struct session *sess) +{ + struct stkctr *stkctr; + void *ptr; + int i; + + proxy_inc_fe_sess_ctr(sess->listener, sess->fe); + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + stkctr = &sess->stkctr[i]; + if (!stkctr_entry(stkctr)) + continue; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_SESS_CNT); + if (ptr) + HA_ATOMIC_INC(&stktable_data_cast(ptr, std_t_uint)); + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_SESS_RATE); + if (ptr) + update_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_SESS_RATE].u, 1); + } +} + +/* This function is called from the protocol layer accept() in order to + * instantiate a new session on behalf of a given listener and frontend. It + * returns a positive value upon success, 0 if the connection can be ignored, + * or a negative value upon critical failure. The accepted connection is + * closed if we return <= 0. If no handshake is needed, it immediately tries + * to instantiate a new stream. The connection must already have been filled + * with the incoming connection handle (a fd), a target (the listener) and a + * source address. + */ +int session_accept_fd(struct connection *cli_conn) +{ + struct listener *l = __objt_listener(cli_conn->target); + struct proxy *p = l->bind_conf->frontend; + int cfd = cli_conn->handle.fd; + struct session *sess; + int ret; + + ret = -1; /* assume unrecoverable error by default */ + + cli_conn->proxy_netns = l->rx.settings->netns; + + /* Active reversed connection has already been initialized before being + * accepted. It must not be reset. + * TODO use a dedicated accept_fd callback for reverse protocol + */ + if (!cli_conn->xprt) { + if (conn_prepare(cli_conn, l->rx.proto, l->bind_conf->xprt) < 0) + goto out_free_conn; + + conn_ctrl_init(cli_conn); + + /* wait for a PROXY protocol header */ + if (l->bind_conf->options & BC_O_ACC_PROXY) + cli_conn->flags |= CO_FL_ACCEPT_PROXY; + + /* wait for a NetScaler client IP insertion protocol header */ + if (l->bind_conf->options & BC_O_ACC_CIP) + cli_conn->flags |= CO_FL_ACCEPT_CIP; + + /* Add the handshake pseudo-XPRT */ + if (cli_conn->flags & (CO_FL_ACCEPT_PROXY | CO_FL_ACCEPT_CIP)) { + if (xprt_add_hs(cli_conn) != 0) + goto out_free_conn; + } + } + + sess = session_new(p, l, &cli_conn->obj_type); + if (!sess) + goto out_free_conn; + + conn_set_owner(cli_conn, sess, NULL); + + /* now evaluate the tcp-request layer4 rules. We only need a session + * and no stream for these rules. + */ + if (!LIST_ISEMPTY(&p->tcp_req.l4_rules) && !tcp_exec_l4_rules(sess)) { + /* let's do a no-linger now to close with a single RST. */ + if (!(cli_conn->flags & CO_FL_FDLESS)) + setsockopt(cfd, SOL_SOCKET, SO_LINGER, (struct linger *) &nolinger, sizeof(struct linger)); + ret = 0; /* successful termination */ + goto out_free_sess; + } + /* TCP rules may flag the connection as needing proxy protocol, now that it's done we can start ourxprt */ + if (conn_xprt_start(cli_conn) < 0) + goto out_free_sess; + + /* FIXME/WTA: we should implement the setsockopt() calls at the proto + * level instead and let non-inet protocols implement their own equivalent. + */ + if (cli_conn->flags & CO_FL_FDLESS) + goto skip_fd_setup; + + /* Adjust some socket options */ + if (l->rx.addr.ss_family == AF_INET || l->rx.addr.ss_family == AF_INET6) { + setsockopt(cfd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)); + + if (p->options & PR_O_TCP_CLI_KA) { + setsockopt(cfd, SOL_SOCKET, SO_KEEPALIVE, (char *) &one, sizeof(one)); + +#ifdef TCP_KEEPCNT + if (p->clitcpka_cnt) + setsockopt(cfd, IPPROTO_TCP, TCP_KEEPCNT, &p->clitcpka_cnt, sizeof(p->clitcpka_cnt)); +#endif + +#ifdef TCP_KEEPIDLE + if (p->clitcpka_idle) + setsockopt(cfd, IPPROTO_TCP, TCP_KEEPIDLE, &p->clitcpka_idle, sizeof(p->clitcpka_idle)); +#endif + +#ifdef TCP_KEEPINTVL + if (p->clitcpka_intvl) + setsockopt(cfd, IPPROTO_TCP, TCP_KEEPINTVL, &p->clitcpka_intvl, sizeof(p->clitcpka_intvl)); +#endif + } + + if (p->options & PR_O_TCP_NOLING) + HA_ATOMIC_OR(&fdtab[cfd].state, FD_LINGER_RISK); + +#if defined(TCP_MAXSEG) + if (l->bind_conf->maxseg < 0) { + /* we just want to reduce the current MSS by that value */ + int mss; + socklen_t mss_len = sizeof(mss); + if (getsockopt(cfd, IPPROTO_TCP, TCP_MAXSEG, &mss, &mss_len) == 0) { + mss += l->bind_conf->maxseg; /* remember, it's < 0 */ + setsockopt(cfd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)); + } + } +#endif + } + + if (global.tune.client_sndbuf) + setsockopt(cfd, SOL_SOCKET, SO_SNDBUF, &global.tune.client_sndbuf, sizeof(global.tune.client_sndbuf)); + + if (global.tune.client_rcvbuf) + setsockopt(cfd, SOL_SOCKET, SO_RCVBUF, &global.tune.client_rcvbuf, sizeof(global.tune.client_rcvbuf)); + + skip_fd_setup: + /* OK, now either we have a pending handshake to execute with and then + * we must return to the I/O layer, or we can proceed with the end of + * the stream initialization. In case of handshake, we also set the I/O + * timeout to the frontend's client timeout and register a task in the + * session for this purpose. The connection's owner is left to the + * session during this period. + * + * At this point we set the relation between sess/task/conn this way : + * + * +----------------- task + * | | + * orig -- sess <-- context | + * | ^ | | + * v | | | + * conn -- owner ---> task <-----+ + */ + if (cli_conn->flags & (CO_FL_WAIT_XPRT | CO_FL_EARLY_SSL_HS)) { + int timeout; + int clt_tmt = p->timeout.client; + int hs_tmt = p->timeout.client_hs; + + if (unlikely((sess->task = task_new_here()) == NULL)) + goto out_free_sess; + + /* Handshake timeout as default timeout */ + timeout = hs_tmt ? hs_tmt : clt_tmt; + sess->task->context = sess; + sess->task->nice = l->bind_conf->nice; + sess->task->process = session_expire_embryonic; + sess->task->expire = tick_add_ifset(now_ms, timeout); + task_queue(sess->task); + return 1; + } + + /* OK let's complete stream initialization since there is no handshake */ + if (conn_complete_session(cli_conn) >= 0) + return 1; + + /* if we reach here we have deliberately decided not to keep this + * session (e.g. tcp-request rule), so that's not an error we should + * try to protect against. + */ + ret = 0; + + /* error unrolling */ + out_free_sess: + /* prevent call to listener_release during session_free. It will be + * done below, for all errors. */ + sess->listener = NULL; + session_free(sess); + + out_free_conn: + if (ret < 0 && l->bind_conf->xprt == xprt_get(XPRT_RAW) && + p->mode == PR_MODE_HTTP && l->bind_conf->mux_proto == NULL && + !(cli_conn->flags & CO_FL_FDLESS)) { + /* critical error, no more memory, try to emit a 500 response */ + send(cfd, http_err_msgs[HTTP_ERR_500], strlen(http_err_msgs[HTTP_ERR_500]), + MSG_DONTWAIT|MSG_NOSIGNAL); + } + + if (cli_conn->mux) { + /* Mux is already initialized for active reversed connection. */ + cli_conn->mux->destroy(cli_conn->ctx); + } + else { + conn_stop_tracking(cli_conn); + conn_full_close(cli_conn); + conn_free(cli_conn); + } + listener_release(l); + return ret; +} + + +/* prepare the trash with a log prefix for session <sess>. It only works with + * embryonic sessions based on a real connection. This function requires that + * at sess->origin points to the incoming connection. + */ +static void session_prepare_log_prefix(struct session *sess) +{ + const struct sockaddr_storage *src; + struct tm tm; + char pn[INET6_ADDRSTRLEN]; + int ret; + char *end; + + src = sess_src(sess); + ret = (src ? addr_to_str(src, pn, sizeof(pn)) : 0); + if (ret <= 0) + chunk_printf(&trash, "unknown ["); + else if (ret == AF_UNIX) + chunk_printf(&trash, "%s:%d [", pn, sess->listener->luid); + else + chunk_printf(&trash, "%s:%d [", pn, get_host_port(src)); + + get_localtime(sess->accept_date.tv_sec, &tm); + end = date2str_log(trash.area + trash.data, &tm, &(sess->accept_date), + trash.size - trash.data); + trash.data = end - trash.area; + if (sess->listener->name) + chunk_appendf(&trash, "] %s/%s", sess->fe->id, sess->listener->name); + else + chunk_appendf(&trash, "] %s/%d", sess->fe->id, sess->listener->luid); +} + + +/* fill the trash buffer with the string to use for send_log during + * session_kill_embryonic(). Add log prefix and error string. + * + * The function is able to dump an SSL error string when CO_ER_SSL_HANDSHAKE + * is met. + */ +static void session_build_err_string(struct session *sess) +{ + struct connection *conn = __objt_conn(sess->origin); + const char *err_msg; + struct ssl_sock_ctx __maybe_unused *ssl_ctx; + + err_msg = conn_err_code_str(conn); + session_prepare_log_prefix(sess); /* use trash buffer */ + +#ifdef USE_OPENSSL + ssl_ctx = conn_get_ssl_sock_ctx(conn); + + /* when the SSL error code is present and during a SSL Handshake failure, + * try to dump the error string from OpenSSL */ + if (conn->err_code == CO_ER_SSL_HANDSHAKE && ssl_ctx && ssl_ctx->error_code != 0) { + chunk_appendf(&trash, ": SSL handshake failure ("); + ERR_error_string_n(ssl_ctx->error_code, b_orig(&trash)+b_data(&trash), b_room(&trash)); + trash.data = strlen(b_orig(&trash)); + chunk_appendf(&trash, ")\n"); + } + + else +#endif /* ! USE_OPENSSL */ + + if (err_msg) + chunk_appendf(&trash, ": %s\n", err_msg); + else + chunk_appendf(&trash, ": unknown connection error (code=%d flags=%08x)\n", + conn->err_code, conn->flags); + + return; +} + + + +/* This function kills an existing embryonic session. It stops the connection's + * transport layer, releases assigned resources, resumes the listener if it was + * disabled and finally kills the file descriptor. This function requires that + * sess->origin points to the incoming connection. + */ +static void session_kill_embryonic(struct session *sess, unsigned int state) +{ + int level = LOG_INFO; + struct connection *conn = __objt_conn(sess->origin); + struct task *task = sess->task; + unsigned int log = sess->fe->to_log; + + if (sess->fe->options2 & PR_O2_LOGERRORS) + level = LOG_ERR; + + if (log && (sess->fe->options & PR_O_NULLNOLOG)) { + /* with "option dontlognull", we don't log connections with no transfer */ + if (!conn->err_code || + conn->err_code == CO_ER_PRX_EMPTY || conn->err_code == CO_ER_PRX_ABORT || + conn->err_code == CO_ER_CIP_EMPTY || conn->err_code == CO_ER_CIP_ABORT || + conn->err_code == CO_ER_SSL_EMPTY || conn->err_code == CO_ER_SSL_ABORT) + log = 0; + } + + if (log) { + if (!conn->err_code && (state & TASK_WOKEN_TIMER)) { + if (conn->flags & CO_FL_ACCEPT_PROXY) + conn->err_code = CO_ER_PRX_TIMEOUT; + else if (conn->flags & CO_FL_ACCEPT_CIP) + conn->err_code = CO_ER_CIP_TIMEOUT; + else if (conn->flags & CO_FL_SSL_WAIT_HS) + conn->err_code = CO_ER_SSL_TIMEOUT; + } + + if(!LIST_ISEMPTY(&sess->fe->logformat_error)) { + /* Display a log line following the configured error-log-format. */ + sess_log(sess); + } + else { + session_build_err_string(sess); + send_log(sess->fe, level, "%s", trash.area); + } + } + + /* kill the connection now */ + conn_stop_tracking(conn); + conn_full_close(conn); + conn_free(conn); + sess->origin = NULL; + + task_destroy(task); + session_free(sess); +} + +/* Manages the embryonic session timeout. It is only called when the timeout + * strikes and performs the required cleanup. It's only exported to make it + * resolve in "show tasks". + */ +struct task *session_expire_embryonic(struct task *t, void *context, unsigned int state) +{ + struct session *sess = context; + + if (!(state & TASK_WOKEN_TIMER)) + return t; + + session_kill_embryonic(sess, state); + return NULL; +} + +/* Finish initializing a session from a connection, or kills it if the + * connection shows and error. Returns <0 if the connection was killed. It may + * be called either asynchronously when ssl handshake is done with an embryonic + * session, or synchronously to finalize the session. The distinction is made + * on sess->task which is only set in the embryonic session case. + */ +int conn_complete_session(struct connection *conn) +{ + struct session *sess = conn->owner; + + sess->t_handshake = ns_to_ms(now_ns - sess->accept_ts); + + if (conn->flags & CO_FL_ERROR) + goto fail; + + /* if logs require transport layer information, note it on the connection */ + if (sess->fe->to_log & LW_XPRT) + conn->flags |= CO_FL_XPRT_TRACKED; + + /* we may have some tcp-request-session rules */ + if (!LIST_ISEMPTY(&sess->fe->tcp_req.l5_rules) && !tcp_exec_l5_rules(sess)) + goto fail; + + session_count_new(sess); + if (!conn->mux) { + if (conn_install_mux_fe(conn, NULL) < 0) + goto fail; + } + + /* the embryonic session's task is not needed anymore */ + task_destroy(sess->task); + sess->task = NULL; + conn_set_owner(conn, sess, conn_session_free); + + return 0; + + fail: + if (sess->task) + session_kill_embryonic(sess, 0); + return -1; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/sha1.c b/src/sha1.c new file mode 100644 index 0000000..b7c2d70 --- /dev/null +++ b/src/sha1.c @@ -0,0 +1,308 @@ +/* + * Based on the git SHA1 Implementation. + * + * Copyright (C) 2009-2015, Linus Torvalds and others. + * + * SHA1 routine optimized to do word accesses rather than byte accesses, + * and to avoid unnecessary copies into the context array. + * + * This was initially based on the Mozilla SHA1 implementation, although + * none of the original Mozilla code remains. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* this is only to get definitions for memcpy(), ntohl() and htonl() */ +#include <string.h> +#include <inttypes.h> +#include <arpa/inet.h> + +#include <import/sha1.h> + +/* + * Performance might be improved if the CPU architecture is OK with + * unaligned 32-bit loads and a fast ntohl() is available. + * Otherwise fall back to byte loads and shifts which is portable, + * and is faster on architectures with memory alignment issues. + */ + +#if defined(__i386__) || defined(__x86_64__) || \ + defined(__ppc__) || defined(__ppc64__) || \ + defined(__powerpc__) || defined(__powerpc64__) || \ + defined(__s390__) || defined(__s390x__) + +#define get_be32(p) ntohl(*(unsigned int *)(p)) +#define put_be32(p, v) do { *(unsigned int *)(p) = htonl(v); } while (0) + +#else + +static inline uint32_t get_be32(const void *ptr) +{ + const unsigned char *p = ptr; + return (uint32_t)p[0] << 24 | + (uint32_t)p[1] << 16 | + (uint32_t)p[2] << 8 | + (uint32_t)p[3] << 0; +} + +static inline void put_be32(void *ptr, uint32_t value) +{ + unsigned char *p = ptr; + p[0] = value >> 24; + p[1] = value >> 16; + p[2] = value >> 8; + p[3] = value >> 0; +} + +#endif + +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) + +/* + * Force usage of rol or ror by selecting the one with the smaller constant. + * It _can_ generate slightly smaller code (a constant of 1 is special), but + * perhaps more importantly it's possibly faster on any uarch that does a + * rotate with a loop. + */ + +#define SHA_ASM(op, x, n) ({ unsigned int __res; __asm__(op " %1,%0":"=r" (__res):"i" (n), "0" (x)); __res; }) +#define SHA_ROL(x,n) SHA_ASM("rol", x, n) +#define SHA_ROR(x,n) SHA_ASM("ror", x, n) + +#else + +#define SHA_ROT(X,l,r) (((X) << (l)) | ((X) >> (r))) +#define SHA_ROL(X,n) SHA_ROT(X,n,32-(n)) +#define SHA_ROR(X,n) SHA_ROT(X,32-(n),n) + +#endif + +/* + * If you have 32 registers or more, the compiler can (and should) + * try to change the array[] accesses into registers. However, on + * machines with less than ~25 registers, that won't really work, + * and at least gcc will make an unholy mess of it. + * + * So to avoid that mess which just slows things down, we force + * the stores to memory to actually happen (we might be better off + * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as + * suggested by Artur Skawina - that will also make gcc unable to + * try to do the silly "optimize away loads" part because it won't + * see what the value will be). + * + * Ben Herrenschmidt reports that on PPC, the C version comes close + * to the optimized asm with this (ie on PPC you don't want that + * 'volatile', since there are lots of registers). + * + * On ARM we get the best code generation by forcing a full memory barrier + * between each SHA_ROUND, otherwise gcc happily get wild with spilling and + * the stack frame size simply explode and performance goes down the drain. + */ + +#if defined(__i386__) || defined(__x86_64__) + #define setW(x, val) (*(volatile unsigned int *)&W(x) = (val)) +#elif defined(__GNUC__) && defined(__arm__) + #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) +#else + #define setW(x, val) (W(x) = (val)) +#endif + +/* This "rolls" over the 512-bit array */ +#define W(x) (array[(x)&15]) + +/* + * Where do we get the source from? The first 16 iterations get it from + * the input data, the next mix it from the 512-bit array. + */ +#define SHA_SRC(t) get_be32((unsigned char *) block + (t)*4) +#define SHA_MIX(t) SHA_ROL(W((t)+13) ^ W((t)+8) ^ W((t)+2) ^ W(t), 1); + +#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ + unsigned int TEMP = input(t); setW(t, TEMP); \ + E += TEMP + SHA_ROL(A,5) + (fn) + (constant); \ + B = SHA_ROR(B, 2); } while (0) + +#define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) +#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) +#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) +#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) +#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) + +static void blk_SHA1_Block(blk_SHA_CTX *ctx, const void *block) +{ + unsigned int A,B,C,D,E; + unsigned int array[16]; + + A = ctx->H[0]; + B = ctx->H[1]; + C = ctx->H[2]; + D = ctx->H[3]; + E = ctx->H[4]; + + /* Round 1 - iterations 0-16 take their input from 'block' */ + T_0_15( 0, A, B, C, D, E); + T_0_15( 1, E, A, B, C, D); + T_0_15( 2, D, E, A, B, C); + T_0_15( 3, C, D, E, A, B); + T_0_15( 4, B, C, D, E, A); + T_0_15( 5, A, B, C, D, E); + T_0_15( 6, E, A, B, C, D); + T_0_15( 7, D, E, A, B, C); + T_0_15( 8, C, D, E, A, B); + T_0_15( 9, B, C, D, E, A); + T_0_15(10, A, B, C, D, E); + T_0_15(11, E, A, B, C, D); + T_0_15(12, D, E, A, B, C); + T_0_15(13, C, D, E, A, B); + T_0_15(14, B, C, D, E, A); + T_0_15(15, A, B, C, D, E); + + /* Round 1 - tail. Input from 512-bit mixing array */ + T_16_19(16, E, A, B, C, D); + T_16_19(17, D, E, A, B, C); + T_16_19(18, C, D, E, A, B); + T_16_19(19, B, C, D, E, A); + + /* Round 2 */ + T_20_39(20, A, B, C, D, E); + T_20_39(21, E, A, B, C, D); + T_20_39(22, D, E, A, B, C); + T_20_39(23, C, D, E, A, B); + T_20_39(24, B, C, D, E, A); + T_20_39(25, A, B, C, D, E); + T_20_39(26, E, A, B, C, D); + T_20_39(27, D, E, A, B, C); + T_20_39(28, C, D, E, A, B); + T_20_39(29, B, C, D, E, A); + T_20_39(30, A, B, C, D, E); + T_20_39(31, E, A, B, C, D); + T_20_39(32, D, E, A, B, C); + T_20_39(33, C, D, E, A, B); + T_20_39(34, B, C, D, E, A); + T_20_39(35, A, B, C, D, E); + T_20_39(36, E, A, B, C, D); + T_20_39(37, D, E, A, B, C); + T_20_39(38, C, D, E, A, B); + T_20_39(39, B, C, D, E, A); + + /* Round 3 */ + T_40_59(40, A, B, C, D, E); + T_40_59(41, E, A, B, C, D); + T_40_59(42, D, E, A, B, C); + T_40_59(43, C, D, E, A, B); + T_40_59(44, B, C, D, E, A); + T_40_59(45, A, B, C, D, E); + T_40_59(46, E, A, B, C, D); + T_40_59(47, D, E, A, B, C); + T_40_59(48, C, D, E, A, B); + T_40_59(49, B, C, D, E, A); + T_40_59(50, A, B, C, D, E); + T_40_59(51, E, A, B, C, D); + T_40_59(52, D, E, A, B, C); + T_40_59(53, C, D, E, A, B); + T_40_59(54, B, C, D, E, A); + T_40_59(55, A, B, C, D, E); + T_40_59(56, E, A, B, C, D); + T_40_59(57, D, E, A, B, C); + T_40_59(58, C, D, E, A, B); + T_40_59(59, B, C, D, E, A); + + /* Round 4 */ + T_60_79(60, A, B, C, D, E); + T_60_79(61, E, A, B, C, D); + T_60_79(62, D, E, A, B, C); + T_60_79(63, C, D, E, A, B); + T_60_79(64, B, C, D, E, A); + T_60_79(65, A, B, C, D, E); + T_60_79(66, E, A, B, C, D); + T_60_79(67, D, E, A, B, C); + T_60_79(68, C, D, E, A, B); + T_60_79(69, B, C, D, E, A); + T_60_79(70, A, B, C, D, E); + T_60_79(71, E, A, B, C, D); + T_60_79(72, D, E, A, B, C); + T_60_79(73, C, D, E, A, B); + T_60_79(74, B, C, D, E, A); + T_60_79(75, A, B, C, D, E); + T_60_79(76, E, A, B, C, D); + T_60_79(77, D, E, A, B, C); + T_60_79(78, C, D, E, A, B); + T_60_79(79, B, C, D, E, A); + + ctx->H[0] += A; + ctx->H[1] += B; + ctx->H[2] += C; + ctx->H[3] += D; + ctx->H[4] += E; +} + +void blk_SHA1_Init(blk_SHA_CTX *ctx) +{ + ctx->size = 0; + + /* Initialize H with the magic constants (see FIPS180 for constants) */ + ctx->H[0] = 0x67452301; + ctx->H[1] = 0xefcdab89; + ctx->H[2] = 0x98badcfe; + ctx->H[3] = 0x10325476; + ctx->H[4] = 0xc3d2e1f0; +} + +void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *data, unsigned long len) +{ + unsigned int lenW = ctx->size & 63; + + ctx->size += len; + + /* Read the data into W and process blocks as they get full */ + if (lenW) { + unsigned int left = 64 - lenW; + if (len < left) + left = len; + memcpy(lenW + (char *)ctx->W, data, left); + lenW = (lenW + left) & 63; + len -= left; + data = ((const char *)data + left); + if (lenW) + return; + blk_SHA1_Block(ctx, ctx->W); + } + while (len >= 64) { + blk_SHA1_Block(ctx, data); + data = ((const char *)data + 64); + len -= 64; + } + if (len) + memcpy(ctx->W, data, len); +} + +void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx) +{ + static const unsigned char pad[64] = { 0x80 }; + unsigned int padlen[2]; + int i; + + /* Pad with a binary 1 (ie 0x80), then zeroes, then length */ + padlen[0] = htonl((uint32_t)(ctx->size >> 29)); + padlen[1] = htonl((uint32_t)(ctx->size << 3)); + + i = ctx->size & 63; + blk_SHA1_Update(ctx, pad, 1 + (63 & (55 - i))); + blk_SHA1_Update(ctx, padlen, 8); + + /* Output hash */ + for (i = 0; i < 5; i++) + put_be32(hashout + i * 4, ctx->H[i]); +} diff --git a/src/shctx.c b/src/shctx.c new file mode 100644 index 0000000..be59053 --- /dev/null +++ b/src/shctx.c @@ -0,0 +1,320 @@ +/* + * shctx.c - shared context management functions for SSL + * + * Copyright (C) 2011-2012 EXCELIANCE + * + * Author: Emeric Brun - emeric@exceliance.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <sys/mman.h> +#include <arpa/inet.h> +#include <import/ebmbtree.h> +#include <haproxy/list.h> +#include <haproxy/shctx.h> + +/* + * Reserve a new row if <first> is null, put it in the hotlist, set the refcount to 1 + * or append new blocks to the row with <first> as first block if non null. + * + * Reserve blocks in the avail list and put them in the hot list + * Return the first block put in the hot list or NULL if not enough blocks available + */ +struct shared_block *shctx_row_reserve_hot(struct shared_context *shctx, + struct shared_block *first, int data_len) +{ + struct shared_block *last = NULL, *block, *sblock; + struct shared_block *ret = first; + int remain = 1; + + BUG_ON(data_len < 0); + + /* Check the object size limit. */ + if (shctx->max_obj_size > 0) { + if ((first && first->len + data_len > shctx->max_obj_size) || + (!first && data_len > shctx->max_obj_size)) + goto out; + } + + if (first) { + /* Check that there is some block to reserve. + * In this first block of code we compute the remaining room in the + * current list of block already reserved for this object. + * We return asap if there is enough room to copy <data_len> bytes. + */ + last = first->last_reserved; + /* Remaining room. */ + remain = (shctx->block_size * first->block_count - first->len); + if (remain) { + if (remain > data_len) { + return last ? last : first; + } else { + data_len -= remain; + if (data_len <= 0) + return last ? last : first; + } + } + } + + shctx_wrlock(shctx); + + /* not enough usable blocks */ + if (data_len > shctx->nbav * shctx->block_size) { + shctx_wrunlock(shctx); + goto out; + } + + + if (data_len <= 0 || LIST_ISEMPTY(&shctx->avail)) { + ret = NULL; + shctx_wrunlock(shctx); + goto out; + } + + list_for_each_entry_safe(block, sblock, &shctx->avail, list) { + + /* release callback */ + if (block->len && shctx->free_block) + shctx->free_block(block, shctx->cb_data); + block->len = 0; + + if (ret) { + shctx_block_append_hot(shctx, ret, block); + if (!remain) { + first->last_append = block; + remain = 1; + } + } else { + ret = shctx_block_detach(shctx, block); + ret->len = 0; + ret->block_count = 0; + ret->last_append = NULL; + ret->refcount = 1; + } + + ++ret->block_count; + + data_len -= shctx->block_size; + + if (data_len <= 0) { + ret->last_reserved = block; + break; + } + } + + shctx_wrunlock(shctx); + + if (shctx->reserve_finish) + shctx->reserve_finish(shctx); + +out: + return ret; +} + +/* + * if the refcount is 0 move the row to the hot list. Increment the refcount + */ +void shctx_row_detach(struct shared_context *shctx, struct shared_block *first) +{ + if (first->refcount <= 0) { + + BUG_ON(!first->last_reserved); + + /* Detach row from avail list, link first item's prev to last + * item's next. This allows to use the LIST_SPLICE_END_DETACHED + * macro. */ + first->list.p->n = first->last_reserved->list.n; + first->last_reserved->list.n->p = first->list.p; + + first->list.p = &first->last_reserved->list; + first->last_reserved->list.n = &first->list; + + shctx->nbav -= first->block_count; + } + + first->refcount++; +} + +/* + * decrement the refcount and move the row at the end of the avail list if it reaches 0. + */ +void shctx_row_reattach(struct shared_context *shctx, struct shared_block *first) +{ + first->refcount--; + + if (first->refcount <= 0) { + + BUG_ON(!first->last_reserved); + + /* Reattach to avail list */ + first->list.p = &first->last_reserved->list; + LIST_SPLICE_END_DETACHED(&shctx->avail, &first->list); + + shctx->nbav += first->block_count; + } +} + + +/* + * Append data in the row if there is enough space. + * The row should be in the hot list + * + * Return the amount of appended data if ret >= 0 + * or how much more space it needs to contains the data if < 0. + */ +int shctx_row_data_append(struct shared_context *shctx, struct shared_block *first, + unsigned char *data, int len) +{ + int remain, start; + struct shared_block *block; + + /* return -<len> needed to work */ + if (len > first->block_count * shctx->block_size - first->len) + return (first->block_count * shctx->block_size - first->len) - len; + + block = first->last_append ? first->last_append : first; + do { + /* end of copy */ + if (len <= 0) + break; + + /* remaining written bytes in the current block. */ + remain = (shctx->block_size * first->block_count - first->len) % shctx->block_size; + BUG_ON(remain < 0); + + /* if remain == 0, previous buffers are full, or first->len == 0 */ + if (!remain) { + remain = shctx->block_size; + start = 0; + } + else { + /* start must be calculated before remain is modified */ + start = shctx->block_size - remain; + BUG_ON(start < 0); + } + + /* must not try to copy more than len */ + remain = MIN(remain, len); + + memcpy(block->data + start, data, remain); + + data += remain; + len -= remain; + first->len += remain; /* update len in the head of the row */ + first->last_append = block; + + block = LIST_ELEM(block->list.n, struct shared_block*, list); + } while (block != first); + + return len; +} + +/* + * Copy <len> data from a row of blocks, return the remaining data to copy + * If 0 is returned, the full data has successfully been copied + * + * The row should be in the hot list + */ +int shctx_row_data_get(struct shared_context *shctx, struct shared_block *first, + unsigned char *dst, int offset, int len) +{ + int count = 0, size = 0, start = -1; + struct shared_block *block; + + /* can't copy more */ + if (len > first->len) + len = first->len; + + block = first; + count = 0; + /* Pass through the blocks to copy them */ + do { + if (count >= first->block_count || len <= 0) + break; + + count++; + /* continue until we are in right block + corresponding to the offset */ + if (count < offset / shctx->block_size + 1) + continue; + + /* on the first block, data won't possibly began at offset 0 */ + if (start == -1) + start = offset - (count - 1) * shctx->block_size; + + BUG_ON(start < 0); + + /* size can be lower than a block when copying the last block */ + size = MIN(shctx->block_size - start, len); + BUG_ON(size < 0); + + memcpy(dst, block->data + start, size); + dst += size; + len -= size; + start = 0; + + block = LIST_ELEM(block->list.n, struct shared_block*, list); + } while (block != first); + return len; +} + +/* Allocate shared memory context. + * <maxblocks> is maximum blocks. + * If <maxblocks> is set to less or equal to 0, ssl cache is disabled. + * Returns: -1 on alloc failure, <maxblocks> if it performs context alloc, + * and 0 if cache is already allocated. + */ +int shctx_init(struct shared_context **orig_shctx, int maxblocks, int blocksize, + unsigned int maxobjsz, int extra) +{ + int i; + struct shared_context *shctx; + int ret; + void *cur; + int maptype = MAP_SHARED; + + if (maxblocks <= 0) + return 0; + + /* make sure to align the records on a pointer size */ + blocksize = (blocksize + sizeof(void *) - 1) & -sizeof(void *); + extra = (extra + sizeof(void *) - 1) & -sizeof(void *); + + shctx = (struct shared_context *)mmap(NULL, sizeof(struct shared_context) + extra + (maxblocks * (sizeof(struct shared_block) + blocksize)), + PROT_READ | PROT_WRITE, maptype | MAP_ANON, -1, 0); + if (!shctx || shctx == MAP_FAILED) { + shctx = NULL; + ret = SHCTX_E_ALLOC_CACHE; + goto err; + } + + shctx->nbav = 0; + + LIST_INIT(&shctx->avail); + HA_RWLOCK_INIT(&shctx->lock); + + shctx->block_size = blocksize; + shctx->max_obj_size = maxobjsz == (unsigned int)-1 ? 0 : maxobjsz; + + /* init the free blocks after the shared context struct */ + cur = (void *)shctx + sizeof(struct shared_context) + extra; + for (i = 0; i < maxblocks; i++) { + struct shared_block *cur_block = (struct shared_block *)cur; + cur_block->len = 0; + cur_block->refcount = 0; + cur_block->block_count = 1; + LIST_APPEND(&shctx->avail, &cur_block->list); + shctx->nbav++; + cur += sizeof(struct shared_block) + blocksize; + } + ret = maxblocks; + +err: + *orig_shctx = shctx; + return ret; +} + diff --git a/src/signal.c b/src/signal.c new file mode 100644 index 0000000..1bb60eb --- /dev/null +++ b/src/signal.c @@ -0,0 +1,284 @@ +/* + * Asynchronous signal delivery functions. + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <signal.h> +#include <string.h> + +#include <haproxy/errors.h> +#include <haproxy/signal.h> +#include <haproxy/task.h> + +/* Principle : we keep an in-order list of the first occurrence of all received + * signals. All occurrences of a same signal are grouped though. The signal + * queue does not need to be deeper than the number of signals we can handle. + * The handlers will be called asynchronously with the signal number. They can + * check themselves the number of calls by checking the descriptor this signal. + */ + +int signal_queue_len; /* length of signal queue, <= MAX_SIGNAL (1 entry per signal max) */ +int signal_queue[MAX_SIGNAL]; /* in-order queue of received signals */ +struct signal_descriptor signal_state[MAX_SIGNAL]; +sigset_t blocked_sig; +int signal_pending = 0; /* non-zero if t least one signal remains unprocessed */ + +DECLARE_STATIC_POOL(pool_head_sig_handlers, "sig_handlers", sizeof(struct sig_handler)); + +/* Common signal handler, used by all signals. Received signals are queued. + * Signal number zero has a specific status, as it cannot be delivered by the + * system, any function may call it to perform asynchronous signal delivery. + */ +void signal_handler(int sig) +{ + if (sig < 0 || sig >= MAX_SIGNAL) { + /* unhandled signal */ + signal(sig, SIG_IGN); + qfprintf(stderr, "Received unhandled signal %d. Signal has been disabled.\n", sig); + return; + } + + if (!signal_state[sig].count) { + /* signal was not queued yet */ + if (signal_queue_len < MAX_SIGNAL) + signal_queue[signal_queue_len++] = sig; + else + qfprintf(stderr, "Signal %d : signal queue is unexpectedly full.\n", sig); + } + + signal_state[sig].count++; + if (sig) + signal(sig, signal_handler); /* re-arm signal */ + + /* If the thread is TH_FL_SLEEPING we need to wake it */ + wake_thread(tid); +} + +/* Call handlers of all pending signals and clear counts and queue length. The + * handlers may unregister themselves by calling signal_register() while they + * are called, just like it is done with normal signal handlers. + * Note that it is more efficient to call the inline version which checks the + * queue length before getting here. + */ +void __signal_process_queue() +{ + int sig, cur_pos = 0; + struct signal_descriptor *desc; + sigset_t old_sig; + + /* block signal delivery during processing */ + ha_sigmask(SIG_SETMASK, &blocked_sig, &old_sig); + + /* It is important that we scan the queue forwards so that we can + * catch any signal that would have been queued by another signal + * handler. That allows real signal handlers to redistribute signals + * to tasks subscribed to signal zero. + */ + for (cur_pos = 0; cur_pos < signal_queue_len; cur_pos++) { + sig = signal_queue[cur_pos]; + desc = &signal_state[sig]; + if (desc->count) { + struct sig_handler *sh, *shb; + list_for_each_entry_safe(sh, shb, &desc->handlers, list) { + if ((sh->flags & SIG_F_TYPE_FCT) && sh->handler) + ((void (*)(struct sig_handler *))sh->handler)(sh); + else if ((sh->flags & SIG_F_TYPE_TASK) && sh->handler) + task_wakeup(sh->handler, TASK_WOKEN_SIGNAL); + } + desc->count = 0; + } + } + signal_queue_len = 0; + + /* restore signal delivery */ + ha_sigmask(SIG_SETMASK, &old_sig, NULL); +} + +/* perform minimal initializations */ +static void signal_init() +{ + int sig; + + signal_queue_len = 0; + memset(signal_queue, 0, sizeof(signal_queue)); + memset(signal_state, 0, sizeof(signal_state)); + + sigfillset(&blocked_sig); + sigdelset(&blocked_sig, SIGPROF); + /* man sigprocmask: If SIGBUS, SIGFPE, SIGILL, or SIGSEGV are + generated while they are blocked, the result is undefined, unless + the signal was generated by kill(2), + sigqueue(3), or raise(3). + Do not ignore WDTSIG or DEBUGSIG either, or it may deadlock the + watchdog */ + sigdelset(&blocked_sig, SIGBUS); + sigdelset(&blocked_sig, SIGFPE); + sigdelset(&blocked_sig, SIGILL); + sigdelset(&blocked_sig, SIGSEGV); +#ifdef DEBUGSIG + sigdelset(&blocked_sig, DEBUGSIG); +#endif +#ifdef WDTSIG + sigdelset(&blocked_sig, WDTSIG); +#endif + for (sig = 0; sig < MAX_SIGNAL; sig++) + LIST_INIT(&signal_state[sig].handlers); +} + +/* + * This function should be called to unblock all signals + */ +void haproxy_unblock_signals() +{ + sigset_t set; + + /* Ensure signals are not blocked. Some shells or service managers may + * accidentally block all of our signals unfortunately, causing lots of + * zombie processes to remain in the background during reloads. + */ + sigemptyset(&set); + ha_sigmask(SIG_SETMASK, &set, NULL); +} + +/* releases all registered signal handlers */ +void deinit_signals() +{ + int sig; + struct sig_handler *sh, *shb; + + for (sig = 0; sig < MAX_SIGNAL; sig++) { + if (sig != SIGPROF) + signal(sig, SIG_DFL); + list_for_each_entry_safe(sh, shb, &signal_state[sig].handlers, list) { + LIST_DELETE(&sh->list); + pool_free(pool_head_sig_handlers, sh); + } + } +} + +/* Register a function and an integer argument on a signal. A pointer to the + * newly allocated sig_handler is returned, or NULL in case of any error. The + * caller is responsible for unregistering the function when not used anymore. + * Note that passing a NULL as the function pointer enables interception of the + * signal without processing, which is identical to SIG_IGN. If the signal is + * zero (which the system cannot deliver), only internal functions will be able + * to notify the registered functions. + */ +struct sig_handler *signal_register_fct(int sig, void (*fct)(struct sig_handler *), int arg) +{ + struct sig_handler *sh; + + if (sig < 0 || sig >= MAX_SIGNAL) + return NULL; + + if (sig) + signal(sig, fct ? signal_handler : SIG_IGN); + + if (!fct) + return NULL; + + sh = pool_alloc(pool_head_sig_handlers); + if (!sh) + return NULL; + + sh->handler = fct; + sh->arg = arg; + sh->flags = SIG_F_TYPE_FCT; + LIST_APPEND(&signal_state[sig].handlers, &sh->list); + return sh; +} + +/* Register a task and a wake-up reason on a signal. A pointer to the newly + * allocated sig_handler is returned, or NULL in case of any error. The caller + * is responsible for unregistering the task when not used anymore. Note that + * passing a NULL as the task pointer enables interception of the signal + * without processing, which is identical to SIG_IGN. If the signal is zero + * (which the system cannot deliver), only internal functions will be able to + * notify the registered functions. + */ +struct sig_handler *signal_register_task(int sig, struct task *task, int reason) +{ + struct sig_handler *sh; + + if (sig < 0 || sig >= MAX_SIGNAL) + return NULL; + + if (sig) + signal(sig, signal_handler); + + if (!task) + return NULL; + + sh = pool_alloc(pool_head_sig_handlers); + if (!sh) + return NULL; + + sh->handler = task; + sh->arg = reason & ~TASK_WOKEN_ANY; + sh->flags = SIG_F_TYPE_TASK; + LIST_APPEND(&signal_state[sig].handlers, &sh->list); + return sh; +} + +/* Immediately unregister a handler so that no further signals may be delivered + * to it. The struct is released so the caller may not reference it anymore. + */ +void signal_unregister_handler(struct sig_handler *handler) +{ + LIST_DELETE(&handler->list); + pool_free(pool_head_sig_handlers, handler); +} + +/* Immediately unregister a handler so that no further signals may be delivered + * to it. The handler struct does not need to be known, only the function or + * task pointer. This method is expensive because it scans all the list, so it + * should only be used for rare cases (eg: exit). The struct is released so the + * caller may not reference it anymore. + */ +void signal_unregister_target(int sig, void *target) +{ + struct sig_handler *sh, *shb; + + if (sig < 0 || sig >= MAX_SIGNAL) + return; + + if (!target) + return; + + list_for_each_entry_safe(sh, shb, &signal_state[sig].handlers, list) { + if (sh->handler == target) { + LIST_DELETE(&sh->list); + pool_free(pool_head_sig_handlers, sh); + break; + } + } +} + +/* + * Immedialtely unregister every handler assigned to a signal <sig>. + * Once the handler list is empty, the signal is ignored with SIG_IGN. + */ + +void signal_unregister(int sig) +{ + struct sig_handler *sh, *shb; + + if (sig < 0 || sig >= MAX_SIGNAL) + return; + + list_for_each_entry_safe(sh, shb, &signal_state[sig].handlers, list) { + LIST_DELETE(&sh->list); + pool_free(pool_head_sig_handlers, sh); + } + + signal(sig, SIG_IGN); +} + +INITCALL0(STG_PREPARE, signal_init); diff --git a/src/sink.c b/src/sink.c new file mode 100644 index 0000000..66c2b8c --- /dev/null +++ b/src/sink.c @@ -0,0 +1,1406 @@ +/* + * Event sink management + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <sys/mman.h> +#include <errno.h> +#include <fcntl.h> + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/cfgparse.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/proxy.h> +#include <haproxy/ring.h> +#include <haproxy/sc_strm.h> +#include <haproxy/signal.h> +#include <haproxy/sink.h> +#include <haproxy/stconn.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> + +struct list sink_list = LIST_HEAD_INIT(sink_list); + +/* sink proxies list */ +struct proxy *sink_proxies_list; + +struct sink *cfg_sink; + +struct sink *sink_find(const char *name) +{ + struct sink *sink; + + list_for_each_entry(sink, &sink_list, sink_list) + if (strcmp(sink->name, name) == 0) + return sink; + return NULL; +} + +/* creates a new sink and adds it to the list, it's still generic and not fully + * initialized. Returns NULL on allocation failure. If another one already + * exists with the same name, it will be returned. The caller can detect it as + * a newly created one has type SINK_TYPE_NEW. + */ +static struct sink *__sink_new(const char *name, const char *desc, int fmt) +{ + struct sink *sink; + + sink = sink_find(name); + if (sink) + goto end; + + sink = calloc(1, sizeof(*sink)); + if (!sink) + goto end; + + sink->name = strdup(name); + if (!sink->name) + goto err; + + sink->desc = strdup(desc); + if (!sink->desc) + goto err; + + sink->fmt = fmt; + sink->type = SINK_TYPE_NEW; + sink->maxlen = BUFSIZE; + /* address will be filled by the caller if needed */ + sink->ctx.fd = -1; + sink->ctx.dropped = 0; + HA_RWLOCK_INIT(&sink->ctx.lock); + LIST_APPEND(&sink_list, &sink->sink_list); + end: + return sink; + + err: + ha_free(&sink->name); + ha_free(&sink->desc); + ha_free(&sink); + + return NULL; +} + +/* creates a sink called <name> of type FD associated to fd <fd>, format <fmt>, + * and description <desc>. Returns NULL on allocation failure or conflict. + * Perfect duplicates are merged (same type, fd, and name). + */ +struct sink *sink_new_fd(const char *name, const char *desc, enum log_fmt fmt, int fd) +{ + struct sink *sink; + + sink = __sink_new(name, desc, fmt); + if (!sink || (sink->type == SINK_TYPE_FD && sink->ctx.fd == fd)) + goto end; + + if (sink->type != SINK_TYPE_NEW) { + sink = NULL; + goto end; + } + + sink->type = SINK_TYPE_FD; + sink->ctx.fd = fd; + end: + return sink; +} + +/* creates a sink called <name> of type BUF of size <size>, format <fmt>, + * and description <desc>. Returns NULL on allocation failure or conflict. + * Perfect duplicates are merged (same type and name). If sizes differ, the + * largest one is kept. + */ +struct sink *sink_new_buf(const char *name, const char *desc, enum log_fmt fmt, size_t size) +{ + struct sink *sink; + + sink = __sink_new(name, desc, fmt); + if (!sink) + goto fail; + + if (sink->type == SINK_TYPE_BUFFER) { + /* such a buffer already exists, we may have to resize it */ + if (!ring_resize(sink->ctx.ring, size)) + goto fail; + goto end; + } + + if (sink->type != SINK_TYPE_NEW) { + /* already exists of another type */ + goto fail; + } + + sink->ctx.ring = ring_new(size); + if (!sink->ctx.ring) { + LIST_DELETE(&sink->sink_list); + free(sink->name); + free(sink->desc); + free(sink); + goto fail; + } + + sink->type = SINK_TYPE_BUFFER; + end: + return sink; + fail: + return NULL; +} + +/* tries to send <nmsg> message parts from message array <msg> to sink <sink>. + * Formatting according to the sink's preference is done here, unless sink->fmt + * is unspecified, in which case the caller formatting will be used instead. + * Lost messages are NOT accounted for. It is preferable to call sink_write() + * instead which will also try to emit the number of dropped messages when there + * are any. + * + * It will stop writing at <maxlen> instead of sink->maxlen if <maxlen> is + * positive and inferior to sink->maxlen. + * + * It returns >0 if it could write anything, <=0 otherwise. + */ + ssize_t __sink_write(struct sink *sink, struct log_header hdr, + size_t maxlen, const struct ist msg[], size_t nmsg) + { + struct ist *pfx = NULL; + size_t npfx = 0; + + if (sink->fmt == LOG_FORMAT_RAW) + goto send; + + if (sink->fmt != LOG_FORMAT_UNSPEC) + hdr.format = sink->fmt; /* sink format prevails over log one */ + pfx = build_log_header(hdr, &npfx); + +send: + if (!maxlen) + maxlen = ~0; + if (sink->type == SINK_TYPE_FD) { + return fd_write_frag_line(sink->ctx.fd, MIN(maxlen, sink->maxlen), pfx, npfx, msg, nmsg, 1); + } + else if (sink->type == SINK_TYPE_BUFFER) { + return ring_write(sink->ctx.ring, MIN(maxlen, sink->maxlen), pfx, npfx, msg, nmsg); + } + return 0; +} + +/* Tries to emit a message indicating the number of dropped events. + * The log header of the original message that we tried to emit is reused + * here with the only difference that we override the log level. This is + * possible since the announce message will be sent from the same context. + * + * In case of success, the amount of drops is reduced by as much. It's supposed + * to be called under an exclusive lock on the sink to avoid multiple producers + * doing the same. On success, >0 is returned, otherwise <=0 on failure. + */ +int sink_announce_dropped(struct sink *sink, struct log_header hdr) +{ + unsigned int dropped; + struct buffer msg; + struct ist msgvec[1]; + char logbuf[64]; + + while (unlikely((dropped = sink->ctx.dropped) > 0)) { + chunk_init(&msg, logbuf, sizeof(logbuf)); + chunk_printf(&msg, "%u event%s dropped", dropped, dropped > 1 ? "s" : ""); + msgvec[0] = ist2(msg.area, msg.data); + + hdr.level = LOG_NOTICE; /* override level but keep original log header data */ + + if (__sink_write(sink, hdr, 0, msgvec, 1) <= 0) + return 0; + /* success! */ + HA_ATOMIC_SUB(&sink->ctx.dropped, dropped); + } + return 1; +} + +/* parse the "show events" command, returns 1 if a message is returned, otherwise zero */ +static int cli_parse_show_events(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct sink *sink; + uint ring_flags; + int arg; + + args++; // make args[1] the 1st arg + + if (!*args[1]) { + /* no arg => report the list of supported sink */ + chunk_printf(&trash, "Supported events sinks are listed below. Add -w(wait), -n(new). Any key to stop\n"); + list_for_each_entry(sink, &sink_list, sink_list) { + chunk_appendf(&trash, " %-10s : type=%s, %u dropped, %s\n", + sink->name, + sink->type == SINK_TYPE_NEW ? "init" : + sink->type == SINK_TYPE_FD ? "fd" : + sink->type == SINK_TYPE_BUFFER ? "buffer" : "?", + sink->ctx.dropped, sink->desc); + } + + trash.area[trash.data] = 0; + return cli_msg(appctx, LOG_WARNING, trash.area); + } + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + sink = sink_find(args[1]); + if (!sink) + return cli_err(appctx, "No such event sink"); + + if (sink->type != SINK_TYPE_BUFFER) + return cli_msg(appctx, LOG_NOTICE, "Nothing to report for this sink"); + + ring_flags = 0; + for (arg = 2; *args[arg]; arg++) { + if (strcmp(args[arg], "-w") == 0) + ring_flags |= RING_WF_WAIT_MODE; + else if (strcmp(args[arg], "-n") == 0) + ring_flags |= RING_WF_SEEK_NEW; + else if (strcmp(args[arg], "-nw") == 0 || strcmp(args[arg], "-wn") == 0) + ring_flags |= RING_WF_WAIT_MODE | RING_WF_SEEK_NEW; + else + return cli_err(appctx, "unknown option"); + } + return ring_attach_cli(sink->ctx.ring, appctx, ring_flags); +} + +/* Pre-configures a ring proxy to emit connections */ +void sink_setup_proxy(struct proxy *px) +{ + px->last_change = ns_to_sec(now_ns); + px->cap = PR_CAP_BE; + px->maxconn = 0; + px->conn_retries = 1; + px->timeout.server = TICK_ETERNITY; + px->timeout.client = TICK_ETERNITY; + px->timeout.connect = TICK_ETERNITY; + px->accept = NULL; + px->options2 |= PR_O2_INDEPSTR | PR_O2_SMARTCON | PR_O2_SMARTACC; + px->next = sink_proxies_list; + sink_proxies_list = px; +} + +/* + * IO Handler to handle message push to syslog tcp server. + * It takes its context from appctx->svcctx. + */ +static void sink_forward_io_handler(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct sink_forward_target *sft = appctx->svcctx; + struct sink *sink = sft->sink; + struct ring *ring = sink->ctx.ring; + struct buffer *buf = &ring->buf; + uint64_t msg_len; + size_t len, cnt, ofs, last_ofs; + int ret = 0; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) + goto out; + + /* if stopping was requested, close immediately */ + if (unlikely(stopping)) + goto close; + + /* if the connection is not established, inform the stream that we want + * to be notified whenever the connection completes. + */ + if (sc_opposite(sc)->state < SC_ST_EST) { + applet_need_more_data(appctx); + se_need_remote_conn(appctx->sedesc); + applet_have_more_data(appctx); + goto out; + } + + HA_SPIN_LOCK(SFT_LOCK, &sft->lock); + if (appctx != sft->appctx) { + HA_SPIN_UNLOCK(SFT_LOCK, &sft->lock); + goto close; + } + + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + LIST_DEL_INIT(&appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); + + HA_RWLOCK_RDLOCK(RING_LOCK, &ring->lock); + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(sft->ofs == ~0)) { + sft->ofs = b_peek_ofs(buf, 0); + HA_ATOMIC_INC(b_orig(buf) + sft->ofs); + } + + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs = sft->ofs - b_head_ofs(buf); + if (sft->ofs < b_head_ofs(buf)) + ofs += b_size(buf); + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + /* in this loop, ofs always points to the counter byte that precedes + * the message so that we can take our reference there if we have to + * stop before the end (ret=0). + */ + ret = 1; + while (ofs + 1 < b_data(buf)) { + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + + if (unlikely(msg_len + 1 > b_size(&trash))) { + /* too large a message to ever fit, let's skip it */ + ofs += cnt + msg_len; + continue; + } + + chunk_reset(&trash); + len = b_getblk(buf, trash.area, msg_len, ofs + cnt); + trash.data += len; + trash.area[trash.data++] = '\n'; + + if (applet_putchk(appctx, &trash) == -1) { + ret = 0; + break; + } + ofs += cnt + msg_len; + } + + HA_ATOMIC_INC(b_peek(buf, ofs)); + last_ofs = b_tail_ofs(buf); + sft->ofs = b_peek_ofs(buf, ofs); + + HA_RWLOCK_RDUNLOCK(RING_LOCK, &ring->lock); + + if (ret) { + /* let's be woken up once new data arrive */ + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + LIST_APPEND(&ring->waiters, &appctx->wait_entry); + ofs = b_tail_ofs(buf); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); + if (ofs != last_ofs) { + /* more data was added into the ring between the + * unlock and the lock, and the writer might not + * have seen us. We need to reschedule a read. + */ + applet_have_more_data(appctx); + } else + applet_have_no_more_data(appctx); + } + HA_SPIN_UNLOCK(SFT_LOCK, &sft->lock); + +out: + /* always drain data from server */ + co_skip(sc_oc(sc), sc_oc(sc)->output); + return; + +close: + se_fl_set(appctx->sedesc, SE_FL_EOS|SE_FL_EOI); +} + +/* + * IO Handler to handle message push to syslog tcp server + * using octet counting frames + * It takes its context from appctx->svcctx. + */ +static void sink_forward_oc_io_handler(struct appctx *appctx) +{ + struct stconn *sc = appctx_sc(appctx); + struct sink_forward_target *sft = appctx->svcctx; + struct sink *sink = sft->sink; + struct ring *ring = sink->ctx.ring; + struct buffer *buf = &ring->buf; + uint64_t msg_len; + size_t len, cnt, ofs; + int ret = 0; + char *p; + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) + goto out; + + /* if stopping was requested, close immediately */ + if (unlikely(stopping)) + goto close; + + /* if the connection is not established, inform the stream that we want + * to be notified whenever the connection completes. + */ + if (sc_opposite(sc)->state < SC_ST_EST) { + applet_need_more_data(appctx); + se_need_remote_conn(appctx->sedesc); + applet_have_more_data(appctx); + goto out; + } + + HA_SPIN_LOCK(SFT_LOCK, &sft->lock); + if (appctx != sft->appctx) { + HA_SPIN_UNLOCK(SFT_LOCK, &sft->lock); + goto close; + } + + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + LIST_DEL_INIT(&appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); + + HA_RWLOCK_RDLOCK(RING_LOCK, &ring->lock); + + /* explanation for the initialization below: it would be better to do + * this in the parsing function but this would occasionally result in + * dropped events because we'd take a reference on the oldest message + * and keep it while being scheduled. Thus instead let's take it the + * first time we enter here so that we have a chance to pass many + * existing messages before grabbing a reference to a location. This + * value cannot be produced after initialization. + */ + if (unlikely(sft->ofs == ~0)) { + sft->ofs = b_peek_ofs(buf, 0); + HA_ATOMIC_INC(b_orig(buf) + sft->ofs); + } + + /* we were already there, adjust the offset to be relative to + * the buffer's head and remove us from the counter. + */ + ofs = sft->ofs - b_head_ofs(buf); + if (sft->ofs < b_head_ofs(buf)) + ofs += b_size(buf); + BUG_ON(ofs >= buf->size); + HA_ATOMIC_DEC(b_peek(buf, ofs)); + + /* in this loop, ofs always points to the counter byte that precedes + * the message so that we can take our reference there if we have to + * stop before the end (ret=0). + */ + ret = 1; + while (ofs + 1 < b_data(buf)) { + cnt = 1; + len = b_peek_varint(buf, ofs + cnt, &msg_len); + if (!len) + break; + cnt += len; + BUG_ON(msg_len + ofs + cnt + 1 > b_data(buf)); + + chunk_reset(&trash); + p = ulltoa(msg_len, trash.area, b_size(&trash)); + if (p) { + trash.data = (p - trash.area) + 1; + *p = ' '; + } + + if (!p || (trash.data + msg_len > b_size(&trash))) { + /* too large a message to ever fit, let's skip it */ + ofs += cnt + msg_len; + continue; + } + + trash.data += b_getblk(buf, p + 1, msg_len, ofs + cnt); + + if (applet_putchk(appctx, &trash) == -1) { + ret = 0; + break; + } + ofs += cnt + msg_len; + } + + HA_ATOMIC_INC(b_peek(buf, ofs)); + sft->ofs = b_peek_ofs(buf, ofs); + + HA_RWLOCK_RDUNLOCK(RING_LOCK, &ring->lock); + + if (ret) { + /* let's be woken up once new data arrive */ + HA_RWLOCK_WRLOCK(RING_LOCK, &ring->lock); + LIST_APPEND(&ring->waiters, &appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &ring->lock); + applet_have_no_more_data(appctx); + } + HA_SPIN_UNLOCK(SFT_LOCK, &sft->lock); + + out: + /* always drain data from server */ + co_skip(sc_oc(sc), sc_oc(sc)->output); + return; + +close: + se_fl_set(appctx->sedesc, SE_FL_EOS|SE_FL_EOI); + goto out; +} + +void __sink_forward_session_deinit(struct sink_forward_target *sft) +{ + struct sink *sink; + + sink = sft->sink; + if (!sink) + return; + + HA_RWLOCK_WRLOCK(RING_LOCK, &sink->ctx.ring->lock); + LIST_DEL_INIT(&sft->appctx->wait_entry); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &sink->ctx.ring->lock); + + sft->appctx = NULL; + task_wakeup(sink->forward_task, TASK_WOKEN_MSG); +} + +static int sink_forward_session_init(struct appctx *appctx) +{ + struct sink_forward_target *sft = appctx->svcctx; + struct stream *s; + struct sockaddr_storage *addr = NULL; + + if (!sockaddr_alloc(&addr, &sft->srv->addr, sizeof(sft->srv->addr))) + goto out_error; + /* srv port should be learned from srv->svc_port not from srv->addr */ + set_host_port(addr, sft->srv->svc_port); + + if (appctx_finalize_startup(appctx, sft->srv->proxy, &BUF_NULL) == -1) + goto out_free_addr; + + s = appctx_strm(appctx); + s->scb->dst = addr; + s->scb->flags |= (SC_FL_RCV_ONCE|SC_FL_NOLINGER); + + s->target = &sft->srv->obj_type; + s->flags = SF_ASSIGNED; + + s->do_log = NULL; + s->uniq_id = 0; + + applet_expect_no_data(appctx); + sft->appctx = appctx; + + return 0; + + out_free_addr: + sockaddr_free(&addr); + out_error: + return -1; +} + +static void sink_forward_session_release(struct appctx *appctx) +{ + struct sink_forward_target *sft = appctx->svcctx; + + if (!sft) + return; + + HA_SPIN_LOCK(SFT_LOCK, &sft->lock); + if (sft->appctx == appctx) + __sink_forward_session_deinit(sft); + HA_SPIN_UNLOCK(SFT_LOCK, &sft->lock); +} + +static struct applet sink_forward_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<SINKFWD>", /* used for logging */ + .fct = sink_forward_io_handler, + .init = sink_forward_session_init, + .release = sink_forward_session_release, +}; + +static struct applet sink_forward_oc_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<SINKFWDOC>", /* used for logging */ + .fct = sink_forward_oc_io_handler, + .init = sink_forward_session_init, + .release = sink_forward_session_release, +}; + +/* + * Create a new peer session in assigned state (connect will start automatically) + * It sets its context into appctx->svcctx. + */ +static struct appctx *sink_forward_session_create(struct sink *sink, struct sink_forward_target *sft) +{ + struct appctx *appctx; + struct applet *applet = &sink_forward_applet; + + if (sft->srv->log_proto == SRV_LOG_PROTO_OCTET_COUNTING) + applet = &sink_forward_oc_applet; + + appctx = appctx_new_here(applet, NULL); + if (!appctx) + goto out_close; + appctx->svcctx = (void *)sft; + + if (appctx_init(appctx) == -1) + goto out_free_appctx; + + return appctx; + + /* Error unrolling */ + out_free_appctx: + appctx_free_on_early_error(appctx); + out_close: + return NULL; +} + +/* + * Task to handle connections to forward servers + */ +static struct task *process_sink_forward(struct task * task, void *context, unsigned int state) +{ + struct sink *sink = (struct sink *)context; + struct sink_forward_target *sft = sink->sft; + + task->expire = TICK_ETERNITY; + + if (!stopping) { + while (sft) { + HA_SPIN_LOCK(SFT_LOCK, &sft->lock); + /* if appctx is NULL, start a new session */ + if (!sft->appctx) + sft->appctx = sink_forward_session_create(sink, sft); + HA_SPIN_UNLOCK(SFT_LOCK, &sft->lock); + sft = sft->next; + } + } + else { + while (sft) { + HA_SPIN_LOCK(SFT_LOCK, &sft->lock); + /* awake applet to perform a clean close */ + if (sft->appctx) + appctx_wakeup(sft->appctx); + HA_SPIN_UNLOCK(SFT_LOCK, &sft->lock); + sft = sft->next; + } + } + + return task; +} +/* + * Init task to manage connections to forward servers + * + * returns 0 in case of error. + */ +int sink_init_forward(struct sink *sink) +{ + sink->forward_task = task_new_anywhere(); + if (!sink->forward_task) + return 0; + + sink->forward_task->process = process_sink_forward; + sink->forward_task->context = (void *)sink; + sink->forward_sighandler = signal_register_task(0, sink->forward_task, 0); + task_wakeup(sink->forward_task, TASK_WOKEN_INIT); + return 1; +} + +/* This tries to rotate a file-backed ring, but only if it contains contents. + * This way empty rings will not cause backups to be overwritten and it's safe + * to reload multiple times. That's only best effort, failures are silently + * ignored. + */ +void sink_rotate_file_backed_ring(const char *name) +{ + struct ring ring; + char *oldback; + int ret; + int fd; + + fd = open(name, O_RDONLY); + if (fd < 0) + return; + + /* check for contents validity */ + ret = read(fd, &ring, sizeof(ring)); + close(fd); + + if (ret != sizeof(ring)) + goto rotate; + + /* contents are present, we want to keep them => rotate. Note that + * an empty ring buffer has one byte (the marker). + */ + if (ring.buf.data > 1) + goto rotate; + + /* nothing to keep, let's scratch the file and preserve the backup */ + return; + + rotate: + oldback = NULL; + memprintf(&oldback, "%s.bak", name); + if (oldback) { + /* try to rename any possibly existing ring file to + * ".bak" and delete remains of older ones. This will + * ensure we don't wipe useful debug info upon restart. + */ + unlink(oldback); + if (rename(name, oldback) < 0) + unlink(oldback); + ha_free(&oldback); + } +} + + +/* helper function to completely deallocate a sink struct + */ +static void sink_free(struct sink *sink) +{ + struct sink_forward_target *sft_next; + + if (!sink) + return; + if (sink->type == SINK_TYPE_BUFFER) { + if (sink->store) { + size_t size = (sink->ctx.ring->buf.size + 4095UL) & -4096UL; + void *area = (sink->ctx.ring->buf.area - sizeof(*sink->ctx.ring)); + + msync(area, size, MS_SYNC); + munmap(area, size); + ha_free(&sink->store); + } + else + ring_free(sink->ctx.ring); + } + LIST_DEL_INIT(&sink->sink_list); // remove from parent list + task_destroy(sink->forward_task); + free_proxy(sink->forward_px); + ha_free(&sink->name); + ha_free(&sink->desc); + while (sink->sft) { + sft_next = sink->sft->next; + ha_free(&sink->sft); + sink->sft = sft_next; + } + ha_free(&sink); +} + +/* Helper function to create new high-level ring buffer (as in ring section from + * the config): will create a new sink of buf type, and a new forward proxy, + * which will be stored in forward_px to know that the sink is responsible for + * it. + * + * Returns NULL on failure + */ +static struct sink *sink_new_ringbuf(const char *id, const char *description, + const char *file, int linenum, char **err_msg) +{ + struct sink *sink; + struct proxy *p = NULL; // forward_px + + /* allocate new proxy to handle forwards */ + p = calloc(1, sizeof(*p)); + if (!p) { + memprintf(err_msg, "out of memory"); + goto err; + } + + init_new_proxy(p); + sink_setup_proxy(p); + p->id = strdup(id); + p->conf.args.file = p->conf.file = strdup(file); + p->conf.args.line = p->conf.line = linenum; + + sink = sink_new_buf(id, description, LOG_FORMAT_RAW, BUFSIZE); + if (!sink) { + memprintf(err_msg, "unable to create a new sink buffer for ring '%s'", id); + goto err; + } + + /* link sink to proxy */ + sink->forward_px = p; + + return sink; + + err: + free_proxy(p); + return NULL; +} + +/* helper function: add a new server to an existing sink + * + * Returns 1 on success and 0 on failure + */ +static int sink_add_srv(struct sink *sink, struct server *srv) +{ + struct sink_forward_target *sft; + + /* allocate new sink_forward_target descriptor */ + sft = calloc(1, sizeof(*sft)); + if (!sft) { + ha_alert("memory allocation error initializing server '%s' in ring '%s'.\n", srv->id, sink->name); + return 0; + } + sft->srv = srv; + sft->appctx = NULL; + sft->ofs = ~0; /* init ring offset */ + sft->sink = sink; + sft->next = sink->sft; + HA_SPIN_INIT(&sft->lock); + + /* mark server attached to the ring */ + if (!ring_attach(sink->ctx.ring)) { + ha_alert("server '%s' sets too many watchers > 255 on ring '%s'.\n", srv->id, sink->name); + ha_free(&sft); + return 0; + } + sink->sft = sft; + return 1; +} + +/* Finalize sink struct to ensure configuration consistency and + * allocate final struct members + * + * Returns ERR_NONE on success, ERR_WARN on warning + * Returns a composition of ERR_ALERT, ERR_ABORT, ERR_FATAL on failure + */ +static int sink_finalize(struct sink *sink) +{ + int err_code = ERR_NONE; + struct server *srv; + + if (sink && (sink->type == SINK_TYPE_BUFFER)) { + if (!sink->maxlen) + sink->maxlen = ~0; // maxlen not set: no implicit truncation + else if (sink->maxlen > ring_max_payload(sink->ctx.ring)) { + /* maxlen set by user however it doesn't fit: set to max value */ + ha_warning("ring '%s' event max length '%u' exceeds max payload size, forced to '%lu'.\n", + sink->name, sink->maxlen, (unsigned long)ring_max_payload(sink->ctx.ring)); + sink->maxlen = ring_max_payload(sink->ctx.ring); + err_code |= ERR_WARN; + } + + /* prepare forward server descriptors */ + if (sink->forward_px) { + /* sink proxy is set: register all servers from the proxy */ + srv = sink->forward_px->srv; + while (srv) { + if (!sink_add_srv(sink, srv)) { + err_code |= ERR_ALERT | ERR_FATAL; + break; + } + srv = srv->next; + } + } + /* init forwarding if at least one sft is registered */ + if (sink->sft && sink_init_forward(sink) == 0) { + ha_alert("error when trying to initialize sink buffer forwarding.\n"); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + return err_code; +} + +/* + * Parse "ring" section and create corresponding sink buffer. + * + * The function returns 0 in success case, otherwise, it returns error + * flags. + */ +int cfg_parse_ring(const char *file, int linenum, char **args, int kwm) +{ + int err_code = 0; + char *err_msg = NULL; + const char *inv; + + if (strcmp(args[0], "ring") == 0) { /* new ring section */ + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing ring name.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + inv = invalid_char(args[1]); + if (inv) { + ha_alert("parsing [%s:%d] : invalid ring name '%s' (character '%c' is not permitted).\n", file, linenum, args[1], *inv); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + if (sink_find(args[1])) { + ha_alert("parsing [%s:%d] : sink named '%s' already exists.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + cfg_sink = sink_new_ringbuf(args[1], args[1], file, linenum, &err_msg); + if (!cfg_sink) { + ha_alert("parsing [%s:%d] : %s.\n", file, linenum, err_msg); + ha_free(&err_msg); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + /* set maxlen value to 0 for now, we rely on this in postparsing + * to know if it was explicitly set using the "maxlen" parameter + */ + cfg_sink->maxlen = 0; + } + else if (strcmp(args[0], "size") == 0) { + size_t size; + + if (!cfg_sink || (cfg_sink->type != SINK_TYPE_BUFFER)) { + ha_alert("parsing [%s:%d] : 'size' directive not usable with this type of sink.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + size = atol(args[1]); + if (!size) { + ha_alert("parsing [%s:%d] : invalid size '%s' for new sink buffer.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + if (cfg_sink->store) { + ha_alert("parsing [%s:%d] : cannot resize an already mapped file, please specify 'size' before 'backing-file'.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + if (size < cfg_sink->ctx.ring->buf.size) { + ha_warning("parsing [%s:%d] : ignoring new size '%llu' that is smaller than current size '%llu' for ring '%s'.\n", + file, linenum, (ullong)size, (ullong)cfg_sink->ctx.ring->buf.size, cfg_sink->name); + err_code |= ERR_WARN; + goto err; + } + + if (!ring_resize(cfg_sink->ctx.ring, size)) { + ha_alert("parsing [%s:%d] : fail to set sink buffer size '%llu' for ring '%s'.\n", file, linenum, + (ullong)cfg_sink->ctx.ring->buf.size, cfg_sink->name); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + } + else if (strcmp(args[0], "backing-file") == 0) { + /* This tries to mmap file <file> for size <size> and to use it as a backing store + * for ring <ring>. Existing data are delete. NULL is returned on error. + */ + const char *backing = args[1]; + size_t size; + void *area; + int fd; + + if (!cfg_sink || (cfg_sink->type != SINK_TYPE_BUFFER)) { + ha_alert("parsing [%s:%d] : 'backing-file' only usable with existing rings.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + if (cfg_sink->store) { + ha_alert("parsing [%s:%d] : 'backing-file' already specified for ring '%s' (was '%s').\n", file, linenum, cfg_sink->name, cfg_sink->store); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + /* let's check if the file exists and is not empty. That's the + * only condition under which we'll trigger a rotate, so that + * config checks, reloads, or restarts that don't emit anything + * do not rotate it again. + */ + sink_rotate_file_backed_ring(backing); + + fd = open(backing, O_RDWR | O_CREAT, 0600); + if (fd < 0) { + ha_alert("parsing [%s:%d] : cannot open backing-file '%s' for ring '%s': %s.\n", file, linenum, backing, cfg_sink->name, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + size = (cfg_sink->ctx.ring->buf.size + 4095UL) & -4096UL; + if (ftruncate(fd, size) != 0) { + close(fd); + ha_alert("parsing [%s:%d] : could not adjust size of backing-file for ring '%s': %s.\n", file, linenum, cfg_sink->name, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + area = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (area == MAP_FAILED) { + close(fd); + ha_alert("parsing [%s:%d] : failed to use '%s' as a backing file for ring '%s': %s.\n", file, linenum, backing, cfg_sink->name, strerror(errno)); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + /* we don't need the file anymore */ + close(fd); + cfg_sink->store = strdup(backing); + + /* never fails */ + ring_free(cfg_sink->ctx.ring); + cfg_sink->ctx.ring = ring_make_from_area(area, size); + } + else if (strcmp(args[0],"server") == 0) { + if (!cfg_sink || (cfg_sink->type != SINK_TYPE_BUFFER)) { + ha_alert("parsing [%s:%d] : unable to create server '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + err_code |= parse_server(file, linenum, args, cfg_sink->forward_px, NULL, + SRV_PARSE_PARSE_ADDR|SRV_PARSE_INITIAL_RESOLVE); + } + else if (strcmp(args[0],"timeout") == 0) { + if (!cfg_sink || !cfg_sink->forward_px) { + ha_alert("parsing [%s:%d] : unable to set timeout '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + if (strcmp(args[1], "connect") == 0 || + strcmp(args[1], "server") == 0) { + const char *res; + unsigned int tout; + + if (!*args[2]) { + ha_alert("parsing [%s:%d] : '%s %s' expects <time> as argument.\n", + file, linenum, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + res = parse_time_err(args[2], &tout, TIME_UNIT_MS); + if (res == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: timer overflow in argument <%s> to <%s %s>, maximum value is 2147483647 ms (~24.8 days).\n", + file, linenum, args[2], args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + else if (res == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: timer underflow in argument <%s> to <%s %s>, minimum non-null value is 1 ms.\n", + file, linenum, args[2], args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + else if (res) { + ha_alert("parsing [%s:%d]: unexpected character '%c' in argument to <%s %s>.\n", + file, linenum, *res, args[0], args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + if (args[1][0] == 'c') + cfg_sink->forward_px->timeout.connect = tout; + else + cfg_sink->forward_px->timeout.server = tout; + } + } + else if (strcmp(args[0],"format") == 0) { + if (!cfg_sink) { + ha_alert("parsing [%s:%d] : unable to set format '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + cfg_sink->fmt = get_log_format(args[1]); + if (cfg_sink->fmt == LOG_FORMAT_UNSPEC) { + ha_alert("parsing [%s:%d] : unknown format '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + } + else if (strcmp(args[0],"maxlen") == 0) { + if (!cfg_sink) { + ha_alert("parsing [%s:%d] : unable to set event max length '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + cfg_sink->maxlen = atol(args[1]); + if (!cfg_sink->maxlen) { + ha_alert("parsing [%s:%d] : invalid size '%s' for new sink buffer.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + } + else if (strcmp(args[0],"description") == 0) { + if (!cfg_sink) { + ha_alert("parsing [%s:%d] : unable to set description '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + if (!*args[1]) { + ha_alert("parsing [%s:%d] : missing ring description text.\n", file, linenum); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + + free(cfg_sink->desc); + + cfg_sink->desc = strdup(args[1]); + if (!cfg_sink->desc) { + ha_alert("parsing [%s:%d] : fail to set description '%s'.\n", file, linenum, args[1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + } + else { + ha_alert("parsing [%s:%d] : unknown statement '%s'.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto err; + } + +err: + return err_code; +} + +/* Creates a new sink buffer from a logger. + * + * It uses the logger's address to declare a forward + * server for this buffer. And it initializes the + * forwarding. + * + * The function returns a pointer on the + * allocated struct sink if allocate + * and initialize succeed, else if it fails + * it returns NULL. + * + * Note: the sink is created using the name + * specified into logger->target.ring_name + */ +struct sink *sink_new_from_logger(struct logger *logger) +{ + struct sink *sink = NULL; + struct server *srv = NULL; + char *err_msg = NULL; + + /* prepare description for the sink */ + chunk_reset(&trash); + chunk_printf(&trash, "created from log directive declared into '%s' at line %d", logger->conf.file, logger->conf.line); + + /* allocate a new sink buffer */ + sink = sink_new_ringbuf(logger->target.ring_name, trash.area, logger->conf.file, logger->conf.line, &err_msg); + if (!sink) { + ha_alert("%s.\n", err_msg); + ha_free(&err_msg); + goto error; + } + + /* ring format normally defaults to RAW, but here we set ring format + * to UNSPEC to inherit from caller format in sink_write() since we + * cannot customize implicit ring settings + */ + sink->fmt = LOG_FORMAT_UNSPEC; + + /* for the same reason, we disable sink->maxlen to inherit from caller + * maxlen in sink_write() + */ + sink->maxlen = 0; + + /* Set default connect and server timeout for sink forward proxy */ + sink->forward_px->timeout.connect = MS_TO_TICKS(1000); + sink->forward_px->timeout.server = MS_TO_TICKS(5000); + + /* allocate a new server to forward messages + * from ring buffer + */ + srv = new_server(sink->forward_px); + if (!srv) + goto error; + + /* init server */ + srv->id = strdup(logger->target.ring_name); + srv->conf.file = strdup(logger->conf.file); + srv->conf.line = logger->conf.line; + srv->addr = *logger->target.addr; + srv->svc_port = get_host_port(logger->target.addr); + HA_SPIN_INIT(&srv->lock); + + /* process per thread init */ + if (srv_init_per_thr(srv) == -1) + goto error; + + /* link srv with sink forward proxy: the servers are linked + * backwards first into proxy + */ + srv->next = sink->forward_px->srv; + sink->forward_px->srv = srv; + + if (sink_finalize(sink) & ERR_CODE) + goto error_final; + + return sink; + error: + srv_drop(srv); + + error_final: + sink_free(sink); + + return NULL; +} + +/* This function is pretty similar to sink_from_logger(): + * But instead of creating a forward proxy and server from a logger struct + * it uses already existing srv to create the forwarding sink, so most of + * the initialization is bypassed. + * + * The function returns a pointer on the + * allocated struct sink if allocate + * and initialize succeed, else if it fails + * it returns NULL. + * + * <from> allows to specify a string that will be inserted into the sink + * description to describe where it was created from. + + * Note: the sink is created using the name + * specified into srv->id + */ +struct sink *sink_new_from_srv(struct server *srv, const char *from) +{ + struct sink *sink = NULL; + int bufsize = (srv->log_bufsize) ? srv->log_bufsize : BUFSIZE; + + /* prepare description for the sink */ + chunk_reset(&trash); + chunk_printf(&trash, "created from %s declared into '%s' at line %d", from, srv->conf.file, srv->conf.line); + + /* directly create a sink of BUF type, and use UNSPEC log format to + * inherit from caller fmt in sink_write() + */ + sink = sink_new_buf(srv->id, trash.area, LOG_FORMAT_UNSPEC, bufsize); + if (!sink) { + ha_alert("unable to create a new sink buffer for server '%s'.\n", srv->id); + goto error; + } + + /* we disable sink->maxlen to inherit from caller + * maxlen in sink_write() + */ + sink->maxlen = 0; + + /* add server to sink */ + if (!sink_add_srv(sink, srv)) + goto error; + + if (sink_finalize(sink) & ERR_CODE) + goto error; + + return sink; + + error: + sink_free(sink); + + return NULL; +} + +/* + * Post parsing "ring" section. + * + * The function returns 0 in success case, otherwise, it returns error + * flags. + */ +int cfg_post_parse_ring() +{ + int err_code; + + err_code = sink_finalize(cfg_sink); + cfg_sink = NULL; + + return err_code; +} + +/* function: resolve a single logger target of BUFFER type + * + * Returns err_code which defaults to ERR_NONE and can be set to a combination + * of ERR_WARN, ERR_ALERT, ERR_FATAL and ERR_ABORT in case of errors. + * <msg> could be set at any time (it will usually be set on error, but + * could also be set when no error occurred to report a diag warning), thus is + * up to the caller to check it and to free it. + */ +int sink_resolve_logger_buffer(struct logger *logger, char **msg) +{ + struct log_target *target = &logger->target; + int err_code = ERR_NONE; + struct sink *sink; + + BUG_ON(target->type != LOG_TARGET_BUFFER || (target->flags & LOG_TARGET_FL_RESOLVED)); + if (target->addr) { + sink = sink_new_from_logger(logger); + if (!sink) { + memprintf(msg, "cannot be initialized (failed to create implicit ring)"); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + ha_free(&target->addr); /* we no longer need this */ + } + else { + sink = sink_find(target->ring_name); + if (!sink) { + memprintf(msg, "uses unknown ring named '%s'", target->ring_name); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + else if (sink->type != SINK_TYPE_BUFFER) { + memprintf(msg, "uses incompatible ring '%s'", target->ring_name); + err_code |= ERR_ALERT | ERR_FATAL; + goto end; + } + } + /* consistency checks */ + if (sink && logger->maxlen > ring_max_payload(sink->ctx.ring)) { + memprintf(msg, "uses a max length which exceeds ring capacity ('%s' supports %lu bytes at most)", + target->ring_name, (unsigned long)ring_max_payload(sink->ctx.ring)); + } + else if (sink && logger->maxlen > sink->maxlen) { + memprintf(msg, "uses a ring with a smaller maxlen than the one specified on the log directive ('%s' has maxlen = %d), logs will be truncated according to the lowest maxlen between the two", + target->ring_name, sink->maxlen); + } + end: + ha_free(&target->ring_name); /* sink is resolved and will replace ring_name hint */ + target->sink = sink; + return err_code; +} + +static void sink_init() +{ + sink_new_fd("stdout", "standard output (fd#1)", LOG_FORMAT_RAW, 1); + sink_new_fd("stderr", "standard output (fd#2)", LOG_FORMAT_RAW, 2); + sink_new_buf("buf0", "in-memory ring buffer", LOG_FORMAT_TIMED, 1048576); +} + +static void sink_deinit() +{ + struct sink *sink, *sb; + + list_for_each_entry_safe(sink, sb, &sink_list, sink_list) + sink_free(sink); +} + +INITCALL0(STG_REGISTER, sink_init); +REGISTER_POST_DEINIT(sink_deinit); + +static struct cli_kw_list cli_kws = {{ },{ + { { "show", "events", NULL }, "show events [<sink>] [-w] [-n] : show event sink state", cli_parse_show_events, NULL, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +/* config parsers for this section */ +REGISTER_CONFIG_SECTION("ring", cfg_parse_ring, cfg_post_parse_ring); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/slz.c b/src/slz.c new file mode 100644 index 0000000..1560bac --- /dev/null +++ b/src/slz.c @@ -0,0 +1,1421 @@ +/* + * Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <inttypes.h> +#include <stdio.h> +#include <string.h> +#include <import/slz.h> +#include <import/slz-tables.h> + +/* First, RFC1951-specific declarations and extracts from the RFC. + * + * RFC1951 - deflate stream format + + + * Data elements are packed into bytes in order of + increasing bit number within the byte, i.e., starting + with the least-significant bit of the byte. + * Data elements other than Huffman codes are packed + starting with the least-significant bit of the data + element. + * Huffman codes are packed starting with the most- + significant bit of the code. + + 3.2.3. Details of block format + + Each block of compressed data begins with 3 header bits + containing the following data: + + first bit BFINAL + next 2 bits BTYPE + + Note that the header bits do not necessarily begin on a byte + boundary, since a block does not necessarily occupy an integral + number of bytes. + + BFINAL is set if and only if this is the last block of the data + set. + + BTYPE specifies how the data are compressed, as follows: + + 00 - no compression + 01 - compressed with fixed Huffman codes + 10 - compressed with dynamic Huffman codes + 11 - reserved (error) + + 3.2.4. Non-compressed blocks (BTYPE=00) + + Any bits of input up to the next byte boundary are ignored. + The rest of the block consists of the following information: + + 0 1 2 3 4... + +---+---+---+---+================================+ + | LEN | NLEN |... LEN bytes of literal data...| + +---+---+---+---+================================+ + + LEN is the number of data bytes in the block. NLEN is the + one's complement of LEN. + + 3.2.5. Compressed blocks (length and distance codes) + + As noted above, encoded data blocks in the "deflate" format + consist of sequences of symbols drawn from three conceptually + distinct alphabets: either literal bytes, from the alphabet of + byte values (0..255), or <length, backward distance> pairs, + where the length is drawn from (3..258) and the distance is + drawn from (1..32,768). In fact, the literal and length + alphabets are merged into a single alphabet (0..285), where + values 0..255 represent literal bytes, the value 256 indicates + end-of-block, and values 257..285 represent length codes + (possibly in conjunction with extra bits following the symbol + code) as follows: + +Length encoding : + Extra Extra Extra + Code Bits Length(s) Code Bits Lengths Code Bits Length(s) + ---- ---- ------ ---- ---- ------- ---- ---- ------- + 257 0 3 267 1 15,16 277 4 67-82 + 258 0 4 268 1 17,18 278 4 83-98 + 259 0 5 269 2 19-22 279 4 99-114 + 260 0 6 270 2 23-26 280 4 115-130 + 261 0 7 271 2 27-30 281 5 131-162 + 262 0 8 272 2 31-34 282 5 163-194 + 263 0 9 273 3 35-42 283 5 195-226 + 264 0 10 274 3 43-50 284 5 227-257 + 265 1 11,12 275 3 51-58 285 0 258 + 266 1 13,14 276 3 59-66 + +Distance encoding : + Extra Extra Extra + Code Bits Dist Code Bits Dist Code Bits Distance + ---- ---- ---- ---- ---- ------ ---- ---- -------- + 0 0 1 10 4 33-48 20 9 1025-1536 + 1 0 2 11 4 49-64 21 9 1537-2048 + 2 0 3 12 5 65-96 22 10 2049-3072 + 3 0 4 13 5 97-128 23 10 3073-4096 + 4 1 5,6 14 6 129-192 24 11 4097-6144 + 5 1 7,8 15 6 193-256 25 11 6145-8192 + 6 2 9-12 16 7 257-384 26 12 8193-12288 + 7 2 13-16 17 7 385-512 27 12 12289-16384 + 8 3 17-24 18 8 513-768 28 13 16385-24576 + 9 3 25-32 19 8 769-1024 29 13 24577-32768 + + 3.2.6. Compression with fixed Huffman codes (BTYPE=01) + + The Huffman codes for the two alphabets are fixed, and are not + represented explicitly in the data. The Huffman code lengths + for the literal/length alphabet are: + + Lit Value Bits Codes + --------- ---- ----- + 0 - 143 8 00110000 through + 10111111 + 144 - 255 9 110010000 through + 111111111 + 256 - 279 7 0000000 through + 0010111 + 280 - 287 8 11000000 through + 11000111 + + The code lengths are sufficient to generate the actual codes, + as described above; we show the codes in the table for added + clarity. Literal/length values 286-287 will never actually + occur in the compressed data, but participate in the code + construction. + + Distance codes 0-31 are represented by (fixed-length) 5-bit + codes, with possible additional bits as shown in the table + shown in Paragraph 3.2.5, above. Note that distance codes 30- + 31 will never actually occur in the compressed data. + +*/ + +/* back references, built in a way that is optimal for 32/64 bits */ +union ref { + struct { + uint32_t pos; + uint32_t word; + } by32; + uint64_t by64; +}; + +#if defined(USE_64BIT_QUEUE) && defined(UNALIGNED_LE_OK) + +/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete + * 32-bit words into output buffer. X must not contain non-zero bits above + * xbits. + */ +static inline void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits) +{ + uint64_t queue = strm->queue + ((uint64_t)x << strm->qbits); + uint32_t qbits = strm->qbits + xbits; + + if (__builtin_expect(qbits >= 32, 1)) { + *(uint32_t *)strm->outbuf = queue; + queue >>= 32; + qbits -= 32; + strm->outbuf += 4; + } + + strm->queue = queue; + strm->qbits = qbits; +} + +#define enqueue8 enqueue24 + +/* flush the queue and align to next byte */ +static inline void flush_bits(struct slz_stream *strm) +{ + if (strm->qbits > 0) + *strm->outbuf++ = strm->queue; + + if (strm->qbits > 8) + *strm->outbuf++ = strm->queue >> 8; + + if (strm->qbits > 16) + *strm->outbuf++ = strm->queue >> 16; + + if (strm->qbits > 24) + *strm->outbuf++ = strm->queue >> 24; + + strm->queue = 0; + strm->qbits = 0; +} + +#else /* non-64 bit or aligned or big endian */ + +/* enqueue code x of <xbits> bits (LSB aligned, at most 24) and copy complete + * bytes into out buf. X must not contain non-zero bits above xbits. Prefer + * enqueue8() when xbits is known for being 8 or less. + */ +static void enqueue24(struct slz_stream *strm, uint32_t x, uint32_t xbits) +{ + uint32_t queue = strm->queue + (x << strm->qbits); + uint32_t qbits = strm->qbits + xbits; + + if (qbits >= 16) { +#ifndef UNALIGNED_LE_OK + strm->outbuf[0] = queue; + strm->outbuf[1] = queue >> 8; +#else + *(uint16_t *)strm->outbuf = queue; +#endif + strm->outbuf += 2; + queue >>= 16; + qbits -= 16; + } + + if (qbits >= 8) { + qbits -= 8; + *strm->outbuf++ = queue; + queue >>= 8; + } + strm->qbits = qbits; + strm->queue = queue; + return; +} + +/* enqueue code x of <xbits> bits (at most 8) and copy complete bytes into + * out buf. X must not contain non-zero bits above xbits. + */ +static inline void enqueue8(struct slz_stream *strm, uint32_t x, uint32_t xbits) +{ + uint32_t queue = strm->queue + (x << strm->qbits); + uint32_t qbits = strm->qbits + xbits; + + if (__builtin_expect((signed)(qbits - 8) >= 0, 1)) { + qbits -= 8; + *strm->outbuf++ = queue; + queue >>= 8; + } + + strm->qbits = qbits; + strm->queue = queue; +} + +/* align to next byte */ +static inline void flush_bits(struct slz_stream *strm) +{ + if (strm->qbits > 0) + *strm->outbuf++ = strm->queue; + + if (strm->qbits > 8) + *strm->outbuf++ = strm->queue >> 8; + + strm->queue = 0; + strm->qbits = 0; +} +#endif + + +/* only valid if buffer is already aligned */ +static inline void copy_8b(struct slz_stream *strm, uint32_t x) +{ + *strm->outbuf++ = x; +} + +/* only valid if buffer is already aligned */ +static inline void copy_16b(struct slz_stream *strm, uint32_t x) +{ + strm->outbuf[0] = x; + strm->outbuf[1] = x >> 8; + strm->outbuf += 2; +} + +/* only valid if buffer is already aligned */ +static inline void copy_32b(struct slz_stream *strm, uint32_t x) +{ + strm->outbuf[0] = x; + strm->outbuf[1] = x >> 8; + strm->outbuf[2] = x >> 16; + strm->outbuf[3] = x >> 24; + strm->outbuf += 4; +} + +static inline void send_huff(struct slz_stream *strm, uint32_t code) +{ + uint32_t bits; + + code = fixed_huff[code]; + bits = code & 15; + code >>= 4; + enqueue24(strm, code, bits); +} + +static inline void send_eob(struct slz_stream *strm) +{ + enqueue8(strm, 0, 7); // direct encoding of 256 = EOB (cf RFC1951) +} + +/* copies <len> literals from <buf>. <more> indicates that there are data past + * buf + <len>. <len> must not be null. + */ +static void copy_lit(struct slz_stream *strm, const void *buf, uint32_t len, int more) +{ + uint32_t len2; + + do { + len2 = len; + if (__builtin_expect(len2 > 65535, 0)) + len2 = 65535; + + len -= len2; + + if (strm->state != SLZ_ST_EOB) + send_eob(strm); + + strm->state = (more || len) ? SLZ_ST_EOB : SLZ_ST_DONE; + + enqueue8(strm, !(more || len), 3); // BFINAL = !more ; BTYPE = 00 + flush_bits(strm); + copy_16b(strm, len2); // len2 + copy_16b(strm, ~len2); // nlen2 + memcpy(strm->outbuf, buf, len2); + buf += len2; + strm->outbuf += len2; + } while (len); +} + +/* copies <len> literals from <buf>. <more> indicates that there are data past + * buf + <len>. <len> must not be null. + */ +static void copy_lit_huff(struct slz_stream *strm, const unsigned char *buf, uint32_t len, int more) +{ + uint32_t pos; + + /* This ugly construct limits the mount of tests and optimizes for the + * most common case (more > 0). + */ + if (strm->state == SLZ_ST_EOB) { + eob: + strm->state = more ? SLZ_ST_FIXED : SLZ_ST_LAST; + enqueue8(strm, 2 + !more, 3); // BFINAL = !more ; BTYPE = 01 + } + else if (!more) { + send_eob(strm); + goto eob; + } + + pos = 0; + do { + send_huff(strm, buf[pos++]); + } while (pos < len); +} + +/* format: + * bit0..31 = word + * bit32..63 = last position in buffer of similar content + */ + +/* This hash provides good average results on HTML contents, and is among the + * few which provide almost optimal results on various different pages. + */ +static inline uint32_t slz_hash(uint32_t a) +{ +#if defined(__ARM_FEATURE_CRC32) +# if defined(__ARM_ARCH_ISA_A64) + // 64 bit mode + __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(a) : "r"(0)); +# else + // 32 bit mode (e.g. armv7 compiler building for armv8 + __asm__ volatile("crc32w %0,%0,%1" : "+r"(a) : "r"(0)); +# endif + return a >> (32 - HASH_BITS); +#else + return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS); +#endif +} + +/* This function compares buffers <a> and <b> and reads 32 or 64 bits at a time + * during the approach. It makes us of unaligned little endian memory accesses + * on capable architectures. <max> is the maximum number of bytes that can be + * read, so both <a> and <b> must have at least <max> bytes ahead. <max> may + * safely be null or negative if that simplifies computations in the caller. + */ +static inline long memmatch(const unsigned char *a, const unsigned char *b, long max) +{ + long len = 0; + +#ifdef UNALIGNED_LE_OK + unsigned long xor; + + while (1) { + if ((long)(len + 2 * sizeof(long)) > max) { + while (len < max) { + if (a[len] != b[len]) + break; + len++; + } + return len; + } + + xor = *(long *)&a[len] ^ *(long *)&b[len]; + if (xor) + break; + len += sizeof(long); + + xor = *(long *)&a[len] ^ *(long *)&b[len]; + if (xor) + break; + len += sizeof(long); + } + +#if defined(__x86_64__) || defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) + /* x86 has bsf. We know that xor is non-null here */ + asm("bsf %1,%0\n" : "=r"(xor) : "0" (xor)); + return len + xor / 8; +#else + if (sizeof(long) > 4 && !(xor & 0xffffffff)) { + /* This code is optimized out on 32-bit archs, but we still + * need to shift in two passes to avoid a warning. It is + * properly optimized out as a single shift. + */ + xor >>= 16; xor >>= 16; + if (xor & 0xffff) { + if (xor & 0xff) + return len + 4; + return len + 5; + } + if (xor & 0xffffff) + return len + 6; + return len + 7; + } + + if (xor & 0xffff) { + if (xor & 0xff) + return len; + return len + 1; + } + if (xor & 0xffffff) + return len + 2; + return len + 3; +#endif // x86 + +#else // UNALIGNED_LE_OK + /* This is the generic version for big endian or unaligned-incompatible + * architectures. + */ + while (len < max) { + if (a[len] != b[len]) + break; + len++; + } + return len; + +#endif +} + +/* sets <count> BYTES to -32769 in <refs> so that any uninitialized entry will + * verify (pos-last-1 >= 32768) and be ignored. <count> must be a multiple of + * 128 bytes and <refs> must be at least one count in length. It's supposed to + * be applied to 64-bit aligned data exclusively, which makes it slightly + * faster than the regular memset() since no alignment check is performed. + */ +static void reset_refs(union ref *refs, long count) +{ + /* avoid a shift/mask by casting to void* */ + union ref *end = (void *)refs + count; + + do { + refs[ 0].by64 = -32769; + refs[ 1].by64 = -32769; + refs[ 2].by64 = -32769; + refs[ 3].by64 = -32769; + refs[ 4].by64 = -32769; + refs[ 5].by64 = -32769; + refs[ 6].by64 = -32769; + refs[ 7].by64 = -32769; + refs[ 8].by64 = -32769; + refs[ 9].by64 = -32769; + refs[10].by64 = -32769; + refs[11].by64 = -32769; + refs[12].by64 = -32769; + refs[13].by64 = -32769; + refs[14].by64 = -32769; + refs[15].by64 = -32769; + refs += 16; + } while (refs < end); +} + +/* Compresses <ilen> bytes from <in> into <out> according to RFC1951. The + * output result may be up to 5 bytes larger than the input, to which 2 extra + * bytes may be added to send the last chunk due to BFINAL+EOB encoding (10 + * bits) when <more> is not set. The caller is responsible for ensuring there + * is enough room in the output buffer for this. The amount of output bytes is + * returned, and no CRC is computed. + */ +long slz_rfc1951_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more) +{ + long rem = ilen; + unsigned long pos = 0; + unsigned long last; + uint32_t word = 0; + long mlen; + uint32_t h; + uint64_t ent; + + uint32_t plit = 0; + uint32_t bit9 = 0; + uint32_t dist, code; + union ref refs[1 << HASH_BITS]; + + if (!strm->level) { + /* force to send as literals (eg to preserve CPU) */ + strm->outbuf = out; + plit = pos = ilen; + bit9 = 52; /* force literal dump */ + goto final_lit_dump; + } + + reset_refs(refs, sizeof(refs)); + + strm->outbuf = out; + +#ifndef UNALIGNED_FASTER + word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24); +#endif + while (rem >= 4) { +#ifndef UNALIGNED_FASTER + word = ((unsigned char)in[pos + 3] << 24) + (word >> 8); +#else + word = *(uint32_t *)&in[pos]; +#endif + h = slz_hash(word); + asm volatile ("" ::); // prevent gcc from trying to be smart with the prefetch + + if (sizeof(long) >= 8) { + ent = refs[h].by64; + last = (uint32_t)ent; + ent >>= 32; + refs[h].by64 = ((uint64_t)pos) + ((uint64_t)word << 32); + } else { + ent = refs[h].by32.word; + last = refs[h].by32.pos; + refs[h].by32.pos = pos; + refs[h].by32.word = word; + } + +#ifdef FIND_OPTIMAL_MATCH + /* Experimental code to see what could be saved with an ideal + * longest match lookup algorithm. This one is very slow but + * scans the whole window. In short, here are the savings : + * file orig fast(ratio) optimal(ratio) + * README 5185 3419 (65.9%) 3165 (61.0%) -7.5% + * index.html 76799 35662 (46.4%) 29875 (38.9%) -16.3% + * rfc1952.c 29383 13442 (45.7%) 11793 (40.1%) -12.3% + * + * Thus the savings to expect for large files is at best 16%. + * + * A non-colliding hash gives 33025 instead of 35662 (-7.4%), + * and keeping the last two entries gives 31724 (-11.0%). + */ + unsigned long scan; + int saved = 0; + int bestpos = 0; + int bestlen = 0; + int firstlen = 0; + int max_lookup = 2; // 0 = no limit + + for (scan = pos - 1; scan < pos && (unsigned long)(pos - scan - 1) < 32768; scan--) { + int len; + + if (*(uint32_t *)(in + scan) != word) + continue; + + len = memmatch(in + pos, in + scan, rem); + if (!bestlen) + firstlen = len; + + if (len > bestlen) { + bestlen = len; + bestpos = scan; + } + if (!--max_lookup) + break; + } + if (bestlen) { + //printf("pos=%d last=%d bestpos=%d word=%08x ent=%08x len=%d\n", + // (int)pos, (int)last, (int)bestpos, (int)word, (int)ent, bestlen); + last = bestpos; + ent = word; + saved += bestlen - firstlen; + } + //fprintf(stderr, "first=%d best=%d saved_total=%d\n", firstlen, bestlen, saved); +#endif + + if ((uint32_t)ent != word) { + send_as_lit: + rem--; + plit++; + bit9 += ((unsigned char)word >= 144); + pos++; + continue; + } + + /* We reject pos = last and pos > last+32768 */ + if ((unsigned long)(pos - last - 1) >= 32768) + goto send_as_lit; + + /* Note: cannot encode a length larger than 258 bytes */ + mlen = memmatch(in + pos + 4, in + last + 4, (rem > 258 ? 258 : rem) - 4) + 4; + + /* found a matching entry */ + + if (bit9 >= 52 && mlen < 6) + goto send_as_lit; + + /* compute the output code, its size and the length's size in + * bits to know if the reference is cheaper than literals. + */ + code = len_fh[mlen]; + + /* direct mapping of dist->huffman code */ + dist = fh_dist_table[pos - last - 1]; + + /* if encoding the dist+length is more expensive than sending + * the equivalent as bytes, lets keep the literals. + */ + if ((dist & 0x1f) + (code >> 16) + 8 >= 8 * mlen + bit9) + goto send_as_lit; + + /* first, copy pending literals */ + if (plit) { + /* Huffman encoding requires 9 bits for octets 144..255, so this + * is a waste of space for binary data. Switching between Huffman + * and no-comp then huffman consumes 52 bits (7 for EOB + 3 for + * block type + 7 for alignment + 32 for LEN+NLEN + 3 for next + * block. Only use plain literals if there are more than 52 bits + * to save then. + */ + if (bit9 >= 52) + copy_lit(strm, in + pos - plit, plit, 1); + else + copy_lit_huff(strm, in + pos - plit, plit, 1); + + plit = 0; + } + + /* use mode 01 - fixed huffman */ + if (strm->state == SLZ_ST_EOB) { + strm->state = SLZ_ST_FIXED; + enqueue8(strm, 0x02, 3); // BTYPE = 01, BFINAL = 0 + } + + /* copy the length first */ + enqueue24(strm, code & 0xFFFF, code >> 16); + + /* in fixed huffman mode, dist is fixed 5 bits */ + enqueue24(strm, dist >> 5, dist & 0x1f); + bit9 = 0; + rem -= mlen; + pos += mlen; + +#ifndef UNALIGNED_FASTER +#ifdef UNALIGNED_LE_OK + word = *(uint32_t *)&in[pos - 1]; +#else + word = ((unsigned char)in[pos] << 8) + ((unsigned char)in[pos + 1] << 16) + ((unsigned char)in[pos + 2] << 24); +#endif +#endif + } + + if (__builtin_expect(rem, 0)) { + /* we're reading the 1..3 last bytes */ + plit += rem; + do { + bit9 += ((unsigned char)in[pos++] >= 144); + } while (--rem); + } + + final_lit_dump: + /* now copy remaining literals or mark the end */ + if (plit) { + if (bit9 >= 52) + copy_lit(strm, in + pos - plit, plit, more); + else + copy_lit_huff(strm, in + pos - plit, plit, more); + + plit = 0; + } + + strm->ilen += ilen; + return strm->outbuf - out; +} + +/* Initializes stream <strm> for use with raw deflate (rfc1951). The CRC is + * unused but set to zero. The compression level passed in <level> is set. This + * value can only be 0 (no compression) or 1 (compression) and other values + * will lead to unpredictable behaviour. The function always returns 0. + */ +int slz_rfc1951_init(struct slz_stream *strm, int level) +{ + strm->state = SLZ_ST_EOB; // no header + strm->level = level; + strm->format = SLZ_FMT_DEFLATE; + strm->crc32 = 0; + strm->ilen = 0; + strm->qbits = 0; + strm->queue = 0; + return 0; +} + +/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an + * empty literal block to byte-align the output, allowing to completely flush + * the queue. This requires that the output buffer still has the size of the + * queue available (up to 4 bytes), plus one byte for (BFINAL,BTYPE), plus 4 + * bytes for LEN+NLEN, or a total of 9 bytes in the worst case. The number of + * bytes emitted is returned. It is guaranteed that the queue is empty on + * return. This may cause some overhead by adding needless 5-byte blocks if + * called to often. + */ +int slz_rfc1951_flush(struct slz_stream *strm, unsigned char *buf) +{ + strm->outbuf = buf; + + /* The queue is always empty on INIT, DONE, and END */ + if (!strm->qbits) + return 0; + + /* we may need to terminate a huffman output. Lit is always in EOB state */ + if (strm->state != SLZ_ST_EOB) { + strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB; + send_eob(strm); + } + + /* send BFINAL according to state, and BTYPE=00 (lit) */ + enqueue8(strm, (strm->state == SLZ_ST_DONE) ? 1 : 0, 3); + flush_bits(strm); // emit pending bits + copy_32b(strm, 0xFFFF0000U); // len=0, nlen=~0 + + /* Now the queue is empty, EOB was sent, BFINAL might have been sent if + * we completed the last block, and a zero-byte block was sent to byte- + * align the output. The last state reflects all this. Let's just + * return the number of bytes added to the output buffer. + */ + return strm->outbuf - buf; +} + +/* Flushes any pending for stream <strm> into buffer <buf>, then sends BTYPE=1 + * and BFINAL=1 if needed. The stream ends in SLZ_ST_DONE. It returns the number + * of bytes emitted. The trailer consists in flushing the possibly pending bits + * from the queue (up to 7 bits), then possibly EOB (7 bits), then 3 bits, EOB, + * a rounding to the next byte, which amounts to a total of 4 bytes max, that + * the caller must ensure are available before calling the function. + */ +int slz_rfc1951_finish(struct slz_stream *strm, unsigned char *buf) +{ + strm->outbuf = buf; + + if (strm->state == SLZ_ST_FIXED || strm->state == SLZ_ST_LAST) { + strm->state = (strm->state == SLZ_ST_LAST) ? SLZ_ST_DONE : SLZ_ST_EOB; + send_eob(strm); + } + + if (strm->state != SLZ_ST_DONE) { + /* send BTYPE=1, BFINAL=1 */ + enqueue8(strm, 3, 3); + send_eob(strm); + strm->state = SLZ_ST_DONE; + } + + flush_bits(strm); + return strm->outbuf - buf; +} + +/* Now RFC1952-specific declarations and extracts from RFC. + * From RFC1952 about the GZIP file format : + +A gzip file consists of a series of "members" ... + +2.3. Member format + + Each member has the following structure: + + +---+---+---+---+---+---+---+---+---+---+ + |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->) + +---+---+---+---+---+---+---+---+---+---+ + + (if FLG.FEXTRA set) + + +---+---+=================================+ + | XLEN |...XLEN bytes of "extra field"...| (more-->) + +---+---+=================================+ + + (if FLG.FNAME set) + + +=========================================+ + |...original file name, zero-terminated...| (more-->) + +=========================================+ + + (if FLG.FCOMMENT set) + + +===================================+ + |...file comment, zero-terminated...| (more-->) + +===================================+ + + (if FLG.FHCRC set) + + +---+---+ + | CRC16 | + +---+---+ + + +=======================+ + |...compressed blocks...| (more-->) + +=======================+ + + 0 1 2 3 4 5 6 7 + +---+---+---+---+---+---+---+---+ + | CRC32 | ISIZE | + +---+---+---+---+---+---+---+---+ + + +2.3.1. Member header and trailer + + ID1 (IDentification 1) + ID2 (IDentification 2) + These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139 + (0x8b, \213), to identify the file as being in gzip format. + + CM (Compression Method) + This identifies the compression method used in the file. CM + = 0-7 are reserved. CM = 8 denotes the "deflate" + compression method, which is the one customarily used by + gzip and which is documented elsewhere. + + FLG (FLaGs) + This flag byte is divided into individual bits as follows: + + bit 0 FTEXT + bit 1 FHCRC + bit 2 FEXTRA + bit 3 FNAME + bit 4 FCOMMENT + bit 5 reserved + bit 6 reserved + bit 7 reserved + + Reserved FLG bits must be zero. + + MTIME (Modification TIME) + This gives the most recent modification time of the original + file being compressed. The time is in Unix format, i.e., + seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this + may cause problems for MS-DOS and other systems that use + local rather than Universal time.) If the compressed data + did not come from a file, MTIME is set to the time at which + compression started. MTIME = 0 means no time stamp is + available. + + XFL (eXtra FLags) + These flags are available for use by specific compression + methods. The "deflate" method (CM = 8) sets these flags as + follows: + + XFL = 2 - compressor used maximum compression, + slowest algorithm + XFL = 4 - compressor used fastest algorithm + + OS (Operating System) + This identifies the type of file system on which compression + took place. This may be useful in determining end-of-line + convention for text files. The currently defined values are + as follows: + + 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32) + 1 - Amiga + 2 - VMS (or OpenVMS) + 3 - Unix + 4 - VM/CMS + 5 - Atari TOS + 6 - HPFS filesystem (OS/2, NT) + 7 - Macintosh + 8 - Z-System + 9 - CP/M + 10 - TOPS-20 + 11 - NTFS filesystem (NT) + 12 - QDOS + 13 - Acorn RISCOS + 255 - unknown + + ==> A file compressed using "gzip -1" on Unix-like systems can be : + + 1F 8B 08 00 00 00 00 00 04 03 + <deflate-compressed stream> + crc32 size32 +*/ + +static const unsigned char gzip_hdr[] = { 0x1F, 0x8B, // ID1, ID2 + 0x08, 0x00, // Deflate, flags (none) + 0x00, 0x00, 0x00, 0x00, // mtime: none + 0x04, 0x03 }; // fastest comp, OS=Unix + +static inline uint32_t crc32_char(uint32_t crc, uint8_t x) +{ +#if defined(__ARM_FEATURE_CRC32) + crc = ~crc; +# if defined(__ARM_ARCH_ISA_A64) + // 64 bit mode + __asm__ volatile("crc32b %w0,%w0,%w1" : "+r"(crc) : "r"(x)); +# else + // 32 bit mode (e.g. armv7 compiler building for armv8 + __asm__ volatile("crc32b %0,%0,%1" : "+r"(crc) : "r"(x)); +# endif + crc = ~crc; +#else + crc = crc32_fast[0][(crc ^ x) & 0xff] ^ (crc >> 8); +#endif + return crc; +} + +static inline uint32_t crc32_uint32(uint32_t data) +{ +#if defined(__ARM_FEATURE_CRC32) +# if defined(__ARM_ARCH_ISA_A64) + // 64 bit mode + __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(data) : "r"(~0UL)); +# else + // 32 bit mode (e.g. armv7 compiler building for armv8 + __asm__ volatile("crc32w %0,%0,%1" : "+r"(data) : "r"(~0UL)); +# endif + data = ~data; +#else + data = crc32_fast[3][(data >> 0) & 0xff] ^ + crc32_fast[2][(data >> 8) & 0xff] ^ + crc32_fast[1][(data >> 16) & 0xff] ^ + crc32_fast[0][(data >> 24) & 0xff]; +#endif + return data; +} + +/* Modified version originally from RFC1952, working with non-inverting CRCs */ +uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len) +{ + int n; + + for (n = 0; n < len; n++) + crc = crc32_char(crc, buf[n]); + return crc; +} + +/* This version computes the crc32 of <buf> over <len> bytes, doing most of it + * in 32-bit chunks. + */ +uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len) +{ + const unsigned char *end = buf + len; + + while (buf <= end - 16) { +#ifdef UNALIGNED_LE_OK +#if defined(__ARM_FEATURE_CRC32) + crc = ~crc; +# if defined(__ARM_ARCH_ISA_A64) + // 64 bit mode + __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf))); + __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 4))); + __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 8))); + __asm__ volatile("crc32w %w0,%w0,%w1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 12))); +# else + // 32 bit mode (e.g. armv7 compiler building for armv8 + __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf))); + __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 4))); + __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 8))); + __asm__ volatile("crc32w %0,%0,%1" : "+r"(crc) : "r"(*(uint32_t*)(buf + 12))); +# endif + crc = ~crc; +#else + crc ^= *(uint32_t *)buf; + crc = crc32_uint32(crc); + + crc ^= *(uint32_t *)(buf + 4); + crc = crc32_uint32(crc); + + crc ^= *(uint32_t *)(buf + 8); + crc = crc32_uint32(crc); + + crc ^= *(uint32_t *)(buf + 12); + crc = crc32_uint32(crc); +#endif +#else + crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^ + crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^ + crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^ + crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff]; + + crc = crc32_fast[3][(buf[4] ^ (crc >> 0)) & 0xff] ^ + crc32_fast[2][(buf[5] ^ (crc >> 8)) & 0xff] ^ + crc32_fast[1][(buf[6] ^ (crc >> 16)) & 0xff] ^ + crc32_fast[0][(buf[7] ^ (crc >> 24)) & 0xff]; + + crc = crc32_fast[3][(buf[8] ^ (crc >> 0)) & 0xff] ^ + crc32_fast[2][(buf[9] ^ (crc >> 8)) & 0xff] ^ + crc32_fast[1][(buf[10] ^ (crc >> 16)) & 0xff] ^ + crc32_fast[0][(buf[11] ^ (crc >> 24)) & 0xff]; + + crc = crc32_fast[3][(buf[12] ^ (crc >> 0)) & 0xff] ^ + crc32_fast[2][(buf[13] ^ (crc >> 8)) & 0xff] ^ + crc32_fast[1][(buf[14] ^ (crc >> 16)) & 0xff] ^ + crc32_fast[0][(buf[15] ^ (crc >> 24)) & 0xff]; +#endif + buf += 16; + } + + while (buf <= end - 4) { +#ifdef UNALIGNED_LE_OK + crc ^= *(uint32_t *)buf; + crc = crc32_uint32(crc); +#else + crc = crc32_fast[3][(buf[0] ^ (crc >> 0)) & 0xff] ^ + crc32_fast[2][(buf[1] ^ (crc >> 8)) & 0xff] ^ + crc32_fast[1][(buf[2] ^ (crc >> 16)) & 0xff] ^ + crc32_fast[0][(buf[3] ^ (crc >> 24)) & 0xff]; +#endif + buf += 4; + } + + while (buf < end) + crc = crc32_char(crc, *buf++); + return crc; +} + +/* uses the most suitable crc32 function to update crc on <buf, len> */ +static inline uint32_t update_crc(uint32_t crc, const void *buf, int len) +{ + return slz_crc32_by4(crc, buf, len); +} + +/* Sends the gzip header for stream <strm> into buffer <buf>. When it's done, + * the stream state is updated to SLZ_ST_EOB. It returns the number of bytes + * emitted which is always 10. The caller is responsible for ensuring there's + * always enough room in the buffer. + */ +int slz_rfc1952_send_header(struct slz_stream *strm, unsigned char *buf) +{ + memcpy(buf, gzip_hdr, sizeof(gzip_hdr)); + strm->state = SLZ_ST_EOB; + return sizeof(gzip_hdr); +} + +/* Encodes the block according to rfc1952. This means that the CRC of the input + * block is computed according to the CRC32 algorithm. If the header was never + * sent, it may be sent first. The number of output bytes is returned. + */ +long slz_rfc1952_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more) +{ + long ret = 0; + + if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) + ret += slz_rfc1952_send_header(strm, out); + + strm->crc32 = update_crc(strm->crc32, in, ilen); + ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more); + return ret; +} + +/* Initializes stream <strm> for use with the gzip format (rfc1952). The + * compression level passed in <level> is set. This value can only be 0 (no + * compression) or 1 (compression) and other values will lead to unpredictable + * behaviour. The function always returns 0. + */ +int slz_rfc1952_init(struct slz_stream *strm, int level) +{ + strm->state = SLZ_ST_INIT; + strm->level = level; + strm->format = SLZ_FMT_GZIP; + strm->crc32 = 0; + strm->ilen = 0; + strm->qbits = 0; + strm->queue = 0; + return 0; +} + +/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an + * empty literal block to byte-align the output, allowing to completely flush + * the queue. Note that if the initial header was never sent, it will be sent + * first as well (10 extra bytes). This requires that the output buffer still + * has this plus the size of the queue available (up to 4 bytes), plus one byte + * for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 19 bytes in the + * worst case. The number of bytes emitted is returned. It is guaranteed that + * the queue is empty on return. This may cause some overhead by adding + * needless 5-byte blocks if called to often. + */ +int slz_rfc1952_flush(struct slz_stream *strm, unsigned char *buf) +{ + int sent = 0; + + if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) + sent = slz_rfc1952_send_header(strm, buf); + + sent += slz_rfc1951_flush(strm, buf + sent); + return sent; +} + +/* Flushes pending bits and sends the gzip trailer for stream <strm> into + * buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It + * returns the number of bytes emitted. The trailer consists in flushing the + * possibly pending bits from the queue (up to 24 bits), rounding to the next + * byte, then 4 bytes for the CRC and another 4 bytes for the input length. + * That may about to 4+4+4 = 12 bytes, that the caller must ensure are + * available before calling the function. Note that if the initial header was + * never sent, it will be sent first as well (10 extra bytes). + */ +int slz_rfc1952_finish(struct slz_stream *strm, unsigned char *buf) +{ + strm->outbuf = buf; + + if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) + strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf); + + slz_rfc1951_finish(strm, strm->outbuf); + copy_32b(strm, strm->crc32); + copy_32b(strm, strm->ilen); + strm->state = SLZ_ST_END; + + return strm->outbuf - buf; +} + + +/* RFC1950-specific stuff. This is for the Zlib stream format. + * From RFC1950 (zlib) : + * + + 2.2. Data format + + A zlib stream has the following structure: + + 0 1 + +---+---+ + |CMF|FLG| (more-->) + +---+---+ + + + (if FLG.FDICT set) + + 0 1 2 3 + +---+---+---+---+ + | DICTID | (more-->) + +---+---+---+---+ + + +=====================+---+---+---+---+ + |...compressed data...| ADLER32 | + +=====================+---+---+---+---+ + + Any data which may appear after ADLER32 are not part of the zlib + stream. + + CMF (Compression Method and flags) + This byte is divided into a 4-bit compression method and a 4- + bit information field depending on the compression method. + + bits 0 to 3 CM Compression method + bits 4 to 7 CINFO Compression info + + CM (Compression method) + This identifies the compression method used in the file. CM = 8 + denotes the "deflate" compression method with a window size up + to 32K. This is the method used by gzip and PNG (see + references [1] and [2] in Chapter 3, below, for the reference + documents). CM = 15 is reserved. It might be used in a future + version of this specification to indicate the presence of an + extra field before the compressed data. + + CINFO (Compression info) + For CM = 8, CINFO is the base-2 logarithm of the LZ77 window + size, minus eight (CINFO=7 indicates a 32K window size). Values + of CINFO above 7 are not allowed in this version of the + specification. CINFO is not defined in this specification for + CM not equal to 8. + + FLG (FLaGs) + This flag byte is divided as follows: + + bits 0 to 4 FCHECK (check bits for CMF and FLG) + bit 5 FDICT (preset dictionary) + bits 6 to 7 FLEVEL (compression level) + + The FCHECK value must be such that CMF and FLG, when viewed as + a 16-bit unsigned integer stored in MSB order (CMF*256 + FLG), + is a multiple of 31. + + + FDICT (Preset dictionary) + If FDICT is set, a DICT dictionary identifier is present + immediately after the FLG byte. The dictionary is a sequence of + bytes which are initially fed to the compressor without + producing any compressed output. DICT is the Adler-32 checksum + of this sequence of bytes (see the definition of ADLER32 + below). The decompressor can use this identifier to determine + which dictionary has been used by the compressor. + + FLEVEL (Compression level) + These flags are available for use by specific compression + methods. The "deflate" method (CM = 8) sets these flags as + follows: + + 0 - compressor used fastest algorithm + 1 - compressor used fast algorithm + 2 - compressor used default algorithm + 3 - compressor used maximum compression, slowest algorithm + + The information in FLEVEL is not needed for decompression; it + is there to indicate if recompression might be worthwhile. + + compressed data + For compression method 8, the compressed data is stored in the + deflate compressed data format as described in the document + "DEFLATE Compressed Data Format Specification" by L. Peter + Deutsch. (See reference [3] in Chapter 3, below) + + Other compressed data formats are not specified in this version + of the zlib specification. + + ADLER32 (Adler-32 checksum) + This contains a checksum value of the uncompressed data + (excluding any dictionary data) computed according to Adler-32 + algorithm. This algorithm is a 32-bit extension and improvement + of the Fletcher algorithm, used in the ITU-T X.224 / ISO 8073 + standard. See references [4] and [5] in Chapter 3, below) + + Adler-32 is composed of two sums accumulated per byte: s1 is + the sum of all bytes, s2 is the sum of all s1 values. Both sums + are done modulo 65521. s1 is initialized to 1, s2 to zero. The + Adler-32 checksum is stored as s2*65536 + s1 in most- + significant-byte first (network) order. + + ==> The stream can start with only 2 bytes : + - CM = 0x78 : CMINFO=7 (32kB window), CM=8 (deflate) + - FLG = 0x01 : FLEVEL = 0 (fastest), FDICT=0 (no dict), FCHECK=1 so + that 0x7801 is a multiple of 31 (30721 = 991 * 31). + + ==> and it ends with only 4 bytes, the Adler-32 checksum in big-endian format. + + */ + +static const unsigned char zlib_hdr[] = { 0x78, 0x01 }; // 32k win, deflate, chk=1 + + +/* Original version from RFC1950, verified and works OK */ +uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len) +{ + uint32_t s1 = crc & 0xffff; + uint32_t s2 = (crc >> 16) & 0xffff; + int n; + + for (n = 0; n < len; n++) { + s1 = (s1 + buf[n]) % 65521; + s2 = (s2 + s1) % 65521; + } + return (s2 << 16) + s1; +} + +/* Computes the adler32 sum on <buf> for <len> bytes. It avoids the expensive + * modulus by retrofitting the number of bytes missed between 65521 and 65536 + * which is easy to count : For every sum above 65536, the modulus is offset + * by (65536-65521) = 15. So for any value, we can count the accumulated extra + * values by dividing the sum by 65536 and multiplying this value by + * (65536-65521). That's easier with a drawing with boxes and marbles. It gives + * this : + * x % 65521 = (x % 65536) + (x / 65536) * (65536 - 65521) + * = (x & 0xffff) + (x >> 16) * 15. + */ +uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len) +{ + long s1 = crc & 0xffff; + long s2 = (crc >> 16); + long blk; + long n; + + do { + blk = len; + /* ensure we never overflow s2 (limit is about 2^((32-8)/2) */ + if (blk > (1U << 12)) + blk = 1U << 12; + len -= blk; + + for (n = 0; n < blk; n++) { + s1 = (s1 + buf[n]); + s2 = (s2 + s1); + } + + /* Largest value here is 2^12 * 255 = 1044480 < 2^20. We can + * still overflow once, but not twice because the right hand + * size is 225 max, so the total is 65761. However we also + * have to take care of the values between 65521 and 65536. + */ + s1 = (s1 & 0xffff) + 15 * (s1 >> 16); + if (s1 >= 65521) + s1 -= 65521; + + /* For s2, the largest value is estimated to 2^32-1 for + * simplicity, so the right hand side is about 15*65535 + * = 983025. We can overflow twice at most. + */ + s2 = (s2 & 0xffff) + 15 * (s2 >> 16); + s2 = (s2 & 0xffff) + 15 * (s2 >> 16); + if (s2 >= 65521) + s2 -= 65521; + + buf += blk; + } while (len); + return (s2 << 16) + s1; +} + +/* Sends the zlib header for stream <strm> into buffer <buf>. When it's done, + * the stream state is updated to SLZ_ST_EOB. It returns the number of bytes + * emitted which is always 2. The caller is responsible for ensuring there's + * always enough room in the buffer. + */ +int slz_rfc1950_send_header(struct slz_stream *strm, unsigned char *buf) +{ + memcpy(buf, zlib_hdr, sizeof(zlib_hdr)); + strm->state = SLZ_ST_EOB; + return sizeof(zlib_hdr); +} + +/* Encodes the block according to rfc1950. This means that the CRC of the input + * block is computed according to the ADLER32 algorithm. If the header was never + * sent, it may be sent first. The number of output bytes is returned. + */ +long slz_rfc1950_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more) +{ + long ret = 0; + + if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) + ret += slz_rfc1950_send_header(strm, out); + + strm->crc32 = slz_adler32_block(strm->crc32, in, ilen); + ret += slz_rfc1951_encode(strm, out + ret, in, ilen, more); + return ret; +} + +/* Initializes stream <strm> for use with the zlib format (rfc1952). The + * compression level passed in <level> is set. This value can only be 0 (no + * compression) or 1 (compression) and other values will lead to unpredictable + * behaviour. The function always returns 0. + */ +int slz_rfc1950_init(struct slz_stream *strm, int level) +{ + strm->state = SLZ_ST_INIT; + strm->level = level; + strm->format = SLZ_FMT_ZLIB; + strm->crc32 = 1; // rfc1950/zlib starts with initial crc=1 + strm->ilen = 0; + strm->qbits = 0; + strm->queue = 0; + return 0; +} + +/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an + * empty literal block to byte-align the output, allowing to completely flush + * the queue. Note that if the initial header was never sent, it will be sent + * first as well (2 extra bytes). This requires that the output buffer still + * has this plus the size of the queue available (up to 4 bytes), plus one byte + * for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 11 bytes in the + * worst case. The number of bytes emitted is returned. It is guaranteed that + * the queue is empty on return. This may cause some overhead by adding + * needless 5-byte blocks if called to often. + */ +int slz_rfc1950_flush(struct slz_stream *strm, unsigned char *buf) +{ + int sent = 0; + + if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) + sent = slz_rfc1950_send_header(strm, buf); + + sent += slz_rfc1951_flush(strm, buf + sent); + return sent; +} + +/* Flushes pending bits and sends the gzip trailer for stream <strm> into + * buffer <buf>. When it's done, the stream state is updated to SLZ_ST_END. It + * returns the number of bytes emitted. The trailer consists in flushing the + * possibly pending bits from the queue (up to 24 bits), rounding to the next + * byte, then 4 bytes for the CRC. That may about to 4+4 = 8 bytes, that the + * caller must ensure are available before calling the function. Note that if + * the initial header was never sent, it will be sent first as well (2 extra + * bytes). + */ +int slz_rfc1950_finish(struct slz_stream *strm, unsigned char *buf) +{ + strm->outbuf = buf; + + if (__builtin_expect(strm->state == SLZ_ST_INIT, 0)) + strm->outbuf += slz_rfc1952_send_header(strm, strm->outbuf); + + slz_rfc1951_finish(strm, strm->outbuf); + copy_8b(strm, (strm->crc32 >> 24) & 0xff); + copy_8b(strm, (strm->crc32 >> 16) & 0xff); + copy_8b(strm, (strm->crc32 >> 8) & 0xff); + copy_8b(strm, (strm->crc32 >> 0) & 0xff); + strm->state = SLZ_ST_END; + return strm->outbuf - buf; +} + +__attribute__((constructor)) +static void __slz_initialize(void) +{ +#if !defined(__ARM_FEATURE_CRC32) + __slz_make_crc_table(); +#endif + __slz_prepare_dist_table(); +} diff --git a/src/sock.c b/src/sock.c new file mode 100644 index 0000000..7fcdc10 --- /dev/null +++ b/src/sock.c @@ -0,0 +1,1072 @@ +/* + * Generic code for native (BSD-compatible) sockets + * + * Copyright 2000-2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <net/if.h> + +#include <haproxy/api.h> +#include <haproxy/activity.h> +#include <haproxy/connection.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/namespace.h> +#include <haproxy/proto_sockpair.h> +#include <haproxy/sock.h> +#include <haproxy/sock_inet.h> +#include <haproxy/tools.h> + +#define SOCK_XFER_OPT_FOREIGN 0x000000001 +#define SOCK_XFER_OPT_V6ONLY 0x000000002 +#define SOCK_XFER_OPT_DGRAM 0x000000004 + +/* the list of remaining sockets transferred from an older process */ +struct xfer_sock_list { + int fd; + int options; /* socket options as SOCK_XFER_OPT_* */ + char *iface; + char *namespace; + int if_namelen; + int ns_namelen; + struct xfer_sock_list *prev; + struct xfer_sock_list *next; + struct sockaddr_storage addr; +}; + +static struct xfer_sock_list *xfer_sock_list; + + +/* Accept an incoming connection from listener <l>, and return it, as well as + * a CO_AC_* status code into <status> if not null. Null is returned on error. + * <l> must be a valid listener with a valid frontend. + */ +struct connection *sock_accept_conn(struct listener *l, int *status) +{ +#ifdef USE_ACCEPT4 + static int accept4_broken; +#endif + struct proxy *p = l->bind_conf->frontend; + struct connection *conn = NULL; + struct sockaddr_storage *addr = NULL; + socklen_t laddr; + int ret; + int cfd; + + if (!sockaddr_alloc(&addr, NULL, 0)) + goto fail_addr; + + /* accept() will mark all accepted FDs O_NONBLOCK and the ones accepted + * in the master process as FD_CLOEXEC. It's not done for workers + * because 1) workers are not supposed to execute anything so there's + * no reason for uselessly slowing down everything, and 2) that would + * prevent us from implementing fd passing in the future. + */ +#ifdef USE_ACCEPT4 + laddr = sizeof(*conn->src); + + /* only call accept4() if it's known to be safe, otherwise fallback to + * the legacy accept() + fcntl(). + */ + if (unlikely(accept4_broken) || + (((cfd = accept4(l->rx.fd, (struct sockaddr*)addr, &laddr, + SOCK_NONBLOCK | (master ? SOCK_CLOEXEC : 0))) == -1) && + (errno == ENOSYS || errno == EINVAL || errno == EBADF) && + ((accept4_broken = 1)))) +#endif + { + laddr = sizeof(*conn->src); + if ((cfd = accept(l->rx.fd, (struct sockaddr*)addr, &laddr)) != -1) { + fd_set_nonblock(cfd); + if (master) + fd_set_cloexec(cfd); + } + } + + if (likely(cfd != -1)) { + if (unlikely(cfd >= global.maxsock)) { + send_log(p, LOG_EMERG, + "Proxy %s reached the configured maximum connection limit. Please check the global 'maxconn' value.\n", + p->id); + goto fail_conn; + } + + /* Perfect, the connection was accepted */ + conn = conn_new(&l->obj_type); + if (!conn) + goto fail_conn; + + conn->src = addr; + conn->handle.fd = cfd; + ret = CO_AC_DONE; + goto done; + } + + /* error conditions below */ + sockaddr_free(&addr); + + switch (errno) { +#if defined(EWOULDBLOCK) && defined(EAGAIN) && EWOULDBLOCK != EAGAIN + case EWOULDBLOCK: +#endif + case EAGAIN: + ret = CO_AC_DONE; /* nothing more to accept */ + if (fdtab[l->rx.fd].state & (FD_POLL_HUP|FD_POLL_ERR)) { + /* the listening socket might have been disabled in a shared + * process and we're a collateral victim. We'll just pause for + * a while in case it comes back. In the mean time, we need to + * clear this sticky flag. + */ + _HA_ATOMIC_AND(&fdtab[l->rx.fd].state, ~(FD_POLL_HUP|FD_POLL_ERR)); + ret = CO_AC_PAUSE; + } + fd_cant_recv(l->rx.fd); + break; + + case EINVAL: + /* might be trying to accept on a shut fd (eg: soft stop) */ + ret = CO_AC_PAUSE; + break; + + case EINTR: + case ECONNABORTED: + ret = CO_AC_RETRY; + break; + + case ENFILE: + if (p) + send_log(p, LOG_EMERG, + "Proxy %s reached system FD limit (maxsock=%d). Please check system tunables.\n", + p->id, global.maxsock); + ret = CO_AC_PAUSE; + break; + + case EMFILE: + if (p) + send_log(p, LOG_EMERG, + "Proxy %s reached process FD limit (maxsock=%d). Please check 'ulimit-n' and restart.\n", + p->id, global.maxsock); + ret = CO_AC_PAUSE; + break; + + case ENOBUFS: + case ENOMEM: + if (p) + send_log(p, LOG_EMERG, + "Proxy %s reached system memory limit (maxsock=%d). Please check system tunables.\n", + p->id, global.maxsock); + ret = CO_AC_PAUSE; + break; + + default: + /* unexpected result, let's give up and let other tasks run */ + ret = CO_AC_YIELD; + } + done: + if (status) + *status = ret; + return conn; + + fail_conn: + sockaddr_free(&addr); + /* The accept call already succeeded by the time we try to allocate the connection, + * we need to close it in case of failure. */ + close(cfd); + fail_addr: + ret = CO_AC_PAUSE; + goto done; +} + +/* Create a socket to connect to the server in conn->dst (which MUST be valid), + * using the configured namespace if needed, or the one passed by the proxy + * protocol if required to do so. It ultimately calls socket() or socketat() + * and returns the FD or error code. + */ +int sock_create_server_socket(struct connection *conn) +{ + const struct netns_entry *ns = NULL; + +#ifdef USE_NS + if (objt_server(conn->target)) { + if (__objt_server(conn->target)->flags & SRV_F_USE_NS_FROM_PP) + ns = conn->proxy_netns; + else + ns = __objt_server(conn->target)->netns; + } +#endif + return my_socketat(ns, conn->dst->ss_family, SOCK_STREAM, 0); +} + +/* Enables receiving on receiver <rx> once already bound. */ +void sock_enable(struct receiver *rx) +{ + if (rx->flags & RX_F_BOUND) + fd_want_recv_safe(rx->fd); +} + +/* Disables receiving on receiver <rx> once already bound. */ +void sock_disable(struct receiver *rx) +{ + if (rx->flags & RX_F_BOUND) + fd_stop_recv(rx->fd); +} + +/* stops, unbinds and possibly closes the FD associated with receiver rx */ +void sock_unbind(struct receiver *rx) +{ + /* There are a number of situations where we prefer to keep the FD and + * not to close it (unless we're stopping, of course): + * - worker process unbinding from a worker's non-suspendable FD (ABNS) => close + * - worker process unbinding from a worker's FD with socket transfer enabled => keep + * - master process unbinding from a master's inherited FD => keep + * - master process unbinding from a master's FD => close + * - master process unbinding from a worker's inherited FD => keep + * - master process unbinding from a worker's FD => close + * - worker process unbinding from a master's FD => close + * - worker process unbinding from a worker's FD => close + */ + if (rx->flags & RX_F_BOUND) + rx->proto->rx_disable(rx); + + if (!stopping && !master && + !(rx->flags & RX_F_MWORKER) && + !(rx->flags & RX_F_NON_SUSPENDABLE) && + (global.tune.options & GTUNE_SOCKET_TRANSFER)) + return; + + if (!stopping && master && + rx->flags & RX_F_INHERITED) + return; + + rx->flags &= ~RX_F_BOUND; + if (rx->fd != -1) + fd_delete(rx->fd); + rx->fd = -1; +} + +/* + * Retrieves the source address for the socket <fd>, with <dir> indicating + * if we're a listener (=0) or an initiator (!=0). It returns 0 in case of + * success, -1 in case of error. The socket's source address is stored in + * <sa> for <salen> bytes. + */ +int sock_get_src(int fd, struct sockaddr *sa, socklen_t salen, int dir) +{ + if (dir) + return getsockname(fd, sa, &salen); + else + return getpeername(fd, sa, &salen); +} + +/* + * Retrieves the original destination address for the socket <fd>, with <dir> + * indicating if we're a listener (=0) or an initiator (!=0). It returns 0 in + * case of success, -1 in case of error. The socket's source address is stored + * in <sa> for <salen> bytes. + */ +int sock_get_dst(int fd, struct sockaddr *sa, socklen_t salen, int dir) +{ + if (dir) + return getpeername(fd, sa, &salen); + else + return getsockname(fd, sa, &salen); +} + +/* Try to retrieve exported sockets from worker at CLI <unixsocket>. These + * ones will be placed into the xfer_sock_list for later use by function + * sock_find_compatible_fd(). Returns 0 on success, -1 on failure. + */ +int sock_get_old_sockets(const char *unixsocket) +{ + char *cmsgbuf = NULL, *tmpbuf = NULL; + int *tmpfd = NULL; + struct sockaddr_un addr; + struct cmsghdr *cmsg; + struct msghdr msghdr; + struct iovec iov; + struct xfer_sock_list *xfer_sock = NULL; + struct timeval tv = { .tv_sec = 1, .tv_usec = 0 }; + int sock = -1; + int ret = -1; + int ret2 = -1; + int fd_nb; + int got_fd = 0; + int cur_fd = 0; + size_t maxoff = 0, curoff = 0; + + if (strncmp("sockpair@", unixsocket, strlen("sockpair@")) == 0) { + /* sockpair for master-worker usage */ + int sv[2]; + int dst_fd; + + dst_fd = strtoll(unixsocket + strlen("sockpair@"), NULL, 0); + + if (socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) { + ha_warning("socketpair(): Cannot create socketpair. Giving up.\n"); + } + + if (send_fd_uxst(dst_fd, sv[0]) == -1) { + ha_alert("socketpair: Cannot transfer the fd %d over sockpair@%d. Giving up.\n", sv[0], dst_fd); + close(sv[0]); + close(sv[1]); + goto out; + } + + close(sv[0]); /* we don't need this side anymore */ + sock = sv[1]; + + } else { + /* Unix socket */ + + sock = socket(PF_UNIX, SOCK_STREAM, 0); + if (sock < 0) { + ha_warning("Failed to connect to the old process socket '%s'\n", unixsocket); + goto out; + } + + strncpy(addr.sun_path, unixsocket, sizeof(addr.sun_path) - 1); + addr.sun_path[sizeof(addr.sun_path) - 1] = 0; + addr.sun_family = PF_UNIX; + + ret = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + ha_warning("Failed to connect to the old process socket '%s'\n", unixsocket); + goto out; + } + + } + memset(&msghdr, 0, sizeof(msghdr)); + cmsgbuf = malloc(CMSG_SPACE(sizeof(int)) * MAX_SEND_FD); + if (!cmsgbuf) { + ha_warning("Failed to allocate memory to send sockets\n"); + goto out; + } + + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (void *)&tv, sizeof(tv)); + iov.iov_base = &fd_nb; + iov.iov_len = sizeof(fd_nb); + msghdr.msg_iov = &iov; + msghdr.msg_iovlen = 1; + + if (send(sock, "_getsocks\n", strlen("_getsocks\n"), 0) != strlen("_getsocks\n")) { + ha_warning("Failed to get the number of sockets to be transferred !\n"); + goto out; + } + + /* First, get the number of file descriptors to be received */ + if (recvmsg(sock, &msghdr, MSG_WAITALL) != sizeof(fd_nb)) { + ha_warning("Failed to get the number of sockets to be transferred !\n"); + goto out; + } + + if (fd_nb == 0) { + ret2 = 0; + goto out; + } + + tmpbuf = malloc(fd_nb * (1 + MAXPATHLEN + 1 + IFNAMSIZ + sizeof(int))); + if (tmpbuf == NULL) { + ha_warning("Failed to allocate memory while receiving sockets\n"); + goto out; + } + + tmpfd = malloc(fd_nb * sizeof(int)); + if (tmpfd == NULL) { + ha_warning("Failed to allocate memory while receiving sockets\n"); + goto out; + } + + msghdr.msg_control = cmsgbuf; + msghdr.msg_controllen = CMSG_SPACE(sizeof(int)) * MAX_SEND_FD; + iov.iov_len = MAX_SEND_FD * (1 + MAXPATHLEN + 1 + IFNAMSIZ + sizeof(int)); + + do { + int ret3; + + iov.iov_base = tmpbuf + curoff; + + ret = recvmsg(sock, &msghdr, 0); + + if (ret == -1 && errno == EINTR) + continue; + + if (ret <= 0) + break; + + /* Send an ack to let the sender know we got the sockets + * and it can send some more + */ + do { + ret3 = send(sock, &got_fd, sizeof(got_fd), 0); + } while (ret3 == -1 && errno == EINTR); + + for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + size_t totlen = cmsg->cmsg_len - CMSG_LEN(0); + + if (totlen / sizeof(int) + got_fd > fd_nb) { + ha_warning("Got to many sockets !\n"); + goto out; + } + + /* + * Be paranoid and use memcpy() to avoid any + * potential alignment issue. + */ + memcpy(&tmpfd[got_fd], CMSG_DATA(cmsg), totlen); + got_fd += totlen / sizeof(int); + } + } + curoff += ret; + } while (got_fd < fd_nb); + + if (got_fd != fd_nb) { + ha_warning("We didn't get the expected number of sockets (expecting %d got %d)\n", + fd_nb, got_fd); + goto out; + } + + maxoff = curoff; + curoff = 0; + + for (cur_fd = 0; cur_fd < got_fd; cur_fd++) { + int fd = tmpfd[cur_fd]; + socklen_t socklen; + int val; + int len; + + xfer_sock = calloc(1, sizeof(*xfer_sock)); + if (!xfer_sock) { + ha_warning("Failed to allocate memory in get_old_sockets() !\n"); + break; + } + xfer_sock->fd = -1; + + socklen = sizeof(xfer_sock->addr); + if (getsockname(fd, (struct sockaddr *)&xfer_sock->addr, &socklen) != 0) { + ha_warning("Failed to get socket address\n"); + ha_free(&xfer_sock); + continue; + } + + if (curoff >= maxoff) { + ha_warning("Inconsistency while transferring sockets\n"); + goto out; + } + + len = tmpbuf[curoff++]; + if (len > 0) { + /* We have a namespace */ + if (curoff + len > maxoff) { + ha_warning("Inconsistency while transferring sockets\n"); + goto out; + } + xfer_sock->namespace = malloc(len + 1); + if (!xfer_sock->namespace) { + ha_warning("Failed to allocate memory while transferring sockets\n"); + goto out; + } + memcpy(xfer_sock->namespace, &tmpbuf[curoff], len); + xfer_sock->namespace[len] = 0; + xfer_sock->ns_namelen = len; + curoff += len; + } + + if (curoff >= maxoff) { + ha_warning("Inconsistency while transferring sockets\n"); + goto out; + } + + len = tmpbuf[curoff++]; + if (len > 0) { + /* We have an interface */ + if (curoff + len > maxoff) { + ha_warning("Inconsistency while transferring sockets\n"); + goto out; + } + xfer_sock->iface = malloc(len + 1); + if (!xfer_sock->iface) { + ha_warning("Failed to allocate memory while transferring sockets\n"); + goto out; + } + memcpy(xfer_sock->iface, &tmpbuf[curoff], len); + xfer_sock->iface[len] = 0; + xfer_sock->if_namelen = len; + curoff += len; + } + + if (curoff + sizeof(int) > maxoff) { + ha_warning("Inconsistency while transferring sockets\n"); + goto out; + } + + /* we used to have 32 bits of listener options here but we don't + * use them anymore. + */ + curoff += sizeof(int); + + /* determine the foreign status directly from the socket itself */ + if (sock_inet_is_foreign(fd, xfer_sock->addr.ss_family)) + xfer_sock->options |= SOCK_XFER_OPT_FOREIGN; + + socklen = sizeof(val); + if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &val, &socklen) == 0 && val == SOCK_DGRAM) + xfer_sock->options |= SOCK_XFER_OPT_DGRAM; + +#if defined(IPV6_V6ONLY) + /* keep only the v6only flag depending on what's currently + * active on the socket, and always drop the v4v6 one. + */ + socklen = sizeof(val); + if (xfer_sock->addr.ss_family == AF_INET6 && + getsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, &socklen) == 0 && val > 0) + xfer_sock->options |= SOCK_XFER_OPT_V6ONLY; +#endif + + xfer_sock->fd = fd; + if (xfer_sock_list) + xfer_sock_list->prev = xfer_sock; + xfer_sock->next = xfer_sock_list; + xfer_sock->prev = NULL; + xfer_sock_list = xfer_sock; + xfer_sock = NULL; + } + + ret2 = 0; +out: + /* If we failed midway make sure to close the remaining + * file descriptors + */ + if (tmpfd != NULL && cur_fd < got_fd) { + for (; cur_fd < got_fd; cur_fd++) { + close(tmpfd[cur_fd]); + } + } + + free(tmpbuf); + free(tmpfd); + free(cmsgbuf); + + if (sock != -1) + close(sock); + + if (xfer_sock) { + free(xfer_sock->namespace); + free(xfer_sock->iface); + if (xfer_sock->fd != -1) + close(xfer_sock->fd); + free(xfer_sock); + } + return (ret2); +} + +/* When binding the receivers, check if a socket has been sent to us by the + * previous process that we could reuse, instead of creating a new one. Note + * that some address family-specific options are checked on the listener and + * on the socket. Typically for AF_INET and AF_INET6, we check for transparent + * mode, and for AF_INET6 we also check for "v4v6" or "v6only". The reused + * socket is automatically removed from the list so that it's not proposed + * anymore. + */ +int sock_find_compatible_fd(const struct receiver *rx) +{ + struct xfer_sock_list *xfer_sock = xfer_sock_list; + int options = 0; + int if_namelen = 0; + int ns_namelen = 0; + int ret = -1; + + if (!rx->proto->fam->addrcmp) + return -1; + + if (rx->proto->proto_type == PROTO_TYPE_DGRAM) + options |= SOCK_XFER_OPT_DGRAM; + + if (rx->settings->options & RX_O_FOREIGN) + options |= SOCK_XFER_OPT_FOREIGN; + + if (rx->addr.ss_family == AF_INET6) { + /* Prepare to match the v6only option against what we really want. Note + * that sadly the two options are not exclusive to each other and that + * v6only is stronger than v4v6. + */ + if ((rx->settings->options & RX_O_V6ONLY) || + (sock_inet6_v6only_default && !(rx->settings->options & RX_O_V4V6))) + options |= SOCK_XFER_OPT_V6ONLY; + } + + if (rx->settings->interface) + if_namelen = strlen(rx->settings->interface); +#ifdef USE_NS + if (rx->settings->netns) + ns_namelen = rx->settings->netns->name_len; +#endif + + while (xfer_sock) { + if ((options == xfer_sock->options) && + (if_namelen == xfer_sock->if_namelen) && + (ns_namelen == xfer_sock->ns_namelen) && + (!if_namelen || strcmp(rx->settings->interface, xfer_sock->iface) == 0) && +#ifdef USE_NS + (!ns_namelen || strcmp(rx->settings->netns->node.key, xfer_sock->namespace) == 0) && +#endif + rx->proto->fam->addrcmp(&xfer_sock->addr, &rx->addr) == 0) + break; + xfer_sock = xfer_sock->next; + } + + if (xfer_sock != NULL) { + ret = xfer_sock->fd; + if (xfer_sock == xfer_sock_list) + xfer_sock_list = xfer_sock->next; + if (xfer_sock->prev) + xfer_sock->prev->next = xfer_sock->next; + if (xfer_sock->next) + xfer_sock->next->prev = xfer_sock->prev; + free(xfer_sock->iface); + free(xfer_sock->namespace); + free(xfer_sock); + } + return ret; +} + +/* After all protocols are bound, there may remain some old sockets that have + * been removed between the previous config and the new one. These ones must + * be dropped, otherwise they will remain open and may prevent a service from + * restarting. + */ +void sock_drop_unused_old_sockets() +{ + while (xfer_sock_list != NULL) { + struct xfer_sock_list *tmpxfer = xfer_sock_list->next; + + close(xfer_sock_list->fd); + free(xfer_sock_list->iface); + free(xfer_sock_list->namespace); + free(xfer_sock_list); + xfer_sock_list = tmpxfer; + } +} + +/* Tests if the receiver supports accepting connections. Returns positive on + * success, 0 if not possible, negative if the socket is non-recoverable. The + * rationale behind this is that inherited FDs may be broken and that shared + * FDs might have been paused by another process. + */ +int sock_accepting_conn(const struct receiver *rx) +{ + int opt_val = 0; + socklen_t opt_len = sizeof(opt_val); + + if (getsockopt(rx->fd, SOL_SOCKET, SO_ACCEPTCONN, &opt_val, &opt_len) == -1) + return -1; + + return opt_val; +} + +/* This is the FD handler IO callback for stream sockets configured for + * accepting incoming connections. It's a pass-through to listener_accept() + * which will iterate over the listener protocol's accept_conn() function. + * The FD's owner must be a listener. + */ +void sock_accept_iocb(int fd) +{ + struct listener *l = fdtab[fd].owner; + + if (!l) + return; + + BUG_ON(!!master != !!(l->rx.flags & RX_F_MWORKER)); + listener_accept(l); +} + +/* This completes the initialization of connection <conn> by inserting its FD + * into the fdtab, associating it with the regular connection handler. It will + * be bound to the current thread only. This call cannot fail. + */ +void sock_conn_ctrl_init(struct connection *conn) +{ + BUG_ON(conn->flags & CO_FL_FDLESS); + fd_insert(conn->handle.fd, conn, sock_conn_iocb, tgid, ti->ltid_bit); +} + +/* This completes the release of connection <conn> by removing its FD from the + * fdtab and deleting it. The connection must not use the FD anymore past this + * point. The FD may be modified in the connection. + */ +void sock_conn_ctrl_close(struct connection *conn) +{ + BUG_ON(conn->flags & CO_FL_FDLESS); + fd_delete(conn->handle.fd); + conn->handle.fd = DEAD_FD_MAGIC; +} + +/* This is the callback which is set when a connection establishment is pending + * and we have nothing to send. It may update the FD polling status to indicate + * !READY. It returns 0 if it fails in a fatal way or needs to poll to go + * further, otherwise it returns non-zero and removes the CO_FL_WAIT_L4_CONN + * flag from the connection's flags. In case of error, it sets CO_FL_ERROR and + * leaves the error code in errno. + */ +int sock_conn_check(struct connection *conn) +{ + struct sockaddr_storage *addr; + int fd = conn->handle.fd; + + if (conn->flags & CO_FL_ERROR) + return 0; + + if (!conn_ctrl_ready(conn)) + return 0; + + if (!(conn->flags & CO_FL_WAIT_L4_CONN)) + return 1; /* strange we were called while ready */ + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (!fd_send_ready(fd) && !(fdtab[fd].state & (FD_POLL_ERR|FD_POLL_HUP))) + return 0; + + /* Here we have 2 cases : + * - modern pollers, able to report ERR/HUP. If these ones return any + * of these flags then it's likely a failure, otherwise it possibly + * is a success (i.e. there may have been data received just before + * the error was reported). + * - select, which doesn't report these and with which it's always + * necessary either to try connect() again or to check for SO_ERROR. + * In order to simplify everything, we double-check using connect() as + * soon as we meet either of these delicate situations. Note that + * SO_ERROR would clear the error after reporting it! + */ + if (cur_poller.flags & HAP_POLL_F_ERRHUP) { + /* modern poller, able to report ERR/HUP */ + if ((fdtab[fd].state & (FD_POLL_IN|FD_POLL_ERR|FD_POLL_HUP)) == FD_POLL_IN) + goto done; + if ((fdtab[fd].state & (FD_POLL_OUT|FD_POLL_ERR|FD_POLL_HUP)) == FD_POLL_OUT) + goto done; + if (!(fdtab[fd].state & (FD_POLL_ERR|FD_POLL_HUP))) + goto wait; + /* error present, fall through common error check path */ + } + + /* Use connect() to check the state of the socket. This has the double + * advantage of *not* clearing the error (so that health checks can + * still use getsockopt(SO_ERROR)) and giving us the following info : + * - error + * - connecting (EALREADY, EINPROGRESS) + * - connected (EISCONN, 0) + */ + addr = conn->dst; + if ((conn->flags & CO_FL_SOCKS4) && obj_type(conn->target) == OBJ_TYPE_SERVER) + addr = &objt_server(conn->target)->socks4_addr; + + if (connect(fd, (const struct sockaddr *)addr, get_addr_len(addr)) == -1) { + if (errno == EALREADY || errno == EINPROGRESS) + goto wait; + + if (errno && errno != EISCONN) + goto out_error; + } + + done: + /* The FD is ready now, we'll mark the connection as complete and + * forward the event to the transport layer which will notify the + * data layer. + */ + conn->flags &= ~CO_FL_WAIT_L4_CONN; + fd_may_send(fd); + fd_cond_recv(fd); + errno = 0; // make health checks happy + return 1; + + out_error: + /* Write error on the file descriptor. Report it to the connection + * and disable polling on this FD. + */ + conn->flags |= CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH; + HA_ATOMIC_AND(&fdtab[fd].state, ~FD_LINGER_RISK); + fd_stop_both(fd); + return 0; + + wait: + fd_cant_send(fd); + fd_want_send(fd); + return 0; +} + +/* I/O callback for fd-based connections. It calls the read/write handlers + * provided by the connection's sock_ops, which must be valid. + */ +void sock_conn_iocb(int fd) +{ + struct connection *conn = fdtab[fd].owner; + unsigned int flags; + int need_wake = 0; + struct tasklet *t; + + if (unlikely(!conn)) { + activity[tid].conn_dead++; + return; + } + + flags = conn->flags & ~CO_FL_ERROR; /* ensure to call the wake handler upon error */ + + if (unlikely(conn->flags & CO_FL_WAIT_L4_CONN) && + ((fd_send_ready(fd) && fd_send_active(fd)) || + (fd_recv_ready(fd) && fd_recv_active(fd)))) { + /* Still waiting for a connection to establish and nothing was + * attempted yet to probe the connection. this will clear the + * CO_FL_WAIT_L4_CONN flag on success. + */ + if (!sock_conn_check(conn)) + goto leave; + need_wake = 1; + } + + if (fd_send_ready(fd) && fd_send_active(fd)) { + /* force reporting of activity by clearing the previous flags : + * we'll have at least ERROR or CONNECTED at the end of an I/O, + * both of which will be detected below. + */ + flags = 0; + if (conn->subs && conn->subs->events & SUB_RETRY_SEND) { + t = conn->subs->tasklet; + need_wake = 0; // wake will be called after this I/O + conn->subs->events &= ~SUB_RETRY_SEND; + if (!conn->subs->events) + conn->subs = NULL; + tasklet_wakeup(t); + } + fd_stop_send(fd); + } + + /* The data transfer starts here and stops on error and handshakes. Note + * that we must absolutely test conn->xprt at each step in case it suddenly + * changes due to a quick unexpected close(). + */ + if (fd_recv_ready(fd) && fd_recv_active(fd)) { + /* force reporting of activity by clearing the previous flags : + * we'll have at least ERROR or CONNECTED at the end of an I/O, + * both of which will be detected below. + */ + flags = 0; + if (conn->subs && conn->subs->events & SUB_RETRY_RECV) { + t = conn->subs->tasklet; + need_wake = 0; // wake will be called after this I/O + conn->subs->events &= ~SUB_RETRY_RECV; + if (!conn->subs->events) + conn->subs = NULL; + tasklet_wakeup(t); + } + fd_stop_recv(fd); + } + + leave: + /* we may have to finish to install a mux or to wake it up based on + * what was just done above. It may kill the connection so we have to + * be prpared not to use it anymore. + */ + if (conn_notify_mux(conn, flags, need_wake) < 0) + return; + + /* commit polling changes in case of error. + * WT: it seems that the last case where this could still be relevant + * is if a mux wake function above report a connection error but does + * not stop polling. Shouldn't we enforce this into the mux instead of + * having to deal with this ? + */ + if (unlikely(conn->flags & CO_FL_ERROR)) { + if (conn_ctrl_ready(conn)) + fd_stop_both(fd); + + if (conn->subs) { + t = conn->subs->tasklet; + conn->subs->events = 0; + if (!conn->subs->events) + conn->subs = NULL; + tasklet_wakeup(t); + } + } +} + +/* Drains possibly pending incoming data on the file descriptor attached to the + * connection. This is used to know whether we need to disable lingering on + * close. Returns non-zero if it is safe to close without disabling lingering, + * otherwise zero. + */ +int sock_drain(struct connection *conn) +{ + int turns = 2; + int fd = conn->handle.fd; + int len; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (fdtab[fd].state & (FD_POLL_ERR|FD_POLL_HUP)) + goto shut; + + if (!(conn->flags & CO_FL_WANT_DRAIN) && !fd_recv_ready(fd)) + return 0; + + /* no drain function defined, use the generic one */ + + while (turns) { +#ifdef MSG_TRUNC_CLEARS_INPUT + len = recv(fd, NULL, INT_MAX, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_TRUNC); + if (len == -1 && errno == EFAULT) +#endif + len = recv(fd, trash.area, trash.size, MSG_DONTWAIT | MSG_NOSIGNAL); + + if (len == 0) + goto shut; + + if (len < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + /* connection not closed yet */ + fd_cant_recv(fd); + break; + } + if (errno == EINTR) /* oops, try again */ + continue; + /* other errors indicate a dead connection, fine. */ + goto shut; + } + /* OK we read some data, let's try again once */ + turns--; + } + + /* some data are still present, give up */ + return 0; + + shut: + /* we're certain the connection was shut down */ + HA_ATOMIC_AND(&fdtab[fd].state, ~FD_LINGER_RISK); + return 1; +} + +/* Checks the connection's FD for readiness of events <event_type>, which may + * only be a combination of SUB_RETRY_RECV and SUB_RETRY_SEND. Those which are + * ready are returned. The ones that are not ready are enabled. The caller is + * expected to do what is needed to handle ready events and to deal with + * subsequent wakeups caused by the requested events' readiness. + */ +int sock_check_events(struct connection *conn, int event_type) +{ + int ret = 0; + + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (event_type & SUB_RETRY_RECV) { + if (fd_recv_ready(conn->handle.fd)) + ret |= SUB_RETRY_RECV; + else + fd_want_recv(conn->handle.fd); + } + + if (event_type & SUB_RETRY_SEND) { + if (fd_send_ready(conn->handle.fd)) + ret |= SUB_RETRY_SEND; + else + fd_want_send(conn->handle.fd); + } + + return ret; +} + +/* Ignore readiness events from connection's FD for events of types <event_type> + * which may only be a combination of SUB_RETRY_RECV and SUB_RETRY_SEND. + */ +void sock_ignore_events(struct connection *conn, int event_type) +{ + BUG_ON(conn->flags & CO_FL_FDLESS); + + if (event_type & SUB_RETRY_RECV) + fd_stop_recv(conn->handle.fd); + + if (event_type & SUB_RETRY_SEND) + fd_stop_send(conn->handle.fd); +} + +/* Live check to see if a socket type supports SO_REUSEPORT for the specified + * family and socket() settings. Returns non-zero on success, 0 on failure. Use + * protocol_supports_flag() instead, which checks cached flags. + */ +int _sock_supports_reuseport(const struct proto_fam *fam, int type, int protocol) +{ + int ret = 0; +#ifdef SO_REUSEPORT + struct sockaddr_storage ss; + socklen_t sl = sizeof(ss); + int fd1, fd2; + + /* for the check, we'll need two sockets */ + fd1 = fd2 = -1; + + /* ignore custom sockets */ + if (!fam || fam->sock_domain >= AF_MAX) + goto leave; + + fd1 = socket(fam->sock_domain, type, protocol); + if (fd1 < 0) + goto leave; + + if (setsockopt(fd1, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) < 0) + goto leave; + + /* bind to any address assigned by the kernel, we'll then try to do it twice */ + memset(&ss, 0, sizeof(ss)); + ss.ss_family = fam->sock_family; + if (bind(fd1, (struct sockaddr *)&ss, fam->sock_addrlen) < 0) + goto leave; + + if (getsockname(fd1, (struct sockaddr *)&ss, &sl) < 0) + goto leave; + + fd2 = socket(fam->sock_domain, type, protocol); + if (fd2 < 0) + goto leave; + + if (setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) < 0) + goto leave; + + if (bind(fd2, (struct sockaddr *)&ss, sl) < 0) + goto leave; + + /* OK we could bind twice to the same address:port, REUSEPORT + * is supported for this protocol. + */ + ret = 1; + + leave: + if (fd2 >= 0) + close(fd2); + if (fd1 >= 0) + close(fd1); +#endif + return ret; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/sock_inet.c b/src/sock_inet.c new file mode 100644 index 0000000..028ffaa --- /dev/null +++ b/src/sock_inet.c @@ -0,0 +1,521 @@ +/* + * AF_INET/AF_INET6 socket management + * + * Copyright 2000-2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <errno.h> +#include <string.h> +#include <unistd.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <netinet/tcp.h> +#include <netinet/in.h> + +#include <haproxy/api.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/namespace.h> +#include <haproxy/receiver-t.h> +#include <haproxy/sock.h> +#include <haproxy/sock_inet.h> +#include <haproxy/tools.h> + +struct proto_fam proto_fam_inet4 = { + .name = "inet4", + .sock_domain = PF_INET, + .sock_family = AF_INET, + .sock_addrlen = sizeof(struct sockaddr_in), + .l3_addrlen = 32/8, + .addrcmp = sock_inet4_addrcmp, + .bind = sock_inet_bind_receiver, + .get_src = sock_get_src, + .get_dst = sock_inet_get_dst, + .set_port = sock_inet_set_port, +}; + +struct proto_fam proto_fam_inet6 = { + .name = "inet6", + .sock_domain = PF_INET6, + .sock_family = AF_INET6, + .sock_addrlen = sizeof(struct sockaddr_in6), + .l3_addrlen = 128/8, + .addrcmp = sock_inet6_addrcmp, + .bind = sock_inet_bind_receiver, + .get_src = sock_get_src, + .get_dst = sock_get_dst, + .set_port = sock_inet_set_port, +}; + +/* PLEASE NOTE for function below: + * - sock_inet4_* is solely for AF_INET (IPv4) + * - sock_inet6_* is solely for AF_INET6 (IPv6) + * - sock_inet_* is for either + * + * The address family SHOULD always be checked. In some cases a function will + * be used in a situation where the address family is guaranteed (e.g. protocol + * definitions), so the test may be avoided. This special case must then be + * mentioned in the comment before the function definition. + */ + +/* determine if the operating system uses IPV6_V6ONLY by default. 0=no, 1=yes. + * It also remains if IPv6 is not enabled/configured. + */ +int sock_inet6_v6only_default = 0; + +/* Default TCPv4/TCPv6 MSS settings. -1=unknown. */ +int sock_inet_tcp_maxseg_default = -1; +int sock_inet6_tcp_maxseg_default = -1; + +/* Compares two AF_INET sockaddr addresses. Returns 0 if they match or non-zero + * if they do not match. + */ +int sock_inet4_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b) +{ + const struct sockaddr_in *a4 = (const struct sockaddr_in *)a; + const struct sockaddr_in *b4 = (const struct sockaddr_in *)b; + + if (a->ss_family != b->ss_family) + return -1; + + if (a->ss_family != AF_INET) + return -1; + + if (a4->sin_port != b4->sin_port) + return -1; + + return memcmp(&a4->sin_addr, &b4->sin_addr, sizeof(a4->sin_addr)); +} + +/* Compares two AF_INET6 sockaddr addresses. Returns 0 if they match or + * non-zero if they do not match. + */ +int sock_inet6_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b) +{ + const struct sockaddr_in6 *a6 = (const struct sockaddr_in6 *)a; + const struct sockaddr_in6 *b6 = (const struct sockaddr_in6 *)b; + + if (a->ss_family != b->ss_family) + return -1; + + if (a->ss_family != AF_INET6) + return -1; + + if (a6->sin6_port != b6->sin6_port) + return -1; + + return memcmp(&a6->sin6_addr, &b6->sin6_addr, sizeof(a6->sin6_addr)); +} + +/* Sets the port <port> on IPv4 or IPv6 address <addr>. The address family is + * determined from the sockaddr_storage's address family. Nothing is done for + * other families. + */ +void sock_inet_set_port(struct sockaddr_storage *addr, int port) +{ + if (addr->ss_family == AF_INET) + ((struct sockaddr_in *)addr)->sin_port = htons(port); + else if (addr->ss_family == AF_INET6) + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); +} + +/* + * Retrieves the original destination address for the socket <fd> which must be + * of family AF_INET (not AF_INET6), with <dir> indicating if we're a listener + * (=0) or an initiator (!=0). In the case of a listener, if the original + * destination address was translated, the original address is retrieved. It + * returns 0 in case of success, -1 in case of error. The socket's source + * address is stored in <sa> for <salen> bytes. + */ +int sock_inet_get_dst(int fd, struct sockaddr *sa, socklen_t salen, int dir) +{ + if (dir) + return getpeername(fd, sa, &salen); + else { + int ret = getsockname(fd, sa, &salen); + + if (ret < 0) + return ret; + +#if defined(USE_TPROXY) && defined(SO_ORIGINAL_DST) + /* For TPROXY and Netfilter's NAT, we can retrieve the original + * IPv4 address before DNAT/REDIRECT. We must not do that with + * other families because v6-mapped IPv4 addresses are still + * reported as v4. + */ + if (getsockopt(fd, IPPROTO_IP, SO_ORIGINAL_DST, sa, &salen) == 0) + return 0; +#endif + return ret; + } +} + +/* Returns true if the passed FD corresponds to a socket bound with RX_O_FOREIGN + * according to the various supported socket options. The socket's address family + * must be passed in <family>. + */ +int sock_inet_is_foreign(int fd, sa_family_t family) +{ + int val __maybe_unused; + socklen_t len __maybe_unused; + + switch (family) { + case AF_INET: +#if defined(IP_TRANSPARENT) + val = 0; len = sizeof(val); + if (getsockopt(fd, IPPROTO_IP, IP_TRANSPARENT, &val, &len) == 0 && val) + return 1; +#endif +#if defined(IP_FREEBIND) + val = 0; len = sizeof(val); + if (getsockopt(fd, IPPROTO_IP, IP_FREEBIND, &val, &len) == 0 && val) + return 1; +#endif +#if defined(IP_BINDANY) + val = 0; len = sizeof(val); + if (getsockopt(fd, IPPROTO_IP, IP_BINDANY, &val, &len) == 0 && val) + return 1; +#endif +#if defined(SO_BINDANY) + val = 0; len = sizeof(val); + if (getsockopt(fd, SOL_SOCKET, SO_BINDANY, &val, &len) == 0 && val) + return 1; +#endif + break; + + case AF_INET6: +#if defined(IPV6_TRANSPARENT) + val = 0; len = sizeof(val); + if (getsockopt(fd, IPPROTO_IPV6, IPV6_TRANSPARENT, &val, &len) == 0 && val) + return 1; +#endif +#if defined(IP_FREEBIND) + val = 0; len = sizeof(val); + if (getsockopt(fd, IPPROTO_IP, IP_FREEBIND, &val, &len) == 0 && val) + return 1; +#endif +#if defined(IPV6_BINDANY) + val = 0; len = sizeof(val); + if (getsockopt(fd, IPPROTO_IPV6, IPV6_BINDANY, &val, &len) == 0 && val) + return 1; +#endif +#if defined(SO_BINDANY) + val = 0; len = sizeof(val); + if (getsockopt(fd, SOL_SOCKET, SO_BINDANY, &val, &len) == 0 && val) + return 1; +#endif + break; + } + return 0; +} + +/* Attempt all known socket options to prepare an AF_INET4 socket to be bound + * to a foreign address. The socket must already exist and must not be bound. + * 1 is returned on success, 0 on failure. The caller must check the address + * family before calling this function. + */ +int sock_inet4_make_foreign(int fd) +{ + return +#if defined(IP_TRANSPARENT) + setsockopt(fd, IPPROTO_IP, IP_TRANSPARENT, &one, sizeof(one)) == 0 || +#endif +#if defined(IP_FREEBIND) + setsockopt(fd, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) == 0 || +#endif +#if defined(IP_BINDANY) + setsockopt(fd, IPPROTO_IP, IP_BINDANY, &one, sizeof(one)) == 0 || +#endif +#if defined(SO_BINDANY) + setsockopt(fd, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) == 0 || +#endif + 0; +} + +/* Attempt all known socket options to prepare an AF_INET6 socket to be bound + * to a foreign address. The socket must already exist and must not be bound. + * 1 is returned on success, 0 on failure. The caller must check the address + * family before calling this function. + */ +int sock_inet6_make_foreign(int fd) +{ + return +#if defined(IPV6_TRANSPARENT) + setsockopt(fd, IPPROTO_IPV6, IPV6_TRANSPARENT, &one, sizeof(one)) == 0 || +#endif +#if defined(IP_FREEBIND) + setsockopt(fd, IPPROTO_IP, IP_FREEBIND, &one, sizeof(one)) == 0 || +#endif +#if defined(IPV6_BINDANY) + setsockopt(fd, IPPROTO_IPV6, IPV6_BINDANY, &one, sizeof(one)) == 0 || +#endif +#if defined(SO_BINDANY) + setsockopt(fd, SOL_SOCKET, SO_BINDANY, &one, sizeof(one)) == 0 || +#endif + 0; +} + +/* Binds receiver <rx>, and assigns rx->iocb and rx->owner as the callback and + * context, respectively. Returns and error code made of ERR_* bits on failure + * or ERR_NONE on success. On failure, an error message may be passed into + * <errmsg>. + */ +int sock_inet_bind_receiver(struct receiver *rx, char **errmsg) +{ + int fd, err, ext; + /* copy listener addr because sometimes we need to switch family */ + struct sockaddr_storage addr_inet = rx->addr; + + /* force to classic sock family, not AF_CUST_* */ + addr_inet.ss_family = rx->proto->fam->sock_family; + + /* ensure we never return garbage */ + if (errmsg) + *errmsg = 0; + + err = ERR_NONE; + + if (rx->flags & RX_F_BOUND) + return ERR_NONE; + + if (rx->flags & RX_F_MUST_DUP) { + /* this is a secondary receiver that is an exact copy of a + * reference which must already be bound (or has failed). + * We'll try to dup() the other one's FD and take it. We + * try hard not to reconfigure the socket since it's shared. + */ + BUG_ON(!rx->shard_info); + if (!(rx->shard_info->ref->flags & RX_F_BOUND)) { + /* it's assumed that the first one has already reported + * the error, let's not spam with another one, and do + * not set ERR_ALERT. + */ + err |= ERR_RETRYABLE; + goto bind_ret_err; + } + /* taking the other one's FD will result in it being marked + * extern and being dup()ed. Let's mark the receiver as + * inherited so that it properly bypasses all second-stage + * setup and avoids being passed to new processes. + */ + rx->flags |= RX_F_INHERITED; + rx->fd = rx->shard_info->ref->fd; + } + + /* if no FD was assigned yet, we'll have to either find a compatible + * one or create a new one. + */ + if (rx->fd == -1) + rx->fd = sock_find_compatible_fd(rx); + + /* if the receiver now has an fd assigned, then we were offered the fd + * by an external process (most likely the parent), and we don't want + * to create a new socket. However we still want to set a few flags on + * the socket. + */ + fd = rx->fd; + ext = (fd >= 0); + + if (!ext) { + fd = my_socketat(rx->settings->netns, rx->proto->fam->sock_domain, + rx->proto->sock_type, rx->proto->sock_prot); + if (fd == -1) { + err |= ERR_RETRYABLE | ERR_ALERT; + memprintf(errmsg, "cannot create receiving socket (%s)", strerror(errno)); + goto bind_return; + } + } + + if (ext && fd < global.maxsock && fdtab[fd].owner) { + /* This FD was already bound so this means that it was already + * known and registered before parsing, hence it's an inherited + * FD. The only reason why it's already known here is that it + * has been registered multiple times (multiple listeners on the + * same, or a "shards" directive on the line). There cannot be + * multiple listeners on one FD but at least we can create a + * new one from the original one. We won't reconfigure it, + * however, as this was already done for the first one. + */ + fd = dup(fd); + if (fd == -1) { + err |= ERR_RETRYABLE | ERR_ALERT; + memprintf(errmsg, "cannot dup() receiving socket (%s)", strerror(errno)); + goto bind_return; + } + } + + if (fd >= global.maxsock) { + err |= ERR_FATAL | ERR_ABORT | ERR_ALERT; + memprintf(errmsg, "not enough free sockets (raise '-n' parameter)"); + goto bind_close_return; + } + + if (fd_set_nonblock(fd) == -1) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "cannot make socket non-blocking"); + goto bind_close_return; + } + + if (!ext && setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) == -1) { + /* not fatal but should be reported */ + memprintf(errmsg, "cannot do so_reuseaddr"); + err |= ERR_ALERT; + } + +#ifdef SO_REUSEPORT + /* OpenBSD and Linux 3.9 support this. As it's present in old libc versions of + * Linux, it might return an error that we will silently ignore. + */ + if (!ext && (rx->proto->flags & PROTO_F_REUSEPORT_SUPPORTED)) + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); +#endif + +#ifdef SO_REUSEPORT_LB + /* FreeBSD 12 and above use this to load-balance incoming connections. + * This is limited to 256 listeners per group however. + */ + if (!ext && (rx->proto->flags & PROTO_F_REUSEPORT_SUPPORTED)) + setsockopt(fd, SOL_SOCKET, SO_REUSEPORT_LB, &one, sizeof(one)); +#endif + + if (!ext && (rx->settings->options & RX_O_FOREIGN)) { + switch (addr_inet.ss_family) { + case AF_INET: + if (!sock_inet4_make_foreign(fd)) { + memprintf(errmsg, "cannot make receiving socket transparent"); + err |= ERR_ALERT; + } + break; + case AF_INET6: + if (!sock_inet6_make_foreign(fd)) { + memprintf(errmsg, "cannot make receiving socket transparent"); + err |= ERR_ALERT; + } + break; + } + } + +#ifdef SO_BINDTODEVICE + /* Note: this might fail if not CAP_NET_RAW */ + if (!ext && rx->settings->interface) { + if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, + rx->settings->interface, + strlen(rx->settings->interface) + 1) == -1) { + memprintf(errmsg, "cannot bind receiver to device '%s' (%s)", rx->settings->interface, strerror(errno)); + err |= ERR_WARN; + } + } +#endif + +#if defined(IPV6_V6ONLY) + if (addr_inet.ss_family == AF_INET6 && !ext) { + /* Prepare to match the v6only option against what we really want. Note + * that sadly the two options are not exclusive to each other and that + * v6only is stronger than v4v6. + */ + if ((rx->settings->options & RX_O_V6ONLY) || + (sock_inet6_v6only_default && !(rx->settings->options & RX_O_V4V6))) + setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one)); + else + setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &zero, sizeof(zero)); + } +#endif + + if (!ext && bind(fd, (struct sockaddr *)&addr_inet, rx->proto->fam->sock_addrlen) == -1) { + err |= ERR_RETRYABLE | ERR_ALERT; + memprintf(errmsg, "cannot bind socket (%s)", strerror(errno)); + goto bind_close_return; + } + + rx->fd = fd; + rx->flags |= RX_F_BOUND; + + fd_insert(fd, rx->owner, rx->iocb, rx->bind_tgroup, rx->bind_thread); + + /* for now, all regularly bound TCP listeners are exportable */ + if (!(rx->flags & RX_F_INHERITED)) + HA_ATOMIC_OR(&fdtab[fd].state, FD_EXPORTED); + + bind_return: + if (errmsg && *errmsg) { + char pn[INET6_ADDRSTRLEN]; + + addr_to_str(&addr_inet, pn, sizeof(pn)); + memprintf(errmsg, "%s for [%s:%d]", *errmsg, pn, get_host_port(&addr_inet)); + } + bind_ret_err: + return err; + + bind_close_return: + close(fd); + goto bind_return; +} + +static void sock_inet_prepare() +{ + int fd, val; + socklen_t len; + + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd >= 0) { +#ifdef TCP_MAXSEG + /* retrieve the OS' default mss for TCPv4 */ + len = sizeof(val); + if (getsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &val, &len) == 0) + sock_inet_tcp_maxseg_default = val; +#endif + close(fd); + } + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd >= 0) { +#if defined(IPV6_V6ONLY) + /* retrieve the OS' bindv6only value */ + len = sizeof(val); + if (getsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, &len) == 0 && val > 0) + sock_inet6_v6only_default = 1; +#endif + +#ifdef TCP_MAXSEG + /* retrieve the OS' default mss for TCPv6 */ + len = sizeof(val); + if (getsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &val, &len) == 0) + sock_inet6_tcp_maxseg_default = val; +#endif + close(fd); + } +} + +INITCALL0(STG_PREPARE, sock_inet_prepare); + + +REGISTER_BUILD_OPTS("Built with transparent proxy support using:" +#if defined(IP_TRANSPARENT) + " IP_TRANSPARENT" +#endif +#if defined(IPV6_TRANSPARENT) + " IPV6_TRANSPARENT" +#endif +#if defined(IP_FREEBIND) + " IP_FREEBIND" +#endif +#if defined(IP_BINDANY) + " IP_BINDANY" +#endif +#if defined(IPV6_BINDANY) + " IPV6_BINDANY" +#endif +#if defined(SO_BINDANY) + " SO_BINDANY" +#endif + ""); diff --git a/src/sock_unix.c b/src/sock_unix.c new file mode 100644 index 0000000..ef749a5 --- /dev/null +++ b/src/sock_unix.c @@ -0,0 +1,387 @@ +/* + * SOCK_UNIX socket management + * + * Copyright 2000-2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <string.h> +#include <unistd.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/un.h> + +#include <haproxy/api.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/listener.h> +#include <haproxy/receiver-t.h> +#include <haproxy/namespace.h> +#include <haproxy/sock.h> +#include <haproxy/sock_unix.h> +#include <haproxy/tools.h> + + +struct proto_fam proto_fam_unix = { + .name = "unix", + .sock_domain = PF_UNIX, + .sock_family = AF_UNIX, + .sock_addrlen = sizeof(struct sockaddr_un), + .l3_addrlen = sizeof(((struct sockaddr_un*)0)->sun_path), + .addrcmp = sock_unix_addrcmp, + .bind = sock_unix_bind_receiver, + .get_src = sock_get_src, + .get_dst = sock_get_dst, +}; + +/* PLEASE NOTE for functions below: + * + * The address family SHOULD always be checked. In some cases a function will + * be used in a situation where the address family is guaranteed (e.g. protocol + * definitions), so the test may be avoided. This special case must then be + * mentioned in the comment before the function definition. + */ + + +/* Compares two AF_UNIX sockaddr addresses. Returns 0 if they match or non-zero + * if they do not match. It also supports ABNS socket addresses (those starting + * with \0). For regular UNIX sockets however, this does explicitly support + * matching names ending exactly with .XXXXX.tmp which are newly bound sockets + * about to be replaced; this suffix is then ignored. Note that our UNIX socket + * paths are always zero-terminated. + */ +int sock_unix_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b) +{ + const struct sockaddr_un *au = (const struct sockaddr_un *)a; + const struct sockaddr_un *bu = (const struct sockaddr_un *)b; + int idx, dot, idx2; + + if (a->ss_family != b->ss_family) + return -1; + + if (a->ss_family != AF_UNIX) + return -1; + + if (au->sun_path[0] != bu->sun_path[0]) + return -1; + + if (au->sun_path[0] == 0) + return memcmp(au->sun_path, bu->sun_path, sizeof(au->sun_path)); + + idx = 1; dot = 0; + while (au->sun_path[idx] == bu->sun_path[idx]) { + if (au->sun_path[idx] == 0) + return 0; + if (au->sun_path[idx] == '.') + dot = idx; + idx++; + } + + /* Now we have a difference. It's OK if they are within or after a + * sequence of digits following a dot, and are followed by ".tmp". + * + * make sure to perform the check against tempname if the compared + * string is in "final" format (does not end with ".XXXX.tmp"). + * + * Examples: + * /tmp/test matches with /tmp/test.1822.tmp + * /tmp/test.1822.tmp matches with /tmp/test.XXXX.tmp + */ + if (au->sun_path[idx] == 0 || bu->sun_path[idx] == 0) { + if (au->sun_path[idx] == '.' || bu->sun_path[idx] == '.') + dot = idx; /* try to match against temp path */ + else + return -1; /* invalid temp path */ + } + + if (!dot) + return -1; + + /* First, check in path "a" */ + if (au->sun_path[idx] != 0) { + for (idx2 = dot + 1; idx2 && isdigit((unsigned char)au->sun_path[idx2]);) + idx2++; + if (strcmp(au->sun_path + idx2, ".tmp") != 0) + return -1; + } + + /* Then check in path "b" */ + if (bu->sun_path[idx] != 0) { + for (idx2 = dot + 1; idx2 && isdigit((unsigned char)bu->sun_path[idx2]); idx2++) + ; + if (strcmp(bu->sun_path + idx2, ".tmp") != 0) + return -1; + } + + /* OK that's a match */ + return 0; +} + +/* Binds receiver <rx>, and assigns rx->iocb and rx->owner as the callback and + * context, respectively, with ->bind_thread as the thread mask. Returns an + * error code made of ERR_* bits on failure or ERR_NONE on success. On failure, + * an error message may be passed into <errmsg>. + */ +int sock_unix_bind_receiver(struct receiver *rx, char **errmsg) +{ + char tempname[MAXPATHLEN]; + char backname[MAXPATHLEN]; + struct sockaddr_un addr; + const char *path; + int maxpathlen; + int fd, err, ext, ret; + + /* ensure we never return garbage */ + if (errmsg) + *errmsg = 0; + + err = ERR_NONE; + + if (rx->flags & RX_F_BOUND) + return ERR_NONE; + + if (rx->flags & RX_F_MUST_DUP) { + /* this is a secondary receiver that is an exact copy of a + * reference which must already be bound (or has failed). + * We'll try to dup() the other one's FD and take it. We + * try hard not to reconfigure the socket since it's shared. + */ + BUG_ON(!rx->shard_info); + if (!(rx->shard_info->ref->flags & RX_F_BOUND)) { + /* it's assumed that the first one has already reported + * the error, let's not spam with another one, and do + * not set ERR_ALERT. + */ + err |= ERR_RETRYABLE; + goto bind_ret_err; + } + /* taking the other one's FD will result in it being marked + * extern and being dup()ed. Let's mark the receiver as + * inherited so that it properly bypasses all second-stage + * setup and avoids being passed to new processes. + */ + rx->flags |= RX_F_INHERITED; + rx->fd = rx->shard_info->ref->fd; + } + + /* if no FD was assigned yet, we'll have to either find a compatible + * one or create a new one. + */ + if (rx->fd == -1) + rx->fd = sock_find_compatible_fd(rx); + + path = ((struct sockaddr_un *)&rx->addr)->sun_path; + maxpathlen = MIN(MAXPATHLEN, sizeof(addr.sun_path)); + + /* if the listener already has an fd assigned, then we were offered the + * fd by an external process (most likely the parent), and we don't want + * to create a new socket. However we still want to set a few flags on + * the socket. + */ + fd = rx->fd; + ext = (fd >= 0); + if (ext) + goto fd_ready; + + if (path[0]) { + ret = snprintf(tempname, maxpathlen, "%s.%d.tmp", path, pid); + if (ret < 0 || ret >= sizeof(addr.sun_path)) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "name too long for UNIX socket (limit usually 97)"); + goto bind_return; + } + + ret = snprintf(backname, maxpathlen, "%s.%d.bak", path, pid); + if (ret < 0 || ret >= maxpathlen) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "name too long for UNIX socket (limit usually 97)"); + goto bind_return; + } + + /* 2. clean existing orphaned entries */ + if (unlink(tempname) < 0 && errno != ENOENT) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "error when trying to unlink previous UNIX socket (%s)", strerror(errno)); + goto bind_return; + } + + if (unlink(backname) < 0 && errno != ENOENT) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "error when trying to unlink previous UNIX socket (%s)", strerror(errno)); + goto bind_return; + } + + /* 3. backup existing socket */ + if (link(path, backname) < 0 && errno != ENOENT) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "error when trying to preserve previous UNIX socket (%s)", strerror(errno)); + goto bind_return; + } + + /* Note: this test is redundant with the snprintf one above and + * will never trigger, it's just added as the only way to shut + * gcc's painfully dumb warning about possibly truncated output + * during strncpy(). Don't move it above or smart gcc will not + * see it! + */ + if (strlen(tempname) >= sizeof(addr.sun_path)) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "name too long for UNIX socket (limit usually 97)"); + goto bind_return; + } + + strncpy(addr.sun_path, tempname, sizeof(addr.sun_path) - 1); + addr.sun_path[sizeof(addr.sun_path) - 1] = 0; + } + else { + /* first char is zero, it's an abstract socket whose address + * is defined by all the bytes past this zero. + */ + memcpy(addr.sun_path, path, sizeof(addr.sun_path)); + } + addr.sun_family = AF_UNIX; + + /* WT: shouldn't we use my_socketat(rx->netns) here instead ? */ + fd = socket(rx->proto->fam->sock_domain, rx->proto->sock_type, rx->proto->sock_prot); + if (fd < 0) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "cannot create receiving socket (%s)", strerror(errno)); + goto bind_return; + } + + fd_ready: + if (ext && fd < global.maxsock && fdtab[fd].owner) { + /* This FD was already bound so this means that it was already + * known and registered before parsing, hence it's an inherited + * FD. The only reason why it's already known here is that it + * has been registered multiple times (multiple listeners on the + * same, or a "shards" directive on the line). There cannot be + * multiple listeners on one FD but at least we can create a + * new one from the original one. We won't reconfigure it, + * however, as this was already done for the first one. + */ + fd = dup(fd); + if (fd == -1) { + err |= ERR_RETRYABLE | ERR_ALERT; + memprintf(errmsg, "cannot dup() receiving socket (%s)", strerror(errno)); + goto bind_return; + } + } + + if (fd >= global.maxsock) { + err |= ERR_FATAL | ERR_ABORT | ERR_ALERT; + memprintf(errmsg, "not enough free sockets (raise '-n' parameter)"); + goto bind_close_return; + } + + if (fd_set_nonblock(fd) == -1) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "cannot make socket non-blocking"); + goto bind_close_return; + } + + if (!ext && bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + /* note that bind() creates the socket <tempname> on the file system */ + if (errno == EADDRINUSE) { + /* the old process might still own it, let's retry */ + err |= ERR_RETRYABLE | ERR_ALERT; + memprintf(errmsg, "cannot bind UNIX socket (already in use)"); + goto bind_close_return; + } + else { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "cannot bind UNIX socket (%s)", strerror(errno)); + goto bind_close_return; + } + } + + /* <uid> and <gid> different of -1 will be used to change the socket owner. + * If <mode> is not 0, it will be used to restrict access to the socket. + * While it is known not to be portable on every OS, it's still useful + * where it works. We also don't change permissions on abstract sockets. + */ + if (!ext && path[0] && + (((rx->settings->ux.uid != -1 || rx->settings->ux.gid != -1) && + (chown(tempname, rx->settings->ux.uid, rx->settings->ux.gid) == -1)) || + (rx->settings->ux.mode != 0 && chmod(tempname, rx->settings->ux.mode) == -1))) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "cannot change UNIX socket ownership (%s)", strerror(errno)); + goto err_unlink_temp; + } + + /* Point of no return: we are ready, we'll switch the sockets. We don't + * fear losing the socket <path> because we have a copy of it in + * backname. Abstract sockets are not renamed. + */ + if (!ext && path[0] && rename(tempname, path) < 0) { + err |= ERR_FATAL | ERR_ALERT; + memprintf(errmsg, "cannot switch final and temporary UNIX sockets (%s)", strerror(errno)); + goto err_rename; + } + + /* Cleanup: only unlink if we didn't inherit the fd from the parent */ + if (!ext && path[0]) + unlink(backname); + + rx->fd = fd; + rx->flags |= RX_F_BOUND; + + if (!path[0]) { + /* ABNS sockets do not support suspend, and they conflict with + * other ones (no reuseport), so they must always be unbound. + */ + rx->flags |= RX_F_NON_SUSPENDABLE; + } + + fd_insert(fd, rx->owner, rx->iocb, rx->bind_tgroup, rx->bind_thread); + + /* for now, all regularly bound TCP listeners are exportable */ + if (!(rx->flags & RX_F_INHERITED)) + HA_ATOMIC_OR(&fdtab[fd].state, FD_EXPORTED); + + return err; + + err_rename: + ret = rename(backname, path); + if (ret < 0 && errno == ENOENT) + unlink(path); + err_unlink_temp: + if (!ext && path[0]) + unlink(tempname); + close(fd); + err_unlink_back: + if (!ext && path[0]) + unlink(backname); + bind_return: + if (errmsg && *errmsg) { + if (!ext) { + char *path_str; + + path_str = sa2str((struct sockaddr_storage *)&rx->addr, 0, 0); + memprintf(errmsg, "%s [%s]", *errmsg, ((path_str) ? path_str : "")); + ha_free(&path_str); + } + else + memprintf(errmsg, "%s [fd %d]", *errmsg, fd); + } + bind_ret_err: + return err; + + bind_close_return: + close(fd); + goto bind_return; +} diff --git a/src/ssl_ckch.c b/src/ssl_ckch.c new file mode 100644 index 0000000..ab39755 --- /dev/null +++ b/src/ssl_ckch.c @@ -0,0 +1,3968 @@ +/* + * + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <unistd.h> + +#include <sys/stat.h> +#include <sys/types.h> + +#include <import/ebpttree.h> +#include <import/ebsttree.h> + +#include <haproxy/applet.h> +#include <haproxy/base64.h> +#include <haproxy/channel.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/sc_strm.h> +#include <haproxy/ssl_ckch.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/ssl_ocsp.h> +#include <haproxy/ssl_utils.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> + +/* Uncommitted CKCH transaction */ + +static struct { + struct ckch_store *new_ckchs; + struct ckch_store *old_ckchs; + char *path; +} ckchs_transaction; + +/* Uncommitted CA file transaction */ + +static struct { + struct cafile_entry *old_cafile_entry; + struct cafile_entry *new_cafile_entry; + char *path; +} cafile_transaction; + +/* Uncommitted CRL file transaction */ + +static struct { + struct cafile_entry *old_crlfile_entry; + struct cafile_entry *new_crlfile_entry; + char *path; +} crlfile_transaction; + +/* CLI context used by "show cafile" */ +struct show_cafile_ctx { + struct cafile_entry *cur_cafile_entry; + struct cafile_entry *old_cafile_entry; + int ca_index; + int show_all; +}; + +/* CLI context used by "show crlfile" */ +struct show_crlfile_ctx { + struct cafile_entry *cafile_entry; + struct cafile_entry *old_crlfile_entry; + int index; +}; + +/* CLI context used by "show cert" */ +struct show_cert_ctx { + struct ckch_store *old_ckchs; + struct ckch_store *cur_ckchs; + int transaction; +}; + +/* CLI context used by "commit cert" */ +struct commit_cert_ctx { + struct ckch_store *old_ckchs; + struct ckch_store *new_ckchs; + struct ckch_inst *next_ckchi; + char *err; + enum { + CERT_ST_INIT = 0, + CERT_ST_GEN, + CERT_ST_INSERT, + CERT_ST_SUCCESS, + CERT_ST_FIN, + CERT_ST_ERROR, + } state; +}; + +/* CLI context used by "commit cafile" and "commit crlfile" */ +struct commit_cacrlfile_ctx { + struct cafile_entry *old_entry; + struct cafile_entry *new_entry; + struct ckch_inst_link *next_ckchi_link; + enum cafile_type cafile_type; /* either CA or CRL, depending on the current command */ + char *err; + enum { + CACRL_ST_INIT = 0, + CACRL_ST_GEN, + CACRL_ST_INSERT, + CACRL_ST_SUCCESS, + CACRL_ST_FIN, + CACRL_ST_ERROR, + } state; +}; + + +/******************** cert_key_and_chain functions ************************* + * These are the functions that fills a cert_key_and_chain structure. For the + * functions filling a SSL_CTX from a cert_key_and_chain, see ssl_sock.c + */ + +/* + * Try to parse Signed Certificate Timestamp List structure. This function + * makes only basic test if the data seems like SCTL. No signature validation + * is performed. + */ +static int ssl_sock_parse_sctl(struct buffer *sctl) +{ + int ret = 1; + int len, pos, sct_len; + unsigned char *data; + + if (sctl->data < 2) + goto out; + + data = (unsigned char *) sctl->area; + len = (data[0] << 8) | data[1]; + + if (len + 2 != sctl->data) + goto out; + + data = data + 2; + pos = 0; + while (pos < len) { + if (len - pos < 2) + goto out; + + sct_len = (data[pos] << 8) | data[pos + 1]; + if (pos + sct_len + 2 > len) + goto out; + + pos += sct_len + 2; + } + + ret = 0; + +out: + return ret; +} + +/* Try to load a sctl from a buffer <buf> if not NULL, or read the file <sctl_path> + * It fills the ckch->sctl buffer + * return 0 on success or != 0 on failure */ +int ssl_sock_load_sctl_from_file(const char *sctl_path, char *buf, struct ckch_data *data, char **err) +{ + int fd = -1; + int r = 0; + int ret = 1; + struct buffer tmp; + struct buffer *src; + struct buffer *sctl; + + if (buf) { + chunk_initstr(&tmp, buf); + src = &tmp; + } else { + fd = open(sctl_path, O_RDONLY); + if (fd == -1) + goto end; + + trash.data = 0; + while (trash.data < trash.size) { + r = read(fd, trash.area + trash.data, trash.size - trash.data); + if (r < 0) { + if (errno == EINTR) + continue; + goto end; + } + else if (r == 0) { + break; + } + trash.data += r; + } + src = &trash; + } + + ret = ssl_sock_parse_sctl(src); + if (ret) + goto end; + + sctl = calloc(1, sizeof(*sctl)); + if (!chunk_dup(sctl, src)) { + ha_free(&sctl); + goto end; + } + /* no error, fill ckch with new context, old context must be free */ + if (data->sctl) { + ha_free(&data->sctl->area); + free(data->sctl); + } + data->sctl = sctl; + ret = 0; +end: + if (fd != -1) + close(fd); + + return ret; +} + +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) || defined OPENSSL_IS_BORINGSSL) +/* + * This function load the OCSP Response in DER format contained in file at + * path 'ocsp_path' or base64 in a buffer <buf> + * + * Returns 0 on success, 1 in error case. + */ +int ssl_sock_load_ocsp_response_from_file(const char *ocsp_path, char *buf, struct ckch_data *data, char **err) +{ + int fd = -1; + int r = 0; + int ret = 1; + struct buffer *ocsp_response; + struct buffer *src = NULL; + + if (buf) { + int i, j; + /* if it's from a buffer it will be base64 */ + + /* remove \r and \n from the payload */ + for (i = 0, j = 0; buf[i]; i++) { + if (buf[i] == '\r' || buf[i] == '\n') + continue; + buf[j++] = buf[i]; + } + buf[j] = 0; + + ret = base64dec(buf, j, trash.area, trash.size); + if (ret < 0) { + memprintf(err, "Error reading OCSP response in base64 format"); + goto end; + } + trash.data = ret; + src = &trash; + } else { + fd = open(ocsp_path, O_RDONLY); + if (fd == -1) { + memprintf(err, "Error opening OCSP response file"); + goto end; + } + + trash.data = 0; + while (trash.data < trash.size) { + r = read(fd, trash.area + trash.data, trash.size - trash.data); + if (r < 0) { + if (errno == EINTR) + continue; + + memprintf(err, "Error reading OCSP response from file"); + goto end; + } + else if (r == 0) { + break; + } + trash.data += r; + } + close(fd); + fd = -1; + src = &trash; + } + + ocsp_response = calloc(1, sizeof(*ocsp_response)); + if (!chunk_dup(ocsp_response, src)) { + ha_free(&ocsp_response); + goto end; + } + /* no error, fill data with new context, old context must be free */ + if (data->ocsp_response) { + ha_free(&data->ocsp_response->area); + free(data->ocsp_response); + } + data->ocsp_response = ocsp_response; + ret = 0; +end: + if (fd != -1) + close(fd); + + return ret; +} +#endif + +/* + * Try to load in a ckch every files related to a ckch. + * (PEM, sctl, ocsp, issuer etc.) + * + * This function is only used to load files during the configuration parsing, + * it is not used with the CLI. + * + * This allows us to carry the contents of the file without having to read the + * file multiple times. The caller must call + * ssl_sock_free_cert_key_and_chain_contents. + * + * returns: + * 0 on Success + * 1 on SSL Failure + */ +int ssl_sock_load_files_into_ckch(const char *path, struct ckch_data *data, char **err) +{ + struct buffer *fp = NULL; + int ret = 1; + struct stat st; + + /* try to load the PEM */ + if (ssl_sock_load_pem_into_ckch(path, NULL, data , err) != 0) { + goto end; + } + + fp = alloc_trash_chunk(); + if (!fp) { + memprintf(err, "%sCan't allocate memory\n", err && *err ? *err : ""); + goto end; + } + + if (!chunk_strcpy(fp, path) || (b_data(fp) > MAXPATHLEN)) { + memprintf(err, "%s '%s' filename too long'.\n", + err && *err ? *err : "", fp->area); + ret = 1; + goto end; + } + + /* remove the ".crt" extension */ + if (global_ssl.extra_files_noext) { + char *ext; + + /* look for the extension */ + if ((ext = strrchr(fp->area, '.'))) { + + if (strcmp(ext, ".crt") == 0) { + *ext = '\0'; + fp->data = strlen(fp->area); + } + } + + } + + if (data->key == NULL) { + /* If no private key was found yet and we cannot look for it in extra + * files, raise an error. + */ + if (!(global_ssl.extra_files & SSL_GF_KEY)) { + memprintf(err, "%sNo Private Key found in '%s'.\n", err && *err ? *err : "", fp->area); + goto end; + } + + /* try to load an external private key if it wasn't in the PEM */ + if (!chunk_strcat(fp, ".key") || (b_data(fp) > MAXPATHLEN)) { + memprintf(err, "%s '%s' filename too long'.\n", + err && *err ? *err : "", fp->area); + ret = 1; + goto end; + } + + if (stat(fp->area, &st) == 0) { + if (ssl_sock_load_key_into_ckch(fp->area, NULL, data, err)) { + memprintf(err, "%s '%s' is present but cannot be read or parsed'.\n", + err && *err ? *err : "", fp->area); + goto end; + } + } + + if (data->key == NULL) { + memprintf(err, "%sNo Private Key found in '%s'.\n", err && *err ? *err : "", fp->area); + goto end; + } + /* remove the added extension */ + *(fp->area + fp->data - strlen(".key")) = '\0'; + b_sub(fp, strlen(".key")); + } + + + if (!X509_check_private_key(data->cert, data->key)) { + memprintf(err, "%sinconsistencies between private key and certificate loaded '%s'.\n", + err && *err ? *err : "", path); + goto end; + } + +#ifdef HAVE_SSL_SCTL + /* try to load the sctl file */ + if (global_ssl.extra_files & SSL_GF_SCTL) { + struct stat st; + + if (!chunk_strcat(fp, ".sctl") || b_data(fp) > MAXPATHLEN) { + memprintf(err, "%s '%s' filename too long'.\n", + err && *err ? *err : "", fp->area); + ret = 1; + goto end; + } + + if (stat(fp->area, &st) == 0) { + if (ssl_sock_load_sctl_from_file(fp->area, NULL, data, err)) { + memprintf(err, "%s '%s.sctl' is present but cannot be read or parsed'.\n", + err && *err ? *err : "", fp->area); + ret = 1; + goto end; + } + } + /* remove the added extension */ + *(fp->area + fp->data - strlen(".sctl")) = '\0'; + b_sub(fp, strlen(".sctl")); + } +#endif + + /* try to load an ocsp response file */ + if (global_ssl.extra_files & SSL_GF_OCSP) { + struct stat st; + + if (!chunk_strcat(fp, ".ocsp") || b_data(fp) > MAXPATHLEN) { + memprintf(err, "%s '%s' filename too long'.\n", + err && *err ? *err : "", fp->area); + ret = 1; + goto end; + } + + if (stat(fp->area, &st) == 0) { + if (ssl_sock_load_ocsp_response_from_file(fp->area, NULL, data, err)) { + ret = 1; + goto end; + } + } + /* remove the added extension */ + *(fp->area + fp->data - strlen(".ocsp")) = '\0'; + b_sub(fp, strlen(".ocsp")); + } + +#ifndef OPENSSL_IS_BORINGSSL /* Useless for BoringSSL */ + if (data->ocsp_response && (global_ssl.extra_files & SSL_GF_OCSP_ISSUER)) { + /* if no issuer was found, try to load an issuer from the .issuer */ + if (!data->ocsp_issuer) { + struct stat st; + + if (!chunk_strcat(fp, ".issuer") || b_data(fp) > MAXPATHLEN) { + memprintf(err, "%s '%s' filename too long'.\n", + err && *err ? *err : "", fp->area); + ret = 1; + goto end; + } + + if (stat(fp->area, &st) == 0) { + if (ssl_sock_load_issuer_file_into_ckch(fp->area, NULL, data, err)) { + ret = 1; + goto end; + } + + if (X509_check_issued(data->ocsp_issuer, data->cert) != X509_V_OK) { + memprintf(err, "%s '%s' is not an issuer'.\n", + err && *err ? *err : "", fp->area); + ret = 1; + goto end; + } + } + /* remove the added extension */ + *(fp->area + fp->data - strlen(".issuer")) = '\0'; + b_sub(fp, strlen(".issuer")); + } + } +#endif + + ret = 0; + +end: + + ERR_clear_error(); + + /* Something went wrong in one of the reads */ + if (ret != 0) + ssl_sock_free_cert_key_and_chain_contents(data); + + free_trash_chunk(fp); + + return ret; +} + +/* + * Try to load a private key file from a <path> or a buffer <buf> + * + * If it failed you should not attempt to use the ckch but free it. + * + * Return 0 on success or != 0 on failure + */ +int ssl_sock_load_key_into_ckch(const char *path, char *buf, struct ckch_data *data , char **err) +{ + BIO *in = NULL; + int ret = 1; + EVP_PKEY *key = NULL; + + if (buf) { + /* reading from a buffer */ + in = BIO_new_mem_buf(buf, -1); + if (in == NULL) { + memprintf(err, "%sCan't allocate memory\n", err && *err ? *err : ""); + goto end; + } + + } else { + /* reading from a file */ + in = BIO_new(BIO_s_file()); + if (in == NULL) + goto end; + + if (BIO_read_filename(in, path) <= 0) + goto end; + } + + /* Read Private Key */ + key = PEM_read_bio_PrivateKey(in, NULL, NULL, NULL); + if (key == NULL) { + memprintf(err, "%sunable to load private key from file '%s'.\n", + err && *err ? *err : "", path); + goto end; + } + + ret = 0; + + SWAP(data->key, key); + +end: + + ERR_clear_error(); + if (in) + BIO_free(in); + if (key) + EVP_PKEY_free(key); + + return ret; +} + +/* + * Try to load a PEM file from a <path> or a buffer <buf> + * The PEM must contain at least a Certificate, + * It could contain a DH, a certificate chain and a PrivateKey. + * + * If it failed you should not attempt to use the ckch but free it. + * + * Return 0 on success or != 0 on failure + */ +int ssl_sock_load_pem_into_ckch(const char *path, char *buf, struct ckch_data *data , char **err) +{ + BIO *in = NULL; + int ret = 1; + X509 *ca; + X509 *cert = NULL; + EVP_PKEY *key = NULL; + HASSL_DH *dh = NULL; + STACK_OF(X509) *chain = NULL; + + if (buf) { + /* reading from a buffer */ + in = BIO_new_mem_buf(buf, -1); + if (in == NULL) { + memprintf(err, "%sCan't allocate memory\n", err && *err ? *err : ""); + goto end; + } + + } else { + /* reading from a file */ + in = BIO_new(BIO_s_file()); + if (in == NULL) { + memprintf(err, "%sCan't allocate memory\n", err && *err ? *err : ""); + goto end; + } + + if (BIO_read_filename(in, path) <= 0) { + memprintf(err, "%scannot open the file '%s'.\n", + err && *err ? *err : "", path); + goto end; + } + } + + /* Read Private Key */ + key = PEM_read_bio_PrivateKey(in, NULL, NULL, NULL); + /* no need to check for errors here, because the private key could be loaded later */ + +#ifndef OPENSSL_NO_DH + /* Seek back to beginning of file */ + if (BIO_reset(in) == -1) { + memprintf(err, "%san error occurred while reading the file '%s'.\n", + err && *err ? *err : "", path); + goto end; + } + + dh = ssl_sock_get_dh_from_bio(in); + ERR_clear_error(); + /* no need to return an error there, dh is not mandatory */ +#endif + + /* Seek back to beginning of file */ + if (BIO_reset(in) == -1) { + memprintf(err, "%san error occurred while reading the file '%s'.\n", + err && *err ? *err : "", path); + goto end; + } + + /* Read Certificate */ + cert = PEM_read_bio_X509_AUX(in, NULL, NULL, NULL); + if (cert == NULL) { + ret = ERR_get_error(); + memprintf(err, "%sunable to load certificate from file '%s': %s.\n", + err && *err ? *err : "", path, ERR_reason_error_string(ret)); + goto end; + } + + /* Look for a Certificate Chain */ + while ((ca = PEM_read_bio_X509(in, NULL, NULL, NULL))) { + if (chain == NULL) + chain = sk_X509_new_null(); + if (!sk_X509_push(chain, ca)) { + X509_free(ca); + break; + } + } + + ret = ERR_get_error(); + if (ret && !(ERR_GET_LIB(ret) == ERR_LIB_PEM && ERR_GET_REASON(ret) == PEM_R_NO_START_LINE)) { + memprintf(err, "%sunable to load certificate chain from file '%s': %s\n", + err && *err ? *err : "", path, ERR_reason_error_string(ret)); + goto end; + } + + /* once it loaded the PEM, it should remove everything else in the data */ + if (data->ocsp_response) { + ha_free(&data->ocsp_response->area); + ha_free(&data->ocsp_response); + } + + if (data->sctl) { + ha_free(&data->sctl->area); + ha_free(&data->sctl); + } + + if (data->ocsp_issuer) { + X509_free(data->ocsp_issuer); + data->ocsp_issuer = NULL; + } + + /* no error, fill data with new context, old context will be free at end: */ + SWAP(data->key, key); + SWAP(data->dh, dh); + SWAP(data->cert, cert); + SWAP(data->chain, chain); + + ret = 0; + +end: + + ERR_clear_error(); + if (in) + BIO_free(in); + if (key) + EVP_PKEY_free(key); + if (dh) + HASSL_DH_free(dh); + if (cert) + X509_free(cert); + if (chain) + sk_X509_pop_free(chain, X509_free); + + return ret; +} + +/* Frees the contents of a cert_key_and_chain + */ +void ssl_sock_free_cert_key_and_chain_contents(struct ckch_data *data) +{ + if (!data) + return; + + /* Free the certificate and set pointer to NULL */ + if (data->cert) + X509_free(data->cert); + data->cert = NULL; + + /* Free the key and set pointer to NULL */ + if (data->key) + EVP_PKEY_free(data->key); + data->key = NULL; + + /* Free each certificate in the chain */ + if (data->chain) + sk_X509_pop_free(data->chain, X509_free); + data->chain = NULL; + + if (data->dh) + HASSL_DH_free(data->dh); + data->dh = NULL; + + if (data->sctl) { + ha_free(&data->sctl->area); + ha_free(&data->sctl); + } + + if (data->ocsp_response) { + ha_free(&data->ocsp_response->area); + ha_free(&data->ocsp_response); + } + + if (data->ocsp_issuer) + X509_free(data->ocsp_issuer); + data->ocsp_issuer = NULL; + + + /* We need to properly remove the reference to the corresponding + * certificate_ocsp structure if it exists (which it should). + */ +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) + if (data->ocsp_cid) { + struct certificate_ocsp *ocsp = NULL; + unsigned char certid[OCSP_MAX_CERTID_ASN1_LENGTH] = {}; + unsigned int certid_length = 0; + + if (ssl_ocsp_build_response_key(data->ocsp_cid, (unsigned char*)certid, &certid_length) >= 0) { + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp = (struct certificate_ocsp *)ebmb_lookup(&cert_ocsp_tree, certid, OCSP_MAX_CERTID_ASN1_LENGTH); + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + ssl_sock_free_ocsp(ocsp); + } + + OCSP_CERTID_free(data->ocsp_cid); + data->ocsp_cid = NULL; + } +#endif +} + +/* + * + * This function copy a cert_key_and_chain in memory + * + * It's used to try to apply changes on a ckch before committing them, because + * most of the time it's not possible to revert those changes + * + * Return a the dst or NULL + */ +struct ckch_data *ssl_sock_copy_cert_key_and_chain(struct ckch_data *src, + struct ckch_data *dst) +{ + if (!src || !dst) + return NULL; + + if (src->cert) { + dst->cert = src->cert; + X509_up_ref(src->cert); + } + + if (src->key) { + dst->key = src->key; + EVP_PKEY_up_ref(src->key); + } + + if (src->chain) { + dst->chain = X509_chain_up_ref(src->chain); + } + + if (src->dh) { +#ifndef USE_OPENSSL_WOLFSSL + HASSL_DH_up_ref(src->dh); + dst->dh = src->dh; +#else + dst->dh = wolfSSL_DH_dup(src->dh); + if (!dst->dh) + goto error; +#endif + } + + if (src->sctl) { + struct buffer *sctl; + + sctl = calloc(1, sizeof(*sctl)); + if (!chunk_dup(sctl, src->sctl)) { + ha_free(&sctl); + goto error; + } + dst->sctl = sctl; + } + + if (src->ocsp_response) { + struct buffer *ocsp_response; + + ocsp_response = calloc(1, sizeof(*ocsp_response)); + if (!chunk_dup(ocsp_response, src->ocsp_response)) { + ha_free(&ocsp_response); + goto error; + } + dst->ocsp_response = ocsp_response; + } + + if (src->ocsp_issuer) { + X509_up_ref(src->ocsp_issuer); + dst->ocsp_issuer = src->ocsp_issuer; + } + + dst->ocsp_cid = OCSP_CERTID_dup(src->ocsp_cid); + + dst->ocsp_update_mode = src->ocsp_update_mode; + + return dst; + +error: + + /* free everything */ + ssl_sock_free_cert_key_and_chain_contents(dst); + + return NULL; +} + +/* + * return 0 on success or != 0 on failure + */ +int ssl_sock_load_issuer_file_into_ckch(const char *path, char *buf, struct ckch_data *data, char **err) +{ + int ret = 1; + BIO *in = NULL; + X509 *issuer; + + if (buf) { + /* reading from a buffer */ + in = BIO_new_mem_buf(buf, -1); + if (in == NULL) { + memprintf(err, "%sCan't allocate memory\n", err && *err ? *err : ""); + goto end; + } + + } else { + /* reading from a file */ + in = BIO_new(BIO_s_file()); + if (in == NULL) + goto end; + + if (BIO_read_filename(in, path) <= 0) + goto end; + } + + issuer = PEM_read_bio_X509_AUX(in, NULL, NULL, NULL); + if (!issuer) { + memprintf(err, "%s'%s' cannot be read or parsed'.\n", + err && *err ? *err : "", path); + goto end; + } + /* no error, fill data with new context, old context must be free */ + if (data->ocsp_issuer) + X509_free(data->ocsp_issuer); + data->ocsp_issuer = issuer; + ret = 0; + +end: + + ERR_clear_error(); + if (in) + BIO_free(in); + + return ret; +} + +/******************** ckch_store functions *********************************** + * The ckch_store is a structure used to cache and index the SSL files used in + * configuration + */ + +/* + * Free a ckch_store, its ckch, its instances and remove it from the ebtree + */ +void ckch_store_free(struct ckch_store *store) +{ + struct ckch_inst *inst, *inst_s; + + if (!store) + return; + + list_for_each_entry_safe(inst, inst_s, &store->ckch_inst, by_ckchs) { + ckch_inst_free(inst); + } + ebmb_delete(&store->node); + + ssl_sock_free_cert_key_and_chain_contents(store->data); + ha_free(&store->data); + + free(store); +} + +/* + * create and initialize a ckch_store + * <path> is the key name + * <nmemb> is the number of store->ckch objects to allocate + * + * Return a ckch_store or NULL upon failure. + */ +struct ckch_store *ckch_store_new(const char *filename) +{ + struct ckch_store *store; + int pathlen; + + pathlen = strlen(filename); + store = calloc(1, sizeof(*store) + pathlen + 1); + if (!store) + return NULL; + + memcpy(store->path, filename, pathlen + 1); + + LIST_INIT(&store->ckch_inst); + LIST_INIT(&store->crtlist_entry); + + store->data = calloc(1, sizeof(*store->data)); + if (!store->data) + goto error; + + return store; +error: + ckch_store_free(store); + return NULL; +} + +/* allocate and duplicate a ckch_store + * Return a new ckch_store or NULL */ +struct ckch_store *ckchs_dup(const struct ckch_store *src) +{ + struct ckch_store *dst; + + if (!src) + return NULL; + + dst = ckch_store_new(src->path); + if (!dst) + return NULL; + + if (!ssl_sock_copy_cert_key_and_chain(src->data, dst->data)) + goto error; + + return dst; + +error: + ckch_store_free(dst); + + return NULL; +} + +/* + * lookup a path into the ckchs tree. + */ +struct ckch_store *ckchs_lookup(char *path) +{ + struct ebmb_node *eb; + + eb = ebst_lookup(&ckchs_tree, path); + if (!eb) + return NULL; + + return ebmb_entry(eb, struct ckch_store, node); +} + +/* + * This function allocate a ckch_store and populate it with certificates from files. + */ +struct ckch_store *ckchs_load_cert_file(char *path, char **err) +{ + struct ckch_store *ckchs; + + ckchs = ckch_store_new(path); + if (!ckchs) { + memprintf(err, "%sunable to allocate memory.\n", err && *err ? *err : ""); + goto end; + } + + if (ssl_sock_load_files_into_ckch(path, ckchs->data, err) == 1) + goto end; + + /* insert into the ckchs tree */ + memcpy(ckchs->path, path, strlen(path) + 1); + ebst_insert(&ckchs_tree, &ckchs->node); + return ckchs; + +end: + ckch_store_free(ckchs); + + return NULL; +} + + +/******************** ckch_inst functions ******************************/ + +/* unlink a ckch_inst, free all SNIs, free the ckch_inst */ +/* The caller must use the lock of the bind_conf if used with inserted SNIs */ +void ckch_inst_free(struct ckch_inst *inst) +{ + struct sni_ctx *sni, *sni_s; + struct ckch_inst_link_ref *link_ref, *link_ref_s; + + if (inst == NULL) + return; + + list_for_each_entry_safe(sni, sni_s, &inst->sni_ctx, by_ckch_inst) { + SSL_CTX_free(sni->ctx); + LIST_DELETE(&sni->by_ckch_inst); + ebmb_delete(&sni->name); + free(sni); + } + SSL_CTX_free(inst->ctx); + inst->ctx = NULL; + LIST_DELETE(&inst->by_ckchs); + LIST_DELETE(&inst->by_crtlist_entry); + + /* Free the cafile_link_refs list */ + list_for_each_entry_safe(link_ref, link_ref_s, &inst->cafile_link_refs, list) { + if (link_ref->link && LIST_INLIST(&link_ref->link->list)) { + /* Try to detach and free the ckch_inst_link only if it + * was attached, this way it can be used to loop from + * the caller */ + LIST_DEL_INIT(&link_ref->link->list); + ha_free(&link_ref->link); + } + LIST_DELETE(&link_ref->list); + free(link_ref); + } + + free(inst); +} + +/* Alloc and init a ckch_inst */ +struct ckch_inst *ckch_inst_new() +{ + struct ckch_inst *ckch_inst; + + ckch_inst = calloc(1, sizeof *ckch_inst); + if (!ckch_inst) + return NULL; + + LIST_INIT(&ckch_inst->sni_ctx); + LIST_INIT(&ckch_inst->by_ckchs); + LIST_INIT(&ckch_inst->by_crtlist_entry); + LIST_INIT(&ckch_inst->cafile_link_refs); + + return ckch_inst; +} + + +/******************** ssl_store functions ******************************/ +struct eb_root cafile_tree = EB_ROOT; + +/* + * Returns the cafile_entry found in the cafile_tree indexed by the path 'path'. + * If 'oldest_entry' is 1, returns the "original" cafile_entry (since + * during a set cafile/commit cafile cycle there might be two entries for any + * given path, the original one and the new one set via the CLI but not + * committed yet). + */ +struct cafile_entry *ssl_store_get_cafile_entry(char *path, int oldest_entry) +{ + struct cafile_entry *ca_e = NULL; + struct ebmb_node *eb; + + eb = ebst_lookup(&cafile_tree, path); + while (eb) { + ca_e = ebmb_entry(eb, struct cafile_entry, node); + /* The ebst_lookup in a tree that has duplicates returns the + * oldest entry first. If we want the latest entry, we need to + * iterate over all the duplicates until we find the last one + * (in our case there should never be more than two entries for + * any given path). */ + if (oldest_entry) + return ca_e; + eb = ebmb_next_dup(eb); + } + return ca_e; +} + +int ssl_store_add_uncommitted_cafile_entry(struct cafile_entry *entry) +{ + return (ebst_insert(&cafile_tree, &entry->node) != &entry->node); +} + +X509_STORE* ssl_store_get0_locations_file(char *path) +{ + struct cafile_entry *ca_e = ssl_store_get_cafile_entry(path, 0); + + if (ca_e) + return ca_e->ca_store; + + return NULL; +} + +/* Create a cafile_entry object, without adding it to the cafile_tree. */ +struct cafile_entry *ssl_store_create_cafile_entry(char *path, X509_STORE *store, enum cafile_type type) +{ + struct cafile_entry *ca_e; + int pathlen; + + pathlen = strlen(path); + + ca_e = calloc(1, sizeof(*ca_e) + pathlen + 1); + if (ca_e) { + memcpy(ca_e->path, path, pathlen + 1); + ca_e->ca_store = store; + ca_e->type = type; + LIST_INIT(&ca_e->ckch_inst_link); + } + return ca_e; +} + + +/* Duplicate a cafile_entry + * Allocate the X509_STORE and copy the X509 and CRL inside. + * + * Return the newly allocated cafile_entry or NULL. + * + */ +struct cafile_entry *ssl_store_dup_cafile_entry(struct cafile_entry *src) +{ + struct cafile_entry *dst = NULL; + X509_STORE *store = NULL; + STACK_OF(X509_OBJECT) *objs; + int i; + + if (!src) + return NULL; + + if (src->ca_store) { + /* if there was a store in the src, copy it */ + store = X509_STORE_new(); + if (!store) + goto err; + + objs = X509_STORE_get0_objects(src->ca_store); + for (i = 0; i < sk_X509_OBJECT_num(objs); i++) { + X509 *cert; + X509_CRL *crl; + + cert = X509_OBJECT_get0_X509(sk_X509_OBJECT_value(objs, i)); + if (cert) { + if (X509_STORE_add_cert(store, cert) == 0) { + /* only exits on error if the error is not about duplicate certificates */ + if (!(ERR_GET_REASON(ERR_get_error()) == X509_R_CERT_ALREADY_IN_HASH_TABLE)) { + goto err; + } + } + + } + crl = X509_OBJECT_get0_X509_CRL(sk_X509_OBJECT_value(objs, i)); + if (crl) { + if (X509_STORE_add_crl(store, crl) == 0) { + /* only exits on error if the error is not about duplicate certificates */ + if (!(ERR_GET_REASON(ERR_get_error()) == X509_R_CERT_ALREADY_IN_HASH_TABLE)) { + goto err; + } + } + + } + } + } + dst = ssl_store_create_cafile_entry(src->path, store, src->type); + + return dst; + +err: + X509_STORE_free(store); + ha_free(&dst); + + return NULL; +} + +/* Delete a cafile_entry. The caller is responsible from removing this entry + * from the cafile_tree first if is was previously added into it. */ +void ssl_store_delete_cafile_entry(struct cafile_entry *ca_e) +{ + struct ckch_inst_link *link, *link_s; + if (!ca_e) + return; + + X509_STORE_free(ca_e->ca_store); + + list_for_each_entry_safe(link, link_s, &ca_e->ckch_inst_link, list) { + struct ckch_inst *inst = link->ckch_inst; + struct ckch_inst_link_ref *link_ref, *link_ref_s; + list_for_each_entry_safe(link_ref, link_ref_s, &inst->cafile_link_refs, list) { + if (link_ref->link == link) { + LIST_DELETE(&link_ref->list); + free(link_ref); + break; + } + } + LIST_DELETE(&link->list); + free(link); + } + + free(ca_e); +} + +/* + * Fill a cafile_entry <ca_e> X509_STORE ca_e->store out of a buffer <cert_buf> + * instead of out of a file. The <append> field should be set to 1 if you want + * to keep the existing X509_STORE and append data to it. + * + * This function is used when the "set ssl ca-file" cli command is used. + * It can parse CERTIFICATE sections as well as CRL ones. + * Returns 0 in case of success, 1 otherwise. + * + * /!\ Warning: If there was an error the X509_STORE could have been modified so it's + * better to not use it after a return 1. + */ +int ssl_store_load_ca_from_buf(struct cafile_entry *ca_e, char *cert_buf, int append) +{ + BIO *bio = NULL; + STACK_OF(X509_INFO) *infos; + X509_INFO *info; + int i; + int retval = 1; + int retcert = 0; + + if (!ca_e) + return 1; + + if (!append) { + X509_STORE_free(ca_e->ca_store); + ca_e->ca_store = NULL; + } + + if (!ca_e->ca_store) + ca_e->ca_store = X509_STORE_new(); + + if (!ca_e->ca_store) + goto end; + + bio = BIO_new_mem_buf(cert_buf, strlen(cert_buf)); + if (!bio) + goto end; + + infos = PEM_X509_INFO_read_bio(bio, NULL, NULL, NULL); + if (!infos) + goto end; + + for (i = 0; i < sk_X509_INFO_num(infos) && !retcert; i++) { + info = sk_X509_INFO_value(infos, i); + + /* X509_STORE_add_cert and X509_STORE_add_crl return 1 on success */ + if (info->x509) + retcert = !X509_STORE_add_cert(ca_e->ca_store, info->x509); + if (!retcert && info->crl) + retcert = !X509_STORE_add_crl(ca_e->ca_store, info->crl); + } + + /* return an error if we didn't compute all the X509_INFO or if there was none + * set to 0 if everything was right */ + if (!(retcert || (i != sk_X509_INFO_num(infos)) || (sk_X509_INFO_num(infos) == 0))) + retval = 0; + + /* Cleanup */ + sk_X509_INFO_pop_free(infos, X509_INFO_free); + +end: + BIO_free(bio); + + return retval; +} + +/* + * Try to load a ca-file from disk into the ca-file cache. + * <shuterror> allows you to to stop emitting the errors. + * Return 0 upon error + */ +int __ssl_store_load_locations_file(char *path, int create_if_none, enum cafile_type type, int shuterror) +{ + X509_STORE *store = ssl_store_get0_locations_file(path); + + /* If this function is called by the CLI, we should not call the + * X509_STORE_load_locations function because it performs forbidden disk + * accesses. */ + if (!store && create_if_none) { + STACK_OF(X509_OBJECT) *objs; + int cert_count = 0; + struct stat buf; + struct cafile_entry *ca_e; + const char *file = NULL; + const char *dir = NULL; + unsigned long e; + + store = X509_STORE_new(); + if (!store) { + if (!shuterror) + ha_alert("Cannot allocate memory!\n"); + goto err; + } + + if (strcmp(path, "@system-ca") == 0) { + dir = X509_get_default_cert_dir(); + if (!dir) { + if (!shuterror) + ha_alert("Couldn't get the system CA directory from X509_get_default_cert_dir().\n"); + goto err; + } + + } else { + + if (stat(path, &buf) == -1) { + if (!shuterror) + ha_alert("Couldn't open the ca-file '%s' (%s).\n", path, strerror(errno)); + goto err; + } + + if (S_ISDIR(buf.st_mode)) + dir = path; + else + file = path; + } + + if (file) { + if (!X509_STORE_load_locations(store, file, NULL)) { + e = ERR_get_error(); + if (!shuterror) + ha_alert("Couldn't open the ca-file '%s' (%s).\n", path, ERR_reason_error_string(e)); + goto err; + } + } else if (dir) { + int n, i; + struct dirent **de_list; + + n = scandir(dir, &de_list, 0, alphasort); + if (n < 0) + goto err; + + for (i= 0; i < n; i++) { + char *end; + struct dirent *de = de_list[i]; + BIO *in = NULL; + X509 *ca = NULL;; + + ERR_clear_error(); + + /* we try to load the files that would have + * been loaded in an hashed directory loaded by + * X509_LOOKUP_hash_dir, so according to "man 1 + * c_rehash", we should load ".pem", ".crt", + * ".cer", or ".crl". Files starting with a dot + * are ignored. + */ + end = strrchr(de->d_name, '.'); + if (!end || de->d_name[0] == '.' || + (strcmp(end, ".pem") != 0 && + strcmp(end, ".crt") != 0 && + strcmp(end, ".cer") != 0 && + strcmp(end, ".crl") != 0)) { + free(de); + continue; + } + in = BIO_new(BIO_s_file()); + if (in == NULL) + goto scandir_err; + + chunk_printf(&trash, "%s/%s", dir, de->d_name); + + if (BIO_read_filename(in, trash.area) == 0) + goto scandir_err; + + if (PEM_read_bio_X509_AUX(in, &ca, NULL, NULL) == NULL) + goto scandir_err; + + if (X509_STORE_add_cert(store, ca) == 0) { + /* only exits on error if the error is not about duplicate certificates */ + if (!(ERR_GET_REASON(ERR_get_error()) == X509_R_CERT_ALREADY_IN_HASH_TABLE)) { + goto scandir_err; + } + } + + X509_free(ca); + BIO_free(in); + free(de); + continue; + +scandir_err: + e = ERR_get_error(); + X509_free(ca); + BIO_free(in); + free(de); + /* warn if it can load one of the files, but don't abort */ + if (!shuterror) + ha_warning("ca-file: '%s' couldn't load '%s' (%s)\n", path, trash.area, ERR_reason_error_string(e)); + + } + free(de_list); + } else { + if (!shuterror) + ha_alert("ca-file: couldn't load '%s'\n", path); + goto err; + } + + objs = X509_STORE_get0_objects(store); + cert_count = sk_X509_OBJECT_num(objs); + if (cert_count == 0) { + if (!shuterror) + ha_warning("ca-file: 0 CA were loaded from '%s'\n", path); + } + ca_e = ssl_store_create_cafile_entry(path, store, type); + if (!ca_e) { + if (!shuterror) + ha_alert("Cannot allocate memory!\n"); + goto err; + } + ebst_insert(&cafile_tree, &ca_e->node); + } + return (store != NULL); + +err: + X509_STORE_free(store); + store = NULL; + return 0; + +} + +int ssl_store_load_locations_file(char *path, int create_if_none, enum cafile_type type) +{ + return __ssl_store_load_locations_file(path, create_if_none, type, 0); +} + +/*************************** CLI commands ***********************/ + +/* Type of SSL payloads that can be updated over the CLI */ + +struct cert_exts cert_exts[] = { + { "", CERT_TYPE_PEM, &ssl_sock_load_pem_into_ckch }, /* default mode, no extensions */ + { "key", CERT_TYPE_KEY, &ssl_sock_load_key_into_ckch }, +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) || defined OPENSSL_IS_BORINGSSL) + { "ocsp", CERT_TYPE_OCSP, &ssl_sock_load_ocsp_response_from_file }, +#endif +#ifdef HAVE_SSL_SCTL + { "sctl", CERT_TYPE_SCTL, &ssl_sock_load_sctl_from_file }, +#endif + { "issuer", CERT_TYPE_ISSUER, &ssl_sock_load_issuer_file_into_ckch }, + { NULL, CERT_TYPE_MAX, NULL }, +}; + + +/* release function of the `show ssl cert' command */ +static void cli_release_show_cert(struct appctx *appctx) +{ + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); +} + +/* IO handler of "show ssl cert <filename>". + * It makes use of a show_cert_ctx context, and ckchs_transaction in read-only. + */ +static int cli_io_handler_show_cert(struct appctx *appctx) +{ + struct show_cert_ctx *ctx = appctx->svcctx; + struct buffer *trash = alloc_trash_chunk(); + struct ebmb_node *node; + struct ckch_store *ckchs = NULL; + + if (trash == NULL) + return 1; + + if (!ctx->old_ckchs && ckchs_transaction.old_ckchs) { + ckchs = ckchs_transaction.old_ckchs; + chunk_appendf(trash, "# transaction\n"); + chunk_appendf(trash, "*%s\n", ckchs->path); + if (applet_putchk(appctx, trash) == -1) + goto yield; + ctx->old_ckchs = ckchs_transaction.old_ckchs; + } + + if (!ctx->cur_ckchs) { + chunk_appendf(trash, "# filename\n"); + node = ebmb_first(&ckchs_tree); + } else { + node = &ctx->cur_ckchs->node; + } + while (node) { + ckchs = ebmb_entry(node, struct ckch_store, node); + chunk_appendf(trash, "%s\n", ckchs->path); + + node = ebmb_next(node); + if (applet_putchk(appctx, trash) == -1) + goto yield; + } + + ctx->cur_ckchs = NULL; + free_trash_chunk(trash); + return 1; +yield: + + free_trash_chunk(trash); + ctx->cur_ckchs = ckchs; + return 0; /* should come back */ +} + +/* + * Extract and format the DNS SAN extensions and copy result into a chuink + * Return 0; + */ +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME +static int ssl_sock_get_san_oneline(X509 *cert, struct buffer *out) +{ + int i; + char *str; + STACK_OF(GENERAL_NAME) *names = NULL; + + names = X509_get_ext_d2i(cert, NID_subject_alt_name, NULL, NULL); + if (names) { + for (i = 0; i < sk_GENERAL_NAME_num(names); i++) { + GENERAL_NAME *name = sk_GENERAL_NAME_value(names, i); + if (i > 0) + chunk_appendf(out, ", "); + if (name->type == GEN_DNS) { + if (ASN1_STRING_to_UTF8((unsigned char **)&str, name->d.dNSName) >= 0) { + chunk_appendf(out, "DNS:%s", str); + OPENSSL_free(str); + } + } + } + sk_GENERAL_NAME_pop_free(names, GENERAL_NAME_free); + } + return 0; +} +#endif + +/* + * Build the ckch_inst_link that will be chained in the CA file entry and the + * corresponding ckch_inst_link_ref that will be chained in the ckch instance. + * Return 0 in case of success. + */ +static int do_chain_inst_and_cafile(struct cafile_entry *cafile_entry, struct ckch_inst *ckch_inst) +{ + struct ckch_inst_link *new_link; + if (!LIST_ISEMPTY(&cafile_entry->ckch_inst_link)) { + struct ckch_inst_link *link = LIST_ELEM(cafile_entry->ckch_inst_link.n, + typeof(link), list); + /* Do not add multiple references to the same + * instance in a cafile_entry */ + if (link->ckch_inst == ckch_inst) { + return 1; + } + } + + new_link = calloc(1, sizeof(*new_link)); + if (new_link) { + struct ckch_inst_link_ref *new_link_ref = calloc(1, sizeof(*new_link_ref)); + if (!new_link_ref) { + free(new_link); + return 1; + } + + new_link->ckch_inst = ckch_inst; + new_link_ref->link = new_link; + LIST_INIT(&new_link->list); + LIST_INIT(&new_link_ref->list); + + LIST_APPEND(&cafile_entry->ckch_inst_link, &new_link->list); + LIST_APPEND(&ckch_inst->cafile_link_refs, &new_link_ref->list); + } + + return 0; +} + + +/* + * Link a CA file tree entry to the ckch instance that uses it. + * To determine if and which CA file tree entries need to be linked to the + * instance, we follow the same logic performed in ssl_sock_prepare_ctx when + * processing the verify option. + * This function works for a frontend as well as for a backend, depending on the + * configuration parameters given (bind_conf or server). + */ +void ckch_inst_add_cafile_link(struct ckch_inst *ckch_inst, struct bind_conf *bind_conf, + struct ssl_bind_conf *ssl_conf, const struct server *srv) +{ + int verify = SSL_VERIFY_NONE; + + if (srv) { + + if (global.ssl_server_verify == SSL_SERVER_VERIFY_REQUIRED) + verify = SSL_VERIFY_PEER; + switch (srv->ssl_ctx.verify) { + case SSL_SOCK_VERIFY_NONE: + verify = SSL_VERIFY_NONE; + break; + case SSL_SOCK_VERIFY_REQUIRED: + verify = SSL_VERIFY_PEER; + break; + } + } + else { + switch ((ssl_conf && ssl_conf->verify) ? ssl_conf->verify : bind_conf->ssl_conf.verify) { + case SSL_SOCK_VERIFY_NONE: + verify = SSL_VERIFY_NONE; + break; + case SSL_SOCK_VERIFY_OPTIONAL: + verify = SSL_VERIFY_PEER; + break; + case SSL_SOCK_VERIFY_REQUIRED: + verify = SSL_VERIFY_PEER|SSL_VERIFY_FAIL_IF_NO_PEER_CERT; + break; + } + } + + if (verify & SSL_VERIFY_PEER) { + struct cafile_entry *ca_file_entry = NULL; + struct cafile_entry *ca_verify_file_entry = NULL; + struct cafile_entry *crl_file_entry = NULL; + if (srv) { + if (srv->ssl_ctx.ca_file) { + ca_file_entry = ssl_store_get_cafile_entry(srv->ssl_ctx.ca_file, 0); + + } + if (srv->ssl_ctx.crl_file) { + crl_file_entry = ssl_store_get_cafile_entry(srv->ssl_ctx.crl_file, 0); + } + } + else { + char *ca_file = (ssl_conf && ssl_conf->ca_file) ? ssl_conf->ca_file : bind_conf->ssl_conf.ca_file; + char *ca_verify_file = (ssl_conf && ssl_conf->ca_verify_file) ? ssl_conf->ca_verify_file : bind_conf->ssl_conf.ca_verify_file; + char *crl_file = (ssl_conf && ssl_conf->crl_file) ? ssl_conf->crl_file : bind_conf->ssl_conf.crl_file; + + if (ca_file) + ca_file_entry = ssl_store_get_cafile_entry(ca_file, 0); + if (ca_verify_file) + ca_verify_file_entry = ssl_store_get_cafile_entry(ca_verify_file, 0); + if (crl_file) + crl_file_entry = ssl_store_get_cafile_entry(crl_file, 0); + } + + if (ca_file_entry) { + /* If we have a ckch instance that is not already in the + * cafile_entry's list, add it to it. */ + if (do_chain_inst_and_cafile(ca_file_entry, ckch_inst)) + return; + + } + if (ca_verify_file_entry && (ca_file_entry != ca_verify_file_entry)) { + /* If we have a ckch instance that is not already in the + * cafile_entry's list, add it to it. */ + if (do_chain_inst_and_cafile(ca_verify_file_entry, ckch_inst)) + return; + } + if (crl_file_entry) { + /* If we have a ckch instance that is not already in the + * cafile_entry's list, add it to it. */ + if (do_chain_inst_and_cafile(crl_file_entry, ckch_inst)) + return; + } + } +} + + + +static int show_cert_detail(X509 *cert, STACK_OF(X509) *chain, struct buffer *out) +{ + BIO *bio = NULL; + struct buffer *tmp = alloc_trash_chunk(); + int i; + int write = -1; + unsigned int len = 0; + X509_NAME *name = NULL; + + if (!tmp) + return -1; + + if (!cert) + goto end; + + if (chain == NULL) { + struct issuer_chain *issuer; + issuer = ssl_get0_issuer_chain(cert); + if (issuer) { + chain = issuer->chain; + chunk_appendf(out, "Chain Filename: "); + chunk_appendf(out, "%s\n", issuer->path); + } + } + chunk_appendf(out, "Serial: "); + if (ssl_sock_get_serial(cert, tmp) == -1) + goto end; + dump_binary(out, tmp->area, tmp->data); + chunk_appendf(out, "\n"); + + chunk_appendf(out, "notBefore: "); + chunk_reset(tmp); + if ((bio = BIO_new(BIO_s_mem())) == NULL) + goto end; + if (ASN1_TIME_print(bio, X509_getm_notBefore(cert)) == 0) + goto end; + write = BIO_read(bio, tmp->area, tmp->size-1); + tmp->area[write] = '\0'; + BIO_free(bio); + bio = NULL; + chunk_appendf(out, "%s\n", tmp->area); + + chunk_appendf(out, "notAfter: "); + chunk_reset(tmp); + if ((bio = BIO_new(BIO_s_mem())) == NULL) + goto end; + if (ASN1_TIME_print(bio, X509_getm_notAfter(cert)) == 0) + goto end; + if ((write = BIO_read(bio, tmp->area, tmp->size-1)) <= 0) + goto end; + tmp->area[write] = '\0'; + BIO_free(bio); + bio = NULL; + chunk_appendf(out, "%s\n", tmp->area); + +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + chunk_appendf(out, "Subject Alternative Name: "); + if (ssl_sock_get_san_oneline(cert, out) == -1) + goto end; + *(out->area + out->data) = '\0'; + chunk_appendf(out, "\n"); +#endif + chunk_reset(tmp); + chunk_appendf(out, "Algorithm: "); + if (cert_get_pkey_algo(cert, tmp) == 0) + goto end; + chunk_appendf(out, "%s\n", tmp->area); + + chunk_reset(tmp); + chunk_appendf(out, "SHA1 FingerPrint: "); + if (X509_digest(cert, EVP_sha1(), (unsigned char *) tmp->area, &len) == 0) + goto end; + tmp->data = len; + dump_binary(out, tmp->area, tmp->data); + chunk_appendf(out, "\n"); + + chunk_appendf(out, "Subject: "); + if ((name = X509_get_subject_name(cert)) == NULL) + goto end; + if ((ssl_sock_get_dn_oneline(name, tmp)) == -1) + goto end; + *(tmp->area + tmp->data) = '\0'; + chunk_appendf(out, "%s\n", tmp->area); + + chunk_appendf(out, "Issuer: "); + if ((name = X509_get_issuer_name(cert)) == NULL) + goto end; + if ((ssl_sock_get_dn_oneline(name, tmp)) == -1) + goto end; + *(tmp->area + tmp->data) = '\0'; + chunk_appendf(out, "%s\n", tmp->area); + + /* Displays subject of each certificate in the chain */ + for (i = 0; i < sk_X509_num(chain); i++) { + X509 *ca = sk_X509_value(chain, i); + + chunk_appendf(out, "Chain Subject: "); + if ((name = X509_get_subject_name(ca)) == NULL) + goto end; + if ((ssl_sock_get_dn_oneline(name, tmp)) == -1) + goto end; + *(tmp->area + tmp->data) = '\0'; + chunk_appendf(out, "%s\n", tmp->area); + + chunk_appendf(out, "Chain Issuer: "); + if ((name = X509_get_issuer_name(ca)) == NULL) + goto end; + if ((ssl_sock_get_dn_oneline(name, tmp)) == -1) + goto end; + *(tmp->area + tmp->data) = '\0'; + chunk_appendf(out, "%s\n", tmp->area); + } + +end: + if (bio) + BIO_free(bio); + free_trash_chunk(tmp); + + return 0; +} + +/* + * Dump the OCSP certificate key (if it exists) of certificate <ckch> into + * buffer <out>. + * Returns 0 in case of success. + */ +static int ckch_store_show_ocsp_certid(struct ckch_store *ckch_store, struct buffer *out) +{ +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) + unsigned char key[OCSP_MAX_CERTID_ASN1_LENGTH] = {}; + unsigned int key_length = 0; + int i; + + if (ssl_ocsp_build_response_key(ckch_store->data->ocsp_cid, (unsigned char*)key, &key_length) >= 0) { + /* Dump the CERTID info */ + chunk_appendf(out, "OCSP Response Key: "); + for (i = 0; i < key_length; ++i) { + chunk_appendf(out, "%02x", key[i]); + } + chunk_appendf(out, "\n"); + } +#endif + + return 0; +} + + +/* IO handler of the details "show ssl cert <filename>". + * It uses a struct show_cert_ctx and ckchs_transaction in read-only. + */ +static int cli_io_handler_show_cert_detail(struct appctx *appctx) +{ + struct show_cert_ctx *ctx = appctx->svcctx; + struct ckch_store *ckchs = ctx->cur_ckchs; + struct buffer *out = alloc_trash_chunk(); + int retval = 0; + + if (!out) + goto end_no_putchk; + + chunk_appendf(out, "Filename: "); + if (ckchs == ckchs_transaction.new_ckchs) + chunk_appendf(out, "*"); + chunk_appendf(out, "%s\n", ckchs->path); + + chunk_appendf(out, "Status: "); + if (ckchs->data->cert == NULL) + chunk_appendf(out, "Empty\n"); + else if (LIST_ISEMPTY(&ckchs->ckch_inst)) + chunk_appendf(out, "Unused\n"); + else + chunk_appendf(out, "Used\n"); + + retval = show_cert_detail(ckchs->data->cert, ckchs->data->chain, out); + if (retval < 0) + goto end_no_putchk; + else if (retval) + goto end; + + ckch_store_show_ocsp_certid(ckchs, out); + +end: + if (applet_putchk(appctx, out) == -1) + goto yield; + +end_no_putchk: + free_trash_chunk(out); + return 1; +yield: + free_trash_chunk(out); + return 0; /* should come back */ +} + + +/* IO handler of the details "show ssl cert <filename.ocsp>". + * It uses a show_cert_ctx. + */ +static int cli_io_handler_show_cert_ocsp_detail(struct appctx *appctx) +{ +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) + struct show_cert_ctx *ctx = appctx->svcctx; + struct ckch_store *ckchs = ctx->cur_ckchs; + struct buffer *out = alloc_trash_chunk(); + int from_transaction = ctx->transaction; + + if (!out) + goto end_no_putchk; + + /* If we try to display an ongoing transaction's OCSP response, we + * need to dump the ckch's ocsp_response buffer directly. + * Otherwise, we must rebuild the certificate's certid in order to + * look for the current OCSP response in the tree. */ + if (from_transaction && ckchs->data->ocsp_response) { + if (ssl_ocsp_response_print(ckchs->data->ocsp_response, out)) + goto end_no_putchk; + } + else { + unsigned char key[OCSP_MAX_CERTID_ASN1_LENGTH] = {}; + unsigned int key_length = 0; + + if (ssl_ocsp_build_response_key(ckchs->data->ocsp_cid, (unsigned char*)key, &key_length) < 0) + goto end_no_putchk; + + if (ssl_get_ocspresponse_detail(key, out)) + goto end_no_putchk; + } + + if (applet_putchk(appctx, out) == -1) + goto yield; + +end_no_putchk: + free_trash_chunk(out); + return 1; +yield: + free_trash_chunk(out); + return 0; /* should come back */ +#else + return cli_err(appctx, "HAProxy was compiled against a version of OpenSSL that doesn't support OCSP stapling.\n"); +#endif +} + +/* parsing function for 'show ssl cert [certfile]' */ +static int cli_parse_show_cert(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_cert_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct ckch_store *ckchs; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return cli_err(appctx, "Can't allocate memory!\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't show!\nOperations on certificates are currently locked!\n"); + + /* check if there is a certificate to lookup */ + if (*args[3]) { + int show_ocsp_detail = 0; + int from_transaction = 0; + char *end; + + /* We manage the special case "certname.ocsp" through which we + * can show the details of an OCSP response. */ + end = strrchr(args[3], '.'); + if (end && strcmp(end+1, "ocsp") == 0) { + *end = '\0'; + show_ocsp_detail = 1; + } + + if (*args[3] == '*') { + from_transaction = 1; + if (!ckchs_transaction.new_ckchs) + goto error; + + ckchs = ckchs_transaction.new_ckchs; + + if (strcmp(args[3] + 1, ckchs->path) != 0) + goto error; + + } else { + if ((ckchs = ckchs_lookup(args[3])) == NULL) + goto error; + + } + + ctx->cur_ckchs = ckchs; + /* use the IO handler that shows details */ + if (show_ocsp_detail) { + ctx->transaction = from_transaction; + appctx->io_handler = cli_io_handler_show_cert_ocsp_detail; + } + else + appctx->io_handler = cli_io_handler_show_cert_detail; + } + + return 0; + +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_err(appctx, "Can't display the certificate: Not found or the certificate is a bundle!\n"); +} + +/* release function of the `set ssl cert' command, free things and unlock the spinlock */ +static void cli_release_commit_cert(struct appctx *appctx) +{ + struct commit_cert_ctx *ctx = appctx->svcctx; + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + /* free every new sni_ctx and the new store, which are not in the trees so no spinlock there */ + if (ctx->new_ckchs) + ckch_store_free(ctx->new_ckchs); + ha_free(&ctx->err); +} + + +/* + * Rebuild a new instance 'new_inst' based on an old instance 'ckchi' and a + * specific ckch_store. + * Returns 0 in case of success, 1 otherwise. + */ +int ckch_inst_rebuild(struct ckch_store *ckch_store, struct ckch_inst *ckchi, + struct ckch_inst **new_inst, char **err) +{ + int retval = 0; + int errcode = 0; + struct sni_ctx *sc0, *sc0s; + char **sni_filter = NULL; + int fcount = 0; + + if (ckchi->crtlist_entry) { + sni_filter = ckchi->crtlist_entry->filters; + fcount = ckchi->crtlist_entry->fcount; + } + + if (ckchi->is_server_instance) + errcode |= ckch_inst_new_load_srv_store(ckch_store->path, ckch_store, new_inst, err); + else + errcode |= ckch_inst_new_load_store(ckch_store->path, ckch_store, ckchi->bind_conf, ckchi->ssl_conf, sni_filter, fcount, new_inst, err); + + if (errcode & ERR_CODE) + return 1; + + /* if the previous ckchi was used as the default */ + if (ckchi->is_default) + (*new_inst)->is_default = 1; + + (*new_inst)->is_server_instance = ckchi->is_server_instance; + (*new_inst)->server = ckchi->server; + /* Create a new SSL_CTX and link it to the new instance. */ + if ((*new_inst)->is_server_instance) { + retval = ssl_sock_prep_srv_ctx_and_inst(ckchi->server, (*new_inst)->ctx, (*new_inst)); + if (retval) + return 1; + } + + /* create the link to the crtlist_entry */ + (*new_inst)->crtlist_entry = ckchi->crtlist_entry; + + /* we need to initialize the SSL_CTX generated */ + /* this iterate on the newly generated SNIs in the new instance to prepare their SSL_CTX */ + list_for_each_entry_safe(sc0, sc0s, &(*new_inst)->sni_ctx, by_ckch_inst) { + if (!sc0->order) { /* we initialized only the first SSL_CTX because it's the same in the other sni_ctx's */ + errcode |= ssl_sock_prep_ctx_and_inst(ckchi->bind_conf, ckchi->ssl_conf, sc0->ctx, *new_inst, err); + if (errcode & ERR_CODE) + return 1; + } + } + + return 0; +} + +/* + * Load all the new SNIs of a newly built ckch instance in the trees, or replace + * a server's main ckch instance. + */ +static void __ssl_sock_load_new_ckch_instance(struct ckch_inst *ckchi) +{ + /* The bind_conf will be null on server ckch_instances. */ + if (ckchi->is_server_instance) { + int i; + /* a lock is needed here since we have to free the SSL cache */ + HA_RWLOCK_WRLOCK(SSL_SERVER_LOCK, &ckchi->server->ssl_ctx.lock); + /* free the server current SSL_CTX */ + SSL_CTX_free(ckchi->server->ssl_ctx.ctx); + /* Actual ssl context update */ + SSL_CTX_up_ref(ckchi->ctx); + ckchi->server->ssl_ctx.ctx = ckchi->ctx; + ckchi->server->ssl_ctx.inst = ckchi; + + /* flush the session cache of the server */ + for (i = 0; i < global.nbthread; i++) { + ha_free(&ckchi->server->ssl_ctx.reused_sess[i].sni); + ha_free(&ckchi->server->ssl_ctx.reused_sess[i].ptr); + } + HA_RWLOCK_WRUNLOCK(SSL_SERVER_LOCK, &ckchi->server->ssl_ctx.lock); + + } else { + HA_RWLOCK_WRLOCK(SNI_LOCK, &ckchi->bind_conf->sni_lock); + ssl_sock_load_cert_sni(ckchi, ckchi->bind_conf); + HA_RWLOCK_WRUNLOCK(SNI_LOCK, &ckchi->bind_conf->sni_lock); + } +} + +/* + * Delete a ckch instance that was replaced after a CLI command. + */ +static void __ckch_inst_free_locked(struct ckch_inst *ckchi) +{ + if (ckchi->is_server_instance) { + /* no lock for servers */ + ckch_inst_free(ckchi); + } else { + struct bind_conf __maybe_unused *bind_conf = ckchi->bind_conf; + + HA_RWLOCK_WRLOCK(SNI_LOCK, &bind_conf->sni_lock); + ckch_inst_free(ckchi); + HA_RWLOCK_WRUNLOCK(SNI_LOCK, &bind_conf->sni_lock); + } +} + +/* Replace a ckch_store in the ckch tree and insert the whole dependencies, +* then free the previous dependencies and store. +* Used in the case of a certificate update. +* +* Every dependencies must allocated before using this function. +* +* This function can't fail as it only update pointers, and does not alloc anything. +* +* /!\ This function must be used under the ckch lock. /!\ +* +* - Insert every dependencies (SNI, crtlist_entry, ckch_inst, etc) +* - Delete the old ckch_store from the tree +* - Insert the new ckch_store +* - Free the old dependencies and the old ckch_store +*/ +void ckch_store_replace(struct ckch_store *old_ckchs, struct ckch_store *new_ckchs) +{ + struct crtlist_entry *entry; + struct ckch_inst *ckchi, *ckchis; + + LIST_SPLICE(&new_ckchs->crtlist_entry, &old_ckchs->crtlist_entry); + list_for_each_entry(entry, &new_ckchs->crtlist_entry, by_ckch_store) { + ebpt_delete(&entry->node); + /* change the ptr and reinsert the node */ + entry->node.key = new_ckchs; + ebpt_insert(&entry->crtlist->entries, &entry->node); + } + /* insert the new ckch_insts in the crtlist_entry */ + list_for_each_entry(ckchi, &new_ckchs->ckch_inst, by_ckchs) { + if (ckchi->crtlist_entry) + LIST_INSERT(&ckchi->crtlist_entry->ckch_inst, &ckchi->by_crtlist_entry); + } + /* First, we insert every new SNIs in the trees, also replace the default_ctx */ + list_for_each_entry_safe(ckchi, ckchis, &new_ckchs->ckch_inst, by_ckchs) { + __ssl_sock_load_new_ckch_instance(ckchi); + } + /* delete the old sni_ctx, the old ckch_insts and the ckch_store */ + list_for_each_entry_safe(ckchi, ckchis, &old_ckchs->ckch_inst, by_ckchs) { + __ckch_inst_free_locked(ckchi); + } + + ckch_store_free(old_ckchs); + ebst_insert(&ckchs_tree, &new_ckchs->node); +} + + +/* + * This function tries to create the new ckch_inst and their SNIs + * + * /!\ don't forget to update __hlua_ckch_commit() if you changes things there. /!\ + */ +static int cli_io_handler_commit_cert(struct appctx *appctx) +{ + struct commit_cert_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + int y = 0; + struct ckch_store *old_ckchs, *new_ckchs = NULL; + struct ckch_inst *ckchi; + + usermsgs_clr("CLI"); + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + goto end; + + while (1) { + switch (ctx->state) { + case CERT_ST_INIT: + /* This state just print the update message */ + chunk_printf(&trash, "Committing %s", ckchs_transaction.path); + if (applet_putchk(appctx, &trash) == -1) + goto yield; + + ctx->state = CERT_ST_GEN; + __fallthrough; + case CERT_ST_GEN: + /* + * This state generates the ckch instances with their + * sni_ctxs and SSL_CTX. + * + * Since the SSL_CTX generation can be CPU consumer, we + * yield every 10 instances. + */ + + old_ckchs = ctx->old_ckchs; + new_ckchs = ctx->new_ckchs; + + /* get the next ckchi to regenerate */ + ckchi = ctx->next_ckchi; + /* we didn't start yet, set it to the first elem */ + if (ckchi == NULL) + ckchi = LIST_ELEM(old_ckchs->ckch_inst.n, typeof(ckchi), by_ckchs); + + /* walk through the old ckch_inst and creates new ckch_inst using the updated ckchs */ + list_for_each_entry_from(ckchi, &old_ckchs->ckch_inst, by_ckchs) { + struct ckch_inst *new_inst; + + /* save the next ckchi to compute in case of yield */ + ctx->next_ckchi = ckchi; + + /* it takes a lot of CPU to creates SSL_CTXs, so we yield every 10 CKCH instances */ + if (y >= 10) { + applet_have_more_data(appctx); /* let's come back later */ + goto yield; + } + + /* display one dot per new instance */ + if (applet_putstr(appctx, ".") == -1) + goto yield; + + ctx->err = NULL; + if (ckch_inst_rebuild(new_ckchs, ckchi, &new_inst, &ctx->err)) { + ctx->state = CERT_ST_ERROR; + goto error; + } + + /* link the new ckch_inst to the duplicate */ + LIST_APPEND(&new_ckchs->ckch_inst, &new_inst->by_ckchs); + y++; + } + ctx->state = CERT_ST_INSERT; + __fallthrough; + case CERT_ST_INSERT: + /* The generation is finished, we can insert everything */ + + old_ckchs = ctx->old_ckchs; + new_ckchs = ctx->new_ckchs; + + /* insert everything and remove the previous objects */ + ckch_store_replace(old_ckchs, new_ckchs); + ctx->new_ckchs = ctx->old_ckchs = NULL; + ctx->state = CERT_ST_SUCCESS; + __fallthrough; + case CERT_ST_SUCCESS: + chunk_printf(&trash, "\n%sSuccess!\n", usermsgs_str()); + if (applet_putchk(appctx, &trash) == -1) + goto yield; + ctx->state = CERT_ST_FIN; + __fallthrough; + case CERT_ST_FIN: + /* we achieved the transaction, we can set everything to NULL */ + ckchs_transaction.new_ckchs = NULL; + ckchs_transaction.old_ckchs = NULL; + ckchs_transaction.path = NULL; + goto end; + + case CERT_ST_ERROR: + error: + chunk_printf(&trash, "\n%s%sFailed!\n", usermsgs_str(), ctx->err); + if (applet_putchk(appctx, &trash) == -1) + goto yield; + ctx->state = CERT_ST_FIN; + break; + } + } +end: + usermsgs_clr(NULL); + /* success: call the release function and don't come back */ + return 1; + +yield: + usermsgs_clr(NULL); + return 0; /* should come back */ +} + +/* + * Parsing function of 'commit ssl cert' + */ +static int cli_parse_commit_cert(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct commit_cert_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + char *err = NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'commit ssl cert' expects a filename\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't commit the certificate!\nOperations on certificates are currently locked!\n"); + + if (!ckchs_transaction.path) { + memprintf(&err, "No ongoing transaction! !\n"); + goto error; + } + + if (strcmp(ckchs_transaction.path, args[3]) != 0) { + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to set '%s'\n", ckchs_transaction.path, args[3]); + goto error; + } + + /* if a certificate is here, a private key must be here too */ + if (ckchs_transaction.new_ckchs->data->cert && !ckchs_transaction.new_ckchs->data->key) { + memprintf(&err, "The transaction must contain at least a certificate and a private key!\n"); + goto error; + } + + if (!X509_check_private_key(ckchs_transaction.new_ckchs->data->cert, ckchs_transaction.new_ckchs->data->key)) { + memprintf(&err, "inconsistencies between private key and certificate loaded '%s'.\n", ckchs_transaction.path); + goto error; + } + + /* init the appctx structure */ + ctx->state = CERT_ST_INIT; + ctx->next_ckchi = NULL; + ctx->new_ckchs = ckchs_transaction.new_ckchs; + ctx->old_ckchs = ckchs_transaction.old_ckchs; + + /* we don't unlock there, it will be unlock after the IO handler, in the release handler */ + return 0; + +error: + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + err = memprintf(&err, "%sCan't commit %s!\n", err ? err : "", args[3]); + + return cli_dynerr(appctx, err); +} + + + + +/* + * Parsing function of `set ssl cert`, it updates or creates a temporary ckch. + * It uses a set_cert_ctx context, and ckchs_transaction under a lock. + */ +static int cli_parse_set_cert(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct ckch_store *new_ckchs = NULL; + struct ckch_store *old_ckchs = NULL; + char *err = NULL; + int i; + int errcode = 0; + char *end; + struct cert_exts *cert_ext = &cert_exts[0]; /* default one, PEM */ + struct ckch_data *data; + struct buffer *buf; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3] || !payload) + return cli_err(appctx, "'set ssl cert' expects a filename and a certificate as a payload\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't update the certificate!\nOperations on certificates are currently locked!\n"); + + if ((buf = alloc_trash_chunk()) == NULL) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + if (!chunk_strcpy(buf, args[3])) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* check which type of file we want to update */ + for (i = 0; cert_exts[i].ext != NULL; i++) { + end = strrchr(buf->area, '.'); + if (end && *cert_exts[i].ext && (strcmp(end + 1, cert_exts[i].ext) == 0)) { + *end = '\0'; + buf->data = strlen(buf->area); + cert_ext = &cert_exts[i]; + break; + } + } + + /* if there is an ongoing transaction */ + if (ckchs_transaction.path) { + /* if there is an ongoing transaction, check if this is the same file */ + if (strcmp(ckchs_transaction.path, buf->area) != 0) { + /* we didn't find the transaction, must try more cases below */ + + /* if the del-ext option is activated we should try to take a look at a ".crt" too. */ + if (cert_ext->type != CERT_TYPE_PEM && global_ssl.extra_files_noext) { + if (!chunk_strcat(buf, ".crt")) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + if (strcmp(ckchs_transaction.path, buf->area) != 0) { + /* remove .crt of the error message */ + *(b_orig(buf) + b_data(buf) + strlen(".crt")) = '\0'; + b_sub(buf, strlen(".crt")); + + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to set '%s'\n", ckchs_transaction.path, buf->area); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + } + } + + old_ckchs = ckchs_transaction.new_ckchs; + + } else { + + /* lookup for the certificate in the tree */ + old_ckchs = ckchs_lookup(buf->area); + + if (!old_ckchs) { + /* if the del-ext option is activated we should try to take a look at a ".crt" too. */ + if (cert_ext->type != CERT_TYPE_PEM && global_ssl.extra_files_noext) { + if (!chunk_strcat(buf, ".crt")) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + old_ckchs = ckchs_lookup(buf->area); + } + } + } + + if (!old_ckchs) { + memprintf(&err, "%sCan't replace a certificate which is not referenced by the configuration!\n", + err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* duplicate the ckch store */ + new_ckchs = ckchs_dup(old_ckchs); + if (!new_ckchs) { + memprintf(&err, "%sCannot allocate memory!\n", + err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* Reset the OCSP CID */ + if (cert_ext->type == CERT_TYPE_PEM || cert_ext->type == CERT_TYPE_KEY || + cert_ext->type == CERT_TYPE_ISSUER) { + OCSP_CERTID_free(new_ckchs->data->ocsp_cid); + new_ckchs->data->ocsp_cid = NULL; + } + + data = new_ckchs->data; + + /* apply the change on the duplicate */ + if (cert_ext->load(buf->area, payload, data, &err) != 0) { + memprintf(&err, "%sCan't load the payload\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* we succeed, we can save the ckchs in the transaction */ + + /* if there wasn't a transaction, update the old ckchs */ + if (!ckchs_transaction.old_ckchs) { + ckchs_transaction.old_ckchs = old_ckchs; + ckchs_transaction.path = old_ckchs->path; + err = memprintf(&err, "Transaction created for certificate %s!\n", ckchs_transaction.path); + } else { + err = memprintf(&err, "Transaction updated for certificate %s!\n", ckchs_transaction.path); + + } + + /* free the previous ckchs if there was a transaction */ + ckch_store_free(ckchs_transaction.new_ckchs); + + ckchs_transaction.new_ckchs = new_ckchs; + + + /* creates the SNI ctxs later in the IO handler */ + +end: + free_trash_chunk(buf); + + if (errcode & ERR_CODE) { + ckch_store_free(new_ckchs); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, memprintf(&err, "%sCan't update %s!\n", err ? err : "", args[3])); + } else { + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); + } + /* TODO: handle the ERR_WARN which are not handled because of the io_handler */ +} + +/* parsing function of 'abort ssl cert' */ +static int cli_parse_abort_cert(char **args, char *payload, struct appctx *appctx, void *private) +{ + char *err = NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'abort ssl cert' expects a filename\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't abort!\nOperations on certificates are currently locked!\n"); + + if (!ckchs_transaction.path) { + memprintf(&err, "No ongoing transaction!\n"); + goto error; + } + + if (strcmp(ckchs_transaction.path, args[3]) != 0) { + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to abort a transaction for '%s'\n", ckchs_transaction.path, args[3]); + goto error; + } + + /* Only free the ckchs there, because the SNI and instances were not generated yet */ + ckch_store_free(ckchs_transaction.new_ckchs); + ckchs_transaction.new_ckchs = NULL; + ckchs_transaction.old_ckchs = NULL; + ckchs_transaction.path = NULL; + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + err = memprintf(&err, "Transaction aborted for certificate '%s'!\n", args[3]); + return cli_dynmsg(appctx, LOG_NOTICE, err); + +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + return cli_dynerr(appctx, err); +} + +/* parsing function of 'new ssl cert' */ +static int cli_parse_new_cert(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct ckch_store *store; + char *err = NULL; + char *path; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'new ssl cert' expects a filename\n"); + + path = args[3]; + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't create a certificate!\nOperations on certificates are currently locked!\n"); + + store = ckchs_lookup(path); + if (store != NULL) { + memprintf(&err, "Certificate '%s' already exists!\n", path); + store = NULL; /* we don't want to free it */ + goto error; + } + /* we won't support multi-certificate bundle here */ + store = ckch_store_new(path); + if (!store) { + memprintf(&err, "unable to allocate memory.\n"); + goto error; + } + + /* insert into the ckchs tree */ + ebst_insert(&ckchs_tree, &store->node); + memprintf(&err, "New empty certificate store '%s'!\n", args[3]); + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); +error: + free(store); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, err); +} + +/* parsing function of 'del ssl cert' */ +static int cli_parse_del_cert(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct ckch_store *store; + char *err = NULL; + char *filename; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'del ssl cert' expects a certificate name\n"); + + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't delete the certificate!\nOperations on certificates are currently locked!\n"); + + filename = args[3]; + + if (ckchs_transaction.path && strcmp(ckchs_transaction.path, filename) == 0) { + memprintf(&err, "ongoing transaction for the certificate '%s'", filename); + goto error; + } + + store = ckchs_lookup(filename); + if (store == NULL) { + memprintf(&err, "certificate '%s' doesn't exist!\n", filename); + goto error; + } + if (!LIST_ISEMPTY(&store->ckch_inst)) { + memprintf(&err, "certificate '%s' in use, can't be deleted!\n", filename); + goto error; + } + + ebmb_delete(&store->node); + ckch_store_free(store); + + memprintf(&err, "Certificate '%s' deleted!\n", filename); + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); + +error: + memprintf(&err, "Can't remove the certificate: %s\n", err ? err : ""); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, err); +} + + + +/* parsing function of 'new ssl ca-file' */ +static int cli_parse_new_cafile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct cafile_entry *cafile_entry; + char *err = NULL; + char *path; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'new ssl ca-file' expects a filename\n"); + + path = args[3]; + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't create a CA file!\nOperations on certificates are currently locked!\n"); + + cafile_entry = ssl_store_get_cafile_entry(path, 0); + if (cafile_entry) { + memprintf(&err, "CA file '%s' already exists!\n", path); + goto error; + } + + cafile_entry = ssl_store_create_cafile_entry(path, NULL, CAFILE_CERT); + if (!cafile_entry) { + memprintf(&err, "%sCannot allocate memory!\n", + err ? err : ""); + goto error; + } + + /* Add the newly created cafile_entry to the tree so that + * any new ckch instance created from now can use it. */ + if (ssl_store_add_uncommitted_cafile_entry(cafile_entry)) + goto error; + + memprintf(&err, "New CA file created '%s'!\n", path); + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, err); +} + +/* + * Parsing function of `set ssl ca-file` + */ +static int cli_parse_set_cafile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct cafile_entry *old_cafile_entry = NULL; + struct cafile_entry *new_cafile_entry = NULL; + char *err = NULL; + int errcode = 0; + struct buffer *buf; + int add_cmd = 0; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + /* this is "add ssl ca-file" */ + if (*args[0] == 'a') + add_cmd = 1; + + if (!*args[3] || !payload) + return cli_err(appctx, "'set ssl ca-file' expects a filename and CAs as a payload\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't update the CA file!\nOperations on certificates are currently locked!\n"); + + if ((buf = alloc_trash_chunk()) == NULL) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + if (!chunk_strcpy(buf, args[3])) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + old_cafile_entry = NULL; + new_cafile_entry = NULL; + + /* if there is an ongoing transaction */ + if (cafile_transaction.path) { + /* if there is an ongoing transaction, check if this is the same file */ + if (strcmp(cafile_transaction.path, buf->area) != 0) { + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to set '%s'\n", cafile_transaction.path, buf->area); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + old_cafile_entry = cafile_transaction.old_cafile_entry; + } else { + /* lookup for the certificate in the tree */ + old_cafile_entry = ssl_store_get_cafile_entry(buf->area, 0); + } + + if (!old_cafile_entry) { + memprintf(&err, "%sCan't replace a CA file which is not referenced by the configuration!\n", + err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* if the transaction is new, duplicate the old_ca_file_entry, otherwise duplicate the cafile in the current transaction */ + if (cafile_transaction.new_cafile_entry) + new_cafile_entry = ssl_store_dup_cafile_entry(cafile_transaction.new_cafile_entry); + else + new_cafile_entry = ssl_store_dup_cafile_entry(old_cafile_entry); + + if (!new_cafile_entry) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* Fill the new entry with the new CAs. The add_cmd variable determine + if we flush the X509_STORE or not */ + if (ssl_store_load_ca_from_buf(new_cafile_entry, payload, add_cmd)) { + memprintf(&err, "%sInvalid payload\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* we succeed, we can save the ca in the transaction */ + + /* if there wasn't a transaction, update the old CA */ + if (!cafile_transaction.old_cafile_entry) { + cafile_transaction.old_cafile_entry = old_cafile_entry; + cafile_transaction.path = old_cafile_entry->path; + err = memprintf(&err, "transaction created for CA %s!\n", cafile_transaction.path); + } else { + err = memprintf(&err, "transaction updated for CA %s!\n", cafile_transaction.path); + } + + /* free the previous CA if there was a transaction */ + ssl_store_delete_cafile_entry(cafile_transaction.new_cafile_entry); + + cafile_transaction.new_cafile_entry = new_cafile_entry; + + /* creates the SNI ctxs later in the IO handler */ + +end: + free_trash_chunk(buf); + + if (errcode & ERR_CODE) { + ssl_store_delete_cafile_entry(new_cafile_entry); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, memprintf(&err, "%sCan't update %s!\n", err ? err : "", args[3])); + } else { + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); + } +} + + +/* + * Parsing function of 'commit ssl ca-file'. + * It uses a commit_cacrlfile_ctx that's also shared with "commit ssl crl-file". + */ +static int cli_parse_commit_cafile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct commit_cacrlfile_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + char *err = NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'commit ssl ca-file' expects a filename\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't commit the CA file!\nOperations on certificates are currently locked!\n"); + + if (!cafile_transaction.path) { + memprintf(&err, "No ongoing transaction! !\n"); + goto error; + } + + if (strcmp(cafile_transaction.path, args[3]) != 0) { + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to set '%s'\n", cafile_transaction.path, args[3]); + goto error; + } + /* init the appctx structure */ + ctx->state = CACRL_ST_INIT; + ctx->next_ckchi_link = NULL; + ctx->old_entry = cafile_transaction.old_cafile_entry; + ctx->new_entry = cafile_transaction.new_cafile_entry; + ctx->cafile_type = CAFILE_CERT; + + return 0; + +error: + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + err = memprintf(&err, "%sCan't commit %s!\n", err ? err : "", args[3]); + + return cli_dynerr(appctx, err); +} + +/* + * This function tries to create new ckch instances and their SNIs using a newly + * set certificate authority (CA file) or a newly set Certificate Revocation + * List (CRL), depending on the command being called. + */ +static int cli_io_handler_commit_cafile_crlfile(struct appctx *appctx) +{ + struct commit_cacrlfile_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + int y = 0; + struct cafile_entry *old_cafile_entry = ctx->old_entry; + struct cafile_entry *new_cafile_entry = ctx->new_entry; + struct ckch_inst_link *ckchi_link; + char *path; + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + goto end; + + /* The ctx was already validated by the ca-file/crl-file parsing + * function. Entries can only be NULL in CACRL_ST_SUCCESS or + * CACRL_ST_FIN states + */ + switch (ctx->cafile_type) { + case CAFILE_CERT: + path = cafile_transaction.path; + break; + case CAFILE_CRL: + path = crlfile_transaction.path; + break; + default: + path = NULL; + goto error; + } + + while (1) { + switch (ctx->state) { + case CACRL_ST_INIT: + /* This state just print the update message */ + chunk_printf(&trash, "Committing %s", path); + if (applet_putchk(appctx, &trash) == -1) + goto yield; + + ctx->state = CACRL_ST_GEN; + __fallthrough; + case CACRL_ST_GEN: + /* + * This state generates the ckch instances with their + * sni_ctxs and SSL_CTX. + * + * Since the SSL_CTX generation can be CPU consumer, we + * yield every 10 instances. + */ + + /* get the next ckchi to regenerate */ + ckchi_link = ctx->next_ckchi_link; + + /* we didn't start yet, set it to the first elem */ + if (ckchi_link == NULL) { + ckchi_link = LIST_ELEM(old_cafile_entry->ckch_inst_link.n, typeof(ckchi_link), list); + /* Add the newly created cafile_entry to the tree so that + * any new ckch instance created from now can use it. */ + if (ssl_store_add_uncommitted_cafile_entry(new_cafile_entry)) { + ctx->state = CACRL_ST_ERROR; + goto error; + } + } + + list_for_each_entry_from(ckchi_link, &old_cafile_entry->ckch_inst_link, list) { + struct ckch_inst *new_inst; + + /* save the next ckchi to compute */ + ctx->next_ckchi_link = ckchi_link; + + /* it takes a lot of CPU to creates SSL_CTXs, so we yield every 10 CKCH instances */ + if (y >= 10) { + applet_have_more_data(appctx); /* let's come back later */ + goto yield; + } + + /* display one dot per new instance */ + if (applet_putstr(appctx, ".") == -1) + goto yield; + + /* Rebuild a new ckch instance that uses the same ckch_store + * than a reference ckchi instance but will use a new CA file. */ + ctx->err = NULL; + if (ckch_inst_rebuild(ckchi_link->ckch_inst->ckch_store, ckchi_link->ckch_inst, &new_inst, &ctx->err)) { + ctx->state = CACRL_ST_ERROR; + goto error; + } + + y++; + } + + ctx->state = CACRL_ST_INSERT; + __fallthrough; + case CACRL_ST_INSERT: + /* The generation is finished, we can insert everything */ + + /* insert the new ckch_insts in the crtlist_entry */ + list_for_each_entry(ckchi_link, &new_cafile_entry->ckch_inst_link, list) { + if (ckchi_link->ckch_inst->crtlist_entry) + LIST_INSERT(&ckchi_link->ckch_inst->crtlist_entry->ckch_inst, + &ckchi_link->ckch_inst->by_crtlist_entry); + } + + /* First, we insert every new SNIs in the trees, also replace the default_ctx */ + list_for_each_entry(ckchi_link, &new_cafile_entry->ckch_inst_link, list) { + __ssl_sock_load_new_ckch_instance(ckchi_link->ckch_inst); + } + + /* delete the old sni_ctx, the old ckch_insts + * and the ckch_store. ckch_inst_free() also + * manipulates the list so it's cleaner to loop + * until it's empty */ + while (!LIST_ISEMPTY(&old_cafile_entry->ckch_inst_link)) { + ckchi_link = LIST_ELEM(old_cafile_entry->ckch_inst_link.n, typeof(ckchi_link), list); + + LIST_DEL_INIT(&ckchi_link->list); /* must reinit because ckch_inst checks the list */ + __ckch_inst_free_locked(ckchi_link->ckch_inst); + free(ckchi_link); + } + + /* Remove the old cafile entry from the tree */ + ebmb_delete(&old_cafile_entry->node); + ssl_store_delete_cafile_entry(old_cafile_entry); + + ctx->old_entry = ctx->new_entry = NULL; + ctx->state = CACRL_ST_SUCCESS; + __fallthrough; + case CACRL_ST_SUCCESS: + if (applet_putstr(appctx, "\nSuccess!\n") == -1) + goto yield; + ctx->state = CACRL_ST_FIN; + __fallthrough; + case CACRL_ST_FIN: + /* we achieved the transaction, we can set everything to NULL */ + switch (ctx->cafile_type) { + case CAFILE_CERT: + cafile_transaction.old_cafile_entry = NULL; + cafile_transaction.new_cafile_entry = NULL; + cafile_transaction.path = NULL; + break; + case CAFILE_CRL: + crlfile_transaction.old_crlfile_entry = NULL; + crlfile_transaction.new_crlfile_entry = NULL; + crlfile_transaction.path = NULL; + break; + } + goto end; + + case CACRL_ST_ERROR: + error: + chunk_printf(&trash, "\n%sFailed!\n", ctx->err); + if (applet_putchk(appctx, &trash) == -1) + goto yield; + ctx->state = CACRL_ST_FIN; + break; + } + } +end: + /* success: call the release function and don't come back */ + return 1; +yield: + return 0; /* should come back */ +} + + +/* parsing function of 'abort ssl ca-file' */ +static int cli_parse_abort_cafile(char **args, char *payload, struct appctx *appctx, void *private) +{ + char *err = NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'abort ssl ca-file' expects a filename\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't abort!\nOperations on certificates are currently locked!\n"); + + if (!cafile_transaction.path) { + memprintf(&err, "No ongoing transaction!\n"); + goto error; + } + + if (strcmp(cafile_transaction.path, args[3]) != 0) { + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to abort a transaction for '%s'\n", cafile_transaction.path, args[3]); + goto error; + } + + /* Only free the uncommitted cafile_entry here, because the SNI and instances were not generated yet */ + ssl_store_delete_cafile_entry(cafile_transaction.new_cafile_entry); + cafile_transaction.new_cafile_entry = NULL; + cafile_transaction.old_cafile_entry = NULL; + cafile_transaction.path = NULL; + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + err = memprintf(&err, "Transaction aborted for certificate '%s'!\n", args[3]); + return cli_dynmsg(appctx, LOG_NOTICE, err); + +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + return cli_dynerr(appctx, err); +} + +/* release function of the `commit ssl ca-file' command, free things and unlock the spinlock. + * It uses a commit_cacrlfile_ctx context. + */ +static void cli_release_commit_cafile(struct appctx *appctx) +{ + struct commit_cacrlfile_ctx *ctx = appctx->svcctx; + struct cafile_entry *new_cafile_entry = ctx->new_entry; + + /* Remove the uncommitted cafile_entry from the tree. */ + if (new_cafile_entry) { + ebmb_delete(&new_cafile_entry->node); + ssl_store_delete_cafile_entry(new_cafile_entry); + } + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + ha_free(&ctx->err); +} + + +/* IO handler of details "show ssl ca-file <filename[:index]>". + * It uses a show_cafile_ctx context, and the global + * cafile_transaction.new_cafile_entry in read-only. + */ +static int cli_io_handler_show_cafile_detail(struct appctx *appctx) +{ + struct show_cafile_ctx *ctx = appctx->svcctx; + struct cafile_entry *cafile_entry = ctx->cur_cafile_entry; + struct buffer *out = alloc_trash_chunk(); + int i = 0; + X509 *cert; + STACK_OF(X509_OBJECT) *objs; + int retval = 0; + int ca_index = ctx->ca_index; + int show_all = ctx->show_all; + + if (!out) + goto end_no_putchk; + + chunk_appendf(out, "Filename: "); + if (cafile_entry == cafile_transaction.new_cafile_entry) + chunk_appendf(out, "*"); + chunk_appendf(out, "%s\n", cafile_entry->path); + + chunk_appendf(out, "Status: "); + if (!cafile_entry->ca_store) + chunk_appendf(out, "Empty\n"); + else if (LIST_ISEMPTY(&cafile_entry->ckch_inst_link)) + chunk_appendf(out, "Unused\n"); + else + chunk_appendf(out, "Used\n"); + + if (!cafile_entry->ca_store) + goto end; + + objs = X509_STORE_get0_objects(cafile_entry->ca_store); + for (i = ca_index; i < sk_X509_OBJECT_num(objs); i++) { + + cert = X509_OBJECT_get0_X509(sk_X509_OBJECT_value(objs, i)); + if (!cert) + continue; + + /* file starts at line 1 */ + chunk_appendf(out, " \nCertificate #%d:\n", i+1); + retval = show_cert_detail(cert, NULL, out); + if (retval < 0) + goto end_no_putchk; + else if (retval) + goto yield; + + if (applet_putchk(appctx, out) == -1) + goto yield; + + if (!show_all) /* only need to dump one certificate */ + goto end; + } + +end: + free_trash_chunk(out); + return 1; /* end, don't come back */ + +end_no_putchk: + free_trash_chunk(out); + return 1; +yield: + /* save the current state */ + ctx->ca_index = i; + free_trash_chunk(out); + return 0; /* should come back */ +} + + +/* parsing function for 'show ssl ca-file [cafile[:index]]'. + * It prepares a show_cafile_ctx context, and checks the global + * cafile_transaction under the ckch_lock (read only). + */ +static int cli_parse_show_cafile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_cafile_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct cafile_entry *cafile_entry; + int ca_index = 0; + char *colons; + char *err = NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return cli_err(appctx, "Can't allocate memory!\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't show!\nOperations on certificates are currently locked!\n"); + + ctx->show_all = 1; /* show all certificates */ + ctx->ca_index = 0; + /* check if there is a certificate to lookup */ + if (*args[3]) { + + /* Look for an optional CA index after the CA file name */ + colons = strchr(args[3], ':'); + if (colons) { + char *endptr; + + ca_index = strtol(colons + 1, &endptr, 10); + /* Indexes start at 1 */ + if (colons + 1 == endptr || *endptr != '\0' || ca_index <= 0) { + memprintf(&err, "wrong CA index after colons in '%s'!", args[3]); + goto error; + } + *colons = '\0'; + ctx->ca_index = ca_index - 1; /* we start counting at 0 in the ca_store, but at 1 on the CLI */ + ctx->show_all = 0; /* show only one certificate */ + } + + if (*args[3] == '*') { + if (!cafile_transaction.new_cafile_entry) + goto error; + + cafile_entry = cafile_transaction.new_cafile_entry; + + if (strcmp(args[3] + 1, cafile_entry->path) != 0) + goto error; + + } else { + /* Get the "original" cafile_entry and not the + * uncommitted one if it exists. */ + if ((cafile_entry = ssl_store_get_cafile_entry(args[3], 1)) == NULL || cafile_entry->type != CAFILE_CERT) + goto error; + } + + ctx->cur_cafile_entry = cafile_entry; + /* use the IO handler that shows details */ + appctx->io_handler = cli_io_handler_show_cafile_detail; + } + + return 0; + +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + if (err) + return cli_dynerr(appctx, err); + return cli_err(appctx, "Can't display the CA file : Not found!\n"); +} + + +/* release function of the 'show ssl ca-file' command */ +static void cli_release_show_cafile(struct appctx *appctx) +{ + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); +} + + +/* This function returns the number of certificates in a cafile_entry. */ +static int get_certificate_count(struct cafile_entry *cafile_entry) +{ + int cert_count = 0; + STACK_OF(X509_OBJECT) *objs; + + if (cafile_entry && cafile_entry->ca_store) { + objs = X509_STORE_get0_objects(cafile_entry->ca_store); + if (objs) + cert_count = sk_X509_OBJECT_num(objs); + } + return cert_count; +} + +/* IO handler of "show ssl ca-file". The command taking a specific CA file name + * is managed in cli_io_handler_show_cafile_detail. + * It uses a show_cafile_ctx and the global cafile_transaction.new_cafile_entry + * in read-only. + */ +static int cli_io_handler_show_cafile(struct appctx *appctx) +{ + struct show_cafile_ctx *ctx = appctx->svcctx; + struct buffer *trash = alloc_trash_chunk(); + struct ebmb_node *node; + struct cafile_entry *cafile_entry = NULL; + + if (trash == NULL) + return 1; + + if (!ctx->old_cafile_entry && cafile_transaction.old_cafile_entry) { + chunk_appendf(trash, "# transaction\n"); + chunk_appendf(trash, "*%s", cafile_transaction.old_cafile_entry->path); + chunk_appendf(trash, " - %d certificate(s)\n", get_certificate_count(cafile_transaction.new_cafile_entry)); + if (applet_putchk(appctx, trash) == -1) + goto yield; + ctx->old_cafile_entry = cafile_transaction.new_cafile_entry; + } + + /* First time in this io_handler. */ + if (!ctx->cur_cafile_entry) { + chunk_appendf(trash, "# filename\n"); + node = ebmb_first(&cafile_tree); + } else { + /* We yielded during a previous call. */ + node = &ctx->cur_cafile_entry->node; + } + + while (node) { + cafile_entry = ebmb_entry(node, struct cafile_entry, node); + if (cafile_entry->type == CAFILE_CERT) { + chunk_appendf(trash, "%s", cafile_entry->path); + + chunk_appendf(trash, " - %d certificate(s)\n", get_certificate_count(cafile_entry)); + } + + node = ebmb_next(node); + if (applet_putchk(appctx, trash) == -1) + goto yield; + } + + ctx->cur_cafile_entry = NULL; + free_trash_chunk(trash); + return 1; +yield: + + free_trash_chunk(trash); + ctx->cur_cafile_entry = cafile_entry; + return 0; /* should come back */ +} + +/* parsing function of 'del ssl ca-file' */ +static int cli_parse_del_cafile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct cafile_entry *cafile_entry; + char *err = NULL; + char *filename; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'del ssl ca-file' expects a CA file name\n"); + + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't delete the CA file!\nOperations on certificates are currently locked!\n"); + + filename = args[3]; + + if (cafile_transaction.path && strcmp(cafile_transaction.path, filename) == 0) { + memprintf(&err, "ongoing transaction for the CA file '%s'", filename); + goto error; + } + + cafile_entry = ssl_store_get_cafile_entry(filename, 0); + if (!cafile_entry) { + memprintf(&err, "CA file '%s' doesn't exist!\n", filename); + goto error; + } + + if (!LIST_ISEMPTY(&cafile_entry->ckch_inst_link)) { + memprintf(&err, "CA file '%s' in use, can't be deleted!\n", filename); + goto error; + } + + /* Remove the cafile_entry from the tree */ + ebmb_delete(&cafile_entry->node); + ssl_store_delete_cafile_entry(cafile_entry); + + memprintf(&err, "CA file '%s' deleted!\n", filename); + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); + +error: + memprintf(&err, "Can't remove the CA file: %s\n", err ? err : ""); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, err); +} + +/* parsing function of 'new ssl crl-file' */ +static int cli_parse_new_crlfile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct cafile_entry *cafile_entry; + char *err = NULL; + char *path; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'new ssl crl-file' expects a filename\n"); + + path = args[3]; + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't create a CRL file!\nOperations on certificates are currently locked!\n"); + + cafile_entry = ssl_store_get_cafile_entry(path, 0); + if (cafile_entry) { + memprintf(&err, "CRL file '%s' already exists!\n", path); + goto error; + } + + cafile_entry = ssl_store_create_cafile_entry(path, NULL, CAFILE_CRL); + if (!cafile_entry) { + memprintf(&err, "%sCannot allocate memory!\n", err ? err : ""); + goto error; + } + + /* Add the newly created cafile_entry to the tree so that + * any new ckch instance created from now can use it. */ + if (ssl_store_add_uncommitted_cafile_entry(cafile_entry)) + goto error; + + memprintf(&err, "New CRL file created '%s'!\n", path); + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, err); +} + +/* Parsing function of `set ssl crl-file` */ +static int cli_parse_set_crlfile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct cafile_entry *old_crlfile_entry = NULL; + struct cafile_entry *new_crlfile_entry = NULL; + char *err = NULL; + int errcode = 0; + struct buffer *buf; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3] || !payload) + return cli_err(appctx, "'set ssl crl-file' expects a filename and CRLs as a payload\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't update the CRL file!\nOperations on certificates are currently locked!\n"); + + if ((buf = alloc_trash_chunk()) == NULL) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + if (!chunk_strcpy(buf, args[3])) { + memprintf(&err, "%sCan't allocate memory\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + old_crlfile_entry = NULL; + new_crlfile_entry = NULL; + + /* if there is an ongoing transaction */ + if (crlfile_transaction.path) { + /* if there is an ongoing transaction, check if this is the same file */ + if (strcmp(crlfile_transaction.path, buf->area) != 0) { + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to set '%s'\n", crlfile_transaction.path, buf->area); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + old_crlfile_entry = crlfile_transaction.old_crlfile_entry; + } + else { + /* lookup for the certificate in the tree */ + old_crlfile_entry = ssl_store_get_cafile_entry(buf->area, 0); + } + + if (!old_crlfile_entry) { + memprintf(&err, "%sCan't replace a CRL file which is not referenced by the configuration!\n", + err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* Create a new cafile_entry without adding it to the cafile tree. */ + new_crlfile_entry = ssl_store_create_cafile_entry(old_crlfile_entry->path, NULL, CAFILE_CRL); + if (!new_crlfile_entry) { + memprintf(&err, "%sCannot allocate memory!\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* Fill the new entry with the new CRL. */ + if (ssl_store_load_ca_from_buf(new_crlfile_entry, payload, 0)) { + memprintf(&err, "%sInvalid payload\n", err ? err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + /* we succeed, we can save the crl in the transaction */ + + /* if there wasn't a transaction, update the old CRL */ + if (!crlfile_transaction.old_crlfile_entry) { + crlfile_transaction.old_crlfile_entry = old_crlfile_entry; + crlfile_transaction.path = old_crlfile_entry->path; + err = memprintf(&err, "transaction created for CRL %s!\n", crlfile_transaction.path); + } else { + err = memprintf(&err, "transaction updated for CRL %s!\n", crlfile_transaction.path); + } + + /* free the previous CRL file if there was a transaction */ + ssl_store_delete_cafile_entry(crlfile_transaction.new_crlfile_entry); + + crlfile_transaction.new_crlfile_entry = new_crlfile_entry; + + /* creates the SNI ctxs later in the IO handler */ + +end: + free_trash_chunk(buf); + + if (errcode & ERR_CODE) { + ssl_store_delete_cafile_entry(new_crlfile_entry); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, memprintf(&err, "%sCan't update %s!\n", err ? err : "", args[3])); + } else { + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); + } +} + +/* Parsing function of 'commit ssl crl-file'. + * It uses a commit_cacrlfile_ctx that's also shared with "commit ssl ca-file". + */ +static int cli_parse_commit_crlfile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct commit_cacrlfile_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + char *err = NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'commit ssl ca-file' expects a filename\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't commit the CRL file!\nOperations on certificates are currently locked!\n"); + + if (!crlfile_transaction.path) { + memprintf(&err, "No ongoing transaction! !\n"); + goto error; + } + + if (strcmp(crlfile_transaction.path, args[3]) != 0) { + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to set '%s'\n", crlfile_transaction.path, args[3]); + goto error; + } + /* init the appctx structure */ + ctx->state = CACRL_ST_INIT; + ctx->next_ckchi_link = NULL; + ctx->old_entry = crlfile_transaction.old_crlfile_entry; + ctx->new_entry = crlfile_transaction.new_crlfile_entry; + ctx->cafile_type = CAFILE_CRL; + + return 0; + +error: + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + err = memprintf(&err, "%sCan't commit %s!\n", err ? err : "", args[3]); + + return cli_dynerr(appctx, err); +} + + +/* release function of the `commit ssl crl-file' command, free things and unlock the spinlock. + * it uses a commit_cacrlfile_ctx that's the same as for "commit ssl ca-file". + */ +static void cli_release_commit_crlfile(struct appctx *appctx) +{ + struct commit_cacrlfile_ctx *ctx = appctx->svcctx; + struct cafile_entry *new_crlfile_entry = ctx->new_entry; + + /* Remove the uncommitted cafile_entry from the tree. */ + if (new_crlfile_entry) { + ebmb_delete(&new_crlfile_entry->node); + ssl_store_delete_cafile_entry(new_crlfile_entry); + } + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + ha_free(&ctx->err); +} + +/* parsing function of 'del ssl crl-file' */ +static int cli_parse_del_crlfile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct cafile_entry *cafile_entry; + char *err = NULL; + char *filename; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'del ssl crl-file' expects a CRL file name\n"); + + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't delete the CRL file!\nOperations on certificates are currently locked!\n"); + + filename = args[3]; + + if (crlfile_transaction.path && strcmp(crlfile_transaction.path, filename) == 0) { + memprintf(&err, "ongoing transaction for the CRL file '%s'", filename); + goto error; + } + + cafile_entry = ssl_store_get_cafile_entry(filename, 0); + if (!cafile_entry) { + memprintf(&err, "CRL file '%s' doesn't exist!\n", filename); + goto error; + } + if (cafile_entry->type != CAFILE_CRL) { + memprintf(&err, "'del ssl crl-file' does not work on CA files!\n"); + goto error; + } + + if (!LIST_ISEMPTY(&cafile_entry->ckch_inst_link)) { + memprintf(&err, "CRL file '%s' in use, can't be deleted!\n", filename); + goto error; + } + + /* Remove the cafile_entry from the tree */ + ebmb_delete(&cafile_entry->node); + ssl_store_delete_cafile_entry(cafile_entry); + + memprintf(&err, "CRL file '%s' deleted!\n", filename); + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynmsg(appctx, LOG_NOTICE, err); + +error: + memprintf(&err, "Can't remove the CRL file: %s\n", err ? err : ""); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + return cli_dynerr(appctx, err); +} + +/* parsing function of 'abort ssl crl-file' */ +static int cli_parse_abort_crlfile(char **args, char *payload, struct appctx *appctx, void *private) +{ + char *err = NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3]) + return cli_err(appctx, "'abort ssl crl-file' expects a filename\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't abort!\nOperations on certificates are currently locked!\n"); + + if (!crlfile_transaction.path) { + memprintf(&err, "No ongoing transaction!\n"); + goto error; + } + + if (strcmp(crlfile_transaction.path, args[3]) != 0) { + memprintf(&err, "The ongoing transaction is about '%s' but you are trying to abort a transaction for '%s'\n", crlfile_transaction.path, args[3]); + goto error; + } + + /* Only free the uncommitted cafile_entry here, because the SNI and instances were not generated yet */ + ssl_store_delete_cafile_entry(crlfile_transaction.new_crlfile_entry); + crlfile_transaction.new_crlfile_entry = NULL; + crlfile_transaction.old_crlfile_entry = NULL; + crlfile_transaction.path = NULL; + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + err = memprintf(&err, "Transaction aborted for certificate '%s'!\n", args[3]); + return cli_dynmsg(appctx, LOG_NOTICE, err); + +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + return cli_dynerr(appctx, err); +} + + +/* + * Display a Certificate Resignation List's information. + * The information displayed is inspired by the output of 'openssl crl -in + * crl.pem -text'. + * Returns 0 in case of success. + */ +static int show_crl_detail(X509_CRL *crl, struct buffer *out) +{ + BIO *bio = NULL; + struct buffer *tmp = alloc_trash_chunk(); + long version; + X509_NAME *issuer; + int write = -1; +#ifndef USE_OPENSSL_WOLFSSL + STACK_OF(X509_REVOKED) *rev = NULL; + X509_REVOKED *rev_entry = NULL; + int i; +#endif + + if (!tmp) + return -1; + + if ((bio = BIO_new(BIO_s_mem())) == NULL) + goto end; + + /* Version (as displayed by 'openssl crl') */ + version = X509_CRL_get_version(crl); + chunk_appendf(out, "Version %ld\n", version + 1); + + /* Signature Algorithm */ + chunk_appendf(out, "Signature Algorithm: %s\n", OBJ_nid2ln(X509_CRL_get_signature_nid(crl))); + + /* Issuer */ + chunk_appendf(out, "Issuer: "); + if ((issuer = X509_CRL_get_issuer(crl)) == NULL) + goto end; + if ((ssl_sock_get_dn_oneline(issuer, tmp)) == -1) + goto end; + *(tmp->area + tmp->data) = '\0'; + chunk_appendf(out, "%s\n", tmp->area); + + /* Last Update */ + chunk_appendf(out, "Last Update: "); + chunk_reset(tmp); + if (BIO_reset(bio) == -1) + goto end; + if (ASN1_TIME_print(bio, X509_CRL_get0_lastUpdate(crl)) == 0) + goto end; + write = BIO_read(bio, tmp->area, tmp->size-1); + tmp->area[write] = '\0'; + chunk_appendf(out, "%s\n", tmp->area); + + + /* Next Update */ + chunk_appendf(out, "Next Update: "); + chunk_reset(tmp); + if (BIO_reset(bio) == -1) + goto end; + if (ASN1_TIME_print(bio, X509_CRL_get0_nextUpdate(crl)) == 0) + goto end; + write = BIO_read(bio, tmp->area, tmp->size-1); + tmp->area[write] = '\0'; + chunk_appendf(out, "%s\n", tmp->area); + +#ifndef USE_OPENSSL_WOLFSSL + /* Revoked Certificates */ + rev = X509_CRL_get_REVOKED(crl); + if (sk_X509_REVOKED_num(rev) > 0) + chunk_appendf(out, "Revoked Certificates:\n"); + else + chunk_appendf(out, "No Revoked Certificates.\n"); + + for (i = 0; i < sk_X509_REVOKED_num(rev); i++) { + rev_entry = sk_X509_REVOKED_value(rev, i); + + /* Serial Number and Revocation Date */ + if (BIO_reset(bio) == -1) + goto end; + BIO_printf(bio , " Serial Number: "); + i2a_ASN1_INTEGER(bio, (ASN1_INTEGER*)X509_REVOKED_get0_serialNumber(rev_entry)); + BIO_printf(bio, "\n Revocation Date: "); + if (ASN1_TIME_print(bio, X509_REVOKED_get0_revocationDate(rev_entry)) == 0) + goto end; + BIO_printf(bio, "\n"); + + write = BIO_read(bio, tmp->area, tmp->size-1); + tmp->area[write] = '\0'; + chunk_appendf(out, "%s", tmp->area); + } +#endif /* not USE_OPENSSL_WOLFSSL */ + +end: + free_trash_chunk(tmp); + if (bio) + BIO_free(bio); + + return 0; +} + +/* IO handler of details "show ssl crl-file <filename[:index]>". + * It uses show_crlfile_ctx and the global + * crlfile_transaction.new_cafile_entry in read-only. + */ +static int cli_io_handler_show_crlfile_detail(struct appctx *appctx) +{ + struct show_crlfile_ctx *ctx = appctx->svcctx; + struct cafile_entry *cafile_entry = ctx->cafile_entry; + struct buffer *out = alloc_trash_chunk(); + int i; + X509_CRL *crl; + STACK_OF(X509_OBJECT) *objs; + int retval = 0; + int index = ctx->index; + + if (!out) + goto end_no_putchk; + + chunk_appendf(out, "Filename: "); + if (cafile_entry == crlfile_transaction.new_crlfile_entry) + chunk_appendf(out, "*"); + chunk_appendf(out, "%s\n", cafile_entry->path); + + chunk_appendf(out, "Status: "); + if (!cafile_entry->ca_store) + chunk_appendf(out, "Empty\n"); + else if (LIST_ISEMPTY(&cafile_entry->ckch_inst_link)) + chunk_appendf(out, "Unused\n"); + else + chunk_appendf(out, "Used\n"); + + if (!cafile_entry->ca_store) + goto end; + + objs = X509_STORE_get0_objects(cafile_entry->ca_store); + for (i = 0; i < sk_X509_OBJECT_num(objs); i++) { + crl = X509_OBJECT_get0_X509_CRL(sk_X509_OBJECT_value(objs, i)); + if (!crl) + continue; + + /* CRL indexes start at 1 on the CLI output. */ + if (index && index-1 != i) + continue; + + chunk_appendf(out, " \nCertificate Revocation List #%d:\n", i+1); + retval = show_crl_detail(crl, out); + if (retval < 0) + goto end_no_putchk; + else if (retval || index) + goto end; + } + +end: + if (applet_putchk(appctx, out) == -1) + goto yield; + +end_no_putchk: + free_trash_chunk(out); + return 1; +yield: + free_trash_chunk(out); + return 0; /* should come back */ +} + +/* parsing function for 'show ssl crl-file [crlfile[:index]]'. + * It sets the context to a show_crlfile_ctx, and the global + * cafile_transaction.new_crlfile_entry under the ckch_lock. + */ +static int cli_parse_show_crlfile(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_crlfile_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct cafile_entry *cafile_entry; + long index = 0; + char *colons; + char *err = NULL; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return cli_err(appctx, "Can't allocate memory!\n"); + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't show!\nOperations on certificates are currently locked!\n"); + + /* check if there is a certificate to lookup */ + if (*args[3]) { + + /* Look for an optional index after the CRL file name */ + colons = strchr(args[3], ':'); + if (colons) { + char *endptr; + + index = strtol(colons + 1, &endptr, 10); + /* Indexes start at 1 */ + if (colons + 1 == endptr || *endptr != '\0' || index <= 0) { + memprintf(&err, "wrong CRL index after colons in '%s'!", args[3]); + goto error; + } + *colons = '\0'; + } + + if (*args[3] == '*') { + if (!crlfile_transaction.new_crlfile_entry) + goto error; + + cafile_entry = crlfile_transaction.new_crlfile_entry; + + if (strcmp(args[3] + 1, cafile_entry->path) != 0) + goto error; + + } else { + /* Get the "original" cafile_entry and not the + * uncommitted one if it exists. */ + if ((cafile_entry = ssl_store_get_cafile_entry(args[3], 1)) == NULL || cafile_entry->type != CAFILE_CRL) + goto error; + } + + ctx->cafile_entry = cafile_entry; + ctx->index = index; + /* use the IO handler that shows details */ + appctx->io_handler = cli_io_handler_show_crlfile_detail; + } + + return 0; + +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + if (err) + return cli_dynerr(appctx, err); + return cli_err(appctx, "Can't display the CRL file : Not found!\n"); +} + +/* IO handler of "show ssl crl-file". The command taking a specific CRL file name + * is managed in cli_io_handler_show_crlfile_detail. */ +static int cli_io_handler_show_crlfile(struct appctx *appctx) +{ + struct show_crlfile_ctx *ctx = appctx->svcctx; + struct buffer *trash = alloc_trash_chunk(); + struct ebmb_node *node; + struct cafile_entry *cafile_entry = NULL; + + if (trash == NULL) + return 1; + + if (!ctx->old_crlfile_entry && crlfile_transaction.old_crlfile_entry) { + chunk_appendf(trash, "# transaction\n"); + chunk_appendf(trash, "*%s\n", crlfile_transaction.old_crlfile_entry->path); + if (applet_putchk(appctx, trash) == -1) + goto yield; + ctx->old_crlfile_entry = crlfile_transaction.old_crlfile_entry; + } + + /* First time in this io_handler. */ + if (!ctx->cafile_entry) { + chunk_appendf(trash, "# filename\n"); + node = ebmb_first(&cafile_tree); + } else { + /* We yielded during a previous call. */ + node = &ctx->cafile_entry->node; + } + + while (node) { + cafile_entry = ebmb_entry(node, struct cafile_entry, node); + if (cafile_entry->type == CAFILE_CRL) { + chunk_appendf(trash, "%s\n", cafile_entry->path); + } + + node = ebmb_next(node); + if (applet_putchk(appctx, trash) == -1) + goto yield; + } + + ctx->cafile_entry = NULL; + free_trash_chunk(trash); + return 1; +yield: + + free_trash_chunk(trash); + ctx->cafile_entry = cafile_entry; + return 0; /* should come back */ +} + + +/* release function of the 'show ssl crl-file' command */ +static void cli_release_show_crlfile(struct appctx *appctx) +{ + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); +} + + +void ckch_deinit() +{ + struct eb_node *node, *next; + struct ckch_store *store; + struct ebmb_node *canode; + + /* deinit the ckch stores */ + node = eb_first(&ckchs_tree); + while (node) { + next = eb_next(node); + store = ebmb_entry(node, struct ckch_store, node); + ckch_store_free(store); + node = next; + } + + /* deinit the ca-file store */ + canode = ebmb_first(&cafile_tree); + while (canode) { + struct cafile_entry *entry = NULL; + + entry = ebmb_entry(canode, struct cafile_entry, node); + canode = ebmb_next(canode); + ebmb_delete(&entry->node); + ssl_store_delete_cafile_entry(entry); + } +} + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "new", "ssl", "cert", NULL }, "new ssl cert <certfile> : create a new certificate file to be used in a crt-list or a directory", cli_parse_new_cert, NULL, NULL }, + { { "set", "ssl", "cert", NULL }, "set ssl cert <certfile> <payload> : replace a certificate file", cli_parse_set_cert, NULL, NULL }, + { { "commit", "ssl", "cert", NULL }, "commit ssl cert <certfile> : commit a certificate file", cli_parse_commit_cert, cli_io_handler_commit_cert, cli_release_commit_cert }, + { { "abort", "ssl", "cert", NULL }, "abort ssl cert <certfile> : abort a transaction for a certificate file", cli_parse_abort_cert, NULL, NULL }, + { { "del", "ssl", "cert", NULL }, "del ssl cert <certfile> : delete an unused certificate file", cli_parse_del_cert, NULL, NULL }, + { { "show", "ssl", "cert", NULL }, "show ssl cert [<certfile>] : display the SSL certificates used in memory, or the details of a file", cli_parse_show_cert, cli_io_handler_show_cert, cli_release_show_cert }, + + { { "new", "ssl", "ca-file", NULL }, "new ssl ca-file <cafile> : create a new CA file to be used in a crt-list", cli_parse_new_cafile, NULL, NULL }, + { { "add", "ssl", "ca-file", NULL }, "add ssl ca-file <cafile> <payload> : add a certificate into the CA file", cli_parse_set_cafile, NULL, NULL }, + { { "set", "ssl", "ca-file", NULL }, "set ssl ca-file <cafile> <payload> : replace a CA file", cli_parse_set_cafile, NULL, NULL }, + { { "commit", "ssl", "ca-file", NULL }, "commit ssl ca-file <cafile> : commit a CA file", cli_parse_commit_cafile, cli_io_handler_commit_cafile_crlfile, cli_release_commit_cafile }, + { { "abort", "ssl", "ca-file", NULL }, "abort ssl ca-file <cafile> : abort a transaction for a CA file", cli_parse_abort_cafile, NULL, NULL }, + { { "del", "ssl", "ca-file", NULL }, "del ssl ca-file <cafile> : delete an unused CA file", cli_parse_del_cafile, NULL, NULL }, + { { "show", "ssl", "ca-file", NULL }, "show ssl ca-file [<cafile>[:<index>]] : display the SSL CA files used in memory, or the details of a <cafile>, or a single certificate of index <index> of a CA file <cafile>", cli_parse_show_cafile, cli_io_handler_show_cafile, cli_release_show_cafile }, + + { { "new", "ssl", "crl-file", NULL }, "new ssl crlfile <crlfile> : create a new CRL file to be used in a crt-list", cli_parse_new_crlfile, NULL, NULL }, + { { "set", "ssl", "crl-file", NULL }, "set ssl crl-file <crlfile> <payload> : replace a CRL file", cli_parse_set_crlfile, NULL, NULL }, + { { "commit", "ssl", "crl-file", NULL },"commit ssl crl-file <crlfile> : commit a CRL file", cli_parse_commit_crlfile, cli_io_handler_commit_cafile_crlfile, cli_release_commit_crlfile }, + { { "abort", "ssl", "crl-file", NULL }, "abort ssl crl-file <crlfile> : abort a transaction for a CRL file", cli_parse_abort_crlfile, NULL, NULL }, + { { "del", "ssl", "crl-file", NULL }, "del ssl crl-file <crlfile> : delete an unused CRL file", cli_parse_del_crlfile, NULL, NULL }, + { { "show", "ssl", "crl-file", NULL }, "show ssl crl-file [<crlfile[:<index>>]] : display the SSL CRL files used in memory, or the details of a <crlfile>, or a single CRL of index <index> of CRL file <crlfile>", cli_parse_show_crlfile, cli_io_handler_show_crlfile, cli_release_show_crlfile }, + { { NULL }, NULL, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + diff --git a/src/ssl_crtlist.c b/src/ssl_crtlist.c new file mode 100644 index 0000000..dcd9171 --- /dev/null +++ b/src/ssl_crtlist.c @@ -0,0 +1,1577 @@ +/* + * + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <sys/stat.h> +#include <sys/types.h> + +#include <dirent.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> + +#include <import/ebpttree.h> +#include <import/ebsttree.h> + +#include <haproxy/applet.h> +#include <haproxy/channel.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/sc_strm.h> +#include <haproxy/ssl_ckch.h> +#include <haproxy/ssl_crtlist.h> +#include <haproxy/ssl_ocsp.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> + +/* CLI context for "show ssl crt-list" or "dump ssl crt-list" */ +struct show_crtlist_ctx { + struct ebmb_node *crtlist_node; /* ebmb_node for the current crtlist */ + struct crtlist_entry *entry; /* current entry */ + int mode; /* 'd' for dump, 's' for show */ +}; + +/* CLI context for "add ssl crt-list" */ +struct add_crtlist_ctx { + struct crtlist *crtlist; + struct crtlist_entry *entry; + struct bind_conf_list *bind_conf_node; + char *err; + enum { + ADDCRT_ST_INIT = 0, + ADDCRT_ST_GEN, + ADDCRT_ST_INSERT, + ADDCRT_ST_SUCCESS, + ADDCRT_ST_ERROR, + ADDCRT_ST_FIN, + } state; +}; + +/* release ssl bind conf */ +void ssl_sock_free_ssl_conf(struct ssl_bind_conf *conf) +{ + if (conf) { +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + ha_free(&conf->npn_str); +#endif +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + ha_free(&conf->alpn_str); +#endif + ha_free(&conf->ca_file); + ha_free(&conf->ca_verify_file); + ha_free(&conf->crl_file); + ha_free(&conf->ciphers); +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + ha_free(&conf->ciphersuites); +#endif + ha_free(&conf->curves); + ha_free(&conf->ecdhe); +#if defined(SSL_CTX_set1_sigalgs_list) + ha_free(&conf->sigalgs); +#endif +#if defined(SSL_CTX_set1_client_sigalgs_list) + ha_free(&conf->client_sigalgs); +#endif + } +} + +/* + * Allocate and copy a ssl_bind_conf structure + */ +struct ssl_bind_conf *crtlist_dup_ssl_conf(struct ssl_bind_conf *src) +{ + struct ssl_bind_conf *dst; + + if (!src) + return NULL; + + dst = calloc(1, sizeof(*dst)); + if (!dst) + return NULL; + +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + if (src->npn_str) { + dst->npn_str = strdup(src->npn_str); + if (!dst->npn_str) + goto error; + } +#endif +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + if (src->alpn_str) { + dst->alpn_str = strdup(src->alpn_str); + if (!dst->alpn_str) + goto error; + } +#endif + if (src->ca_file) { + dst->ca_file = strdup(src->ca_file); + if (!dst->ca_file) + goto error; + } + if (src->ca_verify_file) { + dst->ca_verify_file = strdup(src->ca_verify_file); + if (!dst->ca_verify_file) + goto error; + } + if (src->crl_file) { + dst->crl_file = strdup(src->crl_file); + if (!dst->crl_file) + goto error; + } + if (src->ciphers) { + dst->ciphers = strdup(src->ciphers); + if (!dst->ciphers) + goto error; + } +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + if (src->ciphersuites) { + dst->ciphersuites = strdup(src->ciphersuites); + if (!dst->ciphersuites) + goto error; + } +#endif + if (src->curves) { + dst->curves = strdup(src->curves); + if (!dst->curves) + goto error; + } + if (src->ecdhe) { + dst->ecdhe = strdup(src->ecdhe); + if (!dst->ecdhe) + goto error; + } + + dst->ssl_methods_cfg.flags = src->ssl_methods_cfg.flags; + dst->ssl_methods_cfg.min = src->ssl_methods_cfg.min; + dst->ssl_methods_cfg.max = src->ssl_methods_cfg.max; + + dst->ssl_methods.flags = src->ssl_methods.flags; + dst->ssl_methods.min = src->ssl_methods.min; + dst->ssl_methods.max = src->ssl_methods.max; + +#if defined(SSL_CTX_set1_sigalgs_list) + if (src->sigalgs) { + dst->sigalgs = strdup(src->sigalgs); + if (!dst->sigalgs) + goto error; + } +#endif +#if defined(SSL_CTX_set1_client_sigalgs_list) + if (src->client_sigalgs) { + dst->client_sigalgs = strdup(src->client_sigalgs); + if (!dst->client_sigalgs) + goto error; + } +#endif + return dst; + +error: + ssl_sock_free_ssl_conf(dst); + free(dst); + + return NULL; +} + +/* free sni filters */ +void crtlist_free_filters(char **args) +{ + int i; + + if (!args) + return; + + for (i = 0; args[i]; i++) + free(args[i]); + + free(args); +} + +/* Alloc and duplicate a char ** array */ +char **crtlist_dup_filters(char **args, int fcount) +{ + char **dst; + int i; + + if (fcount == 0) + return NULL; + + dst = calloc(fcount + 1, sizeof(*dst)); + if (!dst) + return NULL; + + for (i = 0; i < fcount; i++) { + dst[i] = strdup(args[i]); + if (!dst[i]) + goto error; + } + return dst; + +error: + crtlist_free_filters(dst); + return NULL; +} + +/* + * Detach and free a crtlist_entry. + * Free the filters, the ssl_conf and call ckch_inst_free() for each ckch_inst + */ +void crtlist_entry_free(struct crtlist_entry *entry) +{ + struct ckch_inst *inst, *inst_s; + + if (entry == NULL) + return; + + ebpt_delete(&entry->node); + LIST_DELETE(&entry->by_crtlist); + LIST_DELETE(&entry->by_ckch_store); + crtlist_free_filters(entry->filters); + ssl_sock_free_ssl_conf(entry->ssl_conf); + free(entry->ssl_conf); + list_for_each_entry_safe(inst, inst_s, &entry->ckch_inst, by_crtlist_entry) { + ckch_inst_free(inst); + } + free(entry); +} +/* + * Duplicate a crt_list entry and its content (ssl_conf, filters/fcount) + * Return a pointer to the new entry + */ +struct crtlist_entry *crtlist_entry_dup(struct crtlist_entry *src) +{ + struct crtlist_entry *entry; + + if (src == NULL) + return NULL; + + entry = crtlist_entry_new(); + if (entry == NULL) + return NULL; + + if (src->filters) { + entry->filters = crtlist_dup_filters(src->filters, src->fcount); + if (!entry->filters) + goto error; + } + entry->fcount = src->fcount; + if (src->ssl_conf) { + entry->ssl_conf = crtlist_dup_ssl_conf(src->ssl_conf); + if (!entry->ssl_conf) + goto error; + } + entry->crtlist = src->crtlist; + + return entry; + +error: + + crtlist_free_filters(entry->filters); + ssl_sock_free_ssl_conf(entry->ssl_conf); + free(entry->ssl_conf); + free(entry); + + return NULL; +} + +/* + * Allocate and initialize a crtlist_entry + */ +struct crtlist_entry *crtlist_entry_new() +{ + struct crtlist_entry *entry; + + entry = calloc(1, sizeof(*entry)); + if (entry == NULL) + return NULL; + + LIST_INIT(&entry->ckch_inst); + + /* initialize the nodes so we can LIST_DELETE in any cases */ + LIST_INIT(&entry->by_crtlist); + LIST_INIT(&entry->by_ckch_store); + + return entry; +} + +/* Free a crtlist, from the crt_entry to the content of the ssl_conf */ +void crtlist_free(struct crtlist *crtlist) +{ + struct crtlist_entry *entry, *s_entry; + struct bind_conf_list *bind_conf_node; + + if (crtlist == NULL) + return; + + bind_conf_node = crtlist->bind_conf; + while (bind_conf_node) { + struct bind_conf_list *next = bind_conf_node->next; + free(bind_conf_node); + bind_conf_node = next; + } + + list_for_each_entry_safe(entry, s_entry, &crtlist->ord_entries, by_crtlist) { + crtlist_entry_free(entry); + } + ebmb_delete(&crtlist->node); + free(crtlist); +} + +/* Alloc and initialize a struct crtlist + * <filename> is the key of the ebmb_node + * <unique> initialize the list of entries to be unique (1) or not (0) + */ +struct crtlist *crtlist_new(const char *filename, int unique) +{ + struct crtlist *newlist; + + newlist = calloc(1, sizeof(*newlist) + strlen(filename) + 1); + if (newlist == NULL) + return NULL; + + memcpy(newlist->node.key, filename, strlen(filename) + 1); + if (unique) + newlist->entries = EB_ROOT_UNIQUE; + else + newlist->entries = EB_ROOT; + + LIST_INIT(&newlist->ord_entries); + + return newlist; +} + +/* + * Read a single crt-list line. /!\ alter the <line> string. + * Fill <crt_path> and <crtlist_entry> + * <crtlist_entry> must be alloc and free by the caller + * <crtlist_entry->ssl_conf> is alloc by the function + * <crtlist_entry->filters> is alloc by the function + * <crt_path> is a ptr in <line> + * Return an error code + */ +int crtlist_parse_line(char *line, char **crt_path, struct crtlist_entry *entry, const char *file, int linenum, int from_cli, char **err) +{ + int cfgerr = 0; + int arg, newarg, cur_arg, i, ssl_b = 0, ssl_e = 0; + char *end; + char *args[MAX_CRT_ARGS + 1]; + struct ssl_bind_conf *ssl_conf = NULL; + + if (!line || !crt_path || !entry) + return ERR_ALERT | ERR_FATAL; + + end = line + strlen(line); + if (end-line >= CRT_LINESIZE-1 && *(end-1) != '\n') { + /* Check if we reached the limit and the last char is not \n. + * Watch out for the last line without the terminating '\n'! + */ + memprintf(err, "parsing [%s:%d]: line too long, limit is %d characters", + file, linenum, CRT_LINESIZE-1); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + arg = 0; + newarg = 1; + while (*line) { + if (isspace((unsigned char)*line)) { + newarg = 1; + *line = 0; + } else if (*line == '[') { + if (ssl_b) { + memprintf(err, "parsing [%s:%d]: too many '['", file, linenum); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + if (!arg) { + memprintf(err, "parsing [%s:%d]: file must start with a cert", file, linenum); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + ssl_b = arg; + newarg = 1; + *line = 0; + } else if (*line == ']') { + if (ssl_e) { + memprintf(err, "parsing [%s:%d]: too many ']'", file, linenum); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + if (!ssl_b) { + memprintf(err, "parsing [%s:%d]: missing '['", file, linenum); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + ssl_e = arg; + newarg = 1; + *line = 0; + } else if (newarg) { + if (arg == MAX_CRT_ARGS) { + memprintf(err, "parsing [%s:%d]: too many args ", file, linenum); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + newarg = 0; + args[arg++] = line; + } + line++; + } + args[arg++] = line; + + /* empty line */ + if (!*args[0]) { + cfgerr |= ERR_NONE; + goto error; + } + + *crt_path = args[0]; + + if (ssl_b) { + if (ssl_b > 1) { + memprintf(err, "parsing [%s:%d]: malformated line, filters can't be between filename and options!", file, linenum); + cfgerr |= ERR_WARN; + } + + ssl_conf = calloc(1, sizeof *ssl_conf); + if (!ssl_conf) { + memprintf(err, "not enough memory!"); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + } + + cur_arg = ssl_b ? ssl_b : 1; + while (cur_arg < ssl_e) { + newarg = 0; + for (i = 0; ssl_crtlist_kws[i].kw != NULL; i++) { + if (strcmp(ssl_crtlist_kws[i].kw, args[cur_arg]) == 0) { + newarg = 1; + cfgerr |= ssl_crtlist_kws[i].parse(args, cur_arg, NULL, ssl_conf, from_cli, err); + if (cur_arg + 1 + ssl_crtlist_kws[i].skip > ssl_e) { + memprintf(err, "parsing [%s:%d]: ssl args out of '[]' for %s", + file, linenum, args[cur_arg]); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + cur_arg += 1 + ssl_crtlist_kws[i].skip; + break; + } + } + if (!cfgerr && !newarg) { + memprintf(err, "parsing [%s:%d]: unknown ssl keyword %s", + file, linenum, args[cur_arg]); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + } + entry->linenum = linenum; + entry->ssl_conf = ssl_conf; + entry->filters = crtlist_dup_filters(&args[cur_arg], arg - cur_arg - 1); + entry->fcount = arg - cur_arg - 1; + + return cfgerr; + +error: + crtlist_free_filters(entry->filters); + entry->filters = NULL; + ssl_sock_free_ssl_conf(entry->ssl_conf); + ha_free(&entry->ssl_conf); + return cfgerr; +} + + + +/* This function parse a crt-list file and store it in a struct crtlist, each line is a crtlist_entry structure + * Fill the <crtlist> argument with a pointer to a new crtlist struct + * + * This function tries to open and store certificate files. + */ +int crtlist_parse_file(char *file, struct bind_conf *bind_conf, struct proxy *curproxy, struct crtlist **crtlist, char **err) +{ + struct crtlist *newlist; + struct crtlist_entry *entry = NULL; + char thisline[CRT_LINESIZE]; + FILE *f; + struct stat buf; + int linenum = 0; + int cfgerr = 0; + int missing_lf = -1; + + if ((f = fopen(file, "r")) == NULL) { + memprintf(err, "cannot open file '%s' : %s", file, strerror(errno)); + return ERR_ALERT | ERR_FATAL; + } + + newlist = crtlist_new(file, 0); + if (newlist == NULL) { + memprintf(err, "Not enough memory!"); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + + while (fgets(thisline, sizeof(thisline), f) != NULL) { + char *end; + char *line = thisline; + char *crt_path; + char path[MAXPATHLEN+1]; + struct ckch_store *ckchs; + int found = 0; + + if (missing_lf != -1) { + memprintf(err, "parsing [%s:%d]: Stray NUL character at position %d.\n", + file, linenum, (missing_lf + 1)); + cfgerr |= ERR_ALERT | ERR_FATAL; + missing_lf = -1; + break; + } + + linenum++; + end = line + strlen(line); + if (end-line == sizeof(thisline)-1 && *(end-1) != '\n') { + /* Check if we reached the limit and the last char is not \n. + * Watch out for the last line without the terminating '\n'! + */ + memprintf(err, "parsing [%s:%d]: line too long, limit is %d characters", + file, linenum, (int)sizeof(thisline)-1); + cfgerr |= ERR_ALERT | ERR_FATAL; + break; + } + + if (*line == '#' || *line == '\n' || *line == '\r') + continue; + + if (end > line && *(end-1) == '\n') { + /* kill trailing LF */ + *(end - 1) = 0; + } + else { + /* mark this line as truncated */ + missing_lf = end - line; + } + + entry = crtlist_entry_new(); + if (entry == NULL) { + memprintf(err, "Not enough memory!"); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + + cfgerr |= crtlist_parse_line(thisline, &crt_path, entry, file, linenum, 0, err); + if (cfgerr & ERR_CODE) + goto error; + + /* empty line */ + if (!crt_path || !*crt_path) { + crtlist_entry_free(entry); + entry = NULL; + continue; + } + + if (*crt_path != '/' && global_ssl.crt_base) { + if ((strlen(global_ssl.crt_base) + 1 + strlen(crt_path)) > sizeof(path) || + snprintf(path, sizeof(path), "%s/%s", global_ssl.crt_base, crt_path) > sizeof(path)) { + memprintf(err, "parsing [%s:%d]: '%s' : path too long", + file, linenum, crt_path); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + crt_path = path; + } + + /* Look for a ckch_store or create one */ + ckchs = ckchs_lookup(crt_path); + if (ckchs == NULL) { + if (stat(crt_path, &buf) == 0) { + found++; + + ckchs = ckchs_load_cert_file(crt_path, err); + if (ckchs == NULL) { + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + + entry->node.key = ckchs; + entry->crtlist = newlist; + if (entry->ssl_conf) + ckchs->data->ocsp_update_mode = entry->ssl_conf->ocsp_update; + ebpt_insert(&newlist->entries, &entry->node); + LIST_APPEND(&newlist->ord_entries, &entry->by_crtlist); + LIST_APPEND(&ckchs->crtlist_entry, &entry->by_ckch_store); + + } else if (global_ssl.extra_files & SSL_GF_BUNDLE) { + /* If we didn't find the file, this could be a + bundle, since 2.3 we don't support multiple + certificate in the same OpenSSL store, so we + emulate it by loading each file separately. To + do so we need to duplicate the entry in the + crt-list because it becomes independent */ + char fp[MAXPATHLEN+1] = {0}; + int n = 0; + struct crtlist_entry *entry_dup = entry; /* use the previous created entry */ + for (n = 0; n < SSL_SOCK_NUM_KEYTYPES; n++) { + struct stat buf; + int ret; + + ret = snprintf(fp, sizeof(fp), "%s.%s", crt_path, SSL_SOCK_KEYTYPE_NAMES[n]); + if (ret > sizeof(fp)) + continue; + + ckchs = ckchs_lookup(fp); + if (!ckchs) { + if (stat(fp, &buf) == 0) { + ckchs = ckchs_load_cert_file(fp, err); + if (!ckchs) { + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + } else { + continue; /* didn't find this extension, skip */ + } + } + found++; + linenum++; /* we duplicate the line for this entry in the bundle */ + if (!entry_dup) { /* if the entry was used, duplicate one */ + linenum++; + entry_dup = crtlist_entry_dup(entry); + if (!entry_dup) { + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + entry_dup->linenum = linenum; + } + + entry_dup->node.key = ckchs; + entry_dup->crtlist = newlist; + + cfgerr |= ocsp_update_check_cfg_consistency(ckchs, entry, crt_path, err); + if (cfgerr & ERR_FATAL) + goto error; + + if (entry->ssl_conf) + ckchs->data->ocsp_update_mode = entry->ssl_conf->ocsp_update; + ebpt_insert(&newlist->entries, &entry_dup->node); + LIST_APPEND(&newlist->ord_entries, &entry_dup->by_crtlist); + LIST_APPEND(&ckchs->crtlist_entry, &entry_dup->by_ckch_store); + + entry_dup = NULL; /* the entry was used, we need a new one next round */ + } +#if HA_OPENSSL_VERSION_NUMBER < 0x10101000L + if (found) { + memprintf(err, "%sCan't load '%s'. Loading a multi certificates bundle requires OpenSSL >= 1.1.1\n", + err && *err ? *err : "", crt_path); + cfgerr |= ERR_ALERT | ERR_FATAL; + } +#endif + } + if (!found) { + memprintf(err, "%sunable to stat SSL certificate from file '%s' : %s.\n", + err && *err ? *err : "", crt_path, strerror(errno)); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + + } else { + entry->node.key = ckchs; + entry->crtlist = newlist; + + cfgerr |= ocsp_update_check_cfg_consistency(ckchs, entry, crt_path, err); + if (cfgerr & ERR_FATAL) + goto error; + + if (entry->ssl_conf) + ckchs->data->ocsp_update_mode = entry->ssl_conf->ocsp_update; + ebpt_insert(&newlist->entries, &entry->node); + LIST_APPEND(&newlist->ord_entries, &entry->by_crtlist); + LIST_APPEND(&ckchs->crtlist_entry, &entry->by_ckch_store); + found++; + } + entry = NULL; + } + + if (missing_lf != -1) { + memprintf(err, "parsing [%s:%d]: Missing LF on last line, file might have been truncated at position %d.\n", + file, linenum, (missing_lf + 1)); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + + if (cfgerr & ERR_CODE) + goto error; + + newlist->linecount = linenum; + + fclose(f); + *crtlist = newlist; + + return cfgerr; +error: + crtlist_entry_free(entry); + + fclose(f); + crtlist_free(newlist); + return cfgerr; +} + +/* This function reads a directory and stores it in a struct crtlist, each file is a crtlist_entry structure + * Fill the <crtlist> argument with a pointer to a new crtlist struct + * + * This function tries to open and store certificate files. + */ +int crtlist_load_cert_dir(char *path, struct bind_conf *bind_conf, struct crtlist **crtlist, char **err) +{ + struct crtlist *dir; + struct dirent **de_list; + int i, n; + struct stat buf; + char *end; + char fp[MAXPATHLEN+1]; + int cfgerr = 0; + struct ckch_store *ckchs; + + dir = crtlist_new(path, 1); + if (dir == NULL) { + memprintf(err, "not enough memory"); + return ERR_ALERT | ERR_FATAL; + } + + n = scandir(path, &de_list, 0, alphasort); + if (n < 0) { + memprintf(err, "%sunable to scan directory '%s' : %s.\n", + err && *err ? *err : "", path, strerror(errno)); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + else { + for (i = 0; i < n; i++) { + struct crtlist_entry *entry; + struct dirent *de = de_list[i]; + + end = strrchr(de->d_name, '.'); + if (end && (de->d_name[0] == '.' || + strcmp(end, ".issuer") == 0 || strcmp(end, ".ocsp") == 0 || + strcmp(end, ".sctl") == 0 || strcmp(end, ".key") == 0)) + goto ignore_entry; + + snprintf(fp, sizeof(fp), "%s/%s", path, de->d_name); + if (stat(fp, &buf) != 0) { + memprintf(err, "%sunable to stat SSL certificate from file '%s' : %s.\n", + err && *err ? *err : "", fp, strerror(errno)); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto ignore_entry; + } + if (!S_ISREG(buf.st_mode)) + goto ignore_entry; + + entry = crtlist_entry_new(); + if (entry == NULL) { + memprintf(err, "not enough memory '%s'", fp); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto ignore_entry; + } + + ckchs = ckchs_lookup(fp); + if (ckchs == NULL) + ckchs = ckchs_load_cert_file(fp, err); + if (ckchs == NULL) { + free(de); + free(entry); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto end; + } + entry->node.key = ckchs; + entry->crtlist = dir; + LIST_APPEND(&ckchs->crtlist_entry, &entry->by_ckch_store); + LIST_APPEND(&dir->ord_entries, &entry->by_crtlist); + ebpt_insert(&dir->entries, &entry->node); + +ignore_entry: + free(de); + } +end: + free(de_list); + } + + if (cfgerr & ERR_CODE) { + /* free the dir and entries on error */ + crtlist_free(dir); + } else { + *crtlist = dir; + } + return cfgerr; + +} + +/* + * Take an ssl_bind_conf structure and append the configuration line used to + * create it in the buffer + */ +static void dump_crtlist_sslconf(struct buffer *buf, const struct ssl_bind_conf *conf) +{ + int space = 0; + + if (conf == NULL) + return; + + chunk_appendf(buf, " ["); +#ifdef OPENSSL_NPN_NEGOTIATED + if (conf->npn_str) { + int len = conf->npn_len; + char *ptr = conf->npn_str; + int comma = 0; + + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "npn "); + while (len) { + unsigned short size; + + size = *ptr; + ptr++; + if (comma) + chunk_memcat(buf, ",", 1); + chunk_memcat(buf, ptr, size); + ptr += size; + len -= size + 1; + comma = 1; + } + chunk_memcat(buf, "", 1); /* finish with a \0 */ + space++; + } +#endif +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + if (conf->alpn_str) { + int len = conf->alpn_len; + char *ptr = conf->alpn_str; + int comma = 0; + + if (space) chunk_appendf(buf, " "); + if (len) + chunk_appendf(buf, "alpn "); + else + chunk_appendf(buf, "no-alpn"); + while (len) { + unsigned short size; + + size = *ptr; + ptr++; + if (comma) + chunk_memcat(buf, ",", 1); + chunk_memcat(buf, ptr, size); + ptr += size; + len -= size + 1; + comma = 1; + } + chunk_memcat(buf, "", 1); /* finish with a \0 */ + space++; + } +#endif + /* verify */ + { + if (conf->verify == SSL_SOCK_VERIFY_NONE) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "verify none"); + space++; + } else if (conf->verify == SSL_SOCK_VERIFY_OPTIONAL) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "verify optional"); + space++; + } else if (conf->verify == SSL_SOCK_VERIFY_REQUIRED) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "verify required"); + space++; + } + } + + if (conf->no_ca_names) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "no-ca-names"); + space++; + } + + if (conf->early_data) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "allow-0rtt"); + space++; + } + if (conf->ca_file) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "ca-file %s", conf->ca_file); + space++; + } + if (conf->crl_file) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "crl-file %s", conf->crl_file); + space++; + } + if (conf->ciphers) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "ciphers %s", conf->ciphers); + space++; + } +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + if (conf->ciphersuites) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "ciphersuites %s", conf->ciphersuites); + space++; + } +#endif + if (conf->curves) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "curves %s", conf->curves); + space++; + } + if (conf->ecdhe) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "ecdhe %s", conf->ecdhe); + space++; + } + + /* the crt-lists only support ssl-min-ver and ssl-max-ver */ + if (conf->ssl_methods_cfg.min) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "ssl-min-ver %s", methodVersions[conf->ssl_methods_cfg.min].name); + space++; + } + + if (conf->ssl_methods_cfg.max) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "ssl-max-ver %s", methodVersions[conf->ssl_methods_cfg.max].name); + space++; + } + + if (conf->ocsp_update != SSL_SOCK_OCSP_UPDATE_DFLT) { + if (space) chunk_appendf(buf, " "); + chunk_appendf(buf, "ocsp-update %s", + conf->ocsp_update == SSL_SOCK_OCSP_UPDATE_OFF ? "off" : "on"); + space++; + } + + chunk_appendf(buf, "]"); + + return; +} + +/* dump a list of filters */ +static void dump_crtlist_filters(struct buffer *buf, struct crtlist_entry *entry) +{ + int i; + + if (!entry->fcount) + return; + + for (i = 0; i < entry->fcount; i++) { + chunk_appendf(buf, " %s", entry->filters[i]); + } + return; +} + +/************************** CLI functions ****************************/ + + +/* CLI IO handler for '(show|dump) ssl crt-list'. + * It uses show_crtlist_ctx for the context. + */ +static int cli_io_handler_dump_crtlist(struct appctx *appctx) +{ + struct show_crtlist_ctx *ctx = appctx->svcctx; + struct buffer *trash = alloc_trash_chunk(); + struct ebmb_node *lnode; + + if (trash == NULL) + return 1; + + /* dump the list of crt-lists */ + lnode = ctx->crtlist_node; + if (lnode == NULL) + lnode = ebmb_first(&crtlists_tree); + while (lnode) { + chunk_appendf(trash, "%s\n", lnode->key); + if (applet_putchk(appctx, trash) == -1) + goto yield; + lnode = ebmb_next(lnode); + } + free_trash_chunk(trash); + return 1; +yield: + ctx->crtlist_node = lnode; + free_trash_chunk(trash); + return 0; +} + +/* CLI IO handler for '(show|dump) ssl crt-list <filename>' */ +static int cli_io_handler_dump_crtlist_entries(struct appctx *appctx) +{ + struct show_crtlist_ctx *ctx = appctx->svcctx; + struct buffer *trash = alloc_trash_chunk(); + struct crtlist *crtlist; + struct crtlist_entry *entry; + + if (trash == NULL) + return 1; + + crtlist = ebmb_entry(ctx->crtlist_node, struct crtlist, node); + + entry = ctx->entry; + if (entry == NULL) { + entry = LIST_ELEM((crtlist->ord_entries).n, typeof(entry), by_crtlist); + chunk_appendf(trash, "# %s\n", crtlist->node.key); + if (applet_putchk(appctx, trash) == -1) + goto yield; + } + + list_for_each_entry_from(entry, &crtlist->ord_entries, by_crtlist) { + struct ckch_store *store; + const char *filename; + + store = entry->node.key; + filename = store->path; + chunk_appendf(trash, "%s", filename); + if (ctx->mode == 's') /* show */ + chunk_appendf(trash, ":%d", entry->linenum); + dump_crtlist_sslconf(trash, entry->ssl_conf); + dump_crtlist_filters(trash, entry); + chunk_appendf(trash, "\n"); + + if (applet_putchk(appctx, trash) == -1) + goto yield; + } + free_trash_chunk(trash); + return 1; +yield: + ctx->entry = entry; + free_trash_chunk(trash); + return 0; +} + +/* CLI argument parser for '(show|dump) ssl crt-list' */ +static int cli_parse_dump_crtlist(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_crtlist_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + struct ebmb_node *lnode; + char *filename = NULL; + int mode; + char *end; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (*args[3] && strcmp(args[3], "-n") == 0) { + mode = 's'; + filename = args[4]; + } else { + mode = 'd'; + filename = args[3]; + } + + if (mode == 's' && !*args[4]) + return cli_err(appctx, "'show ssl crt-list -n' expects a filename or a directory\n"); + + if (filename && *filename) { + + + /* strip trailing slashes, including first one */ + for (end = filename + strlen(filename) - 1; end >= filename && *end == '/'; end--) + *end = 0; + + lnode = ebst_lookup(&crtlists_tree, filename); + if (lnode == NULL) + return cli_err(appctx, "didn't find the specified filename\n"); + + ctx->crtlist_node = lnode; + appctx->io_handler = cli_io_handler_dump_crtlist_entries; + } + ctx->mode = mode; + + return 0; +} + +/* release function of the "add ssl crt-list' command, free things and unlock + * the spinlock. It uses the add_crtlist_ctx. + */ +static void cli_release_add_crtlist(struct appctx *appctx) +{ + struct add_crtlist_ctx *ctx = appctx->svcctx; + struct crtlist_entry *entry = ctx->entry; + + if (entry) { + struct ckch_inst *inst, *inst_s; + + /* upon error free the ckch_inst and everything inside */ + ebpt_delete(&entry->node); + LIST_DELETE(&entry->by_crtlist); + LIST_DELETE(&entry->by_ckch_store); + + list_for_each_entry_safe(inst, inst_s, &entry->ckch_inst, by_ckchs) { + ckch_inst_free(inst); + } + crtlist_free_filters(entry->filters); + ssl_sock_free_ssl_conf(entry->ssl_conf); + free(entry->ssl_conf); + free(entry); + } + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + ha_free(&ctx->err); +} + + +/* IO Handler for the "add ssl crt-list" command It adds a new entry in the + * crt-list and generates the ckch_insts for each bind_conf that uses this crt-list + * + * The logic is the same as the "commit ssl cert" command but without the + * freeing of the old structures, because there are none. + * + * It uses the add_crtlist_ctx for the context. + */ +static int cli_io_handler_add_crtlist(struct appctx *appctx) +{ + struct add_crtlist_ctx *ctx = appctx->svcctx; + struct bind_conf_list *bind_conf_node; + struct stconn *sc = appctx_sc(appctx); + struct crtlist *crtlist = ctx->crtlist; + struct crtlist_entry *entry = ctx->entry; + struct ckch_store *store = entry->node.key; + struct ckch_inst *new_inst; + int i = 0; + int errcode = 0; + + /* for each bind_conf which use the crt-list, a new ckch_inst must be + * created. + */ + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) + goto end; + + switch (ctx->state) { + case ADDCRT_ST_INIT: + /* This state just print the update message */ + chunk_printf(&trash, "Inserting certificate '%s' in crt-list '%s'", store->path, crtlist->node.key); + if (applet_putchk(appctx, &trash) == -1) + goto yield; + ctx->state = ADDCRT_ST_GEN; + __fallthrough; + case ADDCRT_ST_GEN: + bind_conf_node = ctx->bind_conf_node; /* get the previous ptr from the yield */ + if (bind_conf_node == NULL) + bind_conf_node = crtlist->bind_conf; + for (; bind_conf_node; bind_conf_node = bind_conf_node->next) { + struct bind_conf *bind_conf = bind_conf_node->bind_conf; + struct sni_ctx *sni; + + ctx->bind_conf_node = bind_conf_node; + + /* yield every 10 generations */ + if (i > 10) { + applet_have_more_data(appctx); /* let's come back later */ + goto yield; + } + + /* display one dot for each new instance */ + if (applet_putstr(appctx, ".") == -1) + goto yield; + + /* we don't support multi-cert bundles, only simple ones */ + ctx->err = NULL; + errcode |= ckch_inst_new_load_store(store->path, store, bind_conf, entry->ssl_conf, entry->filters, entry->fcount, &new_inst, &ctx->err); + if (errcode & ERR_CODE) { + ctx->state = ADDCRT_ST_ERROR; + goto error; + } + + /* we need to initialize the SSL_CTX generated */ + /* this iterate on the newly generated SNIs in the new instance to prepare their SSL_CTX */ + list_for_each_entry(sni, &new_inst->sni_ctx, by_ckch_inst) { + if (!sni->order) { /* we initialized only the first SSL_CTX because it's the same in the other sni_ctx's */ + ctx->err = NULL; + errcode |= ssl_sock_prep_ctx_and_inst(bind_conf, new_inst->ssl_conf, sni->ctx, sni->ckch_inst, &ctx->err); + if (errcode & ERR_CODE) { + ctx->state = ADDCRT_ST_ERROR; + goto error; + } + } + } + + i++; + LIST_APPEND(&store->ckch_inst, &new_inst->by_ckchs); + LIST_APPEND(&entry->ckch_inst, &new_inst->by_crtlist_entry); + new_inst->crtlist_entry = entry; + } + ctx->state = ADDCRT_ST_INSERT; + __fallthrough; + case ADDCRT_ST_INSERT: + /* the insertion is called for every instance of the store, not + * only the one we generated. + * But the ssl_sock_load_cert_sni() skip the sni already + * inserted. Not every instance has a bind_conf, it could be + * the store of a server so we should be careful */ + + list_for_each_entry(new_inst, &store->ckch_inst, by_ckchs) { + if (!new_inst->bind_conf) /* this is a server instance */ + continue; + HA_RWLOCK_WRLOCK(SNI_LOCK, &new_inst->bind_conf->sni_lock); + ssl_sock_load_cert_sni(new_inst, new_inst->bind_conf); + HA_RWLOCK_WRUNLOCK(SNI_LOCK, &new_inst->bind_conf->sni_lock); + } + entry->linenum = ++crtlist->linecount; + ctx->entry = NULL; + ctx->state = ADDCRT_ST_SUCCESS; + __fallthrough; + case ADDCRT_ST_SUCCESS: + chunk_reset(&trash); + chunk_appendf(&trash, "\n"); + if (ctx->err) + chunk_appendf(&trash, "%s", ctx->err); + chunk_appendf(&trash, "Success!\n"); + if (applet_putchk(appctx, &trash) == -1) + goto yield; + ctx->state = ADDCRT_ST_FIN; + break; + + case ADDCRT_ST_ERROR: + error: + chunk_printf(&trash, "\n%sFailed!\n", ctx->err); + if (applet_putchk(appctx, &trash) == -1) + goto yield; + break; + + default: + break; + } + +end: + /* success: call the release function and don't come back */ + return 1; +yield: + return 0; /* should come back */ +} + + +/* + * Parse a "add ssl crt-list <crt-list> <certfile>" line. + * Filters and option must be passed through payload. + * It sets a struct add_crtlist_ctx. + */ +static int cli_parse_add_crtlist(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct add_crtlist_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int cfgerr = 0; + struct ckch_store *store; + char *err = NULL; + char path[MAXPATHLEN+1]; + char *crtlist_path; + char *cert_path = NULL; + struct ebmb_node *eb; + struct ebpt_node *inserted; + struct crtlist *crtlist; + struct crtlist_entry *entry = NULL; + char *end; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3] || (!payload && !*args[4])) + return cli_err(appctx, "'add ssl crtlist' expects a filename and a certificate name\n"); + + crtlist_path = args[3]; + + /* strip trailing slashes, including first one */ + for (end = crtlist_path + strlen(crtlist_path) - 1; end >= crtlist_path && *end == '/'; end--) + *end = 0; + + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Operations on certificates are currently locked!\n"); + + eb = ebst_lookup(&crtlists_tree, crtlist_path); + if (!eb) { + memprintf(&err, "crt-list '%s' does not exist!", crtlist_path); + goto error; + } + crtlist = ebmb_entry(eb, struct crtlist, node); + + entry = crtlist_entry_new(); + if (entry == NULL) { + memprintf(&err, "Not enough memory!"); + goto error; + } + + if (payload) { + char *lf; + + lf = strrchr(payload, '\n'); + if (lf) { + memprintf(&err, "only one line of payload is supported!"); + goto error; + } + /* cert_path is filled here */ + cfgerr |= crtlist_parse_line(payload, &cert_path, entry, "CLI", 1, 1, &err); + if (cfgerr & ERR_CODE) + goto error; + } else { + cert_path = args[4]; + } + + if (!cert_path) { + memprintf(&err, "'add ssl crtlist' should contain the certificate name in the payload"); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + + if (eb_gettag(crtlist->entries.b[EB_RGHT])) { + char *slash; + + slash = strrchr(cert_path, '/'); + if (!slash) { + memprintf(&err, "'%s' is a directory, certificate path '%s' must contain the directory path", (char *)crtlist->node.key, cert_path); + goto error; + } + /* temporary replace / by 0 to do an strcmp */ + *slash = '\0'; + if (strcmp(cert_path, (char*)crtlist->node.key) != 0) { + *slash = '/'; + memprintf(&err, "'%s' is a directory, certificate path '%s' must contain the directory path", (char *)crtlist->node.key, cert_path); + goto error; + } + *slash = '/'; + } + + if (*cert_path != '/' && global_ssl.crt_base) { + if ((strlen(global_ssl.crt_base) + 1 + strlen(cert_path)) > sizeof(path) || + snprintf(path, sizeof(path), "%s/%s", global_ssl.crt_base, cert_path) > sizeof(path)) { + memprintf(&err, "'%s' : path too long", cert_path); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto error; + } + cert_path = path; + } + + store = ckchs_lookup(cert_path); + if (store == NULL) { + memprintf(&err, "certificate '%s' does not exist!", cert_path); + goto error; + } + if (store->data == NULL || store->data->cert == NULL) { + memprintf(&err, "certificate '%s' is empty!", cert_path); + goto error; + } + + /* No need to check 'ocsp-update' inconsistency on a store that is not + * used yet (it was just added through the CLI for instance). + */ + if (!LIST_ISEMPTY(&store->ckch_inst) && + ocsp_update_check_cfg_consistency(store, entry, cert_path, &err)) + goto error; + + if (entry->ssl_conf) + store->data->ocsp_update_mode = entry->ssl_conf->ocsp_update; + + /* check if it's possible to insert this new crtlist_entry */ + entry->node.key = store; + inserted = ebpt_insert(&crtlist->entries, &entry->node); + if (inserted != &entry->node) { + memprintf(&err, "file already exists in this directory!"); + goto error; + } + + /* this is supposed to be a directory (EB_ROOT_UNIQUE), so no ssl_conf are allowed */ + if ((entry->ssl_conf || entry->filters) && eb_gettag(crtlist->entries.b[EB_RGHT])) { + memprintf(&err, "this is a directory, SSL configuration and filters are not allowed"); + goto error; + } + + LIST_APPEND(&crtlist->ord_entries, &entry->by_crtlist); + entry->crtlist = crtlist; + LIST_APPEND(&store->crtlist_entry, &entry->by_ckch_store); + + ctx->state = ADDCRT_ST_INIT; + ctx->crtlist = crtlist; + ctx->entry = entry; + + /* unlock is done in the release handler */ + return 0; + +error: + crtlist_entry_free(entry); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + err = memprintf(&err, "Can't edit the crt-list: %s\n", err ? err : ""); + return cli_dynerr(appctx, err); +} + +/* Parse a "del ssl crt-list <crt-list> <certfile>" line. */ +static int cli_parse_del_crtlist(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct ckch_store *store; + char *err = NULL; + char *crtlist_path, *cert_path; + struct ebmb_node *ebmb; + struct ebpt_node *ebpt; + struct crtlist *crtlist; + struct crtlist_entry *entry = NULL; + struct ckch_inst *inst, *inst_s; + int linenum = 0; + char *colons; + char *end; + int error_message_dumped = 0; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + if (!*args[3] || !*args[4]) + return cli_err(appctx, "'del ssl crtlist' expects a filename and a certificate name\n"); + + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) + return cli_err(appctx, "Can't delete!\nOperations on certificates are currently locked!\n"); + + crtlist_path = args[3]; + cert_path = args[4]; + + colons = strchr(cert_path, ':'); + if (colons) { + char *endptr; + + linenum = strtol(colons + 1, &endptr, 10); + if (colons + 1 == endptr || *endptr != '\0') { + memprintf(&err, "wrong line number after colons in '%s'!", cert_path); + goto error; + } + *colons = '\0'; + } + + /* strip trailing slashes, including first one */ + for (end = crtlist_path + strlen(crtlist_path) - 1; end >= crtlist_path && *end == '/'; end--) + *end = 0; + + /* look for crtlist */ + ebmb = ebst_lookup(&crtlists_tree, crtlist_path); + if (!ebmb) { + memprintf(&err, "crt-list '%s' does not exist!", crtlist_path); + goto error; + } + crtlist = ebmb_entry(ebmb, struct crtlist, node); + + /* look for store */ + store = ckchs_lookup(cert_path); + if (store == NULL) { + memprintf(&err, "certificate '%s' does not exist!", cert_path); + goto error; + } + if (store->data == NULL || store->data->cert == NULL) { + memprintf(&err, "certificate '%s' is empty!", cert_path); + goto error; + } + + ebpt = ebpt_lookup(&crtlist->entries, store); + if (!ebpt) { + memprintf(&err, "certificate '%s' can't be found in crt-list '%s'!", cert_path, crtlist_path); + goto error; + } + + /* list the line number of entries for errors in err, and select the right ebpt */ + for (; ebpt; ebpt = ebpt_next_dup(ebpt)) { + struct crtlist_entry *tmp; + + tmp = ebpt_entry(ebpt, struct crtlist_entry, node); + memprintf(&err, "%s%s%d", err ? err : "", err ? ", " : "", tmp->linenum); + + /* select the entry we wanted */ + if (linenum == 0 || tmp->linenum == linenum) { + if (!entry) + entry = tmp; + } + } + + /* we didn't found the specified entry */ + if (!entry) { + memprintf(&err, "found a certificate '%s' but the line number is incorrect, please specify a correct line number preceded by colons (%s)!", cert_path, err ? err : NULL); + goto error; + } + + /* we didn't specified a line number but there were several entries */ + if (linenum == 0 && ebpt_next_dup(&entry->node)) { + memprintf(&err, "found the certificate '%s' in several entries, please specify a line number preceded by colons (%s)!", cert_path, err ? err : NULL); + goto error; + } + + /* Iterate over all the instances in order to see if any of them is a + * default instance. If this is the case, the entry won't be suppressed. */ + list_for_each_entry_safe(inst, inst_s, &entry->ckch_inst, by_crtlist_entry) { + if (inst->is_default && !inst->bind_conf->strict_sni) { + if (!error_message_dumped) { + memprintf(&err, "certificate '%s' cannot be deleted, it is used as default certificate by the following frontends:\n", cert_path); + error_message_dumped = 1; + } + memprintf(&err, "%s\t- %s:%d\n", err, inst->bind_conf->file, inst->bind_conf->line); + } + } + if (error_message_dumped) + goto error; + + /* upon error free the ckch_inst and everything inside */ + + ebpt_delete(&entry->node); + LIST_DELETE(&entry->by_crtlist); + LIST_DELETE(&entry->by_ckch_store); + + list_for_each_entry_safe(inst, inst_s, &entry->ckch_inst, by_crtlist_entry) { + struct sni_ctx *sni, *sni_s; + struct ckch_inst_link_ref *link_ref, *link_ref_s; + + HA_RWLOCK_WRLOCK(SNI_LOCK, &inst->bind_conf->sni_lock); + list_for_each_entry_safe(sni, sni_s, &inst->sni_ctx, by_ckch_inst) { + ebmb_delete(&sni->name); + LIST_DELETE(&sni->by_ckch_inst); + SSL_CTX_free(sni->ctx); + free(sni); + } + HA_RWLOCK_WRUNLOCK(SNI_LOCK, &inst->bind_conf->sni_lock); + LIST_DELETE(&inst->by_ckchs); + list_for_each_entry_safe(link_ref, link_ref_s, &inst->cafile_link_refs, list) { + LIST_DELETE(&link_ref->link->list); + LIST_DELETE(&link_ref->list); + free(link_ref); + } + ckch_inst_free(inst); + } + + crtlist_free_filters(entry->filters); + ssl_sock_free_ssl_conf(entry->ssl_conf); + free(entry->ssl_conf); + free(entry); + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + err = memprintf(&err, "Entry '%s' deleted in crtlist '%s'!\n", cert_path, crtlist_path); + return cli_dynmsg(appctx, LOG_NOTICE, err); + +error: + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + err = memprintf(&err, "Can't delete the entry: %s\n", err ? err : ""); + return cli_dynerr(appctx, err); +} + + +/* unlink and free all crt-list and crt-list entries */ +void crtlist_deinit() +{ + struct eb_node *node, *next; + struct crtlist *crtlist; + + node = eb_first(&crtlists_tree); + while (node) { + next = eb_next(node); + crtlist = ebmb_entry(node, struct crtlist, node); + crtlist_free(crtlist); + node = next; + } +} + + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "add", "ssl", "crt-list", NULL }, "add ssl crt-list <list> <cert> [opts]* : add to crt-list file <list> a line <cert> or a payload", cli_parse_add_crtlist, cli_io_handler_add_crtlist, cli_release_add_crtlist }, + { { "del", "ssl", "crt-list", NULL }, "del ssl crt-list <list> <cert[:line]> : delete a line <cert> from crt-list file <list>", cli_parse_del_crtlist, NULL, NULL }, + { { "show", "ssl", "crt-list", NULL }, "show ssl crt-list [-n] [<list>] : show the list of crt-lists or the content of a crt-list file <list>", cli_parse_dump_crtlist, cli_io_handler_dump_crtlist, NULL }, + { { NULL }, NULL, NULL, NULL } } +}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + diff --git a/src/ssl_ocsp.c b/src/ssl_ocsp.c new file mode 100644 index 0000000..1adddc4 --- /dev/null +++ b/src/ssl_ocsp.c @@ -0,0 +1,1986 @@ + +/* + * SSL/TLS OCSP-related functions + * + * Copyright (C) 2022 HAProxy Technologies, Remi Tricot-Le Breton <rlebreton@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Acknowledgement: + * We'd like to specially thank the Stud project authors for a very clean + * and well documented code which helped us understand how the OpenSSL API + * ought to be used in non-blocking mode. This is one difficult part which + * is not easy to get from the OpenSSL doc, and reading the Stud code made + * it much more obvious than the examples in the OpenSSL package. Keep up + * the good works, guys ! + * + * Stud is an extremely efficient and scalable SSL/TLS proxy which combines + * particularly well with haproxy. For more info about this project, visit : + * https://github.com/bumptech/stud + * + */ + +/* Note: do NOT include openssl/xxx.h here, do it in openssl-compat.h */ +#define _GNU_SOURCE +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <netdb.h> +#include <netinet/tcp.h> + +#include <import/ebpttree.h> +#include <import/ebsttree.h> +#include <import/lru.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/arg.h> +#include <haproxy/base64.h> +#include <haproxy/channel.h> +#include <haproxy/chunk.h> +#include <haproxy/cli.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/http_rules.h> +#include <haproxy/log.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/pattern-t.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_tp.h> +#include <haproxy/server.h> +#include <haproxy/shctx.h> +#include <haproxy/ssl_ckch.h> +#include <haproxy/ssl_crtlist.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/ssl_utils.h> +#include <haproxy/stats.h> +#include <haproxy/stconn.h> +#include <haproxy/stream-t.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/vars.h> +#include <haproxy/xxhash.h> +#include <haproxy/istbuf.h> +#include <haproxy/ssl_ocsp-t.h> +#include <haproxy/http_client.h> + + +/* ***** READ THIS before adding code here! ***** + * + * Due to API incompatibilities between multiple OpenSSL versions and their + * derivatives, it's often tempting to add macros to (re-)define certain + * symbols. Please do not do this here, and do it in common/openssl-compat.h + * exclusively so that the whole code consistently uses the same macros. + * + * Whenever possible if a macro is missing in certain versions, it's better + * to conditionally define it in openssl-compat.h than using lots of ifdefs. + */ + +#ifndef OPENSSL_NO_OCSP +int ocsp_ex_index = -1; + +int ssl_sock_get_ocsp_arg_kt_index(int evp_keytype) +{ + switch (evp_keytype) { + case EVP_PKEY_RSA: + return 2; + case EVP_PKEY_DSA: + return 0; + case EVP_PKEY_EC: + return 1; + } + + return -1; +} + +/* + * Callback used to set OCSP status extension content in server hello. + */ +int ssl_sock_ocsp_stapling_cbk(SSL *ssl, void *arg) +{ + struct certificate_ocsp *ocsp; + struct ocsp_cbk_arg *ocsp_arg; + char *ssl_buf; + SSL_CTX *ctx; + EVP_PKEY *ssl_pkey; + int key_type; + int index; + + ctx = SSL_get_SSL_CTX(ssl); + if (!ctx) + return SSL_TLSEXT_ERR_NOACK; + + ocsp_arg = SSL_CTX_get_ex_data(ctx, ocsp_ex_index); + if (!ocsp_arg) + return SSL_TLSEXT_ERR_NOACK; + + ssl_pkey = SSL_get_privatekey(ssl); + if (!ssl_pkey) + return SSL_TLSEXT_ERR_NOACK; + + key_type = EVP_PKEY_base_id(ssl_pkey); + + if (ocsp_arg->is_single && ocsp_arg->single_kt == key_type) + ocsp = ocsp_arg->s_ocsp; + else { + /* For multiple certs per context, we have to find the correct OCSP response based on + * the certificate type + */ + index = ssl_sock_get_ocsp_arg_kt_index(key_type); + + if (index < 0) + return SSL_TLSEXT_ERR_NOACK; + + ocsp = ocsp_arg->m_ocsp[index]; + + } + + if (!ocsp || + !ocsp->response.area || + !ocsp->response.data || + (ocsp->expire < date.tv_sec)) + return SSL_TLSEXT_ERR_NOACK; + + ssl_buf = OPENSSL_malloc(ocsp->response.data); + if (!ssl_buf) + return SSL_TLSEXT_ERR_NOACK; + + memcpy(ssl_buf, ocsp->response.area, ocsp->response.data); + SSL_set_tlsext_status_ocsp_resp(ssl, (unsigned char*)ssl_buf, ocsp->response.data); + + return SSL_TLSEXT_ERR_OK; +} + +#endif /* !defined(OPENSSL_NO_OCSP) */ + + +#if (defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) + +struct eb_root cert_ocsp_tree = EB_ROOT_UNIQUE; + +__decl_thread(HA_SPINLOCK_T ocsp_tree_lock); + +struct eb_root ocsp_update_tree = EB_ROOT; /* updatable ocsp responses sorted by next_update in absolute time */ + +/* + * Convert an OCSP_CERTID structure into a char buffer that can be used as a key + * in the OCSP response tree. It takes an <ocsp_cid> as parameter and builds a + * key of length <key_length> into the <certid> buffer. The key length cannot + * exceed OCSP_MAX_CERTID_ASN1_LENGTH bytes. + * Returns a negative value in case of error. + */ +int ssl_ocsp_build_response_key(OCSP_CERTID *ocsp_cid, unsigned char certid[OCSP_MAX_CERTID_ASN1_LENGTH], unsigned int *key_length) +{ + unsigned char *p = NULL; + int i; + + if (!key_length) + return -1; + + *key_length = 0; + + if (!ocsp_cid) + return 0; + + i = i2d_OCSP_CERTID(ocsp_cid, NULL); + if (!i || (i > OCSP_MAX_CERTID_ASN1_LENGTH)) + return 0; + + p = certid; + *key_length = i2d_OCSP_CERTID(ocsp_cid, &p); + +end: + return *key_length > 0; +} + +/* This function starts to check if the OCSP response (in DER format) contained + * in chunk 'ocsp_response' is valid (else exits on error). + * If 'cid' is not NULL, it will be compared to the OCSP certificate ID + * contained in the OCSP Response and exits on error if no match. + * If it's a valid OCSP Response: + * If 'ocsp' is not NULL, the chunk is copied in the OCSP response's container + * pointed by 'ocsp'. + * If 'ocsp' is NULL, the function looks up into the OCSP response's + * containers tree (using as index the ASN1 form of the OCSP Certificate ID extracted + * from the response) and exits on error if not found. Finally, If an OCSP response is + * already present in the container, it will be overwritten. + * + * Note: OCSP response containing more than one OCSP Single response is not + * considered valid. + * + * Returns 0 on success, 1 in error case. + */ +int ssl_sock_load_ocsp_response(struct buffer *ocsp_response, + struct certificate_ocsp *ocsp, + OCSP_CERTID *cid, char **err) +{ + OCSP_RESPONSE *resp; + OCSP_BASICRESP *bs = NULL; + OCSP_SINGLERESP *sr; + OCSP_CERTID *id; + unsigned char *p = (unsigned char *) ocsp_response->area; + int rc , count_sr; + ASN1_GENERALIZEDTIME *revtime, *thisupd, *nextupd = NULL; + int reason; + int ret = 1; +#ifdef HAVE_ASN1_TIME_TO_TM + struct tm nextupd_tm = {0}; +#endif + + resp = d2i_OCSP_RESPONSE(NULL, (const unsigned char **)&p, + ocsp_response->data); + if (!resp) { + memprintf(err, "Unable to parse OCSP response"); + goto out; + } + + rc = OCSP_response_status(resp); + if (rc != OCSP_RESPONSE_STATUS_SUCCESSFUL) { + memprintf(err, "OCSP response status not successful"); + goto out; + } + + bs = OCSP_response_get1_basic(resp); + if (!bs) { + memprintf(err, "Failed to get basic response from OCSP Response"); + goto out; + } + + count_sr = OCSP_resp_count(bs); + if (count_sr > 1) { + memprintf(err, "OCSP response ignored because contains multiple single responses (%d)", count_sr); + goto out; + } + + sr = OCSP_resp_get0(bs, 0); + if (!sr) { + memprintf(err, "Failed to get OCSP single response"); + goto out; + } + + id = (OCSP_CERTID*)OCSP_SINGLERESP_get0_id(sr); + + rc = OCSP_single_get0_status(sr, &reason, &revtime, &thisupd, &nextupd); + if (rc != V_OCSP_CERTSTATUS_GOOD && rc != V_OCSP_CERTSTATUS_REVOKED) { + memprintf(err, "OCSP single response: certificate status is unknown"); + goto out; + } + + if (!nextupd) { + memprintf(err, "OCSP single response: missing nextupdate"); + goto out; + } + + rc = OCSP_check_validity(thisupd, nextupd, OCSP_MAX_RESPONSE_TIME_SKEW, -1); + if (!rc) { + memprintf(err, "OCSP single response: no longer valid."); + goto out; + } + + if (cid) { + if (OCSP_id_cmp(id, cid)) { + memprintf(err, "OCSP single response: Certificate ID does not match certificate and issuer"); + goto out; + } + } + + if (!ocsp) { + unsigned char key[OCSP_MAX_CERTID_ASN1_LENGTH]; + unsigned char *p; + + rc = i2d_OCSP_CERTID(id, NULL); + if (!rc) { + memprintf(err, "OCSP single response: Unable to encode Certificate ID"); + goto out; + } + + if (rc > OCSP_MAX_CERTID_ASN1_LENGTH) { + memprintf(err, "OCSP single response: Certificate ID too long"); + goto out; + } + + p = key; + memset(key, 0, OCSP_MAX_CERTID_ASN1_LENGTH); + i2d_OCSP_CERTID(id, &p); + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp = (struct certificate_ocsp *)ebmb_lookup(&cert_ocsp_tree, key, OCSP_MAX_CERTID_ASN1_LENGTH); + if (!ocsp) { + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + memprintf(err, "OCSP single response: Certificate ID does not match any certificate or issuer"); + goto out; + } + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + } + + /* According to comments on "chunk_dup", the + previous chunk buffer will be freed */ + if (!chunk_dup(&ocsp->response, ocsp_response)) { + memprintf(err, "OCSP response: Memory allocation error"); + goto out; + } + +#ifdef HAVE_ASN1_TIME_TO_TM + if (ASN1_TIME_to_tm(nextupd, &nextupd_tm) == 0) { + memprintf(err, "OCSP single response: Invalid \"Next Update\" time"); + goto out; + } + ocsp->expire = my_timegm(&nextupd_tm) - OCSP_MAX_RESPONSE_TIME_SKEW; +#else + ocsp->expire = asn1_generalizedtime_to_epoch(nextupd) - OCSP_MAX_RESPONSE_TIME_SKEW; + if (ocsp->expire < 0) { + memprintf(err, "OCSP single response: Invalid \"Next Update\" time"); + goto out; + } +#endif + + ret = 0; +out: + ERR_clear_error(); + + if (bs) + OCSP_BASICRESP_free(bs); + + if (resp) + OCSP_RESPONSE_free(resp); + + return ret; +} +/* + * External function use to update the OCSP response in the OCSP response's + * containers tree. The chunk 'ocsp_response' must contain the OCSP response + * to update in DER format. + * + * Returns 0 on success, 1 in error case. + */ +int ssl_sock_update_ocsp_response(struct buffer *ocsp_response, char **err) +{ + return ssl_sock_load_ocsp_response(ocsp_response, NULL, NULL, err); +} + + + +#if !defined OPENSSL_IS_BORINGSSL +/* + * Decrease the refcount of the struct ocsp_response and frees it if it's not + * used anymore. Also removes it from the tree if free'd. + */ +void ssl_sock_free_ocsp(struct certificate_ocsp *ocsp) +{ + if (!ocsp) + return; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp->refcount_store--; + if (ocsp->refcount_store <= 0) { + BUG_ON(ocsp->refcount_instance > 0); + ebmb_delete(&ocsp->key); + eb64_delete(&ocsp->next_update); + X509_free(ocsp->issuer); + ocsp->issuer = NULL; + sk_X509_pop_free(ocsp->chain, X509_free); + ocsp->chain = NULL; + chunk_destroy(&ocsp->response); + if (ocsp->uri) { + ha_free(&ocsp->uri->area); + ha_free(&ocsp->uri); + } + + free(ocsp); + } + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); +} + +void ssl_sock_free_ocsp_instance(struct certificate_ocsp *ocsp) +{ + if (!ocsp) + return; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp->refcount_instance--; + if (ocsp->refcount_instance <= 0) { + eb64_delete(&ocsp->next_update); + } + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); +} + + +/* + * This function dumps the details of an OCSP_CERTID. It is based on + * ocsp_certid_print in OpenSSL. + */ +static inline int ocsp_certid_print(BIO *bp, OCSP_CERTID *certid, int indent) +{ + ASN1_OCTET_STRING *piNameHash = NULL; + ASN1_OCTET_STRING *piKeyHash = NULL; + ASN1_INTEGER *pSerial = NULL; + + if (OCSP_id_get0_info(&piNameHash, NULL, &piKeyHash, &pSerial, certid)) { + + BIO_printf(bp, "%*sCertificate ID:\n", indent, ""); + indent += 2; + BIO_printf(bp, "%*sIssuer Name Hash: ", indent, ""); +#ifndef USE_OPENSSL_WOLFSSL + i2a_ASN1_STRING(bp, piNameHash, 0); +#else + wolfSSL_ASN1_STRING_print(bp, piNameHash); +#endif + BIO_printf(bp, "\n%*sIssuer Key Hash: ", indent, ""); +#ifndef USE_OPENSSL_WOLFSSL + i2a_ASN1_STRING(bp, piKeyHash, 0); +#else + wolfSSL_ASN1_STRING_print(bp, piNameHash); +#endif + BIO_printf(bp, "\n%*sSerial Number: ", indent, ""); + i2a_ASN1_INTEGER(bp, pSerial); + } + return 1; +} + + +enum { + SHOW_OCSPRESP_FMT_DFLT, + SHOW_OCSPRESP_FMT_TEXT, + SHOW_OCSPRESP_FMT_B64 +}; + +struct show_ocspresp_cli_ctx { + struct certificate_ocsp *ocsp; + int format; +}; + +/* + * Dump the details about an OCSP response in DER format stored in + * <ocsp_response> into buffer <out>. + * Returns 0 in case of success. + */ +int ssl_ocsp_response_print(struct buffer *ocsp_response, struct buffer *out) +{ + BIO *bio = NULL; + int write = -1; + OCSP_RESPONSE *resp; + const unsigned char *p; + int retval = -1; + + if (!ocsp_response) + return -1; + + if ((bio = BIO_new(BIO_s_mem())) == NULL) + return -1; + + p = (const unsigned char*)ocsp_response->area; + + resp = d2i_OCSP_RESPONSE(NULL, &p, ocsp_response->data); + if (!resp) { + chunk_appendf(out, "Unable to parse OCSP response"); + goto end; + } + +#ifndef USE_OPENSSL_WOLFSSL + if (OCSP_RESPONSE_print(bio, resp, 0) != 0) { +#else + if (wolfSSL_d2i_OCSP_RESPONSE_bio(bio, &resp) != 0) { +#endif + struct buffer *trash = get_trash_chunk(); + struct ist ist_block = IST_NULL; + struct ist ist_double_lf = IST_NULL; + static struct ist double_lf = IST("\n\n"); + + write = BIO_read(bio, trash->area, trash->size - 1); + if (write <= 0) + goto end; + trash->data = write; + + /* Look for empty lines in the 'trash' buffer and add a space to + * the beginning to avoid having empty lines in the output + * (without changing the appearance of the information + * displayed). + */ + ist_block = ist2(b_orig(trash), b_data(trash)); + + ist_double_lf = istist(ist_block, double_lf); + + while (istlen(ist_double_lf)) { + /* istptr(ist_double_lf) points to the first \n of a + * \n\n pattern. + */ + uint empty_line_offset = istptr(ist_double_lf) + 1 - istptr(ist_block); + + /* Write up to the first '\n' of the "\n\n" pattern into + * the output buffer. + */ + b_putblk(out, istptr(ist_block), empty_line_offset); + /* Add an extra space. */ + b_putchr(out, ' '); + + /* Keep looking for empty lines in the rest of the data. */ + ist_block = istadv(ist_block, empty_line_offset); + + ist_double_lf = istist(ist_block, double_lf); + } + + retval = (b_istput(out, ist_block) <= 0); + } + +end: + if (bio) + BIO_free(bio); + + OCSP_RESPONSE_free(resp); + + return retval; +} + +/* + * Dump the contents of an OCSP response in DER format stored in + * <ocsp_response> into buffer <out> after converting it to base64. + * Returns 0 in case of success. + */ +static int ssl_ocsp_response_print_base64(struct buffer *ocsp_response, struct buffer *out) +{ + int b64len = 0; + + b64len = a2base64(b_orig(ocsp_response), b_data(ocsp_response), + b_orig(out), b_size(out)); + + if (b64len < 0) + return 1; + + out->data = b64len; + + /* Add empty line */ + chunk_appendf(ocsp_response, "\n"); + + return 0; +} + +/* + * Dump the details of the OCSP response of ID <ocsp_certid> into buffer <out>. + * Returns 0 in case of success. + */ +int ssl_get_ocspresponse_detail(unsigned char *ocsp_certid, struct buffer *out) +{ + struct certificate_ocsp *ocsp; + int ret = 0; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp = (struct certificate_ocsp *)ebmb_lookup(&cert_ocsp_tree, ocsp_certid, OCSP_MAX_CERTID_ASN1_LENGTH); + if (!ocsp) { + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + return -1; + } + + ret = ssl_ocsp_response_print(&ocsp->response, out); + + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + + return ret; +} + + +/* IO handler of details "show ssl ocsp-response <id>". + * The current entry is taken from appctx->svcctx. + */ +static int cli_io_handler_show_ocspresponse_detail(struct appctx *appctx) +{ + struct buffer *trash = get_trash_chunk(); + struct show_ocspresp_cli_ctx *ctx = appctx->svcctx; + struct certificate_ocsp *ocsp = ctx->ocsp; + int retval = 0; + + switch (ctx->format) { + case SHOW_OCSPRESP_FMT_DFLT: + case SHOW_OCSPRESP_FMT_TEXT: + retval = ssl_ocsp_response_print(&ocsp->response, trash); + break; + case SHOW_OCSPRESP_FMT_B64: + retval = ssl_ocsp_response_print_base64(&ocsp->response, trash); + break; + } + + if (retval) + return 1; + + if (applet_putchk(appctx, trash) == -1) + goto yield; + + appctx->svcctx = NULL; + return 1; + +yield: + return 0; +} + +void ssl_sock_ocsp_free_func(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp) +{ + struct ocsp_cbk_arg *ocsp_arg; + + if (ptr) { + ocsp_arg = ptr; + + if (ocsp_arg->is_single) { + ssl_sock_free_ocsp_instance(ocsp_arg->s_ocsp); + ocsp_arg->s_ocsp = NULL; + } else { + int i; + + for (i = 0; i < SSL_SOCK_NUM_KEYTYPES; i++) { + ssl_sock_free_ocsp_instance(ocsp_arg->m_ocsp[i]); + ocsp_arg->m_ocsp[i] = NULL; + } + } + free(ocsp_arg); + } +} + +/* + * Extract the first OCSP URI (if any) contained in <cert> and write it into + * <out>. + * Returns 0 in case of success, 1 otherwise. + */ +int ssl_ocsp_get_uri_from_cert(X509 *cert, struct buffer *out, char **err) +{ + STACK_OF(OPENSSL_STRING) *ocsp_uri_stk = NULL; + int ret = 1; + + if (!cert || !out) + goto end; + + ocsp_uri_stk = X509_get1_ocsp(cert); + if (ocsp_uri_stk == NULL) { + memprintf(err, "%sNo OCSP URL stack!\n", *err ? *err : ""); + goto end; + } + + if (!chunk_strcpy(out, sk_OPENSSL_STRING_value(ocsp_uri_stk, 0))) { + memprintf(err, "%sOCSP URI too long!\n", *err ? *err : ""); + goto end; + } + if (b_data(out) == 0) { + memprintf(err, "%sNo OCSP URL!\n", *err ? *err : ""); + goto end; + } + + ret = 0; + +end: + X509_email_free(ocsp_uri_stk); + return ret; +} + +/* + * Create the url and request body that make a proper OCSP request for the + * <certid>. The <req_url> parameter should already hold the OCSP URI that was + * extracted from the corresponding certificate. Depending on the size of the + * certid we will either append data to the <req_url> to create a proper URL + * that will be sent with a GET command, or the <req_body> will be constructed + * in case of a POST. + * Returns 0 in case of success. + */ +int ssl_ocsp_create_request_details(const OCSP_CERTID *certid, struct buffer *req_url, + struct buffer *req_body, char **err) +{ + int errcode = -1; + OCSP_REQUEST *ocsp; + struct buffer *bin_request = get_trash_chunk(); + unsigned char *outbuf = (unsigned char*)b_orig(bin_request); + + ocsp = OCSP_REQUEST_new(); + if (ocsp == NULL) { + memprintf(err, "%sCan't create OCSP_REQUEST\n", *err ? *err : ""); + goto end; + } + + if (OCSP_request_add0_id(ocsp, (OCSP_CERTID*)certid) == NULL) { + memprintf(err, "%sOCSP_request_add0_id() error\n", *err ? *err : ""); + goto end; + } + + bin_request->data = i2d_OCSP_REQUEST(ocsp, &outbuf); + if (b_data(bin_request) <= 0) { + memprintf(err, "%si2d_OCSP_REQUEST() error\n", *err ? *err : ""); + goto end; + } + + /* HTTP based OCSP requests can use either the GET or the POST method to + * submit their requests. To enable HTTP caching, small requests (that + * after encoding are less than 255 bytes), MAY be submitted using GET. + * If HTTP caching is not important, or the request is greater than 255 + * bytes, the request SHOULD be submitted using POST. + */ + if (b_data(bin_request) + b_data(req_url) < 0xff) { + struct buffer *b64buf = get_trash_chunk(); + char *ret = NULL; + int base64_ret = 0; + + chunk_strcat(req_url, "/"); + + base64_ret = a2base64(b_orig(bin_request), b_data(bin_request), + b_orig(b64buf), b_size(b64buf)); + + if (base64_ret < 0) { + memprintf(err, "%sa2base64() error\n", *err ? *err : ""); + goto end; + } + + b64buf->data = base64_ret; + + ret = encode_chunk((char*)b_stop(req_url), b_orig(req_url) + b_size(req_url), '%', + query_encode_map, b64buf); + if (ret && *ret == '\0') { + req_url->data = ret - b_orig(req_url); + errcode = 0; + } + } + else { + chunk_cpy(req_body, bin_request); + errcode = 0; + } + + +end: + OCSP_REQUEST_free(ocsp); + + return errcode; +} + +/* + * Parse an OCSP_RESPONSE contained in <respbuf> and check its validity in + * regard to the contents of <ckch> or the <issuer> certificate. + * Certificate_ocsp structure does not keep a reference to the corresponding + * ckch_store so outside of a CLI context (see "send ssl ocsp-response" + * command), we only have an easy access to the issuer's certificate whose + * reference is held in the structure. + * Return 0 in case of success, 1 otherwise. + */ +int ssl_ocsp_check_response(STACK_OF(X509) *chain, X509 *issuer, + struct buffer *respbuf, char **err) +{ + int ret = 1; + int n; + OCSP_RESPONSE *response = NULL; + OCSP_BASICRESP *basic = NULL; + X509_STORE *store = NULL; + const unsigned char *start = (const unsigned char*)b_orig(respbuf); + + if (!chain && !issuer) { + memprintf(err, "check_ocsp_response needs a certificate validation chain or an issuer certificate"); + goto end; + } + + response = d2i_OCSP_RESPONSE(NULL, &start, b_data(respbuf)); + if (!response) { + memprintf(err, "d2i_OCSP_RESPONSE() failed"); + goto end; + } + + n = OCSP_response_status(response); + + if (n != OCSP_RESPONSE_STATUS_SUCCESSFUL) { + memprintf(err, "OCSP response not successful (%d: %s)", + n, OCSP_response_status_str(n)); + goto end; + } + + basic = OCSP_response_get1_basic(response); + if (basic == NULL) { + memprintf(err, "OCSP_response_get1_basic() failed"); + goto end; + } + + /* Create a temporary store in which we add the certificate's chain + * certificates. We assume that all those certificates can be trusted + * because they were provided by the user. + * The only ssl item that needs to be verified here is the OCSP + * response. + */ + store = X509_STORE_new(); + if (!store) { + memprintf(err, "X509_STORE_new() failed"); + goto end; + } + + if (chain) { + int i = 0; + for (i = 0; i < sk_X509_num(chain); i++) { + X509 *cert = sk_X509_value(chain, i); + X509_STORE_add_cert(store, cert); + } + } + + if (issuer) + X509_STORE_add_cert(store, issuer); + + if (OCSP_basic_verify(basic, chain, store, OCSP_TRUSTOTHER) != 1) { + memprintf(err, "OCSP_basic_verify() failed"); + goto end; + } + + ret = 0; + +end: + X509_STORE_free(store); + OCSP_RESPONSE_free(response); + OCSP_BASICRESP_free(basic); + return ret; +} + + +/* + * OCSP-UPDATE RELATED FUNCTIONS AND STRUCTURES + */ + +struct task *ocsp_update_task __read_mostly = NULL; +static struct proxy *httpclient_ocsp_update_px; + +static struct ssl_ocsp_task_ctx { + struct certificate_ocsp *cur_ocsp; + struct httpclient *hc; + struct appctx *appctx; + int flags; + int update_status; +} ssl_ocsp_task_ctx; + +const struct http_hdr ocsp_request_hdrs[] = { + { IST("Content-Type"), IST("application/ocsp-request") }, + { IST_NULL, IST_NULL } +}; + +enum { + OCSP_UPDT_UNKNOWN = 0, + OCSP_UPDT_OK = 1, + OCSP_UPDT_ERR_HTTP_STATUS = 2, + OCSP_UPDT_ERR_HTTP_HDR = 3, + OCSP_UPDT_ERR_CHECK = 4, + OCSP_UPDT_ERR_INSERT = 5, + OCSP_UPDT_ERR_LAST /* Must be last */ +}; + +const struct ist ocsp_update_errors[] = { + [OCSP_UPDT_UNKNOWN] = IST("Unknown"), + [OCSP_UPDT_OK] = IST("Update successful"), + [OCSP_UPDT_ERR_HTTP_STATUS] = IST("HTTP error"), + [OCSP_UPDT_ERR_HTTP_HDR] = IST("Missing \"ocsp-response\" header"), + [OCSP_UPDT_ERR_CHECK] = IST("OCSP response check failure"), + [OCSP_UPDT_ERR_INSERT] = IST("Error during insertion") +}; + +static struct task *ssl_ocsp_update_responses(struct task *task, void *context, unsigned int state); + +/* + * Create the main OCSP update task that will iterate over the OCSP responses + * stored in ocsp_update_tree and send an OCSP request via the http_client + * applet to the corresponding OCSP responder. The task will then be in charge + * of processing the response, verifying it and resinserting it in the actual + * ocsp response tree if the response is valid. + * Returns 0 in case of success. + */ +int ssl_create_ocsp_update_task(char **err) +{ + if (ocsp_update_task) + return 0; /* Already created */ + + ocsp_update_task = task_new_anywhere(); + if (!ocsp_update_task) { + memprintf(err, "parsing : failed to allocate global ocsp update task."); + return -1; + } + + ocsp_update_task->process = ssl_ocsp_update_responses; + ocsp_update_task->context = NULL; + + return 0; +} + +static int ssl_ocsp_task_schedule() +{ + if (ocsp_update_task) + task_schedule(ocsp_update_task, now_ms); + + return 0; +} +REGISTER_POST_CHECK(ssl_ocsp_task_schedule); + +void ssl_sock_free_ocsp(struct certificate_ocsp *ocsp); + +void ssl_destroy_ocsp_update_task(void) +{ + struct eb64_node *node, *next; + if (!ocsp_update_task) + return; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + + node = eb64_first(&ocsp_update_tree); + while (node) { + next = eb64_next(node); + eb64_delete(node); + node = next; + } + + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + + task_destroy(ocsp_update_task); + ocsp_update_task = NULL; + + ssl_sock_free_ocsp(ssl_ocsp_task_ctx.cur_ocsp); + ssl_ocsp_task_ctx.cur_ocsp = NULL; + + if (ssl_ocsp_task_ctx.hc) { + httpclient_stop_and_destroy(ssl_ocsp_task_ctx.hc); + ssl_ocsp_task_ctx.hc = NULL; + } +} + +static inline void ssl_ocsp_set_next_update(struct certificate_ocsp *ocsp) +{ + int update_margin = (ocsp->expire >= SSL_OCSP_UPDATE_MARGIN) ? SSL_OCSP_UPDATE_MARGIN : 0; + + ocsp->next_update.key = MIN(date.tv_sec + global_ssl.ocsp_update.delay_max, + ocsp->expire - update_margin); + + /* An already existing valid OCSP response that expires within less than + * SSL_OCSP_UPDATE_DELAY_MIN or has no 'Next Update' field should not be + * updated more than once every 5 minutes in order to avoid continuous + * update of the same response. */ + if (b_data(&ocsp->response)) + ocsp->next_update.key = MAX(ocsp->next_update.key, + date.tv_sec + global_ssl.ocsp_update.delay_min); +} + +/* + * Insert a certificate_ocsp structure into the ocsp_update_tree tree, in which + * entries are sorted by absolute date of the next update. The next_update key + * will be the smallest out of the actual expire value of the response and + * now+1H. This arbitrary 1H value ensures that ocsp responses are updated + * periodically even when they have a long expire time, while not overloading + * the system too much (in theory). Likewise, a minimum 5 minutes interval is + * defined in order to avoid updating too often responses that have a really + * short expire time or even no 'Next Update' at all. + */ +int ssl_ocsp_update_insert(struct certificate_ocsp *ocsp) +{ + /* Set next_update based on current time and the various OCSP + * minimum/maximum update times. + */ + ssl_ocsp_set_next_update(ocsp); + + ocsp->fail_count = 0; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp->updating = 0; + /* An entry with update_once set to 1 was only supposed to be updated + * once, it does not need to be reinserted into the update tree. + */ + if (!ocsp->update_once) + eb64_insert(&ocsp_update_tree, &ocsp->next_update); + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + + return 0; +} + +/* + * Reinsert an entry in the update tree. The entry's next update time can not + * occur before now+SSL_OCSP_HTTP_ERR_REPLAY. + * This is supposed to be used in case of http error (ocsp responder unreachable + * for instance). This ensures that the entry does not get reinserted at the + * beginning of the tree every time. + */ +int ssl_ocsp_update_insert_after_error(struct certificate_ocsp *ocsp) +{ + int replay_delay = 0; + + /* + * Set next_update based on current time and the various OCSP + * minimum/maximum update times. + */ + ssl_ocsp_set_next_update(ocsp); + + ++ocsp->fail_count; + + /* + * The replay delay will be increased for every consecutive update + * failure, up to the SSL_OCSP_UPDATE_DELAY_MAX delay. It will ensure + * that the replay delay will be one minute for the first failure and + * will be multiplied by 2 for every subsequent failures, while still + * being at most 1 hour (with the current default values). + */ + replay_delay = MIN(SSL_OCSP_HTTP_ERR_REPLAY * (1 << ocsp->fail_count), + global_ssl.ocsp_update.delay_max); + + if (ocsp->next_update.key < date.tv_sec + replay_delay) + ocsp->next_update.key = date.tv_sec + replay_delay; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp->updating = 0; + /* An entry with update_once set to 1 was only supposed to be updated + * once, it does not need to be reinserted into the update tree. + */ + if (!ocsp->update_once) + eb64_insert(&ocsp_update_tree, &ocsp->next_update); + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + + return 0; +} + +void ocsp_update_response_stline_cb(struct httpclient *hc) +{ + struct task *task = hc->caller; + + if (!task) + return; + + ssl_ocsp_task_ctx.flags |= HC_F_RES_STLINE; + task_wakeup(task, TASK_WOKEN_MSG); +} + +void ocsp_update_response_headers_cb(struct httpclient *hc) +{ + struct task *task = hc->caller; + + if (!task) + return; + + ssl_ocsp_task_ctx.flags |= HC_F_RES_HDR; + task_wakeup(task, TASK_WOKEN_MSG); +} + +void ocsp_update_response_body_cb(struct httpclient *hc) +{ + struct task *task = hc->caller; + + if (!task) + return; + + ssl_ocsp_task_ctx.flags |= HC_F_RES_BODY; + task_wakeup(task, TASK_WOKEN_MSG); +} + +void ocsp_update_response_end_cb(struct httpclient *hc) +{ + struct task *task = hc->caller; + + if (!task) + return; + + ssl_ocsp_task_ctx.flags |= HC_F_RES_END; + task_wakeup(task, TASK_WOKEN_MSG); +} + + +/* + * Send a log line that will use the dedicated proxy's error_logformat string. + * It uses the sess_log function instead of app_log for instance in order to + * benefit from the "generic" items that can be added to a log format line such + * as the date and frontend name that can be found at the beginning of the + * ocspupdate_log_format line. + */ +static void ssl_ocsp_send_log() +{ + if (!ssl_ocsp_task_ctx.appctx) + return; + + sess_log(ssl_ocsp_task_ctx.appctx->sess); +} + +/* + * This is the main function of the ocsp auto update mechanism. It has two + * distinct parts and the branching to one or the other is completely based on + * the fact that the cur_ocsp pointer of the ssl_ocsp_task_ctx member is set. + * + * If the pointer is not set, we need to look at the first item of the update + * tree and see if it needs to be updated. If it does not we simply wait until + * the time is right and let the task asleep. If it does need to be updated, we + * simply build and send the corresponding ocsp request thanks to the + * http_client. The task is then sent to sleep with an expire time set to + * infinity. The http_client will wake it back up once the response is received + * (or a timeout occurs). Just note that during this whole process the + * cetificate_ocsp object corresponding to the entry being updated is taken out + * of the update tree and only stored in the ssl_ocsp_task_ctx context. + * + * Once the task is waken up by the http_client, it branches on the response + * processing part of the function which basically checks that the response is + * valid and inserts it into the ocsp_response tree. The task then goes back to + * sleep until another entry needs to be updated. + */ +static struct task *ssl_ocsp_update_responses(struct task *task, void *context, unsigned int state) +{ + unsigned int next_wakeup = 0; + struct eb64_node *eb; + struct certificate_ocsp *ocsp; + struct httpclient *hc = NULL; + struct buffer *req_url = NULL; + struct buffer *req_body = NULL; + OCSP_CERTID *certid = NULL; + struct ssl_ocsp_task_ctx *ctx = &ssl_ocsp_task_ctx; + + if (ctx->cur_ocsp) { + /* An update is in process */ + ocsp = ctx->cur_ocsp; + hc = ctx->hc; + if (ctx->flags & HC_F_RES_STLINE) { + if (hc->res.status != 200) { + ctx->update_status = OCSP_UPDT_ERR_HTTP_STATUS; + goto http_error; + } + ctx->flags &= ~HC_F_RES_STLINE; + } + + if (ctx->flags & HC_F_RES_HDR) { + struct http_hdr *hdr; + int found = 0; + /* Look for "Content-Type" header which should have + * "application/ocsp-response" value. */ + for (hdr = hc->res.hdrs; isttest(hdr->v); hdr++) { + if (isteqi(hdr->n, ist("Content-Type")) && + isteqi(hdr->v, ist("application/ocsp-response"))) { + found = 1; + break; + } + } + if (!found) { + ctx->update_status = OCSP_UPDT_ERR_HTTP_HDR; + goto http_error; + } + ctx->flags &= ~HC_F_RES_HDR; + } + + /* If the HC_F_RES_BODY is set, we still need for the + * HC_F_RES_END flag to be set as well in order to be sure that + * the body is complete. */ + + /* we must close only if F_RES_END is the last flag */ + if (ctx->flags & HC_F_RES_END) { + + /* Process the body that must be complete since + * HC_F_RES_END is set. */ + if (ctx->flags & HC_F_RES_BODY) { + if (ssl_ocsp_check_response(ocsp->chain, ocsp->issuer, &hc->res.buf, NULL)) { + ctx->update_status = OCSP_UPDT_ERR_CHECK; + goto http_error; + } + + if (ssl_sock_update_ocsp_response(&hc->res.buf, NULL) != 0) { + ctx->update_status = OCSP_UPDT_ERR_INSERT; + goto http_error; + } + + ctx->flags &= ~HC_F_RES_BODY; + } + + ctx->flags &= ~HC_F_RES_END; + + ++ocsp->num_success; + ocsp->last_update = date.tv_sec; + ctx->update_status = OCSP_UPDT_OK; + ocsp->last_update_status = ctx->update_status; + + ssl_ocsp_send_log(); + + /* Reinsert the entry into the update list so that it can be updated later */ + ssl_ocsp_update_insert(ocsp); + /* Release the reference kept on the updated ocsp response. */ + ssl_sock_free_ocsp_instance(ctx->cur_ocsp); + ctx->cur_ocsp = NULL; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + /* Set next_wakeup to the new first entry of the tree */ + eb = eb64_first(&ocsp_update_tree); + if (eb) { + if (eb->key > date.tv_sec) + next_wakeup = (eb->key - date.tv_sec)*1000; + else + next_wakeup = 0; + } + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + goto leave; + } + + /* We did not receive the HC_F_RES_END flag yet, wait for it + * before trying to update a new ocsp response. */ + goto wait; + } else { + /* Look for next entry that needs to be updated. */ + const unsigned char *p = NULL; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + + eb = eb64_first(&ocsp_update_tree); + if (!eb) { + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + goto wait; + } + + if (eb->key > date.tv_sec) { + next_wakeup = (eb->key - date.tv_sec)*1000; + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + goto leave; + } + + ocsp = eb64_entry(eb, struct certificate_ocsp, next_update); + + /* Take the current entry out of the update tree, it will be + * reinserted after the response is processed. */ + eb64_delete(&ocsp->next_update); + + ocsp->updating = 1; + ocsp->refcount_instance++; + ctx->cur_ocsp = ocsp; + ocsp->last_update_status = OCSP_UPDT_UNKNOWN; + + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + + req_url = alloc_trash_chunk(); + if (!req_url) { + goto leave; + } + req_body = alloc_trash_chunk(); + if (!req_body) { + goto leave; + } + + p = ocsp->key_data; + + d2i_OCSP_CERTID(&certid, &p, ocsp->key_length); + if (!certid) + goto leave; + + /* Copy OCSP URI stored in ocsp structure into req_url */ + chunk_cpy(req_url, ocsp->uri); + + /* Create ocsp request */ + if (ssl_ocsp_create_request_details(certid, req_url, req_body, NULL) != 0) { + goto leave; + } + + /* Depending on the processing that occurred in + * ssl_ocsp_create_request_details we could either have to send + * a GET or a POST request. */ + hc = httpclient_new_from_proxy(httpclient_ocsp_update_px, task, + b_data(req_body) ? HTTP_METH_POST : HTTP_METH_GET, + ist2(b_orig(req_url), b_data(req_url))); + if (!hc) { + goto leave; + } + + if (httpclient_req_gen(hc, hc->req.url, hc->req.meth, + b_data(req_body) ? ocsp_request_hdrs : NULL, + b_data(req_body) ? ist2(b_orig(req_body), b_data(req_body)) : IST_NULL) != ERR_NONE) { + goto leave; + } + + hc->ops.res_stline = ocsp_update_response_stline_cb; + hc->ops.res_headers = ocsp_update_response_headers_cb; + hc->ops.res_payload = ocsp_update_response_body_cb; + hc->ops.res_end = ocsp_update_response_end_cb; + + if (!(ctx->appctx = httpclient_start(hc))) { + goto leave; + } + + ctx->flags = 0; + ctx->hc = hc; + + /* We keep the lock, this indicates that an update is in process. */ + goto wait; + } + +leave: + if (ctx->cur_ocsp) { + /* Something went wrong, reinsert the entry in the tree. */ + ++ctx->cur_ocsp->num_failure; + ssl_ocsp_update_insert_after_error(ctx->cur_ocsp); + /* Release the reference kept on the updated ocsp response. */ + ssl_sock_free_ocsp_instance(ctx->cur_ocsp); + ctx->cur_ocsp = NULL; + } + if (hc) + httpclient_stop_and_destroy(hc); + ctx->hc = NULL; + free_trash_chunk(req_url); + free_trash_chunk(req_body); + task->expire = tick_add(now_ms, next_wakeup); + return task; + +wait: + free_trash_chunk(req_url); + free_trash_chunk(req_body); + task->expire = TICK_ETERNITY; + return task; + +http_error: + ssl_ocsp_send_log(); + /* Reinsert certificate into update list so that it can be updated later */ + if (ocsp) { + ++ocsp->num_failure; + ocsp->last_update_status = ctx->update_status; + ssl_ocsp_update_insert_after_error(ocsp); + } + + if (hc) + httpclient_stop_and_destroy(hc); + /* Release the reference kept on the updated ocsp response. */ + ssl_sock_free_ocsp_instance(ctx->cur_ocsp); + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + /* Set next_wakeup to the new first entry of the tree */ + eb = eb64_first(&ocsp_update_tree); + if (eb) { + if (eb->key > date.tv_sec) + next_wakeup = (eb->key - date.tv_sec)*1000; + else + next_wakeup = 0; + } + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + ctx->cur_ocsp = NULL; + ctx->hc = NULL; + ctx->flags = 0; + task->expire = tick_add(now_ms, next_wakeup); + return task; +} + +char ocspupdate_log_format[] = "%ci:%cp [%tr] %ft %[ssl_ocsp_certname] %[ssl_ocsp_status] %{+Q}[ssl_ocsp_status_str] %[ssl_ocsp_fail_cnt] %[ssl_ocsp_success_cnt]"; + +/* + * Initialize the proxy for the OCSP update HTTP client with 2 servers, one for + * raw HTTP, the other for HTTPS. + */ +static int ssl_ocsp_update_precheck() +{ + /* initialize the OCSP update dedicated httpclient */ + httpclient_ocsp_update_px = httpclient_create_proxy("<OCSP-UPDATE>"); + if (!httpclient_ocsp_update_px) + return 1; + httpclient_ocsp_update_px->conf.error_logformat_string = strdup(ocspupdate_log_format); + httpclient_ocsp_update_px->conf.logformat_string = httpclient_log_format; + httpclient_ocsp_update_px->options2 |= PR_O2_NOLOGNORM; + + return 0; +} + +/* initialize the proxy and servers for the HTTP client */ + +REGISTER_PRE_CHECK(ssl_ocsp_update_precheck); + + +static int cli_parse_update_ocsp_response(char **args, char *payload, struct appctx *appctx, void *private) +{ + char *err = NULL; + struct ckch_store *ckch_store = NULL; + struct certificate_ocsp *ocsp = NULL; + int update_once = 0; + unsigned char key[OCSP_MAX_CERTID_ASN1_LENGTH] = {}; + unsigned char *p; + + if (!*args[3]) { + memprintf(&err, "'update ssl ocsp-response' expects a filename\n"); + return cli_dynerr(appctx, err); + } + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) { + memprintf(&err, "%sCan't update the certificate!\nOperations on certificates are currently locked!\n", err ? err : ""); + goto end; + } + + ckch_store = ckchs_lookup(args[3]); + + if (!ckch_store) { + memprintf(&err, "%sUnknown certificate! 'update ssl ocsp-response' expects an already known certificate file name.\n", err ? err : ""); + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + goto end; + } + + p = key; + i2d_OCSP_CERTID(ckch_store->data->ocsp_cid, &p); + + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp = (struct certificate_ocsp *)ebmb_lookup(&cert_ocsp_tree, key, OCSP_MAX_CERTID_ASN1_LENGTH); + if (!ocsp) { + memprintf(&err, "%s'update ssl ocsp-response' only works on certificates that already have a known OCSP response.\n", err ? err : ""); + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + goto end; + } + + /* No need to try to update this response, it is already being updated. */ + if (!ocsp->updating) { + update_once = (ocsp->next_update.node.leaf_p == NULL); + eb64_delete(&ocsp->next_update); + + /* Insert the entry at the beginning of the update tree. + * We don't need to increase the reference counter on the + * certificate_ocsp structure because we would not have a way to + * decrease it afterwards since this update operation is asynchronous. + * If the corresponding entry were to be destroyed before the update can + * be performed, which is pretty unlikely, it would not be such a + * problem because that would mean that the OCSP response is not + * actually used. + */ + ocsp->next_update.key = 0; + eb64_insert(&ocsp_update_tree, &ocsp->next_update); + ocsp->update_once = update_once; + } + + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + + if (!ocsp_update_task) + ssl_create_ocsp_update_task(&err); + + task_wakeup(ocsp_update_task, TASK_WOKEN_MSG); + + free(err); + + return 0; + +end: + return cli_dynerr(appctx, memprintf(&err, "%sCan't send ocsp request for %s!\n", err ? err : "", args[3])); +} + +#endif /* !defined OPENSSL_IS_BORINGSSL */ + + +#endif /* (defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) */ + + +static int cli_parse_set_ocspresponse(char **args, char *payload, struct appctx *appctx, void *private) +{ +#if (defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) + char *err = NULL; + int i, j, ret; + + if (!payload) + payload = args[3]; + + /* Expect one parameter: the new response in base64 encoding */ + if (!*payload) + return cli_err(appctx, "'set ssl ocsp-response' expects response in base64 encoding.\n"); + + /* remove \r and \n from the payload */ + for (i = 0, j = 0; payload[i]; i++) { + if (payload[i] == '\r' || payload[i] == '\n') + continue; + payload[j++] = payload[i]; + } + payload[j] = 0; + + ret = base64dec(payload, j, trash.area, trash.size); + if (ret < 0) + return cli_err(appctx, "'set ssl ocsp-response' received invalid base64 encoded response.\n"); + + trash.data = ret; + if (ssl_sock_update_ocsp_response(&trash, &err)) { + if (err) + return cli_dynerr(appctx, memprintf(&err, "%s.\n", err)); + else + return cli_err(appctx, "Failed to update OCSP response.\n"); + } + + return cli_msg(appctx, LOG_INFO, "OCSP Response updated!\n"); +#else + return cli_err(appctx, "HAProxy was compiled against a version of OpenSSL that doesn't support OCSP stapling.\n"); +#endif + +} + +/* parsing function for 'show ssl ocsp-response [id]'. If an entry is forced, + * it's set into appctx->svcctx. + */ +static int cli_parse_show_ocspresponse(char **args, char *payload, struct appctx *appctx, void *private) +{ +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) + + struct show_ocspresp_cli_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int arg_idx = 3; + + if (*args[3]) { + struct certificate_ocsp *ocsp = NULL; + char key[OCSP_MAX_CERTID_ASN1_LENGTH] = {}; + int key_length = OCSP_MAX_CERTID_ASN1_LENGTH; + char *key_ptr = key; + unsigned char *p; + struct ckch_store *ckch_store = NULL; + + if (strcmp(args[3], "text") == 0) { + ctx->format = SHOW_OCSPRESP_FMT_TEXT; + ++arg_idx; + } else if (strcmp(args[3], "base64") == 0) { + ctx->format = SHOW_OCSPRESP_FMT_B64; + ++arg_idx; + } + + if (ctx->format != SHOW_OCSPRESP_FMT_DFLT && !*args[arg_idx]) + return cli_err(appctx, "'show ssl ocsp-response [text|base64]' expects a valid certid.\n"); + + /* Try to convert parameter into an OCSP certid first, and consider it + * as a filename if it fails. */ + if (strlen(args[arg_idx]) > OCSP_MAX_CERTID_ASN1_LENGTH*2 || + !parse_binary(args[arg_idx], &key_ptr, &key_length, NULL)) { + + key_ptr = key; + key_length = 0; + + /* The operations on the CKCH architecture are locked so we can + * manipulate ckch_store and ckch_inst */ + if (HA_SPIN_TRYLOCK(CKCH_LOCK, &ckch_lock)) { + return cli_err(appctx, "Operations on certificates are currently locked!\n"); + } + + ckch_store = ckchs_lookup(args[arg_idx]); + + if (ckch_store) { + p = (unsigned char*)key; + key_length = i2d_OCSP_CERTID(ckch_store->data->ocsp_cid, &p); + } + HA_SPIN_UNLOCK(CKCH_LOCK, &ckch_lock); + } + + if (key_length == 0) { + return cli_err(appctx, "'show ssl ocsp-response' expects a valid certid or certificate path.\n"); + } + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + ocsp = (struct certificate_ocsp *)ebmb_lookup(&cert_ocsp_tree, key, OCSP_MAX_CERTID_ASN1_LENGTH); + + if (!ocsp) { + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + return cli_err(appctx, "Certificate ID or path does not match any certificate.\n"); + } + ocsp->refcount_instance++; + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + + ctx->ocsp = ocsp; + appctx->io_handler = cli_io_handler_show_ocspresponse_detail; + } + + return 0; + +#else + return cli_err(appctx, "HAProxy was compiled against a version of OpenSSL that doesn't support OCSP stapling.\n"); +#endif +} + +/* + * IO handler of "show ssl ocsp-response". The command taking a specific ID + * is managed in cli_io_handler_show_ocspresponse_detail. + * The current entry is taken from appctx->svcctx. + */ +static int cli_io_handler_show_ocspresponse(struct appctx *appctx) +{ +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) + struct buffer *trash = alloc_trash_chunk(); + struct buffer *tmp = NULL; + struct ebmb_node *node; + struct certificate_ocsp *ocsp = NULL; + BIO *bio = NULL; + int write = -1; + struct show_ocspresp_cli_ctx *ctx = appctx->svcctx; + + if (trash == NULL) + return 1; + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + + tmp = alloc_trash_chunk(); + if (!tmp) + goto end; + + if ((bio = BIO_new(BIO_s_mem())) == NULL) + goto end; + + if (!ctx->ocsp) { + chunk_appendf(trash, "# Certificate IDs\n"); + node = ebmb_first(&cert_ocsp_tree); + } else { + node = &ctx->ocsp->key; + } + + while (node) { + OCSP_CERTID *certid = NULL; + const unsigned char *p = NULL; + int i; + + ocsp = ebmb_entry(node, struct certificate_ocsp, key); + + /* Dump the key in hexadecimal */ + chunk_appendf(trash, "Certificate ID key : "); + for (i = 0; i < ocsp->key_length; ++i) { + chunk_appendf(trash, "%02x", ocsp->key_data[i]); + } + chunk_appendf(trash, "\n"); + + /* Dump the certificate path */ + chunk_appendf(trash, "Certificate path : %s\n", ocsp->path); + + p = ocsp->key_data; + + /* Decode the certificate ID (serialized into the key). */ + d2i_OCSP_CERTID(&certid, &p, ocsp->key_length); + if (!certid) + goto end; + + /* Dump the CERTID info */ + ocsp_certid_print(bio, certid, 1); + OCSP_CERTID_free(certid); + write = BIO_read(bio, tmp->area, tmp->size-1); + /* strip trailing LFs */ + while (write > 0 && tmp->area[write-1] == '\n') + write--; + tmp->area[write] = '\0'; + + chunk_appendf(trash, "%s\n", tmp->area); + + node = ebmb_next(node); + if (applet_putchk(appctx, trash) == -1) + goto yield; + } + +end: + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + free_trash_chunk(trash); + free_trash_chunk(tmp); + BIO_free(bio); + return 1; + +yield: + free_trash_chunk(trash); + free_trash_chunk(tmp); + BIO_free(bio); + + ocsp->refcount_instance++; + ctx->ocsp = ocsp; + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + return 0; +#else + return cli_err(appctx, "HAProxy was compiled against a version of OpenSSL that doesn't support OCSP stapling.\n"); +#endif +} + +static void cli_release_show_ocspresponse(struct appctx *appctx) +{ + struct show_ocspresp_cli_ctx *ctx = appctx->svcctx; + + if (ctx) + ssl_sock_free_ocsp(ctx->ocsp); +} + +/* Check if the ckch_store and the entry does have the same configuration */ +int ocsp_update_check_cfg_consistency(struct ckch_store *store, struct crtlist_entry *entry, char *crt_path, char **err) +{ + int err_code = ERR_NONE; + + if (store->data->ocsp_update_mode != SSL_SOCK_OCSP_UPDATE_DFLT || entry->ssl_conf) { + if ((!entry->ssl_conf && store->data->ocsp_update_mode == SSL_SOCK_OCSP_UPDATE_ON) + || (entry->ssl_conf && store->data->ocsp_update_mode != entry->ssl_conf->ocsp_update)) { + memprintf(err, "%sIncompatibilities found in OCSP update mode for certificate %s\n", err && *err ? *err : "", crt_path); + err_code |= ERR_ALERT | ERR_FATAL; + } + } + return err_code; +} + +struct show_ocsp_updates_ctx { + struct certificate_ocsp *cur_ocsp; +}; + +/* + * Parsing function for 'show ssl ocsp-updates [nb]'. + */ +static int cli_parse_show_ocsp_updates(char **args, char *payload, struct appctx *appctx, void *private) +{ +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) + struct show_ocsp_updates_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + + return 0; +#else + return cli_err(appctx, "HAProxy was compiled against a version of OpenSSL that doesn't support OCSP stapling.\n"); +#endif +} + +/* + * Dump information about an ocsp response concerning ocsp auto update. + * It follows the following format : + * OCSP Certid | Path | Next Update | Last Update | Successes | Failures | Last Update Status | Last Update Status (str) + * Return 0 in case of success. + */ +static int dump_ocsp_update_info(struct certificate_ocsp *ocsp, struct buffer *out) +{ + struct tm tm = {}; + char *ret; + int i; + time_t next_update; + + /* Dump OCSP certid */ + for (i = 0; i < ocsp->key_length; ++i) { + chunk_appendf(out, "%02x", ocsp->key_data[i]); + } + + chunk_appendf(out, " | "); + + /* Dump path */ + chunk_appendf(out, "%s", ocsp->path); + + chunk_appendf(out, " | "); + + /* Dump next update time */ + if (ocsp->next_update.key != 0) { + next_update = ocsp->next_update.key; + get_localtime(ocsp->next_update.key, &tm); + } else { + next_update = date.tv_sec; + get_localtime(date.tv_sec, &tm); + } + ret = localdate2str_log(b_orig(out)+b_data(out), next_update, &tm, b_size(out)-b_data(out)); + + if (ret == NULL) + return 1; + + out->data = (ret - out->area); + + chunk_appendf(out, " | "); + + /* Dump last update time or "-" if no update occurred yet */ + if (ocsp->last_update) { + get_localtime(ocsp->last_update, &tm); + ret = localdate2str_log(b_orig(out)+b_data(out), ocsp->last_update, &tm, b_size(out)-b_data(out)); + + if (ret == NULL) + return 1; + + out->data = (ret - out->area); + } else + chunk_appendf(out, "-"); + + chunk_appendf(out, " | "); + + /* Number of successful updates */ + chunk_appendf(out, "%d", ocsp->num_success); + + chunk_appendf(out, " | "); + + /* Number of failed updates */ + chunk_appendf(out, "%d", ocsp->num_failure); + + chunk_appendf(out, " | "); + + /* Last update status */ + chunk_appendf(out, "%d", ocsp->last_update_status); + + chunk_appendf(out, " | "); + + /* Last update status str */ + if (ocsp->last_update_status >= OCSP_UPDT_ERR_LAST) + chunk_appendf(out, "-"); + else + chunk_appendf(out, "%s", istptr(ocsp_update_errors[ocsp->last_update_status])); + + chunk_appendf(out, "\n"); + + return 0; +} + +static int cli_io_handler_show_ocsp_updates(struct appctx *appctx) +{ + struct show_ocsp_updates_ctx *ctx = appctx->svcctx; + struct eb64_node *node; + struct certificate_ocsp *ocsp = NULL; + struct buffer *trash = get_trash_chunk(); + + if (!ctx->cur_ocsp) { + node = eb64_first(&ocsp_update_tree); + chunk_appendf(trash, "OCSP Certid | Path | Next Update | Last Update | Successes | Failures | Last Update Status | Last Update Status (str)\n"); + + /* Look for an entry currently being updated */ + ocsp = ssl_ocsp_task_ctx.cur_ocsp; + if (ocsp) { + if (dump_ocsp_update_info(ocsp, trash)) + goto end; + } + + if (applet_putchk(appctx, trash) == -1) + goto yield; + + } else { + node = &((struct certificate_ocsp*)ctx->cur_ocsp)->next_update; + } + + while (node) { + ocsp = eb64_entry(node, struct certificate_ocsp, next_update); + + chunk_reset(trash); + if (dump_ocsp_update_info(ocsp, trash)) + goto end; + + if (applet_putchk(appctx, trash) == -1) { + ctx->cur_ocsp = ocsp; + goto yield; + } + + node = eb64_next(node); + } + +end: + return 1; + +yield: + return 0; /* should come back */ +} + +static void cli_release_show_ocsp_updates(struct appctx *appctx) +{ + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); +} + + +static int +smp_fetch_ssl_ocsp_certid(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct buffer *data = get_trash_chunk(); + struct certificate_ocsp *ocsp = ssl_ocsp_task_ctx.cur_ocsp; + + if (!ocsp) + return 0; + + dump_binary(data, (char *)ocsp->key_data, ocsp->key_length); + + smp->data.type = SMP_T_STR; + smp->data.u.str = *data; + return 1; +} + +static int +smp_fetch_ssl_ocsp_certname(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct certificate_ocsp *ocsp = ssl_ocsp_task_ctx.cur_ocsp; + + if (!ocsp) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str.area = ocsp->path; + smp->data.u.str.data = strlen(ocsp->path); + return 1; +} + +static int +smp_fetch_ssl_ocsp_status(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct certificate_ocsp *ocsp = ssl_ocsp_task_ctx.cur_ocsp; + + if (!ocsp) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = ssl_ocsp_task_ctx.update_status; + return 1; +} + +static int +smp_fetch_ssl_ocsp_status_str(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct certificate_ocsp *ocsp = ssl_ocsp_task_ctx.cur_ocsp; + + if (!ocsp) + return 0; + + if (ssl_ocsp_task_ctx.update_status >= OCSP_UPDT_ERR_LAST) + return 0; + + smp->data.type = SMP_T_STR; + smp->data.u.str = ist2buf(ocsp_update_errors[ssl_ocsp_task_ctx.update_status]); + + return 1; +} + +static int +smp_fetch_ssl_ocsp_fail_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct certificate_ocsp *ocsp = ssl_ocsp_task_ctx.cur_ocsp; + + if (!ocsp) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = ocsp->num_failure; + return 1; +} + +static int +smp_fetch_ssl_ocsp_success_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct certificate_ocsp *ocsp = ssl_ocsp_task_ctx.cur_ocsp; + + if (!ocsp) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = ocsp->num_success; + return 1; +} + + +static struct cli_kw_list cli_kws = {{ },{ + { { "set", "ssl", "ocsp-response", NULL }, "set ssl ocsp-response <resp|payload> : update a certificate's OCSP Response from a base64-encode DER", cli_parse_set_ocspresponse, NULL }, + + { { "show", "ssl", "ocsp-response", NULL },"show ssl ocsp-response [[text|base64] id] : display the IDs of the OCSP responses used in memory, or the details of a single OCSP response (in text or base64 format)", cli_parse_show_ocspresponse, cli_io_handler_show_ocspresponse, cli_release_show_ocspresponse }, + { { "show", "ssl", "ocsp-updates", NULL }, "show ssl ocsp-updates : display information about the next 'nb' ocsp responses that will be updated automatically", cli_parse_show_ocsp_updates, cli_io_handler_show_ocsp_updates, cli_release_show_ocsp_updates }, +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) + { { "update", "ssl", "ocsp-response", NULL }, "update ssl ocsp-response <certfile> : send ocsp request and update stored ocsp response", cli_parse_update_ocsp_response, NULL, NULL }, +#endif + { { NULL }, NULL, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + * + * Those fetches only have a valid value during an OCSP update process so they + * can only be used in a log format of a log line built by the update process + * task itself. + */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "ssl_ocsp_certid", smp_fetch_ssl_ocsp_certid, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, + { "ssl_ocsp_certname", smp_fetch_ssl_ocsp_certname, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, + { "ssl_ocsp_status", smp_fetch_ssl_ocsp_status, 0, NULL, SMP_T_SINT, SMP_USE_L5SRV }, + { "ssl_ocsp_status_str", smp_fetch_ssl_ocsp_status_str, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, + { "ssl_ocsp_fail_cnt", smp_fetch_ssl_ocsp_fail_cnt, 0, NULL, SMP_T_SINT, SMP_USE_L5SRV }, + { "ssl_ocsp_success_cnt", smp_fetch_ssl_ocsp_success_cnt, 0, NULL, SMP_T_SINT, SMP_USE_L5SRV }, + { NULL, NULL, 0, 0, 0 }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/ssl_sample.c b/src/ssl_sample.c new file mode 100644 index 0000000..789637f --- /dev/null +++ b/src/ssl_sample.c @@ -0,0 +1,2389 @@ +/* + * This file contains the sample fetches related to the SSL + * + * Copyright (C) 2012 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define _GNU_SOURCE +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <haproxy/acl.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/base64.h> +#include <haproxy/buf-t.h> +#include <haproxy/connection.h> +#include <haproxy/obj_type.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/sample.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/ssl_utils.h> +#include <haproxy/stconn.h> +#include <haproxy/tools.h> +#include <haproxy/vars.h> + + +/***** Below are some sample fetching functions for ACL/patterns *****/ + +#if defined(HAVE_CRYPTO_memcmp) +/* Compares bytestring with a variable containing a bytestring. Return value + * is `true` if both bytestrings are bytewise identical and `false` otherwise. + * + * Comparison will be performed in constant time if both bytestrings are of + * the same length. If the lengths differ execution time will not be constant. + */ +static int sample_conv_secure_memcmp(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct sample tmp; + int result; + + smp_set_owner(&tmp, smp->px, smp->sess, smp->strm, smp->opt); + if (arg_p[0].type != ARGT_VAR) + return 0; + + if (!sample_conv_var2smp(&arg_p[0].data.var, &tmp, SMP_T_BIN)) + return 0; + + if (smp->data.u.str.data != tmp.data.u.str.data) { + smp->data.u.sint = 0; + smp->data.type = SMP_T_BOOL; + return 1; + } + + /* The following comparison is performed in constant time. */ + result = CRYPTO_memcmp(smp->data.u.str.area, tmp.data.u.str.area, smp->data.u.str.data); + + smp->data.u.sint = result == 0; + smp->data.type = SMP_T_BOOL; + return 1; +} + +/* This function checks the "secure_memcmp" converter's arguments and extracts the + * variable name and its scope. + */ +static int smp_check_secure_memcmp(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (!args[0].data.str.data) { + memprintf(err, "missing variable name"); + return 0; + } + + /* Try to decode a variable. */ + if (vars_check_arg(&args[0], NULL)) + return 1; + + memprintf(err, "failed to register variable name '%s'", + args[0].data.str.area); + return 0; +} +#endif // HAVE_secure_memcmp() + +static int smp_check_sha2(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (args[0].type == ARGT_STOP) + return 1; + if (args[0].type != ARGT_SINT) { + memprintf(err, "Invalid type '%s'", arg_type_names[args[0].type]); + return 0; + } + + switch (args[0].data.sint) { + case 224: + case 256: + case 384: + case 512: + /* this is okay */ + return 1; + default: + memprintf(err, "Unsupported number of bits: '%lld'", args[0].data.sint); + return 0; + } +} + +static int sample_conv_sha2(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + int bits = 256; + EVP_MD_CTX *mdctx; + const EVP_MD *evp = NULL; + unsigned int digest_length = 0; + if (arg_p->data.sint) + bits = arg_p->data.sint; + + switch (bits) { + case 224: + evp = EVP_sha224(); + break; + case 256: + evp = EVP_sha256(); + break; + case 384: + evp = EVP_sha384(); + break; + case 512: + evp = EVP_sha512(); + break; + default: + return 0; + } + + mdctx = EVP_MD_CTX_new(); + if (!mdctx) + return 0; + EVP_DigestInit_ex(mdctx, evp, NULL); + EVP_DigestUpdate(mdctx, smp->data.u.str.area, smp->data.u.str.data); + EVP_DigestFinal_ex(mdctx, (unsigned char*)trash->area, &digest_length); + trash->data = digest_length; + + EVP_MD_CTX_free(mdctx); + + smp->data.u.str = *trash; + smp->data.type = SMP_T_BIN; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +/* This function checks an <arg> and fills it with a variable type if the + * <arg> string contains a valid variable name. If failed, the function + * tries to perform a base64 decode operation on the same string, and + * fills the <arg> with the decoded content. + * + * Validation is skipped if the <arg> string is empty. + * + * This function returns 0 if the variable lookup fails and the specified + * <arg> string is not a valid base64 encoded string, as well if + * unexpected argument type is specified or memory allocation error + * occurs. Otherwise it returns 1. + */ +static inline int sample_check_arg_base64(struct arg *arg, char **err) +{ + char *dec = NULL; + int dec_size; + + if (arg->type != ARGT_STR) { + memprintf(err, "unexpected argument type"); + return 0; + } + + if (arg->data.str.data == 0) /* empty */ + return 1; + + if (vars_check_arg(arg, NULL)) + return 1; + + if (arg->data.str.data % 4) { + memprintf(err, "argument needs to be base64 encoded, and " + "can either be a string or a variable"); + return 0; + } + + dec_size = (arg->data.str.data / 4 * 3) + - (arg->data.str.area[arg->data.str.data-1] == '=' ? 1 : 0) + - (arg->data.str.area[arg->data.str.data-2] == '=' ? 1 : 0); + + if ((dec = malloc(dec_size)) == NULL) { + memprintf(err, "memory allocation error"); + return 0; + } + + dec_size = base64dec(arg->data.str.area, arg->data.str.data, dec, dec_size); + if (dec_size < 0) { + memprintf(err, "argument needs to be base64 encoded, and " + "can either be a string or a variable"); + free(dec); + return 0; + } + + /* base64 decoded */ + chunk_destroy(&arg->data.str); + arg->data.str.area = dec; + arg->data.str.data = dec_size; + return 1; +} + +#ifdef EVP_CIPH_GCM_MODE +static int check_aes_gcm(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + switch(args[0].data.sint) { + case 128: + case 192: + case 256: + break; + default: + memprintf(err, "key size must be 128, 192 or 256 (bits)."); + return 0; + } + + /* Try to decode variables. */ + if (!sample_check_arg_base64(&args[1], err)) { + memprintf(err, "failed to parse nonce : %s", *err); + return 0; + } + if (!sample_check_arg_base64(&args[2], err)) { + memprintf(err, "failed to parse key : %s", *err); + return 0; + } + if (!sample_check_arg_base64(&args[3], err)) { + memprintf(err, "failed to parse aead_tag : %s", *err); + return 0; + } + + return 1; +} + +/* Arguments: AES size in bits, nonce, key, tag. The last three arguments are base64 encoded */ +static int sample_conv_aes_gcm_dec(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct sample nonce, key, aead_tag; + struct buffer *smp_trash = NULL, *smp_trash_alloc = NULL; + EVP_CIPHER_CTX *ctx; + int dec_size, ret; + + smp_trash_alloc = alloc_trash_chunk(); + if (!smp_trash_alloc) + return 0; + + /* smp copy */ + smp_trash_alloc->data = smp->data.u.str.data; + if (unlikely(smp_trash_alloc->data > smp_trash_alloc->size)) + smp_trash_alloc->data = smp_trash_alloc->size; + memcpy(smp_trash_alloc->area, smp->data.u.str.area, smp_trash_alloc->data); + + ctx = EVP_CIPHER_CTX_new(); + + if (!ctx) + goto err; + + smp_trash = alloc_trash_chunk(); + if (!smp_trash) + goto err; + + smp_set_owner(&nonce, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_str(&arg_p[1], &nonce)) + goto err; + + if (arg_p[1].type == ARGT_VAR) { + dec_size = base64dec(nonce.data.u.str.area, nonce.data.u.str.data, smp_trash->area, smp_trash->size); + if (dec_size < 0) + goto err; + smp_trash->data = dec_size; + nonce.data.u.str = *smp_trash; + } + + /* Set cipher type and mode */ + switch(arg_p[0].data.sint) { + case 128: + EVP_DecryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL); + break; + case 192: + EVP_DecryptInit_ex(ctx, EVP_aes_192_gcm(), NULL, NULL, NULL); + break; + case 256: + EVP_DecryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL); + break; + } + + EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_IVLEN, nonce.data.u.str.data, NULL); + + /* Initialise IV */ + if(!EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, (unsigned char *) nonce.data.u.str.area)) + goto err; + + smp_set_owner(&key, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_str(&arg_p[2], &key)) + goto err; + + if (arg_p[2].type == ARGT_VAR) { + dec_size = base64dec(key.data.u.str.area, key.data.u.str.data, smp_trash->area, smp_trash->size); + if (dec_size < 0) + goto err; + smp_trash->data = dec_size; + key.data.u.str = *smp_trash; + } + + /* Initialise key */ + if (!EVP_DecryptInit_ex(ctx, NULL, NULL, (unsigned char *) key.data.u.str.area, NULL)) + goto err; + + if (!EVP_DecryptUpdate(ctx, (unsigned char *) smp_trash->area, (int *) &smp_trash->data, + (unsigned char *) smp_trash_alloc->area, (int) smp_trash_alloc->data)) + goto err; + + smp_set_owner(&aead_tag, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_str(&arg_p[3], &aead_tag)) + goto err; + + if (arg_p[3].type == ARGT_VAR) { + dec_size = base64dec(aead_tag.data.u.str.area, aead_tag.data.u.str.data, smp_trash_alloc->area, smp_trash_alloc->size); + if (dec_size < 0) + goto err; + smp_trash_alloc->data = dec_size; + aead_tag.data.u.str = *smp_trash_alloc; + } + + dec_size = smp_trash->data; + + EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_AEAD_SET_TAG, aead_tag.data.u.str.data, (void *) aead_tag.data.u.str.area); + ret = EVP_DecryptFinal_ex(ctx, (unsigned char *) smp_trash->area + smp_trash->data, (int *) &smp_trash->data); + + if (ret <= 0) + goto err; + + smp->data.u.str.data = dec_size + smp_trash->data; + smp->data.u.str.area = smp_trash->area; + smp->data.type = SMP_T_BIN; + smp_dup(smp); + free_trash_chunk(smp_trash_alloc); + free_trash_chunk(smp_trash); + return 1; + +err: + free_trash_chunk(smp_trash_alloc); + free_trash_chunk(smp_trash); + return 0; +} +#endif + +static int check_crypto_digest(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + const EVP_MD *evp = EVP_get_digestbyname(args[0].data.str.area); + + if (evp) + return 1; + + memprintf(err, "algorithm must be a valid OpenSSL message digest name."); + return 0; +} + +static int sample_conv_crypto_digest(const struct arg *args, struct sample *smp, void *private) +{ + struct buffer *trash = get_trash_chunk(); + unsigned char *md = (unsigned char*) trash->area; + unsigned int md_len = trash->size; + EVP_MD_CTX *ctx = EVP_MD_CTX_new(); + const EVP_MD *evp = EVP_get_digestbyname(args[0].data.str.area); + + if (!ctx) + return 0; + + if (!EVP_DigestInit_ex(ctx, evp, NULL) || + !EVP_DigestUpdate(ctx, smp->data.u.str.area, smp->data.u.str.data) || + !EVP_DigestFinal_ex(ctx, md, &md_len)) { + EVP_MD_CTX_free(ctx); + return 0; + } + + EVP_MD_CTX_free(ctx); + + trash->data = md_len; + smp->data.u.str = *trash; + smp->data.type = SMP_T_BIN; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +/* Take a numerical X509_V_ERR and return its constant name */ +static int sample_conv_x509_v_err(const struct arg *arg_p, struct sample *smp, void *private) +{ + const char *res = x509_v_err_int_to_str(smp->data.u.sint); + + /* if the value was found return its string */ + if (res) { + smp->data.u.str.area = (char *)res; + smp->data.u.str.data = strlen(res); + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_CONST; + + return 1; + } else { + struct buffer *smp_trash = get_trash_chunk(); + + /* if the conversion failed, output the numbers as string */ + chunk_printf(smp_trash, "%llu", smp->data.u.sint); + + smp->data.u.str = *smp_trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + + return 1; + } + + return 0; +} + +static int check_crypto_hmac(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err) +{ + if (!check_crypto_digest(args, conv, file, line, err)) + return 0; + + if (!sample_check_arg_base64(&args[1], err)) { + memprintf(err, "failed to parse key : %s", *err); + return 0; + } + + return 1; +} + +static int sample_conv_crypto_hmac(const struct arg *args, struct sample *smp, void *private) +{ + struct sample key; + struct buffer *trash = NULL, *key_trash = NULL; + unsigned char *md; + unsigned int md_len; + const EVP_MD *evp = EVP_get_digestbyname(args[0].data.str.area); + int dec_size; + + smp_set_owner(&key, smp->px, smp->sess, smp->strm, smp->opt); + if (!sample_conv_var2smp_str(&args[1], &key)) + return 0; + + if (args[1].type == ARGT_VAR) { + key_trash = alloc_trash_chunk(); + if (!key_trash) + goto err; + + dec_size = base64dec(key.data.u.str.area, key.data.u.str.data, key_trash->area, key_trash->size); + if (dec_size < 0) + goto err; + key_trash->data = dec_size; + key.data.u.str = *key_trash; + } + + trash = alloc_trash_chunk(); + if (!trash) + goto err; + + md = (unsigned char*) trash->area; + md_len = trash->size; + if (!HMAC(evp, key.data.u.str.area, key.data.u.str.data, (const unsigned char*) smp->data.u.str.area, + smp->data.u.str.data, md, &md_len)) + goto err; + + free_trash_chunk(key_trash); + + trash->data = md_len; + smp->data.u.str = *trash; + smp->data.type = SMP_T_BIN; + smp_dup(smp); + free_trash_chunk(trash); + return 1; + +err: + free_trash_chunk(key_trash); + free_trash_chunk(trash); + return 0; +} + +static int +smp_fetch_ssl_fc_has_early(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + SSL *ssl; + struct connection *conn; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + smp->flags = 0; + smp->data.type = SMP_T_BOOL; +#ifdef OPENSSL_IS_BORINGSSL + { + smp->data.u.sint = (SSL_in_early_data(ssl) && + SSL_early_data_accepted(ssl)); + } +#else + smp->data.u.sint = ((conn->flags & CO_FL_EARLY_DATA) && + (conn->flags & (CO_FL_EARLY_SSL_HS | CO_FL_SSL_WAIT_HS))) ? 1 : 0; +#endif + return 1; +} + +/* boolean, returns true if client cert was present */ +static int +smp_fetch_ssl_fc_has_crt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn = objt_conn(smp->sess->origin); + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (!ctx) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = SSL_SOCK_ST_FL_VERIFY_DONE & ctx->xprt_st ? 1 : 0; + + return 1; +} + +/* string, returns a string of a formatted full dn \C=..\O=..\OU=.. \CN=.. of the + * client certificate's root CA. + */ +#ifdef HAVE_SSL_get0_verified_chain +static int +smp_fetch_ssl_r_dn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + X509 *crt = NULL; + X509_NAME *name; + int ret = 0; + struct buffer *smp_trash; + struct connection *conn; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + crt = ssl_sock_get_verified_chain_root(ssl); + if (!crt) + goto out; + + name = X509_get_subject_name(crt); + if (!name) + goto out; + + smp_trash = get_trash_chunk(); + if (args[0].type == ARGT_STR && args[0].data.str.data > 0) { + int pos = 1; + + if (args[1].type == ARGT_SINT) + pos = args[1].data.sint; + + if (ssl_sock_get_dn_entry(name, &args[0].data.str, pos, smp_trash) <= 0) + goto out; + } + else if (args[2].type == ARGT_STR && args[2].data.str.data > 0) { + if (ssl_sock_get_dn_formatted(name, &args[2].data.str, smp_trash) <= 0) + goto out; + } + else if (ssl_sock_get_dn_oneline(name, smp_trash) <= 0) + goto out; + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_STR; + smp->data.u.str = *smp_trash; + ret = 1; +out: + return ret; +} +#endif + +/* binary, returns a certificate in a binary chunk (der/raw). + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_der(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + + X509 *crt = NULL; + int ret = 0; + struct buffer *smp_trash; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + + if (!crt) + goto out; + + smp_trash = get_trash_chunk(); + if (ssl_sock_crt2der(crt, smp_trash) <= 0) + goto out; + + smp->flags = SMP_F_VOL_SESS; + smp->data.u.str = *smp_trash; + smp->data.type = SMP_T_BIN; + ret = 1; +out: + /* SSL_get_peer_certificate, it increase X509 * ref count */ + if (cert_peer && crt) + X509_free(crt); + return ret; +} + +/* binary, returns a chain certificate in a binary chunk (der/raw). + * The 5th keyword char is used to support only peer cert + */ +static int +smp_fetch_ssl_x_chain_der(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + struct buffer *smp_trash; + struct buffer *tmp_trash = NULL; + struct connection *conn; + STACK_OF(X509) *certs = NULL; + X509 *crt = NULL; + SSL *ssl; + int ret = 0; + int num_certs; + int i; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + + if (!conn) + return 0; + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (!cert_peer) + return 0; + + certs = SSL_get_peer_cert_chain(ssl); + if (!certs) + return 0; + + num_certs = sk_X509_num(certs); + if (!num_certs) + goto out; + smp_trash = get_trash_chunk(); + tmp_trash = alloc_trash_chunk(); + if (!tmp_trash) + goto out; + for (i = 0; i < num_certs; i++) { + crt = sk_X509_value(certs, i); + if (ssl_sock_crt2der(crt, tmp_trash) <= 0) + goto out; + chunk_cat(smp_trash, tmp_trash); + } + + smp->flags = SMP_F_VOL_SESS; + smp->data.u.str = *smp_trash; + smp->data.type = SMP_T_BIN; + ret = 1; +out: + if (tmp_trash) + free_trash_chunk(tmp_trash); + return ret; +} + +/* binary, returns serial of certificate in a binary chunk. + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_serial(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + X509 *crt = NULL; + int ret = 0; + struct buffer *smp_trash; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + + if (!crt) + goto out; + + smp_trash = get_trash_chunk(); + if (ssl_sock_get_serial(crt, smp_trash) <= 0) + goto out; + + smp->flags = SMP_F_VOL_SESS; + smp->data.u.str = *smp_trash; + smp->data.type = SMP_T_BIN; + ret = 1; +out: + /* SSL_get_peer_certificate, it increase X509 * ref count */ + if (cert_peer && crt) + X509_free(crt); + return ret; +} + +/* binary, returns the client certificate's SHA-1 fingerprint (SHA-1 hash of DER-encoded certificate) in a binary chunk. + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_sha1(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + X509 *crt = NULL; + const EVP_MD *digest; + int ret = 0; + unsigned int len = 0; + struct buffer *smp_trash; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + if (!crt) + goto out; + + smp_trash = get_trash_chunk(); + digest = EVP_sha1(); + X509_digest(crt, digest, (unsigned char *) smp_trash->area, &len); + smp_trash->data = len; + smp->flags = SMP_F_VOL_SESS; + smp->data.u.str = *smp_trash; + smp->data.type = SMP_T_BIN; + ret = 1; +out: + /* SSL_get_peer_certificate, it increase X509 * ref count */ + if (cert_peer && crt) + X509_free(crt); + return ret; +} + +/* string, returns certificate's notafter date in ASN1_UTCTIME format. + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_notafter(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + X509 *crt = NULL; + int ret = 0; + struct buffer *smp_trash; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + if (!crt) + goto out; + + smp_trash = get_trash_chunk(); + if (ssl_sock_get_time(X509_getm_notAfter(crt), smp_trash) <= 0) + goto out; + + smp->flags = SMP_F_VOL_SESS; + smp->data.u.str = *smp_trash; + smp->data.type = SMP_T_STR; + ret = 1; +out: + /* SSL_get_peer_certificate, it increase X509 * ref count */ + if (cert_peer && crt) + X509_free(crt); + return ret; +} + +/* string, returns a string of a formatted full dn \C=..\O=..\OU=.. \CN=.. of certificate's issuer + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_i_dn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + X509 *crt = NULL; + X509_NAME *name; + int ret = 0; + struct buffer *smp_trash; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + if (!crt) + goto out; + + name = X509_get_issuer_name(crt); + if (!name) + goto out; + + smp_trash = get_trash_chunk(); + if (args[0].type == ARGT_STR && args[0].data.str.data > 0) { + int pos = 1; + + if (args[1].type == ARGT_SINT) + pos = args[1].data.sint; + + if (ssl_sock_get_dn_entry(name, &args[0].data.str, pos, smp_trash) <= 0) + goto out; + } + else if (args[2].type == ARGT_STR && args[2].data.str.data > 0) { + if (ssl_sock_get_dn_formatted(name, &args[2].data.str, smp_trash) <= 0) + goto out; + } + else if (ssl_sock_get_dn_oneline(name, smp_trash) <= 0) + goto out; + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_STR; + smp->data.u.str = *smp_trash; + ret = 1; +out: + /* SSL_get_peer_certificate, it increase X509 * ref count */ + if (cert_peer && crt) + X509_free(crt); + return ret; +} + +/* string, returns notbefore date in ASN1_UTCTIME format. + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_notbefore(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + X509 *crt = NULL; + int ret = 0; + struct buffer *smp_trash; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + if (!crt) + goto out; + + smp_trash = get_trash_chunk(); + if (ssl_sock_get_time(X509_getm_notBefore(crt), smp_trash) <= 0) + goto out; + + smp->flags = SMP_F_VOL_SESS; + smp->data.u.str = *smp_trash; + smp->data.type = SMP_T_STR; + ret = 1; +out: + /* SSL_get_peer_certificate, it increase X509 * ref count */ + if (cert_peer && crt) + X509_free(crt); + return ret; +} + +/* string, returns a string of a formatted full dn \C=..\O=..\OU=.. \CN=.. of certificate's subject + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_s_dn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + X509 *crt = NULL; + X509_NAME *name; + int ret = 0; + struct buffer *smp_trash; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + if (!crt) + goto out; + + name = X509_get_subject_name(crt); + if (!name) + goto out; + + smp_trash = get_trash_chunk(); + if (args[0].type == ARGT_STR && args[0].data.str.data > 0) { + int pos = 1; + + if (args[1].type == ARGT_SINT) + pos = args[1].data.sint; + + if (ssl_sock_get_dn_entry(name, &args[0].data.str, pos, smp_trash) <= 0) + goto out; + } + else if (args[2].type == ARGT_STR && args[2].data.str.data > 0) { + if (ssl_sock_get_dn_formatted(name, &args[2].data.str, smp_trash) <= 0) + goto out; + } + else if (ssl_sock_get_dn_oneline(name, smp_trash) <= 0) + goto out; + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_STR; + smp->data.u.str = *smp_trash; + ret = 1; +out: + /* SSL_get_peer_certificate, it increase X509 * ref count */ + if (cert_peer && crt) + X509_free(crt); + return ret; +} + +/* integer, returns true if current session use a client certificate */ +static int +smp_fetch_ssl_c_used(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + X509 *crt; + struct connection *conn; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + /* SSL_get_peer_certificate returns a ptr on allocated X509 struct */ + crt = ssl_sock_get_peer_certificate(ssl); + if (crt) { + X509_free(crt); + } + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = (crt != NULL); + return 1; +} + +/* integer, returns the certificate version + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_version(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + + X509 *crt; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + if (!crt) + return 0; + + smp->flags = SMP_F_VOL_SESS; + smp->data.u.sint = (unsigned int)(1 + X509_get_version(crt)); + /* SSL_get_peer_certificate increase X509 * ref count */ + if (cert_peer) + X509_free(crt); + smp->data.type = SMP_T_SINT; + + return 1; +} + +/* string, returns the certificate's signature algorithm. + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_sig_alg(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + X509 *crt; + __OPENSSL_110_CONST__ ASN1_OBJECT *algorithm; + int nid; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + if (!crt) + return 0; + + X509_ALGOR_get0(&algorithm, NULL, NULL, X509_get0_tbs_sigalg(crt)); + nid = OBJ_obj2nid(algorithm); + + smp->data.u.str.area = (char *)OBJ_nid2sn(nid); + if (!smp->data.u.str.area) { + /* SSL_get_peer_certificate increase X509 * ref count */ + if (cert_peer) + X509_free(crt); + return 0; + } + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_VOL_SESS | SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + /* SSL_get_peer_certificate increase X509 * ref count */ + if (cert_peer) + X509_free(crt); + + return 1; +} + +/* string, returns the certificate's key algorithm. + * The 5th keyword char is used to know if SSL_get_certificate or SSL_get_peer_certificate + * should be use. + */ +static int +smp_fetch_ssl_x_key_alg(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + int cert_peer = (kw[4] == 'c' || kw[4] == 's') ? 1 : 0; + int conn_server = (kw[4] == 's') ? 1 : 0; + X509 *crt; + ASN1_OBJECT *algorithm; + int nid; + struct connection *conn; + SSL *ssl; + + if (conn_server) + conn = smp->strm ? sc_conn(smp->strm->scb) : NULL; + else + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + if (cert_peer) + crt = ssl_sock_get_peer_certificate(ssl); + else + crt = SSL_get_certificate(ssl); + if (!crt) + return 0; + + X509_PUBKEY_get0_param(&algorithm, NULL, NULL, NULL, X509_get_X509_PUBKEY(crt)); + nid = OBJ_obj2nid(algorithm); + + smp->data.u.str.area = (char *)OBJ_nid2sn(nid); + if (!smp->data.u.str.area) { + /* SSL_get_peer_certificate increase X509 * ref count */ + if (cert_peer) + X509_free(crt); + return 0; + } + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_VOL_SESS | SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + if (cert_peer) + X509_free(crt); + + return 1; +} + +/* boolean, returns true if front conn. transport layer is SSL. + * This function is also usable on backend conn if the fetch keyword 5th + * char is 'b'. + */ +static int +smp_fetch_ssl_fc(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = conn_is_ssl(conn); + return 1; +} + +/* boolean, returns true if client present a SNI */ +static int +smp_fetch_ssl_fc_has_sni(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + struct connection *conn = objt_conn(smp->sess->origin); + SSL *ssl = ssl_sock_get_ssl_object(conn); + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = ssl && SSL_get_servername(ssl, TLSEXT_NAMETYPE_host_name) != NULL; + return 1; +#else + return 0; +#endif +} + +/* boolean, returns true if client session has been resumed. + * This function is also usable on backend conn if the fetch keyword 5th + * char is 'b'. + */ +static int +smp_fetch_ssl_fc_is_resumed(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + ssl = ssl_sock_get_ssl_object(conn); + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = ssl && SSL_session_reused(ssl); + return 1; +} + +/* + * string, returns the EC curve used for key agreement on the + * front and backend connection. + * + * The function to get the curve name (SSL_get_negotiated_group) is only available + * in OpenSSLv3 onwards and not for previous versions. + */ +#if (HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL) +static int +smp_fetch_ssl_fc_ec(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + int __maybe_unused nid; + char *curve_name; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + /* + * SSL_get0_group_name is a function to get the curve name and is available from + * OpenSSL v3.2 onwards. For OpenSSL >=3.0 and <3.2, we will continue to use + * SSL_get_negotiated_group to get the curve name. + */ + #if (HA_OPENSSL_VERSION_NUMBER >= 0x3020000fL) + curve_name = (char *)SSL_get0_group_name(ssl); + if (curve_name == NULL) + return 0; + else { + /** + * The curve name returned by SSL_get0_group_name is in lowercase whereas the curve + * name returned when we use `SSL_get_negotiated_group` and `OBJ_nid2sn` is the + * short name and is in upper case. To make the return value consistent across the + * different functional calls and to make it consistent while upgrading OpenSSL versions, + * will convert the curve name returned by SSL_get0_group_name to upper case. + */ + for (int i = 0; curve_name[i]; i++) + curve_name[i] = toupper(curve_name[i]); + } + #else + nid = SSL_get_negotiated_group(ssl); + if (!nid) + return 0; + curve_name = (char *)OBJ_nid2sn(nid); + if (curve_name == NULL) + return 0; + #endif + + smp->data.u.str.area = curve_name; + if (!smp->data.u.str.area) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_VOL_SESS | SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + + return 1; +} +#endif + +/* string, returns the used cipher if front conn. transport layer is SSL. + * This function is also usable on backend conn if the fetch keyword 5th + * char is 'b'. + */ +static int +smp_fetch_ssl_fc_cipher(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + smp->flags = 0; + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + smp->data.u.str.area = (char *)SSL_get_cipher_name(ssl); + if (!smp->data.u.str.area) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_VOL_SESS | SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + + return 1; +} + +/* integer, returns the algoritm's keysize if front conn. transport layer + * is SSL. + * This function is also usable on backend conn if the fetch keyword 5th + * char is 'b'. + */ +static int +smp_fetch_ssl_fc_alg_keysize(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + int sint; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + smp->flags = 0; + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (!SSL_get_cipher_bits(ssl, &sint)) + return 0; + + smp->flags = SMP_F_VOL_SESS; + smp->data.u.sint = sint; + smp->data.type = SMP_T_SINT; + + return 1; +} + +/* integer, returns the used keysize if front conn. transport layer is SSL. + * This function is also usable on backend conn if the fetch keyword 5th + * char is 'b'. + */ +static int +smp_fetch_ssl_fc_use_keysize(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + smp->flags = 0; + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + smp->data.u.sint = (unsigned int)SSL_get_cipher_bits(ssl, NULL); + if (!smp->data.u.sint) + return 0; + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_SINT; + + return 1; +} + +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) +static int +smp_fetch_ssl_fc_npn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + unsigned int len = 0; + + smp->flags = SMP_F_CONST; + smp->data.type = SMP_T_STR; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + smp->flags = SMP_F_VOL_SESS; + smp->data.u.str.area = NULL; + SSL_get0_next_proto_negotiated(ssl, + (const unsigned char **)&smp->data.u.str.area, + &len); + + if (!smp->data.u.str.area) + return 0; + + smp->data.u.str.data = len; + return 1; +} +#endif + +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation +static int +smp_fetch_ssl_fc_alpn(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + unsigned int len = 0; + + smp->flags = SMP_F_VOL_SESS | SMP_F_CONST; + smp->data.type = SMP_T_STR; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + smp->data.u.str.area = NULL; + SSL_get0_alpn_selected(ssl, + (const unsigned char **)&smp->data.u.str.area, + &len); + + if (!smp->data.u.str.area) + return 0; + + smp->data.u.str.data = len; + return 1; +} +#endif + +/* string, returns the used protocol if front conn. transport layer is SSL. + * This function is also usable on backend conn if the fetch keyword 5th + * char is 'b'. + */ +static int +smp_fetch_ssl_fc_protocol(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + smp->flags = 0; + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + smp->data.u.str.area = (char *)SSL_get_version(ssl); + if (!smp->data.u.str.area) + return 0; + + smp->data.type = SMP_T_STR; + smp->flags = SMP_F_VOL_SESS | SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + + return 1; +} + +/* binary, returns the SSL stream id if front conn. transport layer is SSL. + * This function is also usable on backend conn if the fetch keyword 5th + * char is 'b'. + */ +#if HA_OPENSSL_VERSION_NUMBER > 0x0090800fL +static int +smp_fetch_ssl_fc_session_id(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL_SESSION *ssl_sess; + SSL *ssl; + unsigned int len = 0; + + smp->flags = SMP_F_VOL_SESS | SMP_F_CONST; + smp->data.type = SMP_T_BIN; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + ssl_sess = SSL_get_session(ssl); + if (!ssl_sess) + return 0; + + smp->data.u.str.area = (char *)SSL_SESSION_get_id(ssl_sess, &len); + if (!smp->data.u.str.area || !len) + return 0; + + smp->data.u.str.data = len; + return 1; +} +#endif + + +#ifdef HAVE_SSL_EXTRACT_RANDOM +static int +smp_fetch_ssl_fc_random(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + struct buffer *data; + SSL *ssl; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + data = get_trash_chunk(); + if (kw[7] == 'c') + data->data = SSL_get_client_random(ssl, + (unsigned char *) data->area, + data->size); + else + data->data = SSL_get_server_random(ssl, + (unsigned char *) data->area, + data->size); + if (!data->data) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_BIN; + smp->data.u.str = *data; + + return 1; +} + +static int +smp_fetch_ssl_fc_session_key(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL_SESSION *ssl_sess; + struct buffer *data; + SSL *ssl; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + ssl_sess = SSL_get_session(ssl); + if (!ssl_sess) + return 0; + + data = get_trash_chunk(); + data->data = SSL_SESSION_get_master_key(ssl_sess, + (unsigned char *) data->area, + data->size); + if (!data->data) + return 0; + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_BIN; + smp->data.u.str = *data; + + return 1; +} +#endif + +static int +smp_fetch_ssl_fc_sni(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + struct connection *conn; + SSL *ssl; + + smp->flags = SMP_F_VOL_SESS | SMP_F_CONST; + smp->data.type = SMP_T_STR; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + smp->data.u.str.area = (char *)SSL_get_servername(ssl, TLSEXT_NAMETYPE_host_name); + if (!smp->data.u.str.area) { + /* We might have stored the SNI ourselves, look for it in the + * context's ex_data. + */ + smp->data.u.str.area = SSL_get_ex_data(ssl, ssl_client_sni_index); + + if (!smp->data.u.str.area) + return 0; + } + + smp->data.u.str.data = strlen(smp->data.u.str.area); + + return 1; +#else + /* SNI not supported */ + return 0; +#endif +} + +/* binary, returns tls client hello cipher list. + * Arguments: filter_option (0,1) + */ +static int +smp_fetch_ssl_fc_cl_bin(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct buffer *smp_trash; + struct connection *conn; + struct ssl_capture *capture; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + capture = SSL_get_ex_data(ssl, ssl_capture_ptr_index); + if (!capture) + return 0; + + if (args[0].data.sint) { + smp_trash = get_trash_chunk(); + exclude_tls_grease(capture->data + capture->ciphersuite_offset, capture->ciphersuite_len, smp_trash); + smp->data.u.str.area = smp_trash->area; + smp->data.u.str.data = smp_trash->data; + smp->flags = SMP_F_VOL_SESS; + } + else { + smp->data.u.str.area = capture->data + capture->ciphersuite_offset; + smp->data.u.str.data = capture->ciphersuite_len; + smp->flags = SMP_F_VOL_TEST | SMP_F_CONST; + } + + smp->data.type = SMP_T_BIN; + return 1; +} + +/* binary, returns tls client hello cipher list as hexadecimal string. + * Arguments: filter_option (0,1) + */ +static int +smp_fetch_ssl_fc_cl_hex(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct buffer *data; + + if (!smp_fetch_ssl_fc_cl_bin(args, smp, kw, private)) + return 0; + + data = get_trash_chunk(); + dump_binary(data, smp->data.u.str.area, smp->data.u.str.data); + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_BIN; + smp->data.u.str = *data; + return 1; +} + +/* integer, returns xxh64 hash of tls client hello cipher list. */ +static int +smp_fetch_ssl_fc_cl_xxh64(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + struct ssl_capture *capture; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + capture = SSL_get_ex_data(ssl, ssl_capture_ptr_index); + if (!capture) + return 0; + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = capture->xxh64; + return 1; +} + +static int +smp_fetch_ssl_fc_err(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + struct ssl_sock_ctx *ctx; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + if (!conn) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags = SMP_F_MAY_CHANGE; + return 0; + } + + ctx = conn_get_ssl_sock_ctx(conn); + if (!ctx) + return 0; + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = ctx->error_code; + return 1; +} + +static int +smp_fetch_ssl_fc_protocol_hello_id(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + struct ssl_capture *capture; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + capture = SSL_get_ex_data(ssl, ssl_capture_ptr_index); + if (!capture) + return 0; + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = capture->protocol_version; + return 1; +} + +static int +smp_fetch_ssl_fc_err_str(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + struct ssl_sock_ctx *ctx; + const char *err_code_str; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + if (!conn) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags = SMP_F_MAY_CHANGE; + return 0; + } + + ctx = conn_get_ssl_sock_ctx(conn); + if (!ctx || !ctx->error_code) + return 0; + + err_code_str = ERR_error_string(ctx->error_code, NULL); + + smp->flags = SMP_F_VOL_SESS; + smp->data.type = SMP_T_STR; + smp->data.u.str.area = (char*)err_code_str; + smp->data.u.str.data = strlen(err_code_str); + + return 1; +} + +/* binary, returns tls client hello extensions list. + * Arguments: filter_option (0,1) + */ +static int +smp_fetch_ssl_fc_ext_bin(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct buffer *smp_trash; + struct connection *conn; + struct ssl_capture *capture; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + capture = SSL_get_ex_data(ssl, ssl_capture_ptr_index); + if (!capture) + return 0; + + if (args[0].data.sint) { + smp_trash = get_trash_chunk(); + exclude_tls_grease(capture->data + capture->extensions_offset, capture->extensions_len, smp_trash); + smp->data.u.str.area = smp_trash->area; + smp->data.u.str.data = smp_trash->data; + smp->flags = SMP_F_VOL_SESS; + } + else { + smp->data.u.str.area = capture->data + capture->extensions_offset; + smp->data.u.str.data = capture->extensions_len; + smp->flags = SMP_F_VOL_TEST | SMP_F_CONST; + } + + smp->data.type = SMP_T_BIN; + return 1; +} + +/* binary, returns tls client hello supported elliptic curves. + * Arguments: filter_option (0,1) + */ +static int +smp_fetch_ssl_fc_ecl_bin(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct buffer *smp_trash; + struct connection *conn; + struct ssl_capture *capture; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + capture = SSL_get_ex_data(ssl, ssl_capture_ptr_index); + if (!capture) + return 0; + + if (args[0].data.sint) { + smp_trash = get_trash_chunk(); + exclude_tls_grease(capture->data + capture->ec_offset, capture->ec_len, smp_trash); + smp->data.u.str.area = smp_trash->area; + smp->data.u.str.data = smp_trash->data; + smp->flags = SMP_F_VOL_SESS; + } + else { + smp->data.u.str.area = capture->data + capture->ec_offset; + smp->data.u.str.data = capture->ec_len; + smp->flags = SMP_F_VOL_TEST | SMP_F_CONST; + } + + smp->data.type = SMP_T_BIN; + return 1; +} + +/* binary, returns tls client hello supported elliptic curve point formats */ +static int +smp_fetch_ssl_fc_ecf_bin(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + struct ssl_capture *capture; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + capture = SSL_get_ex_data(ssl, ssl_capture_ptr_index); + if (!capture) + return 0; + + smp->flags = SMP_F_VOL_TEST | SMP_F_CONST; + smp->data.type = SMP_T_BIN; + smp->data.u.str.area = capture->data + capture->ec_formats_offset; + smp->data.u.str.data = capture->ec_formats_len; + return 1; +} + +/* Dump the SSL keylog, it only works with "tune.ssl.keylog 1" */ +#ifdef HAVE_SSL_KEYLOG +static int smp_fetch_ssl_x_keylog(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + struct ssl_keylog *keylog; + SSL *ssl; + char *src = NULL; + const char *sfx; + + if (global_ssl.keylog <= 0) + return 0; + + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + if (!conn) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + keylog = SSL_get_ex_data(ssl, ssl_keylog_index); + if (!keylog) + return 0; + + sfx = kw + strlen("ssl_xx_"); + + if (strcmp(sfx, "client_early_traffic_secret") == 0) { + src = keylog->client_early_traffic_secret; + } else if (strcmp(sfx, "client_handshake_traffic_secret") == 0) { + src = keylog->client_handshake_traffic_secret; + } else if (strcmp(sfx, "server_handshake_traffic_secret") == 0) { + src = keylog->server_handshake_traffic_secret; + } else if (strcmp(sfx, "client_traffic_secret_0") == 0) { + src = keylog->client_traffic_secret_0; + } else if (strcmp(sfx, "server_traffic_secret_0") == 0) { + src = keylog->server_traffic_secret_0; + } else if (strcmp(sfx, "exporter_secret") == 0) { + src = keylog->exporter_secret; + } else if (strcmp(sfx, "early_exporter_secret") == 0) { + src = keylog->early_exporter_secret; + } + + if (!src || !*src) + return 0; + + smp->data.u.str.area = src; + smp->data.type = SMP_T_STR; + smp->flags |= SMP_F_VOL_TEST | SMP_F_CONST; + smp->data.u.str.data = strlen(smp->data.u.str.area); + return 1; +} +#endif + +static int +smp_fetch_ssl_fc_cl_str(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ +#if defined(OPENSSL_IS_BORINGSSL) || defined(SSL_CTRL_GET_RAW_CIPHERLIST) + struct buffer *data; + int i; + + if (!smp_fetch_ssl_fc_cl_bin(args, smp, kw, private)) + return 0; + + data = get_trash_chunk(); + for (i = 0; i + 1 < smp->data.u.str.data; i += 2) { + const char *str; + const SSL_CIPHER *cipher; + const unsigned char *bin = (const unsigned char *) smp->data.u.str.area + i; + uint16_t id = (bin[0] << 8) | bin[1]; +#if defined(OPENSSL_IS_BORINGSSL) + cipher = SSL_get_cipher_by_value(id); +#else + struct connection *conn = __objt_conn(smp->sess->origin); + SSL *ssl = ssl_sock_get_ssl_object(conn); + cipher = SSL_CIPHER_find(ssl, bin); +#endif + str = SSL_CIPHER_get_name(cipher); + if (!str || strcmp(str, "(NONE)") == 0) + chunk_appendf(data, "%sUNKNOWN(%04x)", i == 0 ? "" : ",", id); + else + chunk_appendf(data, "%s%s", i == 0 ? "" : ",", str); + } + smp->data.type = SMP_T_STR; + smp->data.u.str = *data; + return 1; +#else + return smp_fetch_ssl_fc_cl_xxh64(args, smp, kw, private); +#endif +} + +#if HA_OPENSSL_VERSION_NUMBER > 0x0090800fL +static int +smp_fetch_ssl_fc_unique_id(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + int finished_len; + struct buffer *finished_trash; + SSL *ssl; + + if (obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + conn = (kw[4] == 'b') ? sc_conn(__objt_check(smp->sess->origin)->sc) : NULL; + else + conn = (kw[4] != 'b') ? objt_conn(smp->sess->origin) : + smp->strm ? sc_conn(smp->strm->scb) : NULL; + + smp->flags = 0; + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + finished_trash = get_trash_chunk(); + if (!SSL_session_reused(ssl)) + finished_len = SSL_get_peer_finished(ssl, + finished_trash->area, + finished_trash->size); + else + finished_len = SSL_get_finished(ssl, + finished_trash->area, + finished_trash->size); + + if (!finished_len) + return 0; + + finished_trash->data = finished_len; + smp->flags = SMP_F_VOL_SESS; + smp->data.u.str = *finished_trash; + smp->data.type = SMP_T_BIN; + + return 1; +} +#endif + +/* integer, returns the first verify error in CA chain of client certificate chain. */ +static int +smp_fetch_ssl_c_ca_err(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn = objt_conn(smp->sess->origin); + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (conn && conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags = SMP_F_MAY_CHANGE; + return 0; + } + + if (!ctx) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = (unsigned long long int)SSL_SOCK_ST_TO_CA_ERROR(ctx->xprt_st); + smp->flags = SMP_F_VOL_SESS; + + return 1; +} + +/* integer, returns the depth of the first verify error in CA chain of client certificate chain. */ +static int +smp_fetch_ssl_c_ca_err_depth(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn = objt_conn(smp->sess->origin); + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (conn && conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags = SMP_F_MAY_CHANGE; + return 0; + } + + if (!ctx) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = (long long int)SSL_SOCK_ST_TO_CAEDEPTH(ctx->xprt_st); + smp->flags = SMP_F_VOL_SESS; + + return 1; +} + +/* integer, returns the first verify error on client certificate */ +static int +smp_fetch_ssl_c_err(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn = objt_conn(smp->sess->origin); + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (conn && conn->flags & CO_FL_WAIT_XPRT && !conn->err_code) { + smp->flags = SMP_F_MAY_CHANGE; + return 0; + } + + if (!ctx) + return 0; + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = (long long int)SSL_SOCK_ST_TO_CRTERROR(ctx->xprt_st); + smp->flags = SMP_F_VOL_SESS; + + return 1; +} + +/* integer, returns the verify result on client cert */ +static int +smp_fetch_ssl_c_verify(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + SSL *ssl; + + conn = objt_conn(smp->sess->origin); + ssl = ssl_sock_get_ssl_object(conn); + if (!ssl) + return 0; + + if (conn->flags & CO_FL_WAIT_XPRT) { + smp->flags = SMP_F_MAY_CHANGE; + return 0; + } + + smp->data.type = SMP_T_SINT; + smp->data.u.sint = (long long int)SSL_get_verify_result(ssl); + smp->flags = SMP_F_VOL_SESS; + + return 1; +} + +/* Argument validation functions */ + +/* This function is used to validate the arguments passed to any "x_dn" ssl + * keywords. These keywords support specifying a third parameter that must be + * either empty or the value "rfc2253". Returns 0 on error, non-zero if OK. + */ +int val_dnfmt(struct arg *arg, char **err_msg) +{ + if (arg && arg[2].type == ARGT_STR && arg[2].data.str.data > 0 && (strcmp(arg[2].data.str.area, "rfc2253") != 0)) { + memprintf(err_msg, "only rfc2253 or a blank value are currently supported as the format argument."); + return 0; + } + return 1; +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "ssl_bc", smp_fetch_ssl_fc, 0, NULL, SMP_T_BOOL, SMP_USE_L5SRV }, + { "ssl_bc_alg_keysize", smp_fetch_ssl_fc_alg_keysize, 0, NULL, SMP_T_SINT, SMP_USE_L5SRV }, +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + { "ssl_bc_alpn", smp_fetch_ssl_fc_alpn, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, +#endif + { "ssl_bc_cipher", smp_fetch_ssl_fc_cipher, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, +#if (HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL) + { "ssl_bc_curve", smp_fetch_ssl_fc_ec, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, +#endif +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + { "ssl_bc_npn", smp_fetch_ssl_fc_npn, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, +#endif + { "ssl_bc_is_resumed", smp_fetch_ssl_fc_is_resumed, 0, NULL, SMP_T_BOOL, SMP_USE_L5SRV }, + { "ssl_bc_protocol", smp_fetch_ssl_fc_protocol, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, + { "ssl_bc_unique_id", smp_fetch_ssl_fc_unique_id, 0, NULL, SMP_T_BIN, SMP_USE_L5SRV }, + { "ssl_bc_use_keysize", smp_fetch_ssl_fc_use_keysize, 0, NULL, SMP_T_SINT, SMP_USE_L5SRV }, +#if HA_OPENSSL_VERSION_NUMBER > 0x0090800fL + { "ssl_bc_session_id", smp_fetch_ssl_fc_session_id, 0, NULL, SMP_T_BIN, SMP_USE_L5SRV }, +#endif +#ifdef HAVE_SSL_EXTRACT_RANDOM + { "ssl_bc_client_random", smp_fetch_ssl_fc_random, 0, NULL, SMP_T_BIN, SMP_USE_L5SRV }, + { "ssl_bc_server_random", smp_fetch_ssl_fc_random, 0, NULL, SMP_T_BIN, SMP_USE_L5SRV }, + { "ssl_bc_session_key", smp_fetch_ssl_fc_session_key, 0, NULL, SMP_T_BIN, SMP_USE_L5SRV }, +#endif + { "ssl_bc_err", smp_fetch_ssl_fc_err, 0, NULL, SMP_T_SINT, SMP_USE_L5SRV }, + { "ssl_bc_err_str", smp_fetch_ssl_fc_err_str, 0, NULL, SMP_T_STR, SMP_USE_L5SRV }, + { "ssl_c_ca_err", smp_fetch_ssl_c_ca_err, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_c_ca_err_depth", smp_fetch_ssl_c_ca_err_depth, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_c_der", smp_fetch_ssl_x_der, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_c_chain_der", smp_fetch_ssl_x_chain_der, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_c_err", smp_fetch_ssl_c_err, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_c_i_dn", smp_fetch_ssl_x_i_dn, ARG3(0,STR,SINT,STR),val_dnfmt, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_c_key_alg", smp_fetch_ssl_x_key_alg, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_c_notafter", smp_fetch_ssl_x_notafter, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_c_notbefore", smp_fetch_ssl_x_notbefore, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, +#ifdef HAVE_SSL_get0_verified_chain + { "ssl_c_r_dn", smp_fetch_ssl_r_dn, ARG3(0,STR,SINT,STR),val_dnfmt, SMP_T_STR, SMP_USE_L5CLI }, +#endif + { "ssl_c_sig_alg", smp_fetch_ssl_x_sig_alg, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_c_s_dn", smp_fetch_ssl_x_s_dn, ARG3(0,STR,SINT,STR),val_dnfmt, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_c_serial", smp_fetch_ssl_x_serial, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_c_sha1", smp_fetch_ssl_x_sha1, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_c_used", smp_fetch_ssl_c_used, 0, NULL, SMP_T_BOOL, SMP_USE_L5CLI }, + { "ssl_c_verify", smp_fetch_ssl_c_verify, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_c_version", smp_fetch_ssl_x_version, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_f_der", smp_fetch_ssl_x_der, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_f_i_dn", smp_fetch_ssl_x_i_dn, ARG3(0,STR,SINT,STR),val_dnfmt, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_f_key_alg", smp_fetch_ssl_x_key_alg, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_f_notafter", smp_fetch_ssl_x_notafter, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_f_notbefore", smp_fetch_ssl_x_notbefore, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_f_sig_alg", smp_fetch_ssl_x_sig_alg, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_f_s_dn", smp_fetch_ssl_x_s_dn, ARG3(0,STR,SINT,STR),val_dnfmt, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_f_serial", smp_fetch_ssl_x_serial, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_f_sha1", smp_fetch_ssl_x_sha1, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_f_version", smp_fetch_ssl_x_version, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_fc", smp_fetch_ssl_fc, 0, NULL, SMP_T_BOOL, SMP_USE_L5CLI }, + { "ssl_fc_alg_keysize", smp_fetch_ssl_fc_alg_keysize, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_fc_cipher", smp_fetch_ssl_fc_cipher, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, +#if (HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL) + { "ssl_fc_curve", smp_fetch_ssl_fc_ec, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, +#endif + { "ssl_fc_has_crt", smp_fetch_ssl_fc_has_crt, 0, NULL, SMP_T_BOOL, SMP_USE_L5CLI }, + { "ssl_fc_has_early", smp_fetch_ssl_fc_has_early, 0, NULL, SMP_T_BOOL, SMP_USE_L5CLI }, + { "ssl_fc_has_sni", smp_fetch_ssl_fc_has_sni, 0, NULL, SMP_T_BOOL, SMP_USE_L5CLI }, + { "ssl_fc_is_resumed", smp_fetch_ssl_fc_is_resumed, 0, NULL, SMP_T_BOOL, SMP_USE_L5CLI }, +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + { "ssl_fc_npn", smp_fetch_ssl_fc_npn, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, +#endif +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + { "ssl_fc_alpn", smp_fetch_ssl_fc_alpn, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, +#endif + { "ssl_fc_protocol", smp_fetch_ssl_fc_protocol, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, +#if HA_OPENSSL_VERSION_NUMBER > 0x0090800fL + { "ssl_fc_unique_id", smp_fetch_ssl_fc_unique_id, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, +#endif + { "ssl_fc_use_keysize", smp_fetch_ssl_fc_use_keysize, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, +#if HA_OPENSSL_VERSION_NUMBER > 0x0090800fL + { "ssl_fc_session_id", smp_fetch_ssl_fc_session_id, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, +#endif +#ifdef HAVE_SSL_EXTRACT_RANDOM + { "ssl_fc_client_random", smp_fetch_ssl_fc_random, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_fc_server_random", smp_fetch_ssl_fc_random, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_fc_session_key", smp_fetch_ssl_fc_session_key, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, +#endif + +#ifdef HAVE_SSL_KEYLOG + { "ssl_fc_client_early_traffic_secret", smp_fetch_ssl_x_keylog, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_client_handshake_traffic_secret", smp_fetch_ssl_x_keylog, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_server_handshake_traffic_secret", smp_fetch_ssl_x_keylog, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_client_traffic_secret_0", smp_fetch_ssl_x_keylog, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_server_traffic_secret_0", smp_fetch_ssl_x_keylog, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_exporter_secret", smp_fetch_ssl_x_keylog, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_early_exporter_secret", smp_fetch_ssl_x_keylog, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, +#endif + + { "ssl_fc_sni", smp_fetch_ssl_fc_sni, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_cipherlist_bin", smp_fetch_ssl_fc_cl_bin, ARG1(0,SINT), NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_cipherlist_hex", smp_fetch_ssl_fc_cl_hex, ARG1(0,SINT), NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_fc_cipherlist_str", smp_fetch_ssl_fc_cl_str, ARG1(0,SINT), NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_cipherlist_xxh", smp_fetch_ssl_fc_cl_xxh64, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_fc_err", smp_fetch_ssl_fc_err, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_fc_err_str", smp_fetch_ssl_fc_err_str, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_protocol_hello_id",smp_fetch_ssl_fc_protocol_hello_id,0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { "ssl_fc_extlist_bin", smp_fetch_ssl_fc_ext_bin, ARG1(0,SINT), NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_eclist_bin", smp_fetch_ssl_fc_ecl_bin, ARG1(0,SINT), NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_fc_ecformats_bin", smp_fetch_ssl_fc_ecf_bin, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + +/* SSL server certificate fetches */ + { "ssl_s_der", smp_fetch_ssl_x_der, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_s_chain_der", smp_fetch_ssl_x_chain_der, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_s_key_alg", smp_fetch_ssl_x_key_alg, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_s_notafter", smp_fetch_ssl_x_notafter, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_s_notbefore", smp_fetch_ssl_x_notbefore, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_s_sig_alg", smp_fetch_ssl_x_sig_alg, 0, NULL, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_s_s_dn", smp_fetch_ssl_x_s_dn, ARG3(0,STR,SINT,STR),val_dnfmt, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_s_i_dn", smp_fetch_ssl_x_i_dn, ARG3(0,STR,SINT,STR),val_dnfmt, SMP_T_STR, SMP_USE_L5CLI }, + { "ssl_s_serial", smp_fetch_ssl_x_serial, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_s_sha1", smp_fetch_ssl_x_sha1, 0, NULL, SMP_T_BIN, SMP_USE_L5CLI }, + { "ssl_s_version", smp_fetch_ssl_x_version, 0, NULL, SMP_T_SINT, SMP_USE_L5CLI }, + { NULL, NULL, 0, 0, 0 }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_conv_kw_list sample_conv_kws = {ILH, { + { "sha2", sample_conv_sha2, ARG1(0, SINT), smp_check_sha2, SMP_T_BIN, SMP_T_BIN }, +#ifdef EVP_CIPH_GCM_MODE + { "aes_gcm_dec", sample_conv_aes_gcm_dec, ARG4(4,SINT,STR,STR,STR), check_aes_gcm, SMP_T_BIN, SMP_T_BIN }, +#endif + { "x509_v_err_str", sample_conv_x509_v_err, 0, NULL, SMP_T_SINT, SMP_T_STR }, + { "digest", sample_conv_crypto_digest, ARG1(1,STR), check_crypto_digest, SMP_T_BIN, SMP_T_BIN }, + { "hmac", sample_conv_crypto_hmac, ARG2(2,STR,STR), check_crypto_hmac, SMP_T_BIN, SMP_T_BIN }, +#if defined(HAVE_CRYPTO_memcmp) + { "secure_memcmp", sample_conv_secure_memcmp, ARG1(1,STR), smp_check_secure_memcmp, SMP_T_BIN, SMP_T_BOOL }, +#endif + { NULL, NULL, 0, 0, 0 }, +}}; + +INITCALL1(STG_REGISTER, sample_register_convs, &sample_conv_kws); + + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct acl_kw_list acl_kws = {ILH, { + { "ssl_fc_sni_end", "ssl_fc_sni", PAT_MATCH_END }, + { "ssl_fc_sni_reg", "ssl_fc_sni", PAT_MATCH_REG }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, acl_register_keywords, &acl_kws); diff --git a/src/ssl_sock.c b/src/ssl_sock.c new file mode 100644 index 0000000..6fbabb4 --- /dev/null +++ b/src/ssl_sock.c @@ -0,0 +1,8100 @@ + +/* + * SSL/TLS transport layer over SOCK_STREAM sockets + * + * Copyright (C) 2012 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Acknowledgement: + * We'd like to specially thank the Stud project authors for a very clean + * and well documented code which helped us understand how the OpenSSL API + * ought to be used in non-blocking mode. This is one difficult part which + * is not easy to get from the OpenSSL doc, and reading the Stud code made + * it much more obvious than the examples in the OpenSSL package. Keep up + * the good works, guys ! + * + * Stud is an extremely efficient and scalable SSL/TLS proxy which combines + * particularly well with haproxy. For more info about this project, visit : + * https://github.com/bumptech/stud + * + */ + +/* Note: do NOT include openssl/xxx.h here, do it in openssl-compat.h */ +#define _GNU_SOURCE +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <netdb.h> +#include <netinet/tcp.h> + +#include <import/ebpttree.h> +#include <import/ebsttree.h> +#include <import/lru.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/arg.h> +#include <haproxy/base64.h> +#include <haproxy/channel.h> +#include <haproxy/chunk.h> +#include <haproxy/cli.h> +#include <haproxy/connection.h> +#include <haproxy/dynbuf.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/http_rules.h> +#include <haproxy/log.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/pattern-t.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/proxy.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_openssl_compat.h> +#include <haproxy/quic_tp.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/shctx.h> +#include <haproxy/ssl_ckch.h> +#include <haproxy/ssl_crtlist.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/ssl_utils.h> +#include <haproxy/stats.h> +#include <haproxy/stconn.h> +#include <haproxy/stream-t.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/vars.h> +#include <haproxy/xxhash.h> +#include <haproxy/istbuf.h> +#include <haproxy/ssl_ocsp.h> + + +/* ***** READ THIS before adding code here! ***** + * + * Due to API incompatibilities between multiple OpenSSL versions and their + * derivatives, it's often tempting to add macros to (re-)define certain + * symbols. Please do not do this here, and do it in common/openssl-compat.h + * exclusively so that the whole code consistently uses the same macros. + * + * Whenever possible if a macro is missing in certain versions, it's better + * to conditionally define it in openssl-compat.h than using lots of ifdefs. + */ + +int nb_engines = 0; + +static struct eb_root cert_issuer_tree = EB_ROOT; /* issuers tree from "issuers-chain-path" */ + +struct global_ssl global_ssl = { +#ifdef LISTEN_DEFAULT_CIPHERS + .listen_default_ciphers = LISTEN_DEFAULT_CIPHERS, +#endif +#ifdef CONNECT_DEFAULT_CIPHERS + .connect_default_ciphers = CONNECT_DEFAULT_CIPHERS, +#endif +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + .listen_default_ciphersuites = LISTEN_DEFAULT_CIPHERSUITES, + .connect_default_ciphersuites = CONNECT_DEFAULT_CIPHERSUITES, +#endif + .listen_default_ssloptions = BC_SSL_O_NONE, + .connect_default_ssloptions = SRV_SSL_O_NONE, + + .listen_default_sslmethods.flags = MC_SSL_O_ALL, + .listen_default_sslmethods.min = CONF_TLSV_NONE, + .listen_default_sslmethods.max = CONF_TLSV_NONE, + .connect_default_sslmethods.flags = MC_SSL_O_ALL, + .connect_default_sslmethods.min = CONF_TLSV_NONE, + .connect_default_sslmethods.max = CONF_TLSV_NONE, + +#ifdef DEFAULT_SSL_MAX_RECORD + .max_record = DEFAULT_SSL_MAX_RECORD, +#endif + .hard_max_record = 0, + .default_dh_param = SSL_DEFAULT_DH_PARAM, + .ctx_cache = DEFAULT_SSL_CTX_CACHE, + .capture_buffer_size = 0, + .extra_files = SSL_GF_ALL, + .extra_files_noext = 0, +#ifdef HAVE_SSL_KEYLOG + .keylog = 0, +#endif +#ifndef OPENSSL_NO_OCSP + .ocsp_update.delay_max = SSL_OCSP_UPDATE_DELAY_MAX, + .ocsp_update.delay_min = SSL_OCSP_UPDATE_DELAY_MIN, +#endif +}; + +static BIO_METHOD *ha_meth; + +DECLARE_STATIC_POOL(ssl_sock_ctx_pool, "ssl_sock_ctx", sizeof(struct ssl_sock_ctx)); + +DECLARE_STATIC_POOL(ssl_sock_client_sni_pool, "ssl_sock_client_sni", TLSEXT_MAXLEN_host_name + 1); + +/* ssl stats module */ +enum { + SSL_ST_SESS, + SSL_ST_REUSED_SESS, + SSL_ST_FAILED_HANDSHAKE, + + SSL_ST_STATS_COUNT /* must be the last member of the enum */ +}; + +static struct name_desc ssl_stats[] = { + [SSL_ST_SESS] = { .name = "ssl_sess", + .desc = "Total number of ssl sessions established" }, + [SSL_ST_REUSED_SESS] = { .name = "ssl_reused_sess", + .desc = "Total number of ssl sessions reused" }, + [SSL_ST_FAILED_HANDSHAKE] = { .name = "ssl_failed_handshake", + .desc = "Total number of failed handshake" }, +}; + +static struct ssl_counters { + long long sess; + long long reused_sess; + long long failed_handshake; +} ssl_counters; + +static void ssl_fill_stats(void *data, struct field *stats) +{ + struct ssl_counters *counters = data; + + stats[SSL_ST_SESS] = mkf_u64(FN_COUNTER, counters->sess); + stats[SSL_ST_REUSED_SESS] = mkf_u64(FN_COUNTER, counters->reused_sess); + stats[SSL_ST_FAILED_HANDSHAKE] = mkf_u64(FN_COUNTER, counters->failed_handshake); +} + +static struct stats_module ssl_stats_module = { + .name = "ssl", + .fill_stats = ssl_fill_stats, + .stats = ssl_stats, + .stats_count = SSL_ST_STATS_COUNT, + .counters = &ssl_counters, + .counters_size = sizeof(ssl_counters), + .domain_flags = MK_STATS_PROXY_DOMAIN(STATS_PX_CAP_FE|STATS_PX_CAP_LI|STATS_PX_CAP_BE|STATS_PX_CAP_SRV), + .clearable = 1, +}; + +INITCALL1(STG_REGISTER, stats_register_module, &ssl_stats_module); + +/* CLI context for "show tls-keys" */ +struct show_keys_ctx { + struct tls_keys_ref *next_ref; /* next reference to be dumped */ + int names_only; /* non-zero = only show file names */ + int next_index; /* next index to be dumped */ + int dump_entries; /* dump entries also */ + enum { + SHOW_KEYS_INIT = 0, + SHOW_KEYS_LIST, + SHOW_KEYS_DONE, + } state; /* phase of the current dump */ +}; + +/* ssl_sock_io_cb is exported to see it resolved in "show fd" */ +struct task *ssl_sock_io_cb(struct task *, void *, unsigned int); +static int ssl_sock_handshake(struct connection *conn, unsigned int flag); + +/* Methods to implement OpenSSL BIO */ +static int ha_ssl_write(BIO *h, const char *buf, int num) +{ + struct buffer tmpbuf; + struct ssl_sock_ctx *ctx; + uint flags; + int ret; + + ctx = BIO_get_data(h); + tmpbuf.size = num; + tmpbuf.area = (void *)(uintptr_t)buf; + tmpbuf.data = num; + tmpbuf.head = 0; + flags = (ctx->xprt_st & SSL_SOCK_SEND_MORE) ? CO_SFL_MSG_MORE : 0; + ret = ctx->xprt->snd_buf(ctx->conn, ctx->xprt_ctx, &tmpbuf, num, flags); + BIO_clear_retry_flags(h); + if (ret == 0 && !(ctx->conn->flags & (CO_FL_ERROR | CO_FL_SOCK_WR_SH))) { + BIO_set_retry_write(h); + ret = -1; + } + return ret; +} + +static int ha_ssl_gets(BIO *h, char *buf, int size) +{ + + return 0; +} + +static int ha_ssl_puts(BIO *h, const char *str) +{ + + return ha_ssl_write(h, str, strlen(str)); +} + +static int ha_ssl_read(BIO *h, char *buf, int size) +{ + struct buffer tmpbuf; + struct ssl_sock_ctx *ctx; + int ret; + + ctx = BIO_get_data(h); + tmpbuf.size = size; + tmpbuf.area = buf; + tmpbuf.data = 0; + tmpbuf.head = 0; + ret = ctx->xprt->rcv_buf(ctx->conn, ctx->xprt_ctx, &tmpbuf, size, 0); + BIO_clear_retry_flags(h); + if (ret == 0 && !(ctx->conn->flags & (CO_FL_ERROR | CO_FL_SOCK_RD_SH))) { + BIO_set_retry_read(h); + ret = -1; + } + + return ret; +} + +static long ha_ssl_ctrl(BIO *h, int cmd, long arg1, void *arg2) +{ + int ret = 0; + switch (cmd) { + case BIO_CTRL_DUP: + case BIO_CTRL_FLUSH: + ret = 1; + break; + } + return ret; +} + +static int ha_ssl_new(BIO *h) +{ + BIO_set_init(h, 1); + BIO_set_data(h, NULL); + BIO_clear_flags(h, ~0); + return 1; +} + +static int ha_ssl_free(BIO *data) +{ + + return 1; +} + + +#if defined(USE_THREAD) && (HA_OPENSSL_VERSION_NUMBER < 0x10100000L) + +static HA_RWLOCK_T *ssl_rwlocks; + + +unsigned long ssl_id_function(void) +{ + return (unsigned long)tid; +} + +void ssl_locking_function(int mode, int n, const char * file, int line) +{ + if (mode & CRYPTO_LOCK) { + if (mode & CRYPTO_READ) + HA_RWLOCK_RDLOCK(SSL_LOCK, &ssl_rwlocks[n]); + else + HA_RWLOCK_WRLOCK(SSL_LOCK, &ssl_rwlocks[n]); + } + else { + if (mode & CRYPTO_READ) + HA_RWLOCK_RDUNLOCK(SSL_LOCK, &ssl_rwlocks[n]); + else + HA_RWLOCK_WRUNLOCK(SSL_LOCK, &ssl_rwlocks[n]); + } +} + +static int ssl_locking_init(void) +{ + int i; + + ssl_rwlocks = malloc(sizeof(HA_RWLOCK_T)*CRYPTO_num_locks()); + if (!ssl_rwlocks) + return -1; + + for (i = 0 ; i < CRYPTO_num_locks() ; i++) + HA_RWLOCK_INIT(&ssl_rwlocks[i]); + + CRYPTO_set_id_callback(ssl_id_function); + CRYPTO_set_locking_callback(ssl_locking_function); + + return 0; +} + +#endif + +__decl_thread(HA_SPINLOCK_T ckch_lock); + + + +/* mimic what X509_STORE_load_locations do with store_ctx */ +static int ssl_set_cert_crl_file(X509_STORE *store_ctx, char *path) +{ + X509_STORE *store = NULL; + struct cafile_entry *ca_e = ssl_store_get_cafile_entry(path, 0); + if (ca_e) + store = ca_e->ca_store; + if (store_ctx && store) { + int i; + X509_OBJECT *obj; + STACK_OF(X509_OBJECT) *objs = X509_STORE_get0_objects(store); + for (i = 0; i < sk_X509_OBJECT_num(objs); i++) { + obj = sk_X509_OBJECT_value(objs, i); + switch (X509_OBJECT_get_type(obj)) { + case X509_LU_X509: + X509_STORE_add_cert(store_ctx, X509_OBJECT_get0_X509(obj)); + break; + case X509_LU_CRL: + X509_STORE_add_crl(store_ctx, X509_OBJECT_get0_X509_CRL(obj)); + break; + default: + break; + } + } + return 1; + } + return 0; +} + +/* SSL_CTX_load_verify_locations substitute, internally call X509_STORE_load_locations */ +static int ssl_set_verify_locations_file(SSL_CTX *ctx, char *path) +{ + X509_STORE *store_ctx = SSL_CTX_get_cert_store(ctx); + return ssl_set_cert_crl_file(store_ctx, path); +} + +/* + Extract CA_list from CA_file already in tree. + Duplicate ca_name is tracking with ebtree. It's simplify openssl compatibility. + Return a shared ca_list: SSL_dup_CA_list must be used before set it on SSL_CTX. +*/ +static STACK_OF(X509_NAME)* ssl_get_client_ca_file(char *path) +{ + struct ebmb_node *eb; + struct cafile_entry *ca_e; + + eb = ebst_lookup(&cafile_tree, path); + if (!eb) + return NULL; + ca_e = ebmb_entry(eb, struct cafile_entry, node); + + if (ca_e->ca_list == NULL) { + int i; + unsigned long key; + struct eb_root ca_name_tree = EB_ROOT; + struct eb64_node *node, *back; + struct { + struct eb64_node node; + X509_NAME *xname; + } *ca_name; + STACK_OF(X509_OBJECT) *objs; + STACK_OF(X509_NAME) *skn; + X509 *x; + X509_NAME *xn; + + skn = sk_X509_NAME_new_null(); + /* take x509 from cafile_tree */ + objs = X509_STORE_get0_objects(ca_e->ca_store); + for (i = 0; i < sk_X509_OBJECT_num(objs); i++) { + x = X509_OBJECT_get0_X509(sk_X509_OBJECT_value(objs, i)); + if (!x) + continue; + xn = X509_get_subject_name(x); + if (!xn) + continue; + /* Check for duplicates. */ + key = X509_NAME_hash(xn); + for (node = eb64_lookup(&ca_name_tree, key), ca_name = NULL; + node && ca_name == NULL; + node = eb64_next(node)) { + ca_name = container_of(node, typeof(*ca_name), node); + if (X509_NAME_cmp(xn, ca_name->xname) != 0) + ca_name = NULL; + } + /* find a duplicate */ + if (ca_name) + continue; + ca_name = calloc(1, sizeof *ca_name); + xn = X509_NAME_dup(xn); + if (!ca_name || + !xn || + !sk_X509_NAME_push(skn, xn)) { + free(ca_name); + X509_NAME_free(xn); + sk_X509_NAME_pop_free(skn, X509_NAME_free); + sk_X509_NAME_free(skn); + skn = NULL; + break; + } + ca_name->node.key = key; + ca_name->xname = xn; + eb64_insert(&ca_name_tree, &ca_name->node); + } + ca_e->ca_list = skn; + /* remove temporary ca_name tree */ + node = eb64_first(&ca_name_tree); + while (node) { + ca_name = container_of(node, typeof(*ca_name), node); + back = eb64_next(node); + eb64_delete(node); + free(ca_name); + node = back; + } + } + return ca_e->ca_list; +} + +struct pool_head *pool_head_ssl_capture __read_mostly = NULL; +int ssl_capture_ptr_index = -1; +int ssl_app_data_index = -1; +#ifdef USE_QUIC +int ssl_qc_app_data_index = -1; +#endif /* USE_QUIC */ + +#ifdef HAVE_SSL_KEYLOG +int ssl_keylog_index = -1; +struct pool_head *pool_head_ssl_keylog __read_mostly = NULL; +struct pool_head *pool_head_ssl_keylog_str __read_mostly = NULL; +#endif + +int ssl_client_crt_ref_index = -1; + +/* Used to store the client's SNI in case of ClientHello callback error */ +int ssl_client_sni_index = -1; + +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) +struct list tlskeys_reference = LIST_HEAD_INIT(tlskeys_reference); +#endif + +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) +unsigned int openssl_engines_initialized; +struct list openssl_engines = LIST_HEAD_INIT(openssl_engines); +struct ssl_engine_list { + struct list list; + ENGINE *e; +}; +#endif + +#ifdef HAVE_SSL_PROVIDERS +struct list openssl_providers = LIST_HEAD_INIT(openssl_providers); +struct ssl_provider_list { + struct list list; + OSSL_PROVIDER *provider; +}; +#endif + +#ifndef OPENSSL_NO_DH +static int ssl_dh_ptr_index = -1; +static HASSL_DH *global_dh = NULL; +static HASSL_DH *local_dh_1024 = NULL; +static HASSL_DH *local_dh_2048 = NULL; +static HASSL_DH *local_dh_4096 = NULL; +#if (HA_OPENSSL_VERSION_NUMBER < 0x3000000fL) +static DH *ssl_get_tmp_dh_cbk(SSL *ssl, int export, int keylen); +#else +static void ssl_sock_set_tmp_dh_from_pkey(SSL_CTX *ctx, EVP_PKEY *pkey); +#endif +#endif /* OPENSSL_NO_DH */ + +#if (defined SSL_CTRL_SET_TLSEXT_HOSTNAME && !defined SSL_NO_GENERATE_CERTIFICATES) +/* X509V3 Extensions that will be added on generated certificates */ +#define X509V3_EXT_SIZE 5 +static char *x509v3_ext_names[X509V3_EXT_SIZE] = { + "basicConstraints", + "nsComment", + "subjectKeyIdentifier", + "authorityKeyIdentifier", + "keyUsage", +}; +static char *x509v3_ext_values[X509V3_EXT_SIZE] = { + "CA:FALSE", + "\"OpenSSL Generated Certificate\"", + "hash", + "keyid,issuer:always", + "nonRepudiation,digitalSignature,keyEncipherment" +}; +/* LRU cache to store generated certificate */ +static struct lru64_head *ssl_ctx_lru_tree = NULL; +static unsigned int ssl_ctx_lru_seed = 0; +static unsigned int ssl_ctx_serial; +__decl_rwlock(ssl_ctx_lru_rwlock); + +#endif // SSL_CTRL_SET_TLSEXT_HOSTNAME + +/* The order here matters for picking a default context, + * keep the most common keytype at the bottom of the list + */ +const char *SSL_SOCK_KEYTYPE_NAMES[] = { + "dsa", + "ecdsa", + "rsa" +}; + +static struct shared_context *ssl_shctx = NULL; /* ssl shared session cache */ +static struct eb_root *sh_ssl_sess_tree; /* ssl shared session tree */ + +/* Dedicated callback functions for heartbeat and clienthello. + */ +#ifdef TLS1_RT_HEARTBEAT +static void ssl_sock_parse_heartbeat(struct connection *conn, int write_p, int version, + int content_type, const void *buf, size_t len, + SSL *ssl); +#endif +static void ssl_sock_parse_clienthello(struct connection *conn, int write_p, int version, + int content_type, const void *buf, size_t len, + SSL *ssl); + +#ifdef HAVE_SSL_KEYLOG +static void ssl_init_keylog(struct connection *conn, int write_p, int version, + int content_type, const void *buf, size_t len, + SSL *ssl); +#endif + +/* List head of all registered SSL/TLS protocol message callbacks. */ +struct list ssl_sock_msg_callbacks = LIST_HEAD_INIT(ssl_sock_msg_callbacks); + +/* Registers the function <func> in order to be called on SSL/TLS protocol + * message processing. It will return 0 if the function <func> is not set + * or if it fails to allocate memory. + */ +int ssl_sock_register_msg_callback(ssl_sock_msg_callback_func func) +{ + struct ssl_sock_msg_callback *cbk; + + if (!func) + return 0; + + cbk = calloc(1, sizeof(*cbk)); + if (!cbk) { + ha_alert("out of memory in ssl_sock_register_msg_callback().\n"); + return 0; + } + + cbk->func = func; + + LIST_APPEND(&ssl_sock_msg_callbacks, &cbk->list); + + return 1; +} + +/* Used to register dedicated SSL/TLS protocol message callbacks. + */ +static int ssl_sock_register_msg_callbacks(void) +{ +#ifdef TLS1_RT_HEARTBEAT + if (!ssl_sock_register_msg_callback(ssl_sock_parse_heartbeat)) + return ERR_ABORT; +#endif + if (global_ssl.capture_buffer_size > 0) { + if (!ssl_sock_register_msg_callback(ssl_sock_parse_clienthello)) + return ERR_ABORT; + } +#ifdef HAVE_SSL_KEYLOG + if (global_ssl.keylog > 0) { + if (!ssl_sock_register_msg_callback(ssl_init_keylog)) + return ERR_ABORT; + } +#endif +#ifdef USE_QUIC_OPENSSL_COMPAT + if (!ssl_sock_register_msg_callback(quic_tls_compat_msg_callback)) + return ERR_ABORT; +#endif + + return ERR_NONE; +} + +/* Used to free all SSL/TLS protocol message callbacks that were + * registered by using ssl_sock_register_msg_callback(). + */ +static void ssl_sock_unregister_msg_callbacks(void) +{ + struct ssl_sock_msg_callback *cbk, *cbkback; + + list_for_each_entry_safe(cbk, cbkback, &ssl_sock_msg_callbacks, list) { + LIST_DELETE(&cbk->list); + free(cbk); + } +} + +static struct ssl_sock_ctx *ssl_sock_get_ctx(struct connection *conn) +{ + if (!conn || conn->xprt != xprt_get(XPRT_SSL) || !conn->xprt_ctx) + return NULL; + + return (struct ssl_sock_ctx *)conn->xprt_ctx; +} + +SSL *ssl_sock_get_ssl_object(struct connection *conn) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + return ctx ? ctx->ssl : NULL; +} +/* + * This function gives the detail of the SSL error. It is used only + * if the debug mode and the verbose mode are activated. It dump all + * the SSL error until the stack was empty. + */ +static forceinline void ssl_sock_dump_errors(struct connection *conn, + struct quic_conn *qc) +{ + unsigned long ret; + + if (unlikely(global.mode & MODE_DEBUG)) { + while(1) { + const char *func = NULL; + ERR_peek_error_func(&func); + + ret = ERR_get_error(); + if (ret == 0) + return; + if (conn) { + fprintf(stderr, "fd[%#x] OpenSSL error[0x%lx] %s: %s\n", + conn_fd(conn), ret, + func, ERR_reason_error_string(ret)); + } +#ifdef USE_QUIC + else { + /* TODO: we are not sure <conn> is always initialized for QUIC connections */ + fprintf(stderr, "qc @%p OpenSSL error[0x%lx] %s: %s\n", qc, ret, + func, ERR_reason_error_string(ret)); + } +#endif + } + } +} + + +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) +int ssl_init_single_engine(const char *engine_id, const char *def_algorithms) +{ + int err_code = ERR_ABORT; + ENGINE *engine; + struct ssl_engine_list *el; + + /* grab the structural reference to the engine */ + engine = ENGINE_by_id(engine_id); + if (engine == NULL) { + ha_alert("ssl-engine %s: failed to get structural reference\n", engine_id); + goto fail_get; + } + + if (!ENGINE_init(engine)) { + /* the engine couldn't initialise, release it */ + ha_alert("ssl-engine %s: failed to initialize\n", engine_id); + goto fail_init; + } + + if (ENGINE_set_default_string(engine, def_algorithms) == 0) { + ha_alert("ssl-engine %s: failed on ENGINE_set_default_string\n", engine_id); + goto fail_set_method; + } + + el = calloc(1, sizeof(*el)); + if (!el) + goto fail_alloc; + el->e = engine; + LIST_INSERT(&openssl_engines, &el->list); + nb_engines++; + if (global_ssl.async) + global.ssl_used_async_engines = nb_engines; + return 0; + +fail_alloc: +fail_set_method: + /* release the functional reference from ENGINE_init() */ + ENGINE_finish(engine); + +fail_init: + /* release the structural reference from ENGINE_by_id() */ + ENGINE_free(engine); + +fail_get: + return err_code; +} +#endif + +#ifdef HAVE_SSL_PROVIDERS +int ssl_init_provider(const char *provider_name) +{ + int err_code = ERR_ABORT; + struct ssl_provider_list *prov = NULL; + + prov = calloc(1, sizeof(*prov)); + if (!prov) { + ha_alert("ssl-provider %s: memory allocation failure\n", provider_name); + goto error; + } + + if ((prov->provider = OSSL_PROVIDER_load(NULL, provider_name)) == NULL) { + ha_alert("ssl-provider %s: unknown provider\n", provider_name); + goto error; + } + + LIST_INSERT(&openssl_providers, &prov->list); + + return 0; + +error: + ha_free(&prov); + return err_code; +} +#endif /* HAVE_SSL_PROVIDERS */ + +#ifdef SSL_MODE_ASYNC +/* + * openssl async fd handler + */ +void ssl_async_fd_handler(int fd) +{ + struct ssl_sock_ctx *ctx = fdtab[fd].owner; + + /* fd is an async enfine fd, we must stop + * to poll this fd until it is requested + */ + fd_stop_recv(fd); + fd_cant_recv(fd); + + /* crypto engine is available, let's notify the associated + * connection that it can pursue its processing. + */ + tasklet_wakeup(ctx->wait_event.tasklet); +} + +/* + * openssl async delayed SSL_free handler + */ +void ssl_async_fd_free(int fd) +{ + SSL *ssl = fdtab[fd].owner; + OSSL_ASYNC_FD all_fd[32]; + size_t num_all_fds = 0; + int i; + + /* We suppose that the async job for a same SSL * + * are serialized. So if we are awake it is + * because the running job has just finished + * and we can remove all async fds safely + */ + SSL_get_all_async_fds(ssl, NULL, &num_all_fds); + if (num_all_fds > 32) { + send_log(NULL, LOG_EMERG, "haproxy: openssl returns too many async fds. It seems a bug. Process may crash\n"); + return; + } + + SSL_get_all_async_fds(ssl, all_fd, &num_all_fds); + for (i=0 ; i < num_all_fds ; i++) { + /* We want to remove the fd from the fdtab + * but we flag it to disown because the + * close is performed by the engine itself + */ + fdtab[all_fd[i]].state |= FD_DISOWN; + fd_delete(all_fd[i]); + } + + /* Now we can safely call SSL_free, no more pending job in engines */ + SSL_free(ssl); + _HA_ATOMIC_DEC(&global.sslconns); + _HA_ATOMIC_DEC(&jobs); +} +/* + * function used to manage a returned SSL_ERROR_WANT_ASYNC + * and enable/disable polling for async fds + */ +static inline void ssl_async_process_fds(struct ssl_sock_ctx *ctx) +{ + OSSL_ASYNC_FD add_fd[32]; + OSSL_ASYNC_FD del_fd[32]; + SSL *ssl = ctx->ssl; + size_t num_add_fds = 0; + size_t num_del_fds = 0; + int i; + + SSL_get_changed_async_fds(ssl, NULL, &num_add_fds, NULL, + &num_del_fds); + if (num_add_fds > 32 || num_del_fds > 32) { + send_log(NULL, LOG_EMERG, "haproxy: openssl returns too many async fds. It seems a bug. Process may crash\n"); + return; + } + + SSL_get_changed_async_fds(ssl, add_fd, &num_add_fds, del_fd, &num_del_fds); + + /* We remove unused fds from the fdtab */ + for (i=0 ; i < num_del_fds ; i++) { + /* We want to remove the fd from the fdtab + * but we flag it to disown because the + * close is performed by the engine itself + */ + fdtab[del_fd[i]].state |= FD_DISOWN; + fd_delete(del_fd[i]); + } + + /* We add new fds to the fdtab */ + for (i=0 ; i < num_add_fds ; i++) { + fd_insert(add_fd[i], ctx, ssl_async_fd_handler, tgid, ti->ltid_bit); + } + + num_add_fds = 0; + SSL_get_all_async_fds(ssl, NULL, &num_add_fds); + if (num_add_fds > 32) { + send_log(NULL, LOG_EMERG, "haproxy: openssl returns too many async fds. It seems a bug. Process may crash\n"); + return; + } + + /* We activate the polling for all known async fds */ + SSL_get_all_async_fds(ssl, add_fd, &num_add_fds); + for (i=0 ; i < num_add_fds ; i++) { + fd_want_recv(add_fd[i]); + /* To ensure that the fd cache won't be used + * We'll prefer to catch a real RD event + * because handling an EAGAIN on this fd will + * result in a context switch and also + * some engines uses a fd in blocking mode. + */ + fd_cant_recv(add_fd[i]); + } + +} +#endif + + +/* + * Initialize an HMAC context <hctx> using the <key> and <md> parameters. + * Returns -1 in case of error, 1 otherwise. + */ +static int ssl_hmac_init(MAC_CTX *hctx, unsigned char *key, int key_len, const EVP_MD *md) +{ +#ifdef HAVE_OSSL_PARAM + OSSL_PARAM params[3]; + + params[0] = OSSL_PARAM_construct_octet_string(OSSL_MAC_PARAM_KEY, key, key_len); + params[1] = OSSL_PARAM_construct_utf8_string(OSSL_MAC_PARAM_DIGEST, (char*)EVP_MD_name(md), 0); + params[2] = OSSL_PARAM_construct_end(); + if (EVP_MAC_CTX_set_params(hctx, params) == 0) + return -1; /* error in mac initialisation */ + +#else + HMAC_Init_ex(hctx, key, key_len, md, NULL); +#endif + return 1; +} + +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) + +static int ssl_tlsext_ticket_key_cb(SSL *s, unsigned char key_name[16], unsigned char *iv, EVP_CIPHER_CTX *ectx, MAC_CTX *hctx, int enc) +{ + struct tls_keys_ref *ref = NULL; + union tls_sess_key *keys; + int head; + int i; + int ret = -1; /* error by default */ + struct connection *conn = SSL_get_ex_data(s, ssl_app_data_index); +#ifdef USE_QUIC + struct quic_conn *qc = SSL_get_ex_data(s, ssl_qc_app_data_index); +#endif + + if (conn) + ref = __objt_listener(conn->target)->bind_conf->keys_ref; +#ifdef USE_QUIC + else if (qc) + ref = qc->li->bind_conf->keys_ref; +#endif + + if (!ref) { + /* must never happen */ + ABORT_NOW(); + } + + HA_RWLOCK_RDLOCK(TLSKEYS_REF_LOCK, &ref->lock); + + keys = ref->tlskeys; + head = ref->tls_ticket_enc_index; + + if (enc) { + memcpy(key_name, keys[head].name, 16); + + if(!RAND_pseudo_bytes(iv, EVP_MAX_IV_LENGTH)) + goto end; + + if (ref->key_size_bits == 128) { + + if(!EVP_EncryptInit_ex(ectx, EVP_aes_128_cbc(), NULL, keys[head].key_128.aes_key, iv)) + goto end; + + if (ssl_hmac_init(hctx, keys[head].key_128.hmac_key, 16, TLS_TICKET_HASH_FUNCT()) < 0) + goto end; + ret = 1; + } + else if (ref->key_size_bits == 256 ) { + + if(!EVP_EncryptInit_ex(ectx, EVP_aes_256_cbc(), NULL, keys[head].key_256.aes_key, iv)) + goto end; + + if (ssl_hmac_init(hctx, keys[head].key_256.hmac_key, 32, TLS_TICKET_HASH_FUNCT()) < 0) + goto end; + ret = 1; + } + } else { + for (i = 0; i < TLS_TICKETS_NO; i++) { + if (!memcmp(key_name, keys[(head + i) % TLS_TICKETS_NO].name, 16)) + goto found; + } + ret = 0; + goto end; + + found: + if (ref->key_size_bits == 128) { + if (ssl_hmac_init(hctx, keys[(head + i) % TLS_TICKETS_NO].key_128.hmac_key, 16, TLS_TICKET_HASH_FUNCT()) < 0) + goto end; + if(!EVP_DecryptInit_ex(ectx, EVP_aes_128_cbc(), NULL, keys[(head + i) % TLS_TICKETS_NO].key_128.aes_key, iv)) + goto end; + /* 2 for key renewal, 1 if current key is still valid */ + ret = i ? 2 : 1; + } + else if (ref->key_size_bits == 256) { + if (ssl_hmac_init(hctx, keys[(head + i) % TLS_TICKETS_NO].key_256.hmac_key, 32, TLS_TICKET_HASH_FUNCT()) < 0) + goto end; + if(!EVP_DecryptInit_ex(ectx, EVP_aes_256_cbc(), NULL, keys[(head + i) % TLS_TICKETS_NO].key_256.aes_key, iv)) + goto end; + /* 2 for key renewal, 1 if current key is still valid */ + ret = i ? 2 : 1; + } + } + + end: + HA_RWLOCK_RDUNLOCK(TLSKEYS_REF_LOCK, &ref->lock); + return ret; +} + +struct tls_keys_ref *tlskeys_ref_lookup(const char *filename) +{ + struct tls_keys_ref *ref; + + list_for_each_entry(ref, &tlskeys_reference, list) + if (ref->filename && strcmp(filename, ref->filename) == 0) + return ref; + return NULL; +} + +struct tls_keys_ref *tlskeys_ref_lookupid(int unique_id) +{ + struct tls_keys_ref *ref; + + list_for_each_entry(ref, &tlskeys_reference, list) + if (ref->unique_id == unique_id) + return ref; + return NULL; +} + +/* Update the key into ref: if keysize doesn't + * match existing ones, this function returns -1 + * else it returns 0 on success. + */ +int ssl_sock_update_tlskey_ref(struct tls_keys_ref *ref, + struct buffer *tlskey) +{ + if (ref->key_size_bits == 128) { + if (tlskey->data != sizeof(struct tls_sess_key_128)) + return -1; + } + else if (ref->key_size_bits == 256) { + if (tlskey->data != sizeof(struct tls_sess_key_256)) + return -1; + } + else + return -1; + + HA_RWLOCK_WRLOCK(TLSKEYS_REF_LOCK, &ref->lock); + memcpy((char *) (ref->tlskeys + ((ref->tls_ticket_enc_index + 2) % TLS_TICKETS_NO)), + tlskey->area, tlskey->data); + ref->tls_ticket_enc_index = (ref->tls_ticket_enc_index + 1) % TLS_TICKETS_NO; + HA_RWLOCK_WRUNLOCK(TLSKEYS_REF_LOCK, &ref->lock); + + return 0; +} + +int ssl_sock_update_tlskey(char *filename, struct buffer *tlskey, char **err) +{ + struct tls_keys_ref *ref = tlskeys_ref_lookup(filename); + + if(!ref) { + memprintf(err, "Unable to locate the referenced filename: %s", filename); + return 1; + } + if (ssl_sock_update_tlskey_ref(ref, tlskey) < 0) { + memprintf(err, "Invalid key size"); + return 1; + } + + return 0; +} + +/* This function finalize the configuration parsing. Its set all the + * automatic ids. It's called just after the basic checks. It returns + * 0 on success otherwise ERR_*. + */ +static int tlskeys_finalize_config(void) +{ + int i = 0; + struct tls_keys_ref *ref, *ref2, *ref3; + struct list tkr = LIST_HEAD_INIT(tkr); + + list_for_each_entry(ref, &tlskeys_reference, list) { + if (ref->unique_id == -1) { + /* Look for the first free id. */ + while (1) { + list_for_each_entry(ref2, &tlskeys_reference, list) { + if (ref2->unique_id == i) { + i++; + break; + } + } + if (&ref2->list == &tlskeys_reference) + break; + } + + /* Uses the unique id and increment it for the next entry. */ + ref->unique_id = i; + i++; + } + } + + /* This sort the reference list by id. */ + list_for_each_entry_safe(ref, ref2, &tlskeys_reference, list) { + LIST_DELETE(&ref->list); + list_for_each_entry(ref3, &tkr, list) { + if (ref->unique_id < ref3->unique_id) { + LIST_APPEND(&ref3->list, &ref->list); + break; + } + } + if (&ref3->list == &tkr) + LIST_APPEND(&tkr, &ref->list); + } + + /* swap root */ + LIST_SPLICE(&tlskeys_reference, &tkr); + return ERR_NONE; +} +#endif /* SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB */ + + +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) +/* + * This function enables the handling of OCSP status extension on 'ctx' if a + * ocsp_response buffer was found in the cert_key_and_chain. To enable OCSP + * status extension, the issuer's certificate is mandatory. It should be + * present in ckch->ocsp_issuer. + * + * In addition, the ckch->ocsp_reponse buffer is loaded as a DER format of an + * OCSP response. If file is empty or content is not a valid OCSP response, + * OCSP status extension is enabled but OCSP response is ignored (a warning is + * displayed). + * + * Returns 1 if no ".ocsp" file found, 0 if OCSP status extension is + * successfully enabled, or -1 in other error case. + */ +static int ssl_sock_load_ocsp(const char *path, SSL_CTX *ctx, struct ckch_data *data, STACK_OF(X509) *chain) +{ + X509 *x, *issuer; + int i, ret = -1; + struct certificate_ocsp *ocsp = NULL, *iocsp; + char *warn = NULL; + unsigned char *p; +#ifndef USE_OPENSSL_WOLFSSL +#if (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) + int (*callback) (SSL *, void *); +#else + void (*callback) (void); +#endif +#else + tlsextStatusCb callback; +#endif + struct buffer *ocsp_uri = get_trash_chunk(); + char *err = NULL; + size_t path_len; + int inc_refcount_store = 0; + + x = data->cert; + if (!x) + goto out; + + ssl_ocsp_get_uri_from_cert(x, ocsp_uri, &err); + /* We should have an "OCSP URI" field in order for auto update to work. */ + if (data->ocsp_update_mode == SSL_SOCK_OCSP_UPDATE_ON && b_data(ocsp_uri) == 0) + goto out; + + /* In case of ocsp update mode set to 'on', this function might be + * called with no known ocsp response. If no ocsp uri can be found in + * the certificate, nothing needs to be done here. */ + if (!data->ocsp_response && !data->ocsp_cid) { + if (data->ocsp_update_mode != SSL_SOCK_OCSP_UPDATE_ON || b_data(ocsp_uri) == 0) { + ret = 0; + goto out; + } + } + + issuer = data->ocsp_issuer; + /* take issuer from chain over ocsp_issuer, is what is done historicaly */ + if (chain) { + /* check if one of the certificate of the chain is the issuer */ + for (i = 0; i < sk_X509_num(chain); i++) { + X509 *ti = sk_X509_value(chain, i); + if (X509_check_issued(ti, x) == X509_V_OK) { + issuer = ti; + break; + } + } + } + if (!issuer) + goto out; + + if (!data->ocsp_cid) { + data->ocsp_cid = OCSP_cert_to_id(0, x, issuer); + inc_refcount_store = 1; + } + if (!data->ocsp_cid) + goto out; + + i = i2d_OCSP_CERTID(data->ocsp_cid, NULL); + if (!i || (i > OCSP_MAX_CERTID_ASN1_LENGTH)) + goto out; + + path_len = strlen(path); + ocsp = calloc(1, sizeof(*ocsp) + path_len + 1); + if (!ocsp) + goto out; + + p = ocsp->key_data; + ocsp->key_length = i2d_OCSP_CERTID(data->ocsp_cid, &p); + + HA_SPIN_LOCK(OCSP_LOCK, &ocsp_tree_lock); + iocsp = (struct certificate_ocsp *)ebmb_insert(&cert_ocsp_tree, &ocsp->key, OCSP_MAX_CERTID_ASN1_LENGTH); + if (iocsp == ocsp) + ocsp = NULL; + +#ifndef SSL_CTX_get_tlsext_status_cb +# define SSL_CTX_get_tlsext_status_cb(ctx, cb) \ + *cb = (void (*) (void))ctx->tlsext_status_cb; +#endif + SSL_CTX_get_tlsext_status_cb(ctx, &callback); + + if (inc_refcount_store) + iocsp->refcount_store++; + + if (!callback) { + struct ocsp_cbk_arg *cb_arg; + EVP_PKEY *pkey; + + cb_arg = calloc(1, sizeof(*cb_arg)); + if (!cb_arg) + goto out; + + cb_arg->is_single = 1; + cb_arg->s_ocsp = iocsp; + iocsp->refcount_instance++; + + pkey = X509_get_pubkey(x); + cb_arg->single_kt = EVP_PKEY_base_id(pkey); + EVP_PKEY_free(pkey); + + SSL_CTX_set_tlsext_status_cb(ctx, ssl_sock_ocsp_stapling_cbk); + SSL_CTX_set_ex_data(ctx, ocsp_ex_index, cb_arg); /* we use the ex_data instead of the cb_arg function here, so we can use the cleanup callback to free */ + + } else { + /* + * If the ctx has a status CB, then we have previously set an OCSP staple for this ctx + * Update that cb_arg with the new cert's staple + */ + struct ocsp_cbk_arg *cb_arg; + struct certificate_ocsp *tmp_ocsp; + int index; + int key_type; + EVP_PKEY *pkey; + + cb_arg = SSL_CTX_get_ex_data(ctx, ocsp_ex_index); + + /* + * The following few lines will convert cb_arg from a single ocsp to multi ocsp + * the order of operations below matter, take care when changing it + */ + tmp_ocsp = cb_arg->s_ocsp; + index = ssl_sock_get_ocsp_arg_kt_index(cb_arg->single_kt); + cb_arg->s_ocsp = NULL; + cb_arg->m_ocsp[index] = tmp_ocsp; + cb_arg->is_single = 0; + cb_arg->single_kt = 0; + + pkey = X509_get_pubkey(x); + key_type = EVP_PKEY_base_id(pkey); + EVP_PKEY_free(pkey); + + index = ssl_sock_get_ocsp_arg_kt_index(key_type); + if (index >= 0 && !cb_arg->m_ocsp[index]) { + cb_arg->m_ocsp[index] = iocsp; + iocsp->refcount_instance++; + } + } + HA_SPIN_UNLOCK(OCSP_LOCK, &ocsp_tree_lock); + + ret = 0; + + warn = NULL; + if (data->ocsp_response && ssl_sock_load_ocsp_response(data->ocsp_response, iocsp, data->ocsp_cid, &warn)) { + memprintf(&warn, "Loading: %s. Content will be ignored", warn ? warn : "failure"); + ha_warning("%s.\n", warn); + } + + + /* Do not insert the same certificate_ocsp structure in the + * update tree more than once. */ + if (!ocsp) { + /* Issuer certificate is not included in the certificate + * chain, it will have to be treated separately during + * ocsp response validation. */ + if (issuer == data->ocsp_issuer) { + iocsp->issuer = issuer; + X509_up_ref(issuer); + } + if (data->chain) + iocsp->chain = X509_chain_up_ref(data->chain); + + iocsp->uri = calloc(1, sizeof(*iocsp->uri)); + if (!chunk_dup(iocsp->uri, ocsp_uri)) { + ha_free(&iocsp->uri); + goto out; + } + + /* Note: if we arrive here, ocsp==NULL because iocsp==ocsp + * after the ebmb_insert(), which indicates that we've + * just inserted this new node and that it's the one for + * which we previously allocated enough room for path_len+1 + * chars. + */ + memcpy(iocsp->path, path, path_len + 1); + + if (data->ocsp_update_mode == SSL_SOCK_OCSP_UPDATE_ON) { + ssl_ocsp_update_insert(iocsp); + /* If we are during init the update task is not + * scheduled yet so a wakeup won't do anything. + * Otherwise, if the OCSP was added through the CLI, we + * wake the task up to manage the case of a new entry + * that needs to be updated before the previous first + * entry. + */ + if (ocsp_update_task) + task_wakeup(ocsp_update_task, TASK_WOKEN_MSG); + } + } else if (iocsp->uri && data->ocsp_update_mode == SSL_SOCK_OCSP_UPDATE_ON) { + /* This unlikely case can happen if a series of "del ssl + * crt-list" / "add ssl crt-list" commands are made on the CLI. + * In such a case, the OCSP response tree entry will be created + * prior to the activation of the ocsp auto update and in such a + * case we must "force" insertion in the auto update tree. + */ + if (iocsp->next_update.node.leaf_p == NULL) { + ssl_ocsp_update_insert(iocsp); + /* If we are during init the update task is not + * scheduled yet so a wakeup won't do anything. + * Otherwise, if the OCSP was added through the CLI, we + * wake the task up to manage the case of a new entry + * that needs to be updated before the previous first + * entry. + */ + if (ocsp_update_task) + task_wakeup(ocsp_update_task, TASK_WOKEN_MSG); + } + } + +out: + if (ret && data->ocsp_cid) { + OCSP_CERTID_free(data->ocsp_cid); + data->ocsp_cid = NULL; + } + + if (!ret && data->ocsp_response) { + ha_free(&data->ocsp_response->area); + ha_free(&data->ocsp_response); + } + + if (ocsp) + ssl_sock_free_ocsp(ocsp); + + if (warn) + free(warn); + + free(err); + + return ret; +} + +#endif + +#ifdef OPENSSL_IS_BORINGSSL +static int ssl_sock_load_ocsp(const char *path, SSL_CTX *ctx, struct ckch_data *data, STACK_OF(X509) *chain) +{ + return SSL_CTX_set_ocsp_response(ctx, (const uint8_t *)ckch->ocsp_response->area, ckch->ocsp_response->data); +} +#endif + + +#ifdef HAVE_SSL_CTX_ADD_SERVER_CUSTOM_EXT + +#define CT_EXTENSION_TYPE 18 + +int sctl_ex_index = -1; + +int ssl_sock_sctl_add_cbk(SSL *ssl, unsigned ext_type, const unsigned char **out, size_t *outlen, int *al, void *add_arg) +{ + struct buffer *sctl = add_arg; + + *out = (unsigned char *) sctl->area; + *outlen = sctl->data; + + return 1; +} + +int ssl_sock_sctl_parse_cbk(SSL *s, unsigned int ext_type, const unsigned char *in, size_t inlen, int *al, void *parse_arg) +{ + return 1; +} + +static int ssl_sock_load_sctl(SSL_CTX *ctx, struct buffer *sctl) +{ + int ret = -1; + + if (!SSL_CTX_add_server_custom_ext(ctx, CT_EXTENSION_TYPE, ssl_sock_sctl_add_cbk, NULL, sctl, ssl_sock_sctl_parse_cbk, NULL)) + goto out; + + SSL_CTX_set_ex_data(ctx, sctl_ex_index, sctl); + + ret = 0; + +out: + return ret; +} + +#endif + +void ssl_sock_infocbk(const SSL *ssl, int where, int ret) +{ + struct connection *conn = SSL_get_ex_data(ssl, ssl_app_data_index); +#ifdef USE_QUIC + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); +#endif /* USE_QUIC */ + struct ssl_sock_ctx *ctx = NULL; + + BIO *write_bio; + (void)ret; /* shut gcc stupid warning */ + + if (conn) + ctx = conn_get_ssl_sock_ctx(conn); +#ifdef USE_QUIC + else if (qc) + ctx = qc->xprt_ctx; +#endif /* USE_QUIC */ + + if (!ctx) { + /* must never happen */ + ABORT_NOW(); + return; + } + +#ifndef SSL_OP_NO_RENEGOTIATION + /* Please note that BoringSSL defines this macro to zero so don't + * change this to #if and do not assign a default value to this macro! + */ + if (where & SSL_CB_HANDSHAKE_START) { + /* Disable renegotiation (CVE-2009-3555) */ + if (conn && (conn->flags & (CO_FL_WAIT_L6_CONN | CO_FL_EARLY_SSL_HS | CO_FL_EARLY_DATA)) == 0) { + conn->flags |= CO_FL_ERROR; + conn->err_code = CO_ER_SSL_RENEG; + } + } +#endif + + if ((where & SSL_CB_ACCEPT_LOOP) == SSL_CB_ACCEPT_LOOP) { + if (!(ctx->xprt_st & SSL_SOCK_ST_FL_16K_WBFSIZE)) { + /* Long certificate chains optimz + If write and read bios are different, we + consider that the buffering was activated, + so we rise the output buffer size from 4k + to 16k */ + write_bio = SSL_get_wbio(ssl); + if (write_bio != SSL_get_rbio(ssl)) { + BIO_set_write_buffer_size(write_bio, 16384); + ctx->xprt_st |= SSL_SOCK_ST_FL_16K_WBFSIZE; + } + } + } +} + +/* Callback is called for each certificate of the chain during a verify + ok is set to 1 if preverify detect no error on current certificate. + Returns 0 to break the handshake, 1 otherwise. */ +int ssl_sock_bind_verifycbk(int ok, X509_STORE_CTX *x_store) +{ + SSL *ssl; + struct connection *conn; + struct ssl_sock_ctx *ctx = NULL; + int err, depth; + X509 *client_crt; + STACK_OF(X509) *certs; + struct bind_conf *bind_conf = NULL; + struct quic_conn *qc = NULL; + + ssl = X509_STORE_CTX_get_ex_data(x_store, SSL_get_ex_data_X509_STORE_CTX_idx()); + conn = SSL_get_ex_data(ssl, ssl_app_data_index); + client_crt = SSL_get_ex_data(ssl, ssl_client_crt_ref_index); + + if (conn) { + bind_conf = __objt_listener(conn->target)->bind_conf; + ctx = __conn_get_ssl_sock_ctx(conn); + } +#ifdef USE_QUIC + else { + qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); + BUG_ON(!qc); /* Must never happen */ + bind_conf = qc->li->bind_conf; + ctx = qc->xprt_ctx; + } +#endif + + BUG_ON(!ctx || !bind_conf); + ALREADY_CHECKED(ctx); + ALREADY_CHECKED(bind_conf); + + ctx->xprt_st |= SSL_SOCK_ST_FL_VERIFY_DONE; + + depth = X509_STORE_CTX_get_error_depth(x_store); + err = X509_STORE_CTX_get_error(x_store); + + if (ok) /* no errors */ + return ok; + + /* Keep a reference to the client's certificate in order to be able to + * dump some fetches values in a log even when the verification process + * fails. */ + if (depth == 0) { + X509_free(client_crt); + client_crt = X509_STORE_CTX_get0_cert(x_store); + if (client_crt) { + X509_up_ref(client_crt); + SSL_set_ex_data(ssl, ssl_client_crt_ref_index, client_crt); + } + } + else { + /* An error occurred on a CA certificate of the certificate + * chain, we might never call this verify callback on the client + * certificate's depth (which is 0) so we try to store the + * reference right now. */ + certs = X509_STORE_CTX_get1_chain(x_store); + if (certs) { + client_crt = sk_X509_value(certs, 0); + if (client_crt) { + X509_up_ref(client_crt); + SSL_set_ex_data(ssl, ssl_client_crt_ref_index, client_crt); + } + sk_X509_pop_free(certs, X509_free); + } + } + + /* check if CA error needs to be ignored */ + if (depth > 0) { + if (!SSL_SOCK_ST_TO_CA_ERROR(ctx->xprt_st)) { + ctx->xprt_st |= SSL_SOCK_CA_ERROR_TO_ST(err); + ctx->xprt_st |= SSL_SOCK_CAEDEPTH_TO_ST(depth); + } + + if (err <= SSL_MAX_VFY_ERROR_CODE && + cert_ignerr_bitfield_get(bind_conf->ca_ignerr_bitfield, err)) + goto err_ignored; + + /* TODO: for QUIC connection, this error code is lost */ + if (conn) + conn->err_code = CO_ER_SSL_CA_FAIL; + return 0; + } + + if (!SSL_SOCK_ST_TO_CRTERROR(ctx->xprt_st)) + ctx->xprt_st |= SSL_SOCK_CRTERROR_TO_ST(err); + + /* check if certificate error needs to be ignored */ + if (err <= SSL_MAX_VFY_ERROR_CODE && + cert_ignerr_bitfield_get(bind_conf->crt_ignerr_bitfield, err)) + goto err_ignored; + + /* TODO: for QUIC connection, this error code is lost */ + if (conn) + conn->err_code = CO_ER_SSL_CRT_FAIL; + return 0; + + err_ignored: + ssl_sock_dump_errors(conn, qc); + ERR_clear_error(); + return 1; +} + +#ifdef TLS1_RT_HEARTBEAT +static void ssl_sock_parse_heartbeat(struct connection *conn, int write_p, int version, + int content_type, const void *buf, size_t len, + SSL *ssl) +{ + /* test heartbeat received (write_p is set to 0 + for a received record) */ + if ((content_type == TLS1_RT_HEARTBEAT) && (write_p == 0)) { + struct ssl_sock_ctx *ctx = __conn_get_ssl_sock_ctx(conn); + const unsigned char *p = buf; + unsigned int payload; + + ctx->xprt_st |= SSL_SOCK_RECV_HEARTBEAT; + + /* Check if this is a CVE-2014-0160 exploitation attempt. */ + if (*p != TLS1_HB_REQUEST) + return; + + if (len < 1 + 2 + 16) /* 1 type + 2 size + 0 payload + 16 padding */ + goto kill_it; + + payload = (p[1] * 256) + p[2]; + if (3 + payload + 16 <= len) + return; /* OK no problem */ + kill_it: + /* We have a clear heartbleed attack (CVE-2014-0160), the + * advertised payload is larger than the advertised packet + * length, so we have garbage in the buffer between the + * payload and the end of the buffer (p+len). We can't know + * if the SSL stack is patched, and we don't know if we can + * safely wipe out the area between p+3+len and payload. + * So instead, we prevent the response from being sent by + * setting the max_send_fragment to 0 and we report an SSL + * error, which will kill this connection. It will be reported + * above as SSL_ERROR_SSL while an other handshake failure with + * a heartbeat message will be reported as SSL_ERROR_SYSCALL. + */ + ssl->max_send_fragment = 0; + SSLerr(SSL_F_TLS1_HEARTBEAT, SSL_R_SSL_HANDSHAKE_FAILURE); + } +} +#endif + +static void ssl_sock_parse_clienthello(struct connection *conn, int write_p, int version, + int content_type, const void *buf, size_t len, + SSL *ssl) +{ + struct ssl_capture *capture; + uchar *msg; + uchar *end; + uchar *extensions_end; + uchar *ec_start = NULL; + uchar *ec_formats_start = NULL; + uchar *list_end; + ushort protocol_version; + ushort extension_id; + ushort ec_len = 0; + uchar ec_formats_len = 0; + int offset = 0; + int rec_len; + + /* This function is called for "from client" and "to server" + * connections. The combination of write_p == 0 and content_type == 22 + * is only available during "from client" connection. + */ + + /* "write_p" is set to 0 is the bytes are received messages, + * otherwise it is set to 1. + */ + if (write_p != 0) + return; + + /* content_type contains the type of message received or sent + * according with the SSL/TLS protocol spec. This message is + * encoded with one byte. The value 256 (two bytes) is used + * for designing the SSL/TLS record layer. According with the + * rfc6101, the expected message (other than 256) are: + * - change_cipher_spec(20) + * - alert(21) + * - handshake(22) + * - application_data(23) + * - (255) + * We are interessed by the handshake and specially the client + * hello. + */ + if (content_type != 22) + return; + + /* The message length is at least 4 bytes, containing the + * message type and the message length. + */ + if (len < 4) + return; + + /* First byte of the handshake message id the type of + * message. The known types are: + * - hello_request(0) + * - client_hello(1) + * - server_hello(2) + * - certificate(11) + * - server_key_exchange (12) + * - certificate_request(13) + * - server_hello_done(14) + * We are interested by the client hello. + */ + msg = (unsigned char *)buf; + if (msg[0] != 1) + return; + + /* Next three bytes are the length of the message. The total length + * must be this decoded length + 4. If the length given as argument + * is not the same, we abort the protocol dissector. + */ + rec_len = (msg[1] << 16) + (msg[2] << 8) + msg[3]; + if (len < rec_len + 4) + return; + msg += 4; + end = msg + rec_len; + if (end < msg) + return; + + /* Expect 2 bytes for protocol version + * (1 byte for major and 1 byte for minor) + */ + if (msg + 2 > end) + return; + protocol_version = (msg[0] << 8) + msg[1]; + msg += 2; + + /* Expect the random, composed by 4 bytes for the unix time and + * 28 bytes for unix payload. So we jump 4 + 28. + */ + msg += 4 + 28; + if (msg > end) + return; + + /* Next, is session id: + * if present, we have to jump by length + 1 for the size information + * if not present, we have to jump by 1 only + */ + if (msg[0] > 0) + msg += msg[0]; + msg += 1; + if (msg > end) + return; + + /* Next two bytes are the ciphersuite length. */ + if (msg + 2 > end) + return; + rec_len = (msg[0] << 8) + msg[1]; + msg += 2; + if (msg + rec_len > end || msg + rec_len < msg) + return; + + capture = pool_zalloc(pool_head_ssl_capture); + if (!capture) + return; + /* Compute the xxh64 of the ciphersuite. */ + capture->xxh64 = XXH64(msg, rec_len, 0); + + /* Capture the ciphersuite. */ + capture->ciphersuite_len = MIN(global_ssl.capture_buffer_size, rec_len); + capture->ciphersuite_offset = 0; + memcpy(capture->data, msg, capture->ciphersuite_len); + msg += rec_len; + offset += capture->ciphersuite_len; + + /* Initialize other data */ + capture->protocol_version = protocol_version; + + /* Next, compression methods: + * if present, we have to jump by length + 1 for the size information + * if not present, we have to jump by 1 only + */ + if (msg[0] > 0) + msg += msg[0]; + msg += 1; + if (msg > end) + goto store_capture; + + /* We reached extensions */ + if (msg + 2 > end) + goto store_capture; + rec_len = (msg[0] << 8) + msg[1]; + msg += 2; + if (msg + rec_len > end || msg + rec_len < msg) + goto store_capture; + extensions_end = msg + rec_len; + capture->extensions_offset = offset; + + /* Parse each extension */ + while (msg + 4 < extensions_end) { + /* Add 2 bytes of extension_id */ + if (global_ssl.capture_buffer_size >= offset + 2) { + capture->data[offset++] = msg[0]; + capture->data[offset++] = msg[1]; + capture->extensions_len += 2; + } + else + break; + extension_id = (msg[0] << 8) + msg[1]; + /* Length of the extension */ + rec_len = (msg[2] << 8) + msg[3]; + + /* Expect 2 bytes extension id + 2 bytes extension size */ + msg += 2 + 2; + if (msg + rec_len > extensions_end || msg + rec_len < msg) + goto store_capture; + /* TLS Extensions + * https://www.iana.org/assignments/tls-extensiontype-values/tls-extensiontype-values.xhtml */ + if (extension_id == 0x000a) { + /* Elliptic Curves: + * https://www.rfc-editor.org/rfc/rfc8422.html + * https://www.rfc-editor.org/rfc/rfc7919.html */ + list_end = msg + rec_len; + if (msg + 2 > list_end) + goto store_capture; + rec_len = (msg[0] << 8) + msg[1]; + msg += 2; + + if (msg + rec_len > list_end || msg + rec_len < msg) + goto store_capture; + /* Store location/size of the list */ + ec_start = msg; + ec_len = rec_len; + } + else if (extension_id == 0x000b) { + /* Elliptic Curves Point Formats: + * https://www.rfc-editor.org/rfc/rfc8422.html */ + list_end = msg + rec_len; + if (msg + 1 > list_end) + goto store_capture; + rec_len = msg[0]; + msg += 1; + + if (msg + rec_len > list_end || msg + rec_len < msg) + goto store_capture; + /* Store location/size of the list */ + ec_formats_start = msg; + ec_formats_len = rec_len; + } + msg += rec_len; + } + + if (ec_start) { + rec_len = ec_len; + if (offset + rec_len > global_ssl.capture_buffer_size) + rec_len = global_ssl.capture_buffer_size - offset; + memcpy(capture->data + offset, ec_start, rec_len); + capture->ec_offset = offset; + capture->ec_len = rec_len; + offset += rec_len; + } + if (ec_formats_start) { + rec_len = ec_formats_len; + if (offset + rec_len > global_ssl.capture_buffer_size) + rec_len = global_ssl.capture_buffer_size - offset; + memcpy(capture->data + offset, ec_formats_start, rec_len); + capture->ec_formats_offset = offset; + capture->ec_formats_len = rec_len; + offset += rec_len; + } + + store_capture: + SSL_set_ex_data(ssl, ssl_capture_ptr_index, capture); +} + + +#ifdef HAVE_SSL_KEYLOG +static void ssl_init_keylog(struct connection *conn, int write_p, int version, + int content_type, const void *buf, size_t len, + SSL *ssl) +{ + struct ssl_keylog *keylog; + + if (SSL_get_ex_data(ssl, ssl_keylog_index)) + return; + + keylog = pool_zalloc(pool_head_ssl_keylog); + if (!keylog) + return; + + if (!SSL_set_ex_data(ssl, ssl_keylog_index, keylog)) { + pool_free(pool_head_ssl_keylog, keylog); + return; + } +} +#endif + +/* Callback is called for ssl protocol analyse */ +void ssl_sock_msgcbk(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg) +{ + struct connection *conn = SSL_get_ex_data(ssl, ssl_app_data_index); + struct ssl_sock_msg_callback *cbk; + + /* Try to call all callback functions that were registered by using + * ssl_sock_register_msg_callback(). + */ + list_for_each_entry(cbk, &ssl_sock_msg_callbacks, list) { + cbk->func(conn, write_p, version, content_type, buf, len, ssl); + } +} + +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) +static int ssl_sock_srv_select_protos(SSL *s, unsigned char **out, unsigned char *outlen, + const unsigned char *in, unsigned int inlen, + void *arg) +{ + struct server *srv = arg; + + if (SSL_select_next_proto(out, outlen, in, inlen, (unsigned char *)srv->ssl_ctx.npn_str, + srv->ssl_ctx.npn_len) == OPENSSL_NPN_NEGOTIATED) + return SSL_TLSEXT_ERR_OK; + return SSL_TLSEXT_ERR_NOACK; +} +#endif + +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) +/* This callback is used so that the server advertises the list of + * negotiable protocols for NPN. + */ +static int ssl_sock_advertise_npn_protos(SSL *s, const unsigned char **data, + unsigned int *len, void *arg) +{ + struct ssl_bind_conf *conf = arg; + + *data = (const unsigned char *)conf->npn_str; + *len = conf->npn_len; + return SSL_TLSEXT_ERR_OK; +} +#endif + +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation +/* This callback is used so that the server advertises the list of + * negotiable protocols for ALPN. + */ +static int ssl_sock_advertise_alpn_protos(SSL *s, const unsigned char **out, + unsigned char *outlen, + const unsigned char *server, + unsigned int server_len, void *arg) +{ + struct ssl_bind_conf *conf = arg; +#ifdef USE_QUIC + struct quic_conn *qc = SSL_get_ex_data(s, ssl_qc_app_data_index); +#endif + + if (SSL_select_next_proto((unsigned char**) out, outlen, (const unsigned char *)conf->alpn_str, + conf->alpn_len, server, server_len) != OPENSSL_NPN_NEGOTIATED) { +#ifdef USE_QUIC + if (qc) + quic_set_tls_alert(qc, SSL_AD_NO_APPLICATION_PROTOCOL); +#endif + return SSL_TLSEXT_ERR_NOACK; + } + +#ifdef USE_QUIC + if (qc && !quic_set_app_ops(qc, *out, *outlen)) { + quic_set_tls_alert(qc, SSL_AD_NO_APPLICATION_PROTOCOL); + return SSL_TLSEXT_ERR_NOACK; + } +#endif + + return SSL_TLSEXT_ERR_OK; +} +#endif + +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME +#ifndef SSL_NO_GENERATE_CERTIFICATES + +/* Configure a DNS SAN extension on a certificate. */ +int ssl_sock_add_san_ext(X509V3_CTX* ctx, X509* cert, const char *servername) { + int failure = 0; + X509_EXTENSION *san_ext = NULL; + CONF *conf = NULL; + struct buffer *san_name = get_trash_chunk(); + + conf = NCONF_new(NULL); + if (!conf) { + failure = 1; + goto cleanup; + } + + /* Build an extension based on the DNS entry above */ + chunk_appendf(san_name, "DNS:%s", servername); + san_ext = X509V3_EXT_nconf_nid(conf, ctx, NID_subject_alt_name, san_name->area); + if (!san_ext) { + failure = 1; + goto cleanup; + } + + /* Add the extension */ + if (!X509_add_ext(cert, san_ext, -1 /* Add to end */)) { + failure = 1; + goto cleanup; + } + + /* Success */ + failure = 0; + +cleanup: + if (NULL != san_ext) X509_EXTENSION_free(san_ext); + if (NULL != conf) NCONF_free(conf); + + return failure; +} + +/* Create a X509 certificate with the specified servername and serial. This + * function returns a SSL_CTX object or NULL if an error occurs. */ +static SSL_CTX * +ssl_sock_do_create_cert(const char *servername, struct bind_conf *bind_conf, SSL *ssl) +{ + X509 *cacert = bind_conf->ca_sign_ckch->cert; + EVP_PKEY *capkey = bind_conf->ca_sign_ckch->key; + SSL_CTX *ssl_ctx = NULL; + X509 *newcrt = NULL; + EVP_PKEY *pkey = NULL; + SSL *tmp_ssl = NULL; + CONF *ctmp = NULL; + X509_NAME *name; + const EVP_MD *digest; + X509V3_CTX ctx; + unsigned int i; + int key_type; + + /* Get the private key of the default certificate and use it */ +#ifdef HAVE_SSL_CTX_get0_privatekey + pkey = SSL_CTX_get0_privatekey(bind_conf->default_ctx); +#else + tmp_ssl = SSL_new(bind_conf->default_ctx); + if (tmp_ssl) + pkey = SSL_get_privatekey(tmp_ssl); +#endif + if (!pkey) + goto mkcert_error; + + /* Create the certificate */ + if (!(newcrt = X509_new())) + goto mkcert_error; + + /* Set version number for the certificate (X509v3) and the serial + * number */ + if (X509_set_version(newcrt, 2L) != 1) + goto mkcert_error; + ASN1_INTEGER_set(X509_get_serialNumber(newcrt), _HA_ATOMIC_ADD_FETCH(&ssl_ctx_serial, 1)); + + /* Set duration for the certificate */ + if (!X509_gmtime_adj(X509_getm_notBefore(newcrt), (long)-60*60*24) || + !X509_gmtime_adj(X509_getm_notAfter(newcrt),(long)60*60*24*365)) + goto mkcert_error; + + /* set public key in the certificate */ + if (X509_set_pubkey(newcrt, pkey) != 1) + goto mkcert_error; + + /* Set issuer name from the CA */ + if (!(name = X509_get_subject_name(cacert))) + goto mkcert_error; + if (X509_set_issuer_name(newcrt, name) != 1) + goto mkcert_error; + + /* Set the subject name using the same, but the CN */ + name = X509_NAME_dup(name); + if (X509_NAME_add_entry_by_txt(name, "CN", MBSTRING_ASC, + (const unsigned char *)servername, + -1, -1, 0) != 1) { + X509_NAME_free(name); + goto mkcert_error; + } + if (X509_set_subject_name(newcrt, name) != 1) { + X509_NAME_free(name); + goto mkcert_error; + } + X509_NAME_free(name); + + /* Add x509v3 extensions as specified */ + ctmp = NCONF_new(NULL); + X509V3_set_ctx(&ctx, cacert, newcrt, NULL, NULL, 0); + for (i = 0; i < X509V3_EXT_SIZE; i++) { + X509_EXTENSION *ext; + + if (!(ext = X509V3_EXT_nconf(ctmp, &ctx, x509v3_ext_names[i], x509v3_ext_values[i]))) + goto mkcert_error; + if (!X509_add_ext(newcrt, ext, -1)) { + X509_EXTENSION_free(ext); + goto mkcert_error; + } + X509_EXTENSION_free(ext); + } + + /* Add SAN extension */ + if (ssl_sock_add_san_ext(&ctx, newcrt, servername)) { + goto mkcert_error; + } + + /* Sign the certificate with the CA private key */ + + key_type = EVP_PKEY_base_id(capkey); + + if (key_type == EVP_PKEY_DSA) + digest = EVP_sha1(); + else if (key_type == EVP_PKEY_RSA) + digest = EVP_sha256(); + else if (key_type == EVP_PKEY_EC) + digest = EVP_sha256(); + else { +#ifdef ASN1_PKEY_CTRL_DEFAULT_MD_NID + int nid; + + if (EVP_PKEY_get_default_digest_nid(capkey, &nid) <= 0) + goto mkcert_error; + if (!(digest = EVP_get_digestbynid(nid))) + goto mkcert_error; +#else + goto mkcert_error; +#endif + } + + if (!(X509_sign(newcrt, capkey, digest))) + goto mkcert_error; + + /* Create and set the new SSL_CTX */ + if (!(ssl_ctx = SSL_CTX_new(SSLv23_server_method()))) + goto mkcert_error; + if (!SSL_CTX_use_PrivateKey(ssl_ctx, pkey)) + goto mkcert_error; + if (!SSL_CTX_use_certificate(ssl_ctx, newcrt)) + goto mkcert_error; + if (!SSL_CTX_check_private_key(ssl_ctx)) + goto mkcert_error; + + /* Build chaining the CA cert and the rest of the chain, keep these order */ +#if defined(SSL_CTX_add1_chain_cert) + if (!SSL_CTX_add1_chain_cert(ssl_ctx, bind_conf->ca_sign_ckch->cert)) { + goto mkcert_error; + } + + if (bind_conf->ca_sign_ckch->chain) { + for (i = 0; i < sk_X509_num(bind_conf->ca_sign_ckch->chain); i++) { + X509 *chain_cert = sk_X509_value(bind_conf->ca_sign_ckch->chain, i); + if (!SSL_CTX_add1_chain_cert(ssl_ctx, chain_cert)) { + goto mkcert_error; + } + } + } +#endif + + if (newcrt) X509_free(newcrt); + +#ifndef OPENSSL_NO_DH +#if (HA_OPENSSL_VERSION_NUMBER < 0x3000000fL) + SSL_CTX_set_tmp_dh_callback(ssl_ctx, ssl_get_tmp_dh_cbk); +#else + ssl_sock_set_tmp_dh_from_pkey(ssl_ctx, pkey); +#endif +#endif + +#if (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) +#if defined(SSL_CTX_set1_curves_list) + { + const char *ecdhe = (bind_conf->ssl_conf.ecdhe ? bind_conf->ssl_conf.ecdhe : ECDHE_DEFAULT_CURVE); + if (!SSL_CTX_set1_curves_list(ssl_ctx, ecdhe)) + goto end; + } +#endif +#else +#if defined(SSL_CTX_set_tmp_ecdh) && !defined(OPENSSL_NO_ECDH) + { + const char *ecdhe = (bind_conf->ssl_conf.ecdhe ? bind_conf->ssl_conf.ecdhe : ECDHE_DEFAULT_CURVE); + EC_KEY *ecc; + int nid; + + if ((nid = OBJ_sn2nid(ecdhe)) == NID_undef) + goto end; + if (!(ecc = EC_KEY_new_by_curve_name(nid))) + goto end; + SSL_CTX_set_tmp_ecdh(ssl_ctx, ecc); + EC_KEY_free(ecc); + } +#endif /* defined(SSL_CTX_set_tmp_ecdh) && !defined(OPENSSL_NO_ECDH) */ +#endif /* HA_OPENSSL_VERSION_NUMBER >= 0x10101000L */ + end: + return ssl_ctx; + + mkcert_error: + if (ctmp) NCONF_free(ctmp); + if (tmp_ssl) SSL_free(tmp_ssl); + if (ssl_ctx) SSL_CTX_free(ssl_ctx); + if (newcrt) X509_free(newcrt); + return NULL; +} + + +/* Do a lookup for a certificate in the LRU cache used to store generated + * certificates and immediately assign it to the SSL session if not null. */ +SSL_CTX * +ssl_sock_assign_generated_cert(unsigned int key, struct bind_conf *bind_conf, SSL *ssl) +{ + struct lru64 *lru = NULL; + + if (ssl_ctx_lru_tree) { + HA_RWLOCK_WRLOCK(SSL_GEN_CERTS_LOCK, &ssl_ctx_lru_rwlock); + lru = lru64_lookup(key, ssl_ctx_lru_tree, bind_conf->ca_sign_ckch->cert, 0); + if (lru && lru->domain) { + if (ssl) + SSL_set_SSL_CTX(ssl, (SSL_CTX *)lru->data); + HA_RWLOCK_WRUNLOCK(SSL_GEN_CERTS_LOCK, &ssl_ctx_lru_rwlock); + return (SSL_CTX *)lru->data; + } + HA_RWLOCK_WRUNLOCK(SSL_GEN_CERTS_LOCK, &ssl_ctx_lru_rwlock); + } + return NULL; +} + +/* Same as <ssl_sock_assign_generated_cert> but without SSL session. This + * function is not thread-safe, it should only be used to check if a certificate + * exists in the lru cache (with no warranty it will not be removed by another + * thread). It is kept for backward compatibility. */ +SSL_CTX * +ssl_sock_get_generated_cert(unsigned int key, struct bind_conf *bind_conf) +{ + return ssl_sock_assign_generated_cert(key, bind_conf, NULL); +} + +/* Set a certificate int the LRU cache used to store generated + * certificate. Return 0 on success, otherwise -1 */ +int +ssl_sock_set_generated_cert(SSL_CTX *ssl_ctx, unsigned int key, struct bind_conf *bind_conf) +{ + struct lru64 *lru = NULL; + + if (ssl_ctx_lru_tree) { + HA_RWLOCK_WRLOCK(SSL_GEN_CERTS_LOCK, &ssl_ctx_lru_rwlock); + lru = lru64_get(key, ssl_ctx_lru_tree, bind_conf->ca_sign_ckch->cert, 0); + if (!lru) { + HA_RWLOCK_WRUNLOCK(SSL_GEN_CERTS_LOCK, &ssl_ctx_lru_rwlock); + return -1; + } + if (lru->domain && lru->data) + lru->free((SSL_CTX *)lru->data); + lru64_commit(lru, ssl_ctx, bind_conf->ca_sign_ckch->cert, 0, (void (*)(void *))SSL_CTX_free); + HA_RWLOCK_WRUNLOCK(SSL_GEN_CERTS_LOCK, &ssl_ctx_lru_rwlock); + return 0; + } + return -1; +} + +/* Compute the key of the certificate. */ +unsigned int +ssl_sock_generated_cert_key(const void *data, size_t len) +{ + return XXH32(data, len, ssl_ctx_lru_seed); +} + +/* Generate a cert and immediately assign it to the SSL session so that the cert's + * refcount is maintained regardless of the cert's presence in the LRU cache. + */ +static int +ssl_sock_generate_certificate(const char *servername, struct bind_conf *bind_conf, SSL *ssl) +{ + X509 *cacert = bind_conf->ca_sign_ckch->cert; + SSL_CTX *ssl_ctx = NULL; + struct lru64 *lru = NULL; + unsigned int key; + + key = ssl_sock_generated_cert_key(servername, strlen(servername)); + if (ssl_ctx_lru_tree) { + HA_RWLOCK_WRLOCK(SSL_GEN_CERTS_LOCK, &ssl_ctx_lru_rwlock); + lru = lru64_get(key, ssl_ctx_lru_tree, cacert, 0); + if (lru && lru->domain) + ssl_ctx = (SSL_CTX *)lru->data; + if (!ssl_ctx && lru) { + ssl_ctx = ssl_sock_do_create_cert(servername, bind_conf, ssl); + lru64_commit(lru, ssl_ctx, cacert, 0, (void (*)(void *))SSL_CTX_free); + } + SSL_set_SSL_CTX(ssl, ssl_ctx); + HA_RWLOCK_WRUNLOCK(SSL_GEN_CERTS_LOCK, &ssl_ctx_lru_rwlock); + return 1; + } + else { + ssl_ctx = ssl_sock_do_create_cert(servername, bind_conf, ssl); + SSL_set_SSL_CTX(ssl, ssl_ctx); + /* No LRU cache, this CTX will be released as soon as the session dies */ + SSL_CTX_free(ssl_ctx); + return 1; + } + return 0; +} +static int +ssl_sock_generate_certificate_from_conn(struct bind_conf *bind_conf, SSL *ssl) +{ + unsigned int key; + struct connection *conn = SSL_get_ex_data(ssl, ssl_app_data_index); + + if (conn_get_dst(conn)) { + key = ssl_sock_generated_cert_key(conn->dst, get_addr_len(conn->dst)); + if (ssl_sock_assign_generated_cert(key, bind_conf, ssl)) + return 1; + } + return 0; +} +#endif /* !defined SSL_NO_GENERATE_CERTIFICATES */ + +#if (HA_OPENSSL_VERSION_NUMBER < 0x1010000fL) + +static void ctx_set_SSLv3_func(SSL_CTX *ctx, set_context_func c) +{ +#if SSL_OP_NO_SSLv3 + c == SET_SERVER ? SSL_CTX_set_ssl_version(ctx, SSLv3_server_method()) + : SSL_CTX_set_ssl_version(ctx, SSLv3_client_method()); +#endif +} +static void ctx_set_TLSv10_func(SSL_CTX *ctx, set_context_func c) { + c == SET_SERVER ? SSL_CTX_set_ssl_version(ctx, TLSv1_server_method()) + : SSL_CTX_set_ssl_version(ctx, TLSv1_client_method()); +} +static void ctx_set_TLSv11_func(SSL_CTX *ctx, set_context_func c) { +#if SSL_OP_NO_TLSv1_1 + c == SET_SERVER ? SSL_CTX_set_ssl_version(ctx, TLSv1_1_server_method()) + : SSL_CTX_set_ssl_version(ctx, TLSv1_1_client_method()); +#endif +} +static void ctx_set_TLSv12_func(SSL_CTX *ctx, set_context_func c) { +#if SSL_OP_NO_TLSv1_2 + c == SET_SERVER ? SSL_CTX_set_ssl_version(ctx, TLSv1_2_server_method()) + : SSL_CTX_set_ssl_version(ctx, TLSv1_2_client_method()); +#endif +} +/* TLSv1.2 is the last supported version in this context. */ +static void ctx_set_TLSv13_func(SSL_CTX *ctx, set_context_func c) {} +/* Unusable in this context. */ +static void ssl_set_SSLv3_func(SSL *ssl, set_context_func c) {} +static void ssl_set_TLSv10_func(SSL *ssl, set_context_func c) {} +static void ssl_set_TLSv11_func(SSL *ssl, set_context_func c) {} +static void ssl_set_TLSv12_func(SSL *ssl, set_context_func c) {} +static void ssl_set_TLSv13_func(SSL *ssl, set_context_func c) {} +#else /* openssl >= 1.1.0 */ + +static void ctx_set_SSLv3_func(SSL_CTX *ctx, set_context_func c) { + c == SET_MAX ? SSL_CTX_set_max_proto_version(ctx, SSL3_VERSION) + : SSL_CTX_set_min_proto_version(ctx, SSL3_VERSION); +} +static void ssl_set_SSLv3_func(SSL *ssl, set_context_func c) { + c == SET_MAX ? SSL_set_max_proto_version(ssl, SSL3_VERSION) + : SSL_set_min_proto_version(ssl, SSL3_VERSION); +} +static void ctx_set_TLSv10_func(SSL_CTX *ctx, set_context_func c) { + c == SET_MAX ? SSL_CTX_set_max_proto_version(ctx, TLS1_VERSION) + : SSL_CTX_set_min_proto_version(ctx, TLS1_VERSION); +} +static void ssl_set_TLSv10_func(SSL *ssl, set_context_func c) { + c == SET_MAX ? SSL_set_max_proto_version(ssl, TLS1_VERSION) + : SSL_set_min_proto_version(ssl, TLS1_VERSION); +} +static void ctx_set_TLSv11_func(SSL_CTX *ctx, set_context_func c) { + c == SET_MAX ? SSL_CTX_set_max_proto_version(ctx, TLS1_1_VERSION) + : SSL_CTX_set_min_proto_version(ctx, TLS1_1_VERSION); +} +static void ssl_set_TLSv11_func(SSL *ssl, set_context_func c) { + c == SET_MAX ? SSL_set_max_proto_version(ssl, TLS1_1_VERSION) + : SSL_set_min_proto_version(ssl, TLS1_1_VERSION); +} +static void ctx_set_TLSv12_func(SSL_CTX *ctx, set_context_func c) { + c == SET_MAX ? SSL_CTX_set_max_proto_version(ctx, TLS1_2_VERSION) + : SSL_CTX_set_min_proto_version(ctx, TLS1_2_VERSION); +} +static void ssl_set_TLSv12_func(SSL *ssl, set_context_func c) { + c == SET_MAX ? SSL_set_max_proto_version(ssl, TLS1_2_VERSION) + : SSL_set_min_proto_version(ssl, TLS1_2_VERSION); +} +static void ctx_set_TLSv13_func(SSL_CTX *ctx, set_context_func c) { +#if (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) + c == SET_MAX ? SSL_CTX_set_max_proto_version(ctx, TLS1_3_VERSION) + : SSL_CTX_set_min_proto_version(ctx, TLS1_3_VERSION); +#endif +} +static void ssl_set_TLSv13_func(SSL *ssl, set_context_func c) { +#if (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) + c == SET_MAX ? SSL_set_max_proto_version(ssl, TLS1_3_VERSION) + : SSL_set_min_proto_version(ssl, TLS1_3_VERSION); +#endif +} +#endif +static void ctx_set_None_func(SSL_CTX *ctx, set_context_func c) { } +static void ssl_set_None_func(SSL *ssl, set_context_func c) { } + +struct methodVersions methodVersions[] = { + {0, 0, ctx_set_None_func, ssl_set_None_func, "NONE"}, /* CONF_TLSV_NONE */ + {SSL_OP_NO_SSLv3, MC_SSL_O_NO_SSLV3, ctx_set_SSLv3_func, ssl_set_SSLv3_func, "SSLv3"}, /* CONF_SSLV3 */ + {SSL_OP_NO_TLSv1, MC_SSL_O_NO_TLSV10, ctx_set_TLSv10_func, ssl_set_TLSv10_func, "TLSv1.0"}, /* CONF_TLSV10 */ + {SSL_OP_NO_TLSv1_1, MC_SSL_O_NO_TLSV11, ctx_set_TLSv11_func, ssl_set_TLSv11_func, "TLSv1.1"}, /* CONF_TLSV11 */ + {SSL_OP_NO_TLSv1_2, MC_SSL_O_NO_TLSV12, ctx_set_TLSv12_func, ssl_set_TLSv12_func, "TLSv1.2"}, /* CONF_TLSV12 */ + {SSL_OP_NO_TLSv1_3, MC_SSL_O_NO_TLSV13, ctx_set_TLSv13_func, ssl_set_TLSv13_func, "TLSv1.3"}, /* CONF_TLSV13 */ +}; + +static void ssl_sock_switchctx_set(SSL *ssl, SSL_CTX *ctx) +{ + SSL_set_verify(ssl, SSL_CTX_get_verify_mode(ctx), ssl_sock_bind_verifycbk); + SSL_set_client_CA_list(ssl, SSL_dup_CA_list(SSL_CTX_get_client_CA_list(ctx))); + SSL_set_SSL_CTX(ssl, ctx); +} + +/* + * Return the right sni_ctx for a <bind_conf> and a chosen <servername> (must be in lowercase) + * RSA <have_rsa_sig> and ECDSA <have_ecdsa_sig> capabilities of the client can also be used. + * + * This function does a lookup in the bind_conf sni tree so the caller should lock its tree. + */ +static __maybe_unused struct sni_ctx *ssl_sock_chose_sni_ctx(struct bind_conf *s, const char *servername, + int have_rsa_sig, int have_ecdsa_sig) +{ + struct ebmb_node *node, *n, *node_ecdsa = NULL, *node_rsa = NULL, *node_anonymous = NULL; + const char *wildp = NULL; + int i; + + /* look for the first dot for wildcard search */ + for (i = 0; servername[i] != '\0'; i++) { + if (servername[i] == '.') { + wildp = &servername[i]; + break; + } + } + + /* Look for an ECDSA, RSA and DSA certificate, first in the single + * name and if not found in the wildcard */ + for (i = 0; i < 2; i++) { + if (i == 0) /* lookup in full qualified names */ + node = ebst_lookup(&s->sni_ctx, trash.area); + else if (i == 1 && wildp) /* lookup in wildcards names */ + node = ebst_lookup(&s->sni_w_ctx, wildp); + else + break; + + for (n = node; n; n = ebmb_next_dup(n)) { + + /* lookup a not neg filter */ + if (!container_of(n, struct sni_ctx, name)->neg) { + struct sni_ctx *sni, *sni_tmp; + int skip = 0; + + if (i == 1 && wildp) { /* wildcard */ + /* If this is a wildcard, look for an exclusion on the same crt-list line */ + sni = container_of(n, struct sni_ctx, name); + list_for_each_entry(sni_tmp, &sni->ckch_inst->sni_ctx, by_ckch_inst) { + if (sni_tmp->neg && (strcmp((const char *)sni_tmp->name.key, trash.area) == 0)) { + skip = 1; + break; + } + } + if (skip) + continue; + } + + switch(container_of(n, struct sni_ctx, name)->kinfo.sig) { + case TLSEXT_signature_ecdsa: + if (!node_ecdsa) + node_ecdsa = n; + break; + case TLSEXT_signature_rsa: + if (!node_rsa) + node_rsa = n; + break; + default: /* TLSEXT_signature_anonymous|dsa */ + if (!node_anonymous) + node_anonymous = n; + break; + } + } + } + } + /* Once the certificates are found, select them depending on what is + * supported in the client and by key_signature priority order: EDSA > + * RSA > DSA */ + if (have_ecdsa_sig && node_ecdsa) + node = node_ecdsa; + else if (have_rsa_sig && node_rsa) + node = node_rsa; + else if (node_anonymous) + node = node_anonymous; + else if (node_ecdsa) + node = node_ecdsa; /* no ecdsa signature case (< TLSv1.2) */ + else + node = node_rsa; /* no rsa signature case (far far away) */ + + if (node) + return container_of(node, struct sni_ctx, name); + + return NULL; +} + +#ifdef HAVE_SSL_CLIENT_HELLO_CB + +int ssl_sock_switchctx_err_cbk(SSL *ssl, int *al, void *priv) +{ + struct bind_conf *s = priv; + (void)al; /* shut gcc stupid warning */ + + if (SSL_get_servername(ssl, TLSEXT_NAMETYPE_host_name) || (s->options & BC_O_GENERATE_CERTS)) + return SSL_TLSEXT_ERR_OK; + return SSL_TLSEXT_ERR_NOACK; +} + +#ifdef OPENSSL_IS_BORINGSSL +int ssl_sock_switchctx_cbk(const struct ssl_early_callback_ctx *ctx) +{ + SSL *ssl = ctx->ssl; +#else +int ssl_sock_switchctx_cbk(SSL *ssl, int *al, void *arg) +{ +#endif + struct connection *conn = SSL_get_ex_data(ssl, ssl_app_data_index); +#ifdef USE_QUIC + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); +#endif /* USE_QUIC */ + struct bind_conf *s = NULL; + const uint8_t *extension_data; + size_t extension_len; + int has_rsa_sig = 0, has_ecdsa_sig = 0; + struct sni_ctx *sni_ctx; + const char *servername; + size_t servername_len; + int allow_early = 0; + int i; + + if (conn) + s = __objt_listener(conn->target)->bind_conf; +#ifdef USE_QUIC + else if (qc) + s = qc->li->bind_conf; +#endif /* USE_QUIC */ + + if (!s) { + /* must never happen */ + ABORT_NOW(); + return 0; + } + +#ifdef USE_QUIC + if (qc) { + /* Look for the QUIC transport parameters. */ +#ifdef OPENSSL_IS_BORINGSSL + if (!SSL_early_callback_ctx_extension_get(ctx, qc->tps_tls_ext, + &extension_data, &extension_len)) +#else + if (!SSL_client_hello_get0_ext(ssl, qc->tps_tls_ext, + &extension_data, &extension_len)) +#endif + { + /* This is not redundant. It we only return 0 without setting + * <*al>, this has as side effect to generate another TLS alert + * which would be set after calling quic_set_tls_alert(). + */ + *al = SSL_AD_MISSING_EXTENSION; + quic_set_tls_alert(qc, SSL_AD_MISSING_EXTENSION); + return 0; + } + + if (!quic_transport_params_store(qc, 0, extension_data, + extension_data + extension_len)) + goto abort; + + qc->flags |= QUIC_FL_CONN_TX_TP_RECEIVED; + } +#endif /* USE_QUIC */ + + if (s->ssl_conf.early_data) + allow_early = 1; +#ifdef OPENSSL_IS_BORINGSSL + if (SSL_early_callback_ctx_extension_get(ctx, TLSEXT_TYPE_server_name, + &extension_data, &extension_len)) { +#else + if (SSL_client_hello_get0_ext(ssl, TLSEXT_TYPE_server_name, &extension_data, &extension_len)) { +#endif + /* + * The server_name extension was given too much extensibility when it + * was written, so parsing the normal case is a bit complex. + */ + size_t len; + if (extension_len <= 2) + goto abort; + /* Extract the length of the supplied list of names. */ + len = (*extension_data++) << 8; + len |= *extension_data++; + if (len + 2 != extension_len) + goto abort; + /* + * The list in practice only has a single element, so we only consider + * the first one. + */ + if (len == 0 || *extension_data++ != TLSEXT_NAMETYPE_host_name) + goto abort; + extension_len = len - 1; + /* Now we can finally pull out the byte array with the actual hostname. */ + if (extension_len <= 2) + goto abort; + len = (*extension_data++) << 8; + len |= *extension_data++; + if (len == 0 || len + 2 > extension_len || len > TLSEXT_MAXLEN_host_name + || memchr(extension_data, 0, len) != NULL) + goto abort; + servername = (char *)extension_data; + servername_len = len; + } else { +#if (!defined SSL_NO_GENERATE_CERTIFICATES) + if (s->options & BC_O_GENERATE_CERTS && ssl_sock_generate_certificate_from_conn(s, ssl)) { + goto allow_early; + } +#endif + /* without SNI extension, is the default_ctx (need SSL_TLSEXT_ERR_NOACK) */ + if (!s->strict_sni) { + HA_RWLOCK_RDLOCK(SNI_LOCK, &s->sni_lock); + ssl_sock_switchctx_set(ssl, s->default_ctx); + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + goto allow_early; + } + goto abort; + } + + /* extract/check clientHello information */ +#ifdef OPENSSL_IS_BORINGSSL + if (SSL_early_callback_ctx_extension_get(ctx, TLSEXT_TYPE_signature_algorithms, &extension_data, &extension_len)) { +#else + if (SSL_client_hello_get0_ext(ssl, TLSEXT_TYPE_signature_algorithms, &extension_data, &extension_len)) { +#endif + uint8_t sign; + size_t len; + if (extension_len < 2) + goto abort; + len = (*extension_data++) << 8; + len |= *extension_data++; + if (len + 2 != extension_len) + goto abort; + if (len % 2 != 0) + goto abort; + for (; len > 0; len -= 2) { + extension_data++; /* hash */ + sign = *extension_data++; + switch (sign) { + case TLSEXT_signature_rsa: + has_rsa_sig = 1; + break; + case TLSEXT_signature_ecdsa: + has_ecdsa_sig = 1; + break; + default: + continue; + } + if (has_ecdsa_sig && has_rsa_sig) + break; + } + } else { + /* without TLSEXT_TYPE_signature_algorithms extension (< TLSv1.2) */ + has_rsa_sig = 1; + } + if (has_ecdsa_sig) { /* in very rare case: has ecdsa sign but not a ECDSA cipher */ + const SSL_CIPHER *cipher; + uint32_t cipher_id; + size_t len; + const uint8_t *cipher_suites; + has_ecdsa_sig = 0; +#ifdef OPENSSL_IS_BORINGSSL + len = ctx->cipher_suites_len; + cipher_suites = ctx->cipher_suites; +#else + len = SSL_client_hello_get0_ciphers(ssl, &cipher_suites); +#endif + if (len % 2 != 0) + goto abort; + for (; len != 0; len -= 2, cipher_suites += 2) { +#ifdef OPENSSL_IS_BORINGSSL + uint16_t cipher_suite = (cipher_suites[0] << 8) | cipher_suites[1]; + cipher = SSL_get_cipher_by_value(cipher_suite); +#else + cipher = SSL_CIPHER_find(ssl, cipher_suites); +#endif + if (!cipher) + continue; + + cipher_id = SSL_CIPHER_get_id(cipher); + /* skip the SCSV "fake" signaling ciphersuites because they are NID_auth_any (RFC 7507) */ + if (cipher_id == SSL3_CK_SCSV || cipher_id == SSL3_CK_FALLBACK_SCSV) + continue; + + if (SSL_CIPHER_get_auth_nid(cipher) == NID_auth_ecdsa + || SSL_CIPHER_get_auth_nid(cipher) == NID_auth_any) { + has_ecdsa_sig = 1; + break; + } + } + } + + /* we need to transform this a NULL-ended string in lowecase */ + for (i = 0; i < trash.size && i < servername_len; i++) + trash.area[i] = tolower(servername[i]); + trash.area[i] = 0; + servername = trash.area; + + HA_RWLOCK_RDLOCK(SNI_LOCK, &s->sni_lock); + sni_ctx = ssl_sock_chose_sni_ctx(s, servername, has_rsa_sig, has_ecdsa_sig); + if (sni_ctx) { + /* switch ctx */ + struct ssl_bind_conf *conf = sni_ctx->conf; + ssl_sock_switchctx_set(ssl, sni_ctx->ctx); + if (conf) { + methodVersions[conf->ssl_methods.min].ssl_set_version(ssl, SET_MIN); + methodVersions[conf->ssl_methods.max].ssl_set_version(ssl, SET_MAX); + if (conf->early_data) + allow_early = 1; + } + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + goto allow_early; + } + + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); +#if (!defined SSL_NO_GENERATE_CERTIFICATES) + if (s->options & BC_O_GENERATE_CERTS && ssl_sock_generate_certificate(servername, s, ssl)) { + /* switch ctx done in ssl_sock_generate_certificate */ + goto allow_early; + } +#endif + if (!s->strict_sni) { + /* no certificate match, is the default_ctx */ + HA_RWLOCK_RDLOCK(SNI_LOCK, &s->sni_lock); + ssl_sock_switchctx_set(ssl, s->default_ctx); + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + goto allow_early; + } + + /* We are about to raise an handshake error so the servername extension + * callback will never be called and the SNI will never be stored in the + * SSL context. In order for the ssl_fc_sni sample fetch to still work + * in such a case, we store the SNI ourselves as an ex_data information + * in the SSL context. + */ + { + char *client_sni = pool_alloc(ssl_sock_client_sni_pool); + if (client_sni) { + strncpy(client_sni, servername, TLSEXT_MAXLEN_host_name); + client_sni[TLSEXT_MAXLEN_host_name] = '\0'; + SSL_set_ex_data(ssl, ssl_client_sni_index, client_sni); + } + } + + /* other cases fallback on abort, if strict-sni is set but no node was found */ + + abort: + /* abort handshake (was SSL_TLSEXT_ERR_ALERT_FATAL) */ + if (conn) + conn->err_code = CO_ER_SSL_HANDSHAKE; +#ifdef OPENSSL_IS_BORINGSSL + return ssl_select_cert_error; +#else + *al = SSL_AD_UNRECOGNIZED_NAME; + return 0; +#endif + +allow_early: +#ifdef OPENSSL_IS_BORINGSSL + if (allow_early) + SSL_set_early_data_enabled(ssl, 1); +#else + if (!allow_early) + SSL_set_max_early_data(ssl, 0); +#endif + return 1; +} + +#else /* ! HAVE_SSL_CLIENT_HELLO_CB */ + +/* Sets the SSL ctx of <ssl> to match the advertised server name. Returns a + * warning when no match is found, which implies the default (first) cert + * will keep being used. + */ +int ssl_sock_switchctx_cbk(SSL *ssl, int *al, void *priv) +{ + const char *servername; + const char *wildp = NULL; + struct ebmb_node *node, *n; + struct bind_conf *s = priv; +#ifdef USE_QUIC + const uint8_t *extension_data; + size_t extension_len; + struct quic_conn *qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); +#endif /* USE_QUIC */ + int i; + (void)al; /* shut gcc stupid warning */ + +#ifdef USE_QUIC + if (qc) { + + /* Look for the QUIC transport parameters. */ + SSL_get_peer_quic_transport_params(ssl, &extension_data, &extension_len); + if (extension_len == 0) { + /* This is not redundant. It we only return 0 without setting + * <*al>, this has as side effect to generate another TLS alert + * which would be set after calling quic_set_tls_alert(). + */ + *al = SSL_AD_MISSING_EXTENSION; + quic_set_tls_alert(qc, SSL_AD_MISSING_EXTENSION); + return SSL_TLSEXT_ERR_NOACK; + } + + if (!quic_transport_params_store(qc, 0, extension_data, + extension_data + extension_len)) + return SSL_TLSEXT_ERR_NOACK; + + qc->flags |= QUIC_FL_CONN_TX_TP_RECEIVED; + } +#endif /* USE_QUIC */ + + servername = SSL_get_servername(ssl, TLSEXT_NAMETYPE_host_name); + if (!servername) { +#if (!defined SSL_NO_GENERATE_CERTIFICATES) + if (s->options & BC_O_GENERATE_CERTS && ssl_sock_generate_certificate_from_conn(s, ssl)) + return SSL_TLSEXT_ERR_OK; +#endif + if (s->strict_sni) + return SSL_TLSEXT_ERR_ALERT_FATAL; + HA_RWLOCK_RDLOCK(SNI_LOCK, &s->sni_lock); + ssl_sock_switchctx_set(ssl, s->default_ctx); + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + return SSL_TLSEXT_ERR_NOACK; + } + + for (i = 0; i < trash.size; i++) { + if (!servername[i]) + break; + trash.area[i] = tolower((unsigned char)servername[i]); + if (!wildp && (trash.area[i] == '.')) + wildp = &trash.area[i]; + } + trash.area[i] = 0; + + HA_RWLOCK_RDLOCK(SNI_LOCK, &s->sni_lock); + node = NULL; + /* lookup in full qualified names */ + for (n = ebst_lookup(&s->sni_ctx, trash.area); n; n = ebmb_next_dup(n)) { + /* lookup a not neg filter */ + if (!container_of(n, struct sni_ctx, name)->neg) { + node = n; + break; + } + } + if (!node && wildp) { + /* lookup in wildcards names */ + for (n = ebst_lookup(&s->sni_w_ctx, wildp); n; n = ebmb_next_dup(n)) { + /* lookup a not neg filter */ + if (!container_of(n, struct sni_ctx, name)->neg) { + node = n; + break; + } + } + } + if (!node) { +#if (!defined SSL_NO_GENERATE_CERTIFICATES) + if (s->options & BC_O_GENERATE_CERTS && ssl_sock_generate_certificate(servername, s, ssl)) { + /* switch ctx done in ssl_sock_generate_certificate */ + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + return SSL_TLSEXT_ERR_OK; + } +#endif + if (s->strict_sni) { + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + return SSL_TLSEXT_ERR_ALERT_FATAL; + } + ssl_sock_switchctx_set(ssl, s->default_ctx); + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + return SSL_TLSEXT_ERR_OK; + } + + /* switch ctx */ + ssl_sock_switchctx_set(ssl, container_of(node, struct sni_ctx, name)->ctx); + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + return SSL_TLSEXT_ERR_OK; +} +#endif /* (!) OPENSSL_IS_BORINGSSL */ +#endif /* SSL_CTRL_SET_TLSEXT_HOSTNAME */ + +#if 0 && defined(USE_OPENSSL_WOLFSSL) +/* This implement the equivalent of the clientHello Callback but using the cert_cb. + * WolfSSL is able to extract the sigalgs and ciphers of the client byt using the API + * provided in https://github.com/wolfSSL/wolfssl/pull/6963 + * + * Not activated for now since the PR is not merged. + */ +static int ssl_sock_switchctx_wolfSSL_cbk(WOLFSSL* ssl, void* arg) +{ + struct bind_conf *s = arg; + int has_rsa_sig = 0, has_ecdsa_sig = 0; + const char *servername; + struct sni_ctx *sni_ctx; + int i; + + if (!s) { + /* must never happen */ + ABORT_NOW(); + return 0; + } + + servername = SSL_get_servername(ssl, TLSEXT_NAMETYPE_host_name); + if (!servername) { + /* without SNI extension, is the default_ctx (need SSL_TLSEXT_ERR_NOACK) */ + if (!s->strict_sni) { + HA_RWLOCK_RDLOCK(SNI_LOCK, &s->sni_lock); + ssl_sock_switchctx_set(ssl, s->default_ctx); + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + goto allow_early; + } + goto abort; + } + + /* extract sigalgs and ciphers */ + { + const byte* suites = NULL; + word16 suiteSz = 0; + const byte* hashSigAlgo = NULL; + word16 hashSigAlgoSz = 0; + word16 idx = 0; + + wolfSSL_get_client_suites_sigalgs(ssl, &suites, &suiteSz, &hashSigAlgo, &hashSigAlgoSz); + if (suites == NULL || suiteSz == 0 || hashSigAlgo == NULL || hashSigAlgoSz == 0) + return 0; + + if (SSL_version(ssl) != TLS1_3_VERSION) { + for (idx = 0; idx < suiteSz; idx += 2) { + WOLFSSL_CIPHERSUITE_INFO info; + info = wolfSSL_get_ciphersuite_info(suites[idx], suites[idx+1]); + if (info.rsaAuth) + has_rsa_sig = 1; + else if (info.eccAuth) + has_ecdsa_sig = 1; + } + } + + if (hashSigAlgoSz > 0) { + /* sigalgs extension takes precedence over ciphersuites */ + has_ecdsa_sig = 0; + has_rsa_sig = 0; + } + for (idx = 0; idx < hashSigAlgoSz; idx += 2) { + int hashAlgo; + int sigAlgo; + + wolfSSL_get_sigalg_info(hashSigAlgo[idx+0], hashSigAlgo[idx+1], &hashAlgo, &sigAlgo); + + if (sigAlgo == RSAk || sigAlgo == RSAPSSk) + has_rsa_sig = 1; + else if (sigAlgo == ECDSAk) + has_ecdsa_sig = 1; + } + } + + /* we need to transform this into a NULL-ended string in lowecase */ + for (i = 0; i < trash.size && servername[i] != '\0'; i++) + trash.area[i] = tolower(servername[i]); + trash.area[i] = 0; + servername = trash.area; + + HA_RWLOCK_RDLOCK(SNI_LOCK, &s->sni_lock); + sni_ctx = ssl_sock_chose_sni_ctx(s, servername, has_rsa_sig, has_ecdsa_sig); + if (sni_ctx) { + /* switch ctx */ + struct ssl_bind_conf *conf = sni_ctx->conf; + ssl_sock_switchctx_set(ssl, sni_ctx->ctx); + if (conf) { + methodVersions[conf->ssl_methods.min].ssl_set_version(ssl, SET_MIN); + methodVersions[conf->ssl_methods.max].ssl_set_version(ssl, SET_MAX); + } + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + goto allow_early; + } + + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + if (!s->strict_sni) { + /* no certificate match, is the default_ctx */ + HA_RWLOCK_RDLOCK(SNI_LOCK, &s->sni_lock); + ssl_sock_switchctx_set(ssl, s->default_ctx); + HA_RWLOCK_RDUNLOCK(SNI_LOCK, &s->sni_lock); + goto allow_early; + } + + /* We are about to raise an handshake error so the servername extension + * callback will never be called and the SNI will never be stored in the + * SSL context. In order for the ssl_fc_sni sample fetch to still work + * in such a case, we store the SNI ourselves as an ex_data information + * in the SSL context. + */ + { + char *client_sni = pool_alloc(ssl_sock_client_sni_pool); + if (client_sni) { + strncpy(client_sni, servername, TLSEXT_MAXLEN_host_name); + client_sni[TLSEXT_MAXLEN_host_name] = '\0'; + SSL_set_ex_data(ssl, ssl_client_sni_index, client_sni); + } + } + + /* other cases fallback on abort, if strict-sni is set but no node was found */ + + abort: + /* abort handshake (was SSL_TLSEXT_ERR_ALERT_FATAL) */ + return 0; + +allow_early: + return 1; +} +#endif + +#ifndef OPENSSL_NO_DH + +static inline HASSL_DH *ssl_new_dh_fromdata(BIGNUM *p, BIGNUM *g) +{ +#if (HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL) + OSSL_PARAM_BLD *tmpl = NULL; + OSSL_PARAM *params = NULL; + EVP_PKEY_CTX *ctx = NULL; + EVP_PKEY *pkey = NULL; + + if ((tmpl = OSSL_PARAM_BLD_new()) == NULL + || !OSSL_PARAM_BLD_push_BN(tmpl, OSSL_PKEY_PARAM_FFC_P, p) + || !OSSL_PARAM_BLD_push_BN(tmpl, OSSL_PKEY_PARAM_FFC_G, g) + || (params = OSSL_PARAM_BLD_to_param(tmpl)) == NULL) { + goto end; + } + ctx = EVP_PKEY_CTX_new_from_name(NULL, "DH", NULL); + if (ctx == NULL + || !EVP_PKEY_fromdata_init(ctx) + || !EVP_PKEY_fromdata(ctx, &pkey, EVP_PKEY_KEY_PARAMETERS, params)) { + goto end; + } + +end: + EVP_PKEY_CTX_free(ctx); + OSSL_PARAM_free(params); + OSSL_PARAM_BLD_free(tmpl); + BN_free(p); + BN_free(g); + return pkey; +#else + + HASSL_DH *dh = DH_new(); + + if (!dh) + return NULL; + + DH_set0_pqg(dh, p, NULL, g); + + return dh; +#endif +} + +#if (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) +static inline HASSL_DH *ssl_get_dh_by_nid(int nid) +{ +#if (HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL) + OSSL_PARAM params[2]; + EVP_PKEY *pkey = NULL; + EVP_PKEY_CTX *pctx = EVP_PKEY_CTX_new_from_name(NULL, "DH", NULL); + const char *named_group = NULL; + + if (!pctx) + goto end; + + named_group = OBJ_nid2ln(nid); + + if (!named_group) + goto end; + + params[0] = OSSL_PARAM_construct_utf8_string("group", (char*)named_group, 0); + params[1] = OSSL_PARAM_construct_end(); + + if (EVP_PKEY_keygen_init(pctx) && EVP_PKEY_CTX_set_params(pctx, params)) + EVP_PKEY_generate(pctx, &pkey); + +end: + EVP_PKEY_CTX_free(pctx); + return pkey; +#else + + HASSL_DH *dh = NULL; + dh = DH_new_by_nid(nid); + return dh; +#endif +} +#endif + + +static HASSL_DH * ssl_get_dh_1024(void) +{ + static unsigned char dh1024_p[]={ + 0xFA,0xF9,0x2A,0x22,0x2A,0xA7,0x7F,0xE1,0x67,0x4E,0x53,0xF7, + 0x56,0x13,0xC3,0xB1,0xE3,0x29,0x6B,0x66,0x31,0x6A,0x7F,0xB3, + 0xC2,0x68,0x6B,0xCB,0x1D,0x57,0x39,0x1D,0x1F,0xFF,0x1C,0xC9, + 0xA6,0xA4,0x98,0x82,0x31,0x5D,0x25,0xFF,0x8A,0xE0,0x73,0x96, + 0x81,0xC8,0x83,0x79,0xC1,0x5A,0x04,0xF8,0x37,0x0D,0xA8,0x3D, + 0xAE,0x74,0xBC,0xDB,0xB6,0xA4,0x75,0xD9,0x71,0x8A,0xA0,0x17, + 0x9E,0x2D,0xC8,0xA8,0xDF,0x2C,0x5F,0x82,0x95,0xF8,0x92,0x9B, + 0xA7,0x33,0x5F,0x89,0x71,0xC8,0x2D,0x6B,0x18,0x86,0xC4,0x94, + 0x22,0xA5,0x52,0x8D,0xF6,0xF6,0xD2,0x37,0x92,0x0F,0xA5,0xCC, + 0xDB,0x7B,0x1D,0x3D,0xA1,0x31,0xB7,0x80,0x8F,0x0B,0x67,0x5E, + 0x36,0xA5,0x60,0x0C,0xF1,0x95,0x33,0x8B, + }; + static unsigned char dh1024_g[]={ + 0x02, + }; + + BIGNUM *p; + BIGNUM *g; + + HASSL_DH *dh = NULL; + + p = BN_bin2bn(dh1024_p, sizeof dh1024_p, NULL); + g = BN_bin2bn(dh1024_g, sizeof dh1024_g, NULL); + + if (p && g) + dh = ssl_new_dh_fromdata(p, g); + + return dh; +} + +static HASSL_DH *ssl_get_dh_2048(void) +{ +#if (HA_OPENSSL_VERSION_NUMBER < 0x10101000L) + static unsigned char dh2048_p[]={ + 0xEC,0x86,0xF8,0x70,0xA0,0x33,0x16,0xEC,0x05,0x1A,0x73,0x59, + 0xCD,0x1F,0x8B,0xF8,0x29,0xE4,0xD2,0xCF,0x52,0xDD,0xC2,0x24, + 0x8D,0xB5,0x38,0x9A,0xFB,0x5C,0xA4,0xE4,0xB2,0xDA,0xCE,0x66, + 0x50,0x74,0xA6,0x85,0x4D,0x4B,0x1D,0x30,0xB8,0x2B,0xF3,0x10, + 0xE9,0xA7,0x2D,0x05,0x71,0xE7,0x81,0xDF,0x8B,0x59,0x52,0x3B, + 0x5F,0x43,0x0B,0x68,0xF1,0xDB,0x07,0xBE,0x08,0x6B,0x1B,0x23, + 0xEE,0x4D,0xCC,0x9E,0x0E,0x43,0xA0,0x1E,0xDF,0x43,0x8C,0xEC, + 0xBE,0xBE,0x90,0xB4,0x51,0x54,0xB9,0x2F,0x7B,0x64,0x76,0x4E, + 0x5D,0xD4,0x2E,0xAE,0xC2,0x9E,0xAE,0x51,0x43,0x59,0xC7,0x77, + 0x9C,0x50,0x3C,0x0E,0xED,0x73,0x04,0x5F,0xF1,0x4C,0x76,0x2A, + 0xD8,0xF8,0xCF,0xFC,0x34,0x40,0xD1,0xB4,0x42,0x61,0x84,0x66, + 0x42,0x39,0x04,0xF8,0x68,0xB2,0x62,0xD7,0x55,0xED,0x1B,0x74, + 0x75,0x91,0xE0,0xC5,0x69,0xC1,0x31,0x5C,0xDB,0x7B,0x44,0x2E, + 0xCE,0x84,0x58,0x0D,0x1E,0x66,0x0C,0xC8,0x44,0x9E,0xFD,0x40, + 0x08,0x67,0x5D,0xFB,0xA7,0x76,0x8F,0x00,0x11,0x87,0xE9,0x93, + 0xF9,0x7D,0xC4,0xBC,0x74,0x55,0x20,0xD4,0x4A,0x41,0x2F,0x43, + 0x42,0x1A,0xC1,0xF2,0x97,0x17,0x49,0x27,0x37,0x6B,0x2F,0x88, + 0x7E,0x1C,0xA0,0xA1,0x89,0x92,0x27,0xD9,0x56,0x5A,0x71,0xC1, + 0x56,0x37,0x7E,0x3A,0x9D,0x05,0xE7,0xEE,0x5D,0x8F,0x82,0x17, + 0xBC,0xE9,0xC2,0x93,0x30,0x82,0xF9,0xF4,0xC9,0xAE,0x49,0xDB, + 0xD0,0x54,0xB4,0xD9,0x75,0x4D,0xFA,0x06,0xB8,0xD6,0x38,0x41, + 0xB7,0x1F,0x77,0xF3, + }; + static unsigned char dh2048_g[]={ + 0x02, + }; + + BIGNUM *p; + BIGNUM *g; + + HASSL_DH *dh = NULL; + + p = BN_bin2bn(dh2048_p, sizeof dh2048_p, NULL); + g = BN_bin2bn(dh2048_g, sizeof dh2048_g, NULL); + + if (p && g) + dh = ssl_new_dh_fromdata(p, g); + + return dh; +#else + return ssl_get_dh_by_nid(NID_ffdhe2048); +#endif +} + +static HASSL_DH *ssl_get_dh_4096(void) +{ +#if (HA_OPENSSL_VERSION_NUMBER < 0x10101000L) + static unsigned char dh4096_p[]={ + 0xDE,0x16,0x94,0xCD,0x99,0x58,0x07,0xF1,0xF7,0x32,0x96,0x11, + 0x04,0x82,0xD4,0x84,0x72,0x80,0x99,0x06,0xCA,0xF0,0xA3,0x68, + 0x07,0xCE,0x64,0x50,0xE7,0x74,0x45,0x20,0x80,0x5E,0x4D,0xAD, + 0xA5,0xB6,0xED,0xFA,0x80,0x6C,0x3B,0x35,0xC4,0x9A,0x14,0x6B, + 0x32,0xBB,0xFD,0x1F,0x17,0x8E,0xB7,0x1F,0xD6,0xFA,0x3F,0x7B, + 0xEE,0x16,0xA5,0x62,0x33,0x0D,0xED,0xBC,0x4E,0x58,0xE5,0x47, + 0x4D,0xE9,0xAB,0x8E,0x38,0xD3,0x6E,0x90,0x57,0xE3,0x22,0x15, + 0x33,0xBD,0xF6,0x43,0x45,0xB5,0x10,0x0A,0xBE,0x2C,0xB4,0x35, + 0xB8,0x53,0x8D,0xAD,0xFB,0xA7,0x1F,0x85,0x58,0x41,0x7A,0x79, + 0x20,0x68,0xB3,0xE1,0x3D,0x08,0x76,0xBF,0x86,0x0D,0x49,0xE3, + 0x82,0x71,0x8C,0xB4,0x8D,0x81,0x84,0xD4,0xE7,0xBE,0x91,0xDC, + 0x26,0x39,0x48,0x0F,0x35,0xC4,0xCA,0x65,0xE3,0x40,0x93,0x52, + 0x76,0x58,0x7D,0xDD,0x51,0x75,0xDC,0x69,0x61,0xBF,0x47,0x2C, + 0x16,0x68,0x2D,0xC9,0x29,0xD3,0xE6,0xC0,0x99,0x48,0xA0,0x9A, + 0xC8,0x78,0xC0,0x6D,0x81,0x67,0x12,0x61,0x3F,0x71,0xBA,0x41, + 0x1F,0x6C,0x89,0x44,0x03,0xBA,0x3B,0x39,0x60,0xAA,0x28,0x55, + 0x59,0xAE,0xB8,0xFA,0xCB,0x6F,0xA5,0x1A,0xF7,0x2B,0xDD,0x52, + 0x8A,0x8B,0xE2,0x71,0xA6,0x5E,0x7E,0xD8,0x2E,0x18,0xE0,0x66, + 0xDF,0xDD,0x22,0x21,0x99,0x52,0x73,0xA6,0x33,0x20,0x65,0x0E, + 0x53,0xE7,0x6B,0x9B,0xC5,0xA3,0x2F,0x97,0x65,0x76,0xD3,0x47, + 0x23,0x77,0x12,0xB6,0x11,0x7B,0x24,0xED,0xF1,0xEF,0xC0,0xE2, + 0xA3,0x7E,0x67,0x05,0x3E,0x96,0x4D,0x45,0xC2,0x18,0xD1,0x73, + 0x9E,0x07,0xF3,0x81,0x6E,0x52,0x63,0xF6,0x20,0x76,0xB9,0x13, + 0xD2,0x65,0x30,0x18,0x16,0x09,0x16,0x9E,0x8F,0xF1,0xD2,0x10, + 0x5A,0xD3,0xD4,0xAF,0x16,0x61,0xDA,0x55,0x2E,0x18,0x5E,0x14, + 0x08,0x54,0x2E,0x2A,0x25,0xA2,0x1A,0x9B,0x8B,0x32,0xA9,0xFD, + 0xC2,0x48,0x96,0xE1,0x80,0xCA,0xE9,0x22,0x17,0xBB,0xCE,0x3E, + 0x9E,0xED,0xC7,0xF1,0x1F,0xEC,0x17,0x21,0xDC,0x7B,0x82,0x48, + 0x8E,0xBB,0x4B,0x9D,0x5B,0x04,0x04,0xDA,0xDB,0x39,0xDF,0x01, + 0x40,0xC3,0xAA,0x26,0x23,0x89,0x75,0xC6,0x0B,0xD0,0xA2,0x60, + 0x6A,0xF1,0xCC,0x65,0x18,0x98,0x1B,0x52,0xD2,0x74,0x61,0xCC, + 0xBD,0x60,0xAE,0xA3,0xA0,0x66,0x6A,0x16,0x34,0x92,0x3F,0x41, + 0x40,0x31,0x29,0xC0,0x2C,0x63,0xB2,0x07,0x8D,0xEB,0x94,0xB8, + 0xE8,0x47,0x92,0x52,0x93,0x6A,0x1B,0x7E,0x1A,0x61,0xB3,0x1B, + 0xF0,0xD6,0x72,0x9B,0xF1,0xB0,0xAF,0xBF,0x3E,0x65,0xEF,0x23, + 0x1D,0x6F,0xFF,0x70,0xCD,0x8A,0x4C,0x8A,0xA0,0x72,0x9D,0xBE, + 0xD4,0xBB,0x24,0x47,0x4A,0x68,0xB5,0xF5,0xC6,0xD5,0x7A,0xCD, + 0xCA,0x06,0x41,0x07,0xAD,0xC2,0x1E,0xE6,0x54,0xA7,0xAD,0x03, + 0xD9,0x12,0xC1,0x9C,0x13,0xB1,0xC9,0x0A,0x43,0x8E,0x1E,0x08, + 0xCE,0x50,0x82,0x73,0x5F,0xA7,0x55,0x1D,0xD9,0x59,0xAC,0xB5, + 0xEA,0x02,0x7F,0x6C,0x5B,0x74,0x96,0x98,0x67,0x24,0xA3,0x0F, + 0x15,0xFC,0xA9,0x7D,0x3E,0x67,0xD1,0x70,0xF8,0x97,0xF3,0x67, + 0xC5,0x8C,0x88,0x44,0x08,0x02,0xC7,0x2B, + }; + static unsigned char dh4096_g[]={ + 0x02, + }; + + BIGNUM *p; + BIGNUM *g; + + HASSL_DH *dh = NULL; + + p = BN_bin2bn(dh4096_p, sizeof dh4096_p, NULL); + g = BN_bin2bn(dh4096_g, sizeof dh4096_g, NULL); + + if (p && g) + dh = ssl_new_dh_fromdata(p, g); + + return dh; +#else + return ssl_get_dh_by_nid(NID_ffdhe4096); +#endif +} + +static HASSL_DH *ssl_get_tmp_dh(EVP_PKEY *pkey) +{ + HASSL_DH *dh = NULL; + int type; + int keylen = 0; + + type = pkey ? EVP_PKEY_base_id(pkey) : EVP_PKEY_NONE; + + if (type == EVP_PKEY_EC) { + keylen = global_ssl.default_dh_param; + } + + /* The keylen supplied by OpenSSL can only be 512 or 1024. + See ssl3_send_server_key_exchange() in ssl/s3_srvr.c + */ + if (type == EVP_PKEY_RSA || type == EVP_PKEY_DSA) { + keylen = EVP_PKEY_bits(pkey); + } + + if (keylen > global_ssl.default_dh_param) { + keylen = global_ssl.default_dh_param; + } + + if (keylen >= 4096) { + if (!local_dh_4096) + local_dh_4096 = ssl_get_dh_4096(); + dh = local_dh_4096; + } + else if (keylen >= 2048) { + if (!local_dh_2048) + local_dh_2048 = ssl_get_dh_2048(); + dh = local_dh_2048; + } + else { + if (!local_dh_1024) + local_dh_1024 = ssl_get_dh_1024(); + dh = local_dh_1024; + } + + return dh; +} + +#if (HA_OPENSSL_VERSION_NUMBER < 0x3000000fL) +/* Returns Diffie-Hellman parameters matching the private key length + but not exceeding global_ssl.default_dh_param */ +static HASSL_DH *ssl_get_tmp_dh_cbk(SSL *ssl, int export, int keylen) +{ + EVP_PKEY *pkey = SSL_get_privatekey(ssl); + + return ssl_get_tmp_dh(pkey); +} +#endif + +static int ssl_sock_set_tmp_dh(SSL_CTX *ctx, HASSL_DH *dh) +{ +#if (HA_OPENSSL_VERSION_NUMBER < 0x3000000fL) + return SSL_CTX_set_tmp_dh(ctx, dh); +#else + int retval = 0; + HASSL_DH_up_ref(dh); + + retval = SSL_CTX_set0_tmp_dh_pkey(ctx, dh); + + if (!retval) + HASSL_DH_free(dh); + + return retval; +#endif +} + +#if (HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL) +static void ssl_sock_set_tmp_dh_from_pkey(SSL_CTX *ctx, EVP_PKEY *pkey) +{ + HASSL_DH *dh = NULL; + if (pkey && (dh = ssl_get_tmp_dh(pkey))) { + HASSL_DH_up_ref(dh); + if (!SSL_CTX_set0_tmp_dh_pkey(ctx, dh)) + HASSL_DH_free(dh); + } +} +#endif + +HASSL_DH *ssl_sock_get_dh_from_bio(BIO *bio) +{ +#if (HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL) + HASSL_DH *dh = NULL; + OSSL_DECODER_CTX *dctx = NULL; + const char *format = "PEM"; + const char *keytype = "DH"; + + dctx = OSSL_DECODER_CTX_new_for_pkey(&dh, format, NULL, keytype, + OSSL_KEYMGMT_SELECT_DOMAIN_PARAMETERS, + NULL, NULL); + + if (dctx == NULL || OSSL_DECODER_CTX_get_num_decoders(dctx) == 0) + goto end; + + /* The DH parameters might not be the first section found in the PEM + * file so we need to iterate over all of them until we find the right + * one. + */ + while (!BIO_eof(bio) && !dh) + OSSL_DECODER_from_bio(dctx, bio); + +end: + OSSL_DECODER_CTX_free(dctx); + return dh; +#else + HASSL_DH *dh = NULL; + + dh = PEM_read_bio_DHparams(bio, NULL, NULL, NULL); + + return dh; +#endif +} + +static HASSL_DH * ssl_sock_get_dh_from_file(const char *filename) +{ + HASSL_DH *dh = NULL; + BIO *in = BIO_new(BIO_s_file()); + + if (in == NULL) + goto end; + + if (BIO_read_filename(in, filename) <= 0) + goto end; + + dh = ssl_sock_get_dh_from_bio(in); + +end: + if (in) + BIO_free(in); + + ERR_clear_error(); + + return dh; +} + +int ssl_sock_load_global_dh_param_from_file(const char *filename) +{ + global_dh = ssl_sock_get_dh_from_file(filename); + + if (global_dh) { + return 0; + } + + return -1; +} +#endif + +/* This function allocates a sni_ctx and adds it to the ckch_inst */ +static int ckch_inst_add_cert_sni(SSL_CTX *ctx, struct ckch_inst *ckch_inst, + struct bind_conf *s, struct ssl_bind_conf *conf, + struct pkey_info kinfo, char *name, int order) +{ + struct sni_ctx *sc; + int wild = 0, neg = 0; + + if (*name == '!') { + neg = 1; + name++; + } + if (*name == '*') { + wild = 1; + name++; + } + /* !* filter is a nop */ + if (neg && wild) + return order; + if (*name) { + int j, len; + len = strlen(name); + for (j = 0; j < len && j < trash.size; j++) + trash.area[j] = tolower((unsigned char)name[j]); + if (j >= trash.size) + return -1; + trash.area[j] = 0; + + sc = malloc(sizeof(struct sni_ctx) + len + 1); + if (!sc) + return -1; + memcpy(sc->name.key, trash.area, len + 1); + SSL_CTX_up_ref(ctx); + sc->ctx = ctx; + sc->conf = conf; + sc->kinfo = kinfo; + sc->order = order++; + sc->neg = neg; + sc->wild = wild; + sc->name.node.leaf_p = NULL; + sc->ckch_inst = ckch_inst; + LIST_APPEND(&ckch_inst->sni_ctx, &sc->by_ckch_inst); + } + return order; +} + +/* + * Insert the sni_ctxs that are listed in the ckch_inst, in the bind_conf's sni_ctx tree + * This function can't return an error. + * + * *CAUTION*: The caller must lock the sni tree if called in multithreading mode + */ +void ssl_sock_load_cert_sni(struct ckch_inst *ckch_inst, struct bind_conf *bind_conf) +{ + + struct sni_ctx *sc0, *sc0b, *sc1; + struct ebmb_node *node; + + list_for_each_entry_safe(sc0, sc0b, &ckch_inst->sni_ctx, by_ckch_inst) { + + /* ignore if sc0 was already inserted in a tree */ + if (sc0->name.node.leaf_p) + continue; + + /* Check for duplicates. */ + if (sc0->wild) + node = ebst_lookup(&bind_conf->sni_w_ctx, (char *)sc0->name.key); + else + node = ebst_lookup(&bind_conf->sni_ctx, (char *)sc0->name.key); + + for (; node; node = ebmb_next_dup(node)) { + sc1 = ebmb_entry(node, struct sni_ctx, name); + if (sc1->ctx == sc0->ctx && sc1->conf == sc0->conf + && sc1->neg == sc0->neg && sc1->wild == sc0->wild) { + /* it's a duplicate, we should remove and free it */ + LIST_DELETE(&sc0->by_ckch_inst); + SSL_CTX_free(sc0->ctx); + ha_free(&sc0); + break; + } + } + + /* if duplicate, ignore the insertion */ + if (!sc0) + continue; + + if (sc0->wild) + ebst_insert(&bind_conf->sni_w_ctx, &sc0->name); + else + ebst_insert(&bind_conf->sni_ctx, &sc0->name); + } + + /* replace the default_ctx if required with the instance's ctx. */ + if (ckch_inst->is_default) { + SSL_CTX_free(bind_conf->default_ctx); + SSL_CTX_up_ref(ckch_inst->ctx); + bind_conf->default_ctx = ckch_inst->ctx; + bind_conf->default_inst = ckch_inst; + } +} + +/* + * tree used to store the ckchs ordered by filename/bundle name + */ +struct eb_root ckchs_tree = EB_ROOT_UNIQUE; + +/* tree of crtlist (crt-list/directory) */ +struct eb_root crtlists_tree = EB_ROOT_UNIQUE; + +/* Loads Diffie-Hellman parameter from a ckchs to an SSL_CTX. + * If there is no DH parameter available in the ckchs, the global + * DH parameter is loaded into the SSL_CTX and if there is no + * DH parameter available in ckchs nor in global, the default + * DH parameters are applied on the SSL_CTX. + * Returns a bitfield containing the flags: + * ERR_FATAL in any fatal error case + * ERR_ALERT if a reason of the error is availabine in err + * ERR_WARN if a warning is available into err + * The value 0 means there is no error nor warning and + * the operation succeed. + */ +#ifndef OPENSSL_NO_DH +static int ssl_sock_load_dh_params(SSL_CTX *ctx, const struct ckch_data *data, + const char *path, char **err) +{ + int ret = 0; + HASSL_DH *dh = NULL; + + if (data && data->dh) { + dh = data->dh; + if (!ssl_sock_set_tmp_dh(ctx, dh)) { + memprintf(err, "%sunable to load the DH parameter specified in '%s'", + err && *err ? *err : "", path); + memprintf(err, "%s, DH ciphers won't be available.\n", + err && *err ? *err : ""); + ret |= ERR_WARN; + goto end; + } + + if (ssl_dh_ptr_index >= 0) { + /* store a pointer to the DH params to avoid complaining about + ssl-default-dh-param not being set for this SSL_CTX */ + SSL_CTX_set_ex_data(ctx, ssl_dh_ptr_index, dh); + } + } + else if (global_dh) { + if (!ssl_sock_set_tmp_dh(ctx, global_dh)) { + memprintf(err, "%sunable to use the global DH parameter for certificate '%s'", + err && *err ? *err : "", path); + memprintf(err, "%s, DH ciphers won't be available.\n", + err && *err ? *err : ""); + ret |= ERR_WARN; + goto end; + } + } + else { + /* Clear openssl global errors stack */ + ERR_clear_error(); + + /* We do not want DHE ciphers to be added to the cipher list + * unless there is an explicit global dh option in the conf. + */ + if (global_ssl.default_dh_param) { + if (global_ssl.default_dh_param <= 1024) { + /* we are limited to DH parameter of 1024 bits anyway */ + if (local_dh_1024 == NULL) + local_dh_1024 = ssl_get_dh_1024(); + + if (local_dh_1024 == NULL) { + memprintf(err, "%sunable to load default 1024 bits DH parameter for certificate '%s'.\n", + err && *err ? *err : "", path); + ret |= ERR_ALERT | ERR_FATAL; + goto end; + } + + if (!ssl_sock_set_tmp_dh(ctx, local_dh_1024)) { + memprintf(err, "%sunable to load default 1024 bits DH parameter for certificate '%s'.\n", + err && *err ? *err : "", path); + memprintf(err, "%s, DH ciphers won't be available.\n", + err && *err ? *err : ""); + ret |= ERR_WARN; + goto end; + } + } + else { +#if (HA_OPENSSL_VERSION_NUMBER < 0x3000000fL) + SSL_CTX_set_tmp_dh_callback(ctx, ssl_get_tmp_dh_cbk); +#else + ssl_sock_set_tmp_dh_from_pkey(ctx, data ? data->key : NULL); +#endif + } + } + } + +end: + ERR_clear_error(); + return ret; +} +#endif + + +/* Load a certificate chain into an SSL context. + * Returns a bitfield containing the flags: + * ERR_FATAL in any fatal error case + * ERR_ALERT if the reason of the error is available in err + * ERR_WARN if a warning is available into err + * The caller is responsible of freeing the newly built or newly refcounted + * find_chain element. + * The value 0 means there is no error nor warning and + * the operation succeed. + */ +static int ssl_sock_load_cert_chain(const char *path, const struct ckch_data *data, + SSL_CTX *ctx, STACK_OF(X509) **find_chain, char **err) +{ + int errcode = 0; + int ret; + + ERR_clear_error(); + + if (find_chain == NULL) { + errcode |= ERR_FATAL; + goto end; + } + + if (!SSL_CTX_use_certificate(ctx, data->cert)) { + ret = ERR_get_error(); + memprintf(err, "%sunable to load SSL certificate into SSL Context '%s': %s.\n", + err && *err ? *err : "", path, ERR_reason_error_string(ret)); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + + if (data->chain) { + *find_chain = X509_chain_up_ref(data->chain); + } else { + /* Find Certificate Chain in global */ + struct issuer_chain *issuer; + issuer = ssl_get0_issuer_chain(data->cert); + if (issuer) + *find_chain = X509_chain_up_ref(issuer->chain); + } + + if (!*find_chain) { + /* always put a null chain stack in the SSL_CTX so it does not + * try to build the chain from the verify store */ + *find_chain = sk_X509_new_null(); + } + + /* Load all certs in the data into the ctx_chain for the ssl_ctx */ +#ifdef SSL_CTX_set1_chain + if (!SSL_CTX_set1_chain(ctx, *find_chain)) { + ret = ERR_get_error(); + memprintf(err, "%sunable to load chain certificate into SSL Context '%s': %s.\n", + err && *err ? *err : "", path, ERR_reason_error_string(ret)); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } +#else + { /* legacy compat (< openssl 1.0.2) */ + X509 *ca; + while ((ca = sk_X509_shift(*find_chain))) + if (!SSL_CTX_add_extra_chain_cert(ctx, ca)) { + memprintf(err, "%sunable to load chain certificate into SSL Context '%s'.\n", + err && *err ? *err : "", path); + X509_free(ca); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + } +#endif + +#ifdef SSL_CTX_build_cert_chain + /* remove the Root CA from the SSL_CTX if the option is activated */ + if (global_ssl.skip_self_issued_ca) { + if (!SSL_CTX_build_cert_chain(ctx, SSL_BUILD_CHAIN_FLAG_NO_ROOT|SSL_BUILD_CHAIN_FLAG_UNTRUSTED|SSL_BUILD_CHAIN_FLAG_IGNORE_ERROR)) { + memprintf(err, "%sunable to load chain certificate into SSL Context '%s'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + } +#endif + +end: + return errcode; +} + + +/* Loads the info in ckch into ctx + * Returns a bitfield containing the flags: + * ERR_FATAL in any fatal error case + * ERR_ALERT if the reason of the error is available in err + * ERR_WARN if a warning is available into err + * The value 0 means there is no error nor warning and + * the operation succeed. + */ +static int ssl_sock_put_ckch_into_ctx(const char *path, struct ckch_data *data, SSL_CTX *ctx, char **err) +{ + int errcode = 0; + STACK_OF(X509) *find_chain = NULL; + + ERR_clear_error(); + + if (SSL_CTX_use_PrivateKey(ctx, data->key) <= 0) { + int ret; + + ret = ERR_get_error(); + memprintf(err, "%sunable to load SSL private key into SSL Context '%s': %s.\n", + err && *err ? *err : "", path, ERR_reason_error_string(ret)); + errcode |= ERR_ALERT | ERR_FATAL; + return errcode; + } + + /* Load certificate chain */ + errcode |= ssl_sock_load_cert_chain(path, data, ctx, &find_chain, err); + if (errcode & ERR_CODE) + goto end; + +#ifndef OPENSSL_NO_DH + /* store a NULL pointer to indicate we have not yet loaded + a custom DH param file */ + if (ssl_dh_ptr_index >= 0) { + SSL_CTX_set_ex_data(ctx, ssl_dh_ptr_index, NULL); + } + + errcode |= ssl_sock_load_dh_params(ctx, data, path, err); + if (errcode & ERR_CODE) { + memprintf(err, "%sunable to load DH parameters from file '%s'.\n", + err && *err ? *err : "", path); + goto end; + } +#endif + +#ifdef HAVE_SSL_CTX_ADD_SERVER_CUSTOM_EXT + if (sctl_ex_index >= 0 && data->sctl) { + if (ssl_sock_load_sctl(ctx, data->sctl) < 0) { + memprintf(err, "%s '%s.sctl' is present but cannot be read or parsed'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } + } +#endif + +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) || defined OPENSSL_IS_BORINGSSL) + /* Load OCSP Info into context + * If OCSP update mode is set to 'on', an entry will be created in the + * ocsp tree even if no ocsp_response was known during init, unless the + * frontend's conf disables ocsp update explicitly. + */ + if (ssl_sock_load_ocsp(path, ctx, data, find_chain) < 0) { + if (data->ocsp_response) + memprintf(err, "%s '%s.ocsp' is present and activates OCSP but it is impossible to compute the OCSP certificate ID (maybe the issuer could not be found)'.\n", + err && *err ? *err : "", path); + else + memprintf(err, "%s '%s' has an OCSP auto-update set to 'on' but an error occurred (maybe the OCSP URI or the issuer could not be found)'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + goto end; + } +#endif + + end: + sk_X509_pop_free(find_chain, X509_free); + return errcode; +} + + +/* Loads the info of a ckch built out of a backend certificate into an SSL ctx + * Returns a bitfield containing the flags: + * ERR_FATAL in any fatal error case + * ERR_ALERT if the reason of the error is available in err + * ERR_WARN if a warning is available into err + * The value 0 means there is no error nor warning and + * the operation succeed. + */ +static int ssl_sock_put_srv_ckch_into_ctx(const char *path, const struct ckch_data *data, + SSL_CTX *ctx, char **err) +{ + int errcode = 0; + STACK_OF(X509) *find_chain = NULL; + + /* Load the private key */ + if (SSL_CTX_use_PrivateKey(ctx, data->key) <= 0) { + memprintf(err, "%sunable to load SSL private key into SSL Context '%s'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + } + + /* Load certificate chain */ + errcode |= ssl_sock_load_cert_chain(path, data, ctx, &find_chain, err); + if (errcode & ERR_CODE) + goto end; + + if (SSL_CTX_check_private_key(ctx) <= 0) { + memprintf(err, "%sinconsistencies between private key and certificate loaded from PEM file '%s'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + } + +end: + sk_X509_pop_free(find_chain, X509_free); + return errcode; +} + + +/* + * This function allocate a ckch_inst and create its snis + * + * Returns a bitfield containing the flags: + * ERR_FATAL in any fatal error case + * ERR_ALERT if the reason of the error is available in err + * ERR_WARN if a warning is available into err + */ +int ckch_inst_new_load_store(const char *path, struct ckch_store *ckchs, struct bind_conf *bind_conf, + struct ssl_bind_conf *ssl_conf, char **sni_filter, int fcount, struct ckch_inst **ckchi, char **err) +{ + SSL_CTX *ctx; + int i; + int order = 0; + X509_NAME *xname; + char *str; + EVP_PKEY *pkey; + struct pkey_info kinfo = { .sig = TLSEXT_signature_anonymous, .bits = 0 }; +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + STACK_OF(GENERAL_NAME) *names; +#endif + struct ckch_data *data; + struct ckch_inst *ckch_inst = NULL; + int errcode = 0; + + *ckchi = NULL; + + if (!ckchs || !ckchs->data) + return ERR_FATAL; + + data = ckchs->data; + + ctx = SSL_CTX_new(SSLv23_server_method()); + if (!ctx) { + memprintf(err, "%sunable to allocate SSL context for cert '%s'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + goto error; + } + + errcode |= ssl_sock_put_ckch_into_ctx(path, data, ctx, err); + if (errcode & ERR_CODE) + goto error; + + ckch_inst = ckch_inst_new(); + if (!ckch_inst) { + memprintf(err, "%sunable to allocate SSL context for cert '%s'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + goto error; + } + + pkey = X509_get_pubkey(data->cert); + if (pkey) { + kinfo.bits = EVP_PKEY_bits(pkey); + switch(EVP_PKEY_base_id(pkey)) { + case EVP_PKEY_RSA: + kinfo.sig = TLSEXT_signature_rsa; + break; + case EVP_PKEY_EC: + kinfo.sig = TLSEXT_signature_ecdsa; + break; + case EVP_PKEY_DSA: + kinfo.sig = TLSEXT_signature_dsa; + break; + } + EVP_PKEY_free(pkey); + } + + if (fcount) { + while (fcount--) { + order = ckch_inst_add_cert_sni(ctx, ckch_inst, bind_conf, ssl_conf, kinfo, sni_filter[fcount], order); + if (order < 0) { + memprintf(err, "%sunable to create a sni context.\n", err && *err ? *err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto error; + } + } + } + else { +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + names = X509_get_ext_d2i(data->cert, NID_subject_alt_name, NULL, NULL); + if (names) { + for (i = 0; i < sk_GENERAL_NAME_num(names); i++) { + GENERAL_NAME *name = sk_GENERAL_NAME_value(names, i); + if (name->type == GEN_DNS) { + if (ASN1_STRING_to_UTF8((unsigned char **)&str, name->d.dNSName) >= 0) { + order = ckch_inst_add_cert_sni(ctx, ckch_inst, bind_conf, ssl_conf, kinfo, str, order); + OPENSSL_free(str); + if (order < 0) { + memprintf(err, "%sunable to create a sni context.\n", err && *err ? *err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto error; + } + } + } + } + sk_GENERAL_NAME_pop_free(names, GENERAL_NAME_free); + } +#endif /* SSL_CTRL_SET_TLSEXT_HOSTNAME */ + xname = X509_get_subject_name(data->cert); + i = -1; + while ((i = X509_NAME_get_index_by_NID(xname, NID_commonName, i)) != -1) { + X509_NAME_ENTRY *entry = X509_NAME_get_entry(xname, i); + ASN1_STRING *value; + + value = X509_NAME_ENTRY_get_data(entry); + if (ASN1_STRING_to_UTF8((unsigned char **)&str, value) >= 0) { + order = ckch_inst_add_cert_sni(ctx, ckch_inst, bind_conf, ssl_conf, kinfo, str, order); + OPENSSL_free(str); + if (order < 0) { + memprintf(err, "%sunable to create a sni context.\n", err && *err ? *err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto error; + } + } + } + } + /* we must not free the SSL_CTX anymore below, since it's already in + * the tree, so it will be discovered and cleaned in time. + */ + +#ifndef SSL_CTRL_SET_TLSEXT_HOSTNAME + if (bind_conf->default_ctx) { + memprintf(err, "%sthis version of openssl cannot load multiple SSL certificates.\n", + err && *err ? *err : ""); + errcode |= ERR_ALERT | ERR_FATAL; + goto error; + } +#endif + if (!bind_conf->default_ctx) { + bind_conf->default_ctx = ctx; + bind_conf->default_ssl_conf = ssl_conf; + ckch_inst->is_default = 1; + SSL_CTX_up_ref(ctx); + bind_conf->default_inst = ckch_inst; + } + + /* Always keep a reference to the newly constructed SSL_CTX in the + * instance. This way if the instance has no SNIs, the SSL_CTX will + * still be linked. */ + SSL_CTX_up_ref(ctx); + ckch_inst->ctx = ctx; + + /* everything succeed, the ckch instance can be used */ + ckch_inst->bind_conf = bind_conf; + ckch_inst->ssl_conf = ssl_conf; + ckch_inst->ckch_store = ckchs; + + SSL_CTX_free(ctx); /* we need to free the ctx since we incremented the refcount where it's used */ + + *ckchi = ckch_inst; + return errcode; + +error: + /* free the allocated sni_ctxs */ + if (ckch_inst) { + if (ckch_inst->is_default) + SSL_CTX_free(ctx); + + ckch_inst_free(ckch_inst); + ckch_inst = NULL; + } + SSL_CTX_free(ctx); + + return errcode; +} + + +/* + * This function allocate a ckch_inst that will be used on the backend side + * (server line) + * + * Returns a bitfield containing the flags: + * ERR_FATAL in any fatal error case + * ERR_ALERT if the reason of the error is available in err + * ERR_WARN if a warning is available into err + */ +int ckch_inst_new_load_srv_store(const char *path, struct ckch_store *ckchs, + struct ckch_inst **ckchi, char **err) +{ + SSL_CTX *ctx; + struct ckch_data *data; + struct ckch_inst *ckch_inst = NULL; + int errcode = 0; + + *ckchi = NULL; + + if (!ckchs || !ckchs->data) + return ERR_FATAL; + + data = ckchs->data; + + ctx = SSL_CTX_new(SSLv23_client_method()); + if (!ctx) { + memprintf(err, "%sunable to allocate SSL context for cert '%s'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + goto error; + } + + errcode |= ssl_sock_put_srv_ckch_into_ctx(path, data, ctx, err); + if (errcode & ERR_CODE) + goto error; + + ckch_inst = ckch_inst_new(); + if (!ckch_inst) { + memprintf(err, "%sunable to allocate SSL context for cert '%s'.\n", + err && *err ? *err : "", path); + errcode |= ERR_ALERT | ERR_FATAL; + goto error; + } + + /* everything succeed, the ckch instance can be used */ + ckch_inst->bind_conf = NULL; + ckch_inst->ssl_conf = NULL; + ckch_inst->ckch_store = ckchs; + ckch_inst->ctx = ctx; + ckch_inst->is_server_instance = 1; + + *ckchi = ckch_inst; + return errcode; + +error: + SSL_CTX_free(ctx); + + return errcode; +} + +/* Returns a set of ERR_* flags possibly with an error in <err>. */ +static int ssl_sock_load_ckchs(const char *path, struct ckch_store *ckchs, + struct bind_conf *bind_conf, struct ssl_bind_conf *ssl_conf, + char **sni_filter, int fcount, struct ckch_inst **ckch_inst, char **err) +{ + int errcode = 0; + + /* we found the ckchs in the tree, we can use it directly */ + errcode |= ckch_inst_new_load_store(path, ckchs, bind_conf, ssl_conf, sni_filter, fcount, ckch_inst, err); + + if (errcode & ERR_CODE) + return errcode; + + ssl_sock_load_cert_sni(*ckch_inst, bind_conf); + + /* succeed, add the instance to the ckch_store's list of instance */ + LIST_APPEND(&ckchs->ckch_inst, &((*ckch_inst)->by_ckchs)); + return errcode; +} + +/* This function generates a <struct ckch_inst *> for a <struct server *>, and + * fill the SSL_CTX of the server. + * + * Returns a set of ERR_* flags possibly with an error in <err>. */ +static int ssl_sock_load_srv_ckchs(const char *path, struct ckch_store *ckchs, + struct server *server, struct ckch_inst **ckch_inst, char **err) +{ + int errcode = 0; + + /* we found the ckchs in the tree, we can use it directly */ + errcode |= ckch_inst_new_load_srv_store(path, ckchs, ckch_inst, err); + + if (errcode & ERR_CODE) + return errcode; + + (*ckch_inst)->server = server; + /* Keep the reference to the SSL_CTX in the server. */ + SSL_CTX_up_ref((*ckch_inst)->ctx); + server->ssl_ctx.ctx = (*ckch_inst)->ctx; + /* succeed, add the instance to the ckch_store's list of instance */ + LIST_APPEND(&ckchs->ckch_inst, &((*ckch_inst)->by_ckchs)); + return errcode; +} + + + + +/* Make sure openssl opens /dev/urandom before the chroot. The work is only + * done once. Zero is returned if the operation fails. No error is returned + * if the random is said as not implemented, because we expect that openssl + * will use another method once needed. + */ +int ssl_initialize_random(void) +{ + unsigned char random; + static int random_initialized = 0; + + if (!random_initialized && RAND_bytes(&random, 1) != 0) + random_initialized = 1; + + return random_initialized; +} + +/* Load a crt-list file, this is done in 2 parts: + * - store the content of the file in a crtlist structure with crtlist_entry structures + * - generate the instances by iterating on entries in the crtlist struct + * + * Nothing is locked there, this function is used in the configuration parser. + * + * Returns a set of ERR_* flags possibly with an error in <err>. + */ +int ssl_sock_load_cert_list_file(char *file, int dir, struct bind_conf *bind_conf, struct proxy *curproxy, char **err) +{ + struct crtlist *crtlist = NULL; + struct ebmb_node *eb; + struct crtlist_entry *entry = NULL; + struct bind_conf_list *bind_conf_node = NULL; + int cfgerr = 0; + char *end; + + bind_conf_node = malloc(sizeof(*bind_conf_node)); + if (!bind_conf_node) { + memprintf(err, "%sCan't alloc memory!\n", err && *err ? *err : ""); + cfgerr |= ERR_FATAL | ERR_ALERT; + goto error; + } + bind_conf_node->next = NULL; + bind_conf_node->bind_conf = bind_conf; + + /* strip trailing slashes, including first one */ + for (end = file + strlen(file) - 1; end >= file && *end == '/'; end--) + *end = 0; + + /* look for an existing crtlist or create one */ + eb = ebst_lookup(&crtlists_tree, file); + if (eb) { + crtlist = ebmb_entry(eb, struct crtlist, node); + } else { + /* load a crt-list OR a directory */ + if (dir) + cfgerr |= crtlist_load_cert_dir(file, bind_conf, &crtlist, err); + else + cfgerr |= crtlist_parse_file(file, bind_conf, curproxy, &crtlist, err); + + if (!(cfgerr & ERR_CODE)) + ebst_insert(&crtlists_tree, &crtlist->node); + } + + if (cfgerr & ERR_CODE) { + cfgerr |= ERR_FATAL | ERR_ALERT; + goto error; + } + + /* generates ckch instance from the crtlist_entry */ + list_for_each_entry(entry, &crtlist->ord_entries, by_crtlist) { + struct ckch_store *store; + struct ckch_inst *ckch_inst = NULL; + + store = entry->node.key; + cfgerr |= ssl_sock_load_ckchs(store->path, store, bind_conf, entry->ssl_conf, entry->filters, entry->fcount, &ckch_inst, err); + if (cfgerr & ERR_CODE) { + memprintf(err, "error processing line %d in file '%s' : %s", entry->linenum, file, *err); + goto error; + } + LIST_APPEND(&entry->ckch_inst, &ckch_inst->by_crtlist_entry); + ckch_inst->crtlist_entry = entry; + } + + /* add the bind_conf to the list */ + bind_conf_node->next = crtlist->bind_conf; + crtlist->bind_conf = bind_conf_node; + + return cfgerr; +error: + { + struct crtlist_entry *lastentry; + struct ckch_inst *inst, *s_inst; + + lastentry = entry; /* which entry we tried to generate last */ + if (lastentry) { + list_for_each_entry(entry, &crtlist->ord_entries, by_crtlist) { + if (entry == lastentry) /* last entry we tried to generate, no need to go further */ + break; + + list_for_each_entry_safe(inst, s_inst, &entry->ckch_inst, by_crtlist_entry) { + + /* this was not generated for this bind_conf, skip */ + if (inst->bind_conf != bind_conf) + continue; + + /* free the sni_ctx and instance */ + ckch_inst_free(inst); + } + } + } + free(bind_conf_node); + } + return cfgerr; +} + +/* Returns a set of ERR_* flags possibly with an error in <err>. */ +int ssl_sock_load_cert(char *path, struct bind_conf *bind_conf, char **err) +{ + struct stat buf; + int cfgerr = 0; + struct ckch_store *ckchs; + struct ckch_inst *ckch_inst = NULL; + int found = 0; /* did we found a file to load ? */ + + if ((ckchs = ckchs_lookup(path))) { + /* we found the ckchs in the tree, we can use it directly */ + cfgerr |= ssl_sock_load_ckchs(path, ckchs, bind_conf, NULL, NULL, 0, &ckch_inst, err); + found++; + } else if (stat(path, &buf) == 0) { + found++; + if (S_ISDIR(buf.st_mode) == 0) { + ckchs = ckchs_load_cert_file(path, err); + if (!ckchs) + cfgerr |= ERR_ALERT | ERR_FATAL; + cfgerr |= ssl_sock_load_ckchs(path, ckchs, bind_conf, NULL, NULL, 0, &ckch_inst, err); + } else { + cfgerr |= ssl_sock_load_cert_list_file(path, 1, bind_conf, bind_conf->frontend, err); + } + } else { + /* stat failed, could be a bundle */ + if (global_ssl.extra_files & SSL_GF_BUNDLE) { + char fp[MAXPATHLEN+1] = {0}; + int n = 0; + + /* Load all possible certs and keys in separate ckch_store */ + for (n = 0; n < SSL_SOCK_NUM_KEYTYPES; n++) { + struct stat buf; + int ret; + + ret = snprintf(fp, sizeof(fp), "%s.%s", path, SSL_SOCK_KEYTYPE_NAMES[n]); + if (ret > sizeof(fp)) + continue; + + if ((ckchs = ckchs_lookup(fp))) { + cfgerr |= ssl_sock_load_ckchs(fp, ckchs, bind_conf, NULL, NULL, 0, &ckch_inst, err); + found++; + } else { + if (stat(fp, &buf) == 0) { + found++; + ckchs = ckchs_load_cert_file(fp, err); + if (!ckchs) + cfgerr |= ERR_ALERT | ERR_FATAL; + cfgerr |= ssl_sock_load_ckchs(fp, ckchs, bind_conf, NULL, NULL, 0, &ckch_inst, err); + } + } + } +#if HA_OPENSSL_VERSION_NUMBER < 0x10101000L + if (found) { + memprintf(err, "%sCan't load '%s'. Loading a multi certificates bundle requires OpenSSL >= 1.1.1\n", + err && *err ? *err : "", path); + cfgerr |= ERR_ALERT | ERR_FATAL; + } +#endif + } + } + if (!found) { + memprintf(err, "%sunable to stat SSL certificate from file '%s' : %s.\n", + err && *err ? *err : "", path, strerror(errno)); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + + return cfgerr; +} + + +/* Create a full ssl context and ckch instance that will be used for a specific + * backend server (server configuration line). + * Returns a set of ERR_* flags possibly with an error in <err>. + */ +int ssl_sock_load_srv_cert(char *path, struct server *server, int create_if_none, char **err) +{ + struct stat buf; + int cfgerr = 0; + struct ckch_store *ckchs; + int found = 0; /* did we found a file to load ? */ + + if ((ckchs = ckchs_lookup(path))) { + /* we found the ckchs in the tree, we can use it directly */ + cfgerr |= ssl_sock_load_srv_ckchs(path, ckchs, server, &server->ssl_ctx.inst, err); + found++; + } else { + if (!create_if_none) { + memprintf(err, "%sunable to stat SSL certificate '%s'.\n", + err && *err ? *err : "", path); + cfgerr |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (stat(path, &buf) == 0) { + /* We do not manage directories on backend side. */ + if (S_ISDIR(buf.st_mode) == 0) { + ++found; + ckchs = ckchs_load_cert_file(path, err); + if (!ckchs) + cfgerr |= ERR_ALERT | ERR_FATAL; + cfgerr |= ssl_sock_load_srv_ckchs(path, ckchs, server, &server->ssl_ctx.inst, err); + } + } + } + if (!found) { + memprintf(err, "%sunable to stat SSL certificate from file '%s' : %s.\n", + err && *err ? *err : "", path, strerror(errno)); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + +out: + return cfgerr; +} + +/* Create an initial CTX used to start the SSL connection before switchctx */ +static int +ssl_sock_initial_ctx(struct bind_conf *bind_conf) +{ + SSL_CTX *ctx = NULL; + long options = + SSL_OP_ALL | /* all known workarounds for bugs */ + SSL_OP_NO_SSLv2 | + SSL_OP_NO_COMPRESSION | + SSL_OP_SINGLE_DH_USE | + SSL_OP_SINGLE_ECDH_USE | + SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION | + SSL_OP_PRIORITIZE_CHACHA | + SSL_OP_CIPHER_SERVER_PREFERENCE; + long mode = + SSL_MODE_ENABLE_PARTIAL_WRITE | + SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER | + SSL_MODE_RELEASE_BUFFERS | + SSL_MODE_SMALL_BUFFERS; + struct tls_version_filter *conf_ssl_methods = &bind_conf->ssl_conf.ssl_methods; + int i, min, max, hole; + int flags = MC_SSL_O_ALL; + int cfgerr = 0; + const int default_min_ver = CONF_TLSV12; + + ctx = SSL_CTX_new(SSLv23_server_method()); + bind_conf->initial_ctx = ctx; + + if (conf_ssl_methods->flags && (conf_ssl_methods->min || conf_ssl_methods->max)) + ha_warning("Proxy '%s': no-sslv3/no-tlsv1x are ignored for bind '%s' at [%s:%d]. " + "Use only 'ssl-min-ver' and 'ssl-max-ver' to fix.\n", + bind_conf->frontend->id, bind_conf->arg, bind_conf->file, bind_conf->line); + else + flags = conf_ssl_methods->flags; + + min = conf_ssl_methods->min; + max = conf_ssl_methods->max; + + /* default minimum is TLSV12, */ + if (!min) { + if (!max || (max >= default_min_ver)) { + min = default_min_ver; + } else { + ha_warning("Proxy '%s': Ambiguous configuration for bind '%s' at [%s:%d]: the ssl-min-ver value is not configured and the ssl-max-ver value is lower than the default ssl-min-ver value (%s). " + "Setting the ssl-min-ver to %s. Use 'ssl-min-ver' to fix this.\n", + bind_conf->frontend->id, bind_conf->arg, bind_conf->file, bind_conf->line, methodVersions[default_min_ver].name, methodVersions[max].name); + min = max; + } + } + /* Real min and max should be determinate with configuration and openssl's capabilities */ + if (min) + flags |= (methodVersions[min].flag - 1); + if (max) + flags |= ~((methodVersions[max].flag << 1) - 1); + /* find min, max and holes */ + min = max = CONF_TLSV_NONE; + hole = 0; + for (i = CONF_TLSV_MIN; i <= CONF_TLSV_MAX; i++) + /* version is in openssl && version not disable in configuration */ + if (methodVersions[i].option && !(flags & methodVersions[i].flag)) { + if (min) { + if (hole) { + ha_warning("Proxy '%s': SSL/TLS versions range not contiguous for bind '%s' at [%s:%d]. " + "Hole find for %s. Use only 'ssl-min-ver' and 'ssl-max-ver' to fix.\n", + bind_conf->frontend->id, bind_conf->arg, bind_conf->file, bind_conf->line, + methodVersions[hole].name); + hole = 0; + } + max = i; + } + else { + min = max = i; + } + } + else { + if (min) + hole = i; + } + if (!min) { + ha_alert("Proxy '%s': all SSL/TLS versions are disabled for bind '%s' at [%s:%d].\n", + bind_conf->frontend->id, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr += 1; + } + /* save real min/max in bind_conf */ + conf_ssl_methods->min = min; + conf_ssl_methods->max = max; + +#if (HA_OPENSSL_VERSION_NUMBER < 0x1010000fL) + /* Keep force-xxx implementation as it is in older haproxy. It's a + precautionary measure to avoid any surprise with older openssl version. */ + if (min == max) + methodVersions[min].ctx_set_version(ctx, SET_SERVER); + else + for (i = CONF_TLSV_MIN; i <= CONF_TLSV_MAX; i++) { + /* clear every version flags in case SSL_CTX_new() + * returns an SSL_CTX with disabled versions */ + SSL_CTX_clear_options(ctx, methodVersions[i].option); + + if (flags & methodVersions[i].flag) + options |= methodVersions[i].option; + + } +#else /* openssl >= 1.1.0 */ + /* set the max_version is required to cap TLS version or activate new TLS (v1.3) */ + methodVersions[min].ctx_set_version(ctx, SET_MIN); + methodVersions[max].ctx_set_version(ctx, SET_MAX); +#endif + + if (bind_conf->ssl_options & BC_SSL_O_NO_TLS_TICKETS) + options |= SSL_OP_NO_TICKET; + if (bind_conf->ssl_options & BC_SSL_O_PREF_CLIE_CIPH) + options &= ~SSL_OP_CIPHER_SERVER_PREFERENCE; + +#ifdef SSL_OP_NO_RENEGOTIATION + options |= SSL_OP_NO_RENEGOTIATION; +#endif + + SSL_CTX_set_options(ctx, options); + +#ifdef SSL_MODE_ASYNC + if (global_ssl.async) + mode |= SSL_MODE_ASYNC; +#endif + SSL_CTX_set_mode(ctx, mode); + if (global_ssl.life_time) + SSL_CTX_set_timeout(ctx, global_ssl.life_time); + +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME +# ifdef OPENSSL_IS_BORINGSSL + SSL_CTX_set_select_certificate_cb(ctx, ssl_sock_switchctx_cbk); + SSL_CTX_set_tlsext_servername_callback(ctx, ssl_sock_switchctx_err_cbk); +# elif defined(HAVE_SSL_CLIENT_HELLO_CB) +# if defined(SSL_OP_NO_ANTI_REPLAY) + if (bind_conf->ssl_conf.early_data) + SSL_CTX_set_options(ctx, SSL_OP_NO_ANTI_REPLAY); +# endif /* ! SSL_OP_NO_ANTI_REPLAY */ + SSL_CTX_set_client_hello_cb(ctx, ssl_sock_switchctx_cbk, NULL); + SSL_CTX_set_tlsext_servername_callback(ctx, ssl_sock_switchctx_err_cbk); +# elif 0 && defined(USE_OPENSSL_WOLFSSL) + SSL_CTX_set_cert_cb(ctx, ssl_sock_switchctx_wolfSSL_cbk, bind_conf); +# else + /* ! OPENSSL_IS_BORINGSSL && ! HAVE_SSL_CLIENT_HELLO_CB */ + SSL_CTX_set_tlsext_servername_callback(ctx, ssl_sock_switchctx_cbk); +# endif + SSL_CTX_set_tlsext_servername_arg(ctx, bind_conf); +#endif /* ! SSL_CTRL_SET_TLSEXT_HOSTNAME */ + return cfgerr; +} + + +static inline void sh_ssl_sess_free_blocks(struct shared_block *first, void *data) +{ + struct sh_ssl_sess_hdr *sh_ssl_sess = (struct sh_ssl_sess_hdr *)first->data; + if (first->len > 0) + sh_ssl_sess_tree_delete(sh_ssl_sess); +} + +/* return first block from sh_ssl_sess */ +static inline struct shared_block *sh_ssl_sess_first_block(struct sh_ssl_sess_hdr *sh_ssl_sess) +{ + return (struct shared_block *)((unsigned char *)sh_ssl_sess - offsetof(struct shared_block, data)); + +} + +/* store a session into the cache + * s_id : session id padded with zero to SSL_MAX_SSL_SESSION_ID_LENGTH + * data: asn1 encoded session + * data_len: asn1 encoded session length + * Returns 1 id session was stored (else 0) + */ +static int sh_ssl_sess_store(unsigned char *s_id, unsigned char *data, int data_len) +{ + struct shared_block *first; + struct sh_ssl_sess_hdr *sh_ssl_sess, *oldsh_ssl_sess; + + first = shctx_row_reserve_hot(ssl_shctx, NULL, data_len + sizeof(struct sh_ssl_sess_hdr)); + if (!first) { + /* Could not retrieve enough free blocks to store that session */ + return 0; + } + + shctx_wrlock(ssl_shctx); + + /* STORE the key in the first elem */ + sh_ssl_sess = (struct sh_ssl_sess_hdr *)first->data; + memcpy(sh_ssl_sess->key_data, s_id, SSL_MAX_SSL_SESSION_ID_LENGTH); + first->len = sizeof(struct sh_ssl_sess_hdr); + + /* it returns the already existing node + or current node if none, never returns null */ + oldsh_ssl_sess = sh_ssl_sess_tree_insert(sh_ssl_sess); + if (oldsh_ssl_sess != sh_ssl_sess) { + /* NOTE: Row couldn't be in use because we lock read & write function */ + /* release the reserved row */ + first->len = 0; /* the len must be liberated in order not to call the release callback on it */ + shctx_row_reattach(ssl_shctx, first); + /* replace the previous session already in the tree */ + sh_ssl_sess = oldsh_ssl_sess; + /* ignore the previous session data, only use the header */ + first = sh_ssl_sess_first_block(sh_ssl_sess); + shctx_row_detach(ssl_shctx, first); + first->len = sizeof(struct sh_ssl_sess_hdr); + } + + if (shctx_row_data_append(ssl_shctx, first, data, data_len) < 0) { + shctx_row_reattach(ssl_shctx, first); + return 0; + } + + shctx_row_reattach(ssl_shctx, first); + + shctx_wrunlock(ssl_shctx); + + return 1; +} + +/* SSL callback used when a new session is created while connecting to a server */ +static int ssl_sess_new_srv_cb(SSL *ssl, SSL_SESSION *sess) +{ + struct connection *conn = SSL_get_ex_data(ssl, ssl_app_data_index); + struct server *s; + uint old_tid; + + s = __objt_server(conn->target); + + /* RWLOCK: only read lock the SSL cache even when writing in it because there is + * one cache per thread, it only prevents to flush it from the CLI in + * another thread. However, we also write-lock our session element while + * updating it to make sure no other thread is reading it while we're copying + * or releasing it. + */ + + if (!(s->ssl_ctx.options & SRV_SSL_O_NO_REUSE)) { + int len; + unsigned char *ptr; + const char *sni; + + /* determine the required len to store this new session */ + len = i2d_SSL_SESSION(sess, NULL); + sni = SSL_get_servername(ssl, TLSEXT_NAMETYPE_host_name); + HA_RWLOCK_RDLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.lock); + + ptr = s->ssl_ctx.reused_sess[tid].ptr; + + /* we're updating the possibly shared session right now */ + HA_RWLOCK_WRLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.reused_sess[tid].sess_lock); + + if (!ptr || s->ssl_ctx.reused_sess[tid].allocated_size < len) { + /* insufficient storage, reallocate */ + len = (len + 7) & -8; /* round to the nearest 8 bytes */ + ptr = realloc(ptr, len); + if (!ptr) + free(s->ssl_ctx.reused_sess[tid].ptr); + s->ssl_ctx.reused_sess[tid].ptr = ptr; + s->ssl_ctx.reused_sess[tid].allocated_size = len; + } + + if (ptr) { + /* store the new session into ptr and advance it; save the + * resulting size. It's guaranteed to be equal to the returned + * len above, and the pointer to be advanced by as much. + */ + s->ssl_ctx.reused_sess[tid].size = i2d_SSL_SESSION(sess, &ptr); + } + + /* done updating the session */ + + /* Now we'll try to add or remove this entry as a valid one: + * - if no entry is set and we have one, let's share it + * - if our entry was set and we have no more, let's clear it + */ + old_tid = HA_ATOMIC_LOAD(&s->ssl_ctx.last_ssl_sess_tid); // 0=none, >0 = tid + 1 + if (!s->ssl_ctx.reused_sess[tid].ptr && old_tid == tid + 1) + HA_ATOMIC_CAS(&s->ssl_ctx.last_ssl_sess_tid, &old_tid, 0); // no more valid + else if (s->ssl_ctx.reused_sess[tid].ptr && !old_tid) + HA_ATOMIC_CAS(&s->ssl_ctx.last_ssl_sess_tid, &old_tid, tid + 1); + + if (s->ssl_ctx.reused_sess[tid].sni) { + /* if the new sni is empty or isn' t the same as the old one */ + if ((!sni) || strcmp(s->ssl_ctx.reused_sess[tid].sni, sni) != 0) { + ha_free(&s->ssl_ctx.reused_sess[tid].sni); + if (sni) + s->ssl_ctx.reused_sess[tid].sni = strdup(sni); + } + } else if (sni) { + /* if there wasn't an old sni but there is a new one */ + s->ssl_ctx.reused_sess[tid].sni = strdup(sni); + } + HA_RWLOCK_WRUNLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.reused_sess[tid].sess_lock); + HA_RWLOCK_RDUNLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.lock); + } else { + HA_RWLOCK_RDLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.lock); + + if (s->ssl_ctx.reused_sess[tid].ptr) { + HA_RWLOCK_WRLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.reused_sess[tid].sess_lock); + ha_free(&s->ssl_ctx.reused_sess[tid].ptr); + HA_RWLOCK_WRUNLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.reused_sess[tid].sess_lock); + } + + old_tid = HA_ATOMIC_LOAD(&s->ssl_ctx.last_ssl_sess_tid); // 0=none, >0 = tid + 1 + if (old_tid == tid + 1) + HA_ATOMIC_CAS(&s->ssl_ctx.last_ssl_sess_tid, &old_tid, 0); // no more valid + + HA_RWLOCK_RDUNLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.lock); + } + + return 0; +} + + +/* SSL callback used on new session creation */ +int sh_ssl_sess_new_cb(SSL *ssl, SSL_SESSION *sess) +{ + unsigned char encsess[SHSESS_MAX_DATA_LEN]; /* encoded session */ + unsigned char encid[SSL_MAX_SSL_SESSION_ID_LENGTH]; /* encoded id */ + unsigned char *p; + int data_len; + unsigned int sid_length; + const unsigned char *sid_data; + + /* Session id is already stored in to key and session id is known + * so we don't store it to keep size. + * note: SSL_SESSION_set1_id is using + * a memcpy so we need to use a different pointer + * than sid_data or sid_ctx_data to avoid valgrind + * complaining. + */ + + sid_data = SSL_SESSION_get_id(sess, &sid_length); + + /* copy value in an other buffer */ + memcpy(encid, sid_data, sid_length); + + /* pad with 0 */ + if (sid_length < SSL_MAX_SSL_SESSION_ID_LENGTH) + memset(encid + sid_length, 0, SSL_MAX_SSL_SESSION_ID_LENGTH-sid_length); + + /* force length to zero to avoid ASN1 encoding */ + SSL_SESSION_set1_id(sess, encid, 0); + + /* force length to zero to avoid ASN1 encoding */ + SSL_SESSION_set1_id_context(sess, (const unsigned char *)SHCTX_APPNAME, 0); + + /* check if buffer is large enough for the ASN1 encoded session */ + data_len = i2d_SSL_SESSION(sess, NULL); + if (data_len > SHSESS_MAX_DATA_LEN) + goto err; + + p = encsess; + + /* process ASN1 session encoding before the lock */ + i2d_SSL_SESSION(sess, &p); + + + /* store to cache */ + sh_ssl_sess_store(encid, encsess, data_len); +err: + /* reset original length values */ + SSL_SESSION_set1_id(sess, encid, sid_length); + SSL_SESSION_set1_id_context(sess, (const unsigned char *)SHCTX_APPNAME, strlen(SHCTX_APPNAME)); + + return 0; /* do not increment session reference count */ +} + +/* SSL callback used on lookup an existing session cause none found in internal cache */ +SSL_SESSION *sh_ssl_sess_get_cb(SSL *ssl, __OPENSSL_110_CONST__ unsigned char *key, int key_len, int *do_copy) +{ + struct sh_ssl_sess_hdr *sh_ssl_sess; + unsigned char data[SHSESS_MAX_DATA_LEN], *p; + unsigned char tmpkey[SSL_MAX_SSL_SESSION_ID_LENGTH]; + SSL_SESSION *sess; + struct shared_block *first; + + _HA_ATOMIC_INC(&global.shctx_lookups); + + /* allow the session to be freed automatically by openssl */ + *do_copy = 0; + + /* tree key is zeros padded sessionid */ + if (key_len < SSL_MAX_SSL_SESSION_ID_LENGTH) { + memcpy(tmpkey, key, key_len); + memset(tmpkey + key_len, 0, SSL_MAX_SSL_SESSION_ID_LENGTH - key_len); + key = tmpkey; + } + + /* lock cache */ + shctx_wrlock(ssl_shctx); + + /* lookup for session */ + sh_ssl_sess = sh_ssl_sess_tree_lookup(key); + if (!sh_ssl_sess) { + /* no session found: unlock cache and exit */ + shctx_wrunlock(ssl_shctx); + _HA_ATOMIC_INC(&global.shctx_misses); + return NULL; + } + + /* sh_ssl_sess (shared_block->data) is at the end of shared_block */ + first = sh_ssl_sess_first_block(sh_ssl_sess); + + shctx_row_data_get(ssl_shctx, first, data, sizeof(struct sh_ssl_sess_hdr), first->len-sizeof(struct sh_ssl_sess_hdr)); + + shctx_wrunlock(ssl_shctx); + + /* decode ASN1 session */ + p = data; + sess = d2i_SSL_SESSION(NULL, (const unsigned char **)&p, first->len-sizeof(struct sh_ssl_sess_hdr)); + /* Reset session id and session id contenxt */ + if (sess) { + SSL_SESSION_set1_id(sess, key, key_len); + SSL_SESSION_set1_id_context(sess, (const unsigned char *)SHCTX_APPNAME, strlen(SHCTX_APPNAME)); + } + + return sess; +} + + +/* SSL callback used to signal session is no more used in internal cache */ +void sh_ssl_sess_remove_cb(SSL_CTX *ctx, SSL_SESSION *sess) +{ + struct sh_ssl_sess_hdr *sh_ssl_sess; + unsigned char tmpkey[SSL_MAX_SSL_SESSION_ID_LENGTH]; + unsigned int sid_length; + const unsigned char *sid_data; + (void)ctx; + + sid_data = SSL_SESSION_get_id(sess, &sid_length); + /* tree key is zeros padded sessionid */ + if (sid_length < SSL_MAX_SSL_SESSION_ID_LENGTH) { + memcpy(tmpkey, sid_data, sid_length); + memset(tmpkey+sid_length, 0, SSL_MAX_SSL_SESSION_ID_LENGTH - sid_length); + sid_data = tmpkey; + } + + shctx_wrlock(ssl_shctx); + + /* lookup for session */ + sh_ssl_sess = sh_ssl_sess_tree_lookup(sid_data); + if (sh_ssl_sess) { + /* free session */ + sh_ssl_sess_tree_delete(sh_ssl_sess); + } + + /* unlock cache */ + shctx_wrunlock(ssl_shctx); +} + +/* Set session cache mode to server and disable openssl internal cache. + * Set shared cache callbacks on an ssl context. + * Shared context MUST be firstly initialized */ +void ssl_set_shctx(SSL_CTX *ctx) +{ + SSL_CTX_set_session_id_context(ctx, (const unsigned char *)SHCTX_APPNAME, strlen(SHCTX_APPNAME)); + + if (!ssl_shctx) { + SSL_CTX_set_session_cache_mode(ctx, SSL_SESS_CACHE_OFF); + return; + } + + SSL_CTX_set_session_cache_mode(ctx, SSL_SESS_CACHE_SERVER | + SSL_SESS_CACHE_NO_INTERNAL | + SSL_SESS_CACHE_NO_AUTO_CLEAR); + + /* Set callbacks */ + SSL_CTX_sess_set_new_cb(ctx, sh_ssl_sess_new_cb); + SSL_CTX_sess_set_get_cb(ctx, sh_ssl_sess_get_cb); + SSL_CTX_sess_set_remove_cb(ctx, sh_ssl_sess_remove_cb); +} + +/* + * https://developer.mozilla.org/en-US/docs/Mozilla/Projects/NSS/Key_Log_Format + * + * The format is: + * * <Label> <space> <ClientRandom> <space> <Secret> + * We only need to copy the secret as there is a sample fetch for the ClientRandom + */ + +#ifdef HAVE_SSL_KEYLOG +void SSL_CTX_keylog(const SSL *ssl, const char *line) +{ + struct ssl_keylog *keylog; + char *lastarg = NULL; + char *dst = NULL; + +#ifdef USE_QUIC_OPENSSL_COMPAT + quic_tls_compat_keylog_callback(ssl, line); +#endif + keylog = SSL_get_ex_data(ssl, ssl_keylog_index); + if (!keylog) + return; + + lastarg = strrchr(line, ' '); + if (lastarg == NULL || ++lastarg == NULL) + return; + + dst = pool_alloc(pool_head_ssl_keylog_str); + if (!dst) + return; + + strncpy(dst, lastarg, SSL_KEYLOG_MAX_SECRET_SIZE-1); + dst[SSL_KEYLOG_MAX_SECRET_SIZE-1] = '\0'; + + if (strncmp(line, "CLIENT_RANDOM ", strlen("CLIENT RANDOM ")) == 0) { + if (keylog->client_random) + goto error; + keylog->client_random = dst; + + } else if (strncmp(line, "CLIENT_EARLY_TRAFFIC_SECRET ", strlen("CLIENT_EARLY_TRAFFIC_SECRET ")) == 0) { + if (keylog->client_early_traffic_secret) + goto error; + keylog->client_early_traffic_secret = dst; + + } else if (strncmp(line, "CLIENT_HANDSHAKE_TRAFFIC_SECRET ", strlen("CLIENT_HANDSHAKE_TRAFFIC_SECRET ")) == 0) { + if(keylog->client_handshake_traffic_secret) + goto error; + keylog->client_handshake_traffic_secret = dst; + + } else if (strncmp(line, "SERVER_HANDSHAKE_TRAFFIC_SECRET ", strlen("SERVER_HANDSHAKE_TRAFFIC_SECRET ")) == 0) { + if (keylog->server_handshake_traffic_secret) + goto error; + keylog->server_handshake_traffic_secret = dst; + + } else if (strncmp(line, "CLIENT_TRAFFIC_SECRET_0 ", strlen("CLIENT_TRAFFIC_SECRET_0 ")) == 0) { + if (keylog->client_traffic_secret_0) + goto error; + keylog->client_traffic_secret_0 = dst; + + } else if (strncmp(line, "SERVER_TRAFFIC_SECRET_0 ", strlen("SERVER_TRAFFIC_SECRET_0 ")) == 0) { + if (keylog->server_traffic_secret_0) + goto error; + keylog->server_traffic_secret_0 = dst; + + } else if (strncmp(line, "EARLY_EXPORTER_SECRET ", strlen("EARLY_EXPORTER_SECRET ")) == 0) { + if (keylog->early_exporter_secret) + goto error; + keylog->early_exporter_secret = dst; + + } else if (strncmp(line, "EXPORTER_SECRET ", strlen("EXPORTER_SECRET ")) == 0) { + if (keylog->exporter_secret) + goto error; + keylog->exporter_secret = dst; + } else { + goto error; + } + + return; + +error: + pool_free(pool_head_ssl_keylog_str, dst); + + return; +} +#endif + +/* + * This function applies the SSL configuration on a SSL_CTX + * It returns an error code and fills the <err> buffer + */ +static int ssl_sock_prepare_ctx(struct bind_conf *bind_conf, struct ssl_bind_conf *ssl_conf, SSL_CTX *ctx, char **err) +{ + struct proxy *curproxy = bind_conf->frontend; + int cfgerr = 0; + int verify = SSL_VERIFY_NONE; + struct ssl_bind_conf __maybe_unused *ssl_conf_cur; + const char *conf_ciphers; +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + const char *conf_ciphersuites; +#endif + const char *conf_curves = NULL; + X509_STORE *store = SSL_CTX_get_cert_store(ctx); +#if defined(SSL_CTX_set1_sigalgs_list) + const char *conf_sigalgs = NULL; +#endif +#if defined(SSL_CTX_set1_client_sigalgs_list) + const char *conf_client_sigalgs = NULL; +#endif + + if (ssl_conf) { + struct tls_version_filter *conf_ssl_methods = &ssl_conf->ssl_methods; + int i, min, max; + int flags = MC_SSL_O_ALL; + + /* Real min and max should be determinate with configuration and openssl's capabilities */ + min = conf_ssl_methods->min ? conf_ssl_methods->min : bind_conf->ssl_conf.ssl_methods.min; + max = conf_ssl_methods->max ? conf_ssl_methods->max : bind_conf->ssl_conf.ssl_methods.max; + if (min) + flags |= (methodVersions[min].flag - 1); + if (max) + flags |= ~((methodVersions[max].flag << 1) - 1); + min = max = CONF_TLSV_NONE; + for (i = CONF_TLSV_MIN; i <= CONF_TLSV_MAX; i++) + if (methodVersions[i].option && !(flags & methodVersions[i].flag)) { + if (min) + max = i; + else + min = max = i; + } + /* save real min/max */ + conf_ssl_methods->min = min; + conf_ssl_methods->max = max; + if (!min) { + memprintf(err, "%sProxy '%s': all SSL/TLS versions are disabled for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", bind_conf->frontend->id, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + } + + switch ((ssl_conf && ssl_conf->verify) ? ssl_conf->verify : bind_conf->ssl_conf.verify) { + case SSL_SOCK_VERIFY_NONE: + verify = SSL_VERIFY_NONE; + break; + case SSL_SOCK_VERIFY_OPTIONAL: + verify = SSL_VERIFY_PEER; + break; + case SSL_SOCK_VERIFY_REQUIRED: + verify = SSL_VERIFY_PEER|SSL_VERIFY_FAIL_IF_NO_PEER_CERT; + break; + } + SSL_CTX_set_verify(ctx, verify, ssl_sock_bind_verifycbk); + if (verify & SSL_VERIFY_PEER) { + char *ca_file = (ssl_conf && ssl_conf->ca_file) ? ssl_conf->ca_file : bind_conf->ssl_conf.ca_file; + char *ca_verify_file = (ssl_conf && ssl_conf->ca_verify_file) ? ssl_conf->ca_verify_file : bind_conf->ssl_conf.ca_verify_file; + char *crl_file = (ssl_conf && ssl_conf->crl_file) ? ssl_conf->crl_file : bind_conf->ssl_conf.crl_file; + if (ca_file || ca_verify_file) { + /* set CAfile to verify */ + if (ca_file && !ssl_set_verify_locations_file(ctx, ca_file)) { + memprintf(err, "%sProxy '%s': unable to set CA file '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, ca_file, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + if (ca_verify_file && !ssl_set_verify_locations_file(ctx, ca_verify_file)) { + memprintf(err, "%sProxy '%s': unable to set CA-no-names file '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, ca_verify_file, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + if (ca_file && !((ssl_conf && ssl_conf->no_ca_names) || bind_conf->ssl_conf.no_ca_names)) { + /* set CA names for client cert request, function returns void */ + SSL_CTX_set_client_CA_list(ctx, SSL_dup_CA_list(ssl_get_client_ca_file(ca_file))); + } +#ifdef USE_OPENSSL_WOLFSSL + /* WolfSSL activates CRL checks by default so we need to disable it */ + X509_STORE_set_flags(store, 0) ; +#endif + } + else { + memprintf(err, "%sProxy '%s': verify is enabled but no CA file specified for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } +#ifdef X509_V_FLAG_CRL_CHECK + if (crl_file) { + + if (!ssl_set_cert_crl_file(store, crl_file)) { + memprintf(err, "%sProxy '%s': unable to configure CRL file '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, crl_file, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + else { + X509_STORE_set_flags(store, X509_V_FLAG_CRL_CHECK|X509_V_FLAG_CRL_CHECK_ALL); + } + } +#endif + ERR_clear_error(); + } +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) + if(bind_conf->keys_ref) { + if (!SSL_CTX_set_tlsext_ticket_key_evp_cb(ctx, ssl_tlsext_ticket_key_cb)) { + memprintf(err, "%sProxy '%s': unable to set callback for TLS ticket validation for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + } +#endif + + ssl_set_shctx(ctx); + conf_ciphers = (ssl_conf && ssl_conf->ciphers) ? ssl_conf->ciphers : bind_conf->ssl_conf.ciphers; + if (conf_ciphers && + !SSL_CTX_set_cipher_list(ctx, conf_ciphers)) { + memprintf(err, "%sProxy '%s': unable to set SSL cipher list to '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, conf_ciphers, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + conf_ciphersuites = (ssl_conf && ssl_conf->ciphersuites) ? ssl_conf->ciphersuites : bind_conf->ssl_conf.ciphersuites; + if (conf_ciphersuites && + !SSL_CTX_set_ciphersuites(ctx, conf_ciphersuites)) { + memprintf(err, "%sProxy '%s': unable to set TLS 1.3 cipher suites to '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, conf_ciphersuites, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } +#endif + +#ifndef OPENSSL_NO_DH + if (!local_dh_1024) + local_dh_1024 = ssl_get_dh_1024(); + if (!local_dh_2048) + local_dh_2048 = ssl_get_dh_2048(); + if (!local_dh_4096) + local_dh_4096 = ssl_get_dh_4096(); +#endif /* OPENSSL_NO_DH */ + + SSL_CTX_set_info_callback(ctx, ssl_sock_infocbk); +#ifdef SSL_CTRL_SET_MSG_CALLBACK + SSL_CTX_set_msg_callback(ctx, ssl_sock_msgcbk); +#endif +#ifdef HAVE_SSL_KEYLOG + /* only activate the keylog callback if it was required to prevent performance loss */ + if (global_ssl.keylog > 0) + SSL_CTX_set_keylog_callback(ctx, SSL_CTX_keylog); +#endif + +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + ssl_conf_cur = NULL; + if (ssl_conf && ssl_conf->npn_str) + ssl_conf_cur = ssl_conf; + else if (bind_conf->ssl_conf.npn_str) + ssl_conf_cur = &bind_conf->ssl_conf; + if (ssl_conf_cur) + SSL_CTX_set_next_protos_advertised_cb(ctx, ssl_sock_advertise_npn_protos, ssl_conf_cur); +#endif +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + ssl_conf_cur = NULL; + if (ssl_conf && ssl_conf->alpn_str) + ssl_conf_cur = ssl_conf; + else if (bind_conf->ssl_conf.alpn_str) + ssl_conf_cur = &bind_conf->ssl_conf; + if (ssl_conf_cur && ssl_conf_cur->alpn_len) + SSL_CTX_set_alpn_select_cb(ctx, ssl_sock_advertise_alpn_protos, ssl_conf_cur); +#endif +#if defined(SSL_CTX_set1_curves_list) + conf_curves = (ssl_conf && ssl_conf->curves) ? ssl_conf->curves : bind_conf->ssl_conf.curves; + if (conf_curves) { + if (!SSL_CTX_set1_curves_list(ctx, conf_curves)) { + memprintf(err, "%sProxy '%s': unable to set SSL curves list to '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, conf_curves, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + (void)SSL_CTX_set_ecdh_auto(ctx, 1); + } +#endif /* defined(SSL_CTX_set1_curves_list) */ + + if (!conf_curves) { +#if (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) +#if defined(SSL_CTX_set1_curves_list) + const char *ecdhe = (ssl_conf && ssl_conf->ecdhe) ? ssl_conf->ecdhe : + (bind_conf->ssl_conf.ecdhe ? bind_conf->ssl_conf.ecdhe : + NULL); + + if (ecdhe && SSL_CTX_set1_curves_list(ctx, ecdhe) == 0) { + memprintf(err, "%sProxy '%s': unable to set elliptic named curve to '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, ecdhe, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } +#endif /* defined(SSL_CTX_set1_curves_list) */ +#else +#if defined(SSL_CTX_set_tmp_ecdh) && !defined(OPENSSL_NO_ECDH) + int i; + EC_KEY *ecdh; + + const char *ecdhe = (ssl_conf && ssl_conf->ecdhe) ? ssl_conf->ecdhe : + (bind_conf->ssl_conf.ecdhe ? bind_conf->ssl_conf.ecdhe : + ECDHE_DEFAULT_CURVE); + + i = OBJ_sn2nid(ecdhe); + if (!i || ((ecdh = EC_KEY_new_by_curve_name(i)) == NULL)) { + memprintf(err, "%sProxy '%s': unable to set elliptic named curve to '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, ecdhe, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + else { + SSL_CTX_set_tmp_ecdh(ctx, ecdh); + EC_KEY_free(ecdh); + } +#endif /* defined(SSL_CTX_set_tmp_ecdh) && !defined(OPENSSL_NO_ECDH) */ +#endif /* HA_OPENSSL_VERSION_NUMBER >= 0x10101000L */ + } + +#if defined(SSL_CTX_set1_sigalgs_list) + conf_sigalgs = (ssl_conf && ssl_conf->sigalgs) ? ssl_conf->sigalgs : bind_conf->ssl_conf.sigalgs; + if (conf_sigalgs) { + if (!SSL_CTX_set1_sigalgs_list(ctx, conf_sigalgs)) { + memprintf(err, "%sProxy '%s': unable to set SSL Signature Algorithm list to '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, conf_sigalgs, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + } +#endif + +#if defined(SSL_CTX_set1_client_sigalgs_list) + conf_client_sigalgs = (ssl_conf && ssl_conf->client_sigalgs) ? ssl_conf->client_sigalgs : bind_conf->ssl_conf.client_sigalgs; + if (conf_client_sigalgs) { + if (!SSL_CTX_set1_client_sigalgs_list(ctx, conf_client_sigalgs)) { + memprintf(err, "%sProxy '%s': unable to set SSL Signature Algorithm list to '%s' for bind '%s' at [%s:%d].\n", + err && *err ? *err : "", curproxy->id, conf_client_sigalgs, bind_conf->arg, bind_conf->file, bind_conf->line); + cfgerr |= ERR_ALERT | ERR_FATAL; + } + } +#endif + +#ifdef USE_QUIC_OPENSSL_COMPAT + if (!quic_tls_compat_init(bind_conf, ctx)) + cfgerr |= ERR_ALERT | ERR_FATAL; +#endif + + return cfgerr; +} + + +/* + * Prepare the SSL_CTX based on the bind line configuration. + * Since the CA file loading is made depending on the verify option of the bind + * line, the link between the SSL_CTX and the CA file tree entry is made here. + * If we want to create a link between the CA file entry and the corresponding + * ckch instance (for CA file hot update), it needs to be done after + * ssl_sock_prepare_ctx. + * Returns 0 in case of success. + */ +int ssl_sock_prep_ctx_and_inst(struct bind_conf *bind_conf, struct ssl_bind_conf *ssl_conf, + SSL_CTX *ctx, struct ckch_inst *ckch_inst, char **err) +{ + int errcode = 0; + + errcode |= ssl_sock_prepare_ctx(bind_conf, ssl_conf, ctx, err); + if (!errcode && ckch_inst) + ckch_inst_add_cafile_link(ckch_inst, bind_conf, ssl_conf, NULL); + + return errcode; +} + +static int ssl_sock_srv_hostcheck(const char *pattern, const char *hostname) +{ + const char *pattern_wildcard, *pattern_left_label_end, *hostname_left_label_end; + size_t prefixlen, suffixlen; + + /* Trivial case */ + if (strcasecmp(pattern, hostname) == 0) + return 1; + + /* The rest of this logic is based on RFC 6125, section 6.4.3 + * (http://tools.ietf.org/html/rfc6125#section-6.4.3) */ + + pattern_wildcard = NULL; + pattern_left_label_end = pattern; + while (*pattern_left_label_end != '.') { + switch (*pattern_left_label_end) { + case 0: + /* End of label not found */ + return 0; + case '*': + /* If there is more than one wildcards */ + if (pattern_wildcard) + return 0; + pattern_wildcard = pattern_left_label_end; + break; + } + pattern_left_label_end++; + } + + /* If it's not trivial and there is no wildcard, it can't + * match */ + if (!pattern_wildcard) + return 0; + + /* Make sure all labels match except the leftmost */ + hostname_left_label_end = strchr(hostname, '.'); + if (!hostname_left_label_end + || strcasecmp(pattern_left_label_end, hostname_left_label_end) != 0) + return 0; + + /* Make sure the leftmost label of the hostname is long enough + * that the wildcard can match */ + if (hostname_left_label_end - hostname < (pattern_left_label_end - pattern) - 1) + return 0; + + /* Finally compare the string on either side of the + * wildcard */ + prefixlen = pattern_wildcard - pattern; + suffixlen = pattern_left_label_end - (pattern_wildcard + 1); + if ((prefixlen && (strncasecmp(pattern, hostname, prefixlen) != 0)) + || (suffixlen && (strncasecmp(pattern_wildcard + 1, hostname_left_label_end - suffixlen, suffixlen) != 0))) + return 0; + + return 1; +} + +static int ssl_sock_srv_verifycbk(int ok, X509_STORE_CTX *ctx) +{ + SSL *ssl; + struct connection *conn; + struct ssl_sock_ctx *ssl_ctx; + const char *servername; + const char *sni; + + int depth; + X509 *cert; + STACK_OF(GENERAL_NAME) *alt_names; + int i; + X509_NAME *cert_subject; + char *str; + + if (ok == 0) + return ok; + + ssl = X509_STORE_CTX_get_ex_data(ctx, SSL_get_ex_data_X509_STORE_CTX_idx()); + conn = SSL_get_ex_data(ssl, ssl_app_data_index); + ssl_ctx = __conn_get_ssl_sock_ctx(conn); + + /* We're checking if the provided hostnames match the desired one. The + * desired hostname comes from the SNI we presented if any, or if not + * provided then it may have been explicitly stated using a "verifyhost" + * directive. If neither is set, we don't care about the name so the + * verification is OK. + */ + servername = SSL_get_servername(ssl_ctx->ssl, TLSEXT_NAMETYPE_host_name); + sni = servername; + if (!servername) { + servername = __objt_server(conn->target)->ssl_ctx.verify_host; + if (!servername) + return ok; + } + + /* We only need to verify the CN on the actual server cert, + * not the indirect CAs */ + depth = X509_STORE_CTX_get_error_depth(ctx); + if (depth != 0) + return ok; + + /* At this point, the cert is *not* OK unless we can find a + * hostname match */ + ok = 0; + + cert = X509_STORE_CTX_get_current_cert(ctx); + /* It seems like this might happen if verify peer isn't set */ + if (!cert) + return ok; + + alt_names = X509_get_ext_d2i(cert, NID_subject_alt_name, NULL, NULL); + if (alt_names) { + for (i = 0; !ok && i < sk_GENERAL_NAME_num(alt_names); i++) { + GENERAL_NAME *name = sk_GENERAL_NAME_value(alt_names, i); + if (name->type == GEN_DNS) { +#if HA_OPENSSL_VERSION_NUMBER < 0x00907000L + if (ASN1_STRING_to_UTF8((unsigned char **)&str, name->d.ia5) >= 0) { +#else + if (ASN1_STRING_to_UTF8((unsigned char **)&str, name->d.dNSName) >= 0) { +#endif + ok = ssl_sock_srv_hostcheck(str, servername); + OPENSSL_free(str); + } + } + } + sk_GENERAL_NAME_pop_free(alt_names, GENERAL_NAME_free); + } + + cert_subject = X509_get_subject_name(cert); + i = -1; + while (!ok && (i = X509_NAME_get_index_by_NID(cert_subject, NID_commonName, i)) != -1) { + X509_NAME_ENTRY *entry = X509_NAME_get_entry(cert_subject, i); + ASN1_STRING *value; + value = X509_NAME_ENTRY_get_data(entry); + if (ASN1_STRING_to_UTF8((unsigned char **)&str, value) >= 0) { + ok = ssl_sock_srv_hostcheck(str, servername); + OPENSSL_free(str); + } + } + + /* report the mismatch and indicate if SNI was used or not */ + if (!ok && !conn->err_code) + conn->err_code = sni ? CO_ER_SSL_MISMATCH_SNI : CO_ER_SSL_MISMATCH; + return ok; +} + +/* prepare ssl context from servers options. Returns an error count */ +int ssl_sock_prepare_srv_ctx(struct server *srv) +{ + int cfgerr = 0; + SSL_CTX *ctx; + /* Automatic memory computations need to know we use SSL there + * If this is an internal proxy, don't use it for the computation */ + if (!(srv->proxy->cap & PR_CAP_INT)) + global.ssl_used_backend = 1; + + /* Initiate SSL context for current server */ + if (!srv->ssl_ctx.reused_sess) { + if ((srv->ssl_ctx.reused_sess = calloc(1, global.nbthread*sizeof(*srv->ssl_ctx.reused_sess))) == NULL) { + ha_alert("out of memory.\n"); + cfgerr++; + return cfgerr; + } + } + if (srv->use_ssl == 1) + srv->xprt = &ssl_sock; + + if (srv->ssl_ctx.client_crt) { + const int create_if_none = srv->flags & SRV_F_DYNAMIC ? 0 : 1; + char *err = NULL; + int err_code = 0; + + /* If there is a crt keyword there, the SSL_CTX will be created here. */ + err_code = ssl_sock_load_srv_cert(srv->ssl_ctx.client_crt, srv, create_if_none, &err); + if (err_code != ERR_NONE) { + if ((err_code & ERR_WARN) && !(err_code & ERR_ALERT)) + ha_warning("%s", err); + else + ha_alert("%s", err); + + if (err_code & (ERR_FATAL|ERR_ABORT)) + cfgerr++; + } + ha_free(&err); + } + + ctx = srv->ssl_ctx.ctx; + + /* The context will be uninitialized if there wasn't any "cert" option + * in the server line. */ + if (!ctx) { + ctx = SSL_CTX_new(SSLv23_client_method()); + if (!ctx) { + ha_alert("unable to allocate ssl context.\n"); + cfgerr++; + return cfgerr; + } + + srv->ssl_ctx.ctx = ctx; + } + + cfgerr += ssl_sock_prep_srv_ctx_and_inst(srv, srv->ssl_ctx.ctx, srv->ssl_ctx.inst); + + return cfgerr; +} + +/* Initialize an SSL context that will be used on the backend side. + * Returns an error count. + */ +static int ssl_sock_prepare_srv_ssl_ctx(const struct server *srv, SSL_CTX *ctx) +{ + struct proxy *curproxy = srv->proxy; + int cfgerr = 0; + long options = + SSL_OP_ALL | /* all known workarounds for bugs */ + SSL_OP_NO_SSLv2 | + SSL_OP_NO_COMPRESSION; + long mode = + SSL_MODE_ENABLE_PARTIAL_WRITE | + SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER | + SSL_MODE_RELEASE_BUFFERS | + SSL_MODE_SMALL_BUFFERS; + int verify = SSL_VERIFY_NONE; + const struct tls_version_filter *conf_ssl_methods = &srv->ssl_ctx.methods; + int i, min, max, hole; + int flags = MC_SSL_O_ALL; +#if defined(SSL_CTX_set1_sigalgs_list) + const char *conf_sigalgs = NULL; +#endif +#if defined(SSL_CTX_set1_client_sigalgs_list) + const char *conf_client_sigalgs = NULL; +#endif +#if defined(SSL_CTX_set1_curves_list) + const char *conf_curves = NULL; +#endif + + if (conf_ssl_methods->flags && (conf_ssl_methods->min || conf_ssl_methods->max)) + ha_warning("no-sslv3/no-tlsv1x are ignored for this server. " + "Use only 'ssl-min-ver' and 'ssl-max-ver' to fix.\n"); + else + flags = conf_ssl_methods->flags; + + /* Real min and max should be determinate with configuration and openssl's capabilities */ + if (conf_ssl_methods->min) + flags |= (methodVersions[conf_ssl_methods->min].flag - 1); + if (conf_ssl_methods->max) + flags |= ~((methodVersions[conf_ssl_methods->max].flag << 1) - 1); + + /* find min, max and holes */ + min = max = CONF_TLSV_NONE; + hole = 0; + for (i = CONF_TLSV_MIN; i <= CONF_TLSV_MAX; i++) + /* version is in openssl && version not disable in configuration */ + if (methodVersions[i].option && !(flags & methodVersions[i].flag)) { + if (min) { + if (hole) { + ha_warning("%s '%s': SSL/TLS versions range not contiguous for server '%s'. " + "Hole find for %s. Use only 'ssl-min-ver' and 'ssl-max-ver' to fix.\n", + proxy_type_str(curproxy), curproxy->id, srv->id, + methodVersions[hole].name); + hole = 0; + } + max = i; + } + else { + min = max = i; + } + } + else { + if (min) + hole = i; + } + if (!min) { + ha_alert("%s '%s': all SSL/TLS versions are disabled for server '%s'.\n", + proxy_type_str(curproxy), curproxy->id, srv->id); + cfgerr += 1; + } + +#if (HA_OPENSSL_VERSION_NUMBER < 0x1010000fL) + /* Keep force-xxx implementation as it is in older haproxy. It's a + precautionary measure to avoid any surprise with older openssl version. */ + if (min == max) + methodVersions[min].ctx_set_version(ctx, SET_CLIENT); + else + for (i = CONF_TLSV_MIN; i <= CONF_TLSV_MAX; i++) + if (flags & methodVersions[i].flag) + options |= methodVersions[i].option; +#else /* openssl >= 1.1.0 */ + /* set the max_version is required to cap TLS version or activate new TLS (v1.3) */ + methodVersions[min].ctx_set_version(ctx, SET_MIN); + methodVersions[max].ctx_set_version(ctx, SET_MAX); +#endif + + if (srv->ssl_ctx.options & SRV_SSL_O_NO_TLS_TICKETS) + options |= SSL_OP_NO_TICKET; + SSL_CTX_set_options(ctx, options); + +#ifdef SSL_MODE_ASYNC + if (global_ssl.async) + mode |= SSL_MODE_ASYNC; +#endif + SSL_CTX_set_mode(ctx, mode); + + if (global.ssl_server_verify == SSL_SERVER_VERIFY_REQUIRED) + verify = SSL_VERIFY_PEER; + switch (srv->ssl_ctx.verify) { + case SSL_SOCK_VERIFY_NONE: + verify = SSL_VERIFY_NONE; + break; + case SSL_SOCK_VERIFY_REQUIRED: + verify = SSL_VERIFY_PEER; + break; + } + SSL_CTX_set_verify(ctx, verify, + (srv->ssl_ctx.verify_host || (verify & SSL_VERIFY_PEER)) ? ssl_sock_srv_verifycbk : NULL); + if (verify & SSL_VERIFY_PEER) { + if (srv->ssl_ctx.ca_file) { + /* set CAfile to verify */ + if (!ssl_set_verify_locations_file(ctx, srv->ssl_ctx.ca_file)) { + ha_alert("unable to set CA file '%s'.\n", + srv->ssl_ctx.ca_file); + cfgerr++; + } + } + else { + if (global.ssl_server_verify == SSL_SERVER_VERIFY_REQUIRED) + ha_alert("verify is enabled by default but no CA file specified. If you're running on a LAN where you're certain to trust the server's certificate, please set an explicit 'verify none' statement on the 'server' line, or use 'ssl-server-verify none' in the global section to disable server-side verifications by default.\n"); + else + ha_alert("verify is enabled but no CA file specified.\n"); + cfgerr++; + } +#ifdef X509_V_FLAG_CRL_CHECK + if (srv->ssl_ctx.crl_file) { + X509_STORE *store = SSL_CTX_get_cert_store(ctx); + + if (!ssl_set_cert_crl_file(store, srv->ssl_ctx.crl_file)) { + ha_alert("unable to configure CRL file '%s'.\n", + srv->ssl_ctx.crl_file); + cfgerr++; + } + else { + X509_STORE_set_flags(store, X509_V_FLAG_CRL_CHECK|X509_V_FLAG_CRL_CHECK_ALL); + } + } +#endif + } + + SSL_CTX_set_session_cache_mode(ctx, SSL_SESS_CACHE_CLIENT | SSL_SESS_CACHE_NO_INTERNAL_STORE); + SSL_CTX_sess_set_new_cb(ctx, ssl_sess_new_srv_cb); + if (srv->ssl_ctx.ciphers && + !SSL_CTX_set_cipher_list(ctx, srv->ssl_ctx.ciphers)) { + ha_alert("unable to set SSL cipher list to '%s'.\n", + srv->ssl_ctx.ciphers); + cfgerr++; + } + +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + if (srv->ssl_ctx.ciphersuites && + !SSL_CTX_set_ciphersuites(ctx, srv->ssl_ctx.ciphersuites)) { + ha_alert("unable to set TLS 1.3 cipher suites to '%s'.\n", + srv->ssl_ctx.ciphersuites); + cfgerr++; + } +#endif +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + if (srv->ssl_ctx.npn_str) + SSL_CTX_set_next_proto_select_cb(ctx, ssl_sock_srv_select_protos, (struct server*)srv); +#endif +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + if (srv->ssl_ctx.alpn_str && srv->ssl_ctx.alpn_len) + SSL_CTX_set_alpn_protos(ctx, (unsigned char *)srv->ssl_ctx.alpn_str, srv->ssl_ctx.alpn_len); +#endif + +#if defined(SSL_CTX_set1_sigalgs_list) + conf_sigalgs = srv->ssl_ctx.sigalgs; + if (conf_sigalgs) { + if (!SSL_CTX_set1_sigalgs_list(ctx, conf_sigalgs)) { + ha_alert("Proxy '%s': unable to set SSL Signature Algorithm list to '%s' for server '%s'.\n", + curproxy->id, conf_sigalgs, srv->id); + cfgerr++; + } + } +#endif +#if defined(SSL_CTX_set1_client_sigalgs_list) + conf_client_sigalgs = srv->ssl_ctx.client_sigalgs; + if (conf_client_sigalgs) { + if (!SSL_CTX_set1_client_sigalgs_list(ctx, conf_client_sigalgs)) { + ha_alert("Proxy '%s': unable to set SSL Client Signature Algorithm list to '%s' for server '%s'.\n", + curproxy->id, conf_client_sigalgs, srv->id); + cfgerr++; + } + } +#endif + +#if defined(SSL_CTX_set1_curves_list) + conf_curves = srv->ssl_ctx.curves; + if (conf_curves) { + if (!SSL_CTX_set1_curves_list(ctx, conf_curves)) { + ha_alert("Proxy '%s': unable to set SSL curves list to '%s' for server '%s'.\n", + curproxy->id, conf_curves, srv->id); + cfgerr++; + } + } +#endif /* defined(SSL_CTX_set1_curves_list) */ + + return cfgerr; +} + +/* + * Prepare the frontend's SSL_CTX based on the server line configuration. + * Since the CA file loading is made depending on the verify option of the + * server line, the link between the SSL_CTX and the CA file tree entry is + * made here. + * If we want to create a link between the CA file entry and the corresponding + * ckch instance (for CA file hot update), it needs to be done after + * ssl_sock_prepare_srv_ssl_ctx. + * Returns an error count. + */ +int ssl_sock_prep_srv_ctx_and_inst(const struct server *srv, SSL_CTX *ctx, + struct ckch_inst *ckch_inst) +{ + int cfgerr = 0; + + cfgerr += ssl_sock_prepare_srv_ssl_ctx(srv, ctx); + if (!cfgerr && ckch_inst) + ckch_inst_add_cafile_link(ckch_inst, NULL, NULL, srv); + + return cfgerr; +} + + +/* + * Create an initial CTX used to start the SSL connections. + * May be used by QUIC xprt which makes usage of SSL sessions initialized from SSL_CTXs. + * Returns 0 if succeeded, or something >0 if not. + */ +#ifdef USE_QUIC +static int ssl_initial_ctx(struct bind_conf *bind_conf) +{ + if (bind_conf->xprt == xprt_get(XPRT_QUIC)) + return ssl_quic_initial_ctx(bind_conf); + else + return ssl_sock_initial_ctx(bind_conf); +} +#else +static int ssl_initial_ctx(struct bind_conf *bind_conf) +{ + return ssl_sock_initial_ctx(bind_conf); +} +#endif + +/* Walks down the two trees in bind_conf and prepares all certs. The pointer may + * be NULL, in which case nothing is done. Returns the number of errors + * encountered. + */ +int ssl_sock_prepare_all_ctx(struct bind_conf *bind_conf) +{ + struct ebmb_node *node; + struct sni_ctx *sni; + int err = 0; + int errcode = 0; + char *errmsg = NULL; + + /* Automatic memory computations need to know we use SSL there */ + global.ssl_used_frontend = 1; + + /* Create initial_ctx used to start the ssl connection before do switchctx */ + if (!bind_conf->initial_ctx) { + err += ssl_initial_ctx(bind_conf); + /* It should not be necessary to call this function, but it's + necessary first to check and move all initialisation related + to initial_ctx in ssl_initial_ctx. */ + errcode |= ssl_sock_prep_ctx_and_inst(bind_conf, NULL, bind_conf->initial_ctx, NULL, &errmsg); + } + if (bind_conf->default_ctx) { + errcode |= ssl_sock_prep_ctx_and_inst(bind_conf, bind_conf->default_ssl_conf, bind_conf->default_ctx, bind_conf->default_inst, &errmsg); + } + + node = ebmb_first(&bind_conf->sni_ctx); + while (node) { + sni = ebmb_entry(node, struct sni_ctx, name); + if (!sni->order && sni->ctx != bind_conf->default_ctx) { + /* only initialize the CTX on its first occurrence and + if it is not the default_ctx */ + errcode |= ssl_sock_prep_ctx_and_inst(bind_conf, sni->conf, sni->ctx, sni->ckch_inst, &errmsg); + } + node = ebmb_next(node); + } + + node = ebmb_first(&bind_conf->sni_w_ctx); + while (node) { + sni = ebmb_entry(node, struct sni_ctx, name); + if (!sni->order && sni->ctx != bind_conf->default_ctx) { + /* only initialize the CTX on its first occurrence and + if it is not the default_ctx */ + errcode |= ssl_sock_prep_ctx_and_inst(bind_conf, sni->conf, sni->ctx, sni->ckch_inst, &errmsg); + } + node = ebmb_next(node); + } + + if (errcode & ERR_WARN) { + ha_warning("%s", errmsg); + } else if (errcode & ERR_CODE) { + ha_alert("%s", errmsg); + err++; + } + + free(errmsg); + return err; +} + +/* Prepares all the contexts for a bind_conf and allocates the shared SSL + * context if needed. Returns < 0 on error, 0 on success. The warnings and + * alerts are directly emitted since the rest of the stack does it below. + */ +int ssl_sock_prepare_bind_conf(struct bind_conf *bind_conf) +{ + struct proxy *px = bind_conf->frontend; + int alloc_ctx; + int err; + + if (!(bind_conf->options & BC_O_USE_SSL)) { + if (bind_conf->default_ctx) { + ha_warning("Proxy '%s': A certificate was specified but SSL was not enabled on bind '%s' at [%s:%d] (use 'ssl').\n", + px->id, bind_conf->arg, bind_conf->file, bind_conf->line); + } + return 0; + } + if (!bind_conf->default_ctx) { + if (bind_conf->strict_sni && !(bind_conf->options & BC_O_GENERATE_CERTS)) { + ha_warning("Proxy '%s': no SSL certificate specified for bind '%s' at [%s:%d], ssl connections will fail (use 'crt').\n", + px->id, bind_conf->arg, bind_conf->file, bind_conf->line); + } + else { + ha_alert("Proxy '%s': no SSL certificate specified for bind '%s' at [%s:%d] (use 'crt').\n", + px->id, bind_conf->arg, bind_conf->file, bind_conf->line); + return -1; + } + } + if (!ssl_shctx && global.tune.sslcachesize) { + alloc_ctx = shctx_init(&ssl_shctx, global.tune.sslcachesize, + sizeof(struct sh_ssl_sess_hdr) + SHSESS_BLOCK_MIN_SIZE, -1, + sizeof(*sh_ssl_sess_tree)); + if (alloc_ctx <= 0) { + if (alloc_ctx == SHCTX_E_INIT_LOCK) + ha_alert("Unable to initialize the lock for the shared SSL session cache. You can retry using the global statement 'tune.ssl.force-private-cache' but it could increase CPU usage due to renegotiations if nbproc > 1.\n"); + else + ha_alert("Unable to allocate SSL session cache.\n"); + return -1; + } + /* free block callback */ + ssl_shctx->free_block = sh_ssl_sess_free_blocks; + /* init the root tree within the extra space */ + sh_ssl_sess_tree = (void *)ssl_shctx + sizeof(struct shared_context); + *sh_ssl_sess_tree = EB_ROOT_UNIQUE; + } + err = 0; + /* initialize all certificate contexts */ + err += ssl_sock_prepare_all_ctx(bind_conf); + + /* initialize CA variables if the certificates generation is enabled */ + err += ssl_sock_load_ca(bind_conf); + + return -err; +} + +/* release ssl context allocated for servers. Most of the field free here + * must also be allocated in srv_ssl_settings_cpy() */ +void ssl_sock_free_srv_ctx(struct server *srv) +{ +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + ha_free(&srv->ssl_ctx.alpn_str); +#endif +#ifdef OPENSSL_NPN_NEGOTIATED + ha_free(&srv->ssl_ctx.npn_str); +#endif + if (srv->ssl_ctx.reused_sess) { + int i; + + for (i = 0; i < global.nbthread; i++) { + ha_free(&srv->ssl_ctx.reused_sess[i].ptr); + ha_free(&srv->ssl_ctx.reused_sess[i].sni); + } + ha_free(&srv->ssl_ctx.reused_sess); + } + + if (srv->ssl_ctx.ctx) { + SSL_CTX_free(srv->ssl_ctx.ctx); + srv->ssl_ctx.ctx = NULL; + } + + ha_free(&srv->ssl_ctx.ca_file); + ha_free(&srv->ssl_ctx.crl_file); + ha_free(&srv->ssl_ctx.client_crt); + ha_free(&srv->ssl_ctx.verify_host); +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + ha_free(&srv->sni_expr); + release_sample_expr(srv->ssl_ctx.sni); + srv->ssl_ctx.sni = NULL; +#endif + ha_free(&srv->ssl_ctx.ciphers); +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + ha_free(&srv->ssl_ctx.ciphersuites); +#endif + /* If there is a certificate we must unlink the ckch instance */ + ckch_inst_free(srv->ssl_ctx.inst); +} + +/* Walks down the two trees in bind_conf and frees all the certs. The pointer may + * be NULL, in which case nothing is done. The default_ctx is nullified too. + */ +void ssl_sock_free_all_ctx(struct bind_conf *bind_conf) +{ + struct ebmb_node *node, *back; + struct sni_ctx *sni; + + node = ebmb_first(&bind_conf->sni_ctx); + while (node) { + sni = ebmb_entry(node, struct sni_ctx, name); + back = ebmb_next(node); + ebmb_delete(node); + SSL_CTX_free(sni->ctx); + LIST_DELETE(&sni->by_ckch_inst); + free(sni); + node = back; + } + + node = ebmb_first(&bind_conf->sni_w_ctx); + while (node) { + sni = ebmb_entry(node, struct sni_ctx, name); + back = ebmb_next(node); + ebmb_delete(node); + SSL_CTX_free(sni->ctx); + LIST_DELETE(&sni->by_ckch_inst); + free(sni); + node = back; + } + + SSL_CTX_free(bind_conf->initial_ctx); + bind_conf->initial_ctx = NULL; + SSL_CTX_free(bind_conf->default_ctx); + bind_conf->default_ctx = NULL; + bind_conf->default_inst = NULL; + bind_conf->default_ssl_conf = NULL; +} + + +void ssl_sock_deinit() +{ + crtlist_deinit(); /* must be free'd before the ckchs */ + ckch_deinit(); +} +REGISTER_POST_DEINIT(ssl_sock_deinit); + +/* Destroys all the contexts for a bind_conf. This is used during deinit(). */ +void ssl_sock_destroy_bind_conf(struct bind_conf *bind_conf) +{ + ssl_sock_free_ca(bind_conf); + ssl_sock_free_all_ctx(bind_conf); + ssl_sock_free_ssl_conf(&bind_conf->ssl_conf); + free(bind_conf->ca_sign_file); + free(bind_conf->ca_sign_pass); + if (bind_conf->keys_ref && !--bind_conf->keys_ref->refcount) { + free(bind_conf->keys_ref->filename); + free(bind_conf->keys_ref->tlskeys); + LIST_DELETE(&bind_conf->keys_ref->list); + free(bind_conf->keys_ref); + } + bind_conf->keys_ref = NULL; + bind_conf->ca_sign_pass = NULL; + bind_conf->ca_sign_file = NULL; +} + +/* Load CA cert file and private key used to generate certificates */ +int +ssl_sock_load_ca(struct bind_conf *bind_conf) +{ + struct proxy *px = bind_conf->frontend; + struct ckch_data *data = NULL; + int ret = 0; + char *err = NULL; + + if (!(bind_conf->options & BC_O_GENERATE_CERTS)) + return ret; + +#if (defined SSL_CTRL_SET_TLSEXT_HOSTNAME && !defined SSL_NO_GENERATE_CERTIFICATES) + if (global_ssl.ctx_cache) { + ssl_ctx_lru_tree = lru64_new(global_ssl.ctx_cache); + } + ssl_ctx_lru_seed = (unsigned int)time(NULL); + ssl_ctx_serial = now_ms; +#endif + + if (!bind_conf->ca_sign_file) { + ha_alert("Proxy '%s': cannot enable certificate generation, " + "no CA certificate File configured at [%s:%d].\n", + px->id, bind_conf->file, bind_conf->line); + goto failed; + } + + /* Allocate cert structure */ + data = calloc(1, sizeof(*data)); + if (!data) { + ha_alert("Proxy '%s': Failed to read CA certificate file '%s' at [%s:%d]. Chain allocation failure\n", + px->id, bind_conf->ca_sign_file, bind_conf->file, bind_conf->line); + goto failed; + } + + /* Try to parse file */ + if (ssl_sock_load_files_into_ckch(bind_conf->ca_sign_file, data, &err)) { + ha_alert("Proxy '%s': Failed to read CA certificate file '%s' at [%s:%d]. Chain loading failed: %s\n", + px->id, bind_conf->ca_sign_file, bind_conf->file, bind_conf->line, err); + free(err); + goto failed; + } + + /* Fail if missing cert or pkey */ + if ((!data->cert) || (!data->key)) { + ha_alert("Proxy '%s': Failed to read CA certificate file '%s' at [%s:%d]. Chain missing certificate or private key\n", + px->id, bind_conf->ca_sign_file, bind_conf->file, bind_conf->line); + goto failed; + } + + /* Final assignment to bind */ + bind_conf->ca_sign_ckch = data; + return ret; + + failed: + if (data) { + ssl_sock_free_cert_key_and_chain_contents(data); + free(data); + } + + bind_conf->options &= ~BC_O_GENERATE_CERTS; + ret++; + return ret; +} + +/* Release CA cert and private key used to generate certificated */ +void +ssl_sock_free_ca(struct bind_conf *bind_conf) +{ + if (bind_conf->ca_sign_ckch) { + ssl_sock_free_cert_key_and_chain_contents(bind_conf->ca_sign_ckch); + ha_free(&bind_conf->ca_sign_ckch); + } +} + +/* + * Try to allocate the BIO and SSL session objects of <conn> connection with <bio> and + * <ssl> as addresses, <bio_meth> as BIO method and <ssl_ctx> as SSL context inherited settings. + * Connect the allocated BIO to the allocated SSL session. Also set <ctx> as address of custom + * data for the BIO and store <conn> as user data of the SSL session object. + * This is the responsibility of the caller to check the validity of all the pointers passed + * as parameters to this function. + * Return 0 if succeeded, -1 if not. If failed, sets the ->err_code member of <conn> to + * CO_ER_SSL_NO_MEM. + */ +int ssl_bio_and_sess_init(struct connection *conn, SSL_CTX *ssl_ctx, + SSL **ssl, BIO **bio, BIO_METHOD *bio_meth, void *ctx) +{ + int retry = 1; + + retry: + /* Alloc a new SSL session. */ + *ssl = SSL_new(ssl_ctx); + if (!*ssl) { + if (!retry--) + goto err; + + pool_gc(NULL); + goto retry; + } + + *bio = BIO_new(bio_meth); + if (!*bio) { + SSL_free(*ssl); + *ssl = NULL; + if (!retry--) + goto err; + + pool_gc(NULL); + goto retry; + } + + BIO_set_data(*bio, ctx); + SSL_set_bio(*ssl, *bio, *bio); + + /* set connection pointer. */ + if (!SSL_set_ex_data(*ssl, ssl_app_data_index, conn)) { + SSL_free(*ssl); + *ssl = NULL; + if (!retry--) + goto err; + + pool_gc(NULL); + goto retry; + } + + return 0; + + err: + conn->err_code = CO_ER_SSL_NO_MEM; + return -1; +} + +/* This function is called when all the XPRT have been initialized. We can + * now attempt to start the SSL handshake. + */ +static int ssl_sock_start(struct connection *conn, void *xprt_ctx) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + + if (ctx->xprt->start) { + int ret; + + ret = ctx->xprt->start(conn, ctx->xprt_ctx); + if (ret < 0) + return ret; + } + tasklet_wakeup(ctx->wait_event.tasklet); + + return 0; +} + +/* Similar to increment_actconn() but for SSL connections. */ +int increment_sslconn() +{ + unsigned int count, next_sslconn; + + do { + count = global.sslconns; + if (global.maxsslconn && count >= global.maxsslconn) { + /* maxconn reached */ + next_sslconn = 0; + goto end; + } + + /* try to increment sslconns */ + next_sslconn = count + 1; + } while (!_HA_ATOMIC_CAS(&global.sslconns, &count, next_sslconn) && __ha_cpu_relax()); + + end: + return next_sslconn; +} + +/* + * This function is called if SSL * context is not yet allocated. The function + * is designed to be called before any other data-layer operation and sets the + * handshake flag on the connection. It is safe to call it multiple times. + * It returns 0 on success and -1 in error case. + */ +static int ssl_sock_init(struct connection *conn, void **xprt_ctx) +{ + struct ssl_sock_ctx *ctx; + int next_sslconn = 0; + + /* already initialized */ + if (*xprt_ctx) + return 0; + + ctx = pool_alloc(ssl_sock_ctx_pool); + if (!ctx) { + conn->err_code = CO_ER_SSL_NO_MEM; + return -1; + } + ctx->wait_event.tasklet = tasklet_new(); + if (!ctx->wait_event.tasklet) { + conn->err_code = CO_ER_SSL_NO_MEM; + pool_free(ssl_sock_ctx_pool, ctx); + return -1; + } + ctx->wait_event.tasklet->process = ssl_sock_io_cb; + ctx->wait_event.tasklet->context = ctx; + ctx->wait_event.tasklet->state |= TASK_HEAVY; // assign it to the bulk queue during handshake + ctx->wait_event.events = 0; + ctx->sent_early_data = 0; + ctx->early_buf = BUF_NULL; + ctx->conn = conn; + ctx->subs = NULL; + ctx->xprt_st = 0; + ctx->xprt_ctx = NULL; + ctx->error_code = 0; + + next_sslconn = increment_sslconn(); + if (!next_sslconn) { + conn->err_code = CO_ER_SSL_TOO_MANY; + goto err; + } + + /* Only work with sockets for now, this should be adapted when we'll + * add QUIC support. + */ + ctx->xprt = xprt_get(XPRT_RAW); + if (ctx->xprt->init) { + if (ctx->xprt->init(conn, &ctx->xprt_ctx) != 0) + goto err; + } + + /* If it is in client mode initiate SSL session + in connect state otherwise accept state */ + if (objt_server(conn->target)) { + struct server *srv = __objt_server(conn->target); + + if (ssl_bio_and_sess_init(conn, srv->ssl_ctx.ctx, + &ctx->ssl, &ctx->bio, ha_meth, ctx) == -1) + goto err; + + SSL_set_connect_state(ctx->ssl); + HA_RWLOCK_RDLOCK(SSL_SERVER_LOCK, &srv->ssl_ctx.lock); + if (srv->ssl_ctx.reused_sess[tid].ptr) { + /* let's recreate a session from (ptr,size) and assign + * it to ctx->ssl. Its refcount will be updated by the + * creation and by the assignment, so after assigning + * it or failing to, we must always free it to decrement + * the refcount. + */ + const unsigned char *ptr = srv->ssl_ctx.reused_sess[tid].ptr; + SSL_SESSION *sess = d2i_SSL_SESSION(NULL, &ptr, srv->ssl_ctx.reused_sess[tid].size); + + if (sess && !SSL_set_session(ctx->ssl, sess)) { + uint old_tid = HA_ATOMIC_LOAD(&srv->ssl_ctx.last_ssl_sess_tid); // 0=none, >0 = tid + 1 + if (old_tid == tid + 1) + HA_ATOMIC_CAS(&srv->ssl_ctx.last_ssl_sess_tid, &old_tid, 0); // no more valid + SSL_SESSION_free(sess); + HA_RWLOCK_WRLOCK(SSL_SERVER_LOCK, &srv->ssl_ctx.reused_sess[tid].sess_lock); + ha_free(&srv->ssl_ctx.reused_sess[tid].ptr); + HA_RWLOCK_WRTORD(SSL_SERVER_LOCK, &srv->ssl_ctx.reused_sess[tid].sess_lock); + if (srv->ssl_ctx.reused_sess[tid].sni) + SSL_set_tlsext_host_name(ctx->ssl, srv->ssl_ctx.reused_sess[tid].sni); + HA_RWLOCK_RDUNLOCK(SSL_SERVER_LOCK, &srv->ssl_ctx.reused_sess[tid].sess_lock); + } else if (sess) { + /* already assigned, not needed anymore */ + SSL_SESSION_free(sess); + HA_RWLOCK_RDLOCK(SSL_SERVER_LOCK, &srv->ssl_ctx.reused_sess[tid].sess_lock); + if (srv->ssl_ctx.reused_sess[tid].sni) + SSL_set_tlsext_host_name(ctx->ssl, srv->ssl_ctx.reused_sess[tid].sni); + HA_RWLOCK_RDUNLOCK(SSL_SERVER_LOCK, &srv->ssl_ctx.reused_sess[tid].sess_lock); + } + } else { + /* No session available yet, let's see if we can pick one + * from another thread. If old_tid is non-null, it designates + * the index of a recently updated thread that might still have + * a usable session. All threads are collectively responsible + * for resetting the index if it fails. + */ + const unsigned char *ptr; + SSL_SESSION *sess; + uint old_tid = HA_ATOMIC_LOAD(&srv->ssl_ctx.last_ssl_sess_tid); // 0=none, >0 = tid + 1 + + if (old_tid) { + HA_RWLOCK_RDLOCK(SSL_SERVER_LOCK, &srv->ssl_ctx.reused_sess[old_tid-1].sess_lock); + + ptr = srv->ssl_ctx.reused_sess[old_tid-1].ptr; + if (ptr) { + sess = d2i_SSL_SESSION(NULL, &ptr, srv->ssl_ctx.reused_sess[old_tid-1].size); + if (sess) { + if (!SSL_set_session(ctx->ssl, sess)) + HA_ATOMIC_CAS(&srv->ssl_ctx.last_ssl_sess_tid, &old_tid, 0); // no more valid + SSL_SESSION_free(sess); + } + } + + if (srv->ssl_ctx.reused_sess[old_tid-1].sni) + SSL_set_tlsext_host_name(ctx->ssl, srv->ssl_ctx.reused_sess[old_tid-1].sni); + + HA_RWLOCK_RDUNLOCK(SSL_SERVER_LOCK, &srv->ssl_ctx.reused_sess[old_tid-1].sess_lock); + } + } + HA_RWLOCK_RDUNLOCK(SSL_SERVER_LOCK, &srv->ssl_ctx.lock); + + /* leave init state and start handshake */ + conn->flags |= CO_FL_SSL_WAIT_HS | CO_FL_WAIT_L6_CONN; + + _HA_ATOMIC_INC(&global.totalsslconns); + *xprt_ctx = ctx; + return 0; + } + else if (objt_listener(conn->target)) { + struct bind_conf *bc = __objt_listener(conn->target)->bind_conf; + + if (ssl_bio_and_sess_init(conn, bc->initial_ctx, + &ctx->ssl, &ctx->bio, ha_meth, ctx) == -1) + goto err; + +#ifdef SSL_READ_EARLY_DATA_SUCCESS + if (bc->ssl_conf.early_data) { + b_alloc(&ctx->early_buf); + SSL_set_max_early_data(ctx->ssl, + /* Only allow early data if we managed to allocate + * a buffer. + */ + (!b_is_null(&ctx->early_buf)) ? + global.tune.bufsize - global.tune.maxrewrite : 0); + } +#endif + + SSL_set_accept_state(ctx->ssl); + + /* leave init state and start handshake */ + conn->flags |= CO_FL_SSL_WAIT_HS | CO_FL_WAIT_L6_CONN; +#ifdef SSL_READ_EARLY_DATA_SUCCESS + if (bc->ssl_conf.early_data) + conn->flags |= CO_FL_EARLY_SSL_HS; +#endif + + _HA_ATOMIC_INC(&global.totalsslconns); + *xprt_ctx = ctx; + return 0; + } + /* don't know how to handle such a target */ + conn->err_code = CO_ER_SSL_NO_TARGET; +err: + if (next_sslconn) + _HA_ATOMIC_DEC(&global.sslconns); + if (ctx && ctx->wait_event.tasklet) + tasklet_free(ctx->wait_event.tasklet); + pool_free(ssl_sock_ctx_pool, ctx); + return -1; +} + + +/* This is the callback which is used when an SSL handshake is pending. It + * updates the FD status if it wants some polling before being called again. + * It returns 0 if it fails in a fatal way or needs to poll to go further, + * otherwise it returns non-zero and removes itself from the connection's + * flags (the bit is provided in <flag> by the caller). + */ +static int ssl_sock_handshake(struct connection *conn, unsigned int flag) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + int ret; + struct ssl_counters *counters = NULL; + struct ssl_counters *counters_px = NULL; + struct listener *li; + struct server *srv; + socklen_t lskerr; + int skerr; + + + if (!conn_ctrl_ready(conn)) + return 0; + + /* get counters */ + switch (obj_type(conn->target)) { + case OBJ_TYPE_LISTENER: + li = __objt_listener(conn->target); + counters = EXTRA_COUNTERS_GET(li->extra_counters, &ssl_stats_module); + counters_px = EXTRA_COUNTERS_GET(li->bind_conf->frontend->extra_counters_fe, + &ssl_stats_module); + break; + + case OBJ_TYPE_SERVER: + srv = __objt_server(conn->target); + counters = EXTRA_COUNTERS_GET(srv->extra_counters, &ssl_stats_module); + counters_px = EXTRA_COUNTERS_GET(srv->proxy->extra_counters_be, + &ssl_stats_module); + break; + + default: + break; + } + + if (!ctx) + goto out_error; + + /* don't start calculating a handshake on a dead connection */ + if (conn->flags & (CO_FL_ERROR | CO_FL_SOCK_RD_SH | CO_FL_SOCK_WR_SH)) + goto out_error; + + /* FIXME/WT: for now we don't have a clear way to inspect the connection + * status from the lower layers, so let's check the FD directly. Ideally + * the xprt layers should provide some status indicating their knowledge + * of shutdowns or error. + */ + BUG_ON(conn->flags & CO_FL_FDLESS); + + skerr = 0; + lskerr = sizeof(skerr); + if ((getsockopt(conn->handle.fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr) < 0) || + skerr != 0) + goto out_error; + +#ifdef SSL_READ_EARLY_DATA_SUCCESS + /* + * Check if we have early data. If we do, we have to read them + * before SSL_do_handshake() is called, And there's no way to + * detect early data, except to try to read them + */ + if (conn->flags & CO_FL_EARLY_SSL_HS) { + size_t read_data = 0; + + while (1) { + ret = SSL_read_early_data(ctx->ssl, + b_tail(&ctx->early_buf), b_room(&ctx->early_buf), + &read_data); + if (ret == SSL_READ_EARLY_DATA_ERROR) + goto check_error; + if (read_data > 0) { + conn->flags |= CO_FL_EARLY_DATA; + b_add(&ctx->early_buf, read_data); + } + if (ret == SSL_READ_EARLY_DATA_FINISH) { + conn->flags &= ~CO_FL_EARLY_SSL_HS; + if (!b_data(&ctx->early_buf)) + b_free(&ctx->early_buf); + break; + } + } + } +#endif + /* If we use SSL_do_handshake to process a reneg initiated by + * the remote peer, it sometimes returns SSL_ERROR_SSL. + * Usually SSL_write and SSL_read are used and process implicitly + * the reneg handshake. + * Here we use SSL_peek as a workaround for reneg. + */ + if (!(conn->flags & CO_FL_WAIT_L6_CONN) && SSL_renegotiate_pending(ctx->ssl)) { + char c; + + ret = SSL_peek(ctx->ssl, &c, 1); + if (ret <= 0) { + /* handshake may have not been completed, let's find why */ + ret = SSL_get_error(ctx->ssl, ret); + + if (ret == SSL_ERROR_WANT_WRITE) { + /* SSL handshake needs to write, L4 connection may not be ready */ + if (!(ctx->wait_event.events & SUB_RETRY_SEND)) + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_SEND, &ctx->wait_event); + return 0; + } + else if (ret == SSL_ERROR_WANT_READ) { + /* handshake may have been completed but we have + * no more data to read. + */ + if (!SSL_renegotiate_pending(ctx->ssl)) { + ret = 1; + goto reneg_ok; + } + /* SSL handshake needs to read, L4 connection is ready */ + if (!(ctx->wait_event.events & SUB_RETRY_RECV)) + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_RECV, &ctx->wait_event); + return 0; + } +#ifdef SSL_MODE_ASYNC + else if (ret == SSL_ERROR_WANT_ASYNC) { + ssl_async_process_fds(ctx); + return 0; + } +#endif + else if (ret == SSL_ERROR_SYSCALL) { + /* if errno is null, then connection was successfully established */ + if (!errno && conn->flags & CO_FL_WAIT_L4_CONN) + conn->flags &= ~CO_FL_WAIT_L4_CONN; + if (!conn->err_code) { +#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER) + /* do not handle empty handshakes in BoringSSL or LibreSSL */ + conn->err_code = CO_ER_SSL_HANDSHAKE; +#else + int empty_handshake; +#if (HA_OPENSSL_VERSION_NUMBER >= 0x1010000fL) + /* use SSL_get_state() in OpenSSL >= 1.1.0; SSL_state() is broken */ + OSSL_HANDSHAKE_STATE state = SSL_get_state((SSL *)ctx->ssl); + empty_handshake = state == TLS_ST_BEFORE; +#else + /* access packet_length directly in OpenSSL <= 1.0.2; SSL_state() is broken */ + empty_handshake = !ctx->ssl->packet_length; +#endif + if (empty_handshake) { + if (!errno) { + if (ctx->xprt_st & SSL_SOCK_RECV_HEARTBEAT) + conn->err_code = CO_ER_SSL_HANDSHAKE_HB; + else + conn->err_code = CO_ER_SSL_EMPTY; + } + else { + if (ctx->xprt_st & SSL_SOCK_RECV_HEARTBEAT) + conn->err_code = CO_ER_SSL_HANDSHAKE_HB; + else + conn->err_code = CO_ER_SSL_ABORT; + } + } + else { + if (ctx->xprt_st & SSL_SOCK_RECV_HEARTBEAT) + conn->err_code = CO_ER_SSL_HANDSHAKE_HB; + else + conn->err_code = CO_ER_SSL_HANDSHAKE; + } +#endif /* BoringSSL or LibreSSL */ + } + goto out_error; + } + else { + /* Fail on all other handshake errors */ + /* Note: OpenSSL may leave unread bytes in the socket's + * buffer, causing an RST to be emitted upon close() on + * TCP sockets. We first try to drain possibly pending + * data to avoid this as much as possible. + */ + conn_ctrl_drain(conn); + if (!conn->err_code) + conn->err_code = (ctx->xprt_st & SSL_SOCK_RECV_HEARTBEAT) ? + CO_ER_SSL_KILLED_HB : CO_ER_SSL_HANDSHAKE; + goto out_error; + } + } + /* read some data: consider handshake completed */ + goto reneg_ok; + } + ret = SSL_do_handshake(ctx->ssl); +check_error: + if (ret != 1) { + /* handshake did not complete, let's find why */ + ret = SSL_get_error(ctx->ssl, ret); + + if (!ctx->error_code) + ctx->error_code = ERR_peek_error(); + + if (ret == SSL_ERROR_WANT_WRITE) { + /* SSL handshake needs to write, L4 connection may not be ready */ + if (!(ctx->wait_event.events & SUB_RETRY_SEND)) + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_SEND, &ctx->wait_event); + return 0; + } + else if (ret == SSL_ERROR_WANT_READ) { + /* SSL handshake needs to read, L4 connection is ready */ + if (!(ctx->wait_event.events & SUB_RETRY_RECV)) + ctx->xprt->subscribe(conn, ctx->xprt_ctx, + SUB_RETRY_RECV, &ctx->wait_event); + return 0; + } +#ifdef SSL_MODE_ASYNC + else if (ret == SSL_ERROR_WANT_ASYNC) { + ssl_async_process_fds(ctx); + return 0; + } +#endif + else if (ret == SSL_ERROR_SYSCALL) { + /* if errno is null, then connection was successfully established */ + if (!errno && conn->flags & CO_FL_WAIT_L4_CONN) + conn->flags &= ~CO_FL_WAIT_L4_CONN; + if (!conn->err_code) { +#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER) + /* do not handle empty handshakes in BoringSSL or LibreSSL */ + conn->err_code = CO_ER_SSL_HANDSHAKE; +#else + int empty_handshake; +#if (HA_OPENSSL_VERSION_NUMBER >= 0x1010000fL) + /* use SSL_get_state() in OpenSSL >= 1.1.0; SSL_state() is broken */ + OSSL_HANDSHAKE_STATE state = SSL_get_state(ctx->ssl); + empty_handshake = state == TLS_ST_BEFORE; +#else + /* access packet_length directly in OpenSSL <= 1.0.2; SSL_state() is broken */ + empty_handshake = !ctx->ssl->packet_length; +#endif + if (empty_handshake) { + if (!errno) { + if (ctx->xprt_st & SSL_SOCK_RECV_HEARTBEAT) + conn->err_code = CO_ER_SSL_HANDSHAKE_HB; + else + conn->err_code = CO_ER_SSL_EMPTY; + } + else { + if (ctx->xprt_st & SSL_SOCK_RECV_HEARTBEAT) + conn->err_code = CO_ER_SSL_HANDSHAKE_HB; + else + conn->err_code = CO_ER_SSL_ABORT; + } + } + else { + if (ctx->xprt_st & SSL_SOCK_RECV_HEARTBEAT) + conn->err_code = CO_ER_SSL_HANDSHAKE_HB; + else + conn->err_code = CO_ER_SSL_HANDSHAKE; + } +#endif /* BoringSSL or LibreSSL */ + } + goto out_error; + + } else if (ret == SSL_ERROR_ZERO_RETURN) { + /* The peer has closed the SSL session for writing by + * sending a close_notify alert */ + conn_ctrl_drain(conn); + conn->err_code = CO_ER_SSL_EMPTY; + goto out_error; + + } + else { + /* Fail on all other handshake errors */ + /* Note: OpenSSL may leave unread bytes in the socket's + * buffer, causing an RST to be emitted upon close() on + * TCP sockets. We first try to drain possibly pending + * data to avoid this as much as possible. + */ + conn_ctrl_drain(conn); + if (!conn->err_code) + conn->err_code = (ctx->xprt_st & SSL_SOCK_RECV_HEARTBEAT) ? + CO_ER_SSL_KILLED_HB : CO_ER_SSL_HANDSHAKE; + goto out_error; + } + } +#ifdef SSL_READ_EARLY_DATA_SUCCESS + else { + /* + * If the server refused the early data, we have to send a + * 425 to the client, as we no longer have the data to sent + * them again. + */ + if ((conn->flags & CO_FL_EARLY_DATA) && (objt_server(conn->target))) { + if (SSL_get_early_data_status(ctx->ssl) == SSL_EARLY_DATA_REJECTED) { + conn->err_code = CO_ER_SSL_EARLY_FAILED; + goto out_error; + } + } + } +#endif + + +reneg_ok: + +#ifdef SSL_MODE_ASYNC + /* ASYNC engine API doesn't support moving read/write + * buffers. So we disable ASYNC mode right after + * the handshake to avoid buffer overflow. + */ + if (global_ssl.async) + SSL_clear_mode(ctx->ssl, SSL_MODE_ASYNC); +#endif + /* Handshake succeeded */ + if (!SSL_session_reused(ctx->ssl)) { + if (objt_server(conn->target)) { + update_freq_ctr(&global.ssl_be_keys_per_sec, 1); + if (global.ssl_be_keys_per_sec.curr_ctr > global.ssl_be_keys_max) + global.ssl_be_keys_max = global.ssl_be_keys_per_sec.curr_ctr; + } + else { + update_freq_ctr(&global.ssl_fe_keys_per_sec, 1); + if (global.ssl_fe_keys_per_sec.curr_ctr > global.ssl_fe_keys_max) + global.ssl_fe_keys_max = global.ssl_fe_keys_per_sec.curr_ctr; + } + + if (counters) { + HA_ATOMIC_INC(&counters->sess); + HA_ATOMIC_INC(&counters_px->sess); + } + } + else if (counters) { + HA_ATOMIC_INC(&counters->reused_sess); + HA_ATOMIC_INC(&counters_px->reused_sess); + } + + /* The connection is now established at both layers, it's time to leave */ + conn->flags &= ~(flag | CO_FL_WAIT_L4_CONN | CO_FL_WAIT_L6_CONN); + return 1; + + out_error: + /* Clear openssl global errors stack */ + ssl_sock_dump_errors(conn, NULL); + ERR_clear_error(); + + /* free resumed session if exists */ + if (objt_server(conn->target)) { + struct server *s = __objt_server(conn->target); + /* RWLOCK: only rdlock the SSL cache even when writing in it because there is + * one cache per thread, it only prevents to flush it from the CLI in + * another thread */ + + HA_RWLOCK_RDLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.lock); + if (s->ssl_ctx.reused_sess[tid].ptr) + ha_free(&s->ssl_ctx.reused_sess[tid].ptr); + HA_RWLOCK_RDUNLOCK(SSL_SERVER_LOCK, &s->ssl_ctx.lock); + } + + if (counters) { + HA_ATOMIC_INC(&counters->failed_handshake); + HA_ATOMIC_INC(&counters_px->failed_handshake); + } + + /* Fail on all other handshake errors */ + conn->flags |= CO_FL_ERROR; + if (!conn->err_code) + conn->err_code = CO_ER_SSL_HANDSHAKE; + return 0; +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0, + * unless the transport layer was already released. + */ +static int ssl_subscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + + if (!ctx) + return -1; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(ctx->subs && ctx->subs != es); + + ctx->subs = es; + es->events |= event_type; + + /* we may have to subscribe to lower layers for new events */ + event_type &= ~ctx->wait_event.events; + if (event_type && !(conn->flags & CO_FL_SSL_WAIT_HS)) + ctx->xprt->subscribe(conn, ctx->xprt_ctx, event_type, &ctx->wait_event); + return 0; +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type>. + * The <es> pointer is not allowed to differ from the one passed to the + * subscribe() call. It always returns zero. + */ +static int ssl_unsubscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(ctx->subs && ctx->subs != es); + + es->events &= ~event_type; + if (!es->events) + ctx->subs = NULL; + + /* If we subscribed, and we're not doing the handshake, + * then we subscribed because the upper layer asked for it, + * as the upper layer is no longer interested, we can + * unsubscribe too. + */ + event_type &= ctx->wait_event.events; + if (event_type && !(ctx->conn->flags & CO_FL_SSL_WAIT_HS)) + conn_unsubscribe(conn, ctx->xprt_ctx, event_type, &ctx->wait_event); + + return 0; +} + +/* The connection has been taken over, so destroy the old tasklet and create + * a new one. The original thread ID must be passed into orig_tid + * It should be called with the takeover lock for the old thread held. + * Returns 0 on success, and -1 on failure + */ +static int ssl_takeover(struct connection *conn, void *xprt_ctx, int orig_tid) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + struct tasklet *tl = tasklet_new(); + + if (!tl) + return -1; + + ctx->wait_event.tasklet->context = NULL; + tasklet_wakeup_on(ctx->wait_event.tasklet, orig_tid); + ctx->wait_event.tasklet = tl; + ctx->wait_event.tasklet->process = ssl_sock_io_cb; + ctx->wait_event.tasklet->context = ctx; + return 0; +} + +/* notify the next xprt that the connection is about to become idle and that it + * may be stolen at any time after the function returns and that any tasklet in + * the chain must be careful before dereferencing its context. + */ +static void ssl_set_idle(struct connection *conn, void *xprt_ctx) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + + if (!ctx || !ctx->wait_event.tasklet) + return; + + HA_ATOMIC_OR(&ctx->wait_event.tasklet->state, TASK_F_USR1); + if (ctx->xprt) + xprt_set_idle(conn, ctx->xprt, ctx->xprt_ctx); +} + +/* notify the next xprt that the connection is not idle anymore and that it may + * not be stolen before the next xprt_set_idle(). + */ +static void ssl_set_used(struct connection *conn, void *xprt_ctx) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + + if (!ctx || !ctx->wait_event.tasklet) + return; + + HA_ATOMIC_OR(&ctx->wait_event.tasklet->state, TASK_F_USR1); + if (ctx->xprt) + xprt_set_used(conn, ctx->xprt, ctx->xprt_ctx); +} + +/* Use the provided XPRT as an underlying XPRT, and provide the old one. + * Returns 0 on success, and non-zero on failure. + */ +static int ssl_add_xprt(struct connection *conn, void *xprt_ctx, void *toadd_ctx, const struct xprt_ops *toadd_ops, void **oldxprt_ctx, const struct xprt_ops **oldxprt_ops) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + + if (oldxprt_ops != NULL) + *oldxprt_ops = ctx->xprt; + if (oldxprt_ctx != NULL) + *oldxprt_ctx = ctx->xprt_ctx; + ctx->xprt = toadd_ops; + ctx->xprt_ctx = toadd_ctx; + return 0; +} + +/* Remove the specified xprt. If if it our underlying XPRT, remove it and + * return 0, otherwise just call the remove_xprt method from the underlying + * XPRT. + */ +static int ssl_remove_xprt(struct connection *conn, void *xprt_ctx, void *toremove_ctx, const struct xprt_ops *newops, void *newctx) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + + if (ctx->xprt_ctx == toremove_ctx) { + ctx->xprt_ctx = newctx; + ctx->xprt = newops; + return 0; + } + return (ctx->xprt->remove_xprt(conn, ctx->xprt_ctx, toremove_ctx, newops, newctx)); +} + +struct task *ssl_sock_io_cb(struct task *t, void *context, unsigned int state) +{ + struct tasklet *tl = (struct tasklet *)t; + struct ssl_sock_ctx *ctx = context; + struct connection *conn; + int conn_in_list; + int ret = 0; + + if (state & TASK_F_USR1) { + /* the tasklet was idling on an idle connection, it might have + * been stolen, let's be careful! + */ + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + if (tl->context == NULL) { + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + tasklet_free(tl); + return NULL; + } + conn = ctx->conn; + conn_in_list = conn->flags & CO_FL_LIST_MASK; + if (conn_in_list) + conn_delete_from_tree(conn); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } else { + conn = ctx->conn; + conn_in_list = 0; + } + + /* First if we're doing an handshake, try that */ + if (ctx->conn->flags & CO_FL_SSL_WAIT_HS) { + ssl_sock_handshake(ctx->conn, CO_FL_SSL_WAIT_HS); + if (!(ctx->conn->flags & CO_FL_SSL_WAIT_HS)) { + /* handshake completed, leave the bulk queue */ + _HA_ATOMIC_AND(&tl->state, ~TASK_HEAVY); + } + } + /* If we had an error, or the handshake is done and I/O is available, + * let the upper layer know. + * If no mux was set up yet, then call conn_create_mux() + * we can't be sure conn_fd_handler() will be called again. + */ + if ((ctx->conn->flags & CO_FL_ERROR) || + !(ctx->conn->flags & CO_FL_SSL_WAIT_HS)) { + int woke = 0; + + /* On error, wake any waiter */ + if (ctx->subs) { + tasklet_wakeup(ctx->subs->tasklet); + ctx->subs->events = 0; + woke = 1; + ctx->subs = NULL; + } + + /* If we're the first xprt for the connection, let the + * upper layers know. If we have no mux, create it, + * and once we have a mux, call its wake method if we didn't + * woke a tasklet already. + */ + if (ctx->conn->xprt_ctx == ctx) { + if (!ctx->conn->mux) + ret = conn_create_mux(ctx->conn); + if (ret >= 0 && !woke && ctx->conn->mux && ctx->conn->mux->wake) + ret = ctx->conn->mux->wake(ctx->conn); + goto leave; + } + } +#ifdef SSL_READ_EARLY_DATA_SUCCESS + /* If we have early data and somebody wants to receive, let them */ + else if (b_data(&ctx->early_buf) && ctx->subs && + ctx->subs->events & SUB_RETRY_RECV) { + tasklet_wakeup(ctx->subs->tasklet); + ctx->subs->events &= ~SUB_RETRY_RECV; + if (!ctx->subs->events) + ctx->subs = NULL; + } +#endif +leave: + if (!ret && conn_in_list) { + struct server *srv = objt_server(conn->target); + + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + _srv_add_idle(srv, conn, conn_in_list == CO_FL_SAFE_LIST); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); + } + return t; +} + +/* Receive up to <count> bytes from connection <conn>'s socket and store them + * into buffer <buf>. Only one call to recv() is performed, unless the + * buffer wraps, in which case a second call may be performed. The connection's + * flags are updated with whatever special event is detected (error, read0, + * empty). The caller is responsible for taking care of those events and + * avoiding the call if inappropriate. The function does not call the + * connection's polling update function, so the caller is responsible for this. + */ +static size_t ssl_sock_to_buf(struct connection *conn, void *xprt_ctx, struct buffer *buf, size_t count, int flags) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + ssize_t ret; + size_t try, done = 0; + + if (!ctx) + goto out_error; + +#ifdef SSL_READ_EARLY_DATA_SUCCESS + if (b_data(&ctx->early_buf)) { + try = b_contig_space(buf); + if (try > b_data(&ctx->early_buf)) + try = b_data(&ctx->early_buf); + memcpy(b_tail(buf), b_head(&ctx->early_buf), try); + b_add(buf, try); + b_del(&ctx->early_buf, try); + if (b_data(&ctx->early_buf) == 0) + b_free(&ctx->early_buf); + return try; + } +#endif + + if (conn->flags & (CO_FL_WAIT_XPRT | CO_FL_SSL_WAIT_HS)) + /* a handshake was requested */ + return 0; + + /* read the largest possible block. For this, we perform only one call + * to recv() unless the buffer wraps and we exactly fill the first hunk, + * in which case we accept to do it once again. A new attempt is made on + * EINTR too. + */ + while (count > 0) { + + try = b_contig_space(buf); + if (!try) + break; + + if (try > count) + try = count; + + ret = SSL_read(ctx->ssl, b_tail(buf), try); + + if (conn->flags & CO_FL_ERROR) { + /* CO_FL_ERROR may be set by ssl_sock_infocbk */ + goto out_error; + } + if (ret > 0) { + b_add(buf, ret); + done += ret; + count -= ret; + } + else { + ret = SSL_get_error(ctx->ssl, ret); + if (ret == SSL_ERROR_WANT_WRITE) { + /* handshake is running, and it needs to enable write */ + conn->flags |= CO_FL_SSL_WAIT_HS; + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_SEND, &ctx->wait_event); +#ifdef SSL_MODE_ASYNC + /* Async mode can be re-enabled, because we're leaving data state.*/ + if (global_ssl.async) + SSL_set_mode(ctx->ssl, SSL_MODE_ASYNC); +#endif + break; + } + else if (ret == SSL_ERROR_WANT_READ) { + if (SSL_renegotiate_pending(ctx->ssl)) { + ctx->xprt->subscribe(conn, ctx->xprt_ctx, + SUB_RETRY_RECV, + &ctx->wait_event); + /* handshake is running, and it may need to re-enable read */ + conn->flags |= CO_FL_SSL_WAIT_HS; +#ifdef SSL_MODE_ASYNC + /* Async mode can be re-enabled, because we're leaving data state.*/ + if (global_ssl.async) + SSL_set_mode(ctx->ssl, SSL_MODE_ASYNC); +#endif + break; + } + break; + } else if (ret == SSL_ERROR_ZERO_RETURN) + goto read0; + else if (ret == SSL_ERROR_SSL) { + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + if (ctx && !ctx->error_code) + ctx->error_code = ERR_peek_error(); + conn->err_code = CO_ERR_SSL_FATAL; + } + /* For SSL_ERROR_SYSCALL, make sure to clear the error + * stack before shutting down the connection for + * reading. */ + if (ret == SSL_ERROR_SYSCALL && (!errno || errno == EAGAIN || errno == EWOULDBLOCK)) + goto clear_ssl_error; + /* otherwise it's a real error */ + goto out_error; + } + } + leave: + return done; + + clear_ssl_error: + /* Clear openssl global errors stack */ + ssl_sock_dump_errors(conn, NULL); + ERR_clear_error(); + read0: + conn_sock_read0(conn); + goto leave; + + out_error: + conn->flags |= CO_FL_ERROR; + /* Clear openssl global errors stack */ + ssl_sock_dump_errors(conn, NULL); + ERR_clear_error(); + goto leave; +} + + +/* Send up to <count> pending bytes from buffer <buf> to connection <conn>'s + * socket. <flags> may contain some CO_SFL_* flags to hint the system about + * other pending data for example, but this flag is ignored at the moment. + * Only one call to send() is performed, unless the buffer wraps, in which case + * a second call may be performed. The connection's flags are updated with + * whatever special event is detected (error, empty). The caller is responsible + * for taking care of those events and avoiding the call if inappropriate. The + * function does not call the connection's polling update function, so the caller + * is responsible for this. The buffer's output is not adjusted, it's up to the + * caller to take care of this. It's up to the caller to update the buffer's + * contents based on the return value. + */ +static size_t ssl_sock_from_buf(struct connection *conn, void *xprt_ctx, const struct buffer *buf, size_t count, int flags) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + ssize_t ret; + size_t try, done; + + done = 0; + + if (!ctx) + goto out_error; + + if (conn->flags & (CO_FL_WAIT_XPRT | CO_FL_SSL_WAIT_HS | CO_FL_EARLY_SSL_HS)) + /* a handshake was requested */ + return 0; + + /* send the largest possible block. For this we perform only one call + * to send() unless the buffer wraps and we exactly fill the first hunk, + * in which case we accept to do it once again. + */ + while (count) { +#ifdef SSL_READ_EARLY_DATA_SUCCESS + size_t written_data; +#endif + + try = b_contig_data(buf, done); + if (try > count) + try = count; + + if (global_ssl.hard_max_record && try > global_ssl.hard_max_record) + try = global_ssl.hard_max_record; + + if (!(flags & CO_SFL_STREAMER) && + !(ctx->xprt_st & SSL_SOCK_SEND_UNLIMITED) && + global_ssl.max_record && try > global_ssl.max_record) { + try = global_ssl.max_record; + } + else { + /* we need to keep the information about the fact that + * we're not limiting the upcoming send(), because if it + * fails, we'll have to retry with at least as many data. + */ + ctx->xprt_st |= SSL_SOCK_SEND_UNLIMITED; + } + + if (try < count || flags & CO_SFL_MSG_MORE) + ctx->xprt_st |= SSL_SOCK_SEND_MORE; + else + ctx->xprt_st &= ~SSL_SOCK_SEND_MORE; + +#ifdef SSL_READ_EARLY_DATA_SUCCESS + if (!SSL_is_init_finished(ctx->ssl) && conn_is_back(conn)) { + unsigned int max_early; + + if (objt_listener(conn->target)) + max_early = SSL_get_max_early_data(ctx->ssl); + else { + if (SSL_get0_session(ctx->ssl)) + max_early = SSL_SESSION_get_max_early_data(SSL_get0_session(ctx->ssl)); + else + max_early = 0; + } + + if (try + ctx->sent_early_data > max_early) { + try -= (try + ctx->sent_early_data) - max_early; + if (try <= 0) { + conn->flags |= CO_FL_SSL_WAIT_HS | CO_FL_WAIT_L6_CONN; + tasklet_wakeup(ctx->wait_event.tasklet); + break; + } + } + ret = SSL_write_early_data(ctx->ssl, b_peek(buf, done), try, &written_data); + if (ret == 1) { + ret = written_data; + ctx->sent_early_data += ret; + if (objt_server(conn->target)) { + conn->flags |= CO_FL_SSL_WAIT_HS | CO_FL_WAIT_L6_CONN | CO_FL_EARLY_DATA; + /* Initiate the handshake, now */ + tasklet_wakeup(ctx->wait_event.tasklet); + } + + } + + } else +#endif + ret = SSL_write(ctx->ssl, b_peek(buf, done), try); + + if (conn->flags & CO_FL_ERROR) { + /* CO_FL_ERROR may be set by ssl_sock_infocbk */ + goto out_error; + } + if (ret > 0) { + /* A send succeeded, so we can consider ourself connected */ + conn->flags &= ~CO_FL_WAIT_L4L6; + ctx->xprt_st &= ~SSL_SOCK_SEND_UNLIMITED; + count -= ret; + done += ret; + } + else { + ret = SSL_get_error(ctx->ssl, ret); + + if (ret == SSL_ERROR_WANT_WRITE) { + if (SSL_renegotiate_pending(ctx->ssl)) { + /* handshake is running, and it may need to re-enable write */ + conn->flags |= CO_FL_SSL_WAIT_HS; + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_SEND, &ctx->wait_event); +#ifdef SSL_MODE_ASYNC + /* Async mode can be re-enabled, because we're leaving data state.*/ + if (global_ssl.async) + SSL_set_mode(ctx->ssl, SSL_MODE_ASYNC); +#endif + break; + } + + break; + } + else if (ret == SSL_ERROR_WANT_READ) { + /* handshake is running, and it needs to enable read */ + conn->flags |= CO_FL_SSL_WAIT_HS; + ctx->xprt->subscribe(conn, ctx->xprt_ctx, + SUB_RETRY_RECV, + &ctx->wait_event); +#ifdef SSL_MODE_ASYNC + /* Async mode can be re-enabled, because we're leaving data state.*/ + if (global_ssl.async) + SSL_set_mode(ctx->ssl, SSL_MODE_ASYNC); +#endif + break; + } + else if (ret == SSL_ERROR_SSL || ret == SSL_ERROR_SYSCALL) { + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (ctx && !ctx->error_code) + ctx->error_code = ERR_peek_error(); + conn->err_code = CO_ERR_SSL_FATAL; + } + goto out_error; + } + } + leave: + return done; + + out_error: + /* Clear openssl global errors stack */ + ssl_sock_dump_errors(conn, NULL); + ERR_clear_error(); + + conn->flags |= CO_FL_ERROR; + goto leave; +} + +void ssl_sock_close(struct connection *conn, void *xprt_ctx) { + + struct ssl_sock_ctx *ctx = xprt_ctx; + + + if (ctx) { + if (ctx->wait_event.events != 0) + ctx->xprt->unsubscribe(ctx->conn, ctx->xprt_ctx, + ctx->wait_event.events, + &ctx->wait_event); + if (ctx->subs) { + ctx->subs->events = 0; + tasklet_wakeup(ctx->subs->tasklet); + } + + if (ctx->xprt->close) + ctx->xprt->close(conn, ctx->xprt_ctx); +#ifdef SSL_MODE_ASYNC + if (global_ssl.async) { + OSSL_ASYNC_FD all_fd[32], afd; + size_t num_all_fds = 0; + int i; + + SSL_get_all_async_fds(ctx->ssl, NULL, &num_all_fds); + if (num_all_fds > 32) { + send_log(NULL, LOG_EMERG, "haproxy: openssl returns too many async fds. It seems a bug. Process may crash\n"); + return; + } + + SSL_get_all_async_fds(ctx->ssl, all_fd, &num_all_fds); + + /* If an async job is pending, we must try to + to catch the end using polling before calling + SSL_free */ + if (num_all_fds && SSL_waiting_for_async(ctx->ssl)) { + for (i=0 ; i < num_all_fds ; i++) { + /* switch on an handler designed to + * handle the SSL_free + */ + afd = all_fd[i]; + fdtab[afd].iocb = ssl_async_fd_free; + fdtab[afd].owner = ctx->ssl; + fd_want_recv(afd); + /* To ensure that the fd cache won't be used + * and we'll catch a real RD event. + */ + fd_cant_recv(afd); + } + tasklet_free(ctx->wait_event.tasklet); + pool_free(ssl_sock_ctx_pool, ctx); + _HA_ATOMIC_INC(&jobs); + return; + } + /* Else we can remove the fds from the fdtab + * and call SSL_free. + * note: we do a fd_stop_both and not a delete + * because the fd is owned by the engine. + * the engine is responsible to close + */ + for (i=0 ; i < num_all_fds ; i++) { + /* We want to remove the fd from the fdtab + * but we flag it to disown because the + * close is performed by the engine itself + */ + fdtab[all_fd[i]].state |= FD_DISOWN; + fd_delete(all_fd[i]); + } + } +#endif + SSL_free(ctx->ssl); + b_free(&ctx->early_buf); + tasklet_free(ctx->wait_event.tasklet); + pool_free(ssl_sock_ctx_pool, ctx); + _HA_ATOMIC_DEC(&global.sslconns); + } +} + +/* This function tries to perform a clean shutdown on an SSL connection, and in + * any case, flags the connection as reusable if no handshake was in progress. + */ +static void ssl_sock_shutw(struct connection *conn, void *xprt_ctx, int clean) +{ + struct ssl_sock_ctx *ctx = xprt_ctx; + + if (conn->flags & (CO_FL_WAIT_XPRT | CO_FL_SSL_WAIT_HS)) + return; + if (!clean) + /* don't sent notify on SSL_shutdown */ + SSL_set_quiet_shutdown(ctx->ssl, 1); + /* no handshake was in progress, try a clean ssl shutdown */ + if (SSL_shutdown(ctx->ssl) <= 0) { + /* Clear openssl global errors stack */ + ssl_sock_dump_errors(conn, NULL); + ERR_clear_error(); + } +} + + +/* used for ppv2 pkey algo (can be used for logging) */ +int ssl_sock_get_pkey_algo(struct connection *conn, struct buffer *out) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + X509 *crt; + + if (!ctx) + return 0; + crt = SSL_get_certificate(ctx->ssl); + if (!crt) + return 0; + + return cert_get_pkey_algo(crt, out); +} + +/* used for ppv2 cert signature (can be used for logging) */ +const char *ssl_sock_get_cert_sig(struct connection *conn) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + __OPENSSL_110_CONST__ ASN1_OBJECT *algorithm; + X509 *crt; + + if (!ctx) + return NULL; + crt = SSL_get_certificate(ctx->ssl); + if (!crt) + return NULL; + X509_ALGOR_get0(&algorithm, NULL, NULL, X509_get0_tbs_sigalg(crt)); + return OBJ_nid2sn(OBJ_obj2nid(algorithm)); +} + +/* used for ppv2 authority */ +const char *ssl_sock_get_sni(struct connection *conn) +{ +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (!ctx) + return NULL; + return SSL_get_servername(ctx->ssl, TLSEXT_NAMETYPE_host_name); +#else + return NULL; +#endif +} + +/* used for logging/ppv2, may be changed for a sample fetch later */ +const char *ssl_sock_get_cipher_name(struct connection *conn) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (!ctx) + return NULL; + return SSL_get_cipher_name(ctx->ssl); +} + +/* used for logging/ppv2, may be changed for a sample fetch later */ +const char *ssl_sock_get_proto_version(struct connection *conn) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (!ctx) + return NULL; + return SSL_get_version(ctx->ssl); +} + +void ssl_sock_set_alpn(struct connection *conn, const unsigned char *alpn, int len) +{ +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (!ctx) + return; + SSL_set_alpn_protos(ctx->ssl, alpn, len); +#endif +} + +/* Sets advertised SNI for outgoing connections. Please set <hostname> to NULL + * to disable SNI. + */ +void ssl_sock_set_servername(struct connection *conn, const char *hostname) +{ +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + char *prev_name; + + if (!ctx) + return; + + BUG_ON(!(conn->flags & CO_FL_WAIT_L6_CONN)); + BUG_ON(!(conn->flags & CO_FL_SSL_WAIT_HS)); + + /* if the SNI changes, we must destroy the reusable context so that a + * new connection will present a new SNI. compare with the SNI + * previously stored in the reused_sess. If the session was reused, + * the associated SNI (if any) has already been assigned to the SSL + * during ssl_sock_init() so SSL_get_servername() will properly + * retrieve the currently known hostname for the SSL. + */ + + prev_name = (char *)SSL_get_servername(ctx->ssl, TLSEXT_NAMETYPE_host_name); + if ((!prev_name && hostname) || + !hostname || + strcmp(hostname, prev_name) != 0) { + SSL_set_session(ctx->ssl, NULL); + SSL_set_tlsext_host_name(ctx->ssl, hostname); + } +#endif +} + +/* Extract peer certificate's common name into the chunk dest + * Returns + * the len of the extracted common name + * or 0 if no CN found in DN + * or -1 on error case (i.e. no peer certificate) + */ +int ssl_sock_get_remote_common_name(struct connection *conn, + struct buffer *dest) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + X509 *crt = NULL; + X509_NAME *name; + const char find_cn[] = "CN"; + const struct buffer find_cn_chunk = { + .area = (char *)&find_cn, + .data = sizeof(find_cn)-1 + }; + int result = -1; + + if (!ctx) + goto out; + + /* SSL_get_peer_certificate, it increase X509 * ref count */ + crt = SSL_get_peer_certificate(ctx->ssl); + if (!crt) + goto out; + + name = X509_get_subject_name(crt); + if (!name) + goto out; + + result = ssl_sock_get_dn_entry(name, &find_cn_chunk, 1, dest); +out: + if (crt) + X509_free(crt); + + return result; +} + +/* returns 1 if client passed a certificate for this session, 0 if not */ +int ssl_sock_get_cert_used_sess(struct connection *conn) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + X509 *crt = NULL; + + if (!ctx) + return 0; + + /* SSL_get_peer_certificate, it increase X509 * ref count */ + crt = SSL_get_peer_certificate(ctx->ssl); + if (!crt) + return 0; + + X509_free(crt); + return 1; +} + +/* returns 1 if client passed a certificate for this connection, 0 if not */ +int ssl_sock_get_cert_used_conn(struct connection *conn) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (!ctx) + return 0; + return SSL_SOCK_ST_FL_VERIFY_DONE & ctx->xprt_st ? 1 : 0; +} + +/* returns result from SSL verify */ +unsigned int ssl_sock_get_verify_result(struct connection *conn) +{ + struct ssl_sock_ctx *ctx = conn_get_ssl_sock_ctx(conn); + + if (!ctx) + return (unsigned int)X509_V_ERR_APPLICATION_VERIFICATION; + return (unsigned int)SSL_get_verify_result(ctx->ssl); +} + +/* Returns the application layer protocol name in <str> and <len> when known. + * Zero is returned if the protocol name was not found, otherwise non-zero is + * returned. The string is allocated in the SSL context and doesn't have to be + * freed by the caller. NPN is also checked if available since older versions + * of openssl (1.0.1) which are more common in field only support this one. + */ +int ssl_sock_get_alpn(const struct connection *conn, void *xprt_ctx, const char **str, int *len) +{ +#if defined(TLSEXT_TYPE_application_layer_protocol_negotiation) || \ + defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + struct ssl_sock_ctx *ctx = xprt_ctx; + if (!ctx) + return 0; + + *str = NULL; + +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + SSL_get0_alpn_selected(ctx->ssl, (const unsigned char **)str, (unsigned *)len); + if (*str) + return 1; +#endif +#if defined(OPENSSL_NPN_NEGOTIATED) && !defined(OPENSSL_NO_NEXTPROTONEG) + SSL_get0_next_proto_negotiated(ctx->ssl, (const unsigned char **)str, (unsigned *)len); + if (*str) + return 1; +#endif +#endif + return 0; +} + +/* "issuers-chain-path" load chain certificate in global */ +int ssl_load_global_issuer_from_BIO(BIO *in, char *fp, char **err) +{ + X509 *ca; + X509_NAME *name = NULL; + ASN1_OCTET_STRING *skid = NULL; + STACK_OF(X509) *chain = NULL; + struct issuer_chain *issuer; + struct eb64_node *node; + char *path; + u64 key; + int ret = 0; + + while ((ca = PEM_read_bio_X509(in, NULL, NULL, NULL))) { + if (chain == NULL) { + chain = sk_X509_new_null(); + skid = X509_get_ext_d2i(ca, NID_subject_key_identifier, NULL, NULL); + name = X509_get_subject_name(ca); + } + if (!sk_X509_push(chain, ca)) { + X509_free(ca); + goto end; + } + } + if (!chain) { + memprintf(err, "unable to load issuers-chain %s : pem certificate not found.\n", fp); + goto end; + } + if (!skid) { + memprintf(err, "unable to load issuers-chain %s : SubjectKeyIdentifier not found.\n", fp); + goto end; + } + if (!name) { + memprintf(err, "unable to load issuers-chain %s : SubjectName not found.\n", fp); + goto end; + } + key = XXH3(ASN1_STRING_get0_data(skid), ASN1_STRING_length(skid), 0); + for (node = eb64_lookup(&cert_issuer_tree, key); node; node = eb64_next(node)) { + issuer = container_of(node, typeof(*issuer), node); + if (!X509_NAME_cmp(name, X509_get_subject_name(sk_X509_value(issuer->chain, 0)))) { + memprintf(err, "duplicate issuers-chain %s: %s already in store\n", fp, issuer->path); + goto end; + } + } + issuer = calloc(1, sizeof *issuer); + path = strdup(fp); + if (!issuer || !path) { + free(issuer); + free(path); + goto end; + } + issuer->node.key = key; + issuer->path = path; + issuer->chain = chain; + chain = NULL; + eb64_insert(&cert_issuer_tree, &issuer->node); + ret = 1; + end: + if (skid) + ASN1_OCTET_STRING_free(skid); + if (chain) + sk_X509_pop_free(chain, X509_free); + return ret; +} + + struct issuer_chain* ssl_get0_issuer_chain(X509 *cert) +{ + AUTHORITY_KEYID *akid; + struct issuer_chain *issuer = NULL; + + akid = X509_get_ext_d2i(cert, NID_authority_key_identifier, NULL, NULL); + if (akid && akid->keyid) { + struct eb64_node *node; + u64 hk; + hk = XXH3(ASN1_STRING_get0_data(akid->keyid), ASN1_STRING_length(akid->keyid), 0); + for (node = eb64_lookup(&cert_issuer_tree, hk); node; node = eb64_next(node)) { + struct issuer_chain *ti = container_of(node, typeof(*issuer), node); + if (X509_check_issued(sk_X509_value(ti->chain, 0), cert) == X509_V_OK) { + issuer = ti; + break; + } + } + } + AUTHORITY_KEYID_free(akid); + return issuer; +} + +void ssl_free_global_issuers(void) +{ + struct eb64_node *node, *back; + struct issuer_chain *issuer; + + node = eb64_first(&cert_issuer_tree); + while (node) { + issuer = container_of(node, typeof(*issuer), node); + back = eb64_next(node); + eb64_delete(node); + free(issuer->path); + sk_X509_pop_free(issuer->chain, X509_free); + free(issuer); + node = back; + } +} + +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) +static int ssl_check_async_engine_count(void) { + int err_code = ERR_NONE; + + if (global_ssl.async && (openssl_engines_initialized > 32)) { + ha_alert("ssl-mode-async only supports a maximum of 32 engines.\n"); + err_code = ERR_ABORT; + } + return err_code; +} +#endif + +/* "show fd" helper to dump ssl internals. Warning: the output buffer is often + * the common trash! It returns non-zero if the connection entry looks suspicious. + */ +static int ssl_sock_show_fd(struct buffer *buf, const struct connection *conn, const void *ctx) +{ + const struct ssl_sock_ctx *sctx = ctx; + int ret = 0; + + if (!sctx) + return ret; + + if (sctx->conn != conn) { + chunk_appendf(&trash, " xctx.conn=%p(BOGUS)", sctx->conn); + ret = 1; + } + chunk_appendf(&trash, " xctx.st=%d .err=%ld", sctx->xprt_st, sctx->error_code); + + if (sctx->xprt) { + chunk_appendf(&trash, " .xprt=%s", sctx->xprt->name); + if (sctx->xprt_ctx) + chunk_appendf(&trash, " .xctx=%p", sctx->xprt_ctx); + } + + chunk_appendf(&trash, " .wait.ev=%d", sctx->wait_event.events); + + /* as soon as a shutdown is reported the lower layer unregisters its + * subscriber, so the situations below are transient and rare enough to + * be reported as suspicious. In any case they shouldn't last. + */ + if ((sctx->wait_event.events & 1) && (conn->flags & (CO_FL_SOCK_RD_SH|CO_FL_ERROR))) + ret = 1; + if ((sctx->wait_event.events & 2) && (conn->flags & (CO_FL_SOCK_WR_SH|CO_FL_ERROR))) + ret = 1; + + chunk_appendf(&trash, " .subs=%p", sctx->subs); + if (sctx->subs) { + chunk_appendf(&trash, "(ev=%d tl=%p", sctx->subs->events, sctx->subs->tasklet); + if (sctx->subs->tasklet->calls >= 1000000) + ret = 1; + chunk_appendf(&trash, " tl.calls=%d tl.ctx=%p tl.fct=", + sctx->subs->tasklet->calls, + sctx->subs->tasklet->context); + resolve_sym_name(&trash, NULL, sctx->subs->tasklet->process); + chunk_appendf(&trash, ")"); + } + chunk_appendf(&trash, " .sent_early=%d", sctx->sent_early_data); + chunk_appendf(&trash, " .early_in=%d", (int)sctx->early_buf.data); + return ret; +} + +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) +/* This function is used with TLS ticket keys management. It permits to browse + * each reference. The variable <ref> must point to the current node's list + * element (which starts by the root), and <end> must point to the root node. + */ +static inline +struct tls_keys_ref *tlskeys_list_get_next(struct list *ref, struct list *end) +{ + /* Get next list entry. */ + ref = ref->n; + + /* If the entry is the last of the list, return NULL. */ + if (ref == end) + return NULL; + + return LIST_ELEM(ref, struct tls_keys_ref *, list); +} + +static inline +struct tls_keys_ref *tlskeys_ref_lookup_ref(const char *reference) +{ + int id; + char *error; + + /* If the reference starts by a '#', this is numeric id. */ + if (reference[0] == '#') { + /* Try to convert the numeric id. If the conversion fails, the lookup fails. */ + id = strtol(reference + 1, &error, 10); + if (*error != '\0') + return NULL; + + /* Perform the unique id lookup. */ + return tlskeys_ref_lookupid(id); + } + + /* Perform the string lookup. */ + return tlskeys_ref_lookup(reference); +} +#endif + + +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) + +/* dumps all tls keys. Relies on the show_keys_ctx context from the appctx. */ +static int cli_io_handler_tlskeys_files(struct appctx *appctx) +{ + struct show_keys_ctx *ctx = appctx->svcctx; + + switch (ctx->state) { + case SHOW_KEYS_INIT: + /* Display the column headers. If the message cannot be sent, + * quit the function with returning 0. The function is called + * later and restart at the state "SHOW_KEYS_INIT". + */ + chunk_reset(&trash); + + if (ctx->dump_entries) + chunk_appendf(&trash, "# id secret\n"); + else + chunk_appendf(&trash, "# id (file)\n"); + + if (applet_putchk(appctx, &trash) == -1) + return 0; + + /* Now, we start the browsing of the references lists. + * Note that the following call to LIST_ELEM return bad pointer. The only + * available field of this pointer is <list>. It is used with the function + * tlskeys_list_get_next() for returning the first available entry + */ + if (ctx->next_ref == NULL) + ctx->next_ref = tlskeys_list_get_next(&tlskeys_reference, &tlskeys_reference); + + ctx->state = SHOW_KEYS_LIST; + __fallthrough; + + case SHOW_KEYS_LIST: + while (ctx->next_ref) { + struct tls_keys_ref *ref = ctx->next_ref; + + chunk_reset(&trash); + if (ctx->dump_entries && ctx->next_index == 0) + chunk_appendf(&trash, "# "); + + if (ctx->next_index == 0) + chunk_appendf(&trash, "%d (%s)\n", ref->unique_id, ref->filename); + + if (ctx->dump_entries) { + int head; + + HA_RWLOCK_RDLOCK(TLSKEYS_REF_LOCK, &ref->lock); + head = ref->tls_ticket_enc_index; + while (ctx->next_index < TLS_TICKETS_NO) { + struct buffer *t2 = get_trash_chunk(); + + chunk_reset(t2); + /* should never fail here because we dump only a key in the t2 buffer */ + if (ref->key_size_bits == 128) { + t2->data = a2base64((char *)(ref->tlskeys + (head + 2 + ctx->next_index) % TLS_TICKETS_NO), + sizeof(struct tls_sess_key_128), + t2->area, t2->size); + chunk_appendf(&trash, "%d.%d %s\n", ref->unique_id, ctx->next_index, + t2->area); + } + else if (ref->key_size_bits == 256) { + t2->data = a2base64((char *)(ref->tlskeys + (head + 2 + ctx->next_index) % TLS_TICKETS_NO), + sizeof(struct tls_sess_key_256), + t2->area, t2->size); + chunk_appendf(&trash, "%d.%d %s\n", ref->unique_id, ctx->next_index, + t2->area); + } + else { + /* This case should never happen */ + chunk_appendf(&trash, "%d.%d <unknown>\n", ref->unique_id, ctx->next_index); + } + + if (applet_putchk(appctx, &trash) == -1) { + /* let's try again later from this stream. We add ourselves into + * this stream's users so that it can remove us upon termination. + */ + HA_RWLOCK_RDUNLOCK(TLSKEYS_REF_LOCK, &ref->lock); + return 0; + } + ctx->next_index++; + } + HA_RWLOCK_RDUNLOCK(TLSKEYS_REF_LOCK, &ref->lock); + ctx->next_index = 0; + } + if (applet_putchk(appctx, &trash) == -1) { + /* let's try again later from this stream. We add ourselves into + * this stream's users so that it can remove us upon termination. + */ + return 0; + } + + if (ctx->names_only == 0) /* don't display everything if not necessary */ + break; + + /* get next list entry and check the end of the list */ + ctx->next_ref = tlskeys_list_get_next(&ref->list, &tlskeys_reference); + } + ctx->state = SHOW_KEYS_DONE; + __fallthrough; + + default: + return 1; + } + return 0; +} + +/* Prepares a "show_keys_ctx" and sets the appropriate io_handler if needed */ +static int cli_parse_show_tlskeys(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_keys_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + /* no parameter, shows only file list */ + if (!*args[2]) { + ctx->names_only = 1; + return 0; + } + + if (args[2][0] == '*') { + /* list every TLS ticket keys */ + ctx->names_only = 1; + } else { + ctx->next_ref = tlskeys_ref_lookup_ref(args[2]); + if (!ctx->next_ref) + return cli_err(appctx, "'show tls-keys' unable to locate referenced filename\n"); + } + + ctx->dump_entries = 1; + return 0; +} + +static int cli_parse_set_tlskeys(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct tls_keys_ref *ref; + int ret; + + /* Expect two parameters: the filename and the new new TLS key in encoding */ + if (!*args[3] || !*args[4]) + return cli_err(appctx, "'set ssl tls-key' expects a filename and the new TLS key in base64 encoding.\n"); + + ref = tlskeys_ref_lookup_ref(args[3]); + if (!ref) + return cli_err(appctx, "'set ssl tls-key' unable to locate referenced filename\n"); + + ret = base64dec(args[4], strlen(args[4]), trash.area, trash.size); + if (ret < 0) + return cli_err(appctx, "'set ssl tls-key' received invalid base64 encoded TLS key.\n"); + + trash.data = ret; + if (ssl_sock_update_tlskey_ref(ref, &trash) < 0) + return cli_err(appctx, "'set ssl tls-key' received a key of wrong size.\n"); + + return cli_msg(appctx, LOG_INFO, "TLS ticket key updated!\n"); +} +#endif + + +#ifdef HAVE_SSL_PROVIDERS +struct provider_name { + const char *name; + struct list list; +}; + + +static int ssl_provider_get_name_cb(OSSL_PROVIDER *provider, void *cbdata) +{ + struct list *provider_names = cbdata; + struct provider_name *item = NULL; + const char *name = OSSL_PROVIDER_get0_name(provider); + + if (!provider_names) + return 0; + + item = calloc(1, sizeof(*item)); + + if (!item) + return 0; + + item->name = name; + LIST_APPEND(provider_names, &item->list); + + return 1; +} + +static void ssl_provider_get_name_list(struct list *provider_names) +{ + if (!provider_names) + return; + + OSSL_PROVIDER_do_all(NULL, ssl_provider_get_name_cb, provider_names); +} + +static void ssl_provider_clear_name_list(struct list *provider_names) +{ + struct provider_name *item = NULL, *item_s = NULL; + + if (provider_names) { + list_for_each_entry_safe(item, item_s, provider_names, list) { + LIST_DELETE(&item->list); + free(item); + } + } +} + +static int cli_io_handler_show_providers(struct appctx *appctx) +{ + struct buffer *trash = get_trash_chunk(); + struct list provider_names; + struct provider_name *name; + + LIST_INIT(&provider_names); + + chunk_appendf(trash, "Loaded providers : \n"); + + ssl_provider_get_name_list(&provider_names); + + list_for_each_entry(name, &provider_names, list) { + chunk_appendf(trash, "\t- %s\n", name->name); + } + + ssl_provider_clear_name_list(&provider_names); + + if (applet_putchk(appctx, trash) == -1) + goto yield; + + return 1; + +yield: + return 0; +} +#endif + + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) + { { "show", "tls-keys", NULL }, "show tls-keys [id|*] : show tls keys references or dump tls ticket keys when id specified", cli_parse_show_tlskeys, cli_io_handler_tlskeys_files }, + { { "set", "ssl", "tls-key", NULL }, "set ssl tls-key [id|file] <key> : set the next TLS key for the <id> or <file> listener to <key>", cli_parse_set_tlskeys, NULL }, +#endif +#ifdef HAVE_SSL_PROVIDERS + { { "show", "ssl", "providers", NULL }, "show ssl providers : show loaded SSL providers", NULL, cli_io_handler_show_providers }, +#endif + { { NULL }, NULL, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +/* transport-layer operations for SSL sockets */ +struct xprt_ops ssl_sock = { + .snd_buf = ssl_sock_from_buf, + .rcv_buf = ssl_sock_to_buf, + .subscribe = ssl_subscribe, + .unsubscribe = ssl_unsubscribe, + .remove_xprt = ssl_remove_xprt, + .add_xprt = ssl_add_xprt, + .rcv_pipe = NULL, + .snd_pipe = NULL, + .shutr = NULL, + .shutw = ssl_sock_shutw, + .close = ssl_sock_close, + .init = ssl_sock_init, + .start = ssl_sock_start, + .prepare_bind_conf = ssl_sock_prepare_bind_conf, + .destroy_bind_conf = ssl_sock_destroy_bind_conf, + .prepare_srv = ssl_sock_prepare_srv_ctx, + .destroy_srv = ssl_sock_free_srv_ctx, + .get_alpn = ssl_sock_get_alpn, + .takeover = ssl_takeover, + .set_idle = ssl_set_idle, + .set_used = ssl_set_used, + .get_ssl_sock_ctx = ssl_sock_get_ctx, + .name = "SSL", + .show_fd = ssl_sock_show_fd, +}; + +enum act_return ssl_action_wait_for_hs(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct connection *conn; + + conn = objt_conn(sess->origin); + + if (conn) { + if (conn->flags & (CO_FL_EARLY_SSL_HS | CO_FL_SSL_WAIT_HS)) { + sc_ep_set(s->scf, SE_FL_WAIT_FOR_HS); + s->req.flags |= CF_READ_EVENT; + return ACT_RET_YIELD; + } + } + return (ACT_RET_CONT); +} + +static enum act_parse_ret ssl_parse_wait_for_hs(const char **args, int *orig_arg, struct proxy *px, struct act_rule *rule, char **err) +{ + rule->action_ptr = ssl_action_wait_for_hs; + + return ACT_RET_PRS_OK; +} + +static struct action_kw_list http_req_actions = {ILH, { + { "wait-for-handshake", ssl_parse_wait_for_hs }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_actions); + +#ifdef HAVE_SSL_CTX_ADD_SERVER_CUSTOM_EXT + +static void ssl_sock_sctl_free_func(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp) +{ + if (ptr) { + chunk_destroy(ptr); + free(ptr); + } +} + +#endif + + +static void ssl_sock_capture_free_func(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp) +{ + pool_free(pool_head_ssl_capture, ptr); +} + +#ifdef HAVE_SSL_KEYLOG +static void ssl_sock_keylog_free_func(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp) +{ + struct ssl_keylog *keylog; + + if (!ptr) + return; + + keylog = ptr; + + pool_free(pool_head_ssl_keylog_str, keylog->client_random); + pool_free(pool_head_ssl_keylog_str, keylog->client_early_traffic_secret); + pool_free(pool_head_ssl_keylog_str, keylog->client_handshake_traffic_secret); + pool_free(pool_head_ssl_keylog_str, keylog->server_handshake_traffic_secret); + pool_free(pool_head_ssl_keylog_str, keylog->client_traffic_secret_0); + pool_free(pool_head_ssl_keylog_str, keylog->server_traffic_secret_0); + pool_free(pool_head_ssl_keylog_str, keylog->exporter_secret); + pool_free(pool_head_ssl_keylog_str, keylog->early_exporter_secret); + + pool_free(pool_head_ssl_keylog, ptr); +} +#endif + +static void ssl_sock_clt_crt_free_func(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp) +{ + if (!ptr) + return; + + X509_free((X509*)ptr); +} + +static void ssl_sock_clt_sni_free_func(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp) +{ + pool_free(ssl_sock_client_sni_pool, ptr); +} + +static void __ssl_sock_init(void) +{ +#if (!defined(OPENSSL_NO_COMP) && !defined(SSL_OP_NO_COMPRESSION)) + STACK_OF(SSL_COMP)* cm; + int n; +#endif + + if (global_ssl.listen_default_ciphers) + global_ssl.listen_default_ciphers = strdup(global_ssl.listen_default_ciphers); + if (global_ssl.connect_default_ciphers) + global_ssl.connect_default_ciphers = strdup(global_ssl.connect_default_ciphers); +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + if (global_ssl.listen_default_ciphersuites) + global_ssl.listen_default_ciphersuites = strdup(global_ssl.listen_default_ciphersuites); + if (global_ssl.connect_default_ciphersuites) + global_ssl.connect_default_ciphersuites = strdup(global_ssl.connect_default_ciphersuites); +#endif + + xprt_register(XPRT_SSL, &ssl_sock); +#if HA_OPENSSL_VERSION_NUMBER < 0x10100000L + SSL_library_init(); +#endif +#if (!defined(OPENSSL_NO_COMP) && !defined(SSL_OP_NO_COMPRESSION)) + cm = SSL_COMP_get_compression_methods(); + n = sk_SSL_COMP_num(cm); + while (n--) { + (void) sk_SSL_COMP_pop(cm); + } +#endif + +#if defined(USE_THREAD) && (HA_OPENSSL_VERSION_NUMBER < 0x10100000L) + ssl_locking_init(); +#endif +#ifdef HAVE_SSL_CTX_ADD_SERVER_CUSTOM_EXT + sctl_ex_index = SSL_CTX_get_ex_new_index(0, NULL, NULL, NULL, ssl_sock_sctl_free_func); +#endif + +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) + ocsp_ex_index = SSL_CTX_get_ex_new_index(0, NULL, NULL, NULL, ssl_sock_ocsp_free_func); +#endif + + ssl_app_data_index = SSL_get_ex_new_index(0, NULL, NULL, NULL, NULL); + ssl_capture_ptr_index = SSL_get_ex_new_index(0, NULL, NULL, NULL, ssl_sock_capture_free_func); +#ifdef USE_QUIC + ssl_qc_app_data_index = SSL_get_ex_new_index(0, NULL, NULL, NULL, NULL); +#endif /* USE_QUIC */ +#ifdef HAVE_SSL_KEYLOG + ssl_keylog_index = SSL_get_ex_new_index(0, NULL, NULL, NULL, ssl_sock_keylog_free_func); +#endif + ssl_client_crt_ref_index = SSL_get_ex_new_index(0, NULL, NULL, NULL, ssl_sock_clt_crt_free_func); + ssl_client_sni_index = SSL_get_ex_new_index(0, NULL, NULL, NULL, ssl_sock_clt_sni_free_func); +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) + ENGINE_load_builtin_engines(); + hap_register_post_check(ssl_check_async_engine_count); +#endif +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) + hap_register_post_check(tlskeys_finalize_config); +#endif + + global.ssl_session_max_cost = SSL_SESSION_MAX_COST; + global.ssl_handshake_max_cost = SSL_HANDSHAKE_MAX_COST; + + hap_register_post_deinit(ssl_free_global_issuers); + +#ifndef OPENSSL_NO_DH + ssl_dh_ptr_index = SSL_CTX_get_ex_new_index(0, NULL, NULL, NULL, NULL); + hap_register_post_deinit(ssl_free_dh); +#endif +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) + hap_register_post_deinit(ssl_free_engines); +#endif +#ifdef HAVE_SSL_PROVIDERS + hap_register_post_deinit(ssl_unload_providers); +#endif +#if HA_OPENSSL_VERSION_NUMBER < 0x3000000fL + /* Load SSL string for the verbose & debug mode. */ + ERR_load_SSL_strings(); +#endif + ha_meth = BIO_meth_new(0x666, "ha methods"); + if (ha_meth != NULL) { + BIO_meth_set_write(ha_meth, ha_ssl_write); + BIO_meth_set_read(ha_meth, ha_ssl_read); + BIO_meth_set_ctrl(ha_meth, ha_ssl_ctrl); + BIO_meth_set_create(ha_meth, ha_ssl_new); + BIO_meth_set_destroy(ha_meth, ha_ssl_free); + BIO_meth_set_puts(ha_meth, ha_ssl_puts); + BIO_meth_set_gets(ha_meth, ha_ssl_gets); + } + + HA_SPIN_INIT(&ckch_lock); + + HA_SPIN_INIT(&ocsp_tree_lock); + + /* Try to register dedicated SSL/TLS protocol message callbacks for + * heartbleed attack (CVE-2014-0160) and clienthello. + */ + hap_register_post_check(ssl_sock_register_msg_callbacks); + + /* Try to free all callbacks that were registered by using + * ssl_sock_register_msg_callback(). + */ + hap_register_post_deinit(ssl_sock_unregister_msg_callbacks); +} +INITCALL0(STG_REGISTER, __ssl_sock_init); + +/* Compute and register the version string */ +static void ssl_register_build_options() +{ + char *ptr = NULL; + int i; + + memprintf(&ptr, "Built with OpenSSL version : " +#ifdef OPENSSL_IS_BORINGSSL + "BoringSSL"); +#else /* OPENSSL_IS_BORINGSSL */ + OPENSSL_VERSION_TEXT + "\nRunning on OpenSSL version : %s%s", + OpenSSL_version(OPENSSL_VERSION), + ((OPENSSL_VERSION_NUMBER ^ OpenSSL_version_num()) >> 8) ? " (VERSIONS DIFFER!)" : ""); +#endif + memprintf(&ptr, "%s\nOpenSSL library supports TLS extensions : " +#if HA_OPENSSL_VERSION_NUMBER < 0x00907000L + "no (library version too old)" +#elif defined(OPENSSL_NO_TLSEXT) + "no (disabled via OPENSSL_NO_TLSEXT)" +#else + "yes" +#endif + "", ptr); + + memprintf(&ptr, "%s\nOpenSSL library supports SNI : " +#ifdef SSL_CTRL_SET_TLSEXT_HOSTNAME + "yes" +#else +#ifdef OPENSSL_NO_TLSEXT + "no (because of OPENSSL_NO_TLSEXT)" +#else + "no (version might be too old, 0.9.8f min needed)" +#endif +#endif + "", ptr); + + memprintf(&ptr, "%s\nOpenSSL library supports :", ptr); + for (i = CONF_TLSV_MIN; i <= CONF_TLSV_MAX; i++) + if (methodVersions[i].option) + memprintf(&ptr, "%s %s", ptr, methodVersions[i].name); + +#ifdef HAVE_SSL_PROVIDERS + { + struct list provider_names; + struct provider_name *name; + LIST_INIT(&provider_names); + ssl_provider_get_name_list(&provider_names); + + memprintf(&ptr, "%s\nOpenSSL providers loaded :", ptr); + + list_for_each_entry(name, &provider_names, list) { + memprintf(&ptr, "%s %s", ptr, name->name); + } + + ssl_provider_clear_name_list(&provider_names); + } +#endif + + hap_register_build_opts(ptr, 1); +} + +INITCALL0(STG_REGISTER, ssl_register_build_options); + +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) +void ssl_free_engines(void) { + struct ssl_engine_list *wl, *wlb; + /* free up engine list */ + list_for_each_entry_safe(wl, wlb, &openssl_engines, list) { + ENGINE_finish(wl->e); + ENGINE_free(wl->e); + LIST_DELETE(&wl->list); + free(wl); + } +} +#endif + +#ifdef HAVE_SSL_PROVIDERS +void ssl_unload_providers(void) { + struct ssl_provider_list *prov, *provb; + list_for_each_entry_safe(prov, provb, &openssl_providers, list) { + OSSL_PROVIDER_unload(prov->provider); + LIST_DELETE(&prov->list); + free(prov); + } +} +#endif + +#ifndef OPENSSL_NO_DH +void ssl_free_dh(void) { + if (local_dh_1024) { + HASSL_DH_free(local_dh_1024); + local_dh_1024 = NULL; + } + if (local_dh_2048) { + HASSL_DH_free(local_dh_2048); + local_dh_2048 = NULL; + } + if (local_dh_4096) { + HASSL_DH_free(local_dh_4096); + local_dh_4096 = NULL; + } + if (global_dh) { + HASSL_DH_free(global_dh); + global_dh = NULL; + } +} +#endif + +static void __ssl_sock_deinit(void) +{ +#if (defined SSL_CTRL_SET_TLSEXT_HOSTNAME && !defined SSL_NO_GENERATE_CERTIFICATES) + if (ssl_ctx_lru_tree) { + lru64_destroy(ssl_ctx_lru_tree); + HA_RWLOCK_DESTROY(&ssl_ctx_lru_rwlock); + } +#endif + +#if (HA_OPENSSL_VERSION_NUMBER < 0x10100000L) + ERR_remove_state(0); + ERR_free_strings(); + + EVP_cleanup(); +#endif + +#if (HA_OPENSSL_VERSION_NUMBER >= 0x00907000L) && (HA_OPENSSL_VERSION_NUMBER < 0x10100000L) + CRYPTO_cleanup_all_ex_data(); +#endif + BIO_meth_free(ha_meth); + +#if !defined OPENSSL_NO_OCSP + ssl_destroy_ocsp_update_task(); +#endif +} +REGISTER_POST_DEINIT(__ssl_sock_deinit); + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/ssl_utils.c b/src/ssl_utils.c new file mode 100644 index 0000000..4a85b89 --- /dev/null +++ b/src/ssl_utils.c @@ -0,0 +1,702 @@ +/* + * Utility functions for SSL: + * Mostly generic functions that retrieve information from certificates + * + * Copyright (C) 2012 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <haproxy/api.h> +#include <haproxy/buf-t.h> +#include <haproxy/chunk.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/ssl_utils.h> + +/* fill a buffer with the algorithm and size of a public key */ +int cert_get_pkey_algo(X509 *crt, struct buffer *out) +{ + int bits = 0; + int sig = TLSEXT_signature_anonymous; + int len = -1; + EVP_PKEY *pkey; + + pkey = X509_get_pubkey(crt); + if (pkey) { + bits = EVP_PKEY_bits(pkey); + switch(EVP_PKEY_base_id(pkey)) { + case EVP_PKEY_RSA: + sig = TLSEXT_signature_rsa; + break; + case EVP_PKEY_EC: + sig = TLSEXT_signature_ecdsa; + break; + case EVP_PKEY_DSA: + sig = TLSEXT_signature_dsa; + break; + } + EVP_PKEY_free(pkey); + } + + switch(sig) { + case TLSEXT_signature_rsa: + len = chunk_printf(out, "RSA%d", bits); + break; + case TLSEXT_signature_ecdsa: + len = chunk_printf(out, "EC%d", bits); + break; + case TLSEXT_signature_dsa: + len = chunk_printf(out, "DSA%d", bits); + break; + default: + return 0; + } + if (len < 0) + return 0; + return 1; +} + +/* Extract a serial from a cert, and copy it to a chunk. + * Returns 1 if serial is found and copied, 0 if no serial found and + * -1 if output is not large enough. + */ +int ssl_sock_get_serial(X509 *crt, struct buffer *out) +{ + ASN1_INTEGER *serial; + + serial = X509_get_serialNumber(crt); + if (!serial) + return 0; + + if (out->size < serial->length) + return -1; + + memcpy(out->area, serial->data, serial->length); + out->data = serial->length; + return 1; +} + +/* Extract a cert to der, and copy it to a chunk. + * Returns 1 if the cert is found and copied, 0 on der conversion failure + * and -1 if the output is not large enough. + */ +int ssl_sock_crt2der(X509 *crt, struct buffer *out) +{ + int len; + unsigned char *p = (unsigned char *) out->area; + + len = i2d_X509(crt, NULL); + if (len <= 0) + return 1; + + if (out->size < len) + return -1; + + i2d_X509(crt, &p); + out->data = len; + return 1; +} + + +/* Copy Date in ASN1_UTCTIME format in struct buffer out. + * Returns 1 if serial is found and copied, 0 if no valid time found + * and -1 if output is not large enough. + */ +int ssl_sock_get_time(ASN1_TIME *tm, struct buffer *out) +{ + if (tm->type == V_ASN1_GENERALIZEDTIME) { + ASN1_GENERALIZEDTIME *gentm = (ASN1_GENERALIZEDTIME *)tm; + + if (gentm->length < 12) + return 0; + if (gentm->data[0] != 0x32 || gentm->data[1] != 0x30) + return 0; + if (out->size < gentm->length-2) + return -1; + + memcpy(out->area, gentm->data+2, gentm->length-2); + out->data = gentm->length-2; + return 1; + } + else if (tm->type == V_ASN1_UTCTIME) { + ASN1_UTCTIME *utctm = (ASN1_UTCTIME *)tm; + + if (utctm->length < 10) + return 0; + if (utctm->data[0] >= 0x35) + return 0; + if (out->size < utctm->length) + return -1; + + memcpy(out->area, utctm->data, utctm->length); + out->data = utctm->length; + return 1; + } + + return 0; +} + +/* Extract an entry from a X509_NAME and copy its value to an output chunk. + * Returns 1 if entry found, 0 if entry not found, or -1 if output not large enough. + */ +int ssl_sock_get_dn_entry(X509_NAME *a, const struct buffer *entry, int pos, + struct buffer *out) +{ + X509_NAME_ENTRY *ne; + ASN1_OBJECT *obj; + ASN1_STRING *data; + const unsigned char *data_ptr; + int data_len; + int i, j, n; + int cur = 0; + const char *s; + char tmp[128]; + int name_count; + + name_count = X509_NAME_entry_count(a); + + out->data = 0; + for (i = 0; i < name_count; i++) { + if (pos < 0) + j = (name_count-1) - i; + else + j = i; + + ne = X509_NAME_get_entry(a, j); + obj = X509_NAME_ENTRY_get_object(ne); + data = X509_NAME_ENTRY_get_data(ne); + data_ptr = ASN1_STRING_get0_data(data); + data_len = ASN1_STRING_length(data); + n = OBJ_obj2nid(obj); + if ((n == NID_undef) || ((s = OBJ_nid2sn(n)) == NULL)) { + i2t_ASN1_OBJECT(tmp, sizeof(tmp), obj); + s = tmp; + } + + if (chunk_strcasecmp(entry, s) != 0) + continue; + + if (pos < 0) + cur--; + else + cur++; + + if (cur != pos) + continue; + + if (data_len > out->size) + return -1; + + memcpy(out->area, data_ptr, data_len); + out->data = data_len; + return 1; + } + + return 0; + +} + +/* + * Extract the DN in the specified format from the X509_NAME and copy result to a chunk. + * Currently supports rfc2253 for returning LDAP V3 DNs. + * Returns 1 if dn entries exist, 0 if no dn entry was found. + */ +int ssl_sock_get_dn_formatted(X509_NAME *a, const struct buffer *format, struct buffer *out) +{ + BIO *bio = NULL; + int ret = 0; + int data_len = 0; + + if (chunk_strcmp(format, "rfc2253") == 0) { + bio = BIO_new(BIO_s_mem()); + if (bio == NULL) + goto out; + + if (X509_NAME_print_ex(bio, a, 0, XN_FLAG_RFC2253) < 0) + goto out; + + if ((data_len = BIO_read(bio, out->area, out->size)) <= 0) + goto out; + + out->data = data_len; + + ret = 1; + } +out: + if (bio) + BIO_free(bio); + return ret; +} + +/* Extract and format full DN from a X509_NAME and copy result into a chunk + * Returns 1 if dn entries exits, 0 if no dn entry found or -1 if output is not large enough. + */ +int ssl_sock_get_dn_oneline(X509_NAME *a, struct buffer *out) +{ + X509_NAME_ENTRY *ne; + ASN1_OBJECT *obj; + ASN1_STRING *data; + const unsigned char *data_ptr; + int data_len; + int i, n, ln; + int l = 0; + const char *s; + char *p; + char tmp[128]; + int name_count; + + + name_count = X509_NAME_entry_count(a); + + out->data = 0; + p = out->area; + for (i = 0; i < name_count; i++) { + ne = X509_NAME_get_entry(a, i); + obj = X509_NAME_ENTRY_get_object(ne); + data = X509_NAME_ENTRY_get_data(ne); + data_ptr = ASN1_STRING_get0_data(data); + data_len = ASN1_STRING_length(data); + n = OBJ_obj2nid(obj); + if ((n == NID_undef) || ((s = OBJ_nid2sn(n)) == NULL)) { + i2t_ASN1_OBJECT(tmp, sizeof(tmp), obj); + s = tmp; + } + ln = strlen(s); + + l += 1 + ln + 1 + data_len; + if (l > out->size) + return -1; + out->data = l; + + *(p++)='/'; + memcpy(p, s, ln); + p += ln; + *(p++)='='; + memcpy(p, data_ptr, data_len); + p += data_len; + } + + if (!out->data) + return 0; + + return 1; +} + + +extern int ssl_client_crt_ref_index; + +/* + * This function fetches the SSL certificate for a specific connection (either + * client certificate or server certificate depending on the cert_peer + * parameter). + * When trying to get the peer certificate from the server side, we first try to + * use the dedicated SSL_get_peer_certificate function, but we fall back to + * trying to get the client certificate reference that might have been stored in + * the SSL structure's ex_data during the verification process. + * Returns NULL in case of failure. + */ +X509* ssl_sock_get_peer_certificate(SSL *ssl) +{ + X509* cert; + + cert = SSL_get_peer_certificate(ssl); + /* Get the client certificate reference stored in the SSL + * structure's ex_data during the verification process. */ + if (!cert) { + cert = SSL_get_ex_data(ssl, ssl_client_crt_ref_index); + if (cert) + X509_up_ref(cert); + } + + return cert; +} + +/* + * This function fetches the x509* for the root CA of client certificate + * from the verified chain. We use the SSL_get0_verified_chain and get the + * last certificate in the x509 stack. + * + * Returns NULL in case of failure. +*/ +#ifdef HAVE_SSL_get0_verified_chain +X509* ssl_sock_get_verified_chain_root(SSL *ssl) +{ + STACK_OF(X509) *chain = NULL; + X509 *crt = NULL; + int i; + + chain = SSL_get0_verified_chain(ssl); + if (!chain) + return NULL; + + for (i = 0; i < sk_X509_num(chain); i++) { + crt = sk_X509_value(chain, i); + + if (X509_check_issued(crt, crt) == X509_V_OK) + break; + } + + return crt; +} +#endif + +/* + * Take an OpenSSL version in text format and return a numeric openssl version + * Return 0 if it failed to parse the version + * + * https://www.openssl.org/docs/man1.1.1/man3/OPENSSL_VERSION_NUMBER.html + * + * MNNFFPPS: major minor fix patch status + * + * The status nibble has one of the values 0 for development, 1 to e for betas + * 1 to 14, and f for release. + * + * for example + * + * 0x0090821f 0.9.8zh + * 0x1000215f 1.0.2u + * 0x30000000 3.0.0-alpha17 + * 0x30000002 3.0.0-beta2 + * 0x3000000e 3.0.0-beta14 + * 0x3000000f 3.0.0 + */ +unsigned int openssl_version_parser(const char *version) +{ + unsigned int numversion; + unsigned int major = 0, minor = 0, fix = 0, patch = 0, status = 0; + char *p, *end; + + p = (char *)version; + + if (!p || !*p) + return 0; + + major = strtol(p, &end, 10); + if (*end != '.' || major > 0xf) + goto error; + p = end + 1; + + minor = strtol(p, &end, 10); + if (*end != '.' || minor > 0xff) + goto error; + p = end + 1; + + fix = strtol(p, &end, 10); + if (fix > 0xff) + goto error; + p = end; + + if (!*p) { + /* end of the string, that's a release */ + status = 0xf; + } else if (*p == '-') { + /* after the hyphen, only the beta will increment the status + * counter, all others versions will be considered as "dev" and + * does not increment anything */ + p++; + + if (!strncmp(p, "beta", 4)) { + p += 4; + status = strtol(p, &end, 10); + if (status > 14) + goto error; + } + } else { + /* that's a patch release */ + patch = 1; + + /* add the value of each letter */ + while (*p) { + patch += (*p & ~0x20) - 'A'; + p++; + } + status = 0xf; + } + +end: + numversion = ((major & 0xf) << 28) | ((minor & 0xff) << 20) | ((fix & 0xff) << 12) | ((patch & 0xff) << 4) | (status & 0xf); + return numversion; + +error: + return 0; + +} + +/* Exclude GREASE (RFC8701) values from input buffer */ +void exclude_tls_grease(char *input, int len, struct buffer *output) +{ + int ptr = 0; + + while (ptr < len - 1) { + if (input[ptr] != input[ptr+1] || (input[ptr] & 0x0f) != 0x0a) { + if (output->data <= output->size - 2) { + memcpy(output->area + output->data, input + ptr, 2); + output->data += 2; + } else + break; + } + ptr += 2; + } + if (output->size - output->data > 0 && len - ptr > 0) + output->area[output->data++] = input[ptr]; +} + +/* + * The following generates an array <x509_v_codes> in which the X509_V_ERR_* + * codes are populated with there string equivalent. Depending on the version + * of the SSL library, some code does not exist, these will be populated as + * "-1" in the array. + * + * The list was taken from + * https://github.com/openssl/openssl/blob/master/include/openssl/x509_vfy.h.in + * and must be updated when new constant are introduced. + */ + +#undef _Q +#define _Q(x) (#x) +#undef V +#define V(x) { .code = -1, .value = _Q(x), .string = #x } + +static struct x509_v_codes { + int code; // integer value of the code or -1 if undefined + const char *value; // value of the macro as a string or its name + const char *string; // name of the macro +} x509_v_codes[] = { + V(X509_V_OK), + V(X509_V_ERR_UNSPECIFIED), + V(X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT), + V(X509_V_ERR_UNABLE_TO_GET_CRL), + V(X509_V_ERR_UNABLE_TO_DECRYPT_CERT_SIGNATURE), + V(X509_V_ERR_UNABLE_TO_DECRYPT_CRL_SIGNATURE), + V(X509_V_ERR_UNABLE_TO_DECODE_ISSUER_PUBLIC_KEY), + V(X509_V_ERR_CERT_SIGNATURE_FAILURE), + V(X509_V_ERR_CRL_SIGNATURE_FAILURE), + V(X509_V_ERR_CERT_NOT_YET_VALID), + V(X509_V_ERR_CERT_HAS_EXPIRED), + V(X509_V_ERR_CRL_NOT_YET_VALID), + V(X509_V_ERR_CRL_HAS_EXPIRED), + V(X509_V_ERR_ERROR_IN_CERT_NOT_BEFORE_FIELD), + V(X509_V_ERR_ERROR_IN_CERT_NOT_AFTER_FIELD), + V(X509_V_ERR_ERROR_IN_CRL_LAST_UPDATE_FIELD), + V(X509_V_ERR_ERROR_IN_CRL_NEXT_UPDATE_FIELD), + V(X509_V_ERR_OUT_OF_MEM), + V(X509_V_ERR_DEPTH_ZERO_SELF_SIGNED_CERT), + V(X509_V_ERR_SELF_SIGNED_CERT_IN_CHAIN), + V(X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY), + V(X509_V_ERR_UNABLE_TO_VERIFY_LEAF_SIGNATURE), + V(X509_V_ERR_CERT_CHAIN_TOO_LONG), + V(X509_V_ERR_CERT_REVOKED), + V(X509_V_ERR_NO_ISSUER_PUBLIC_KEY), + V(X509_V_ERR_PATH_LENGTH_EXCEEDED), + V(X509_V_ERR_INVALID_PURPOSE), + V(X509_V_ERR_CERT_UNTRUSTED), + V(X509_V_ERR_CERT_REJECTED), + V(X509_V_ERR_SUBJECT_ISSUER_MISMATCH), + V(X509_V_ERR_AKID_SKID_MISMATCH), + V(X509_V_ERR_AKID_ISSUER_SERIAL_MISMATCH), + V(X509_V_ERR_KEYUSAGE_NO_CERTSIGN), + V(X509_V_ERR_UNABLE_TO_GET_CRL_ISSUER), + V(X509_V_ERR_UNHANDLED_CRITICAL_EXTENSION), + V(X509_V_ERR_KEYUSAGE_NO_CRL_SIGN), + V(X509_V_ERR_UNHANDLED_CRITICAL_CRL_EXTENSION), + V(X509_V_ERR_INVALID_NON_CA), + V(X509_V_ERR_PROXY_PATH_LENGTH_EXCEEDED), + V(X509_V_ERR_KEYUSAGE_NO_DIGITAL_SIGNATURE), + V(X509_V_ERR_PROXY_CERTIFICATES_NOT_ALLOWED), + V(X509_V_ERR_INVALID_EXTENSION), + V(X509_V_ERR_INVALID_POLICY_EXTENSION), + V(X509_V_ERR_NO_EXPLICIT_POLICY), + V(X509_V_ERR_DIFFERENT_CRL_SCOPE), + V(X509_V_ERR_UNSUPPORTED_EXTENSION_FEATURE), + V(X509_V_ERR_UNNESTED_RESOURCE), + V(X509_V_ERR_PERMITTED_VIOLATION), + V(X509_V_ERR_EXCLUDED_VIOLATION), + V(X509_V_ERR_SUBTREE_MINMAX), + V(X509_V_ERR_APPLICATION_VERIFICATION), + V(X509_V_ERR_UNSUPPORTED_CONSTRAINT_TYPE), + V(X509_V_ERR_UNSUPPORTED_CONSTRAINT_SYNTAX), + V(X509_V_ERR_UNSUPPORTED_NAME_SYNTAX), + V(X509_V_ERR_CRL_PATH_VALIDATION_ERROR), + V(X509_V_ERR_PATH_LOOP), + V(X509_V_ERR_SUITE_B_INVALID_VERSION), + V(X509_V_ERR_SUITE_B_INVALID_ALGORITHM), + V(X509_V_ERR_SUITE_B_INVALID_CURVE), + V(X509_V_ERR_SUITE_B_INVALID_SIGNATURE_ALGORITHM), + V(X509_V_ERR_SUITE_B_LOS_NOT_ALLOWED), + V(X509_V_ERR_SUITE_B_CANNOT_SIGN_P_384_WITH_P_256), + V(X509_V_ERR_HOSTNAME_MISMATCH), + V(X509_V_ERR_EMAIL_MISMATCH), + V(X509_V_ERR_IP_ADDRESS_MISMATCH), + V(X509_V_ERR_DANE_NO_MATCH), + V(X509_V_ERR_EE_KEY_TOO_SMALL), + V(X509_V_ERR_CA_KEY_TOO_SMALL), + V(X509_V_ERR_CA_MD_TOO_WEAK), + V(X509_V_ERR_INVALID_CALL), + V(X509_V_ERR_STORE_LOOKUP), + V(X509_V_ERR_NO_VALID_SCTS), + V(X509_V_ERR_PROXY_SUBJECT_NAME_VIOLATION), + V(X509_V_ERR_OCSP_VERIFY_NEEDED), + V(X509_V_ERR_OCSP_VERIFY_FAILED), + V(X509_V_ERR_OCSP_CERT_UNKNOWN), + V(X509_V_ERR_UNSUPPORTED_SIGNATURE_ALGORITHM), + V(X509_V_ERR_SIGNATURE_ALGORITHM_MISMATCH), + V(X509_V_ERR_SIGNATURE_ALGORITHM_INCONSISTENCY), + V(X509_V_ERR_INVALID_CA), + V(X509_V_ERR_PATHLEN_INVALID_FOR_NON_CA), + V(X509_V_ERR_PATHLEN_WITHOUT_KU_KEY_CERT_SIGN), + V(X509_V_ERR_KU_KEY_CERT_SIGN_INVALID_FOR_NON_CA), + V(X509_V_ERR_ISSUER_NAME_EMPTY), + V(X509_V_ERR_SUBJECT_NAME_EMPTY), + V(X509_V_ERR_MISSING_AUTHORITY_KEY_IDENTIFIER), + V(X509_V_ERR_MISSING_SUBJECT_KEY_IDENTIFIER), + V(X509_V_ERR_EMPTY_SUBJECT_ALT_NAME), + V(X509_V_ERR_EMPTY_SUBJECT_SAN_NOT_CRITICAL), + V(X509_V_ERR_CA_BCONS_NOT_CRITICAL), + V(X509_V_ERR_AUTHORITY_KEY_IDENTIFIER_CRITICAL), + V(X509_V_ERR_SUBJECT_KEY_IDENTIFIER_CRITICAL), + V(X509_V_ERR_CA_CERT_MISSING_KEY_USAGE), + V(X509_V_ERR_EXTENSIONS_REQUIRE_VERSION_3), + V(X509_V_ERR_EC_KEY_EXPLICIT_PARAMS), + { 0, NULL, NULL }, +}; + +/* + * Return the X509_V_ERR code corresponding to the name of the constant. + * See https://github.com/openssl/openssl/blob/master/include/openssl/x509_vfy.h.in + * If not found, return -1 + */ +int x509_v_err_str_to_int(const char *str) +{ + int i; + + for (i = 0; x509_v_codes[i].string; i++) { + if (strcmp(str, x509_v_codes[i].string) == 0) { + return x509_v_codes[i].code; + } + } + + return -1; +} + +/* + * Return the constant name corresponding to the X509_V_ERR code + * See https://github.com/openssl/openssl/blob/master/include/openssl/x509_vfy.h.in + * If not found, return NULL; + */ +const char *x509_v_err_int_to_str(int code) +{ + int i; + + if (code == -1) + return NULL; + + for (i = 0; x509_v_codes[i].string; i++) { + if (x509_v_codes[i].code == code) { + return x509_v_codes[i].string; + } + } + return NULL; +} + +void init_x509_v_err_tab(void) +{ + int i; + + for (i = 0; x509_v_codes[i].string; i++) { + /* either the macro exists or it's equal to its own name */ + if (strcmp(x509_v_codes[i].string, x509_v_codes[i].value) == 0) + continue; + x509_v_codes[i].code = atoi(x509_v_codes[i].value); + } +} + +INITCALL0(STG_REGISTER, init_x509_v_err_tab); + + +/* + * This function returns the number of seconds elapsed + * since the Epoch, 1970-01-01 00:00:00 +0000 (UTC) and the + * date presented un ASN1_GENERALIZEDTIME. + * + * In parsing error case, it returns -1. + */ +long asn1_generalizedtime_to_epoch(ASN1_GENERALIZEDTIME *d) +{ + long epoch; + char *p, *end; + const unsigned short month_offset[12] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 + }; + unsigned long year, month; + + if (!d || (d->type != V_ASN1_GENERALIZEDTIME)) return -1; + + p = (char *)d->data; + end = p + d->length; + + if (end - p < 4) return -1; + year = 1000 * (p[0] - '0') + 100 * (p[1] - '0') + 10 * (p[2] - '0') + p[3] - '0'; + p += 4; + if (end - p < 2) return -1; + month = 10 * (p[0] - '0') + p[1] - '0'; + if (month < 1 || month > 12) return -1; + /* Compute the number of seconds since 1 jan 1970 and the beginning of current month + We consider leap years and the current month (<marsh or not) */ + epoch = ( ((year - 1970) * 365) + + ((year - (month < 3)) / 4 - (year - (month < 3)) / 100 + (year - (month < 3)) / 400) + - ((1970 - 1) / 4 - (1970 - 1) / 100 + (1970 - 1) / 400) + + month_offset[month-1] + ) * 24 * 60 * 60; + p += 2; + if (end - p < 2) return -1; + /* Add the number of seconds of completed days of current month */ + epoch += (10 * (p[0] - '0') + p[1] - '0' - 1) * 24 * 60 * 60; + p += 2; + if (end - p < 2) return -1; + /* Add the completed hours of the current day */ + epoch += (10 * (p[0] - '0') + p[1] - '0') * 60 * 60; + p += 2; + if (end - p < 2) return -1; + /* Add the completed minutes of the current hour */ + epoch += (10 * (p[0] - '0') + p[1] - '0') * 60; + p += 2; + if (p == end) return -1; + /* Test if there is available seconds */ + if (p[0] < '0' || p[0] > '9') + goto nosec; + if (end - p < 2) return -1; + /* Add the seconds of the current minute */ + epoch += 10 * (p[0] - '0') + p[1] - '0'; + p += 2; + if (p == end) return -1; + /* Ignore seconds float part if present */ + if (p[0] == '.') { + do { + if (++p == end) return -1; + } while (p[0] >= '0' && p[0] <= '9'); + } + +nosec: + if (p[0] == 'Z') { + if (end - p != 1) return -1; + return epoch; + } + else if (p[0] == '+') { + if (end - p != 5) return -1; + /* Apply timezone offset */ + return epoch - ((10 * (p[1] - '0') + p[2] - '0') * 60 * 60 + (10 * (p[3] - '0') + p[4] - '0')) * 60; + } + else if (p[0] == '-') { + if (end - p != 5) return -1; + /* Apply timezone offset */ + return epoch + ((10 * (p[1] - '0') + p[2] - '0') * 60 * 60 + (10 * (p[3] - '0') + p[4] - '0')) * 60; + } + + return -1; +} diff --git a/src/stats.c b/src/stats.c new file mode 100644 index 0000000..0ed5758 --- /dev/null +++ b/src/stats.c @@ -0,0 +1,5521 @@ +/* + * Functions dedicated to statistics output and the stats socket + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * Copyright 2007-2009 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pwd.h> +#include <grp.h> + +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include <haproxy/api.h> +#include <haproxy/activity.h> +#include <haproxy/applet.h> +#include <haproxy/backend.h> +#include <haproxy/base64.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/cli.h> +#include <haproxy/clock.h> +#include <haproxy/compression.h> +#include <haproxy/debug.h> +#include <haproxy/errors.h> +#include <haproxy/fd.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/http.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/list.h> +#include <haproxy/listener.h> +#include <haproxy/log.h> +#include <haproxy/map-t.h> +#include <haproxy/pattern-t.h> +#include <haproxy/pipe.h> +#include <haproxy/pool.h> +#include <haproxy/proxy.h> +#include <haproxy/resolvers.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/session.h> +#include <haproxy/stats.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> +#include <haproxy/uri_auth-t.h> +#include <haproxy/version.h> + + +/* status codes available for the stats admin page (strictly 4 chars length) */ +const char *stat_status_codes[STAT_STATUS_SIZE] = { + [STAT_STATUS_DENY] = "DENY", + [STAT_STATUS_DONE] = "DONE", + [STAT_STATUS_ERRP] = "ERRP", + [STAT_STATUS_EXCD] = "EXCD", + [STAT_STATUS_NONE] = "NONE", + [STAT_STATUS_PART] = "PART", + [STAT_STATUS_UNKN] = "UNKN", + [STAT_STATUS_IVAL] = "IVAL", +}; + +/* These are the field names for each INF_* field position. Please pay attention + * to always use the exact same name except that the strings for new names must + * be lower case or CamelCase while the enum entries must be upper case. + */ +const struct name_desc info_fields[INF_TOTAL_FIELDS] = { + [INF_NAME] = { .name = "Name", .desc = "Product name" }, + [INF_VERSION] = { .name = "Version", .desc = "Product version" }, + [INF_RELEASE_DATE] = { .name = "Release_date", .desc = "Date of latest source code update" }, + [INF_NBTHREAD] = { .name = "Nbthread", .desc = "Number of started threads (global.nbthread)" }, + [INF_NBPROC] = { .name = "Nbproc", .desc = "Number of started worker processes (historical, always 1)" }, + [INF_PROCESS_NUM] = { .name = "Process_num", .desc = "Relative worker process number (1)" }, + [INF_PID] = { .name = "Pid", .desc = "This worker process identifier for the system" }, + [INF_UPTIME] = { .name = "Uptime", .desc = "How long ago this worker process was started (days+hours+minutes+seconds)" }, + [INF_UPTIME_SEC] = { .name = "Uptime_sec", .desc = "How long ago this worker process was started (seconds)" }, + [INF_START_TIME_SEC] = { .name = "Start_time_sec", .desc = "Start time in seconds" }, + [INF_MEMMAX_MB] = { .name = "Memmax_MB", .desc = "Worker process's hard limit on memory usage in MB (-m on command line)" }, + [INF_MEMMAX_BYTES] = { .name = "Memmax_bytes", .desc = "Worker process's hard limit on memory usage in byes (-m on command line)" }, + [INF_POOL_ALLOC_MB] = { .name = "PoolAlloc_MB", .desc = "Amount of memory allocated in pools (in MB)" }, + [INF_POOL_ALLOC_BYTES] = { .name = "PoolAlloc_bytes", .desc = "Amount of memory allocated in pools (in bytes)" }, + [INF_POOL_USED_MB] = { .name = "PoolUsed_MB", .desc = "Amount of pool memory currently used (in MB)" }, + [INF_POOL_USED_BYTES] = { .name = "PoolUsed_bytes", .desc = "Amount of pool memory currently used (in bytes)" }, + [INF_POOL_FAILED] = { .name = "PoolFailed", .desc = "Number of failed pool allocations since this worker was started" }, + [INF_ULIMIT_N] = { .name = "Ulimit-n", .desc = "Hard limit on the number of per-process file descriptors" }, + [INF_MAXSOCK] = { .name = "Maxsock", .desc = "Hard limit on the number of per-process sockets" }, + [INF_MAXCONN] = { .name = "Maxconn", .desc = "Hard limit on the number of per-process connections (configured or imposed by Ulimit-n)" }, + [INF_HARD_MAXCONN] = { .name = "Hard_maxconn", .desc = "Hard limit on the number of per-process connections (imposed by Memmax_MB or Ulimit-n)" }, + [INF_CURR_CONN] = { .name = "CurrConns", .desc = "Current number of connections on this worker process" }, + [INF_CUM_CONN] = { .name = "CumConns", .desc = "Total number of connections on this worker process since started" }, + [INF_CUM_REQ] = { .name = "CumReq", .desc = "Total number of requests on this worker process since started" }, + [INF_MAX_SSL_CONNS] = { .name = "MaxSslConns", .desc = "Hard limit on the number of per-process SSL endpoints (front+back), 0=unlimited" }, + [INF_CURR_SSL_CONNS] = { .name = "CurrSslConns", .desc = "Current number of SSL endpoints on this worker process (front+back)" }, + [INF_CUM_SSL_CONNS] = { .name = "CumSslConns", .desc = "Total number of SSL endpoints on this worker process since started (front+back)" }, + [INF_MAXPIPES] = { .name = "Maxpipes", .desc = "Hard limit on the number of pipes for splicing, 0=unlimited" }, + [INF_PIPES_USED] = { .name = "PipesUsed", .desc = "Current number of pipes in use in this worker process" }, + [INF_PIPES_FREE] = { .name = "PipesFree", .desc = "Current number of allocated and available pipes in this worker process" }, + [INF_CONN_RATE] = { .name = "ConnRate", .desc = "Number of front connections created on this worker process over the last second" }, + [INF_CONN_RATE_LIMIT] = { .name = "ConnRateLimit", .desc = "Hard limit for ConnRate (global.maxconnrate)" }, + [INF_MAX_CONN_RATE] = { .name = "MaxConnRate", .desc = "Highest ConnRate reached on this worker process since started (in connections per second)" }, + [INF_SESS_RATE] = { .name = "SessRate", .desc = "Number of sessions created on this worker process over the last second" }, + [INF_SESS_RATE_LIMIT] = { .name = "SessRateLimit", .desc = "Hard limit for SessRate (global.maxsessrate)" }, + [INF_MAX_SESS_RATE] = { .name = "MaxSessRate", .desc = "Highest SessRate reached on this worker process since started (in sessions per second)" }, + [INF_SSL_RATE] = { .name = "SslRate", .desc = "Number of SSL connections created on this worker process over the last second" }, + [INF_SSL_RATE_LIMIT] = { .name = "SslRateLimit", .desc = "Hard limit for SslRate (global.maxsslrate)" }, + [INF_MAX_SSL_RATE] = { .name = "MaxSslRate", .desc = "Highest SslRate reached on this worker process since started (in connections per second)" }, + [INF_SSL_FRONTEND_KEY_RATE] = { .name = "SslFrontendKeyRate", .desc = "Number of SSL keys created on frontends in this worker process over the last second" }, + [INF_SSL_FRONTEND_MAX_KEY_RATE] = { .name = "SslFrontendMaxKeyRate", .desc = "Highest SslFrontendKeyRate reached on this worker process since started (in SSL keys per second)" }, + [INF_SSL_FRONTEND_SESSION_REUSE_PCT] = { .name = "SslFrontendSessionReuse_pct", .desc = "Percent of frontend SSL connections which did not require a new key" }, + [INF_SSL_BACKEND_KEY_RATE] = { .name = "SslBackendKeyRate", .desc = "Number of SSL keys created on backends in this worker process over the last second" }, + [INF_SSL_BACKEND_MAX_KEY_RATE] = { .name = "SslBackendMaxKeyRate", .desc = "Highest SslBackendKeyRate reached on this worker process since started (in SSL keys per second)" }, + [INF_SSL_CACHE_LOOKUPS] = { .name = "SslCacheLookups", .desc = "Total number of SSL session ID lookups in the SSL session cache on this worker since started" }, + [INF_SSL_CACHE_MISSES] = { .name = "SslCacheMisses", .desc = "Total number of SSL session ID lookups that didn't find a session in the SSL session cache on this worker since started" }, + [INF_COMPRESS_BPS_IN] = { .name = "CompressBpsIn", .desc = "Number of bytes submitted to the HTTP compressor in this worker process over the last second" }, + [INF_COMPRESS_BPS_OUT] = { .name = "CompressBpsOut", .desc = "Number of bytes emitted by the HTTP compressor in this worker process over the last second" }, + [INF_COMPRESS_BPS_RATE_LIM] = { .name = "CompressBpsRateLim", .desc = "Limit of CompressBpsOut beyond which HTTP compression is automatically disabled" }, + [INF_ZLIB_MEM_USAGE] = { .name = "ZlibMemUsage", .desc = "Amount of memory currently used by HTTP compression on the current worker process (in bytes)" }, + [INF_MAX_ZLIB_MEM_USAGE] = { .name = "MaxZlibMemUsage", .desc = "Limit on the amount of memory used by HTTP compression above which it is automatically disabled (in bytes, see global.maxzlibmem)" }, + [INF_TASKS] = { .name = "Tasks", .desc = "Total number of tasks in the current worker process (active + sleeping)" }, + [INF_RUN_QUEUE] = { .name = "Run_queue", .desc = "Total number of active tasks+tasklets in the current worker process" }, + [INF_IDLE_PCT] = { .name = "Idle_pct", .desc = "Percentage of last second spent waiting in the current worker thread" }, + [INF_NODE] = { .name = "node", .desc = "Node name (global.node)" }, + [INF_DESCRIPTION] = { .name = "description", .desc = "Node description (global.description)" }, + [INF_STOPPING] = { .name = "Stopping", .desc = "1 if the worker process is currently stopping, otherwise zero" }, + [INF_JOBS] = { .name = "Jobs", .desc = "Current number of active jobs on the current worker process (frontend connections, master connections, listeners)" }, + [INF_UNSTOPPABLE_JOBS] = { .name = "Unstoppable Jobs", .desc = "Current number of unstoppable jobs on the current worker process (master connections)" }, + [INF_LISTENERS] = { .name = "Listeners", .desc = "Current number of active listeners on the current worker process" }, + [INF_ACTIVE_PEERS] = { .name = "ActivePeers", .desc = "Current number of verified active peers connections on the current worker process" }, + [INF_CONNECTED_PEERS] = { .name = "ConnectedPeers", .desc = "Current number of peers having passed the connection step on the current worker process" }, + [INF_DROPPED_LOGS] = { .name = "DroppedLogs", .desc = "Total number of dropped logs for current worker process since started" }, + [INF_BUSY_POLLING] = { .name = "BusyPolling", .desc = "1 if busy-polling is currently in use on the worker process, otherwise zero (config.busy-polling)" }, + [INF_FAILED_RESOLUTIONS] = { .name = "FailedResolutions", .desc = "Total number of failed DNS resolutions in current worker process since started" }, + [INF_TOTAL_BYTES_OUT] = { .name = "TotalBytesOut", .desc = "Total number of bytes emitted by current worker process since started" }, + [INF_TOTAL_SPLICED_BYTES_OUT] = { .name = "TotalSplicedBytesOut", .desc = "Total number of bytes emitted by current worker process through a kernel pipe since started" }, + [INF_BYTES_OUT_RATE] = { .name = "BytesOutRate", .desc = "Number of bytes emitted by current worker process over the last second" }, + [INF_DEBUG_COMMANDS_ISSUED] = { .name = "DebugCommandsIssued", .desc = "Number of debug commands issued on this process (anything > 0 is unsafe)" }, + [INF_CUM_LOG_MSGS] = { .name = "CumRecvLogs", .desc = "Total number of log messages received by log-forwarding listeners on this worker process since started" }, + [INF_BUILD_INFO] = { .name = "Build info", .desc = "Build info" }, + [INF_TAINTED] = { .name = "Tainted", .desc = "Experimental features used" }, + [INF_WARNINGS] = { .name = "TotalWarnings", .desc = "Total warnings issued" }, + [INF_MAXCONN_REACHED] = { .name = "MaxconnReached", .desc = "Number of times an accepted connection resulted in Maxconn being reached" }, + [INF_BOOTTIME_MS] = { .name = "BootTime_ms", .desc = "How long ago it took to parse and process the config before being ready (milliseconds)" }, + [INF_NICED_TASKS] = { .name = "Niced_tasks", .desc = "Total number of active tasks+tasklets in the current worker process (Run_queue) that are niced" }, +}; + +const struct name_desc stat_fields[ST_F_TOTAL_FIELDS] = { + [ST_F_PXNAME] = { .name = "pxname", .desc = "Proxy name" }, + [ST_F_SVNAME] = { .name = "svname", .desc = "Server name" }, + [ST_F_QCUR] = { .name = "qcur", .desc = "Number of current queued connections" }, + [ST_F_QMAX] = { .name = "qmax", .desc = "Highest value of queued connections encountered since process started" }, + [ST_F_SCUR] = { .name = "scur", .desc = "Number of current sessions on the frontend, backend or server" }, + [ST_F_SMAX] = { .name = "smax", .desc = "Highest value of current sessions encountered since process started" }, + [ST_F_SLIM] = { .name = "slim", .desc = "Frontend/listener/server's maxconn, backend's fullconn" }, + [ST_F_STOT] = { .name = "stot", .desc = "Total number of sessions since process started" }, + [ST_F_BIN] = { .name = "bin", .desc = "Total number of request bytes since process started" }, + [ST_F_BOUT] = { .name = "bout", .desc = "Total number of response bytes since process started" }, + [ST_F_DREQ] = { .name = "dreq", .desc = "Total number of denied requests since process started" }, + [ST_F_DRESP] = { .name = "dresp", .desc = "Total number of denied responses since process started" }, + [ST_F_EREQ] = { .name = "ereq", .desc = "Total number of invalid requests since process started" }, + [ST_F_ECON] = { .name = "econ", .desc = "Total number of failed connections to server since the worker process started" }, + [ST_F_ERESP] = { .name = "eresp", .desc = "Total number of invalid responses since the worker process started" }, + [ST_F_WRETR] = { .name = "wretr", .desc = "Total number of server connection retries since the worker process started" }, + [ST_F_WREDIS] = { .name = "wredis", .desc = "Total number of server redispatches due to connection failures since the worker process started" }, + [ST_F_STATUS] = { .name = "status", .desc = "Frontend/listen status: OPEN/WAITING/FULL/STOP; backend: UP/DOWN; server: last check status" }, + [ST_F_WEIGHT] = { .name = "weight", .desc = "Server's effective weight, or sum of active servers' effective weights for a backend" }, + [ST_F_ACT] = { .name = "act", .desc = "Total number of active UP servers with a non-zero weight" }, + [ST_F_BCK] = { .name = "bck", .desc = "Total number of backup UP servers with a non-zero weight" }, + [ST_F_CHKFAIL] = { .name = "chkfail", .desc = "Total number of failed individual health checks per server/backend, since the worker process started" }, + [ST_F_CHKDOWN] = { .name = "chkdown", .desc = "Total number of failed checks causing UP to DOWN server transitions, per server/backend, since the worker process started" }, + [ST_F_LASTCHG] = { .name = "lastchg", .desc = "How long ago the last server state changed, in seconds" }, + [ST_F_DOWNTIME] = { .name = "downtime", .desc = "Total time spent in DOWN state, for server or backend" }, + [ST_F_QLIMIT] = { .name = "qlimit", .desc = "Limit on the number of connections in queue, for servers only (maxqueue argument)" }, + [ST_F_PID] = { .name = "pid", .desc = "Relative worker process number (1)" }, + [ST_F_IID] = { .name = "iid", .desc = "Frontend or Backend numeric identifier ('id' setting)" }, + [ST_F_SID] = { .name = "sid", .desc = "Server numeric identifier ('id' setting)" }, + [ST_F_THROTTLE] = { .name = "throttle", .desc = "Throttling ratio applied to a server's maxconn and weight during the slowstart period (0 to 100%)" }, + [ST_F_LBTOT] = { .name = "lbtot", .desc = "Total number of requests routed by load balancing since the worker process started (ignores queue pop and stickiness)" }, + [ST_F_TRACKED] = { .name = "tracked", .desc = "Name of the other server this server tracks for its state" }, + [ST_F_TYPE] = { .name = "type", .desc = "Type of the object (Listener, Frontend, Backend, Server)" }, + [ST_F_RATE] = { .name = "rate", .desc = "Total number of sessions processed by this object over the last second (sessions for listeners/frontends, requests for backends/servers)" }, + [ST_F_RATE_LIM] = { .name = "rate_lim", .desc = "Limit on the number of sessions accepted in a second (frontend only, 'rate-limit sessions' setting)" }, + [ST_F_RATE_MAX] = { .name = "rate_max", .desc = "Highest value of sessions per second observed since the worker process started" }, + [ST_F_CHECK_STATUS] = { .name = "check_status", .desc = "Status report of the server's latest health check, prefixed with '*' if a check is currently in progress" }, + [ST_F_CHECK_CODE] = { .name = "check_code", .desc = "HTTP/SMTP/LDAP status code reported by the latest server health check" }, + [ST_F_CHECK_DURATION] = { .name = "check_duration", .desc = "Total duration of the latest server health check, in milliseconds" }, + [ST_F_HRSP_1XX] = { .name = "hrsp_1xx", .desc = "Total number of HTTP responses with status 100-199 returned by this object since the worker process started" }, + [ST_F_HRSP_2XX] = { .name = "hrsp_2xx", .desc = "Total number of HTTP responses with status 200-299 returned by this object since the worker process started" }, + [ST_F_HRSP_3XX] = { .name = "hrsp_3xx", .desc = "Total number of HTTP responses with status 300-399 returned by this object since the worker process started" }, + [ST_F_HRSP_4XX] = { .name = "hrsp_4xx", .desc = "Total number of HTTP responses with status 400-499 returned by this object since the worker process started" }, + [ST_F_HRSP_5XX] = { .name = "hrsp_5xx", .desc = "Total number of HTTP responses with status 500-599 returned by this object since the worker process started" }, + [ST_F_HRSP_OTHER] = { .name = "hrsp_other", .desc = "Total number of HTTP responses with status <100, >599 returned by this object since the worker process started (error -1 included)" }, + [ST_F_HANAFAIL] = { .name = "hanafail", .desc = "Total number of failed checks caused by an 'on-error' directive after an 'observe' condition matched" }, + [ST_F_REQ_RATE] = { .name = "req_rate", .desc = "Number of HTTP requests processed over the last second on this object" }, + [ST_F_REQ_RATE_MAX] = { .name = "req_rate_max", .desc = "Highest value of http requests observed since the worker process started" }, + [ST_F_REQ_TOT] = { .name = "req_tot", .desc = "Total number of HTTP requests processed by this object since the worker process started" }, + [ST_F_CLI_ABRT] = { .name = "cli_abrt", .desc = "Total number of requests or connections aborted by the client since the worker process started" }, + [ST_F_SRV_ABRT] = { .name = "srv_abrt", .desc = "Total number of requests or connections aborted by the server since the worker process started" }, + [ST_F_COMP_IN] = { .name = "comp_in", .desc = "Total number of bytes submitted to the HTTP compressor for this object since the worker process started" }, + [ST_F_COMP_OUT] = { .name = "comp_out", .desc = "Total number of bytes emitted by the HTTP compressor for this object since the worker process started" }, + [ST_F_COMP_BYP] = { .name = "comp_byp", .desc = "Total number of bytes that bypassed HTTP compression for this object since the worker process started (CPU/memory/bandwidth limitation)" }, + [ST_F_COMP_RSP] = { .name = "comp_rsp", .desc = "Total number of HTTP responses that were compressed for this object since the worker process started" }, + [ST_F_LASTSESS] = { .name = "lastsess", .desc = "How long ago some traffic was seen on this object on this worker process, in seconds" }, + [ST_F_LAST_CHK] = { .name = "last_chk", .desc = "Short description of the latest health check report for this server (see also check_desc)" }, + [ST_F_LAST_AGT] = { .name = "last_agt", .desc = "Short description of the latest agent check report for this server (see also agent_desc)" }, + [ST_F_QTIME] = { .name = "qtime", .desc = "Time spent in the queue, in milliseconds, averaged over the 1024 last requests (backend/server)" }, + [ST_F_CTIME] = { .name = "ctime", .desc = "Time spent waiting for a connection to complete, in milliseconds, averaged over the 1024 last requests (backend/server)" }, + [ST_F_RTIME] = { .name = "rtime", .desc = "Time spent waiting for a server response, in milliseconds, averaged over the 1024 last requests (backend/server)" }, + [ST_F_TTIME] = { .name = "ttime", .desc = "Total request+response time (request+queue+connect+response+processing), in milliseconds, averaged over the 1024 last requests (backend/server)" }, + [ST_F_AGENT_STATUS] = { .name = "agent_status", .desc = "Status report of the server's latest agent check, prefixed with '*' if a check is currently in progress" }, + [ST_F_AGENT_CODE] = { .name = "agent_code", .desc = "Status code reported by the latest server agent check" }, + [ST_F_AGENT_DURATION] = { .name = "agent_duration", .desc = "Total duration of the latest server agent check, in milliseconds" }, + [ST_F_CHECK_DESC] = { .name = "check_desc", .desc = "Textual description of the latest health check report for this server" }, + [ST_F_AGENT_DESC] = { .name = "agent_desc", .desc = "Textual description of the latest agent check report for this server" }, + [ST_F_CHECK_RISE] = { .name = "check_rise", .desc = "Number of successful health checks before declaring a server UP (server 'rise' setting)" }, + [ST_F_CHECK_FALL] = { .name = "check_fall", .desc = "Number of failed health checks before declaring a server DOWN (server 'fall' setting)" }, + [ST_F_CHECK_HEALTH] = { .name = "check_health", .desc = "Current server health check level (0..fall-1=DOWN, fall..rise-1=UP)" }, + [ST_F_AGENT_RISE] = { .name = "agent_rise", .desc = "Number of successful agent checks before declaring a server UP (server 'rise' setting)" }, + [ST_F_AGENT_FALL] = { .name = "agent_fall", .desc = "Number of failed agent checks before declaring a server DOWN (server 'fall' setting)" }, + [ST_F_AGENT_HEALTH] = { .name = "agent_health", .desc = "Current server agent check level (0..fall-1=DOWN, fall..rise-1=UP)" }, + [ST_F_ADDR] = { .name = "addr", .desc = "Server's address:port, shown only if show-legends is set, or at levels oper/admin for the CLI" }, + [ST_F_COOKIE] = { .name = "cookie", .desc = "Backend's cookie name or Server's cookie value, shown only if show-legends is set, or at levels oper/admin for the CLI" }, + [ST_F_MODE] = { .name = "mode", .desc = "'mode' setting (tcp/http/health/cli)" }, + [ST_F_ALGO] = { .name = "algo", .desc = "Backend's load balancing algorithm, shown only if show-legends is set, or at levels oper/admin for the CLI" }, + [ST_F_CONN_RATE] = { .name = "conn_rate", .desc = "Number of new connections accepted over the last second on the frontend for this worker process" }, + [ST_F_CONN_RATE_MAX] = { .name = "conn_rate_max", .desc = "Highest value of connections per second observed since the worker process started" }, + [ST_F_CONN_TOT] = { .name = "conn_tot", .desc = "Total number of new connections accepted on this frontend since the worker process started" }, + [ST_F_INTERCEPTED] = { .name = "intercepted", .desc = "Total number of HTTP requests intercepted on the frontend (redirects/stats/services) since the worker process started" }, + [ST_F_DCON] = { .name = "dcon", .desc = "Total number of incoming connections blocked on a listener/frontend by a tcp-request connection rule since the worker process started" }, + [ST_F_DSES] = { .name = "dses", .desc = "Total number of incoming sessions blocked on a listener/frontend by a tcp-request connection rule since the worker process started" }, + [ST_F_WREW] = { .name = "wrew", .desc = "Total number of failed HTTP header rewrites since the worker process started" }, + [ST_F_CONNECT] = { .name = "connect", .desc = "Total number of outgoing connection attempts on this backend/server since the worker process started" }, + [ST_F_REUSE] = { .name = "reuse", .desc = "Total number of reused connection on this backend/server since the worker process started" }, + [ST_F_CACHE_LOOKUPS] = { .name = "cache_lookups", .desc = "Total number of HTTP requests looked up in the cache on this frontend/backend since the worker process started" }, + [ST_F_CACHE_HITS] = { .name = "cache_hits", .desc = "Total number of HTTP requests not found in the cache on this frontend/backend since the worker process started" }, + [ST_F_SRV_ICUR] = { .name = "srv_icur", .desc = "Current number of idle connections available for reuse on this server" }, + [ST_F_SRV_ILIM] = { .name = "src_ilim", .desc = "Limit on the number of available idle connections on this server (server 'pool_max_conn' directive)" }, + [ST_F_QT_MAX] = { .name = "qtime_max", .desc = "Maximum observed time spent in the queue, in milliseconds (backend/server)" }, + [ST_F_CT_MAX] = { .name = "ctime_max", .desc = "Maximum observed time spent waiting for a connection to complete, in milliseconds (backend/server)" }, + [ST_F_RT_MAX] = { .name = "rtime_max", .desc = "Maximum observed time spent waiting for a server response, in milliseconds (backend/server)" }, + [ST_F_TT_MAX] = { .name = "ttime_max", .desc = "Maximum observed total request+response time (request+queue+connect+response+processing), in milliseconds (backend/server)" }, + [ST_F_EINT] = { .name = "eint", .desc = "Total number of internal errors since process started"}, + [ST_F_IDLE_CONN_CUR] = { .name = "idle_conn_cur", .desc = "Current number of unsafe idle connections"}, + [ST_F_SAFE_CONN_CUR] = { .name = "safe_conn_cur", .desc = "Current number of safe idle connections"}, + [ST_F_USED_CONN_CUR] = { .name = "used_conn_cur", .desc = "Current number of connections in use"}, + [ST_F_NEED_CONN_EST] = { .name = "need_conn_est", .desc = "Estimated needed number of connections"}, + [ST_F_UWEIGHT] = { .name = "uweight", .desc = "Server's user weight, or sum of active servers' user weights for a backend" }, + [ST_F_AGG_SRV_CHECK_STATUS] = { .name = "agg_server_check_status", .desc = "[DEPRECATED] Backend's aggregated gauge of servers' status" }, + [ST_F_AGG_SRV_STATUS ] = { .name = "agg_server_status", .desc = "Backend's aggregated gauge of servers' status" }, + [ST_F_AGG_CHECK_STATUS] = { .name = "agg_check_status", .desc = "Backend's aggregated gauge of servers' state check status" }, + [ST_F_SRID] = { .name = "srid", .desc = "Server id revision, to prevent server id reuse mixups" }, + [ST_F_SESS_OTHER] = { .name = "sess_other", .desc = "Total number of sessions other than HTTP since process started" }, + [ST_F_H1SESS] = { .name = "h1sess", .desc = "Total number of HTTP/1 sessions since process started" }, + [ST_F_H2SESS] = { .name = "h2sess", .desc = "Total number of HTTP/2 sessions since process started" }, + [ST_F_H3SESS] = { .name = "h3sess", .desc = "Total number of HTTP/3 sessions since process started" }, + [ST_F_REQ_OTHER] = { .name = "req_other", .desc = "Total number of sessions other than HTTP processed by this object since the worker process started" }, + [ST_F_H1REQ] = { .name = "h1req", .desc = "Total number of HTTP/1 sessions processed by this object since the worker process started" }, + [ST_F_H2REQ] = { .name = "h2req", .desc = "Total number of hTTP/2 sessions processed by this object since the worker process started" }, + [ST_F_H3REQ] = { .name = "h3req", .desc = "Total number of HTTP/3 sessions processed by this object since the worker process started" }, + [ST_F_PROTO] = { .name = "proto", .desc = "Protocol" }, +}; + +/* one line of info */ +THREAD_LOCAL struct field info[INF_TOTAL_FIELDS]; + +/* description of statistics (static and dynamic) */ +static struct name_desc *stat_f[STATS_DOMAIN_COUNT]; +static size_t stat_count[STATS_DOMAIN_COUNT]; + +/* one line for stats */ +THREAD_LOCAL struct field *stat_l[STATS_DOMAIN_COUNT]; + +/* list of all registered stats module */ +static struct list stats_module_list[STATS_DOMAIN_COUNT] = { + LIST_HEAD_INIT(stats_module_list[STATS_DOMAIN_PROXY]), + LIST_HEAD_INIT(stats_module_list[STATS_DOMAIN_RESOLVERS]), +}; + +THREAD_LOCAL void *trash_counters; +static THREAD_LOCAL struct buffer trash_chunk = BUF_NULL; + + +static inline uint8_t stats_get_domain(uint32_t domain) +{ + return domain >> STATS_DOMAIN & STATS_DOMAIN_MASK; +} + +static inline enum stats_domain_px_cap stats_px_get_cap(uint32_t domain) +{ + return domain >> STATS_PX_CAP & STATS_PX_CAP_MASK; +} + +static void stats_dump_json_schema(struct buffer *out); + +int stats_putchk(struct appctx *appctx, struct htx *htx) +{ + struct stconn *sc = appctx_sc(appctx); + struct channel *chn = sc_ic(sc); + struct buffer *chk = &trash_chunk; + + if (htx) { + if (chk->data >= channel_htx_recv_max(chn, htx)) { + sc_need_room(sc, chk->data); + return 0; + } + if (!htx_add_data_atonce(htx, ist2(chk->area, chk->data))) { + sc_need_room(sc, 0); + return 0; + } + channel_add_input(chn, chk->data); + chk->data = 0; + } + else { + if (applet_putchk(appctx, chk) == -1) + return 0; + } + return 1; +} + +static const char *stats_scope_ptr(struct appctx *appctx, struct stconn *sc) +{ + struct show_stat_ctx *ctx = appctx->svcctx; + struct channel *req = sc_oc(sc); + struct htx *htx = htxbuf(&req->buf); + struct htx_blk *blk; + struct ist uri; + + blk = htx_get_head_blk(htx); + BUG_ON(!blk || htx_get_blk_type(blk) != HTX_BLK_REQ_SL); + ALREADY_CHECKED(blk); + uri = htx_sl_req_uri(htx_get_blk_ptr(htx, blk)); + return uri.ptr + ctx->scope_str; +} + +/* + * http_stats_io_handler() + * -> stats_dump_stat_to_buffer() // same as above, but used for CSV or HTML + * -> stats_dump_csv_header() // emits the CSV headers (same as above) + * -> stats_dump_json_header() // emits the JSON headers (same as above) + * -> stats_dump_html_head() // emits the HTML headers + * -> stats_dump_html_info() // emits the equivalent of "show info" at the top + * -> stats_dump_proxy_to_buffer() // same as above, valid for CSV and HTML + * -> stats_dump_html_px_hdr() + * -> stats_dump_fe_stats() + * -> stats_dump_li_stats() + * -> stats_dump_sv_stats() + * -> stats_dump_be_stats() + * -> stats_dump_html_px_end() + * -> stats_dump_html_end() // emits HTML trailer + * -> stats_dump_json_end() // emits JSON trailer + */ + + +/* Dumps the stats CSV header to the local trash buffer. The caller is + * responsible for clearing it if needed. + * NOTE: Some tools happen to rely on the field position instead of its name, + * so please only append new fields at the end, never in the middle. + */ +static void stats_dump_csv_header(enum stats_domain domain) +{ + int field; + + chunk_appendf(&trash_chunk, "# "); + if (stat_f[domain]) { + for (field = 0; field < stat_count[domain]; ++field) { + chunk_appendf(&trash_chunk, "%s,", stat_f[domain][field].name); + + /* print special delimiter on proxy stats to mark end of + static fields */ + if (domain == STATS_DOMAIN_PROXY && field + 1 == ST_F_TOTAL_FIELDS) + chunk_appendf(&trash_chunk, "-,"); + } + } + + chunk_appendf(&trash_chunk, "\n"); +} + +/* Emits a stats field without any surrounding element and properly encoded to + * resist CSV output. Returns non-zero on success, 0 if the buffer is full. + */ +int stats_emit_raw_data_field(struct buffer *out, const struct field *f) +{ + switch (field_format(f, 0)) { + case FF_EMPTY: return 1; + case FF_S32: return chunk_appendf(out, "%d", f->u.s32); + case FF_U32: return chunk_appendf(out, "%u", f->u.u32); + case FF_S64: return chunk_appendf(out, "%lld", (long long)f->u.s64); + case FF_U64: return chunk_appendf(out, "%llu", (unsigned long long)f->u.u64); + case FF_FLT: { + size_t prev_data = out->data; + out->data = flt_trim(out->area, prev_data, chunk_appendf(out, "%f", f->u.flt)); + return out->data; + } + case FF_STR: return csv_enc_append(field_str(f, 0), 1, 2, out) != NULL; + default: return chunk_appendf(out, "[INCORRECT_FIELD_TYPE_%08x]", f->type); + } +} + +const char *field_to_html_str(const struct field *f) +{ + switch (field_format(f, 0)) { + case FF_S32: return U2H(f->u.s32); + case FF_S64: return U2H(f->u.s64); + case FF_U64: return U2H(f->u.u64); + case FF_U32: return U2H(f->u.u32); + case FF_FLT: return F2H(f->u.flt); + case FF_STR: return field_str(f, 0); + case FF_EMPTY: + default: + return ""; + } +} + +/* Emits a stats field prefixed with its type. No CSV encoding is prepared, the + * output is supposed to be used on its own line. Returns non-zero on success, 0 + * if the buffer is full. + */ +int stats_emit_typed_data_field(struct buffer *out, const struct field *f) +{ + switch (field_format(f, 0)) { + case FF_EMPTY: return 1; + case FF_S32: return chunk_appendf(out, "s32:%d", f->u.s32); + case FF_U32: return chunk_appendf(out, "u32:%u", f->u.u32); + case FF_S64: return chunk_appendf(out, "s64:%lld", (long long)f->u.s64); + case FF_U64: return chunk_appendf(out, "u64:%llu", (unsigned long long)f->u.u64); + case FF_FLT: { + size_t prev_data = out->data; + out->data = flt_trim(out->area, prev_data, chunk_appendf(out, "flt:%f", f->u.flt)); + return out->data; + } + case FF_STR: return chunk_appendf(out, "str:%s", field_str(f, 0)); + default: return chunk_appendf(out, "%08x:?", f->type); + } +} + +/* Limit JSON integer values to the range [-(2**53)+1, (2**53)-1] as per + * the recommendation for interoperable integers in section 6 of RFC 7159. + */ +#define JSON_INT_MAX ((1ULL << 53) - 1) +#define JSON_INT_MIN (0 - JSON_INT_MAX) + +/* Emits a stats field value and its type in JSON. + * Returns non-zero on success, 0 on error. + */ +int stats_emit_json_data_field(struct buffer *out, const struct field *f) +{ + int old_len; + char buf[20]; + const char *type, *value = buf, *quote = ""; + + switch (field_format(f, 0)) { + case FF_EMPTY: return 1; + case FF_S32: type = "\"s32\""; + snprintf(buf, sizeof(buf), "%d", f->u.s32); + break; + case FF_U32: type = "\"u32\""; + snprintf(buf, sizeof(buf), "%u", f->u.u32); + break; + case FF_S64: type = "\"s64\""; + if (f->u.s64 < JSON_INT_MIN || f->u.s64 > JSON_INT_MAX) + return 0; + type = "\"s64\""; + snprintf(buf, sizeof(buf), "%lld", (long long)f->u.s64); + break; + case FF_U64: if (f->u.u64 > JSON_INT_MAX) + return 0; + type = "\"u64\""; + snprintf(buf, sizeof(buf), "%llu", + (unsigned long long) f->u.u64); + break; + case FF_FLT: type = "\"flt\""; + flt_trim(buf, 0, snprintf(buf, sizeof(buf), "%f", f->u.flt)); + break; + case FF_STR: type = "\"str\""; + value = field_str(f, 0); + quote = "\""; + break; + default: snprintf(buf, sizeof(buf), "%u", f->type); + type = buf; + value = "unknown"; + quote = "\""; + break; + } + + old_len = out->data; + chunk_appendf(out, ",\"value\":{\"type\":%s,\"value\":%s%s%s}", + type, quote, value, quote); + return !(old_len == out->data); +} + +/* Emits an encoding of the field type on 3 characters followed by a delimiter. + * Returns non-zero on success, 0 if the buffer is full. + */ +int stats_emit_field_tags(struct buffer *out, const struct field *f, + char delim) +{ + char origin, nature, scope; + + switch (field_origin(f, 0)) { + case FO_METRIC: origin = 'M'; break; + case FO_STATUS: origin = 'S'; break; + case FO_KEY: origin = 'K'; break; + case FO_CONFIG: origin = 'C'; break; + case FO_PRODUCT: origin = 'P'; break; + default: origin = '?'; break; + } + + switch (field_nature(f, 0)) { + case FN_GAUGE: nature = 'G'; break; + case FN_LIMIT: nature = 'L'; break; + case FN_MIN: nature = 'm'; break; + case FN_MAX: nature = 'M'; break; + case FN_RATE: nature = 'R'; break; + case FN_COUNTER: nature = 'C'; break; + case FN_DURATION: nature = 'D'; break; + case FN_AGE: nature = 'A'; break; + case FN_TIME: nature = 'T'; break; + case FN_NAME: nature = 'N'; break; + case FN_OUTPUT: nature = 'O'; break; + case FN_AVG: nature = 'a'; break; + default: nature = '?'; break; + } + + switch (field_scope(f, 0)) { + case FS_PROCESS: scope = 'P'; break; + case FS_SERVICE: scope = 'S'; break; + case FS_SYSTEM: scope = 's'; break; + case FS_CLUSTER: scope = 'C'; break; + default: scope = '?'; break; + } + + return chunk_appendf(out, "%c%c%c%c", origin, nature, scope, delim); +} + +/* Emits an encoding of the field type as JSON. + * Returns non-zero on success, 0 if the buffer is full. + */ +int stats_emit_json_field_tags(struct buffer *out, const struct field *f) +{ + const char *origin, *nature, *scope; + int old_len; + + switch (field_origin(f, 0)) { + case FO_METRIC: origin = "Metric"; break; + case FO_STATUS: origin = "Status"; break; + case FO_KEY: origin = "Key"; break; + case FO_CONFIG: origin = "Config"; break; + case FO_PRODUCT: origin = "Product"; break; + default: origin = "Unknown"; break; + } + + switch (field_nature(f, 0)) { + case FN_GAUGE: nature = "Gauge"; break; + case FN_LIMIT: nature = "Limit"; break; + case FN_MIN: nature = "Min"; break; + case FN_MAX: nature = "Max"; break; + case FN_RATE: nature = "Rate"; break; + case FN_COUNTER: nature = "Counter"; break; + case FN_DURATION: nature = "Duration"; break; + case FN_AGE: nature = "Age"; break; + case FN_TIME: nature = "Time"; break; + case FN_NAME: nature = "Name"; break; + case FN_OUTPUT: nature = "Output"; break; + case FN_AVG: nature = "Avg"; break; + default: nature = "Unknown"; break; + } + + switch (field_scope(f, 0)) { + case FS_PROCESS: scope = "Process"; break; + case FS_SERVICE: scope = "Service"; break; + case FS_SYSTEM: scope = "System"; break; + case FS_CLUSTER: scope = "Cluster"; break; + default: scope = "Unknown"; break; + } + + old_len = out->data; + chunk_appendf(out, "\"tags\":{" + "\"origin\":\"%s\"," + "\"nature\":\"%s\"," + "\"scope\":\"%s\"" + "}", origin, nature, scope); + return !(old_len == out->data); +} + +/* Dump all fields from <stats> into <out> using CSV format */ +static int stats_dump_fields_csv(struct buffer *out, + const struct field *stats, size_t stats_count, + struct show_stat_ctx *ctx) +{ + int domain = ctx->domain; + int field; + + for (field = 0; field < stats_count; ++field) { + if (!stats_emit_raw_data_field(out, &stats[field])) + return 0; + if (!chunk_strcat(out, ",")) + return 0; + + /* print special delimiter on proxy stats to mark end of + static fields */ + if (domain == STATS_DOMAIN_PROXY && field + 1 == ST_F_TOTAL_FIELDS) { + if (!chunk_strcat(out, "-,")) + return 0; + } + } + + chunk_strcat(out, "\n"); + return 1; +} + +/* Dump all fields from <stats> into <out> using a typed "field:desc:type:value" format */ +static int stats_dump_fields_typed(struct buffer *out, + const struct field *stats, + size_t stats_count, + struct show_stat_ctx * ctx) +{ + int flags = ctx->flags; + int domain = ctx->domain; + int field; + + for (field = 0; field < stats_count; ++field) { + if (!stats[field].type) + continue; + + switch (domain) { + case STATS_DOMAIN_PROXY: + chunk_appendf(out, "%c.%u.%u.%d.%s.%u:", + stats[ST_F_TYPE].u.u32 == STATS_TYPE_FE ? 'F' : + stats[ST_F_TYPE].u.u32 == STATS_TYPE_BE ? 'B' : + stats[ST_F_TYPE].u.u32 == STATS_TYPE_SO ? 'L' : + stats[ST_F_TYPE].u.u32 == STATS_TYPE_SV ? 'S' : + '?', + stats[ST_F_IID].u.u32, stats[ST_F_SID].u.u32, + field, + stat_f[domain][field].name, + stats[ST_F_PID].u.u32); + break; + + case STATS_DOMAIN_RESOLVERS: + chunk_appendf(out, "N.%d.%s:", field, + stat_f[domain][field].name); + break; + + default: + break; + } + + if (!stats_emit_field_tags(out, &stats[field], ':')) + return 0; + if (!stats_emit_typed_data_field(out, &stats[field])) + return 0; + + if (flags & STAT_SHOW_FDESC && + !chunk_appendf(out, ":\"%s\"", stat_f[domain][field].desc)) { + return 0; + } + + if (!chunk_strcat(out, "\n")) + return 0; + } + return 1; +} + +/* Dump all fields from <stats> into <out> using the "show info json" format */ +static int stats_dump_json_info_fields(struct buffer *out, + const struct field *info, + struct show_stat_ctx *ctx) +{ + int started = (ctx->field) ? 1 : 0; + int ready_data = 0; + + if (!started && !chunk_strcat(out, "[")) + return 0; + + for (; ctx->field < INF_TOTAL_FIELDS; ctx->field++) { + int old_len; + int field = ctx->field; + + if (!field_format(info, field)) + continue; + + if (started && !chunk_strcat(out, ",")) + goto err; + started = 1; + + old_len = out->data; + chunk_appendf(out, + "{\"field\":{\"pos\":%d,\"name\":\"%s\"}," + "\"processNum\":%u,", + field, info_fields[field].name, + info[INF_PROCESS_NUM].u.u32); + if (old_len == out->data) + goto err; + + if (!stats_emit_json_field_tags(out, &info[field])) + goto err; + + if (!stats_emit_json_data_field(out, &info[field])) + goto err; + + if (!chunk_strcat(out, "}")) + goto err; + ready_data = out->data; + } + + if (!chunk_strcat(out, "]\n")) + goto err; + ctx->field = 0; /* we're done */ + return 1; + +err: + if (!ready_data) { + /* not enough buffer space for a single entry.. */ + chunk_reset(out); + chunk_appendf(out, "{\"errorStr\":\"output buffer too short\"}\n"); + return 0; /* hard error */ + } + /* push ready data and wait for a new buffer to complete the dump */ + out->data = ready_data; + return 1; +} + +static void stats_print_proxy_field_json(struct buffer *out, + const struct field *stat, + const char *name, + int pos, + uint32_t field_type, + uint32_t iid, + uint32_t sid, + uint32_t pid) +{ + const char *obj_type; + switch (field_type) { + case STATS_TYPE_FE: obj_type = "Frontend"; break; + case STATS_TYPE_BE: obj_type = "Backend"; break; + case STATS_TYPE_SO: obj_type = "Listener"; break; + case STATS_TYPE_SV: obj_type = "Server"; break; + default: obj_type = "Unknown"; break; + } + + chunk_appendf(out, + "{" + "\"objType\":\"%s\"," + "\"proxyId\":%u," + "\"id\":%u," + "\"field\":{\"pos\":%d,\"name\":\"%s\"}," + "\"processNum\":%u,", + obj_type, iid, sid, pos, name, pid); +} + +static void stats_print_rslv_field_json(struct buffer *out, + const struct field *stat, + const char *name, + int pos) +{ + chunk_appendf(out, + "{" + "\"field\":{\"pos\":%d,\"name\":\"%s\"},", + pos, name); +} + + +/* Dump all fields from <stats> into <out> using a typed "field:desc:type:value" format */ +static int stats_dump_fields_json(struct buffer *out, + const struct field *stats, size_t stats_count, + struct show_stat_ctx *ctx) +{ + int flags = ctx->flags; + int domain = ctx->domain; + int started = (ctx->field) ? 1 : 0; + int ready_data = 0; + + if (!started && (flags & STAT_STARTED) && !chunk_strcat(out, ",")) + return 0; + if (!started && !chunk_strcat(out, "[")) + return 0; + + for (; ctx->field < stats_count; ctx->field++) { + int old_len; + int field = ctx->field; + + if (!stats[field].type) + continue; + + if (started && !chunk_strcat(out, ",")) + goto err; + started = 1; + + old_len = out->data; + if (domain == STATS_DOMAIN_PROXY) { + stats_print_proxy_field_json(out, &stats[field], + stat_f[domain][field].name, + field, + stats[ST_F_TYPE].u.u32, + stats[ST_F_IID].u.u32, + stats[ST_F_SID].u.u32, + stats[ST_F_PID].u.u32); + } else if (domain == STATS_DOMAIN_RESOLVERS) { + stats_print_rslv_field_json(out, &stats[field], + stat_f[domain][field].name, + field); + } + + if (old_len == out->data) + goto err; + + if (!stats_emit_json_field_tags(out, &stats[field])) + goto err; + + if (!stats_emit_json_data_field(out, &stats[field])) + goto err; + + if (!chunk_strcat(out, "}")) + goto err; + ready_data = out->data; + } + + if (!chunk_strcat(out, "]")) + goto err; + + ctx->field = 0; /* we're done */ + return 1; + +err: + if (!ready_data) { + /* not enough buffer space for a single entry.. */ + chunk_reset(out); + if (ctx->flags & STAT_STARTED) + chunk_strcat(out, ","); + chunk_appendf(out, "{\"errorStr\":\"output buffer too short\"}"); + return 0; /* hard error */ + } + /* push ready data and wait for a new buffer to complete the dump */ + out->data = ready_data; + return 1; +} + +/* Dump all fields from <stats> into <out> using the HTML format. A column is + * reserved for the checkbox is STAT_ADMIN is set in <flags>. Some extra info + * are provided if STAT_SHLGNDS is present in <flags>. The statistics from + * extra modules are displayed at the end of the lines if STAT_SHMODULES is + * present in <flags>. + */ +static int stats_dump_fields_html(struct buffer *out, + const struct field *stats, + struct show_stat_ctx *ctx) +{ + struct buffer src; + struct stats_module *mod; + int flags = ctx->flags; + int i = 0, j = 0; + + if (stats[ST_F_TYPE].u.u32 == STATS_TYPE_FE) { + chunk_appendf(out, + /* name, queue */ + "<tr class=\"frontend\">"); + + if (flags & STAT_ADMIN) { + /* Column sub-heading for Enable or Disable server */ + chunk_appendf(out, "<td></td>"); + } + + chunk_appendf(out, + "<td class=ac>" + "<a name=\"%s/Frontend\"></a>" + "<a class=lfsb href=\"#%s/Frontend\">Frontend</a></td>" + "<td colspan=3></td>" + "", + field_str(stats, ST_F_PXNAME), field_str(stats, ST_F_PXNAME)); + + chunk_appendf(out, + /* sessions rate : current */ + "<td><u>%s<div class=tips><table class=det>" + "<tr><th>Current connection rate:</th><td>%s/s</td></tr>" + "<tr><th>Current session rate:</th><td>%s/s</td></tr>" + "", + U2H(stats[ST_F_RATE].u.u32), + U2H(stats[ST_F_CONN_RATE].u.u32), + U2H(stats[ST_F_RATE].u.u32)); + + if (strcmp(field_str(stats, ST_F_MODE), "http") == 0) + chunk_appendf(out, + "<tr><th>Current request rate:</th><td>%s/s</td></tr>", + U2H(stats[ST_F_REQ_RATE].u.u32)); + + chunk_appendf(out, + "</table></div></u></td>" + /* sessions rate : max */ + "<td><u>%s<div class=tips><table class=det>" + "<tr><th>Max connection rate:</th><td>%s/s</td></tr>" + "<tr><th>Max session rate:</th><td>%s/s</td></tr>" + "", + U2H(stats[ST_F_RATE_MAX].u.u32), + U2H(stats[ST_F_CONN_RATE_MAX].u.u32), + U2H(stats[ST_F_RATE_MAX].u.u32)); + + if (strcmp(field_str(stats, ST_F_MODE), "http") == 0) + chunk_appendf(out, + "<tr><th>Max request rate:</th><td>%s/s</td></tr>", + U2H(stats[ST_F_REQ_RATE_MAX].u.u32)); + + chunk_appendf(out, + "</table></div></u></td>" + /* sessions rate : limit */ + "<td>%s</td>", + LIM2A(stats[ST_F_RATE_LIM].u.u32, "-")); + + chunk_appendf(out, + /* sessions: current, max, limit, total */ + "<td>%s</td><td>%s</td><td>%s</td>" + "<td><u>%s<div class=tips><table class=det>" + "<tr><th>Cum. connections:</th><td>%s</td></tr>" + "<tr><th>Cum. sessions:</th><td>%s</td></tr>" + "", + U2H(stats[ST_F_SCUR].u.u32), U2H(stats[ST_F_SMAX].u.u32), U2H(stats[ST_F_SLIM].u.u32), + U2H(stats[ST_F_STOT].u.u64), + U2H(stats[ST_F_CONN_TOT].u.u64), + U2H(stats[ST_F_STOT].u.u64)); + + /* http response (via hover): 1xx, 2xx, 3xx, 4xx, 5xx, other */ + if (strcmp(field_str(stats, ST_F_MODE), "http") == 0) { + chunk_appendf(out, + "<tr><th>- HTTP/1 sessions:</th><td>%s</td></tr>" + "<tr><th>- HTTP/2 sessions:</th><td>%s</td></tr>" + "<tr><th>- HTTP/3 sessions:</th><td>%s</td></tr>" + "<tr><th>- other sessions:</th><td>%s</td></tr>" + "<tr><th>Cum. HTTP requests:</th><td>%s</td></tr>" + "<tr><th>- HTTP/1 requests:</th><td>%s</td></tr>" + "<tr><th>- HTTP/2 requests:</th><td>%s</td></tr>" + "<tr><th>- HTTP/3 requests:</th><td>%s</td></tr>" + "<tr><th>- other requests:</th><td>%s</td></tr>" + "", + U2H(stats[ST_F_H1SESS].u.u64), + U2H(stats[ST_F_H2SESS].u.u64), + U2H(stats[ST_F_H3SESS].u.u64), + U2H(stats[ST_F_SESS_OTHER].u.u64), + U2H(stats[ST_F_REQ_TOT].u.u64), + U2H(stats[ST_F_H1REQ].u.u64), + U2H(stats[ST_F_H2REQ].u.u64), + U2H(stats[ST_F_H3REQ].u.u64), + U2H(stats[ST_F_REQ_OTHER].u.u64)); + + chunk_appendf(out, + "<tr><th>- HTTP 1xx responses:</th><td>%s</td></tr>" + "<tr><th>- HTTP 2xx responses:</th><td>%s</td></tr>" + "<tr><th> Compressed 2xx:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>- HTTP 3xx responses:</th><td>%s</td></tr>" + "<tr><th>- HTTP 4xx responses:</th><td>%s</td></tr>" + "<tr><th>- HTTP 5xx responses:</th><td>%s</td></tr>" + "<tr><th>- other responses:</th><td>%s</td></tr>" + "", + U2H(stats[ST_F_HRSP_1XX].u.u64), + U2H(stats[ST_F_HRSP_2XX].u.u64), + U2H(stats[ST_F_COMP_RSP].u.u64), + stats[ST_F_HRSP_2XX].u.u64 ? + (int)(100 * stats[ST_F_COMP_RSP].u.u64 / stats[ST_F_HRSP_2XX].u.u64) : 0, + U2H(stats[ST_F_HRSP_3XX].u.u64), + U2H(stats[ST_F_HRSP_4XX].u.u64), + U2H(stats[ST_F_HRSP_5XX].u.u64), + U2H(stats[ST_F_HRSP_OTHER].u.u64)); + + chunk_appendf(out, + "<tr><th>Intercepted requests:</th><td>%s</td></tr>" + "<tr><th>Cache lookups:</th><td>%s</td></tr>" + "<tr><th>Cache hits:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>Failed hdr rewrites:</th><td>%s</td></tr>" + "<tr><th>Internal errors:</th><td>%s</td></tr>" + "", + U2H(stats[ST_F_INTERCEPTED].u.u64), + U2H(stats[ST_F_CACHE_LOOKUPS].u.u64), + U2H(stats[ST_F_CACHE_HITS].u.u64), + stats[ST_F_CACHE_LOOKUPS].u.u64 ? + (int)(100 * stats[ST_F_CACHE_HITS].u.u64 / stats[ST_F_CACHE_LOOKUPS].u.u64) : 0, + U2H(stats[ST_F_WREW].u.u64), + U2H(stats[ST_F_EINT].u.u64)); + } + + chunk_appendf(out, + "</table></div></u></td>" + /* sessions: lbtot, lastsess */ + "<td></td><td></td>" + /* bytes : in */ + "<td>%s</td>" + "", + U2H(stats[ST_F_BIN].u.u64)); + + chunk_appendf(out, + /* bytes:out + compression stats (via hover): comp_in, comp_out, comp_byp */ + "<td>%s%s<div class=tips><table class=det>" + "<tr><th>Response bytes in:</th><td>%s</td></tr>" + "<tr><th>Compression in:</th><td>%s</td></tr>" + "<tr><th>Compression out:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>Compression bypass:</th><td>%s</td></tr>" + "<tr><th>Total bytes saved:</th><td>%s</td><td>(%d%%)</td></tr>" + "</table></div>%s</td>", + (stats[ST_F_COMP_IN].u.u64 || stats[ST_F_COMP_BYP].u.u64) ? "<u>":"", + U2H(stats[ST_F_BOUT].u.u64), + U2H(stats[ST_F_BOUT].u.u64), + U2H(stats[ST_F_COMP_IN].u.u64), + U2H(stats[ST_F_COMP_OUT].u.u64), + stats[ST_F_COMP_IN].u.u64 ? (int)(stats[ST_F_COMP_OUT].u.u64 * 100 / stats[ST_F_COMP_IN].u.u64) : 0, + U2H(stats[ST_F_COMP_BYP].u.u64), + U2H(stats[ST_F_COMP_IN].u.u64 - stats[ST_F_COMP_OUT].u.u64), + stats[ST_F_BOUT].u.u64 ? (int)((stats[ST_F_COMP_IN].u.u64 - stats[ST_F_COMP_OUT].u.u64) * 100 / stats[ST_F_BOUT].u.u64) : 0, + (stats[ST_F_COMP_IN].u.u64 || stats[ST_F_COMP_BYP].u.u64) ? "</u>":""); + + chunk_appendf(out, + /* denied: req, resp */ + "<td>%s</td><td>%s</td>" + /* errors : request, connect, response */ + "<td>%s</td><td></td><td></td>" + /* warnings: retries, redispatches */ + "<td></td><td></td>" + /* server status : reflect frontend status */ + "<td class=ac>%s</td>" + /* rest of server: nothing */ + "<td class=ac colspan=8></td>" + "", + U2H(stats[ST_F_DREQ].u.u64), U2H(stats[ST_F_DRESP].u.u64), + U2H(stats[ST_F_EREQ].u.u64), + field_str(stats, ST_F_STATUS)); + + if (flags & STAT_SHMODULES) { + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + chunk_appendf(out, "<td>"); + + if (stats_px_get_cap(mod->domain_flags) & STATS_PX_CAP_FE) { + chunk_appendf(out, + "<u>%s<div class=tips><table class=det>", + mod->name); + for (j = 0; j < mod->stats_count; ++j) { + chunk_appendf(out, + "<tr><th>%s</th><td>%s</td></tr>", + mod->stats[j].desc, field_to_html_str(&stats[ST_F_TOTAL_FIELDS + i])); + ++i; + } + chunk_appendf(out, "</table></div></u>"); + } else { + i += mod->stats_count; + } + + chunk_appendf(out, "</td>"); + } + } + + chunk_appendf(out, "</tr>"); + } + else if (stats[ST_F_TYPE].u.u32 == STATS_TYPE_SO) { + chunk_appendf(out, "<tr class=socket>"); + if (flags & STAT_ADMIN) { + /* Column sub-heading for Enable or Disable server */ + chunk_appendf(out, "<td></td>"); + } + + chunk_appendf(out, + /* frontend name, listener name */ + "<td class=ac><a name=\"%s/+%s\"></a>%s" + "<a class=lfsb href=\"#%s/+%s\">%s</a>" + "", + field_str(stats, ST_F_PXNAME), field_str(stats, ST_F_SVNAME), + (flags & STAT_SHLGNDS)?"<u>":"", + field_str(stats, ST_F_PXNAME), field_str(stats, ST_F_SVNAME), field_str(stats, ST_F_SVNAME)); + + if (flags & STAT_SHLGNDS) { + chunk_appendf(out, "<div class=tips>"); + + if (isdigit((unsigned char)*field_str(stats, ST_F_ADDR))) + chunk_appendf(out, "IPv4: %s, ", field_str(stats, ST_F_ADDR)); + else if (*field_str(stats, ST_F_ADDR) == '[') + chunk_appendf(out, "IPv6: %s, ", field_str(stats, ST_F_ADDR)); + else if (*field_str(stats, ST_F_ADDR)) + chunk_appendf(out, "%s, ", field_str(stats, ST_F_ADDR)); + + chunk_appendf(out, "proto=%s, ", field_str(stats, ST_F_PROTO)); + + /* id */ + chunk_appendf(out, "id: %d</div>", stats[ST_F_SID].u.u32); + } + + chunk_appendf(out, + /* queue */ + "%s</td><td colspan=3></td>" + /* sessions rate: current, max, limit */ + "<td colspan=3> </td>" + /* sessions: current, max, limit, total, lbtot, lastsess */ + "<td>%s</td><td>%s</td><td>%s</td>" + "<td>%s</td><td> </td><td> </td>" + /* bytes: in, out */ + "<td>%s</td><td>%s</td>" + "", + (flags & STAT_SHLGNDS)?"</u>":"", + U2H(stats[ST_F_SCUR].u.u32), U2H(stats[ST_F_SMAX].u.u32), U2H(stats[ST_F_SLIM].u.u32), + U2H(stats[ST_F_STOT].u.u64), U2H(stats[ST_F_BIN].u.u64), U2H(stats[ST_F_BOUT].u.u64)); + + chunk_appendf(out, + /* denied: req, resp */ + "<td>%s</td><td>%s</td>" + /* errors: request, connect, response */ + "<td>%s</td><td></td><td></td>" + /* warnings: retries, redispatches */ + "<td></td><td></td>" + /* server status: reflect listener status */ + "<td class=ac>%s</td>" + /* rest of server: nothing */ + "<td class=ac colspan=8></td>" + "", + U2H(stats[ST_F_DREQ].u.u64), U2H(stats[ST_F_DRESP].u.u64), + U2H(stats[ST_F_EREQ].u.u64), + field_str(stats, ST_F_STATUS)); + + if (flags & STAT_SHMODULES) { + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + chunk_appendf(out, "<td>"); + + if (stats_px_get_cap(mod->domain_flags) & STATS_PX_CAP_LI) { + chunk_appendf(out, + "<u>%s<div class=tips><table class=det>", + mod->name); + for (j = 0; j < mod->stats_count; ++j) { + chunk_appendf(out, + "<tr><th>%s</th><td>%s</td></tr>", + mod->stats[j].desc, field_to_html_str(&stats[ST_F_TOTAL_FIELDS + i])); + ++i; + } + chunk_appendf(out, "</table></div></u>"); + } else { + i += mod->stats_count; + } + + chunk_appendf(out, "</td>"); + } + } + + chunk_appendf(out, "</tr>"); + } + else if (stats[ST_F_TYPE].u.u32 == STATS_TYPE_SV) { + const char *style; + + /* determine the style to use depending on the server's state, + * its health and weight. There isn't a 1-to-1 mapping between + * state and styles for the cases where the server is (still) + * up. The reason is that we don't want to report nolb and + * drain with the same color. + */ + + if (strcmp(field_str(stats, ST_F_STATUS), "DOWN") == 0 || + strcmp(field_str(stats, ST_F_STATUS), "DOWN (agent)") == 0) { + style = "down"; + } + else if (strncmp(field_str(stats, ST_F_STATUS), "DOWN ", strlen("DOWN ")) == 0) { + style = "going_up"; + } + else if (strcmp(field_str(stats, ST_F_STATUS), "DRAIN") == 0) { + style = "draining"; + } + else if (strncmp(field_str(stats, ST_F_STATUS), "NOLB ", strlen("NOLB ")) == 0) { + style = "going_down"; + } + else if (strcmp(field_str(stats, ST_F_STATUS), "NOLB") == 0) { + style = "nolb"; + } + else if (strcmp(field_str(stats, ST_F_STATUS), "no check") == 0) { + style = "no_check"; + } + else if (!stats[ST_F_CHKFAIL].type || + stats[ST_F_CHECK_HEALTH].u.u32 == stats[ST_F_CHECK_RISE].u.u32 + stats[ST_F_CHECK_FALL].u.u32 - 1) { + /* no check or max health = UP */ + if (stats[ST_F_WEIGHT].u.u32) + style = "up"; + else + style = "draining"; + } + else { + style = "going_down"; + } + + if (strncmp(field_str(stats, ST_F_STATUS), "MAINT", 5) == 0) + chunk_appendf(out, "<tr class=\"maintain\">"); + else + chunk_appendf(out, + "<tr class=\"%s_%s\">", + (stats[ST_F_BCK].u.u32) ? "backup" : "active", style); + + + if (flags & STAT_ADMIN) + chunk_appendf(out, + "<td><input class='%s-checkbox' type=\"checkbox\" name=\"s\" value=\"%s\"></td>", + field_str(stats, ST_F_PXNAME), + field_str(stats, ST_F_SVNAME)); + + chunk_appendf(out, + "<td class=ac><a name=\"%s/%s\"></a>%s" + "<a class=lfsb href=\"#%s/%s\">%s</a>" + "", + field_str(stats, ST_F_PXNAME), field_str(stats, ST_F_SVNAME), + (flags & STAT_SHLGNDS) ? "<u>" : "", + field_str(stats, ST_F_PXNAME), field_str(stats, ST_F_SVNAME), field_str(stats, ST_F_SVNAME)); + + if (flags & STAT_SHLGNDS) { + chunk_appendf(out, "<div class=tips>"); + + if (isdigit((unsigned char)*field_str(stats, ST_F_ADDR))) + chunk_appendf(out, "IPv4: %s, ", field_str(stats, ST_F_ADDR)); + else if (*field_str(stats, ST_F_ADDR) == '[') + chunk_appendf(out, "IPv6: %s, ", field_str(stats, ST_F_ADDR)); + else if (*field_str(stats, ST_F_ADDR)) + chunk_appendf(out, "%s, ", field_str(stats, ST_F_ADDR)); + + /* id */ + chunk_appendf(out, "id: %d, rid: %d", stats[ST_F_SID].u.u32, stats[ST_F_SRID].u.u32); + + /* cookie */ + if (stats[ST_F_COOKIE].type) { + chunk_appendf(out, ", cookie: '"); + chunk_initstr(&src, field_str(stats, ST_F_COOKIE)); + chunk_htmlencode(out, &src); + chunk_appendf(out, "'"); + } + + chunk_appendf(out, "</div>"); + } + + chunk_appendf(out, + /* queue : current, max, limit */ + "%s</td><td>%s</td><td>%s</td><td>%s</td>" + /* sessions rate : current, max, limit */ + "<td>%s</td><td>%s</td><td></td>" + "", + (flags & STAT_SHLGNDS) ? "</u>" : "", + U2H(stats[ST_F_QCUR].u.u32), U2H(stats[ST_F_QMAX].u.u32), LIM2A(stats[ST_F_QLIMIT].u.u32, "-"), + U2H(stats[ST_F_RATE].u.u32), U2H(stats[ST_F_RATE_MAX].u.u32)); + + chunk_appendf(out, + /* sessions: current, max, limit, total */ + "<td><u>%s<div class=tips>" + "<table class=det>" + "<tr><th>Current active connections:</th><td>%s</td></tr>" + "<tr><th>Current used connections:</th><td>%s</td></tr>" + "<tr><th>Current idle connections:</th><td>%s</td></tr>" + "<tr><th>- unsafe:</th><td>%s</td></tr>" + "<tr><th>- safe:</th><td>%s</td></tr>" + "<tr><th>Estimated need of connections:</th><td>%s</td></tr>" + "<tr><th>Active connections limit:</th><td>%s</td></tr>" + "<tr><th>Idle connections limit:</th><td>%s</td></tr>" + "</table></div></u>" + "</td><td>%s</td><td>%s</td>" + "<td><u>%s<div class=tips><table class=det>" + "<tr><th>Cum. sessions:</th><td>%s</td></tr>" + "", + U2H(stats[ST_F_SCUR].u.u32), + U2H(stats[ST_F_SCUR].u.u32), + U2H(stats[ST_F_USED_CONN_CUR].u.u32), + U2H(stats[ST_F_SRV_ICUR].u.u32), + U2H(stats[ST_F_IDLE_CONN_CUR].u.u32), + U2H(stats[ST_F_SAFE_CONN_CUR].u.u32), + U2H(stats[ST_F_NEED_CONN_EST].u.u32), + + LIM2A(stats[ST_F_SLIM].u.u32, "-"), + stats[ST_F_SRV_ILIM].type ? U2H(stats[ST_F_SRV_ILIM].u.u32) : "-", + U2H(stats[ST_F_SMAX].u.u32), LIM2A(stats[ST_F_SLIM].u.u32, "-"), + U2H(stats[ST_F_STOT].u.u64), + U2H(stats[ST_F_STOT].u.u64)); + + /* http response (via hover): 1xx, 2xx, 3xx, 4xx, 5xx, other */ + if (strcmp(field_str(stats, ST_F_MODE), "http") == 0) { + chunk_appendf(out, + "<tr><th>New connections:</th><td>%s</td></tr>" + "<tr><th>Reused connections:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>Cum. HTTP requests:</th><td>%s</td></tr>" + "<tr><th>- HTTP 1xx responses:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>- HTTP 2xx responses:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>- HTTP 3xx responses:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>- HTTP 4xx responses:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>- HTTP 5xx responses:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>- other responses:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>Failed hdr rewrites:</th><td>%s</td></tr>" + "<tr><th>Internal error:</th><td>%s</td></tr>" + "", + U2H(stats[ST_F_CONNECT].u.u64), + U2H(stats[ST_F_REUSE].u.u64), + (stats[ST_F_CONNECT].u.u64 + stats[ST_F_REUSE].u.u64) ? + (int)(100 * stats[ST_F_REUSE].u.u64 / (stats[ST_F_CONNECT].u.u64 + stats[ST_F_REUSE].u.u64)) : 0, + U2H(stats[ST_F_REQ_TOT].u.u64), + U2H(stats[ST_F_HRSP_1XX].u.u64), stats[ST_F_REQ_TOT].u.u64 ? + (int)(100 * stats[ST_F_HRSP_1XX].u.u64 / stats[ST_F_REQ_TOT].u.u64) : 0, + U2H(stats[ST_F_HRSP_2XX].u.u64), stats[ST_F_REQ_TOT].u.u64 ? + (int)(100 * stats[ST_F_HRSP_2XX].u.u64 / stats[ST_F_REQ_TOT].u.u64) : 0, + U2H(stats[ST_F_HRSP_3XX].u.u64), stats[ST_F_REQ_TOT].u.u64 ? + (int)(100 * stats[ST_F_HRSP_3XX].u.u64 / stats[ST_F_REQ_TOT].u.u64) : 0, + U2H(stats[ST_F_HRSP_4XX].u.u64), stats[ST_F_REQ_TOT].u.u64 ? + (int)(100 * stats[ST_F_HRSP_4XX].u.u64 / stats[ST_F_REQ_TOT].u.u64) : 0, + U2H(stats[ST_F_HRSP_5XX].u.u64), stats[ST_F_REQ_TOT].u.u64 ? + (int)(100 * stats[ST_F_HRSP_5XX].u.u64 / stats[ST_F_REQ_TOT].u.u64) : 0, + U2H(stats[ST_F_HRSP_OTHER].u.u64), stats[ST_F_REQ_TOT].u.u64 ? + (int)(100 * stats[ST_F_HRSP_OTHER].u.u64 / stats[ST_F_REQ_TOT].u.u64) : 0, + U2H(stats[ST_F_WREW].u.u64), + U2H(stats[ST_F_EINT].u.u64)); + } + + chunk_appendf(out, "<tr><th colspan=3>Max / Avg over last 1024 success. conn.</th></tr>"); + chunk_appendf(out, "<tr><th>- Queue time:</th><td>%s / %s</td><td>ms</td></tr>", + U2H(stats[ST_F_QT_MAX].u.u32), U2H(stats[ST_F_QTIME].u.u32)); + chunk_appendf(out, "<tr><th>- Connect time:</th><td>%s / %s</td><td>ms</td></tr>", + U2H(stats[ST_F_CT_MAX].u.u32), U2H(stats[ST_F_CTIME].u.u32)); + if (strcmp(field_str(stats, ST_F_MODE), "http") == 0) + chunk_appendf(out, "<tr><th>- Responses time:</th><td>%s / %s</td><td>ms</td></tr>", + U2H(stats[ST_F_RT_MAX].u.u32), U2H(stats[ST_F_RTIME].u.u32)); + chunk_appendf(out, "<tr><th>- Total time:</th><td>%s / %s</td><td>ms</td></tr>", + U2H(stats[ST_F_TT_MAX].u.u32), U2H(stats[ST_F_TTIME].u.u32)); + + chunk_appendf(out, + "</table></div></u></td>" + /* sessions: lbtot, last */ + "<td>%s</td><td>%s</td>", + U2H(stats[ST_F_LBTOT].u.u64), + human_time(stats[ST_F_LASTSESS].u.s32, 1)); + + chunk_appendf(out, + /* bytes : in, out */ + "<td>%s</td><td>%s</td>" + /* denied: req, resp */ + "<td></td><td>%s</td>" + /* errors : request, connect */ + "<td></td><td>%s</td>" + /* errors : response */ + "<td><u>%s<div class=tips>Connection resets during transfers: %lld client, %lld server</div></u></td>" + /* warnings: retries, redispatches */ + "<td>%lld</td><td>%lld</td>" + "", + U2H(stats[ST_F_BIN].u.u64), U2H(stats[ST_F_BOUT].u.u64), + U2H(stats[ST_F_DRESP].u.u64), + U2H(stats[ST_F_ECON].u.u64), + U2H(stats[ST_F_ERESP].u.u64), + (long long)stats[ST_F_CLI_ABRT].u.u64, + (long long)stats[ST_F_SRV_ABRT].u.u64, + (long long)stats[ST_F_WRETR].u.u64, + (long long)stats[ST_F_WREDIS].u.u64); + + /* status, last change */ + chunk_appendf(out, "<td class=ac>"); + + /* FIXME!!!! + * LASTCHG should contain the last change for *this* server and must be computed + * properly above, as was done below, ie: this server if maint, otherwise ref server + * if tracking. Note that ref is either local or remote depending on tracking. + */ + + + if (strncmp(field_str(stats, ST_F_STATUS), "MAINT", 5) == 0) { + chunk_appendf(out, "%s MAINT", human_time(stats[ST_F_LASTCHG].u.u32, 1)); + } + else if (strcmp(field_str(stats, ST_F_STATUS), "no check") == 0) { + chunk_strcat(out, "<i>no check</i>"); + } + else { + chunk_appendf(out, "%s %s", human_time(stats[ST_F_LASTCHG].u.u32, 1), field_str(stats, ST_F_STATUS)); + if (strncmp(field_str(stats, ST_F_STATUS), "DOWN", 4) == 0) { + if (stats[ST_F_CHECK_HEALTH].u.u32) + chunk_strcat(out, " ↑"); + } + else if (stats[ST_F_CHECK_HEALTH].u.u32 < stats[ST_F_CHECK_RISE].u.u32 + stats[ST_F_CHECK_FALL].u.u32 - 1) + chunk_strcat(out, " ↓"); + } + + if (strncmp(field_str(stats, ST_F_STATUS), "DOWN", 4) == 0 && + stats[ST_F_AGENT_STATUS].type && !stats[ST_F_AGENT_HEALTH].u.u32) { + chunk_appendf(out, + "</td><td class=ac><u> %s", + field_str(stats, ST_F_AGENT_STATUS)); + + if (stats[ST_F_AGENT_CODE].type) + chunk_appendf(out, "/%d", stats[ST_F_AGENT_CODE].u.u32); + + if (stats[ST_F_AGENT_DURATION].type) + chunk_appendf(out, " in %lums", (long)stats[ST_F_AGENT_DURATION].u.u64); + + chunk_appendf(out, "<div class=tips>%s", field_str(stats, ST_F_AGENT_DESC)); + + if (*field_str(stats, ST_F_LAST_AGT)) { + chunk_appendf(out, ": "); + chunk_initstr(&src, field_str(stats, ST_F_LAST_AGT)); + chunk_htmlencode(out, &src); + } + chunk_appendf(out, "</div></u>"); + } + else if (stats[ST_F_CHECK_STATUS].type) { + chunk_appendf(out, + "</td><td class=ac><u> %s", + field_str(stats, ST_F_CHECK_STATUS)); + + if (stats[ST_F_CHECK_CODE].type) + chunk_appendf(out, "/%d", stats[ST_F_CHECK_CODE].u.u32); + + if (stats[ST_F_CHECK_DURATION].type) + chunk_appendf(out, " in %lums", (long)stats[ST_F_CHECK_DURATION].u.u64); + + chunk_appendf(out, "<div class=tips>%s", field_str(stats, ST_F_CHECK_DESC)); + + if (*field_str(stats, ST_F_LAST_CHK)) { + chunk_appendf(out, ": "); + chunk_initstr(&src, field_str(stats, ST_F_LAST_CHK)); + chunk_htmlencode(out, &src); + } + chunk_appendf(out, "</div></u>"); + } + else + chunk_appendf(out, "</td><td>"); + + chunk_appendf(out, + /* weight / uweight */ + "</td><td class=ac>%d/%d</td>" + /* act, bck */ + "<td class=ac>%s</td><td class=ac>%s</td>" + "", + stats[ST_F_WEIGHT].u.u32, stats[ST_F_UWEIGHT].u.u32, + stats[ST_F_BCK].u.u32 ? "-" : "Y", + stats[ST_F_BCK].u.u32 ? "Y" : "-"); + + /* check failures: unique, fatal, down time */ + if (strcmp(field_str(stats, ST_F_STATUS), "MAINT (resolution)") == 0) { + chunk_appendf(out, "<td class=ac colspan=3>resolution</td>"); + } + else if (stats[ST_F_CHKFAIL].type) { + chunk_appendf(out, "<td><u>%lld", (long long)stats[ST_F_CHKFAIL].u.u64); + + if (stats[ST_F_HANAFAIL].type) + chunk_appendf(out, "/%lld", (long long)stats[ST_F_HANAFAIL].u.u64); + + chunk_appendf(out, + "<div class=tips>Failed Health Checks%s</div></u></td>" + "<td>%lld</td><td>%s</td>" + "", + stats[ST_F_HANAFAIL].type ? "/Health Analyses" : "", + (long long)stats[ST_F_CHKDOWN].u.u64, human_time(stats[ST_F_DOWNTIME].u.u32, 1)); + } + else if (strcmp(field_str(stats, ST_F_STATUS), "MAINT") != 0 && field_format(stats, ST_F_TRACKED) == FF_STR) { + /* tracking a server (hence inherited maint would appear as "MAINT (via...)" */ + chunk_appendf(out, + "<td class=ac colspan=3><a class=lfsb href=\"#%s\">via %s</a></td>", + field_str(stats, ST_F_TRACKED), field_str(stats, ST_F_TRACKED)); + } + else + chunk_appendf(out, "<td colspan=3></td>"); + + /* throttle */ + if (stats[ST_F_THROTTLE].type) + chunk_appendf(out, "<td class=ac>%d %%</td>\n", stats[ST_F_THROTTLE].u.u32); + else + chunk_appendf(out, "<td class=ac>-</td>"); + + if (flags & STAT_SHMODULES) { + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + chunk_appendf(out, "<td>"); + + if (stats_px_get_cap(mod->domain_flags) & STATS_PX_CAP_SRV) { + chunk_appendf(out, + "<u>%s<div class=tips><table class=det>", + mod->name); + for (j = 0; j < mod->stats_count; ++j) { + chunk_appendf(out, + "<tr><th>%s</th><td>%s</td></tr>", + mod->stats[j].desc, field_to_html_str(&stats[ST_F_TOTAL_FIELDS + i])); + ++i; + } + chunk_appendf(out, "</table></div></u>"); + } else { + i += mod->stats_count; + } + + chunk_appendf(out, "</td>"); + } + } + + chunk_appendf(out, "</tr>\n"); + } + else if (stats[ST_F_TYPE].u.u32 == STATS_TYPE_BE) { + chunk_appendf(out, "<tr class=\"backend\">"); + if (flags & STAT_ADMIN) { + /* Column sub-heading for Enable or Disable server */ + chunk_appendf(out, "<td></td>"); + } + chunk_appendf(out, + "<td class=ac>" + /* name */ + "%s<a name=\"%s/Backend\"></a>" + "<a class=lfsb href=\"#%s/Backend\">Backend</a>" + "", + (flags & STAT_SHLGNDS)?"<u>":"", + field_str(stats, ST_F_PXNAME), field_str(stats, ST_F_PXNAME)); + + if (flags & STAT_SHLGNDS) { + /* balancing */ + chunk_appendf(out, "<div class=tips>balancing: %s", + field_str(stats, ST_F_ALGO)); + + /* cookie */ + if (stats[ST_F_COOKIE].type) { + chunk_appendf(out, ", cookie: '"); + chunk_initstr(&src, field_str(stats, ST_F_COOKIE)); + chunk_htmlencode(out, &src); + chunk_appendf(out, "'"); + } + chunk_appendf(out, "</div>"); + } + + chunk_appendf(out, + "%s</td>" + /* queue : current, max */ + "<td>%s</td><td>%s</td><td></td>" + /* sessions rate : current, max, limit */ + "<td>%s</td><td>%s</td><td></td>" + "", + (flags & STAT_SHLGNDS)?"</u>":"", + U2H(stats[ST_F_QCUR].u.u32), U2H(stats[ST_F_QMAX].u.u32), + U2H(stats[ST_F_RATE].u.u32), U2H(stats[ST_F_RATE_MAX].u.u32)); + + chunk_appendf(out, + /* sessions: current, max, limit, total */ + "<td>%s</td><td>%s</td><td>%s</td>" + "<td><u>%s<div class=tips><table class=det>" + "<tr><th>Cum. sessions:</th><td>%s</td></tr>" + "", + U2H(stats[ST_F_SCUR].u.u32), U2H(stats[ST_F_SMAX].u.u32), U2H(stats[ST_F_SLIM].u.u32), + U2H(stats[ST_F_STOT].u.u64), + U2H(stats[ST_F_STOT].u.u64)); + + /* http response (via hover): 1xx, 2xx, 3xx, 4xx, 5xx, other */ + if (strcmp(field_str(stats, ST_F_MODE), "http") == 0) { + chunk_appendf(out, + "<tr><th>New connections:</th><td>%s</td></tr>" + "<tr><th>Reused connections:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>Cum. HTTP requests:</th><td>%s</td></tr>" + "<tr><th>- HTTP 1xx responses:</th><td>%s</td></tr>" + "<tr><th>- HTTP 2xx responses:</th><td>%s</td></tr>" + "<tr><th> Compressed 2xx:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>- HTTP 3xx responses:</th><td>%s</td></tr>" + "<tr><th>- HTTP 4xx responses:</th><td>%s</td></tr>" + "<tr><th>- HTTP 5xx responses:</th><td>%s</td></tr>" + "<tr><th>- other responses:</th><td>%s</td></tr>" + "<tr><th>Cache lookups:</th><td>%s</td></tr>" + "<tr><th>Cache hits:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>Failed hdr rewrites:</th><td>%s</td></tr>" + "<tr><th>Internal errors:</th><td>%s</td></tr>" + "", + U2H(stats[ST_F_CONNECT].u.u64), + U2H(stats[ST_F_REUSE].u.u64), + (stats[ST_F_CONNECT].u.u64 + stats[ST_F_REUSE].u.u64) ? + (int)(100 * stats[ST_F_REUSE].u.u64 / (stats[ST_F_CONNECT].u.u64 + stats[ST_F_REUSE].u.u64)) : 0, + U2H(stats[ST_F_REQ_TOT].u.u64), + U2H(stats[ST_F_HRSP_1XX].u.u64), + U2H(stats[ST_F_HRSP_2XX].u.u64), + U2H(stats[ST_F_COMP_RSP].u.u64), + stats[ST_F_HRSP_2XX].u.u64 ? + (int)(100 * stats[ST_F_COMP_RSP].u.u64 / stats[ST_F_HRSP_2XX].u.u64) : 0, + U2H(stats[ST_F_HRSP_3XX].u.u64), + U2H(stats[ST_F_HRSP_4XX].u.u64), + U2H(stats[ST_F_HRSP_5XX].u.u64), + U2H(stats[ST_F_HRSP_OTHER].u.u64), + U2H(stats[ST_F_CACHE_LOOKUPS].u.u64), + U2H(stats[ST_F_CACHE_HITS].u.u64), + stats[ST_F_CACHE_LOOKUPS].u.u64 ? + (int)(100 * stats[ST_F_CACHE_HITS].u.u64 / stats[ST_F_CACHE_LOOKUPS].u.u64) : 0, + U2H(stats[ST_F_WREW].u.u64), + U2H(stats[ST_F_EINT].u.u64)); + } + + chunk_appendf(out, "<tr><th colspan=3>Max / Avg over last 1024 success. conn.</th></tr>"); + chunk_appendf(out, "<tr><th>- Queue time:</th><td>%s / %s</td><td>ms</td></tr>", + U2H(stats[ST_F_QT_MAX].u.u32), U2H(stats[ST_F_QTIME].u.u32)); + chunk_appendf(out, "<tr><th>- Connect time:</th><td>%s / %s</td><td>ms</td></tr>", + U2H(stats[ST_F_CT_MAX].u.u32), U2H(stats[ST_F_CTIME].u.u32)); + if (strcmp(field_str(stats, ST_F_MODE), "http") == 0) + chunk_appendf(out, "<tr><th>- Responses time:</th><td>%s / %s</td><td>ms</td></tr>", + U2H(stats[ST_F_RT_MAX].u.u32), U2H(stats[ST_F_RTIME].u.u32)); + chunk_appendf(out, "<tr><th>- Total time:</th><td>%s / %s</td><td>ms</td></tr>", + U2H(stats[ST_F_TT_MAX].u.u32), U2H(stats[ST_F_TTIME].u.u32)); + + chunk_appendf(out, + "</table></div></u></td>" + /* sessions: lbtot, last */ + "<td>%s</td><td>%s</td>" + /* bytes: in */ + "<td>%s</td>" + "", + U2H(stats[ST_F_LBTOT].u.u64), + human_time(stats[ST_F_LASTSESS].u.s32, 1), + U2H(stats[ST_F_BIN].u.u64)); + + chunk_appendf(out, + /* bytes:out + compression stats (via hover): comp_in, comp_out, comp_byp */ + "<td>%s%s<div class=tips><table class=det>" + "<tr><th>Response bytes in:</th><td>%s</td></tr>" + "<tr><th>Compression in:</th><td>%s</td></tr>" + "<tr><th>Compression out:</th><td>%s</td><td>(%d%%)</td></tr>" + "<tr><th>Compression bypass:</th><td>%s</td></tr>" + "<tr><th>Total bytes saved:</th><td>%s</td><td>(%d%%)</td></tr>" + "</table></div>%s</td>", + (stats[ST_F_COMP_IN].u.u64 || stats[ST_F_COMP_BYP].u.u64) ? "<u>":"", + U2H(stats[ST_F_BOUT].u.u64), + U2H(stats[ST_F_BOUT].u.u64), + U2H(stats[ST_F_COMP_IN].u.u64), + U2H(stats[ST_F_COMP_OUT].u.u64), + stats[ST_F_COMP_IN].u.u64 ? (int)(stats[ST_F_COMP_OUT].u.u64 * 100 / stats[ST_F_COMP_IN].u.u64) : 0, + U2H(stats[ST_F_COMP_BYP].u.u64), + U2H(stats[ST_F_COMP_IN].u.u64 - stats[ST_F_COMP_OUT].u.u64), + stats[ST_F_BOUT].u.u64 ? (int)((stats[ST_F_COMP_IN].u.u64 - stats[ST_F_COMP_OUT].u.u64) * 100 / stats[ST_F_BOUT].u.u64) : 0, + (stats[ST_F_COMP_IN].u.u64 || stats[ST_F_COMP_BYP].u.u64) ? "</u>":""); + + chunk_appendf(out, + /* denied: req, resp */ + "<td>%s</td><td>%s</td>" + /* errors : request, connect */ + "<td></td><td>%s</td>" + /* errors : response */ + "<td><u>%s<div class=tips>Connection resets during transfers: %lld client, %lld server</div></u></td>" + /* warnings: retries, redispatches */ + "<td>%lld</td><td>%lld</td>" + /* backend status: reflect backend status (up/down): we display UP + * if the backend has known working servers or if it has no server at + * all (eg: for stats). Then we display the total weight, number of + * active and backups. */ + "<td class=ac>%s %s</td><td class=ac> </td><td class=ac>%d/%d</td>" + "<td class=ac>%d</td><td class=ac>%d</td>" + "", + U2H(stats[ST_F_DREQ].u.u64), U2H(stats[ST_F_DRESP].u.u64), + U2H(stats[ST_F_ECON].u.u64), + U2H(stats[ST_F_ERESP].u.u64), + (long long)stats[ST_F_CLI_ABRT].u.u64, + (long long)stats[ST_F_SRV_ABRT].u.u64, + (long long)stats[ST_F_WRETR].u.u64, (long long)stats[ST_F_WREDIS].u.u64, + human_time(stats[ST_F_LASTCHG].u.u32, 1), + strcmp(field_str(stats, ST_F_STATUS), "DOWN") ? field_str(stats, ST_F_STATUS) : "<font color=\"red\"><b>DOWN</b></font>", + stats[ST_F_WEIGHT].u.u32, stats[ST_F_UWEIGHT].u.u32, + stats[ST_F_ACT].u.u32, stats[ST_F_BCK].u.u32); + + chunk_appendf(out, + /* rest of backend: nothing, down transitions, total downtime, throttle */ + "<td class=ac> </td><td>%d</td>" + "<td>%s</td>" + "<td></td>", + stats[ST_F_CHKDOWN].u.u32, + stats[ST_F_DOWNTIME].type ? human_time(stats[ST_F_DOWNTIME].u.u32, 1) : " "); + + if (flags & STAT_SHMODULES) { + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + chunk_appendf(out, "<td>"); + + if (stats_px_get_cap(mod->domain_flags) & STATS_PX_CAP_BE) { + chunk_appendf(out, + "<u>%s<div class=tips><table class=det>", + mod->name); + for (j = 0; j < mod->stats_count; ++j) { + chunk_appendf(out, + "<tr><th>%s</th><td>%s</td></tr>", + mod->stats[j].desc, field_to_html_str(&stats[ST_F_TOTAL_FIELDS + i])); + ++i; + } + chunk_appendf(out, "</table></div></u>"); + } else { + i += mod->stats_count; + } + + chunk_appendf(out, "</td>"); + } + } + + chunk_appendf(out, "</tr>"); + } + + return 1; +} + +int stats_dump_one_line(const struct field *stats, size_t stats_count, + struct appctx *appctx) +{ + struct show_stat_ctx *ctx = appctx->svcctx; + int ret; + + if (ctx->flags & STAT_FMT_HTML) + ret = stats_dump_fields_html(&trash_chunk, stats, ctx); + else if (ctx->flags & STAT_FMT_TYPED) + ret = stats_dump_fields_typed(&trash_chunk, stats, stats_count, ctx); + else if (ctx->flags & STAT_FMT_JSON) + ret = stats_dump_fields_json(&trash_chunk, stats, stats_count, ctx); + else + ret = stats_dump_fields_csv(&trash_chunk, stats, stats_count, ctx); + + return ret; +} + +/* Fill <stats> with the frontend statistics. <stats> is preallocated array of + * length <len>. If <selected_field> is != NULL, only fill this one. The length + * of the array must be at least ST_F_TOTAL_FIELDS. If this length is less than + * this value, or if the selected field is not implemented for frontends, the + * function returns 0, otherwise, it returns 1. + */ +int stats_fill_fe_stats(struct proxy *px, struct field *stats, int len, + enum stat_field *selected_field) +{ + enum stat_field current_field = (selected_field != NULL ? *selected_field : 0); + + if (len < ST_F_TOTAL_FIELDS) + return 0; + + for (; current_field < ST_F_TOTAL_FIELDS; current_field++) { + struct field metric = { 0 }; + + switch (current_field) { + case ST_F_PXNAME: + metric = mkf_str(FO_KEY|FN_NAME|FS_SERVICE, px->id); + break; + case ST_F_SVNAME: + metric = mkf_str(FO_KEY|FN_NAME|FS_SERVICE, "FRONTEND"); + break; + case ST_F_MODE: + metric = mkf_str(FO_CONFIG|FS_SERVICE, proxy_mode_str(px->mode)); + break; + case ST_F_SCUR: + metric = mkf_u32(0, px->feconn); + break; + case ST_F_SMAX: + metric = mkf_u32(FN_MAX, px->fe_counters.conn_max); + break; + case ST_F_SLIM: + metric = mkf_u32(FO_CONFIG|FN_LIMIT, px->maxconn); + break; + case ST_F_STOT: + metric = mkf_u64(FN_COUNTER, px->fe_counters.cum_sess); + break; + case ST_F_BIN: + metric = mkf_u64(FN_COUNTER, px->fe_counters.bytes_in); + break; + case ST_F_BOUT: + metric = mkf_u64(FN_COUNTER, px->fe_counters.bytes_out); + break; + case ST_F_DREQ: + metric = mkf_u64(FN_COUNTER, px->fe_counters.denied_req); + break; + case ST_F_DRESP: + metric = mkf_u64(FN_COUNTER, px->fe_counters.denied_resp); + break; + case ST_F_EREQ: + metric = mkf_u64(FN_COUNTER, px->fe_counters.failed_req); + break; + case ST_F_DCON: + metric = mkf_u64(FN_COUNTER, px->fe_counters.denied_conn); + break; + case ST_F_DSES: + metric = mkf_u64(FN_COUNTER, px->fe_counters.denied_sess); + break; + case ST_F_STATUS: { + const char *state; + + if (px->flags & (PR_FL_DISABLED|PR_FL_STOPPED)) + state = "STOP"; + else if (px->flags & PR_FL_PAUSED) + state = "PAUSED"; + else + state = "OPEN"; + metric = mkf_str(FO_STATUS, state); + break; + } + case ST_F_PID: + metric = mkf_u32(FO_KEY, 1); + break; + case ST_F_IID: + metric = mkf_u32(FO_KEY|FS_SERVICE, px->uuid); + break; + case ST_F_SID: + metric = mkf_u32(FO_KEY|FS_SERVICE, 0); + break; + case ST_F_TYPE: + metric = mkf_u32(FO_CONFIG|FS_SERVICE, STATS_TYPE_FE); + break; + case ST_F_RATE: + metric = mkf_u32(FN_RATE, read_freq_ctr(&px->fe_sess_per_sec)); + break; + case ST_F_RATE_LIM: + metric = mkf_u32(FO_CONFIG|FN_LIMIT, px->fe_sps_lim); + break; + case ST_F_RATE_MAX: + metric = mkf_u32(FN_MAX, px->fe_counters.sps_max); + break; + case ST_F_WREW: + metric = mkf_u64(FN_COUNTER, px->fe_counters.failed_rewrites); + break; + case ST_F_EINT: + metric = mkf_u64(FN_COUNTER, px->fe_counters.internal_errors); + break; + case ST_F_HRSP_1XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.rsp[1]); + break; + case ST_F_HRSP_2XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.rsp[2]); + break; + case ST_F_HRSP_3XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.rsp[3]); + break; + case ST_F_HRSP_4XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.rsp[4]); + break; + case ST_F_HRSP_5XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.rsp[5]); + break; + case ST_F_HRSP_OTHER: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.rsp[0]); + break; + case ST_F_INTERCEPTED: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.intercepted_req); + break; + case ST_F_CACHE_LOOKUPS: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.cache_lookups); + break; + case ST_F_CACHE_HITS: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.cache_hits); + break; + case ST_F_REQ_RATE: + metric = mkf_u32(FN_RATE, read_freq_ctr(&px->fe_req_per_sec)); + break; + case ST_F_REQ_RATE_MAX: + metric = mkf_u32(FN_MAX, px->fe_counters.p.http.rps_max); + break; + case ST_F_REQ_TOT: { + int i; + uint64_t total_req; + size_t nb_reqs = + sizeof(px->fe_counters.p.http.cum_req) / sizeof(*px->fe_counters.p.http.cum_req); + + total_req = 0; + for (i = 0; i < nb_reqs; i++) + total_req += px->fe_counters.p.http.cum_req[i]; + metric = mkf_u64(FN_COUNTER, total_req); + break; + } + case ST_F_COMP_IN: + metric = mkf_u64(FN_COUNTER, px->fe_counters.comp_in[COMP_DIR_RES]); + break; + case ST_F_COMP_OUT: + metric = mkf_u64(FN_COUNTER, px->fe_counters.comp_out[COMP_DIR_RES]); + break; + case ST_F_COMP_BYP: + metric = mkf_u64(FN_COUNTER, px->fe_counters.comp_byp[COMP_DIR_RES]); + break; + case ST_F_COMP_RSP: + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.comp_rsp); + break; + case ST_F_CONN_RATE: + metric = mkf_u32(FN_RATE, read_freq_ctr(&px->fe_conn_per_sec)); + break; + case ST_F_CONN_RATE_MAX: + metric = mkf_u32(FN_MAX, px->fe_counters.cps_max); + break; + case ST_F_CONN_TOT: + metric = mkf_u64(FN_COUNTER, px->fe_counters.cum_conn); + break; + case ST_F_SESS_OTHER: { + int i; + uint64_t total_sess; + size_t nb_sess = + sizeof(px->fe_counters.cum_sess_ver) / sizeof(*px->fe_counters.cum_sess_ver); + + total_sess = px->fe_counters.cum_sess; + for (i = 0; i < nb_sess; i++) + total_sess -= px->fe_counters.cum_sess_ver[i]; + total_sess = (int64_t)total_sess < 0 ? 0 : total_sess; + metric = mkf_u64(FN_COUNTER, total_sess); + break; + } + case ST_F_H1SESS: + metric = mkf_u64(FN_COUNTER, px->fe_counters.cum_sess_ver[0]); + break; + case ST_F_H2SESS: + metric = mkf_u64(FN_COUNTER, px->fe_counters.cum_sess_ver[1]); + break; + case ST_F_H3SESS: + metric = mkf_u64(FN_COUNTER, px->fe_counters.cum_sess_ver[2]); + break; + case ST_F_REQ_OTHER: + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.cum_req[0]); + break; + case ST_F_H1REQ: + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.cum_req[1]); + break; + case ST_F_H2REQ: + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.cum_req[2]); + break; + case ST_F_H3REQ: + metric = mkf_u64(FN_COUNTER, px->fe_counters.p.http.cum_req[3]); + break; + default: + /* not used for frontends. If a specific metric + * is requested, return an error. Otherwise continue. + */ + if (selected_field != NULL) + return 0; + continue; + } + stats[current_field] = metric; + if (selected_field != NULL) + break; + } + return 1; +} + +/* Dumps a frontend's line to the local trash buffer for the current proxy <px> + * and uses the state from stream connector <sc>. The caller is responsible for + * clearing the local trash buffer if needed. Returns non-zero if it emits + * anything, zero otherwise. + */ +static int stats_dump_fe_stats(struct stconn *sc, struct proxy *px) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct field *stats = stat_l[STATS_DOMAIN_PROXY]; + struct stats_module *mod; + size_t stats_count = ST_F_TOTAL_FIELDS; + + if (!(px->cap & PR_CAP_FE)) + return 0; + + if ((ctx->flags & STAT_BOUND) && !(ctx->type & (1 << STATS_TYPE_FE))) + return 0; + + memset(stats, 0, sizeof(struct field) * stat_count[STATS_DOMAIN_PROXY]); + + if (!stats_fill_fe_stats(px, stats, ST_F_TOTAL_FIELDS, NULL)) + return 0; + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + void *counters; + + if (!(stats_px_get_cap(mod->domain_flags) & STATS_PX_CAP_FE)) { + stats_count += mod->stats_count; + continue; + } + + counters = EXTRA_COUNTERS_GET(px->extra_counters_fe, mod); + mod->fill_stats(counters, stats + stats_count); + stats_count += mod->stats_count; + } + + return stats_dump_one_line(stats, stats_count, appctx); +} + +/* Fill <stats> with the listener statistics. <stats> is preallocated array of + * length <len>. The length of the array must be at least ST_F_TOTAL_FIELDS. If + * this length is less then this value, the function returns 0, otherwise, it + * returns 1. If selected_field is != NULL, only fill this one. <flags> can + * take the value STAT_SHLGNDS. + */ +int stats_fill_li_stats(struct proxy *px, struct listener *l, int flags, + struct field *stats, int len, enum stat_field *selected_field) +{ + enum stat_field current_field = (selected_field != NULL ? *selected_field : 0); + struct buffer *out = get_trash_chunk(); + + if (len < ST_F_TOTAL_FIELDS) + return 0; + + if (!l->counters) + return 0; + + chunk_reset(out); + + for (; current_field < ST_F_TOTAL_FIELDS; current_field++) { + struct field metric = { 0 }; + + switch (current_field) { + case ST_F_PXNAME: + metric = mkf_str(FO_KEY|FN_NAME|FS_SERVICE, px->id); + break; + case ST_F_SVNAME: + metric = mkf_str(FO_KEY|FN_NAME|FS_SERVICE, l->name); + break; + case ST_F_MODE: + metric = mkf_str(FO_CONFIG|FS_SERVICE, proxy_mode_str(px->mode)); + break; + case ST_F_SCUR: + metric = mkf_u32(0, l->nbconn); + break; + case ST_F_SMAX: + metric = mkf_u32(FN_MAX, l->counters->conn_max); + break; + case ST_F_SLIM: + metric = mkf_u32(FO_CONFIG|FN_LIMIT, l->bind_conf->maxconn); + break; + case ST_F_STOT: + metric = mkf_u64(FN_COUNTER, l->counters->cum_conn); + break; + case ST_F_BIN: + metric = mkf_u64(FN_COUNTER, l->counters->bytes_in); + break; + case ST_F_BOUT: + metric = mkf_u64(FN_COUNTER, l->counters->bytes_out); + break; + case ST_F_DREQ: + metric = mkf_u64(FN_COUNTER, l->counters->denied_req); + break; + case ST_F_DRESP: + metric = mkf_u64(FN_COUNTER, l->counters->denied_resp); + break; + case ST_F_EREQ: + metric = mkf_u64(FN_COUNTER, l->counters->failed_req); + break; + case ST_F_DCON: + metric = mkf_u64(FN_COUNTER, l->counters->denied_conn); + break; + case ST_F_DSES: + metric = mkf_u64(FN_COUNTER, l->counters->denied_sess); + break; + case ST_F_STATUS: + metric = mkf_str(FO_STATUS, li_status_st[get_li_status(l)]); + break; + case ST_F_PID: + metric = mkf_u32(FO_KEY, 1); + break; + case ST_F_IID: + metric = mkf_u32(FO_KEY|FS_SERVICE, px->uuid); + break; + case ST_F_SID: + metric = mkf_u32(FO_KEY|FS_SERVICE, l->luid); + break; + case ST_F_TYPE: + metric = mkf_u32(FO_CONFIG|FS_SERVICE, STATS_TYPE_SO); + break; + case ST_F_WREW: + metric = mkf_u64(FN_COUNTER, l->counters->failed_rewrites); + break; + case ST_F_EINT: + metric = mkf_u64(FN_COUNTER, l->counters->internal_errors); + break; + case ST_F_ADDR: + if (flags & STAT_SHLGNDS) { + char str[INET6_ADDRSTRLEN]; + int port; + + port = get_host_port(&l->rx.addr); + switch (addr_to_str(&l->rx.addr, str, sizeof(str))) { + case AF_INET: + metric = mkf_str(FO_CONFIG|FS_SERVICE, chunk_newstr(out)); + chunk_appendf(out, "%s:%d", str, port); + break; + case AF_INET6: + metric = mkf_str(FO_CONFIG|FS_SERVICE, chunk_newstr(out)); + chunk_appendf(out, "[%s]:%d", str, port); + break; + case AF_UNIX: + metric = mkf_str(FO_CONFIG|FS_SERVICE, "unix"); + break; + case -1: + metric = mkf_str(FO_CONFIG|FS_SERVICE, chunk_newstr(out)); + chunk_strcat(out, strerror(errno)); + break; + default: /* address family not supported */ + break; + } + } + break; + case ST_F_PROTO: + metric = mkf_str(FO_STATUS, l->rx.proto->name); + break; + default: + /* not used for listen. If a specific metric + * is requested, return an error. Otherwise continue. + */ + if (selected_field != NULL) + return 0; + continue; + } + stats[current_field] = metric; + if (selected_field != NULL) + break; + } + return 1; +} + +/* Dumps a line for listener <l> and proxy <px> to the local trash buffer and + * uses the state from stream connector <sc>. The caller is responsible for + * clearing the local trash buffer if needed. Returns non-zero if it emits + * anything, zero otherwise. + */ +static int stats_dump_li_stats(struct stconn *sc, struct proxy *px, struct listener *l) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct field *stats = stat_l[STATS_DOMAIN_PROXY]; + struct stats_module *mod; + size_t stats_count = ST_F_TOTAL_FIELDS; + + memset(stats, 0, sizeof(struct field) * stat_count[STATS_DOMAIN_PROXY]); + + if (!stats_fill_li_stats(px, l, ctx->flags, stats, + ST_F_TOTAL_FIELDS, NULL)) + return 0; + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + void *counters; + + if (!(stats_px_get_cap(mod->domain_flags) & STATS_PX_CAP_LI)) { + stats_count += mod->stats_count; + continue; + } + + counters = EXTRA_COUNTERS_GET(l->extra_counters, mod); + mod->fill_stats(counters, stats + stats_count); + stats_count += mod->stats_count; + } + + return stats_dump_one_line(stats, stats_count, appctx); +} + +enum srv_stats_state { + SRV_STATS_STATE_DOWN = 0, + SRV_STATS_STATE_DOWN_AGENT, + SRV_STATS_STATE_GOING_UP, + SRV_STATS_STATE_UP_GOING_DOWN, + SRV_STATS_STATE_UP, + SRV_STATS_STATE_NOLB_GOING_DOWN, + SRV_STATS_STATE_NOLB, + SRV_STATS_STATE_DRAIN_GOING_DOWN, + SRV_STATS_STATE_DRAIN, + SRV_STATS_STATE_DRAIN_AGENT, + SRV_STATS_STATE_NO_CHECK, + + SRV_STATS_STATE_COUNT, /* Must be last */ +}; + +static const char *srv_hlt_st[SRV_STATS_STATE_COUNT] = { + [SRV_STATS_STATE_DOWN] = "DOWN", + [SRV_STATS_STATE_DOWN_AGENT] = "DOWN (agent)", + [SRV_STATS_STATE_GOING_UP] = "DOWN %d/%d", + [SRV_STATS_STATE_UP_GOING_DOWN] = "UP %d/%d", + [SRV_STATS_STATE_UP] = "UP", + [SRV_STATS_STATE_NOLB_GOING_DOWN] = "NOLB %d/%d", + [SRV_STATS_STATE_NOLB] = "NOLB", + [SRV_STATS_STATE_DRAIN_GOING_DOWN] = "DRAIN %d/%d", + [SRV_STATS_STATE_DRAIN] = "DRAIN", + [SRV_STATS_STATE_DRAIN_AGENT] = "DRAIN (agent)", + [SRV_STATS_STATE_NO_CHECK] = "no check" +}; + +/* Compute server state helper + */ +static void stats_fill_sv_stats_computestate(struct server *sv, struct server *ref, + enum srv_stats_state *state) +{ + if (sv->cur_state == SRV_ST_RUNNING || sv->cur_state == SRV_ST_STARTING) { + if ((ref->check.state & CHK_ST_ENABLED) && + (ref->check.health < ref->check.rise + ref->check.fall - 1)) { + *state = SRV_STATS_STATE_UP_GOING_DOWN; + } else { + *state = SRV_STATS_STATE_UP; + } + + if (sv->cur_admin & SRV_ADMF_DRAIN) { + if (ref->agent.state & CHK_ST_ENABLED) + *state = SRV_STATS_STATE_DRAIN_AGENT; + else if (*state == SRV_STATS_STATE_UP_GOING_DOWN) + *state = SRV_STATS_STATE_DRAIN_GOING_DOWN; + else + *state = SRV_STATS_STATE_DRAIN; + } + + if (*state == SRV_STATS_STATE_UP && !(ref->check.state & CHK_ST_ENABLED)) { + *state = SRV_STATS_STATE_NO_CHECK; + } + } + else if (sv->cur_state == SRV_ST_STOPPING) { + if ((!(sv->check.state & CHK_ST_ENABLED) && !sv->track) || + (ref->check.health == ref->check.rise + ref->check.fall - 1)) { + *state = SRV_STATS_STATE_NOLB; + } else { + *state = SRV_STATS_STATE_NOLB_GOING_DOWN; + } + } + else { /* stopped */ + if ((ref->agent.state & CHK_ST_ENABLED) && !ref->agent.health) { + *state = SRV_STATS_STATE_DOWN_AGENT; + } else if ((ref->check.state & CHK_ST_ENABLED) && !ref->check.health) { + *state = SRV_STATS_STATE_DOWN; /* DOWN */ + } else if ((ref->agent.state & CHK_ST_ENABLED) || (ref->check.state & CHK_ST_ENABLED)) { + *state = SRV_STATS_STATE_GOING_UP; + } else { + *state = SRV_STATS_STATE_DOWN; /* DOWN, unchecked */ + } + } +} + +/* Fill <stats> with the backend statistics. <stats> is preallocated array of + * length <len>. If <selected_field> is != NULL, only fill this one. The length + * of the array must be at least ST_F_TOTAL_FIELDS. If this length is less than + * this value, or if the selected field is not implemented for servers, the + * function returns 0, otherwise, it returns 1. <flags> can take the value + * STAT_SHLGNDS. + */ +int stats_fill_sv_stats(struct proxy *px, struct server *sv, int flags, + struct field *stats, int len, + enum stat_field *selected_field) +{ + enum stat_field current_field = (selected_field != NULL ? *selected_field : 0); + struct server *via = sv->track ? sv->track : sv; + struct server *ref = via; + enum srv_stats_state state = 0; + char str[INET6_ADDRSTRLEN]; + struct buffer *out = get_trash_chunk(); + char *fld_status; + long long srv_samples_counter; + unsigned int srv_samples_window = TIME_STATS_SAMPLES; + + if (len < ST_F_TOTAL_FIELDS) + return 0; + + chunk_reset(out); + + /* compute state for later use */ + if (selected_field == NULL || *selected_field == ST_F_STATUS || + *selected_field == ST_F_CHECK_RISE || *selected_field == ST_F_CHECK_FALL || + *selected_field == ST_F_CHECK_HEALTH || *selected_field == ST_F_HANAFAIL) { + /* we have "via" which is the tracked server as described in the configuration, + * and "ref" which is the checked server and the end of the chain. + */ + while (ref->track) + ref = ref->track; + stats_fill_sv_stats_computestate(sv, ref, &state); + } + + /* compue time values for later use */ + if (selected_field == NULL || *selected_field == ST_F_QTIME || + *selected_field == ST_F_CTIME || *selected_field == ST_F_RTIME || + *selected_field == ST_F_TTIME) { + srv_samples_counter = (px->mode == PR_MODE_HTTP) ? sv->counters.p.http.cum_req : sv->counters.cum_lbconn; + if (srv_samples_counter < TIME_STATS_SAMPLES && srv_samples_counter > 0) + srv_samples_window = srv_samples_counter; + } + + for (; current_field < ST_F_TOTAL_FIELDS; current_field++) { + struct field metric = { 0 }; + + switch (current_field) { + case ST_F_PXNAME: + metric = mkf_str(FO_KEY|FN_NAME|FS_SERVICE, px->id); + break; + case ST_F_SVNAME: + metric = mkf_str(FO_KEY|FN_NAME|FS_SERVICE, sv->id); + break; + case ST_F_MODE: + metric = mkf_str(FO_CONFIG|FS_SERVICE, proxy_mode_str(px->mode)); + break; + case ST_F_QCUR: + metric = mkf_u32(0, sv->queue.length); + break; + case ST_F_QMAX: + metric = mkf_u32(FN_MAX, sv->counters.nbpend_max); + break; + case ST_F_SCUR: + metric = mkf_u32(0, sv->cur_sess); + break; + case ST_F_SMAX: + metric = mkf_u32(FN_MAX, sv->counters.cur_sess_max); + break; + case ST_F_SLIM: + if (sv->maxconn) + metric = mkf_u32(FO_CONFIG|FN_LIMIT, sv->maxconn); + break; + case ST_F_SRV_ICUR: + metric = mkf_u32(0, sv->curr_idle_conns); + break; + case ST_F_SRV_ILIM: + if (sv->max_idle_conns != -1) + metric = mkf_u32(FO_CONFIG|FN_LIMIT, sv->max_idle_conns); + break; + case ST_F_STOT: + metric = mkf_u64(FN_COUNTER, sv->counters.cum_sess); + break; + case ST_F_BIN: + metric = mkf_u64(FN_COUNTER, sv->counters.bytes_in); + break; + case ST_F_BOUT: + metric = mkf_u64(FN_COUNTER, sv->counters.bytes_out); + break; + case ST_F_DRESP: + metric = mkf_u64(FN_COUNTER, sv->counters.denied_resp); + break; + case ST_F_ECON: + metric = mkf_u64(FN_COUNTER, sv->counters.failed_conns); + break; + case ST_F_ERESP: + metric = mkf_u64(FN_COUNTER, sv->counters.failed_resp); + break; + case ST_F_WRETR: + metric = mkf_u64(FN_COUNTER, sv->counters.retries); + break; + case ST_F_WREDIS: + metric = mkf_u64(FN_COUNTER, sv->counters.redispatches); + break; + case ST_F_WREW: + metric = mkf_u64(FN_COUNTER, sv->counters.failed_rewrites); + break; + case ST_F_EINT: + metric = mkf_u64(FN_COUNTER, sv->counters.internal_errors); + break; + case ST_F_CONNECT: + metric = mkf_u64(FN_COUNTER, sv->counters.connect); + break; + case ST_F_REUSE: + metric = mkf_u64(FN_COUNTER, sv->counters.reuse); + break; + case ST_F_IDLE_CONN_CUR: + metric = mkf_u32(0, sv->curr_idle_nb); + break; + case ST_F_SAFE_CONN_CUR: + metric = mkf_u32(0, sv->curr_safe_nb); + break; + case ST_F_USED_CONN_CUR: + metric = mkf_u32(0, sv->curr_used_conns); + break; + case ST_F_NEED_CONN_EST: + metric = mkf_u32(0, sv->est_need_conns); + break; + case ST_F_STATUS: + fld_status = chunk_newstr(out); + if (sv->cur_admin & SRV_ADMF_RMAINT) + chunk_appendf(out, "MAINT (resolution)"); + else if (sv->cur_admin & SRV_ADMF_IMAINT) + chunk_appendf(out, "MAINT (via %s/%s)", via->proxy->id, via->id); + else if (sv->cur_admin & SRV_ADMF_MAINT) + chunk_appendf(out, "MAINT"); + else + chunk_appendf(out, + srv_hlt_st[state], + (ref->cur_state != SRV_ST_STOPPED) ? (ref->check.health - ref->check.rise + 1) : (ref->check.health), + (ref->cur_state != SRV_ST_STOPPED) ? (ref->check.fall) : (ref->check.rise)); + + metric = mkf_str(FO_STATUS, fld_status); + break; + case ST_F_LASTCHG: + metric = mkf_u32(FN_AGE, ns_to_sec(now_ns) - sv->last_change); + break; + case ST_F_WEIGHT: + metric = mkf_u32(FN_AVG, (sv->cur_eweight * px->lbprm.wmult + px->lbprm.wdiv - 1) / px->lbprm.wdiv); + break; + case ST_F_UWEIGHT: + metric = mkf_u32(FN_AVG, sv->uweight); + break; + case ST_F_ACT: + metric = mkf_u32(FO_STATUS, (sv->flags & SRV_F_BACKUP) ? 0 : 1); + break; + case ST_F_BCK: + metric = mkf_u32(FO_STATUS, (sv->flags & SRV_F_BACKUP) ? 1 : 0); + break; + case ST_F_CHKFAIL: + if (sv->check.state & CHK_ST_ENABLED) + metric = mkf_u64(FN_COUNTER, sv->counters.failed_checks); + break; + case ST_F_CHKDOWN: + if (sv->check.state & CHK_ST_ENABLED) + metric = mkf_u64(FN_COUNTER, sv->counters.down_trans); + break; + case ST_F_DOWNTIME: + if (sv->check.state & CHK_ST_ENABLED) + metric = mkf_u32(FN_COUNTER, srv_downtime(sv)); + break; + case ST_F_QLIMIT: + if (sv->maxqueue) + metric = mkf_u32(FO_CONFIG|FS_SERVICE, sv->maxqueue); + break; + case ST_F_PID: + metric = mkf_u32(FO_KEY, 1); + break; + case ST_F_IID: + metric = mkf_u32(FO_KEY|FS_SERVICE, px->uuid); + break; + case ST_F_SID: + metric = mkf_u32(FO_KEY|FS_SERVICE, sv->puid); + break; + case ST_F_SRID: + metric = mkf_u32(FN_COUNTER, sv->rid); + break; + case ST_F_THROTTLE: + if (sv->cur_state == SRV_ST_STARTING && !server_is_draining(sv)) + metric = mkf_u32(FN_AVG, server_throttle_rate(sv)); + break; + case ST_F_LBTOT: + metric = mkf_u64(FN_COUNTER, sv->counters.cum_lbconn); + break; + case ST_F_TRACKED: + if (sv->track) { + char *fld_track = chunk_newstr(out); + chunk_appendf(out, "%s/%s", sv->track->proxy->id, sv->track->id); + metric = mkf_str(FO_CONFIG|FN_NAME|FS_SERVICE, fld_track); + } + break; + case ST_F_TYPE: + metric = mkf_u32(FO_CONFIG|FS_SERVICE, STATS_TYPE_SV); + break; + case ST_F_RATE: + metric = mkf_u32(FN_RATE, read_freq_ctr(&sv->sess_per_sec)); + break; + case ST_F_RATE_MAX: + metric = mkf_u32(FN_MAX, sv->counters.sps_max); + break; + case ST_F_CHECK_STATUS: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) { + const char *fld_chksts; + + fld_chksts = chunk_newstr(out); + chunk_strcat(out, "* "); // for check in progress + chunk_strcat(out, get_check_status_info(sv->check.status)); + if (!(sv->check.state & CHK_ST_INPROGRESS)) + fld_chksts += 2; // skip "* " + metric = mkf_str(FN_OUTPUT, fld_chksts); + } + break; + case ST_F_CHECK_CODE: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED && + sv->check.status >= HCHK_STATUS_L57DATA) + metric = mkf_u32(FN_OUTPUT, sv->check.code); + break; + case ST_F_CHECK_DURATION: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED && + sv->check.status >= HCHK_STATUS_CHECKED) + metric = mkf_u64(FN_DURATION, MAX(sv->check.duration, 0)); + break; + case ST_F_CHECK_DESC: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_str(FN_OUTPUT, get_check_status_description(sv->check.status)); + break; + case ST_F_LAST_CHK: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_str(FN_OUTPUT, sv->check.desc); + break; + case ST_F_CHECK_RISE: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_u32(FO_CONFIG|FS_SERVICE, ref->check.rise); + break; + case ST_F_CHECK_FALL: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_u32(FO_CONFIG|FS_SERVICE, ref->check.fall); + break; + case ST_F_CHECK_HEALTH: + if ((sv->check.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_u32(FO_CONFIG|FS_SERVICE, ref->check.health); + break; + case ST_F_AGENT_STATUS: + if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) { + const char *fld_chksts; + + fld_chksts = chunk_newstr(out); + chunk_strcat(out, "* "); // for check in progress + chunk_strcat(out, get_check_status_info(sv->agent.status)); + if (!(sv->agent.state & CHK_ST_INPROGRESS)) + fld_chksts += 2; // skip "* " + metric = mkf_str(FN_OUTPUT, fld_chksts); + } + break; + case ST_F_AGENT_CODE: + if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED && + (sv->agent.status >= HCHK_STATUS_L57DATA)) + metric = mkf_u32(FN_OUTPUT, sv->agent.code); + break; + case ST_F_AGENT_DURATION: + if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_u64(FN_DURATION, sv->agent.duration); + break; + case ST_F_AGENT_DESC: + if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_str(FN_OUTPUT, get_check_status_description(sv->agent.status)); + break; + case ST_F_LAST_AGT: + if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_str(FN_OUTPUT, sv->agent.desc); + break; + case ST_F_AGENT_RISE: + if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_u32(FO_CONFIG|FS_SERVICE, sv->agent.rise); + break; + case ST_F_AGENT_FALL: + if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_u32(FO_CONFIG|FS_SERVICE, sv->agent.fall); + break; + case ST_F_AGENT_HEALTH: + if ((sv->agent.state & (CHK_ST_ENABLED|CHK_ST_PAUSED)) == CHK_ST_ENABLED) + metric = mkf_u32(FO_CONFIG|FS_SERVICE, sv->agent.health); + break; + case ST_F_REQ_TOT: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, sv->counters.p.http.cum_req); + break; + case ST_F_HRSP_1XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, sv->counters.p.http.rsp[1]); + break; + case ST_F_HRSP_2XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, sv->counters.p.http.rsp[2]); + break; + case ST_F_HRSP_3XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, sv->counters.p.http.rsp[3]); + break; + case ST_F_HRSP_4XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, sv->counters.p.http.rsp[4]); + break; + case ST_F_HRSP_5XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, sv->counters.p.http.rsp[5]); + break; + case ST_F_HRSP_OTHER: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, sv->counters.p.http.rsp[0]); + break; + case ST_F_HANAFAIL: + if (ref->observe) + metric = mkf_u64(FN_COUNTER, sv->counters.failed_hana); + break; + case ST_F_CLI_ABRT: + metric = mkf_u64(FN_COUNTER, sv->counters.cli_aborts); + break; + case ST_F_SRV_ABRT: + metric = mkf_u64(FN_COUNTER, sv->counters.srv_aborts); + break; + case ST_F_LASTSESS: + metric = mkf_s32(FN_AGE, srv_lastsession(sv)); + break; + case ST_F_QTIME: + metric = mkf_u32(FN_AVG, swrate_avg(sv->counters.q_time, srv_samples_window)); + break; + case ST_F_CTIME: + metric = mkf_u32(FN_AVG, swrate_avg(sv->counters.c_time, srv_samples_window)); + break; + case ST_F_RTIME: + metric = mkf_u32(FN_AVG, swrate_avg(sv->counters.d_time, srv_samples_window)); + break; + case ST_F_TTIME: + metric = mkf_u32(FN_AVG, swrate_avg(sv->counters.t_time, srv_samples_window)); + break; + case ST_F_QT_MAX: + metric = mkf_u32(FN_MAX, sv->counters.qtime_max); + break; + case ST_F_CT_MAX: + metric = mkf_u32(FN_MAX, sv->counters.ctime_max); + break; + case ST_F_RT_MAX: + metric = mkf_u32(FN_MAX, sv->counters.dtime_max); + break; + case ST_F_TT_MAX: + metric = mkf_u32(FN_MAX, sv->counters.ttime_max); + break; + case ST_F_ADDR: + if (flags & STAT_SHLGNDS) { + switch (addr_to_str(&sv->addr, str, sizeof(str))) { + case AF_INET: + metric = mkf_str(FO_CONFIG|FS_SERVICE, chunk_newstr(out)); + chunk_appendf(out, "%s:%d", str, sv->svc_port); + break; + case AF_INET6: + metric = mkf_str(FO_CONFIG|FS_SERVICE, chunk_newstr(out)); + chunk_appendf(out, "[%s]:%d", str, sv->svc_port); + break; + case AF_UNIX: + metric = mkf_str(FO_CONFIG|FS_SERVICE, "unix"); + break; + case -1: + metric = mkf_str(FO_CONFIG|FS_SERVICE, chunk_newstr(out)); + chunk_strcat(out, strerror(errno)); + break; + default: /* address family not supported */ + break; + } + } + break; + case ST_F_COOKIE: + if (flags & STAT_SHLGNDS && sv->cookie) + metric = mkf_str(FO_CONFIG|FN_NAME|FS_SERVICE, sv->cookie); + break; + default: + /* not used for servers. If a specific metric + * is requested, return an error. Otherwise continue. + */ + if (selected_field != NULL) + return 0; + continue; + } + stats[current_field] = metric; + if (selected_field != NULL) + break; + } + return 1; +} + +/* Dumps a line for server <sv> and proxy <px> to the local trash vbuffer and + * uses the state from stream connector <sc>, and server state <state>. The + * caller is responsible for clearing the local trash buffer if needed. Returns + * non-zero if it emits anything, zero otherwise. + */ +static int stats_dump_sv_stats(struct stconn *sc, struct proxy *px, struct server *sv) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct stats_module *mod; + struct field *stats = stat_l[STATS_DOMAIN_PROXY]; + size_t stats_count = ST_F_TOTAL_FIELDS; + + memset(stats, 0, sizeof(struct field) * stat_count[STATS_DOMAIN_PROXY]); + + if (!stats_fill_sv_stats(px, sv, ctx->flags, stats, + ST_F_TOTAL_FIELDS, NULL)) + return 0; + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + void *counters; + + if (stats_get_domain(mod->domain_flags) != STATS_DOMAIN_PROXY) + continue; + + if (!(stats_px_get_cap(mod->domain_flags) & STATS_PX_CAP_SRV)) { + stats_count += mod->stats_count; + continue; + } + + counters = EXTRA_COUNTERS_GET(sv->extra_counters, mod); + mod->fill_stats(counters, stats + stats_count); + stats_count += mod->stats_count; + } + + return stats_dump_one_line(stats, stats_count, appctx); +} + +/* Helper to compute srv values for a given backend + */ +static void stats_fill_be_stats_computesrv(struct proxy *px, int *nbup, int *nbsrv, int *totuw) +{ + int nbup_tmp, nbsrv_tmp, totuw_tmp; + const struct server *srv; + + nbup_tmp = nbsrv_tmp = totuw_tmp = 0; + for (srv = px->srv; srv; srv = srv->next) { + if (srv->cur_state != SRV_ST_STOPPED) { + nbup_tmp++; + if (srv_currently_usable(srv) && + (!px->srv_act ^ !(srv->flags & SRV_F_BACKUP))) + totuw_tmp += srv->uweight; + } + nbsrv_tmp++; + } + + HA_RWLOCK_RDLOCK(LBPRM_LOCK, &px->lbprm.lock); + if (!px->srv_act && px->lbprm.fbck) + totuw_tmp = px->lbprm.fbck->uweight; + HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &px->lbprm.lock); + + /* use tmp variable then assign result to make gcc happy */ + *nbup = nbup_tmp; + *nbsrv = nbsrv_tmp; + *totuw = totuw_tmp; +} + +/* Fill <stats> with the backend statistics. <stats> is preallocated array of + * length <len>. If <selected_field> is != NULL, only fill this one. The length + * of the array must be at least ST_F_TOTAL_FIELDS. If this length is less than + * this value, or if the selected field is not implemented for backends, the + * function returns 0, otherwise, it returns 1. <flags> can take the value + * STAT_SHLGNDS. + */ +int stats_fill_be_stats(struct proxy *px, int flags, struct field *stats, int len, + enum stat_field *selected_field) +{ + enum stat_field current_field = (selected_field != NULL ? *selected_field : 0); + long long be_samples_counter; + unsigned int be_samples_window = TIME_STATS_SAMPLES; + struct buffer *out = get_trash_chunk(); + int nbup, nbsrv, totuw; + char *fld; + + if (len < ST_F_TOTAL_FIELDS) + return 0; + + nbup = nbsrv = totuw = 0; + /* some srv values compute for later if we either select all fields or + * need them for one of the mentioned ones */ + if (selected_field == NULL || *selected_field == ST_F_STATUS || + *selected_field == ST_F_UWEIGHT) + stats_fill_be_stats_computesrv(px, &nbup, &nbsrv, &totuw); + + /* same here but specific to time fields */ + if (selected_field == NULL || *selected_field == ST_F_QTIME || + *selected_field == ST_F_CTIME || *selected_field == ST_F_RTIME || + *selected_field == ST_F_TTIME) { + be_samples_counter = (px->mode == PR_MODE_HTTP) ? px->be_counters.p.http.cum_req : px->be_counters.cum_lbconn; + if (be_samples_counter < TIME_STATS_SAMPLES && be_samples_counter > 0) + be_samples_window = be_samples_counter; + } + + for (; current_field < ST_F_TOTAL_FIELDS; current_field++) { + struct field metric = { 0 }; + + switch (current_field) { + case ST_F_PXNAME: + metric = mkf_str(FO_KEY|FN_NAME|FS_SERVICE, px->id); + break; + case ST_F_SVNAME: + metric = mkf_str(FO_KEY|FN_NAME|FS_SERVICE, "BACKEND"); + break; + case ST_F_MODE: + metric = mkf_str(FO_CONFIG|FS_SERVICE, proxy_mode_str(px->mode)); + break; + case ST_F_QCUR: + metric = mkf_u32(0, px->queue.length); + break; + case ST_F_QMAX: + metric = mkf_u32(FN_MAX, px->be_counters.nbpend_max); + break; + case ST_F_SCUR: + metric = mkf_u32(0, px->beconn); + break; + case ST_F_SMAX: + metric = mkf_u32(FN_MAX, px->be_counters.conn_max); + break; + case ST_F_SLIM: + metric = mkf_u32(FO_CONFIG|FN_LIMIT, px->fullconn); + break; + case ST_F_STOT: + metric = mkf_u64(FN_COUNTER, px->be_counters.cum_conn); + break; + case ST_F_BIN: + metric = mkf_u64(FN_COUNTER, px->be_counters.bytes_in); + break; + case ST_F_BOUT: + metric = mkf_u64(FN_COUNTER, px->be_counters.bytes_out); + break; + case ST_F_DREQ: + metric = mkf_u64(FN_COUNTER, px->be_counters.denied_req); + break; + case ST_F_DRESP: + metric = mkf_u64(FN_COUNTER, px->be_counters.denied_resp); + break; + case ST_F_ECON: + metric = mkf_u64(FN_COUNTER, px->be_counters.failed_conns); + break; + case ST_F_ERESP: + metric = mkf_u64(FN_COUNTER, px->be_counters.failed_resp); + break; + case ST_F_WRETR: + metric = mkf_u64(FN_COUNTER, px->be_counters.retries); + break; + case ST_F_WREDIS: + metric = mkf_u64(FN_COUNTER, px->be_counters.redispatches); + break; + case ST_F_WREW: + metric = mkf_u64(FN_COUNTER, px->be_counters.failed_rewrites); + break; + case ST_F_EINT: + metric = mkf_u64(FN_COUNTER, px->be_counters.internal_errors); + break; + case ST_F_CONNECT: + metric = mkf_u64(FN_COUNTER, px->be_counters.connect); + break; + case ST_F_REUSE: + metric = mkf_u64(FN_COUNTER, px->be_counters.reuse); + break; + case ST_F_STATUS: + fld = chunk_newstr(out); + chunk_appendf(out, "%s", (px->lbprm.tot_weight > 0 || !px->srv) ? "UP" : "DOWN"); + if (flags & (STAT_HIDE_MAINT|STAT_HIDE_DOWN)) + chunk_appendf(out, " (%d/%d)", nbup, nbsrv); + metric = mkf_str(FO_STATUS, fld); + break; + case ST_F_AGG_SRV_CHECK_STATUS: // DEPRECATED + case ST_F_AGG_SRV_STATUS: + metric = mkf_u32(FN_GAUGE, 0); + break; + case ST_F_AGG_CHECK_STATUS: + metric = mkf_u32(FN_GAUGE, 0); + break; + case ST_F_WEIGHT: + metric = mkf_u32(FN_AVG, (px->lbprm.tot_weight * px->lbprm.wmult + px->lbprm.wdiv - 1) / px->lbprm.wdiv); + break; + case ST_F_UWEIGHT: + metric = mkf_u32(FN_AVG, totuw); + break; + case ST_F_ACT: + metric = mkf_u32(0, px->srv_act); + break; + case ST_F_BCK: + metric = mkf_u32(0, px->srv_bck); + break; + case ST_F_CHKDOWN: + metric = mkf_u64(FN_COUNTER, px->down_trans); + break; + case ST_F_LASTCHG: + metric = mkf_u32(FN_AGE, ns_to_sec(now_ns) - px->last_change); + break; + case ST_F_DOWNTIME: + if (px->srv) + metric = mkf_u32(FN_COUNTER, be_downtime(px)); + break; + case ST_F_PID: + metric = mkf_u32(FO_KEY, 1); + break; + case ST_F_IID: + metric = mkf_u32(FO_KEY|FS_SERVICE, px->uuid); + break; + case ST_F_SID: + metric = mkf_u32(FO_KEY|FS_SERVICE, 0); + break; + case ST_F_LBTOT: + metric = mkf_u64(FN_COUNTER, px->be_counters.cum_lbconn); + break; + case ST_F_TYPE: + metric = mkf_u32(FO_CONFIG|FS_SERVICE, STATS_TYPE_BE); + break; + case ST_F_RATE: + metric = mkf_u32(0, read_freq_ctr(&px->be_sess_per_sec)); + break; + case ST_F_RATE_MAX: + metric = mkf_u32(0, px->be_counters.sps_max); + break; + case ST_F_COOKIE: + if (flags & STAT_SHLGNDS && px->cookie_name) + metric = mkf_str(FO_CONFIG|FN_NAME|FS_SERVICE, px->cookie_name); + break; + case ST_F_ALGO: + if (flags & STAT_SHLGNDS) + metric = mkf_str(FO_CONFIG|FS_SERVICE, backend_lb_algo_str(px->lbprm.algo & BE_LB_ALGO)); + break; + case ST_F_REQ_TOT: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.cum_req); + break; + case ST_F_HRSP_1XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.rsp[1]); + break; + case ST_F_HRSP_2XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.rsp[2]); + break; + case ST_F_HRSP_3XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.rsp[3]); + break; + case ST_F_HRSP_4XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.rsp[4]); + break; + case ST_F_HRSP_5XX: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.rsp[5]); + break; + case ST_F_HRSP_OTHER: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.rsp[0]); + break; + case ST_F_CACHE_LOOKUPS: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.cache_lookups); + break; + case ST_F_CACHE_HITS: + if (px->mode == PR_MODE_HTTP) + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.cache_hits); + break; + case ST_F_CLI_ABRT: + metric = mkf_u64(FN_COUNTER, px->be_counters.cli_aborts); + break; + case ST_F_SRV_ABRT: + metric = mkf_u64(FN_COUNTER, px->be_counters.srv_aborts); + break; + case ST_F_COMP_IN: + metric = mkf_u64(FN_COUNTER, px->be_counters.comp_in[COMP_DIR_RES]); + break; + case ST_F_COMP_OUT: + metric = mkf_u64(FN_COUNTER, px->be_counters.comp_out[COMP_DIR_RES]); + break; + case ST_F_COMP_BYP: + metric = mkf_u64(FN_COUNTER, px->be_counters.comp_byp[COMP_DIR_RES]); + break; + case ST_F_COMP_RSP: + metric = mkf_u64(FN_COUNTER, px->be_counters.p.http.comp_rsp); + break; + case ST_F_LASTSESS: + metric = mkf_s32(FN_AGE, be_lastsession(px)); + break; + case ST_F_QTIME: + metric = mkf_u32(FN_AVG, swrate_avg(px->be_counters.q_time, be_samples_window)); + break; + case ST_F_CTIME: + metric = mkf_u32(FN_AVG, swrate_avg(px->be_counters.c_time, be_samples_window)); + break; + case ST_F_RTIME: + metric = mkf_u32(FN_AVG, swrate_avg(px->be_counters.d_time, be_samples_window)); + break; + case ST_F_TTIME: + metric = mkf_u32(FN_AVG, swrate_avg(px->be_counters.t_time, be_samples_window)); + break; + case ST_F_QT_MAX: + metric = mkf_u32(FN_MAX, px->be_counters.qtime_max); + break; + case ST_F_CT_MAX: + metric = mkf_u32(FN_MAX, px->be_counters.ctime_max); + break; + case ST_F_RT_MAX: + metric = mkf_u32(FN_MAX, px->be_counters.dtime_max); + break; + case ST_F_TT_MAX: + metric = mkf_u32(FN_MAX, px->be_counters.ttime_max); + break; + default: + /* not used for backends. If a specific metric + * is requested, return an error. Otherwise continue. + */ + if (selected_field != NULL) + return 0; + continue; + } + stats[current_field] = metric; + if (selected_field != NULL) + break; + } + return 1; +} + +/* Dumps a line for backend <px> to the local trash buffer for and uses the + * state from stream interface <si>. The caller is responsible for clearing the + * local trash buffer if needed. Returns non-zero if it emits anything, zero + * otherwise. + */ +static int stats_dump_be_stats(struct stconn *sc, struct proxy *px) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct field *stats = stat_l[STATS_DOMAIN_PROXY]; + struct stats_module *mod; + size_t stats_count = ST_F_TOTAL_FIELDS; + + if (!(px->cap & PR_CAP_BE)) + return 0; + + if ((ctx->flags & STAT_BOUND) && !(ctx->type & (1 << STATS_TYPE_BE))) + return 0; + + memset(stats, 0, sizeof(struct field) * stat_count[STATS_DOMAIN_PROXY]); + + if (!stats_fill_be_stats(px, ctx->flags, stats, ST_F_TOTAL_FIELDS, NULL)) + return 0; + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + struct extra_counters *counters; + + if (stats_get_domain(mod->domain_flags) != STATS_DOMAIN_PROXY) + continue; + + if (!(stats_px_get_cap(mod->domain_flags) & STATS_PX_CAP_BE)) { + stats_count += mod->stats_count; + continue; + } + + counters = EXTRA_COUNTERS_GET(px->extra_counters_be, mod); + mod->fill_stats(counters, stats + stats_count); + stats_count += mod->stats_count; + } + + return stats_dump_one_line(stats, stats_count, appctx); +} + +/* Dumps the HTML table header for proxy <px> to the local trash buffer for and + * uses the state from stream connector <sc>. The caller is responsible for + * clearing the local trash buffer if needed. + */ +static void stats_dump_html_px_hdr(struct stconn *sc, struct proxy *px) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + char scope_txt[STAT_SCOPE_TXT_MAXLEN + sizeof STAT_SCOPE_PATTERN]; + struct stats_module *mod; + int stats_module_len = 0; + + if (px->cap & PR_CAP_BE && px->srv && (ctx->flags & STAT_ADMIN)) { + /* A form to enable/disable this proxy servers */ + + /* scope_txt = search pattern + search query, ctx->scope_len is always <= STAT_SCOPE_TXT_MAXLEN */ + scope_txt[0] = 0; + if (ctx->scope_len) { + const char *scope_ptr = stats_scope_ptr(appctx, sc); + + strlcpy2(scope_txt, STAT_SCOPE_PATTERN, sizeof(scope_txt)); + memcpy(scope_txt + strlen(STAT_SCOPE_PATTERN), scope_ptr, ctx->scope_len); + scope_txt[strlen(STAT_SCOPE_PATTERN) + ctx->scope_len] = 0; + } + + chunk_appendf(&trash_chunk, + "<form method=\"post\">"); + } + + /* print a new table */ + chunk_appendf(&trash_chunk, + "<table class=\"tbl\" width=\"100%%\">\n" + "<tr class=\"titre\">" + "<th class=\"pxname\" width=\"10%%\">"); + + chunk_appendf(&trash_chunk, + "<a name=\"%s\"></a>%s" + "<a class=px href=\"#%s\">%s</a>", + px->id, + (ctx->flags & STAT_SHLGNDS) ? "<u>":"", + px->id, px->id); + + if (ctx->flags & STAT_SHLGNDS) { + /* cap, mode, id */ + chunk_appendf(&trash_chunk, "<div class=tips>cap: %s, mode: %s, id: %d", + proxy_cap_str(px->cap), proxy_mode_str(px->mode), + px->uuid); + chunk_appendf(&trash_chunk, "</div>"); + } + + chunk_appendf(&trash_chunk, + "%s</th>" + "<th class=\"%s\" width=\"90%%\">%s</th>" + "</tr>\n" + "</table>\n" + "<table class=\"tbl\" width=\"100%%\">\n" + "<tr class=\"titre\">", + (ctx->flags & STAT_SHLGNDS) ? "</u>":"", + px->desc ? "desc" : "empty", px->desc ? px->desc : ""); + + if (ctx->flags & STAT_ADMIN) { + /* Column heading for Enable or Disable server */ + if ((px->cap & PR_CAP_BE) && px->srv) + chunk_appendf(&trash_chunk, + "<th rowspan=2 width=1><input type=\"checkbox\" " + "onclick=\"for(c in document.getElementsByClassName('%s-checkbox')) " + "document.getElementsByClassName('%s-checkbox').item(c).checked = this.checked\"></th>", + px->id, + px->id); + else + chunk_appendf(&trash_chunk, "<th rowspan=2></th>"); + } + + chunk_appendf(&trash_chunk, + "<th rowspan=2></th>" + "<th colspan=3>Queue</th>" + "<th colspan=3>Session rate</th><th colspan=6>Sessions</th>" + "<th colspan=2>Bytes</th><th colspan=2>Denied</th>" + "<th colspan=3>Errors</th><th colspan=2>Warnings</th>" + "<th colspan=9>Server</th>"); + + if (ctx->flags & STAT_SHMODULES) { + // calculate the count of module for colspan attribute + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + ++stats_module_len; + } + chunk_appendf(&trash_chunk, "<th colspan=%d>Extra modules</th>", + stats_module_len); + } + + chunk_appendf(&trash_chunk, + "</tr>\n" + "<tr class=\"titre\">" + "<th>Cur</th><th>Max</th><th>Limit</th>" + "<th>Cur</th><th>Max</th><th>Limit</th><th>Cur</th><th>Max</th>" + "<th>Limit</th><th>Total</th><th>LbTot</th><th>Last</th><th>In</th><th>Out</th>" + "<th>Req</th><th>Resp</th><th>Req</th><th>Conn</th>" + "<th>Resp</th><th>Retr</th><th>Redis</th>" + "<th>Status</th><th>LastChk</th><th>Wght</th><th>Act</th>" + "<th>Bck</th><th>Chk</th><th>Dwn</th><th>Dwntme</th>" + "<th>Thrtle</th>\n"); + + if (ctx->flags & STAT_SHMODULES) { + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + chunk_appendf(&trash_chunk, "<th>%s</th>", mod->name); + } + } + + chunk_appendf(&trash_chunk, "</tr>"); +} + +/* Dumps the HTML table trailer for proxy <px> to the local trash buffer for and + * uses the state from stream connector <sc>. The caller is responsible for + * clearing the local trash buffer if needed. + */ +static void stats_dump_html_px_end(struct stconn *sc, struct proxy *px) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + + chunk_appendf(&trash_chunk, "</table>"); + + if ((px->cap & PR_CAP_BE) && px->srv && (ctx->flags & STAT_ADMIN)) { + /* close the form used to enable/disable this proxy servers */ + chunk_appendf(&trash_chunk, + "Choose the action to perform on the checked servers : " + "<select name=action>" + "<option value=\"\"></option>" + "<option value=\"ready\">Set state to READY</option>" + "<option value=\"drain\">Set state to DRAIN</option>" + "<option value=\"maint\">Set state to MAINT</option>" + "<option value=\"dhlth\">Health: disable checks</option>" + "<option value=\"ehlth\">Health: enable checks</option>" + "<option value=\"hrunn\">Health: force UP</option>" + "<option value=\"hnolb\">Health: force NOLB</option>" + "<option value=\"hdown\">Health: force DOWN</option>" + "<option value=\"dagent\">Agent: disable checks</option>" + "<option value=\"eagent\">Agent: enable checks</option>" + "<option value=\"arunn\">Agent: force UP</option>" + "<option value=\"adown\">Agent: force DOWN</option>" + "<option value=\"shutdown\">Kill Sessions</option>" + "</select>" + "<input type=\"hidden\" name=\"b\" value=\"#%d\">" + " <input type=\"submit\" value=\"Apply\">" + "</form>", + px->uuid); + } + + chunk_appendf(&trash_chunk, "<p>\n"); +} + +/* + * Dumps statistics for a proxy. The output is sent to the stream connector's + * input buffer. Returns 0 if it had to stop dumping data because of lack of + * buffer space, or non-zero if everything completed. This function is used + * both by the CLI and the HTTP entry points, and is able to dump the output + * in HTML or CSV formats. + */ +int stats_dump_proxy_to_buffer(struct stconn *sc, struct htx *htx, + struct proxy *px) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct channel *rep = sc_ic(sc); + struct server *sv, *svs; /* server and server-state, server-state=server or server->track */ + struct listener *l; + struct uri_auth *uri = NULL; + int current_field; + int px_st = ctx->px_st; + + if (ctx->http_px) + uri = ctx->http_px->uri_auth; + chunk_reset(&trash_chunk); +more: + current_field = ctx->field; + + switch (ctx->px_st) { + case STAT_PX_ST_INIT: + /* we are on a new proxy */ + if (uri && uri->scope) { + /* we have a limited scope, we have to check the proxy name */ + struct stat_scope *scope; + int len; + + len = strlen(px->id); + scope = uri->scope; + + while (scope) { + /* match exact proxy name */ + if (scope->px_len == len && !memcmp(px->id, scope->px_id, len)) + break; + + /* match '.' which means 'self' proxy */ + if (strcmp(scope->px_id, ".") == 0 && px == ctx->http_px) + break; + scope = scope->next; + } + + /* proxy name not found : don't dump anything */ + if (scope == NULL) + return 1; + } + + /* if the user has requested a limited output and the proxy + * name does not match, skip it. + */ + if (ctx->scope_len) { + const char *scope_ptr = stats_scope_ptr(appctx, sc); + + if (strnistr(px->id, strlen(px->id), scope_ptr, ctx->scope_len) == NULL) + return 1; + } + + if ((ctx->flags & STAT_BOUND) && + (ctx->iid != -1) && + (px->uuid != ctx->iid)) + return 1; + + ctx->px_st = STAT_PX_ST_TH; + __fallthrough; + + case STAT_PX_ST_TH: + if (ctx->flags & STAT_FMT_HTML) { + stats_dump_html_px_hdr(sc, px); + if (!stats_putchk(appctx, htx)) + goto full; + } + + ctx->px_st = STAT_PX_ST_FE; + __fallthrough; + + case STAT_PX_ST_FE: + /* print the frontend */ + if (stats_dump_fe_stats(sc, px)) { + if (!stats_putchk(appctx, htx)) + goto full; + ctx->flags |= STAT_STARTED; + if (ctx->field) + goto more; + } + + current_field = 0; + ctx->obj2 = px->conf.listeners.n; + ctx->px_st = STAT_PX_ST_LI; + __fallthrough; + + case STAT_PX_ST_LI: + /* obj2 points to listeners list as initialized above */ + for (; ctx->obj2 != &px->conf.listeners; ctx->obj2 = l->by_fe.n) { + if (htx) { + if (htx_almost_full(htx)) { + sc_need_room(sc, htx->size / 2); + goto full; + } + } + else { + if (buffer_almost_full(&rep->buf)) { + sc_need_room(sc, b_size(&rep->buf) / 2); + goto full; + } + } + + l = LIST_ELEM(ctx->obj2, struct listener *, by_fe); + if (!l->counters) + continue; + + if (ctx->flags & STAT_BOUND) { + if (!(ctx->type & (1 << STATS_TYPE_SO))) + break; + + if (ctx->sid != -1 && l->luid != ctx->sid) + continue; + } + + /* print the frontend */ + if (stats_dump_li_stats(sc, px, l)) { + if (!stats_putchk(appctx, htx)) + goto full; + ctx->flags |= STAT_STARTED; + if (ctx->field) + goto more; + } + current_field = 0; + } + + ctx->obj2 = px->srv; /* may be NULL */ + ctx->px_st = STAT_PX_ST_SV; + __fallthrough; + + case STAT_PX_ST_SV: + /* check for dump resumption */ + if (px_st == STAT_PX_ST_SV) { + struct server *cur = ctx->obj2; + + /* re-entrant dump */ + BUG_ON(!cur); + if (cur->flags & SRV_F_DELETED) { + /* the server could have been marked as deleted + * between two dumping attempts, skip it. + */ + cur = cur->next; + } + srv_drop(ctx->obj2); /* drop old srv taken on last dumping attempt */ + ctx->obj2 = cur; /* could be NULL */ + /* back to normal */ + } + + /* obj2 points to servers list as initialized above. + * + * A server may be removed during the stats dumping. + * Temporarily increment its refcount to prevent its + * anticipated cleaning. Call srv_drop() to release it. + */ + for (; ctx->obj2 != NULL; + ctx->obj2 = srv_drop(sv)) { + + sv = ctx->obj2; + srv_take(sv); + + if (htx) { + if (htx_almost_full(htx)) { + sc_need_room(sc, htx->size / 2); + goto full; + } + } + else { + if (buffer_almost_full(&rep->buf)) { + sc_need_room(sc, b_size(&rep->buf) / 2); + goto full; + } + } + + if (ctx->flags & STAT_BOUND) { + if (!(ctx->type & (1 << STATS_TYPE_SV))) { + srv_drop(sv); + break; + } + + if (ctx->sid != -1 && sv->puid != ctx->sid) + continue; + } + + /* do not report disabled servers */ + if (ctx->flags & STAT_HIDE_MAINT && + sv->cur_admin & SRV_ADMF_MAINT) { + continue; + } + + svs = sv; + while (svs->track) + svs = svs->track; + + /* do not report servers which are DOWN and not changing state */ + if ((ctx->flags & STAT_HIDE_DOWN) && + ((sv->cur_admin & SRV_ADMF_MAINT) || /* server is in maintenance */ + (sv->cur_state == SRV_ST_STOPPED && /* server is down */ + (!((svs->agent.state | svs->check.state) & CHK_ST_ENABLED) || + ((svs->agent.state & CHK_ST_ENABLED) && !svs->agent.health) || + ((svs->check.state & CHK_ST_ENABLED) && !svs->check.health))))) { + continue; + } + + if (stats_dump_sv_stats(sc, px, sv)) { + if (!stats_putchk(appctx, htx)) + goto full; + ctx->flags |= STAT_STARTED; + if (ctx->field) + goto more; + } + current_field = 0; + } /* for sv */ + + ctx->px_st = STAT_PX_ST_BE; + __fallthrough; + + case STAT_PX_ST_BE: + /* print the backend */ + if (stats_dump_be_stats(sc, px)) { + if (!stats_putchk(appctx, htx)) + goto full; + ctx->flags |= STAT_STARTED; + if (ctx->field) + goto more; + } + + current_field = 0; + ctx->px_st = STAT_PX_ST_END; + __fallthrough; + + case STAT_PX_ST_END: + if (ctx->flags & STAT_FMT_HTML) { + stats_dump_html_px_end(sc, px); + if (!stats_putchk(appctx, htx)) + goto full; + } + + ctx->px_st = STAT_PX_ST_FIN; + __fallthrough; + + case STAT_PX_ST_FIN: + return 1; + + default: + /* unknown state, we should put an abort() here ! */ + return 1; + } + + full: + /* restore previous field */ + ctx->field = current_field; + return 0; +} + +/* Dumps the HTTP stats head block to the local trash buffer and uses the + * per-uri parameters from the parent proxy. The caller is responsible for + * clearing the local trash buffer if needed. + */ +static void stats_dump_html_head(struct appctx *appctx) +{ + struct show_stat_ctx *ctx = appctx->svcctx; + struct uri_auth *uri; + + BUG_ON(!ctx->http_px); + uri = ctx->http_px->uri_auth; + + /* WARNING! This must fit in the first buffer !!! */ + chunk_appendf(&trash_chunk, + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"\n" + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + "<html><head><title>Statistics Report for " PRODUCT_NAME "%s%s</title>\n" + "<link rel=\"icon\" href=\"data:,\">\n" + "<meta http-equiv=\"content-type\" content=\"text/html; charset=iso-8859-1\">\n" + "<style type=\"text/css\"><!--\n" + "body {" + " font-family: arial, helvetica, sans-serif;" + " font-size: 12px;" + " font-weight: normal;" + " color: black;" + " background: white;" + "}\n" + "th,td {" + " font-size: 10px;" + "}\n" + "h1 {" + " font-size: x-large;" + " margin-bottom: 0.5em;" + "}\n" + "h2 {" + " font-family: helvetica, arial;" + " font-size: x-large;" + " font-weight: bold;" + " font-style: italic;" + " color: #6020a0;" + " margin-top: 0em;" + " margin-bottom: 0em;" + "}\n" + "h3 {" + " font-family: helvetica, arial;" + " font-size: 16px;" + " font-weight: bold;" + " color: #b00040;" + " background: #e8e8d0;" + " margin-top: 0em;" + " margin-bottom: 0em;" + "}\n" + "li {" + " margin-top: 0.25em;" + " margin-right: 2em;" + "}\n" + ".hr {margin-top: 0.25em;" + " border-color: black;" + " border-bottom-style: solid;" + "}\n" + ".titre {background: #20D0D0;color: #000000; font-weight: bold; text-align: center;}\n" + ".total {background: #20D0D0;color: #ffff80;}\n" + ".frontend {background: #e8e8d0;}\n" + ".socket {background: #d0d0d0;}\n" + ".backend {background: #e8e8d0;}\n" + ".active_down {background: #ff9090;}\n" + ".active_going_up {background: #ffd020;}\n" + ".active_going_down {background: #ffffa0;}\n" + ".active_up {background: #c0ffc0;}\n" + ".active_nolb {background: #20a0ff;}\n" + ".active_draining {background: #20a0FF;}\n" + ".active_no_check {background: #e0e0e0;}\n" + ".backup_down {background: #ff9090;}\n" + ".backup_going_up {background: #ff80ff;}\n" + ".backup_going_down {background: #c060ff;}\n" + ".backup_up {background: #b0d0ff;}\n" + ".backup_nolb {background: #90b0e0;}\n" + ".backup_draining {background: #cc9900;}\n" + ".backup_no_check {background: #e0e0e0;}\n" + ".maintain {background: #c07820;}\n" + ".rls {letter-spacing: 0.2em; margin-right: 1px;}\n" /* right letter spacing (used for grouping digits) */ + "\n" + "a.px:link {color: #ffff40; text-decoration: none;}" + "a.px:visited {color: #ffff40; text-decoration: none;}" + "a.px:hover {color: #ffffff; text-decoration: none;}" + "a.lfsb:link {color: #000000; text-decoration: none;}" + "a.lfsb:visited {color: #000000; text-decoration: none;}" + "a.lfsb:hover {color: #505050; text-decoration: none;}" + "\n" + "table.tbl { border-collapse: collapse; border-style: none;}\n" + "table.tbl td { text-align: right; border-width: 1px 1px 1px 1px; border-style: solid solid solid solid; padding: 2px 3px; border-color: gray; white-space: nowrap;}\n" + "table.tbl td.ac { text-align: center;}\n" + "table.tbl th { border-width: 1px; border-style: solid solid solid solid; border-color: gray;}\n" + "table.tbl th.pxname { background: #b00040; color: #ffff40; font-weight: bold; border-style: solid solid none solid; padding: 2px 3px; white-space: nowrap;}\n" + "table.tbl th.empty { border-style: none; empty-cells: hide; background: white;}\n" + "table.tbl th.desc { background: white; border-style: solid solid none solid; text-align: left; padding: 2px 3px;}\n" + "\n" + "table.lgd { border-collapse: collapse; border-width: 1px; border-style: none none none solid; border-color: black;}\n" + "table.lgd td { border-width: 1px; border-style: solid solid solid solid; border-color: gray; padding: 2px;}\n" + "table.lgd td.noborder { border-style: none; padding: 2px; white-space: nowrap;}\n" + "table.det { border-collapse: collapse; border-style: none; }\n" + "table.det th { text-align: left; border-width: 0px; padding: 0px 1px 0px 0px; font-style:normal;font-size:11px;font-weight:bold;font-family: sans-serif;}\n" + "table.det td { text-align: right; border-width: 0px; padding: 0px 0px 0px 4px; white-space: nowrap; font-style:normal;font-size:11px;font-weight:normal;}\n" + "u {text-decoration:none; border-bottom: 1px dotted black;}\n" + "div.tips {\n" + " display:block;\n" + " visibility:hidden;\n" + " z-index:2147483647;\n" + " position:absolute;\n" + " padding:2px 4px 3px;\n" + " background:#f0f060; color:#000000;\n" + " border:1px solid #7040c0;\n" + " white-space:nowrap;\n" + " font-style:normal;font-size:11px;font-weight:normal;\n" + " -moz-border-radius:3px;-webkit-border-radius:3px;border-radius:3px;\n" + " -moz-box-shadow:gray 2px 2px 3px;-webkit-box-shadow:gray 2px 2px 3px;box-shadow:gray 2px 2px 3px;\n" + "}\n" + "u:hover div.tips {visibility:visible;}\n" + "@media (prefers-color-scheme: dark) {\n" + " body { font-family: arial, helvetica, sans-serif; font-size: 12px; font-weight: normal; color: #e8e6e3; background: #131516;}\n" + " h1 { color: #a265e0!important; }\n" + " h2 { color: #a265e0; }\n" + " h3 { color: #ff5190; background-color: #3e3e1f; }\n" + " a { color: #3391ff; }\n" + " input { background-color: #2f3437; }\n" + " .hr { border-color: #8c8273; }\n" + " .titre { background-color: #1aa6a6; color: #e8e6e3; }\n" + " .frontend {background: #2f3437;}\n" + " .socket {background: #2a2d2f;}\n" + " .backend {background: #2f3437;}\n" + " .active_down {background: #760000;}\n" + " .active_going_up {background: #b99200;}\n" + " .active_going_down {background: #6c6c00;}\n" + " .active_up {background: #165900;}\n" + " .active_nolb {background: #006ab9;}\n" + " .active_draining {background: #006ab9;}\n" + " .active_no_check {background: #2a2d2f;}\n" + " .backup_down {background: #760000;}\n" + " .backup_going_up {background: #7f007f;}\n" + " .backup_going_down {background: #580092;}\n" + " .backup_up {background: #2e3234;}\n" + " .backup_nolb {background: #1e3c6a;}\n" + " .backup_draining {background: #a37a00;}\n" + " .backup_no_check {background: #2a2d2f;}\n" + " .maintain {background: #9a601a;}\n" + " a.px:link {color: #d8d83b; text-decoration: none;}\n" + " a.px:visited {color: #d8d83b; text-decoration: none;}\n" + " a.px:hover {color: #ffffff; text-decoration: none;}\n" + " a.lfsb:link {color: #e8e6e3; text-decoration: none;}\n" + " a.lfsb:visited {color: #e8e6e3; text-decoration: none;}\n" + " a.lfsb:hover {color: #b5afa6; text-decoration: none;}\n" + " table.tbl th.empty { background-color: #181a1b; }\n" + " table.tbl th.desc { background: #181a1b; }\n" + " table.tbl th.pxname { background-color: #8d0033; color: #ffff46; }\n" + " table.tbl th { border-color: #808080; }\n" + " table.tbl td { border-color: #808080; }\n" + " u {text-decoration:none; border-bottom: 1px dotted #e8e6e3;}\n" + " div.tips {\n" + " background:#8e8e0d;\n" + " color:#e8e6e3;\n" + " border-color: #4e2c86;\n" + " -moz-box-shadow: #60686c 2px 2px 3px;\n" + " -webkit-box-shadow: #60686c 2px 2px 3px;\n" + " box-shadow: #60686c 2px 2px 3px;\n" + " }\n" + "}\n" + "-->\n" + "</style></head>\n", + (ctx->flags & STAT_SHNODE) ? " on " : "", + (ctx->flags & STAT_SHNODE) ? (uri && uri->node ? uri->node : global.node) : "" + ); +} + +/* Dumps the HTML stats information block to the local trash buffer and uses + * the state from stream connector <sc> and per-uri parameter from the parent + * proxy. The caller is responsible for clearing the local trash buffer if + * needed. + */ +static void stats_dump_html_info(struct stconn *sc) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + unsigned int up = ns_to_sec(now_ns - start_time_ns); + char scope_txt[STAT_SCOPE_TXT_MAXLEN + sizeof STAT_SCOPE_PATTERN]; + const char *scope_ptr = stats_scope_ptr(appctx, sc); + struct uri_auth *uri; + unsigned long long bps; + int thr; + + BUG_ON(!ctx->http_px); + uri = ctx->http_px->uri_auth; + for (bps = thr = 0; thr < global.nbthread; thr++) + bps += 32ULL * read_freq_ctr(&ha_thread_ctx[thr].out_32bps); + + /* Turn the bytes per second to bits per second and take care of the + * usual ethernet overhead in order to help figure how far we are from + * interface saturation since it's the only case which usually matters. + * For this we count the total size of an Ethernet frame on the wire + * including preamble and IFG (1538) for the largest TCP segment it + * transports (1448 with TCP timestamps). This is not valid for smaller + * packets (under-estimated), but it gives a reasonably accurate + * estimation of how far we are from uplink saturation. + */ + bps = bps * 8 * 1538 / 1448; + + /* WARNING! this has to fit the first packet too. + * We are around 3.5 kB, add adding entries will + * become tricky if we want to support 4kB buffers ! + */ + chunk_appendf(&trash_chunk, + "<body><h1><a href=\"" PRODUCT_URL "\" style=\"text-decoration: none;\">" + PRODUCT_NAME "%s</a></h1>\n" + "<h2>Statistics Report for pid %d%s%s%s%s</h2>\n" + "<hr width=\"100%%\" class=\"hr\">\n" + "<h3>> General process information</h3>\n" + "<table border=0><tr><td align=\"left\" nowrap width=\"1%%\">\n" + "<p><b>pid = </b> %d (process #%d, nbproc = %d, nbthread = %d)<br>\n" + "<b>uptime = </b> %dd %dh%02dm%02ds; warnings = %u<br>\n" + "<b>system limits:</b> memmax = %s%s; ulimit-n = %d<br>\n" + "<b>maxsock = </b> %d; <b>maxconn = </b> %d; <b>reached = </b> %llu; <b>maxpipes = </b> %d<br>\n" + "current conns = %d; current pipes = %d/%d; conn rate = %d/sec; bit rate = %.3f %cbps<br>\n" + "Running tasks: %d/%d (%d niced); idle = %d %%<br>\n" + "</td><td align=\"center\" nowrap>\n" + "<table class=\"lgd\"><tr>\n" + "<td class=\"active_up\"> </td><td class=\"noborder\">active UP </td>" + "<td class=\"backup_up\"> </td><td class=\"noborder\">backup UP </td>" + "</tr><tr>\n" + "<td class=\"active_going_down\"></td><td class=\"noborder\">active UP, going down </td>" + "<td class=\"backup_going_down\"></td><td class=\"noborder\">backup UP, going down </td>" + "</tr><tr>\n" + "<td class=\"active_going_up\"></td><td class=\"noborder\">active DOWN, going up </td>" + "<td class=\"backup_going_up\"></td><td class=\"noborder\">backup DOWN, going up </td>" + "</tr><tr>\n" + "<td class=\"active_down\"></td><td class=\"noborder\">active or backup DOWN </td>" + "<td class=\"active_no_check\"></td><td class=\"noborder\">not checked </td>" + "</tr><tr>\n" + "<td class=\"maintain\"></td><td class=\"noborder\" colspan=\"3\">active or backup DOWN for maintenance (MAINT) </td>" + "</tr><tr>\n" + "<td class=\"active_draining\"></td><td class=\"noborder\" colspan=\"3\">active or backup SOFT STOPPED for maintenance </td>" + "</tr></table>\n" + "Note: \"NOLB\"/\"DRAIN\" = UP with load-balancing disabled." + "</td>" + "<td align=\"left\" valign=\"top\" nowrap width=\"1%%\">" + "<b>Display option:</b><ul style=\"margin-top: 0.25em;\">" + "", + (ctx->flags & STAT_HIDEVER) ? "" : (stats_version_string), + pid, (ctx->flags & STAT_SHNODE) ? " on " : "", + (ctx->flags & STAT_SHNODE) ? (uri->node ? uri->node : global.node) : "", + (ctx->flags & STAT_SHDESC) ? ": " : "", + (ctx->flags & STAT_SHDESC) ? (uri->desc ? uri->desc : global.desc) : "", + pid, 1, 1, global.nbthread, + up / 86400, (up % 86400) / 3600, + (up % 3600) / 60, (up % 60), + HA_ATOMIC_LOAD(&tot_warnings), + global.rlimit_memmax ? ultoa(global.rlimit_memmax) : "unlimited", + global.rlimit_memmax ? " MB" : "", + global.rlimit_nofile, + global.maxsock, global.maxconn, HA_ATOMIC_LOAD(&maxconn_reached), global.maxpipes, + actconn, pipes_used, pipes_used+pipes_free, read_freq_ctr(&global.conn_per_sec), + bps >= 1000000000UL ? (bps / 1000000000.0) : bps >= 1000000UL ? (bps / 1000000.0) : (bps / 1000.0), + bps >= 1000000000UL ? 'G' : bps >= 1000000UL ? 'M' : 'k', + total_run_queues(), total_allocated_tasks(), total_niced_running_tasks(), clock_report_idle()); + + /* scope_txt = search query, ctx->scope_len is always <= STAT_SCOPE_TXT_MAXLEN */ + memcpy(scope_txt, scope_ptr, ctx->scope_len); + scope_txt[ctx->scope_len] = '\0'; + + chunk_appendf(&trash_chunk, + "<li><form method=\"GET\">Scope : <input value=\"%s\" name=\"" STAT_SCOPE_INPUT_NAME "\" size=\"8\" maxlength=\"%d\" tabindex=\"1\"/></form>\n", + (ctx->scope_len > 0) ? scope_txt : "", + STAT_SCOPE_TXT_MAXLEN); + + /* scope_txt = search pattern + search query, ctx->scope_len is always <= STAT_SCOPE_TXT_MAXLEN */ + scope_txt[0] = 0; + if (ctx->scope_len) { + strlcpy2(scope_txt, STAT_SCOPE_PATTERN, sizeof(scope_txt)); + memcpy(scope_txt + strlen(STAT_SCOPE_PATTERN), scope_ptr, ctx->scope_len); + scope_txt[strlen(STAT_SCOPE_PATTERN) + ctx->scope_len] = 0; + } + + if (ctx->flags & STAT_HIDE_DOWN) + chunk_appendf(&trash_chunk, + "<li><a href=\"%s%s%s%s\">Show all servers</a><br>\n", + uri->uri_prefix, + "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + else + chunk_appendf(&trash_chunk, + "<li><a href=\"%s%s%s%s\">Hide 'DOWN' servers</a><br>\n", + uri->uri_prefix, + ";up", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + + if (uri->refresh > 0) { + if (ctx->flags & STAT_NO_REFRESH) + chunk_appendf(&trash_chunk, + "<li><a href=\"%s%s%s%s\">Enable refresh</a><br>\n", + uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + "", + scope_txt); + else + chunk_appendf(&trash_chunk, + "<li><a href=\"%s%s%s%s\">Disable refresh</a><br>\n", + uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + ";norefresh", + scope_txt); + } + + chunk_appendf(&trash_chunk, + "<li><a href=\"%s%s%s%s\">Refresh now</a><br>\n", + uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + + chunk_appendf(&trash_chunk, + "<li><a href=\"%s;csv%s%s\">CSV export</a><br>\n", + uri->uri_prefix, + (uri->refresh > 0) ? ";norefresh" : "", + scope_txt); + + chunk_appendf(&trash_chunk, + "<li><a href=\"%s;json%s%s\">JSON export</a> (<a href=\"%s;json-schema\">schema</a>)<br>\n", + uri->uri_prefix, + (uri->refresh > 0) ? ";norefresh" : "", + scope_txt, uri->uri_prefix); + + chunk_appendf(&trash_chunk, + "</ul></td>" + "<td align=\"left\" valign=\"top\" nowrap width=\"1%%\">" + "<b>External resources:</b><ul style=\"margin-top: 0.25em;\">\n" + "<li><a href=\"" PRODUCT_URL "\">Primary site</a><br>\n" + "<li><a href=\"" PRODUCT_URL_UPD "\">Updates (v" PRODUCT_BRANCH ")</a><br>\n" + "<li><a href=\"" PRODUCT_URL_DOC "\">Online manual</a><br>\n" + "</ul>" + "</td>" + "</tr></table>\n" + "" + ); + + if (ctx->st_code) { + switch (ctx->st_code) { + case STAT_STATUS_DONE: + chunk_appendf(&trash_chunk, + "<p><div class=active_up>" + "<a class=lfsb href=\"%s%s%s%s\" title=\"Remove this message\">[X]</a> " + "Action processed successfully." + "</div>\n", uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + break; + case STAT_STATUS_NONE: + chunk_appendf(&trash_chunk, + "<p><div class=active_going_down>" + "<a class=lfsb href=\"%s%s%s%s\" title=\"Remove this message\">[X]</a> " + "Nothing has changed." + "</div>\n", uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + break; + case STAT_STATUS_PART: + chunk_appendf(&trash_chunk, + "<p><div class=active_going_down>" + "<a class=lfsb href=\"%s%s%s%s\" title=\"Remove this message\">[X]</a> " + "Action partially processed.<br>" + "Some server names are probably unknown or ambiguous (duplicated names in the backend)." + "</div>\n", uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + break; + case STAT_STATUS_ERRP: + chunk_appendf(&trash_chunk, + "<p><div class=active_down>" + "<a class=lfsb href=\"%s%s%s%s\" title=\"Remove this message\">[X]</a> " + "Action not processed because of invalid parameters." + "<ul>" + "<li>The action is maybe unknown.</li>" + "<li>Invalid key parameter (empty or too long).</li>" + "<li>The backend name is probably unknown or ambiguous (duplicated names).</li>" + "<li>Some server names are probably unknown or ambiguous (duplicated names in the backend).</li>" + "</ul>" + "</div>\n", uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + break; + case STAT_STATUS_EXCD: + chunk_appendf(&trash_chunk, + "<p><div class=active_down>" + "<a class=lfsb href=\"%s%s%s%s\" title=\"Remove this message\">[X]</a> " + "<b>Action not processed : the buffer couldn't store all the data.<br>" + "You should retry with less servers at a time.</b>" + "</div>\n", uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + break; + case STAT_STATUS_DENY: + chunk_appendf(&trash_chunk, + "<p><div class=active_down>" + "<a class=lfsb href=\"%s%s%s%s\" title=\"Remove this message\">[X]</a> " + "<b>Action denied.</b>" + "</div>\n", uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + break; + case STAT_STATUS_IVAL: + chunk_appendf(&trash_chunk, + "<p><div class=active_down>" + "<a class=lfsb href=\"%s%s%s%s\" title=\"Remove this message\">[X]</a> " + "<b>Invalid requests (unsupported method or chunked encoded request).</b>" + "</div>\n", uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + break; + default: + chunk_appendf(&trash_chunk, + "<p><div class=active_no_check>" + "<a class=lfsb href=\"%s%s%s%s\" title=\"Remove this message\">[X]</a> " + "Unexpected result." + "</div>\n", uri->uri_prefix, + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + } + chunk_appendf(&trash_chunk, "<p>\n"); + } +} + +/* Dumps the HTML stats trailer block to the local trash buffer. The caller is + * responsible for clearing the local trash buffer if needed. + */ +static void stats_dump_html_end() +{ + chunk_appendf(&trash_chunk, "</body></html>\n"); +} + +/* Dumps the stats JSON header to the local trash buffer buffer which. The + * caller is responsible for clearing it if needed. + */ +static void stats_dump_json_header() +{ + chunk_strcat(&trash_chunk, "["); +} + + +/* Dumps the JSON stats trailer block to the local trash buffer. The caller is + * responsible for clearing the local trash buffer if needed. + */ +static void stats_dump_json_end() +{ + chunk_strcat(&trash_chunk, "]\n"); +} + +/* Uses <appctx.ctx.stats.obj1> as a pointer to the current proxy and <obj2> as + * a pointer to the current server/listener. + */ +static int stats_dump_proxies(struct stconn *sc, + struct htx *htx) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct channel *rep = sc_ic(sc); + struct proxy *px; + + /* dump proxies */ + while (ctx->obj1) { + if (htx) { + if (htx_almost_full(htx)) { + sc_need_room(sc, htx->size / 2); + goto full; + } + } + else { + if (buffer_almost_full(&rep->buf)) { + sc_need_room(sc, b_size(&rep->buf) / 2); + goto full; + } + } + + px = ctx->obj1; + /* Skip the global frontend proxies and non-networked ones. + * Also skip proxies that were disabled in the configuration + * This change allows retrieving stats from "old" proxies after a reload. + */ + if (!(px->flags & PR_FL_DISABLED) && px->uuid > 0 && + (px->cap & (PR_CAP_FE | PR_CAP_BE)) && !(px->cap & PR_CAP_INT)) { + if (stats_dump_proxy_to_buffer(sc, htx, px) == 0) + return 0; + } + + ctx->obj1 = px->next; + ctx->px_st = STAT_PX_ST_INIT; + ctx->field = 0; + } + + return 1; + + full: + return 0; +} + +/* This function dumps statistics onto the stream connector's read buffer in + * either CSV or HTML format. It returns 0 if it had to stop writing data and + * an I/O is needed, 1 if the dump is finished and the stream must be closed, + * or -1 in case of any error. This function is used by both the CLI and the + * HTTP handlers. + */ +static int stats_dump_stat_to_buffer(struct stconn *sc, struct htx *htx) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + enum stats_domain domain = ctx->domain; + + chunk_reset(&trash_chunk); + + switch (ctx->state) { + case STAT_STATE_INIT: + ctx->state = STAT_STATE_HEAD; /* let's start producing data */ + __fallthrough; + + case STAT_STATE_HEAD: + if (ctx->flags & STAT_FMT_HTML) + stats_dump_html_head(appctx); + else if (ctx->flags & STAT_JSON_SCHM) + stats_dump_json_schema(&trash_chunk); + else if (ctx->flags & STAT_FMT_JSON) + stats_dump_json_header(); + else if (!(ctx->flags & STAT_FMT_TYPED)) + stats_dump_csv_header(ctx->domain); + + if (!stats_putchk(appctx, htx)) + goto full; + + if (ctx->flags & STAT_JSON_SCHM) { + ctx->state = STAT_STATE_FIN; + return 1; + } + ctx->state = STAT_STATE_INFO; + __fallthrough; + + case STAT_STATE_INFO: + if (ctx->flags & STAT_FMT_HTML) { + stats_dump_html_info(sc); + if (!stats_putchk(appctx, htx)) + goto full; + } + + if (domain == STATS_DOMAIN_PROXY) + ctx->obj1 = proxies_list; + + ctx->px_st = STAT_PX_ST_INIT; + ctx->field = 0; + ctx->state = STAT_STATE_LIST; + __fallthrough; + + case STAT_STATE_LIST: + switch (domain) { + case STATS_DOMAIN_RESOLVERS: + if (!stats_dump_resolvers(sc, stat_l[domain], + stat_count[domain], + &stats_module_list[domain])) { + return 0; + } + break; + + case STATS_DOMAIN_PROXY: + default: + /* dump proxies */ + if (!stats_dump_proxies(sc, htx)) + return 0; + break; + } + + ctx->state = STAT_STATE_END; + __fallthrough; + + case STAT_STATE_END: + if (ctx->flags & (STAT_FMT_HTML|STAT_FMT_JSON)) { + if (ctx->flags & STAT_FMT_HTML) + stats_dump_html_end(); + else + stats_dump_json_end(); + if (!stats_putchk(appctx, htx)) + goto full; + } + + ctx->state = STAT_STATE_FIN; + __fallthrough; + + case STAT_STATE_FIN: + return 1; + + default: + /* unknown state ! */ + ctx->state = STAT_STATE_FIN; + return -1; + } + + full: + return 0; + +} + +/* We reached the stats page through a POST request. The appctx is + * expected to have already been allocated by the caller. + * Parse the posted data and enable/disable servers if necessary. + * Returns 1 if request was parsed or zero if it needs more data. + */ +static int stats_process_http_post(struct stconn *sc) +{ + struct stream *s = __sc_strm(sc); + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + + struct proxy *px = NULL; + struct server *sv = NULL; + + char key[LINESIZE]; + int action = ST_ADM_ACTION_NONE; + int reprocess = 0; + + int total_servers = 0; + int altered_servers = 0; + + char *first_param, *cur_param, *next_param, *end_params; + char *st_cur_param = NULL; + char *st_next_param = NULL; + + struct buffer *temp = get_trash_chunk(); + + struct htx *htx = htxbuf(&s->req.buf); + struct htx_blk *blk; + + /* we need more data */ + if (s->txn->req.msg_state < HTTP_MSG_DONE) { + /* check if we can receive more */ + if (htx_free_data_space(htx) <= global.tune.maxrewrite) { + ctx->st_code = STAT_STATUS_EXCD; + goto out; + } + goto wait; + } + + /* The request was fully received. Copy data */ + blk = htx_get_head_blk(htx); + while (blk) { + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_DATA) { + struct ist v = htx_get_blk_value(htx, blk); + + if (!chunk_memcat(temp, v.ptr, v.len)) { + ctx->st_code = STAT_STATUS_EXCD; + goto out; + } + } + blk = htx_get_next_blk(htx, blk); + } + + first_param = temp->area; + end_params = temp->area + temp->data; + cur_param = next_param = end_params; + *end_params = '\0'; + + ctx->st_code = STAT_STATUS_NONE; + + /* + * Parse the parameters in reverse order to only store the last value. + * From the html form, the backend and the action are at the end. + */ + while (cur_param > first_param) { + char *value; + int poffset, plen; + + cur_param--; + + if ((*cur_param == '&') || (cur_param == first_param)) { + reprocess_servers: + /* Parse the key */ + poffset = (cur_param != first_param ? 1 : 0); + plen = next_param - cur_param + (cur_param == first_param ? 1 : 0); + if ((plen > 0) && (plen <= sizeof(key))) { + strncpy(key, cur_param + poffset, plen); + key[plen - 1] = '\0'; + } else { + ctx->st_code = STAT_STATUS_ERRP; + goto out; + } + + /* Parse the value */ + value = key; + while (*value != '\0' && *value != '=') { + value++; + } + if (*value == '=') { + /* Ok, a value is found, we can mark the end of the key */ + *value++ = '\0'; + } + if (url_decode(key, 1) < 0 || url_decode(value, 1) < 0) + break; + + /* Now we can check the key to see what to do */ + if (!px && (strcmp(key, "b") == 0)) { + if ((px = proxy_be_by_name(value)) == NULL) { + /* the backend name is unknown or ambiguous (duplicate names) */ + ctx->st_code = STAT_STATUS_ERRP; + goto out; + } + } + else if (!action && (strcmp(key, "action") == 0)) { + if (strcmp(value, "ready") == 0) { + action = ST_ADM_ACTION_READY; + } + else if (strcmp(value, "drain") == 0) { + action = ST_ADM_ACTION_DRAIN; + } + else if (strcmp(value, "maint") == 0) { + action = ST_ADM_ACTION_MAINT; + } + else if (strcmp(value, "shutdown") == 0) { + action = ST_ADM_ACTION_SHUTDOWN; + } + else if (strcmp(value, "dhlth") == 0) { + action = ST_ADM_ACTION_DHLTH; + } + else if (strcmp(value, "ehlth") == 0) { + action = ST_ADM_ACTION_EHLTH; + } + else if (strcmp(value, "hrunn") == 0) { + action = ST_ADM_ACTION_HRUNN; + } + else if (strcmp(value, "hnolb") == 0) { + action = ST_ADM_ACTION_HNOLB; + } + else if (strcmp(value, "hdown") == 0) { + action = ST_ADM_ACTION_HDOWN; + } + else if (strcmp(value, "dagent") == 0) { + action = ST_ADM_ACTION_DAGENT; + } + else if (strcmp(value, "eagent") == 0) { + action = ST_ADM_ACTION_EAGENT; + } + else if (strcmp(value, "arunn") == 0) { + action = ST_ADM_ACTION_ARUNN; + } + else if (strcmp(value, "adown") == 0) { + action = ST_ADM_ACTION_ADOWN; + } + /* else these are the old supported methods */ + else if (strcmp(value, "disable") == 0) { + action = ST_ADM_ACTION_DISABLE; + } + else if (strcmp(value, "enable") == 0) { + action = ST_ADM_ACTION_ENABLE; + } + else if (strcmp(value, "stop") == 0) { + action = ST_ADM_ACTION_STOP; + } + else if (strcmp(value, "start") == 0) { + action = ST_ADM_ACTION_START; + } + else { + ctx->st_code = STAT_STATUS_ERRP; + goto out; + } + } + else if (strcmp(key, "s") == 0) { + if (!(px && action)) { + /* + * Indicates that we'll need to reprocess the parameters + * as soon as backend and action are known + */ + if (!reprocess) { + st_cur_param = cur_param; + st_next_param = next_param; + } + reprocess = 1; + } + else if ((sv = findserver(px, value)) != NULL) { + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + switch (action) { + case ST_ADM_ACTION_DISABLE: + if (!(sv->cur_admin & SRV_ADMF_FMAINT)) { + altered_servers++; + total_servers++; + srv_set_admin_flag(sv, SRV_ADMF_FMAINT, SRV_ADM_STCHGC_STATS_DISABLE); + } + break; + case ST_ADM_ACTION_ENABLE: + if (sv->cur_admin & SRV_ADMF_FMAINT) { + altered_servers++; + total_servers++; + srv_clr_admin_flag(sv, SRV_ADMF_FMAINT); + } + break; + case ST_ADM_ACTION_STOP: + if (!(sv->cur_admin & SRV_ADMF_FDRAIN)) { + srv_set_admin_flag(sv, SRV_ADMF_FDRAIN, SRV_ADM_STCHGC_STATS_STOP); + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_START: + if (sv->cur_admin & SRV_ADMF_FDRAIN) { + srv_clr_admin_flag(sv, SRV_ADMF_FDRAIN); + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_DHLTH: + if (sv->check.state & CHK_ST_CONFIGURED) { + sv->check.state &= ~CHK_ST_ENABLED; + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_EHLTH: + if (sv->check.state & CHK_ST_CONFIGURED) { + sv->check.state |= CHK_ST_ENABLED; + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_HRUNN: + if (!(sv->track)) { + sv->check.health = sv->check.rise + sv->check.fall - 1; + srv_set_running(sv, SRV_OP_STCHGC_STATS_WEB); + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_HNOLB: + if (!(sv->track)) { + sv->check.health = sv->check.rise + sv->check.fall - 1; + srv_set_stopping(sv, SRV_OP_STCHGC_STATS_WEB); + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_HDOWN: + if (!(sv->track)) { + sv->check.health = 0; + srv_set_stopped(sv, SRV_OP_STCHGC_STATS_WEB); + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_DAGENT: + if (sv->agent.state & CHK_ST_CONFIGURED) { + sv->agent.state &= ~CHK_ST_ENABLED; + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_EAGENT: + if (sv->agent.state & CHK_ST_CONFIGURED) { + sv->agent.state |= CHK_ST_ENABLED; + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_ARUNN: + if (sv->agent.state & CHK_ST_ENABLED) { + sv->agent.health = sv->agent.rise + sv->agent.fall - 1; + srv_set_running(sv, SRV_OP_STCHGC_STATS_WEB); + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_ADOWN: + if (sv->agent.state & CHK_ST_ENABLED) { + sv->agent.health = 0; + srv_set_stopped(sv, SRV_OP_STCHGC_STATS_WEB); + altered_servers++; + total_servers++; + } + break; + case ST_ADM_ACTION_READY: + srv_adm_set_ready(sv); + altered_servers++; + total_servers++; + break; + case ST_ADM_ACTION_DRAIN: + srv_adm_set_drain(sv); + altered_servers++; + total_servers++; + break; + case ST_ADM_ACTION_MAINT: + srv_adm_set_maint(sv); + altered_servers++; + total_servers++; + break; + case ST_ADM_ACTION_SHUTDOWN: + if (!(px->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + srv_shutdown_streams(sv, SF_ERR_KILLED); + altered_servers++; + total_servers++; + } + break; + } + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + } else { + /* the server name is unknown or ambiguous (duplicate names) */ + total_servers++; + } + } + if (reprocess && px && action) { + /* Now, we know the backend and the action chosen by the user. + * We can safely restart from the first server parameter + * to reprocess them + */ + cur_param = st_cur_param; + next_param = st_next_param; + reprocess = 0; + goto reprocess_servers; + } + + next_param = cur_param; + } + } + + if (total_servers == 0) { + ctx->st_code = STAT_STATUS_NONE; + } + else if (altered_servers == 0) { + ctx->st_code = STAT_STATUS_ERRP; + } + else if (altered_servers == total_servers) { + ctx->st_code = STAT_STATUS_DONE; + } + else { + ctx->st_code = STAT_STATUS_PART; + } + out: + return 1; + wait: + ctx->st_code = STAT_STATUS_NONE; + return 0; +} + + +static int stats_send_http_headers(struct stconn *sc, struct htx *htx) +{ + struct stream *s = __sc_strm(sc); + struct uri_auth *uri; + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct htx_sl *sl; + unsigned int flags; + + BUG_ON(!ctx->http_px); + uri = ctx->http_px->uri_auth; + + flags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11|HTX_SL_F_XFER_ENC|HTX_SL_F_XFER_LEN|HTX_SL_F_CHNK); + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, ist("HTTP/1.1"), ist("200"), ist("OK")); + if (!sl) + goto full; + sl->info.res.status = 200; + + if (!htx_add_header(htx, ist("Cache-Control"), ist("no-cache"))) + goto full; + if (ctx->flags & STAT_FMT_HTML) { + if (!htx_add_header(htx, ist("Content-Type"), ist("text/html"))) + goto full; + } + else if (ctx->flags & (STAT_FMT_JSON|STAT_JSON_SCHM)) { + if (!htx_add_header(htx, ist("Content-Type"), ist("application/json"))) + goto full; + } + else { + if (!htx_add_header(htx, ist("Content-Type"), ist("text/plain"))) + goto full; + } + + if (uri->refresh > 0 && !(ctx->flags & STAT_NO_REFRESH)) { + const char *refresh = U2A(uri->refresh); + if (!htx_add_header(htx, ist("Refresh"), ist(refresh))) + goto full; + } + + if (ctx->flags & STAT_CHUNKED) { + if (!htx_add_header(htx, ist("Transfer-Encoding"), ist("chunked"))) + goto full; + } + + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto full; + + channel_add_input(&s->res, htx->data); + return 1; + + full: + htx_reset(htx); + sc_need_room(sc, 0); + return 0; +} + + +static int stats_send_http_redirect(struct stconn *sc, struct htx *htx) +{ + char scope_txt[STAT_SCOPE_TXT_MAXLEN + sizeof STAT_SCOPE_PATTERN]; + struct stream *s = __sc_strm(sc); + struct uri_auth *uri; + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + struct htx_sl *sl; + unsigned int flags; + + BUG_ON(!ctx->http_px); + uri = ctx->http_px->uri_auth; + + /* scope_txt = search pattern + search query, ctx->scope_len is always <= STAT_SCOPE_TXT_MAXLEN */ + scope_txt[0] = 0; + if (ctx->scope_len) { + const char *scope_ptr = stats_scope_ptr(appctx, sc); + + strlcpy2(scope_txt, STAT_SCOPE_PATTERN, sizeof(scope_txt)); + memcpy(scope_txt + strlen(STAT_SCOPE_PATTERN), scope_ptr, ctx->scope_len); + scope_txt[strlen(STAT_SCOPE_PATTERN) + ctx->scope_len] = 0; + } + + /* We don't want to land on the posted stats page because a refresh will + * repost the data. We don't want this to happen on accident so we redirect + * the browse to the stats page with a GET. + */ + chunk_printf(&trash, "%s;st=%s%s%s%s", + uri->uri_prefix, + ((ctx->st_code > STAT_STATUS_INIT) && + (ctx->st_code < STAT_STATUS_SIZE) && + stat_status_codes[ctx->st_code]) ? + stat_status_codes[ctx->st_code] : + stat_status_codes[STAT_STATUS_UNKN], + (ctx->flags & STAT_HIDE_DOWN) ? ";up" : "", + (ctx->flags & STAT_NO_REFRESH) ? ";norefresh" : "", + scope_txt); + + flags = (HTX_SL_F_IS_RESP|HTX_SL_F_VER_11|HTX_SL_F_XFER_LEN|HTX_SL_F_CLEN|HTX_SL_F_CHNK); + sl = htx_add_stline(htx, HTX_BLK_RES_SL, flags, ist("HTTP/1.1"), ist("303"), ist("See Other")); + if (!sl) + goto full; + sl->info.res.status = 303; + + if (!htx_add_header(htx, ist("Cache-Control"), ist("no-cache")) || + !htx_add_header(htx, ist("Content-Type"), ist("text/plain")) || + !htx_add_header(htx, ist("Content-Length"), ist("0")) || + !htx_add_header(htx, ist("Location"), ist2(trash.area, trash.data))) + goto full; + + if (!htx_add_endof(htx, HTX_BLK_EOH)) + goto full; + + channel_add_input(&s->res, htx->data); + return 1; + +full: + htx_reset(htx); + sc_need_room(sc, 0); + return 0; +} + +/* This I/O handler runs as an applet embedded in a stream connector. It is + * used to send HTTP stats over a TCP socket. The mechanism is very simple. + * appctx->st0 contains the operation in progress (dump, done). The handler + * automatically unregisters itself once transfer is complete. + */ +static void http_stats_io_handler(struct appctx *appctx) +{ + struct show_stat_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct stream *s = __sc_strm(sc); + struct channel *req = sc_oc(sc); + struct channel *res = sc_ic(sc); + struct htx *req_htx, *res_htx; + + /* only proxy stats are available via http */ + ctx->domain = STATS_DOMAIN_PROXY; + + res_htx = htx_from_buf(&res->buf); + + if (unlikely(se_fl_test(appctx->sedesc, (SE_FL_EOS|SE_FL_ERROR|SE_FL_SHR|SE_FL_SHW)))) { + appctx->st0 = STAT_HTTP_END; + goto out; + } + + /* Check if the input buffer is available. */ + if (!b_size(&res->buf)) { + sc_need_room(sc, 0); + goto out; + } + + /* all states are processed in sequence */ + if (appctx->st0 == STAT_HTTP_HEAD) { + if (stats_send_http_headers(sc, res_htx)) { + if (s->txn->meth == HTTP_METH_HEAD) + appctx->st0 = STAT_HTTP_DONE; + else + appctx->st0 = STAT_HTTP_DUMP; + } + } + + if (appctx->st0 == STAT_HTTP_DUMP) { + trash_chunk = b_make(trash.area, res->buf.size, 0, 0); + /* adjust buffer size to take htx overhead into account, + * make sure to perform this call on an empty buffer + */ + trash_chunk.size = buf_room_for_htx_data(&trash_chunk); + if (stats_dump_stat_to_buffer(sc, res_htx)) + appctx->st0 = STAT_HTTP_DONE; + } + + if (appctx->st0 == STAT_HTTP_POST) { + if (stats_process_http_post(sc)) + appctx->st0 = STAT_HTTP_LAST; + else if (s->scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + appctx->st0 = STAT_HTTP_DONE; + } + + if (appctx->st0 == STAT_HTTP_LAST) { + if (stats_send_http_redirect(sc, res_htx)) + appctx->st0 = STAT_HTTP_DONE; + } + + if (appctx->st0 == STAT_HTTP_DONE) { + /* no more data are expected. If the response buffer is empty, + * be sure to add something (EOT block in this case) to have + * something to send. It is important to be sure the EOM flags + * will be handled by the endpoint. + */ + if (htx_is_empty(res_htx)) { + if (!htx_add_endof(res_htx, HTX_BLK_EOT)) { + sc_need_room(sc, sizeof(struct htx_blk) + 1); + goto out; + } + channel_add_input(res, 1); + } + res_htx->flags |= HTX_FL_EOM; + se_fl_set(appctx->sedesc, SE_FL_EOI); + appctx->st0 = STAT_HTTP_END; + } + + if (appctx->st0 == STAT_HTTP_END) { + se_fl_set(appctx->sedesc, SE_FL_EOS); + applet_will_consume(appctx); + } + + out: + /* we have left the request in the buffer for the case where we + * process a POST, and this automatically re-enables activity on + * read. It's better to indicate that we want to stop reading when + * we're sending, so that we know there's at most one direction + * deciding to wake the applet up. It saves it from looping when + * emitting large blocks into small TCP windows. + */ + htx_to_buf(res_htx, &res->buf); + if (appctx->st0 == STAT_HTTP_END) { + /* eat the whole request */ + if (co_data(req)) { + req_htx = htx_from_buf(&req->buf); + co_htx_skip(req, req_htx, co_data(req)); + htx_to_buf(req_htx, &req->buf); + } + } + else if (co_data(res)) + applet_wont_consume(appctx); +} + +/* Dump all fields from <info> into <out> using the "show info" format (name: value) */ +static int stats_dump_info_fields(struct buffer *out, + const struct field *info, + struct show_stat_ctx *ctx) +{ + int flags = ctx->flags; + int field; + + for (field = 0; field < INF_TOTAL_FIELDS; field++) { + if (!field_format(info, field)) + continue; + + if (!chunk_appendf(out, "%s: ", info_fields[field].name)) + return 0; + if (!stats_emit_raw_data_field(out, &info[field])) + return 0; + if ((flags & STAT_SHOW_FDESC) && !chunk_appendf(out, ":\"%s\"", info_fields[field].desc)) + return 0; + if (!chunk_strcat(out, "\n")) + return 0; + } + return 1; +} + +/* Dump all fields from <info> into <out> using the "show info typed" format */ +static int stats_dump_typed_info_fields(struct buffer *out, + const struct field *info, + struct show_stat_ctx *ctx) +{ + int flags = ctx->flags; + int field; + + for (field = 0; field < INF_TOTAL_FIELDS; field++) { + if (!field_format(info, field)) + continue; + + if (!chunk_appendf(out, "%d.%s.%u:", field, info_fields[field].name, info[INF_PROCESS_NUM].u.u32)) + return 0; + if (!stats_emit_field_tags(out, &info[field], ':')) + return 0; + if (!stats_emit_typed_data_field(out, &info[field])) + return 0; + if ((flags & STAT_SHOW_FDESC) && !chunk_appendf(out, ":\"%s\"", info_fields[field].desc)) + return 0; + if (!chunk_strcat(out, "\n")) + return 0; + } + return 1; +} + +/* Fill <info> with HAProxy global info. <info> is preallocated array of length + * <len>. The length of the array must be INF_TOTAL_FIELDS. If this length is + * less then this value, the function returns 0, otherwise, it returns 1. Some + * fields' presence or precision may depend on some of the STAT_* flags present + * in <flags>. + */ +int stats_fill_info(struct field *info, int len, uint flags) +{ + struct buffer *out = get_trash_chunk(); + uint64_t glob_out_bytes, glob_spl_bytes, glob_out_b32; + uint up_sec, up_usec; + ullong up; + ulong boot; + int thr; + +#ifdef USE_OPENSSL + double ssl_sess_rate = read_freq_ctr_flt(&global.ssl_per_sec); + double ssl_key_rate = read_freq_ctr_flt(&global.ssl_fe_keys_per_sec); + double ssl_reuse = 0; + + if (ssl_key_rate < ssl_sess_rate) + ssl_reuse = 100.0 * (1.0 - ssl_key_rate / ssl_sess_rate); +#endif + + /* sum certain per-thread totals (mostly byte counts) */ + glob_out_bytes = glob_spl_bytes = glob_out_b32 = 0; + for (thr = 0; thr < global.nbthread; thr++) { + glob_out_bytes += HA_ATOMIC_LOAD(&ha_thread_ctx[thr].out_bytes); + glob_spl_bytes += HA_ATOMIC_LOAD(&ha_thread_ctx[thr].spliced_out_bytes); + glob_out_b32 += read_freq_ctr(&ha_thread_ctx[thr].out_32bps); + } + glob_out_b32 *= 32; // values are 32-byte units + + up = now_ns - start_time_ns; + up_sec = ns_to_sec(up); + up_usec = (up / 1000U) % 1000000U; + + boot = tv_ms_remain(&start_date, &ready_date); + + if (len < INF_TOTAL_FIELDS) + return 0; + + chunk_reset(out); + memset(info, 0, sizeof(*info) * len); + + info[INF_NAME] = mkf_str(FO_PRODUCT|FN_OUTPUT|FS_SERVICE, PRODUCT_NAME); + info[INF_VERSION] = mkf_str(FO_PRODUCT|FN_OUTPUT|FS_SERVICE, haproxy_version); + info[INF_BUILD_INFO] = mkf_str(FO_PRODUCT|FN_OUTPUT|FS_SERVICE, haproxy_version); + info[INF_RELEASE_DATE] = mkf_str(FO_PRODUCT|FN_OUTPUT|FS_SERVICE, haproxy_date); + + info[INF_NBTHREAD] = mkf_u32(FO_CONFIG|FS_SERVICE, global.nbthread); + info[INF_NBPROC] = mkf_u32(FO_CONFIG|FS_SERVICE, 1); + info[INF_PROCESS_NUM] = mkf_u32(FO_KEY, 1); + info[INF_PID] = mkf_u32(FO_STATUS, pid); + + info[INF_UPTIME] = mkf_str(FN_DURATION, chunk_newstr(out)); + chunk_appendf(out, "%ud %uh%02um%02us", up_sec / 86400, (up_sec % 86400) / 3600, (up_sec % 3600) / 60, (up_sec % 60)); + + info[INF_UPTIME_SEC] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_DURATION, up_sec + up_usec / 1000000.0) : mkf_u32(FN_DURATION, up_sec); + info[INF_START_TIME_SEC] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_DURATION, start_date.tv_sec + start_date.tv_usec / 1000000.0) : mkf_u32(FN_DURATION, start_date.tv_sec); + info[INF_MEMMAX_MB] = mkf_u32(FO_CONFIG|FN_LIMIT, global.rlimit_memmax); + info[INF_MEMMAX_BYTES] = mkf_u32(FO_CONFIG|FN_LIMIT, global.rlimit_memmax * 1048576L); + info[INF_POOL_ALLOC_MB] = mkf_u32(0, (unsigned)(pool_total_allocated() / 1048576L)); + info[INF_POOL_ALLOC_BYTES] = mkf_u64(0, pool_total_allocated()); + info[INF_POOL_USED_MB] = mkf_u32(0, (unsigned)(pool_total_used() / 1048576L)); + info[INF_POOL_USED_BYTES] = mkf_u64(0, pool_total_used()); + info[INF_POOL_FAILED] = mkf_u32(FN_COUNTER, pool_total_failures()); + info[INF_ULIMIT_N] = mkf_u32(FO_CONFIG|FN_LIMIT, global.rlimit_nofile); + info[INF_MAXSOCK] = mkf_u32(FO_CONFIG|FN_LIMIT, global.maxsock); + info[INF_MAXCONN] = mkf_u32(FO_CONFIG|FN_LIMIT, global.maxconn); + info[INF_HARD_MAXCONN] = mkf_u32(FO_CONFIG|FN_LIMIT, global.hardmaxconn); + info[INF_CURR_CONN] = mkf_u32(0, actconn); + info[INF_CUM_CONN] = mkf_u32(FN_COUNTER, totalconn); + info[INF_CUM_REQ] = mkf_u32(FN_COUNTER, global.req_count); +#ifdef USE_OPENSSL + info[INF_MAX_SSL_CONNS] = mkf_u32(FN_MAX, global.maxsslconn); + info[INF_CURR_SSL_CONNS] = mkf_u32(0, global.sslconns); + info[INF_CUM_SSL_CONNS] = mkf_u32(FN_COUNTER, global.totalsslconns); +#endif + info[INF_MAXPIPES] = mkf_u32(FO_CONFIG|FN_LIMIT, global.maxpipes); + info[INF_PIPES_USED] = mkf_u32(0, pipes_used); + info[INF_PIPES_FREE] = mkf_u32(0, pipes_free); + info[INF_CONN_RATE] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_RATE, read_freq_ctr_flt(&global.conn_per_sec)) : mkf_u32(FN_RATE, read_freq_ctr(&global.conn_per_sec)); + info[INF_CONN_RATE_LIMIT] = mkf_u32(FO_CONFIG|FN_LIMIT, global.cps_lim); + info[INF_MAX_CONN_RATE] = mkf_u32(FN_MAX, global.cps_max); + info[INF_SESS_RATE] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_RATE, read_freq_ctr_flt(&global.sess_per_sec)) : mkf_u32(FN_RATE, read_freq_ctr(&global.sess_per_sec)); + info[INF_SESS_RATE_LIMIT] = mkf_u32(FO_CONFIG|FN_LIMIT, global.sps_lim); + info[INF_MAX_SESS_RATE] = mkf_u32(FN_RATE, global.sps_max); + +#ifdef USE_OPENSSL + info[INF_SSL_RATE] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_RATE, ssl_sess_rate) : mkf_u32(FN_RATE, ssl_sess_rate); + info[INF_SSL_RATE_LIMIT] = mkf_u32(FO_CONFIG|FN_LIMIT, global.ssl_lim); + info[INF_MAX_SSL_RATE] = mkf_u32(FN_MAX, global.ssl_max); + info[INF_SSL_FRONTEND_KEY_RATE] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_RATE, ssl_key_rate) : mkf_u32(0, ssl_key_rate); + info[INF_SSL_FRONTEND_MAX_KEY_RATE] = mkf_u32(FN_MAX, global.ssl_fe_keys_max); + info[INF_SSL_FRONTEND_SESSION_REUSE_PCT] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_RATE, ssl_reuse) : mkf_u32(0, ssl_reuse); + info[INF_SSL_BACKEND_KEY_RATE] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_RATE, read_freq_ctr_flt(&global.ssl_be_keys_per_sec)) : mkf_u32(FN_RATE, read_freq_ctr(&global.ssl_be_keys_per_sec)); + info[INF_SSL_BACKEND_MAX_KEY_RATE] = mkf_u32(FN_MAX, global.ssl_be_keys_max); + info[INF_SSL_CACHE_LOOKUPS] = mkf_u32(FN_COUNTER, global.shctx_lookups); + info[INF_SSL_CACHE_MISSES] = mkf_u32(FN_COUNTER, global.shctx_misses); +#endif + info[INF_COMPRESS_BPS_IN] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_RATE, read_freq_ctr_flt(&global.comp_bps_in)) : mkf_u32(FN_RATE, read_freq_ctr(&global.comp_bps_in)); + info[INF_COMPRESS_BPS_OUT] = (flags & STAT_USE_FLOAT) ? mkf_flt(FN_RATE, read_freq_ctr_flt(&global.comp_bps_out)) : mkf_u32(FN_RATE, read_freq_ctr(&global.comp_bps_out)); + info[INF_COMPRESS_BPS_RATE_LIM] = mkf_u32(FO_CONFIG|FN_LIMIT, global.comp_rate_lim); +#ifdef USE_ZLIB + info[INF_ZLIB_MEM_USAGE] = mkf_u32(0, zlib_used_memory); + info[INF_MAX_ZLIB_MEM_USAGE] = mkf_u32(FO_CONFIG|FN_LIMIT, global.maxzlibmem); +#endif + info[INF_TASKS] = mkf_u32(0, total_allocated_tasks()); + info[INF_RUN_QUEUE] = mkf_u32(0, total_run_queues()); + info[INF_IDLE_PCT] = mkf_u32(FN_AVG, clock_report_idle()); + info[INF_NODE] = mkf_str(FO_CONFIG|FN_OUTPUT|FS_SERVICE, global.node); + if (global.desc) + info[INF_DESCRIPTION] = mkf_str(FO_CONFIG|FN_OUTPUT|FS_SERVICE, global.desc); + info[INF_STOPPING] = mkf_u32(0, stopping); + info[INF_JOBS] = mkf_u32(0, jobs); + info[INF_UNSTOPPABLE_JOBS] = mkf_u32(0, unstoppable_jobs); + info[INF_LISTENERS] = mkf_u32(0, listeners); + info[INF_ACTIVE_PEERS] = mkf_u32(0, active_peers); + info[INF_CONNECTED_PEERS] = mkf_u32(0, connected_peers); + info[INF_DROPPED_LOGS] = mkf_u32(0, dropped_logs); + info[INF_BUSY_POLLING] = mkf_u32(0, !!(global.tune.options & GTUNE_BUSY_POLLING)); + info[INF_FAILED_RESOLUTIONS] = mkf_u32(0, resolv_failed_resolutions); + info[INF_TOTAL_BYTES_OUT] = mkf_u64(0, glob_out_bytes); + info[INF_TOTAL_SPLICED_BYTES_OUT] = mkf_u64(0, glob_spl_bytes); + info[INF_BYTES_OUT_RATE] = mkf_u64(FN_RATE, glob_out_b32); + info[INF_DEBUG_COMMANDS_ISSUED] = mkf_u32(0, debug_commands_issued); + info[INF_CUM_LOG_MSGS] = mkf_u32(FN_COUNTER, cum_log_messages); + + info[INF_TAINTED] = mkf_str(FO_STATUS, chunk_newstr(out)); + chunk_appendf(out, "%#x", get_tainted()); + info[INF_WARNINGS] = mkf_u32(FN_COUNTER, HA_ATOMIC_LOAD(&tot_warnings)); + info[INF_MAXCONN_REACHED] = mkf_u32(FN_COUNTER, HA_ATOMIC_LOAD(&maxconn_reached)); + info[INF_BOOTTIME_MS] = mkf_u32(FN_DURATION, boot); + info[INF_NICED_TASKS] = mkf_u32(0, total_niced_running_tasks()); + + return 1; +} + +/* This function dumps information onto the stream connector's read buffer. + * It returns 0 as long as it does not complete, non-zero upon completion. + * No state is used. + */ +static int stats_dump_info_to_buffer(struct stconn *sc) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_stat_ctx *ctx = appctx->svcctx; + int ret; + int current_field; + + if (!stats_fill_info(info, INF_TOTAL_FIELDS, ctx->flags)) + return 0; + + chunk_reset(&trash_chunk); +more: + current_field = ctx->field; + + if (ctx->flags & STAT_FMT_TYPED) + ret = stats_dump_typed_info_fields(&trash_chunk, info, ctx); + else if (ctx->flags & STAT_FMT_JSON) + ret = stats_dump_json_info_fields(&trash_chunk, info, ctx); + else + ret = stats_dump_info_fields(&trash_chunk, info, ctx); + + if (applet_putchk(appctx, &trash_chunk) == -1) { + /* restore previous field */ + ctx->field = current_field; + return 0; + } + if (ret && ctx->field) { + /* partial dump */ + goto more; + } + ctx->field = 0; + return 1; +} + +/* This function dumps the schema onto the stream connector's read buffer. + * It returns 0 as long as it does not complete, non-zero upon completion. + * No state is used. + * + * Integer values bounded to the range [-(2**53)+1, (2**53)-1] as + * per the recommendation for interoperable integers in section 6 of RFC 7159. + */ +static void stats_dump_json_schema(struct buffer *out) +{ + + int old_len = out->data; + + chunk_strcat(out, + "{" + "\"$schema\":\"http://json-schema.org/draft-04/schema#\"," + "\"oneOf\":[" + "{" + "\"title\":\"Info\"," + "\"type\":\"array\"," + "\"items\":{" + "\"title\":\"InfoItem\"," + "\"type\":\"object\"," + "\"properties\":{" + "\"field\":{\"$ref\":\"#/definitions/field\"}," + "\"processNum\":{\"$ref\":\"#/definitions/processNum\"}," + "\"tags\":{\"$ref\":\"#/definitions/tags\"}," + "\"value\":{\"$ref\":\"#/definitions/typedValue\"}" + "}," + "\"required\":[\"field\",\"processNum\",\"tags\"," + "\"value\"]" + "}" + "}," + "{" + "\"title\":\"Stat\"," + "\"type\":\"array\"," + "\"items\":{" + "\"title\":\"InfoItem\"," + "\"type\":\"object\"," + "\"properties\":{" + "\"objType\":{" + "\"enum\":[\"Frontend\",\"Backend\",\"Listener\"," + "\"Server\",\"Unknown\"]" + "}," + "\"proxyId\":{" + "\"type\":\"integer\"," + "\"minimum\":0" + "}," + "\"id\":{" + "\"type\":\"integer\"," + "\"minimum\":0" + "}," + "\"field\":{\"$ref\":\"#/definitions/field\"}," + "\"processNum\":{\"$ref\":\"#/definitions/processNum\"}," + "\"tags\":{\"$ref\":\"#/definitions/tags\"}," + "\"typedValue\":{\"$ref\":\"#/definitions/typedValue\"}" + "}," + "\"required\":[\"objType\",\"proxyId\",\"id\"," + "\"field\",\"processNum\",\"tags\"," + "\"value\"]" + "}" + "}," + "{" + "\"title\":\"Error\"," + "\"type\":\"object\"," + "\"properties\":{" + "\"errorStr\":{" + "\"type\":\"string\"" + "}" + "}," + "\"required\":[\"errorStr\"]" + "}" + "]," + "\"definitions\":{" + "\"field\":{" + "\"type\":\"object\"," + "\"pos\":{" + "\"type\":\"integer\"," + "\"minimum\":0" + "}," + "\"name\":{" + "\"type\":\"string\"" + "}," + "\"required\":[\"pos\",\"name\"]" + "}," + "\"processNum\":{" + "\"type\":\"integer\"," + "\"minimum\":1" + "}," + "\"tags\":{" + "\"type\":\"object\"," + "\"origin\":{" + "\"type\":\"string\"," + "\"enum\":[\"Metric\",\"Status\",\"Key\"," + "\"Config\",\"Product\",\"Unknown\"]" + "}," + "\"nature\":{" + "\"type\":\"string\"," + "\"enum\":[\"Gauge\",\"Limit\",\"Min\",\"Max\"," + "\"Rate\",\"Counter\",\"Duration\"," + "\"Age\",\"Time\",\"Name\",\"Output\"," + "\"Avg\", \"Unknown\"]" + "}," + "\"scope\":{" + "\"type\":\"string\"," + "\"enum\":[\"Cluster\",\"Process\",\"Service\"," + "\"System\",\"Unknown\"]" + "}," + "\"required\":[\"origin\",\"nature\",\"scope\"]" + "}," + "\"typedValue\":{" + "\"type\":\"object\"," + "\"oneOf\":[" + "{\"$ref\":\"#/definitions/typedValue/definitions/s32Value\"}," + "{\"$ref\":\"#/definitions/typedValue/definitions/s64Value\"}," + "{\"$ref\":\"#/definitions/typedValue/definitions/u32Value\"}," + "{\"$ref\":\"#/definitions/typedValue/definitions/u64Value\"}," + "{\"$ref\":\"#/definitions/typedValue/definitions/strValue\"}" + "]," + "\"definitions\":{" + "\"s32Value\":{" + "\"properties\":{" + "\"type\":{" + "\"type\":\"string\"," + "\"enum\":[\"s32\"]" + "}," + "\"value\":{" + "\"type\":\"integer\"," + "\"minimum\":-2147483648," + "\"maximum\":2147483647" + "}" + "}," + "\"required\":[\"type\",\"value\"]" + "}," + "\"s64Value\":{" + "\"properties\":{" + "\"type\":{" + "\"type\":\"string\"," + "\"enum\":[\"s64\"]" + "}," + "\"value\":{" + "\"type\":\"integer\"," + "\"minimum\":-9007199254740991," + "\"maximum\":9007199254740991" + "}" + "}," + "\"required\":[\"type\",\"value\"]" + "}," + "\"u32Value\":{" + "\"properties\":{" + "\"type\":{" + "\"type\":\"string\"," + "\"enum\":[\"u32\"]" + "}," + "\"value\":{" + "\"type\":\"integer\"," + "\"minimum\":0," + "\"maximum\":4294967295" + "}" + "}," + "\"required\":[\"type\",\"value\"]" + "}," + "\"u64Value\":{" + "\"properties\":{" + "\"type\":{" + "\"type\":\"string\"," + "\"enum\":[\"u64\"]" + "}," + "\"value\":{" + "\"type\":\"integer\"," + "\"minimum\":0," + "\"maximum\":9007199254740991" + "}" + "}," + "\"required\":[\"type\",\"value\"]" + "}," + "\"strValue\":{" + "\"properties\":{" + "\"type\":{" + "\"type\":\"string\"," + "\"enum\":[\"str\"]" + "}," + "\"value\":{\"type\":\"string\"}" + "}," + "\"required\":[\"type\",\"value\"]" + "}," + "\"unknownValue\":{" + "\"properties\":{" + "\"type\":{" + "\"type\":\"integer\"," + "\"minimum\":0" + "}," + "\"value\":{" + "\"type\":\"string\"," + "\"enum\":[\"unknown\"]" + "}" + "}," + "\"required\":[\"type\",\"value\"]" + "}" + "}" + "}" + "}" + "}"); + + if (old_len == out->data) { + chunk_reset(out); + chunk_appendf(out, + "{\"errorStr\":\"output buffer too short\"}"); + } + chunk_appendf(out, "\n"); +} + +/* This function dumps the schema onto the stream connector's read buffer. + * It returns 0 as long as it does not complete, non-zero upon completion. + * No state is used. + */ +static int stats_dump_json_schema_to_buffer(struct appctx *appctx) +{ + + chunk_reset(&trash_chunk); + + stats_dump_json_schema(&trash_chunk); + + if (applet_putchk(appctx, &trash_chunk) == -1) + return 0; + + return 1; +} + +static int cli_parse_clear_counters(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy *px; + struct server *sv; + struct listener *li; + struct stats_module *mod; + int clrall = 0; + + if (strcmp(args[2], "all") == 0) + clrall = 1; + + /* check permissions */ + if (!cli_has_level(appctx, ACCESS_LVL_OPER) || + (clrall && !cli_has_level(appctx, ACCESS_LVL_ADMIN))) + return 1; + + for (px = proxies_list; px; px = px->next) { + if (clrall) { + memset(&px->be_counters, 0, sizeof(px->be_counters)); + memset(&px->fe_counters, 0, sizeof(px->fe_counters)); + } + else { + px->be_counters.conn_max = 0; + px->be_counters.p.http.rps_max = 0; + px->be_counters.sps_max = 0; + px->be_counters.cps_max = 0; + px->be_counters.nbpend_max = 0; + px->be_counters.qtime_max = 0; + px->be_counters.ctime_max = 0; + px->be_counters.dtime_max = 0; + px->be_counters.ttime_max = 0; + + px->fe_counters.conn_max = 0; + px->fe_counters.p.http.rps_max = 0; + px->fe_counters.sps_max = 0; + px->fe_counters.cps_max = 0; + } + + for (sv = px->srv; sv; sv = sv->next) + if (clrall) + memset(&sv->counters, 0, sizeof(sv->counters)); + else { + sv->counters.cur_sess_max = 0; + sv->counters.nbpend_max = 0; + sv->counters.sps_max = 0; + sv->counters.qtime_max = 0; + sv->counters.ctime_max = 0; + sv->counters.dtime_max = 0; + sv->counters.ttime_max = 0; + } + + list_for_each_entry(li, &px->conf.listeners, by_fe) + if (li->counters) { + if (clrall) + memset(li->counters, 0, sizeof(*li->counters)); + else + li->counters->conn_max = 0; + } + } + + global.cps_max = 0; + global.sps_max = 0; + global.ssl_max = 0; + global.ssl_fe_keys_max = 0; + global.ssl_be_keys_max = 0; + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + if (!mod->clearable && !clrall) + continue; + + for (px = proxies_list; px; px = px->next) { + enum stats_domain_px_cap mod_cap = stats_px_get_cap(mod->domain_flags); + + if (px->cap & PR_CAP_FE && mod_cap & STATS_PX_CAP_FE) { + EXTRA_COUNTERS_INIT(px->extra_counters_fe, + mod, + mod->counters, + mod->counters_size); + } + + if (px->cap & PR_CAP_BE && mod_cap & STATS_PX_CAP_BE) { + EXTRA_COUNTERS_INIT(px->extra_counters_be, + mod, + mod->counters, + mod->counters_size); + } + + if (mod_cap & STATS_PX_CAP_SRV) { + for (sv = px->srv; sv; sv = sv->next) { + EXTRA_COUNTERS_INIT(sv->extra_counters, + mod, + mod->counters, + mod->counters_size); + } + } + + if (mod_cap & STATS_PX_CAP_LI) { + list_for_each_entry(li, &px->conf.listeners, by_fe) { + EXTRA_COUNTERS_INIT(li->extra_counters, + mod, + mod->counters, + mod->counters_size); + } + } + } + } + + resolv_stats_clear_counters(clrall, &stats_module_list[STATS_DOMAIN_RESOLVERS]); + + memset(activity, 0, sizeof(activity)); + return 1; +} + + +static int cli_parse_show_info(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_stat_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int arg = 2; + + ctx->scope_str = 0; + ctx->scope_len = 0; + ctx->flags = 0; + ctx->field = 0; /* explicit default value */ + + while (*args[arg]) { + if (strcmp(args[arg], "typed") == 0) + ctx->flags = (ctx->flags & ~STAT_FMT_MASK) | STAT_FMT_TYPED; + else if (strcmp(args[arg], "json") == 0) + ctx->flags = (ctx->flags & ~STAT_FMT_MASK) | STAT_FMT_JSON; + else if (strcmp(args[arg], "desc") == 0) + ctx->flags |= STAT_SHOW_FDESC; + else if (strcmp(args[arg], "float") == 0) + ctx->flags |= STAT_USE_FLOAT; + arg++; + } + return 0; +} + + +static int cli_parse_show_stat(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_stat_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int arg = 2; + + ctx->scope_str = 0; + ctx->scope_len = 0; + ctx->http_px = NULL; // not under http context + ctx->flags = STAT_SHNODE | STAT_SHDESC; + + if ((strm_li(appctx_strm(appctx))->bind_conf->level & ACCESS_LVL_MASK) >= ACCESS_LVL_OPER) + ctx->flags |= STAT_SHLGNDS; + + /* proxy is the default domain */ + ctx->domain = STATS_DOMAIN_PROXY; + if (strcmp(args[arg], "domain") == 0) { + ++args; + + if (strcmp(args[arg], "proxy") == 0) { + ++args; + } else if (strcmp(args[arg], "resolvers") == 0) { + ctx->domain = STATS_DOMAIN_RESOLVERS; + ++args; + } else { + return cli_err(appctx, "Invalid statistics domain.\n"); + } + } + + if (ctx->domain == STATS_DOMAIN_PROXY + && *args[arg] && *args[arg+1] && *args[arg+2]) { + struct proxy *px; + + px = proxy_find_by_name(args[arg], 0, 0); + if (px) + ctx->iid = px->uuid; + else + ctx->iid = atoi(args[arg]); + + if (!ctx->iid) + return cli_err(appctx, "No such proxy.\n"); + + ctx->flags |= STAT_BOUND; + ctx->type = atoi(args[arg+1]); + ctx->sid = atoi(args[arg+2]); + arg += 3; + } + + while (*args[arg]) { + if (strcmp(args[arg], "typed") == 0) + ctx->flags = (ctx->flags & ~STAT_FMT_MASK) | STAT_FMT_TYPED; + else if (strcmp(args[arg], "json") == 0) + ctx->flags = (ctx->flags & ~STAT_FMT_MASK) | STAT_FMT_JSON; + else if (strcmp(args[arg], "desc") == 0) + ctx->flags |= STAT_SHOW_FDESC; + else if (strcmp(args[arg], "no-maint") == 0) + ctx->flags |= STAT_HIDE_MAINT; + else if (strcmp(args[arg], "up") == 0) + ctx->flags |= STAT_HIDE_DOWN; + arg++; + } + + return 0; +} + +static int cli_io_handler_dump_info(struct appctx *appctx) +{ + trash_chunk = b_make(trash.area, trash.size, 0, 0); + return stats_dump_info_to_buffer(appctx_sc(appctx)); +} + +/* This I/O handler runs as an applet embedded in a stream connector. It is + * used to send raw stats over a socket. + */ +static int cli_io_handler_dump_stat(struct appctx *appctx) +{ + trash_chunk = b_make(trash.area, trash.size, 0, 0); + return stats_dump_stat_to_buffer(appctx_sc(appctx), NULL); +} + +static int cli_io_handler_dump_json_schema(struct appctx *appctx) +{ + trash_chunk = b_make(trash.area, trash.size, 0, 0); + return stats_dump_json_schema_to_buffer(appctx); +} + +int stats_allocate_proxy_counters_internal(struct extra_counters **counters, + int type, int px_cap) +{ + struct stats_module *mod; + + EXTRA_COUNTERS_REGISTER(counters, type, alloc_failed); + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + if (!(stats_px_get_cap(mod->domain_flags) & px_cap)) + continue; + + EXTRA_COUNTERS_ADD(mod, *counters, mod->counters, mod->counters_size); + } + + EXTRA_COUNTERS_ALLOC(*counters, alloc_failed); + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + if (!(stats_px_get_cap(mod->domain_flags) & px_cap)) + continue; + + EXTRA_COUNTERS_INIT(*counters, mod, mod->counters, mod->counters_size); + } + + return 1; + + alloc_failed: + return 0; +} + +/* Initialize and allocate all extra counters for a proxy and its attached + * servers/listeners with all already registered stats module + */ +int stats_allocate_proxy_counters(struct proxy *px) +{ + struct server *sv; + struct listener *li; + + if (px->cap & PR_CAP_FE) { + if (!stats_allocate_proxy_counters_internal(&px->extra_counters_fe, + COUNTERS_FE, + STATS_PX_CAP_FE)) { + return 0; + } + } + + if (px->cap & PR_CAP_BE) { + if (!stats_allocate_proxy_counters_internal(&px->extra_counters_be, + COUNTERS_BE, + STATS_PX_CAP_BE)) { + return 0; + } + } + + for (sv = px->srv; sv; sv = sv->next) { + if (!stats_allocate_proxy_counters_internal(&sv->extra_counters, + COUNTERS_SV, + STATS_PX_CAP_SRV)) { + return 0; + } + } + + list_for_each_entry(li, &px->conf.listeners, by_fe) { + if (!stats_allocate_proxy_counters_internal(&li->extra_counters, + COUNTERS_LI, + STATS_PX_CAP_LI)) { + return 0; + } + } + + return 1; +} + +void stats_register_module(struct stats_module *m) +{ + const uint8_t domain = stats_get_domain(m->domain_flags); + + LIST_APPEND(&stats_module_list[domain], &m->list); + stat_count[domain] += m->stats_count; +} + +static int allocate_stats_px_postcheck(void) +{ + struct stats_module *mod; + size_t i = ST_F_TOTAL_FIELDS; + int err_code = 0; + struct proxy *px; + + stat_count[STATS_DOMAIN_PROXY] += ST_F_TOTAL_FIELDS; + + stat_f[STATS_DOMAIN_PROXY] = malloc(stat_count[STATS_DOMAIN_PROXY] * sizeof(struct name_desc)); + if (!stat_f[STATS_DOMAIN_PROXY]) { + ha_alert("stats: cannot allocate all fields for proxy statistics\n"); + err_code |= ERR_ALERT | ERR_FATAL; + return err_code; + } + + memcpy(stat_f[STATS_DOMAIN_PROXY], stat_fields, + ST_F_TOTAL_FIELDS * sizeof(struct name_desc)); + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_PROXY], list) { + memcpy(stat_f[STATS_DOMAIN_PROXY] + i, + mod->stats, + mod->stats_count * sizeof(struct name_desc)); + i += mod->stats_count; + } + + for (px = proxies_list; px; px = px->next) { + if (!stats_allocate_proxy_counters(px)) { + ha_alert("stats: cannot allocate all counters for proxy statistics\n"); + err_code |= ERR_ALERT | ERR_FATAL; + return err_code; + } + } + + /* wait per-thread alloc to perform corresponding stat_l allocation */ + + return err_code; +} + +REGISTER_CONFIG_POSTPARSER("allocate-stats-px", allocate_stats_px_postcheck); + +static int allocate_stats_rslv_postcheck(void) +{ + struct stats_module *mod; + size_t i = 0; + int err_code = 0; + + stat_f[STATS_DOMAIN_RESOLVERS] = malloc(stat_count[STATS_DOMAIN_RESOLVERS] * sizeof(struct name_desc)); + if (!stat_f[STATS_DOMAIN_RESOLVERS]) { + ha_alert("stats: cannot allocate all fields for resolver statistics\n"); + err_code |= ERR_ALERT | ERR_FATAL; + return err_code; + } + + list_for_each_entry(mod, &stats_module_list[STATS_DOMAIN_RESOLVERS], list) { + memcpy(stat_f[STATS_DOMAIN_RESOLVERS] + i, + mod->stats, + mod->stats_count * sizeof(struct name_desc)); + i += mod->stats_count; + } + + if (!resolv_allocate_counters(&stats_module_list[STATS_DOMAIN_RESOLVERS])) { + ha_alert("stats: cannot allocate all counters for resolver statistics\n"); + err_code |= ERR_ALERT | ERR_FATAL; + return err_code; + } + + /* wait per-thread alloc to perform corresponding stat_l allocation */ + + return err_code; +} + +REGISTER_CONFIG_POSTPARSER("allocate-stats-resolver", allocate_stats_rslv_postcheck); + +static int allocate_stat_lines_per_thread(void) +{ + int domains[] = { STATS_DOMAIN_PROXY, STATS_DOMAIN_RESOLVERS }, i; + + for (i = 0; i < STATS_DOMAIN_COUNT; ++i) { + const int domain = domains[i]; + + stat_l[domain] = malloc(stat_count[domain] * sizeof(struct field)); + if (!stat_l[domain]) + return 0; + } + return 1; +} + +REGISTER_PER_THREAD_ALLOC(allocate_stat_lines_per_thread); + +static int allocate_trash_counters(void) +{ + struct stats_module *mod; + int domains[] = { STATS_DOMAIN_PROXY, STATS_DOMAIN_RESOLVERS }, i; + size_t max_counters_size = 0; + + /* calculate the greatest counters used by any stats modules */ + for (i = 0; i < STATS_DOMAIN_COUNT; ++i) { + list_for_each_entry(mod, &stats_module_list[domains[i]], list) { + max_counters_size = mod->counters_size > max_counters_size ? + mod->counters_size : max_counters_size; + } + } + + /* allocate the trash with the size of the greatest counters */ + if (max_counters_size) { + trash_counters = malloc(max_counters_size); + if (!trash_counters) { + ha_alert("stats: cannot allocate trash counters for statistics\n"); + return 0; + } + } + + return 1; +} + +REGISTER_PER_THREAD_ALLOC(allocate_trash_counters); + +static void deinit_stat_lines_per_thread(void) +{ + int domains[] = { STATS_DOMAIN_PROXY, STATS_DOMAIN_RESOLVERS }, i; + + for (i = 0; i < STATS_DOMAIN_COUNT; ++i) { + const int domain = domains[i]; + + ha_free(&stat_l[domain]); + } +} + + +REGISTER_PER_THREAD_FREE(deinit_stat_lines_per_thread); + +static void deinit_stats(void) +{ + int domains[] = { STATS_DOMAIN_PROXY, STATS_DOMAIN_RESOLVERS }, i; + + for (i = 0; i < STATS_DOMAIN_COUNT; ++i) { + const int domain = domains[i]; + + if (stat_f[domain]) + free(stat_f[domain]); + } +} + +REGISTER_POST_DEINIT(deinit_stats); + +static void free_trash_counters(void) +{ + if (trash_counters) + free(trash_counters); +} + +REGISTER_PER_THREAD_FREE(free_trash_counters); + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "clear", "counters", NULL }, "clear counters [all] : clear max statistics counters (or all counters)", cli_parse_clear_counters, NULL, NULL }, + { { "show", "info", NULL }, "show info [desc|json|typed|float]* : report information about the running process", cli_parse_show_info, cli_io_handler_dump_info, NULL }, + { { "show", "stat", NULL }, "show stat [desc|json|no-maint|typed|up]*: report counters for each proxy and server", cli_parse_show_stat, cli_io_handler_dump_stat, NULL }, + { { "show", "schema", "json", NULL }, "show schema json : report schema used for stats", NULL, cli_io_handler_dump_json_schema, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +struct applet http_stats_applet = { + .obj_type = OBJ_TYPE_APPLET, + .name = "<STATS>", /* used for logging */ + .fct = http_stats_io_handler, + .release = NULL, +}; + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/stconn.c b/src/stconn.c new file mode 100644 index 0000000..8e3ae7e --- /dev/null +++ b/src/stconn.c @@ -0,0 +1,2050 @@ +/* + * stream connector management functions + * + * Copyright 2021 Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/connection.h> +#include <haproxy/check.h> +#include <haproxy/http_ana.h> +#include <haproxy/pipe.h> +#include <haproxy/pool.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/xref.h> + +DECLARE_POOL(pool_head_connstream, "stconn", sizeof(struct stconn)); +DECLARE_POOL(pool_head_sedesc, "sedesc", sizeof(struct sedesc)); + +/* functions used by default on a detached stream connector */ +static void sc_app_abort(struct stconn *sc); +static void sc_app_shut(struct stconn *sc); +static void sc_app_chk_rcv(struct stconn *sc); +static void sc_app_chk_snd(struct stconn *sc); + +/* functions used on a mux-based stream connector */ +static void sc_app_abort_conn(struct stconn *sc); +static void sc_app_shut_conn(struct stconn *sc); +static void sc_app_chk_rcv_conn(struct stconn *sc); +static void sc_app_chk_snd_conn(struct stconn *sc); + +/* functions used on an applet-based stream connector */ +static void sc_app_abort_applet(struct stconn *sc); +static void sc_app_shut_applet(struct stconn *sc); +static void sc_app_chk_rcv_applet(struct stconn *sc); +static void sc_app_chk_snd_applet(struct stconn *sc); + +static int sc_conn_process(struct stconn *sc); +static int sc_conn_recv(struct stconn *sc); +static int sc_conn_send(struct stconn *sc); +static int sc_applet_process(struct stconn *sc); + +/* stream connector operations for connections */ +struct sc_app_ops sc_app_conn_ops = { + .chk_rcv = sc_app_chk_rcv_conn, + .chk_snd = sc_app_chk_snd_conn, + .abort = sc_app_abort_conn, + .shutdown= sc_app_shut_conn, + .wake = sc_conn_process, + .name = "STRM", +}; + +/* stream connector operations for embedded tasks */ +struct sc_app_ops sc_app_embedded_ops = { + .chk_rcv = sc_app_chk_rcv, + .chk_snd = sc_app_chk_snd, + .abort = sc_app_abort, + .shutdown= sc_app_shut, + .wake = NULL, /* may never be used */ + .name = "NONE", /* may never be used */ +}; + +/* stream connector operations for applets */ +struct sc_app_ops sc_app_applet_ops = { + .chk_rcv = sc_app_chk_rcv_applet, + .chk_snd = sc_app_chk_snd_applet, + .abort = sc_app_abort_applet, + .shutdown= sc_app_shut_applet, + .wake = sc_applet_process, + .name = "STRM", +}; + +/* stream connector for health checks on connections */ +struct sc_app_ops sc_app_check_ops = { + .chk_rcv = NULL, + .chk_snd = NULL, + .abort = NULL, + .shutdown= NULL, + .wake = wake_srv_chk, + .name = "CHCK", +}; + +/* Initializes an endpoint */ +void sedesc_init(struct sedesc *sedesc) +{ + sedesc->se = NULL; + sedesc->conn = NULL; + sedesc->sc = NULL; + sedesc->lra = TICK_ETERNITY; + sedesc->fsb = TICK_ETERNITY; + sedesc->xref.peer = NULL; + se_fl_setall(sedesc, SE_FL_NONE); + + sedesc->iobuf.pipe = NULL; + sedesc->iobuf.buf = NULL; + sedesc->iobuf.offset = sedesc->iobuf.data = 0; + sedesc->iobuf.flags = IOBUF_FL_NONE; +} + +/* Tries to alloc an endpoint and initialize it. Returns NULL on failure. */ +struct sedesc *sedesc_new() +{ + struct sedesc *sedesc; + + sedesc = pool_alloc(pool_head_sedesc); + if (unlikely(!sedesc)) + return NULL; + + sedesc_init(sedesc); + return sedesc; +} + +/* Releases an endpoint. It is the caller responsibility to be sure it is safe + * and it is not shared with another entity + */ +void sedesc_free(struct sedesc *sedesc) +{ + if (sedesc) { + if (sedesc->iobuf.pipe) + put_pipe(sedesc->iobuf.pipe); + pool_free(pool_head_sedesc, sedesc); + } +} + +/* Tries to allocate a new stconn and initialize its main fields. On + * failure, nothing is allocated and NULL is returned. It is an internal + * function. The caller must, at least, set the SE_FL_ORPHAN or SE_FL_DETACHED + * flag. + */ +static struct stconn *sc_new(struct sedesc *sedesc) +{ + struct stconn *sc; + + sc = pool_alloc(pool_head_connstream); + + if (unlikely(!sc)) + goto alloc_error; + + sc->obj_type = OBJ_TYPE_SC; + sc->flags = SC_FL_NONE; + sc->state = SC_ST_INI; + sc->ioto = TICK_ETERNITY; + sc->room_needed = 0; + sc->app = NULL; + sc->app_ops = NULL; + sc->src = NULL; + sc->dst = NULL; + sc->wait_event.tasklet = NULL; + sc->wait_event.events = 0; + + /* If there is no endpoint, allocate a new one now */ + if (!sedesc) { + sedesc = sedesc_new(); + if (unlikely(!sedesc)) + goto alloc_error; + } + sc->sedesc = sedesc; + sedesc->sc = sc; + + return sc; + + alloc_error: + pool_free(pool_head_connstream, sc); + return NULL; +} + +/* Creates a new stream connector and its associated stream from a mux. <sd> must + * be defined. It returns NULL on error. On success, the new stream connector is + * returned. In this case, SE_FL_ORPHAN flag is removed. + */ +struct stconn *sc_new_from_endp(struct sedesc *sd, struct session *sess, struct buffer *input) +{ + struct stconn *sc; + + sc = sc_new(sd); + if (unlikely(!sc)) + return NULL; + if (unlikely(!stream_new(sess, sc, input))) { + sd->sc = NULL; + if (sc->sedesc != sd) { + /* none was provided so sc_new() allocated one */ + sedesc_free(sc->sedesc); + } + pool_free(pool_head_connstream, sc); + se_fl_set(sd, SE_FL_ORPHAN); + return NULL; + } + se_fl_clr(sd, SE_FL_ORPHAN); + return sc; +} + +/* Creates a new stream connector from an stream. There is no endpoint here, thus it + * will be created by sc_new(). So the SE_FL_DETACHED flag is set. It returns + * NULL on error. On success, the new stream connector is returned. + */ +struct stconn *sc_new_from_strm(struct stream *strm, unsigned int flags) +{ + struct stconn *sc; + + sc = sc_new(NULL); + if (unlikely(!sc)) + return NULL; + sc->flags |= flags; + sc_ep_set(sc, SE_FL_DETACHED); + sc->app = &strm->obj_type; + sc->app_ops = &sc_app_embedded_ops; + return sc; +} + +/* Creates a new stream connector from an health-check. There is no endpoint here, + * thus it will be created by sc_new(). So the SE_FL_DETACHED flag is set. It + * returns NULL on error. On success, the new stream connector is returned. + */ +struct stconn *sc_new_from_check(struct check *check, unsigned int flags) +{ + struct stconn *sc; + + sc = sc_new(NULL); + if (unlikely(!sc)) + return NULL; + sc->flags |= flags; + sc_ep_set(sc, SE_FL_DETACHED); + sc->app = &check->obj_type; + sc->app_ops = &sc_app_check_ops; + return sc; +} + +/* Releases a stconn previously allocated by sc_new(), as well as its + * endpoint, if it exists. This function is called internally or on error path. + */ +void sc_free(struct stconn *sc) +{ + sockaddr_free(&sc->src); + sockaddr_free(&sc->dst); + if (sc->sedesc) { + BUG_ON(!sc_ep_test(sc, SE_FL_DETACHED)); + sedesc_free(sc->sedesc); + } + tasklet_free(sc->wait_event.tasklet); + pool_free(pool_head_connstream, sc); +} + +/* Conditionally removes a stream connector if it is detached and if there is no app + * layer defined. Except on error path, this one must be used. if release, the + * pointer on the SC is set to NULL. + */ +static void sc_free_cond(struct stconn **scp) +{ + struct stconn *sc = *scp; + + if (!sc->app && (!sc->sedesc || sc_ep_test(sc, SE_FL_DETACHED))) { + sc_free(sc); + *scp = NULL; + } +} + + +/* Attaches a stconn to a mux endpoint and sets the endpoint ctx. Returns + * -1 on error and 0 on success. SE_FL_DETACHED flag is removed. This function is + * called from a mux when it is attached to a stream or a health-check. + */ +int sc_attach_mux(struct stconn *sc, void *sd, void *ctx) +{ + struct connection *conn = ctx; + struct sedesc *sedesc = sc->sedesc; + + if (sc_strm(sc)) { + if (!sc->wait_event.tasklet) { + sc->wait_event.tasklet = tasklet_new(); + if (!sc->wait_event.tasklet) + return -1; + sc->wait_event.tasklet->process = sc_conn_io_cb; + sc->wait_event.tasklet->context = sc; + sc->wait_event.events = 0; + } + + sc->app_ops = &sc_app_conn_ops; + xref_create(&sc->sedesc->xref, &sc_opposite(sc)->sedesc->xref); + } + else if (sc_check(sc)) { + if (!sc->wait_event.tasklet) { + sc->wait_event.tasklet = tasklet_new(); + if (!sc->wait_event.tasklet) + return -1; + sc->wait_event.tasklet->process = srv_chk_io_cb; + sc->wait_event.tasklet->context = sc; + sc->wait_event.events = 0; + } + + sc->app_ops = &sc_app_check_ops; + } + + sedesc->se = sd; + sedesc->conn = ctx; + se_fl_set(sedesc, SE_FL_T_MUX); + se_fl_clr(sedesc, SE_FL_DETACHED); + if (!conn->ctx) + conn->ctx = sc; + return 0; +} + +/* Attaches a stconn to an applet endpoint and sets the endpoint + * ctx. Returns -1 on error and 0 on success. SE_FL_DETACHED flag is + * removed. This function is called by a stream when a backend applet is + * registered. + */ +static void sc_attach_applet(struct stconn *sc, void *sd) +{ + sc->sedesc->se = sd; + sc_ep_set(sc, SE_FL_T_APPLET); + sc_ep_clr(sc, SE_FL_DETACHED); + if (sc_strm(sc)) { + sc->app_ops = &sc_app_applet_ops; + xref_create(&sc->sedesc->xref, &sc_opposite(sc)->sedesc->xref); + } +} + +/* Attaches a stconn to a app layer and sets the relevant + * callbacks. Returns -1 on error and 0 on success. SE_FL_ORPHAN flag is + * removed. This function is called by a stream when it is created to attach it + * on the stream connector on the client side. + */ +int sc_attach_strm(struct stconn *sc, struct stream *strm) +{ + sc->app = &strm->obj_type; + sc_ep_clr(sc, SE_FL_ORPHAN); + sc_ep_report_read_activity(sc); + if (sc_ep_test(sc, SE_FL_T_MUX)) { + sc->wait_event.tasklet = tasklet_new(); + if (!sc->wait_event.tasklet) + return -1; + sc->wait_event.tasklet->process = sc_conn_io_cb; + sc->wait_event.tasklet->context = sc; + sc->wait_event.events = 0; + + sc->app_ops = &sc_app_conn_ops; + } + else if (sc_ep_test(sc, SE_FL_T_APPLET)) { + sc->app_ops = &sc_app_applet_ops; + } + else { + sc->app_ops = &sc_app_embedded_ops; + } + return 0; +} + +/* Detaches the stconn from the endpoint, if any. For a connecrion, if a + * mux owns the connection ->detach() callback is called. Otherwise, it means + * the stream connector owns the connection. In this case the connection is closed + * and released. For an applet, the appctx is released. If still allocated, the + * endpoint is reset and flag as detached. If the app layer is also detached, + * the stream connector is released. + */ +static void sc_detach_endp(struct stconn **scp) +{ + struct stconn *sc = *scp; + struct xref *peer; + + if (!sc) + return; + + + /* Remove my link in the original objects. */ + peer = xref_get_peer_and_lock(&sc->sedesc->xref); + if (peer) + xref_disconnect(&sc->sedesc->xref, peer); + + if (sc_ep_test(sc, SE_FL_T_MUX)) { + struct connection *conn = __sc_conn(sc); + struct sedesc *sedesc = sc->sedesc; + + if (conn->mux) { + if (sc->wait_event.events != 0) + conn->mux->unsubscribe(sc, sc->wait_event.events, &sc->wait_event); + se_fl_set(sedesc, SE_FL_ORPHAN); + sedesc->sc = NULL; + sc->sedesc = NULL; + conn->mux->detach(sedesc); + } + else { + /* It's too early to have a mux, let's just destroy + * the connection + */ + conn_stop_tracking(conn); + conn_full_close(conn); + if (conn->destroy_cb) + conn->destroy_cb(conn); + conn_free(conn); + } + } + else if (sc_ep_test(sc, SE_FL_T_APPLET)) { + struct appctx *appctx = __sc_appctx(sc); + + sc_ep_set(sc, SE_FL_ORPHAN); + sc->sedesc->sc = NULL; + sc->sedesc = NULL; + appctx_shut(appctx); + appctx_free(appctx); + } + + if (sc->sedesc) { + /* the SD wasn't used and can be recycled */ + sc->sedesc->se = NULL; + sc->sedesc->conn = NULL; + sc->sedesc->flags = 0; + sc_ep_set(sc, SE_FL_DETACHED); + } + + /* FIXME: Rest SC for now but must be reviewed. SC flags are only + * connection related for now but this will evolved + */ + sc->flags &= SC_FL_ISBACK; + if (sc_strm(sc)) + sc->app_ops = &sc_app_embedded_ops; + else + sc->app_ops = NULL; + sc_free_cond(scp); +} + +/* Detaches the stconn from the app layer. If there is no endpoint attached + * to the stconn + */ +static void sc_detach_app(struct stconn **scp) +{ + struct stconn *sc = *scp; + + if (!sc) + return; + + sc->app = NULL; + sc->app_ops = NULL; + sockaddr_free(&sc->src); + sockaddr_free(&sc->dst); + + tasklet_free(sc->wait_event.tasklet); + sc->wait_event.tasklet = NULL; + sc->wait_event.events = 0; + sc_free_cond(scp); +} + +/* Destroy the stconn. It is detached from its endpoint and its + * application. After this call, the stconn must be considered as released. + */ +void sc_destroy(struct stconn *sc) +{ + sc_detach_endp(&sc); + sc_detach_app(&sc); + BUG_ON_HOT(sc); +} + +/* Resets the stream connector endpoint. It happens when the app layer want to renew + * its endpoint. For a connection retry for instance. If a mux or an applet is + * attached, a new endpoint is created. Returns -1 on error and 0 on success. + */ +int sc_reset_endp(struct stconn *sc) +{ + struct sedesc *new_sd; + + BUG_ON(!sc->app); + + if (!__sc_endp(sc)) { + /* endpoint not attached or attached to a mux with no + * target. Thus the endpoint will not be release but just + * reset. The app is still attached, the sc will not be + * released. + */ + sc_detach_endp(&sc); + return 0; + } + + /* allocate the new endpoint first to be able to set error if it + * fails */ + new_sd = sedesc_new(); + if (!unlikely(new_sd)) + return -1; + + /* The app is still attached, the sc will not be released */ + sc_detach_endp(&sc); + BUG_ON(!sc); + BUG_ON(sc->sedesc); + sc->sedesc = new_sd; + sc->sedesc->sc = sc; + sc_ep_set(sc, SE_FL_DETACHED); + return 0; +} + + +/* Create an applet to handle a stream connector as a new appctx. The SC will + * wake it up every time it is solicited. The appctx must be deleted by the task + * handler using sc_detach_endp(), possibly from within the function itself. + * It also pre-initializes the applet's context and returns it (or NULL in case + * it could not be allocated). + */ +struct appctx *sc_applet_create(struct stconn *sc, struct applet *app) +{ + struct appctx *appctx; + + appctx = appctx_new_here(app, sc->sedesc); + if (!appctx) + return NULL; + sc_attach_applet(sc, appctx); + appctx->t->nice = __sc_strm(sc)->task->nice; + applet_need_more_data(appctx); + appctx_wakeup(appctx); + + sc->state = SC_ST_RDY; + return appctx; +} + +/* Conditionally forward the close to the write side. It return 1 if it can be + * forwarded. It is the caller responsibility to forward the close to the write + * side. Otherwise, 0 is returned. In this case, SC_FL_SHUT_WANTED flag may be set on + * the consumer SC if we are only waiting for the outgoing data to be flushed. + */ +static inline int sc_cond_forward_shut(struct stconn *sc) +{ + /* The close must not be forwarded */ + if (!(sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) || !(sc->flags & SC_FL_NOHALF)) + return 0; + + if (co_data(sc_ic(sc)) && !(sc_ic(sc)->flags & CF_WRITE_TIMEOUT)) { + /* the shutdown cannot be forwarded now because + * we should flush outgoing data first. But instruct the output + * channel it should be done ASAP. + */ + sc_schedule_shutdown(sc); + return 0; + } + + /* the close can be immediately forwarded to the write side */ + return 1; +} + + +static inline int sc_is_fastfwd_supported(struct stconn *sc) +{ + return (!(global.tune.no_zero_copy_fwd & NO_ZERO_COPY_FWD) && + sc_ep_test(sc, SE_FL_MAY_FASTFWD_PROD) && + sc_ep_test(sc_opposite(sc), SE_FL_MAY_FASTFWD_CONS) && + sc_ic(sc)->to_forward); +} +/* + * This function performs a shutdown-read on a detached stream connector in a + * connected or init state (it does nothing for other states). It either shuts + * the read side or marks itself as closed. The buffer flags are updated to + * reflect the new state. If the stream connector has SC_FL_NOHALF, we also + * forward the close to the write side. The owner task is woken up if it exists. + */ +static void sc_app_abort(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + + if (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + return; + + sc->flags |= SC_FL_ABRT_DONE; + ic->flags |= CF_READ_EVENT; + + if (!sc_state_in(sc->state, SC_SB_CON|SC_SB_RDY|SC_SB_EST)) + return; + + if (sc->flags & SC_FL_SHUT_DONE) { + sc->state = SC_ST_DIS; + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + } + else if (sc_cond_forward_shut(sc)) + return sc_app_shut(sc); + + /* note that if the task exists, it must unregister itself once it runs */ + if (!(sc->flags & SC_FL_DONT_WAKE)) + task_wakeup(sc_strm_task(sc), TASK_WOKEN_IO); +} + +/* + * This function performs a shutdown-write on a detached stream connector in a + * connected or init state (it does nothing for other states). It either shuts + * the write side or marks itself as closed. The buffer flags are updated to + * reflect the new state. It does also close everything if the SC was marked as + * being in error state. The owner task is woken up if it exists. + */ +static void sc_app_shut(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + struct channel *oc = sc_oc(sc); + + sc->flags &= ~SC_FL_SHUT_WANTED; + if (sc->flags & SC_FL_SHUT_DONE) + return; + sc->flags |= SC_FL_SHUT_DONE; + oc->flags |= CF_WRITE_EVENT; + sc_set_hcto(sc); + + switch (sc->state) { + case SC_ST_RDY: + case SC_ST_EST: + /* we have to shut before closing, otherwise some short messages + * may never leave the system, especially when there are remaining + * unread data in the socket input buffer, or when nolinger is set. + * However, if SC_FL_NOLINGER is explicitly set, we know there is + * no risk so we close both sides immediately. + */ + if (!(sc->flags & (SC_FL_ERROR|SC_FL_NOLINGER|SC_FL_EOS|SC_FL_ABRT_DONE)) && + !(ic->flags & CF_DONT_READ)) + return; + + __fallthrough; + case SC_ST_CON: + case SC_ST_CER: + case SC_ST_QUE: + case SC_ST_TAR: + /* Note that none of these states may happen with applets */ + sc->state = SC_ST_DIS; + __fallthrough; + default: + sc->flags &= ~SC_FL_NOLINGER; + sc->flags |= SC_FL_ABRT_DONE; + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + } + + /* note that if the task exists, it must unregister itself once it runs */ + if (!(sc->flags & SC_FL_DONT_WAKE)) + task_wakeup(sc_strm_task(sc), TASK_WOKEN_IO); +} + +/* default chk_rcv function for scheduled tasks */ +static void sc_app_chk_rcv(struct stconn *sc) +{ + if (sc_ep_have_ff_data(sc_opposite(sc))) { + /* stop reading */ + sc_need_room(sc, -1); + } + else { + /* (re)start reading */ + if (!(sc->flags & SC_FL_DONT_WAKE)) + task_wakeup(sc_strm_task(sc), TASK_WOKEN_IO); + } +} + +/* default chk_snd function for scheduled tasks */ +static void sc_app_chk_snd(struct stconn *sc) +{ + struct channel *oc = sc_oc(sc); + + if (unlikely(sc->state != SC_ST_EST || (sc->flags & SC_FL_SHUT_DONE))) + return; + + if (!sc_ep_test(sc, SE_FL_WAIT_DATA) || /* not waiting for data */ + (!co_data(oc) && !sc_ep_have_ff_data(sc))) /* called with nothing to send ! */ + return; + + /* Otherwise there are remaining data to be sent in the buffer, + * so we tell the handler. + */ + sc_ep_clr(sc, SE_FL_WAIT_DATA); + if (!(sc->flags & SC_FL_DONT_WAKE)) + task_wakeup(sc_strm_task(sc), TASK_WOKEN_IO); +} + +/* + * This function performs a shutdown-read on a stream connector attached to + * a connection in a connected or init state (it does nothing for other + * states). It either shuts the read side or marks itself as closed. The buffer + * flags are updated to reflect the new state. If the stream connector has + * SC_FL_NOHALF, we also forward the close to the write side. If a control + * layer is defined, then it is supposed to be a socket layer and file + * descriptors are then shutdown or closed accordingly. The function + * automatically disables polling if needed. + */ +static void sc_app_abort_conn(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + + BUG_ON(!sc_conn(sc)); + + if (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + return; + sc->flags |= SC_FL_ABRT_DONE; + ic->flags |= CF_READ_EVENT; + + if (!sc_state_in(sc->state, SC_SB_CON|SC_SB_RDY|SC_SB_EST)) + return; + + if (sc->flags & SC_FL_SHUT_DONE) { + sc_conn_shut(sc); + sc->state = SC_ST_DIS; + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + } + else if (sc_cond_forward_shut(sc)) + return sc_app_shut_conn(sc); +} + +/* + * This function performs a shutdown-write on a stream connector attached to + * a connection in a connected or init state (it does nothing for other + * states). It either shuts the write side or marks itself as closed. The + * buffer flags are updated to reflect the new state. It does also close + * everything if the SC was marked as being in error state. If there is a + * data-layer shutdown, it is called. + */ +static void sc_app_shut_conn(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + struct channel *oc = sc_oc(sc); + + BUG_ON(!sc_conn(sc)); + + sc->flags &= ~SC_FL_SHUT_WANTED; + if (sc->flags & SC_FL_SHUT_DONE) + return; + sc->flags |= SC_FL_SHUT_DONE; + oc->flags |= CF_WRITE_EVENT; + sc_set_hcto(sc); + + switch (sc->state) { + case SC_ST_RDY: + case SC_ST_EST: + /* we have to shut before closing, otherwise some short messages + * may never leave the system, especially when there are remaining + * unread data in the socket input buffer, or when nolinger is set. + * However, if SC_FL_NOLINGER is explicitly set, we know there is + * no risk so we close both sides immediately. + */ + if (sc->flags & SC_FL_NOLINGER) { + /* unclean data-layer shutdown, typically an aborted request + * or a forwarded shutdown from a client to a server due to + * option abortonclose. No need for the TLS layer to try to + * emit a shutdown message. + */ + sc_conn_shutw(sc, CO_SHW_SILENT); + } + else { + /* clean data-layer shutdown. This only happens on the + * frontend side, or on the backend side when forwarding + * a client close in TCP mode or in HTTP TUNNEL mode + * while option abortonclose is set. We want the TLS + * layer to try to signal it to the peer before we close. + */ + sc_conn_shutw(sc, CO_SHW_NORMAL); + + if (!(sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && !(ic->flags & CF_DONT_READ)) + return; + } + + __fallthrough; + case SC_ST_CON: + /* we may have to close a pending connection, and mark the + * response buffer as abort + */ + sc_conn_shut(sc); + __fallthrough; + case SC_ST_CER: + case SC_ST_QUE: + case SC_ST_TAR: + sc->state = SC_ST_DIS; + __fallthrough; + default: + sc->flags &= ~SC_FL_NOLINGER; + sc->flags |= SC_FL_ABRT_DONE; + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + } +} + +/* This function is used for inter-stream connector calls. It is called by the + * consumer to inform the producer side that it may be interested in checking + * for free space in the buffer. Note that it intentionally does not update + * timeouts, so that we can still check them later at wake-up. This function is + * dedicated to connection-based stream connectors. + */ +static void sc_app_chk_rcv_conn(struct stconn *sc) +{ + BUG_ON(!sc_conn(sc)); + + /* (re)start reading */ + if (sc_state_in(sc->state, SC_SB_CON|SC_SB_RDY|SC_SB_EST)) + tasklet_wakeup(sc->wait_event.tasklet); +} + + +/* This function is used for inter-stream connector calls. It is called by the + * producer to inform the consumer side that it may be interested in checking + * for data in the buffer. Note that it intentionally does not update timeouts, + * so that we can still check them later at wake-up. + */ +static void sc_app_chk_snd_conn(struct stconn *sc) +{ + struct channel *oc = sc_oc(sc); + + BUG_ON(!sc_conn(sc)); + + if (unlikely(!sc_state_in(sc->state, SC_SB_RDY|SC_SB_EST) || + (sc->flags & SC_FL_SHUT_DONE))) + return; + + if (unlikely(!co_data(oc) && !sc_ep_have_ff_data(sc))) /* called with nothing to send ! */ + return; + + if (!sc_ep_have_ff_data(sc) && /* data wants to be fast-forwarded ASAP */ + !sc_ep_test(sc, SE_FL_WAIT_DATA)) /* not waiting for data */ + return; + + if (!(sc->wait_event.events & SUB_RETRY_SEND)) + sc_conn_send(sc); + + if (sc_ep_test(sc, SE_FL_ERROR | SE_FL_ERR_PENDING) || sc_is_conn_error(sc)) { + /* Write error on the file descriptor */ + BUG_ON(sc_ep_test(sc, SE_FL_EOS|SE_FL_ERROR|SE_FL_ERR_PENDING) == (SE_FL_EOS|SE_FL_ERR_PENDING)); + goto out_wakeup; + } + + /* OK, so now we know that some data might have been sent, and that we may + * have to poll first. We have to do that too if the buffer is not empty. + */ + if (!co_data(oc)) { + /* the connection is established but we can't write. Either the + * buffer is empty, or we just refrain from sending because the + * ->o limit was reached. Maybe we just wrote the last + * chunk and need to close. + */ + if ((oc->flags & CF_AUTO_CLOSE) && + ((sc->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) == SC_FL_SHUT_WANTED) && + sc_state_in(sc->state, SC_SB_RDY|SC_SB_EST)) { + sc_shutdown(sc); + goto out_wakeup; + } + + if ((sc->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) == 0) + sc_ep_set(sc, SE_FL_WAIT_DATA); + } + else { + /* Otherwise there are remaining data to be sent in the buffer, + * which means we have to poll before doing so. + */ + sc_ep_clr(sc, SE_FL_WAIT_DATA); + } + + /* in case of special condition (error, shutdown, end of write...), we + * have to notify the task. + */ + if (likely((sc->flags & SC_FL_SHUT_DONE) || + ((oc->flags & CF_WRITE_EVENT) && sc->state < SC_ST_EST) || + ((oc->flags & CF_WAKE_WRITE) && + ((!co_data(oc) && !oc->to_forward) || + !sc_state_in(sc->state, SC_SB_EST))))) { + out_wakeup: + if (!(sc->flags & SC_FL_DONT_WAKE)) + task_wakeup(sc_strm_task(sc), TASK_WOKEN_IO); + } +} + +/* + * This function performs a shutdown-read on a stream connector attached to an + * applet in a connected or init state (it does nothing for other states). It + * either shuts the read side or marks itself as closed. The buffer flags are + * updated to reflect the new state. If the stream connector has SC_FL_NOHALF, + * we also forward the close to the write side. The owner task is woken up if + * it exists. + */ +static void sc_app_abort_applet(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + + BUG_ON(!sc_appctx(sc)); + + if (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + return; + sc->flags |= SC_FL_ABRT_DONE; + ic->flags |= CF_READ_EVENT; + + /* Note: on abort, we don't call the applet */ + + if (!sc_state_in(sc->state, SC_SB_CON|SC_SB_RDY|SC_SB_EST)) + return; + + if (sc->flags & SC_FL_SHUT_DONE) { + appctx_shut(__sc_appctx(sc)); + sc->state = SC_ST_DIS; + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + } + else if (sc_cond_forward_shut(sc)) + return sc_app_shut_applet(sc); +} + +/* + * This function performs a shutdown-write on a stream connector attached to an + * applet in a connected or init state (it does nothing for other states). It + * either shuts the write side or marks itself as closed. The buffer flags are + * updated to reflect the new state. It does also close everything if the SI + * was marked as being in error state. The owner task is woken up if it exists. + */ +static void sc_app_shut_applet(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + struct channel *oc = sc_oc(sc); + + BUG_ON(!sc_appctx(sc)); + + sc->flags &= ~SC_FL_SHUT_WANTED; + if (sc->flags & SC_FL_SHUT_DONE) + return; + sc->flags |= SC_FL_SHUT_DONE; + oc->flags |= CF_WRITE_EVENT; + sc_set_hcto(sc); + + /* on shutw we always wake the applet up */ + appctx_wakeup(__sc_appctx(sc)); + + switch (sc->state) { + case SC_ST_RDY: + case SC_ST_EST: + /* we have to shut before closing, otherwise some short messages + * may never leave the system, especially when there are remaining + * unread data in the socket input buffer, or when nolinger is set. + * However, if SC_FL_NOLINGER is explicitly set, we know there is + * no risk so we close both sides immediately. + */ + if (!(sc->flags & (SC_FL_ERROR|SC_FL_NOLINGER|SC_FL_EOS|SC_FL_ABRT_DONE)) && + !(ic->flags & CF_DONT_READ)) + return; + + __fallthrough; + case SC_ST_CON: + case SC_ST_CER: + case SC_ST_QUE: + case SC_ST_TAR: + /* Note that none of these states may happen with applets */ + appctx_shut(__sc_appctx(sc)); + sc->state = SC_ST_DIS; + __fallthrough; + default: + sc->flags &= ~SC_FL_NOLINGER; + sc->flags |= SC_FL_ABRT_DONE; + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + } +} + +/* chk_rcv function for applets */ +static void sc_app_chk_rcv_applet(struct stconn *sc) +{ + BUG_ON(!sc_appctx(sc)); + + if (!sc_ep_have_ff_data(sc_opposite(sc))) { + /* (re)start reading */ + appctx_wakeup(__sc_appctx(sc)); + } +} + +/* chk_snd function for applets */ +static void sc_app_chk_snd_applet(struct stconn *sc) +{ + struct channel *oc = sc_oc(sc); + + BUG_ON(!sc_appctx(sc)); + + if (unlikely(sc->state != SC_ST_EST || (sc->flags & SC_FL_SHUT_DONE))) + return; + + /* we only wake the applet up if it was waiting for some data and is ready to consume it */ + if (!sc_ep_test(sc, SE_FL_WAIT_DATA|SE_FL_WONT_CONSUME)) + return; + + if (co_data(oc) || sc_ep_have_ff_data(sc)) { + /* (re)start sending */ + appctx_wakeup(__sc_appctx(sc)); + } +} + + +/* This function is designed to be called from within the stream handler to + * update the input channel's expiration timer and the stream connector's + * Rx flags based on the channel's flags. It needs to be called only once + * after the channel's flags have settled down, and before they are cleared, + * though it doesn't harm to call it as often as desired (it just slightly + * hurts performance). It must not be called from outside of the stream + * handler, as what it does will be used to compute the stream task's + * expiration. + */ +void sc_update_rx(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + + if (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + return; + + /* Unblock the SC if it needs room and the free space is large enough (0 + * means it can always be unblocked). Do not unblock it if -1 was + * specified. + */ + if (!sc->room_needed || (sc->room_needed > 0 && channel_recv_max(ic) >= sc->room_needed)) + sc_have_room(sc); + + /* Read not closed, update FD status and timeout for reads */ + if (ic->flags & CF_DONT_READ) + sc_wont_read(sc); + else + sc_will_read(sc); + + sc_chk_rcv(sc); +} + +/* This function is designed to be called from within the stream handler to + * update the output channel's expiration timer and the stream connector's + * Tx flags based on the channel's flags. It needs to be called only once + * after the channel's flags have settled down, and before they are cleared, + * though it doesn't harm to call it as often as desired (it just slightly + * hurts performance). It must not be called from outside of the stream + * handler, as what it does will be used to compute the stream task's + * expiration. + */ +void sc_update_tx(struct stconn *sc) +{ + struct channel *oc = sc_oc(sc); + + if (sc->flags & SC_FL_SHUT_DONE) + return; + + /* Write not closed, update FD status and timeout for writes */ + if (!co_data(oc)) { + /* stop writing */ + if (!sc_ep_test(sc, SE_FL_WAIT_DATA)) { + if ((sc->flags & SC_FL_SHUT_WANTED) == 0) + sc_ep_set(sc, SE_FL_WAIT_DATA); + } + return; + } + + /* (re)start writing */ + sc_ep_clr(sc, SE_FL_WAIT_DATA); +} + +/* This function is the equivalent to sc_update() except that it's + * designed to be called from outside the stream handlers, typically the lower + * layers (applets, connections) after I/O completion. After updating the stream + * interface and timeouts, it will try to forward what can be forwarded, then to + * wake the associated task up if an important event requires special handling. + * It may update SE_FL_WAIT_DATA and/or SC_FL_NEED_ROOM, that the callers are + * encouraged to watch to take appropriate action. + * It should not be called from within the stream itself, sc_update() + * is designed for this. Please do not statify this function, it's often + * present in backtraces, it's useful to recognize it. + */ +void sc_notify(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + struct channel *oc = sc_oc(sc); + struct stconn *sco = sc_opposite(sc); + struct task *task = sc_strm_task(sc); + + /* process consumer side */ + if (!co_data(oc)) { + struct connection *conn = sc_conn(sc); + + if (((sc->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) == SC_FL_SHUT_WANTED) && + (sc->state == SC_ST_EST) && (!conn || !(conn->flags & (CO_FL_WAIT_XPRT | CO_FL_EARLY_SSL_HS)))) + sc_shutdown(sc); + } + + /* indicate that we may be waiting for data from the output channel or + * we're about to close and can't expect more data if SC_FL_SHUT_WANTED is there. + */ + if (!(sc->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED))) + sc_ep_set(sc, SE_FL_WAIT_DATA); + else if ((sc->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) == SC_FL_SHUT_WANTED) + sc_ep_clr(sc, SE_FL_WAIT_DATA); + + if (oc->flags & CF_DONT_READ) + sc_wont_read(sco); + else + sc_will_read(sco); + + /* Notify the other side when we've injected data into the IC that + * needs to be forwarded. We can do fast-forwarding as soon as there + * are output data, but we avoid doing this if some of the data are + * not yet scheduled for being forwarded, because it is very likely + * that it will be done again immediately afterwards once the following + * data are parsed (eg: HTTP chunking). We only clear SC_FL_NEED_ROOM + * once we've emptied *some* of the output buffer, and not just when + * there is available room, because applets are often forced to stop + * before the buffer is full. We must not stop based on input data + * alone because an HTTP parser might need more data to complete the + * parsing. + */ + if (sc_ep_have_ff_data(sc_opposite(sc)) || + (co_data(ic) && sc_ep_test(sco, SE_FL_WAIT_DATA) && + (!(sc->flags & SC_FL_SND_EXP_MORE) || channel_full(ic, co_data(ic)) || channel_input_data(ic) == 0))) { + int new_len, last_len; + + last_len = co_data(ic) + sc_ep_ff_data(sco); + sc_chk_snd(sco); + new_len = co_data(ic) + sc_ep_ff_data(sco); + + /* check if the consumer has freed some space either in the + * buffer or in the pipe. + */ + if (!sc->room_needed || (new_len < last_len && (sc->room_needed < 0 || channel_recv_max(ic) >= sc->room_needed))) + sc_have_room(sc); + } + + if (!(ic->flags & CF_DONT_READ)) + sc_will_read(sc); + + sc_chk_rcv(sc); + sc_chk_rcv(sco); + + /* wake the task up only when needed */ + if (/* changes on the production side that must be handled: + * - An error on receipt: SC_FL_ERROR + * - A read event: shutdown for reads (CF_READ_EVENT + EOS/ABRT_DONE) + * end of input (CF_READ_EVENT + SC_FL_EOI) + * data received and no fast-forwarding (CF_READ_EVENT + !to_forward) + * read event while consumer side is not established (CF_READ_EVENT + sco->state != SC_ST_EST) + */ + ((ic->flags & CF_READ_EVENT) && ((sc->flags & SC_FL_EOI) || (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) || !ic->to_forward || sco->state != SC_ST_EST)) || + (sc->flags & SC_FL_ERROR) || + + /* changes on the consumption side */ + sc_ep_test(sc, SE_FL_ERR_PENDING) || + ((oc->flags & CF_WRITE_EVENT) && + ((sc->state < SC_ST_EST) || + (sc->flags & SC_FL_SHUT_DONE) || + (((oc->flags & CF_WAKE_WRITE) || + (!(oc->flags & CF_AUTO_CLOSE) && + !(sc->flags & (SC_FL_SHUT_WANTED|SC_FL_SHUT_DONE)))) && + (sco->state != SC_ST_EST || + (!co_data(oc) && !oc->to_forward)))))) { + task_wakeup(task, TASK_WOKEN_IO); + } + else { + /* Update expiration date for the task and requeue it if not already expired */ + if (!tick_is_expired(task->expire, now_ms)) { + task->expire = tick_first(task->expire, sc_ep_rcv_ex(sc)); + task->expire = tick_first(task->expire, sc_ep_snd_ex(sc)); + task->expire = tick_first(task->expire, sc_ep_rcv_ex(sco)); + task->expire = tick_first(task->expire, sc_ep_snd_ex(sco)); + task->expire = tick_first(task->expire, ic->analyse_exp); + task->expire = tick_first(task->expire, oc->analyse_exp); + task->expire = tick_first(task->expire, __sc_strm(sc)->conn_exp); + + /* WARNING: Don't forget to remove this BUG_ON before 2.9.0 */ + BUG_ON(tick_is_expired(task->expire, now_ms)); + task_queue(task); + } + } + + if (ic->flags & CF_READ_EVENT) + sc->flags &= ~SC_FL_RCV_ONCE; +} + +/* + * This function propagates an end-of-stream received on a socket-based connection. + * It updates the stream connector. If the stream connector has SC_FL_NOHALF, + * the close is also forwarded to the write side as an abort. + */ +static void sc_conn_eos(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + + BUG_ON(!sc_conn(sc)); + + if (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + return; + sc->flags |= SC_FL_EOS; + ic->flags |= CF_READ_EVENT; + sc_ep_report_read_activity(sc); + + if (!sc_state_in(sc->state, SC_SB_CON|SC_SB_RDY|SC_SB_EST)) + return; + + if (sc->flags & SC_FL_SHUT_DONE) + goto do_close; + + if (sc_cond_forward_shut(sc)) { + /* we want to immediately forward this close to the write side */ + /* force flag on ssl to keep stream in cache */ + sc_conn_shutw(sc, CO_SHW_SILENT); + goto do_close; + } + + /* otherwise that's just a normal read shutdown */ + return; + + do_close: + /* OK we completely close the socket here just as if we went through sc_shut[rw]() */ + sc_conn_shut(sc); + + sc->flags &= ~SC_FL_SHUT_WANTED; + sc->flags |= SC_FL_SHUT_DONE; + + sc->state = SC_ST_DIS; + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + return; +} + +/* + * This is the callback which is called by the connection layer to receive data + * into the buffer from the connection. It iterates over the mux layer's + * rcv_buf function. Please do not statify this function, it's often present in + * backtraces, it's useful to recognize it. + */ +int sc_conn_recv(struct stconn *sc) +{ + struct connection *conn = __sc_conn(sc); + struct channel *ic = sc_ic(sc); + int ret, max, cur_read = 0; + int read_poll = MAX_READ_POLL_LOOPS; + int flags = 0; + + /* If not established yet, do nothing. */ + if (sc->state != SC_ST_EST) + return 0; + + /* If another call to sc_conn_recv() failed, and we subscribed to + * recv events already, give up now. + */ + if ((sc->wait_event.events & SUB_RETRY_RECV) || sc_waiting_room(sc)) + return 0; + + /* maybe we were called immediately after an asynchronous abort */ + if (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + return 1; + + /* we must wait because the mux is not installed yet */ + if (!conn->mux) + return 0; + + /* stop immediately on errors. Note that we DON'T want to stop on + * POLL_ERR, as the poller might report a write error while there + * are still data available in the recv buffer. This typically + * happens when we send too large a request to a backend server + * which rejects it before reading it all. + */ + if (!sc_ep_test(sc, SE_FL_RCV_MORE)) { + if (!conn_xprt_ready(conn)) + return 0; + if (sc_ep_test(sc, SE_FL_ERROR)) + goto end_recv; + } + + /* prepare to detect if the mux needs more room */ + sc_ep_clr(sc, SE_FL_WANT_ROOM); + + if ((ic->flags & (CF_STREAMER | CF_STREAMER_FAST)) && !co_data(ic) && + global.tune.idle_timer && + (unsigned short)(now_ms - ic->last_read) >= global.tune.idle_timer) { + /* The buffer was empty and nothing was transferred for more + * than one second. This was caused by a pause and not by + * congestion. Reset any streaming mode to reduce latency. + */ + ic->xfer_small = 0; + ic->xfer_large = 0; + ic->flags &= ~(CF_STREAMER | CF_STREAMER_FAST); + } + +#if defined(USE_LINUX_SPLICE) + /* Detect if the splicing is possible depending on the stream policy */ + if ((global.tune.options & GTUNE_USE_SPLICE) && + (ic->to_forward >= MIN_SPLICE_FORWARD) && + ((!(sc->flags & SC_FL_ISBACK) && ((strm_fe(__sc_strm(sc))->options2|__sc_strm(sc)->be->options2) & PR_O2_SPLIC_REQ)) || + ((sc->flags & SC_FL_ISBACK) && ((strm_fe(__sc_strm(sc))->options2|__sc_strm(sc)->be->options2) & PR_O2_SPLIC_RTR)) || + ((ic->flags & CF_STREAMER_FAST) && ((strm_sess(__sc_strm(sc))->fe->options2|__sc_strm(sc)->be->options2) & PR_O2_SPLIC_AUT)))) + flags |= CO_RFL_MAY_SPLICE; +#endif + + /* First, let's see if we may fast-forward data from a side to the other + * one without using the channel buffer. + */ + if (sc_is_fastfwd_supported(sc)) { + if (channel_data(ic)) { + /* We're embarrassed, there are already data pending in + * the buffer and we don't want to have them at two + * locations at a time. Let's indicate we need some + * place and ask the consumer to hurry. + */ + flags |= CO_RFL_BUF_FLUSH; + goto abort_fastfwd; + } + ret = conn->mux->fastfwd(sc, ic->to_forward, flags); + if (ret < 0) + goto abort_fastfwd; + else if (ret > 0) { + if (ic->to_forward != CHN_INFINITE_FORWARD) + ic->to_forward -= ret; + ic->total += ret; + cur_read += ret; + ic->flags |= CF_READ_EVENT; + } + + if (sc_ep_test(sc, SE_FL_EOS | SE_FL_ERROR)) + goto end_recv; + + if (sc_ep_test(sc, SE_FL_WANT_ROOM)) + sc_need_room(sc, -1); + + if (sc_ep_test(sc, SE_FL_MAY_FASTFWD_PROD) && ic->to_forward) + goto done_recv; + } + + abort_fastfwd: + /* now we'll need a input buffer for the stream */ + if (!sc_alloc_ibuf(sc, &(__sc_strm(sc)->buffer_wait))) + goto end_recv; + + /* For an HTX stream, if the buffer is stuck (no output data with some + * input data) and if the HTX message is fragmented or if its free space + * wraps, we force an HTX deframentation. It is a way to have a + * contiguous free space nad to let the mux to copy as much data as + * possible. + * + * NOTE: A possible optim may be to let the mux decides if defrag is + * required or not, depending on amount of data to be xferred. + */ + if (IS_HTX_STRM(__sc_strm(sc)) && !co_data(ic)) { + struct htx *htx = htxbuf(&ic->buf); + + if (htx_is_not_empty(htx) && ((htx->flags & HTX_FL_FRAGMENTED) || htx_space_wraps(htx))) + htx_defrag(htx, NULL, 0); + } + + /* Instruct the mux it must subscribed for read events */ + if (!(sc->flags & SC_FL_ISBACK) && /* for frontend conns only */ + (sc_opposite(sc)->state != SC_ST_INI) && /* before backend connection setup */ + (__sc_strm(sc)->be->options & PR_O_ABRT_CLOSE)) /* if abortonclose option is set for the current backend */ + flags |= CO_RFL_KEEP_RECV; + + /* Important note : if we're called with POLL_IN|POLL_HUP, it means the read polling + * was enabled, which implies that the recv buffer was not full. So we have a guarantee + * that if such an event is not handled above in splice, it will be handled here by + * recv(). + */ + while (sc_ep_test(sc, SE_FL_RCV_MORE) || + (!(conn->flags & CO_FL_HANDSHAKE) && + (!sc_ep_test(sc, SE_FL_ERROR | SE_FL_EOS)) && !(sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)))) { + int cur_flags = flags; + + /* Compute transient CO_RFL_* flags */ + if (co_data(ic)) { + cur_flags |= (CO_RFL_BUF_WET | CO_RFL_BUF_NOT_STUCK); + } + + /* <max> may be null. This is the mux responsibility to set + * SE_FL_RCV_MORE on the SC if more space is needed. + */ + max = channel_recv_max(ic); + ret = conn->mux->rcv_buf(sc, &ic->buf, max, cur_flags); + + if (sc_ep_test(sc, SE_FL_WANT_ROOM)) { + /* SE_FL_WANT_ROOM must not be reported if the channel's + * buffer is empty. + */ + BUG_ON(c_empty(ic)); + + sc_need_room(sc, channel_recv_max(ic) + 1); + /* Add READ_PARTIAL because some data are pending but + * cannot be xferred to the channel + */ + ic->flags |= CF_READ_EVENT; + sc_ep_report_read_activity(sc); + } + + if (ret <= 0) { + /* if we refrained from reading because we asked for a + * flush to satisfy rcv_pipe(), we must not subscribe + * and instead report that there's not enough room + * here to proceed. + */ + if (flags & CO_RFL_BUF_FLUSH) + sc_need_room(sc, -1); + break; + } + + cur_read += ret; + + /* if we're allowed to directly forward data, we must update ->o */ + if (ic->to_forward && !(sc_opposite(sc)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED))) { + unsigned long fwd = ret; + if (ic->to_forward != CHN_INFINITE_FORWARD) { + if (fwd > ic->to_forward) + fwd = ic->to_forward; + ic->to_forward -= fwd; + } + c_adv(ic, fwd); + } + + ic->flags |= CF_READ_EVENT; + ic->total += ret; + + /* End-of-input reached, we can leave. In this case, it is + * important to break the loop to not block the SC because of + * the channel's policies.This way, we are still able to receive + * shutdowns. + */ + if (sc_ep_test(sc, SE_FL_EOI)) + break; + + if ((sc->flags & SC_FL_RCV_ONCE) || --read_poll <= 0) { + /* we don't expect to read more data */ + sc_wont_read(sc); + break; + } + + /* if too many bytes were missing from last read, it means that + * it's pointless trying to read again because the system does + * not have them in buffers. + */ + if (ret < max) { + /* if a streamer has read few data, it may be because we + * have exhausted system buffers. It's not worth trying + * again. + */ + if (ic->flags & CF_STREAMER) { + /* we're stopped by the channel's policy */ + sc_wont_read(sc); + break; + } + + /* if we read a large block smaller than what we requested, + * it's almost certain we'll never get anything more. + */ + if (ret >= global.tune.recv_enough) { + /* we're stopped by the channel's policy */ + sc_wont_read(sc); + break; + } + } + + /* if we are waiting for more space, don't try to read more data + * right now. + */ + if (sc->flags & (SC_FL_WONT_READ|SC_FL_NEED_BUFF|SC_FL_NEED_ROOM)) + break; + } /* while !flags */ + + done_recv: + if (!cur_read) + se_have_no_more_data(sc->sedesc); + else { + if ((ic->flags & (CF_STREAMER | CF_STREAMER_FAST)) && + (cur_read <= ic->buf.size / 2)) { + ic->xfer_large = 0; + ic->xfer_small++; + if (ic->xfer_small >= 3) { + /* we have read less than half of the buffer in + * one pass, and this happened at least 3 times. + * This is definitely not a streamer. + */ + ic->flags &= ~(CF_STREAMER | CF_STREAMER_FAST); + } + else if (ic->xfer_small >= 2) { + /* if the buffer has been at least half full twice, + * we receive faster than we send, so at least it + * is not a "fast streamer". + */ + ic->flags &= ~CF_STREAMER_FAST; + } + } + else if (!(ic->flags & CF_STREAMER_FAST) && (cur_read >= channel_data_limit(ic))) { + /* we read a full buffer at once */ + ic->xfer_small = 0; + ic->xfer_large++; + if (ic->xfer_large >= 3) { + /* we call this buffer a fast streamer if it manages + * to be filled in one call 3 consecutive times. + */ + ic->flags |= (CF_STREAMER | CF_STREAMER_FAST); + } + } + else { + ic->xfer_small = 0; + ic->xfer_large = 0; + } + ic->last_read = now_ms; + sc_ep_report_read_activity(sc); + } + + end_recv: + ret = (cur_read != 0); + + /* Report EOI on the channel if it was reached from the mux point of + * view. */ + if (sc_ep_test(sc, SE_FL_EOI) && !(sc->flags & SC_FL_EOI)) { + sc_ep_report_read_activity(sc); + sc->flags |= SC_FL_EOI; + ic->flags |= CF_READ_EVENT; + ret = 1; + } + + if (sc_ep_test(sc, SE_FL_EOS)) { + /* we received a shutdown */ + if (ic->flags & CF_AUTO_CLOSE) + sc_schedule_shutdown(sc_opposite(sc)); + sc_conn_eos(sc); + ret = 1; + } + + if (sc_ep_test(sc, SE_FL_ERROR)) { + sc->flags |= SC_FL_ERROR; + ret = 1; + } + else if (!cur_read && + !(sc->flags & (SC_FL_WONT_READ|SC_FL_NEED_BUFF|SC_FL_NEED_ROOM)) && + !(sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE))) { + /* Subscribe to receive events if we're blocking on I/O */ + conn->mux->subscribe(sc, SUB_RETRY_RECV, &sc->wait_event); + se_have_no_more_data(sc->sedesc); + } + else { + se_have_more_data(sc->sedesc); + ret = 1; + } + + return ret; +} + +/* This tries to perform a synchronous receive on the stream connector to + * try to collect last arrived data. In practice it's only implemented on + * stconns. Returns 0 if nothing was done, non-zero if new data or a + * shutdown were collected. This may result on some delayed receive calls + * to be programmed and performed later, though it doesn't provide any + * such guarantee. + */ +int sc_conn_sync_recv(struct stconn *sc) +{ + if (!sc_state_in(sc->state, SC_SB_RDY|SC_SB_EST)) + return 0; + + if (!sc_mux_ops(sc)) + return 0; // only stconns are supported + + if (sc->wait_event.events & SUB_RETRY_RECV) + return 0; // already subscribed + + if (!sc_is_recv_allowed(sc)) + return 0; // already failed + + return sc_conn_recv(sc); +} + +/* + * This function is called to send buffer data to a stream socket. + * It calls the mux layer's snd_buf function. It relies on the + * caller to commit polling changes. The caller should check conn->flags + * for errors. Please do not statify this function, it's often present in + * backtraces, it's useful to recognize it. + */ +int sc_conn_send(struct stconn *sc) +{ + struct connection *conn = __sc_conn(sc); + struct stconn *sco = sc_opposite(sc); + struct stream *s = __sc_strm(sc); + struct channel *oc = sc_oc(sc); + int ret; + int did_send = 0; + + if (sc_ep_test(sc, SE_FL_ERROR | SE_FL_ERR_PENDING) || sc_is_conn_error(sc)) { + /* We're probably there because the tasklet was woken up, + * but process_stream() ran before, detected there were an + * error and put the SC back to SC_ST_TAR. There's still + * CO_FL_ERROR on the connection but we don't want to add + * SE_FL_ERROR back, so give up + */ + if (sc->state < SC_ST_CON) + return 0; + BUG_ON(sc_ep_test(sc, SE_FL_EOS|SE_FL_ERROR|SE_FL_ERR_PENDING) == (SE_FL_EOS|SE_FL_ERR_PENDING)); + return 1; + } + + /* We're already waiting to be able to send, give up */ + if (sc->wait_event.events & SUB_RETRY_SEND) + return 0; + + /* we might have been called just after an asynchronous shutw */ + if (sc->flags & SC_FL_SHUT_DONE) + return 1; + + /* we must wait because the mux is not installed yet */ + if (!conn->mux) + return 0; + + if (sc_ep_have_ff_data(sc)) { + unsigned int send_flag = 0; + + if ((!(sc->flags & (SC_FL_SND_ASAP|SC_FL_SND_NEVERWAIT)) && + ((oc->to_forward && oc->to_forward != CHN_INFINITE_FORWARD) || + (sc->flags & SC_FL_SND_EXP_MORE) || + (IS_HTX_STRM(s) && + (!(sco->flags & (SC_FL_EOI|SC_FL_EOS|SC_FL_ABRT_DONE)) && htx_expect_more(htxbuf(&oc->buf)))))) || + ((oc->flags & CF_ISRESP) && + (oc->flags & CF_AUTO_CLOSE) && + (sc->flags & SC_FL_SHUT_WANTED))) + send_flag |= CO_SFL_MSG_MORE; + + if (oc->flags & CF_STREAMER) + send_flag |= CO_SFL_STREAMER; + + ret = conn->mux->resume_fastfwd(sc, send_flag); + if (ret > 0) + did_send = 1; + + if (sc_ep_have_ff_data(sc)) + goto end; + } + + /* At this point, the pipe is empty, but we may still have data pending + * in the normal buffer. + */ + if (co_data(oc)) { + /* when we're here, we already know that there is no spliced + * data left, and that there are sendable buffered data. + */ + + /* check if we want to inform the kernel that we're interested in + * sending more data after this call. We want this if : + * - we're about to close after this last send and want to merge + * the ongoing FIN with the last segment. + * - we know we can't send everything at once and must get back + * here because of unaligned data + * - there is still a finite amount of data to forward + * The test is arranged so that the most common case does only 2 + * tests. + */ + unsigned int send_flag = 0; + + if ((!(sc->flags & (SC_FL_SND_ASAP|SC_FL_SND_NEVERWAIT)) && + ((oc->to_forward && oc->to_forward != CHN_INFINITE_FORWARD) || + (sc->flags & SC_FL_SND_EXP_MORE) || + (IS_HTX_STRM(s) && + (!(sco->flags & (SC_FL_EOI|SC_FL_EOS|SC_FL_ABRT_DONE)) && htx_expect_more(htxbuf(&oc->buf)))))) || + ((oc->flags & CF_ISRESP) && + (oc->flags & CF_AUTO_CLOSE) && + (sc->flags & SC_FL_SHUT_WANTED))) + send_flag |= CO_SFL_MSG_MORE; + + if (oc->flags & CF_STREAMER) + send_flag |= CO_SFL_STREAMER; + + if (s->txn && s->txn->flags & TX_L7_RETRY && !b_data(&s->txn->l7_buffer)) { + /* If we want to be able to do L7 retries, copy + * the data we're about to send, so that we are able + * to resend them if needed + */ + /* Try to allocate a buffer if we had none. + * If it fails, the next test will just + * disable the l7 retries by setting + * l7_conn_retries to 0. + */ + if (s->txn->req.msg_state != HTTP_MSG_DONE) + s->txn->flags &= ~TX_L7_RETRY; + else { + if (b_alloc(&s->txn->l7_buffer) == NULL) + s->txn->flags &= ~TX_L7_RETRY; + else { + memcpy(b_orig(&s->txn->l7_buffer), + b_orig(&oc->buf), + b_size(&oc->buf)); + s->txn->l7_buffer.head = co_data(oc); + b_add(&s->txn->l7_buffer, co_data(oc)); + } + + } + } + + ret = conn->mux->snd_buf(sc, &oc->buf, co_data(oc), send_flag); + if (ret > 0) { + did_send = 1; + c_rew(oc, ret); + c_realign_if_empty(oc); + + if (!co_data(oc)) { + /* Always clear both flags once everything has been sent, they're one-shot */ + sc->flags &= ~(SC_FL_SND_ASAP|SC_FL_SND_EXP_MORE); + } + /* if some data remain in the buffer, it's only because the + * system buffers are full, we will try next time. + */ + } + } + + end: + if (did_send) { + oc->flags |= CF_WRITE_EVENT | CF_WROTE_DATA; + if (sc->state == SC_ST_CON) + sc->state = SC_ST_RDY; + } + + if (!sco->room_needed || (did_send && (sco->room_needed < 0 || channel_recv_max(sc_oc(sc)) >= sco->room_needed))) + sc_have_room(sco); + + if (sc_ep_test(sc, SE_FL_ERROR | SE_FL_ERR_PENDING)) { + oc->flags |= CF_WRITE_EVENT; + BUG_ON(sc_ep_test(sc, SE_FL_EOS|SE_FL_ERROR|SE_FL_ERR_PENDING) == (SE_FL_EOS|SE_FL_ERR_PENDING)); + if (sc_ep_test(sc, SE_FL_ERROR)) + sc->flags |= SC_FL_ERROR; + return 1; + } + + /* FIXME: Must be reviewed for FF */ + if (!co_data(oc) && !sc_ep_have_ff_data(sc)) { + if (did_send) + sc_ep_report_send_activity(sc); + /* If fast-forwarding is blocked, unblock it now to check for + * receive on the other side + */ + if (sc->sedesc->iobuf.flags & IOBUF_FL_FF_BLOCKED) { + sc->sedesc->iobuf.flags &= ~IOBUF_FL_FF_BLOCKED; + sc_have_room(sco); + did_send = 1; + } + } + else { + /* We couldn't send all of our data, let the mux know we'd like to send more */ + conn->mux->subscribe(sc, SUB_RETRY_SEND, &sc->wait_event); + if (sc_state_in(sc->state, SC_SB_EST|SC_SB_DIS|SC_SB_CLO)) + sc_ep_report_blocked_send(sc, did_send); + } + + return did_send; +} + +/* perform a synchronous send() for the stream connector. The CF_WRITE_EVENT + * flag are cleared prior to the attempt, and will possibly be updated in case + * of success. + */ +void sc_conn_sync_send(struct stconn *sc) +{ + struct channel *oc = sc_oc(sc); + + oc->flags &= ~CF_WRITE_EVENT; + + if (sc->flags & SC_FL_SHUT_DONE) + return; + + if (!co_data(oc)) + return; + + if (!sc_state_in(sc->state, SC_SB_CON|SC_SB_RDY|SC_SB_EST)) + return; + + if (!sc_mux_ops(sc)) + return; + + sc_conn_send(sc); +} + +/* Called by I/O handlers after completion.. It propagates + * connection flags to the stream connector, updates the stream (which may or + * may not take this opportunity to try to forward data), then update the + * connection's polling based on the channels and stream connector's final + * states. The function always returns 0. Please do not statify this function, + * it's often present in backtraces, it's useful to recognize it. + */ +int sc_conn_process(struct stconn *sc) +{ + struct connection *conn = __sc_conn(sc); + struct channel *ic = sc_ic(sc); + struct channel *oc = sc_oc(sc); + + BUG_ON(!conn); + + /* If we have data to send, try it now */ + if ((co_data(oc) || sc_ep_have_ff_data(sc)) && + !(sc->wait_event.events & SUB_RETRY_SEND)) + sc_conn_send(sc); + + /* First step, report to the stream connector what was detected at the + * connection layer : errors and connection establishment. + * Only add SC_FL_ERROR if we're connected, or we're attempting to + * connect, we may get there because we got woken up, but only run + * after process_stream() noticed there were an error, and decided + * to retry to connect, the connection may still have CO_FL_ERROR, + * and we don't want to add SC_FL_ERROR back + * + * Note: This test is only required because sc_conn_process is also the SI + * wake callback. Otherwise sc_conn_recv()/sc_conn_send() already take + * care of it. + */ + + if (sc->state >= SC_ST_CON) { + if (sc_is_conn_error(sc)) + sc->flags |= SC_FL_ERROR; + } + + /* If we had early data, and the handshake ended, then + * we can remove the flag, and attempt to wake the task up, + * in the event there's an analyser waiting for the end of + * the handshake. + */ + if (!(conn->flags & (CO_FL_WAIT_XPRT | CO_FL_EARLY_SSL_HS)) && + sc_ep_test(sc, SE_FL_WAIT_FOR_HS)) { + sc_ep_clr(sc, SE_FL_WAIT_FOR_HS); + task_wakeup(sc_strm_task(sc), TASK_WOKEN_MSG); + } + + if (!sc_state_in(sc->state, SC_SB_EST|SC_SB_DIS|SC_SB_CLO) && + (conn->flags & CO_FL_WAIT_XPRT) == 0) { + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + oc->flags |= CF_WRITE_EVENT; + if (sc->state == SC_ST_CON) + sc->state = SC_ST_RDY; + } + + /* Report EOS on the channel if it was reached from the mux point of + * view. + * + * Note: This test is only required because sc_conn_process is also the SI + * wake callback. Otherwise sc_conn_recv()/sc_conn_send() already take + * care of it. + */ + if (sc_ep_test(sc, SE_FL_EOS) && !(sc->flags & SC_FL_EOS)) { + /* we received a shutdown */ + if (ic->flags & CF_AUTO_CLOSE) + sc_schedule_shutdown(sc_opposite(sc)); + sc_conn_eos(sc); + } + + /* Report EOI on the channel if it was reached from the mux point of + * view. + * + * Note: This test is only required because sc_conn_process is also the SI + * wake callback. Otherwise sc_conn_recv()/sc_conn_send() already take + * care of it. + */ + if (sc_ep_test(sc, SE_FL_EOI) && !(sc->flags & SC_FL_EOI)) { + sc->flags |= SC_FL_EOI; + ic->flags |= CF_READ_EVENT; + sc_ep_report_read_activity(sc); + } + + if (sc_ep_test(sc, SE_FL_ERROR)) + sc->flags |= SC_FL_ERROR; + + /* Second step : update the stream connector and channels, try to forward any + * pending data, then possibly wake the stream up based on the new + * stream connector status. + */ + sc_notify(sc); + stream_release_buffers(__sc_strm(sc)); + return 0; +} + +/* This is the ->process() function for any stream connector's wait_event task. + * It's assigned during the stream connector's initialization, for any type of + * stream connector. Thus it is always safe to perform a tasklet_wakeup() on a + * stream connector, as the presence of the SC is checked there. + */ +struct task *sc_conn_io_cb(struct task *t, void *ctx, unsigned int state) +{ + struct stconn *sc = ctx; + int ret = 0; + + if (!sc_conn(sc)) + return t; + + if (!(sc->wait_event.events & SUB_RETRY_SEND) && (co_data(sc_oc(sc)) || sc_ep_have_ff_data(sc) || (sc->sedesc->iobuf.flags & IOBUF_FL_FF_BLOCKED))) + ret = sc_conn_send(sc); + if (!(sc->wait_event.events & SUB_RETRY_RECV)) + ret |= sc_conn_recv(sc); + if (ret != 0) + sc_conn_process(sc); + + stream_release_buffers(__sc_strm(sc)); + return t; +} + +/* + * This function propagates an end-of-stream received from an applet. It + * updates the stream connector. If it is is already shut, the applet is + * released. Otherwise, we try to forward the shutdown, immediately or ASAP. + */ +static void sc_applet_eos(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + + BUG_ON(!sc_appctx(sc)); + + if (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) + return; + sc->flags |= SC_FL_EOS; + ic->flags |= CF_READ_EVENT; + sc_ep_report_read_activity(sc); + + /* Note: on abort, we don't call the applet */ + + if (!sc_state_in(sc->state, SC_SB_CON|SC_SB_RDY|SC_SB_EST)) + return; + + if (sc->flags & SC_FL_SHUT_DONE) { + appctx_shut(__sc_appctx(sc)); + sc->state = SC_ST_DIS; + if (sc->flags & SC_FL_ISBACK) + __sc_strm(sc)->conn_exp = TICK_ETERNITY; + } + else if (sc_cond_forward_shut(sc)) + return sc_app_shut_applet(sc); +} + +/* Callback to be used by applet handlers upon completion. It updates the stream + * (which may or may not take this opportunity to try to forward data), then + * may re-enable the applet's based on the channels and stream connector's final + * states. Please do not statify this function, it's often present in backtraces, + * it's useful to recognize it. + */ +int sc_applet_process(struct stconn *sc) +{ + struct channel *ic = sc_ic(sc); + + BUG_ON(!sc_appctx(sc)); + + /* Report EOI on the channel if it was reached from the applet point of + * view. */ + if (sc_ep_test(sc, SE_FL_EOI) && !(sc->flags & SC_FL_EOI)) { + sc_ep_report_read_activity(sc); + sc->flags |= SC_FL_EOI; + ic->flags |= CF_READ_EVENT; + } + + if (sc_ep_test(sc, SE_FL_ERROR)) + sc->flags |= SC_FL_ERROR; + + if (sc_ep_test(sc, SE_FL_EOS)) { + /* we received a shutdown */ + sc_applet_eos(sc); + } + + BUG_ON(sc_ep_test(sc, SE_FL_HAVE_NO_DATA|SE_FL_EOI) == SE_FL_EOI); + + /* If the applet wants to write and the channel is closed, it's a + * broken pipe and it must be reported. + */ + if (!sc_ep_test(sc, SE_FL_HAVE_NO_DATA) && (sc->flags & (SC_FL_EOS|SC_FL_ABRT_DONE))) + sc_ep_set(sc, SE_FL_ERROR); + + /* automatically mark the applet having data available if it reported + * begin blocked by the channel. + */ + if ((sc->flags & (SC_FL_WONT_READ|SC_FL_NEED_BUFF|SC_FL_NEED_ROOM)) || + sc_ep_test(sc, SE_FL_APPLET_NEED_CONN)) + applet_have_more_data(__sc_appctx(sc)); + + /* update the stream connector, channels, and possibly wake the stream up */ + sc_notify(sc); + stream_release_buffers(__sc_strm(sc)); + + /* sc_notify may have passed through chk_snd and released some blocking + * flags. Process_stream will consider those flags to wake up the + * appctx but in the case the task is not in runqueue we may have to + * wakeup the appctx immediately. + */ + if (sc_is_recv_allowed(sc) || sc_is_send_allowed(sc)) + appctx_wakeup(__sc_appctx(sc)); + return 0; +} + + +/* Prepares an endpoint upgrade. We don't now at this stage if the upgrade will + * succeed or not and if the stconn will be reused by the new endpoint. Thus, + * for now, only pretend the stconn is detached. + */ +void sc_conn_prepare_endp_upgrade(struct stconn *sc) +{ + BUG_ON(!sc_conn(sc) || !sc->app); + sc_ep_clr(sc, SE_FL_T_MUX); + sc_ep_set(sc, SE_FL_DETACHED); +} + +/* Endpoint upgrade failed. Restore the stconn state. */ +void sc_conn_abort_endp_upgrade(struct stconn *sc) +{ + sc_ep_set(sc, SE_FL_T_MUX); + sc_ep_clr(sc, SE_FL_DETACHED); +} + +/* Commit the endpoint upgrade. If stconn is attached, it means the new endpoint + * use it. So we do nothing. Otherwise, the stconn will be destroy with the + * overlying stream. So, it means we must commit the detach. +*/ +void sc_conn_commit_endp_upgrade(struct stconn *sc) +{ + if (!sc_ep_test(sc, SE_FL_DETACHED)) + return; + sc_detach_endp(&sc); + /* Because it was already set as detached, the sedesc must be preserved */ + BUG_ON(!sc); + BUG_ON(!sc->sedesc); +} + +/* return the frontend or backend mux stream ID. + */ +static int +smp_fetch_sid(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn; + struct stconn *sc; + int64_t sid = 0; + + if (!smp->strm) + return 0; + + sc = (kw[0] == 'f' ? smp->strm->scf : smp->strm->scb); + conn = sc_conn(sc); + + /* No connection */ + if (!conn) + return 0; + + /* No mux install, this may change */ + if (!conn->mux) { + smp->flags |= SMP_F_MAY_CHANGE; + return 0; + } + + /* No sctl, report sid=0 in this case */ + if (conn->mux->sctl) { + if (conn->mux->sctl(sc, MUX_SCTL_SID, &sid) == -1) + return 0; + } + + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = sid; + + return 1; +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Note: fetches that may return multiple types should be declared using the + * appropriate pseudo-type. If not available it must be declared as the lowest + * common denominator, the type that can be casted into all other ones. + */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + { "bs.id", smp_fetch_sid, 0, NULL, SMP_T_SINT, SMP_USE_L6REQ }, + { "fs.id", smp_fetch_sid, 0, NULL, SMP_T_STR, SMP_USE_L6RES }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); diff --git a/src/stick_table.c b/src/stick_table.c new file mode 100644 index 0000000..6427568 --- /dev/null +++ b/src/stick_table.c @@ -0,0 +1,5658 @@ +/* + * Stick tables management functions. + * + * Copyright 2009-2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <string.h> +#include <errno.h> + +#include <import/ebmbtree.h> +#include <import/ebsttree.h> +#include <import/ebistree.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/arg.h> +#include <haproxy/cfgparse.h> +#include <haproxy/cli.h> +#include <haproxy/dict.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/http_rules.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/net_helper.h> +#include <haproxy/peers.h> +#include <haproxy/pool.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stats-t.h> +#include <haproxy/stconn.h> +#include <haproxy/stick_table.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/ticks.h> +#include <haproxy/tools.h> +#include <haproxy/xxhash.h> + + +/* structure used to return a table key built from a sample */ +static THREAD_LOCAL struct stktable_key static_table_key; +static int (*smp_fetch_src)(const struct arg *, struct sample *, const char *, void *); +struct pool_head *pool_head_stk_ctr __read_mostly = NULL; +struct stktable *stktables_list; +struct eb_root stktable_by_name = EB_ROOT; + +#define round_ptr_size(i) (((i) + (sizeof(void *) - 1)) &~ (sizeof(void *) - 1)) + +/* This function inserts stktable <t> into the tree of known stick-table. + * The stick-table ID is used as the storing key so it must already have + * been initialized. + */ +void stktable_store_name(struct stktable *t) +{ + t->name.key = t->id; + ebis_insert(&stktable_by_name, &t->name); +} + +struct stktable *stktable_find_by_name(const char *name) +{ + struct ebpt_node *node; + struct stktable *t; + + node = ebis_lookup(&stktable_by_name, name); + if (node) { + t = container_of(node, struct stktable, name); + if (strcmp(t->id, name) == 0) + return t; + } + + return NULL; +} + +/* + * Free an allocated sticky session <ts>, and decrease sticky sessions counter + * in table <t>. It's safe to call it under or out of a lock. + */ +void __stksess_free(struct stktable *t, struct stksess *ts) +{ + HA_ATOMIC_DEC(&t->current); + pool_free(t->pool, (void *)ts - round_ptr_size(t->data_size)); +} + +/* + * Free an allocated sticky session <ts>, and decrease sticky sessions counter + * in table <t>. + * This function locks the table + */ +void stksess_free(struct stktable *t, struct stksess *ts) +{ + void *data; + data = stktable_data_ptr(t, ts, STKTABLE_DT_SERVER_KEY); + if (data) { + dict_entry_unref(&server_key_dict, stktable_data_cast(data, std_t_dict)); + stktable_data_cast(data, std_t_dict) = NULL; + } + HA_RWLOCK_RDLOCK(STK_TABLE_LOCK, &t->lock); + __stksess_free(t, ts); + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &t->lock); +} + +/* + * Kill an stksess (only if its ref_cnt is zero). This must be called under the + * write lock. Returns zero if could not deleted, non-zero otherwise. + */ +int __stksess_kill(struct stktable *t, struct stksess *ts) +{ + if (HA_ATOMIC_LOAD(&ts->ref_cnt)) + return 0; + + eb32_delete(&ts->exp); + if (ts->upd.node.leaf_p) { + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->updt_lock); + eb32_delete(&ts->upd); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->updt_lock); + } + ebmb_delete(&ts->key); + __stksess_free(t, ts); + return 1; +} + +/* + * Decrease the refcount if decrefcnt is not 0, and try to kill the stksess. + * Returns non-zero if deleted, zero otherwise. + * This function locks the table + */ +int stksess_kill(struct stktable *t, struct stksess *ts, int decrefcnt) +{ + int ret; + + if (decrefcnt && HA_ATOMIC_SUB_FETCH(&ts->ref_cnt, 1) != 0) + return 0; + + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->lock); + ret = __stksess_kill(t, ts); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->lock); + + return ret; +} + +/* + * Initialize or update the key in the sticky session <ts> present in table <t> + * from the value present in <key>. + */ +void stksess_setkey(struct stktable *t, struct stksess *ts, struct stktable_key *key) +{ + if (t->type != SMP_T_STR) + memcpy(ts->key.key, key->key, t->key_size); + else { + memcpy(ts->key.key, key->key, MIN(t->key_size - 1, key->key_len)); + ts->key.key[MIN(t->key_size - 1, key->key_len)] = 0; + } +} + +/* return a shard number for key <key> of len <len> present in table <t>. This + * takes into account the presence or absence of a peers section with shards + * and the number of shards, the table's hash_seed, and of course the key. The + * caller must pass a valid <key> and <len>. The shard number to be used by the + * entry is returned (from 1 to nb_shards, otherwise 0 for none). + */ +int stktable_get_key_shard(struct stktable *t, const void *key, size_t len) +{ + /* no peers section or no shards in the peers section */ + if (!t->peers.p || !t->peers.p->nb_shards) + return 0; + + return XXH64(key, len, t->hash_seed) % t->peers.p->nb_shards + 1; +} + +/* + * Set the shard for <key> key of <ts> sticky session attached to <t> stick table. + * Use zero for stick-table without peers synchronisation. + */ +static void stksess_setkey_shard(struct stktable *t, struct stksess *ts, + struct stktable_key *key) +{ + size_t keylen; + + if (t->type == SMP_T_STR) + keylen = key->key_len; + else + keylen = t->key_size; + + ts->shard = stktable_get_key_shard(t, key->key, keylen); +} + +/* + * Init sticky session <ts> of table <t>. The data parts are cleared and <ts> + * is returned. + */ +static struct stksess *__stksess_init(struct stktable *t, struct stksess * ts) +{ + memset((void *)ts - t->data_size, 0, t->data_size); + ts->ref_cnt = 0; + ts->shard = 0; + ts->key.node.leaf_p = NULL; + ts->exp.node.leaf_p = NULL; + ts->upd.node.leaf_p = NULL; + ts->expire = tick_add(now_ms, MS_TO_TICKS(t->expire)); + HA_RWLOCK_INIT(&ts->lock); + return ts; +} + +/* + * Trash oldest <to_batch> sticky sessions from table <t> + * Returns number of trashed sticky sessions. It may actually trash less + * than expected if finding these requires too long a search time (e.g. + * most of them have ts->ref_cnt>0). + */ +int __stktable_trash_oldest(struct stktable *t, int to_batch) +{ + struct stksess *ts; + struct eb32_node *eb; + int max_search = to_batch * 2; // no more than 50% misses + int batched = 0; + int looped = 0; + + eb = eb32_lookup_ge(&t->exps, now_ms - TIMER_LOOK_BACK); + + while (batched < to_batch) { + + if (unlikely(!eb)) { + /* we might have reached the end of the tree, typically because + * <now_ms> is in the first half and we're first scanning the last + * half. Let's loop back to the beginning of the tree now if we + * have not yet visited it. + */ + if (looped) + break; + looped = 1; + eb = eb32_first(&t->exps); + if (likely(!eb)) + break; + } + + if (--max_search < 0) + break; + + /* timer looks expired, detach it from the queue */ + ts = eb32_entry(eb, struct stksess, exp); + eb = eb32_next(eb); + + /* don't delete an entry which is currently referenced */ + if (HA_ATOMIC_LOAD(&ts->ref_cnt) != 0) + continue; + + eb32_delete(&ts->exp); + + if (ts->expire != ts->exp.key) { + if (!tick_isset(ts->expire)) + continue; + + ts->exp.key = ts->expire; + eb32_insert(&t->exps, &ts->exp); + + /* the update might have jumped beyond the next element, + * possibly causing a wrapping. We need to check whether + * the next element should be used instead. If the next + * element doesn't exist it means we're on the right + * side and have to check the first one then. If it + * exists and is closer, we must use it, otherwise we + * use the current one. + */ + if (!eb) + eb = eb32_first(&t->exps); + + if (!eb || tick_is_lt(ts->exp.key, eb->key)) + eb = &ts->exp; + + continue; + } + + /* session expired, trash it */ + ebmb_delete(&ts->key); + if (ts->upd.node.leaf_p) { + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->updt_lock); + eb32_delete(&ts->upd); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->updt_lock); + } + __stksess_free(t, ts); + batched++; + } + + return batched; +} + +/* + * Trash oldest <to_batch> sticky sessions from table <t> + * Returns number of trashed sticky sessions. + * This function locks the table + */ +int stktable_trash_oldest(struct stktable *t, int to_batch) +{ + int ret; + + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->lock); + ret = __stktable_trash_oldest(t, to_batch); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->lock); + + return ret; +} +/* + * Allocate and initialise a new sticky session. + * The new sticky session is returned or NULL in case of lack of memory. + * Sticky sessions should only be allocated this way, and must be freed using + * stksess_free(). Table <t>'s sticky session counter is increased. If <key> + * is not NULL, it is assigned to the new session. It must be called unlocked + * as it may rely on a lock to trash older entries. + */ +struct stksess *stksess_new(struct stktable *t, struct stktable_key *key) +{ + struct stksess *ts; + unsigned int current; + + current = HA_ATOMIC_FETCH_ADD(&t->current, 1); + + if (unlikely(current >= t->size)) { + /* the table was already full, we may have to purge entries */ + if (t->nopurge || !stktable_trash_oldest(t, (t->size >> 8) + 1)) { + HA_ATOMIC_DEC(&t->current); + return NULL; + } + } + + ts = pool_alloc(t->pool); + if (ts) { + ts = (void *)ts + round_ptr_size(t->data_size); + __stksess_init(t, ts); + if (key) { + stksess_setkey(t, ts, key); + stksess_setkey_shard(t, ts, key); + } + } + + return ts; +} + +/* + * Looks in table <t> for a sticky session matching key <key>. + * Returns pointer on requested sticky session or NULL if none was found. + */ +struct stksess *__stktable_lookup_key(struct stktable *t, struct stktable_key *key) +{ + struct ebmb_node *eb; + + if (t->type == SMP_T_STR) + eb = ebst_lookup_len(&t->keys, key->key, key->key_len+1 < t->key_size ? key->key_len : t->key_size-1); + else + eb = ebmb_lookup(&t->keys, key->key, t->key_size); + + if (unlikely(!eb)) { + /* no session found */ + return NULL; + } + + return ebmb_entry(eb, struct stksess, key); +} + +/* + * Looks in table <t> for a sticky session matching key <key>. + * Returns pointer on requested sticky session or NULL if none was found. + * The refcount of the found entry is increased and this function + * is protected using the table lock + */ +struct stksess *stktable_lookup_key(struct stktable *t, struct stktable_key *key) +{ + struct stksess *ts; + + HA_RWLOCK_RDLOCK(STK_TABLE_LOCK, &t->lock); + ts = __stktable_lookup_key(t, key); + if (ts) + HA_ATOMIC_INC(&ts->ref_cnt); + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &t->lock); + + return ts; +} + +/* + * Looks in table <t> for a sticky session with same key as <ts>. + * Returns pointer on requested sticky session or NULL if none was found. + */ +struct stksess *__stktable_lookup(struct stktable *t, struct stksess *ts) +{ + struct ebmb_node *eb; + + if (t->type == SMP_T_STR) + eb = ebst_lookup(&(t->keys), (char *)ts->key.key); + else + eb = ebmb_lookup(&(t->keys), ts->key.key, t->key_size); + + if (unlikely(!eb)) + return NULL; + + return ebmb_entry(eb, struct stksess, key); +} + +/* + * Looks in table <t> for a sticky session with same key as <ts>. + * Returns pointer on requested sticky session or NULL if none was found. + * The refcount of the found entry is increased and this function + * is protected using the table lock + */ +struct stksess *stktable_lookup(struct stktable *t, struct stksess *ts) +{ + struct stksess *lts; + + HA_RWLOCK_RDLOCK(STK_TABLE_LOCK, &t->lock); + lts = __stktable_lookup(t, ts); + if (lts) + HA_ATOMIC_INC(<s->ref_cnt); + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &t->lock); + + return lts; +} + +/* Update the expiration timer for <ts> but do not touch its expiration node. + * The table's expiration timer is updated if set. + * The node will be also inserted into the update tree if needed, at a position + * depending if the update is a local or coming from a remote node. + * If <decrefcnt> is set, the ts entry's ref_cnt will be decremented. The table's + * write lock may be taken. + */ +void stktable_touch_with_exp(struct stktable *t, struct stksess *ts, int local, int expire, int decrefcnt) +{ + struct eb32_node * eb; + int use_wrlock = 0; + int do_wakeup = 0; + + if (expire != HA_ATOMIC_LOAD(&ts->expire)) { + /* we'll need to set the expiration and to wake up the expiration timer .*/ + HA_ATOMIC_STORE(&ts->expire, expire); + stktable_requeue_exp(t, ts); + } + + /* If sync is enabled */ + if (t->sync_task) { + try_lock_again: + /* We'll need to reliably check that the entry is in the tree. + * It's only inserted/deleted using a write lock so a read lock + * is sufficient to verify this. We may then need to upgrade it + * to perform an update (which is rare under load), and if the + * upgrade fails, we'll try again with a write lock directly. + */ + if (use_wrlock) + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->updt_lock); + else + HA_RWLOCK_RDLOCK(STK_TABLE_LOCK, &t->updt_lock); + + if (local) { + /* Check if this entry is not in the tree or not + * scheduled for at least one peer. + */ + if (!ts->upd.node.leaf_p + || (int)(t->commitupdate - ts->upd.key) >= 0 + || (int)(ts->upd.key - t->localupdate) >= 0) { + /* Time to upgrade the read lock to write lock if needed */ + if (!use_wrlock) { + if (HA_RWLOCK_TRYRDTOSK(STK_TABLE_LOCK, &t->updt_lock) != 0) { + /* failed, try again */ + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &t->updt_lock); + use_wrlock = 1; + goto try_lock_again; + } + HA_RWLOCK_SKTOWR(STK_TABLE_LOCK, &t->updt_lock); + use_wrlock = 1; + } + + /* here we're write-locked */ + + ts->upd.key = ++t->update; + t->localupdate = t->update; + eb32_delete(&ts->upd); + eb = eb32_insert(&t->updates, &ts->upd); + if (eb != &ts->upd) { + eb32_delete(eb); + eb32_insert(&t->updates, &ts->upd); + } + } + do_wakeup = 1; + } + else { + /* If this entry is not in the tree */ + + if (!ts->upd.node.leaf_p) { + /* Time to upgrade the read lock to write lock if needed */ + if (!use_wrlock) { + if (HA_RWLOCK_TRYRDTOSK(STK_TABLE_LOCK, &t->updt_lock) != 0) { + /* failed, try again */ + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &t->updt_lock); + use_wrlock = 1; + goto try_lock_again; + } + HA_RWLOCK_SKTOWR(STK_TABLE_LOCK, &t->updt_lock); + use_wrlock = 1; + } + + /* here we're write-locked */ + + ts->upd.key= (++t->update)+(2147483648U); + eb = eb32_insert(&t->updates, &ts->upd); + if (eb != &ts->upd) { + eb32_delete(eb); + eb32_insert(&t->updates, &ts->upd); + } + } + } + + /* drop the lock now */ + if (use_wrlock) + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->updt_lock); + else + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &t->updt_lock); + } + + if (decrefcnt) + HA_ATOMIC_DEC(&ts->ref_cnt); + + if (do_wakeup) + task_wakeup(t->sync_task, TASK_WOKEN_MSG); +} + +/* Update the expiration timer for <ts> but do not touch its expiration node. + * The table's expiration timer is updated using the date of expiration coming from + * <t> stick-table configuration. + * The node will be also inserted into the update tree if needed, at a position + * considering the update is coming from a remote node + */ +void stktable_touch_remote(struct stktable *t, struct stksess *ts, int decrefcnt) +{ + stktable_touch_with_exp(t, ts, 0, ts->expire, decrefcnt); +} + +/* Update the expiration timer for <ts> but do not touch its expiration node. + * The table's expiration timer is updated using the date of expiration coming from + * <t> stick-table configuration. + * The node will be also inserted into the update tree if needed, at a position + * considering the update was made locally + */ +void stktable_touch_local(struct stktable *t, struct stksess *ts, int decrefcnt) +{ + int expire = tick_add(now_ms, MS_TO_TICKS(t->expire)); + + stktable_touch_with_exp(t, ts, 1, expire, decrefcnt); +} +/* Just decrease the ref_cnt of the current session. Does nothing if <ts> is NULL. + * Note that we still need to take the read lock because a number of other places + * (including in Lua and peers) update the ref_cnt non-atomically under the write + * lock. + */ +static void stktable_release(struct stktable *t, struct stksess *ts) +{ + if (!ts) + return; + HA_ATOMIC_DEC(&ts->ref_cnt); +} + +/* Insert new sticky session <ts> in the table. It is assumed that it does not + * yet exist (the caller must check this). The table's timeout is updated if it + * is set. <ts> is returned if properly inserted, otherwise the one already + * present if any. + */ +struct stksess *__stktable_store(struct stktable *t, struct stksess *ts) +{ + struct ebmb_node *eb; + + eb = ebmb_insert(&t->keys, &ts->key, t->key_size); + if (likely(eb == &ts->key)) { + ts->exp.key = ts->expire; + eb32_insert(&t->exps, &ts->exp); + } + return ebmb_entry(eb, struct stksess, key); // most commonly this is <ts> +} + +/* requeues the table's expiration task to take the recently added <ts> into + * account. This is performed atomically and doesn't require any lock. + */ +void stktable_requeue_exp(struct stktable *t, const struct stksess *ts) +{ + int old_exp, new_exp; + int expire = ts->expire; + + if (!t->expire) + return; + + /* set the task's expire to the newest expiration date. */ + old_exp = HA_ATOMIC_LOAD(&t->exp_task->expire); + new_exp = tick_first(expire, old_exp); + + /* let's not go further if we're already up to date */ + if (new_exp == old_exp) + return; + + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->lock); + + while (new_exp != old_exp && + !HA_ATOMIC_CAS(&t->exp_task->expire, &old_exp, new_exp)) { + __ha_cpu_relax(); + new_exp = tick_first(expire, old_exp); + } + + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->lock); + + task_queue(t->exp_task); +} + +/* Returns a valid or initialized stksess for the specified stktable_key in the + * specified table, or NULL if the key was NULL, or if no entry was found nor + * could be created. The entry's expiration is updated. This function locks the + * table, and the refcount of the entry is increased. + */ +struct stksess *stktable_get_entry(struct stktable *table, struct stktable_key *key) +{ + struct stksess *ts, *ts2; + + if (!key) + return NULL; + + ts = stktable_lookup_key(table, key); + if (ts) + return ts; + + /* No such entry exists, let's try to create a new one. this doesn't + * require locking yet. + */ + + ts = stksess_new(table, key); + if (!ts) + return NULL; + + /* Now we're certain to have a ts. We need to store it. For this we'll + * need an exclusive access. We don't need an atomic upgrade, this is + * rare and an unlock+lock sequence will do the job fine. Given that + * this will not be atomic, the missing entry might appear in the mean + * tome so we have to be careful that the one we try to insert is the + * one we find. + */ + + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &table->lock); + + ts2 = __stktable_store(table, ts); + + HA_ATOMIC_INC(&ts2->ref_cnt); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &table->lock); + + if (unlikely(ts2 != ts)) { + /* another entry was added in the mean time, let's + * switch to it. + */ + __stksess_free(table, ts); + ts = ts2; + } + + stktable_requeue_exp(table, ts); + return ts; +} + +/* Lookup for an entry with the same key and store the submitted + * stksess if not found. This function locks the table either shared or + * exclusively, and the refcount of the entry is increased. + */ +struct stksess *stktable_set_entry(struct stktable *table, struct stksess *nts) +{ + struct stksess *ts; + + HA_RWLOCK_RDLOCK(STK_TABLE_LOCK, &table->lock); + ts = __stktable_lookup(table, nts); + if (ts) { + HA_ATOMIC_INC(&ts->ref_cnt); + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &table->lock); + return ts; + } + ts = nts; + + /* let's increment it before switching to exclusive */ + HA_ATOMIC_INC(&ts->ref_cnt); + + if (HA_RWLOCK_TRYRDTOSK(STK_TABLE_LOCK, &table->lock) != 0) { + /* upgrade to seek lock failed, let's drop and take */ + HA_RWLOCK_RDUNLOCK(STK_TABLE_LOCK, &table->lock); + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &table->lock); + } + else + HA_RWLOCK_SKTOWR(STK_TABLE_LOCK, &table->lock); + + /* now we're write-locked */ + + __stktable_store(table, ts); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &table->lock); + + stktable_requeue_exp(table, ts); + return ts; +} + +/* + * Task processing function to trash expired sticky sessions. A pointer to the + * task itself is returned since it never dies. + */ +struct task *process_table_expire(struct task *task, void *context, unsigned int state) +{ + struct stktable *t = context; + struct stksess *ts; + struct eb32_node *eb; + int updt_locked = 0; + int looped = 0; + int exp_next; + + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->lock); + eb = eb32_lookup_ge(&t->exps, now_ms - TIMER_LOOK_BACK); + + while (1) { + if (unlikely(!eb)) { + /* we might have reached the end of the tree, typically because + * <now_ms> is in the first half and we're first scanning the last + * half. Let's loop back to the beginning of the tree now if we + * have not yet visited it. + */ + if (looped) + break; + looped = 1; + eb = eb32_first(&t->exps); + if (likely(!eb)) + break; + } + + if (likely(tick_is_lt(now_ms, eb->key))) { + /* timer not expired yet, revisit it later */ + exp_next = eb->key; + goto out_unlock; + } + + /* timer looks expired, detach it from the queue */ + ts = eb32_entry(eb, struct stksess, exp); + eb = eb32_next(eb); + + /* don't delete an entry which is currently referenced */ + if (HA_ATOMIC_LOAD(&ts->ref_cnt) != 0) + continue; + + eb32_delete(&ts->exp); + + if (!tick_is_expired(ts->expire, now_ms)) { + if (!tick_isset(ts->expire)) + continue; + + ts->exp.key = ts->expire; + eb32_insert(&t->exps, &ts->exp); + + /* the update might have jumped beyond the next element, + * possibly causing a wrapping. We need to check whether + * the next element should be used instead. If the next + * element doesn't exist it means we're on the right + * side and have to check the first one then. If it + * exists and is closer, we must use it, otherwise we + * use the current one. + */ + if (!eb) + eb = eb32_first(&t->exps); + + if (!eb || tick_is_lt(ts->exp.key, eb->key)) + eb = &ts->exp; + continue; + } + + /* session expired, trash it */ + ebmb_delete(&ts->key); + if (ts->upd.node.leaf_p) { + if (!updt_locked) { + updt_locked = 1; + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->updt_lock); + } + eb32_delete(&ts->upd); + } + __stksess_free(t, ts); + } + + /* We have found no task to expire in any tree */ + exp_next = TICK_ETERNITY; + +out_unlock: + task->expire = exp_next; + if (updt_locked) + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->updt_lock); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->lock); + return task; +} + +/* Perform minimal stick table initialization. In case of error, the + * function will return 0 and <err_msg> will contain hints about the + * error and it is up to the caller to free it. + * + * Returns 1 on success + */ +int stktable_init(struct stktable *t, char **err_msg) +{ + int peers_retval = 0; + + t->hash_seed = XXH64(t->id, t->idlen, 0); + + if (t->size) { + t->keys = EB_ROOT_UNIQUE; + memset(&t->exps, 0, sizeof(t->exps)); + t->updates = EB_ROOT_UNIQUE; + HA_RWLOCK_INIT(&t->lock); + + t->pool = create_pool("sticktables", sizeof(struct stksess) + round_ptr_size(t->data_size) + t->key_size, MEM_F_SHARED); + + if ( t->expire ) { + t->exp_task = task_new_anywhere(); + if (!t->exp_task) + goto mem_error; + t->exp_task->process = process_table_expire; + t->exp_task->context = (void *)t; + } + if (t->peers.p && t->peers.p->peers_fe && !(t->peers.p->peers_fe->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + peers_retval = peers_register_table(t->peers.p, t); + } + + if (t->pool == NULL || peers_retval) + goto mem_error; + } + if (t->write_to.name) { + struct stktable *table; + + /* postresolve write_to table */ + table = stktable_find_by_name(t->write_to.name); + if (!table) { + memprintf(err_msg, "write-to: table '%s' doesn't exist", t->write_to.name); + ha_free(&t->write_to.name); /* no longer need this */ + return 0; + } + ha_free(&t->write_to.name); /* no longer need this */ + if (table->write_to.ptr) { + memprintf(err_msg, "write-to: table '%s' is already used as a source table", table->id); + return 0; + } + if (table->type != t->type) { + memprintf(err_msg, "write-to: cannot mix table types ('%s' has '%s' type and '%s' has '%s' type)", + table->id, stktable_types[table->type].kw, + t->id, stktable_types[t->type].kw); + return 0; + } + if (table->key_size != t->key_size) { + memprintf(err_msg, "write-to: cannot mix key sizes ('%s' has '%ld' key_size and '%s' has '%ld' key_size)", + table->id, (long)table->key_size, + t->id, (long)t->key_size); + return 0; + } + + t->write_to.t = table; + } + return 1; + + mem_error: + memprintf(err_msg, "memory allocation error"); + return 0; +} + +/* Performs stick table cleanup: it's meant to be called after the table + * has been initialized ith stktable_init(), else it will lead to undefined + * behavior. + * + * However it does not free the table pointer itself + */ +void stktable_deinit(struct stktable *t) +{ + if (!t) + return; + task_destroy(t->exp_task); + pool_destroy(t->pool); +} + +/* + * Configuration keywords of known table types + */ +struct stktable_type stktable_types[SMP_TYPES] = { + [SMP_T_SINT] = { "integer", 0, 4 }, + [SMP_T_IPV4] = { "ip", 0, 4 }, + [SMP_T_IPV6] = { "ipv6", 0, 16 }, + [SMP_T_STR] = { "string", STK_F_CUSTOM_KEYSIZE, 32 }, + [SMP_T_BIN] = { "binary", STK_F_CUSTOM_KEYSIZE, 32 } +}; + +/* + * Parse table type configuration. + * Returns 0 on successful parsing, else 1. + * <myidx> is set at next configuration <args> index. + */ +int stktable_parse_type(char **args, int *myidx, unsigned long *type, size_t *key_size, const char *file, int linenum) +{ + for (*type = 0; *type < SMP_TYPES; (*type)++) { + if (!stktable_types[*type].kw) + continue; + if (strcmp(args[*myidx], stktable_types[*type].kw) != 0) + continue; + + *key_size = stktable_types[*type].default_size; + (*myidx)++; + + if (stktable_types[*type].flags & STK_F_CUSTOM_KEYSIZE) { + if (strcmp("len", args[*myidx]) == 0) { + char *stop; + + (*myidx)++; + *key_size = strtol(args[*myidx], &stop, 10); + if (*stop != '\0' || !*key_size) { + ha_alert("parsing [%s:%d] : 'len' expects a positive integer argument.\n", file, linenum); + return 1; + } + if (*type == SMP_T_STR) { + /* null terminated string needs +1 for '\0'. */ + (*key_size)++; + } + (*myidx)++; + } + } + return 0; + } + ha_alert("parsing [%s:%d] : %s: unknown type '%s'.\n", file, linenum, args[0], args[*myidx]); + return 1; +} + +/* reserve some space for data type <type>, there is 2 optionnals + * argument at <sa> and <sa2> to configure this data type and + * they can be NULL if unused for a given type. + * Returns PE_NONE (0) if OK or an error code among : + * - PE_ENUM_OOR if <type> does not exist + * - PE_EXIST if <type> is already registered + * - PE_ARG_NOT_USE if <sa>/<sa2> was provided but not expected + * - PE_ARG_MISSING if <sa>/<sa2> was expected but not provided + * - PE_ARG_VALUE_OOR if type is an array and <sa> it out of array size range. + */ +int stktable_alloc_data_type(struct stktable *t, int type, const char *sa, const char *sa2) + +{ + if (type >= STKTABLE_DATA_TYPES) + return PE_ENUM_OOR; + + if (t->data_ofs[type]) + /* already allocated */ + return PE_EXIST; + + t->data_nbelem[type] = 1; + if (stktable_data_types[type].is_array) { + /* arrays take their element count on first argument */ + if (!sa) + return PE_ARG_MISSING; + t->data_nbelem[type] = atoi(sa); + if (!t->data_nbelem[type] || (t->data_nbelem[type] > STKTABLE_MAX_DT_ARRAY_SIZE)) + return PE_ARG_VALUE_OOR; + sa = sa2; + } + + switch (stktable_data_types[type].arg_type) { + case ARG_T_NONE: + if (sa) + return PE_ARG_NOT_USED; + break; + case ARG_T_INT: + if (!sa) + return PE_ARG_MISSING; + t->data_arg[type].i = atoi(sa); + break; + case ARG_T_DELAY: + if (!sa) + return PE_ARG_MISSING; + sa = parse_time_err(sa, &t->data_arg[type].u, TIME_UNIT_MS); + if (sa) + return PE_ARG_INVC; /* invalid char */ + break; + } + + t->data_size += t->data_nbelem[type] * stktable_type_size(stktable_data_types[type].std_type); + t->data_ofs[type] = -t->data_size; + return PE_NONE; +} + +/* + * Parse a line with <linenum> as number in <file> configuration file to configure + * the stick-table with <t> as address and <id> as ID. + * <peers> provides the "peers" section pointer only if this function is called + * from a "peers" section. + * <nid> is the stick-table name which is sent over the network. It must be equal + * to <id> if this stick-table is parsed from a proxy section, and prefixed by <peers> + * "peers" section name followed by a '/' character if parsed from a "peers" section. + * This is the responsibility of the caller to check this. + * Return an error status with ERR_* flags set if required, 0 if no error was encountered. + */ +int parse_stick_table(const char *file, int linenum, char **args, + struct stktable *t, char *id, char *nid, struct peers *peers) +{ + int err_code = 0; + int idx = 1; + unsigned int val; + + if (!id || !*id) { + ha_alert("parsing [%s:%d] : %s: ID not provided.\n", file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_ABORT; + goto out; + } + + /* Store the "peers" section if this function is called from a "peers" section. */ + if (peers) { + t->peers.p = peers; + idx++; + } + + t->id = id; + t->idlen = strlen(id); + t->nid = nid; + t->type = (unsigned int)-1; + t->conf.file = file; + t->conf.line = linenum; + t->write_to.name = NULL; + + while (*args[idx]) { + const char *err; + + if (strcmp(args[idx], "size") == 0) { + idx++; + if (!*(args[idx])) { + ha_alert("parsing [%s:%d] : %s: missing argument after '%s'.\n", + file, linenum, args[0], args[idx-1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if ((err = parse_size_err(args[idx], &t->size))) { + ha_alert("parsing [%s:%d] : %s: unexpected character '%c' in argument of '%s'.\n", + file, linenum, args[0], *err, args[idx-1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + idx++; + } + /* This argument does not exit in "peers" section. */ + else if (!peers && strcmp(args[idx], "peers") == 0) { + idx++; + if (!*(args[idx])) { + ha_alert("parsing [%s:%d] : %s: missing argument after '%s'.\n", + file, linenum, args[0], args[idx-1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + ha_free(&t->peers.name); + t->peers.name = strdup(args[idx++]); + } + else if (strcmp(args[idx], "expire") == 0) { + idx++; + if (!*(args[idx])) { + ha_alert("parsing [%s:%d] : %s: missing argument after '%s'.\n", + file, linenum, args[0], args[idx-1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + err = parse_time_err(args[idx], &val, TIME_UNIT_MS); + if (err == PARSE_TIME_OVER) { + ha_alert("parsing [%s:%d]: %s: timer overflow in argument <%s> to <%s>, maximum value is 2147483647 ms (~24.8 days).\n", + file, linenum, args[0], args[idx], args[idx-1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (err == PARSE_TIME_UNDER) { + ha_alert("parsing [%s:%d]: %s: timer underflow in argument <%s> to <%s>, minimum non-null value is 1 ms.\n", + file, linenum, args[0], args[idx], args[idx-1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (err) { + ha_alert("parsing [%s:%d] : %s: unexpected character '%c' in argument of '%s'.\n", + file, linenum, args[0], *err, args[idx-1]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + t->expire = val; + idx++; + } + else if (strcmp(args[idx], "nopurge") == 0) { + t->nopurge = 1; + idx++; + } + else if (strcmp(args[idx], "type") == 0) { + idx++; + if (stktable_parse_type(args, &idx, &t->type, &t->key_size, file, linenum) != 0) { + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + /* idx already points to next arg */ + } + else if (strcmp(args[idx], "store") == 0) { + int type, err; + char *cw, *nw, *sa, *sa2; + + idx++; + nw = args[idx]; + while (*nw) { + /* the "store" keyword supports a comma-separated list */ + cw = nw; + sa = NULL; /* store arg */ + sa2 = NULL; + while (*nw && *nw != ',') { + if (*nw == '(') { + *nw = 0; + sa = ++nw; + while (*nw != ')') { + if (!*nw) { + ha_alert("parsing [%s:%d] : %s: missing closing parenthesis after store option '%s'.\n", + file, linenum, args[0], cw); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + if (*nw == ',') { + *nw = '\0'; + sa2 = nw + 1; + } + nw++; + } + *nw = '\0'; + } + nw++; + } + if (*nw) + *nw++ = '\0'; + type = stktable_get_data_type(cw); + if (type < 0) { + ha_alert("parsing [%s:%d] : %s: unknown store option '%s'.\n", + file, linenum, args[0], cw); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + err = stktable_alloc_data_type(t, type, sa, sa2); + switch (err) { + case PE_NONE: break; + case PE_EXIST: + ha_warning("parsing [%s:%d]: %s: store option '%s' already enabled, ignored.\n", + file, linenum, args[0], cw); + err_code |= ERR_WARN; + break; + + case PE_ARG_MISSING: + ha_alert("parsing [%s:%d] : %s: missing argument to store option '%s'.\n", + file, linenum, args[0], cw); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + + case PE_ARG_NOT_USED: + ha_alert("parsing [%s:%d] : %s: unexpected argument to store option '%s'.\n", + file, linenum, args[0], cw); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + case PE_ARG_VALUE_OOR: + ha_alert("parsing [%s:%d] : %s: array size is out of allowed range (1-%d) for store option '%s'.\n", + file, linenum, args[0], STKTABLE_MAX_DT_ARRAY_SIZE, cw); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + + default: + ha_alert("parsing [%s:%d] : %s: error when processing store option '%s'.\n", + file, linenum, args[0], cw); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + idx++; + if (t->data_ofs[STKTABLE_DT_GPT] && t->data_ofs[STKTABLE_DT_GPT0]) { + ha_alert("parsing [%s:%d] : %s: simultaneous usage of 'gpt' and 'gpt0' in a same table is not permitted as 'gpt' overrides 'gpt0'.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (t->data_ofs[STKTABLE_DT_GPC] && (t->data_ofs[STKTABLE_DT_GPC0] || t->data_ofs[STKTABLE_DT_GPC1])) { + ha_alert("parsing [%s:%d] : %s: simultaneous usage of 'gpc' and 'gpc[0/1]' in a same table is not permitted as 'gpc' overrides 'gpc[0/1]'.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + else if (t->data_ofs[STKTABLE_DT_GPC_RATE] && (t->data_ofs[STKTABLE_DT_GPC0_RATE] || t->data_ofs[STKTABLE_DT_GPC1_RATE])) { + ha_alert("parsing [%s:%d] : %s: simultaneous usage of 'gpc_rate' and 'gpc[0/1]_rate' in a same table is not permitted as 'gpc_rate' overrides 'gpc[0/1]_rate'.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + else if (strcmp(args[idx], "srvkey") == 0) { + char *keytype; + idx++; + keytype = args[idx]; + if (strcmp(keytype, "name") == 0) { + t->server_key_type = STKTABLE_SRV_NAME; + } + else if (strcmp(keytype, "addr") == 0) { + t->server_key_type = STKTABLE_SRV_ADDR; + } + else { + ha_alert("parsing [%s:%d] : %s : unknown server key type '%s'.\n", + file, linenum, args[0], keytype); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + + } + idx++; + } + else if (strcmp(args[idx], "write-to") == 0) { + char *write_to; + + idx++; + write_to = args[idx]; + if (!write_to[0]) { + ha_alert("parsing [%s:%d] : %s : write-to requires table name.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + + } + ha_free(&t->write_to.name); + t->write_to.name = strdup(write_to); + idx++; + } + else { + ha_alert("parsing [%s:%d] : %s: unknown argument '%s'.\n", + file, linenum, args[0], args[idx]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + } + + if (!t->size) { + ha_alert("parsing [%s:%d] : %s: missing size.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + if (t->type == (unsigned int)-1) { + ha_alert("parsing [%s:%d] : %s: missing type.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + + out: + return err_code; +} + +/* Prepares a stktable_key from a sample <smp> to search into table <t>. + * Note that the sample *is* modified and that the returned key may point + * to it, so the sample must not be modified afterwards before the lookup. + * Returns NULL if the sample could not be converted (eg: no matching type), + * otherwise a pointer to the static stktable_key filled with what is needed + * for the lookup. + */ +struct stktable_key *smp_to_stkey(struct sample *smp, struct stktable *t) +{ + /* Convert sample. */ + if (!sample_convert(smp, t->type)) + return NULL; + + /* Fill static_table_key. */ + switch (t->type) { + + case SMP_T_IPV4: + static_table_key.key = &smp->data.u.ipv4; + static_table_key.key_len = 4; + break; + + case SMP_T_IPV6: + static_table_key.key = &smp->data.u.ipv6; + static_table_key.key_len = 16; + break; + + case SMP_T_SINT: + /* The stick table require a 32bit unsigned int, "sint" is a + * signed 64 it, so we can convert it inplace. + */ + smp->data.u.sint = (unsigned int)smp->data.u.sint; + static_table_key.key = &smp->data.u.sint; + static_table_key.key_len = 4; + break; + + case SMP_T_STR: + if (!smp_make_safe(smp)) + return NULL; + static_table_key.key = smp->data.u.str.area; + static_table_key.key_len = smp->data.u.str.data; + break; + + case SMP_T_BIN: + if (smp->data.u.str.data < t->key_size) { + /* This type needs padding with 0. */ + if (!smp_make_rw(smp)) + return NULL; + + if (smp->data.u.str.size < t->key_size) + if (!smp_dup(smp)) + return NULL; + if (smp->data.u.str.size < t->key_size) + return NULL; + memset(smp->data.u.str.area + smp->data.u.str.data, 0, + t->key_size - smp->data.u.str.data); + smp->data.u.str.data = t->key_size; + } + static_table_key.key = smp->data.u.str.area; + static_table_key.key_len = smp->data.u.str.data; + break; + + default: /* impossible case. */ + return NULL; + } + + return &static_table_key; +} + +/* + * Process a fetch + format conversion as defined by the sample expression <expr> + * on request or response considering the <opt> parameter. Returns either NULL if + * no key could be extracted, or a pointer to the converted result stored in + * static_table_key in format <table_type>. If <smp> is not NULL, it will be reset + * and its flags will be initialized so that the caller gets a copy of the input + * sample, and knows why it was not accepted (eg: SMP_F_MAY_CHANGE is present + * without SMP_OPT_FINAL). The output will be usable like this : + * + * return MAY_CHANGE FINAL Meaning for the sample + * NULL 0 * Not present and will never be (eg: header) + * NULL 1 0 Not present or unstable, could change (eg: req_len) + * NULL 1 1 Not present, will not change anymore + * smp 0 * Present and will not change (eg: header) + * smp 1 0 not possible + * smp 1 1 Present, last known value (eg: request length) + */ +struct stktable_key *stktable_fetch_key(struct stktable *t, struct proxy *px, struct session *sess, struct stream *strm, + unsigned int opt, struct sample_expr *expr, struct sample *smp) +{ + if (smp) + memset(smp, 0, sizeof(*smp)); + + smp = sample_process(px, sess, strm, opt, expr, smp); + if (!smp) + return NULL; + + if ((smp->flags & SMP_F_MAY_CHANGE) && !(opt & SMP_OPT_FINAL)) + return NULL; /* we can only use stable samples */ + + return smp_to_stkey(smp, t); +} + +/* + * Returns 1 if sample expression <expr> result can be converted to table key of + * type <table_type>, otherwise zero. Used in configuration check. + */ +int stktable_compatible_sample(struct sample_expr *expr, unsigned long table_type) +{ + int out_type; + + if (table_type >= SMP_TYPES || !stktable_types[table_type].kw) + return 0; + + out_type = smp_expr_output_type(expr); + + /* Convert sample. */ + if (!sample_casts[out_type][table_type]) + return 0; + + return 1; +} + +/* Extra data types processing : after the last one, some room may remain + * before STKTABLE_DATA_TYPES that may be used to register extra data types + * at run time. + */ +struct stktable_data_type stktable_data_types[STKTABLE_DATA_TYPES] = { + [STKTABLE_DT_SERVER_ID] = { .name = "server_id", .std_type = STD_T_SINT, .as_is = 1 }, + [STKTABLE_DT_GPT0] = { .name = "gpt0", .std_type = STD_T_UINT, .as_is = 1 }, + [STKTABLE_DT_GPC0] = { .name = "gpc0", .std_type = STD_T_UINT }, + [STKTABLE_DT_GPC0_RATE] = { .name = "gpc0_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_CONN_CNT] = { .name = "conn_cnt", .std_type = STD_T_UINT }, + [STKTABLE_DT_CONN_RATE] = { .name = "conn_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_CONN_CUR] = { .name = "conn_cur", .std_type = STD_T_UINT, .is_local = 1 }, + [STKTABLE_DT_SESS_CNT] = { .name = "sess_cnt", .std_type = STD_T_UINT }, + [STKTABLE_DT_SESS_RATE] = { .name = "sess_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_HTTP_REQ_CNT] = { .name = "http_req_cnt", .std_type = STD_T_UINT }, + [STKTABLE_DT_HTTP_REQ_RATE] = { .name = "http_req_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_HTTP_ERR_CNT] = { .name = "http_err_cnt", .std_type = STD_T_UINT }, + [STKTABLE_DT_HTTP_ERR_RATE] = { .name = "http_err_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_BYTES_IN_CNT] = { .name = "bytes_in_cnt", .std_type = STD_T_ULL }, + [STKTABLE_DT_BYTES_IN_RATE] = { .name = "bytes_in_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_BYTES_OUT_CNT] = { .name = "bytes_out_cnt", .std_type = STD_T_ULL }, + [STKTABLE_DT_BYTES_OUT_RATE]= { .name = "bytes_out_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_GPC1] = { .name = "gpc1", .std_type = STD_T_UINT }, + [STKTABLE_DT_GPC1_RATE] = { .name = "gpc1_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_SERVER_KEY] = { .name = "server_key", .std_type = STD_T_DICT, .as_is = 1 }, + [STKTABLE_DT_HTTP_FAIL_CNT] = { .name = "http_fail_cnt", .std_type = STD_T_UINT }, + [STKTABLE_DT_HTTP_FAIL_RATE]= { .name = "http_fail_rate", .std_type = STD_T_FRQP, .arg_type = ARG_T_DELAY }, + [STKTABLE_DT_GPT] = { .name = "gpt", .std_type = STD_T_UINT, .is_array = 1, .as_is = 1 }, + [STKTABLE_DT_GPC] = { .name = "gpc", .std_type = STD_T_UINT, .is_array = 1 }, + [STKTABLE_DT_GPC_RATE] = { .name = "gpc_rate", .std_type = STD_T_FRQP, .is_array = 1, .arg_type = ARG_T_DELAY }, +}; + +/* Registers stick-table extra data type with index <idx>, name <name>, type + * <std_type> and arg type <arg_type>. If the index is negative, the next free + * index is automatically allocated. The allocated index is returned, or -1 if + * no free index was found or <name> was already registered. The <name> is used + * directly as a pointer, so if it's not stable, the caller must allocate it. + */ +int stktable_register_data_store(int idx, const char *name, int std_type, int arg_type) +{ + if (idx < 0) { + for (idx = 0; idx < STKTABLE_DATA_TYPES; idx++) { + if (!stktable_data_types[idx].name) + break; + + if (strcmp(stktable_data_types[idx].name, name) == 0) + return -1; + } + } + + if (idx >= STKTABLE_DATA_TYPES) + return -1; + + if (stktable_data_types[idx].name != NULL) + return -1; + + stktable_data_types[idx].name = name; + stktable_data_types[idx].std_type = std_type; + stktable_data_types[idx].arg_type = arg_type; + return idx; +} + +/* + * Returns the data type number for the stktable_data_type whose name is <name>, + * or <0 if not found. + */ +int stktable_get_data_type(char *name) +{ + int type; + + for (type = 0; type < STKTABLE_DATA_TYPES; type++) { + if (!stktable_data_types[type].name) + continue; + if (strcmp(name, stktable_data_types[type].name) == 0) + return type; + } + /* For backwards compatibility */ + if (strcmp(name, "server_name") == 0) + return STKTABLE_DT_SERVER_KEY; + return -1; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns true if found, false otherwise. The input + * type is STR so that input samples are converted to string (since all types + * can be converted to strings), then the function casts the string again into + * the table's type. This is a double conversion, but in the future we might + * support automatic input types to perform the cast on the fly. + */ +static int sample_conv_in_table(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->data.type = SMP_T_BOOL; + smp->data.u.sint = !!ts; + smp->flags = SMP_F_VOL_TEST; + stktable_release(t, ts); + return 1; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the data rate received from clients in bytes/s + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_bytes_in_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_BYTES_IN_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_BYTES_IN_RATE].u); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the cumulated number of connections for the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_conn_cnt(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_CONN_CNT); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the number of concurrent connections for the + * key if the key is present in the table, otherwise zero, so that comparisons + * can be easily performed. If the inspected parameter is not stored in the + * table, <not found> is returned. + */ +static int sample_conv_table_conn_cur(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_CONN_CUR); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the rate of incoming connections from the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_conn_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_CONN_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_CONN_RATE].u); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the expiration delay for the key if the key is + * present in the table, otherwise the default value provided as second argument + * if any, if not (no default value), <not found> is returned. + */ +static int sample_conv_table_expire(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) { /* key not present */ + if (arg_p[1].type == ARGT_STOP) + return 0; + + /* default value */ + smp->data.u.sint = arg_p[1].data.sint; + return 1; + } + + smp->data.u.sint = tick_remain(now_ms, ts->expire); + + stktable_release(t, ts); + return 1; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the time the key remains unused if the key is + * present in the table, otherwise the default value provided as second argument + * if any, if not (no default value), <not found> is returned. + */ +static int sample_conv_table_idle(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) { /* key not present */ + if (arg_p[1].type == ARGT_STOP) + return 0; + + /* default value */ + smp->data.u.sint = arg_p[1].data.sint; + return 1; + } + + smp->data.u.sint = tick_remain(tick_remain(now_ms, ts->expire), t->expire); + + stktable_release(t, ts); + return 1; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the data rate sent to clients in bytes/s + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_bytes_out_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_BYTES_OUT_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_BYTES_OUT_RATE].u); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg_p(1), and looks + * it up into this table. Returns the value of the GPT[arg_p(0)] tag for the key + * if the key is present in the table, otherwise false, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_gpt(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + unsigned int idx; + + idx = arg_p[0].data.sint; + + t = arg_p[1].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr_idx(t, ts, STKTABLE_DT_GPT, idx); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the value of the GPT0 tag for the key + * if the key is present in the table, otherwise false, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_gpt0(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_GPT0); + if (!ptr) + ptr = stktable_data_ptr_idx(t, ts, STKTABLE_DT_GPT, 0); + + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg_p(1), and looks + * it up into this table. Returns the value of the GPC[arg_p(0)] counter for the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_gpc(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + unsigned int idx; + + idx = arg_p[0].data.sint; + + t = arg_p[1].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr_idx(t, ts, STKTABLE_DT_GPC, idx); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg_p(1), and looks + * it up into this table. Returns the event rate of the GPC[arg_p(0)] counter + * for the key if the key is present in the table, otherwise zero, so that + * comparisons can be easily performed. If the inspected parameter is not + * stored in the table, <not found> is returned. + */ +static int sample_conv_table_gpc_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + unsigned int idx; + + idx = arg_p[0].data.sint; + + t = arg_p[1].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr_idx(t, ts, STKTABLE_DT_GPC_RATE, idx); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_GPC_RATE].u); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the value of the GPC0 counter for the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_gpc0(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_GPC0); + if (!ptr) { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(t, ts, STKTABLE_DT_GPC, 0); + } + + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the event rate of the GPC0 counter for the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_gpc0_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_GPC0_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_GPC0_RATE].u); + else { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(t, ts, STKTABLE_DT_GPC_RATE, 0); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_GPC_RATE].u); + } + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the value of the GPC1 counter for the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_gpc1(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_GPC1); + if (!ptr) { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(t, ts, STKTABLE_DT_GPC, 1); + } + + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the event rate of the GPC1 counter for the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_gpc1_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_GPC1_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_GPC1_RATE].u); + else { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(t, ts, STKTABLE_DT_GPC_RATE, 1); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_GPC_RATE].u); + } + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the cumulated number of HTTP request errors + * for the key if the key is present in the table, otherwise zero, so that + * comparisons can be easily performed. If the inspected parameter is not stored + * in the table, <not found> is returned. + */ +static int sample_conv_table_http_err_cnt(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_ERR_CNT); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the HTTP request error rate the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_http_err_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_ERR_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_HTTP_ERR_RATE].u); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the cumulated number of HTTP response failures + * for the key if the key is present in the table, otherwise zero, so that + * comparisons can be easily performed. If the inspected parameter is not stored + * in the table, <not found> is returned. + */ +static int sample_conv_table_http_fail_cnt(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_FAIL_CNT); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the HTTP response failure rate for the key + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_http_fail_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_FAIL_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_HTTP_FAIL_RATE].u); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the cumulated number of HTTP request for the + * key if the key is present in the table, otherwise zero, so that comparisons + * can be easily performed. If the inspected parameter is not stored in the + * table, <not found> is returned. + */ +static int sample_conv_table_http_req_cnt(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_REQ_CNT); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the HTTP request rate the key if the key is + * present in the table, otherwise zero, so that comparisons can be easily + * performed. If the inspected parameter is not stored in the table, <not found> + * is returned. + */ +static int sample_conv_table_http_req_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_HTTP_REQ_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_HTTP_REQ_RATE].u); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the volume of datareceived from clients in kbytes + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_kbytes_in(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_BYTES_IN_CNT); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_ull) >> 10; + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the volume of data sent to clients in kbytes + * if the key is present in the table, otherwise zero, so that comparisons can + * be easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_kbytes_out(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_BYTES_OUT_CNT); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_ull) >> 10; + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the server ID associated with the key if the + * key is present in the table, otherwise zero, so that comparisons can be + * easily performed. If the inspected parameter is not stored in the table, + * <not found> is returned. + */ +static int sample_conv_table_server_id(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_SERVER_ID); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_sint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the cumulated number of sessions for the + * key if the key is present in the table, otherwise zero, so that comparisons + * can be easily performed. If the inspected parameter is not stored in the + * table, <not found> is returned. + */ +static int sample_conv_table_sess_cnt(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_SESS_CNT); + if (ptr) + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the session rate the key if the key is + * present in the table, otherwise zero, so that comparisons can be easily + * performed. If the inspected parameter is not stored in the table, <not found> + * is returned. + */ +static int sample_conv_table_sess_rate(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + void *ptr; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) /* key not present */ + return 1; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_SESS_RATE); + if (ptr) + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_SESS_RATE].u); + + stktable_release(t, ts); + return !!ptr; +} + +/* Casts sample <smp> to the type of the table specified in arg(0), and looks + * it up into this table. Returns the amount of concurrent connections tracking + * the same key if the key is present in the table, otherwise zero, so that + * comparisons can be easily performed. If the inspected parameter is not + * stored in the table, <not found> is returned. + */ +static int sample_conv_table_trackers(const struct arg *arg_p, struct sample *smp, void *private) +{ + struct stktable *t; + struct stktable_key *key; + struct stksess *ts; + + t = arg_p[0].data.t; + + key = smp_to_stkey(smp, t); + if (!key) + return 0; + + ts = stktable_lookup_key(t, key); + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!ts) + return 1; + + smp->data.u.sint = HA_ATOMIC_LOAD(&ts->ref_cnt); + + stktable_release(t, ts); + return 1; +} + +/* This function increments the gpc counter at index 'rule->arg.gpc.idx' of the + * array on the tracksc counter of index 'rule->arg.gpc.sc' stored into the + * <stream> or directly in the session <sess> if <stream> is set to NULL + * + * This function always returns ACT_RET_CONT and parameter flags is unused. + */ +static enum act_return action_inc_gpc(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct stksess *ts; + struct stkctr *stkctr; + + /* Extract the stksess, return OK if no stksess available. */ + if (s) + stkctr = &s->stkctr[rule->arg.gpc.sc]; + else + stkctr = &sess->stkctr[rule->arg.gpc.sc]; + + ts = stkctr_entry(stkctr); + if (ts) { + void *ptr1, *ptr2; + + /* First, update gpc_rate if it's tracked. Second, update its gpc if tracked. */ + ptr1 = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPC_RATE, rule->arg.gpc.idx); + ptr2 = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPC, rule->arg.gpc.idx); + + if (ptr1 || ptr2) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + if (ptr1) + update_freq_ctr_period(&stktable_data_cast(ptr1, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u, 1); + + if (ptr2) + stktable_data_cast(ptr2, std_t_uint)++; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, ts, 0); + } + } + return ACT_RET_CONT; +} + +/* Same as action_inc_gpc() but for gpc0 only */ +static enum act_return action_inc_gpc0(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct stksess *ts; + struct stkctr *stkctr; + unsigned int period = 0; + + /* Extract the stksess, return OK if no stksess available. */ + if (s) + stkctr = &s->stkctr[rule->arg.gpc.sc]; + else + stkctr = &sess->stkctr[rule->arg.gpc.sc]; + + ts = stkctr_entry(stkctr); + if (ts) { + void *ptr1, *ptr2; + + /* First, update gpc0_rate if it's tracked. Second, update its gpc0 if tracked. */ + ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_GPC0_RATE); + if (ptr1) { + period = stkctr->table->data_arg[STKTABLE_DT_GPC0_RATE].u; + } + else { + /* fallback on the gpc array */ + ptr1 = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPC_RATE, 0); + if (ptr1) + period = stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u; + } + + ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_GPC0); + if (!ptr2) { + /* fallback on the gpc array */ + ptr2 = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPC, 0); + } + + if (ptr1 || ptr2) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + if (ptr1) + update_freq_ctr_period(&stktable_data_cast(ptr1, std_t_frqp), + period, 1); + + if (ptr2) + stktable_data_cast(ptr2, std_t_uint)++; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, ts, 0); + } + } + return ACT_RET_CONT; +} + +/* Same as action_inc_gpc() but for gpc1 only */ +static enum act_return action_inc_gpc1(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct stksess *ts; + struct stkctr *stkctr = NULL; + unsigned int period = 0; + + /* Extract the stksess, return OK if no stksess available. */ + if (s && s->stkctr) + stkctr = &s->stkctr[rule->arg.gpc.sc]; + else if (sess->stkctr) + stkctr = &sess->stkctr[rule->arg.gpc.sc]; + else + return ACT_RET_CONT; + + ts = stkctr_entry(stkctr); + if (ts) { + void *ptr1, *ptr2; + + /* First, update gpc1_rate if it's tracked. Second, update its gpc1 if tracked. */ + ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_GPC1_RATE); + if (ptr1) { + period = stkctr->table->data_arg[STKTABLE_DT_GPC1_RATE].u; + } + else { + /* fallback on the gpc array */ + ptr1 = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPC_RATE, 1); + if (ptr1) + period = stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u; + } + + ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_GPC1); + if (!ptr2) { + /* fallback on the gpc array */ + ptr2 = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPC, 1); + } + + if (ptr1 || ptr2) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + if (ptr1) + update_freq_ctr_period(&stktable_data_cast(ptr1, std_t_frqp), + period, 1); + + if (ptr2) + stktable_data_cast(ptr2, std_t_uint)++; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, ts, 0); + } + } + return ACT_RET_CONT; +} + +/* This function is a common parser for actions incrementing the GPC + * (General Purpose Counters). It understands the formats: + * + * sc-inc-gpc(<gpc IDX>,<track ID>) + * sc-inc-gpc0([<track ID>]) + * sc-inc-gpc1([<track ID>]) + * + * It returns ACT_RET_PRS_ERR if fails and <err> is filled with an error + * message. Otherwise it returns ACT_RET_PRS_OK. + */ +static enum act_parse_ret parse_inc_gpc(const char **args, int *arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + const char *cmd_name = args[*arg-1]; + char *error; + + if (!global.tune.nb_stk_ctr) { + memprintf(err, "Cannot use '%s', stick-counters are disabled via tune.stick-counters", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + cmd_name += strlen("sc-inc-gpc"); + if (*cmd_name == '(') { + cmd_name++; /* skip the '(' */ + rule->arg.gpc.idx = strtoul(cmd_name, &error, 10); /* Convert stick table id. */ + if (*error != ',') { + memprintf(err, "Missing gpc ID '%s'. Expects sc-inc-gpc(<GPC ID>,<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + else { + cmd_name = error + 1; /* skip the ',' */ + rule->arg.gpc.sc = strtol(cmd_name, &error, 10); /* Convert stick table id. */ + if (*error != ')') { + memprintf(err, "invalid stick table track ID '%s'. Expects sc-inc-gpc(<GPC ID>,<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + if (rule->arg.gpc.sc >= global.tune.nb_stk_ctr) { + memprintf(err, "invalid stick table track ID '%s'. The max allowed ID is %d (tune.stick-counters)", + args[*arg-1], global.tune.nb_stk_ctr-1); + return ACT_RET_PRS_ERR; + } + } + rule->action_ptr = action_inc_gpc; + } + else if (*cmd_name == '0' ||*cmd_name == '1') { + char c = *cmd_name; + + cmd_name++; + if (*cmd_name == '\0') { + /* default stick table id. */ + rule->arg.gpc.sc = 0; + } else { + /* parse the stick table id. */ + if (*cmd_name != '(') { + memprintf(err, "invalid stick table track ID. Expects %s(<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + cmd_name++; /* jump the '(' */ + rule->arg.gpc.sc = strtol(cmd_name, &error, 10); /* Convert stick table id. */ + if (*error != ')') { + memprintf(err, "invalid stick table track ID. Expects %s(<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + if (rule->arg.gpc.sc >= global.tune.nb_stk_ctr) { + memprintf(err, "invalid stick table track ID. The max allowed ID is %d (tune.stick-counters)", + global.tune.nb_stk_ctr-1); + return ACT_RET_PRS_ERR; + } + } + if (c == '1') + rule->action_ptr = action_inc_gpc1; + else + rule->action_ptr = action_inc_gpc0; + } + else { + /* default stick table id. */ + memprintf(err, "invalid gpc ID '%s'. Expects sc-inc-gpc(<GPC ID>,<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + rule->action = ACT_CUSTOM; + return ACT_RET_PRS_OK; +} + +/* This function sets the gpt at index 'rule->arg.gpt.idx' of the array on the + * tracksc counter of index 'rule->arg.gpt.sc' stored into the <stream> or + * directly in the session <sess> if <stream> is set to NULL. This gpt is + * set to the value computed by the expression 'rule->arg.gpt.expr' or if + * 'rule->arg.gpt.expr' is null directly to the value of 'rule->arg.gpt.value'. + * + * This function always returns ACT_RET_CONT and parameter flags is unused. + */ +static enum act_return action_set_gpt(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + void *ptr; + struct stksess *ts; + struct stkctr *stkctr = NULL; + unsigned int value = 0; + struct sample *smp; + int smp_opt_dir; + + /* Extract the stksess, return OK if no stksess available. */ + if (s && s->stkctr) + stkctr = &s->stkctr[rule->arg.gpt.sc]; + else if (sess->stkctr) + stkctr = &sess->stkctr[rule->arg.gpt.sc]; + else + return ACT_RET_CONT; + + ts = stkctr_entry(stkctr); + if (!ts) + return ACT_RET_CONT; + + /* Store the sample in the required sc, and ignore errors. */ + ptr = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPT, rule->arg.gpt.idx); + if (ptr) { + + if (!rule->arg.gpt.expr) + value = (unsigned int)(rule->arg.gpt.value); + else { + switch (rule->from) { + case ACT_F_TCP_REQ_CON: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_SES: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_CNT: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_RES_CNT: smp_opt_dir = SMP_OPT_DIR_RES; break; + case ACT_F_HTTP_REQ: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_HTTP_RES: smp_opt_dir = SMP_OPT_DIR_RES; break; + default: + send_log(px, LOG_ERR, "stick table: internal error while setting gpt%u.", rule->arg.gpt.idx); + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) + ha_alert("stick table: internal error while executing setting gpt%u.\n", rule->arg.gpt.idx); + return ACT_RET_CONT; + } + + /* Fetch and cast the expression. */ + smp = sample_fetch_as_type(px, sess, s, smp_opt_dir|SMP_OPT_FINAL, rule->arg.gpt.expr, SMP_T_SINT); + if (!smp) { + send_log(px, LOG_WARNING, "stick table: invalid expression or data type while setting gpt%u.", rule->arg.gpt.idx); + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) + ha_alert("stick table: invalid expression or data type while setting gpt%u.\n", rule->arg.gpt.idx); + return ACT_RET_CONT; + } + value = (unsigned int)(smp->data.u.sint); + } + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + stktable_data_cast(ptr, std_t_uint) = value; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + stktable_touch_local(stkctr->table, ts, 0); + } + + return ACT_RET_CONT; +} + +/* Always returns 1. */ +static enum act_return action_set_gpt0(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + void *ptr; + struct stksess *ts; + struct stkctr *stkctr = NULL; + unsigned int value = 0; + struct sample *smp; + int smp_opt_dir; + + /* Extract the stksess, return OK if no stksess available. */ + if (s && s->stkctr) + stkctr = &s->stkctr[rule->arg.gpt.sc]; + else if (sess->stkctr) + stkctr = &sess->stkctr[rule->arg.gpt.sc]; + else + return ACT_RET_CONT; + + ts = stkctr_entry(stkctr); + if (!ts) + return ACT_RET_CONT; + + /* Store the sample in the required sc, and ignore errors. */ + ptr = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_GPT0); + if (!ptr) + ptr = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPT, 0); + + if (ptr) { + if (!rule->arg.gpt.expr) + value = (unsigned int)(rule->arg.gpt.value); + else { + switch (rule->from) { + case ACT_F_TCP_REQ_CON: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_SES: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_CNT: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_RES_CNT: smp_opt_dir = SMP_OPT_DIR_RES; break; + case ACT_F_HTTP_REQ: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_HTTP_RES: smp_opt_dir = SMP_OPT_DIR_RES; break; + default: + send_log(px, LOG_ERR, "stick table: internal error while setting gpt0."); + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) + ha_alert("stick table: internal error while executing setting gpt0.\n"); + return ACT_RET_CONT; + } + + /* Fetch and cast the expression. */ + smp = sample_fetch_as_type(px, sess, s, smp_opt_dir|SMP_OPT_FINAL, rule->arg.gpt.expr, SMP_T_SINT); + if (!smp) { + send_log(px, LOG_WARNING, "stick table: invalid expression or data type while setting gpt0."); + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) + ha_alert("stick table: invalid expression or data type while setting gpt0.\n"); + return ACT_RET_CONT; + } + value = (unsigned int)(smp->data.u.sint); + } + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + stktable_data_cast(ptr, std_t_uint) = value; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + stktable_touch_local(stkctr->table, ts, 0); + } + + return ACT_RET_CONT; +} + +/* This function is a parser for the "sc-set-gpt" and "sc-set-gpt0" actions. + * It understands the formats: + * + * sc-set-gpt(<gpt IDX>,<track ID>) <expression> + * sc-set-gpt0(<track ID>) <expression> + * + * It returns ACT_RET_PRS_ERR if fails and <err> is filled with an error message. + * Otherwise, it returns ACT_RET_PRS_OK and the variable 'rule->arg.gpt.expr' + * is filled with the pointer to the expression to execute or NULL if the arg + * is directly an integer stored into 'rule->arg.gpt.value'. + */ +static enum act_parse_ret parse_set_gpt(const char **args, int *arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + const char *cmd_name = args[*arg-1]; + char *error; + int smp_val; + + if (!global.tune.nb_stk_ctr) { + memprintf(err, "Cannot use '%s', stick-counters are disabled via tune.stick-counters", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + cmd_name += strlen("sc-set-gpt"); + if (*cmd_name == '(') { + cmd_name++; /* skip the '(' */ + rule->arg.gpt.idx = strtoul(cmd_name, &error, 10); /* Convert stick table id. */ + if (*error != ',') { + memprintf(err, "Missing gpt ID '%s'. Expects sc-set-gpt(<GPT ID>,<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + else { + cmd_name = error + 1; /* skip the ',' */ + rule->arg.gpt.sc = strtol(cmd_name, &error, 10); /* Convert stick table id. */ + if (*error != ')') { + memprintf(err, "invalid stick table track ID '%s'. Expects sc-set-gpt(<GPT ID>,<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + if (rule->arg.gpt.sc >= global.tune.nb_stk_ctr) { + memprintf(err, "invalid stick table track ID '%s'. The max allowed ID is %d", + args[*arg-1], global.tune.nb_stk_ctr-1); + return ACT_RET_PRS_ERR; + } + } + rule->action_ptr = action_set_gpt; + } + else if (*cmd_name == '0') { + cmd_name++; + if (*cmd_name == '\0') { + /* default stick table id. */ + rule->arg.gpt.sc = 0; + } else { + /* parse the stick table id. */ + if (*cmd_name != '(') { + memprintf(err, "invalid stick table track ID '%s'. Expects sc-set-gpt0(<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + cmd_name++; /* jump the '(' */ + rule->arg.gpt.sc = strtol(cmd_name, &error, 10); /* Convert stick table id. */ + if (*error != ')') { + memprintf(err, "invalid stick table track ID '%s'. Expects sc-set-gpt0(<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + if (rule->arg.gpt.sc >= global.tune.nb_stk_ctr) { + memprintf(err, "invalid stick table track ID '%s'. The max allowed ID is %d", + args[*arg-1], global.tune.nb_stk_ctr-1); + return ACT_RET_PRS_ERR; + } + } + rule->action_ptr = action_set_gpt0; + } + else { + /* default stick table id. */ + memprintf(err, "invalid gpt ID '%s'. Expects sc-set-gpt(<GPT ID>,<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + /* value may be either an integer or an expression */ + rule->arg.gpt.expr = NULL; + rule->arg.gpt.value = strtol(args[*arg], &error, 10); + if (*error == '\0') { + /* valid integer, skip it */ + (*arg)++; + } else { + rule->arg.gpt.expr = sample_parse_expr((char **)args, arg, px->conf.args.file, + px->conf.args.line, err, &px->conf.args, NULL); + if (!rule->arg.gpt.expr) + return ACT_RET_PRS_ERR; + + switch (rule->from) { + case ACT_F_TCP_REQ_CON: smp_val = SMP_VAL_FE_CON_ACC; break; + case ACT_F_TCP_REQ_SES: smp_val = SMP_VAL_FE_SES_ACC; break; + case ACT_F_TCP_REQ_CNT: smp_val = SMP_VAL_FE_REQ_CNT; break; + case ACT_F_TCP_RES_CNT: smp_val = SMP_VAL_BE_RES_CNT; break; + case ACT_F_HTTP_REQ: smp_val = SMP_VAL_FE_HRQ_HDR; break; + case ACT_F_HTTP_RES: smp_val = SMP_VAL_BE_HRS_HDR; break; + default: + memprintf(err, "internal error, unexpected rule->from=%d, please report this bug!", rule->from); + return ACT_RET_PRS_ERR; + } + if (!(rule->arg.gpt.expr->fetch->val & smp_val)) { + memprintf(err, "fetch method '%s' extracts information from '%s', none of which is available here", args[*arg-1], + sample_src_names(rule->arg.gpt.expr->fetch->use)); + free(rule->arg.gpt.expr); + return ACT_RET_PRS_ERR; + } + } + + rule->action = ACT_CUSTOM; + + return ACT_RET_PRS_OK; +} + +/* This function updates the gpc at index 'rule->arg.gpc.idx' of the array on + * the tracksc counter of index 'rule->arg.gpc.sc' stored into the <stream> or + * directly in the session <sess> if <stream> is set to NULL. This gpc is + * set to the value computed by the expression 'rule->arg.gpc.expr' or if + * 'rule->arg.gpc.expr' is null directly to the value of 'rule->arg.gpc.value'. + * + * This function always returns ACT_RET_CONT and parameter flags is unused. + */ +static enum act_return action_add_gpc(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + void *ptr1, *ptr2; + struct stksess *ts; + struct stkctr *stkctr; + unsigned int value = 0; + struct sample *smp; + int smp_opt_dir; + + /* Extract the stksess, return OK if no stksess available. */ + if (s) + stkctr = &s->stkctr[rule->arg.gpc.sc]; + else + stkctr = &sess->stkctr[rule->arg.gpc.sc]; + + ts = stkctr_entry(stkctr); + if (!ts) + return ACT_RET_CONT; + + /* First, update gpc_rate if it's tracked. Second, update its gpc if tracked. */ + ptr1 = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPC_RATE, rule->arg.gpc.idx); + ptr2 = stktable_data_ptr_idx(stkctr->table, ts, STKTABLE_DT_GPC, rule->arg.gpc.idx); + + if (ptr1 || ptr2) { + if (!rule->arg.gpc.expr) + value = (unsigned int)(rule->arg.gpc.value); + else { + switch (rule->from) { + case ACT_F_TCP_REQ_CON: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_SES: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_CNT: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_RES_CNT: smp_opt_dir = SMP_OPT_DIR_RES; break; + case ACT_F_HTTP_REQ: smp_opt_dir = SMP_OPT_DIR_REQ; break; + case ACT_F_HTTP_RES: smp_opt_dir = SMP_OPT_DIR_RES; break; + default: + send_log(px, LOG_ERR, "stick table: internal error while setting gpc%u.", rule->arg.gpc.idx); + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) + ha_alert("stick table: internal error while executing setting gpc%u.\n", rule->arg.gpc.idx); + return ACT_RET_CONT; + } + + /* Fetch and cast the expression. */ + smp = sample_fetch_as_type(px, sess, s, smp_opt_dir|SMP_OPT_FINAL, rule->arg.gpc.expr, SMP_T_SINT); + if (!smp) { + send_log(px, LOG_WARNING, "stick table: invalid expression or data type while setting gpc%u.", rule->arg.gpc.idx); + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) + ha_alert("stick table: invalid expression or data type while setting gpc%u.\n", rule->arg.gpc.idx); + return ACT_RET_CONT; + } + value = (unsigned int)(smp->data.u.sint); + } + + if (value) { + /* only update the value if non-null increment */ + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + if (ptr1) + update_freq_ctr_period(&stktable_data_cast(ptr1, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u, value); + + if (ptr2) + stktable_data_cast(ptr2, std_t_uint) += value; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + } + /* always touch the table so that it doesn't expire */ + stktable_touch_local(stkctr->table, ts, 0); + } + + return ACT_RET_CONT; +} + +/* This function is a parser for the "sc-add-gpc" action. It understands the + * format: + * + * sc-add-gpc(<gpc IDX>,<track ID>) <expression> + * + * It returns ACT_RET_PRS_ERR if fails and <err> is filled with an error message. + * Otherwise, it returns ACT_RET_PRS_OK and the variable 'rule->arg.gpc.expr' + * is filled with the pointer to the expression to execute or NULL if the arg + * is directly an integer stored into 'rule->arg.gpt.value'. + */ +static enum act_parse_ret parse_add_gpc(const char **args, int *arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + const char *cmd_name = args[*arg-1]; + char *error; + int smp_val; + + cmd_name += strlen("sc-add-gpc"); + if (*cmd_name != '(') { + memprintf(err, "Missing or invalid arguments for '%s'. Expects sc-add-gpc(<GPC ID>,<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + cmd_name++; /* skip the '(' */ + rule->arg.gpc.idx = strtoul(cmd_name, &error, 10); /* Convert stick table id. */ + if (*error != ',') { + memprintf(err, "Missing gpc ID. Expects %s(<GPC ID>,<Track ID>)", args[*arg-1]); + return ACT_RET_PRS_ERR; + } + else { + cmd_name = error + 1; /* skip the ',' */ + rule->arg.gpc.sc = strtol(cmd_name, &error, 10); /* Convert stick table id. */ + if (*error != ')') { + memprintf(err, "invalid stick table track ID '%s'. Expects %s(<GPC ID>,<Track ID>)", cmd_name, args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + if (rule->arg.gpc.sc >= MAX_SESS_STKCTR) { + memprintf(err, "invalid stick table track ID '%s' for '%s'. The max allowed ID is %d", + cmd_name, args[*arg-1], MAX_SESS_STKCTR-1); + return ACT_RET_PRS_ERR; + } + } + rule->action_ptr = action_add_gpc; + + /* value may be either an integer or an expression */ + rule->arg.gpc.expr = NULL; + rule->arg.gpc.value = strtol(args[*arg], &error, 10); + if (*error == '\0') { + /* valid integer, skip it */ + (*arg)++; + } else { + rule->arg.gpc.expr = sample_parse_expr((char **)args, arg, px->conf.args.file, + px->conf.args.line, err, &px->conf.args, NULL); + if (!rule->arg.gpc.expr) + return ACT_RET_PRS_ERR; + + switch (rule->from) { + case ACT_F_TCP_REQ_CON: smp_val = SMP_VAL_FE_CON_ACC; break; + case ACT_F_TCP_REQ_SES: smp_val = SMP_VAL_FE_SES_ACC; break; + case ACT_F_TCP_REQ_CNT: smp_val = SMP_VAL_FE_REQ_CNT; break; + case ACT_F_TCP_RES_CNT: smp_val = SMP_VAL_BE_RES_CNT; break; + case ACT_F_HTTP_REQ: smp_val = SMP_VAL_FE_HRQ_HDR; break; + case ACT_F_HTTP_RES: smp_val = SMP_VAL_BE_HRS_HDR; break; + default: + memprintf(err, "internal error, unexpected rule->from=%d, please report this bug!", rule->from); + return ACT_RET_PRS_ERR; + } + + if (!(rule->arg.gpc.expr->fetch->val & smp_val)) { + memprintf(err, "fetch method '%s' extracts information from '%s', none of which is available here", args[*arg-1], + sample_src_names(rule->arg.gpc.expr->fetch->use)); + free(rule->arg.gpc.expr); + return ACT_RET_PRS_ERR; + } + } + + rule->action = ACT_CUSTOM; + + return ACT_RET_PRS_OK; +} + +/* set temp integer to the number of used entries in the table pointed to by expr. + * Accepts exactly 1 argument of type table. + */ +static int +smp_fetch_table_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = args->data.t->current; + return 1; +} + +/* set temp integer to the number of free entries in the table pointed to by expr. + * Accepts exactly 1 argument of type table. + */ +static int +smp_fetch_table_avl(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stktable *t; + + t = args->data.t; + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = t->size - t->current; + return 1; +} + +/* Returns a pointer to a stkctr depending on the fetch keyword name. + * It is designed to be called as sc[0-9]_* sc_* or src_* exclusively. + * sc[0-9]_* will return a pointer to the respective field in the + * stream <l4>. sc_* requires an UINT argument specifying the stick + * counter number. src_* will fill a locally allocated structure with + * the table and entry corresponding to what is specified with src_*. + * NULL may be returned if the designated stkctr is not tracked. For + * the sc_* and sc[0-9]_* forms, an optional table argument may be + * passed. When present, the currently tracked key is then looked up + * in the specified table instead of the current table. The purpose is + * to be able to convert multiple values per key (eg: have gpc0 from + * multiple tables). <strm> is allowed to be NULL, in which case only + * the session will be consulted. + */ +struct stkctr * +smp_fetch_sc_stkctr(struct session *sess, struct stream *strm, const struct arg *args, const char *kw, struct stkctr *stkctr) +{ + struct stkctr *stkptr; + struct stksess *stksess; + unsigned int num = kw[2] - '0'; + int arg = 0; + + if (num == '_' - '0') { + /* sc_* variant, args[0] = ctr# (mandatory) */ + num = args[arg++].data.sint; + } + else if (num > 9) { /* src_* variant, args[0] = table */ + struct stktable_key *key; + struct connection *conn = objt_conn(sess->origin); + struct sample smp; + + if (!conn) + return NULL; + + /* Fetch source address in a sample. */ + smp.px = NULL; + smp.sess = sess; + smp.strm = strm; + if (!smp_fetch_src || !smp_fetch_src(empty_arg_list, &smp, "src", NULL)) + return NULL; + + /* Converts into key. */ + key = smp_to_stkey(&smp, args->data.t); + if (!key) + return NULL; + + stkctr->table = args->data.t; + stkctr_set_entry(stkctr, stktable_lookup_key(stkctr->table, key)); + return stkctr; + } + + /* Here, <num> contains the counter number from 0 to 9 for + * the sc[0-9]_ form, or even higher using sc_(num) if needed. + * args[arg] is the first optional argument. We first lookup the + * ctr form the stream, then from the session if it was not there. + * But we must be sure the counter does not exceed global.tune.nb_stk_ctr. + */ + if (num >= global.tune.nb_stk_ctr) + return NULL; + + stkptr = NULL; + if (strm && strm->stkctr) + stkptr = &strm->stkctr[num]; + if (!strm || !stkptr || !stkctr_entry(stkptr)) { + if (sess->stkctr) + stkptr = &sess->stkctr[num]; + else + return NULL; + if (!stkctr_entry(stkptr)) + return NULL; + } + + stksess = stkctr_entry(stkptr); + if (!stksess) + return NULL; + + if (unlikely(args[arg].type == ARGT_TAB)) { + /* an alternate table was specified, let's look up the same key there */ + stkctr->table = args[arg].data.t; + stkctr_set_entry(stkctr, stktable_lookup(stkctr->table, stksess)); + return stkctr; + } + return stkptr; +} + +/* same as smp_fetch_sc_stkctr() but dedicated to src_* and can create + * the entry if it doesn't exist yet. This is needed for a few fetch + * functions which need to create an entry, such as src_inc_gpc* and + * src_clr_gpc*. + */ +struct stkctr * +smp_create_src_stkctr(struct session *sess, struct stream *strm, const struct arg *args, const char *kw, struct stkctr *stkctr) +{ + struct stktable_key *key; + struct connection *conn = objt_conn(sess->origin); + struct sample smp; + + if (strncmp(kw, "src_", 4) != 0) + return NULL; + + if (!conn) + return NULL; + + /* Fetch source address in a sample. */ + smp.px = NULL; + smp.sess = sess; + smp.strm = strm; + if (!smp_fetch_src || !smp_fetch_src(empty_arg_list, &smp, "src", NULL)) + return NULL; + + /* Converts into key. */ + key = smp_to_stkey(&smp, args->data.t); + if (!key) + return NULL; + + stkctr->table = args->data.t; + stkctr_set_entry(stkctr, stktable_get_entry(stkctr->table, key)); + return stkctr; +} + +/* set return a boolean indicating if the requested stream counter is + * currently being tracked or not. + * Supports being called as "sc[0-9]_tracked" only. + */ +static int +smp_fetch_sc_tracked(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_BOOL; + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + smp->data.u.sint = !!stkctr; + + /* release the ref count */ + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + + return 1; +} + +/* set <smp> to the General Purpose Tag of index set as first arg + * to value from the stream's tracked frontend counters or from the src. + * Supports being called as "sc_get_gpt(<gpt-idx>,<sc-idx>[,<table>])" or + * "src_get_gpt(<gpt-idx>[,<table>])" only. Value zero is returned if + * the key is new or gpt is not stored. + */ +static int +smp_fetch_sc_get_gpt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int idx; + + idx = args[0].data.sint; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args + 1, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (stkctr_entry(stkctr)) { + void *ptr; + + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPT, idx); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the General Purpose Flag 0 value from the stream's tracked + * frontend counters or from the src. + * Supports being called as "sc[0-9]_get_gpc0" or "src_get_gpt0" only. Value + * zero is returned if the key is new. + */ +static int +smp_fetch_sc_get_gpt0(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (stkctr_entry(stkctr)) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPT0); + if (!ptr) + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPT, 0); + + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the GPC[args(0)]'s value from the stream's tracked + * frontend counters or from the src. + * Supports being called as "sc_get_gpc(<gpc-idx>,<sc-idx>[,<table>])" or + * "src_get_gpc(<gpc-idx>[,<table>])" only. Value + * Value zero is returned if the key is new or gpc is not stored. + */ +static int +smp_fetch_sc_get_gpc(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int idx; + + idx = args[0].data.sint; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args + 1, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, idx); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the General Purpose Counter 0 value from the stream's tracked + * frontend counters or from the src. + * Supports being called as "sc[0-9]_get_gpc0" or "src_get_gpc0" only. Value + * zero is returned if the key is new. + */ +static int +smp_fetch_sc_get_gpc0(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC0); + if (!ptr) { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, 0); + } + + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the General Purpose Counter 1 value from the stream's tracked + * frontend counters or from the src. + * Supports being called as "sc[0-9]_get_gpc1" or "src_get_gpc1" only. Value + * zero is returned if the key is new. + */ +static int +smp_fetch_sc_get_gpc1(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC1); + if (!ptr) { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, 1); + } + + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the GPC[args(0)]'s event rate from the stream's + * tracked frontend counters or from the src. + * Supports being called as "sc_gpc_rate(<gpc-idx>,<sc-idx>[,<table])" + * or "src_gpc_rate(<gpc-idx>[,<table>])" only. + * Value zero is returned if the key is new or gpc_rate is not stored. + */ +static int +smp_fetch_sc_gpc_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int idx; + + idx = args[0].data.sint; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args + 1, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC_RATE, idx); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the General Purpose Counter 0's event rate from the stream's + * tracked frontend counters or from the src. + * Supports being called as "sc[0-9]_gpc0_rate" or "src_gpc0_rate" only. + * Value zero is returned if the key is new. + */ +static int +smp_fetch_sc_gpc0_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int period; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC0_RATE); + if (ptr) { + period = stkctr->table->data_arg[STKTABLE_DT_GPC0_RATE].u; + } + else { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC_RATE, 0); + if (ptr) + period = stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u; + } + + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), period); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the General Purpose Counter 1's event rate from the stream's + * tracked frontend counters or from the src. + * Supports being called as "sc[0-9]_gpc1_rate" or "src_gpc1_rate" only. + * Value zero is returned if the key is new. + */ +static int +smp_fetch_sc_gpc1_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int period; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC1_RATE); + if (ptr) { + period = stkctr->table->data_arg[STKTABLE_DT_GPC1_RATE].u; + } + else { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC_RATE, 1); + if (ptr) + period = stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u; + } + + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), period); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* Increment the GPC[args(0)] value from the stream's tracked + * frontend counters and return it into temp integer. + * Supports being called as "sc_inc_gpc(<gpc-idx>,<sc-idx>[,<table>])" + * or "src_inc_gpc(<gpc-idx>[,<table>])" only. + */ +static int +smp_fetch_sc_inc_gpc(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int idx; + + idx = args[0].data.sint; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args + 1, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!stkctr_entry(stkctr)) + stkctr = smp_create_src_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + + if (stkctr && stkctr_entry(stkctr)) { + void *ptr1,*ptr2; + + + /* First, update gpc0_rate if it's tracked. Second, update its + * gpc0 if tracked. Returns gpc0's value otherwise the curr_ctr. + */ + ptr1 = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC_RATE, idx); + ptr2 = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, idx); + if (ptr1 || ptr2) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (ptr1) { + update_freq_ctr_period(&stktable_data_cast(ptr1, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u, 1); + smp->data.u.sint = (&stktable_data_cast(ptr1, std_t_frqp))->curr_ctr; + } + + if (ptr2) + smp->data.u.sint = ++stktable_data_cast(ptr2, std_t_uint); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, stkctr_entry(stkctr), (stkctr == &tmpstkctr) ? 1 : 0); + } + else if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* Increment the General Purpose Counter 0 value from the stream's tracked + * frontend counters and return it into temp integer. + * Supports being called as "sc[0-9]_inc_gpc0" or "src_inc_gpc0" only. + */ +static int +smp_fetch_sc_inc_gpc0(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int period = 0; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!stkctr_entry(stkctr)) + stkctr = smp_create_src_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + + if (stkctr && stkctr_entry(stkctr)) { + void *ptr1,*ptr2; + + + /* First, update gpc0_rate if it's tracked. Second, update its + * gpc0 if tracked. Returns gpc0's value otherwise the curr_ctr. + */ + ptr1 = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC0_RATE); + if (ptr1) { + period = stkctr->table->data_arg[STKTABLE_DT_GPC0_RATE].u; + } + else { + /* fallback on the gpc array */ + ptr1 = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC_RATE, 0); + if (ptr1) + period = stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u; + } + + ptr2 = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC0); + if (!ptr2) { + /* fallback on the gpc array */ + ptr2 = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, 0); + } + + if (ptr1 || ptr2) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (ptr1) { + update_freq_ctr_period(&stktable_data_cast(ptr1, std_t_frqp), + period, 1); + smp->data.u.sint = (&stktable_data_cast(ptr1, std_t_frqp))->curr_ctr; + } + + if (ptr2) + smp->data.u.sint = ++stktable_data_cast(ptr2, std_t_uint); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, stkctr_entry(stkctr), (stkctr == &tmpstkctr) ? 1 : 0); + } + else if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* Increment the General Purpose Counter 1 value from the stream's tracked + * frontend counters and return it into temp integer. + * Supports being called as "sc[0-9]_inc_gpc1" or "src_inc_gpc1" only. + */ +static int +smp_fetch_sc_inc_gpc1(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int period = 0; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!stkctr_entry(stkctr)) + stkctr = smp_create_src_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + + if (stkctr && stkctr_entry(stkctr)) { + void *ptr1,*ptr2; + + + /* First, update gpc1_rate if it's tracked. Second, update its + * gpc1 if tracked. Returns gpc1's value otherwise the curr_ctr. + */ + ptr1 = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC1_RATE); + if (ptr1) { + period = stkctr->table->data_arg[STKTABLE_DT_GPC1_RATE].u; + } + else { + /* fallback on the gpc array */ + ptr1 = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC_RATE, 1); + if (ptr1) + period = stkctr->table->data_arg[STKTABLE_DT_GPC_RATE].u; + } + + ptr2 = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC1); + if (!ptr2) { + /* fallback on the gpc array */ + ptr2 = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, 1); + } + + if (ptr1 || ptr2) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (ptr1) { + update_freq_ctr_period(&stktable_data_cast(ptr1, std_t_frqp), + period, 1); + smp->data.u.sint = (&stktable_data_cast(ptr1, std_t_frqp))->curr_ctr; + } + + if (ptr2) + smp->data.u.sint = ++stktable_data_cast(ptr2, std_t_uint); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, stkctr_entry(stkctr), (stkctr == &tmpstkctr) ? 1 : 0); + } + else if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* Clear the GPC[args(0)] value from the stream's tracked + * frontend counters and return its previous value into temp integer. + * Supports being called as "sc_clr_gpc(<gpc-idx>,<sc-idx>[,<table>])" + * or "src_clr_gpc(<gpc-idx>[,<table>])" only. + */ +static int +smp_fetch_sc_clr_gpc(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + unsigned int idx; + + idx = args[0].data.sint; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args + 1, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!stkctr_entry(stkctr)) + stkctr = smp_create_src_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + + if (stkctr && stkctr_entry(stkctr)) { + void *ptr; + + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, idx); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + stktable_data_cast(ptr, std_t_uint) = 0; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, stkctr_entry(stkctr), (stkctr == &tmpstkctr) ? 1 : 0); + } + return 1; +} + +/* Clear the General Purpose Counter 0 value from the stream's tracked + * frontend counters and return its previous value into temp integer. + * Supports being called as "sc[0-9]_clr_gpc0" or "src_clr_gpc0" only. + */ +static int +smp_fetch_sc_clr_gpc0(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!stkctr_entry(stkctr)) + stkctr = smp_create_src_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + + if (stkctr && stkctr_entry(stkctr)) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC0); + if (!ptr) { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, 0); + } + + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + stktable_data_cast(ptr, std_t_uint) = 0; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, stkctr_entry(stkctr), (stkctr == &tmpstkctr) ? 1 : 0); + } + return 1; +} + +/* Clear the General Purpose Counter 1 value from the stream's tracked + * frontend counters and return its previous value into temp integer. + * Supports being called as "sc[0-9]_clr_gpc1" or "src_clr_gpc1" only. + */ +static int +smp_fetch_sc_clr_gpc1(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + + if (!stkctr_entry(stkctr)) + stkctr = smp_create_src_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + + if (stkctr && stkctr_entry(stkctr)) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC1); + if (!ptr) { + /* fallback on the gpc array */ + ptr = stktable_data_ptr_idx(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_GPC, 1); + } + + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + stktable_data_cast(ptr, std_t_uint) = 0; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, stkctr_entry(stkctr), (stkctr == &tmpstkctr) ? 1 : 0); + } + return 1; +} + +/* set <smp> to the cumulated number of connections from the stream's tracked + * frontend counters. Supports being called as "sc[0-9]_conn_cnt" or + * "src_conn_cnt" only. + */ +static int +smp_fetch_sc_conn_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_CONN_CNT); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + + + } + return 1; +} + +/* set <smp> to the connection rate from the stream's tracked frontend + * counters. Supports being called as "sc[0-9]_conn_rate" or "src_conn_rate" + * only. + */ +static int +smp_fetch_sc_conn_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_CONN_RATE); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_CONN_RATE].u); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set temp integer to the number of connections from the stream's source address + * in the table pointed to by expr, after updating it. + * Accepts exactly 1 argument of type table. + */ +static int +smp_fetch_src_updt_conn_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct connection *conn = objt_conn(smp->sess->origin); + struct stksess *ts; + struct stktable_key *key; + void *ptr; + struct stktable *t; + + if (!conn) + return 0; + + /* Fetch source address in a sample. */ + if (!smp_fetch_src || !smp_fetch_src(empty_arg_list, smp, "src", NULL)) + return 0; + + /* Converts into key. */ + key = smp_to_stkey(smp, args->data.t); + if (!key) + return 0; + + t = args->data.t; + + if ((ts = stktable_get_entry(t, key)) == NULL) + /* entry does not exist and could not be created */ + return 0; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_CONN_CNT); + if (!ptr) { + return 0; /* parameter not stored in this table */ + } + + smp->data.type = SMP_T_SINT; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + smp->data.u.sint = ++stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + smp->flags = SMP_F_VOL_TEST; + + stktable_touch_local(t, ts, 1); + + /* Touch was previously performed by stktable_update_key */ + return 1; +} + +/* set <smp> to the number of concurrent connections from the stream's tracked + * frontend counters. Supports being called as "sc[0-9]_conn_cur" or + * "src_conn_cur" only. + */ +static int +smp_fetch_sc_conn_cur(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_CONN_CUR); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the cumulated number of streams from the stream's tracked + * frontend counters. Supports being called as "sc[0-9]_sess_cnt" or + * "src_sess_cnt" only. + */ +static int +smp_fetch_sc_sess_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_SESS_CNT); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the stream rate from the stream's tracked frontend counters. + * Supports being called as "sc[0-9]_sess_rate" or "src_sess_rate" only. + */ +static int +smp_fetch_sc_sess_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_SESS_RATE); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_SESS_RATE].u); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the cumulated number of HTTP requests from the stream's tracked + * frontend counters. Supports being called as "sc[0-9]_http_req_cnt" or + * "src_http_req_cnt" only. + */ +static int +smp_fetch_sc_http_req_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_HTTP_REQ_CNT); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the HTTP request rate from the stream's tracked frontend + * counters. Supports being called as "sc[0-9]_http_req_rate" or + * "src_http_req_rate" only. + */ +static int +smp_fetch_sc_http_req_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_HTTP_REQ_RATE); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_HTTP_REQ_RATE].u); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the cumulated number of HTTP requests errors from the stream's + * tracked frontend counters. Supports being called as "sc[0-9]_http_err_cnt" or + * "src_http_err_cnt" only. + */ +static int +smp_fetch_sc_http_err_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_HTTP_ERR_CNT); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the HTTP request error rate from the stream's tracked frontend + * counters. Supports being called as "sc[0-9]_http_err_rate" or + * "src_http_err_rate" only. + */ +static int +smp_fetch_sc_http_err_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_HTTP_ERR_RATE); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_HTTP_ERR_RATE].u); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the cumulated number of HTTP response failures from the stream's + * tracked frontend counters. Supports being called as "sc[0-9]_http_fail_cnt" or + * "src_http_fail_cnt" only. + */ +static int +smp_fetch_sc_http_fail_cnt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_HTTP_FAIL_CNT); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_uint); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the HTTP response failure rate from the stream's tracked frontend + * counters. Supports being called as "sc[0-9]_http_fail_rate" or + * "src_http_fail_rate" only. + */ +static int +smp_fetch_sc_http_fail_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_HTTP_FAIL_RATE); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_HTTP_FAIL_RATE].u); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the number of kbytes received from clients, as found in the + * stream's tracked frontend counters. Supports being called as + * "sc[0-9]_kbytes_in" or "src_kbytes_in" only. + */ +static int +smp_fetch_sc_kbytes_in(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_BYTES_IN_CNT); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_ull) >> 10; + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the data rate received from clients in bytes/s, as found + * in the stream's tracked frontend counters. Supports being called as + * "sc[0-9]_bytes_in_rate" or "src_bytes_in_rate" only. + */ +static int +smp_fetch_sc_bytes_in_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_BYTES_IN_RATE); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_BYTES_IN_RATE].u); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the number of kbytes sent to clients, as found in the + * stream's tracked frontend counters. Supports being called as + * "sc[0-9]_kbytes_out" or "src_kbytes_out" only. + */ +static int +smp_fetch_sc_kbytes_out(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_BYTES_OUT_CNT); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = stktable_data_cast(ptr, std_t_ull) >> 10; + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the data rate sent to clients in bytes/s, as found in the + * stream's tracked frontend counters. Supports being called as + * "sc[0-9]_bytes_out_rate" or "src_bytes_out_rate" only. + */ +static int +smp_fetch_sc_bytes_out_rate(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + smp->data.u.sint = 0; + if (stkctr_entry(stkctr) != NULL) { + void *ptr; + + ptr = stktable_data_ptr(stkctr->table, stkctr_entry(stkctr), STKTABLE_DT_BYTES_OUT_RATE); + if (!ptr) { + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + return 0; /* parameter not stored */ + } + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + smp->data.u.sint = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_BYTES_OUT_RATE].u); + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &stkctr_entry(stkctr)->lock); + + if (stkctr == &tmpstkctr) + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + return 1; +} + +/* set <smp> to the number of active trackers on the SC entry in the stream's + * tracked frontend counters. Supports being called as "sc[0-9]_trackers" only. + */ +static int +smp_fetch_sc_trackers(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct stkctr tmpstkctr; + struct stkctr *stkctr; + + stkctr = smp_fetch_sc_stkctr(smp->sess, smp->strm, args, kw, &tmpstkctr); + if (!stkctr) + return 0; + + smp->flags = SMP_F_VOL_TEST; + smp->data.type = SMP_T_SINT; + if (stkctr == &tmpstkctr) { + smp->data.u.sint = stkctr_entry(stkctr) ? (HA_ATOMIC_LOAD(&stkctr_entry(stkctr)->ref_cnt) - 1) : 0; + stktable_release(stkctr->table, stkctr_entry(stkctr)); + } + else { + smp->data.u.sint = stkctr_entry(stkctr) ? HA_ATOMIC_LOAD(&stkctr_entry(stkctr)->ref_cnt) : 0; + } + + return 1; +} + + +/* The functions below are used to manipulate table contents from the CLI. + * There are 3 main actions, "clear", "set" and "show". The code is shared + * between all actions, and the action is encoded in the void *private in + * the appctx as well as in the keyword registration, among one of the + * following values. + */ + +enum { + STK_CLI_ACT_CLR, + STK_CLI_ACT_SET, + STK_CLI_ACT_SHOW, +}; + +/* Dump the status of a table to a stream connector's + * read buffer. It returns 0 if the output buffer is full + * and needs to be called again, otherwise non-zero. + */ +static int table_dump_head_to_buffer(struct buffer *msg, + struct appctx *appctx, + struct stktable *t, struct stktable *target) +{ + struct stream *s = __sc_strm(appctx_sc(appctx)); + + chunk_appendf(msg, "# table: %s, type: %s, size:%d, used:%d\n", + t->id, stktable_types[t->type].kw, t->size, t->current); + + /* any other information should be dumped here */ + + if (target && (strm_li(s)->bind_conf->level & ACCESS_LVL_MASK) < ACCESS_LVL_OPER) + chunk_appendf(msg, "# contents not dumped due to insufficient privileges\n"); + + if (applet_putchk(appctx, msg) == -1) + return 0; + + return 1; +} + +/* Dump a table entry to a stream connector's + * read buffer. It returns 0 if the output buffer is full + * and needs to be called again, otherwise non-zero. + */ +static int table_dump_entry_to_buffer(struct buffer *msg, + struct appctx *appctx, + struct stktable *t, struct stksess *entry) +{ + int dt; + + chunk_appendf(msg, "%p:", entry); + + if (t->type == SMP_T_IPV4) { + char addr[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, (const void *)&entry->key.key, addr, sizeof(addr)); + chunk_appendf(msg, " key=%s", addr); + } + else if (t->type == SMP_T_IPV6) { + char addr[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, (const void *)&entry->key.key, addr, sizeof(addr)); + chunk_appendf(msg, " key=%s", addr); + } + else if (t->type == SMP_T_SINT) { + chunk_appendf(msg, " key=%u", read_u32(entry->key.key)); + } + else if (t->type == SMP_T_STR) { + chunk_appendf(msg, " key="); + dump_text(msg, (const char *)entry->key.key, t->key_size); + } + else { + chunk_appendf(msg, " key="); + dump_binary(msg, (const char *)entry->key.key, t->key_size); + } + + chunk_appendf(msg, " use=%d exp=%d shard=%d", HA_ATOMIC_LOAD(&entry->ref_cnt) - 1, tick_remain(now_ms, entry->expire), entry->shard); + + for (dt = 0; dt < STKTABLE_DATA_TYPES; dt++) { + void *ptr; + + if (t->data_ofs[dt] == 0) + continue; + if (stktable_data_types[dt].is_array) { + char tmp[16] = {}; + const char *name_pfx = stktable_data_types[dt].name; + const char *name_sfx = NULL; + unsigned int idx = 0; + int i = 0; + + /* split name to show index before first _ of the name + * for example: 'gpc3_rate' if array name is 'gpc_rate'. + */ + for (i = 0 ; i < (sizeof(tmp) - 1); i++) { + if (!name_pfx[i]) + break; + if (name_pfx[i] == '_') { + name_pfx = &tmp[0]; + name_sfx = &stktable_data_types[dt].name[i]; + break; + } + tmp[i] = name_pfx[i]; + } + + ptr = stktable_data_ptr_idx(t, entry, dt, idx); + while (ptr) { + if (stktable_data_types[dt].arg_type == ARG_T_DELAY) + chunk_appendf(msg, " %s%u%s(%u)=", name_pfx, idx, name_sfx ? name_sfx : "", t->data_arg[dt].u); + else + chunk_appendf(msg, " %s%u%s=", name_pfx, idx, name_sfx ? name_sfx : ""); + switch (stktable_data_types[dt].std_type) { + case STD_T_SINT: + chunk_appendf(msg, "%d", stktable_data_cast(ptr, std_t_sint)); + break; + case STD_T_UINT: + chunk_appendf(msg, "%u", stktable_data_cast(ptr, std_t_uint)); + break; + case STD_T_ULL: + chunk_appendf(msg, "%llu", stktable_data_cast(ptr, std_t_ull)); + break; + case STD_T_FRQP: + chunk_appendf(msg, "%u", + read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[dt].u)); + break; + } + ptr = stktable_data_ptr_idx(t, entry, dt, ++idx); + } + continue; + } + if (stktable_data_types[dt].arg_type == ARG_T_DELAY) + chunk_appendf(msg, " %s(%u)=", stktable_data_types[dt].name, t->data_arg[dt].u); + else + chunk_appendf(msg, " %s=", stktable_data_types[dt].name); + + ptr = stktable_data_ptr(t, entry, dt); + switch (stktable_data_types[dt].std_type) { + case STD_T_SINT: + chunk_appendf(msg, "%d", stktable_data_cast(ptr, std_t_sint)); + break; + case STD_T_UINT: + chunk_appendf(msg, "%u", stktable_data_cast(ptr, std_t_uint)); + break; + case STD_T_ULL: + chunk_appendf(msg, "%llu", stktable_data_cast(ptr, std_t_ull)); + break; + case STD_T_FRQP: + chunk_appendf(msg, "%u", + read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[dt].u)); + break; + case STD_T_DICT: { + struct dict_entry *de; + de = stktable_data_cast(ptr, std_t_dict); + chunk_appendf(msg, "%s", de ? (char *)de->value.key : "-"); + break; + } + } + } + chunk_appendf(msg, "\n"); + + if (applet_putchk(appctx, msg) == -1) + return 0; + + return 1; +} + +/* appctx context used by the "show table" command */ +struct show_table_ctx { + void *target; /* table we want to dump, or NULL for all */ + struct stktable *t; /* table being currently dumped (first if NULL) */ + struct stksess *entry; /* last entry we were trying to dump (or first if NULL) */ + long long value[STKTABLE_FILTER_LEN]; /* value to compare against */ + signed char data_type[STKTABLE_FILTER_LEN]; /* type of data to compare, or -1 if none */ + signed char data_op[STKTABLE_FILTER_LEN]; /* operator (STD_OP_*) when data_type set */ + enum { + STATE_NEXT = 0, /* px points to next table, entry=NULL */ + STATE_DUMP, /* px points to curr table, entry is valid, refcount held */ + STATE_DONE, /* done dumping */ + } state; + char action; /* action on the table : one of STK_CLI_ACT_* */ +}; + +/* Processes a single table entry matching a specific key passed in argument. + * returns 0 if wants to be called again, 1 if has ended processing. + */ +static int table_process_entry_per_key(struct appctx *appctx, char **args) +{ + struct show_table_ctx *ctx = appctx->svcctx; + struct stktable *t = ctx->target; + struct stksess *ts; + struct sample key; + long long value; + int data_type; + int cur_arg; + void *ptr; + struct freq_ctr *frqp; + + if (!*args[4]) + return cli_err(appctx, "Key value expected\n"); + + memset(&key, 0, sizeof(key)); + key.data.type = SMP_T_STR; + key.data.u.str.area = args[4]; + key.data.u.str.data = strlen(args[4]); + + switch (t->type) { + case SMP_T_IPV4: + case SMP_T_IPV6: + /* prefer input format over table type when parsing ip addresses, + * then let smp_to_stkey() do the conversion for us when needed + */ + BUG_ON(!sample_casts[key.data.type][SMP_T_ADDR]); + if (!sample_casts[key.data.type][SMP_T_ADDR](&key)) + return cli_err(appctx, "Invalid key\n"); + break; + case SMP_T_SINT: + case SMP_T_STR: + break; + default: + switch (ctx->action) { + case STK_CLI_ACT_SHOW: + return cli_err(appctx, "Showing keys from tables of type other than ip, ipv6, string and integer is not supported\n"); + case STK_CLI_ACT_CLR: + return cli_err(appctx, "Removing keys from tables of type other than ip, ipv6, string and integer is not supported\n"); + case STK_CLI_ACT_SET: + return cli_err(appctx, "Inserting keys into tables of type other than ip, ipv6, string and integer is not supported\n"); + default: + return cli_err(appctx, "Unknown action\n"); + } + } + + /* try to convert key according to table type + * (it will fill static_table_key on success) + */ + if (!smp_to_stkey(&key, t)) + return cli_err(appctx, "Invalid key\n"); + + /* check permissions */ + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + switch (ctx->action) { + case STK_CLI_ACT_SHOW: + ts = stktable_lookup_key(t, &static_table_key); + if (!ts) + return 1; + chunk_reset(&trash); + if (!table_dump_head_to_buffer(&trash, appctx, t, t)) { + stktable_release(t, ts); + return 0; + } + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &ts->lock); + if (!table_dump_entry_to_buffer(&trash, appctx, t, ts)) { + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_release(t, ts); + return 0; + } + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_release(t, ts); + break; + + case STK_CLI_ACT_CLR: + ts = stktable_lookup_key(t, &static_table_key); + if (!ts) + return 1; + + if (!stksess_kill(t, ts, 1)) { + /* don't delete an entry which is currently referenced */ + return cli_err(appctx, "Entry currently in use, cannot remove\n"); + } + break; + + case STK_CLI_ACT_SET: + ts = stktable_get_entry(t, &static_table_key); + if (!ts) { + /* don't delete an entry which is currently referenced */ + return cli_err(appctx, "Unable to allocate a new entry\n"); + } + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + for (cur_arg = 5; *args[cur_arg]; cur_arg += 2) { + if (strncmp(args[cur_arg], "data.", 5) != 0) { + cli_err(appctx, "\"data.<type>\" followed by a value expected\n"); + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_touch_local(t, ts, 1); + return 1; + } + + data_type = stktable_get_data_type(args[cur_arg] + 5); + if (data_type < 0) { + cli_err(appctx, "Unknown data type\n"); + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_touch_local(t, ts, 1); + return 1; + } + + if (!t->data_ofs[data_type]) { + cli_err(appctx, "Data type not stored in this table\n"); + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_touch_local(t, ts, 1); + return 1; + } + + if (!*args[cur_arg+1] || strl2llrc(args[cur_arg+1], strlen(args[cur_arg+1]), &value) != 0) { + cli_err(appctx, "Require a valid integer value to store\n"); + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_touch_local(t, ts, 1); + return 1; + } + + ptr = stktable_data_ptr(t, ts, data_type); + + switch (stktable_data_types[data_type].std_type) { + case STD_T_SINT: + stktable_data_cast(ptr, std_t_sint) = value; + break; + case STD_T_UINT: + stktable_data_cast(ptr, std_t_uint) = value; + break; + case STD_T_ULL: + stktable_data_cast(ptr, std_t_ull) = value; + break; + case STD_T_FRQP: + /* We set both the current and previous values. That way + * the reported frequency is stable during all the period + * then slowly fades out. This allows external tools to + * push measures without having to update them too often. + */ + frqp = &stktable_data_cast(ptr, std_t_frqp); + /* First bit is reserved for the freq_ctr lock + Note: here we're still protected by the stksess lock + so we don't need to update the update the freq_ctr + using its internal lock */ + frqp->curr_tick = now_ms & ~0x1; + frqp->prev_ctr = 0; + frqp->curr_ctr = value; + break; + } + } + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + stktable_touch_local(t, ts, 1); + break; + + default: + return cli_err(appctx, "Unknown action\n"); + } + return 1; +} + +/* Prepares the appctx fields with the data-based filters from the command line. + * Returns 0 if the dump can proceed, 1 if has ended processing. + */ +static int table_prepare_data_request(struct appctx *appctx, char **args) +{ + struct show_table_ctx *ctx = appctx->svcctx; + int i; + char *err = NULL; + + if (ctx->action != STK_CLI_ACT_SHOW && ctx->action != STK_CLI_ACT_CLR) + return cli_err(appctx, "content-based lookup is only supported with the \"show\" and \"clear\" actions\n"); + + for (i = 0; i < STKTABLE_FILTER_LEN; i++) { + if (i > 0 && !*args[3+3*i]) // number of filter entries can be less than STKTABLE_FILTER_LEN + break; + /* condition on stored data value */ + ctx->data_type[i] = stktable_get_data_type(args[3+3*i] + 5); + if (ctx->data_type[i] < 0) + return cli_dynerr(appctx, memprintf(&err, "Filter entry #%i: Unknown data type\n", i + 1)); + + if (!((struct stktable *)ctx->target)->data_ofs[ctx->data_type[i]]) + return cli_dynerr(appctx, memprintf(&err, "Filter entry #%i: Data type not stored in this table\n", i + 1)); + + ctx->data_op[i] = get_std_op(args[4+3*i]); + if (ctx->data_op[i] < 0) + return cli_dynerr(appctx, memprintf(&err, "Filter entry #%i: Require and operator among \"eq\", \"ne\", \"le\", \"ge\", \"lt\", \"gt\"\n", i + 1)); + + if (!*args[5+3*i] || strl2llrc(args[5+3*i], strlen(args[5+3*i]), &ctx->value[i]) != 0) + return cli_dynerr(appctx, memprintf(&err, "Filter entry #%i: Require a valid integer value to compare against\n", i + 1)); + } + + if (*args[3+3*i]) { + return cli_dynerr(appctx, memprintf(&err, "Detected extra data in filter, %ith word of input, after '%s'\n", 3+3*i + 1, args[2+3*i])); + } + + /* OK we're done, all the fields are set */ + return 0; +} + +/* returns 0 if wants to be called, 1 if has ended processing */ +static int cli_parse_table_req(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_table_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + int i; + + for (i = 0; i < STKTABLE_FILTER_LEN; i++) + ctx->data_type[i] = -1; + ctx->target = NULL; + ctx->entry = NULL; + ctx->action = (long)private; // keyword argument, one of STK_CLI_ACT_* + + if (*args[2]) { + ctx->t = ctx->target = stktable_find_by_name(args[2]); + if (!ctx->target) + return cli_err(appctx, "No such table\n"); + } + else { + ctx->t = stktables_list; + if (ctx->action != STK_CLI_ACT_SHOW) + goto err_args; + return 0; + } + + if (strcmp(args[3], "key") == 0) + return table_process_entry_per_key(appctx, args); + else if (strncmp(args[3], "data.", 5) == 0) + return table_prepare_data_request(appctx, args); + else if (*args[3]) + goto err_args; + + return 0; + +err_args: + switch (ctx->action) { + case STK_CLI_ACT_SHOW: + return cli_err(appctx, "Optional argument only supports \"data.<store_data_type>\" <operator> <value> and key <key>\n"); + case STK_CLI_ACT_CLR: + return cli_err(appctx, "Required arguments: <table> \"data.<store_data_type>\" <operator> <value> or <table> key <key>\n"); + case STK_CLI_ACT_SET: + return cli_err(appctx, "Required arguments: <table> key <key> [data.<store_data_type> <value>]*\n"); + default: + return cli_err(appctx, "Unknown action\n"); + } +} + +/* This function is used to deal with table operations (dump or clear depending + * on the action stored in appctx->private). It returns 0 if the output buffer is + * full and it needs to be called again, otherwise non-zero. + */ +static int cli_io_handler_table(struct appctx *appctx) +{ + struct show_table_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct stream *s = __sc_strm(sc); + struct ebmb_node *eb; + int skip_entry; + int show = ctx->action == STK_CLI_ACT_SHOW; + + /* + * We have 3 possible states in ctx->state : + * - STATE_NEXT : the proxy pointer points to the next table to + * dump, the entry pointer is NULL ; + * - STATE_DUMP : the proxy pointer points to the current table + * and the entry pointer points to the next entry to be dumped, + * and the refcount on the next entry is held ; + * - STATE_DONE : nothing left to dump, the buffer may contain some + * data though. + */ + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) { + /* in case of abort, remove any refcount we might have set on an entry */ + if (ctx->state == STATE_DUMP) { + stksess_kill_if_expired(ctx->t, ctx->entry, 1); + } + return 1; + } + + chunk_reset(&trash); + + while (ctx->state != STATE_DONE) { + switch (ctx->state) { + case STATE_NEXT: + if (!ctx->t || + (ctx->target && + ctx->t != ctx->target)) { + ctx->state = STATE_DONE; + break; + } + + if (ctx->t->size) { + if (show && !table_dump_head_to_buffer(&trash, appctx, ctx->t, ctx->target)) + return 0; + + if (ctx->target && + (strm_li(s)->bind_conf->level & ACCESS_LVL_MASK) >= ACCESS_LVL_OPER) { + /* dump entries only if table explicitly requested */ + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &ctx->t->lock); + eb = ebmb_first(&ctx->t->keys); + if (eb) { + ctx->entry = ebmb_entry(eb, struct stksess, key); + HA_ATOMIC_INC(&ctx->entry->ref_cnt); + ctx->state = STATE_DUMP; + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &ctx->t->lock); + break; + } + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &ctx->t->lock); + } + } + ctx->t = ctx->t->next; + break; + + case STATE_DUMP: + skip_entry = 0; + + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &ctx->entry->lock); + + if (ctx->data_type[0] >= 0) { + /* we're filtering on some data contents */ + void *ptr; + int dt, i; + signed char op; + long long data, value; + + + for (i = 0; i < STKTABLE_FILTER_LEN; i++) { + if (ctx->data_type[i] == -1) + break; + dt = ctx->data_type[i]; + ptr = stktable_data_ptr(ctx->t, + ctx->entry, + dt); + + data = 0; + switch (stktable_data_types[dt].std_type) { + case STD_T_SINT: + data = stktable_data_cast(ptr, std_t_sint); + break; + case STD_T_UINT: + data = stktable_data_cast(ptr, std_t_uint); + break; + case STD_T_ULL: + data = stktable_data_cast(ptr, std_t_ull); + break; + case STD_T_FRQP: + data = read_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + ctx->t->data_arg[dt].u); + break; + } + + op = ctx->data_op[i]; + value = ctx->value[i]; + + /* skip the entry if the data does not match the test and the value */ + if ((data < value && + (op == STD_OP_EQ || op == STD_OP_GT || op == STD_OP_GE)) || + (data == value && + (op == STD_OP_NE || op == STD_OP_GT || op == STD_OP_LT)) || + (data > value && + (op == STD_OP_EQ || op == STD_OP_LT || op == STD_OP_LE))) { + skip_entry = 1; + break; + } + } + } + + if (show && !skip_entry && + !table_dump_entry_to_buffer(&trash, appctx, ctx->t, ctx->entry)) { + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &ctx->entry->lock); + return 0; + } + + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &ctx->entry->lock); + + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &ctx->t->lock); + HA_ATOMIC_DEC(&ctx->entry->ref_cnt); + + eb = ebmb_next(&ctx->entry->key); + if (eb) { + struct stksess *old = ctx->entry; + ctx->entry = ebmb_entry(eb, struct stksess, key); + if (show) + __stksess_kill_if_expired(ctx->t, old); + else if (!skip_entry && !ctx->entry->ref_cnt) + __stksess_kill(ctx->t, old); + HA_ATOMIC_INC(&ctx->entry->ref_cnt); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &ctx->t->lock); + break; + } + + + if (show) + __stksess_kill_if_expired(ctx->t, ctx->entry); + else if (!skip_entry && !HA_ATOMIC_LOAD(&ctx->entry->ref_cnt)) + __stksess_kill(ctx->t, ctx->entry); + + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &ctx->t->lock); + + ctx->t = ctx->t->next; + ctx->state = STATE_NEXT; + break; + + default: + break; + } + } + return 1; +} + +static void cli_release_show_table(struct appctx *appctx) +{ + struct show_table_ctx *ctx = appctx->svcctx; + + if (ctx->state == STATE_DUMP) { + stksess_kill_if_expired(ctx->t, ctx->entry, 1); + } +} + +static int stk_parse_stick_counters(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *error; + int counters; + + counters = strtol(args[1], &error, 10); + if (*error != 0) { + memprintf(err, "%s: '%s' is an invalid number", args[0], args[1]); + return -1; + } + + if (counters < 0) { + memprintf(err, "%s: the number of stick-counters may not be negative (was %d)", args[0], counters); + return -1; + } + + global.tune.nb_stk_ctr = counters; + return 0; +} + +/* This function creates the stk_ctr pools after the configuration parsing. It + * returns 0 on success otherwise ERR_*. If nb_stk_ctr is 0, the pool remains + * NULL. + */ +static int stkt_create_stk_ctr_pool(void) +{ + if (!global.tune.nb_stk_ctr) + return 0; + + pool_head_stk_ctr = create_pool("stk_ctr", sizeof(*((struct session*)0)->stkctr) * global.tune.nb_stk_ctr, MEM_F_SHARED); + if (!pool_head_stk_ctr) { + ha_alert("out of memory while creating the stick-counters pool.\n"); + return ERR_ABORT; + } + return 0; +} + +static void stkt_late_init(void) +{ + struct sample_fetch *f; + + f = find_sample_fetch("src", strlen("src")); + if (f) + smp_fetch_src = f->process; + hap_register_post_check(stkt_create_stk_ctr_pool); +} + +INITCALL0(STG_INIT, stkt_late_init); + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "clear", "table", NULL }, "clear table <table> [<filter>]* : remove an entry from a table (filter: data/key)", cli_parse_table_req, cli_io_handler_table, cli_release_show_table, (void *)STK_CLI_ACT_CLR }, + { { "set", "table", NULL }, "set table <table> key <k> [data.* <v>]* : update or create a table entry's data", cli_parse_table_req, cli_io_handler_table, NULL, (void *)STK_CLI_ACT_SET }, + { { "show", "table", NULL }, "show table <table> [<filter>]* : report table usage stats or dump this table's contents (filter: data/key)", cli_parse_table_req, cli_io_handler_table, cli_release_show_table, (void *)STK_CLI_ACT_SHOW }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +static struct action_kw_list tcp_conn_kws = { { }, { + { "sc-add-gpc", parse_add_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc0", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc1", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-set-gpt", parse_set_gpt, KWF_MATCH_PREFIX }, + { "sc-set-gpt0", parse_set_gpt, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_conn_keywords_register, &tcp_conn_kws); + +static struct action_kw_list tcp_sess_kws = { { }, { + { "sc-add-gpc", parse_add_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc0", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc1", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-set-gpt", parse_set_gpt, KWF_MATCH_PREFIX }, + { "sc-set-gpt0", parse_set_gpt, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_sess_keywords_register, &tcp_sess_kws); + +static struct action_kw_list tcp_req_kws = { { }, { + { "sc-add-gpc", parse_add_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc0", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc1", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-set-gpt", parse_set_gpt, KWF_MATCH_PREFIX }, + { "sc-set-gpt0", parse_set_gpt, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_cont_keywords_register, &tcp_req_kws); + +static struct action_kw_list tcp_res_kws = { { }, { + { "sc-add-gpc", parse_add_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc0", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc1", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-set-gpt", parse_set_gpt, KWF_MATCH_PREFIX }, + { "sc-set-gpt0", parse_set_gpt, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_res_cont_keywords_register, &tcp_res_kws); + +static struct action_kw_list http_req_kws = { { }, { + { "sc-add-gpc", parse_add_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc0", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc1", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-set-gpt", parse_set_gpt, KWF_MATCH_PREFIX }, + { "sc-set-gpt0", parse_set_gpt, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_kws); + +static struct action_kw_list http_res_kws = { { }, { + { "sc-add-gpc", parse_add_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc0", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc1", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-set-gpt", parse_set_gpt, KWF_MATCH_PREFIX }, + { "sc-set-gpt0", parse_set_gpt, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_res_keywords_register, &http_res_kws); + +static struct action_kw_list http_after_res_kws = { { }, { + { "sc-add-gpc", parse_add_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc0", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-inc-gpc1", parse_inc_gpc, KWF_MATCH_PREFIX }, + { "sc-set-gpt", parse_set_gpt, KWF_MATCH_PREFIX }, + { "sc-set-gpt0", parse_set_gpt, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_after_res_keywords_register, &http_after_res_kws); + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct sample_fetch_kw_list smp_fetch_keywords = {ILH, { + { "sc_bytes_in_rate", smp_fetch_sc_bytes_in_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_bytes_out_rate", smp_fetch_sc_bytes_out_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_clr_gpc", smp_fetch_sc_clr_gpc, ARG3(2,SINT,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_clr_gpc0", smp_fetch_sc_clr_gpc0, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_clr_gpc1", smp_fetch_sc_clr_gpc1, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN }, + { "sc_conn_cnt", smp_fetch_sc_conn_cnt, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_conn_cur", smp_fetch_sc_conn_cur, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_conn_rate", smp_fetch_sc_conn_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_get_gpt", smp_fetch_sc_get_gpt, ARG3(2,SINT,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_get_gpt0", smp_fetch_sc_get_gpt0, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_get_gpc", smp_fetch_sc_get_gpc, ARG3(2,SINT,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_get_gpc0", smp_fetch_sc_get_gpc0, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_get_gpc1", smp_fetch_sc_get_gpc1, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN }, + { "sc_gpc_rate", smp_fetch_sc_gpc_rate, ARG3(2,SINT,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_gpc0_rate", smp_fetch_sc_gpc0_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_gpc1_rate", smp_fetch_sc_gpc1_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_http_err_cnt", smp_fetch_sc_http_err_cnt, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_http_err_rate", smp_fetch_sc_http_err_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_http_fail_cnt", smp_fetch_sc_http_fail_cnt, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_http_fail_rate", smp_fetch_sc_http_fail_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_http_req_cnt", smp_fetch_sc_http_req_cnt, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_http_req_rate", smp_fetch_sc_http_req_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_inc_gpc", smp_fetch_sc_inc_gpc, ARG3(2,SINT,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_inc_gpc0", smp_fetch_sc_inc_gpc0, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_inc_gpc1", smp_fetch_sc_inc_gpc1, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_kbytes_in", smp_fetch_sc_kbytes_in, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "sc_kbytes_out", smp_fetch_sc_kbytes_out, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "sc_sess_cnt", smp_fetch_sc_sess_cnt, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_sess_rate", smp_fetch_sc_sess_rate, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc_tracked", smp_fetch_sc_tracked, ARG2(1,SINT,TAB), NULL, SMP_T_BOOL, SMP_USE_INTRN, }, + { "sc_trackers", smp_fetch_sc_trackers, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_bytes_in_rate", smp_fetch_sc_bytes_in_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_bytes_out_rate", smp_fetch_sc_bytes_out_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_clr_gpc0", smp_fetch_sc_clr_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_clr_gpc1", smp_fetch_sc_clr_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_conn_cnt", smp_fetch_sc_conn_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_conn_cur", smp_fetch_sc_conn_cur, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_conn_rate", smp_fetch_sc_conn_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_get_gpt0", smp_fetch_sc_get_gpt0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_get_gpc0", smp_fetch_sc_get_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_get_gpc1", smp_fetch_sc_get_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_gpc0_rate", smp_fetch_sc_gpc0_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_gpc1_rate", smp_fetch_sc_gpc1_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_http_err_cnt", smp_fetch_sc_http_err_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_http_err_rate", smp_fetch_sc_http_err_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_http_fail_cnt", smp_fetch_sc_http_fail_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_http_fail_rate", smp_fetch_sc_http_fail_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_http_req_cnt", smp_fetch_sc_http_req_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_http_req_rate", smp_fetch_sc_http_req_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_inc_gpc0", smp_fetch_sc_inc_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_inc_gpc1", smp_fetch_sc_inc_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_kbytes_in", smp_fetch_sc_kbytes_in, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "sc0_kbytes_out", smp_fetch_sc_kbytes_out, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "sc0_sess_cnt", smp_fetch_sc_sess_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_sess_rate", smp_fetch_sc_sess_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc0_tracked", smp_fetch_sc_tracked, ARG1(0,TAB), NULL, SMP_T_BOOL, SMP_USE_INTRN, }, + { "sc0_trackers", smp_fetch_sc_trackers, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_bytes_in_rate", smp_fetch_sc_bytes_in_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_bytes_out_rate", smp_fetch_sc_bytes_out_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_clr_gpc", smp_fetch_sc_clr_gpc, ARG2(1,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_clr_gpc0", smp_fetch_sc_clr_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_clr_gpc1", smp_fetch_sc_clr_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_conn_cnt", smp_fetch_sc_conn_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_conn_cur", smp_fetch_sc_conn_cur, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_conn_rate", smp_fetch_sc_conn_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_get_gpt0", smp_fetch_sc_get_gpt0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_get_gpc0", smp_fetch_sc_get_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_get_gpc1", smp_fetch_sc_get_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_gpc0_rate", smp_fetch_sc_gpc0_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_gpc1_rate", smp_fetch_sc_gpc1_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_http_err_cnt", smp_fetch_sc_http_err_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_http_err_rate", smp_fetch_sc_http_err_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_http_fail_cnt", smp_fetch_sc_http_fail_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_http_fail_rate", smp_fetch_sc_http_fail_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_http_req_cnt", smp_fetch_sc_http_req_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_http_req_rate", smp_fetch_sc_http_req_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_inc_gpc0", smp_fetch_sc_inc_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_inc_gpc1", smp_fetch_sc_inc_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_kbytes_in", smp_fetch_sc_kbytes_in, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "sc1_kbytes_out", smp_fetch_sc_kbytes_out, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "sc1_sess_cnt", smp_fetch_sc_sess_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_sess_rate", smp_fetch_sc_sess_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc1_tracked", smp_fetch_sc_tracked, ARG1(0,TAB), NULL, SMP_T_BOOL, SMP_USE_INTRN, }, + { "sc1_trackers", smp_fetch_sc_trackers, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_bytes_in_rate", smp_fetch_sc_bytes_in_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_bytes_out_rate", smp_fetch_sc_bytes_out_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_clr_gpc0", smp_fetch_sc_clr_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_clr_gpc1", smp_fetch_sc_clr_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_conn_cnt", smp_fetch_sc_conn_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_conn_cur", smp_fetch_sc_conn_cur, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_conn_rate", smp_fetch_sc_conn_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_get_gpt0", smp_fetch_sc_get_gpt0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_get_gpc0", smp_fetch_sc_get_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_get_gpc1", smp_fetch_sc_get_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_gpc0_rate", smp_fetch_sc_gpc0_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_gpc1_rate", smp_fetch_sc_gpc1_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_http_err_cnt", smp_fetch_sc_http_err_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_http_err_rate", smp_fetch_sc_http_err_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_http_fail_cnt", smp_fetch_sc_http_fail_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_http_fail_rate", smp_fetch_sc_http_fail_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_http_req_cnt", smp_fetch_sc_http_req_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_http_req_rate", smp_fetch_sc_http_req_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_inc_gpc0", smp_fetch_sc_inc_gpc0, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_inc_gpc1", smp_fetch_sc_inc_gpc1, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_kbytes_in", smp_fetch_sc_kbytes_in, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "sc2_kbytes_out", smp_fetch_sc_kbytes_out, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "sc2_sess_cnt", smp_fetch_sc_sess_cnt, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_sess_rate", smp_fetch_sc_sess_rate, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "sc2_tracked", smp_fetch_sc_tracked, ARG1(0,TAB), NULL, SMP_T_BOOL, SMP_USE_INTRN, }, + { "sc2_trackers", smp_fetch_sc_trackers, ARG1(0,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "src_bytes_in_rate", smp_fetch_sc_bytes_in_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_bytes_out_rate", smp_fetch_sc_bytes_out_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_clr_gpc", smp_fetch_sc_clr_gpc, ARG2(2,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_clr_gpc0", smp_fetch_sc_clr_gpc0, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_clr_gpc1", smp_fetch_sc_clr_gpc1, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_conn_cnt", smp_fetch_sc_conn_cnt, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_conn_cur", smp_fetch_sc_conn_cur, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_conn_rate", smp_fetch_sc_conn_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_get_gpt" , smp_fetch_sc_get_gpt, ARG2(2,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_get_gpt0", smp_fetch_sc_get_gpt0, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_get_gpc", smp_fetch_sc_get_gpc, ARG2(2,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_get_gpc0", smp_fetch_sc_get_gpc0, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_get_gpc1", smp_fetch_sc_get_gpc1, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_gpc_rate", smp_fetch_sc_gpc_rate, ARG2(2,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_gpc0_rate", smp_fetch_sc_gpc0_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_gpc1_rate", smp_fetch_sc_gpc1_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_http_err_cnt", smp_fetch_sc_http_err_cnt, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_http_err_rate", smp_fetch_sc_http_err_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_http_fail_cnt", smp_fetch_sc_http_fail_cnt, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_http_fail_rate", smp_fetch_sc_http_fail_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_http_req_cnt", smp_fetch_sc_http_req_cnt, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_http_req_rate", smp_fetch_sc_http_req_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_inc_gpc", smp_fetch_sc_inc_gpc, ARG2(2,SINT,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_inc_gpc0", smp_fetch_sc_inc_gpc0, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_inc_gpc1", smp_fetch_sc_inc_gpc1, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_kbytes_in", smp_fetch_sc_kbytes_in, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_kbytes_out", smp_fetch_sc_kbytes_out, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_sess_cnt", smp_fetch_sc_sess_cnt, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_sess_rate", smp_fetch_sc_sess_rate, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "src_updt_conn_cnt", smp_fetch_src_updt_conn_cnt, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_L4CLI, }, + { "table_avl", smp_fetch_table_avl, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "table_cnt", smp_fetch_table_cnt, ARG1(1,TAB), NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_fetch_keywords); + +/* Note: must not be declared <const> as its list will be overwritten */ +static struct sample_conv_kw_list sample_conv_kws = {ILH, { + { "in_table", sample_conv_in_table, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_BOOL }, + { "table_bytes_in_rate", sample_conv_table_bytes_in_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_bytes_out_rate", sample_conv_table_bytes_out_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_conn_cnt", sample_conv_table_conn_cnt, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_conn_cur", sample_conv_table_conn_cur, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_conn_rate", sample_conv_table_conn_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_expire", sample_conv_table_expire, ARG2(1,TAB,SINT), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_gpt", sample_conv_table_gpt, ARG2(2,SINT,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_gpt0", sample_conv_table_gpt0, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_gpc", sample_conv_table_gpc, ARG2(2,SINT,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_gpc0", sample_conv_table_gpc0, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_gpc1", sample_conv_table_gpc1, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_gpc_rate", sample_conv_table_gpc_rate, ARG2(2,SINT,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_gpc0_rate", sample_conv_table_gpc0_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_gpc1_rate", sample_conv_table_gpc1_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_http_err_cnt", sample_conv_table_http_err_cnt, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_http_err_rate", sample_conv_table_http_err_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_http_fail_cnt", sample_conv_table_http_fail_cnt, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_http_fail_rate", sample_conv_table_http_fail_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_http_req_cnt", sample_conv_table_http_req_cnt, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_http_req_rate", sample_conv_table_http_req_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_idle", sample_conv_table_idle, ARG2(1,TAB,SINT), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_kbytes_in", sample_conv_table_kbytes_in, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_kbytes_out", sample_conv_table_kbytes_out, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_server_id", sample_conv_table_server_id, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_sess_cnt", sample_conv_table_sess_cnt, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_sess_rate", sample_conv_table_sess_rate, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { "table_trackers", sample_conv_table_trackers, ARG1(1,TAB), NULL, SMP_T_ANY, SMP_T_SINT }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_convs, &sample_conv_kws); + +static struct cfg_kw_list cfg_kws = {{ },{ + { CFG_GLOBAL, "tune.stick-counters", stk_parse_stick_counters }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/stream.c b/src/stream.c new file mode 100644 index 0000000..a3c0c93 --- /dev/null +++ b/src/stream.c @@ -0,0 +1,4045 @@ +/* + * Stream management functions. + * + * Copyright 2000-2012 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <stdlib.h> +#include <unistd.h> + +#include <import/ebistree.h> + +#include <haproxy/acl.h> +#include <haproxy/action.h> +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/arg.h> +#include <haproxy/backend.h> +#include <haproxy/capture.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/check.h> +#include <haproxy/cli.h> +#include <haproxy/connection.h> +#include <haproxy/dict.h> +#include <haproxy/dynbuf.h> +#include <haproxy/fd.h> +#include <haproxy/filters.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/frontend.h> +#include <haproxy/global.h> +#include <haproxy/hlua.h> +#include <haproxy/http_ana.h> +#include <haproxy/http_rules.h> +#include <haproxy/htx.h> +#include <haproxy/istbuf.h> +#include <haproxy/log.h> +#include <haproxy/pipe.h> +#include <haproxy/pool.h> +#include <haproxy/proxy.h> +#include <haproxy/queue.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/resolvers.h> +#include <haproxy/sample.h> +#include <haproxy/session.h> +#include <haproxy/stats-t.h> +#include <haproxy/stconn.h> +#include <haproxy/stick_table.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/thread.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> +#include <haproxy/vars.h> + + +DECLARE_POOL(pool_head_stream, "stream", sizeof(struct stream)); +DECLARE_POOL(pool_head_uniqueid, "uniqueid", UNIQUEID_LEN); + +/* incremented by each "show sess" to fix a delimiter between streams */ +unsigned stream_epoch = 0; + +/* List of all use-service keywords. */ +static struct list service_keywords = LIST_HEAD_INIT(service_keywords); + + +/* trace source and events */ +static void strm_trace(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +/* The event representation is split like this : + * strm - stream + * sc - stream connector + * http - http analyzis + * tcp - tcp analyzis + * + * STRM_EV_* macros are defined in <proto/stream.h> + */ +static const struct trace_event strm_trace_events[] = { + { .mask = STRM_EV_STRM_NEW, .name = "strm_new", .desc = "new stream" }, + { .mask = STRM_EV_STRM_FREE, .name = "strm_free", .desc = "release stream" }, + { .mask = STRM_EV_STRM_ERR, .name = "strm_err", .desc = "error during stream processing" }, + { .mask = STRM_EV_STRM_ANA, .name = "strm_ana", .desc = "stream analyzers" }, + { .mask = STRM_EV_STRM_PROC, .name = "strm_proc", .desc = "stream processing" }, + + { .mask = STRM_EV_CS_ST, .name = "sc_state", .desc = "processing connector states" }, + + { .mask = STRM_EV_HTTP_ANA, .name = "http_ana", .desc = "HTTP analyzers" }, + { .mask = STRM_EV_HTTP_ERR, .name = "http_err", .desc = "error during HTTP analyzis" }, + + { .mask = STRM_EV_TCP_ANA, .name = "tcp_ana", .desc = "TCP analyzers" }, + { .mask = STRM_EV_TCP_ERR, .name = "tcp_err", .desc = "error during TCP analyzis" }, + + { .mask = STRM_EV_FLT_ANA, .name = "flt_ana", .desc = "Filter analyzers" }, + { .mask = STRM_EV_FLT_ERR, .name = "flt_err", .desc = "error during filter analyzis" }, + {} +}; + +static const struct name_desc strm_trace_lockon_args[4] = { + /* arg1 */ { /* already used by the stream */ }, + /* arg2 */ { }, + /* arg3 */ { }, + /* arg4 */ { } +}; + +static const struct name_desc strm_trace_decoding[] = { +#define STRM_VERB_CLEAN 1 + { .name="clean", .desc="only user-friendly stuff, generally suitable for level \"user\"" }, +#define STRM_VERB_MINIMAL 2 + { .name="minimal", .desc="report info on streams and connectors" }, +#define STRM_VERB_SIMPLE 3 + { .name="simple", .desc="add info on request and response channels" }, +#define STRM_VERB_ADVANCED 4 + { .name="advanced", .desc="add info on channel's buffer for data and developer levels only" }, +#define STRM_VERB_COMPLETE 5 + { .name="complete", .desc="add info on channel's buffer" }, + { /* end */ } +}; + +struct trace_source trace_strm = { + .name = IST("stream"), + .desc = "Applicative stream", + .arg_def = TRC_ARG1_STRM, // TRACE()'s first argument is always a stream + .default_cb = strm_trace, + .known_events = strm_trace_events, + .lockon_args = strm_trace_lockon_args, + .decoding = strm_trace_decoding, + .report_events = ~0, // report everything by default +}; + +#define TRACE_SOURCE &trace_strm +INITCALL1(STG_REGISTER, trace_register_source, TRACE_SOURCE); + +/* the stream traces always expect that arg1, if non-null, is of a stream (from + * which we can derive everything), that arg2, if non-null, is an http + * transaction, that arg3, if non-null, is an http message. + */ +static void strm_trace(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + const struct stream *s = a1; + const struct http_txn *txn = a2; + const struct http_msg *msg = a3; + struct task *task; + const struct channel *req, *res; + struct htx *htx; + + if (!s || src->verbosity < STRM_VERB_CLEAN) + return; + + task = s->task; + req = &s->req; + res = &s->res; + htx = (msg ? htxbuf(&msg->chn->buf) : NULL); + + /* General info about the stream (htx/tcp, id...) */ + chunk_appendf(&trace_buf, " : [%u,%s]", + s->uniq_id, ((s->flags & SF_HTX) ? "HTX" : "TCP")); + if (isttest(s->unique_id)) { + chunk_appendf(&trace_buf, " id="); + b_putist(&trace_buf, s->unique_id); + } + + /* Front and back stream connector state */ + chunk_appendf(&trace_buf, " SC=(%s,%s)", + sc_state_str(s->scf->state), sc_state_str(s->scb->state)); + + /* If txn is defined, HTTP req/rep states */ + if (txn) + chunk_appendf(&trace_buf, " HTTP=(%s,%s)", + h1_msg_state_str(txn->req.msg_state), h1_msg_state_str(txn->rsp.msg_state)); + if (msg) + chunk_appendf(&trace_buf, " %s", ((msg->chn->flags & CF_ISRESP) ? "RESPONSE" : "REQUEST")); + + if (src->verbosity == STRM_VERB_CLEAN) + return; + + /* If msg defined, display status-line if possible (verbosity > MINIMAL) */ + if (src->verbosity > STRM_VERB_MINIMAL && htx && htx_nbblks(htx)) { + const struct htx_blk *blk = __htx_get_head_blk(htx); + const struct htx_sl *sl = htx_get_blk_ptr(htx, blk); + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_REQ_SL || type == HTX_BLK_RES_SL) + chunk_appendf(&trace_buf, " - \"%.*s %.*s %.*s\"", + HTX_SL_P1_LEN(sl), HTX_SL_P1_PTR(sl), + HTX_SL_P2_LEN(sl), HTX_SL_P2_PTR(sl), + HTX_SL_P3_LEN(sl), HTX_SL_P3_PTR(sl)); + } + + chunk_appendf(&trace_buf, " - t=%p t.exp=%d s=(%p,0x%08x,0x%x)", + task, tick_isset(task->expire) ? TICKS_TO_MS(task->expire - now_ms) : TICK_ETERNITY, s, s->flags, s->conn_err_type); + + /* If txn defined info about HTTP msgs, otherwise info about SI. */ + if (txn) { + chunk_appendf(&trace_buf, " txn.flags=0x%08x, http.flags=(0x%08x,0x%08x) status=%d", + txn->flags, txn->req.flags, txn->rsp.flags, txn->status); + } + else { + chunk_appendf(&trace_buf, " scf=(%p,%d,0x%08x,0x%x) scb=(%p,%d,0x%08x,0x%x) scf.exp(r,w)=(%d,%d) scb.exp(r,w)=(%d,%d) retries=%d", + s->scf, s->scf->state, s->scf->flags, s->scf->sedesc->flags, + s->scb, s->scb->state, s->scb->flags, s->scb->sedesc->flags, + tick_isset(sc_ep_rcv_ex(s->scf)) ? TICKS_TO_MS(sc_ep_rcv_ex(s->scf) - now_ms) : TICK_ETERNITY, + tick_isset(sc_ep_snd_ex(s->scf)) ? TICKS_TO_MS(sc_ep_snd_ex(s->scf) - now_ms) : TICK_ETERNITY, + tick_isset(sc_ep_rcv_ex(s->scb)) ? TICKS_TO_MS(sc_ep_rcv_ex(s->scb) - now_ms) : TICK_ETERNITY, + tick_isset(sc_ep_snd_ex(s->scb)) ? TICKS_TO_MS(sc_ep_snd_ex(s->scb) - now_ms) : TICK_ETERNITY, + s->conn_retries); + } + + if (src->verbosity == STRM_VERB_MINIMAL) + return; + + + /* If txn defined, don't display all channel info */ + if (src->verbosity == STRM_VERB_SIMPLE || txn) { + chunk_appendf(&trace_buf, " req=(%p .fl=0x%08x .exp=%d)", + req, req->flags, tick_isset(req->analyse_exp) ? TICKS_TO_MS(req->analyse_exp - now_ms) : TICK_ETERNITY); + chunk_appendf(&trace_buf, " res=(%p .fl=0x%08x .exp=%d)", + res, res->flags, tick_isset(res->analyse_exp) ? TICKS_TO_MS(res->analyse_exp - now_ms) : TICK_ETERNITY); + } + else { + chunk_appendf(&trace_buf, " req=(%p .fl=0x%08x .ana=0x%08x .exp=%u .o=%lu .tot=%llu .to_fwd=%u)", + req, req->flags, req->analysers, req->analyse_exp, + (long)req->output, req->total, req->to_forward); + chunk_appendf(&trace_buf, " res=(%p .fl=0x%08x .ana=0x%08x .exp=%u .o=%lu .tot=%llu .to_fwd=%u)", + res, res->flags, res->analysers, res->analyse_exp, + (long)res->output, res->total, res->to_forward); + } + + if (src->verbosity == STRM_VERB_SIMPLE || + (src->verbosity == STRM_VERB_ADVANCED && src->level < TRACE_LEVEL_DATA)) + return; + + /* channels' buffer info */ + if (s->flags & SF_HTX) { + struct htx *rqhtx = htxbuf(&req->buf); + struct htx *rphtx = htxbuf(&res->buf); + + chunk_appendf(&trace_buf, " htx=(%u/%u#%u, %u/%u#%u)", + rqhtx->data, rqhtx->size, htx_nbblks(rqhtx), + rphtx->data, rphtx->size, htx_nbblks(rphtx)); + } + else { + chunk_appendf(&trace_buf, " buf=(%u@%p+%u/%u, %u@%p+%u/%u)", + (unsigned int)b_data(&req->buf), b_orig(&req->buf), + (unsigned int)b_head_ofs(&req->buf), (unsigned int)b_size(&req->buf), + (unsigned int)b_data(&res->buf), b_orig(&res->buf), + (unsigned int)b_head_ofs(&res->buf), (unsigned int)b_size(&res->buf)); + } + + /* If msg defined, display htx info if defined (level > USER) */ + if (src->level > TRACE_LEVEL_USER && htx && htx_nbblks(htx)) { + int full = 0; + + /* Full htx info (level > STATE && verbosity > SIMPLE) */ + if (src->level > TRACE_LEVEL_STATE) { + if (src->verbosity == STRM_VERB_COMPLETE) + full = 1; + } + + chunk_memcat(&trace_buf, "\n\t", 2); + htx_dump(&trace_buf, htx, full); + } +} + +/* Upgrade an existing stream for stream connector <sc>. Return < 0 on error. This + * is only valid right after a TCP to H1 upgrade. The stream should be + * "reativated" by removing SF_IGNORE flag. And the right mode must be set. On + * success, <input> buffer is transferred to the stream and thus points to + * BUF_NULL. On error, it is unchanged and it is the caller responsibility to + * release it (this never happens for now). + */ +int stream_upgrade_from_sc(struct stconn *sc, struct buffer *input) +{ + struct stream *s = __sc_strm(sc); + const struct mux_ops *mux = sc_mux_ops(sc); + + if (mux) { + if (mux->flags & MX_FL_HTX) + s->flags |= SF_HTX; + } + + if (!b_is_null(input)) { + /* Xfer the input buffer to the request channel. <input> will + * than point to BUF_NULL. From this point, it is the stream + * responsibility to release it. + */ + s->req.buf = *input; + *input = BUF_NULL; + s->req.total = (IS_HTX_STRM(s) ? htxbuf(&s->req.buf)->data : b_data(&s->req.buf)); + sc_ep_report_read_activity(s->scf); + } + + s->req.flags |= CF_READ_EVENT; /* Always report a read event */ + s->flags &= ~SF_IGNORE; + + task_wakeup(s->task, TASK_WOKEN_INIT); + return 0; +} + +/* Callback used to wake up a stream when an input buffer is available. The + * stream <s>'s stream connectors are checked for a failed buffer allocation + * as indicated by the presence of the SC_FL_NEED_BUFF flag and the lack of a + * buffer, and and input buffer is assigned there (at most one). The function + * returns 1 and wakes the stream up if a buffer was taken, otherwise zero. + * It's designed to be called from __offer_buffer(). + */ +int stream_buf_available(void *arg) +{ + struct stream *s = arg; + + if (!s->req.buf.size && !sc_ep_have_ff_data(s->scb) && s->scf->flags & SC_FL_NEED_BUFF && + b_alloc(&s->req.buf)) + sc_have_buff(s->scf); + else if (!s->res.buf.size && !sc_ep_have_ff_data(s->scf) && s->scb->flags & SC_FL_NEED_BUFF && + b_alloc(&s->res.buf)) + sc_have_buff(s->scb); + else + return 0; + + task_wakeup(s->task, TASK_WOKEN_RES); + return 1; + +} + +/* This function is called from the session handler which detects the end of + * handshake, in order to complete initialization of a valid stream. It must be + * called with a completely initialized session. It returns the pointer to + * the newly created stream, or NULL in case of fatal error. The client-facing + * end point is assigned to <origin>, which must be valid. The stream's task + * is configured with a nice value inherited from the listener's nice if any. + * The task's context is set to the new stream, and its function is set to + * process_stream(). Target and analysers are null. <input> is used as input + * buffer for the request channel and may contain data. On success, it is + * transfer to the stream and <input> is set to BUF_NULL. On error, <input> + * buffer is unchanged and it is the caller responsibility to release it. + */ +struct stream *stream_new(struct session *sess, struct stconn *sc, struct buffer *input) +{ + struct stream *s; + struct task *t; + + DBG_TRACE_ENTER(STRM_EV_STRM_NEW); + if (unlikely((s = pool_alloc(pool_head_stream)) == NULL)) + goto out_fail_alloc; + + /* minimum stream initialization required for an embryonic stream is + * fairly low. We need very little to execute L4 ACLs, then we need a + * task to make the client-side connection live on its own. + * - flags + * - stick-entry tracking + */ + s->flags = 0; + s->logs.logwait = sess->fe->to_log; + s->logs.level = 0; + s->logs.request_ts = 0; + s->logs.t_queue = -1; + s->logs.t_connect = -1; + s->logs.t_data = -1; + s->logs.t_close = 0; + s->logs.bytes_in = s->logs.bytes_out = 0; + s->logs.prx_queue_pos = 0; /* we get the number of pending conns before us */ + s->logs.srv_queue_pos = 0; /* we will get this number soon */ + s->obj_type = OBJ_TYPE_STREAM; + + s->logs.accept_date = sess->accept_date; + s->logs.accept_ts = sess->accept_ts; + s->logs.t_handshake = sess->t_handshake; + s->logs.t_idle = sess->t_idle; + + /* default logging function */ + s->do_log = strm_log; + + /* default error reporting function, may be changed by analysers */ + s->srv_error = default_srv_error; + + /* Initialise the current rule list pointer to NULL. We are sure that + * any rulelist match the NULL pointer. + */ + s->current_rule_list = NULL; + s->current_rule = NULL; + s->rules_exp = TICK_ETERNITY; + s->last_rule_file = NULL; + s->last_rule_line = 0; + + s->stkctr = NULL; + if (pool_head_stk_ctr) { + s->stkctr = pool_alloc(pool_head_stk_ctr); + if (!s->stkctr) + goto out_fail_alloc; + + /* Copy SC counters for the stream. We don't touch refcounts because + * any reference we have is inherited from the session. Since the stream + * doesn't exist without the session, the session's existence guarantees + * we don't lose the entry. During the store operation, the stream won't + * touch these ones. + */ + memcpy(s->stkctr, sess->stkctr, sizeof(s->stkctr[0]) * global.tune.nb_stk_ctr); + } + + s->sess = sess; + + s->stream_epoch = _HA_ATOMIC_LOAD(&stream_epoch); + s->uniq_id = _HA_ATOMIC_FETCH_ADD(&global.req_count, 1); + + /* OK, we're keeping the stream, so let's properly initialize the stream */ + LIST_INIT(&s->back_refs); + + LIST_INIT(&s->buffer_wait.list); + s->buffer_wait.target = s; + s->buffer_wait.wakeup_cb = stream_buf_available; + + s->lat_time = s->cpu_time = 0; + s->call_rate.curr_tick = s->call_rate.curr_ctr = s->call_rate.prev_ctr = 0; + s->pcli_next_pid = 0; + s->pcli_flags = 0; + s->unique_id = IST_NULL; + + if ((t = task_new_here()) == NULL) + goto out_fail_alloc; + + s->task = t; + s->pending_events = 0; + s->conn_retries = 0; + s->conn_exp = TICK_ETERNITY; + s->conn_err_type = STRM_ET_NONE; + s->prev_conn_state = SC_ST_INI; + t->process = process_stream; + t->context = s; + t->expire = TICK_ETERNITY; + if (sess->listener) + t->nice = sess->listener->bind_conf->nice; + + /* Note: initially, the stream's backend points to the frontend. + * This changes later when switching rules are executed or + * when the default backend is assigned. + */ + s->be = sess->fe; + s->req_cap = NULL; + s->res_cap = NULL; + + /* Initialize all the variables contexts even if not used. + * This permits to prune these contexts without errors. + * + * We need to make sure that those lists are not re-initialized + * by stream-dependant underlying code because we could lose + * track of already defined variables, leading to data inconsistency + * and memory leaks... + * + * For reference: we had a very old bug caused by vars_txn and + * vars_reqres being accidentally re-initialized in http_create_txn() + * (https://github.com/haproxy/haproxy/issues/1935) + */ + vars_init_head(&s->vars_txn, SCOPE_TXN); + vars_init_head(&s->vars_reqres, SCOPE_REQ); + + /* Set SF_HTX flag for HTTP frontends. */ + if (sess->fe->mode == PR_MODE_HTTP) + s->flags |= SF_HTX; + + s->scf = sc; + if (sc_attach_strm(s->scf, s) < 0) + goto out_fail_attach_scf; + + s->scb = sc_new_from_strm(s, SC_FL_ISBACK); + if (!s->scb) + goto out_fail_alloc_scb; + + sc_set_state(s->scf, SC_ST_EST); + + if (likely(sess->fe->options2 & PR_O2_INDEPSTR)) + s->scf->flags |= SC_FL_INDEP_STR; + + if (likely(sess->fe->options2 & PR_O2_INDEPSTR)) + s->scb->flags |= SC_FL_INDEP_STR; + + if (sc_ep_test(sc, SE_FL_WEBSOCKET)) + s->flags |= SF_WEBSOCKET; + if (sc_conn(sc)) { + const struct mux_ops *mux = sc_mux_ops(sc); + + if (mux && mux->flags & MX_FL_HTX) + s->flags |= SF_HTX; + } + + stream_init_srv_conn(s); + s->target = sess->fe->default_target; + + s->pend_pos = NULL; + s->priority_class = 0; + s->priority_offset = 0; + + /* init store persistence */ + s->store_count = 0; + + channel_init(&s->req); + s->req.flags |= CF_READ_EVENT; /* the producer is already connected */ + s->req.analysers = sess->listener ? sess->listener->bind_conf->analysers : sess->fe->fe_req_ana; + + if (IS_HTX_STRM(s)) { + /* Be sure to have HTTP analysers because in case of + * "destructive" stream upgrade, they may be missing (e.g + * TCP>H2) + */ + s->req.analysers |= AN_REQ_WAIT_HTTP|AN_REQ_HTTP_PROCESS_FE; + } + + if (!sess->fe->fe_req_ana) { + channel_auto_connect(&s->req); /* don't wait to establish connection */ + channel_auto_close(&s->req); /* let the producer forward close requests */ + } + + s->scf->ioto = sess->fe->timeout.client; + s->req.analyse_exp = TICK_ETERNITY; + + channel_init(&s->res); + s->res.flags |= CF_ISRESP; + s->res.analysers = 0; + + if (sess->fe->options2 & PR_O2_NODELAY) { + s->scf->flags |= SC_FL_SND_NEVERWAIT; + s->scb->flags |= SC_FL_SND_NEVERWAIT; + } + + s->scb->ioto = TICK_ETERNITY; + s->res.analyse_exp = TICK_ETERNITY; + + s->txn = NULL; + s->hlua = NULL; + + s->resolv_ctx.requester = NULL; + s->resolv_ctx.hostname_dn = NULL; + s->resolv_ctx.hostname_dn_len = 0; + s->resolv_ctx.parent = NULL; + + s->tunnel_timeout = TICK_ETERNITY; + + LIST_APPEND(&th_ctx->streams, &s->list); + + if (flt_stream_init(s) < 0 || flt_stream_start(s) < 0) + goto out_fail_accept; + + /* just in case the caller would have pre-disabled it */ + se_will_consume(s->scf->sedesc); + + if (sess->fe->accept && sess->fe->accept(s) < 0) + goto out_fail_accept; + + if (!b_is_null(input)) { + /* Xfer the input buffer to the request channel. <input> will + * than point to BUF_NULL. From this point, it is the stream + * responsibility to release it. + */ + s->req.buf = *input; + *input = BUF_NULL; + s->req.total = (IS_HTX_STRM(s) ? htxbuf(&s->req.buf)->data : b_data(&s->req.buf)); + sc_ep_report_read_activity(s->scf); + } + + /* it is important not to call the wakeup function directly but to + * pass through task_wakeup(), because this one knows how to apply + * priorities to tasks. Using multi thread we must be sure that + * stream is fully initialized before calling task_wakeup. So + * the caller must handle the task_wakeup + */ + DBG_TRACE_LEAVE(STRM_EV_STRM_NEW, s); + task_wakeup(s->task, TASK_WOKEN_INIT); + return s; + + /* Error unrolling */ + out_fail_accept: + flt_stream_release(s, 0); + LIST_DELETE(&s->list); + sc_free(s->scb); + out_fail_alloc_scb: + out_fail_attach_scf: + task_destroy(t); + out_fail_alloc: + if (s) + pool_free(pool_head_stk_ctr, s->stkctr); + pool_free(pool_head_stream, s); + DBG_TRACE_DEVEL("leaving on error", STRM_EV_STRM_NEW|STRM_EV_STRM_ERR); + return NULL; +} + +/* + * frees the context associated to a stream. It must have been removed first. + */ +void stream_free(struct stream *s) +{ + struct session *sess = strm_sess(s); + struct proxy *fe = sess->fe; + struct bref *bref, *back; + int i; + + DBG_TRACE_POINT(STRM_EV_STRM_FREE, s); + + /* detach the stream from its own task before even releasing it so + * that walking over a task list never exhibits a dying stream. + */ + s->task->context = NULL; + __ha_barrier_store(); + + pendconn_free(s); + + if (objt_server(s->target)) { /* there may be requests left pending in queue */ + if (s->flags & SF_CURR_SESS) { + s->flags &= ~SF_CURR_SESS; + _HA_ATOMIC_DEC(&__objt_server(s->target)->cur_sess); + } + if (may_dequeue_tasks(__objt_server(s->target), s->be)) + process_srv_queue(__objt_server(s->target)); + } + + if (unlikely(s->srv_conn)) { + /* the stream still has a reserved slot on a server, but + * it should normally be only the same as the one above, + * so this should not happen in fact. + */ + sess_change_server(s, NULL); + } + + /* We may still be present in the buffer wait queue */ + if (LIST_INLIST(&s->buffer_wait.list)) + LIST_DEL_INIT(&s->buffer_wait.list); + + if (s->req.buf.size || s->res.buf.size) { + int count = !!s->req.buf.size + !!s->res.buf.size; + + b_free(&s->req.buf); + b_free(&s->res.buf); + offer_buffers(NULL, count); + } + + pool_free(pool_head_uniqueid, s->unique_id.ptr); + s->unique_id = IST_NULL; + + flt_stream_stop(s); + flt_stream_release(s, 0); + + hlua_ctx_destroy(s->hlua); + s->hlua = NULL; + if (s->txn) + http_destroy_txn(s); + + /* ensure the client-side transport layer is destroyed */ + /* Be sure it is useless !! */ + /* if (cli_cs) */ + /* cs_close(cli_cs); */ + + for (i = 0; i < s->store_count; i++) { + if (!s->store[i].ts) + continue; + stksess_free(s->store[i].table, s->store[i].ts); + s->store[i].ts = NULL; + } + + if (s->resolv_ctx.requester) { + __decl_thread(struct resolvers *resolvers = s->resolv_ctx.parent->arg.resolv.resolvers); + + HA_SPIN_LOCK(DNS_LOCK, &resolvers->lock); + ha_free(&s->resolv_ctx.hostname_dn); + s->resolv_ctx.hostname_dn_len = 0; + resolv_unlink_resolution(s->resolv_ctx.requester); + HA_SPIN_UNLOCK(DNS_LOCK, &resolvers->lock); + + pool_free(resolv_requester_pool, s->resolv_ctx.requester); + s->resolv_ctx.requester = NULL; + } + + if (fe) { + if (s->req_cap) { + struct cap_hdr *h; + for (h = fe->req_cap; h; h = h->next) + pool_free(h->pool, s->req_cap[h->index]); + pool_free(fe->req_cap_pool, s->req_cap); + } + + if (s->res_cap) { + struct cap_hdr *h; + for (h = fe->rsp_cap; h; h = h->next) + pool_free(h->pool, s->res_cap[h->index]); + pool_free(fe->rsp_cap_pool, s->res_cap); + } + } + + /* Cleanup all variable contexts. */ + if (!LIST_ISEMPTY(&s->vars_txn.head)) + vars_prune(&s->vars_txn, s->sess, s); + if (!LIST_ISEMPTY(&s->vars_reqres.head)) + vars_prune(&s->vars_reqres, s->sess, s); + + stream_store_counters(s); + pool_free(pool_head_stk_ctr, s->stkctr); + + list_for_each_entry_safe(bref, back, &s->back_refs, users) { + /* we have to unlink all watchers. We must not relink them if + * this stream was the last one in the list. This is safe to do + * here because we're touching our thread's list so we know + * that other streams are not active, and the watchers will + * only touch their node under thread isolation. + */ + LIST_DEL_INIT(&bref->users); + if (s->list.n != &th_ctx->streams) + LIST_APPEND(&LIST_ELEM(s->list.n, struct stream *, list)->back_refs, &bref->users); + bref->ref = s->list.n; + __ha_barrier_store(); + } + LIST_DELETE(&s->list); + + sc_destroy(s->scb); + sc_destroy(s->scf); + + pool_free(pool_head_stream, s); + + /* We may want to free the maximum amount of pools if the proxy is stopping */ + if (fe && unlikely(fe->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) { + pool_flush(pool_head_buffer); + pool_flush(pool_head_http_txn); + pool_flush(pool_head_requri); + pool_flush(pool_head_capture); + pool_flush(pool_head_stream); + pool_flush(pool_head_session); + pool_flush(pool_head_connection); + pool_flush(pool_head_pendconn); + pool_flush(fe->req_cap_pool); + pool_flush(fe->rsp_cap_pool); + } +} + + +/* Allocates a work buffer for stream <s>. It is meant to be called inside + * process_stream(). It will only allocate the side needed for the function + * to work fine, which is the response buffer so that an error message may be + * built and returned. Response buffers may be allocated from the reserve, this + * is critical to ensure that a response may always flow and will never block a + * server from releasing a connection. Returns 0 in case of failure, non-zero + * otherwise. + */ +static int stream_alloc_work_buffer(struct stream *s) +{ + if (b_alloc(&s->res.buf)) + return 1; + return 0; +} + +/* releases unused buffers after processing. Typically used at the end of the + * update() functions. It will try to wake up as many tasks/applets as the + * number of buffers that it releases. In practice, most often streams are + * blocked on a single buffer, so it makes sense to try to wake two up when two + * buffers are released at once. + */ +void stream_release_buffers(struct stream *s) +{ + int offer = 0; + + if (c_size(&s->req) && c_empty(&s->req)) { + offer++; + b_free(&s->req.buf); + } + if (c_size(&s->res) && c_empty(&s->res)) { + offer++; + b_free(&s->res.buf); + } + + /* if we're certain to have at least 1 buffer available, and there is + * someone waiting, we can wake up a waiter and offer them. + */ + if (offer) + offer_buffers(s, offer); +} + +void stream_process_counters(struct stream *s) +{ + struct session *sess = s->sess; + unsigned long long bytes; + int i; + + bytes = s->req.total - s->logs.bytes_in; + s->logs.bytes_in = s->req.total; + if (bytes) { + _HA_ATOMIC_ADD(&sess->fe->fe_counters.bytes_in, bytes); + _HA_ATOMIC_ADD(&s->be->be_counters.bytes_in, bytes); + + if (objt_server(s->target)) + _HA_ATOMIC_ADD(&__objt_server(s->target)->counters.bytes_in, bytes); + + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_ADD(&sess->listener->counters->bytes_in, bytes); + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + if (!stkctr_inc_bytes_in_ctr(&s->stkctr[i], bytes)) + stkctr_inc_bytes_in_ctr(&sess->stkctr[i], bytes); + } + } + + bytes = s->res.total - s->logs.bytes_out; + s->logs.bytes_out = s->res.total; + if (bytes) { + _HA_ATOMIC_ADD(&sess->fe->fe_counters.bytes_out, bytes); + _HA_ATOMIC_ADD(&s->be->be_counters.bytes_out, bytes); + + if (objt_server(s->target)) + _HA_ATOMIC_ADD(&__objt_server(s->target)->counters.bytes_out, bytes); + + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_ADD(&sess->listener->counters->bytes_out, bytes); + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + if (!stkctr_inc_bytes_out_ctr(&s->stkctr[i], bytes)) + stkctr_inc_bytes_out_ctr(&sess->stkctr[i], bytes); + } + } +} + +/* Abort processing on the both channels in same time */ +void stream_abort(struct stream *s) +{ + channel_abort(&s->req); + channel_abort(&s->res); +} + +/* + * Returns a message to the client ; the connection is shut down for read, + * and the request is cleared so that no server connection can be initiated. + * The buffer is marked for read shutdown on the other side to protect the + * message, and the buffer write is enabled. The message is contained in a + * "chunk". If it is null, then an empty message is used. The reply buffer does + * not need to be empty before this, and its contents will not be overwritten. + * The primary goal of this function is to return error messages to a client. + */ +void stream_retnclose(struct stream *s, const struct buffer *msg) +{ + struct channel *ic = &s->req; + struct channel *oc = &s->res; + + channel_auto_read(ic); + channel_abort(ic); + channel_erase(ic); + channel_truncate(oc); + + if (likely(msg && msg->data)) + co_inject(oc, msg->area, msg->data); + + channel_auto_read(oc); + channel_auto_close(oc); + sc_schedule_abort(s->scb); +} + +int stream_set_timeout(struct stream *s, enum act_timeout_name name, int timeout) +{ + switch (name) { + case ACT_TIMEOUT_CLIENT: + s->scf->ioto = timeout; + return 1; + + case ACT_TIMEOUT_SERVER: + s->scb->ioto = timeout; + return 1; + + case ACT_TIMEOUT_TUNNEL: + s->tunnel_timeout = timeout; + return 1; + + default: + return 0; + } +} + +/* + * This function handles the transition between the SC_ST_CON state and the + * SC_ST_EST state. It must only be called after switching from SC_ST_CON (or + * SC_ST_INI or SC_ST_RDY) to SC_ST_EST, but only when a ->proto is defined. + * Note that it will switch the interface to SC_ST_DIS if we already have + * the SC_FL_ABRT_DONE flag, it means we were able to forward the request, and + * receive the response, before process_stream() had the opportunity to + * make the switch from SC_ST_CON to SC_ST_EST. When that happens, we want + * to go through back_establish() anyway, to make sure the analysers run. + * Timeouts are cleared. Error are reported on the channel so that analysers + * can handle them. + */ +void back_establish(struct stream *s) +{ + struct connection *conn = sc_conn(s->scb); + struct channel *req = &s->req; + struct channel *rep = &s->res; + + DBG_TRACE_ENTER(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); + /* First, centralize the timers information, and clear any irrelevant + * timeout. + */ + s->logs.t_connect = ns_to_ms(now_ns - s->logs.accept_ts); + s->conn_exp = TICK_ETERNITY; + s->flags &= ~SF_CONN_EXP; + + /* errors faced after sending data need to be reported */ + if ((s->scb->flags & SC_FL_ERROR) && req->flags & CF_WROTE_DATA) { + s->req.flags |= CF_WRITE_EVENT; + s->res.flags |= CF_READ_EVENT; + s->conn_err_type = STRM_ET_DATA_ERR; + DBG_TRACE_STATE("read/write error", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + } + + if (objt_server(s->target)) + health_adjust(__objt_server(s->target), HANA_STATUS_L4_OK); + + if (!IS_HTX_STRM(s)) { /* let's allow immediate data connection in this case */ + /* if the user wants to log as soon as possible, without counting + * bytes from the server, then this is the right moment. */ + if (!LIST_ISEMPTY(&strm_fe(s)->logformat) && !(s->logs.logwait & LW_BYTES)) { + /* note: no pend_pos here, session is established */ + s->logs.t_close = s->logs.t_connect; /* to get a valid end date */ + s->do_log(s); + } + } + else { + s->scb->flags |= SC_FL_RCV_ONCE; /* a single read is enough to get response headers */ + } + + rep->analysers |= strm_fe(s)->fe_rsp_ana | s->be->be_rsp_ana; + + se_have_more_data(s->scb->sedesc); + rep->flags |= CF_READ_EVENT; /* producer is now attached */ + sc_ep_report_read_activity(s->scb); + if (conn) { + /* real connections have timeouts + * if already defined, it means that a set-timeout rule has + * been executed so do not overwrite them + */ + if (!tick_isset(s->scb->ioto)) + s->scb->ioto = s->be->timeout.server; + if (!tick_isset(s->tunnel_timeout)) + s->tunnel_timeout = s->be->timeout.tunnel; + + /* The connection is now established, try to read data from the + * underlying layer, and subscribe to recv events. We use a + * delayed recv here to give a chance to the data to flow back + * by the time we process other tasks. + */ + sc_chk_rcv(s->scb); + } + /* If we managed to get the whole response, and we don't have anything + * left to send, or can't, switch to SC_ST_DIS now. */ + if ((s->scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) || (s->scf->flags & SC_FL_SHUT_DONE)) { + s->scb->state = SC_ST_DIS; + DBG_TRACE_STATE("response channel shutdwn for read/write", STRM_EV_STRM_PROC|STRM_EV_CS_ST|STRM_EV_STRM_ERR, s); + } + + DBG_TRACE_LEAVE(STRM_EV_STRM_PROC|STRM_EV_CS_ST, s); +} + +/* Set correct stream termination flags in case no analyser has done it. It + * also counts a failed request if the server state has not reached the request + * stage. + */ +void sess_set_term_flags(struct stream *s) +{ + if (!(s->flags & SF_FINST_MASK)) { + if (s->scb->state == SC_ST_INI) { + /* anything before REQ in fact */ + _HA_ATOMIC_INC(&strm_fe(s)->fe_counters.failed_req); + if (strm_li(s) && strm_li(s)->counters) + _HA_ATOMIC_INC(&strm_li(s)->counters->failed_req); + + s->flags |= SF_FINST_R; + } + else if (s->scb->state == SC_ST_QUE) + s->flags |= SF_FINST_Q; + else if (sc_state_in(s->scb->state, SC_SB_REQ|SC_SB_TAR|SC_SB_ASS|SC_SB_CON|SC_SB_CER|SC_SB_RDY)) + s->flags |= SF_FINST_C; + else if (s->scb->state == SC_ST_EST || s->prev_conn_state == SC_ST_EST) + s->flags |= SF_FINST_D; + else + s->flags |= SF_FINST_L; + } +} + +/* This function parses the use-service action ruleset. It executes + * the associated ACL and set an applet as a stream or txn final node. + * it returns ACT_RET_ERR if an error occurs, the proxy left in + * consistent state. It returns ACT_RET_STOP in success case because + * use-service must be a terminal action. Returns ACT_RET_YIELD + * if the initialisation function require more data. + */ +enum act_return process_use_service(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) + +{ + struct appctx *appctx; + + /* Initialises the applet if it is required. */ + if (flags & ACT_OPT_FIRST) { + /* Register applet. this function schedules the applet. */ + s->target = &rule->applet.obj_type; + appctx = sc_applet_create(s->scb, objt_applet(s->target)); + if (unlikely(!appctx)) + return ACT_RET_ERR; + + /* Finish initialisation of the context. */ + appctx->rule = rule; + if (appctx_init(appctx) == -1) + return ACT_RET_ERR; + } + else + appctx = __sc_appctx(s->scb); + + if (rule->from != ACT_F_HTTP_REQ) { + if (sess->fe == s->be) /* report it if the request was intercepted by the frontend */ + _HA_ATOMIC_INC(&sess->fe->fe_counters.intercepted_req); + + /* The flag SF_ASSIGNED prevent from server assignment. */ + s->flags |= SF_ASSIGNED; + } + + /* Now we can schedule the applet. */ + applet_need_more_data(appctx); + appctx_wakeup(appctx); + return ACT_RET_STOP; +} + +/* This stream analyser checks the switching rules and changes the backend + * if appropriate. The default_backend rule is also considered, then the + * target backend's forced persistence rules are also evaluated last if any. + * It returns 1 if the processing can continue on next analysers, or zero if it + * either needs more data or wants to immediately abort the request. + */ +static int process_switching_rules(struct stream *s, struct channel *req, int an_bit) +{ + struct persist_rule *prst_rule; + struct session *sess = s->sess; + struct proxy *fe = sess->fe; + + req->analysers &= ~an_bit; + req->analyse_exp = TICK_ETERNITY; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA, s); + + /* now check whether we have some switching rules for this request */ + if (!(s->flags & SF_BE_ASSIGNED)) { + struct switching_rule *rule; + + list_for_each_entry(rule, &fe->switching_rules, list) { + int ret = 1; + + if (rule->cond) { + ret = acl_exec_cond(rule->cond, fe, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + /* If the backend name is dynamic, try to resolve the name. + * If we can't resolve the name, or if any error occurs, break + * the loop and fallback to the default backend. + */ + struct proxy *backend = NULL; + + if (rule->dynamic) { + struct buffer *tmp; + + tmp = alloc_trash_chunk(); + if (!tmp) + goto sw_failed; + + if (build_logline(s, tmp->area, tmp->size, &rule->be.expr)) + backend = proxy_be_by_name(tmp->area); + + free_trash_chunk(tmp); + tmp = NULL; + + if (!backend) + break; + } + else + backend = rule->be.backend; + + if (!stream_set_backend(s, backend)) + goto sw_failed; + break; + } + } + + /* To ensure correct connection accounting on the backend, we + * have to assign one if it was not set (eg: a listen). This + * measure also takes care of correctly setting the default + * backend if any. Don't do anything if an upgrade is already in + * progress. + */ + if (!(s->flags & (SF_BE_ASSIGNED|SF_IGNORE))) + if (!stream_set_backend(s, fe->defbe.be ? fe->defbe.be : s->be)) + goto sw_failed; + + /* No backend assigned but no error reported. It happens when a + * TCP stream is upgraded to HTTP/2. + */ + if ((s->flags & (SF_BE_ASSIGNED|SF_IGNORE)) == SF_IGNORE) { + DBG_TRACE_DEVEL("leaving with no backend because of a destructive upgrade", STRM_EV_STRM_ANA, s); + return 0; + } + + } + + /* we don't want to run the TCP or HTTP filters again if the backend has not changed */ + if (fe == s->be) { + s->req.analysers &= ~AN_REQ_INSPECT_BE; + s->req.analysers &= ~AN_REQ_HTTP_PROCESS_BE; + s->req.analysers &= ~AN_REQ_FLT_START_BE; + } + + /* as soon as we know the backend, we must check if we have a matching forced or ignored + * persistence rule, and report that in the stream. + */ + list_for_each_entry(prst_rule, &s->be->persist_rules, list) { + int ret = 1; + + if (prst_rule->cond) { + ret = acl_exec_cond(prst_rule->cond, s->be, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (prst_rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + /* no rule, or the rule matches */ + if (prst_rule->type == PERSIST_TYPE_FORCE) { + s->flags |= SF_FORCE_PRST; + } else { + s->flags |= SF_IGNORE_PRST; + } + break; + } + } + + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA, s); + return 1; + + sw_failed: + /* immediately abort this request in case of allocation failure */ + stream_abort(s); + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= SF_FINST_R; + + if (s->txn) + s->txn->status = 500; + s->req.analysers &= AN_REQ_FLT_END; + s->req.analyse_exp = TICK_ETERNITY; + DBG_TRACE_DEVEL("leaving on error", STRM_EV_STRM_ANA|STRM_EV_STRM_ERR, s); + return 0; +} + +/* This stream analyser works on a request. It applies all use-server rules on + * it then returns 1. The data must already be present in the buffer otherwise + * they won't match. It always returns 1. + */ +static int process_server_rules(struct stream *s, struct channel *req, int an_bit) +{ + struct proxy *px = s->be; + struct session *sess = s->sess; + struct server_rule *rule; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA, s); + + if (!(s->flags & SF_ASSIGNED)) { + list_for_each_entry(rule, &px->server_rules, list) { + int ret; + + ret = acl_exec_cond(rule->cond, s->be, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + + if (ret) { + struct server *srv; + + if (rule->dynamic) { + struct buffer *tmp = get_trash_chunk(); + + if (!build_logline(s, tmp->area, tmp->size, &rule->expr)) + break; + + srv = findserver(s->be, tmp->area); + if (!srv) + break; + } + else + srv = rule->srv.ptr; + + if ((srv->cur_state != SRV_ST_STOPPED) || + (px->options & PR_O_PERSIST) || + (s->flags & SF_FORCE_PRST)) { + s->flags |= SF_DIRECT | SF_ASSIGNED; + s->target = &srv->obj_type; + break; + } + /* if the server is not UP, let's go on with next rules + * just in case another one is suited. + */ + } + } + } + + req->analysers &= ~an_bit; + req->analyse_exp = TICK_ETERNITY; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA, s); + return 1; +} + +static inline void sticking_rule_find_target(struct stream *s, + struct stktable *t, struct stksess *ts) +{ + struct proxy *px = s->be; + struct eb32_node *node; + struct dict_entry *de; + void *ptr; + struct server *srv; + + /* Look for the server name previously stored in <t> stick-table */ + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &ts->lock); + ptr = __stktable_data_ptr(t, ts, STKTABLE_DT_SERVER_KEY); + de = stktable_data_cast(ptr, std_t_dict); + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &ts->lock); + + if (de) { + struct ebpt_node *node; + + if (t->server_key_type == STKTABLE_SRV_NAME) { + node = ebis_lookup(&px->conf.used_server_name, de->value.key); + if (node) { + srv = container_of(node, struct server, conf.name); + goto found; + } + } else if (t->server_key_type == STKTABLE_SRV_ADDR) { + HA_RWLOCK_RDLOCK(PROXY_LOCK, &px->lock); + node = ebis_lookup(&px->used_server_addr, de->value.key); + HA_RWLOCK_RDUNLOCK(PROXY_LOCK, &px->lock); + if (node) { + srv = container_of(node, struct server, addr_node); + goto found; + } + } + } + + /* Look for the server ID */ + HA_RWLOCK_RDLOCK(STK_SESS_LOCK, &ts->lock); + ptr = __stktable_data_ptr(t, ts, STKTABLE_DT_SERVER_ID); + node = eb32_lookup(&px->conf.used_server_id, stktable_data_cast(ptr, std_t_sint)); + HA_RWLOCK_RDUNLOCK(STK_SESS_LOCK, &ts->lock); + + if (!node) + return; + + srv = container_of(node, struct server, conf.id); + found: + if ((srv->cur_state != SRV_ST_STOPPED) || + (px->options & PR_O_PERSIST) || (s->flags & SF_FORCE_PRST)) { + s->flags |= SF_DIRECT | SF_ASSIGNED; + s->target = &srv->obj_type; + } +} + +/* This stream analyser works on a request. It applies all sticking rules on + * it then returns 1. The data must already be present in the buffer otherwise + * they won't match. It always returns 1. + */ +static int process_sticking_rules(struct stream *s, struct channel *req, int an_bit) +{ + struct proxy *px = s->be; + struct session *sess = s->sess; + struct sticking_rule *rule; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA, s); + + list_for_each_entry(rule, &px->sticking_rules, list) { + int ret = 1 ; + int i; + + /* Only the first stick store-request of each table is applied + * and other ones are ignored. The purpose is to allow complex + * configurations which look for multiple entries by decreasing + * order of precision and to stop at the first which matches. + * An example could be a store of the IP address from an HTTP + * header first, then from the source if not found. + */ + if (rule->flags & STK_IS_STORE) { + for (i = 0; i < s->store_count; i++) { + if (rule->table.t == s->store[i].table) + break; + } + + if (i != s->store_count) + continue; + } + + if (rule->cond) { + ret = acl_exec_cond(rule->cond, px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + struct stktable_key *key; + + key = stktable_fetch_key(rule->table.t, px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->expr, NULL); + if (!key) + continue; + + if (rule->flags & STK_IS_MATCH) { + struct stksess *ts; + + if ((ts = stktable_lookup_key(rule->table.t, key)) != NULL) { + if (!(s->flags & SF_ASSIGNED)) + sticking_rule_find_target(s, rule->table.t, ts); + stktable_touch_local(rule->table.t, ts, 1); + } + } + if (rule->flags & STK_IS_STORE) { + if (s->store_count < (sizeof(s->store) / sizeof(s->store[0]))) { + struct stksess *ts; + + ts = stksess_new(rule->table.t, key); + if (ts) { + s->store[s->store_count].table = rule->table.t; + s->store[s->store_count++].ts = ts; + } + } + } + } + } + + req->analysers &= ~an_bit; + req->analyse_exp = TICK_ETERNITY; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA, s); + return 1; +} + +/* This stream analyser works on a response. It applies all store rules on it + * then returns 1. The data must already be present in the buffer otherwise + * they won't match. It always returns 1. + */ +static int process_store_rules(struct stream *s, struct channel *rep, int an_bit) +{ + struct proxy *px = s->be; + struct session *sess = s->sess; + struct sticking_rule *rule; + int i; + int nbreq = s->store_count; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA, s); + + list_for_each_entry(rule, &px->storersp_rules, list) { + int ret = 1 ; + + /* Only the first stick store-response of each table is applied + * and other ones are ignored. The purpose is to allow complex + * configurations which look for multiple entries by decreasing + * order of precision and to stop at the first which matches. + * An example could be a store of a set-cookie value, with a + * fallback to a parameter found in a 302 redirect. + * + * The store-response rules are not allowed to override the + * store-request rules for the same table, but they may coexist. + * Thus we can have up to one store-request entry and one store- + * response entry for the same table at any time. + */ + for (i = nbreq; i < s->store_count; i++) { + if (rule->table.t == s->store[i].table) + break; + } + + /* skip existing entries for this table */ + if (i < s->store_count) + continue; + + if (rule->cond) { + ret = acl_exec_cond(rule->cond, px, sess, s, SMP_OPT_DIR_RES|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + struct stktable_key *key; + + key = stktable_fetch_key(rule->table.t, px, sess, s, SMP_OPT_DIR_RES|SMP_OPT_FINAL, rule->expr, NULL); + if (!key) + continue; + + if (s->store_count < (sizeof(s->store) / sizeof(s->store[0]))) { + struct stksess *ts; + + ts = stksess_new(rule->table.t, key); + if (ts) { + s->store[s->store_count].table = rule->table.t; + s->store[s->store_count++].ts = ts; + } + } + } + } + + /* process store request and store response */ + for (i = 0; i < s->store_count; i++) { + struct stksess *ts; + void *ptr; + char *key; + struct dict_entry *de; + struct stktable *t = s->store[i].table; + + if (!objt_server(s->target) || (__objt_server(s->target)->flags & SRV_F_NON_STICK)) { + stksess_free(s->store[i].table, s->store[i].ts); + s->store[i].ts = NULL; + continue; + } + + ts = stktable_set_entry(t, s->store[i].ts); + if (ts != s->store[i].ts) { + /* the entry already existed, we can free ours */ + stksess_free(t, s->store[i].ts); + } + s->store[i].ts = NULL; + + if (t->server_key_type == STKTABLE_SRV_NAME) + key = __objt_server(s->target)->id; + else if (t->server_key_type == STKTABLE_SRV_ADDR) + key = __objt_server(s->target)->addr_node.key; + else + key = NULL; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + ptr = __stktable_data_ptr(t, ts, STKTABLE_DT_SERVER_ID); + stktable_data_cast(ptr, std_t_sint) = __objt_server(s->target)->puid; + + if (key) { + de = dict_insert(&server_key_dict, key); + if (de) { + ptr = __stktable_data_ptr(t, ts, STKTABLE_DT_SERVER_KEY); + stktable_data_cast(ptr, std_t_dict) = de; + } + } + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + stktable_touch_local(t, ts, 1); + } + s->store_count = 0; /* everything is stored */ + + rep->analysers &= ~an_bit; + rep->analyse_exp = TICK_ETERNITY; + + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA, s); + return 1; +} + +/* Set the stream to HTTP mode, if necessary. The minimal request HTTP analysers + * are set and the client mux is upgraded. It returns 1 if the stream processing + * may continue or 0 if it should be stopped. It happens on error or if the + * upgrade required a new stream. The mux protocol may be specified. + */ +int stream_set_http_mode(struct stream *s, const struct mux_proto_list *mux_proto) +{ + struct stconn *sc = s->scf; + struct connection *conn; + + /* Already an HTTP stream */ + if (IS_HTX_STRM(s)) + return 1; + + s->req.analysers |= AN_REQ_WAIT_HTTP|AN_REQ_HTTP_PROCESS_FE; + + if (unlikely(!s->txn && !http_create_txn(s))) + return 0; + + conn = sc_conn(sc); + if (conn) { + se_have_more_data(s->scf->sedesc); + /* Make sure we're unsubscribed, the the new + * mux will probably want to subscribe to + * the underlying XPRT + */ + if (s->scf->wait_event.events) + conn->mux->unsubscribe(sc, s->scf->wait_event.events, &(s->scf->wait_event)); + + if (conn->mux->flags & MX_FL_NO_UPG) + return 0; + + sc_conn_prepare_endp_upgrade(sc); + if (conn_upgrade_mux_fe(conn, sc, &s->req.buf, + (mux_proto ? mux_proto->token : ist("")), + PROTO_MODE_HTTP) == -1) { + sc_conn_abort_endp_upgrade(sc); + return 0; + } + sc_conn_commit_endp_upgrade(sc); + + s->req.flags &= ~(CF_READ_EVENT|CF_AUTO_CONNECT); + s->req.total = 0; + s->flags |= SF_IGNORE; + if (sc_ep_test(sc, SE_FL_DETACHED)) { + /* If stream connector is detached, it means it was not + * reused by the new mux. Son destroy it, disable + * logging, and abort the stream process. Thus the + * stream will be silently destroyed. The new mux will + * create new streams. + */ + s->logs.logwait = 0; + s->logs.level = 0; + stream_abort(s); + s->req.analysers &= AN_REQ_FLT_END; + s->req.analyse_exp = TICK_ETERNITY; + } + } + + return 1; +} + + +/* Updates at once the channel flags, and timers of both stream connectors of a + * same stream, to complete the work after the analysers, then updates the data + * layer below. This will ensure that any synchronous update performed at the + * data layer will be reflected in the channel flags and/or stream connector. + * Note that this does not change the stream connector's current state, though + * it updates the previous state to the current one. + */ +void stream_update_both_sc(struct stream *s) +{ + struct stconn *scf = s->scf; + struct stconn *scb = s->scb; + struct channel *req = &s->req; + struct channel *res = &s->res; + + req->flags &= ~(CF_READ_EVENT|CF_WRITE_EVENT); + res->flags &= ~(CF_READ_EVENT|CF_WRITE_EVENT); + + s->prev_conn_state = scb->state; + + /* let's recompute both sides states */ + if (sc_state_in(scf->state, SC_SB_RDY|SC_SB_EST)) + sc_update(scf); + + if (sc_state_in(scb->state, SC_SB_RDY|SC_SB_EST)) + sc_update(scb); + + /* stream connectors are processed outside of process_stream() and must be + * handled at the latest moment. + */ + if (sc_appctx(scf)) { + if (sc_is_recv_allowed(scf) || sc_is_send_allowed(scf)) + appctx_wakeup(__sc_appctx(scf)); + } + if (sc_appctx(scb)) { + if (sc_is_recv_allowed(scb) || sc_is_send_allowed(scb)) + appctx_wakeup(__sc_appctx(scb)); + } +} + +/* check SC and channel timeouts, and close the corresponding stream connectors + * for future reads or writes. + * Note: this will also concern upper layers but we do not touch any other + * flag. We must be careful and correctly detect state changes when calling + * them. + */ +static void stream_handle_timeouts(struct stream *s) +{ + stream_check_conn_timeout(s); + + sc_check_timeouts(s->scf); + channel_check_timeout(&s->req); + sc_check_timeouts(s->scb); + channel_check_timeout(&s->res); + + if (unlikely(!(s->scb->flags & SC_FL_SHUT_DONE) && (s->req.flags & CF_WRITE_TIMEOUT))) { + s->scb->flags |= SC_FL_NOLINGER; + sc_shutdown(s->scb); + } + + if (unlikely(!(s->scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && (s->req.flags & CF_READ_TIMEOUT))) { + if (s->scf->flags & SC_FL_NOHALF) + s->scf->flags |= SC_FL_NOLINGER; + sc_abort(s->scf); + } + if (unlikely(!(s->scf->flags & SC_FL_SHUT_DONE) && (s->res.flags & CF_WRITE_TIMEOUT))) { + s->scf->flags |= SC_FL_NOLINGER; + sc_shutdown(s->scf); + } + + if (unlikely(!(s->scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && (s->res.flags & CF_READ_TIMEOUT))) { + if (s->scb->flags & SC_FL_NOHALF) + s->scb->flags |= SC_FL_NOLINGER; + sc_abort(s->scb); + } + + if (HAS_FILTERS(s)) + flt_stream_check_timeouts(s); +} + +/* if the current task's wake_date was set, it's being profiled, thus we may + * report latencies and CPU usages in logs, so it's desirable to update the + * latency when entering process_stream(). + */ +static void stream_cond_update_cpu_latency(struct stream *s) +{ + uint32_t lat = th_ctx->sched_call_date - th_ctx->sched_wake_date; + + s->lat_time += lat; +} + +/* if the current task's wake_date was set, it's being profiled, thus we may + * report latencies and CPU usages in logs, so it's desirable to do that before + * logging in order to report accurate CPU usage. In this case we count that + * final part and reset the wake date so that the scheduler doesn't do it a + * second time, and by doing so we also avoid an extra call to clock_gettime(). + * The CPU usage will be off by the little time needed to run over stream_free() + * but that's only marginal. + */ +static void stream_cond_update_cpu_usage(struct stream *s) +{ + uint32_t cpu; + + /* stats are only registered for non-zero wake dates */ + if (likely(!th_ctx->sched_wake_date)) + return; + + cpu = (uint32_t)now_mono_time() - th_ctx->sched_call_date; + s->cpu_time += cpu; + HA_ATOMIC_ADD(&th_ctx->sched_profile_entry->cpu_time, cpu); + th_ctx->sched_wake_date = 0; +} + +/* this functions is called directly by the scheduler for tasks whose + * ->process points to process_stream(), and is used to keep latencies + * and CPU usage measurements accurate. + */ +void stream_update_timings(struct task *t, uint64_t lat, uint64_t cpu) +{ + struct stream *s = t->context; + s->lat_time += lat; + s->cpu_time += cpu; +} + + +/* This macro is very specific to the function below. See the comments in + * process_stream() below to understand the logic and the tests. + */ +#define UPDATE_ANALYSERS(real, list, back, flag) { \ + list = (((list) & ~(flag)) | ~(back)) & (real); \ + back = real; \ + if (!(list)) \ + break; \ + if (((list) ^ ((list) & ((list) - 1))) < (flag)) \ + continue; \ +} + +/* These 2 following macros call an analayzer for the specified channel if the + * right flag is set. The first one is used for "filterable" analyzers. If a + * stream has some registered filters, pre and post analyaze callbacks are + * called. The second are used for other analyzers (AN_REQ/RES_FLT_* and + * AN_REQ/RES_HTTP_XFER_BODY) */ +#define FLT_ANALYZE(strm, chn, fun, list, back, flag, ...) \ + { \ + if ((list) & (flag)) { \ + if (HAS_FILTERS(strm)) { \ + if (!flt_pre_analyze((strm), (chn), (flag))) \ + break; \ + if (!fun((strm), (chn), (flag), ##__VA_ARGS__)) \ + break; \ + if (!flt_post_analyze((strm), (chn), (flag))) \ + break; \ + } \ + else { \ + if (!fun((strm), (chn), (flag), ##__VA_ARGS__)) \ + break; \ + } \ + UPDATE_ANALYSERS((chn)->analysers, (list), \ + (back), (flag)); \ + } \ + } + +#define ANALYZE(strm, chn, fun, list, back, flag, ...) \ + { \ + if ((list) & (flag)) { \ + if (!fun((strm), (chn), (flag), ##__VA_ARGS__)) \ + break; \ + UPDATE_ANALYSERS((chn)->analysers, (list), \ + (back), (flag)); \ + } \ + } + +/* Processes the client, server, request and response jobs of a stream task, + * then puts it back to the wait queue in a clean state, or cleans up its + * resources if it must be deleted. Returns in <next> the date the task wants + * to be woken up, or TICK_ETERNITY. In order not to call all functions for + * nothing too many times, the request and response buffers flags are monitored + * and each function is called only if at least another function has changed at + * least one flag it is interested in. + */ +struct task *process_stream(struct task *t, void *context, unsigned int state) +{ + struct server *srv; + struct stream *s = context; + struct session *sess = s->sess; + unsigned int scf_flags, scb_flags; + unsigned int rqf_last, rpf_last; + unsigned int rq_prod_last, rq_cons_last; + unsigned int rp_cons_last, rp_prod_last; + unsigned int req_ana_back, res_ana_back; + struct channel *req, *res; + struct stconn *scf, *scb; + unsigned int rate; + + DBG_TRACE_ENTER(STRM_EV_STRM_PROC, s); + + activity[tid].stream_calls++; + stream_cond_update_cpu_latency(s); + + req = &s->req; + res = &s->res; + + scf = s->scf; + scb = s->scb; + + /* First, attempt to receive pending data from I/O layers */ + sc_conn_sync_recv(scf); + sc_conn_sync_recv(scb); + + /* Let's check if we're looping without making any progress, e.g. due + * to a bogus analyser or the fact that we're ignoring a read0. The + * call_rate counter only counts calls with no progress made. + */ + if (!((req->flags | res->flags) & (CF_READ_EVENT|CF_WRITE_EVENT))) { + rate = update_freq_ctr(&s->call_rate, 1); + if (rate >= 100000 && s->call_rate.prev_ctr) // make sure to wait at least a full second + stream_dump_and_crash(&s->obj_type, read_freq_ctr(&s->call_rate)); + } + + /* this data may be no longer valid, clear it */ + if (s->txn) + memset(&s->txn->auth, 0, sizeof(s->txn->auth)); + + /* This flag must explicitly be set every time */ + req->flags &= ~CF_WAKE_WRITE; + res->flags &= ~CF_WAKE_WRITE; + + /* Keep a copy of req/rep flags so that we can detect shutdowns */ + rqf_last = req->flags & ~CF_MASK_ANALYSER; + rpf_last = res->flags & ~CF_MASK_ANALYSER; + + /* we don't want the stream connector functions to recursively wake us up */ + scf->flags |= SC_FL_DONT_WAKE; + scb->flags |= SC_FL_DONT_WAKE; + + /* Keep a copy of SC flags */ + scf_flags = scf->flags; + scb_flags = scb->flags; + + /* update pending events */ + s->pending_events |= (state & TASK_WOKEN_ANY); + + /* 1a: Check for low level timeouts if needed. We just set a flag on + * stream connectors when their timeouts have expired. + */ + if (unlikely(s->pending_events & TASK_WOKEN_TIMER)) { + stream_handle_timeouts(s); + + /* Once in a while we're woken up because the task expires. But + * this does not necessarily mean that a timeout has been reached. + * So let's not run a whole stream processing if only an expiration + * timeout needs to be refreshed. + */ + if (!((scf->flags | scb->flags) & (SC_FL_ERROR|SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_SHUT_DONE)) && + !((req->flags | res->flags) & (CF_READ_EVENT|CF_READ_TIMEOUT|CF_WRITE_EVENT|CF_WRITE_TIMEOUT)) && + !(s->flags & SF_CONN_EXP) && + ((s->pending_events & TASK_WOKEN_ANY) == TASK_WOKEN_TIMER)) { + scf->flags &= ~SC_FL_DONT_WAKE; + scb->flags &= ~SC_FL_DONT_WAKE; + goto update_exp_and_leave; + } + } + + resync_stconns: + /* below we may emit error messages so we have to ensure that we have + * our buffers properly allocated. If the allocation failed, an error is + * triggered. + * + * NOTE: An error is returned because the mechanism to queue entities + * waiting for a buffer is totally broken for now. However, this + * part must be refactored. When it will be handled, this part + * must be be reviewed too. + */ + if (!stream_alloc_work_buffer(s)) { + scf->flags |= SC_FL_ERROR; + s->conn_err_type = STRM_ET_CONN_RES; + + scb->flags |= SC_FL_ERROR; + s->conn_err_type = STRM_ET_CONN_RES; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_RESOURCE; + sess_set_term_flags(s); + } + + /* 1b: check for low-level errors reported at the stream connector. + * First we check if it's a retryable error (in which case we don't + * want to tell the buffer). Otherwise we report the error one level + * upper by setting flags into the buffers. Note that the side towards + * the client cannot have connect (hence retryable) errors. Also, the + * connection setup code must be able to deal with any type of abort. + */ + srv = objt_server(s->target); + if (unlikely(scf->flags & SC_FL_ERROR)) { + if (sc_state_in(scf->state, SC_SB_EST|SC_SB_DIS)) { + sc_abort(scf); + sc_shutdown(scf); + //sc_report_error(scf); TODO: Be sure it is useless + if (!(req->analysers) && !(res->analysers)) { + _HA_ATOMIC_INC(&s->be->be_counters.cli_aborts); + _HA_ATOMIC_INC(&sess->fe->fe_counters.cli_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->cli_aborts); + if (srv) + _HA_ATOMIC_INC(&srv->counters.cli_aborts); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_CLICL; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= SF_FINST_D; + } + } + } + + if (unlikely(scb->flags & SC_FL_ERROR)) { + if (sc_state_in(scb->state, SC_SB_EST|SC_SB_DIS)) { + sc_abort(scb); + sc_shutdown(scb); + //sc_report_error(scb); TODO: Be sure it is useless + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (srv) + _HA_ATOMIC_INC(&srv->counters.failed_resp); + if (!(req->analysers) && !(res->analysers)) { + _HA_ATOMIC_INC(&s->be->be_counters.srv_aborts); + _HA_ATOMIC_INC(&sess->fe->fe_counters.srv_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->srv_aborts); + if (srv) + _HA_ATOMIC_INC(&srv->counters.srv_aborts); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_SRVCL; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= SF_FINST_D; + } + } + /* note: maybe we should process connection errors here ? */ + } + + if (sc_state_in(scb->state, SC_SB_CON|SC_SB_RDY)) { + /* we were trying to establish a connection on the server side, + * maybe it succeeded, maybe it failed, maybe we timed out, ... + */ + if (scb->state == SC_ST_RDY) + back_handle_st_rdy(s); + else if (s->scb->state == SC_ST_CON) + back_handle_st_con(s); + + if (scb->state == SC_ST_CER) + back_handle_st_cer(s); + else if (scb->state == SC_ST_EST) + back_establish(s); + + /* state is now one of SC_ST_CON (still in progress), SC_ST_EST + * (established), SC_ST_DIS (abort), SC_ST_CLO (last error), + * SC_ST_ASS/SC_ST_TAR/SC_ST_REQ for retryable errors. + */ + } + + rq_prod_last = scf->state; + rq_cons_last = scb->state; + rp_cons_last = scf->state; + rp_prod_last = scb->state; + + /* Check for connection closure */ + DBG_TRACE_POINT(STRM_EV_STRM_PROC, s); + + /* nothing special to be done on client side */ + if (unlikely(scf->state == SC_ST_DIS)) { + scf->state = SC_ST_CLO; + + /* This is needed only when debugging is enabled, to indicate + * client-side close. + */ + if (unlikely((global.mode & MODE_DEBUG) && + (!(global.mode & MODE_QUIET) || + (global.mode & MODE_VERBOSE)))) { + chunk_printf(&trash, "%08x:%s.clicls[%04x:%04x]\n", + s->uniq_id, s->be->id, + (unsigned short)conn_fd(sc_conn(scf)), + (unsigned short)conn_fd(sc_conn(scb))); + DISGUISE(write(1, trash.area, trash.data)); + } + } + + /* When a server-side connection is released, we have to count it and + * check for pending connections on this server. + */ + if (unlikely(scb->state == SC_ST_DIS)) { + scb->state = SC_ST_CLO; + srv = objt_server(s->target); + if (srv) { + if (s->flags & SF_CURR_SESS) { + s->flags &= ~SF_CURR_SESS; + _HA_ATOMIC_DEC(&srv->cur_sess); + } + sess_change_server(s, NULL); + if (may_dequeue_tasks(srv, s->be)) + process_srv_queue(srv); + } + + /* This is needed only when debugging is enabled, to indicate + * server-side close. + */ + if (unlikely((global.mode & MODE_DEBUG) && + (!(global.mode & MODE_QUIET) || + (global.mode & MODE_VERBOSE)))) { + if (s->prev_conn_state == SC_ST_EST) { + chunk_printf(&trash, "%08x:%s.srvcls[%04x:%04x]\n", + s->uniq_id, s->be->id, + (unsigned short)conn_fd(sc_conn(scf)), + (unsigned short)conn_fd(sc_conn(scb))); + DISGUISE(write(1, trash.area, trash.data)); + } + } + } + + /* + * Note: of the transient states (REQ, CER, DIS), only REQ may remain + * at this point. + */ + + resync_request: + /* Analyse request */ + if (((req->flags & ~rqf_last) & CF_MASK_ANALYSER) || + ((scf->flags ^ scf_flags) & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) || + ((scb->flags ^ scb_flags) & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) || + (req->analysers && (scb->flags & SC_FL_SHUT_DONE)) || + scf->state != rq_prod_last || + scb->state != rq_cons_last || + s->pending_events & TASK_WOKEN_MSG) { + unsigned int scf_flags_ana = scf->flags; + unsigned int scb_flags_ana = scb->flags; + + if (sc_state_in(scf->state, SC_SB_EST|SC_SB_DIS|SC_SB_CLO)) { + int max_loops = global.tune.maxpollevents; + unsigned int ana_list; + unsigned int ana_back; + + /* it's up to the analysers to stop new connections, + * disable reading or closing. Note: if an analyser + * disables any of these bits, it is responsible for + * enabling them again when it disables itself, so + * that other analysers are called in similar conditions. + */ + channel_auto_read(req); + channel_auto_connect(req); + channel_auto_close(req); + + /* We will call all analysers for which a bit is set in + * req->analysers, following the bit order from LSB + * to MSB. The analysers must remove themselves from + * the list when not needed. Any analyser may return 0 + * to break out of the loop, either because of missing + * data to take a decision, or because it decides to + * kill the stream. We loop at least once through each + * analyser, and we may loop again if other analysers + * are added in the middle. + * + * We build a list of analysers to run. We evaluate all + * of these analysers in the order of the lower bit to + * the higher bit. This ordering is very important. + * An analyser will often add/remove other analysers, + * including itself. Any changes to itself have no effect + * on the loop. If it removes any other analysers, we + * want those analysers not to be called anymore during + * this loop. If it adds an analyser that is located + * after itself, we want it to be scheduled for being + * processed during the loop. If it adds an analyser + * which is located before it, we want it to switch to + * it immediately, even if it has already been called + * once but removed since. + * + * In order to achieve this, we compare the analyser + * list after the call with a copy of it before the + * call. The work list is fed with analyser bits that + * appeared during the call. Then we compare previous + * work list with the new one, and check the bits that + * appeared. If the lowest of these bits is lower than + * the current bit, it means we have enabled a previous + * analyser and must immediately loop again. + */ + + ana_list = ana_back = req->analysers; + while (ana_list && max_loops--) { + /* Warning! ensure that analysers are always placed in ascending order! */ + ANALYZE (s, req, flt_start_analyze, ana_list, ana_back, AN_REQ_FLT_START_FE); + FLT_ANALYZE(s, req, tcp_inspect_request, ana_list, ana_back, AN_REQ_INSPECT_FE); + FLT_ANALYZE(s, req, http_wait_for_request, ana_list, ana_back, AN_REQ_WAIT_HTTP); + FLT_ANALYZE(s, req, http_wait_for_request_body, ana_list, ana_back, AN_REQ_HTTP_BODY); + FLT_ANALYZE(s, req, http_process_req_common, ana_list, ana_back, AN_REQ_HTTP_PROCESS_FE, sess->fe); + FLT_ANALYZE(s, req, process_switching_rules, ana_list, ana_back, AN_REQ_SWITCHING_RULES); + ANALYZE (s, req, flt_start_analyze, ana_list, ana_back, AN_REQ_FLT_START_BE); + FLT_ANALYZE(s, req, tcp_inspect_request, ana_list, ana_back, AN_REQ_INSPECT_BE); + FLT_ANALYZE(s, req, http_process_req_common, ana_list, ana_back, AN_REQ_HTTP_PROCESS_BE, s->be); + FLT_ANALYZE(s, req, http_process_tarpit, ana_list, ana_back, AN_REQ_HTTP_TARPIT); + FLT_ANALYZE(s, req, process_server_rules, ana_list, ana_back, AN_REQ_SRV_RULES); + FLT_ANALYZE(s, req, http_process_request, ana_list, ana_back, AN_REQ_HTTP_INNER); + FLT_ANALYZE(s, req, tcp_persist_rdp_cookie, ana_list, ana_back, AN_REQ_PRST_RDP_COOKIE); + FLT_ANALYZE(s, req, process_sticking_rules, ana_list, ana_back, AN_REQ_STICKING_RULES); + ANALYZE (s, req, flt_analyze_http_headers, ana_list, ana_back, AN_REQ_FLT_HTTP_HDRS); + ANALYZE (s, req, http_request_forward_body, ana_list, ana_back, AN_REQ_HTTP_XFER_BODY); + ANALYZE (s, req, pcli_wait_for_request, ana_list, ana_back, AN_REQ_WAIT_CLI); + ANALYZE (s, req, flt_xfer_data, ana_list, ana_back, AN_REQ_FLT_XFER_DATA); + ANALYZE (s, req, flt_end_analyze, ana_list, ana_back, AN_REQ_FLT_END); + break; + } + } + + rq_prod_last = scf->state; + rq_cons_last = scb->state; + req->flags &= ~CF_WAKE_ONCE; + rqf_last = req->flags; + scf_flags = (scf_flags & ~(SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) | (scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)); + scb_flags = (scb_flags & ~(SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) | (scb->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)); + + if (((scf->flags ^ scf_flags_ana) & (SC_FL_EOS|SC_FL_ABRT_DONE)) || ((scb->flags ^ scb_flags_ana) & SC_FL_SHUT_DONE)) + goto resync_request; + } + + /* we'll monitor the request analysers while parsing the response, + * because some response analysers may indirectly enable new request + * analysers (eg: HTTP keep-alive). + */ + req_ana_back = req->analysers; + + resync_response: + /* Analyse response */ + + if (((res->flags & ~rpf_last) & CF_MASK_ANALYSER) || + ((scb->flags ^ scb_flags) & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) || + ((scf->flags ^ scf_flags) & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) || + (res->analysers && (scf->flags & SC_FL_SHUT_DONE)) || + scf->state != rp_cons_last || + scb->state != rp_prod_last || + s->pending_events & TASK_WOKEN_MSG) { + unsigned int scb_flags_ana = scb->flags; + unsigned int scf_flags_ana = scf->flags; + + if (sc_state_in(scb->state, SC_SB_EST|SC_SB_DIS|SC_SB_CLO)) { + int max_loops = global.tune.maxpollevents; + unsigned int ana_list; + unsigned int ana_back; + + /* it's up to the analysers to stop disable reading or + * closing. Note: if an analyser disables any of these + * bits, it is responsible for enabling them again when + * it disables itself, so that other analysers are called + * in similar conditions. + */ + channel_auto_read(res); + channel_auto_close(res); + + /* We will call all analysers for which a bit is set in + * res->analysers, following the bit order from LSB + * to MSB. The analysers must remove themselves from + * the list when not needed. Any analyser may return 0 + * to break out of the loop, either because of missing + * data to take a decision, or because it decides to + * kill the stream. We loop at least once through each + * analyser, and we may loop again if other analysers + * are added in the middle. + */ + + ana_list = ana_back = res->analysers; + while (ana_list && max_loops--) { + /* Warning! ensure that analysers are always placed in ascending order! */ + ANALYZE (s, res, flt_start_analyze, ana_list, ana_back, AN_RES_FLT_START_FE); + ANALYZE (s, res, flt_start_analyze, ana_list, ana_back, AN_RES_FLT_START_BE); + FLT_ANALYZE(s, res, tcp_inspect_response, ana_list, ana_back, AN_RES_INSPECT); + FLT_ANALYZE(s, res, http_wait_for_response, ana_list, ana_back, AN_RES_WAIT_HTTP); + FLT_ANALYZE(s, res, process_store_rules, ana_list, ana_back, AN_RES_STORE_RULES); + FLT_ANALYZE(s, res, http_process_res_common, ana_list, ana_back, AN_RES_HTTP_PROCESS_BE, s->be); + ANALYZE (s, res, flt_analyze_http_headers, ana_list, ana_back, AN_RES_FLT_HTTP_HDRS); + ANALYZE (s, res, http_response_forward_body, ana_list, ana_back, AN_RES_HTTP_XFER_BODY); + ANALYZE (s, res, pcli_wait_for_response, ana_list, ana_back, AN_RES_WAIT_CLI); + ANALYZE (s, res, flt_xfer_data, ana_list, ana_back, AN_RES_FLT_XFER_DATA); + ANALYZE (s, res, flt_end_analyze, ana_list, ana_back, AN_RES_FLT_END); + break; + } + } + + rp_cons_last = scf->state; + rp_prod_last = scb->state; + res->flags &= ~CF_WAKE_ONCE; + rpf_last = res->flags; + scb_flags = (scb_flags & ~(SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) | (scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)); + scf_flags = (scf_flags & ~(SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) | (scf->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)); + + if (((scb->flags ^ scb_flags_ana) & (SC_FL_EOS|SC_FL_ABRT_DONE)) || ((scf->flags ^ scf_flags_ana) & SC_FL_SHUT_DONE)) + goto resync_response; + } + + /* we'll monitor the response analysers because some response analysers + * may be enabled/disabled later + */ + res_ana_back = res->analysers; + + /* maybe someone has added some request analysers, so we must check and loop */ + if (req->analysers & ~req_ana_back) + goto resync_request; + + if ((req->flags & ~rqf_last) & CF_MASK_ANALYSER) + goto resync_request; + + /* FIXME: here we should call protocol handlers which rely on + * both buffers. + */ + + + /* + * Now we propagate unhandled errors to the stream. Normally + * we're just in a data phase here since it means we have not + * seen any analyser who could set an error status. + */ + srv = objt_server(s->target); + if (unlikely(!(s->flags & SF_ERR_MASK))) { + if ((scf->flags & SC_FL_ERROR) || req->flags & (CF_READ_TIMEOUT|CF_WRITE_TIMEOUT)) { + /* Report it if the client got an error or a read timeout expired */ + req->analysers &= AN_REQ_FLT_END; + channel_auto_close(req); + if (scf->flags & SC_FL_ERROR) { + _HA_ATOMIC_INC(&s->be->be_counters.cli_aborts); + _HA_ATOMIC_INC(&sess->fe->fe_counters.cli_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->cli_aborts); + if (srv) + _HA_ATOMIC_INC(&srv->counters.cli_aborts); + s->flags |= SF_ERR_CLICL; + } + else if (req->flags & CF_READ_TIMEOUT) { + _HA_ATOMIC_INC(&s->be->be_counters.cli_aborts); + _HA_ATOMIC_INC(&sess->fe->fe_counters.cli_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->cli_aborts); + if (srv) + _HA_ATOMIC_INC(&srv->counters.cli_aborts); + s->flags |= SF_ERR_CLITO; + } + else { + _HA_ATOMIC_INC(&s->be->be_counters.srv_aborts); + _HA_ATOMIC_INC(&sess->fe->fe_counters.srv_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->srv_aborts); + if (srv) + _HA_ATOMIC_INC(&srv->counters.srv_aborts); + s->flags |= SF_ERR_SRVTO; + } + sess_set_term_flags(s); + + /* Abort the request if a client error occurred while + * the backend stream connector is in the SC_ST_INI + * state. It is switched into the SC_ST_CLO state and + * the request channel is erased. */ + if (scb->state == SC_ST_INI) { + s->scb->state = SC_ST_CLO; + channel_abort(req); + if (IS_HTX_STRM(s)) + channel_htx_erase(req, htxbuf(&req->buf)); + else + channel_erase(req); + } + } + else if ((scb->flags & SC_FL_ERROR) || res->flags & (CF_READ_TIMEOUT|CF_WRITE_TIMEOUT)) { + /* Report it if the server got an error or a read timeout expired */ + res->analysers &= AN_RES_FLT_END; + channel_auto_close(res); + if (scb->flags & SC_FL_ERROR) { + _HA_ATOMIC_INC(&s->be->be_counters.srv_aborts); + _HA_ATOMIC_INC(&sess->fe->fe_counters.srv_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->srv_aborts); + if (srv) + _HA_ATOMIC_INC(&srv->counters.srv_aborts); + s->flags |= SF_ERR_SRVCL; + } + else if (res->flags & CF_READ_TIMEOUT) { + _HA_ATOMIC_INC(&s->be->be_counters.srv_aborts); + _HA_ATOMIC_INC(&sess->fe->fe_counters.srv_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->srv_aborts); + if (srv) + _HA_ATOMIC_INC(&srv->counters.srv_aborts); + s->flags |= SF_ERR_SRVTO; + } + else { + _HA_ATOMIC_INC(&s->be->be_counters.cli_aborts); + _HA_ATOMIC_INC(&sess->fe->fe_counters.cli_aborts); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->cli_aborts); + if (srv) + _HA_ATOMIC_INC(&srv->counters.cli_aborts); + s->flags |= SF_ERR_CLITO; + } + sess_set_term_flags(s); + } + } + + /* + * Here we take care of forwarding unhandled data. This also includes + * connection establishments and shutdown requests. + */ + + + /* If no one is interested in analysing data, it's time to forward + * everything. We configure the buffer to forward indefinitely. + * Note that we're checking SC_FL_ABRT_WANTED as an indication of a possible + * recent call to channel_abort(). + */ + if (unlikely((!req->analysers || (req->analysers == AN_REQ_FLT_END && !(req->flags & CF_FLT_ANALYZE))) && + !(scf->flags & SC_FL_ABRT_WANTED) && !(scb->flags & SC_FL_SHUT_DONE) && + (sc_state_in(scf->state, SC_SB_EST|SC_SB_DIS|SC_SB_CLO)) && + (req->to_forward != CHN_INFINITE_FORWARD))) { + /* This buffer is freewheeling, there's no analyser + * attached to it. If any data are left in, we'll permit them to + * move. + */ + channel_auto_read(req); + channel_auto_connect(req); + channel_auto_close(req); + + if (IS_HTX_STRM(s)) { + struct htx *htx = htxbuf(&req->buf); + + /* We'll let data flow between the producer (if still connected) + * to the consumer. + */ + co_set_data(req, htx->data); + if ((global.tune.options & GTUNE_USE_FAST_FWD) && + !(scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && !(scb->flags & SC_FL_SHUT_WANTED)) + channel_htx_forward_forever(req, htx); + } + else { + /* We'll let data flow between the producer (if still connected) + * to the consumer (which might possibly not be connected yet). + */ + c_adv(req, ci_data(req)); + if ((global.tune.options & GTUNE_USE_FAST_FWD) && + !(scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && !(scb->flags & SC_FL_SHUT_WANTED)) + channel_forward_forever(req); + } + } + + /* reflect what the L7 analysers have seen last */ + rqf_last = req->flags; + scf_flags = (scf_flags & ~(SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) | (scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)); + scb_flags = (scb_flags & ~(SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) | (scb->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)); + + /* it's possible that an upper layer has requested a connection setup or abort. + * There are 2 situations where we decide to establish a new connection : + * - there are data scheduled for emission in the buffer + * - the CF_AUTO_CONNECT flag is set (active connection) + */ + if (scb->state == SC_ST_INI) { + if (!(scb->flags & SC_FL_SHUT_DONE)) { + if ((req->flags & CF_AUTO_CONNECT) || co_data(req)) { + /* If we have an appctx, there is no connect method, so we + * immediately switch to the connected state, otherwise we + * perform a connection request. + */ + scb->state = SC_ST_REQ; /* new connection requested */ + s->conn_retries = 0; + if ((s->be->retry_type &~ PR_RE_CONN_FAILED) && + (s->be->mode == PR_MODE_HTTP) && + !(s->txn->flags & TX_D_L7_RETRY)) + s->txn->flags |= TX_L7_RETRY; + + if (s->be->options & PR_O_ABRT_CLOSE) { + struct connection *conn = sc_conn(scf); + + if (conn && conn->mux && conn->mux->ctl) + conn->mux->ctl(conn, MUX_CTL_SUBS_RECV, NULL); + } + } + } + else { + s->scb->state = SC_ST_CLO; /* shutw+ini = abort */ + sc_schedule_shutdown(scb); + sc_schedule_abort(scb); + } + } + + + /* we may have a pending connection request, or a connection waiting + * for completion. + */ + if (sc_state_in(scb->state, SC_SB_REQ|SC_SB_QUE|SC_SB_TAR|SC_SB_ASS)) { + /* prune the request variables and swap to the response variables. */ + if (s->vars_reqres.scope != SCOPE_RES) { + if (!LIST_ISEMPTY(&s->vars_reqres.head)) + vars_prune(&s->vars_reqres, s->sess, s); + vars_init_head(&s->vars_reqres, SCOPE_RES); + } + + do { + /* nb: step 1 might switch from QUE to ASS, but we first want + * to give a chance to step 2 to perform a redirect if needed. + */ + if (scb->state != SC_ST_REQ) + back_try_conn_req(s); + if (scb->state == SC_ST_REQ) + back_handle_st_req(s); + + /* get a chance to complete an immediate connection setup */ + if (scb->state == SC_ST_RDY) + goto resync_stconns; + + /* applets directly go to the ESTABLISHED state. Similarly, + * servers experience the same fate when their connection + * is reused. + */ + if (unlikely(scb->state == SC_ST_EST)) + back_establish(s); + + srv = objt_server(s->target); + if (scb->state == SC_ST_ASS && srv && srv->rdr_len && (s->flags & SF_REDIRECTABLE)) + http_perform_server_redirect(s, scb); + } while (scb->state == SC_ST_ASS); + } + + /* Let's see if we can send the pending request now */ + sc_conn_sync_send(scb); + + /* + * Now forward all shutdown requests between both sides of the request buffer + */ + + /* first, let's check if the request buffer needs to shutdown(write), which may + * happen either because the input is closed or because we want to force a close + * once the server has begun to respond. If a half-closed timeout is set, we adjust + * the other side's timeout as well. However this doesn't have effect during the + * connection setup unless the backend has abortonclose set. + */ + if (unlikely((req->flags & CF_AUTO_CLOSE) && (scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && + !(scb->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) && + (scb->state != SC_ST_CON || (s->be->options & PR_O_ABRT_CLOSE)))) { + sc_schedule_shutdown(scb); + } + + /* shutdown(write) pending */ + if (unlikely((scb->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) == SC_FL_SHUT_WANTED && + (!co_data(req) || (req->flags & CF_WRITE_TIMEOUT)))) { + if (scf->flags & SC_FL_ERROR) + scb->flags |= SC_FL_NOLINGER; + sc_shutdown(scb); + } + + /* shutdown(write) done on server side, we must stop the client too */ + if (unlikely((scb->flags & SC_FL_SHUT_DONE) && !(scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED))) && + !req->analysers) + sc_schedule_abort(scf); + + /* shutdown(read) pending */ + if (unlikely((scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) == SC_FL_ABRT_WANTED)) { + if (scf->flags & SC_FL_NOHALF) + scf->flags |= SC_FL_NOLINGER; + sc_abort(scf); + } + + /* Benchmarks have shown that it's optimal to do a full resync now */ + if (scf->state == SC_ST_DIS || + sc_state_in(scb->state, SC_SB_RDY|SC_SB_DIS) || + ((scf->flags & SC_FL_ERROR) && scf->state != SC_ST_CLO) || + ((scb->flags & SC_FL_ERROR) && scb->state != SC_ST_CLO)) + goto resync_stconns; + + /* otherwise we want to check if we need to resync the req buffer or not */ + if (((scf->flags ^ scf_flags) & (SC_FL_EOS|SC_FL_ABRT_DONE)) || ((scb->flags ^ scb_flags) & SC_FL_SHUT_DONE)) + goto resync_request; + + /* perform output updates to the response buffer */ + + /* If no one is interested in analysing data, it's time to forward + * everything. We configure the buffer to forward indefinitely. + * Note that we're checking SC_FL_ABRT_WANTED as an indication of a possible + * recent call to channel_abort(). + */ + if (unlikely((!res->analysers || (res->analysers == AN_RES_FLT_END && !(res->flags & CF_FLT_ANALYZE))) && + !(scf->flags & SC_FL_ABRT_WANTED) && !(scb->flags & SC_FL_SHUT_WANTED) && + sc_state_in(scb->state, SC_SB_EST|SC_SB_DIS|SC_SB_CLO) && + (res->to_forward != CHN_INFINITE_FORWARD))) { + /* This buffer is freewheeling, there's no analyser + * attached to it. If any data are left in, we'll permit them to + * move. + */ + channel_auto_read(res); + channel_auto_close(res); + + if (IS_HTX_STRM(s)) { + struct htx *htx = htxbuf(&res->buf); + + /* We'll let data flow between the producer (if still connected) + * to the consumer. + */ + co_set_data(res, htx->data); + if ((global.tune.options & GTUNE_USE_FAST_FWD) && + !(scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && !(scb->flags & SC_FL_SHUT_WANTED)) + channel_htx_forward_forever(res, htx); + } + else { + /* We'll let data flow between the producer (if still connected) + * to the consumer. + */ + c_adv(res, ci_data(res)); + if ((global.tune.options & GTUNE_USE_FAST_FWD) && + !(scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && !(scb->flags & SC_FL_SHUT_WANTED)) + channel_forward_forever(res); + } + + /* if we have no analyser anymore in any direction and have a + * tunnel timeout set, use it now. Note that we must respect + * the half-closed timeouts as well. + */ + if (!req->analysers && s->tunnel_timeout) { + scf->ioto = scb->ioto = s->tunnel_timeout; + + if (!IS_HTX_STRM(s)) { + if ((scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_SHUT_DONE)) && tick_isset(sess->fe->timeout.clientfin)) + scf->ioto = sess->fe->timeout.clientfin; + if ((scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_SHUT_DONE)) && tick_isset(s->be->timeout.serverfin)) + scb->ioto = s->be->timeout.serverfin; + } + } + } + + /* reflect what the L7 analysers have seen last */ + rpf_last = res->flags; + scb_flags = (scb_flags & ~(SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) | (scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)); + scf_flags = (scf_flags & ~(SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) | (scf->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)); + + /* Let's see if we can send the pending response now */ + sc_conn_sync_send(scf); + + /* + * Now forward all shutdown requests between both sides of the buffer + */ + + /* + * FIXME: this is probably where we should produce error responses. + */ + + /* first, let's check if the response buffer needs to shutdown(write) */ + if (unlikely((res->flags & CF_AUTO_CLOSE) && (scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) && + !(scf->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)))) { + sc_schedule_shutdown(scf); + } + + /* shutdown(write) pending */ + if (unlikely((scf->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) == SC_FL_SHUT_WANTED && + (!co_data(res) || (res->flags & CF_WRITE_TIMEOUT)))) { + sc_shutdown(scf); + } + + /* shutdown(write) done on the client side, we must stop the server too */ + if (unlikely((scf->flags & SC_FL_SHUT_DONE) && !(scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED))) && + !res->analysers) + sc_schedule_abort(scb); + + /* shutdown(read) pending */ + if (unlikely((scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) == SC_FL_ABRT_WANTED)) { + if (scb->flags & SC_FL_NOHALF) + scb->flags |= SC_FL_NOLINGER; + sc_abort(scb); + } + + if (scf->state == SC_ST_DIS || + sc_state_in(scb->state, SC_SB_RDY|SC_SB_DIS) || + ((scf->flags & SC_FL_ERROR) && scf->state != SC_ST_CLO) || + ((scb->flags & SC_FL_ERROR) && scb->state != SC_ST_CLO)) + goto resync_stconns; + + if ((req->flags & ~rqf_last) & CF_MASK_ANALYSER) + goto resync_request; + + if (((scb->flags ^ scb_flags) & (SC_FL_EOS|SC_FL_ABRT_DONE|SC_FL_ABRT_WANTED)) || + ((scf->flags ^ scf_flags) & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) || + (res->analysers ^ res_ana_back)) + goto resync_response; + + if ((((req->flags ^ rqf_last) | (res->flags ^ rpf_last)) & CF_MASK_ANALYSER) || + (req->analysers ^ req_ana_back)) + goto resync_request; + + /* we're interested in getting wakeups again */ + scf->flags &= ~SC_FL_DONT_WAKE; + scb->flags &= ~SC_FL_DONT_WAKE; + + if (likely((scf->state != SC_ST_CLO) || !sc_state_in(scb->state, SC_SB_INI|SC_SB_CLO) || + (req->analysers & AN_REQ_FLT_END) || (res->analysers & AN_RES_FLT_END))) { + if ((sess->fe->options & PR_O_CONTSTATS) && (s->flags & SF_BE_ASSIGNED) && !(s->flags & SF_IGNORE)) + stream_process_counters(s); + + stream_update_both_sc(s); + + /* Reset pending events now */ + s->pending_events = 0; + + update_exp_and_leave: + /* Note: please ensure that if you branch here you disable SC_FL_DONT_WAKE */ + if (!req->analysers) + req->analyse_exp = TICK_ETERNITY; + if (!res->analysers) + res->analyse_exp = TICK_ETERNITY; + + if ((sess->fe->options & PR_O_CONTSTATS) && (s->flags & SF_BE_ASSIGNED) && + (!tick_isset(req->analyse_exp) || tick_is_expired(req->analyse_exp, now_ms))) + req->analyse_exp = tick_add(now_ms, 5000); + + t->expire = (tick_is_expired(t->expire, now_ms) ? 0 : t->expire); + t->expire = tick_first(t->expire, sc_ep_rcv_ex(scf)); + t->expire = tick_first(t->expire, sc_ep_snd_ex(scf)); + t->expire = tick_first(t->expire, sc_ep_rcv_ex(scb)); + t->expire = tick_first(t->expire, sc_ep_snd_ex(scb)); + t->expire = tick_first(t->expire, req->analyse_exp); + t->expire = tick_first(t->expire, res->analyse_exp); + t->expire = tick_first(t->expire, s->conn_exp); + + if (unlikely(tick_is_expired(t->expire, now_ms))) { + /* Some events prevented the timeouts to be handled but nothing evolved. + So do it now and resyunc the stconns + */ + stream_handle_timeouts(s); + goto resync_stconns; + } + + s->pending_events &= ~(TASK_WOKEN_TIMER | TASK_WOKEN_RES); + stream_release_buffers(s); + + DBG_TRACE_DEVEL("queuing", STRM_EV_STRM_PROC, s); + return t; /* nothing more to do */ + } + + DBG_TRACE_DEVEL("releasing", STRM_EV_STRM_PROC, s); + + if (s->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_DEC(&s->be->beconn); + + if (unlikely((global.mode & MODE_DEBUG) && + (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)))) { + chunk_printf(&trash, "%08x:%s.closed[%04x:%04x]\n", + s->uniq_id, s->be->id, + (unsigned short)conn_fd(sc_conn(scf)), + (unsigned short)conn_fd(sc_conn(scb))); + DISGUISE(write(1, trash.area, trash.data)); + } + + if (!(s->flags & SF_IGNORE)) { + s->logs.t_close = ns_to_ms(now_ns - s->logs.accept_ts); + + stream_process_counters(s); + + if (s->txn && s->txn->status) { + int n; + + n = s->txn->status / 100; + if (n < 1 || n > 5) + n = 0; + + if (sess->fe->mode == PR_MODE_HTTP) { + _HA_ATOMIC_INC(&sess->fe->fe_counters.p.http.rsp[n]); + } + if ((s->flags & SF_BE_ASSIGNED) && + (s->be->mode == PR_MODE_HTTP)) { + _HA_ATOMIC_INC(&s->be->be_counters.p.http.rsp[n]); + _HA_ATOMIC_INC(&s->be->be_counters.p.http.cum_req); + } + } + + /* let's do a final log if we need it */ + if (!LIST_ISEMPTY(&sess->fe->logformat) && s->logs.logwait && + !(s->flags & SF_MONITOR) && + (!(sess->fe->options & PR_O_NULLNOLOG) || req->total)) { + /* we may need to know the position in the queue */ + pendconn_free(s); + + stream_cond_update_cpu_usage(s); + s->do_log(s); + } + + /* update time stats for this stream */ + stream_update_time_stats(s); + } + + /* the task MUST not be in the run queue anymore */ + stream_free(s); + task_destroy(t); + return NULL; +} + +/* Update the stream's backend and server time stats */ +void stream_update_time_stats(struct stream *s) +{ + int t_request; + int t_queue; + int t_connect; + int t_data; + int t_close; + struct server *srv; + unsigned int samples_window; + + t_request = 0; + t_queue = s->logs.t_queue; + t_connect = s->logs.t_connect; + t_close = s->logs.t_close; + t_data = s->logs.t_data; + + if (s->be->mode != PR_MODE_HTTP) + t_data = t_connect; + + if (t_connect < 0 || t_data < 0) + return; + + if ((llong)(s->logs.request_ts - s->logs.accept_ts) >= 0) + t_request = ns_to_ms(s->logs.request_ts - s->logs.accept_ts); + + t_data -= t_connect; + t_connect -= t_queue; + t_queue -= t_request; + + srv = objt_server(s->target); + if (srv) { + samples_window = (((s->be->mode == PR_MODE_HTTP) ? + srv->counters.p.http.cum_req : srv->counters.cum_lbconn) > TIME_STATS_SAMPLES) ? TIME_STATS_SAMPLES : 0; + swrate_add_dynamic(&srv->counters.q_time, samples_window, t_queue); + swrate_add_dynamic(&srv->counters.c_time, samples_window, t_connect); + swrate_add_dynamic(&srv->counters.d_time, samples_window, t_data); + swrate_add_dynamic(&srv->counters.t_time, samples_window, t_close); + HA_ATOMIC_UPDATE_MAX(&srv->counters.qtime_max, t_queue); + HA_ATOMIC_UPDATE_MAX(&srv->counters.ctime_max, t_connect); + HA_ATOMIC_UPDATE_MAX(&srv->counters.dtime_max, t_data); + HA_ATOMIC_UPDATE_MAX(&srv->counters.ttime_max, t_close); + } + samples_window = (((s->be->mode == PR_MODE_HTTP) ? + s->be->be_counters.p.http.cum_req : s->be->be_counters.cum_lbconn) > TIME_STATS_SAMPLES) ? TIME_STATS_SAMPLES : 0; + swrate_add_dynamic(&s->be->be_counters.q_time, samples_window, t_queue); + swrate_add_dynamic(&s->be->be_counters.c_time, samples_window, t_connect); + swrate_add_dynamic(&s->be->be_counters.d_time, samples_window, t_data); + swrate_add_dynamic(&s->be->be_counters.t_time, samples_window, t_close); + HA_ATOMIC_UPDATE_MAX(&s->be->be_counters.qtime_max, t_queue); + HA_ATOMIC_UPDATE_MAX(&s->be->be_counters.ctime_max, t_connect); + HA_ATOMIC_UPDATE_MAX(&s->be->be_counters.dtime_max, t_data); + HA_ATOMIC_UPDATE_MAX(&s->be->be_counters.ttime_max, t_close); +} + +/* + * This function adjusts sess->srv_conn and maintains the previous and new + * server's served stream counts. Setting newsrv to NULL is enough to release + * current connection slot. This function also notifies any LB algo which might + * expect to be informed about any change in the number of active streams on a + * server. + */ +void sess_change_server(struct stream *strm, struct server *newsrv) +{ + struct server *oldsrv = strm->srv_conn; + + if (oldsrv == newsrv) + return; + + if (oldsrv) { + _HA_ATOMIC_DEC(&oldsrv->served); + _HA_ATOMIC_DEC(&oldsrv->proxy->served); + __ha_barrier_atomic_store(); + if (oldsrv->proxy->lbprm.server_drop_conn) + oldsrv->proxy->lbprm.server_drop_conn(oldsrv); + stream_del_srv_conn(strm); + } + + if (newsrv) { + _HA_ATOMIC_INC(&newsrv->served); + _HA_ATOMIC_INC(&newsrv->proxy->served); + __ha_barrier_atomic_store(); + if (newsrv->proxy->lbprm.server_take_conn) + newsrv->proxy->lbprm.server_take_conn(newsrv); + stream_add_srv_conn(strm, newsrv); + } +} + +/* Handle server-side errors for default protocols. It is called whenever a a + * connection setup is aborted or a request is aborted in queue. It sets the + * stream termination flags so that the caller does not have to worry about + * them. It's installed as ->srv_error for the server-side stream connector. + */ +void default_srv_error(struct stream *s, struct stconn *sc) +{ + int err_type = s->conn_err_type; + int err = 0, fin = 0; + + if (err_type & STRM_ET_QUEUE_ABRT) { + err = SF_ERR_CLICL; + fin = SF_FINST_Q; + } + else if (err_type & STRM_ET_CONN_ABRT) { + err = SF_ERR_CLICL; + fin = SF_FINST_C; + } + else if (err_type & STRM_ET_QUEUE_TO) { + err = SF_ERR_SRVTO; + fin = SF_FINST_Q; + } + else if (err_type & STRM_ET_QUEUE_ERR) { + err = SF_ERR_SRVCL; + fin = SF_FINST_Q; + } + else if (err_type & STRM_ET_CONN_TO) { + err = SF_ERR_SRVTO; + fin = SF_FINST_C; + } + else if (err_type & STRM_ET_CONN_ERR) { + err = SF_ERR_SRVCL; + fin = SF_FINST_C; + } + else if (err_type & STRM_ET_CONN_RES) { + err = SF_ERR_RESOURCE; + fin = SF_FINST_C; + } + else /* STRM_ET_CONN_OTHER and others */ { + err = SF_ERR_INTERNAL; + fin = SF_FINST_C; + } + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= err; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= fin; +} + +/* kill a stream and set the termination flags to <why> (one of SF_ERR_*) */ +void stream_shutdown(struct stream *stream, int why) +{ + if (stream->scb->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) + return; + + sc_schedule_shutdown(stream->scb); + sc_schedule_abort(stream->scb); + stream->task->nice = 1024; + if (!(stream->flags & SF_ERR_MASK)) + stream->flags |= why; + task_wakeup(stream->task, TASK_WOKEN_OTHER); +} + +/* dumps an error message for type <type> at ptr <ptr> related to stream <s>, + * having reached loop rate <rate>, then aborts hoping to retrieve a core. + */ +void stream_dump_and_crash(enum obj_type *obj, int rate) +{ + struct stream *s; + char *msg = NULL; + const void *ptr; + + ptr = s = objt_stream(obj); + if (!s) { + const struct appctx *appctx = objt_appctx(obj); + if (!appctx) + return; + ptr = appctx; + s = appctx_strm(appctx); + if (!s) + return; + } + + chunk_reset(&trash); + chunk_printf(&trash, " "); + strm_dump_to_buffer(&trash, s, " ", HA_ATOMIC_LOAD(&global.anon_key)); + + if (ptr != s) { // that's an appctx + const struct appctx *appctx = ptr; + + chunk_appendf(&trash, " applet=%p(", appctx->applet); + resolve_sym_name(&trash, NULL, appctx->applet); + chunk_appendf(&trash, ")"); + + chunk_appendf(&trash, " handler=%p(", appctx->applet->fct); + resolve_sym_name(&trash, NULL, appctx->applet->fct); + chunk_appendf(&trash, ")"); + } + + memprintf(&msg, + "A bogus %s [%p] is spinning at %d calls per second and refuses to die, " + "aborting now! Please report this error to developers:\n" + "%s\n", + obj_type_name(obj), ptr, rate, trash.area); + + ha_alert("%s", msg); + send_log(NULL, LOG_EMERG, "%s", msg); + ABORT_NOW(); +} + +/* initialize the require structures */ +static void init_stream() +{ + int thr; + + for (thr = 0; thr < MAX_THREADS; thr++) + LIST_INIT(&ha_thread_ctx[thr].streams); +} +INITCALL0(STG_INIT, init_stream); + +/* Generates a unique ID based on the given <format>, stores it in the given <strm> and + * returns the unique ID. + * + * If this function fails to allocate memory IST_NULL is returned. + * + * If an ID is already stored within the stream nothing happens existing unique ID is + * returned. + */ +struct ist stream_generate_unique_id(struct stream *strm, struct list *format) +{ + if (isttest(strm->unique_id)) { + return strm->unique_id; + } + else { + char *unique_id; + int length; + if ((unique_id = pool_alloc(pool_head_uniqueid)) == NULL) + return IST_NULL; + + length = build_logline(strm, unique_id, UNIQUEID_LEN, format); + strm->unique_id = ist2(unique_id, length); + + return strm->unique_id; + } +} + +/************************************************************************/ +/* All supported ACL keywords must be declared here. */ +/************************************************************************/ +static enum act_return stream_action_set_log_level(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + s->logs.level = (uintptr_t)rule->arg.act.p[0]; + return ACT_RET_CONT; +} + + +/* Parse a "set-log-level" action. It takes the level value as argument. It + * returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret stream_parse_set_log_level(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int level; + + if (!*args[*cur_arg]) { + bad_log_level: + memprintf(err, "expects exactly 1 argument (log level name or 'silent')"); + return ACT_RET_PRS_ERR; + } + if (strcmp(args[*cur_arg], "silent") == 0) + level = -1; + else if ((level = get_log_level(args[*cur_arg]) + 1) == 0) + goto bad_log_level; + + (*cur_arg)++; + + /* Register processing function. */ + rule->action_ptr = stream_action_set_log_level; + rule->action = ACT_CUSTOM; + rule->arg.act.p[0] = (void *)(uintptr_t)level; + return ACT_RET_PRS_OK; +} + +static enum act_return stream_action_set_nice(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + s->task->nice = (uintptr_t)rule->arg.act.p[0]; + return ACT_RET_CONT; +} + + +/* Parse a "set-nice" action. It takes the nice value as argument. It returns + * ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret stream_parse_set_nice(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int nice; + + if (!*args[*cur_arg]) { + bad_log_level: + memprintf(err, "expects exactly 1 argument (integer value)"); + return ACT_RET_PRS_ERR; + } + + nice = atoi(args[*cur_arg]); + if (nice < -1024) + nice = -1024; + else if (nice > 1024) + nice = 1024; + + (*cur_arg)++; + + /* Register processing function. */ + rule->action_ptr = stream_action_set_nice; + rule->action = ACT_CUSTOM; + rule->arg.act.p[0] = (void *)(uintptr_t)nice; + return ACT_RET_PRS_OK; +} + + +static enum act_return tcp_action_switch_stream_mode(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + enum pr_mode mode = (uintptr_t)rule->arg.act.p[0]; + const struct mux_proto_list *mux_proto = rule->arg.act.p[1]; + + if (!IS_HTX_STRM(s) && mode == PR_MODE_HTTP) { + if (!stream_set_http_mode(s, mux_proto)) { + stream_abort(s); + return ACT_RET_ABRT; + } + } + return ACT_RET_STOP; +} + + +static int check_tcp_switch_stream_mode(struct act_rule *rule, struct proxy *px, char **err) +{ + const struct mux_proto_list *mux_ent; + const struct mux_proto_list *mux_proto = rule->arg.act.p[1]; + enum pr_mode pr_mode = (uintptr_t)rule->arg.act.p[0]; + enum proto_proxy_mode mode = conn_pr_mode_to_proto_mode(pr_mode); + + if (pr_mode == PR_MODE_HTTP) + px->options |= PR_O_HTTP_UPG; + + if (mux_proto) { + mux_ent = conn_get_best_mux_entry(mux_proto->token, PROTO_SIDE_FE, mode); + if (!mux_ent || !isteq(mux_ent->token, mux_proto->token)) { + memprintf(err, "MUX protocol '%.*s' is not compatible with the selected mode", + (int)mux_proto->token.len, mux_proto->token.ptr); + return 0; + } + } + else { + mux_ent = conn_get_best_mux_entry(IST_NULL, PROTO_SIDE_FE, mode); + if (!mux_ent) { + memprintf(err, "Unable to find compatible MUX protocol with the selected mode"); + return 0; + } + } + + /* Update the mux */ + rule->arg.act.p[1] = (void *)mux_ent; + return 1; + +} + +static enum act_parse_ret stream_parse_switch_mode(const char **args, int *cur_arg, + struct proxy *px, struct act_rule *rule, + char **err) +{ + const struct mux_proto_list *mux_proto = NULL; + struct ist proto; + enum pr_mode mode; + + /* must have at least the mode */ + if (*(args[*cur_arg]) == 0) { + memprintf(err, "'%s %s' expects a mode as argument.", args[0], args[*cur_arg-1]); + return ACT_RET_PRS_ERR; + } + + if (!(px->cap & PR_CAP_FE)) { + memprintf(err, "'%s %s' not allowed because %s '%s' has no frontend capability", + args[0], args[*cur_arg-1], proxy_type_str(px), px->id); + return ACT_RET_PRS_ERR; + } + /* Check if the mode. For now "tcp" is disabled because downgrade is not + * supported and PT is the only TCP mux. + */ + if (strcmp(args[*cur_arg], "http") == 0) + mode = PR_MODE_HTTP; + else { + memprintf(err, "'%s %s' expects a valid mode (got '%s').", args[0], args[*cur_arg-1], args[*cur_arg]); + return ACT_RET_PRS_ERR; + } + + /* check the proto, if specified */ + if (*(args[*cur_arg+1]) && strcmp(args[*cur_arg+1], "proto") == 0) { + if (*(args[*cur_arg+2]) == 0) { + memprintf(err, "'%s %s': '%s' expects a protocol as argument.", + args[0], args[*cur_arg-1], args[*cur_arg+1]); + return ACT_RET_PRS_ERR; + } + + proto = ist(args[*cur_arg + 2]); + mux_proto = get_mux_proto(proto); + if (!mux_proto) { + memprintf(err, "'%s %s': '%s' expects a valid MUX protocol, if specified (got '%s')", + args[0], args[*cur_arg-1], args[*cur_arg+1], args[*cur_arg+2]); + return ACT_RET_PRS_ERR; + } + *cur_arg += 2; + } + + (*cur_arg)++; + + /* Register processing function. */ + rule->action_ptr = tcp_action_switch_stream_mode; + rule->check_ptr = check_tcp_switch_stream_mode; + rule->action = ACT_CUSTOM; + rule->arg.act.p[0] = (void *)(uintptr_t)mode; + rule->arg.act.p[1] = (void *)mux_proto; + return ACT_RET_PRS_OK; +} + +/* 0=OK, <0=Alert, >0=Warning */ +static enum act_parse_ret stream_parse_use_service(const char **args, int *cur_arg, + struct proxy *px, struct act_rule *rule, + char **err) +{ + struct action_kw *kw; + + /* Check if the service name exists. */ + if (*(args[*cur_arg]) == 0) { + memprintf(err, "'%s' expects a service name.", args[0]); + return ACT_RET_PRS_ERR; + } + + /* lookup for keyword corresponding to a service. */ + kw = action_lookup(&service_keywords, args[*cur_arg]); + if (!kw) { + memprintf(err, "'%s' unknown service name.", args[1]); + return ACT_RET_PRS_ERR; + } + (*cur_arg)++; + + /* executes specific rule parser. */ + rule->kw = kw; + if (kw->parse((const char **)args, cur_arg, px, rule, err) == ACT_RET_PRS_ERR) + return ACT_RET_PRS_ERR; + + /* Register processing function. */ + rule->action_ptr = process_use_service; + rule->action = ACT_CUSTOM; + + return ACT_RET_PRS_OK; +} + +void service_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&service_keywords, &kw_list->list); +} + +struct action_kw *service_find(const char *kw) +{ + return action_lookup(&service_keywords, kw); +} + +/* Lists the known services on <out>. If <out> is null, emit them on stdout one + * per line. + */ +void list_services(FILE *out) +{ + const struct action_kw *akwp, *akwn; + struct action_kw_list *kw_list; + int found = 0; + int i; + + if (out) + fprintf(out, "Available services :"); + + for (akwn = akwp = NULL;; akwp = akwn) { + list_for_each_entry(kw_list, &service_keywords, list) { + for (i = 0; kw_list->kw[i].kw != NULL; i++) { + if (strordered(akwp ? akwp->kw : NULL, + kw_list->kw[i].kw, + akwn != akwp ? akwn->kw : NULL)) + akwn = &kw_list->kw[i]; + found = 1; + } + } + if (akwn == akwp) + break; + if (out) + fprintf(out, " %s", akwn->kw); + else + printf("%s\n", akwn->kw); + } + if (!found && out) + fprintf(out, " none\n"); +} + +/* appctx context used by the "show sess" command */ +/* flags used for show_sess_ctx.flags */ +#define CLI_SHOWSESS_F_SUSP 0x00000001 /* show only suspicious streams */ + +struct show_sess_ctx { + struct bref bref; /* back-reference from the session being dumped */ + void *target; /* session we want to dump, or NULL for all */ + unsigned int thr; /* the thread number being explored (0..MAX_THREADS-1) */ + unsigned int uid; /* if non-null, the uniq_id of the session being dumped */ + unsigned int min_age; /* minimum age of streams to dump */ + unsigned int flags; /* CLI_SHOWSESS_* */ + int section; /* section of the session being dumped */ + int pos; /* last position of the current session's buffer */ +}; + +/* This function appends a complete dump of a stream state onto the buffer, + * possibly anonymizing using the specified anon_key. The caller is responsible + * for ensuring that enough room remains in the buffer to dump a complete + * stream at once. Each new output line will be prefixed with <pfx> if non-null, + * which is used to preserve indenting. + */ +void strm_dump_to_buffer(struct buffer *buf, const struct stream *strm, const char *pfx, uint32_t anon_key) +{ + struct stconn *scf, *scb; + struct tm tm; + extern const char *monthname[12]; + char pn[INET6_ADDRSTRLEN]; + struct connection *conn; + struct appctx *tmpctx; + + pfx = pfx ? pfx : ""; + + get_localtime(strm->logs.accept_date.tv_sec, &tm); + chunk_appendf(buf, + "%p: [%02d/%s/%04d:%02d:%02d:%02d.%06d] id=%u proto=%s", + strm, + tm.tm_mday, monthname[tm.tm_mon], tm.tm_year+1900, + tm.tm_hour, tm.tm_min, tm.tm_sec, (int)(strm->logs.accept_date.tv_usec), + strm->uniq_id, + strm_li(strm) ? strm_li(strm)->rx.proto->name : "?"); + + conn = objt_conn(strm_orig(strm)); + switch (conn && conn_get_src(conn) ? addr_to_str(conn->src, pn, sizeof(pn)) : AF_UNSPEC) { + case AF_INET: + case AF_INET6: + chunk_appendf(buf, " source=%s:%d\n", + HA_ANON_STR(anon_key, pn), get_host_port(conn->src)); + break; + case AF_UNIX: + chunk_appendf(buf, " source=unix:%d\n", strm_li(strm)->luid); + break; + default: + /* no more information to print right now */ + chunk_appendf(buf, "\n"); + break; + } + + chunk_appendf(buf, + "%s flags=0x%x, conn_retries=%d, conn_exp=%s conn_et=0x%03x srv_conn=%p, pend_pos=%p waiting=%d epoch=%#x\n", pfx, + strm->flags, strm->conn_retries, + strm->conn_exp ? + tick_is_expired(strm->conn_exp, now_ms) ? "<PAST>" : + human_time(TICKS_TO_MS(strm->conn_exp - now_ms), + TICKS_TO_MS(1000)) : "<NEVER>", + strm->conn_err_type, strm->srv_conn, strm->pend_pos, + LIST_INLIST(&strm->buffer_wait.list), strm->stream_epoch); + + chunk_appendf(buf, + "%s frontend=%s (id=%u mode=%s), listener=%s (id=%u)", pfx, + HA_ANON_STR(anon_key, strm_fe(strm)->id), strm_fe(strm)->uuid, proxy_mode_str(strm_fe(strm)->mode), + strm_li(strm) ? strm_li(strm)->name ? strm_li(strm)->name : "?" : "?", + strm_li(strm) ? strm_li(strm)->luid : 0); + + switch (conn && conn_get_dst(conn) ? addr_to_str(conn->dst, pn, sizeof(pn)) : AF_UNSPEC) { + case AF_INET: + case AF_INET6: + chunk_appendf(buf, " addr=%s:%d\n", + HA_ANON_STR(anon_key, pn), get_host_port(conn->dst)); + break; + case AF_UNIX: + chunk_appendf(buf, " addr=unix:%d\n", strm_li(strm)->luid); + break; + default: + /* no more information to print right now */ + chunk_appendf(buf, "\n"); + break; + } + + if (strm->be->cap & PR_CAP_BE) + chunk_appendf(buf, + "%s backend=%s (id=%u mode=%s)", pfx, + HA_ANON_STR(anon_key, strm->be->id), + strm->be->uuid, proxy_mode_str(strm->be->mode)); + else + chunk_appendf(buf, "%s backend=<NONE> (id=-1 mode=-)", pfx); + + conn = sc_conn(strm->scb); + switch (conn && conn_get_src(conn) ? addr_to_str(conn->src, pn, sizeof(pn)) : AF_UNSPEC) { + case AF_INET: + case AF_INET6: + chunk_appendf(buf, " addr=%s:%d\n", + HA_ANON_STR(anon_key, pn), get_host_port(conn->src)); + break; + case AF_UNIX: + chunk_appendf(buf, " addr=unix\n"); + break; + default: + /* no more information to print right now */ + chunk_appendf(buf, "\n"); + break; + } + + if (strm->be->cap & PR_CAP_BE) + chunk_appendf(buf, + "%s server=%s (id=%u)", pfx, + objt_server(strm->target) ? HA_ANON_STR(anon_key, __objt_server(strm->target)->id) : "<none>", + objt_server(strm->target) ? __objt_server(strm->target)->puid : 0); + else + chunk_appendf(buf, "%s server=<NONE> (id=-1)", pfx); + + switch (conn && conn_get_dst(conn) ? addr_to_str(conn->dst, pn, sizeof(pn)) : AF_UNSPEC) { + case AF_INET: + case AF_INET6: + chunk_appendf(buf, " addr=%s:%d\n", + HA_ANON_STR(anon_key, pn), get_host_port(conn->dst)); + break; + case AF_UNIX: + chunk_appendf(buf, " addr=unix\n"); + break; + default: + /* no more information to print right now */ + chunk_appendf(buf, "\n"); + break; + } + + chunk_appendf(buf, + "%s task=%p (state=0x%02x nice=%d calls=%u rate=%u exp=%s tid=%d(%d/%d)%s", pfx, + strm->task, + strm->task->state, + strm->task->nice, strm->task->calls, read_freq_ctr(&strm->call_rate), + strm->task->expire ? + tick_is_expired(strm->task->expire, now_ms) ? "<PAST>" : + human_time(TICKS_TO_MS(strm->task->expire - now_ms), + TICKS_TO_MS(1000)) : "<NEVER>", + strm->task->tid, + ha_thread_info[strm->task->tid].tgid, + ha_thread_info[strm->task->tid].ltid, + task_in_rq(strm->task) ? ", running" : ""); + + chunk_appendf(buf, + " age=%s)\n", + human_time(ns_to_sec(now_ns) - ns_to_sec(strm->logs.request_ts), 1)); + + if (strm->txn) + chunk_appendf(buf, + "%s txn=%p flags=0x%x meth=%d status=%d req.st=%s rsp.st=%s req.f=0x%02x rsp.f=0x%02x\n", pfx, + strm->txn, strm->txn->flags, strm->txn->meth, strm->txn->status, + h1_msg_state_str(strm->txn->req.msg_state), h1_msg_state_str(strm->txn->rsp.msg_state), + strm->txn->req.flags, strm->txn->rsp.flags); + + scf = strm->scf; + chunk_appendf(buf, "%s scf=%p flags=0x%08x ioto=%s state=%s endp=%s,%p,0x%08x sub=%d", pfx, + scf, scf->flags, human_time(scf->ioto, TICKS_TO_MS(1000)), sc_state_str(scf->state), + (sc_ep_test(scf, SE_FL_T_MUX) ? "CONN" : (sc_ep_test(scf, SE_FL_T_APPLET) ? "APPCTX" : "NONE")), + scf->sedesc->se, sc_ep_get(scf), scf->wait_event.events); + chunk_appendf(buf, " rex=%s", + sc_ep_rcv_ex(scf) ? human_time(TICKS_TO_MS(sc_ep_rcv_ex(scf) - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + chunk_appendf(buf, " wex=%s", + sc_ep_snd_ex(scf) ? human_time(TICKS_TO_MS(sc_ep_snd_ex(scf) - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + chunk_appendf(buf, " rto=%s", + tick_isset(scf->sedesc->lra) ? human_time(TICKS_TO_MS(tick_add(scf->sedesc->lra, scf->ioto) - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + chunk_appendf(buf, " wto=%s\n", + tick_isset(scf->sedesc->fsb) ? human_time(TICKS_TO_MS(tick_add(scf->sedesc->fsb, scf->ioto) - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + + chunk_appendf(&trash, "%s iobuf.flags=0x%08x .pipe=%d .buf=%u@%p+%u/%u\n", pfx, + scf->sedesc->iobuf.flags, + scf->sedesc->iobuf.pipe ? scf->sedesc->iobuf.pipe->data : 0, + scf->sedesc->iobuf.buf ? (unsigned int)b_data(scf->sedesc->iobuf.buf): 0, + scf->sedesc->iobuf.buf ? b_orig(scf->sedesc->iobuf.buf): NULL, + scf->sedesc->iobuf.buf ? (unsigned int)b_head_ofs(scf->sedesc->iobuf.buf): 0, + scf->sedesc->iobuf.buf ? (unsigned int)b_size(scf->sedesc->iobuf.buf): 0); + + if ((conn = sc_conn(scf)) != NULL) { + if (conn->mux && conn->mux->show_sd) { + char muxpfx[100] = ""; + + snprintf(muxpfx, sizeof(muxpfx), "%s ", pfx); + chunk_appendf(buf, "%s ", pfx); + conn->mux->show_sd(buf, scf->sedesc, muxpfx); + chunk_appendf(buf, "\n"); + } + + chunk_appendf(buf, + "%s co0=%p ctrl=%s xprt=%s mux=%s data=%s target=%s:%p\n", pfx, + conn, + conn_get_ctrl_name(conn), + conn_get_xprt_name(conn), + conn_get_mux_name(conn), + sc_get_data_name(scf), + obj_type_name(conn->target), + obj_base_ptr(conn->target)); + + chunk_appendf(buf, + "%s flags=0x%08x fd=%d fd.state=%02x updt=%d fd.tmask=0x%lx\n", pfx, + conn->flags, + conn_fd(conn), + conn_fd(conn) >= 0 ? fdtab[conn->handle.fd].state : 0, + conn_fd(conn) >= 0 ? !!(fdtab[conn->handle.fd].update_mask & ti->ltid_bit) : 0, + conn_fd(conn) >= 0 ? fdtab[conn->handle.fd].thread_mask: 0); + } + else if ((tmpctx = sc_appctx(scf)) != NULL) { + chunk_appendf(buf, + "%s app0=%p st0=%d st1=%d applet=%s tid=%d nice=%d calls=%u rate=%u\n", pfx, + tmpctx, + tmpctx->st0, + tmpctx->st1, + tmpctx->applet->name, + tmpctx->t->tid, + tmpctx->t->nice, tmpctx->t->calls, read_freq_ctr(&tmpctx->call_rate)); + } + + scb = strm->scb; + chunk_appendf(buf, "%s scb=%p flags=0x%08x ioto=%s state=%s endp=%s,%p,0x%08x sub=%d", pfx, + scb, scb->flags, human_time(scb->ioto, TICKS_TO_MS(1000)), sc_state_str(scb->state), + (sc_ep_test(scb, SE_FL_T_MUX) ? "CONN" : (sc_ep_test(scb, SE_FL_T_APPLET) ? "APPCTX" : "NONE")), + scb->sedesc->se, sc_ep_get(scb), scb->wait_event.events); + chunk_appendf(buf, " rex=%s", + sc_ep_rcv_ex(scb) ? human_time(TICKS_TO_MS(sc_ep_rcv_ex(scb) - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + chunk_appendf(buf, " wex=%s", + sc_ep_snd_ex(scb) ? human_time(TICKS_TO_MS(sc_ep_snd_ex(scb) - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + chunk_appendf(buf, " rto=%s", + tick_isset(scb->sedesc->lra) ? human_time(TICKS_TO_MS(tick_add(scb->sedesc->lra, scb->ioto) - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + chunk_appendf(buf, " wto=%s\n", + tick_isset(scb->sedesc->fsb) ? human_time(TICKS_TO_MS(tick_add(scb->sedesc->fsb, scb->ioto) - now_ms), TICKS_TO_MS(1000)) : "<NEVER>"); + + chunk_appendf(&trash, "%s iobuf.flags=0x%08x .pipe=%d .buf=%u@%p+%u/%u\n", pfx, + scb->sedesc->iobuf.flags, + scb->sedesc->iobuf.pipe ? scb->sedesc->iobuf.pipe->data : 0, + scb->sedesc->iobuf.buf ? (unsigned int)b_data(scb->sedesc->iobuf.buf): 0, + scb->sedesc->iobuf.buf ? b_orig(scb->sedesc->iobuf.buf): NULL, + scb->sedesc->iobuf.buf ? (unsigned int)b_head_ofs(scb->sedesc->iobuf.buf): 0, + scb->sedesc->iobuf.buf ? (unsigned int)b_size(scb->sedesc->iobuf.buf): 0); + + if ((conn = sc_conn(scb)) != NULL) { + if (conn->mux && conn->mux->show_sd) { + char muxpfx[100] = ""; + + snprintf(muxpfx, sizeof(muxpfx), "%s ", pfx); + chunk_appendf(buf, "%s ", pfx); + conn->mux->show_sd(buf, scb->sedesc, muxpfx); + chunk_appendf(buf, "\n"); + } + + chunk_appendf(buf, + "%s co1=%p ctrl=%s xprt=%s mux=%s data=%s target=%s:%p\n", pfx, + conn, + conn_get_ctrl_name(conn), + conn_get_xprt_name(conn), + conn_get_mux_name(conn), + sc_get_data_name(scb), + obj_type_name(conn->target), + obj_base_ptr(conn->target)); + + chunk_appendf(buf, + "%s flags=0x%08x fd=%d fd.state=%02x updt=%d fd.tmask=0x%lx\n", pfx, + conn->flags, + conn_fd(conn), + conn_fd(conn) >= 0 ? fdtab[conn->handle.fd].state : 0, + conn_fd(conn) >= 0 ? !!(fdtab[conn->handle.fd].update_mask & ti->ltid_bit) : 0, + conn_fd(conn) >= 0 ? fdtab[conn->handle.fd].thread_mask: 0); + } + else if ((tmpctx = sc_appctx(scb)) != NULL) { + chunk_appendf(buf, + "%s app1=%p st0=%d st1=%d applet=%s tid=%d nice=%d calls=%u rate=%u\n", pfx, + tmpctx, + tmpctx->st0, + tmpctx->st1, + tmpctx->applet->name, + tmpctx->t->tid, + tmpctx->t->nice, tmpctx->t->calls, read_freq_ctr(&tmpctx->call_rate)); + } + + if (HAS_FILTERS(strm)) { + const struct filter *flt; + + chunk_appendf(buf, "%s filters={", pfx); + list_for_each_entry(flt, &strm->strm_flt.filters, list) { + if (flt->list.p != &strm->strm_flt.filters) + chunk_appendf(buf, ", "); + chunk_appendf(buf, "%p=\"%s\"", flt, FLT_ID(flt)); + } + chunk_appendf(buf, "}\n"); + } + + chunk_appendf(buf, + "%s req=%p (f=0x%06x an=0x%x tofwd=%d total=%lld)\n" + "%s an_exp=%s buf=%p data=%p o=%u p=%u i=%u size=%u\n", + pfx, + &strm->req, + strm->req.flags, strm->req.analysers, + strm->req.to_forward, strm->req.total, + pfx, + strm->req.analyse_exp ? + human_time(TICKS_TO_MS(strm->req.analyse_exp - now_ms), + TICKS_TO_MS(1000)) : "<NEVER>", + &strm->req.buf, + b_orig(&strm->req.buf), (unsigned int)co_data(&strm->req), + (unsigned int)ci_head_ofs(&strm->req), (unsigned int)ci_data(&strm->req), + (unsigned int)strm->req.buf.size); + + if (IS_HTX_STRM(strm)) { + struct htx *htx = htxbuf(&strm->req.buf); + + chunk_appendf(buf, + "%s htx=%p flags=0x%x size=%u data=%u used=%u wrap=%s extra=%llu\n", pfx, + htx, htx->flags, htx->size, htx->data, htx_nbblks(htx), + (htx->tail >= htx->head) ? "NO" : "YES", + (unsigned long long)htx->extra); + } + if (HAS_FILTERS(strm) && strm->strm_flt.current[0]) { + const struct filter *flt = strm->strm_flt.current[0]; + + chunk_appendf(buf, "%s current_filter=%p (id=\"%s\" flags=0x%x pre=0x%x post=0x%x) \n", pfx, + flt, flt->config->id, flt->flags, flt->pre_analyzers, flt->post_analyzers); + } + + chunk_appendf(buf, + "%s res=%p (f=0x%06x an=0x%x tofwd=%d total=%lld)\n" + "%s an_exp=%s buf=%p data=%p o=%u p=%u i=%u size=%u\n", + pfx, + &strm->res, + strm->res.flags, strm->res.analysers, + strm->res.to_forward, strm->res.total, + pfx, + strm->res.analyse_exp ? + human_time(TICKS_TO_MS(strm->res.analyse_exp - now_ms), + TICKS_TO_MS(1000)) : "<NEVER>", + &strm->res.buf, + b_orig(&strm->res.buf), (unsigned int)co_data(&strm->res), + (unsigned int)ci_head_ofs(&strm->res), (unsigned int)ci_data(&strm->res), + (unsigned int)strm->res.buf.size); + + if (IS_HTX_STRM(strm)) { + struct htx *htx = htxbuf(&strm->res.buf); + + chunk_appendf(buf, + "%s htx=%p flags=0x%x size=%u data=%u used=%u wrap=%s extra=%llu\n", pfx, + htx, htx->flags, htx->size, htx->data, htx_nbblks(htx), + (htx->tail >= htx->head) ? "NO" : "YES", + (unsigned long long)htx->extra); + } + + if (HAS_FILTERS(strm) && strm->strm_flt.current[1]) { + const struct filter *flt = strm->strm_flt.current[1]; + + chunk_appendf(buf, "%s current_filter=%p (id=\"%s\" flags=0x%x pre=0x%x post=0x%x) \n", pfx, + flt, flt->config->id, flt->flags, flt->pre_analyzers, flt->post_analyzers); + } + + if (strm->current_rule_list && strm->current_rule) { + const struct act_rule *rule = strm->current_rule; + chunk_appendf(buf, "%s current_rule=\"%s\" [%s:%d]\n", pfx, rule->kw->kw, rule->conf.file, rule->conf.line); + } +} + +/* This function dumps a complete stream state onto the stream connector's + * read buffer. The stream has to be set in strm. It returns 0 if the output + * buffer is full and it needs to be called again, otherwise non-zero. It is + * designed to be called from stats_dump_strm_to_buffer() below. + */ +static int stats_dump_full_strm_to_buffer(struct stconn *sc, struct stream *strm) +{ + struct appctx *appctx = __sc_appctx(sc); + struct show_sess_ctx *ctx = appctx->svcctx; + + chunk_reset(&trash); + + if (ctx->section > 0 && ctx->uid != strm->uniq_id) { + /* stream changed, no need to go any further */ + chunk_appendf(&trash, " *** session terminated while we were watching it ***\n"); + if (applet_putchk(appctx, &trash) == -1) + goto full; + goto done; + } + + switch (ctx->section) { + case 0: /* main status of the stream */ + ctx->uid = strm->uniq_id; + ctx->section = 1; + __fallthrough; + + case 1: + strm_dump_to_buffer(&trash, strm, "", appctx->cli_anon_key); + if (applet_putchk(appctx, &trash) == -1) + goto full; + + /* use other states to dump the contents */ + } + /* end of dump */ + done: + ctx->uid = 0; + ctx->section = 0; + return 1; + full: + return 0; +} + +static int cli_parse_show_sess(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct show_sess_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + /* now all sessions by default */ + ctx->target = NULL; + ctx->min_age = 0; + ctx->section = 0; /* start with stream status */ + ctx->pos = 0; + ctx->thr = 0; + + if (*args[2] && strcmp(args[2], "older") == 0) { + unsigned timeout; + const char *res; + + if (!*args[3]) + return cli_err(appctx, "Expects a minimum age (in seconds by default).\n"); + + res = parse_time_err(args[3], &timeout, TIME_UNIT_S); + if (res != 0) + return cli_err(appctx, "Invalid age.\n"); + + ctx->min_age = timeout; + ctx->target = (void *)-1; /* show all matching entries */ + } + else if (*args[2] && strcmp(args[2], "susp") == 0) { + ctx->flags |= CLI_SHOWSESS_F_SUSP; + ctx->target = (void *)-1; /* show all matching entries */ + } + else if (*args[2] && strcmp(args[2], "all") == 0) + ctx->target = (void *)-1; + else if (*args[2]) + ctx->target = (void *)strtoul(args[2], NULL, 0); + + /* The back-ref must be reset, it will be detected and set by + * the dump code upon first invocation. + */ + LIST_INIT(&ctx->bref.users); + + /* let's set our own stream's epoch to the current one and increment + * it so that we know which streams were already there before us. + */ + appctx_strm(appctx)->stream_epoch = _HA_ATOMIC_FETCH_ADD(&stream_epoch, 1); + return 0; +} + +/* This function dumps all streams' states onto the stream connector's + * read buffer. It returns 0 if the output buffer is full and it needs + * to be called again, otherwise non-zero. It proceeds in an isolated + * thread so there is no thread safety issue here. + */ +static int cli_io_handler_dump_sess(struct appctx *appctx) +{ + struct show_sess_ctx *ctx = appctx->svcctx; + struct stconn *sc = appctx_sc(appctx); + struct connection *conn; + + thread_isolate(); + + if (ctx->thr >= global.nbthread) { + /* already terminated */ + goto done; + } + + /* FIXME: Don't watch the other side !*/ + if (unlikely(sc_opposite(sc)->flags & SC_FL_SHUT_DONE)) { + /* If we're forced to shut down, we might have to remove our + * reference to the last stream being dumped. + */ + if (!LIST_ISEMPTY(&ctx->bref.users)) { + LIST_DELETE(&ctx->bref.users); + LIST_INIT(&ctx->bref.users); + } + goto done; + } + + chunk_reset(&trash); + + /* first, let's detach the back-ref from a possible previous stream */ + if (!LIST_ISEMPTY(&ctx->bref.users)) { + LIST_DELETE(&ctx->bref.users); + LIST_INIT(&ctx->bref.users); + } else if (!ctx->bref.ref) { + /* first call, start with first stream */ + ctx->bref.ref = ha_thread_ctx[ctx->thr].streams.n; + } + + /* and start from where we stopped */ + while (1) { + char pn[INET6_ADDRSTRLEN]; + struct stream *curr_strm; + int done= 0; + + if (ctx->bref.ref == &ha_thread_ctx[ctx->thr].streams) + done = 1; + else { + /* check if we've found a stream created after issuing the "show sess" */ + curr_strm = LIST_ELEM(ctx->bref.ref, struct stream *, list); + if ((int)(curr_strm->stream_epoch - appctx_strm(appctx)->stream_epoch) > 0) + done = 1; + } + + if (done) { + ctx->thr++; + if (ctx->thr >= global.nbthread) + break; + ctx->bref.ref = ha_thread_ctx[ctx->thr].streams.n; + continue; + } + + if (ctx->min_age) { + uint age = ns_to_sec(now_ns) - ns_to_sec(curr_strm->logs.request_ts); + if (age < ctx->min_age) + goto next_sess; + } + + if (ctx->flags & CLI_SHOWSESS_F_SUSP) { + /* only show suspicious streams. Non-suspicious ones have a valid + * expiration date in the future and a valid front endpoint. + */ + if (curr_strm->task->expire && + !tick_is_expired(curr_strm->task->expire, now_ms) && + curr_strm->scf && curr_strm->scf->sedesc && curr_strm->scf->sedesc->se) + goto next_sess; + } + + if (ctx->target) { + if (ctx->target != (void *)-1 && ctx->target != curr_strm) + goto next_sess; + + LIST_APPEND(&curr_strm->back_refs, &ctx->bref.users); + /* call the proper dump() function and return if we're missing space */ + if (!stats_dump_full_strm_to_buffer(sc, curr_strm)) + goto full; + + /* stream dump complete */ + LIST_DELETE(&ctx->bref.users); + LIST_INIT(&ctx->bref.users); + if (ctx->target != (void *)-1) { + ctx->target = NULL; + break; + } + else + goto next_sess; + } + + chunk_appendf(&trash, + "%p: proto=%s", + curr_strm, + strm_li(curr_strm) ? strm_li(curr_strm)->rx.proto->name : "?"); + + conn = objt_conn(strm_orig(curr_strm)); + switch (conn && conn_get_src(conn) ? addr_to_str(conn->src, pn, sizeof(pn)) : AF_UNSPEC) { + case AF_INET: + case AF_INET6: + chunk_appendf(&trash, + " src=%s:%d fe=%s be=%s srv=%s", + HA_ANON_CLI(pn), + get_host_port(conn->src), + HA_ANON_CLI(strm_fe(curr_strm)->id), + (curr_strm->be->cap & PR_CAP_BE) ? HA_ANON_CLI(curr_strm->be->id) : "<NONE>", + objt_server(curr_strm->target) ? HA_ANON_CLI(__objt_server(curr_strm->target)->id) : "<none>" + ); + break; + case AF_UNIX: + chunk_appendf(&trash, + " src=unix:%d fe=%s be=%s srv=%s", + strm_li(curr_strm)->luid, + HA_ANON_CLI(strm_fe(curr_strm)->id), + (curr_strm->be->cap & PR_CAP_BE) ? HA_ANON_CLI(curr_strm->be->id) : "<NONE>", + objt_server(curr_strm->target) ? HA_ANON_CLI(__objt_server(curr_strm->target)->id) : "<none>" + ); + break; + } + + chunk_appendf(&trash, + " ts=%02x epoch=%#x age=%s calls=%u rate=%u cpu=%llu lat=%llu", + curr_strm->task->state, curr_strm->stream_epoch, + human_time(ns_to_sec(now_ns) - ns_to_sec(curr_strm->logs.request_ts), 1), + curr_strm->task->calls, read_freq_ctr(&curr_strm->call_rate), + (unsigned long long)curr_strm->cpu_time, (unsigned long long)curr_strm->lat_time); + + chunk_appendf(&trash, + " rq[f=%06xh,i=%u,an=%02xh", + curr_strm->req.flags, + (unsigned int)ci_data(&curr_strm->req), + curr_strm->req.analysers); + + chunk_appendf(&trash, + ",ax=%s]", + curr_strm->req.analyse_exp ? + human_time(TICKS_TO_MS(curr_strm->req.analyse_exp - now_ms), + TICKS_TO_MS(1000)) : ""); + + chunk_appendf(&trash, + " rp[f=%06xh,i=%u,an=%02xh", + curr_strm->res.flags, + (unsigned int)ci_data(&curr_strm->res), + curr_strm->res.analysers); + chunk_appendf(&trash, + ",ax=%s]", + curr_strm->res.analyse_exp ? + human_time(TICKS_TO_MS(curr_strm->res.analyse_exp - now_ms), + TICKS_TO_MS(1000)) : ""); + + conn = sc_conn(curr_strm->scf); + chunk_appendf(&trash," scf=[%d,%1xh,fd=%d", + curr_strm->scf->state, curr_strm->scf->flags, conn_fd(conn)); + chunk_appendf(&trash, ",rex=%s", + sc_ep_rcv_ex(curr_strm->scf) ? + human_time(TICKS_TO_MS(sc_ep_rcv_ex(curr_strm->scf) - now_ms), + TICKS_TO_MS(1000)) : ""); + chunk_appendf(&trash,",wex=%s]", + sc_ep_snd_ex(curr_strm->scf) ? + human_time(TICKS_TO_MS(sc_ep_snd_ex(curr_strm->scf) - now_ms), + TICKS_TO_MS(1000)) : ""); + + conn = sc_conn(curr_strm->scb); + chunk_appendf(&trash, " scb=[%d,%1xh,fd=%d", + curr_strm->scb->state, curr_strm->scb->flags, conn_fd(conn)); + chunk_appendf(&trash, ",rex=%s", + sc_ep_rcv_ex(curr_strm->scb) ? + human_time(TICKS_TO_MS(sc_ep_rcv_ex(curr_strm->scb) - now_ms), + TICKS_TO_MS(1000)) : ""); + chunk_appendf(&trash, ",wex=%s]", + sc_ep_snd_ex(curr_strm->scb) ? + human_time(TICKS_TO_MS(sc_ep_snd_ex(curr_strm->scb) - now_ms), + TICKS_TO_MS(1000)) : ""); + + chunk_appendf(&trash, + " exp=%s rc=%d c_exp=%s", + curr_strm->task->expire ? + human_time(TICKS_TO_MS(curr_strm->task->expire - now_ms), + TICKS_TO_MS(1000)) : "", + curr_strm->conn_retries, + curr_strm->conn_exp ? + human_time(TICKS_TO_MS(curr_strm->conn_exp - now_ms), + TICKS_TO_MS(1000)) : ""); + if (task_in_rq(curr_strm->task)) + chunk_appendf(&trash, " run(nice=%d)", curr_strm->task->nice); + + chunk_appendf(&trash, "\n"); + + if (applet_putchk(appctx, &trash) == -1) { + /* let's try again later from this stream. We add ourselves into + * this stream's users so that it can remove us upon termination. + */ + LIST_APPEND(&curr_strm->back_refs, &ctx->bref.users); + goto full; + } + + next_sess: + ctx->bref.ref = curr_strm->list.n; + } + + if (ctx->target && ctx->target != (void *)-1) { + /* specified stream not found */ + if (ctx->section > 0) + chunk_appendf(&trash, " *** session terminated while we were watching it ***\n"); + else + chunk_appendf(&trash, "Session not found.\n"); + + if (applet_putchk(appctx, &trash) == -1) + goto full; + + ctx->target = NULL; + ctx->uid = 0; + goto done; + } + + done: + thread_release(); + return 1; + full: + thread_release(); + return 0; +} + +static void cli_release_show_sess(struct appctx *appctx) +{ + struct show_sess_ctx *ctx = appctx->svcctx; + + if (ctx->thr < global.nbthread) { + /* a dump was aborted, either in error or timeout. We need to + * safely detach from the target stream's list. It's mandatory + * to lock because a stream on the target thread could be moving + * our node. + */ + thread_isolate(); + if (!LIST_ISEMPTY(&ctx->bref.users)) + LIST_DELETE(&ctx->bref.users); + thread_release(); + } +} + +/* Parses the "shutdown session" directive, it always returns 1 */ +static int cli_parse_shutdown_session(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct stream *strm, *ptr; + int thr; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + ptr = (void *)strtoul(args[2], NULL, 0); + if (!ptr) + return cli_err(appctx, "Session pointer expected (use 'show sess').\n"); + + strm = NULL; + + thread_isolate(); + + /* first, look for the requested stream in the stream table */ + for (thr = 0; strm != ptr && thr < global.nbthread; thr++) { + list_for_each_entry(strm, &ha_thread_ctx[thr].streams, list) { + if (strm == ptr) { + stream_shutdown(strm, SF_ERR_KILLED); + break; + } + } + } + + thread_release(); + + /* do we have the stream ? */ + if (strm != ptr) + return cli_err(appctx, "No such session (use 'show sess').\n"); + + return 1; +} + +/* Parses the "shutdown session server" directive, it always returns 1 */ +static int cli_parse_shutdown_sessions_server(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct server *sv; + + if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) + return 1; + + sv = cli_find_server(appctx, args[3]); + if (!sv) + return 1; + + /* kill all the stream that are on this server */ + HA_SPIN_LOCK(SERVER_LOCK, &sv->lock); + srv_shutdown_streams(sv, SF_ERR_KILLED); + HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); + return 1; +} + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "show", "sess", NULL }, "show sess [<id>|all|susp|older <age>] : report the list of current sessions or dump this exact session", cli_parse_show_sess, cli_io_handler_dump_sess, cli_release_show_sess }, + { { "shutdown", "session", NULL }, "shutdown session [id] : kill a specific session", cli_parse_shutdown_session, NULL, NULL }, + { { "shutdown", "sessions", "server" }, "shutdown sessions server <bk>/<srv> : kill sessions on a server", cli_parse_shutdown_sessions_server, NULL, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +/* main configuration keyword registration. */ +static struct action_kw_list stream_tcp_req_keywords = { ILH, { + { "set-log-level", stream_parse_set_log_level }, + { "set-nice", stream_parse_set_nice }, + { "switch-mode", stream_parse_switch_mode }, + { "use-service", stream_parse_use_service }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_cont_keywords_register, &stream_tcp_req_keywords); + +/* main configuration keyword registration. */ +static struct action_kw_list stream_tcp_res_keywords = { ILH, { + { "set-log-level", stream_parse_set_log_level }, + { "set-nice", stream_parse_set_nice }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_res_cont_keywords_register, &stream_tcp_res_keywords); + +static struct action_kw_list stream_http_req_keywords = { ILH, { + { "set-log-level", stream_parse_set_log_level }, + { "set-nice", stream_parse_set_nice }, + { "use-service", stream_parse_use_service }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &stream_http_req_keywords); + +static struct action_kw_list stream_http_res_keywords = { ILH, { + { "set-log-level", stream_parse_set_log_level }, + { "set-nice", stream_parse_set_nice }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_res_keywords_register, &stream_http_res_keywords); + +static struct action_kw_list stream_http_after_res_actions = { ILH, { + { "set-log-level", stream_parse_set_log_level }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_after_res_keywords_register, &stream_http_after_res_actions); + +static int smp_fetch_cur_client_timeout(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + if (!smp->strm) + return 0; + + smp->data.u.sint = TICKS_TO_MS(smp->strm->scf->ioto); + return 1; +} + +static int smp_fetch_cur_server_timeout(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + if (!smp->strm) + return 0; + + smp->data.u.sint = TICKS_TO_MS(smp->strm->scb->ioto); + return 1; +} + +static int smp_fetch_cur_tunnel_timeout(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + if (!smp->strm) + return 0; + + smp->data.u.sint = TICKS_TO_MS(smp->strm->tunnel_timeout); + return 1; +} + +static int smp_fetch_last_rule_file(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_STR; + if (!smp->strm || !smp->strm->last_rule_file) + return 0; + + smp->flags |= SMP_F_CONST; + smp->data.u.str.area = (char *)smp->strm->last_rule_file; + smp->data.u.str.data = strlen(smp->strm->last_rule_file); + return 1; +} + +static int smp_fetch_last_rule_line(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + if (!smp->strm || !smp->strm->last_rule_line) + return 0; + + smp->data.u.sint = smp->strm->last_rule_line; + return 1; +} + +static int smp_fetch_sess_term_state(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + struct buffer *trash = get_trash_chunk(); + + smp->flags = SMP_F_VOLATILE; + smp->data.type = SMP_T_STR; + if (!smp->strm) + return 0; + + trash->area[trash->data++] = sess_term_cond[(smp->strm->flags & SF_ERR_MASK) >> SF_ERR_SHIFT]; + trash->area[trash->data++] = sess_fin_state[(smp->strm->flags & SF_FINST_MASK) >> SF_FINST_SHIFT]; + + smp->data.u.str = *trash; + smp->data.type = SMP_T_STR; + smp->flags &= ~SMP_F_CONST; + return 1; +} + +static int smp_fetch_conn_retries(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + if (!smp->strm) + return 0; + + if (!sc_state_in(smp->strm->scb->state, SC_SB_DIS|SC_SB_CLO)) + smp->flags |= SMP_F_VOL_TEST; + smp->data.u.sint = smp->strm->conn_retries; + return 1; +} + +static int smp_fetch_id32(const struct arg *args, struct sample *smp, const char *km, void *private) +{ + smp->flags = SMP_F_VOL_TXN; + smp->data.type = SMP_T_SINT; + if (!smp->strm) + return 0; + smp->data.u.sint = smp->strm->uniq_id; + return 1; +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Please take care of keeping this list alphabetically sorted. + */ +static struct sample_fetch_kw_list smp_kws = {ILH, { + { "cur_client_timeout", smp_fetch_cur_client_timeout, 0, NULL, SMP_T_SINT, SMP_USE_FTEND, }, + { "cur_server_timeout", smp_fetch_cur_server_timeout, 0, NULL, SMP_T_SINT, SMP_USE_BKEND, }, + { "cur_tunnel_timeout", smp_fetch_cur_tunnel_timeout, 0, NULL, SMP_T_SINT, SMP_USE_BKEND, }, + { "last_rule_file", smp_fetch_last_rule_file, 0, NULL, SMP_T_STR, SMP_USE_INTRN, }, + { "last_rule_line", smp_fetch_last_rule_line, 0, NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "txn.conn_retries", smp_fetch_conn_retries, 0, NULL, SMP_T_SINT, SMP_USE_L4SRV, }, + { "txn.id32", smp_fetch_id32, 0, NULL, SMP_T_SINT, SMP_USE_INTRN, }, + { "txn.sess_term_state",smp_fetch_sess_term_state, 0, NULL, SMP_T_STR, SMP_USE_INTRN, }, + { NULL, NULL, 0, 0, 0 }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &smp_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/task.c b/src/task.c new file mode 100644 index 0000000..1ab5212 --- /dev/null +++ b/src/task.c @@ -0,0 +1,979 @@ +/* + * Task management functions. + * + * Copyright 2000-2009 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <string.h> + +#include <import/eb32tree.h> + +#include <haproxy/api.h> +#include <haproxy/activity.h> +#include <haproxy/cfgparse.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/list.h> +#include <haproxy/pool.h> +#include <haproxy/task.h> +#include <haproxy/tools.h> + +extern struct task *process_stream(struct task *t, void *context, unsigned int state); +extern void stream_update_timings(struct task *t, uint64_t lat, uint64_t cpu); + +DECLARE_POOL(pool_head_task, "task", sizeof(struct task)); +DECLARE_POOL(pool_head_tasklet, "tasklet", sizeof(struct tasklet)); + +/* This is the memory pool containing all the signal structs. These + * struct are used to store each required signal between two tasks. + */ +DECLARE_POOL(pool_head_notification, "notification", sizeof(struct notification)); + +/* The lock protecting all wait queues at once. For now we have no better + * alternative since a task may have to be removed from a queue and placed + * into another one. Storing the WQ index into the task doesn't seem to be + * sufficient either. + */ +__decl_aligned_rwlock(wq_lock); + +/* Flags the task <t> for immediate destruction and puts it into its first + * thread's shared tasklet list if not yet queued/running. This will bypass + * the priority scheduling and make the task show up as fast as possible in + * the other thread's queue. Note that this operation isn't idempotent and is + * not supposed to be run on the same task from multiple threads at once. It's + * the caller's responsibility to make sure it is the only one able to kill the + * task. + */ +void task_kill(struct task *t) +{ + unsigned int state = t->state; + unsigned int thr; + + BUG_ON(state & TASK_KILLED); + + while (1) { + while (state & (TASK_RUNNING | TASK_QUEUED)) { + /* task already in the queue and about to be executed, + * or even currently running. Just add the flag and be + * done with it, the process loop will detect it and kill + * it. The CAS will fail if we arrive too late. + */ + if (_HA_ATOMIC_CAS(&t->state, &state, state | TASK_KILLED)) + return; + } + + /* We'll have to wake it up, but we must also secure it so that + * it doesn't vanish under us. TASK_QUEUED guarantees nobody will + * add past us. + */ + if (_HA_ATOMIC_CAS(&t->state, &state, state | TASK_QUEUED | TASK_KILLED)) { + /* Bypass the tree and go directly into the shared tasklet list. + * Note: that's a task so it must be accounted for as such. Pick + * the task's first thread for the job. + */ + thr = t->tid >= 0 ? t->tid : tid; + + /* Beware: tasks that have never run don't have their ->list empty yet! */ + MT_LIST_APPEND(&ha_thread_ctx[thr].shared_tasklet_list, + list_to_mt_list(&((struct tasklet *)t)->list)); + _HA_ATOMIC_INC(&ha_thread_ctx[thr].rq_total); + _HA_ATOMIC_INC(&ha_thread_ctx[thr].tasks_in_list); + wake_thread(thr); + return; + } + } +} + +/* Equivalent of task_kill for tasklets. Mark the tasklet <t> for destruction. + * It will be deleted on the next scheduler invocation. This function is + * thread-safe : a thread can kill a tasklet of another thread. + */ +void tasklet_kill(struct tasklet *t) +{ + unsigned int state = t->state; + unsigned int thr; + + BUG_ON(state & TASK_KILLED); + + while (1) { + while (state & (TASK_IN_LIST)) { + /* Tasklet already in the list ready to be executed. Add + * the killed flag and wait for the process loop to + * detect it. + */ + if (_HA_ATOMIC_CAS(&t->state, &state, state | TASK_KILLED)) + return; + } + + /* Mark the tasklet as killed and wake the thread to process it + * as soon as possible. + */ + if (_HA_ATOMIC_CAS(&t->state, &state, state | TASK_IN_LIST | TASK_KILLED)) { + thr = t->tid >= 0 ? t->tid : tid; + MT_LIST_APPEND(&ha_thread_ctx[thr].shared_tasklet_list, + list_to_mt_list(&t->list)); + _HA_ATOMIC_INC(&ha_thread_ctx[thr].rq_total); + wake_thread(thr); + return; + } + } +} + +/* Do not call this one, please use tasklet_wakeup_on() instead, as this one is + * the slow path of tasklet_wakeup_on() which performs some preliminary checks + * and sets TASK_IN_LIST before calling this one. A negative <thr> designates + * the current thread. + */ +void __tasklet_wakeup_on(struct tasklet *tl, int thr) +{ + if (likely(thr < 0)) { + /* this tasklet runs on the caller thread */ + if (tl->state & TASK_HEAVY) { + LIST_APPEND(&th_ctx->tasklets[TL_HEAVY], &tl->list); + th_ctx->tl_class_mask |= 1 << TL_HEAVY; + } + else if (tl->state & TASK_SELF_WAKING) { + LIST_APPEND(&th_ctx->tasklets[TL_BULK], &tl->list); + th_ctx->tl_class_mask |= 1 << TL_BULK; + } + else if ((struct task *)tl == th_ctx->current) { + _HA_ATOMIC_OR(&tl->state, TASK_SELF_WAKING); + LIST_APPEND(&th_ctx->tasklets[TL_BULK], &tl->list); + th_ctx->tl_class_mask |= 1 << TL_BULK; + } + else if (th_ctx->current_queue < 0) { + LIST_APPEND(&th_ctx->tasklets[TL_URGENT], &tl->list); + th_ctx->tl_class_mask |= 1 << TL_URGENT; + } + else { + LIST_APPEND(&th_ctx->tasklets[th_ctx->current_queue], &tl->list); + th_ctx->tl_class_mask |= 1 << th_ctx->current_queue; + } + _HA_ATOMIC_INC(&th_ctx->rq_total); + } else { + /* this tasklet runs on a specific thread. */ + MT_LIST_APPEND(&ha_thread_ctx[thr].shared_tasklet_list, list_to_mt_list(&tl->list)); + _HA_ATOMIC_INC(&ha_thread_ctx[thr].rq_total); + wake_thread(thr); + } +} + +/* Do not call this one, please use tasklet_wakeup_after_on() instead, as this one is + * the slow path of tasklet_wakeup_after() which performs some preliminary checks + * and sets TASK_IN_LIST before calling this one. + */ +struct list *__tasklet_wakeup_after(struct list *head, struct tasklet *tl) +{ + BUG_ON(tl->tid >= 0 && tid != tl->tid); + /* this tasklet runs on the caller thread */ + if (!head) { + if (tl->state & TASK_HEAVY) { + LIST_INSERT(&th_ctx->tasklets[TL_HEAVY], &tl->list); + th_ctx->tl_class_mask |= 1 << TL_HEAVY; + } + else if (tl->state & TASK_SELF_WAKING) { + LIST_INSERT(&th_ctx->tasklets[TL_BULK], &tl->list); + th_ctx->tl_class_mask |= 1 << TL_BULK; + } + else if ((struct task *)tl == th_ctx->current) { + _HA_ATOMIC_OR(&tl->state, TASK_SELF_WAKING); + LIST_INSERT(&th_ctx->tasklets[TL_BULK], &tl->list); + th_ctx->tl_class_mask |= 1 << TL_BULK; + } + else if (th_ctx->current_queue < 0) { + LIST_INSERT(&th_ctx->tasklets[TL_URGENT], &tl->list); + th_ctx->tl_class_mask |= 1 << TL_URGENT; + } + else { + LIST_INSERT(&th_ctx->tasklets[th_ctx->current_queue], &tl->list); + th_ctx->tl_class_mask |= 1 << th_ctx->current_queue; + } + } + else { + LIST_APPEND(head, &tl->list); + } + _HA_ATOMIC_INC(&th_ctx->rq_total); + return &tl->list; +} + +/* Puts the task <t> in run queue at a position depending on t->nice. <t> is + * returned. The nice value assigns boosts in 32th of the run queue size. A + * nice value of -1024 sets the task to -tasks_run_queue*32, while a nice value + * of 1024 sets the task to tasks_run_queue*32. The state flags are cleared, so + * the caller will have to set its flags after this call. + * The task must not already be in the run queue. If unsure, use the safer + * task_wakeup() function. + */ +void __task_wakeup(struct task *t) +{ + struct eb_root *root = &th_ctx->rqueue; + int thr __maybe_unused = t->tid >= 0 ? t->tid : tid; + +#ifdef USE_THREAD + if (thr != tid) { + root = &ha_thread_ctx[thr].rqueue_shared; + + _HA_ATOMIC_INC(&ha_thread_ctx[thr].rq_total); + HA_SPIN_LOCK(TASK_RQ_LOCK, &ha_thread_ctx[thr].rqsh_lock); + + t->rq.key = _HA_ATOMIC_ADD_FETCH(&ha_thread_ctx[thr].rqueue_ticks, 1); + __ha_barrier_store(); + } else +#endif + { + _HA_ATOMIC_INC(&th_ctx->rq_total); + t->rq.key = _HA_ATOMIC_ADD_FETCH(&th_ctx->rqueue_ticks, 1); + } + + if (likely(t->nice)) { + int offset; + + _HA_ATOMIC_INC(&tg_ctx->niced_tasks); + offset = t->nice * (int)global.tune.runqueue_depth; + t->rq.key += offset; + } + + if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING) + t->wake_date = now_mono_time(); + + eb32_insert(root, &t->rq); + +#ifdef USE_THREAD + if (thr != tid) { + HA_SPIN_UNLOCK(TASK_RQ_LOCK, &ha_thread_ctx[thr].rqsh_lock); + + /* If all threads that are supposed to handle this task are sleeping, + * wake one. + */ + wake_thread(thr); + } +#endif + return; +} + +/* + * __task_queue() + * + * Inserts a task into wait queue <wq> at the position given by its expiration + * date. It does not matter if the task was already in the wait queue or not, + * as it will be unlinked. The task MUST NOT have an infinite expiration timer. + * Last, tasks must not be queued further than the end of the tree, which is + * between <now_ms> and <now_ms> + 2^31 ms (now+24days in 32bit). + * + * This function should not be used directly, it is meant to be called by the + * inline version of task_queue() which performs a few cheap preliminary tests + * before deciding to call __task_queue(). Moreover this function doesn't care + * at all about locking so the caller must be careful when deciding whether to + * lock or not around this call. + */ +void __task_queue(struct task *task, struct eb_root *wq) +{ +#ifdef USE_THREAD + BUG_ON((wq == &tg_ctx->timers && task->tid >= 0) || + (wq == &th_ctx->timers && task->tid < 0) || + (wq != &tg_ctx->timers && wq != &th_ctx->timers)); +#endif + /* if this happens the process is doomed anyway, so better catch it now + * so that we have the caller in the stack. + */ + BUG_ON(task->expire == TICK_ETERNITY); + + if (likely(task_in_wq(task))) + __task_unlink_wq(task); + + /* the task is not in the queue now */ + task->wq.key = task->expire; +#ifdef DEBUG_CHECK_INVALID_EXPIRATION_DATES + if (tick_is_lt(task->wq.key, now_ms)) + /* we're queuing too far away or in the past (most likely) */ + return; +#endif + + eb32_insert(wq, &task->wq); +} + +/* + * Extract all expired timers from the timer queue, and wakes up all + * associated tasks. + */ +void wake_expired_tasks() +{ + struct thread_ctx * const tt = th_ctx; // thread's tasks + int max_processed = global.tune.runqueue_depth; + struct task *task; + struct eb32_node *eb; + __decl_thread(int key); + + while (1) { + if (max_processed-- <= 0) + goto leave; + + eb = eb32_lookup_ge(&tt->timers, now_ms - TIMER_LOOK_BACK); + if (!eb) { + /* we might have reached the end of the tree, typically because + * <now_ms> is in the first half and we're first scanning the last + * half. Let's loop back to the beginning of the tree now. + */ + eb = eb32_first(&tt->timers); + if (likely(!eb)) + break; + } + + /* It is possible that this task was left at an earlier place in the + * tree because a recent call to task_queue() has not moved it. This + * happens when the new expiration date is later than the old one. + * Since it is very unlikely that we reach a timeout anyway, it's a + * lot cheaper to proceed like this because we almost never update + * the tree. We may also find disabled expiration dates there. Since + * we have detached the task from the tree, we simply call task_queue + * to take care of this. Note that we might occasionally requeue it at + * the same place, before <eb>, so we have to check if this happens, + * and adjust <eb>, otherwise we may skip it which is not what we want. + * We may also not requeue the task (and not point eb at it) if its + * expiration time is not set. We also make sure we leave the real + * expiration date for the next task in the queue so that when calling + * next_timer_expiry() we're guaranteed to see the next real date and + * not the next apparent date. This is in order to avoid useless + * wakeups. + */ + + task = eb32_entry(eb, struct task, wq); + if (tick_is_expired(task->expire, now_ms)) { + /* expired task, wake it up */ + __task_unlink_wq(task); + _task_wakeup(task, TASK_WOKEN_TIMER, 0); + } + else if (task->expire != eb->key) { + /* task is not expired but its key doesn't match so let's + * update it and skip to next apparently expired task. + */ + __task_unlink_wq(task); + if (tick_isset(task->expire)) + __task_queue(task, &tt->timers); + } + else { + /* task not expired and correctly placed. It may not be eternal. */ + BUG_ON(task->expire == TICK_ETERNITY); + break; + } + } + +#ifdef USE_THREAD + if (eb_is_empty(&tg_ctx->timers)) + goto leave; + + HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &wq_lock); + eb = eb32_lookup_ge(&tg_ctx->timers, now_ms - TIMER_LOOK_BACK); + if (!eb) { + eb = eb32_first(&tg_ctx->timers); + if (likely(!eb)) { + HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock); + goto leave; + } + } + key = eb->key; + + if (tick_is_lt(now_ms, key)) { + HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock); + goto leave; + } + + /* There's really something of interest here, let's visit the queue */ + + if (HA_RWLOCK_TRYRDTOSK(TASK_WQ_LOCK, &wq_lock)) { + /* if we failed to grab the lock it means another thread is + * already doing the same here, so let it do the job. + */ + HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock); + goto leave; + } + + while (1) { + lookup_next: + if (max_processed-- <= 0) + break; + eb = eb32_lookup_ge(&tg_ctx->timers, now_ms - TIMER_LOOK_BACK); + if (!eb) { + /* we might have reached the end of the tree, typically because + * <now_ms> is in the first half and we're first scanning the last + * half. Let's loop back to the beginning of the tree now. + */ + eb = eb32_first(&tg_ctx->timers); + if (likely(!eb)) + break; + } + + task = eb32_entry(eb, struct task, wq); + + /* Check for any competing run of the task (quite rare but may + * involve a dangerous concurrent access on task->expire). In + * order to protect against this, we'll take an exclusive access + * on TASK_RUNNING before checking/touching task->expire. If the + * task is already RUNNING on another thread, it will deal by + * itself with the requeuing so we must not do anything and + * simply quit the loop for now, because we cannot wait with the + * WQ lock held as this would prevent the running thread from + * requeuing the task. One annoying effect of holding RUNNING + * here is that a concurrent task_wakeup() will refrain from + * waking it up. This forces us to check for a wakeup after + * releasing the flag. + */ + if (HA_ATOMIC_FETCH_OR(&task->state, TASK_RUNNING) & TASK_RUNNING) + break; + + if (tick_is_expired(task->expire, now_ms)) { + /* expired task, wake it up */ + HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &wq_lock); + __task_unlink_wq(task); + HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &wq_lock); + task_drop_running(task, TASK_WOKEN_TIMER); + } + else if (task->expire != eb->key) { + /* task is not expired but its key doesn't match so let's + * update it and skip to next apparently expired task. + */ + HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &wq_lock); + __task_unlink_wq(task); + if (tick_isset(task->expire)) + __task_queue(task, &tg_ctx->timers); + HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &wq_lock); + task_drop_running(task, 0); + goto lookup_next; + } + else { + /* task not expired and correctly placed. It may not be eternal. */ + BUG_ON(task->expire == TICK_ETERNITY); + task_drop_running(task, 0); + break; + } + } + + HA_RWLOCK_SKUNLOCK(TASK_WQ_LOCK, &wq_lock); +#endif +leave: + return; +} + +/* Checks the next timer for the current thread by looking into its own timer + * list and the global one. It may return TICK_ETERNITY if no timer is present. + * Note that the next timer might very well be slightly in the past. + */ +int next_timer_expiry() +{ + struct thread_ctx * const tt = th_ctx; // thread's tasks + struct eb32_node *eb; + int ret = TICK_ETERNITY; + __decl_thread(int key = TICK_ETERNITY); + + /* first check in the thread-local timers */ + eb = eb32_lookup_ge(&tt->timers, now_ms - TIMER_LOOK_BACK); + if (!eb) { + /* we might have reached the end of the tree, typically because + * <now_ms> is in the first half and we're first scanning the last + * half. Let's loop back to the beginning of the tree now. + */ + eb = eb32_first(&tt->timers); + } + + if (eb) + ret = eb->key; + +#ifdef USE_THREAD + if (!eb_is_empty(&tg_ctx->timers)) { + HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &wq_lock); + eb = eb32_lookup_ge(&tg_ctx->timers, now_ms - TIMER_LOOK_BACK); + if (!eb) + eb = eb32_first(&tg_ctx->timers); + if (eb) + key = eb->key; + HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock); + if (eb) + ret = tick_first(ret, key); + } +#endif + return ret; +} + +/* Walks over tasklet lists th_ctx->tasklets[0..TL_CLASSES-1] and run at most + * budget[TL_*] of them. Returns the number of entries effectively processed + * (tasks and tasklets merged). The count of tasks in the list for the current + * thread is adjusted. + */ +unsigned int run_tasks_from_lists(unsigned int budgets[]) +{ + struct task *(*process)(struct task *t, void *ctx, unsigned int state); + struct list *tl_queues = th_ctx->tasklets; + struct task *t; + uint8_t budget_mask = (1 << TL_CLASSES) - 1; + struct sched_activity *profile_entry = NULL; + unsigned int done = 0; + unsigned int queue; + unsigned int state; + void *ctx; + + for (queue = 0; queue < TL_CLASSES;) { + th_ctx->current_queue = queue; + + /* global.tune.sched.low-latency is set */ + if (global.tune.options & GTUNE_SCHED_LOW_LATENCY) { + if (unlikely(th_ctx->tl_class_mask & budget_mask & ((1 << queue) - 1))) { + /* a lower queue index has tasks again and still has a + * budget to run them. Let's switch to it now. + */ + queue = (th_ctx->tl_class_mask & 1) ? 0 : + (th_ctx->tl_class_mask & 2) ? 1 : 2; + continue; + } + + if (unlikely(queue > TL_URGENT && + budget_mask & (1 << TL_URGENT) && + !MT_LIST_ISEMPTY(&th_ctx->shared_tasklet_list))) { + /* an urgent tasklet arrived from another thread */ + break; + } + + if (unlikely(queue > TL_NORMAL && + budget_mask & (1 << TL_NORMAL) && + (!eb_is_empty(&th_ctx->rqueue) || !eb_is_empty(&th_ctx->rqueue_shared)))) { + /* a task was woken up by a bulk tasklet or another thread */ + break; + } + } + + if (LIST_ISEMPTY(&tl_queues[queue])) { + th_ctx->tl_class_mask &= ~(1 << queue); + queue++; + continue; + } + + if (!budgets[queue]) { + budget_mask &= ~(1 << queue); + queue++; + continue; + } + + budgets[queue]--; + activity[tid].ctxsw++; + + t = (struct task *)LIST_ELEM(tl_queues[queue].n, struct tasklet *, list); + ctx = t->context; + process = t->process; + t->calls++; + + th_ctx->sched_wake_date = t->wake_date; + if (th_ctx->sched_wake_date) { + uint32_t now_ns = now_mono_time(); + uint32_t lat = now_ns - th_ctx->sched_wake_date; + + t->wake_date = 0; + th_ctx->sched_call_date = now_ns; + profile_entry = sched_activity_entry(sched_activity, t->process, t->caller); + th_ctx->sched_profile_entry = profile_entry; + HA_ATOMIC_ADD(&profile_entry->lat_time, lat); + HA_ATOMIC_INC(&profile_entry->calls); + } + __ha_barrier_store(); + + th_ctx->current = t; + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_STUCK); // this thread is still running + + _HA_ATOMIC_DEC(&th_ctx->rq_total); + LIST_DEL_INIT(&((struct tasklet *)t)->list); + __ha_barrier_store(); + + if (t->state & TASK_F_TASKLET) { + /* this is a tasklet */ + state = _HA_ATOMIC_FETCH_AND(&t->state, TASK_PERSISTENT); + __ha_barrier_atomic_store(); + + if (likely(!(state & TASK_KILLED))) { + process(t, ctx, state); + } + else { + done++; + th_ctx->current = NULL; + pool_free(pool_head_tasklet, t); + __ha_barrier_store(); + continue; + } + } else { + /* This is a regular task */ + + /* We must be the exclusive owner of the TASK_RUNNING bit, and + * have to be careful that the task is not being manipulated on + * another thread finding it expired in wake_expired_tasks(). + * The TASK_RUNNING bit will be set during these operations, + * they are extremely rare and do not last long so the best to + * do here is to wait. + */ + state = _HA_ATOMIC_LOAD(&t->state); + do { + while (unlikely(state & TASK_RUNNING)) { + __ha_cpu_relax(); + state = _HA_ATOMIC_LOAD(&t->state); + } + } while (!_HA_ATOMIC_CAS(&t->state, &state, (state & TASK_PERSISTENT) | TASK_RUNNING)); + + __ha_barrier_atomic_store(); + + _HA_ATOMIC_DEC(&ha_thread_ctx[tid].tasks_in_list); + + /* Note for below: if TASK_KILLED arrived before we've read the state, we + * directly free the task. Otherwise it will be seen after processing and + * it's freed on the exit path. + */ + if (likely(!(state & TASK_KILLED) && process == process_stream)) + t = process_stream(t, ctx, state); + else if (!(state & TASK_KILLED) && process != NULL) + t = process(t, ctx, state); + else { + task_unlink_wq(t); + __task_free(t); + th_ctx->current = NULL; + __ha_barrier_store(); + /* We don't want max_processed to be decremented if + * we're just freeing a destroyed task, we should only + * do so if we really ran a task. + */ + continue; + } + + /* If there is a pending state we have to wake up the task + * immediately, else we defer it into wait queue + */ + if (t != NULL) { + state = _HA_ATOMIC_LOAD(&t->state); + if (unlikely(state & TASK_KILLED)) { + task_unlink_wq(t); + __task_free(t); + } + else { + task_queue(t); + task_drop_running(t, 0); + } + } + } + + th_ctx->current = NULL; + __ha_barrier_store(); + + /* stats are only registered for non-zero wake dates */ + if (unlikely(th_ctx->sched_wake_date)) + HA_ATOMIC_ADD(&profile_entry->cpu_time, (uint32_t)(now_mono_time() - th_ctx->sched_call_date)); + done++; + } + th_ctx->current_queue = -1; + + return done; +} + +/* The run queue is chronologically sorted in a tree. An insertion counter is + * used to assign a position to each task. This counter may be combined with + * other variables (eg: nice value) to set the final position in the tree. The + * counter may wrap without a problem, of course. We then limit the number of + * tasks processed to 200 in any case, so that general latency remains low and + * so that task positions have a chance to be considered. The function scans + * both the global and local run queues and picks the most urgent task between + * the two. We need to grab the global runqueue lock to touch it so it's taken + * on the very first access to the global run queue and is released as soon as + * it reaches the end. + * + * The function adjusts <next> if a new event is closer. + */ +void process_runnable_tasks() +{ + struct thread_ctx * const tt = th_ctx; + struct eb32_node *lrq; // next local run queue entry + struct eb32_node *grq; // next global run queue entry + struct task *t; + const unsigned int default_weights[TL_CLASSES] = { + [TL_URGENT] = 64, // ~50% of CPU bandwidth for I/O + [TL_NORMAL] = 48, // ~37% of CPU bandwidth for tasks + [TL_BULK] = 16, // ~13% of CPU bandwidth for self-wakers + [TL_HEAVY] = 1, // never more than 1 heavy task at once + }; + unsigned int max[TL_CLASSES]; // max to be run per class + unsigned int max_total; // sum of max above + struct mt_list *tmp_list; + unsigned int queue; + int max_processed; + int lpicked, gpicked; + int heavy_queued = 0; + int budget; + + _HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_STUCK); // this thread is still running + + if (!thread_has_tasks()) { + activity[tid].empty_rq++; + return; + } + + max_processed = global.tune.runqueue_depth; + + if (likely(tg_ctx->niced_tasks)) + max_processed = (max_processed + 3) / 4; + + if (max_processed < th_ctx->rq_total && th_ctx->rq_total <= 2*max_processed) { + /* If the run queue exceeds the budget by up to 50%, let's cut it + * into two identical halves to improve latency. + */ + max_processed = th_ctx->rq_total / 2; + } + + not_done_yet: + max[TL_URGENT] = max[TL_NORMAL] = max[TL_BULK] = 0; + + /* urgent tasklets list gets a default weight of ~50% */ + if ((tt->tl_class_mask & (1 << TL_URGENT)) || + !MT_LIST_ISEMPTY(&tt->shared_tasklet_list)) + max[TL_URGENT] = default_weights[TL_URGENT]; + + /* normal tasklets list gets a default weight of ~37% */ + if ((tt->tl_class_mask & (1 << TL_NORMAL)) || + !eb_is_empty(&th_ctx->rqueue) || !eb_is_empty(&th_ctx->rqueue_shared)) + max[TL_NORMAL] = default_weights[TL_NORMAL]; + + /* bulk tasklets list gets a default weight of ~13% */ + if ((tt->tl_class_mask & (1 << TL_BULK))) + max[TL_BULK] = default_weights[TL_BULK]; + + /* heavy tasks are processed only once and never refilled in a + * call round. That budget is not lost either as we don't reset + * it unless consumed. + */ + if (!heavy_queued) { + if ((tt->tl_class_mask & (1 << TL_HEAVY))) + max[TL_HEAVY] = default_weights[TL_HEAVY]; + else + max[TL_HEAVY] = 0; + heavy_queued = 1; + } + + /* Now compute a fair share of the weights. Total may slightly exceed + * 100% due to rounding, this is not a problem. Note that while in + * theory the sum cannot be NULL as we cannot get there without tasklets + * to process, in practice it seldom happens when multiple writers + * conflict and rollback on MT_LIST_TRY_APPEND(shared_tasklet_list), causing + * a first MT_LIST_ISEMPTY() to succeed for thread_has_task() and the + * one above to finally fail. This is extremely rare and not a problem. + */ + max_total = max[TL_URGENT] + max[TL_NORMAL] + max[TL_BULK] + max[TL_HEAVY]; + if (!max_total) + goto leave; + + for (queue = 0; queue < TL_CLASSES; queue++) + max[queue] = ((unsigned)max_processed * max[queue] + max_total - 1) / max_total; + + /* The heavy queue must never process more than very few tasks at once + * anyway. We set the limit to 1 if running on low_latency scheduling, + * given that we know that other values can have an impact on latency + * (~500us end-to-end connection achieved at 130kcps in SSL), 1 + one + * per 1024 tasks if there is at least one non-heavy task while still + * respecting the ratios above, or 1 + one per 128 tasks if only heavy + * tasks are present. This allows to drain excess SSL handshakes more + * efficiently if the queue becomes congested. + */ + if (max[TL_HEAVY] > 1) { + if (global.tune.options & GTUNE_SCHED_LOW_LATENCY) + budget = 1; + else if (tt->tl_class_mask & ~(1 << TL_HEAVY)) + budget = 1 + tt->rq_total / 1024; + else + budget = 1 + tt->rq_total / 128; + + if (max[TL_HEAVY] > budget) + max[TL_HEAVY] = budget; + } + + lrq = grq = NULL; + + /* pick up to max[TL_NORMAL] regular tasks from prio-ordered run queues */ + /* Note: the grq lock is always held when grq is not null */ + lpicked = gpicked = 0; + budget = max[TL_NORMAL] - tt->tasks_in_list; + while (lpicked + gpicked < budget) { + if (!eb_is_empty(&th_ctx->rqueue_shared) && !grq) { +#ifdef USE_THREAD + HA_SPIN_LOCK(TASK_RQ_LOCK, &th_ctx->rqsh_lock); + grq = eb32_lookup_ge(&th_ctx->rqueue_shared, _HA_ATOMIC_LOAD(&tt->rqueue_ticks) - TIMER_LOOK_BACK); + if (unlikely(!grq)) { + grq = eb32_first(&th_ctx->rqueue_shared); + if (!grq) + HA_SPIN_UNLOCK(TASK_RQ_LOCK, &th_ctx->rqsh_lock); + } +#endif + } + + /* If a global task is available for this thread, it's in grq + * now and the global RQ is locked. + */ + + if (!lrq) { + lrq = eb32_lookup_ge(&tt->rqueue, _HA_ATOMIC_LOAD(&tt->rqueue_ticks) - TIMER_LOOK_BACK); + if (unlikely(!lrq)) + lrq = eb32_first(&tt->rqueue); + } + + if (!lrq && !grq) + break; + + if (likely(!grq || (lrq && (int)(lrq->key - grq->key) <= 0))) { + t = eb32_entry(lrq, struct task, rq); + lrq = eb32_next(lrq); + eb32_delete(&t->rq); + lpicked++; + } +#ifdef USE_THREAD + else { + t = eb32_entry(grq, struct task, rq); + grq = eb32_next(grq); + eb32_delete(&t->rq); + + if (unlikely(!grq)) { + grq = eb32_first(&th_ctx->rqueue_shared); + if (!grq) + HA_SPIN_UNLOCK(TASK_RQ_LOCK, &th_ctx->rqsh_lock); + } + gpicked++; + } +#endif + if (t->nice) + _HA_ATOMIC_DEC(&tg_ctx->niced_tasks); + + /* Add it to the local task list */ + LIST_APPEND(&tt->tasklets[TL_NORMAL], &((struct tasklet *)t)->list); + } + + /* release the rqueue lock */ + if (grq) { + HA_SPIN_UNLOCK(TASK_RQ_LOCK, &th_ctx->rqsh_lock); + grq = NULL; + } + + if (lpicked + gpicked) { + tt->tl_class_mask |= 1 << TL_NORMAL; + _HA_ATOMIC_ADD(&tt->tasks_in_list, lpicked + gpicked); + activity[tid].tasksw += lpicked + gpicked; + } + + /* Merge the list of tasklets waken up by other threads to the + * main list. + */ + tmp_list = MT_LIST_BEHEAD(&tt->shared_tasklet_list); + if (tmp_list) { + LIST_SPLICE_END_DETACHED(&tt->tasklets[TL_URGENT], (struct list *)tmp_list); + if (!LIST_ISEMPTY(&tt->tasklets[TL_URGENT])) + tt->tl_class_mask |= 1 << TL_URGENT; + } + + /* execute tasklets in each queue */ + max_processed -= run_tasks_from_lists(max); + + /* some tasks may have woken other ones up */ + if (max_processed > 0 && thread_has_tasks()) + goto not_done_yet; + + leave: + if (tt->tl_class_mask) + activity[tid].long_rq++; +} + +/* + * Delete every tasks before running the master polling loop + */ +void mworker_cleantasks() +{ + struct task *t; + int i; + struct eb32_node *tmp_wq = NULL; + struct eb32_node *tmp_rq = NULL; + +#ifdef USE_THREAD + /* cleanup the global run queue */ + tmp_rq = eb32_first(&th_ctx->rqueue_shared); + while (tmp_rq) { + t = eb32_entry(tmp_rq, struct task, rq); + tmp_rq = eb32_next(tmp_rq); + task_destroy(t); + } + /* cleanup the timers queue */ + tmp_wq = eb32_first(&tg_ctx->timers); + while (tmp_wq) { + t = eb32_entry(tmp_wq, struct task, wq); + tmp_wq = eb32_next(tmp_wq); + task_destroy(t); + } +#endif + /* clean the per thread run queue */ + for (i = 0; i < global.nbthread; i++) { + tmp_rq = eb32_first(&ha_thread_ctx[i].rqueue); + while (tmp_rq) { + t = eb32_entry(tmp_rq, struct task, rq); + tmp_rq = eb32_next(tmp_rq); + task_destroy(t); + } + /* cleanup the per thread timers queue */ + tmp_wq = eb32_first(&ha_thread_ctx[i].timers); + while (tmp_wq) { + t = eb32_entry(tmp_wq, struct task, wq); + tmp_wq = eb32_next(tmp_wq); + task_destroy(t); + } + } +} + +/* perform minimal initializations */ +static void init_task() +{ + int i, q; + + for (i = 0; i < MAX_TGROUPS; i++) + memset(&ha_tgroup_ctx[i].timers, 0, sizeof(ha_tgroup_ctx[i].timers)); + + for (i = 0; i < MAX_THREADS; i++) { + for (q = 0; q < TL_CLASSES; q++) + LIST_INIT(&ha_thread_ctx[i].tasklets[q]); + MT_LIST_INIT(&ha_thread_ctx[i].shared_tasklet_list); + } +} + +/* config parser for global "tune.sched.low-latency", accepts "on" or "off" */ +static int cfg_parse_tune_sched_low_latency(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + if (too_many_args(1, args, err, NULL)) + return -1; + + if (strcmp(args[1], "on") == 0) + global.tune.options |= GTUNE_SCHED_LOW_LATENCY; + else if (strcmp(args[1], "off") == 0) + global.tune.options &= ~GTUNE_SCHED_LOW_LATENCY; + else { + memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]); + return -1; + } + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.sched.low-latency", cfg_parse_tune_sched_low_latency }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); +INITCALL0(STG_PREPARE, init_task); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/tcp_act.c b/src/tcp_act.c new file mode 100644 index 0000000..8b44047 --- /dev/null +++ b/src/tcp_act.c @@ -0,0 +1,749 @@ +/* + * AF_INET/AF_INET6 SOCK_STREAM protocol layer (tcp) + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <netinet/tcp.h> +#include <netinet/in.h> + +#include <haproxy/action-t.h> +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/channel.h> +#include <haproxy/connection.h> +#include <haproxy/global.h> +#include <haproxy/http_rules.h> +#include <haproxy/proto_tcp.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/server.h> +#include <haproxy/session.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/tools.h> + +static enum act_return tcp_action_attach_srv(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct server *srv = rule->arg.attach_srv.srv; + struct sample *name_smp; + struct connection *conn = objt_conn(sess->origin); + if (!conn) + return ACT_RET_ABRT; + + conn_set_reverse(conn, &srv->obj_type); + + if (rule->arg.attach_srv.name) { + name_smp = sample_fetch_as_type(sess->fe, sess, s, + SMP_OPT_DIR_REQ | SMP_OPT_FINAL, + rule->arg.attach_srv.name, SMP_T_STR); + /* TODO strdup du buffer du sample */ + if (name_smp) { + struct buffer *buf = &name_smp->data.u.str; + char *area = malloc(b_data(buf)); + + if (!area) + return ACT_RET_ERR; + + conn->reverse.name = b_make(area, b_data(buf), 0, 0); + b_ncat(&conn->reverse.name, buf, b_data(buf)); + } + } + + return ACT_RET_CONT; +} + +/* + * Execute the "set-src" action. May be called from {tcp,http}request. + * It only changes the address and tries to preserve the original port. If the + * previous family was neither AF_INET nor AF_INET6, the port is set to zero. + */ +static enum act_return tcp_action_req_set_src(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct connection *cli_conn; + struct sockaddr_storage *src; + struct sample *smp; + + switch (rule->from) { + case ACT_F_TCP_REQ_CON: + cli_conn = objt_conn(sess->origin); + if (!cli_conn || !conn_get_src(cli_conn)) + goto end; + src = cli_conn->src; + break; + + case ACT_F_TCP_REQ_SES: + if (!sess_get_src(sess)) + goto end; + src = sess->src; + break; + + case ACT_F_TCP_REQ_CNT: + case ACT_F_HTTP_REQ: + if (!sc_get_src(s->scf)) + goto end; + src = s->scf->src; + break; + + default: + goto end; + } + + smp = sample_fetch_as_type(px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.expr, SMP_T_ADDR); + if (smp) { + int port = get_net_port(src); + + if (smp->data.type == SMP_T_IPV4) { + ((struct sockaddr_in *)src)->sin_family = AF_INET; + ((struct sockaddr_in *)src)->sin_addr.s_addr = smp->data.u.ipv4.s_addr; + ((struct sockaddr_in *)src)->sin_port = port; + } else if (smp->data.type == SMP_T_IPV6) { + ((struct sockaddr_in6 *)src)->sin6_family = AF_INET6; + memcpy(&((struct sockaddr_in6 *)src)->sin6_addr, &smp->data.u.ipv6, sizeof(struct in6_addr)); + ((struct sockaddr_in6 *)src)->sin6_port = port; + } + } + + end: + return ACT_RET_CONT; +} + +/* + * Execute the "set-dst" action. May be called from {tcp,http}request. + * It only changes the address and tries to preserve the original port. If the + * previous family was neither AF_INET nor AF_INET6, the port is set to zero. + */ +static enum act_return tcp_action_req_set_dst(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct connection *cli_conn; + struct sockaddr_storage *dst; + struct sample *smp; + + switch (rule->from) { + case ACT_F_TCP_REQ_CON: + cli_conn = objt_conn(sess->origin); + if (!cli_conn || !conn_get_dst(cli_conn)) + goto end; + dst = cli_conn->dst; + break; + + case ACT_F_TCP_REQ_SES: + if (!sess_get_dst(sess)) + goto end; + dst = sess->dst; + break; + + case ACT_F_TCP_REQ_CNT: + case ACT_F_HTTP_REQ: + if (!sc_get_dst(s->scf)) + goto end; + dst = s->scf->dst; + break; + + default: + goto end; + } + + smp = sample_fetch_as_type(px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.expr, SMP_T_ADDR); + if (smp) { + int port = get_net_port(dst); + + if (smp->data.type == SMP_T_IPV4) { + ((struct sockaddr_in *)dst)->sin_family = AF_INET; + ((struct sockaddr_in *)dst)->sin_addr.s_addr = smp->data.u.ipv4.s_addr; + ((struct sockaddr_in *)dst)->sin_port = port; + } else if (smp->data.type == SMP_T_IPV6) { + ((struct sockaddr_in6 *)dst)->sin6_family = AF_INET6; + memcpy(&((struct sockaddr_in6 *)dst)->sin6_addr, &smp->data.u.ipv6, sizeof(struct in6_addr)); + ((struct sockaddr_in6 *)dst)->sin6_port = port; + } + } + + end: + return ACT_RET_CONT; +} + +/* + * Execute the "set-src-port" action. May be called from {tcp,http}request. + * We must test the sin_family before setting the port. If the address family + * is neither AF_INET nor AF_INET6, the address is forced to AF_INET "0.0.0.0" + * and the port is assigned. + */ +static enum act_return tcp_action_req_set_src_port(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct connection *cli_conn; + struct sockaddr_storage *src; + struct sample *smp; + + switch (rule->from) { + case ACT_F_TCP_REQ_CON: + cli_conn = objt_conn(sess->origin); + if (!cli_conn || !conn_get_src(cli_conn)) + goto end; + src = cli_conn->src; + break; + + case ACT_F_TCP_REQ_SES: + if (!sess_get_src(sess)) + goto end; + src = sess->src; + break; + + case ACT_F_TCP_REQ_CNT: + case ACT_F_HTTP_REQ: + if (!sc_get_src(s->scf)) + goto end; + src = s->scf->src; + break; + + default: + goto end; + } + + smp = sample_fetch_as_type(px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.expr, SMP_T_SINT); + if (smp) { + if (src->ss_family == AF_INET6) { + ((struct sockaddr_in6 *)src)->sin6_port = htons(smp->data.u.sint); + } else { + if (src->ss_family != AF_INET) { + src->ss_family = AF_INET; + ((struct sockaddr_in *)src)->sin_addr.s_addr = 0; + } + ((struct sockaddr_in *)src)->sin_port = htons(smp->data.u.sint); + } + } + + end: + return ACT_RET_CONT; +} + +/* + * Execute the "set-dst-port" action. May be called from {tcp,http}request. + * We must test the sin_family before setting the port. If the address family + * is neither AF_INET nor AF_INET6, the address is forced to AF_INET "0.0.0.0" + * and the port is assigned. + */ +static enum act_return tcp_action_req_set_dst_port(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct connection *cli_conn; + struct sockaddr_storage *dst; + struct sample *smp; + + switch (rule->from) { + case ACT_F_TCP_REQ_CON: + cli_conn = objt_conn(sess->origin); + if (!cli_conn || !conn_get_dst(cli_conn)) + goto end; + dst = cli_conn->dst; + break; + + case ACT_F_TCP_REQ_SES: + if (!sess_get_dst(sess)) + goto end; + dst = sess->dst; + break; + + case ACT_F_TCP_REQ_CNT: + case ACT_F_HTTP_REQ: + if (!sc_get_dst(s->scf)) + goto end; + dst = s->scf->dst; + break; + + default: + goto end; + } + + smp = sample_fetch_as_type(px, sess, s, SMP_OPT_DIR_REQ|SMP_OPT_FINAL, rule->arg.expr, SMP_T_SINT); + if (smp) { + if (dst->ss_family == AF_INET6) { + ((struct sockaddr_in6 *)dst)->sin6_port = htons(smp->data.u.sint); + } else { + if (dst->ss_family != AF_INET) { + dst->ss_family = AF_INET; + ((struct sockaddr_in *)dst)->sin_addr.s_addr = 0; + } + ((struct sockaddr_in *)dst)->sin_port = htons(smp->data.u.sint); + } + } + + end: + return ACT_RET_CONT; +} + +/* Executes the "silent-drop" action. May be called from {tcp,http}{request,response}. + * If rule->arg.act.p[0] is 0, TCP_REPAIR is tried first, with a fallback to + * sending a RST with TTL 1 towards the client. If it is [1-255], we will skip + * TCP_REPAIR and prepare the socket to send a RST with the requested TTL when + * the connection is killed by channel_abort(). + */ +static enum act_return tcp_exec_action_silent_drop(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *strm, int flags) +{ + struct connection *conn = objt_conn(sess->origin); + unsigned int ttl __maybe_unused = (uintptr_t)rule->arg.act.p[0]; + char tcp_repair_enabled __maybe_unused; + + if (ttl == 0) { + tcp_repair_enabled = 1; + ttl = 1; + } else { + tcp_repair_enabled = 0; + } + + if (!conn) + goto out; + + if (!conn_ctrl_ready(conn)) + goto out; + +#ifdef TCP_QUICKACK + /* drain is needed only to send the quick ACK */ + conn_ctrl_drain(conn); + + /* re-enable quickack if it was disabled to ack all data and avoid + * retransmits from the client that might trigger a real reset. + */ + setsockopt(conn->handle.fd, IPPROTO_TCP, TCP_QUICKACK, &one, sizeof(one)); +#endif + /* lingering must absolutely be disabled so that we don't send a + * shutdown(), this is critical to the TCP_REPAIR trick. When no stream + * is present, returning with ERR will cause lingering to be disabled. + */ + if (strm) + strm->scf->flags |= SC_FL_NOLINGER; + + if (conn->flags & CO_FL_FDLESS) + goto out; + + /* We're on the client-facing side, we must force to disable lingering to + * ensure we will use an RST exclusively and kill any pending data. + */ + HA_ATOMIC_OR(&fdtab[conn->handle.fd].state, FD_LINGER_RISK); + +#ifdef TCP_REPAIR + /* try to put socket in repair mode if sending a RST was not requested by + * config. this often fails due to missing permissions (CAP_NET_ADMIN capability) + */ + if (tcp_repair_enabled && (setsockopt(conn->handle.fd, IPPROTO_TCP, TCP_REPAIR, &one, sizeof(one)) == 0)) { + /* socket will be quiet now */ + goto out; + } +#endif + + /* Either TCP_REPAIR is not defined, it failed (eg: permissions), or was + * not executed because a RST with a specific TTL was requested to be sent. + * Set the TTL of the client connection before the connection is killed + * by channel_abort and a RST packet will be emitted by the TCP/IP stack. + */ +#ifdef IP_TTL + if (conn->src && conn->src->ss_family == AF_INET) + setsockopt(conn->handle.fd, IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)); +#endif +#ifdef IPV6_UNICAST_HOPS + if (conn->src && conn->src->ss_family == AF_INET6) + setsockopt(conn->handle.fd, IPPROTO_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)); +#endif + out: + /* kill the stream if any */ + if (strm) { + stream_abort(strm); + strm->req.analysers &= AN_REQ_FLT_END; + strm->res.analysers &= AN_RES_FLT_END; + if (strm->flags & SF_BE_ASSIGNED) + _HA_ATOMIC_INC(&strm->be->be_counters.denied_req); + if (!(strm->flags & SF_ERR_MASK)) + strm->flags |= SF_ERR_PRXCOND; + if (!(strm->flags & SF_FINST_MASK)) + strm->flags |= SF_FINST_R; + } + + _HA_ATOMIC_INC(&sess->fe->fe_counters.denied_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->denied_req); + + return ACT_RET_ABRT; +} + + +#if defined(SO_MARK) || defined(SO_USER_COOKIE) || defined(SO_RTABLE) +static enum act_return tcp_action_set_mark(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + conn_set_mark(objt_conn(sess->origin), (uintptr_t)rule->arg.act.p[0]); + return ACT_RET_CONT; +} +#endif + +#ifdef IP_TOS +static enum act_return tcp_action_set_tos(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + conn_set_tos(objt_conn(sess->origin), (uintptr_t)rule->arg.act.p[0]); + return ACT_RET_CONT; +} +#endif + +/* + * Release the sample expr when releasing attach-srv action + */ +static void release_attach_srv_action(struct act_rule *rule) +{ + ha_free(&rule->arg.attach_srv.srvname); + release_sample_expr(rule->arg.attach_srv.name); +} + +/* + * Release the sample expr when releasing a set src/dst action + */ +static void release_set_src_dst_action(struct act_rule *rule) +{ + release_sample_expr(rule->arg.expr); +} + +static int tcp_check_attach_srv(struct act_rule *rule, struct proxy *px, char **err) +{ + struct proxy *be = NULL; + struct server *srv = NULL; + char *name = rule->arg.attach_srv.srvname; + struct ist be_name, sv_name; + + if (px->mode != PR_MODE_HTTP) { + memprintf(err, "attach-srv rule requires HTTP proxy mode"); + return 0; + } + + sv_name = ist(name); + be_name = istsplit(&sv_name, '/'); + if (!istlen(sv_name)) { + memprintf(err, "attach-srv rule: invalid server name '%s'", name); + return 0; + } + + if (!(be = proxy_be_by_name(ist0(be_name)))) { + memprintf(err, "attach-srv rule: no such backend '%s/%s'", ist0(be_name), ist0(sv_name)); + return 0; + } + if (!(srv = server_find_by_name(be, ist0(sv_name)))) { + memprintf(err, "attach-srv rule: no such server '%s/%s'", ist0(be_name), ist0(sv_name)); + return 0; + } + + if ((rule->arg.attach_srv.name && (!srv->use_ssl || !srv->sni_expr)) || + (!rule->arg.attach_srv.name && srv->use_ssl && srv->sni_expr)) { + memprintf(err, "attach-srv rule: connection will never be used; either specify name argument in conjunction with defined SSL SNI on targeted server or none of these"); + return 0; + } + + rule->arg.attach_srv.srv = srv; + + return 1; +} + +static enum act_parse_ret tcp_parse_attach_srv(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + char *srvname; + struct sample_expr *expr; + + /* TODO duplicated code from check_kw_experimental() */ + if (!experimental_directives_allowed) { + memprintf(err, "parsing [%s:%d] : '%s' action is experimental, must be allowed via a global 'expose-experimental-directives'", + px->conf.args.file, px->conf.args.line, args[2]); + return ACT_RET_PRS_ERR; + } + mark_tainted(TAINTED_CONFIG_EXP_KW_DECLARED); + + rule->action = ACT_CUSTOM; + rule->action_ptr = tcp_action_attach_srv; + rule->release_ptr = release_attach_srv_action; + rule->check_ptr = tcp_check_attach_srv; + rule->arg.attach_srv.srvname = NULL; + rule->arg.attach_srv.name = NULL; + + srvname = my_strndup(args[*cur_arg], strlen(args[*cur_arg])); + if (!srvname) + goto err; + rule->arg.attach_srv.srvname = srvname; + + ++(*cur_arg); + + if (strcmp(args[*cur_arg], "name") == 0) { + if (!*args[*cur_arg + 1]) { + memprintf(err, "missing name value"); + return ACT_RET_PRS_ERR; + } + ++(*cur_arg); + + expr = sample_parse_expr((char **)args, cur_arg, px->conf.args.file, px->conf.args.line, + err, &px->conf.args, NULL); + if (!expr) + return ACT_RET_PRS_ERR; + + rule->arg.attach_srv.name = expr; + rule->release_ptr = release_attach_srv_action; + } + + return ACT_RET_PRS_OK; + + err: + ha_free(&rule->arg.attach_srv.srvname); + release_sample_expr(rule->arg.attach_srv.name); + return ACT_RET_PRS_ERR; +} + +/* parse "set-{src,dst}[-port]" action */ +static enum act_parse_ret tcp_parse_set_src_dst(const char **args, int *orig_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + int cur_arg; + struct sample_expr *expr; + unsigned int where; + + cur_arg = *orig_arg; + expr = sample_parse_expr((char **)args, &cur_arg, px->conf.args.file, px->conf.args.line, err, &px->conf.args, NULL); + if (!expr) + return ACT_RET_PRS_ERR; + + where = 0; + if (px->cap & PR_CAP_FE) + where |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + where |= SMP_VAL_BE_HRQ_HDR; + + if (!(expr->fetch->val & where)) { + memprintf(err, + "fetch method '%s' extracts information from '%s', none of which is available here", + args[cur_arg-1], sample_src_names(expr->fetch->use)); + free(expr); + return ACT_RET_PRS_ERR; + } + rule->arg.expr = expr; + rule->action = ACT_CUSTOM; + + if (strcmp(args[*orig_arg - 1], "set-src") == 0) { + rule->action_ptr = tcp_action_req_set_src; + } else if (strcmp(args[*orig_arg - 1], "set-src-port") == 0) { + rule->action_ptr = tcp_action_req_set_src_port; + } else if (strcmp(args[*orig_arg - 1], "set-dst") == 0) { + rule->action_ptr = tcp_action_req_set_dst; + } else if (strcmp(args[*orig_arg - 1], "set-dst-port") == 0) { + rule->action_ptr = tcp_action_req_set_dst_port; + } else { + return ACT_RET_PRS_ERR; + } + + rule->release_ptr = release_set_src_dst_action; + (*orig_arg)++; + + return ACT_RET_PRS_OK; +} + + +/* Parse a "set-mark" action. It takes the MARK value as argument. It returns + * ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret tcp_parse_set_mark(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ +#if defined(SO_MARK) || defined(SO_USER_COOKIE) || defined(SO_RTABLE) + char *endp; + unsigned int mark; + + if (!*args[*cur_arg]) { + memprintf(err, "expects exactly 1 argument (integer/hex value)"); + return ACT_RET_PRS_ERR; + } + mark = strtoul(args[*cur_arg], &endp, 0); + if (endp && *endp != '\0') { + memprintf(err, "invalid character starting at '%s' (integer/hex value expected)", endp); + return ACT_RET_PRS_ERR; + } + + (*cur_arg)++; + + /* Register processing function. */ + rule->action_ptr = tcp_action_set_mark; + rule->action = ACT_CUSTOM; + rule->arg.act.p[0] = (void *)(uintptr_t)mark; + global.last_checks |= LSTCHK_NETADM; + return ACT_RET_PRS_OK; +#else + memprintf(err, "not supported on this platform (SO_MARK|SO_USER_COOKIE|SO_RTABLE undefined)"); + return ACT_RET_PRS_ERR; +#endif +} + + +/* Parse a "set-tos" action. It takes the TOS value as argument. It returns + * ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR on error. + */ +static enum act_parse_ret tcp_parse_set_tos(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ +#ifdef IP_TOS + char *endp; + int tos; + + if (!*args[*cur_arg]) { + memprintf(err, "expects exactly 1 argument (integer/hex value)"); + return ACT_RET_PRS_ERR; + } + tos = strtol(args[*cur_arg], &endp, 0); + if (endp && *endp != '\0') { + memprintf(err, "invalid character starting at '%s' (integer/hex value expected)", endp); + return ACT_RET_PRS_ERR; + } + + (*cur_arg)++; + + /* Register processing function. */ + rule->action_ptr = tcp_action_set_tos; + rule->action = ACT_CUSTOM; + rule->arg.act.p[0] = (void *)(uintptr_t)tos; + return ACT_RET_PRS_OK; +#else + memprintf(err, "not supported on this platform (IP_TOS undefined)"); + return ACT_RET_PRS_ERR; +#endif +} + +/* Parse a "silent-drop" action. It may take 2 optional arguments to define a + * "rst-ttl" parameter. It returns ACT_RET_PRS_OK on success, ACT_RET_PRS_ERR + * on error. + */ +static enum act_parse_ret tcp_parse_silent_drop(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + unsigned int rst_ttl = 0; + char *endp; + + rule->action = ACT_CUSTOM; + rule->action_ptr = tcp_exec_action_silent_drop; + + if (strcmp(args[*cur_arg], "rst-ttl") == 0) { + if (!*args[*cur_arg + 1]) { + memprintf(err, "missing rst-ttl value\n"); + return ACT_RET_PRS_ERR; + } + + rst_ttl = (unsigned int)strtoul(args[*cur_arg + 1], &endp, 0); + + if (endp && *endp != '\0') { + memprintf(err, "invalid character starting at '%s' (value 1-255 expected)\n", + endp); + return ACT_RET_PRS_ERR; + } + if ((rst_ttl == 0) || (rst_ttl > 255) ) { + memprintf(err, "valid rst-ttl values are [1-255]\n"); + return ACT_RET_PRS_ERR; + } + + *cur_arg += 2; + } + + rule->arg.act.p[0] = (void *)(uintptr_t)rst_ttl; + return ACT_RET_PRS_OK; +} + + +static struct action_kw_list tcp_req_conn_actions = {ILH, { + { "set-dst" , tcp_parse_set_src_dst }, + { "set-dst-port", tcp_parse_set_src_dst }, + { "set-mark", tcp_parse_set_mark }, + { "set-src", tcp_parse_set_src_dst }, + { "set-src-port", tcp_parse_set_src_dst }, + { "set-tos", tcp_parse_set_tos }, + { "silent-drop", tcp_parse_silent_drop }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_conn_keywords_register, &tcp_req_conn_actions); + +static struct action_kw_list tcp_req_sess_actions = {ILH, { + { "attach-srv" , tcp_parse_attach_srv }, + { "set-dst" , tcp_parse_set_src_dst }, + { "set-dst-port", tcp_parse_set_src_dst }, + { "set-mark", tcp_parse_set_mark }, + { "set-src", tcp_parse_set_src_dst }, + { "set-src-port", tcp_parse_set_src_dst }, + { "set-tos", tcp_parse_set_tos }, + { "silent-drop", tcp_parse_silent_drop }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_sess_keywords_register, &tcp_req_sess_actions); + +static struct action_kw_list tcp_req_cont_actions = {ILH, { + { "set-src", tcp_parse_set_src_dst }, + { "set-src-port", tcp_parse_set_src_dst }, + { "set-dst" , tcp_parse_set_src_dst }, + { "set-dst-port", tcp_parse_set_src_dst }, + { "set-mark", tcp_parse_set_mark }, + { "set-tos", tcp_parse_set_tos }, + { "silent-drop", tcp_parse_silent_drop }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_cont_keywords_register, &tcp_req_cont_actions); + +static struct action_kw_list tcp_res_cont_actions = {ILH, { + { "set-mark", tcp_parse_set_mark }, + { "set-tos", tcp_parse_set_tos }, + { "silent-drop", tcp_parse_silent_drop }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_res_cont_keywords_register, &tcp_res_cont_actions); + +static struct action_kw_list http_req_actions = {ILH, { + { "set-dst", tcp_parse_set_src_dst }, + { "set-dst-port", tcp_parse_set_src_dst }, + { "set-mark", tcp_parse_set_mark }, + { "set-src", tcp_parse_set_src_dst }, + { "set-src-port", tcp_parse_set_src_dst }, + { "set-tos", tcp_parse_set_tos }, + { "silent-drop", tcp_parse_silent_drop }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_actions); + +static struct action_kw_list http_res_actions = {ILH, { + { "set-mark", tcp_parse_set_mark }, + { "set-tos", tcp_parse_set_tos }, + { "silent-drop", tcp_parse_silent_drop }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_res_keywords_register, &http_res_actions); + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/tcp_rules.c b/src/tcp_rules.c new file mode 100644 index 0000000..9ce6c90 --- /dev/null +++ b/src/tcp_rules.c @@ -0,0 +1,1428 @@ +/* + * "tcp" rules processing + * + * Copyright 2000-2016 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <haproxy/acl.h> +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/arg-t.h> +#include <haproxy/capture-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/channel.h> +#include <haproxy/connection.h> +#include <haproxy/global.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/proxy.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/stconn.h> +#include <haproxy/stick_table.h> +#include <haproxy/stream-t.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/ticks.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> + + +#define TRACE_SOURCE &trace_strm + +/* List head of all known action keywords for "tcp-request connection" */ +struct list tcp_req_conn_keywords = LIST_HEAD_INIT(tcp_req_conn_keywords); +struct list tcp_req_sess_keywords = LIST_HEAD_INIT(tcp_req_sess_keywords); +struct list tcp_req_cont_keywords = LIST_HEAD_INIT(tcp_req_cont_keywords); +struct list tcp_res_cont_keywords = LIST_HEAD_INIT(tcp_res_cont_keywords); + +/* + * Register keywords. + */ +void tcp_req_conn_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&tcp_req_conn_keywords, &kw_list->list); +} + +void tcp_req_sess_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&tcp_req_sess_keywords, &kw_list->list); +} + +void tcp_req_cont_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&tcp_req_cont_keywords, &kw_list->list); +} + +void tcp_res_cont_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&tcp_res_cont_keywords, &kw_list->list); +} + +/* + * Return the struct tcp_req_action_kw associated to a keyword. + */ +struct action_kw *tcp_req_conn_action(const char *kw) +{ + return action_lookup(&tcp_req_conn_keywords, kw); +} + +struct action_kw *tcp_req_sess_action(const char *kw) +{ + return action_lookup(&tcp_req_sess_keywords, kw); +} + +struct action_kw *tcp_req_cont_action(const char *kw) +{ + return action_lookup(&tcp_req_cont_keywords, kw); +} + +struct action_kw *tcp_res_cont_action(const char *kw) +{ + return action_lookup(&tcp_res_cont_keywords, kw); +} + +/* This function performs the TCP request analysis on the current request. It + * returns 1 if the processing can continue on next analysers, or zero if it + * needs more data, encounters an error, or wants to immediately abort the + * request. It relies on buffers flags, and updates s->req->analysers. The + * function may be called for frontend rules and backend rules. It only relies + * on the backend pointer so this works for both cases. + */ +int tcp_inspect_request(struct stream *s, struct channel *req, int an_bit) +{ + struct list *def_rules, *rules; + struct session *sess = s->sess; + struct act_rule *rule; + int partial; + int act_opts = 0; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_TCP_ANA, s); + + def_rules = ((s->be->defpx && + (sess->fe->mode == PR_MODE_TCP || sess->fe->mode == PR_MODE_HTTP) && + (an_bit == AN_REQ_INSPECT_FE || s->be->defpx != sess->fe->defpx)) ? &s->be->defpx->tcp_req.inspect_rules : NULL); + rules = &s->be->tcp_req.inspect_rules; + + /* We don't know whether we have enough data, so must proceed + * this way : + * - iterate through all rules in their declaration order + * - if one rule returns MISS, it means the inspect delay is + * not over yet, then return immediately, otherwise consider + * it as a non-match. + * - if one rule returns OK, then return OK + * - if one rule returns KO, then return KO + */ + + if ((s->scf->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) || channel_full(req, global.tune.maxrewrite) || + sc_waiting_room(s->scf) || + !s->be->tcp_req.inspect_delay || tick_is_expired(s->rules_exp, now_ms)) { + partial = SMP_OPT_FINAL; + /* Action may yield while the inspect_delay is not expired and there is no read error */ + if ((s->scf->flags & SC_FL_ERROR) || !s->be->tcp_req.inspect_delay || tick_is_expired(s->rules_exp, now_ms)) + act_opts |= ACT_OPT_FINAL; + } + else + partial = 0; + + /* If "the current_rule_list" match the executed rule list, we are in + * resume condition. If a resume is needed it is always in the action + * and never in the ACL or converters. In this case, we initialise the + * current rule, and go to the action execution point. + */ + if (s->current_rule) { + rule = s->current_rule; + s->current_rule = NULL; + if ((def_rules && s->current_rule_list == def_rules) || s->current_rule_list == rules) + goto resume_execution; + } + s->current_rule_list = ((!def_rules || s->current_rule_list == def_rules) ? rules : def_rules); + + restart: + list_for_each_entry(rule, s->current_rule_list, list) { + enum acl_test_res ret = ACL_TEST_PASS; + + if (rule->cond) { + ret = acl_exec_cond(rule->cond, s->be, sess, s, SMP_OPT_DIR_REQ | partial); + if (ret == ACL_TEST_MISS) + goto missing_data; + + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + act_opts |= ACT_OPT_FIRST; +resume_execution: + /* Always call the action function if defined */ + if (rule->action_ptr) { + switch (rule->action_ptr(rule, s->be, s->sess, s, act_opts)) { + case ACT_RET_CONT: + break; + case ACT_RET_STOP: + case ACT_RET_DONE: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_YIELD: + s->current_rule = rule; + if (act_opts & ACT_OPT_FINAL) { + send_log(s->be, LOG_WARNING, + "Internal error: yield not allowed if the inspect-delay expired " + "for the tcp-request content actions."); + goto internal; + } + goto missing_data; + case ACT_RET_DENY: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto deny; + case ACT_RET_ABRT: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto abort; + case ACT_RET_ERR: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto internal; + case ACT_RET_INV: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto invalid; + } + continue; /* eval the next rule */ + } + + /* If not action function defined, check for known actions */ + if (rule->action == ACT_ACTION_ALLOW) { + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + } + else if (rule->action == ACT_ACTION_DENY) { + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto deny; + } + } + } + + if (def_rules && s->current_rule_list == def_rules) { + s->current_rule_list = rules; + goto restart; + } + + end: + /* if we get there, it means we have no rule which matches, or + * we have an explicit accept, so we apply the default accept. + */ + req->analysers &= ~an_bit; + s->current_rule = s->current_rule_list = NULL; + req->analyse_exp = s->rules_exp = TICK_ETERNITY; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_TCP_ANA, s); + return 1; + + missing_data: + channel_dont_connect(req); + /* just set the request timeout once at the beginning of the request */ + if (!tick_isset(s->rules_exp) && s->be->tcp_req.inspect_delay) + s->rules_exp = tick_add(now_ms, s->be->tcp_req.inspect_delay); + req->analyse_exp = tick_first((tick_is_expired(req->analyse_exp, now_ms) ? 0 : req->analyse_exp), s->rules_exp); + DBG_TRACE_DEVEL("waiting for more data", STRM_EV_STRM_ANA|STRM_EV_TCP_ANA, s); + return 0; + + deny: + _HA_ATOMIC_INC(&sess->fe->fe_counters.denied_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->denied_req); + goto reject; + + internal: + _HA_ATOMIC_INC(&sess->fe->fe_counters.internal_errors); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->internal_errors); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_INTERNAL; + goto reject; + + invalid: + _HA_ATOMIC_INC(&sess->fe->fe_counters.failed_req); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->failed_req); + + reject: + sc_must_kill_conn(s->scf); + stream_abort(s); + + abort: + req->analysers &= AN_REQ_FLT_END; + s->current_rule = s->current_rule_list = NULL; + req->analyse_exp = s->rules_exp = TICK_ETERNITY; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= SF_FINST_R; + DBG_TRACE_DEVEL("leaving on error|deny|abort", STRM_EV_STRM_ANA|STRM_EV_TCP_ANA|STRM_EV_TCP_ERR, s); + return 0; +} + +/* This function performs the TCP response analysis on the current response. It + * returns 1 if the processing can continue on next analysers, or zero if it + * needs more data, encounters an error, or wants to immediately abort the + * response. It relies on buffers flags, and updates s->rep->analysers. The + * function may be called for backend rules. + */ +int tcp_inspect_response(struct stream *s, struct channel *rep, int an_bit) +{ + struct list *def_rules, *rules; + struct session *sess = s->sess; + struct act_rule *rule; + int partial; + int act_opts = 0; + + DBG_TRACE_ENTER(STRM_EV_STRM_ANA|STRM_EV_TCP_ANA, s); + + def_rules = (s->be->defpx && (s->be->mode == PR_MODE_TCP || s->be->mode == PR_MODE_HTTP) ? &s->be->defpx->tcp_rep.inspect_rules : NULL); + rules = &s->be->tcp_rep.inspect_rules; + + /* We don't know whether we have enough data, so must proceed + * this way : + * - iterate through all rules in their declaration order + * - if one rule returns MISS, it means the inspect delay is + * not over yet, then return immediately, otherwise consider + * it as a non-match. + * - if one rule returns OK, then return OK + * - if one rule returns KO, then return KO + */ + if ((s->scb->flags & (SC_FL_EOS|SC_FL_ABRT_DONE)) || channel_full(rep, global.tune.maxrewrite) || + sc_waiting_room(s->scb) || + !s->be->tcp_rep.inspect_delay || tick_is_expired(s->rules_exp, now_ms)) { + partial = SMP_OPT_FINAL; + /* Action may yield while the inspect_delay is not expired and there is no read error */ + if ((s->scb->flags & SC_FL_ERROR) || !s->be->tcp_rep.inspect_delay || tick_is_expired(s->rules_exp, now_ms)) + act_opts |= ACT_OPT_FINAL; + } + else + partial = 0; + + /* If "the current_rule_list" match the executed rule list, we are in + * resume condition. If a resume is needed it is always in the action + * and never in the ACL or converters. In this case, we initialise the + * current rule, and go to the action execution point. + */ + if (s->current_rule) { + rule = s->current_rule; + s->current_rule = NULL; + if ((def_rules && s->current_rule_list == def_rules) || s->current_rule_list == rules) + goto resume_execution; + } + s->current_rule_list = ((!def_rules || s->current_rule_list == def_rules) ? rules : def_rules); + + restart: + list_for_each_entry(rule, s->current_rule_list, list) { + enum acl_test_res ret = ACL_TEST_PASS; + + if (rule->cond) { + ret = acl_exec_cond(rule->cond, s->be, sess, s, SMP_OPT_DIR_RES | partial); + if (ret == ACL_TEST_MISS) + goto missing_data; + + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + act_opts |= ACT_OPT_FIRST; +resume_execution: + /* Always call the action function if defined */ + if (rule->action_ptr) { + switch (rule->action_ptr(rule, s->be, s->sess, s, act_opts)) { + case ACT_RET_CONT: + break; + case ACT_RET_STOP: + case ACT_RET_DONE: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + case ACT_RET_YIELD: + s->current_rule = rule; + if (act_opts & ACT_OPT_FINAL) { + send_log(s->be, LOG_WARNING, + "Internal error: yield not allowed if the inspect-delay expired " + "for the tcp-response content actions."); + goto internal; + } + channel_dont_close(rep); + goto missing_data; + case ACT_RET_DENY: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto deny; + case ACT_RET_ABRT: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto abort; + case ACT_RET_ERR: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto internal; + case ACT_RET_INV: + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto invalid; + } + continue; /* eval the next rule */ + } + + /* If not action function defined, check for known actions */ + if (rule->action == ACT_ACTION_ALLOW) { + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + } + else if (rule->action == ACT_ACTION_DENY) { + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto deny; + } + else if (rule->action == ACT_TCP_CLOSE) { + s->scb->flags |= SC_FL_NOLINGER | SC_FL_NOHALF; + sc_must_kill_conn(s->scb); + sc_abort(s->scb); + sc_shutdown(s->scb); + s->last_rule_file = rule->conf.file; + s->last_rule_line = rule->conf.line; + goto end; + } + } + } + + if (def_rules && s->current_rule_list == def_rules) { + s->current_rule_list = rules; + goto restart; + } + + end: + /* if we get there, it means we have no rule which matches, or + * we have an explicit accept, so we apply the default accept. + */ + rep->analysers &= ~an_bit; + s->current_rule = s->current_rule_list = NULL; + rep->analyse_exp = s->rules_exp = TICK_ETERNITY; + DBG_TRACE_LEAVE(STRM_EV_STRM_ANA|STRM_EV_TCP_ANA, s); + return 1; + + missing_data: + /* just set the analyser timeout once at the beginning of the response */ + if (!tick_isset(s->rules_exp) && s->be->tcp_rep.inspect_delay) + s->rules_exp = tick_add(now_ms, s->be->tcp_rep.inspect_delay); + rep->analyse_exp = tick_first((tick_is_expired(rep->analyse_exp, now_ms) ? 0 : rep->analyse_exp), s->rules_exp); + DBG_TRACE_DEVEL("waiting for more data", STRM_EV_STRM_ANA|STRM_EV_TCP_ANA, s); + return 0; + + deny: + _HA_ATOMIC_INC(&s->sess->fe->fe_counters.denied_resp); + _HA_ATOMIC_INC(&s->be->be_counters.denied_resp); + if (s->sess->listener && s->sess->listener->counters) + _HA_ATOMIC_INC(&s->sess->listener->counters->denied_resp); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.denied_resp); + goto reject; + + internal: + _HA_ATOMIC_INC(&s->sess->fe->fe_counters.internal_errors); + _HA_ATOMIC_INC(&s->be->be_counters.internal_errors); + if (s->sess->listener && s->sess->listener->counters) + _HA_ATOMIC_INC(&s->sess->listener->counters->internal_errors); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.internal_errors); + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_INTERNAL; + goto reject; + + invalid: + _HA_ATOMIC_INC(&s->be->be_counters.failed_resp); + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.failed_resp); + + reject: + sc_must_kill_conn(s->scb); + stream_abort(s); + + abort: + rep->analysers &= AN_RES_FLT_END; + s->current_rule = s->current_rule_list = NULL; + rep->analyse_exp = s->rules_exp = TICK_ETERNITY; + + if (!(s->flags & SF_ERR_MASK)) + s->flags |= SF_ERR_PRXCOND; + if (!(s->flags & SF_FINST_MASK)) + s->flags |= SF_FINST_D; + DBG_TRACE_DEVEL("leaving on error", STRM_EV_STRM_ANA|STRM_EV_TCP_ANA|STRM_EV_TCP_ERR, s); + return 0; +} + + +/* This function performs the TCP layer4 analysis on the current request. It + * returns 0 if a reject rule matches, otherwise 1 if either an accept rule + * matches or if no more rule matches. It can only use rules which don't need + * any data. This only works on connection-based client-facing stream connectors. + */ +int tcp_exec_l4_rules(struct session *sess) +{ + struct proxy *px = sess->fe; + struct act_rule *rule; + struct connection *conn = objt_conn(sess->origin); + int result = 1; + enum acl_test_res ret; + + if (!conn) + return result; + + if (sess->fe->defpx && (sess->fe->mode == PR_MODE_TCP || sess->fe->mode == PR_MODE_HTTP)) + px = sess->fe->defpx; + + restart: + list_for_each_entry(rule, &px->tcp_req.l4_rules, list) { + ret = ACL_TEST_PASS; + + if (rule->cond) { + ret = acl_exec_cond(rule->cond, sess->fe, sess, NULL, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + /* Always call the action function if defined */ + if (rule->action_ptr) { + switch (rule->action_ptr(rule, sess->fe, sess, NULL, ACT_OPT_FINAL | ACT_OPT_FIRST)) { + case ACT_RET_YIELD: + /* yield is not allowed at this point. If this return code is + * used it is a bug, so I prefer to abort the process. + */ + send_log(sess->fe, LOG_WARNING, + "Internal error: yield not allowed with tcp-request connection actions."); + /* fall through */ + case ACT_RET_STOP: + case ACT_RET_DONE: + goto end; + case ACT_RET_CONT: + break; + case ACT_RET_DENY: + case ACT_RET_ABRT: + case ACT_RET_ERR: + case ACT_RET_INV: + result = 0; + goto end; + } + continue; /* eval the next rule */ + } + + /* If not action function defined, check for known actions */ + if (rule->action == ACT_ACTION_ALLOW) { + goto end; + } + else if (rule->action == ACT_ACTION_DENY) { + _HA_ATOMIC_INC(&sess->fe->fe_counters.denied_conn); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->denied_conn); + + result = 0; + goto end; + } + else if (rule->action == ACT_TCP_EXPECT_PX) { + if (!(conn->flags & CO_FL_HANDSHAKE)) { + if (xprt_add_hs(conn) < 0) { + result = 0; + goto end; + } + } + conn->flags |= CO_FL_ACCEPT_PROXY; + } + else if (rule->action == ACT_TCP_EXPECT_CIP) { + if (!(conn->flags & CO_FL_HANDSHAKE)) { + if (xprt_add_hs(conn) < 0) { + result = 0; + goto end; + } + } + conn->flags |= CO_FL_ACCEPT_CIP; + } + } + } + + if (px != sess->fe) { + px = sess->fe; + goto restart; + } + end: + return result; +} + +/* This function performs the TCP layer5 analysis on the current request. It + * returns 0 if a reject rule matches, otherwise 1 if either an accept rule + * matches or if no more rule matches. It can only use rules which don't need + * any data. This only works on session-based client-facing stream connectors. + * An example of valid use case is to track a stick-counter on the source + * address extracted from the proxy protocol. + */ +int tcp_exec_l5_rules(struct session *sess) +{ + struct proxy *px = sess->fe; + struct act_rule *rule; + int result = 1; + enum acl_test_res ret; + + if (sess->fe->defpx && (sess->fe->mode == PR_MODE_TCP || sess->fe->mode == PR_MODE_HTTP)) + px = sess->fe->defpx; + + restart: + list_for_each_entry(rule, &px->tcp_req.l5_rules, list) { + ret = ACL_TEST_PASS; + + if (rule->cond) { + ret = acl_exec_cond(rule->cond, sess->fe, sess, NULL, SMP_OPT_DIR_REQ|SMP_OPT_FINAL); + ret = acl_pass(ret); + if (rule->cond->pol == ACL_COND_UNLESS) + ret = !ret; + } + + if (ret) { + /* Always call the action function if defined */ + if (rule->action_ptr) { + switch (rule->action_ptr(rule, sess->fe, sess, NULL, ACT_OPT_FINAL | ACT_OPT_FIRST)) { + case ACT_RET_YIELD: + /* yield is not allowed at this point. If this return code is + * used it is a bug, so I prefer to abort the process. + */ + send_log(sess->fe, LOG_WARNING, + "Internal error: yield not allowed with tcp-request session actions."); + /* fall through */ + case ACT_RET_STOP: + case ACT_RET_DONE: + goto end; + case ACT_RET_CONT: + break; + case ACT_RET_DENY: + case ACT_RET_ABRT: + case ACT_RET_ERR: + case ACT_RET_INV: + result = 0; + goto end; + } + continue; /* eval the next rule */ + } + + /* If not action function defined, check for known actions */ + if (rule->action == ACT_ACTION_ALLOW) { + goto end; + } + else if (rule->action == ACT_ACTION_DENY) { + _HA_ATOMIC_INC(&sess->fe->fe_counters.denied_sess); + if (sess->listener && sess->listener->counters) + _HA_ATOMIC_INC(&sess->listener->counters->denied_sess); + + result = 0; + goto end; + } + } + } + + if (px != sess->fe) { + px = sess->fe; + goto restart; + } + end: + return result; +} + +/* Parse a tcp-response rule. Return a negative value in case of failure */ +static int tcp_parse_response_rule(char **args, int arg, int section_type, + struct proxy *curpx, const struct proxy *defpx, + struct act_rule *rule, char **err, + unsigned int where, + const char *file, int line) +{ + if ((curpx == defpx && strlen(defpx->id) == 0) || !(curpx->cap & PR_CAP_BE)) { + memprintf(err, "%s %s is only allowed in 'backend' sections or 'defaults' section with a name", + args[0], args[1]); + return -1; + } + + if (strcmp(args[arg], "accept") == 0) { + arg++; + rule->action = ACT_ACTION_ALLOW; + rule->flags |= ACT_FLAG_FINAL; + } + else if (strcmp(args[arg], "reject") == 0) { + arg++; + rule->action = ACT_ACTION_DENY; + rule->flags |= ACT_FLAG_FINAL; + } + else if (strcmp(args[arg], "close") == 0) { + arg++; + rule->action = ACT_TCP_CLOSE; + rule->flags |= ACT_FLAG_FINAL; + } + else { + struct action_kw *kw; + kw = tcp_res_cont_action(args[arg]); + if (kw) { + arg++; + rule->kw = kw; + if (kw->parse((const char **)args, &arg, curpx, rule, err) == ACT_RET_PRS_ERR) + return -1; + } else { + const char *extra[] = { "accept", "reject", "close", NULL }; + const char *best = action_suggest(args[arg], &tcp_res_cont_keywords, extra); + + action_build_list(&tcp_res_cont_keywords, &trash); + memprintf(err, + "'%s %s' expects 'accept', 'close', 'reject', %s in %s '%s' (got '%s').%s%s%s", + args[0], args[1], trash.area, + proxy_type_str(curpx), curpx->id, args[arg], + best ? " Did you mean '" : "", + best ? best : "", + best ? "' maybe ?" : ""); + return -1; + } + } + + if (strcmp(args[arg], "if") == 0 || strcmp(args[arg], "unless") == 0) { + if ((rule->cond = build_acl_cond(file, line, &curpx->acl, curpx, (const char **)args+arg, err)) == NULL) { + memprintf(err, + "'%s %s %s' : error detected in %s '%s' while parsing '%s' condition : %s", + args[0], args[1], args[2], proxy_type_str(curpx), curpx->id, args[arg], *err); + return -1; + } + } + else if (*args[arg]) { + memprintf(err, + "'%s %s %s' only accepts 'if' or 'unless', in %s '%s' (got '%s')", + args[0], args[1], args[2], proxy_type_str(curpx), curpx->id, args[arg]); + return -1; + } + return 0; +} + + +/* This function executes a track-sc* actions. On success, it returns + * ACT_RET_CONT. If it must yield, it return ACT_RET_YIELD. Otherwsize + * ACT_RET_ERR is returned. + */ +static enum act_return tcp_action_track_sc(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct stksess *ts; + struct stktable *t; + struct stktable_key *key; + struct sample smp; + int opt; + + opt = SMP_OPT_DIR_REQ; + if (flags & ACT_FLAG_FINAL) + opt |= SMP_OPT_FINAL; + + t = rule->arg.trk_ctr.table.t; + if (rule->from == ACT_F_TCP_REQ_CNT) { /* L7 rules: use the stream */ + if (stkctr_entry(&s->stkctr[rule->action])) + goto end; + + key = stktable_fetch_key(t, s->be, sess, s, opt, rule->arg.trk_ctr.expr, &smp); + + if ((smp.flags & SMP_F_MAY_CHANGE) && !(flags & ACT_FLAG_FINAL)) + return ACT_RET_YIELD; /* key might appear later */ + + if (key && (ts = stktable_get_entry(t, key))) { + stream_track_stkctr(&s->stkctr[rule->action], t, ts); + stkctr_set_flags(&s->stkctr[rule->action], STKCTR_TRACK_CONTENT); + if (sess->fe != s->be) + stkctr_set_flags(&s->stkctr[rule->action], STKCTR_TRACK_BACKEND); + } + } + else { /* L4/L5 rules: use the session */ + if (stkctr_entry(&sess->stkctr[rule->action])) + goto end; + + key = stktable_fetch_key(t, sess->fe, sess, NULL, opt, rule->arg.trk_ctr.expr, NULL); + if (key && (ts = stktable_get_entry(t, key))) + stream_track_stkctr(&sess->stkctr[rule->action], t, ts); + } + + end: + return ACT_RET_CONT; +} + +/* This function executes a capture actions. It executes a fetch expression, + * turns the result into a string and puts it in a capture slot. On success, it + * returns ACT_RET_CONT. If it must yield, it return ACT_RET_YIELD. Otherwsize + * ACT_RET_ERR is returned. + */ +static enum act_return tcp_action_capture(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct sample *key; + struct cap_hdr *h = rule->arg.cap.hdr; + char **cap = s->req_cap; + int len, opt; + + opt = ((rule->from == ACT_F_TCP_REQ_CNT) ? SMP_OPT_DIR_REQ : SMP_OPT_DIR_RES); + if (flags & ACT_FLAG_FINAL) + opt |= SMP_OPT_FINAL; + + key = sample_fetch_as_type(s->be, sess, s, opt, rule->arg.cap.expr, SMP_T_STR); + if (!key) + goto end; + + if ((key->flags & SMP_F_MAY_CHANGE) && !(flags & ACT_FLAG_FINAL)) + return ACT_RET_YIELD; /* key might appear later */ + + if (cap[h->index] == NULL) { + cap[h->index] = pool_alloc(h->pool); + if (cap[h->index] == NULL) /* no more capture memory, ignore error */ + goto end; + } + + len = key->data.u.str.data; + if (len > h->len) + len = h->len; + + memcpy(cap[h->index], key->data.u.str.area, len); + cap[h->index][len] = 0; + + end: + return ACT_RET_CONT; +} + +static void release_tcp_capture(struct act_rule * rule) +{ + release_sample_expr(rule->arg.cap.expr); +} + + +static void release_tcp_track_sc(struct act_rule * rule) +{ + release_sample_expr(rule->arg.trk_ctr.expr); +} + +/* Parse a tcp-request rule. Return a negative value in case of failure */ +static int tcp_parse_request_rule(char **args, int arg, int section_type, + struct proxy *curpx, const struct proxy *defpx, + struct act_rule *rule, char **err, + unsigned int where, const char *file, int line) +{ + if (curpx == defpx && strlen(defpx->id) == 0) { + memprintf(err, "%s %s is not allowed in anonymous 'defaults' sections", + args[0], args[1]); + return -1; + } + + if (strcmp(args[arg], "accept") == 0) { + arg++; + rule->action = ACT_ACTION_ALLOW; + rule->flags |= ACT_FLAG_FINAL; + } + else if (strcmp(args[arg], "reject") == 0) { + arg++; + rule->action = ACT_ACTION_DENY; + rule->flags |= ACT_FLAG_FINAL; + } + else if (strcmp(args[arg], "capture") == 0) { + struct sample_expr *expr; + struct cap_hdr *hdr; + int kw = arg; + int len = 0; + + if (!(curpx->cap & PR_CAP_FE)) { + memprintf(err, + "'%s %s %s' : proxy '%s' has no frontend capability", + args[0], args[1], args[kw], curpx->id); + return -1; + } + + if (!(where & SMP_VAL_FE_REQ_CNT)) { + memprintf(err, + "'%s %s' is not allowed in '%s %s' rules in %s '%s'", + args[arg], args[arg+1], args[0], args[1], proxy_type_str(curpx), curpx->id); + return -1; + } + + arg++; + + curpx->conf.args.ctx = ARGC_CAP; + expr = sample_parse_expr(args, &arg, file, line, err, &curpx->conf.args, NULL); + if (!expr) { + memprintf(err, + "'%s %s %s' : %s", + args[0], args[1], args[kw], *err); + return -1; + } + + if (!(expr->fetch->val & where)) { + memprintf(err, + "'%s %s %s' : fetch method '%s' extracts information from '%s', none of which is available here", + args[0], args[1], args[kw], args[arg-1], sample_src_names(expr->fetch->use)); + release_sample_expr(expr); + return -1; + } + + if (strcmp(args[arg], "len") == 0) { + arg++; + if (!args[arg]) { + memprintf(err, + "'%s %s %s' : missing length value", + args[0], args[1], args[kw]); + release_sample_expr(expr); + return -1; + } + /* we copy the table name for now, it will be resolved later */ + len = atoi(args[arg]); + if (len <= 0) { + memprintf(err, + "'%s %s %s' : length must be > 0", + args[0], args[1], args[kw]); + release_sample_expr(expr); + return -1; + } + arg++; + } + + if (!len) { + memprintf(err, + "'%s %s %s' : a positive 'len' argument is mandatory", + args[0], args[1], args[kw]); + free(expr); + return -1; + } + + hdr = calloc(1, sizeof(*hdr)); + if (!hdr) { + memprintf(err, "parsing [%s:%d] : out of memory", file, line); + release_sample_expr(expr); + return -1; + } + hdr->next = curpx->req_cap; + hdr->name = NULL; /* not a header capture */ + hdr->namelen = 0; + hdr->len = len; + hdr->pool = create_pool("caphdr", hdr->len + 1, MEM_F_SHARED); + hdr->index = curpx->nb_req_cap++; + + curpx->req_cap = hdr; + curpx->to_log |= LW_REQHDR; + + /* check if we need to allocate an http_txn struct for HTTP parsing */ + curpx->http_needed |= !!(expr->fetch->use & SMP_USE_HTTP_ANY); + + rule->arg.cap.expr = expr; + rule->arg.cap.hdr = hdr; + rule->action = ACT_CUSTOM; + rule->action_ptr = tcp_action_capture; + rule->check_ptr = check_capture; + rule->release_ptr = release_tcp_capture; + } + else if (strncmp(args[arg], "track-sc", 8) == 0) { + struct sample_expr *expr; + int kw = arg; + unsigned int tsc_num; + const char *tsc_num_str; + + arg++; + + tsc_num_str = &args[kw][8]; + if (cfg_parse_track_sc_num(&tsc_num, tsc_num_str, tsc_num_str + strlen(tsc_num_str), err) == -1) { + memprintf(err, "'%s %s %s' : %s", args[0], args[1], args[kw], *err); + return -1; + } + + curpx->conf.args.ctx = ARGC_TRK; + expr = sample_parse_expr(args, &arg, file, line, err, &curpx->conf.args, NULL); + if (!expr) { + memprintf(err, + "'%s %s %s' : %s", + args[0], args[1], args[kw], *err); + return -1; + } + + if (!(expr->fetch->val & where)) { + memprintf(err, + "'%s %s %s' : fetch method '%s' extracts information from '%s', none of which is available here", + args[0], args[1], args[kw], args[arg-1], sample_src_names(expr->fetch->use)); + release_sample_expr(expr); + return -1; + } + + /* check if we need to allocate an http_txn struct for HTTP parsing */ + curpx->http_needed |= !!(expr->fetch->use & SMP_USE_HTTP_ANY); + + if (strcmp(args[arg], "table") == 0) { + arg++; + if (!args[arg]) { + memprintf(err, + "'%s %s %s' : missing table name", + args[0], args[1], args[kw]); + release_sample_expr(expr); + return -1; + } + /* we copy the table name for now, it will be resolved later */ + rule->arg.trk_ctr.table.n = strdup(args[arg]); + arg++; + } + rule->action = tsc_num; + rule->arg.trk_ctr.expr = expr; + rule->action_ptr = tcp_action_track_sc; + rule->check_ptr = check_trk_action; + rule->release_ptr = release_tcp_track_sc; + } + else if (strcmp(args[arg], "expect-proxy") == 0) { + if (strcmp(args[arg+1], "layer4") != 0) { + memprintf(err, + "'%s %s %s' only supports 'layer4' in %s '%s' (got '%s')", + args[0], args[1], args[arg], proxy_type_str(curpx), curpx->id, args[arg+1]); + return -1; + } + + if (!(where & SMP_VAL_FE_CON_ACC)) { + memprintf(err, + "'%s %s' is not allowed in '%s %s' rules in %s '%s'", + args[arg], args[arg+1], args[0], args[1], proxy_type_str(curpx), curpx->id); + return -1; + } + + arg += 2; + rule->action = ACT_TCP_EXPECT_PX; + } + else if (strcmp(args[arg], "expect-netscaler-cip") == 0) { + if (strcmp(args[arg+1], "layer4") != 0) { + memprintf(err, + "'%s %s %s' only supports 'layer4' in %s '%s' (got '%s')", + args[0], args[1], args[arg], proxy_type_str(curpx), curpx->id, args[arg+1]); + return -1; + } + + if (!(where & SMP_VAL_FE_CON_ACC)) { + memprintf(err, + "'%s %s' is not allowed in '%s %s' rules in %s '%s'", + args[arg], args[arg+1], args[0], args[1], proxy_type_str(curpx), curpx->id); + return -1; + } + + arg += 2; + rule->action = ACT_TCP_EXPECT_CIP; + } + else { + struct action_kw *kw; + if (where & SMP_VAL_FE_CON_ACC) { + /* L4 */ + kw = tcp_req_conn_action(args[arg]); + rule->kw = kw; + } else if (where & SMP_VAL_FE_SES_ACC) { + /* L5 */ + kw = tcp_req_sess_action(args[arg]); + rule->kw = kw; + } else { + /* L6 */ + kw = tcp_req_cont_action(args[arg]); + rule->kw = kw; + } + if (kw) { + arg++; + if (kw->parse((const char **)args, &arg, curpx, rule, err) == ACT_RET_PRS_ERR) + return -1; + } else { + const char *extra[] = { "accept", "reject", "capture", "track-sc", "expect-proxy", "expect-netscaler-cip", NULL }; + const char *best = NULL; + + + if (where & SMP_VAL_FE_CON_ACC) { + action_build_list(&tcp_req_conn_keywords, &trash); + best = action_suggest(args[arg], &tcp_req_conn_keywords, extra); + } + else if (where & SMP_VAL_FE_SES_ACC) { + action_build_list(&tcp_req_sess_keywords, &trash); + best = action_suggest(args[arg], &tcp_req_sess_keywords, extra); + } + else { + action_build_list(&tcp_req_cont_keywords, &trash); + best = action_suggest(args[arg], &tcp_req_cont_keywords, extra); + } + + memprintf(err, + "'%s %s' expects 'accept', 'reject', 'capture', 'expect-proxy', 'expect-netscaler-cip', 'track-sc0' ... 'track-sc%d', %s " + "in %s '%s' (got '%s').%s%s%s\n", + args[0], args[1], global.tune.nb_stk_ctr-1, + trash.area, proxy_type_str(curpx), + curpx->id, args[arg], + best ? " Did you mean '" : "", + best ? best : "", + best ? "' maybe ?" : ""); + return -1; + } + } + + if (strcmp(args[arg], "if") == 0 || strcmp(args[arg], "unless") == 0) { + if ((rule->cond = build_acl_cond(file, line, &curpx->acl, curpx, (const char **)args+arg, err)) == NULL) { + memprintf(err, + "'%s %s %s' : error detected in %s '%s' while parsing '%s' condition : %s", + args[0], args[1], args[2], proxy_type_str(curpx), curpx->id, args[arg], *err); + return -1; + } + } + else if (*args[arg]) { + memprintf(err, + "'%s %s %s' only accepts 'if' or 'unless', in %s '%s' (got '%s')", + args[0], args[1], args[2], proxy_type_str(curpx), curpx->id, args[arg]); + return -1; + } + return 0; +} + +/* This function should be called to parse a line starting with the "tcp-response" + * keyword. + */ +static int tcp_parse_tcp_rep(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + const char *ptr = NULL; + unsigned int val; + int warn = 0; + int arg; + struct act_rule *rule; + unsigned int where; + const struct acl *acl; + const char *kw; + + if (!*args[1]) { + memprintf(err, "missing argument for '%s' in %s '%s'", + args[0], proxy_type_str(curpx), curpx->id); + return -1; + } + + if (strcmp(args[1], "inspect-delay") == 0) { + if ((curpx == defpx && strlen(defpx->id) == 0) || !(curpx->cap & PR_CAP_BE)) { + memprintf(err, "%s %s is only allowed in 'backend' sections or 'defaults' section with a name", + args[0], args[1]); + return -1; + } + + if (!*args[2] || (ptr = parse_time_err(args[2], &val, TIME_UNIT_MS))) { + memprintf(err, + "'%s %s' expects a positive delay in milliseconds, in %s '%s'", + args[0], args[1], proxy_type_str(curpx), curpx->id); + + if (ptr == PARSE_TIME_OVER) + memprintf(err, "%s (timer overflow in '%s', maximum value is 2147483647 ms or ~24.8 days)", *err, args[2]); + else if (ptr == PARSE_TIME_UNDER) + memprintf(err, "%s (timer underflow in '%s', minimum non-null value is 1 ms)", *err, args[2]); + else if (ptr) + memprintf(err, "%s (unexpected character '%c')", *err, *ptr); + return -1; + } + + if (curpx->tcp_rep.inspect_delay) { + memprintf(err, "ignoring %s %s (was already defined) in %s '%s'", + args[0], args[1], proxy_type_str(curpx), curpx->id); + return 1; + } + curpx->tcp_rep.inspect_delay = val; + return 0; + } + + rule = new_act_rule(ACT_F_TCP_RES_CNT, file, line); + if (!rule) { + memprintf(err, "parsing [%s:%d] : out of memory", file, line); + return -1; + } + LIST_INIT(&rule->list); + arg = 1; + where = 0; + + if (strcmp(args[1], "content") == 0) { + arg++; + + if (curpx->cap & PR_CAP_FE) + where |= SMP_VAL_FE_RES_CNT; + if (curpx->cap & PR_CAP_BE) + where |= SMP_VAL_BE_RES_CNT; + if (tcp_parse_response_rule(args, arg, section_type, curpx, defpx, rule, err, where, file, line) < 0) + goto error; + + acl = rule->cond ? acl_cond_conflicts(rule->cond, where) : NULL; + if (acl) { + if (acl->name && *acl->name) + memprintf(err, + "acl '%s' will never match in '%s %s' because it only involves keywords that are incompatible with '%s'", + acl->name, args[0], args[1], sample_ckp_names(where)); + else + memprintf(err, + "anonymous acl will never match in '%s %s' because it uses keyword '%s' which is incompatible with '%s'", + args[0], args[1], + LIST_ELEM(acl->expr.n, struct acl_expr *, list)->kw, + sample_ckp_names(where)); + + warn++; + } + else if (rule->cond && acl_cond_kw_conflicts(rule->cond, where, &acl, &kw)) { + if (acl->name && *acl->name) + memprintf(err, + "acl '%s' involves keyword '%s' which is incompatible with '%s'", + acl->name, kw, sample_ckp_names(where)); + else + memprintf(err, + "anonymous acl involves keyword '%s' which is incompatible with '%s'", + kw, sample_ckp_names(where)); + warn++; + } + + LIST_APPEND(&curpx->tcp_rep.inspect_rules, &rule->list); + } + else { + memprintf(err, + "'%s' expects 'inspect-delay' or 'content' in %s '%s' (got '%s')", + args[0], proxy_type_str(curpx), curpx->id, args[1]); + goto error; + } + + return warn; + error: + free_act_rule(rule); + return -1; +} + + +/* This function should be called to parse a line starting with the "tcp-request" + * keyword. + */ +static int tcp_parse_tcp_req(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + const char *ptr = NULL; + unsigned int val; + int warn = 0; + int arg; + struct act_rule *rule; + unsigned int where; + const struct acl *acl; + const char *kw; + + if (!*args[1]) { + if (curpx == defpx) + memprintf(err, "missing argument for '%s' in defaults section", args[0]); + else + memprintf(err, "missing argument for '%s' in %s '%s'", + args[0], proxy_type_str(curpx), curpx->id); + return -1; + } + + if (strcmp(args[1], "inspect-delay") == 0) { + if (curpx == defpx && strlen(defpx->id) == 0) { + memprintf(err, "%s %s is not allowed in anonymous 'defaults' sections", + args[0], args[1]); + return -1; + } + + if (!*args[2] || (ptr = parse_time_err(args[2], &val, TIME_UNIT_MS))) { + memprintf(err, + "'%s %s' expects a positive delay in milliseconds, in %s '%s'", + args[0], args[1], proxy_type_str(curpx), curpx->id); + + if (ptr == PARSE_TIME_OVER) + memprintf(err, "%s (timer overflow in '%s', maximum value is 2147483647 ms or ~24.8 days)", *err, args[2]); + else if (ptr == PARSE_TIME_UNDER) + memprintf(err, "%s (timer underflow in '%s', minimum non-null value is 1 ms)", *err, args[2]); + else if (ptr) + memprintf(err, "%s (unexpected character '%c')", *err, *ptr); + return -1; + } + + if (curpx->tcp_req.inspect_delay) { + memprintf(err, "ignoring %s %s (was already defined) in %s '%s'", + args[0], args[1], proxy_type_str(curpx), curpx->id); + return 1; + } + curpx->tcp_req.inspect_delay = val; + return 0; + } + + rule = new_act_rule(0, file, line); + if (!rule) { + memprintf(err, "parsing [%s:%d] : out of memory", file, line); + return -1; + } + LIST_INIT(&rule->list); + arg = 1; + where = 0; + + if (strcmp(args[1], "content") == 0) { + arg++; + + if (curpx->cap & PR_CAP_FE) + where |= SMP_VAL_FE_REQ_CNT; + if (curpx->cap & PR_CAP_BE) + where |= SMP_VAL_BE_REQ_CNT; + rule->from = ACT_F_TCP_REQ_CNT; + if (tcp_parse_request_rule(args, arg, section_type, curpx, defpx, rule, err, where, file, line) < 0) + goto error; + + acl = rule->cond ? acl_cond_conflicts(rule->cond, where) : NULL; + if (acl) { + if (acl->name && *acl->name) + memprintf(err, + "acl '%s' will never match in '%s %s' because it only involves keywords that are incompatible with '%s'", + acl->name, args[0], args[1], sample_ckp_names(where)); + else + memprintf(err, + "anonymous acl will never match in '%s %s' because it uses keyword '%s' which is incompatible with '%s'", + args[0], args[1], + LIST_ELEM(acl->expr.n, struct acl_expr *, list)->kw, + sample_ckp_names(where)); + + warn++; + } + else if (rule->cond && acl_cond_kw_conflicts(rule->cond, where, &acl, &kw)) { + if (acl->name && *acl->name) + memprintf(err, + "acl '%s' involves keyword '%s' which is incompatible with '%s'", + acl->name, kw, sample_ckp_names(where)); + else + memprintf(err, + "anonymous acl involves keyword '%s' which is incompatible with '%s'", + kw, sample_ckp_names(where)); + warn++; + } + + /* the following function directly emits the warning */ + warnif_misplaced_tcp_cont(curpx, file, line, args[0]); + LIST_APPEND(&curpx->tcp_req.inspect_rules, &rule->list); + } + else if (strcmp(args[1], "connection") == 0) { + arg++; + + if (!(curpx->cap & PR_CAP_FE)) { + memprintf(err, "%s %s is not allowed because %s %s is not a frontend", + args[0], args[1], proxy_type_str(curpx), curpx->id); + goto error; + } + + where |= SMP_VAL_FE_CON_ACC; + rule->from = ACT_F_TCP_REQ_CON; + if (tcp_parse_request_rule(args, arg, section_type, curpx, defpx, rule, err, where, file, line) < 0) + goto error; + + acl = rule->cond ? acl_cond_conflicts(rule->cond, where) : NULL; + if (acl) { + if (acl->name && *acl->name) + memprintf(err, + "acl '%s' will never match in '%s %s' because it only involves keywords that are incompatible with '%s'", + acl->name, args[0], args[1], sample_ckp_names(where)); + else + memprintf(err, + "anonymous acl will never match in '%s %s' because it uses keyword '%s' which is incompatible with '%s'", + args[0], args[1], + LIST_ELEM(acl->expr.n, struct acl_expr *, list)->kw, + sample_ckp_names(where)); + + warn++; + } + else if (rule->cond && acl_cond_kw_conflicts(rule->cond, where, &acl, &kw)) { + if (acl->name && *acl->name) + memprintf(err, + "acl '%s' involves keyword '%s' which is incompatible with '%s'", + acl->name, kw, sample_ckp_names(where)); + else + memprintf(err, + "anonymous acl involves keyword '%s' which is incompatible with '%s'", + kw, sample_ckp_names(where)); + warn++; + } + + /* the following function directly emits the warning */ + warnif_misplaced_tcp_conn(curpx, file, line, args[0]); + LIST_APPEND(&curpx->tcp_req.l4_rules, &rule->list); + } + else if (strcmp(args[1], "session") == 0) { + arg++; + + if (!(curpx->cap & PR_CAP_FE)) { + memprintf(err, "%s %s is not allowed because %s %s is not a frontend", + args[0], args[1], proxy_type_str(curpx), curpx->id); + goto error; + } + + where |= SMP_VAL_FE_SES_ACC; + rule->from = ACT_F_TCP_REQ_SES; + if (tcp_parse_request_rule(args, arg, section_type, curpx, defpx, rule, err, where, file, line) < 0) + goto error; + + acl = rule->cond ? acl_cond_conflicts(rule->cond, where) : NULL; + if (acl) { + if (acl->name && *acl->name) + memprintf(err, + "acl '%s' will never match in '%s %s' because it only involves keywords that are incompatible with '%s'", + acl->name, args[0], args[1], sample_ckp_names(where)); + else + memprintf(err, + "anonymous acl will never match in '%s %s' because it uses keyword '%s' which is incompatible with '%s'", + args[0], args[1], + LIST_ELEM(acl->expr.n, struct acl_expr *, list)->kw, + sample_ckp_names(where)); + warn++; + } + else if (rule->cond && acl_cond_kw_conflicts(rule->cond, where, &acl, &kw)) { + if (acl->name && *acl->name) + memprintf(err, + "acl '%s' involves keyword '%s' which is incompatible with '%s'", + acl->name, kw, sample_ckp_names(where)); + else + memprintf(err, + "anonymous acl involves keyword '%s' which is incompatible with '%s'", + kw, sample_ckp_names(where)); + warn++; + } + + /* the following function directly emits the warning */ + warnif_misplaced_tcp_sess(curpx, file, line, args[0]); + LIST_APPEND(&curpx->tcp_req.l5_rules, &rule->list); + } + else { + if (curpx == defpx) + memprintf(err, + "'%s' expects 'inspect-delay', 'connection', or 'content' in defaults section (got '%s')", + args[0], args[1]); + else + memprintf(err, + "'%s' expects 'inspect-delay', 'connection', or 'content' in %s '%s' (got '%s')", + args[0], proxy_type_str(curpx), curpx->id, args[1]); + goto error; + } + + return warn; + error: + free_act_rule(rule); + return -1; +} + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_LISTEN, "tcp-request", tcp_parse_tcp_req }, + { CFG_LISTEN, "tcp-response", tcp_parse_tcp_rep }, + { 0, NULL, NULL }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/tcp_sample.c b/src/tcp_sample.c new file mode 100644 index 0000000..9fbf920 --- /dev/null +++ b/src/tcp_sample.c @@ -0,0 +1,641 @@ +/* + * AF_INET/AF_INET6 SOCK_STREAM protocol layer (tcp) + * + * Copyright 2000-2013 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* this is to have tcp_info defined on systems using musl + * library, such as Alpine Linux. + */ +#define _GNU_SOURCE + +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include <netinet/tcp.h> +#include <netinet/in.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/listener-t.h> +#include <haproxy/namespace.h> +#include <haproxy/proxy-t.h> +#include <haproxy/sample.h> +#include <haproxy/sc_strm.h> +#include <haproxy/session.h> +#include <haproxy/tools.h> + +/* Fetch the connection's source IPv4/IPv6 address. Depending on the keyword, it + * may be the frontend or the backend connection. + */ +static int +smp_fetch_src(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + const struct sockaddr_storage *src = NULL; + + if (kw[0] == 'b') { /* bc_src */ + struct connection *conn = ((obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + ? sc_conn(__objt_check(smp->sess->origin)->sc) + : (smp->strm ? sc_conn(smp->strm->scb): NULL)); + if (conn && conn_get_src(conn)) + src = conn_src(conn); + } + else if (kw[0] == 'f') { /* fc_src */ + struct connection *conn = objt_conn(smp->sess->origin); + + if (conn && conn_get_src(conn)) + src = conn_src(conn); + } + else /* src */ + src = (smp->strm ? sc_src(smp->strm->scf) : sess_src(smp->sess)); + + if (!src) + return 0; + + switch (src->ss_family) { + case AF_INET: + smp->data.u.ipv4 = ((struct sockaddr_in *)src)->sin_addr; + smp->data.type = SMP_T_IPV4; + break; + case AF_INET6: + smp->data.u.ipv6 = ((struct sockaddr_in6 *)src)->sin6_addr; + smp->data.type = SMP_T_IPV6; + break; + default: + return 0; + } + + smp->flags = 0; + return 1; +} + +/* set temp integer to the connection's source port. Depending on the + * keyword, it may be the frontend or the backend connection. + */ +static int +smp_fetch_sport(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + const struct sockaddr_storage *src = NULL; + + if (kw[0] == 'b') { /* bc_src_port */ + struct connection *conn = ((obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + ? sc_conn(__objt_check(smp->sess->origin)->sc) + : (smp->strm ? sc_conn(smp->strm->scb): NULL)); + if (conn && conn_get_src(conn)) + src = conn_src(conn); + } + else if (kw[0] == 'f') { /* fc_src_port */ + struct connection *conn = objt_conn(smp->sess->origin); + + if (conn && conn_get_src(conn)) + src = conn_src(conn); + } + else /* src_port */ + src = (smp->strm ? sc_src(smp->strm->scf) : sess_src(smp->sess)); + + if (!src) + return 0; + + smp->data.type = SMP_T_SINT; + if (!(smp->data.u.sint = get_host_port(src))) + return 0; + + smp->flags = 0; + return 1; +} + +/* fetch the connection's destination IPv4/IPv6 address. Depending on the + * keyword, it may be the frontend or the backend connection. + */ +static int +smp_fetch_dst(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + const struct sockaddr_storage *dst = NULL; + + if (kw[0] == 'b') { /* bc_dst */ + struct connection *conn = ((obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + ? sc_conn(__objt_check(smp->sess->origin)->sc) + : (smp->strm ? sc_conn(smp->strm->scb): NULL)); + if (conn && conn_get_dst(conn)) + dst = conn_dst(conn); + } + else if (kw[0] == 'f') { /* fc_dst */ + struct connection *conn = objt_conn(smp->sess->origin); + + if (conn && conn_get_dst(conn)) + dst = conn_dst(conn); + } + else /* dst */ + dst = (smp->strm ? sc_dst(smp->strm->scf) : sess_dst(smp->sess)); + + if (!dst) + return 0; + + switch (dst->ss_family) { + case AF_INET: + smp->data.u.ipv4 = ((struct sockaddr_in *)dst)->sin_addr; + smp->data.type = SMP_T_IPV4; + break; + case AF_INET6: + smp->data.u.ipv6 = ((struct sockaddr_in6 *)dst)->sin6_addr; + smp->data.type = SMP_T_IPV6; + break; + default: + return 0; + } + + smp->flags = 0; + return 1; +} + +/* check if the destination address of the front connection is local to the + * system or if it was intercepted. + */ +int smp_fetch_dst_is_local(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct listener *li = smp->sess->listener; + const struct sockaddr_storage *dst = NULL; + + if (kw[0] == 'f') { /* fc_dst_is_local */ + struct connection *conn = objt_conn(smp->sess->origin); + + if (conn && conn_get_dst(conn)) + dst = conn_dst(conn); + } + else /* dst_is_local */ + dst = (smp->strm ? sc_dst(smp->strm->scf) : sess_dst(smp->sess)); + + if (!dst) + return 0; + + smp->data.type = SMP_T_BOOL; + smp->flags = 0; + smp->data.u.sint = addr_is_local(li->rx.settings->netns, dst); + return smp->data.u.sint >= 0; +} + +/* check if the source address of the front connection is local to the system + * or not. + */ +int smp_fetch_src_is_local(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct listener *li = smp->sess->listener; + const struct sockaddr_storage *src = NULL; + + if (kw[0] == 'f') { /* fc_src_is_local */ + struct connection *conn = objt_conn(smp->sess->origin); + + if (conn && conn_get_src(conn)) + src = conn_src(conn); + } + else /* src_is_local */ + src = (smp->strm ? sc_src(smp->strm->scf) : sess_src(smp->sess)); + + if (!src) + return 0; + + smp->data.type = SMP_T_BOOL; + smp->flags = 0; + smp->data.u.sint = addr_is_local(li->rx.settings->netns, src); + return smp->data.u.sint >= 0; +} + +/* set temp integer to the connexion's destination port. Depending on the + * keyword, it may be the frontend or the backend connection. + */ +static int +smp_fetch_dport(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + const struct sockaddr_storage *dst = NULL; + + if (kw[0] == 'b') { /* bc_dst_port */ + struct connection *conn = ((obj_type(smp->sess->origin) == OBJ_TYPE_CHECK) + ? sc_conn(__objt_check(smp->sess->origin)->sc) + : (smp->strm ? sc_conn(smp->strm->scb): NULL)); + if (conn && conn_get_dst(conn)) + dst = conn_dst(conn); + } + else if (kw[0] == 'f') { /* fc_dst_port */ + struct connection *conn = objt_conn(smp->sess->origin); + + if (conn && conn_get_dst(conn)) + dst = conn_dst(conn); + } + else /* dst_port */ + dst = (smp->strm ? sc_dst(smp->strm->scf) : sess_dst(smp->sess)); + + if (!dst) + return 0; + + smp->data.type = SMP_T_SINT; + if (!(smp->data.u.sint = get_host_port(dst))) + return 0; + + smp->flags = 0; + return 1; +} + +#ifdef TCP_INFO + + +/* Validates the arguments passed to "fc_*" fetch keywords returning a time + * value. These keywords support an optional string representing the unit of the + * result: "us" for microseconds and "ms" for milliseconds". Returns 0 on error + * and non-zero if OK. + */ +static int val_fc_time_value(struct arg *args, char **err) +{ + if (args[0].type == ARGT_STR) { + if (strcmp(args[0].data.str.area, "us") == 0) { + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = TIME_UNIT_US; + } + else if (strcmp(args[0].data.str.area, "ms") == 0) { + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = TIME_UNIT_MS; + } + else { + memprintf(err, "expects 'us' or 'ms', got '%s'", + args[0].data.str.area); + return 0; + } + } + else { + memprintf(err, "Unexpected arg type"); + return 0; + } + + return 1; +} + +/* Validates the arguments passed to "fc_*" fetch keywords returning a + * counter. These keywords should be used without any keyword, but because of a + * bug in previous versions, an optional string argument may be passed. In such + * case, the argument is ignored and a warning is emitted. Returns 0 on error + * and non-zero if OK. + */ +static int var_fc_counter(struct arg *args, char **err) +{ + if (args[0].type != ARGT_STOP) { + ha_warning("no argument supported for 'fc_*' sample expressions returning counters.\n"); + if (args[0].type == ARGT_STR) + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_STOP; + } + + return 1; +} + +/* Returns some tcp_info data if it's available. "dir" must be set to 0 if + * the client connection is required, otherwise it is set to 1. "val" represents + * the required value. + * If the function fails it returns 0, otherwise it returns 1 and "result" is filled. + */ +static inline int get_tcp_info(const struct arg *args, struct sample *smp, + int dir, int val) +{ + struct connection *conn; + struct tcp_info info; + socklen_t optlen; + + /* strm can be null. */ + if (!smp->strm) + return 0; + + /* get the object associated with the stream connector.The + * object can be other thing than a connection. For example, + * it be a appctx. + */ + conn = (dir == 0 ? sc_conn(smp->strm->scf) : sc_conn(smp->strm->scb)); + if (!conn) + return 0; + + /* The fd may not be available for the tcp_info struct, and the + syscal can fail. */ + optlen = sizeof(info); + if ((conn->flags & CO_FL_FDLESS) || + getsockopt(conn->handle.fd, IPPROTO_TCP, TCP_INFO, &info, &optlen) == -1) + return 0; + + /* extract the value. */ + smp->data.type = SMP_T_SINT; + switch (val) { +#if defined(__APPLE__) + case 0: smp->data.u.sint = info.tcpi_rttcur; break; + case 1: smp->data.u.sint = info.tcpi_rttvar; break; + case 2: smp->data.u.sint = info.tcpi_tfo_syn_data_acked; break; + case 4: smp->data.u.sint = info.tcpi_tfo_syn_loss; break; + case 5: smp->data.u.sint = info.tcpi_rto; break; +#else + /* all other platforms supporting TCP_INFO have these ones */ + case 0: smp->data.u.sint = info.tcpi_rtt; break; + case 1: smp->data.u.sint = info.tcpi_rttvar; break; +# if defined(__linux__) + /* these ones are common to all Linux versions */ + case 2: smp->data.u.sint = info.tcpi_unacked; break; + case 3: smp->data.u.sint = info.tcpi_sacked; break; + case 4: smp->data.u.sint = info.tcpi_lost; break; + case 5: smp->data.u.sint = info.tcpi_retrans; break; + case 6: smp->data.u.sint = info.tcpi_fackets; break; + case 7: smp->data.u.sint = info.tcpi_reordering; break; +# elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) + /* the ones are found on FreeBSD, NetBSD and OpenBSD featuring TCP_INFO */ + case 2: smp->data.u.sint = info.__tcpi_unacked; break; + case 3: smp->data.u.sint = info.__tcpi_sacked; break; + case 4: smp->data.u.sint = info.__tcpi_lost; break; + case 5: smp->data.u.sint = info.__tcpi_retrans; break; + case 6: smp->data.u.sint = info.__tcpi_fackets; break; + case 7: smp->data.u.sint = info.__tcpi_reordering; break; +# endif +#endif // apple + default: return 0; + } + + return 1; +} + +/* get the mean rtt of a client connection */ +static int +smp_fetch_fc_rtt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 0, 0)) + return 0; + + /* By default or if explicitly specified, convert rtt to ms */ + if (!args || args[0].type == ARGT_STOP || args[0].data.sint == TIME_UNIT_MS) + smp->data.u.sint = (smp->data.u.sint + 500) / 1000; + + return 1; +} + +/* get the variance of the mean rtt of a client connection */ +static int +smp_fetch_fc_rttvar(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 0, 1)) + return 0; + + /* By default or if explicitly specified, convert rttvar to ms */ + if (!args || args[0].type == ARGT_STOP || args[0].data.sint == TIME_UNIT_MS) + smp->data.u.sint = (smp->data.u.sint + 500) / 1000; + + return 1; +} + +/* get the mean rtt of a backend connection */ +static int +smp_fetch_bc_rtt(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 1, 0)) + return 0; + + /* By default or if explicitly specified, convert rtt to ms */ + if (!args || args[0].type == ARGT_STOP || args[0].data.sint == TIME_UNIT_MS) + smp->data.u.sint = (smp->data.u.sint + 500) / 1000; + + return 1; +} + +/* get the variance of the mean rtt of a backend connection */ +static int +smp_fetch_bc_rttvar(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 1, 1)) + return 0; + + /* By default or if explicitly specified, convert rttvar to ms */ + if (!args || args[0].type == ARGT_STOP || args[0].data.sint == TIME_UNIT_MS) + smp->data.u.sint = (smp->data.u.sint + 500) / 1000; + + return 1; +} + + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +/* get the unacked counter on a client connection */ +static int +smp_fetch_fc_unacked(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 0, 2)) + return 0; + return 1; +} +#endif + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) +/* get the sacked counter on a client connection */ +static int +smp_fetch_fc_sacked(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 0, 3)) + return 0; + return 1; +} +#endif + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +/* get the lost counter on a client connection */ +static int +smp_fetch_fc_lost(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 0, 4)) + return 0; + return 1; +} +#endif + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__APPLE__) +/* get the retrans counter on a client connection */ +static int +smp_fetch_fc_retrans(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 0, 5)) + return 0; + return 1; +} +#endif + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) +/* get the fackets counter on a client connection */ +static int +smp_fetch_fc_fackets(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 0, 6)) + return 0; + return 1; +} +#endif + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) +/* get the reordering counter on a client connection */ +static int +smp_fetch_fc_reordering(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + if (!get_tcp_info(args, smp, 0, 7)) + return 0; + return 1; +} +#endif +#endif // TCP_INFO + +/* Validates the data unit argument passed to "accept_date" fetch. Argument 0 support an + * optional string representing the unit of the result: "s" for seconds, "ms" for + * milliseconds and "us" for microseconds. + * Returns 0 on error and non-zero if OK. + */ +int smp_check_accept_date_unit(struct arg *args, char **err) +{ + if (args[0].type == ARGT_STR) { + long long int unit; + + if (strcmp(args[0].data.str.area, "s") == 0) { + unit = TIME_UNIT_S; + } + else if (strcmp(args[0].data.str.area, "ms") == 0) { + unit = TIME_UNIT_MS; + } + else if (strcmp(args[0].data.str.area, "us") == 0) { + unit = TIME_UNIT_US; + } + else { + memprintf(err, "expects 's', 'ms' or 'us', got '%s'", + args[0].data.str.area); + return 0; + } + + chunk_destroy(&args[0].data.str); + args[0].type = ARGT_SINT; + args[0].data.sint = unit; + } + else if (args[0].type != ARGT_STOP) { + memprintf(err, "Unexpected arg type"); + return 0; + } + + return 1; +} + +/* retrieve the accept or request date in epoch time, converts it to milliseconds + * or microseconds if asked to in optional args[1] unit param */ +static int +smp_fetch_accept_date(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + struct strm_logs *logs; + struct timeval tv; + + if (!smp->strm) + return 0; + + logs = &smp->strm->logs; + + if (kw[0] == 'r') { /* request_date */ + tv_ms_add(&tv, &logs->accept_date, logs->t_idle >= 0 ? logs->t_idle + logs->t_handshake : 0); + } else { /* accept_date */ + tv.tv_sec = logs->accept_date.tv_sec; + tv.tv_usec = logs->accept_date.tv_usec; + } + + smp->data.u.sint = tv.tv_sec; + + /* report in milliseconds */ + if (args[0].type == ARGT_SINT && args[0].data.sint == TIME_UNIT_MS) { + smp->data.u.sint *= 1000; + smp->data.u.sint += tv.tv_usec / 1000; + } + /* report in microseconds */ + else if (args[0].type == ARGT_SINT && args[0].data.sint == TIME_UNIT_US) { + smp->data.u.sint *= 1000000; + smp->data.u.sint += tv.tv_usec; + } + + smp->data.type = SMP_T_SINT; + smp->flags |= SMP_F_VOL_TEST | SMP_F_MAY_CHANGE; + return 1; +} + +/* Note: must not be declared <const> as its list will be overwritten. + * Note: fetches that may return multiple types should be declared using the + * appropriate pseudo-type. If not available it must be declared as the lowest + * common denominator, the type that can be casted into all other ones. + */ +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + /* timestamps */ + { "accept_date", smp_fetch_accept_date, ARG1(0,STR), smp_check_accept_date_unit, SMP_T_SINT, SMP_USE_L4CLI }, + { "request_date", smp_fetch_accept_date, ARG1(0,STR), smp_check_accept_date_unit, SMP_T_SINT, SMP_USE_HRQHP }, + + { "bc_dst", smp_fetch_dst, 0, NULL, SMP_T_ADDR, SMP_USE_L4SRV }, + { "bc_dst_port", smp_fetch_dport, 0, NULL, SMP_T_SINT, SMP_USE_L4SRV }, + { "bc_src", smp_fetch_src, 0, NULL, SMP_T_ADDR, SMP_USE_L4SRV }, + { "bc_src_port", smp_fetch_sport, 0, NULL, SMP_T_SINT, SMP_USE_L4SRV }, + + { "dst", smp_fetch_dst, 0, NULL, SMP_T_ADDR, SMP_USE_L4CLI }, + { "dst_is_local", smp_fetch_dst_is_local, 0, NULL, SMP_T_BOOL, SMP_USE_L4CLI }, + { "dst_port", smp_fetch_dport, 0, NULL, SMP_T_SINT, SMP_USE_L4CLI }, + + { "fc_dst", smp_fetch_dst, 0, NULL, SMP_T_ADDR, SMP_USE_L4CLI }, + { "fc_dst_is_local", smp_fetch_dst_is_local, 0, NULL, SMP_T_BOOL, SMP_USE_L4CLI }, + { "fc_dst_port", smp_fetch_dport, 0, NULL, SMP_T_SINT, SMP_USE_L4CLI }, + + { "fc_src", smp_fetch_src, 0, NULL, SMP_T_ADDR, SMP_USE_L4CLI }, + { "fc_src_is_local", smp_fetch_src_is_local, 0, NULL, SMP_T_BOOL, SMP_USE_L4CLI }, + { "fc_src_port", smp_fetch_sport, 0, NULL, SMP_T_SINT, SMP_USE_L4CLI }, + + { "src", smp_fetch_src, 0, NULL, SMP_T_ADDR, SMP_USE_L4CLI }, + { "src_is_local", smp_fetch_src_is_local, 0, NULL, SMP_T_BOOL, SMP_USE_L4CLI }, + { "src_port", smp_fetch_sport, 0, NULL, SMP_T_SINT, SMP_USE_L4CLI }, +#ifdef TCP_INFO + { "fc_rtt", smp_fetch_fc_rtt, ARG1(0,STR), val_fc_time_value, SMP_T_SINT, SMP_USE_L4CLI }, + { "fc_rttvar", smp_fetch_fc_rttvar, ARG1(0,STR), val_fc_time_value, SMP_T_SINT, SMP_USE_L4CLI }, + { "bc_rtt", smp_fetch_bc_rtt, ARG1(0,STR), val_fc_time_value, SMP_T_SINT, SMP_USE_L4CLI }, + { "bc_rttvar", smp_fetch_bc_rttvar, ARG1(0,STR), val_fc_time_value, SMP_T_SINT, SMP_USE_L4CLI }, + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__APPLE__) + { "fc_unacked", smp_fetch_fc_unacked, ARG1(0,STR), var_fc_counter, SMP_T_SINT, SMP_USE_L4CLI }, +#endif +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) + { "fc_sacked", smp_fetch_fc_sacked, ARG1(0,STR), var_fc_counter, SMP_T_SINT, SMP_USE_L4CLI }, +#endif +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__APPLE__) + { "fc_retrans", smp_fetch_fc_retrans, ARG1(0,STR), var_fc_counter, SMP_T_SINT, SMP_USE_L4CLI }, +#endif +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) + { "fc_fackets", smp_fetch_fc_fackets, ARG1(0,STR), var_fc_counter, SMP_T_SINT, SMP_USE_L4CLI }, +#endif +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__APPLE__) + { "fc_lost", smp_fetch_fc_lost, ARG1(0,STR), var_fc_counter, SMP_T_SINT, SMP_USE_L4CLI }, +#endif +#if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) + { "fc_reordering", smp_fetch_fc_reordering, ARG1(0,STR), var_fc_counter, SMP_T_SINT, SMP_USE_L4CLI }, +#endif +#endif // TCP_INFO + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); + + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/tcpcheck.c b/src/tcpcheck.c new file mode 100644 index 0000000..d30ecb5 --- /dev/null +++ b/src/tcpcheck.c @@ -0,0 +1,5150 @@ +/* + * Health-checks functions. + * + * Copyright 2000-2009,2020 Willy Tarreau <w@1wt.eu> + * Copyright 2007-2010 Krzysztof Piotr Oledzki <ole@ans.pl> + * Copyright 2013 Baptiste Assmann <bedis9@gmail.com> + * Copyright 2020 Gaetan Rivet <grive@u256.net> + * Copyright 2020 Christopher Faulet <cfaulet@haproxy.com> + * Crown Copyright 2022 Defence Science and Technology Laboratory <dstlipgroup@dstl.gov.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include <ctype.h> +#include <errno.h> +#include <signal.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#include <haproxy/action.h> +#include <haproxy/api.h> +#include <haproxy/cfgparse.h> +#include <haproxy/check.h> +#include <haproxy/chunk.h> +#include <haproxy/connection.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/h1.h> +#include <haproxy/http.h> +#include <haproxy/http_htx.h> +#include <haproxy/htx.h> +#include <haproxy/istbuf.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/net_helper.h> +#include <haproxy/protocol.h> +#include <haproxy/proxy-t.h> +#include <haproxy/regex.h> +#include <haproxy/sample.h> +#include <haproxy/server.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/stconn.h> +#include <haproxy/task.h> +#include <haproxy/tcpcheck.h> +#include <haproxy/ticks.h> +#include <haproxy/tools.h> +#include <haproxy/trace.h> +#include <haproxy/vars.h> + + +#define TRACE_SOURCE &trace_check + +/* Global tree to share all tcp-checks */ +struct eb_root shared_tcpchecks = EB_ROOT; + + +DECLARE_POOL(pool_head_tcpcheck_rule, "tcpcheck_rule", sizeof(struct tcpcheck_rule)); + +/**************************************************************************/ +/*************** Init/deinit tcp-check rules and ruleset ******************/ +/**************************************************************************/ +/* Releases memory allocated for a log-format string */ +static void free_tcpcheck_fmt(struct list *fmt) +{ + struct logformat_node *lf, *lfb; + + list_for_each_entry_safe(lf, lfb, fmt, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } +} + +/* Releases memory allocated for an HTTP header used in a tcp-check send rule */ +void free_tcpcheck_http_hdr(struct tcpcheck_http_hdr *hdr) +{ + if (!hdr) + return; + + free_tcpcheck_fmt(&hdr->value); + istfree(&hdr->name); + free(hdr); +} + +/* Releases memory allocated for an HTTP header list used in a tcp-check send + * rule + */ +static void free_tcpcheck_http_hdrs(struct list *hdrs) +{ + struct tcpcheck_http_hdr *hdr, *bhdr; + + list_for_each_entry_safe(hdr, bhdr, hdrs, list) { + LIST_DELETE(&hdr->list); + free_tcpcheck_http_hdr(hdr); + } +} + +/* Releases memory allocated for a tcp-check. If in_pool is set, it means the + * tcp-check was allocated using a memory pool (it is used to instantiate email + * alerts). + */ +void free_tcpcheck(struct tcpcheck_rule *rule, int in_pool) +{ + if (!rule) + return; + + free(rule->comment); + switch (rule->action) { + case TCPCHK_ACT_SEND: + switch (rule->send.type) { + case TCPCHK_SEND_STRING: + case TCPCHK_SEND_BINARY: + istfree(&rule->send.data); + break; + case TCPCHK_SEND_STRING_LF: + case TCPCHK_SEND_BINARY_LF: + free_tcpcheck_fmt(&rule->send.fmt); + break; + case TCPCHK_SEND_HTTP: + free(rule->send.http.meth.str.area); + if (!(rule->send.http.flags & TCPCHK_SND_HTTP_FL_URI_FMT)) + istfree(&rule->send.http.uri); + else + free_tcpcheck_fmt(&rule->send.http.uri_fmt); + istfree(&rule->send.http.vsn); + free_tcpcheck_http_hdrs(&rule->send.http.hdrs); + if (!(rule->send.http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT)) + istfree(&rule->send.http.body); + else + free_tcpcheck_fmt(&rule->send.http.body_fmt); + break; + case TCPCHK_SEND_UNDEF: + break; + } + break; + case TCPCHK_ACT_EXPECT: + free_tcpcheck_fmt(&rule->expect.onerror_fmt); + free_tcpcheck_fmt(&rule->expect.onsuccess_fmt); + release_sample_expr(rule->expect.status_expr); + switch (rule->expect.type) { + case TCPCHK_EXPECT_HTTP_STATUS: + free(rule->expect.codes.codes); + break; + case TCPCHK_EXPECT_STRING: + case TCPCHK_EXPECT_BINARY: + case TCPCHK_EXPECT_HTTP_BODY: + istfree(&rule->expect.data); + break; + case TCPCHK_EXPECT_STRING_REGEX: + case TCPCHK_EXPECT_BINARY_REGEX: + case TCPCHK_EXPECT_HTTP_STATUS_REGEX: + case TCPCHK_EXPECT_HTTP_BODY_REGEX: + regex_free(rule->expect.regex); + break; + case TCPCHK_EXPECT_STRING_LF: + case TCPCHK_EXPECT_BINARY_LF: + case TCPCHK_EXPECT_HTTP_BODY_LF: + free_tcpcheck_fmt(&rule->expect.fmt); + break; + case TCPCHK_EXPECT_HTTP_HEADER: + if (rule->expect.flags & TCPCHK_EXPT_FL_HTTP_HNAME_REG) + regex_free(rule->expect.hdr.name_re); + else if (rule->expect.flags & TCPCHK_EXPT_FL_HTTP_HNAME_FMT) + free_tcpcheck_fmt(&rule->expect.hdr.name_fmt); + else + istfree(&rule->expect.hdr.name); + + if (rule->expect.flags & TCPCHK_EXPT_FL_HTTP_HVAL_REG) + regex_free(rule->expect.hdr.value_re); + else if (rule->expect.flags & TCPCHK_EXPT_FL_HTTP_HVAL_FMT) + free_tcpcheck_fmt(&rule->expect.hdr.value_fmt); + else if (!(rule->expect.flags & TCPCHK_EXPT_FL_HTTP_HVAL_NONE)) + istfree(&rule->expect.hdr.value); + break; + case TCPCHK_EXPECT_CUSTOM: + case TCPCHK_EXPECT_UNDEF: + break; + } + break; + case TCPCHK_ACT_CONNECT: + free(rule->connect.sni); + free(rule->connect.alpn); + release_sample_expr(rule->connect.port_expr); + break; + case TCPCHK_ACT_COMMENT: + break; + case TCPCHK_ACT_ACTION_KW: + free(rule->action_kw.rule); + break; + } + + if (in_pool) + pool_free(pool_head_tcpcheck_rule, rule); + else + free(rule); +} + +/* Creates a tcp-check variable used in preset variables before executing a + * tcp-check ruleset. + */ +struct tcpcheck_var *create_tcpcheck_var(const struct ist name) +{ + struct tcpcheck_var *var = NULL; + + var = calloc(1, sizeof(*var)); + if (var == NULL) + return NULL; + + var->name = istdup(name); + if (!isttest(var->name)) { + free(var); + return NULL; + } + + LIST_INIT(&var->list); + return var; +} + +/* Releases memory allocated for a preset tcp-check variable */ +void free_tcpcheck_var(struct tcpcheck_var *var) +{ + if (!var) + return; + + istfree(&var->name); + if (var->data.type == SMP_T_STR || var->data.type == SMP_T_BIN) + free(var->data.u.str.area); + else if (var->data.type == SMP_T_METH && var->data.u.meth.meth == HTTP_METH_OTHER) + free(var->data.u.meth.str.area); + free(var); +} + +/* Releases a list of preset tcp-check variables */ +void free_tcpcheck_vars(struct list *vars) +{ + struct tcpcheck_var *var, *back; + + list_for_each_entry_safe(var, back, vars, list) { + LIST_DELETE(&var->list); + free_tcpcheck_var(var); + } +} + +/* Duplicate a list of preset tcp-check variables */ +int dup_tcpcheck_vars(struct list *dst, const struct list *src) +{ + const struct tcpcheck_var *var; + struct tcpcheck_var *new = NULL; + + list_for_each_entry(var, src, list) { + new = create_tcpcheck_var(var->name); + if (!new) + goto error; + new->data.type = var->data.type; + if (var->data.type == SMP_T_STR || var->data.type == SMP_T_BIN) { + if (chunk_dup(&new->data.u.str, &var->data.u.str) == NULL) + goto error; + if (var->data.type == SMP_T_STR) + new->data.u.str.area[new->data.u.str.data] = 0; + } + else if (var->data.type == SMP_T_METH && var->data.u.meth.meth == HTTP_METH_OTHER) { + if (chunk_dup(&new->data.u.str, &var->data.u.str) == NULL) + goto error; + new->data.u.str.area[new->data.u.str.data] = 0; + new->data.u.meth.meth = var->data.u.meth.meth; + } + else + new->data.u = var->data.u; + LIST_APPEND(dst, &new->list); + } + return 1; + + error: + free(new); + return 0; +} + +/* Looks for a shared tcp-check ruleset given its name. */ +struct tcpcheck_ruleset *find_tcpcheck_ruleset(const char *name) +{ + struct tcpcheck_ruleset *rs; + struct ebpt_node *node; + + node = ebis_lookup_len(&shared_tcpchecks, name, strlen(name)); + if (node) { + rs = container_of(node, typeof(*rs), node); + return rs; + } + return NULL; +} + +/* Creates a new shared tcp-check ruleset and insert it in shared_tcpchecks + * tree. + */ +struct tcpcheck_ruleset *create_tcpcheck_ruleset(const char *name) +{ + struct tcpcheck_ruleset *rs; + + rs = calloc(1, sizeof(*rs)); + if (rs == NULL) + return NULL; + + rs->node.key = strdup(name); + if (rs->node.key == NULL) { + free(rs); + return NULL; + } + + LIST_INIT(&rs->rules); + ebis_insert(&shared_tcpchecks, &rs->node); + return rs; +} + +/* Releases memory allocated by a tcp-check ruleset. */ +void free_tcpcheck_ruleset(struct tcpcheck_ruleset *rs) +{ + struct tcpcheck_rule *r, *rb; + + if (!rs) + return; + + ebpt_delete(&rs->node); + free(rs->node.key); + list_for_each_entry_safe(r, rb, &rs->rules, list) { + LIST_DELETE(&r->list); + free_tcpcheck(r, 0); + } + free(rs); +} + + +/**************************************************************************/ +/**************** Everything about tcp-checks execution *******************/ +/**************************************************************************/ +/* Returns the id of a step in a tcp-check ruleset */ +int tcpcheck_get_step_id(const struct check *check, const struct tcpcheck_rule *rule) +{ + if (!rule) + rule = check->current_step; + + /* no last started step => first step */ + if (!rule) + return 1; + + /* last step is the first implicit connect */ + if (rule->index == 0 && + rule->action == TCPCHK_ACT_CONNECT && + (rule->connect.options & TCPCHK_OPT_IMPLICIT)) + return 0; + + return rule->index + 1; +} + +/* Returns the first non COMMENT/ACTION_KW tcp-check rule from list <list> or + * NULL if none was found. + */ +struct tcpcheck_rule *get_first_tcpcheck_rule(const struct tcpcheck_rules *rules) +{ + struct tcpcheck_rule *r; + + list_for_each_entry(r, rules->list, list) { + if (r->action != TCPCHK_ACT_COMMENT && r->action != TCPCHK_ACT_ACTION_KW) + return r; + } + return NULL; +} + +/* Returns the last non COMMENT/ACTION_KW tcp-check rule from list <list> or + * NULL if none was found. + */ +static struct tcpcheck_rule *get_last_tcpcheck_rule(struct tcpcheck_rules *rules) +{ + struct tcpcheck_rule *r; + + list_for_each_entry_rev(r, rules->list, list) { + if (r->action != TCPCHK_ACT_COMMENT && r->action != TCPCHK_ACT_ACTION_KW) + return r; + } + return NULL; +} + +/* Returns the non COMMENT/ACTION_KW tcp-check rule from list <list> following + * <start> or NULL if non was found. If <start> is NULL, it relies on + * get_first_tcpcheck_rule(). + */ +static struct tcpcheck_rule *get_next_tcpcheck_rule(struct tcpcheck_rules *rules, struct tcpcheck_rule *start) +{ + struct tcpcheck_rule *r; + + if (!start) + return get_first_tcpcheck_rule(rules); + + r = LIST_NEXT(&start->list, typeof(r), list); + list_for_each_entry_from(r, rules->list, list) { + if (r->action != TCPCHK_ACT_COMMENT && r->action != TCPCHK_ACT_ACTION_KW) + return r; + } + return NULL; +} + + +/* Creates info message when a tcp-check healthcheck fails on an expect rule */ +static void tcpcheck_expect_onerror_message(struct buffer *msg, struct check *check, struct tcpcheck_rule *rule, + int match, struct ist info) +{ + struct sample *smp; + int is_empty; + + /* Follows these step to produce the info message: + * 1. if info field is already provided, copy it + * 2. if the expect rule provides an onerror log-format string, + * use it to produce the message + * 3. the expect rule is part of a protocol check (http, redis, mysql...), do nothing + * 4. Otherwise produce the generic tcp-check info message + */ + if (istlen(info)) { + chunk_istcat(msg, info); + goto comment; + } + else if (!LIST_ISEMPTY(&rule->expect.onerror_fmt)) { + msg->data += sess_build_logline(check->sess, NULL, b_tail(msg), b_room(msg), &rule->expect.onerror_fmt); + goto comment; + } + + is_empty = (IS_HTX_SC(check->sc) ? htx_is_empty(htxbuf(&check->bi)) : !b_data(&check->bi)); + if (is_empty) { + TRACE_ERROR("empty response", CHK_EV_RX_DATA|CHK_EV_RX_ERR, check); + chunk_printf(msg, "TCPCHK got an empty response at step %d", + tcpcheck_get_step_id(check, rule)); + goto comment; + } + + if (check->type == PR_O2_TCPCHK_CHK && + (check->tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK) != TCPCHK_RULES_TCP_CHK) { + goto comment; + } + + chunk_strcat(msg, (match ? "TCPCHK matched unwanted content" : "TCPCHK did not match content")); + switch (rule->expect.type) { + case TCPCHK_EXPECT_HTTP_STATUS: + chunk_appendf(msg, "(status codes) at step %d", tcpcheck_get_step_id(check, rule)); + break; + case TCPCHK_EXPECT_STRING: + case TCPCHK_EXPECT_HTTP_BODY: + chunk_appendf(msg, " '%.*s' at step %d", (unsigned int)istlen(rule->expect.data), istptr(rule->expect.data), + tcpcheck_get_step_id(check, rule)); + break; + case TCPCHK_EXPECT_BINARY: + chunk_appendf(msg, " (binary) at step %d", tcpcheck_get_step_id(check, rule)); + break; + case TCPCHK_EXPECT_STRING_REGEX: + case TCPCHK_EXPECT_HTTP_STATUS_REGEX: + case TCPCHK_EXPECT_HTTP_BODY_REGEX: + chunk_appendf(msg, " (regex) at step %d", tcpcheck_get_step_id(check, rule)); + break; + case TCPCHK_EXPECT_BINARY_REGEX: + chunk_appendf(msg, " (binary regex) at step %d", tcpcheck_get_step_id(check, rule)); + break; + case TCPCHK_EXPECT_STRING_LF: + case TCPCHK_EXPECT_HTTP_BODY_LF: + chunk_appendf(msg, " (log-format string) at step %d", tcpcheck_get_step_id(check, rule)); + break; + case TCPCHK_EXPECT_BINARY_LF: + chunk_appendf(msg, " (log-format binary) at step %d", tcpcheck_get_step_id(check, rule)); + break; + case TCPCHK_EXPECT_CUSTOM: + chunk_appendf(msg, " (custom function) at step %d", tcpcheck_get_step_id(check, rule)); + break; + case TCPCHK_EXPECT_HTTP_HEADER: + chunk_appendf(msg, " (header pattern) at step %d", tcpcheck_get_step_id(check, rule)); + case TCPCHK_EXPECT_UNDEF: + /* Should never happen. */ + return; + } + + comment: + /* If the failing expect rule provides a comment, it is concatenated to + * the info message. + */ + if (rule->comment) { + chunk_strcat(msg, " comment: "); + chunk_strcat(msg, rule->comment); + } + + /* Finally, the check status code is set if the failing expect rule + * defines a status expression. + */ + if (rule->expect.status_expr) { + smp = sample_fetch_as_type(check->proxy, check->sess, NULL, SMP_OPT_DIR_RES | SMP_OPT_FINAL, + rule->expect.status_expr, SMP_T_STR); + + if (smp && sample_casts[smp->data.type][SMP_T_SINT] && + sample_casts[smp->data.type][SMP_T_SINT](smp)) + check->code = smp->data.u.sint; + } + + *(b_tail(msg)) = '\0'; +} + +/* Creates info message when a tcp-check healthcheck succeeds on an expect rule */ +static void tcpcheck_expect_onsuccess_message(struct buffer *msg, struct check *check, struct tcpcheck_rule *rule, + struct ist info) +{ + struct sample *smp; + + /* Follows these step to produce the info message: + * 1. if info field is already provided, copy it + * 2. if the expect rule provides an onsucces log-format string, + * use it to produce the message + * 3. the expect rule is part of a protocol check (http, redis, mysql...), do nothing + * 4. Otherwise produce the generic tcp-check info message + */ + if (istlen(info)) + chunk_istcat(msg, info); + if (!LIST_ISEMPTY(&rule->expect.onsuccess_fmt)) + msg->data += sess_build_logline(check->sess, NULL, b_tail(msg), b_room(msg), + &rule->expect.onsuccess_fmt); + else if (check->type == PR_O2_TCPCHK_CHK && + (check->tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_TCP_CHK) + chunk_strcat(msg, "(tcp-check)"); + + /* Finally, the check status code is set if the expect rule defines a + * status expression. + */ + if (rule->expect.status_expr) { + smp = sample_fetch_as_type(check->proxy, check->sess, NULL, SMP_OPT_DIR_RES | SMP_OPT_FINAL, + rule->expect.status_expr, SMP_T_STR); + + if (smp && sample_casts[smp->data.type][SMP_T_SINT] && + sample_casts[smp->data.type][SMP_T_SINT](smp)) + check->code = smp->data.u.sint; + } + + *(b_tail(msg)) = '\0'; +} + +/* Internal functions to parse and validate a MySQL packet in the context of an + * expect rule. It start to parse the input buffer at the offset <offset>. If + * <last_read> is set, no more data are expected. + */ +static enum tcpcheck_eval_ret tcpcheck_mysql_expect_packet(struct check *check, struct tcpcheck_rule *rule, + unsigned int offset, int last_read) +{ + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + enum healthcheck_status status; + struct buffer *msg = NULL; + struct ist desc = IST_NULL; + unsigned int err = 0, plen = 0; + + + TRACE_ENTER(CHK_EV_TCPCHK_EXP, check); + + /* 3 Bytes for the packet length and 1 byte for the sequence id */ + if (b_data(&check->bi) < offset+4) { + if (!last_read) + goto wait_more_data; + + /* invalid length or truncated response */ + status = HCHK_STATUS_L7RSP; + goto error; + } + + plen = ((unsigned char) *b_peek(&check->bi, offset)) + + (((unsigned char) *(b_peek(&check->bi, offset+1))) << 8) + + (((unsigned char) *(b_peek(&check->bi, offset+2))) << 16); + + if (b_data(&check->bi) < offset+plen+4) { + if (!last_read) + goto wait_more_data; + + /* invalid length or truncated response */ + status = HCHK_STATUS_L7RSP; + goto error; + } + + if (*b_peek(&check->bi, offset+4) == '\xff') { + /* MySQL Error packet always begin with field_count = 0xff */ + status = HCHK_STATUS_L7STS; + err = ((unsigned char) *b_peek(&check->bi, offset+5)) + + (((unsigned char) *(b_peek(&check->bi, offset+6))) << 8); + desc = ist2(b_peek(&check->bi, offset+7), b_data(&check->bi) - offset - 7); + goto error; + } + + if (get_next_tcpcheck_rule(check->tcpcheck_rules, rule) != NULL) { + /* Not the last rule, continue */ + goto out; + } + + /* We set the MySQL Version in description for information purpose + * FIXME : it can be cool to use MySQL Version for other purpose, + * like mark as down old MySQL server. + */ + status = ((rule->expect.ok_status != HCHK_STATUS_UNKNOWN) ? rule->expect.ok_status : HCHK_STATUS_L7OKD); + set_server_check_status(check, status, b_peek(&check->bi, 5)); + + out: + free_trash_chunk(msg); + TRACE_LEAVE(CHK_EV_TCPCHK_EXP, check, 0, 0, (size_t[]){ret}); + return ret; + + error: + ret = TCPCHK_EVAL_STOP; + check->code = err; + msg = alloc_trash_chunk(); + if (msg) + tcpcheck_expect_onerror_message(msg, check, rule, 0, desc); + set_server_check_status(check, status, (msg ? b_head(msg) : NULL)); + goto out; + + wait_more_data: + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + ret = TCPCHK_EVAL_WAIT; + goto out; +} + +/* Custom tcp-check expect function to parse and validate the MySQL initial + * handshake packet. Returns TCPCHK_EVAL_WAIT to wait for more data, + * TCPCHK_EVAL_CONTINUE to evaluate the next rule or TCPCHK_EVAL_STOP if an + * error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_mysql_expect_iniths(struct check *check, struct tcpcheck_rule *rule, int last_read) +{ + return tcpcheck_mysql_expect_packet(check, rule, 0, last_read); +} + +/* Custom tcp-check expect function to parse and validate the MySQL OK packet + * following the initial handshake. Returns TCPCHK_EVAL_WAIT to wait for more + * data, TCPCHK_EVAL_CONTINUE to evaluate the next rule or TCPCHK_EVAL_STOP if + * an error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_mysql_expect_ok(struct check *check, struct tcpcheck_rule *rule, int last_read) +{ + unsigned int hslen = 0; + + hslen = 4 + ((unsigned char) *b_head(&check->bi)) + + (((unsigned char) *(b_peek(&check->bi, 1))) << 8) + + (((unsigned char) *(b_peek(&check->bi, 2))) << 16); + + return tcpcheck_mysql_expect_packet(check, rule, hslen, last_read); +} + +/* Custom tcp-check expect function to parse and validate the LDAP bind response + * package packet. Returns TCPCHK_EVAL_WAIT to wait for more data, + * TCPCHK_EVAL_CONTINUE to evaluate the next rule or TCPCHK_EVAL_STOP if an + * error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_ldap_expect_bindrsp(struct check *check, struct tcpcheck_rule *rule, int last_read) +{ + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + enum healthcheck_status status; + struct buffer *msg = NULL; + struct ist desc = IST_NULL; + char *ptr; + unsigned short nbytes = 0; + size_t msglen = 0; + + TRACE_ENTER(CHK_EV_TCPCHK_EXP, check); + + /* Check if the server speaks LDAP (ASN.1/BER) + * http://en.wikipedia.org/wiki/Basic_Encoding_Rules + * http://tools.ietf.org/html/rfc4511 + */ + ptr = b_head(&check->bi) + 1; + + /* size of LDAPMessage */ + if (*ptr & 0x80) { + /* For message size encoded on several bytes, we only handle + * size encoded on 2 or 4 bytes. There is no reason to make this + * part to complex because only Active Directory is known to + * encode BindReponse length on 4 bytes. + */ + nbytes = (*ptr & 0x7f); + if (b_data(&check->bi) < 1 + nbytes) + goto too_short; + switch (nbytes) { + case 4: msglen = read_n32(ptr+1); break; + case 2: msglen = read_n16(ptr+1); break; + default: + status = HCHK_STATUS_L7RSP; + desc = ist("Not LDAPv3 protocol"); + goto error; + } + } + else + msglen = *ptr; + ptr += 1 + nbytes; + + if (b_data(&check->bi) < 2 + nbytes + msglen) + goto too_short; + + /* http://tools.ietf.org/html/rfc4511#section-4.2.2 + * messageID: 0x02 0x01 0x01: INTEGER 1 + * protocolOp: 0x61: bindResponse + */ + if (memcmp(ptr, "\x02\x01\x01\x61", 4) != 0) { + status = HCHK_STATUS_L7RSP; + desc = ist("Not LDAPv3 protocol"); + goto error; + } + ptr += 4; + + /* skip size of bindResponse */ + nbytes = 0; + if (*ptr & 0x80) + nbytes = (*ptr & 0x7f); + ptr += 1 + nbytes; + + /* http://tools.ietf.org/html/rfc4511#section-4.1.9 + * ldapResult: 0x0a 0x01: ENUMERATION + */ + if (memcmp(ptr, "\x0a\x01", 2) != 0) { + status = HCHK_STATUS_L7RSP; + desc = ist("Not LDAPv3 protocol"); + goto error; + } + ptr += 2; + + /* http://tools.ietf.org/html/rfc4511#section-4.1.9 + * resultCode + */ + check->code = *ptr; + if (check->code) { + status = HCHK_STATUS_L7STS; + desc = ist("See RFC: http://tools.ietf.org/html/rfc4511#section-4.1.9"); + goto error; + } + + status = ((rule->expect.ok_status != HCHK_STATUS_UNKNOWN) ? rule->expect.ok_status : HCHK_STATUS_L7OKD); + set_server_check_status(check, status, "Success"); + + out: + free_trash_chunk(msg); + TRACE_LEAVE(CHK_EV_TCPCHK_EXP, check, 0, 0, (size_t[]){ret}); + return ret; + + error: + ret = TCPCHK_EVAL_STOP; + msg = alloc_trash_chunk(); + if (msg) + tcpcheck_expect_onerror_message(msg, check, rule, 0, desc); + set_server_check_status(check, status, (msg ? b_head(msg) : NULL)); + goto out; + + too_short: + if (!last_read) + goto wait_more_data; + /* invalid length or truncated response */ + status = HCHK_STATUS_L7RSP; + goto error; + + wait_more_data: + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + ret = TCPCHK_EVAL_WAIT; + goto out; +} + +/* Custom tcp-check expect function to parse and validate the SPOP hello agent + * frame. Returns TCPCHK_EVAL_WAIT to wait for more data, TCPCHK_EVAL_CONTINUE + * to evaluate the next rule or TCPCHK_EVAL_STOP if an error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_spop_expect_agenthello(struct check *check, struct tcpcheck_rule *rule, int last_read) +{ + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + enum healthcheck_status status; + struct buffer *msg = NULL; + struct ist desc = IST_NULL; + unsigned int framesz; + + TRACE_ENTER(CHK_EV_TCPCHK_EXP, check); + + memcpy(&framesz, b_head(&check->bi), 4); + framesz = ntohl(framesz); + + if (!last_read && b_data(&check->bi) < (4+framesz)) + goto wait_more_data; + + memset(b_orig(&trash), 0, b_size(&trash)); + if (spoe_handle_healthcheck_response(b_peek(&check->bi, 4), framesz, b_orig(&trash), HCHK_DESC_LEN) == -1) { + status = HCHK_STATUS_L7RSP; + desc = ist2(b_orig(&trash), strlen(b_orig(&trash))); + goto error; + } + + status = ((rule->expect.ok_status != HCHK_STATUS_UNKNOWN) ? rule->expect.ok_status : HCHK_STATUS_L7OKD); + set_server_check_status(check, status, "SPOA server is ok"); + + out: + free_trash_chunk(msg); + TRACE_LEAVE(CHK_EV_TCPCHK_EXP, check, 0, 0, (size_t[]){ret}); + return ret; + + error: + ret = TCPCHK_EVAL_STOP; + msg = alloc_trash_chunk(); + if (msg) + tcpcheck_expect_onerror_message(msg, check, rule, 0, desc); + set_server_check_status(check, status, (msg ? b_head(msg) : NULL)); + goto out; + + wait_more_data: + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + ret = TCPCHK_EVAL_WAIT; + goto out; +} + +/* Custom tcp-check expect function to parse and validate the agent-check + * reply. Returns TCPCHK_EVAL_WAIT to wait for more data, TCPCHK_EVAL_CONTINUE + * to evaluate the next rule or TCPCHK_EVAL_STOP if an error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_agent_expect_reply(struct check *check, struct tcpcheck_rule *rule, int last_read) +{ + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_STOP; + enum healthcheck_status status = HCHK_STATUS_CHECKED; + const char *hs = NULL; /* health status */ + const char *as = NULL; /* admin status */ + const char *ps = NULL; /* performance status */ + const char *sc = NULL; /* maxconn */ + const char *err = NULL; /* first error to report */ + const char *wrn = NULL; /* first warning to report */ + char *cmd, *p; + + TRACE_ENTER(CHK_EV_TCPCHK_EXP, check); + + /* We're getting an agent check response. The agent could + * have been disabled in the mean time with a long check + * still pending. It is important that we ignore the whole + * response. + */ + if (!(check->state & CHK_ST_ENABLED)) + goto out; + + /* The agent supports strings made of a single line ended by the + * first CR ('\r') or LF ('\n'). This line is composed of words + * delimited by spaces (' '), tabs ('\t'), or commas (','). The + * line may optionally contained a description of a state change + * after a sharp ('#'), which is only considered if a health state + * is announced. + * + * Words may be composed of : + * - a numeric weight suffixed by the percent character ('%'). + * - a health status among "up", "down", "stopped", and "fail". + * - an admin status among "ready", "drain", "maint". + * + * These words may appear in any order. If multiple words of the + * same category appear, the last one wins. + */ + + p = b_head(&check->bi); + while (*p && *p != '\n' && *p != '\r') + p++; + + if (!*p) { + if (!last_read) + goto wait_more_data; + + /* at least inform the admin that the agent is mis-behaving */ + set_server_check_status(check, check->status, "Ignoring incomplete line from agent"); + goto out; + } + + *p = 0; + cmd = b_head(&check->bi); + + while (*cmd) { + /* look for next word */ + if (*cmd == ' ' || *cmd == '\t' || *cmd == ',') { + cmd++; + continue; + } + + if (*cmd == '#') { + /* this is the beginning of a health status description, + * skip the sharp and blanks. + */ + cmd++; + while (*cmd == '\t' || *cmd == ' ') + cmd++; + break; + } + + /* find the end of the word so that we have a null-terminated + * word between <cmd> and <p>. + */ + p = cmd + 1; + while (*p && *p != '\t' && *p != ' ' && *p != '\n' && *p != ',') + p++; + if (*p) + *p++ = 0; + + /* first, health statuses */ + if (strcasecmp(cmd, "up") == 0) { + check->health = check->rise + check->fall - 1; + status = HCHK_STATUS_L7OKD; + hs = cmd; + } + else if (strcasecmp(cmd, "down") == 0) { + check->health = 0; + status = HCHK_STATUS_L7STS; + hs = cmd; + } + else if (strcasecmp(cmd, "stopped") == 0) { + check->health = 0; + status = HCHK_STATUS_L7STS; + hs = cmd; + } + else if (strcasecmp(cmd, "fail") == 0) { + check->health = 0; + status = HCHK_STATUS_L7STS; + hs = cmd; + } + /* admin statuses */ + else if (strcasecmp(cmd, "ready") == 0) { + as = cmd; + } + else if (strcasecmp(cmd, "drain") == 0) { + as = cmd; + } + else if (strcasecmp(cmd, "maint") == 0) { + as = cmd; + } + /* try to parse a weight here and keep the last one */ + else if (isdigit((unsigned char)*cmd) && strchr(cmd, '%') != NULL) { + ps = cmd; + } + /* try to parse a maxconn here */ + else if (strncasecmp(cmd, "maxconn:", strlen("maxconn:")) == 0) { + sc = cmd; + } + else { + /* keep a copy of the first error */ + if (!err) + err = cmd; + } + /* skip to next word */ + cmd = p; + } + /* here, cmd points either to \0 or to the beginning of a + * description. Skip possible leading spaces. + */ + while (*cmd == ' ' || *cmd == '\n') + cmd++; + + /* First, update the admin status so that we avoid sending other + * possibly useless warnings and can also update the health if + * present after going back up. + */ + if (as) { + if (strcasecmp(as, "drain") == 0) { + TRACE_DEVEL("set server into DRAIN mode", CHK_EV_TCPCHK_EXP, check); + srv_adm_set_drain(check->server); + } + else if (strcasecmp(as, "maint") == 0) { + TRACE_DEVEL("set server into MAINT mode", CHK_EV_TCPCHK_EXP, check); + srv_adm_set_maint(check->server); + } + else { + TRACE_DEVEL("set server into READY mode", CHK_EV_TCPCHK_EXP, check); + srv_adm_set_ready(check->server); + } + } + + /* now change weights */ + if (ps) { + const char *msg; + + TRACE_DEVEL("change server weight", CHK_EV_TCPCHK_EXP, check); + msg = server_parse_weight_change_request(check->server, ps); + if (!wrn || !*wrn) + wrn = msg; + } + + if (sc) { + const char *msg; + + sc += strlen("maxconn:"); + + TRACE_DEVEL("change server maxconn", CHK_EV_TCPCHK_EXP, check); + /* This is safe to call server_parse_maxconn_change_request + * because the server lock is held during the check. + */ + msg = server_parse_maxconn_change_request(check->server, sc); + if (!wrn || !*wrn) + wrn = msg; + } + + /* and finally health status */ + if (hs) { + /* We'll report some of the warnings and errors we have + * here. Down reports are critical, we leave them untouched. + * Lack of report, or report of 'UP' leaves the room for + * ERR first, then WARN. + */ + const char *msg = cmd; + struct buffer *t; + + if (!*msg || status == HCHK_STATUS_L7OKD) { + if (err && *err) + msg = err; + else if (wrn && *wrn) + msg = wrn; + } + + t = get_trash_chunk(); + chunk_printf(t, "via agent : %s%s%s%s", + hs, *msg ? " (" : "", + msg, *msg ? ")" : ""); + TRACE_DEVEL("update server health status", CHK_EV_TCPCHK_EXP, check); + set_server_check_status(check, status, t->area); + } + else if (err && *err) { + /* No status change but we'd like to report something odd. + * Just report the current state and copy the message. + */ + TRACE_DEVEL("agent reports an error", CHK_EV_TCPCHK_EXP, check); + chunk_printf(&trash, "agent reports an error : %s", err); + set_server_check_status(check, status/*check->status*/, trash.area); + } + else if (wrn && *wrn) { + /* No status change but we'd like to report something odd. + * Just report the current state and copy the message. + */ + TRACE_DEVEL("agent reports a warning", CHK_EV_TCPCHK_EXP, check); + chunk_printf(&trash, "agent warns : %s", wrn); + set_server_check_status(check, status/*check->status*/, trash.area); + } + else { + TRACE_DEVEL("update server health status", CHK_EV_TCPCHK_EXP, check); + set_server_check_status(check, status, NULL); + } + + out: + TRACE_LEAVE(CHK_EV_TCPCHK_EXP, check, 0, 0, (size_t[]){ret}); + return ret; + + wait_more_data: + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + ret = TCPCHK_EVAL_WAIT; + goto out; +} + +/* Evaluates a TCPCHK_ACT_CONNECT rule. Returns TCPCHK_EVAL_WAIT to wait the + * connection establishment, TCPCHK_EVAL_CONTINUE to evaluate the next rule or + * TCPCHK_EVAL_STOP if an error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_eval_connect(struct check *check, struct tcpcheck_rule *rule) +{ + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + struct tcpcheck_connect *connect = &rule->connect; + struct proxy *proxy = check->proxy; + struct server *s = check->server; + struct task *t = check->task; + struct connection *conn = sc_conn(check->sc); + struct protocol *proto; + struct xprt_ops *xprt; + struct tcpcheck_rule *next; + int status, port; + + TRACE_ENTER(CHK_EV_TCPCHK_CONN, check); + + next = get_next_tcpcheck_rule(check->tcpcheck_rules, rule); + + /* current connection already created, check if it is established or not */ + if (conn) { + if (conn->flags & CO_FL_WAIT_XPRT) { + /* We are still waiting for the connection establishment */ + if (next && next->action == TCPCHK_ACT_SEND) { + if (!(check->sc->wait_event.events & SUB_RETRY_SEND)) + conn->mux->subscribe(check->sc, SUB_RETRY_SEND, &check->sc->wait_event); + ret = TCPCHK_EVAL_WAIT; + TRACE_DEVEL("not connected yet", CHK_EV_TCPCHK_CONN, check); + } + else + ret = tcpcheck_eval_recv(check, rule); + } + goto out; + } + + /* Note: here check->sc = sc = conn = NULL */ + + /* Always release input and output buffer when a new connect is evaluated */ + check_release_buf(check, &check->bi); + check_release_buf(check, &check->bo); + + /* No connection, prepare a new one */ + conn = conn_new((s ? &s->obj_type : &proxy->obj_type)); + if (!conn) { + chunk_printf(&trash, "TCPCHK error allocating connection at step %d", + tcpcheck_get_step_id(check, rule)); + if (rule->comment) + chunk_appendf(&trash, " comment: '%s'", rule->comment); + set_server_check_status(check, HCHK_STATUS_SOCKERR, trash.area); + ret = TCPCHK_EVAL_STOP; + TRACE_ERROR("stconn allocation error", CHK_EV_TCPCHK_CONN|CHK_EV_TCPCHK_ERR, check); + goto out; + } + if (sc_attach_mux(check->sc, NULL, conn) < 0) { + TRACE_ERROR("mux attach error", CHK_EV_TCPCHK_CONN|CHK_EV_TCPCHK_ERR, check); + conn_free(conn); + conn = NULL; + status = SF_ERR_RESOURCE; + goto fail_check; + } + conn->ctx = check->sc; + conn_set_owner(conn, check->sess, NULL); + + /* no client address */ + if (!sockaddr_alloc(&conn->dst, NULL, 0)) { + TRACE_ERROR("sockaddr allocation error", CHK_EV_TCPCHK_CONN|CHK_EV_TCPCHK_ERR, check); + status = SF_ERR_RESOURCE; + goto fail_check; + } + + /* connect to the connect rule addr if specified, otherwise the check + * addr if specified on the server. otherwise, use the server addr (it + * MUST exist at this step). + */ + *conn->dst = (is_addr(&connect->addr) + ? connect->addr + : (is_addr(&check->addr) ? check->addr : s->addr)); + proto = protocol_lookup(conn->dst->ss_family, PROTO_TYPE_STREAM, 0); + + port = 0; + if (connect->port) + port = connect->port; + if (!port && connect->port_expr) { + struct sample *smp; + + smp = sample_fetch_as_type(check->proxy, check->sess, NULL, + SMP_OPT_DIR_REQ | SMP_OPT_FINAL, + connect->port_expr, SMP_T_SINT); + if (smp) + port = smp->data.u.sint; + } + if (!port && is_inet_addr(&connect->addr)) + port = get_host_port(&connect->addr); + if (!port && check->port) + port = check->port; + if (!port && is_inet_addr(&check->addr)) + port = get_host_port(&check->addr); + if (!port) { + /* The server MUST exist here */ + port = s->svc_port; + } + set_host_port(conn->dst, port); + TRACE_DEVEL("set port", CHK_EV_TCPCHK_CONN, check, 0, 0, (size_t[]){port}); + + xprt = ((connect->options & TCPCHK_OPT_SSL) + ? xprt_get(XPRT_SSL) + : ((connect->options & TCPCHK_OPT_DEFAULT_CONNECT) ? check->xprt : xprt_get(XPRT_RAW))); + + if (conn_prepare(conn, proto, xprt) < 0) { + TRACE_ERROR("xprt allocation error", CHK_EV_TCPCHK_CONN|CHK_EV_TCPCHK_ERR, check); + status = SF_ERR_RESOURCE; + goto fail_check; + } + + if ((connect->options & TCPCHK_OPT_SOCKS4) && s && (s->flags & SRV_F_SOCKS4_PROXY)) { + conn->send_proxy_ofs = 1; + conn->flags |= CO_FL_SOCKS4; + TRACE_DEVEL("configure SOCKS4 proxy", CHK_EV_TCPCHK_CONN); + } + else if ((connect->options & TCPCHK_OPT_DEFAULT_CONNECT) && s && s->check.via_socks4 && (s->flags & SRV_F_SOCKS4_PROXY)) { + conn->send_proxy_ofs = 1; + conn->flags |= CO_FL_SOCKS4; + TRACE_DEVEL("configure SOCKS4 proxy", CHK_EV_TCPCHK_CONN); + } + + if (connect->options & TCPCHK_OPT_SEND_PROXY) { + conn->send_proxy_ofs = 1; + conn->flags |= CO_FL_SEND_PROXY; + TRACE_DEVEL("configure PROXY protocol", CHK_EV_TCPCHK_CONN, check); + } + else if ((connect->options & TCPCHK_OPT_DEFAULT_CONNECT) && s && s->check.send_proxy && !(check->state & CHK_ST_AGENT)) { + conn->send_proxy_ofs = 1; + conn->flags |= CO_FL_SEND_PROXY; + TRACE_DEVEL("configure PROXY protocol", CHK_EV_TCPCHK_CONN, check); + } + + status = SF_ERR_INTERNAL; + if (proto && proto->connect) { + int flags = 0; + + if (!next) + flags |= CONNECT_DELACK_ALWAYS; + if (connect->options & TCPCHK_OPT_HAS_DATA) + flags |= (CONNECT_HAS_DATA|CONNECT_DELACK_ALWAYS); + status = proto->connect(conn, flags); + } + + if (status != SF_ERR_NONE) + goto fail_check; + + conn_set_private(conn); + conn->ctx = check->sc; + +#ifdef USE_OPENSSL + if (connect->sni) + ssl_sock_set_servername(conn, connect->sni); + else if ((connect->options & TCPCHK_OPT_DEFAULT_CONNECT) && s && s->check.sni) + ssl_sock_set_servername(conn, s->check.sni); + + if (connect->alpn) + ssl_sock_set_alpn(conn, (unsigned char *)connect->alpn, connect->alpn_len); + else if ((connect->options & TCPCHK_OPT_DEFAULT_CONNECT) && s && s->check.alpn_str) + ssl_sock_set_alpn(conn, (unsigned char *)s->check.alpn_str, s->check.alpn_len); +#endif + + if (conn_ctrl_ready(conn) && (connect->options & TCPCHK_OPT_LINGER) && !(conn->flags & CO_FL_FDLESS)) { + /* Some servers don't like reset on close */ + HA_ATOMIC_AND(&fdtab[conn->handle.fd].state, ~FD_LINGER_RISK); + } + + if (conn_ctrl_ready(conn) && (conn->flags & (CO_FL_SEND_PROXY | CO_FL_SOCKS4))) { + if (xprt_add_hs(conn) < 0) + status = SF_ERR_RESOURCE; + } + + if (conn_xprt_start(conn) < 0) { + status = SF_ERR_RESOURCE; + goto fail_check; + } + + /* The mux may be initialized now if there isn't server attached to the + * check (email alerts) or if there is a mux proto specified or if there + * is no alpn. + */ + if (!s || ((connect->options & TCPCHK_OPT_DEFAULT_CONNECT) && check->mux_proto) || + connect->mux_proto || (!connect->alpn && !check->alpn_str)) { + const struct mux_ops *mux_ops; + + TRACE_DEVEL("try to install mux now", CHK_EV_TCPCHK_CONN, check); + if (connect->mux_proto) + mux_ops = connect->mux_proto->mux; + else if ((connect->options & TCPCHK_OPT_DEFAULT_CONNECT) && check->mux_proto) + mux_ops = check->mux_proto->mux; + else { + int mode = ((check->tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_HTTP_CHK + ? PROTO_MODE_HTTP + : PROTO_MODE_TCP); + + mux_ops = conn_get_best_mux(conn, IST_NULL, PROTO_SIDE_BE, mode); + } + if (mux_ops && conn_install_mux(conn, mux_ops, check->sc, proxy, check->sess) < 0) { + TRACE_ERROR("failed to install mux", CHK_EV_TCPCHK_CONN|CHK_EV_TCPCHK_ERR, check); + status = SF_ERR_INTERNAL; + goto fail_check; + } + } + + fail_check: + /* It can return one of : + * - SF_ERR_NONE if everything's OK + * - SF_ERR_SRVTO if there are no more servers + * - SF_ERR_SRVCL if the connection was refused by the server + * - SF_ERR_PRXCOND if the connection has been limited by the proxy (maxconn) + * - SF_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * - SF_ERR_INTERNAL for any other purely internal errors + * Additionally, in the case of SF_ERR_RESOURCE, an emergency log will be emitted. + * Note that we try to prevent the network stack from sending the ACK during the + * connect() when a pure TCP check is used (without PROXY protocol). + */ + switch (status) { + case SF_ERR_NONE: + /* we allow up to min(inter, timeout.connect) for a connection + * to establish but only when timeout.check is set as it may be + * to short for a full check otherwise + */ + t->expire = tick_add(now_ms, MS_TO_TICKS(check->inter)); + + if (proxy->timeout.check && proxy->timeout.connect) { + int t_con = tick_add(now_ms, proxy->timeout.connect); + t->expire = tick_first(t->expire, t_con); + } + break; + case SF_ERR_SRVTO: /* ETIMEDOUT */ + case SF_ERR_SRVCL: /* ECONNREFUSED, ENETUNREACH, ... */ + case SF_ERR_PRXCOND: + case SF_ERR_RESOURCE: + case SF_ERR_INTERNAL: + TRACE_ERROR("report connection error", CHK_EV_TCPCHK_CONN|CHK_EV_TCPCHK_ERR, check, 0, 0, (size_t[]){status}); + chk_report_conn_err(check, errno, 0); + ret = TCPCHK_EVAL_STOP; + goto out; + } + + /* don't do anything until the connection is established */ + if (conn->flags & CO_FL_WAIT_XPRT) { + if (conn->mux) { + if (next && next->action == TCPCHK_ACT_SEND) + conn->mux->subscribe(check->sc, SUB_RETRY_SEND, &check->sc->wait_event); + else + conn->mux->subscribe(check->sc, SUB_RETRY_RECV, &check->sc->wait_event); + } + ret = TCPCHK_EVAL_WAIT; + TRACE_DEVEL("not connected yet", CHK_EV_TCPCHK_CONN, check); + goto out; + } + + out: + if (conn && check->result == CHK_RES_FAILED) { + conn->flags |= CO_FL_ERROR; + TRACE_ERROR("connect failed, report connection error", CHK_EV_TCPCHK_CONN|CHK_EV_TCPCHK_ERR, check); + } + + if (ret == TCPCHK_EVAL_CONTINUE && check->proxy->timeout.check) + check->task->expire = tick_add_ifset(now_ms, check->proxy->timeout.check); + + TRACE_LEAVE(CHK_EV_TCPCHK_CONN, check, 0, 0, (size_t[]){ret}); + return ret; +} + +/* Evaluates a TCPCHK_ACT_SEND rule. Returns TCPCHK_EVAL_WAIT if outgoing data + * were not fully sent, TCPCHK_EVAL_CONTINUE to evaluate the next rule or + * TCPCHK_EVAL_STOP if an error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_eval_send(struct check *check, struct tcpcheck_rule *rule) +{ + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + struct tcpcheck_send *send = &rule->send; + struct stconn *sc = check->sc; + struct connection *conn = __sc_conn(sc); + struct buffer *tmp = NULL; + struct htx *htx = NULL; + int connection_hdr = 0; + + TRACE_ENTER(CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA, check); + + if (check->state & CHK_ST_OUT_ALLOC) { + ret = TCPCHK_EVAL_WAIT; + TRACE_STATE("waiting for output buffer allocation", CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA|CHK_EV_TX_BLK, check); + goto out; + } + + if (!check_get_buf(check, &check->bo)) { + check->state |= CHK_ST_OUT_ALLOC; + ret = TCPCHK_EVAL_WAIT; + TRACE_STATE("waiting for output buffer allocation", CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA|CHK_EV_TX_BLK, check); + goto out; + } + + /* Data already pending in the output buffer, send them now */ + if ((IS_HTX_CONN(conn) && !htx_is_empty(htxbuf(&check->bo))) || (!IS_HTX_CONN(conn) && b_data(&check->bo))) { + TRACE_DEVEL("Data still pending, try to send it now", CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA, check); + goto do_send; + } + + /* Always release input buffer when a new send is evaluated */ + check_release_buf(check, &check->bi); + + switch (send->type) { + case TCPCHK_SEND_STRING: + case TCPCHK_SEND_BINARY: + if (istlen(send->data) >= b_size(&check->bo)) { + chunk_printf(&trash, "tcp-check send : string too large (%u) for buffer size (%u) at step %d", + (unsigned int)istlen(send->data), (unsigned int)b_size(&check->bo), + tcpcheck_get_step_id(check, rule)); + set_server_check_status(check, HCHK_STATUS_L7RSP, trash.area); + ret = TCPCHK_EVAL_STOP; + goto out; + } + b_putist(&check->bo, send->data); + break; + case TCPCHK_SEND_STRING_LF: + check->bo.data = sess_build_logline(check->sess, NULL, b_orig(&check->bo), b_size(&check->bo), &rule->send.fmt); + if (!b_data(&check->bo)) + goto out; + break; + case TCPCHK_SEND_BINARY_LF: { + int len = b_size(&check->bo); + + tmp = alloc_trash_chunk(); + if (!tmp) + goto error_lf; + tmp->data = sess_build_logline(check->sess, NULL, b_orig(tmp), b_size(tmp), &rule->send.fmt); + if (!b_data(tmp)) + goto out; + tmp->area[tmp->data] = '\0'; + if (parse_binary(b_orig(tmp), &check->bo.area, &len, NULL) == 0) + goto error_lf; + check->bo.data = len; + break; + } + case TCPCHK_SEND_HTTP: { + struct htx_sl *sl; + struct ist meth, uri, vsn, clen, body; + unsigned int slflags = 0; + + tmp = alloc_trash_chunk(); + if (!tmp) + goto error_htx; + + meth = ((send->http.meth.meth == HTTP_METH_OTHER) + ? ist2(send->http.meth.str.area, send->http.meth.str.data) + : http_known_methods[send->http.meth.meth]); + if (send->http.flags & TCPCHK_SND_HTTP_FL_URI_FMT) { + tmp->data = sess_build_logline(check->sess, NULL, b_orig(tmp), b_size(tmp), &send->http.uri_fmt); + uri = (b_data(tmp) ? ist2(b_orig(tmp), b_data(tmp)) : ist("/")); + } + else + uri = (isttest(send->http.uri) ? send->http.uri : ist("/")); + vsn = (isttest(send->http.vsn) ? send->http.vsn : ist("HTTP/1.0")); + + if ((istlen(vsn) == 6 && *(vsn.ptr+5) == '2') || + (istlen(vsn) == 8 && (*(vsn.ptr+5) > '1' || (*(vsn.ptr+5) == '1' && *(vsn.ptr+7) >= '1')))) + slflags |= HTX_SL_F_VER_11; + slflags |= (HTX_SL_F_XFER_LEN|HTX_SL_F_CLEN); + if (!(send->http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT) && !isttest(send->http.body)) + slflags |= HTX_SL_F_BODYLESS; + + htx = htx_from_buf(&check->bo); + sl = htx_add_stline(htx, HTX_BLK_REQ_SL, slflags, meth, uri, vsn); + if (!sl) + goto error_htx; + sl->info.req.meth = send->http.meth.meth; + if (!http_update_host(htx, sl, uri)) + goto error_htx; + + if (!LIST_ISEMPTY(&send->http.hdrs)) { + struct tcpcheck_http_hdr *hdr; + struct ist hdr_value; + + list_for_each_entry(hdr, &send->http.hdrs, list) { + chunk_reset(tmp); + tmp->data = sess_build_logline(check->sess, NULL, b_orig(tmp), b_size(tmp), &hdr->value); + if (!b_data(tmp)) + continue; + hdr_value = ist2(b_orig(tmp), b_data(tmp)); + if (!htx_add_header(htx, hdr->name, hdr_value)) + goto error_htx; + if ((sl->flags & HTX_SL_F_HAS_AUTHORITY) && isteqi(hdr->name, ist("host"))) { + if (!http_update_authority(htx, sl, hdr_value)) + goto error_htx; + } + if (isteqi(hdr->name, ist("connection"))) + connection_hdr = 1; + } + + } + if (check->proxy->options2 & PR_O2_CHK_SNDST) { + chunk_reset(tmp); + httpchk_build_status_header(check->server, tmp); + if (!htx_add_header(htx, ist("X-Haproxy-Server-State"), ist2(b_orig(tmp), b_data(tmp)))) + goto error_htx; + } + + + if (send->http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT) { + chunk_reset(tmp); + tmp->data = sess_build_logline(check->sess, NULL, b_orig(tmp), b_size(tmp), &send->http.body_fmt); + body = ist2(b_orig(tmp), b_data(tmp)); + } + else + body = send->http.body; + + if (!connection_hdr && !htx_add_header(htx, ist("Connection"), ist("close"))) + goto error_htx; + + if ((send->http.meth.meth != HTTP_METH_OPTIONS && + send->http.meth.meth != HTTP_METH_GET && + send->http.meth.meth != HTTP_METH_HEAD && + send->http.meth.meth != HTTP_METH_DELETE) || istlen(body)) { + clen = ist((!istlen(body) ? "0" : ultoa(istlen(body)))); + if (!htx_add_header(htx, ist("Content-length"), clen)) + goto error_htx; + } + + if (!htx_add_endof(htx, HTX_BLK_EOH) || + (istlen(body) && !htx_add_data_atonce(htx, body))) + goto error_htx; + + /* no more data are expected */ + htx->flags |= HTX_FL_EOM; + htx_to_buf(htx, &check->bo); + break; + } + case TCPCHK_SEND_UNDEF: + /* Should never happen. */ + ret = TCPCHK_EVAL_STOP; + goto out; + }; + + do_send: + TRACE_DATA("send data", CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA, check); + if (conn->mux->snd_buf(sc, &check->bo, + (IS_HTX_CONN(conn) ? (htxbuf(&check->bo))->data: b_data(&check->bo)), 0) <= 0) { + if ((conn->flags & CO_FL_ERROR) || sc_ep_test(sc, SE_FL_ERROR)) { + ret = TCPCHK_EVAL_STOP; + TRACE_DEVEL("connection error during send", CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA|CHK_EV_TX_ERR, check); + goto out; + } + } + if ((IS_HTX_CONN(conn) && !htx_is_empty(htxbuf(&check->bo))) || (!IS_HTX_CONN(conn) && b_data(&check->bo))) { + conn->mux->subscribe(sc, SUB_RETRY_SEND, &sc->wait_event); + ret = TCPCHK_EVAL_WAIT; + TRACE_DEVEL("data not fully sent, wait", CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA, check); + goto out; + } + + out: + free_trash_chunk(tmp); + if (!b_data(&check->bo) || ret == TCPCHK_EVAL_STOP) + check_release_buf(check, &check->bo); + + TRACE_LEAVE(CHK_EV_TCPCHK_SND, check, 0, 0, (size_t[]){ret}); + return ret; + + error_htx: + if (htx) { + htx_reset(htx); + htx_to_buf(htx, &check->bo); + } + chunk_printf(&trash, "tcp-check send : failed to build HTTP request at step %d", + tcpcheck_get_step_id(check, rule)); + TRACE_ERROR("failed to build HTTP request", CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA|CHK_EV_TCPCHK_ERR, check); + set_server_check_status(check, HCHK_STATUS_L7RSP, trash.area); + ret = TCPCHK_EVAL_STOP; + goto out; + + error_lf: + chunk_printf(&trash, "tcp-check send : failed to build log-format string at step %d", + tcpcheck_get_step_id(check, rule)); + TRACE_ERROR("failed to build log-format string", CHK_EV_TCPCHK_SND|CHK_EV_TX_DATA|CHK_EV_TCPCHK_ERR, check); + set_server_check_status(check, HCHK_STATUS_L7RSP, trash.area); + ret = TCPCHK_EVAL_STOP; + goto out; + +} + +/* Try to receive data before evaluating a tcp-check expect rule. Returns + * TCPCHK_EVAL_WAIT if it is already subscribed on receive events or if nothing + * was received, TCPCHK_EVAL_CONTINUE to evaluate the expect rule or + * TCPCHK_EVAL_STOP if an error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_eval_recv(struct check *check, struct tcpcheck_rule *rule) +{ + struct stconn *sc = check->sc; + struct connection *conn = __sc_conn(sc); + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + size_t max, read, cur_read = 0; + int is_empty; + int read_poll = MAX_READ_POLL_LOOPS; + + TRACE_ENTER(CHK_EV_RX_DATA, check); + + if (sc->wait_event.events & SUB_RETRY_RECV) { + TRACE_DEVEL("waiting for response", CHK_EV_RX_DATA, check); + goto wait_more_data; + } + + if (sc_ep_test(sc, SE_FL_EOS)) + goto end_recv; + + if (check->state & CHK_ST_IN_ALLOC) { + TRACE_STATE("waiting for input buffer allocation", CHK_EV_RX_DATA|CHK_EV_RX_BLK, check); + goto wait_more_data; + } + + if (!check_get_buf(check, &check->bi)) { + check->state |= CHK_ST_IN_ALLOC; + TRACE_STATE("waiting for input buffer allocation", CHK_EV_RX_DATA|CHK_EV_RX_BLK, check); + goto wait_more_data; + } + + /* errors on the connection and the stream connector were already checked */ + + /* prepare to detect if the mux needs more room */ + sc_ep_clr(sc, SE_FL_WANT_ROOM); + + while (sc_ep_test(sc, SE_FL_RCV_MORE) || + (!(conn->flags & CO_FL_ERROR) && !sc_ep_test(sc, SE_FL_ERROR | SE_FL_EOS))) { + max = (IS_HTX_SC(sc) ? htx_free_space(htxbuf(&check->bi)) : b_room(&check->bi)); + read = conn->mux->rcv_buf(sc, &check->bi, max, 0); + cur_read += read; + if (!read || + sc_ep_test(sc, SE_FL_WANT_ROOM) || + (--read_poll <= 0) || + (read < max && read >= global.tune.recv_enough)) + break; + } + + end_recv: + is_empty = (IS_HTX_SC(sc) ? htx_is_empty(htxbuf(&check->bi)) : !b_data(&check->bi)); + if (is_empty && ((conn->flags & CO_FL_ERROR) || sc_ep_test(sc, SE_FL_ERROR))) { + /* Report network errors only if we got no other data. Otherwise + * we'll let the upper layers decide whether the response is OK + * or not. It is very common that an RST sent by the server is + * reported as an error just after the last data chunk. + */ + TRACE_ERROR("connection error during recv", CHK_EV_RX_DATA|CHK_EV_RX_ERR, check); + goto stop; + } + else if (!cur_read && !sc_ep_test(sc, SE_FL_WANT_ROOM | SE_FL_ERROR | SE_FL_EOS)) { + conn->mux->subscribe(sc, SUB_RETRY_RECV, &sc->wait_event); + TRACE_DEVEL("waiting for response", CHK_EV_RX_DATA, check); + goto wait_more_data; + } + TRACE_DATA("data received", CHK_EV_RX_DATA, check, 0, 0, (size_t[]){cur_read}); + + out: + if (!b_data(&check->bi) || ret == TCPCHK_EVAL_STOP) + check_release_buf(check, &check->bi); + + TRACE_LEAVE(CHK_EV_RX_DATA, check, 0, 0, (size_t[]){ret}); + return ret; + + stop: + ret = TCPCHK_EVAL_STOP; + goto out; + + wait_more_data: + ret = TCPCHK_EVAL_WAIT; + goto out; +} + +/* Evaluates an HTTP TCPCHK_ACT_EXPECT rule. If <last_read> is set , no more data + * are expected. Returns TCPCHK_EVAL_WAIT to wait for more data, + * TCPCHK_EVAL_CONTINUE to evaluate the next rule or TCPCHK_EVAL_STOP if an + * error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_eval_expect_http(struct check *check, struct tcpcheck_rule *rule, int last_read) +{ + struct htx *htx = htxbuf(&check->bi); + struct htx_sl *sl; + struct htx_blk *blk; + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + struct tcpcheck_expect *expect = &rule->expect; + struct buffer *msg = NULL, *tmp = NULL, *nbuf = NULL, *vbuf = NULL; + enum healthcheck_status status = HCHK_STATUS_L7RSP; + struct ist desc = IST_NULL; + int i, match, inverse; + + TRACE_ENTER(CHK_EV_TCPCHK_EXP, check); + + last_read |= (!htx_free_data_space(htx) || (htx->flags & HTX_FL_EOM)); + + if (htx->flags & HTX_FL_PARSING_ERROR) { + TRACE_ERROR("invalid response", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + status = HCHK_STATUS_L7RSP; + goto error; + } + + if (htx_is_empty(htx)) { + if (last_read) { + TRACE_ERROR("empty response received", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + status = HCHK_STATUS_L7RSP; + goto error; + } + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + goto wait_more_data; + } + + sl = http_get_stline(htx); + check->code = sl->info.res.status; + + if (check->server && + (check->server->proxy->options & PR_O_DISABLE404) && + (check->server->next_state != SRV_ST_STOPPED) && + (check->code == 404)) { + /* 404 may be accepted as "stopping" only if the server was up */ + TRACE_STATE("404 response & disable-404", CHK_EV_TCPCHK_EXP, check); + goto out; + } + + inverse = !!(expect->flags & TCPCHK_EXPT_FL_INV); + /* Make GCC happy ; initialize match to a failure state. */ + match = inverse; + status = expect->err_status; + + switch (expect->type) { + case TCPCHK_EXPECT_HTTP_STATUS: + match = 0; + for (i = 0; i < expect->codes.num; i++) { + if (sl->info.res.status >= expect->codes.codes[i][0] && + sl->info.res.status <= expect->codes.codes[i][1]) { + match = 1; + break; + } + } + + /* Set status and description in case of error */ + status = ((status != HCHK_STATUS_UNKNOWN) ? status : HCHK_STATUS_L7STS); + if (LIST_ISEMPTY(&expect->onerror_fmt)) + desc = htx_sl_res_reason(sl); + break; + case TCPCHK_EXPECT_HTTP_STATUS_REGEX: + match = regex_exec2(expect->regex, HTX_SL_RES_CPTR(sl), HTX_SL_RES_CLEN(sl)); + + /* Set status and description in case of error */ + status = ((status != HCHK_STATUS_UNKNOWN) ? status : HCHK_STATUS_L7STS); + if (LIST_ISEMPTY(&expect->onerror_fmt)) + desc = htx_sl_res_reason(sl); + break; + + case TCPCHK_EXPECT_HTTP_HEADER: { + struct http_hdr_ctx ctx; + struct ist npat, vpat, value; + int full = (expect->flags & (TCPCHK_EXPT_FL_HTTP_HVAL_NONE|TCPCHK_EXPT_FL_HTTP_HVAL_FULL)); + + if (expect->flags & TCPCHK_EXPT_FL_HTTP_HNAME_FMT) { + nbuf = alloc_trash_chunk(); + if (!nbuf) { + status = HCHK_STATUS_L7RSP; + desc = ist("Failed to allocate buffer to eval log-format string"); + TRACE_ERROR("buffer allocation failure", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + nbuf->data = sess_build_logline(check->sess, NULL, b_orig(nbuf), b_size(nbuf), &expect->hdr.name_fmt); + if (!b_data(nbuf)) { + status = HCHK_STATUS_L7RSP; + desc = ist("log-format string evaluated to an empty string"); + TRACE_ERROR("invalid log-format string (hdr name)", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + npat = ist2(b_orig(nbuf), b_data(nbuf)); + } + else if (!(expect->flags & TCPCHK_EXPT_FL_HTTP_HNAME_REG)) + npat = expect->hdr.name; + + if (expect->flags & TCPCHK_EXPT_FL_HTTP_HVAL_FMT) { + vbuf = alloc_trash_chunk(); + if (!vbuf) { + status = HCHK_STATUS_L7RSP; + desc = ist("Failed to allocate buffer to eval log-format string"); + TRACE_ERROR("buffer allocation failure", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + vbuf->data = sess_build_logline(check->sess, NULL, b_orig(vbuf), b_size(vbuf), &expect->hdr.value_fmt); + if (!b_data(vbuf)) { + status = HCHK_STATUS_L7RSP; + desc = ist("log-format string evaluated to an empty string"); + TRACE_ERROR("invalid log-format string (hdr value)", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + vpat = ist2(b_orig(vbuf), b_data(vbuf)); + } + else if (!(expect->flags & TCPCHK_EXPT_FL_HTTP_HVAL_REG)) + vpat = expect->hdr.value; + + match = 0; + ctx.blk = NULL; + while (1) { + switch (expect->flags & TCPCHK_EXPT_FL_HTTP_HNAME_TYPE) { + case TCPCHK_EXPT_FL_HTTP_HNAME_STR: + if (!http_find_str_header(htx, npat, &ctx, full)) + goto end_of_match; + break; + case TCPCHK_EXPT_FL_HTTP_HNAME_BEG: + if (!http_find_pfx_header(htx, npat, &ctx, full)) + goto end_of_match; + break; + case TCPCHK_EXPT_FL_HTTP_HNAME_END: + if (!http_find_sfx_header(htx, npat, &ctx, full)) + goto end_of_match; + break; + case TCPCHK_EXPT_FL_HTTP_HNAME_SUB: + if (!http_find_sub_header(htx, npat, &ctx, full)) + goto end_of_match; + break; + case TCPCHK_EXPT_FL_HTTP_HNAME_REG: + if (!http_match_header(htx, expect->hdr.name_re, &ctx, full)) + goto end_of_match; + break; + default: + /* should never happen */ + goto end_of_match; + } + + /* A header has matched the name pattern, let's test its + * value now (always defined from there). If there is no + * value pattern, it is a good match. + */ + + if (expect->flags & TCPCHK_EXPT_FL_HTTP_HVAL_NONE) { + match = 1; + goto end_of_match; + } + + value = ctx.value; + switch (expect->flags & TCPCHK_EXPT_FL_HTTP_HVAL_TYPE) { + case TCPCHK_EXPT_FL_HTTP_HVAL_STR: + if (isteq(value, vpat)) { + match = 1; + goto end_of_match; + } + break; + case TCPCHK_EXPT_FL_HTTP_HVAL_BEG: + if (istlen(value) < istlen(vpat)) + break; + value = ist2(istptr(value), istlen(vpat)); + if (isteq(value, vpat)) { + match = 1; + goto end_of_match; + } + break; + case TCPCHK_EXPT_FL_HTTP_HVAL_END: + if (istlen(value) < istlen(vpat)) + break; + value = ist2(istend(value) - istlen(vpat), istlen(vpat)); + if (isteq(value, vpat)) { + match = 1; + goto end_of_match; + } + break; + case TCPCHK_EXPT_FL_HTTP_HVAL_SUB: + if (isttest(istist(value, vpat))) { + match = 1; + goto end_of_match; + } + break; + case TCPCHK_EXPT_FL_HTTP_HVAL_REG: + if (regex_exec2(expect->hdr.value_re, istptr(value), istlen(value))) { + match = 1; + goto end_of_match; + } + break; + } + } + + end_of_match: + status = ((status != HCHK_STATUS_UNKNOWN) ? status : HCHK_STATUS_L7STS); + if (LIST_ISEMPTY(&expect->onerror_fmt)) + desc = htx_sl_res_reason(sl); + break; + } + + case TCPCHK_EXPECT_HTTP_BODY: + case TCPCHK_EXPECT_HTTP_BODY_REGEX: + case TCPCHK_EXPECT_HTTP_BODY_LF: + match = 0; + chunk_reset(&trash); + for (blk = htx_get_head_blk(htx); blk; blk = htx_get_next_blk(htx, blk)) { + enum htx_blk_type type = htx_get_blk_type(blk); + + if (type == HTX_BLK_TLR || type == HTX_BLK_EOT) + break; + if (type == HTX_BLK_DATA) { + if (!chunk_istcat(&trash, htx_get_blk_value(htx, blk))) + break; + } + } + + if (!b_data(&trash)) { + if (!last_read) { + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + goto wait_more_data; + } + status = ((status != HCHK_STATUS_UNKNOWN) ? status : HCHK_STATUS_L7RSP); + if (LIST_ISEMPTY(&expect->onerror_fmt)) + desc = ist("HTTP content check could not find a response body"); + TRACE_ERROR("no response boduy found while expected", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + + if (expect->type == TCPCHK_EXPECT_HTTP_BODY_LF) { + tmp = alloc_trash_chunk(); + if (!tmp) { + status = HCHK_STATUS_L7RSP; + desc = ist("Failed to allocate buffer to eval log-format string"); + TRACE_ERROR("buffer allocation failure", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + tmp->data = sess_build_logline(check->sess, NULL, b_orig(tmp), b_size(tmp), &expect->fmt); + if (!b_data(tmp)) { + status = HCHK_STATUS_L7RSP; + desc = ist("log-format string evaluated to an empty string"); + TRACE_ERROR("invalid log-format string", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + } + + if (!last_read && + ((expect->type == TCPCHK_EXPECT_HTTP_BODY && b_data(&trash) < istlen(expect->data)) || + ((expect->type == TCPCHK_EXPECT_HTTP_BODY_LF && b_data(&trash) < b_data(tmp))) || + (expect->min_recv > 0 && b_data(&trash) < expect->min_recv))) { + ret = TCPCHK_EVAL_WAIT; + goto out; + } + + if (expect->type ==TCPCHK_EXPECT_HTTP_BODY) + match = my_memmem(b_orig(&trash), b_data(&trash), istptr(expect->data), istlen(expect->data)) != NULL; + else if (expect->type ==TCPCHK_EXPECT_HTTP_BODY_LF) + match = my_memmem(b_orig(&trash), b_data(&trash), b_orig(tmp), b_data(tmp)) != NULL; + else + match = regex_exec2(expect->regex, b_orig(&trash), b_data(&trash)); + + /* Wait for more data on mismatch only if no minimum is defined (-1), + * otherwise the absence of match is already conclusive. + */ + if (!match && !last_read && (expect->min_recv == -1)) { + ret = TCPCHK_EVAL_WAIT; + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + goto out; + } + + /* Set status and description in case of error */ + status = ((status != HCHK_STATUS_UNKNOWN) ? status : HCHK_STATUS_L7RSP); + if (LIST_ISEMPTY(&expect->onerror_fmt)) + desc = (inverse + ? ist("HTTP check matched unwanted content") + : ist("HTTP content check did not match")); + break; + + + default: + /* should never happen */ + status = ((status != HCHK_STATUS_UNKNOWN) ? status : HCHK_STATUS_L7RSP); + goto error; + } + + if (!(match ^ inverse)) { + TRACE_STATE("expect rule failed", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + + TRACE_STATE("expect rule succeeded", CHK_EV_TCPCHK_EXP, check); + + out: + free_trash_chunk(tmp); + free_trash_chunk(nbuf); + free_trash_chunk(vbuf); + free_trash_chunk(msg); + TRACE_LEAVE(CHK_EV_TCPCHK_EXP, check, 0, 0, (size_t[]){ret}); + return ret; + + error: + TRACE_STATE("expect rule failed", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + ret = TCPCHK_EVAL_STOP; + msg = alloc_trash_chunk(); + if (msg) + tcpcheck_expect_onerror_message(msg, check, rule, 0, desc); + set_server_check_status(check, status, (msg ? b_head(msg) : NULL)); + goto out; + + wait_more_data: + ret = TCPCHK_EVAL_WAIT; + goto out; +} + +/* Evaluates a TCP TCPCHK_ACT_EXPECT rule. Returns TCPCHK_EVAL_WAIT to wait for + * more data, TCPCHK_EVAL_CONTINUE to evaluate the next rule or TCPCHK_EVAL_STOP + * if an error occurred. + */ +enum tcpcheck_eval_ret tcpcheck_eval_expect(struct check *check, struct tcpcheck_rule *rule, int last_read) +{ + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + struct tcpcheck_expect *expect = &rule->expect; + struct buffer *msg = NULL, *tmp = NULL; + struct ist desc = IST_NULL; + enum healthcheck_status status; + int match, inverse; + + TRACE_ENTER(CHK_EV_TCPCHK_EXP, check); + + last_read |= b_full(&check->bi); + + /* The current expect might need more data than the previous one, check again + * that the minimum amount data required to match is respected. + */ + if (!last_read) { + if ((expect->type == TCPCHK_EXPECT_STRING || expect->type == TCPCHK_EXPECT_BINARY) && + (b_data(&check->bi) < istlen(expect->data))) { + ret = TCPCHK_EVAL_WAIT; + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + goto out; + } + if (expect->min_recv > 0 && (b_data(&check->bi) < expect->min_recv)) { + ret = TCPCHK_EVAL_WAIT; + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + goto out; + } + } + + inverse = !!(expect->flags & TCPCHK_EXPT_FL_INV); + /* Make GCC happy ; initialize match to a failure state. */ + match = inverse; + status = ((expect->err_status != HCHK_STATUS_UNKNOWN) ? expect->err_status : HCHK_STATUS_L7RSP); + + switch (expect->type) { + case TCPCHK_EXPECT_STRING: + case TCPCHK_EXPECT_BINARY: + match = my_memmem(b_head(&check->bi), b_data(&check->bi), istptr(expect->data), istlen(expect->data)) != NULL; + break; + case TCPCHK_EXPECT_STRING_REGEX: + match = regex_exec2(expect->regex, b_head(&check->bi), MIN(b_data(&check->bi), b_size(&check->bi)-1)); + break; + + case TCPCHK_EXPECT_BINARY_REGEX: + chunk_reset(&trash); + dump_binary(&trash, b_head(&check->bi), b_data(&check->bi)); + match = regex_exec2(expect->regex, b_head(&trash), MIN(b_data(&trash), b_size(&trash)-1)); + break; + + case TCPCHK_EXPECT_STRING_LF: + case TCPCHK_EXPECT_BINARY_LF: + match = 0; + tmp = alloc_trash_chunk(); + if (!tmp) { + status = HCHK_STATUS_L7RSP; + desc = ist("Failed to allocate buffer to eval format string"); + TRACE_ERROR("buffer allocation failure", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + tmp->data = sess_build_logline(check->sess, NULL, b_orig(tmp), b_size(tmp), &expect->fmt); + if (!b_data(tmp)) { + status = HCHK_STATUS_L7RSP; + desc = ist("log-format string evaluated to an empty string"); + TRACE_ERROR("invalid log-format string", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + if (expect->type == TCPCHK_EXPECT_BINARY_LF) { + int len = tmp->data; + if (parse_binary(b_orig(tmp), &tmp->area, &len, NULL) == 0) { + status = HCHK_STATUS_L7RSP; + desc = ist("Failed to parse hexastring resulting of eval of a log-format string"); + TRACE_ERROR("invalid binary log-format string", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + goto error; + } + tmp->data = len; + } + if (b_data(&check->bi) < tmp->data) { + if (!last_read) { + ret = TCPCHK_EVAL_WAIT; + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + goto out; + } + break; + } + match = my_memmem(b_head(&check->bi), b_data(&check->bi), b_orig(tmp), b_data(tmp)) != NULL; + break; + + case TCPCHK_EXPECT_CUSTOM: + /* Don't eval custom function if the buffer is empty. It means + * custom functions can't expect an empty response. If this + * change, don't forget to change this test and update all + * custom functions. + */ + if (!b_data(&check->bi)) + break; + if (expect->custom) + ret = expect->custom(check, rule, last_read); + goto out; + default: + /* Should never happen. */ + ret = TCPCHK_EVAL_STOP; + goto out; + } + + + /* Wait for more data on mismatch only if no minimum is defined (-1), + * otherwise the absence of match is already conclusive. + */ + if (!match && !last_read && (expect->min_recv == -1)) { + ret = TCPCHK_EVAL_WAIT; + TRACE_DEVEL("waiting for more data", CHK_EV_TCPCHK_EXP, check); + goto out; + } + + /* Result as expected, next rule. */ + if (match ^ inverse) { + TRACE_STATE("expect rule succeeded", CHK_EV_TCPCHK_EXP, check); + goto out; + } + + error: + /* From this point on, we matched something we did not want, this is an error state. */ + TRACE_STATE("expect rule failed", CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ERR, check); + ret = TCPCHK_EVAL_STOP; + msg = alloc_trash_chunk(); + if (msg) + tcpcheck_expect_onerror_message(msg, check, rule, match, desc); + set_server_check_status(check, status, (msg ? b_head(msg) : NULL)); + free_trash_chunk(msg); + + out: + free_trash_chunk(tmp); + TRACE_LEAVE(CHK_EV_TCPCHK_EXP, check, 0, 0, (size_t[]){ret}); + return ret; +} + +/* Evaluates a TCPCHK_ACT_ACTION_KW rule. Returns TCPCHK_EVAL_CONTINUE to + * evaluate the next rule or TCPCHK_EVAL_STOP if an error occurred. It never + * waits. + */ +enum tcpcheck_eval_ret tcpcheck_eval_action_kw(struct check *check, struct tcpcheck_rule *rule) +{ + enum tcpcheck_eval_ret ret = TCPCHK_EVAL_CONTINUE; + struct act_rule *act_rule; + enum act_return act_ret; + + act_rule =rule->action_kw.rule; + act_ret = act_rule->action_ptr(act_rule, check->proxy, check->sess, NULL, 0); + if (act_ret != ACT_RET_CONT) { + chunk_printf(&trash, "TCPCHK ACTION unexpected result at step %d\n", + tcpcheck_get_step_id(check, rule)); + set_server_check_status(check, HCHK_STATUS_L7RSP, trash.area); + ret = TCPCHK_EVAL_STOP; + } + + return ret; +} + +/* Executes a tcp-check ruleset. Note that this is called both from the + * connection's wake() callback and from the check scheduling task. It returns + * 0 on normal cases, or <0 if a close() has happened on an existing connection, + * presenting the risk of an fd replacement. + * + * Please do NOT place any return statement in this function and only leave + * via the out_end_tcpcheck label after setting retcode. + */ +int tcpcheck_main(struct check *check) +{ + struct tcpcheck_rule *rule; + struct stconn *sc = check->sc; + struct connection *conn = sc_conn(sc); + int must_read = 1, last_read = 0; + int retcode = 0; + enum tcpcheck_eval_ret eval_ret; + + /* here, we know that the check is complete or that it failed */ + if (check->result != CHK_RES_UNKNOWN) + goto out; + + TRACE_ENTER(CHK_EV_TCPCHK_EVAL, check); + + /* Note: the stream connector and the connection may only be undefined before + * the first rule evaluation (it is always a connect rule) or when the + * stream connector allocation failed on a connect rule, during sc allocation. + */ + + /* 1- check for connection error, if any */ + if ((conn && conn->flags & CO_FL_ERROR) || sc_ep_test(sc, SE_FL_ERROR)) + goto out_end_tcpcheck; + + /* 2- check if a rule must be resume. It happens if check->current_step + * is defined. */ + else if (check->current_step) { + rule = check->current_step; + TRACE_PROTO("resume rule evaluation", CHK_EV_TCPCHK_EVAL, check, 0, 0, (size_t[]){ tcpcheck_get_step_id(check, rule)}); + } + + /* 3- It is the first evaluation. We must create a session and preset + * tcp-check variables */ + else { + struct tcpcheck_var *var; + + /* First evaluation, create a session */ + check->sess = session_new(&checks_fe, NULL, &check->obj_type); + if (!check->sess) { + chunk_printf(&trash, "TCPCHK error allocating check session"); + TRACE_ERROR("session allocation failure", CHK_EV_TCPCHK_EVAL|CHK_EV_TCPCHK_ERR, check); + set_server_check_status(check, HCHK_STATUS_SOCKERR, trash.area); + goto out_end_tcpcheck; + } + vars_init_head(&check->vars, SCOPE_CHECK); + rule = LIST_NEXT(check->tcpcheck_rules->list, typeof(rule), list); + + /* Preset tcp-check variables */ + list_for_each_entry(var, &check->tcpcheck_rules->preset_vars, list) { + struct sample smp; + + memset(&smp, 0, sizeof(smp)); + smp_set_owner(&smp, check->proxy, check->sess, NULL, SMP_OPT_FINAL); + smp.data = var->data; + vars_set_by_name_ifexist(istptr(var->name), istlen(var->name), &smp); + } + TRACE_PROTO("start rules evaluation", CHK_EV_TCPCHK_EVAL, check); + } + + /* Now evaluate the tcp-check rules */ + + list_for_each_entry_from(rule, check->tcpcheck_rules->list, list) { + check->code = 0; + switch (rule->action) { + case TCPCHK_ACT_CONNECT: + /* Not the first connection, release it first */ + if (sc_conn(sc) && check->current_step != rule) { + check->state |= CHK_ST_CLOSE_CONN; + retcode = -1; + } + + check->current_step = rule; + + /* We are still waiting the connection gets closed */ + if (check->state & CHK_ST_CLOSE_CONN) { + TRACE_DEVEL("wait previous connection closure", CHK_EV_TCPCHK_EVAL|CHK_EV_TCPCHK_CONN, check); + eval_ret = TCPCHK_EVAL_WAIT; + break; + } + + TRACE_PROTO("eval connect rule", CHK_EV_TCPCHK_EVAL|CHK_EV_TCPCHK_CONN, check); + eval_ret = tcpcheck_eval_connect(check, rule); + + /* Refresh connection */ + conn = sc_conn(sc); + last_read = 0; + must_read = (IS_HTX_SC(sc) ? htx_is_empty(htxbuf(&check->bi)) : !b_data(&check->bi)); + break; + case TCPCHK_ACT_SEND: + check->current_step = rule; + TRACE_PROTO("eval send rule", CHK_EV_TCPCHK_EVAL|CHK_EV_TCPCHK_SND, check); + eval_ret = tcpcheck_eval_send(check, rule); + must_read = 1; + break; + case TCPCHK_ACT_EXPECT: + check->current_step = rule; + TRACE_PROTO("eval expect rule", CHK_EV_TCPCHK_EVAL|CHK_EV_TCPCHK_EXP, check); + if (must_read) { + eval_ret = tcpcheck_eval_recv(check, rule); + if (eval_ret == TCPCHK_EVAL_STOP) + goto out_end_tcpcheck; + else if (eval_ret == TCPCHK_EVAL_WAIT) + goto out; + last_read = ((conn->flags & CO_FL_ERROR) || sc_ep_test(sc, SE_FL_ERROR | SE_FL_EOS)); + must_read = 0; + } + + eval_ret = ((check->tcpcheck_rules->flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_HTTP_CHK + ? tcpcheck_eval_expect_http(check, rule, last_read) + : tcpcheck_eval_expect(check, rule, last_read)); + + if (eval_ret == TCPCHK_EVAL_WAIT) { + check->current_step = rule->expect.head; + if (!(sc->wait_event.events & SUB_RETRY_RECV)) + conn->mux->subscribe(sc, SUB_RETRY_RECV, &sc->wait_event); + } + break; + case TCPCHK_ACT_ACTION_KW: + /* Don't update the current step */ + TRACE_PROTO("eval action kw rule", CHK_EV_TCPCHK_EVAL|CHK_EV_TCPCHK_ACT, check); + eval_ret = tcpcheck_eval_action_kw(check, rule); + break; + default: + /* Otherwise, just go to the next one and don't update + * the current step + */ + eval_ret = TCPCHK_EVAL_CONTINUE; + break; + } + + switch (eval_ret) { + case TCPCHK_EVAL_CONTINUE: + break; + case TCPCHK_EVAL_WAIT: + goto out; + case TCPCHK_EVAL_STOP: + goto out_end_tcpcheck; + } + } + + /* All rules was evaluated */ + if (check->current_step) { + rule = check->current_step; + + TRACE_DEVEL("eval tcp-check result", CHK_EV_TCPCHK_EVAL, check); + + if (rule->action == TCPCHK_ACT_EXPECT) { + struct buffer *msg; + enum healthcheck_status status; + + if (check->server && + (check->server->proxy->options & PR_O_DISABLE404) && + (check->server->next_state != SRV_ST_STOPPED) && + (check->code == 404)) { + set_server_check_status(check, HCHK_STATUS_L7OKCD, NULL); + TRACE_PROTO("tcp-check conditionally passed (disable-404)", CHK_EV_TCPCHK_EVAL, check); + goto out_end_tcpcheck; + } + + msg = alloc_trash_chunk(); + if (msg) + tcpcheck_expect_onsuccess_message(msg, check, rule, IST_NULL); + status = ((rule->expect.ok_status != HCHK_STATUS_UNKNOWN) ? rule->expect.ok_status : HCHK_STATUS_L7OKD); + set_server_check_status(check, status, (msg ? b_head(msg) : "(tcp-check)")); + free_trash_chunk(msg); + } + else if (rule->action == TCPCHK_ACT_CONNECT) { + const char *msg = ((rule->connect.options & TCPCHK_OPT_IMPLICIT) ? NULL : "(tcp-check)"); + enum healthcheck_status status = HCHK_STATUS_L4OK; +#ifdef USE_OPENSSL + if (conn_is_ssl(conn)) + status = HCHK_STATUS_L6OK; +#endif + set_server_check_status(check, status, msg); + } + else + set_server_check_status(check, HCHK_STATUS_L7OKD, "(tcp-check)"); + } + else { + set_server_check_status(check, HCHK_STATUS_L7OKD, "(tcp-check)"); + } + TRACE_PROTO("tcp-check passed", CHK_EV_TCPCHK_EVAL, check); + + out_end_tcpcheck: + if ((conn && conn->flags & CO_FL_ERROR) || sc_ep_test(sc, SE_FL_ERROR)) { + TRACE_ERROR("report connection error", CHK_EV_TCPCHK_EVAL|CHK_EV_TCPCHK_ERR, check); + chk_report_conn_err(check, errno, 0); + } + + /* the tcpcheck is finished, release in/out buffer now */ + check_release_buf(check, &check->bi); + check_release_buf(check, &check->bo); + + out: + TRACE_LEAVE(CHK_EV_HCHK_RUN, check); + return retcode; +} + +void tcp_check_keywords_register(struct action_kw_list *kw_list) +{ + LIST_APPEND(&tcp_check_keywords.list, &kw_list->list); +} + +/**************************************************************************/ +/******************* Internals to parse tcp-check rules *******************/ +/**************************************************************************/ +struct action_kw_list tcp_check_keywords = { + .list = LIST_HEAD_INIT(tcp_check_keywords.list), +}; + +/* Creates a tcp-check rule resulting from parsing a custom keyword. NULL is + * returned on error. + */ +struct tcpcheck_rule *parse_tcpcheck_action(char **args, int cur_arg, struct proxy *px, + struct list *rules, struct action_kw *kw, + const char *file, int line, char **errmsg) +{ + struct tcpcheck_rule *chk = NULL; + struct act_rule *actrule = NULL; + + actrule = new_act_rule(ACT_F_TCP_CHK, file, line); + if (!actrule) { + memprintf(errmsg, "out of memory"); + goto error; + } + actrule->kw = kw; + + cur_arg++; + if (kw->parse((const char **)args, &cur_arg, px, actrule, errmsg) == ACT_RET_PRS_ERR) { + memprintf(errmsg, "'%s' : %s", kw->kw, *errmsg); + goto error; + } + + chk = calloc(1, sizeof(*chk)); + if (!chk) { + memprintf(errmsg, "out of memory"); + goto error; + } + chk->action = TCPCHK_ACT_ACTION_KW; + chk->action_kw.rule = actrule; + return chk; + + error: + free(actrule); + return NULL; +} + +/* Parses and creates a tcp-check connect or an http-check connect rule. NULL is + * returned on error. + */ +struct tcpcheck_rule *parse_tcpcheck_connect(char **args, int cur_arg, struct proxy *px, struct list *rules, + const char *file, int line, char **errmsg) +{ + struct tcpcheck_rule *chk = NULL; + struct sockaddr_storage *sk = NULL; + char *comment = NULL, *sni = NULL, *alpn = NULL; + struct sample_expr *port_expr = NULL; + const struct mux_proto_list *mux_proto = NULL; + unsigned short conn_opts = 0; + long port = 0; + int alpn_len = 0; + + list_for_each_entry(chk, rules, list) { + if (chk->action == TCPCHK_ACT_CONNECT) + break; + if (chk->action == TCPCHK_ACT_COMMENT || + chk->action == TCPCHK_ACT_ACTION_KW || + (chk->action == TCPCHK_ACT_SEND && (chk->send.http.flags & TCPCHK_SND_HTTP_FROM_OPT))) + continue; + + memprintf(errmsg, "first step MUST also be a 'connect', " + "optionally preceded by a 'set-var', an 'unset-var' or a 'comment', " + "when there is a 'connect' step in the tcp-check ruleset"); + goto error; + } + + cur_arg++; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "default") == 0) + conn_opts |= TCPCHK_OPT_DEFAULT_CONNECT; + else if (strcmp(args[cur_arg], "addr") == 0) { + int port1, port2; + + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects <ipv4|ipv6> as argument.", args[cur_arg]); + goto error; + } + + sk = str2sa_range(args[cur_arg+1], NULL, &port1, &port2, NULL, NULL, NULL, + errmsg, NULL, NULL, PA_O_RESOLVE | PA_O_PORT_OK | PA_O_STREAM | PA_O_CONNECT); + if (!sk) { + memprintf(errmsg, "'%s' : %s.", args[cur_arg], *errmsg); + goto error; + } + + cur_arg++; + } + else if (strcmp(args[cur_arg], "port") == 0) { + const char *p, *end; + + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a port number or a sample expression as argument.", args[cur_arg]); + goto error; + } + cur_arg++; + + port = 0; + release_sample_expr(port_expr); + p = args[cur_arg]; end = p + strlen(p); + port = read_uint(&p, end); + if (p != end) { + int idx = 0; + + px->conf.args.ctx = ARGC_SRV; + port_expr = sample_parse_expr((char *[]){args[cur_arg], NULL}, &idx, + file, line, errmsg, &px->conf.args, NULL); + + if (!port_expr) { + memprintf(errmsg, "error detected while parsing port expression : %s", *errmsg); + goto error; + } + if (!(port_expr->fetch->val & SMP_VAL_BE_CHK_RUL)) { + memprintf(errmsg, "error detected while parsing port expression : " + " fetch method '%s' extracts information from '%s', " + "none of which is available here.\n", + args[cur_arg], sample_src_names(port_expr->fetch->use)); + goto error; + } + px->http_needed |= !!(port_expr->fetch->use & SMP_USE_HTTP_ANY); + } + else if (port > 65535 || port < 1) { + memprintf(errmsg, "expects a valid TCP port (from range 1 to 65535) or a sample expression, got %s.", + args[cur_arg]); + goto error; + } + } + else if (strcmp(args[cur_arg], "proto") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a MUX protocol as argument.", args[cur_arg]); + goto error; + } + mux_proto = get_mux_proto(ist(args[cur_arg + 1])); + if (!mux_proto) { + memprintf(errmsg, "'%s' : unknown MUX protocol '%s'.", args[cur_arg], args[cur_arg+1]); + goto error; + } + + if (strcmp(args[0], "tcp-check") == 0 && mux_proto->mode != PROTO_MODE_TCP) { + memprintf(errmsg, "'%s' : invalid MUX protocol '%s' for tcp-check", args[cur_arg], args[cur_arg+1]); + goto error; + } + else if (strcmp(args[0], "http-check") == 0 && mux_proto->mode != PROTO_MODE_HTTP) { + memprintf(errmsg, "'%s' : invalid MUX protocol '%s' for http-check", args[cur_arg], args[cur_arg+1]); + goto error; + } + + cur_arg++; + } + else if (strcmp(args[cur_arg], "comment") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[cur_arg]); + goto error; + } + cur_arg++; + free(comment); + comment = strdup(args[cur_arg]); + if (!comment) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + else if (strcmp(args[cur_arg], "send-proxy") == 0) + conn_opts |= TCPCHK_OPT_SEND_PROXY; + else if (strcmp(args[cur_arg], "via-socks4") == 0) + conn_opts |= TCPCHK_OPT_SOCKS4; + else if (strcmp(args[cur_arg], "linger") == 0) + conn_opts |= TCPCHK_OPT_LINGER; +#ifdef USE_OPENSSL + else if (strcmp(args[cur_arg], "ssl") == 0) { + px->options |= PR_O_TCPCHK_SSL; + conn_opts |= TCPCHK_OPT_SSL; + } + else if (strcmp(args[cur_arg], "sni") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[cur_arg]); + goto error; + } + cur_arg++; + free(sni); + sni = strdup(args[cur_arg]); + if (!sni) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + else if (strcmp(args[cur_arg], "alpn") == 0) { +#ifdef TLSEXT_TYPE_application_layer_protocol_negotiation + free(alpn); + if (ssl_sock_parse_alpn(args[cur_arg + 1], &alpn, &alpn_len, errmsg)) { + memprintf(errmsg, "'%s' : %s", args[cur_arg], *errmsg); + goto error; + } + cur_arg++; +#else + memprintf(errmsg, "'%s' : library does not support TLS ALPN extension.", args[cur_arg]); + goto error; +#endif + } +#endif /* USE_OPENSSL */ + + else { + memprintf(errmsg, "expects 'comment', 'port', 'addr', 'send-proxy'" +#ifdef USE_OPENSSL + ", 'ssl', 'sni', 'alpn'" +#endif /* USE_OPENSSL */ + " or 'via-socks4', 'linger', 'default' but got '%s' as argument.", + args[cur_arg]); + goto error; + } + cur_arg++; + } + + chk = calloc(1, sizeof(*chk)); + if (!chk) { + memprintf(errmsg, "out of memory"); + goto error; + } + chk->action = TCPCHK_ACT_CONNECT; + chk->comment = comment; + chk->connect.port = port; + chk->connect.options = conn_opts; + chk->connect.sni = sni; + chk->connect.alpn = alpn; + chk->connect.alpn_len= alpn_len; + chk->connect.port_expr= port_expr; + chk->connect.mux_proto= mux_proto; + if (sk) + chk->connect.addr = *sk; + return chk; + + error: + free(alpn); + free(sni); + free(comment); + release_sample_expr(port_expr); + return NULL; +} + +/* Parses and creates a tcp-check send rule. NULL is returned on error */ +struct tcpcheck_rule *parse_tcpcheck_send(char **args, int cur_arg, struct proxy *px, struct list *rules, + const char *file, int line, char **errmsg) +{ + struct tcpcheck_rule *chk = NULL; + char *comment = NULL, *data = NULL; + enum tcpcheck_send_type type = TCPCHK_SEND_UNDEF; + + if (strcmp(args[cur_arg], "send-binary-lf") == 0) + type = TCPCHK_SEND_BINARY_LF; + else if (strcmp(args[cur_arg], "send-binary") == 0) + type = TCPCHK_SEND_BINARY; + else if (strcmp(args[cur_arg], "send-lf") == 0) + type = TCPCHK_SEND_STRING_LF; + else if (strcmp(args[cur_arg], "send") == 0) + type = TCPCHK_SEND_STRING; + + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a %s as argument", + (type == TCPCHK_SEND_BINARY ? "binary string": "string"), args[cur_arg]); + goto error; + } + + data = args[cur_arg+1]; + + cur_arg += 2; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "comment") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[cur_arg]); + goto error; + } + cur_arg++; + free(comment); + comment = strdup(args[cur_arg]); + if (!comment) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + else { + memprintf(errmsg, "expects 'comment' but got '%s' as argument.", + args[cur_arg]); + goto error; + } + cur_arg++; + } + + chk = calloc(1, sizeof(*chk)); + if (!chk) { + memprintf(errmsg, "out of memory"); + goto error; + } + chk->action = TCPCHK_ACT_SEND; + chk->comment = comment; + chk->send.type = type; + + switch (chk->send.type) { + case TCPCHK_SEND_STRING: + chk->send.data = ist(strdup(data)); + if (!isttest(chk->send.data)) { + memprintf(errmsg, "out of memory"); + goto error; + } + break; + case TCPCHK_SEND_BINARY: { + int len = chk->send.data.len; + if (parse_binary(data, &chk->send.data.ptr, &len, errmsg) == 0) { + memprintf(errmsg, "'%s' invalid binary string (%s).\n", data, *errmsg); + goto error; + } + chk->send.data.len = len; + break; + } + case TCPCHK_SEND_STRING_LF: + case TCPCHK_SEND_BINARY_LF: + LIST_INIT(&chk->send.fmt); + px->conf.args.ctx = ARGC_SRV; + if (!parse_logformat_string(data, px, &chk->send.fmt, 0, SMP_VAL_BE_CHK_RUL, errmsg)) { + memprintf(errmsg, "'%s' invalid log-format string (%s).\n", data, *errmsg); + goto error; + } + break; + case TCPCHK_SEND_HTTP: + case TCPCHK_SEND_UNDEF: + goto error; + } + + return chk; + + error: + free(chk); + free(comment); + return NULL; +} + +/* Parses and creates a http-check send rule. NULL is returned on error */ +struct tcpcheck_rule *parse_tcpcheck_send_http(char **args, int cur_arg, struct proxy *px, struct list *rules, + const char *file, int line, char **errmsg) +{ + struct tcpcheck_rule *chk = NULL; + struct tcpcheck_http_hdr *hdr = NULL; + struct http_hdr hdrs[global.tune.max_http_hdr]; + char *meth = NULL, *uri = NULL, *vsn = NULL; + char *body = NULL, *comment = NULL; + unsigned int flags = 0; + int i = 0, host_hdr = -1; + + cur_arg++; + while (*(args[cur_arg])) { + if (strcmp(args[cur_arg], "meth") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[cur_arg]); + goto error; + } + cur_arg++; + meth = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "uri") == 0 || strcmp(args[cur_arg], "uri-lf") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[cur_arg]); + goto error; + } + flags &= ~TCPCHK_SND_HTTP_FL_URI_FMT; + if (strcmp(args[cur_arg], "uri-lf") == 0) + flags |= TCPCHK_SND_HTTP_FL_URI_FMT; + cur_arg++; + uri = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "ver") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[cur_arg]); + goto error; + } + cur_arg++; + vsn = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "hdr") == 0) { + if (!*args[cur_arg+1] || !*args[cur_arg+2]) { + memprintf(errmsg, "'%s' expects <name> and <value> as arguments", args[cur_arg]); + goto error; + } + + if (strcasecmp(args[cur_arg+1], "host") == 0) { + if (host_hdr >= 0) { + memprintf(errmsg, "'%s' header already defined (previous value is '%s')", + args[cur_arg+1], istptr(hdrs[host_hdr].v)); + goto error; + } + host_hdr = i; + } + else if (strcasecmp(args[cur_arg+1], "content-length") == 0 || + strcasecmp(args[cur_arg+1], "transfer-encoding") == 0) + goto skip_hdr; + + hdrs[i].n = ist(args[cur_arg + 1]); + hdrs[i].v = ist(args[cur_arg + 2]); + i++; + skip_hdr: + cur_arg += 2; + } + else if (strcmp(args[cur_arg], "body") == 0 || strcmp(args[cur_arg], "body-lf") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[cur_arg]); + goto error; + } + flags &= ~TCPCHK_SND_HTTP_FL_BODY_FMT; + if (strcmp(args[cur_arg], "body-lf") == 0) + flags |= TCPCHK_SND_HTTP_FL_BODY_FMT; + cur_arg++; + body = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "comment") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument.", args[cur_arg]); + goto error; + } + cur_arg++; + free(comment); + comment = strdup(args[cur_arg]); + if (!comment) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + else { + memprintf(errmsg, "expects 'comment', 'meth', 'uri', 'uri-lf', 'ver', 'hdr', 'body' or 'body-lf'" + " but got '%s' as argument.", args[cur_arg]); + goto error; + } + cur_arg++; + } + + hdrs[i].n = hdrs[i].v = IST_NULL; + + chk = calloc(1, sizeof(*chk)); + if (!chk) { + memprintf(errmsg, "out of memory"); + goto error; + } + chk->action = TCPCHK_ACT_SEND; + chk->comment = comment; comment = NULL; + chk->send.type = TCPCHK_SEND_HTTP; + chk->send.http.flags = flags; + LIST_INIT(&chk->send.http.hdrs); + + if (meth) { + chk->send.http.meth.meth = find_http_meth(meth, strlen(meth)); + chk->send.http.meth.str.area = strdup(meth); + chk->send.http.meth.str.data = strlen(meth); + if (!chk->send.http.meth.str.area) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + if (uri) { + if (chk->send.http.flags & TCPCHK_SND_HTTP_FL_URI_FMT) { + LIST_INIT(&chk->send.http.uri_fmt); + px->conf.args.ctx = ARGC_SRV; + if (!parse_logformat_string(uri, px, &chk->send.http.uri_fmt, 0, SMP_VAL_BE_CHK_RUL, errmsg)) { + memprintf(errmsg, "'%s' invalid log-format string (%s).\n", uri, *errmsg); + goto error; + } + } + else { + chk->send.http.uri = ist(strdup(uri)); + if (!isttest(chk->send.http.uri)) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + } + if (vsn) { + chk->send.http.vsn = ist(strdup(vsn)); + if (!isttest(chk->send.http.vsn)) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + for (i = 0; istlen(hdrs[i].n); i++) { + hdr = calloc(1, sizeof(*hdr)); + if (!hdr) { + memprintf(errmsg, "out of memory"); + goto error; + } + LIST_INIT(&hdr->value); + hdr->name = istdup(hdrs[i].n); + if (!isttest(hdr->name)) { + memprintf(errmsg, "out of memory"); + goto error; + } + + ist0(hdrs[i].v); + if (!parse_logformat_string(istptr(hdrs[i].v), px, &hdr->value, 0, SMP_VAL_BE_CHK_RUL, errmsg)) + goto error; + LIST_APPEND(&chk->send.http.hdrs, &hdr->list); + hdr = NULL; + } + + if (body) { + if (chk->send.http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT) { + LIST_INIT(&chk->send.http.body_fmt); + px->conf.args.ctx = ARGC_SRV; + if (!parse_logformat_string(body, px, &chk->send.http.body_fmt, 0, SMP_VAL_BE_CHK_RUL, errmsg)) { + memprintf(errmsg, "'%s' invalid log-format string (%s).\n", body, *errmsg); + goto error; + } + } + else { + chk->send.http.body = ist(strdup(body)); + if (!isttest(chk->send.http.body)) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + } + + return chk; + + error: + free_tcpcheck_http_hdr(hdr); + free_tcpcheck(chk, 0); + free(comment); + return NULL; +} + +/* Parses and creates a http-check comment rule. NULL is returned on error */ +struct tcpcheck_rule *parse_tcpcheck_comment(char **args, int cur_arg, struct proxy *px, struct list *rules, + const char *file, int line, char **errmsg) +{ + struct tcpcheck_rule *chk = NULL; + char *comment = NULL; + + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "expects a string as argument"); + goto error; + } + cur_arg++; + comment = strdup(args[cur_arg]); + if (!comment) { + memprintf(errmsg, "out of memory"); + goto error; + } + + chk = calloc(1, sizeof(*chk)); + if (!chk) { + memprintf(errmsg, "out of memory"); + goto error; + } + chk->action = TCPCHK_ACT_COMMENT; + chk->comment = comment; + return chk; + + error: + free(comment); + return NULL; +} + +/* Parses and creates a tcp-check or an http-check expect rule. NULL is returned + * on error. <proto> is set to the right protocol flags (covered by the + * TCPCHK_RULES_PROTO_CHK mask). + */ +struct tcpcheck_rule *parse_tcpcheck_expect(char **args, int cur_arg, struct proxy *px, + struct list *rules, unsigned int proto, + const char *file, int line, char **errmsg) +{ + struct tcpcheck_rule *prev_check, *chk = NULL; + struct sample_expr *status_expr = NULL; + char *on_success_msg, *on_error_msg, *comment, *pattern, *npat, *vpat; + enum tcpcheck_expect_type type = TCPCHK_EXPECT_UNDEF; + enum healthcheck_status ok_st = HCHK_STATUS_UNKNOWN; + enum healthcheck_status err_st = HCHK_STATUS_UNKNOWN; + enum healthcheck_status tout_st = HCHK_STATUS_UNKNOWN; + unsigned int flags = 0; + long min_recv = -1; + int inverse = 0; + + on_success_msg = on_error_msg = comment = pattern = npat = vpat = NULL; + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "expects at least a matching pattern as arguments"); + goto error; + } + + cur_arg++; + while (*(args[cur_arg])) { + int in_pattern = 0; + + rescan: + if (strcmp(args[cur_arg], "min-recv") == 0) { + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a integer as argument", args[cur_arg]); + goto error; + } + /* Use an signed integer here because of bufsize */ + cur_arg++; + min_recv = atol(args[cur_arg]); + if (min_recv < -1 || min_recv > INT_MAX) { + memprintf(errmsg, "'%s' expects -1 or an integer from 0 to INT_MAX" , args[cur_arg-1]); + goto error; + } + } + else if (*(args[cur_arg]) == '!') { + in_pattern = 1; + while (*(args[cur_arg]) == '!') { + inverse = !inverse; + args[cur_arg]++; + } + if (!*(args[cur_arg])) + cur_arg++; + goto rescan; + } + else if (strcmp(args[cur_arg], "string") == 0 || strcmp(args[cur_arg], "rstring") == 0) { + if (type != TCPCHK_EXPECT_UNDEF) { + memprintf(errmsg, "only on pattern expected"); + goto error; + } + if (proto != TCPCHK_RULES_HTTP_CHK) + type = ((*(args[cur_arg]) == 's') ? TCPCHK_EXPECT_STRING : TCPCHK_EXPECT_STRING_REGEX); + else + type = ((*(args[cur_arg]) == 's') ? TCPCHK_EXPECT_HTTP_BODY : TCPCHK_EXPECT_HTTP_BODY_REGEX); + + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a <pattern> as argument", args[cur_arg]); + goto error; + } + cur_arg++; + pattern = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "binary") == 0 || strcmp(args[cur_arg], "rbinary") == 0) { + if (proto == TCPCHK_RULES_HTTP_CHK) + goto bad_http_kw; + if (type != TCPCHK_EXPECT_UNDEF) { + memprintf(errmsg, "only on pattern expected"); + goto error; + } + type = ((*(args[cur_arg]) == 'b') ? TCPCHK_EXPECT_BINARY : TCPCHK_EXPECT_BINARY_REGEX); + + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a <pattern> as argument", args[cur_arg]); + goto error; + } + cur_arg++; + pattern = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "string-lf") == 0 || strcmp(args[cur_arg], "binary-lf") == 0) { + if (type != TCPCHK_EXPECT_UNDEF) { + memprintf(errmsg, "only on pattern expected"); + goto error; + } + if (proto != TCPCHK_RULES_HTTP_CHK) + type = ((*(args[cur_arg]) == 's') ? TCPCHK_EXPECT_STRING_LF : TCPCHK_EXPECT_BINARY_LF); + else { + if (*(args[cur_arg]) != 's') + goto bad_http_kw; + type = TCPCHK_EXPECT_HTTP_BODY_LF; + } + + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a <pattern> as argument", args[cur_arg]); + goto error; + } + cur_arg++; + pattern = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "status") == 0 || strcmp(args[cur_arg], "rstatus") == 0) { + if (proto != TCPCHK_RULES_HTTP_CHK) + goto bad_tcp_kw; + if (type != TCPCHK_EXPECT_UNDEF) { + memprintf(errmsg, "only on pattern expected"); + goto error; + } + type = ((*(args[cur_arg]) == 's') ? TCPCHK_EXPECT_HTTP_STATUS : TCPCHK_EXPECT_HTTP_STATUS_REGEX); + + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a <pattern> as argument", args[cur_arg]); + goto error; + } + cur_arg++; + pattern = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "custom") == 0) { + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (type != TCPCHK_EXPECT_UNDEF) { + memprintf(errmsg, "only on pattern expected"); + goto error; + } + type = TCPCHK_EXPECT_CUSTOM; + } + else if (strcmp(args[cur_arg], "hdr") == 0 || strcmp(args[cur_arg], "fhdr") == 0) { + int orig_arg = cur_arg; + + if (proto != TCPCHK_RULES_HTTP_CHK) + goto bad_tcp_kw; + if (type != TCPCHK_EXPECT_UNDEF) { + memprintf(errmsg, "only on pattern expected"); + goto error; + } + type = TCPCHK_EXPECT_HTTP_HEADER; + + if (strcmp(args[cur_arg], "fhdr") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_FULL; + + /* Parse the name pattern, mandatory */ + if (!*(args[cur_arg+1]) || !*(args[cur_arg+2]) || + (strcmp(args[cur_arg+1], "name") != 0 && strcmp(args[cur_arg+1], "name-lf") != 0)) { + memprintf(errmsg, "'%s' expects at the name keyword as first argument followed by a pattern", + args[orig_arg]); + goto error; + } + + if (strcmp(args[cur_arg+1], "name-lf") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HNAME_FMT; + + cur_arg += 2; + if (strcmp(args[cur_arg], "-m") == 0) { + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' : '%s' expects at a matching pattern ('str', 'beg', 'end', 'sub' or 'reg')", + args[orig_arg], args[cur_arg]); + goto error; + } + if (strcmp(args[cur_arg+1], "str") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HNAME_STR; + else if (strcmp(args[cur_arg+1], "beg") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HNAME_BEG; + else if (strcmp(args[cur_arg+1], "end") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HNAME_END; + else if (strcmp(args[cur_arg+1], "sub") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HNAME_SUB; + else if (strcmp(args[cur_arg+1], "reg") == 0) { + if (flags & TCPCHK_EXPT_FL_HTTP_HNAME_FMT) { + memprintf(errmsg, "'%s': log-format string is not supported with a regex matching method", + args[orig_arg]); + goto error; + } + flags |= TCPCHK_EXPT_FL_HTTP_HNAME_REG; + } + else { + memprintf(errmsg, "'%s' : '%s' only supports 'str', 'beg', 'end', 'sub' or 'reg' (got '%s')", + args[orig_arg], args[cur_arg], args[cur_arg+1]); + goto error; + } + cur_arg += 2; + } + else + flags |= TCPCHK_EXPT_FL_HTTP_HNAME_STR; + npat = args[cur_arg]; + + if (!*(args[cur_arg+1]) || + (strcmp(args[cur_arg+1], "value") != 0 && strcmp(args[cur_arg+1], "value-lf") != 0)) { + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_NONE; + goto next; + } + if (strcmp(args[cur_arg+1], "value-lf") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_FMT; + + /* Parse the value pattern, optional */ + if (strcmp(args[cur_arg+2], "-m") == 0) { + cur_arg += 2; + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' : '%s' expects at a matching pattern ('str', 'beg', 'end', 'sub' or 'reg')", + args[orig_arg], args[cur_arg]); + goto error; + } + if (strcmp(args[cur_arg+1], "str") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_STR; + else if (strcmp(args[cur_arg+1], "beg") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_BEG; + else if (strcmp(args[cur_arg+1], "end") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_END; + else if (strcmp(args[cur_arg+1], "sub") == 0) + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_SUB; + else if (strcmp(args[cur_arg+1], "reg") == 0) { + if (flags & TCPCHK_EXPT_FL_HTTP_HVAL_FMT) { + memprintf(errmsg, "'%s': log-format string is not supported with a regex matching method", + args[orig_arg]); + goto error; + } + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_REG; + } + else { + memprintf(errmsg, "'%s' : '%s' only supports 'str', 'beg', 'end', 'sub' or 'reg' (got '%s')", + args[orig_arg], args[cur_arg], args[cur_arg+1]); + goto error; + } + } + else + flags |= TCPCHK_EXPT_FL_HTTP_HVAL_STR; + + if (!*(args[cur_arg+2])) { + memprintf(errmsg, "'%s' expect a pattern with the value keyword", args[orig_arg]); + goto error; + } + vpat = args[cur_arg+2]; + cur_arg += 2; + } + else if (strcmp(args[cur_arg], "comment") == 0) { + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument", args[cur_arg]); + goto error; + } + cur_arg++; + free(comment); + comment = strdup(args[cur_arg]); + if (!comment) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + else if (strcmp(args[cur_arg], "on-success") == 0) { + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument", args[cur_arg]); + goto error; + } + cur_arg++; + on_success_msg = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "on-error") == 0) { + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument", args[cur_arg]); + goto error; + } + cur_arg++; + on_error_msg = args[cur_arg]; + } + else if (strcmp(args[cur_arg], "ok-status") == 0) { + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument", args[cur_arg]); + goto error; + } + if (strcasecmp(args[cur_arg+1], "L7OK") == 0) + ok_st = HCHK_STATUS_L7OKD; + else if (strcasecmp(args[cur_arg+1], "L7OKC") == 0) + ok_st = HCHK_STATUS_L7OKCD; + else if (strcasecmp(args[cur_arg+1], "L6OK") == 0) + ok_st = HCHK_STATUS_L6OK; + else if (strcasecmp(args[cur_arg+1], "L4OK") == 0) + ok_st = HCHK_STATUS_L4OK; + else { + memprintf(errmsg, "'%s' only supports 'L4OK', 'L6OK', 'L7OK' or 'L7OKC' status (got '%s').", + args[cur_arg], args[cur_arg+1]); + goto error; + } + cur_arg++; + } + else if (strcmp(args[cur_arg], "error-status") == 0) { + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument", args[cur_arg]); + goto error; + } + if (strcasecmp(args[cur_arg+1], "L7RSP") == 0) + err_st = HCHK_STATUS_L7RSP; + else if (strcasecmp(args[cur_arg+1], "L7STS") == 0) + err_st = HCHK_STATUS_L7STS; + else if (strcasecmp(args[cur_arg+1], "L7OKC") == 0) + err_st = HCHK_STATUS_L7OKCD; + else if (strcasecmp(args[cur_arg+1], "L6RSP") == 0) + err_st = HCHK_STATUS_L6RSP; + else if (strcasecmp(args[cur_arg+1], "L4CON") == 0) + err_st = HCHK_STATUS_L4CON; + else { + memprintf(errmsg, "'%s' only supports 'L4CON', 'L6RSP', 'L7RSP' or 'L7STS' status (got '%s').", + args[cur_arg], args[cur_arg+1]); + goto error; + } + cur_arg++; + } + else if (strcmp(args[cur_arg], "status-code") == 0) { + int idx = 0; + + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects an expression as argument", args[cur_arg]); + goto error; + } + + cur_arg++; + release_sample_expr(status_expr); + px->conf.args.ctx = ARGC_SRV; + status_expr = sample_parse_expr((char *[]){args[cur_arg], NULL}, &idx, + file, line, errmsg, &px->conf.args, NULL); + if (!status_expr) { + memprintf(errmsg, "error detected while parsing status-code expression : %s", *errmsg); + goto error; + } + if (!(status_expr->fetch->val & SMP_VAL_BE_CHK_RUL)) { + memprintf(errmsg, "error detected while parsing status-code expression : " + " fetch method '%s' extracts information from '%s', " + "none of which is available here.\n", + args[cur_arg], sample_src_names(status_expr->fetch->use)); + goto error; + } + px->http_needed |= !!(status_expr->fetch->use & SMP_USE_HTTP_ANY); + } + else if (strcmp(args[cur_arg], "tout-status") == 0) { + if (in_pattern) { + memprintf(errmsg, "[!] not supported with '%s'", args[cur_arg]); + goto error; + } + if (!*(args[cur_arg+1])) { + memprintf(errmsg, "'%s' expects a string as argument", args[cur_arg]); + goto error; + } + if (strcasecmp(args[cur_arg+1], "L7TOUT") == 0) + tout_st = HCHK_STATUS_L7TOUT; + else if (strcasecmp(args[cur_arg+1], "L6TOUT") == 0) + tout_st = HCHK_STATUS_L6TOUT; + else if (strcasecmp(args[cur_arg+1], "L4TOUT") == 0) + tout_st = HCHK_STATUS_L4TOUT; + else { + memprintf(errmsg, "'%s' only supports 'L4TOUT', 'L6TOUT' or 'L7TOUT' status (got '%s').", + args[cur_arg], args[cur_arg+1]); + goto error; + } + cur_arg++; + } + else { + if (proto == TCPCHK_RULES_HTTP_CHK) { + bad_http_kw: + memprintf(errmsg, "'only supports min-recv, [!]string', '[!]rstring', '[!]string-lf', '[!]status', " + "'[!]rstatus', [!]hdr, [!]fhdr or comment but got '%s' as argument.", args[cur_arg]); + } + else { + bad_tcp_kw: + memprintf(errmsg, "'only supports min-recv, '[!]binary', '[!]string', '[!]rstring', '[!]string-lf'" + "'[!]rbinary', '[!]binary-lf' or comment but got '%s' as argument.", args[cur_arg]); + } + goto error; + } + next: + cur_arg++; + } + + chk = calloc(1, sizeof(*chk)); + if (!chk) { + memprintf(errmsg, "out of memory"); + goto error; + } + chk->action = TCPCHK_ACT_EXPECT; + LIST_INIT(&chk->expect.onerror_fmt); + LIST_INIT(&chk->expect.onsuccess_fmt); + chk->comment = comment; comment = NULL; + chk->expect.type = type; + chk->expect.min_recv = min_recv; + chk->expect.flags = flags | (inverse ? TCPCHK_EXPT_FL_INV : 0); + chk->expect.ok_status = ok_st; + chk->expect.err_status = err_st; + chk->expect.tout_status = tout_st; + chk->expect.status_expr = status_expr; status_expr = NULL; + + if (on_success_msg) { + px->conf.args.ctx = ARGC_SRV; + if (!parse_logformat_string(on_success_msg, px, &chk->expect.onsuccess_fmt, 0, SMP_VAL_BE_CHK_RUL, errmsg)) { + memprintf(errmsg, "'%s' invalid log-format string (%s).\n", on_success_msg, *errmsg); + goto error; + } + } + if (on_error_msg) { + px->conf.args.ctx = ARGC_SRV; + if (!parse_logformat_string(on_error_msg, px, &chk->expect.onerror_fmt, 0, SMP_VAL_BE_CHK_RUL, errmsg)) { + memprintf(errmsg, "'%s' invalid log-format string (%s).\n", on_error_msg, *errmsg); + goto error; + } + } + + switch (chk->expect.type) { + case TCPCHK_EXPECT_HTTP_STATUS: { + const char *p = pattern; + unsigned int c1,c2; + + chk->expect.codes.codes = NULL; + chk->expect.codes.num = 0; + while (1) { + c1 = c2 = read_uint(&p, pattern + strlen(pattern)); + if (*p == '-') { + p++; + c2 = read_uint(&p, pattern + strlen(pattern)); + } + if (c1 > c2) { + memprintf(errmsg, "invalid range of status codes '%s'", pattern); + goto error; + } + + chk->expect.codes.num++; + chk->expect.codes.codes = my_realloc2(chk->expect.codes.codes, + chk->expect.codes.num * sizeof(*chk->expect.codes.codes)); + if (!chk->expect.codes.codes) { + memprintf(errmsg, "out of memory"); + goto error; + } + chk->expect.codes.codes[chk->expect.codes.num-1][0] = c1; + chk->expect.codes.codes[chk->expect.codes.num-1][1] = c2; + + if (*p == '\0') + break; + if (*p != ',') { + memprintf(errmsg, "invalid character '%c' in the list of status codes", *p); + goto error; + } + p++; + } + break; + } + case TCPCHK_EXPECT_STRING: + case TCPCHK_EXPECT_HTTP_BODY: + chk->expect.data = ist(strdup(pattern)); + if (!isttest(chk->expect.data)) { + memprintf(errmsg, "out of memory"); + goto error; + } + break; + case TCPCHK_EXPECT_BINARY: { + int len = chk->expect.data.len; + + if (parse_binary(pattern, &chk->expect.data.ptr, &len, errmsg) == 0) { + memprintf(errmsg, "invalid binary string (%s)", *errmsg); + goto error; + } + chk->expect.data.len = len; + break; + } + case TCPCHK_EXPECT_STRING_REGEX: + case TCPCHK_EXPECT_BINARY_REGEX: + case TCPCHK_EXPECT_HTTP_STATUS_REGEX: + case TCPCHK_EXPECT_HTTP_BODY_REGEX: + chk->expect.regex = regex_comp(pattern, 1, 0, errmsg); + if (!chk->expect.regex) + goto error; + break; + + case TCPCHK_EXPECT_STRING_LF: + case TCPCHK_EXPECT_BINARY_LF: + case TCPCHK_EXPECT_HTTP_BODY_LF: + LIST_INIT(&chk->expect.fmt); + px->conf.args.ctx = ARGC_SRV; + if (!parse_logformat_string(pattern, px, &chk->expect.fmt, 0, SMP_VAL_BE_CHK_RUL, errmsg)) { + memprintf(errmsg, "'%s' invalid log-format string (%s).\n", pattern, *errmsg); + goto error; + } + break; + + case TCPCHK_EXPECT_HTTP_HEADER: + if (!npat) { + memprintf(errmsg, "unexpected error, undefined header name pattern"); + goto error; + } + if (chk->expect.flags & TCPCHK_EXPT_FL_HTTP_HNAME_REG) { + chk->expect.hdr.name_re = regex_comp(npat, 0, 0, errmsg); + if (!chk->expect.hdr.name_re) + goto error; + } + else if (chk->expect.flags & TCPCHK_EXPT_FL_HTTP_HNAME_FMT) { + px->conf.args.ctx = ARGC_SRV; + LIST_INIT(&chk->expect.hdr.name_fmt); + if (!parse_logformat_string(npat, px, &chk->expect.hdr.name_fmt, 0, SMP_VAL_BE_CHK_RUL, errmsg)) { + memprintf(errmsg, "'%s' invalid log-format string (%s).\n", npat, *errmsg); + goto error; + } + } + else { + chk->expect.hdr.name = ist(strdup(npat)); + if (!isttest(chk->expect.hdr.name)) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + + if (chk->expect.flags & TCPCHK_EXPT_FL_HTTP_HVAL_NONE) { + chk->expect.hdr.value = IST_NULL; + break; + } + + if (!vpat) { + memprintf(errmsg, "unexpected error, undefined header value pattern"); + goto error; + } + else if (chk->expect.flags & TCPCHK_EXPT_FL_HTTP_HVAL_REG) { + chk->expect.hdr.value_re = regex_comp(vpat, 1, 0, errmsg); + if (!chk->expect.hdr.value_re) + goto error; + } + else if (chk->expect.flags & TCPCHK_EXPT_FL_HTTP_HVAL_FMT) { + px->conf.args.ctx = ARGC_SRV; + LIST_INIT(&chk->expect.hdr.value_fmt); + if (!parse_logformat_string(vpat, px, &chk->expect.hdr.value_fmt, 0, SMP_VAL_BE_CHK_RUL, errmsg)) { + memprintf(errmsg, "'%s' invalid log-format string (%s).\n", npat, *errmsg); + goto error; + } + } + else { + chk->expect.hdr.value = ist(strdup(vpat)); + if (!isttest(chk->expect.hdr.value)) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + + break; + case TCPCHK_EXPECT_CUSTOM: + chk->expect.custom = NULL; /* Must be defined by the caller ! */ + break; + case TCPCHK_EXPECT_UNDEF: + memprintf(errmsg, "pattern not found"); + goto error; + } + + /* All tcp-check expect points back to the first inverse expect rule in + * a chain of one or more expect rule, potentially itself. + */ + chk->expect.head = chk; + list_for_each_entry_rev(prev_check, rules, list) { + if (prev_check->action == TCPCHK_ACT_EXPECT) { + if (prev_check->expect.flags & TCPCHK_EXPT_FL_INV) + chk->expect.head = prev_check; + continue; + } + if (prev_check->action != TCPCHK_ACT_COMMENT && prev_check->action != TCPCHK_ACT_ACTION_KW) + break; + } + return chk; + + error: + free_tcpcheck(chk, 0); + free(comment); + release_sample_expr(status_expr); + return NULL; +} + +/* Overwrites fields of the old http send rule with those of the new one. When + * replaced, old values are freed and replaced by the new ones. New values are + * not copied but transferred. At the end <new> should be empty and can be + * safely released. This function never fails. + */ +void tcpcheck_overwrite_send_http_rule(struct tcpcheck_rule *old, struct tcpcheck_rule *new) +{ + struct logformat_node *lf, *lfb; + struct tcpcheck_http_hdr *hdr, *bhdr; + + + if (new->send.http.meth.str.area) { + free(old->send.http.meth.str.area); + old->send.http.meth.meth = new->send.http.meth.meth; + old->send.http.meth.str.area = new->send.http.meth.str.area; + old->send.http.meth.str.data = new->send.http.meth.str.data; + new->send.http.meth.str = BUF_NULL; + } + + if (!(new->send.http.flags & TCPCHK_SND_HTTP_FL_URI_FMT) && isttest(new->send.http.uri)) { + if (!(old->send.http.flags & TCPCHK_SND_HTTP_FL_URI_FMT)) + istfree(&old->send.http.uri); + else + free_tcpcheck_fmt(&old->send.http.uri_fmt); + old->send.http.flags &= ~TCPCHK_SND_HTTP_FL_URI_FMT; + old->send.http.uri = new->send.http.uri; + new->send.http.uri = IST_NULL; + } + else if ((new->send.http.flags & TCPCHK_SND_HTTP_FL_URI_FMT) && !LIST_ISEMPTY(&new->send.http.uri_fmt)) { + if (!(old->send.http.flags & TCPCHK_SND_HTTP_FL_URI_FMT)) + istfree(&old->send.http.uri); + else + free_tcpcheck_fmt(&old->send.http.uri_fmt); + old->send.http.flags |= TCPCHK_SND_HTTP_FL_URI_FMT; + LIST_INIT(&old->send.http.uri_fmt); + list_for_each_entry_safe(lf, lfb, &new->send.http.uri_fmt, list) { + LIST_DELETE(&lf->list); + LIST_APPEND(&old->send.http.uri_fmt, &lf->list); + } + } + + if (isttest(new->send.http.vsn)) { + istfree(&old->send.http.vsn); + old->send.http.vsn = new->send.http.vsn; + new->send.http.vsn = IST_NULL; + } + + if (!LIST_ISEMPTY(&new->send.http.hdrs)) { + free_tcpcheck_http_hdrs(&old->send.http.hdrs); + list_for_each_entry_safe(hdr, bhdr, &new->send.http.hdrs, list) { + LIST_DELETE(&hdr->list); + LIST_APPEND(&old->send.http.hdrs, &hdr->list); + } + } + + if (!(new->send.http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT) && isttest(new->send.http.body)) { + if (!(old->send.http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT)) + istfree(&old->send.http.body); + else + free_tcpcheck_fmt(&old->send.http.body_fmt); + old->send.http.flags &= ~TCPCHK_SND_HTTP_FL_BODY_FMT; + old->send.http.body = new->send.http.body; + new->send.http.body = IST_NULL; + } + else if ((new->send.http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT) && !LIST_ISEMPTY(&new->send.http.body_fmt)) { + if (!(old->send.http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT)) + istfree(&old->send.http.body); + else + free_tcpcheck_fmt(&old->send.http.body_fmt); + old->send.http.flags |= TCPCHK_SND_HTTP_FL_BODY_FMT; + LIST_INIT(&old->send.http.body_fmt); + list_for_each_entry_safe(lf, lfb, &new->send.http.body_fmt, list) { + LIST_DELETE(&lf->list); + LIST_APPEND(&old->send.http.body_fmt, &lf->list); + } + } +} + +/* Internal function used to add an http-check rule in a list during the config + * parsing step. Depending on its type, and the previously inserted rules, a + * specific action may be performed or an error may be reported. This functions + * returns 1 on success and 0 on error and <errmsg> is filled with the error + * message. + */ +int tcpcheck_add_http_rule(struct tcpcheck_rule *chk, struct tcpcheck_rules *rules, char **errmsg) +{ + struct tcpcheck_rule *r; + + /* the implicit send rule coming from an "option httpchk" line must be + * merged with the first explici http-check send rule, if + * any. Depending on the declaration order some tests are required. + * + * Some tests are also required for other kinds of http-check rules to be + * sure the ruleset remains valid. + */ + + if (chk->action == TCPCHK_ACT_SEND && (chk->send.http.flags & TCPCHK_SND_HTTP_FROM_OPT)) { + /* Tries to add an implicit http-check send rule from an "option httpchk" line. + * First, the first rule is retrieved, skipping the first CONNECT, if any, and + * following tests are performed : + * + * 1- If there is no such rule or if it is not a send rule, the implicit send + * rule is pushed in front of the ruleset + * + * 2- If it is another implicit send rule, it is replaced with the new one. + * + * 3- Otherwise, it means it is an explicit send rule. In this case we merge + * both, overwriting the old send rule (the explicit one) with info of the + * new send rule (the implicit one). + */ + r = get_first_tcpcheck_rule(rules); + if (r && r->action == TCPCHK_ACT_CONNECT) + r = get_next_tcpcheck_rule(rules, r); + if (!r || r->action != TCPCHK_ACT_SEND) + LIST_INSERT(rules->list, &chk->list); + else if (r->send.http.flags & TCPCHK_SND_HTTP_FROM_OPT) { + LIST_DELETE(&r->list); + free_tcpcheck(r, 0); + LIST_INSERT(rules->list, &chk->list); + } + else { + tcpcheck_overwrite_send_http_rule(r, chk); + free_tcpcheck(chk, 0); + } + } + else { + /* Tries to add an explicit http-check rule. First of all we check the typefo the + * last inserted rule to be sure it is valid. Then for send rule, we try to merge it + * with an existing implicit send rule, if any. At the end, if there is no error, + * the rule is appended to the list. + */ + + r = get_last_tcpcheck_rule(rules); + if (!r || (r->action == TCPCHK_ACT_SEND && (r->send.http.flags & TCPCHK_SND_HTTP_FROM_OPT))) + /* no error */; + else if (r->action != TCPCHK_ACT_CONNECT && chk->action == TCPCHK_ACT_SEND) { + memprintf(errmsg, "unable to add http-check send rule at step %d (missing connect rule).", + chk->index+1); + return 0; + } + else if (r->action != TCPCHK_ACT_SEND && r->action != TCPCHK_ACT_EXPECT && chk->action == TCPCHK_ACT_EXPECT) { + memprintf(errmsg, "unable to add http-check expect rule at step %d (missing send rule).", + chk->index+1); + return 0; + } + else if (r->action != TCPCHK_ACT_EXPECT && chk->action == TCPCHK_ACT_CONNECT) { + memprintf(errmsg, "unable to add http-check connect rule at step %d (missing expect rule).", + chk->index+1); + return 0; + } + + if (chk->action == TCPCHK_ACT_SEND) { + r = get_first_tcpcheck_rule(rules); + if (r && r->action == TCPCHK_ACT_SEND && (r->send.http.flags & TCPCHK_SND_HTTP_FROM_OPT)) { + tcpcheck_overwrite_send_http_rule(r, chk); + free_tcpcheck(chk, 0); + LIST_DELETE(&r->list); + r->send.http.flags &= ~TCPCHK_SND_HTTP_FROM_OPT; + chk = r; + } + } + LIST_APPEND(rules->list, &chk->list); + } + return 1; +} + +/* Check tcp-check health-check configuration for the proxy <px>. */ +static int check_proxy_tcpcheck(struct proxy *px) +{ + struct tcpcheck_rule *chk, *back; + char *comment = NULL, *errmsg = NULL; + enum tcpcheck_rule_type prev_action = TCPCHK_ACT_COMMENT; + int ret = ERR_NONE; + + if (!(px->cap & PR_CAP_BE) || (px->options2 & PR_O2_CHK_ANY) != PR_O2_TCPCHK_CHK) { + deinit_proxy_tcpcheck(px); + goto out; + } + + ha_free(&px->check_command); + ha_free(&px->check_path); + + if (!px->tcpcheck_rules.list) { + ha_alert("proxy '%s' : tcp-check configured but no ruleset defined.\n", px->id); + ret |= ERR_ALERT | ERR_FATAL; + goto out; + } + + /* HTTP ruleset only : */ + if ((px->tcpcheck_rules.flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_HTTP_CHK) { + struct tcpcheck_rule *next; + + /* move remaining implicit send rule from "option httpchk" line to the right place. + * If such rule exists, it must be the first one. In this case, the rule is moved + * after the first connect rule, if any. Otherwise, nothing is done. + */ + chk = get_first_tcpcheck_rule(&px->tcpcheck_rules); + if (chk && chk->action == TCPCHK_ACT_SEND && (chk->send.http.flags & TCPCHK_SND_HTTP_FROM_OPT)) { + next = get_next_tcpcheck_rule(&px->tcpcheck_rules, chk); + if (next && next->action == TCPCHK_ACT_CONNECT) { + LIST_DELETE(&chk->list); + LIST_INSERT(&next->list, &chk->list); + chk->index = next->index + 1; + } + } + + /* add implicit expect rule if the last one is a send. It is inherited from previous + * versions where the http expect rule was optional. Now it is possible to chained + * send/expect rules but the last expect may still be implicit. + */ + chk = get_last_tcpcheck_rule(&px->tcpcheck_rules); + if (chk && chk->action == TCPCHK_ACT_SEND) { + next = parse_tcpcheck_expect((char *[]){"http-check", "expect", "status", "200-399", ""}, + 1, px, px->tcpcheck_rules.list, TCPCHK_RULES_HTTP_CHK, + px->conf.file, px->conf.line, &errmsg); + if (!next) { + ha_alert("proxy '%s': unable to add implicit http-check expect rule " + "(%s).\n", px->id, errmsg); + free(errmsg); + ret |= ERR_ALERT | ERR_FATAL; + goto out; + } + LIST_APPEND(px->tcpcheck_rules.list, &next->list); + next->index = chk->index + 1; + } + } + + /* For all ruleset: */ + + /* If there is no connect rule preceding all send / expect rules, an + * implicit one is inserted before all others. + */ + chk = get_first_tcpcheck_rule(&px->tcpcheck_rules); + if (!chk || chk->action != TCPCHK_ACT_CONNECT) { + chk = calloc(1, sizeof(*chk)); + if (!chk) { + ha_alert("proxy '%s': unable to add implicit tcp-check connect rule " + "(out of memory).\n", px->id); + ret |= ERR_ALERT | ERR_FATAL; + goto out; + } + chk->action = TCPCHK_ACT_CONNECT; + chk->connect.options = (TCPCHK_OPT_DEFAULT_CONNECT|TCPCHK_OPT_IMPLICIT); + LIST_INSERT(px->tcpcheck_rules.list, &chk->list); + } + + /* Remove all comment rules. To do so, when a such rule is found, the + * comment is assigned to the following rule(s). + */ + list_for_each_entry_safe(chk, back, px->tcpcheck_rules.list, list) { + struct tcpcheck_rule *next; + + if (chk->action != prev_action && prev_action != TCPCHK_ACT_COMMENT) + ha_free(&comment); + + prev_action = chk->action; + switch (chk->action) { + case TCPCHK_ACT_COMMENT: + free(comment); + comment = chk->comment; + LIST_DELETE(&chk->list); + free(chk); + break; + case TCPCHK_ACT_CONNECT: + if (!chk->comment && comment) + chk->comment = strdup(comment); + next = get_next_tcpcheck_rule(&px->tcpcheck_rules, chk); + if (next && next->action == TCPCHK_ACT_SEND) + chk->connect.options |= TCPCHK_OPT_HAS_DATA; + __fallthrough; + case TCPCHK_ACT_ACTION_KW: + ha_free(&comment); + break; + case TCPCHK_ACT_SEND: + case TCPCHK_ACT_EXPECT: + if (!chk->comment && comment) + chk->comment = strdup(comment); + break; + } + } + ha_free(&comment); + + out: + return ret; +} + +void deinit_proxy_tcpcheck(struct proxy *px) +{ + free_tcpcheck_vars(&px->tcpcheck_rules.preset_vars); + px->tcpcheck_rules.flags = 0; + px->tcpcheck_rules.list = NULL; +} + +static void deinit_tcpchecks() +{ + struct tcpcheck_ruleset *rs; + struct tcpcheck_rule *r, *rb; + struct ebpt_node *node, *next; + + node = ebpt_first(&shared_tcpchecks); + while (node) { + next = ebpt_next(node); + ebpt_delete(node); + free(node->key); + rs = container_of(node, typeof(*rs), node); + list_for_each_entry_safe(r, rb, &rs->rules, list) { + LIST_DELETE(&r->list); + free_tcpcheck(r, 0); + } + free(rs); + node = next; + } +} + +int add_tcpcheck_expect_str(struct tcpcheck_rules *rules, const char *str) +{ + struct tcpcheck_rule *tcpcheck, *prev_check; + struct tcpcheck_expect *expect; + + if ((tcpcheck = pool_zalloc(pool_head_tcpcheck_rule)) == NULL) + return 0; + tcpcheck->action = TCPCHK_ACT_EXPECT; + + expect = &tcpcheck->expect; + expect->type = TCPCHK_EXPECT_STRING; + LIST_INIT(&expect->onerror_fmt); + LIST_INIT(&expect->onsuccess_fmt); + expect->ok_status = HCHK_STATUS_L7OKD; + expect->err_status = HCHK_STATUS_L7RSP; + expect->tout_status = HCHK_STATUS_L7TOUT; + expect->data = ist(strdup(str)); + if (!isttest(expect->data)) { + pool_free(pool_head_tcpcheck_rule, tcpcheck); + return 0; + } + + /* All tcp-check expect points back to the first inverse expect rule + * in a chain of one or more expect rule, potentially itself. + */ + tcpcheck->expect.head = tcpcheck; + list_for_each_entry_rev(prev_check, rules->list, list) { + if (prev_check->action == TCPCHK_ACT_EXPECT) { + if (prev_check->expect.flags & TCPCHK_EXPT_FL_INV) + tcpcheck->expect.head = prev_check; + continue; + } + if (prev_check->action != TCPCHK_ACT_COMMENT && prev_check->action != TCPCHK_ACT_ACTION_KW) + break; + } + LIST_APPEND(rules->list, &tcpcheck->list); + return 1; +} + +int add_tcpcheck_send_strs(struct tcpcheck_rules *rules, const char * const *strs) +{ + struct tcpcheck_rule *tcpcheck; + struct tcpcheck_send *send; + const char *in; + char *dst; + int i; + + if ((tcpcheck = pool_zalloc(pool_head_tcpcheck_rule)) == NULL) + return 0; + tcpcheck->action = TCPCHK_ACT_SEND; + + send = &tcpcheck->send; + send->type = TCPCHK_SEND_STRING; + + for (i = 0; strs[i]; i++) + send->data.len += strlen(strs[i]); + + send->data.ptr = malloc(istlen(send->data) + 1); + if (!isttest(send->data)) { + pool_free(pool_head_tcpcheck_rule, tcpcheck); + return 0; + } + + dst = istptr(send->data); + for (i = 0; strs[i]; i++) + for (in = strs[i]; (*dst = *in++); dst++); + *dst = 0; + + LIST_APPEND(rules->list, &tcpcheck->list); + return 1; +} + +/* Parses the "tcp-check" proxy keyword */ +static int proxy_parse_tcpcheck(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **errmsg) +{ + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rule *chk = NULL; + int index, cur_arg, ret = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[0], NULL)) + ret = 1; + + /* Deduce the ruleset name from the proxy info */ + chunk_printf(&trash, "*tcp-check-%s_%s-%d", + ((curpx == defpx) ? "defaults" : curpx->id), + curpx->conf.file, curpx->conf.line); + + rs = find_tcpcheck_ruleset(b_orig(&trash)); + if (rs == NULL) { + rs = create_tcpcheck_ruleset(b_orig(&trash)); + if (rs == NULL) { + memprintf(errmsg, "out of memory.\n"); + goto error; + } + } + + index = 0; + if (!LIST_ISEMPTY(&rs->rules)) { + chk = LIST_PREV(&rs->rules, typeof(chk), list); + index = chk->index + 1; + chk = NULL; + } + + cur_arg = 1; + if (strcmp(args[cur_arg], "connect") == 0) + chk = parse_tcpcheck_connect(args, cur_arg, curpx, &rs->rules, file, line, errmsg); + else if (strcmp(args[cur_arg], "send") == 0 || strcmp(args[cur_arg], "send-binary") == 0 || + strcmp(args[cur_arg], "send-lf") == 0 || strcmp(args[cur_arg], "send-binary-lf") == 0) + chk = parse_tcpcheck_send(args, cur_arg, curpx, &rs->rules, file, line, errmsg); + else if (strcmp(args[cur_arg], "expect") == 0) + chk = parse_tcpcheck_expect(args, cur_arg, curpx, &rs->rules, 0, file, line, errmsg); + else if (strcmp(args[cur_arg], "comment") == 0) + chk = parse_tcpcheck_comment(args, cur_arg, curpx, &rs->rules, file, line, errmsg); + else { + struct action_kw *kw = action_kw_tcp_check_lookup(args[cur_arg]); + + if (!kw) { + action_kw_tcp_check_build_list(&trash); + memprintf(errmsg, "'%s' only supports 'comment', 'connect', 'send', 'send-binary', 'expect'" + "%s%s. but got '%s'", + args[0], (*trash.area ? ", " : ""), trash.area, args[1]); + goto error; + } + chk = parse_tcpcheck_action(args, cur_arg, curpx, &rs->rules, kw, file, line, errmsg); + } + + if (!chk) { + memprintf(errmsg, "'%s %s' : %s.", args[0], args[1], *errmsg); + goto error; + } + ret = (ret || (*errmsg != NULL)); /* Handle warning */ + + /* No error: add the tcp-check rule in the list */ + chk->index = index; + LIST_APPEND(&rs->rules, &chk->list); + + if ((curpx->options2 & PR_O2_CHK_ANY) == PR_O2_TCPCHK_CHK && + (curpx->tcpcheck_rules.flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_TCP_CHK) { + /* Use this ruleset if the proxy already has tcp-check enabled */ + curpx->tcpcheck_rules.list = &rs->rules; + curpx->tcpcheck_rules.flags &= ~TCPCHK_RULES_UNUSED_TCP_RS; + } + else { + /* mark this ruleset as unused for now */ + curpx->tcpcheck_rules.flags |= TCPCHK_RULES_UNUSED_TCP_RS; + } + + return ret; + + error: + free_tcpcheck(chk, 0); + free_tcpcheck_ruleset(rs); + return -1; +} + +/* Parses the "http-check" proxy keyword */ +static int proxy_parse_httpcheck(char **args, int section, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **errmsg) +{ + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rule *chk = NULL; + int index, cur_arg, ret = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[0], NULL)) + ret = 1; + + cur_arg = 1; + if (strcmp(args[cur_arg], "disable-on-404") == 0) { + /* enable a graceful server shutdown on an HTTP 404 response */ + curpx->options |= PR_O_DISABLE404; + if (too_many_args(1, args, errmsg, NULL)) + goto error; + goto out; + } + else if (strcmp(args[cur_arg], "send-state") == 0) { + /* enable emission of the apparent state of a server in HTTP checks */ + curpx->options2 |= PR_O2_CHK_SNDST; + if (too_many_args(1, args, errmsg, NULL)) + goto error; + goto out; + } + + /* Deduce the ruleset name from the proxy info */ + chunk_printf(&trash, "*http-check-%s_%s-%d", + ((curpx == defpx) ? "defaults" : curpx->id), + curpx->conf.file, curpx->conf.line); + + rs = find_tcpcheck_ruleset(b_orig(&trash)); + if (rs == NULL) { + rs = create_tcpcheck_ruleset(b_orig(&trash)); + if (rs == NULL) { + memprintf(errmsg, "out of memory.\n"); + goto error; + } + } + + index = 0; + if (!LIST_ISEMPTY(&rs->rules)) { + chk = LIST_PREV(&rs->rules, typeof(chk), list); + if (chk->action != TCPCHK_ACT_SEND || !(chk->send.http.flags & TCPCHK_SND_HTTP_FROM_OPT)) + index = chk->index + 1; + chk = NULL; + } + + if (strcmp(args[cur_arg], "connect") == 0) + chk = parse_tcpcheck_connect(args, cur_arg, curpx, &rs->rules, file, line, errmsg); + else if (strcmp(args[cur_arg], "send") == 0) + chk = parse_tcpcheck_send_http(args, cur_arg, curpx, &rs->rules, file, line, errmsg); + else if (strcmp(args[cur_arg], "expect") == 0) + chk = parse_tcpcheck_expect(args, cur_arg, curpx, &rs->rules, TCPCHK_RULES_HTTP_CHK, + file, line, errmsg); + else if (strcmp(args[cur_arg], "comment") == 0) + chk = parse_tcpcheck_comment(args, cur_arg, curpx, &rs->rules, file, line, errmsg); + else { + struct action_kw *kw = action_kw_tcp_check_lookup(args[cur_arg]); + + if (!kw) { + action_kw_tcp_check_build_list(&trash); + memprintf(errmsg, "'%s' only supports 'disable-on-404', 'send-state', 'comment', 'connect'," + " 'send', 'expect'%s%s. but got '%s'", + args[0], (*trash.area ? ", " : ""), trash.area, args[1]); + goto error; + } + chk = parse_tcpcheck_action(args, cur_arg, curpx, &rs->rules, kw, file, line, errmsg); + } + + if (!chk) { + memprintf(errmsg, "'%s %s' : %s.", args[0], args[1], *errmsg); + goto error; + } + ret = (*errmsg != NULL); /* Handle warning */ + + chk->index = index; + if ((curpx->options2 & PR_O2_CHK_ANY) == PR_O2_TCPCHK_CHK && + (curpx->tcpcheck_rules.flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_HTTP_CHK) { + /* Use this ruleset if the proxy already has http-check enabled */ + curpx->tcpcheck_rules.list = &rs->rules; + curpx->tcpcheck_rules.flags &= ~TCPCHK_RULES_UNUSED_HTTP_RS; + if (!tcpcheck_add_http_rule(chk, &curpx->tcpcheck_rules, errmsg)) { + memprintf(errmsg, "'%s %s' : %s.", args[0], args[1], *errmsg); + curpx->tcpcheck_rules.list = NULL; + goto error; + } + } + else { + /* mark this ruleset as unused for now */ + curpx->tcpcheck_rules.flags |= TCPCHK_RULES_UNUSED_HTTP_RS; + LIST_APPEND(&rs->rules, &chk->list); + } + + out: + return ret; + + error: + free_tcpcheck(chk, 0); + free_tcpcheck_ruleset(rs); + return -1; +} + +/* Parses the "option redis-check" proxy keyword */ +int proxy_parse_redis_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + static char *redis_req = "*1\r\n$4\r\nPING\r\n"; + static char *redis_res = "+PONG\r\n"; + + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + struct tcpcheck_rule *chk; + char *errmsg = NULL; + int err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(0, 1, file, line, args, &err_code)) + goto out; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = NULL; + rules->flags = 0; + + rs = find_tcpcheck_ruleset("*redis-check"); + if (rs) + goto ruleset_found; + + rs = create_tcpcheck_ruleset("*redis-check"); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send", redis_req, ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 0; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "string", redis_res, + "error-status", "L7STS", + "on-error", "%[res.payload(0,0),cut_crlf]", + "on-success", "Redis server is ok", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_REDIS_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 1; + LIST_APPEND(&rs->rules, &chk->list); + + ruleset_found: + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_REDIS_CHK; + + out: + free(errmsg); + return err_code; + + error: + free_tcpcheck_ruleset(rs); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + + +/* Parses the "option ssl-hello-chk" proxy keyword */ +int proxy_parse_ssl_hello_chk_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + /* This is the SSLv3 CLIENT HELLO packet used in conjunction with the + * ssl-hello-chk option to ensure that the remote server speaks SSL. + * + * Check RFC 2246 (TLSv1.0) sections A.3 and A.4 for details. + */ + static char sslv3_client_hello[] = { + "16" /* ContentType : 0x16 = Handshake */ + "0300" /* ProtocolVersion : 0x0300 = SSLv3 */ + "0079" /* ContentLength : 0x79 bytes after this one */ + "01" /* HanshakeType : 0x01 = CLIENT HELLO */ + "000075" /* HandshakeLength : 0x75 bytes after this one */ + "0300" /* Hello Version : 0x0300 = v3 */ + "%[date(),htonl,hex]" /* Unix GMT Time (s) : filled with <now> (@0x0B) */ + "%[str(HAPROXYSSLCHK\nHAPROXYSSLCHK\n),hex]" /* Random : must be exactly 28 bytes */ + "00" /* Session ID length : empty (no session ID) */ + "004E" /* Cipher Suite Length : 78 bytes after this one */ + "0001" "0002" "0003" "0004" /* 39 most common ciphers : */ + "0005" "0006" "0007" "0008" /* 0x01...0x1B, 0x2F...0x3A */ + "0009" "000A" "000B" "000C" /* This covers RSA/DH, */ + "000D" "000E" "000F" "0010" /* various bit lengths, */ + "0011" "0012" "0013" "0014" /* SHA1/MD5, DES/3DES/AES... */ + "0015" "0016" "0017" "0018" + "0019" "001A" "001B" "002F" + "0030" "0031" "0032" "0033" + "0034" "0035" "0036" "0037" + "0038" "0039" "003A" + "01" /* Compression Length : 0x01 = 1 byte for types */ + "00" /* Compression Type : 0x00 = NULL compression */ + }; + + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + struct tcpcheck_rule *chk; + char *errmsg = NULL; + int err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(0, 1, file, line, args, &err_code)) + goto out; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = NULL; + rules->flags = 0; + + rs = find_tcpcheck_ruleset("*ssl-hello-check"); + if (rs) + goto ruleset_found; + + rs = create_tcpcheck_ruleset("*ssl-hello-check"); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send-binary-lf", sslv3_client_hello, ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 0; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "rbinary", "^1[56]", + "min-recv", "5", "ok-status", "L6OK", + "error-status", "L6RSP", "tout-status", "L6TOUT", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_SSL3_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 1; + LIST_APPEND(&rs->rules, &chk->list); + + ruleset_found: + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_SSL3_CHK; + + out: + free(errmsg); + return err_code; + + error: + free_tcpcheck_ruleset(rs); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parses the "option smtpchk" proxy keyword */ +int proxy_parse_smtpchk_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + static char *smtp_req = "%[var(check.smtp_cmd)]\r\n"; + + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + struct tcpcheck_rule *chk; + struct tcpcheck_var *var = NULL; + char *cmd = NULL, *errmsg = NULL; + int err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(2, 1, file, line, args, &err_code)) + goto out; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = NULL; + rules->flags = 0; + + cur_arg += 2; + if (*args[cur_arg] && *args[cur_arg+1] && + (strcmp(args[cur_arg], "EHLO") == 0 || strcmp(args[cur_arg], "HELO") == 0)) { + /* <EHLO|HELO> + space (1) + <host> + null byte (1) */ + size_t len = strlen(args[cur_arg]) + 1 + strlen(args[cur_arg+1]) + 1; + cmd = calloc(1, len); + if (cmd) + snprintf(cmd, len, "%s %s", args[cur_arg], args[cur_arg+1]); + } + else { + /* this just hits the default for now, but you could potentially expand it to allow for other stuff + though, it's unlikely you'd want to send anything other than an EHLO or HELO */ + cmd = strdup("HELO localhost"); + } + + var = create_tcpcheck_var(ist("check.smtp_cmd")); + if (cmd == NULL || var == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + var->data.type = SMP_T_STR; + var->data.u.str.area = cmd; + var->data.u.str.data = strlen(cmd); + LIST_INIT(&var->list); + LIST_APPEND(&rules->preset_vars, &var->list); + cmd = NULL; + var = NULL; + + rs = find_tcpcheck_ruleset("*smtp-check"); + if (rs) + goto ruleset_found; + + rs = create_tcpcheck_ruleset("*smtp-check"); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + + chk = parse_tcpcheck_connect((char *[]){"tcp-check", "connect", "default", "linger", ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 0; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "rstring", "^[0-9]{3}[ \r]", + "min-recv", "4", + "error-status", "L7RSP", + "on-error", "%[res.payload(0,0),cut_crlf]", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_SMTP_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 1; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "rstring", "^2[0-9]{2}[ \r]", + "min-recv", "4", + "error-status", "L7STS", + "on-error", "%[res.payload(4,0),ltrim(' '),cut_crlf]", + "status-code", "res.payload(0,3)", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_SMTP_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 2; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send-lf", smtp_req, ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 3; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "rstring", "^(2[0-9]{2}-[^\r]*\r\n)*2[0-9]{2}[ \r]", + "error-status", "L7STS", + "on-error", "%[res.payload(4,0),ltrim(' '),cut_crlf]", + "on-success", "%[res.payload(4,0),ltrim(' '),cut_crlf]", + "status-code", "res.payload(0,3)", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_SMTP_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 4; + LIST_APPEND(&rs->rules, &chk->list); + + /* Send an SMTP QUIT to ensure clean disconnect (issue 1812), and expect a 2xx response code */ + + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send", "QUIT\r\n", ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 5; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "rstring", "^2[0-9]{2}[- \r]", + "min-recv", "4", + "error-status", "L7STS", + "on-error", "%[res.payload(4,0),ltrim(' '),cut_crlf]", + "on-success", "%[res.payload(4,0),ltrim(' '),cut_crlf]", + "status-code", "res.payload(0,3)", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_SMTP_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 6; + LIST_APPEND(&rs->rules, &chk->list); + + ruleset_found: + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_SMTP_CHK; + + out: + free(errmsg); + return err_code; + + error: + free(cmd); + free(var); + free_tcpcheck_vars(&rules->preset_vars); + free_tcpcheck_ruleset(rs); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parses the "option pgsql-check" proxy keyword */ +int proxy_parse_pgsql_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + static char pgsql_req[] = { + "%[var(check.plen),htonl,hex]" /* The packet length*/ + "00030000" /* the version 3.0 */ + "7573657200" /* "user" key */ + "%[var(check.username),hex]00" /* the username */ + "00" + }; + + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + struct tcpcheck_rule *chk; + struct tcpcheck_var *var = NULL; + char *user = NULL, *errmsg = NULL; + size_t packetlen = 0; + int err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(2, 1, file, line, args, &err_code)) + goto out; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = NULL; + rules->flags = 0; + + cur_arg += 2; + if (!*args[cur_arg] || !*args[cur_arg+1]) { + ha_alert("parsing [%s:%d] : '%s %s' expects 'user <username>' as argument.\n", + file, line, args[0], args[1]); + goto error; + } + if (strcmp(args[cur_arg], "user") == 0) { + packetlen = 15 + strlen(args[cur_arg+1]); + user = strdup(args[cur_arg+1]); + + var = create_tcpcheck_var(ist("check.username")); + if (user == NULL || var == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + var->data.type = SMP_T_STR; + var->data.u.str.area = user; + var->data.u.str.data = strlen(user); + LIST_INIT(&var->list); + LIST_APPEND(&rules->preset_vars, &var->list); + user = NULL; + var = NULL; + + var = create_tcpcheck_var(ist("check.plen")); + if (var == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + var->data.type = SMP_T_SINT; + var->data.u.sint = packetlen; + LIST_INIT(&var->list); + LIST_APPEND(&rules->preset_vars, &var->list); + var = NULL; + } + else { + ha_alert("parsing [%s:%d] : '%s %s' only supports optional values: 'user'.\n", + file, line, args[0], args[1]); + goto error; + } + + rs = find_tcpcheck_ruleset("*pgsql-check"); + if (rs) + goto ruleset_found; + + rs = create_tcpcheck_ruleset("*pgsql-check"); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + + chk = parse_tcpcheck_connect((char *[]){"tcp-check", "connect", "default", "linger", ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 0; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send-binary-lf", pgsql_req, ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 1; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "!rstring", "^E", + "min-recv", "5", + "error-status", "L7RSP", + "on-error", "%[res.payload(6,0)]", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_PGSQL_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 2; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "rbinary", "^52000000[A-Z0-9]{2}000000(00|02|03|04|05|06|07|09|0A)", + "min-recv", "9", + "error-status", "L7STS", + "on-success", "PostgreSQL server is ok", + "on-error", "PostgreSQL unknown error", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_PGSQL_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 3; + LIST_APPEND(&rs->rules, &chk->list); + + ruleset_found: + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_PGSQL_CHK; + + out: + free(errmsg); + return err_code; + + error: + free(user); + free(var); + free_tcpcheck_vars(&rules->preset_vars); + free_tcpcheck_ruleset(rs); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + + +/* Parses the "option mysql-check" proxy keyword */ +int proxy_parse_mysql_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + /* This is an example of a MySQL >=4.0 client Authentication packet kindly provided by Cyril Bonte. + * const char mysql40_client_auth_pkt[] = { + * "\x0e\x00\x00" // packet length + * "\x01" // packet number + * "\x00\x00" // client capabilities + * "\x00\x00\x01" // max packet + * "haproxy\x00" // username (null terminated string) + * "\x00" // filler (always 0x00) + * "\x01\x00\x00" // packet length + * "\x00" // packet number + * "\x01" // COM_QUIT command + * }; + */ + static char mysql40_rsname[] = "*mysql40-check"; + static char mysql40_req[] = { + "%[var(check.header),hex]" /* 3 bytes for the packet length and 1 byte for the sequence ID */ + "0080" /* client capabilities */ + "000001" /* max packet */ + "%[var(check.username),hex]00" /* the username */ + "00" /* filler (always 0x00) */ + "010000" /* packet length*/ + "00" /* sequence ID */ + "01" /* COM_QUIT command */ + }; + + /* This is an example of a MySQL >=4.1 client Authentication packet provided by Nenad Merdanovic. + * const char mysql41_client_auth_pkt[] = { + * "\x0e\x00\x00\" // packet length + * "\x01" // packet number + * "\x00\x00\x00\x00" // client capabilities + * "\x00\x00\x00\x01" // max packet + * "\x21" // character set (UTF-8) + * char[23] // All zeroes + * "haproxy\x00" // username (null terminated string) + * "\x00" // filler (always 0x00) + * "\x01\x00\x00" // packet length + * "\x00" // packet number + * "\x01" // COM_QUIT command + * }; + */ + static char mysql41_rsname[] = "*mysql41-check"; + static char mysql41_req[] = { + "%[var(check.header),hex]" /* 3 bytes for the packet length and 1 byte for the sequence ID */ + "00820000" /* client capabilities */ + "00800001" /* max packet */ + "21" /* character set (UTF-8) */ + "000000000000000000000000" /* 23 bytes, al zeroes */ + "0000000000000000000000" + "%[var(check.username),hex]00" /* the username */ + "00" /* filler (always 0x00) */ + "010000" /* packet length*/ + "00" /* sequence ID */ + "01" /* COM_QUIT command */ + }; + + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + struct tcpcheck_rule *chk; + struct tcpcheck_var *var = NULL; + char *mysql_rsname = "*mysql-check"; + char *mysql_req = NULL, *hdr = NULL, *user = NULL, *errmsg = NULL; + int index = 0, err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(3, 1, file, line, args, &err_code)) + goto out; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = NULL; + rules->flags = 0; + + cur_arg += 2; + if (*args[cur_arg]) { + int packetlen, userlen; + + if (strcmp(args[cur_arg], "user") != 0) { + ha_alert("parsing [%s:%d] : '%s %s' only supports optional values: 'user' (got '%s').\n", + file, line, args[0], args[1], args[cur_arg]); + goto error; + } + + if (*(args[cur_arg+1]) == 0) { + ha_alert("parsing [%s:%d] : '%s %s %s' expects <username> as argument.\n", + file, line, args[0], args[1], args[cur_arg]); + goto error; + } + + hdr = calloc(4, sizeof(*hdr)); + user = strdup(args[cur_arg+1]); + userlen = strlen(args[cur_arg+1]); + + if (hdr == NULL || user == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + + if (!*args[cur_arg+2] || strcmp(args[cur_arg+2], "post-41") == 0) { + packetlen = userlen + 7 + 27; + mysql_req = mysql41_req; + mysql_rsname = mysql41_rsname; + } + else if (strcmp(args[cur_arg+2], "pre-41") == 0) { + packetlen = userlen + 7; + mysql_req = mysql40_req; + mysql_rsname = mysql40_rsname; + } + else { + ha_alert("parsing [%s:%d] : keyword '%s' only supports 'post-41' and 'pre-41' (got '%s').\n", + file, line, args[cur_arg], args[cur_arg+2]); + goto error; + } + + hdr[0] = (unsigned char)(packetlen & 0xff); + hdr[1] = (unsigned char)((packetlen >> 8) & 0xff); + hdr[2] = (unsigned char)((packetlen >> 16) & 0xff); + hdr[3] = 1; + + var = create_tcpcheck_var(ist("check.header")); + if (var == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + var->data.type = SMP_T_STR; + var->data.u.str.area = hdr; + var->data.u.str.data = 4; + LIST_INIT(&var->list); + LIST_APPEND(&rules->preset_vars, &var->list); + hdr = NULL; + var = NULL; + + var = create_tcpcheck_var(ist("check.username")); + if (var == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + var->data.type = SMP_T_STR; + var->data.u.str.area = user; + var->data.u.str.data = strlen(user); + LIST_INIT(&var->list); + LIST_APPEND(&rules->preset_vars, &var->list); + user = NULL; + var = NULL; + } + + rs = find_tcpcheck_ruleset(mysql_rsname); + if (rs) + goto ruleset_found; + + rs = create_tcpcheck_ruleset(mysql_rsname); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + + chk = parse_tcpcheck_connect((char *[]){"tcp-check", "connect", "default", "linger", ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = index++; + LIST_APPEND(&rs->rules, &chk->list); + + if (mysql_req) { + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send-binary-lf", mysql_req, ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = index++; + LIST_APPEND(&rs->rules, &chk->list); + } + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "custom", ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_MYSQL_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->expect.custom = tcpcheck_mysql_expect_iniths; + chk->index = index++; + LIST_APPEND(&rs->rules, &chk->list); + + if (mysql_req) { + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "custom", ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_MYSQL_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->expect.custom = tcpcheck_mysql_expect_ok; + chk->index = index++; + LIST_APPEND(&rs->rules, &chk->list); + } + + ruleset_found: + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_MYSQL_CHK; + + out: + free(errmsg); + return err_code; + + error: + free(hdr); + free(user); + free(var); + free_tcpcheck_vars(&rules->preset_vars); + free_tcpcheck_ruleset(rs); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +int proxy_parse_ldap_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + static char *ldap_req = "300C020101600702010304008000"; + + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + struct tcpcheck_rule *chk; + char *errmsg = NULL; + int err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(0, 1, file, line, args, &err_code)) + goto out; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = NULL; + rules->flags = 0; + + rs = find_tcpcheck_ruleset("*ldap-check"); + if (rs) + goto ruleset_found; + + rs = create_tcpcheck_ruleset("*ldap-check"); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send-binary", ldap_req, ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 0; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "rbinary", "^30", + "min-recv", "14", + "on-error", "Not LDAPv3 protocol", + ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_LDAP_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 1; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "custom", ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_LDAP_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->expect.custom = tcpcheck_ldap_expect_bindrsp; + chk->index = 2; + LIST_APPEND(&rs->rules, &chk->list); + + ruleset_found: + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_LDAP_CHK; + + out: + free(errmsg); + return err_code; + + error: + free_tcpcheck_ruleset(rs); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +int proxy_parse_spop_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + struct tcpcheck_rule *chk; + char *spop_req = NULL; + char *errmsg = NULL; + int spop_len = 0, err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(0, 1, file, line, args, &err_code)) + goto out; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = NULL; + rules->flags = 0; + + + rs = find_tcpcheck_ruleset("*spop-check"); + if (rs) + goto ruleset_found; + + rs = create_tcpcheck_ruleset("*spop-check"); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + + if (spoe_prepare_healthcheck_request(&spop_req, &spop_len) == -1) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + chunk_reset(&trash); + dump_binary(&trash, spop_req, spop_len); + trash.area[trash.data] = '\0'; + + chk = parse_tcpcheck_send((char *[]){"tcp-check", "send-binary", b_head(&trash), ""}, + 1, curpx, &rs->rules, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->index = 0; + LIST_APPEND(&rs->rules, &chk->list); + + chk = parse_tcpcheck_expect((char *[]){"tcp-check", "expect", "custom", "min-recv", "4", ""}, + 1, curpx, &rs->rules, TCPCHK_RULES_SPOP_CHK, file, line, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : %s\n", file, line, errmsg); + goto error; + } + chk->expect.custom = tcpcheck_spop_expect_agenthello; + chk->index = 1; + LIST_APPEND(&rs->rules, &chk->list); + + ruleset_found: + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_SPOP_CHK; + + out: + free(spop_req); + free(errmsg); + return err_code; + + error: + free_tcpcheck_ruleset(rs); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + + +static struct tcpcheck_rule *proxy_parse_httpchk_req(char **args, int cur_arg, struct proxy *px, char **errmsg) +{ + struct tcpcheck_rule *chk = NULL; + struct tcpcheck_http_hdr *hdr = NULL; + char *meth = NULL, *uri = NULL, *vsn = NULL; + char *hdrs, *body; + + hdrs = (*args[cur_arg+2] ? strstr(args[cur_arg+2], "\r\n") : NULL); + body = (*args[cur_arg+2] ? strstr(args[cur_arg+2], "\r\n\r\n") : NULL); + if (hdrs || body) { + memprintf(errmsg, "hiding headers or body at the end of the version string is unsupported." + "Use 'http-check send' directive instead."); + goto error; + } + + chk = calloc(1, sizeof(*chk)); + if (!chk) { + memprintf(errmsg, "out of memory"); + goto error; + } + chk->action = TCPCHK_ACT_SEND; + chk->send.type = TCPCHK_SEND_HTTP; + chk->send.http.flags |= TCPCHK_SND_HTTP_FROM_OPT; + chk->send.http.meth.meth = HTTP_METH_OPTIONS; + LIST_INIT(&chk->send.http.hdrs); + + /* Copy the method, uri and version */ + if (*args[cur_arg]) { + if (!*args[cur_arg+1]) + uri = args[cur_arg]; + else + meth = args[cur_arg]; + } + if (*args[cur_arg+1]) + uri = args[cur_arg+1]; + if (*args[cur_arg+2]) + vsn = args[cur_arg+2]; + + if (meth) { + chk->send.http.meth.meth = find_http_meth(meth, strlen(meth)); + chk->send.http.meth.str.area = strdup(meth); + chk->send.http.meth.str.data = strlen(meth); + if (!chk->send.http.meth.str.area) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + if (uri) { + chk->send.http.uri = ist(strdup(uri)); + if (!isttest(chk->send.http.uri)) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + if (vsn) { + chk->send.http.vsn = ist(strdup(vsn)); + if (!isttest(chk->send.http.vsn)) { + memprintf(errmsg, "out of memory"); + goto error; + } + } + + return chk; + + error: + free_tcpcheck_http_hdr(hdr); + free_tcpcheck(chk, 0); + return NULL; +} + +/* Parses the "option httpchck" proxy keyword */ +int proxy_parse_httpchk_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + struct tcpcheck_rule *chk; + char *errmsg = NULL; + int err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(3, 1, file, line, args, &err_code)) + goto out; + + chk = proxy_parse_httpchk_req(args, cur_arg+2, curpx, &errmsg); + if (!chk) { + ha_alert("parsing [%s:%d] : '%s %s' : %s.\n", file, line, args[0], args[1], errmsg); + goto error; + } + if (errmsg) { + ha_warning("parsing [%s:%d]: '%s %s' : %s\n", file, line, args[0], args[1], errmsg); + err_code |= ERR_WARN; + ha_free(&errmsg); + } + + no_request: + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + free_tcpcheck_vars(&rules->preset_vars); + rules->list = NULL; + rules->flags |= TCPCHK_SND_HTTP_FROM_OPT; + + /* Deduce the ruleset name from the proxy info */ + chunk_printf(&trash, "*http-check-%s_%s-%d", + ((curpx == defpx) ? "defaults" : curpx->id), + curpx->conf.file, curpx->conf.line); + + rs = find_tcpcheck_ruleset(b_orig(&trash)); + if (rs == NULL) { + rs = create_tcpcheck_ruleset(b_orig(&trash)); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + } + + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_HTTP_CHK; + if (!tcpcheck_add_http_rule(chk, rules, &errmsg)) { + ha_alert("parsing [%s:%d] : '%s %s' : %s.\n", file, line, args[0], args[1], errmsg); + rules->list = NULL; + goto error; + } + + out: + free(errmsg); + return err_code; + + error: + free_tcpcheck_ruleset(rs); + free_tcpcheck(chk, 0); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +/* Parses the "option tcp-check" proxy keyword */ +int proxy_parse_tcp_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line) +{ + struct tcpcheck_ruleset *rs = NULL; + struct tcpcheck_rules *rules = &curpx->tcpcheck_rules; + int err_code = 0; + + if (warnifnotcap(curpx, PR_CAP_BE, file, line, args[cur_arg+1], NULL)) + err_code |= ERR_WARN; + + if (alertif_too_many_args_idx(0, 1, file, line, args, &err_code)) + goto out; + + curpx->options2 &= ~PR_O2_CHK_ANY; + curpx->options2 |= PR_O2_TCPCHK_CHK; + + if ((rules->flags & TCPCHK_RULES_PROTO_CHK) == TCPCHK_RULES_TCP_CHK) { + /* If a tcp-check rulesset is already set, do nothing */ + if (rules->list) + goto out; + + /* If a tcp-check ruleset is waiting to be used for the current proxy, + * get it. + */ + if (rules->flags & TCPCHK_RULES_UNUSED_TCP_RS) + goto curpx_ruleset; + + /* Otherwise, try to get the tcp-check ruleset of the default proxy */ + chunk_printf(&trash, "*tcp-check-defaults_%s-%d", defpx->conf.file, defpx->conf.line); + rs = find_tcpcheck_ruleset(b_orig(&trash)); + if (rs) + goto ruleset_found; + } + + curpx_ruleset: + /* Deduce the ruleset name from the proxy info */ + chunk_printf(&trash, "*tcp-check-%s_%s-%d", + ((curpx == defpx) ? "defaults" : curpx->id), + curpx->conf.file, curpx->conf.line); + + rs = find_tcpcheck_ruleset(b_orig(&trash)); + if (rs == NULL) { + rs = create_tcpcheck_ruleset(b_orig(&trash)); + if (rs == NULL) { + ha_alert("parsing [%s:%d] : out of memory.\n", file, line); + goto error; + } + } + + ruleset_found: + free_tcpcheck_vars(&rules->preset_vars); + rules->list = &rs->rules; + rules->flags &= ~(TCPCHK_RULES_PROTO_CHK|TCPCHK_RULES_UNUSED_RS); + rules->flags |= TCPCHK_RULES_TCP_CHK; + + out: + return err_code; + + error: + err_code |= ERR_ALERT | ERR_FATAL; + goto out; +} + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_LISTEN, "http-check", proxy_parse_httpcheck }, + { CFG_LISTEN, "tcp-check", proxy_parse_tcpcheck }, + { 0, NULL, NULL }, +}}; + +REGISTER_POST_PROXY_CHECK(check_proxy_tcpcheck); +REGISTER_PROXY_DEINIT(deinit_proxy_tcpcheck); +REGISTER_POST_DEINIT(deinit_tcpchecks); +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/thread.c b/src/thread.c new file mode 100644 index 0000000..ab4342d --- /dev/null +++ b/src/thread.c @@ -0,0 +1,1864 @@ +/* + * functions about threads. + * + * Copyright (C) 2017 Christopher Fauet - cfaulet@haproxy.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#define _GNU_SOURCE +#include <unistd.h> +#include <stdlib.h> + +#include <signal.h> +#include <unistd.h> +#ifdef _POSIX_PRIORITY_SCHEDULING +#include <sched.h> +#endif + +#ifdef USE_THREAD +# include <pthread.h> +#endif + +#ifdef USE_CPU_AFFINITY +# include <sched.h> +# if defined(__FreeBSD__) || defined(__DragonFly__) +# include <sys/param.h> +# ifdef __FreeBSD__ +# include <sys/cpuset.h> +# endif +# include <pthread_np.h> +# endif +# ifdef __APPLE__ +# include <mach/mach_types.h> +# include <mach/thread_act.h> +# include <mach/thread_policy.h> +# endif +# include <haproxy/cpuset.h> +#endif + +#include <haproxy/cfgparse.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/log.h> +#include <haproxy/thread.h> +#include <haproxy/tools.h> + +struct tgroup_info ha_tgroup_info[MAX_TGROUPS] = { }; +THREAD_LOCAL const struct tgroup_info *tg = &ha_tgroup_info[0]; + +struct thread_info ha_thread_info[MAX_THREADS] = { }; +THREAD_LOCAL const struct thread_info *ti = &ha_thread_info[0]; + +struct tgroup_ctx ha_tgroup_ctx[MAX_TGROUPS] = { }; +THREAD_LOCAL struct tgroup_ctx *tg_ctx = &ha_tgroup_ctx[0]; + +struct thread_ctx ha_thread_ctx[MAX_THREADS] = { }; +THREAD_LOCAL struct thread_ctx *th_ctx = &ha_thread_ctx[0]; + +#ifdef USE_THREAD + +volatile unsigned long all_tgroups_mask __read_mostly = 1; // nbtgroup 1 assumed by default +volatile unsigned int rdv_requests = 0; // total number of threads requesting RDV +volatile unsigned int isolated_thread = ~0; // ID of the isolated thread, or ~0 when none +THREAD_LOCAL unsigned int tgid = 1; // thread ID starts at 1 +THREAD_LOCAL unsigned int tid = 0; +int thread_cpus_enabled_at_boot = 1; +static pthread_t ha_pthread[MAX_THREADS] = { }; + +/* Marks the thread as harmless until the last thread using the rendez-vous + * point quits. Given that we can wait for a long time, sched_yield() is + * used when available to offer the CPU resources to competing threads if + * needed. + */ +void thread_harmless_till_end() +{ + _HA_ATOMIC_OR(&tg_ctx->threads_harmless, ti->ltid_bit); + while (_HA_ATOMIC_LOAD(&rdv_requests) != 0) { + ha_thread_relax(); + } +} + +/* Isolates the current thread : request the ability to work while all other + * threads are harmless, as defined by thread_harmless_now() (i.e. they're not + * going to touch any visible memory area). Only returns once all of them are + * harmless, with the current thread's bit in &tg_ctx->threads_harmless cleared. + * Needs to be completed using thread_release(). + */ +void thread_isolate() +{ + uint tgrp, thr; + + _HA_ATOMIC_OR(&tg_ctx->threads_harmless, ti->ltid_bit); + __ha_barrier_atomic_store(); + _HA_ATOMIC_INC(&rdv_requests); + + /* wait for all threads to become harmless. They cannot change their + * mind once seen thanks to rdv_requests above, unless they pass in + * front of us. For this reason we proceed in 4 steps: + * 1) wait for all threads to declare themselves harmless + * 2) try to grab the isolated_thread exclusivity + * 3) verify again that all threads are harmless, since another one + * that was isolating between 1 and 2 could have dropped its + * harmless state there. + * 4) drop harmless flag (which also has the benefit of leaving + * all other threads wait on reads instead of writes. + */ + while (1) { + for (tgrp = 0; tgrp < global.nbtgroups; tgrp++) { + do { + ulong te = _HA_ATOMIC_LOAD(&ha_tgroup_info[tgrp].threads_enabled); + ulong th = _HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp].threads_harmless); + + if ((th & te) == te) + break; + ha_thread_relax(); + } while (1); + } + + /* all other ones are harmless. isolated_thread will contain + * ~0U if no other one competes, !=tid if another one got it, + * tid if the current thread already grabbed it on the previous + * round. + */ + thr = _HA_ATOMIC_LOAD(&isolated_thread); + if (thr == tid) + break; // we won and we're certain everyone is harmless + + /* try to win the race against others */ + if (thr != ~0U || !_HA_ATOMIC_CAS(&isolated_thread, &thr, tid)) + ha_thread_relax(); + } + + /* the thread is no longer harmless as it runs */ + _HA_ATOMIC_AND(&tg_ctx->threads_harmless, ~ti->ltid_bit); + + /* the thread is isolated until it calls thread_release() which will + * 1) reset isolated_thread to ~0; + * 2) decrement rdv_requests. + */ +} + +/* Isolates the current thread : request the ability to work while all other + * threads are idle, as defined by thread_idle_now(). It only returns once + * all of them are both harmless and idle, with the current thread's bit in + * &tg_ctx->threads_harmless and idle_mask cleared. Needs to be completed using + * thread_release(). By doing so the thread also engages in being safe against + * any actions that other threads might be about to start under the same + * conditions. This specifically targets destruction of any internal structure, + * which implies that the current thread may not hold references to any object. + * + * Note that a concurrent thread_isolate() will usually win against + * thread_isolate_full() as it doesn't consider the idle_mask, allowing it to + * get back to the poller or any other fully idle location, that will + * ultimately release this one. + */ +void thread_isolate_full() +{ + uint tgrp, thr; + + _HA_ATOMIC_OR(&tg_ctx->threads_idle, ti->ltid_bit); + _HA_ATOMIC_OR(&tg_ctx->threads_harmless, ti->ltid_bit); + __ha_barrier_atomic_store(); + _HA_ATOMIC_INC(&rdv_requests); + + /* wait for all threads to become harmless. They cannot change their + * mind once seen thanks to rdv_requests above, unless they pass in + * front of us. For this reason we proceed in 4 steps: + * 1) wait for all threads to declare themselves harmless + * 2) try to grab the isolated_thread exclusivity + * 3) verify again that all threads are harmless, since another one + * that was isolating between 1 and 2 could have dropped its + * harmless state there. + * 4) drop harmless flag (which also has the benefit of leaving + * all other threads wait on reads instead of writes. + */ + while (1) { + for (tgrp = 0; tgrp < global.nbtgroups; tgrp++) { + do { + ulong te = _HA_ATOMIC_LOAD(&ha_tgroup_info[tgrp].threads_enabled); + ulong th = _HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp].threads_harmless); + ulong id = _HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp].threads_idle); + + if ((th & id & te) == te) + break; + ha_thread_relax(); + } while (1); + } + + /* all other ones are harmless and idle. isolated_thread will + * contain ~0U if no other one competes, !=tid if another one + * got it, tid if the current thread already grabbed it on the + * previous round. + */ + thr = _HA_ATOMIC_LOAD(&isolated_thread); + if (thr == tid) + break; // we won and we're certain everyone is harmless + + if (thr != ~0U || !_HA_ATOMIC_CAS(&isolated_thread, &thr, tid)) + ha_thread_relax(); + } + + /* we're not idle nor harmless anymore at this point. Other threads + * waiting on this condition will need to wait until out next pass to + * the poller, or our next call to thread_isolate_full(). + */ + _HA_ATOMIC_AND(&tg_ctx->threads_idle, ~ti->ltid_bit); + _HA_ATOMIC_AND(&tg_ctx->threads_harmless, ~ti->ltid_bit); + + /* the thread is isolated until it calls thread_release() which will + * 1) reset isolated_thread to ~0; + * 2) decrement rdv_requests. + */ +} + +/* Cancels the effect of thread_isolate() by resetting the ID of the isolated + * thread and decrementing the number of RDV requesters. This immediately allows + * other threads to expect to be executed, though they will first have to wait + * for this thread to become harmless again (possibly by reaching the poller + * again). + */ +void thread_release() +{ + HA_ATOMIC_STORE(&isolated_thread, ~0U); + HA_ATOMIC_DEC(&rdv_requests); +} + +/* Sets up threads, signals and masks, and starts threads 2 and above. + * Does nothing when threads are disabled. + */ +void setup_extra_threads(void *(*handler)(void *)) +{ + sigset_t blocked_sig, old_sig; + int i; + + /* ensure the signals will be blocked in every thread */ + sigfillset(&blocked_sig); + sigdelset(&blocked_sig, SIGPROF); + sigdelset(&blocked_sig, SIGBUS); + sigdelset(&blocked_sig, SIGFPE); + sigdelset(&blocked_sig, SIGILL); + sigdelset(&blocked_sig, SIGSEGV); + pthread_sigmask(SIG_SETMASK, &blocked_sig, &old_sig); + + /* Create nbthread-1 thread. The first thread is the current process */ + ha_pthread[0] = pthread_self(); + for (i = 1; i < global.nbthread; i++) + pthread_create(&ha_pthread[i], NULL, handler, &ha_thread_info[i]); +} + +/* waits for all threads to terminate. Does nothing when threads are + * disabled. + */ +void wait_for_threads_completion() +{ + int i; + + /* Wait the end of other threads */ + for (i = 1; i < global.nbthread; i++) + pthread_join(ha_pthread[i], NULL); + +#if defined(DEBUG_THREAD) || defined(DEBUG_FULL) + show_lock_stats(); +#endif +} + +/* Tries to set the current thread's CPU affinity according to the cpu_map */ +void set_thread_cpu_affinity() +{ +#if defined(USE_CPU_AFFINITY) + /* no affinity setting for the master process */ + if (master) + return; + + /* Now the CPU affinity for all threads */ + if (ha_cpuset_count(&cpu_map[tgid - 1].thread[ti->ltid])) {/* only do this if the thread has a THREAD map */ +# if defined(__APPLE__) + /* Note: this API is limited to the first 32/64 CPUs */ + unsigned long set = cpu_map[tgid - 1].thread[ti->ltid].cpuset; + int j; + + while ((j = ffsl(set)) > 0) { + thread_affinity_policy_data_t cpu_set = { j - 1 }; + thread_port_t mthread; + + mthread = pthread_mach_thread_np(ha_pthread[tid]); + thread_policy_set(mthread, THREAD_AFFINITY_POLICY, (thread_policy_t)&cpu_set, 1); + set &= ~(1UL << (j - 1)); + } +# else + struct hap_cpuset *set = &cpu_map[tgid - 1].thread[ti->ltid]; + + pthread_setaffinity_np(ha_pthread[tid], sizeof(set->cpuset), &set->cpuset); +# endif + } +#endif /* USE_CPU_AFFINITY */ +} + +/* Retrieves the opaque pthread_t of thread <thr> cast to an unsigned long long + * since POSIX took great care of not specifying its representation, making it + * hard to export for post-mortem analysis. For this reason we copy it into a + * union and will use the smallest scalar type at least as large as its size, + * which will keep endianness and alignment for all regular sizes. As a last + * resort we end up with a long long ligned to the first bytes in memory, which + * will be endian-dependent if pthread_t is larger than a long long (not seen + * yet). + */ +unsigned long long ha_get_pthread_id(unsigned int thr) +{ + union { + pthread_t t; + unsigned long long ll; + unsigned int i; + unsigned short s; + unsigned char c; + } u = { 0 }; + + u.t = ha_pthread[thr]; + + if (sizeof(u.t) <= sizeof(u.c)) + return u.c; + else if (sizeof(u.t) <= sizeof(u.s)) + return u.s; + else if (sizeof(u.t) <= sizeof(u.i)) + return u.i; + return u.ll; +} + +/* send signal <sig> to thread <thr> */ +void ha_tkill(unsigned int thr, int sig) +{ + pthread_kill(ha_pthread[thr], sig); +} + +/* send signal <sig> to all threads. The calling thread is signaled last in + * order to allow all threads to synchronize in the handler. + */ +void ha_tkillall(int sig) +{ + unsigned int thr; + + for (thr = 0; thr < global.nbthread; thr++) { + if (!(ha_thread_info[thr].tg->threads_enabled & ha_thread_info[thr].ltid_bit)) + continue; + if (thr == tid) + continue; + pthread_kill(ha_pthread[thr], sig); + } + raise(sig); +} + +void ha_thread_relax(void) +{ +#ifdef _POSIX_PRIORITY_SCHEDULING + sched_yield(); +#else + pl_cpu_relax(); +#endif +} + +/* these calls are used as callbacks at init time when debugging is on */ +void ha_spin_init(HA_SPINLOCK_T *l) +{ + HA_SPIN_INIT(l); +} + +/* these calls are used as callbacks at init time when debugging is on */ +void ha_rwlock_init(HA_RWLOCK_T *l) +{ + HA_RWLOCK_INIT(l); +} + +/* returns the number of CPUs the current process is enabled to run on, + * regardless of any MAX_THREADS limitation. + */ +static int thread_cpus_enabled() +{ + int ret = 1; + +#ifdef USE_CPU_AFFINITY +#if defined(__linux__) && defined(CPU_COUNT) + cpu_set_t mask; + + if (sched_getaffinity(0, sizeof(mask), &mask) == 0) + ret = CPU_COUNT(&mask); +#elif defined(__FreeBSD__) && defined(USE_CPU_AFFINITY) + cpuset_t cpuset; + if (cpuset_getaffinity(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, + sizeof(cpuset), &cpuset) == 0) + ret = CPU_COUNT(&cpuset); +#elif defined(__APPLE__) + ret = (int)sysconf(_SC_NPROCESSORS_ONLN); +#endif +#endif + ret = MAX(ret, 1); + return ret; +} + +/* Returns 1 if the cpu set is currently restricted for the process else 0. + * Currently only implemented for the Linux platform. + */ +int thread_cpu_mask_forced() +{ +#if defined(__linux__) + const int cpus_avail = sysconf(_SC_NPROCESSORS_ONLN); + return cpus_avail != thread_cpus_enabled(); +#else + return 0; +#endif +} + +/* Below come the lock-debugging functions */ + +#if defined(DEBUG_THREAD) || defined(DEBUG_FULL) + +struct lock_stat lock_stats[LOCK_LABELS]; + +/* this is only used below */ +static const char *lock_label(enum lock_label label) +{ + switch (label) { + case TASK_RQ_LOCK: return "TASK_RQ"; + case TASK_WQ_LOCK: return "TASK_WQ"; + case LISTENER_LOCK: return "LISTENER"; + case PROXY_LOCK: return "PROXY"; + case SERVER_LOCK: return "SERVER"; + case LBPRM_LOCK: return "LBPRM"; + case SIGNALS_LOCK: return "SIGNALS"; + case STK_TABLE_LOCK: return "STK_TABLE"; + case STK_SESS_LOCK: return "STK_SESS"; + case APPLETS_LOCK: return "APPLETS"; + case PEER_LOCK: return "PEER"; + case SHCTX_LOCK: return "SHCTX"; + case SSL_LOCK: return "SSL"; + case SSL_GEN_CERTS_LOCK: return "SSL_GEN_CERTS"; + case PATREF_LOCK: return "PATREF"; + case PATEXP_LOCK: return "PATEXP"; + case VARS_LOCK: return "VARS"; + case COMP_POOL_LOCK: return "COMP_POOL"; + case LUA_LOCK: return "LUA"; + case NOTIF_LOCK: return "NOTIF"; + case SPOE_APPLET_LOCK: return "SPOE_APPLET"; + case DNS_LOCK: return "DNS"; + case PID_LIST_LOCK: return "PID_LIST"; + case EMAIL_ALERTS_LOCK: return "EMAIL_ALERTS"; + case PIPES_LOCK: return "PIPES"; + case TLSKEYS_REF_LOCK: return "TLSKEYS_REF"; + case AUTH_LOCK: return "AUTH"; + case RING_LOCK: return "RING"; + case DICT_LOCK: return "DICT"; + case PROTO_LOCK: return "PROTO"; + case QUEUE_LOCK: return "QUEUE"; + case CKCH_LOCK: return "CKCH"; + case SNI_LOCK: return "SNI"; + case SSL_SERVER_LOCK: return "SSL_SERVER"; + case SFT_LOCK: return "SFT"; + case IDLE_CONNS_LOCK: return "IDLE_CONNS"; + case OCSP_LOCK: return "OCSP"; + case QC_CID_LOCK: return "QC_CID"; + case CACHE_LOCK: return "CACHE"; + case OTHER_LOCK: return "OTHER"; + case DEBUG1_LOCK: return "DEBUG1"; + case DEBUG2_LOCK: return "DEBUG2"; + case DEBUG3_LOCK: return "DEBUG3"; + case DEBUG4_LOCK: return "DEBUG4"; + case DEBUG5_LOCK: return "DEBUG5"; + case LOCK_LABELS: break; /* keep compiler happy */ + }; + /* only way to come here is consecutive to an internal bug */ + abort(); +} + +void show_lock_stats() +{ + int lbl; + + for (lbl = 0; lbl < LOCK_LABELS; lbl++) { + if (!lock_stats[lbl].num_write_locked && + !lock_stats[lbl].num_seek_locked && + !lock_stats[lbl].num_read_locked) { + fprintf(stderr, + "Stats about Lock %s: not used\n", + lock_label(lbl)); + continue; + } + + fprintf(stderr, + "Stats about Lock %s: \n", + lock_label(lbl)); + + if (lock_stats[lbl].num_write_locked) + fprintf(stderr, + "\t # write lock : %llu\n" + "\t # write unlock: %llu (%lld)\n" + "\t # wait time for write : %.3f msec\n" + "\t # wait time for write/lock: %.3f nsec\n", + (ullong)lock_stats[lbl].num_write_locked, + (ullong)lock_stats[lbl].num_write_unlocked, + (llong)(lock_stats[lbl].num_write_unlocked - lock_stats[lbl].num_write_locked), + (double)lock_stats[lbl].nsec_wait_for_write / 1000000.0, + lock_stats[lbl].num_write_locked ? ((double)lock_stats[lbl].nsec_wait_for_write / (double)lock_stats[lbl].num_write_locked) : 0); + + if (lock_stats[lbl].num_seek_locked) + fprintf(stderr, + "\t # seek lock : %llu\n" + "\t # seek unlock : %llu (%lld)\n" + "\t # wait time for seek : %.3f msec\n" + "\t # wait time for seek/lock : %.3f nsec\n", + (ullong)lock_stats[lbl].num_seek_locked, + (ullong)lock_stats[lbl].num_seek_unlocked, + (llong)(lock_stats[lbl].num_seek_unlocked - lock_stats[lbl].num_seek_locked), + (double)lock_stats[lbl].nsec_wait_for_seek / 1000000.0, + lock_stats[lbl].num_seek_locked ? ((double)lock_stats[lbl].nsec_wait_for_seek / (double)lock_stats[lbl].num_seek_locked) : 0); + + if (lock_stats[lbl].num_read_locked) + fprintf(stderr, + "\t # read lock : %llu\n" + "\t # read unlock : %llu (%lld)\n" + "\t # wait time for read : %.3f msec\n" + "\t # wait time for read/lock : %.3f nsec\n", + (ullong)lock_stats[lbl].num_read_locked, + (ullong)lock_stats[lbl].num_read_unlocked, + (llong)(lock_stats[lbl].num_read_unlocked - lock_stats[lbl].num_read_locked), + (double)lock_stats[lbl].nsec_wait_for_read / 1000000.0, + lock_stats[lbl].num_read_locked ? ((double)lock_stats[lbl].nsec_wait_for_read / (double)lock_stats[lbl].num_read_locked) : 0); + } +} + +void __ha_rwlock_init(struct ha_rwlock *l) +{ + memset(l, 0, sizeof(struct ha_rwlock)); + __RWLOCK_INIT(&l->lock); +} + +void __ha_rwlock_destroy(struct ha_rwlock *l) +{ + __RWLOCK_DESTROY(&l->lock); + memset(l, 0, sizeof(struct ha_rwlock)); +} + + +void __ha_rwlock_wrlock(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + + if ((st->cur_readers | st->cur_seeker | st->cur_writer) & tbit) + abort(); + + HA_ATOMIC_OR(&st->wait_writers, tbit); + + start_time = now_mono_time(); + __RWLOCK_WRLOCK(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_write, (now_mono_time() - start_time)); + + HA_ATOMIC_INC(&lock_stats[lbl].num_write_locked); + + st->cur_writer = tbit; + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + HA_ATOMIC_AND(&st->wait_writers, ~tbit); +} + +int __ha_rwlock_trywrlock(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + int r; + + if ((st->cur_readers | st->cur_seeker | st->cur_writer) & tbit) + abort(); + + /* We set waiting writer because trywrlock could wait for readers to quit */ + HA_ATOMIC_OR(&st->wait_writers, tbit); + + start_time = now_mono_time(); + r = __RWLOCK_TRYWRLOCK(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_write, (now_mono_time() - start_time)); + if (unlikely(r)) { + HA_ATOMIC_AND(&st->wait_writers, ~tbit); + return r; + } + HA_ATOMIC_INC(&lock_stats[lbl].num_write_locked); + + st->cur_writer = tbit; + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + HA_ATOMIC_AND(&st->wait_writers, ~tbit); + + return 0; +} + +void __ha_rwlock_wrunlock(enum lock_label lbl,struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + + if (unlikely(!(st->cur_writer & tbit))) { + /* the thread is not owning the lock for write */ + abort(); + } + + st->cur_writer = 0; + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + __RWLOCK_WRUNLOCK(&l->lock); + + HA_ATOMIC_INC(&lock_stats[lbl].num_write_unlocked); +} + +void __ha_rwlock_rdlock(enum lock_label lbl,struct ha_rwlock *l) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + + if ((st->cur_readers | st->cur_seeker | st->cur_writer) & tbit) + abort(); + + HA_ATOMIC_OR(&st->wait_readers, tbit); + + start_time = now_mono_time(); + __RWLOCK_RDLOCK(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_read, (now_mono_time() - start_time)); + HA_ATOMIC_INC(&lock_stats[lbl].num_read_locked); + + HA_ATOMIC_OR(&st->cur_readers, tbit); + + HA_ATOMIC_AND(&st->wait_readers, ~tbit); +} + +int __ha_rwlock_tryrdlock(enum lock_label lbl,struct ha_rwlock *l) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + int r; + + if ((st->cur_readers | st->cur_seeker | st->cur_writer) & tbit) + abort(); + + /* try read should never wait */ + r = __RWLOCK_TRYRDLOCK(&l->lock); + if (unlikely(r)) + return r; + HA_ATOMIC_INC(&lock_stats[lbl].num_read_locked); + + HA_ATOMIC_OR(&st->cur_readers, tbit); + + return 0; +} + +void __ha_rwlock_rdunlock(enum lock_label lbl,struct ha_rwlock *l) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + + if (unlikely(!(st->cur_readers & tbit))) { + /* the thread is not owning the lock for read */ + abort(); + } + + HA_ATOMIC_AND(&st->cur_readers, ~tbit); + + __RWLOCK_RDUNLOCK(&l->lock); + + HA_ATOMIC_INC(&lock_stats[lbl].num_read_unlocked); +} + +void __ha_rwlock_wrtord(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + + if ((st->cur_readers | st->cur_seeker) & tbit) + abort(); + + if (!(st->cur_writer & tbit)) + abort(); + + HA_ATOMIC_OR(&st->wait_readers, tbit); + + start_time = now_mono_time(); + __RWLOCK_WRTORD(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_read, (now_mono_time() - start_time)); + + HA_ATOMIC_INC(&lock_stats[lbl].num_read_locked); + + HA_ATOMIC_OR(&st->cur_readers, tbit); + HA_ATOMIC_AND(&st->cur_writer, ~tbit); + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + HA_ATOMIC_AND(&st->wait_readers, ~tbit); +} + +void __ha_rwlock_wrtosk(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + + if ((st->cur_readers | st->cur_seeker) & tbit) + abort(); + + if (!(st->cur_writer & tbit)) + abort(); + + HA_ATOMIC_OR(&st->wait_seekers, tbit); + + start_time = now_mono_time(); + __RWLOCK_WRTOSK(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_seek, (now_mono_time() - start_time)); + + HA_ATOMIC_INC(&lock_stats[lbl].num_seek_locked); + + HA_ATOMIC_OR(&st->cur_seeker, tbit); + HA_ATOMIC_AND(&st->cur_writer, ~tbit); + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + HA_ATOMIC_AND(&st->wait_seekers, ~tbit); +} + +void __ha_rwlock_sklock(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + + if ((st->cur_readers | st->cur_seeker | st->cur_writer) & tbit) + abort(); + + HA_ATOMIC_OR(&st->wait_seekers, tbit); + + start_time = now_mono_time(); + __RWLOCK_SKLOCK(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_seek, (now_mono_time() - start_time)); + + HA_ATOMIC_INC(&lock_stats[lbl].num_seek_locked); + + HA_ATOMIC_OR(&st->cur_seeker, tbit); + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + HA_ATOMIC_AND(&st->wait_seekers, ~tbit); +} + +void __ha_rwlock_sktowr(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + + if ((st->cur_readers | st->cur_writer) & tbit) + abort(); + + if (!(st->cur_seeker & tbit)) + abort(); + + HA_ATOMIC_OR(&st->wait_writers, tbit); + + start_time = now_mono_time(); + __RWLOCK_SKTOWR(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_write, (now_mono_time() - start_time)); + + HA_ATOMIC_INC(&lock_stats[lbl].num_write_locked); + + HA_ATOMIC_OR(&st->cur_writer, tbit); + HA_ATOMIC_AND(&st->cur_seeker, ~tbit); + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + HA_ATOMIC_AND(&st->wait_writers, ~tbit); +} + +void __ha_rwlock_sktord(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + + if ((st->cur_readers | st->cur_writer) & tbit) + abort(); + + if (!(st->cur_seeker & tbit)) + abort(); + + HA_ATOMIC_OR(&st->wait_readers, tbit); + + start_time = now_mono_time(); + __RWLOCK_SKTORD(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_read, (now_mono_time() - start_time)); + + HA_ATOMIC_INC(&lock_stats[lbl].num_read_locked); + + HA_ATOMIC_OR(&st->cur_readers, tbit); + HA_ATOMIC_AND(&st->cur_seeker, ~tbit); + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + HA_ATOMIC_AND(&st->wait_readers, ~tbit); +} + +void __ha_rwlock_skunlock(enum lock_label lbl,struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + if (!(st->cur_seeker & tbit)) + abort(); + + HA_ATOMIC_AND(&st->cur_seeker, ~tbit); + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + __RWLOCK_SKUNLOCK(&l->lock); + + HA_ATOMIC_INC(&lock_stats[lbl].num_seek_unlocked); +} + +int __ha_rwlock_trysklock(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + int r; + + if ((st->cur_readers | st->cur_seeker | st->cur_writer) & tbit) + abort(); + + HA_ATOMIC_OR(&st->wait_seekers, tbit); + + start_time = now_mono_time(); + r = __RWLOCK_TRYSKLOCK(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_seek, (now_mono_time() - start_time)); + + if (likely(!r)) { + /* got the lock ! */ + HA_ATOMIC_INC(&lock_stats[lbl].num_seek_locked); + HA_ATOMIC_OR(&st->cur_seeker, tbit); + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + } + + HA_ATOMIC_AND(&st->wait_seekers, ~tbit); + return r; +} + +int __ha_rwlock_tryrdtosk(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_rwlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + int r; + + if ((st->cur_writer | st->cur_seeker) & tbit) + abort(); + + if (!(st->cur_readers & tbit)) + abort(); + + HA_ATOMIC_OR(&st->wait_seekers, tbit); + + start_time = now_mono_time(); + r = __RWLOCK_TRYRDTOSK(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_seek, (now_mono_time() - start_time)); + + if (likely(!r)) { + /* got the lock ! */ + HA_ATOMIC_INC(&lock_stats[lbl].num_seek_locked); + HA_ATOMIC_OR(&st->cur_seeker, tbit); + HA_ATOMIC_AND(&st->cur_readers, ~tbit); + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + } + + HA_ATOMIC_AND(&st->wait_seekers, ~tbit); + return r; +} + +void __spin_init(struct ha_spinlock *l) +{ + memset(l, 0, sizeof(struct ha_spinlock)); + __SPIN_INIT(&l->lock); +} + +void __spin_destroy(struct ha_spinlock *l) +{ + __SPIN_DESTROY(&l->lock); + memset(l, 0, sizeof(struct ha_spinlock)); +} + +void __spin_lock(enum lock_label lbl, struct ha_spinlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_spinlock_state *st = &l->info.st[tgid-1]; + uint64_t start_time; + + if (unlikely(st->owner & tbit)) { + /* the thread is already owning the lock */ + abort(); + } + + HA_ATOMIC_OR(&st->waiters, tbit); + + start_time = now_mono_time(); + __SPIN_LOCK(&l->lock); + HA_ATOMIC_ADD(&lock_stats[lbl].nsec_wait_for_write, (now_mono_time() - start_time)); + + HA_ATOMIC_INC(&lock_stats[lbl].num_write_locked); + + + st->owner = tbit; + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + HA_ATOMIC_AND(&st->waiters, ~tbit); +} + +int __spin_trylock(enum lock_label lbl, struct ha_spinlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_spinlock_state *st = &l->info.st[tgid-1]; + int r; + + if (unlikely(st->owner & tbit)) { + /* the thread is already owning the lock */ + abort(); + } + + /* try read should never wait */ + r = __SPIN_TRYLOCK(&l->lock); + if (unlikely(r)) + return r; + HA_ATOMIC_INC(&lock_stats[lbl].num_write_locked); + + st->owner = tbit; + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + return 0; +} + +void __spin_unlock(enum lock_label lbl, struct ha_spinlock *l, + const char *func, const char *file, int line) +{ + ulong tbit = (ti && ti->ltid_bit) ? ti->ltid_bit : 1; + struct ha_spinlock_state *st = &l->info.st[tgid-1]; + + if (unlikely(!(st->owner & tbit))) { + /* the thread is not owning the lock */ + abort(); + } + + st->owner = 0; + l->info.last_location.function = func; + l->info.last_location.file = file; + l->info.last_location.line = line; + + __SPIN_UNLOCK(&l->lock); + HA_ATOMIC_INC(&lock_stats[lbl].num_write_unlocked); +} + +#endif // defined(DEBUG_THREAD) || defined(DEBUG_FULL) + + +#if defined(USE_PTHREAD_EMULATION) + +/* pthread rwlock emulation using plocks (to avoid expensive futexes). + * these are a direct mapping on Progressive Locks, with the exception that + * since there's a common unlock operation in pthreads, we need to know if + * we need to unlock for reads or writes, so we set the topmost bit to 1 when + * a write lock is acquired to indicate that a write unlock needs to be + * performed. It's not a problem since this bit will never be used given that + * haproxy won't support as many threads as the plocks. + * + * The storage is the pthread_rwlock_t cast as an ulong + */ + +int pthread_rwlock_init(pthread_rwlock_t *restrict rwlock, const pthread_rwlockattr_t *restrict attr) +{ + ulong *lock = (ulong *)rwlock; + + *lock = 0; + return 0; +} + +int pthread_rwlock_destroy(pthread_rwlock_t *rwlock) +{ + ulong *lock = (ulong *)rwlock; + + *lock = 0; + return 0; +} + +int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) +{ + pl_lorw_rdlock((unsigned long *)rwlock); + return 0; +} + +int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock) +{ + return !!pl_cmpxchg((unsigned long *)rwlock, 0, PLOCK_LORW_SHR_BASE); +} + +int pthread_rwlock_timedrdlock(pthread_rwlock_t *restrict rwlock, const struct timespec *restrict abstime) +{ + return pthread_rwlock_tryrdlock(rwlock); +} + +int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) +{ + pl_lorw_wrlock((unsigned long *)rwlock); + return 0; +} + +int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock) +{ + return !!pl_cmpxchg((unsigned long *)rwlock, 0, PLOCK_LORW_EXC_BASE); +} + +int pthread_rwlock_timedwrlock(pthread_rwlock_t *restrict rwlock, const struct timespec *restrict abstime) +{ + return pthread_rwlock_trywrlock(rwlock); +} + +int pthread_rwlock_unlock(pthread_rwlock_t *rwlock) +{ + pl_lorw_unlock((unsigned long *)rwlock); + return 0; +} +#endif // defined(USE_PTHREAD_EMULATION) + +/* Depending on the platform and how libpthread was built, pthread_exit() may + * involve some code in libgcc_s that would be loaded on exit for the first + * time, causing aborts if the process is chrooted. It's harmless bit very + * dirty. There isn't much we can do to make sure libgcc_s is loaded only if + * needed, so what we do here is that during early boot we create a dummy + * thread that immediately exits. This will lead to libgcc_s being loaded + * during boot on the platforms where it's required. + */ +static void *dummy_thread_function(void *data) +{ + pthread_exit(NULL); + return NULL; +} + +static inline void preload_libgcc_s(void) +{ + pthread_t dummy_thread; + if (pthread_create(&dummy_thread, NULL, dummy_thread_function, NULL) == 0) + pthread_join(dummy_thread, NULL); +} + +static void __thread_init(void) +{ + char *ptr = NULL; + + preload_libgcc_s(); + + thread_cpus_enabled_at_boot = thread_cpus_enabled(); + thread_cpus_enabled_at_boot = MIN(thread_cpus_enabled_at_boot, MAX_THREADS); + + memprintf(&ptr, "Built with multi-threading support (MAX_TGROUPS=%d, MAX_THREADS=%d, default=%d).", + MAX_TGROUPS, MAX_THREADS, thread_cpus_enabled_at_boot); + hap_register_build_opts(ptr, 1); + +#if defined(DEBUG_THREAD) || defined(DEBUG_FULL) + memset(lock_stats, 0, sizeof(lock_stats)); +#endif +} +INITCALL0(STG_PREPARE, __thread_init); + +#else + +/* send signal <sig> to thread <thr> (send to process in fact) */ +void ha_tkill(unsigned int thr, int sig) +{ + raise(sig); +} + +/* send signal <sig> to all threads (send to process in fact) */ +void ha_tkillall(int sig) +{ + raise(sig); +} + +void ha_thread_relax(void) +{ +#ifdef _POSIX_PRIORITY_SCHEDULING + sched_yield(); +#endif +} + +REGISTER_BUILD_OPTS("Built without multi-threading support (USE_THREAD not set)."); + +#endif // USE_THREAD + + +/* Returns non-zero on anomaly (bound vs unbound), and emits a warning in this + * case. + */ +int thread_detect_binding_discrepancies(void) +{ +#if defined(USE_CPU_AFFINITY) + uint th, tg, id; + uint tot_b = 0, tot_u = 0; + int first_b = -1; + int first_u = -1; + + for (th = 0; th < global.nbthread; th++) { + tg = ha_thread_info[th].tgid; + id = ha_thread_info[th].ltid; + + if (ha_cpuset_count(&cpu_map[tg - 1].thread[id]) == 0) { + tot_u++; + if (first_u < 0) + first_u = th; + } else { + tot_b++; + if (first_b < 0) + first_b = th; + } + } + + if (tot_u > 0 && tot_b > 0) { + ha_warning("Found %u thread(s) mapped to a CPU and %u thread(s) not mapped to any CPU. " + "This will result in some threads being randomly assigned to the same CPU, " + "which will occasionally cause severe performance degradation. First thread " + "bound is %d and first thread not bound is %d. Please either bind all threads " + "or none (maybe some cpu-map directives are missing?).\n", + tot_b, tot_u, first_b, first_u); + return 1; + } +#endif + return 0; +} + +/* Returns non-zero on anomaly (more threads than CPUs), and emits a warning in + * this case. It checks against configured cpu-map if any, otherwise against + * the number of CPUs at boot if known. It's better to run it only after + * thread_detect_binding_discrepancies() so that mixed cases can be eliminated. + */ +int thread_detect_more_than_cpus(void) +{ +#if defined(USE_CPU_AFFINITY) + struct hap_cpuset cpuset_map, cpuset_boot, cpuset_all; + uint th, tg, id; + int bound; + int tot_map, tot_all; + + ha_cpuset_zero(&cpuset_boot); + ha_cpuset_zero(&cpuset_map); + ha_cpuset_zero(&cpuset_all); + bound = 0; + for (th = 0; th < global.nbthread; th++) { + tg = ha_thread_info[th].tgid; + id = ha_thread_info[th].ltid; + if (ha_cpuset_count(&cpu_map[tg - 1].thread[id])) { + ha_cpuset_or(&cpuset_map, &cpu_map[tg - 1].thread[id]); + bound++; + } + } + + ha_cpuset_assign(&cpuset_all, &cpuset_map); + if (bound != global.nbthread) { + if (ha_cpuset_detect_bound(&cpuset_boot)) + ha_cpuset_or(&cpuset_all, &cpuset_boot); + } + + tot_map = ha_cpuset_count(&cpuset_map); + tot_all = ha_cpuset_count(&cpuset_all); + + if (tot_map && bound > tot_map) { + ha_warning("This configuration binds %d threads to a total of %d CPUs via cpu-map " + "directives. This means that some threads will compete for the same CPU, " + "which will cause severe performance degradation. Please fix either the " + "'cpu-map' directives or set the global 'nbthread' value accordingly.\n", + bound, tot_map); + return 1; + } + else if (tot_all && global.nbthread > tot_all) { + ha_warning("This configuration enables %d threads running on a total of %d CPUs. " + "This means that some threads will compete for the same CPU, which will cause " + "severe performance degradation. Please either the 'cpu-map' directives to " + "adjust the CPUs to use, or fix the global 'nbthread' value.\n", + global.nbthread, tot_all); + return 1; + } +#endif + return 0; +} + + +/* scans the configured thread mapping and establishes the final one. Returns <0 + * on failure, >=0 on success. + */ +int thread_map_to_groups() +{ + int t, g, ut, ug; + int q, r; + ulong m __maybe_unused; + + ut = ug = 0; // unassigned threads & groups + + for (t = 0; t < global.nbthread; t++) { + if (!ha_thread_info[t].tg) + ut++; + } + + for (g = 0; g < global.nbtgroups; g++) { + if (!ha_tgroup_info[g].count) + ug++; + ha_tgroup_info[g].tgid_bit = 1UL << g; + } + + if (ug > ut) { + ha_alert("More unassigned thread-groups (%d) than threads (%d). Please reduce thread-groups\n", ug, ut); + return -1; + } + + /* look for first unassigned thread */ + for (t = 0; t < global.nbthread && ha_thread_info[t].tg; t++) + ; + + /* assign threads to empty groups */ + for (g = 0; ug && ut; ) { + /* due to sparse thread assignment we can end up with more threads + * per group on last assigned groups than former ones, so we must + * always try to pack the maximum remaining ones together first. + */ + q = ut / ug; + r = ut % ug; + if ((q + !!r) > MAX_THREADS_PER_GROUP) { + ha_alert("Too many remaining unassigned threads (%d) for thread groups (%d). Please increase thread-groups or make sure to keep thread numbers contiguous\n", ut, ug); + return -1; + } + + /* thread <t> is the next unassigned one. Let's look for next + * unassigned group, we know there are some left + */ + while (ut >= ug && ha_tgroup_info[g].count) + g++; + + /* group g is unassigned, try to fill it with consecutive threads */ + while (ut && ut >= ug && ha_tgroup_info[g].count < q + !!r && + (!ha_tgroup_info[g].count || t == ha_tgroup_info[g].base + ha_tgroup_info[g].count)) { + + if (!ha_tgroup_info[g].count) { + /* assign new group */ + ha_tgroup_info[g].base = t; + ug--; + } + + ha_tgroup_info[g].count++; + ha_thread_info[t].tgid = g + 1; + ha_thread_info[t].tg = &ha_tgroup_info[g]; + ha_thread_info[t].tg_ctx = &ha_tgroup_ctx[g]; + + ut--; + /* switch to next unassigned thread */ + while (++t < global.nbthread && ha_thread_info[t].tg) + ; + } + } + + if (ut) { + ha_alert("Remaining unassigned threads found (%d) because all groups are in use. Please increase 'thread-groups', reduce 'nbthreads' or remove or extend 'thread-group' enumerations.\n", ut); + return -1; + } + + for (t = 0; t < global.nbthread; t++) { + ha_thread_info[t].tid = t; + ha_thread_info[t].ltid = t - ha_thread_info[t].tg->base; + ha_thread_info[t].ltid_bit = 1UL << ha_thread_info[t].ltid; + } + + m = 0; + for (g = 0; g < global.nbtgroups; g++) { + ha_tgroup_info[g].threads_enabled = nbits(ha_tgroup_info[g].count); + /* for now, additional threads are not started, so we should + * consider them as harmless and idle. + * This will get automatically updated when such threads are + * started in run_thread_poll_loop() + * Without this, thread_isolate() and thread_isolate_full() + * will fail to work as long as secondary threads did not enter + * the polling loop at least once. + */ + ha_tgroup_ctx[g].threads_harmless = ha_tgroup_info[g].threads_enabled; + ha_tgroup_ctx[g].threads_idle = ha_tgroup_info[g].threads_enabled; + if (!ha_tgroup_info[g].count) + continue; + m |= 1UL << g; + + } + +#ifdef USE_THREAD + all_tgroups_mask = m; +#endif + return 0; +} + +/* Converts a configuration thread set based on either absolute or relative + * thread numbers into a global group+mask. This is essentially for use with + * the "thread" directive on "bind" lines, where "thread 4-6,10-12" might be + * turned to "2/1-3,4/1-3". It cannot be used before the thread mapping above + * was completed and the thread group numbers configured. The thread_set is + * replaced by the resolved group-based one. It is possible to force a single + * default group for unspecified sets instead of enabling all groups by passing + * this group's non-zero value to defgrp. + * + * Returns <0 on failure, >=0 on success. + */ +int thread_resolve_group_mask(struct thread_set *ts, int defgrp, char **err) +{ + struct thread_set new_ts = { }; + ulong mask, imask; + uint g; + + if (!ts->grps) { + /* unspecified group, IDs are global */ + if (thread_set_is_empty(ts)) { + /* all threads of all groups, unless defgrp is set and + * we then set it as the only group. + */ + for (g = defgrp ? defgrp-1 : 0; g < (defgrp ? defgrp : global.nbtgroups); g++) { + new_ts.rel[g] = ha_tgroup_info[g].threads_enabled; + if (new_ts.rel[g]) + new_ts.grps |= 1UL << g; + } + } else { + /* some absolute threads are set, we must remap them to + * relative ones. Each group cannot have more than + * LONGBITS threads, thus it spans at most two absolute + * blocks. + */ + for (g = 0; g < global.nbtgroups; g++) { + uint block = ha_tgroup_info[g].base / LONGBITS; + uint base = ha_tgroup_info[g].base % LONGBITS; + + mask = ts->abs[block] >> base; + if (base && + (block + 1) < sizeof(ts->abs) / sizeof(ts->abs[0]) && + ha_tgroup_info[g].count > (LONGBITS - base)) + mask |= ts->abs[block + 1] << (LONGBITS - base); + mask &= nbits(ha_tgroup_info[g].count); + mask &= ha_tgroup_info[g].threads_enabled; + + /* now the mask exactly matches the threads to be enabled + * in this group. + */ + new_ts.rel[g] |= mask; + if (new_ts.rel[g]) + new_ts.grps |= 1UL << g; + } + } + } else { + /* groups were specified */ + for (g = 0; g < MAX_TGROUPS; g++) { + imask = ts->rel[g]; + if (!imask) + continue; + + if (g >= global.nbtgroups) { + memprintf(err, "'thread' directive references non-existing thread group %u", g+1); + return -1; + } + + /* some relative threads are set. Keep only existing ones for this group */ + mask = nbits(ha_tgroup_info[g].count); + + if (!(mask & imask)) { + /* no intersection between the thread group's + * threads and the bind line's. + */ +#ifdef THREAD_AUTO_ADJUST_GROUPS + unsigned long new_mask = 0; + + while (imask) { + new_mask |= imask & mask; + imask >>= ha_tgroup_info[g].count; + } + imask = new_mask; +#else + memprintf(err, "'thread' directive only references threads not belonging to group %u", g+1); + return -1; +#endif + } + + new_ts.rel[g] = imask & mask; + if (new_ts.rel[g]) + new_ts.grps |= 1UL << g; + } + } + + /* update the thread_set */ + if (!thread_set_nth_group(&new_ts, 0)) { + memprintf(err, "'thread' directive only references non-existing threads"); + return -1; + } + + *ts = new_ts; + return 0; +} + +/* Parse a string representing a thread set in one of the following forms: + * + * - { "all" | "odd" | "even" | <abs_num> [ "-" <abs_num> ] }[,...] + * => these are (lists of) absolute thread numbers + * + * - <tgnum> "/" { "all" | "odd" | "even" | <rel_num> [ "-" <rel_num> ][,...] + * => these are (lists of) per-group relative thread numbers. All numbers + * must be lower than or equal to LONGBITS. When multiple list elements + * are provided, each of them must contain the thread group number. + * + * Minimum value for a thread or group number is always 1. Maximum value for an + * absolute thread number is MAX_THREADS, maximum value for a relative thread + * number is MAX_THREADS_PER_GROUP, an maximum value for a thread group is + * MAX_TGROUPS. "all", "even" and "odd" will be bound by MAX_THREADS and/or + * MAX_THREADS_PER_GROUP in any case. In ranges, a missing digit before "-" + * is implicitly 1, and a missing digit after "-" is implicitly the highest of + * its class. As such "-" is equivalent to "all", allowing to build strings + * such as "${MIN}-${MAX}" where both MIN and MAX are optional. + * + * It is not valid to mix absolute and relative numbers. As such: + * - all valid (all absolute threads) + * - 12-19,24-31 valid (abs threads 12 to 19 and 24 to 31) + * - 1/all valid (all 32 or 64 threads of group 1) + * - 1/1-4,1/8-10,2/1 valid + * - 1/1-4,8-10 invalid (mixes relatve "1/1-4" with absolute "8-10") + * - 1-4,8-10,2/1 invalid (mixes absolute "1-4,8-10" with relative "2/1") + * - 1/odd-4 invalid (mixes range with boundary) + * + * The target thread set is *completed* with supported threads, which means + * that it's the caller's responsibility for pre-initializing it. If the target + * thread set is NULL, it's not updated and the function only verifies that the + * input parses. + * + * On success, it returns 0. otherwise it returns non-zero with an error + * message in <err>. + */ +int parse_thread_set(const char *arg, struct thread_set *ts, char **err) +{ + const char *set; + const char *sep; + int v, min, max, tg; + int is_rel; + + /* search for the first delimiter (',', '-' or '/') to decide whether + * we're facing an absolute or relative form. The relative form always + * starts with a number followed by a slash. + */ + for (sep = arg; isdigit((uchar)*sep); sep++) + ; + + is_rel = (/*sep > arg &&*/ *sep == '/'); /* relative form */ + + /* from there we have to cut the thread spec around commas */ + + set = arg; + tg = 0; + while (*set) { + /* note: we can't use strtol() here because "-3" would parse as + * (-3) while we want to stop before the "-", so we find the + * separator ourselves and rely on atoi() whose value we may + * ignore depending where the separator is. + */ + for (sep = set; isdigit((uchar)*sep); sep++) + ; + + if (sep != set && *sep && *sep != '/' && *sep != '-' && *sep != ',') { + memprintf(err, "invalid character '%c' in thread set specification: '%s'.", *sep, set); + return -1; + } + + v = (sep != set) ? atoi(set) : 0; + + /* Now we know that the string is made of an optional series of digits + * optionally followed by one of the delimiters above, or that it + * starts with a different character. + */ + + /* first, let's search for the thread group (digits before '/') */ + + if (tg || !is_rel) { + /* thread group already specified or not expected if absolute spec */ + if (*sep == '/') { + if (tg) + memprintf(err, "redundant thread group specification '%s' for group %d", set, tg); + else + memprintf(err, "group-relative thread specification '%s' is not permitted after a absolute thread range.", set); + return -1; + } + } else { + /* this is a group-relative spec, first field is the group number */ + if (sep == set && *sep == '/') { + memprintf(err, "thread group number expected before '%s'.", set); + return -1; + } + + if (*sep != '/') { + memprintf(err, "absolute thread specification '%s' is not permitted after a group-relative thread range.", set); + return -1; + } + + if (v < 1 || v > MAX_TGROUPS) { + memprintf(err, "invalid thread group number '%d', permitted range is 1..%d in '%s'.", v, MAX_TGROUPS, set); + return -1; + } + + tg = v; + + /* skip group number and go on with set,sep,v as if + * there was no group number. + */ + set = sep + 1; + continue; + } + + /* Now 'set' starts at the min thread number, whose value is in v if any, + * and preset the max to it, unless the range is filled at once via "all" + * (stored as 1:0), "odd" (stored as) 1:-1, or "even" (stored as 1:-2). + * 'sep' points to the next non-digit which may be set itself e.g. for + * "all" etc or "-xx". + */ + + if (!*set) { + /* empty set sets no restriction */ + min = 1; + max = is_rel ? MAX_THREADS_PER_GROUP : MAX_THREADS; + } + else { + if (sep != set && *sep && *sep != '-' && *sep != ',') { + // Only delimiters are permitted around digits. + memprintf(err, "invalid character '%c' in thread set specification: '%s'.", *sep, set); + return -1; + } + + /* for non-digits, find next delim */ + for (; *sep && *sep != '-' && *sep != ','; sep++) + ; + + min = max = 1; + if (sep != set) { + /* non-empty first thread */ + if (isteq(ist2(set, sep-set), ist("all"))) + max = 0; + else if (isteq(ist2(set, sep-set), ist("odd"))) + max = -1; + else if (isteq(ist2(set, sep-set), ist("even"))) + max = -2; + else if (v) + min = max = v; + else + max = min = 0; // throw an error below + } + + if (min < 1 || min > MAX_THREADS || (is_rel && min > MAX_THREADS_PER_GROUP)) { + memprintf(err, "invalid first thread number '%s', permitted range is 1..%d, or 'all', 'odd', 'even'.", + set, is_rel ? MAX_THREADS_PER_GROUP : MAX_THREADS); + return -1; + } + + /* is this a range ? */ + if (*sep == '-') { + if (min != max) { + memprintf(err, "extraneous range after 'all', 'odd' or 'even': '%s'.", set); + return -1; + } + + /* this is a seemingly valid range, there may be another number */ + for (set = ++sep; isdigit((uchar)*sep); sep++) + ; + v = atoi(set); + + if (sep == set) { // no digit: to the max + max = is_rel ? MAX_THREADS_PER_GROUP : MAX_THREADS; + if (*sep && *sep != ',') + max = 0; // throw an error below + } else + max = v; + + if (max < 1 || max > MAX_THREADS || (is_rel && max > MAX_THREADS_PER_GROUP)) { + memprintf(err, "invalid last thread number '%s', permitted range is 1..%d.", + set, is_rel ? MAX_THREADS_PER_GROUP : MAX_THREADS); + return -1; + } + } + + /* here sep points to the first non-digit after the thread spec, + * must be a valid delimiter. + */ + if (*sep && *sep != ',') { + memprintf(err, "invalid character '%c' after thread set specification: '%s'.", *sep, set); + return -1; + } + } + + /* store values */ + if (ts) { + if (is_rel) { + /* group-relative thread numbers */ + ts->grps |= 1UL << (tg - 1); + + if (max >= min) { + for (v = min; v <= max; v++) + ts->rel[tg - 1] |= 1UL << (v - 1); + } else { + memset(&ts->rel[tg - 1], + (max == 0) ? 0xff /* all */ : (max == -1) ? 0x55 /* odd */: 0xaa /* even */, + sizeof(ts->rel[tg - 1])); + } + } else { + /* absolute thread numbers */ + if (max >= min) { + for (v = min; v <= max; v++) + ts->abs[(v - 1) / LONGBITS] |= 1UL << ((v - 1) % LONGBITS); + } else { + memset(&ts->abs, + (max == 0) ? 0xff /* all */ : (max == -1) ? 0x55 /* odd */: 0xaa /* even */, + sizeof(ts->abs)); + } + } + } + + set = *sep ? sep + 1 : sep; + tg = 0; + } + return 0; +} + +/* Parse the "nbthread" global directive, which takes an integer argument that + * contains the desired number of threads. + */ +static int cfg_parse_nbthread(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + long nbthread; + char *errptr; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (non_global_section_parsed == 1) { + memprintf(err, "'%s' not allowed if a non-global section was previously defined. This parameter must be declared in the first global section", args[0]); + return -1; + } + + nbthread = strtol(args[1], &errptr, 10); + if (!*args[1] || *errptr) { + memprintf(err, "'%s' passed a missing or unparsable integer value in '%s'", args[0], args[1]); + return -1; + } + +#ifndef USE_THREAD + if (nbthread != 1) { + memprintf(err, "'%s' specified with a value other than 1 while HAProxy is not compiled with threads support. Please check build options for USE_THREAD", args[0]); + return -1; + } +#else + if (nbthread < 1 || nbthread > MAX_THREADS) { + memprintf(err, "'%s' value must be between 1 and %d (was %ld)", args[0], MAX_THREADS, nbthread); + return -1; + } +#endif + + HA_DIAG_WARNING_COND(global.nbthread, + "parsing [%s:%d] : '%s' is already defined and will be overridden.\n", + file, line, args[0]); + + global.nbthread = nbthread; + return 0; +} + +/* Parse the "thread-group" global directive, which takes an integer argument + * that designates a thread group, and a list of threads to put into that group. + */ +static int cfg_parse_thread_group(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *errptr; + long tnum, tend, tgroup; + int arg, tot; + + if (non_global_section_parsed == 1) { + memprintf(err, "'%s' not allowed if a non-global section was previously defined. This parameter must be declared in the first global section", args[0]); + return -1; + } + + tgroup = strtol(args[1], &errptr, 10); + if (!*args[1] || *errptr) { + memprintf(err, "'%s' passed a missing or unparsable integer value in '%s'", args[0], args[1]); + return -1; + } + + if (tgroup < 1 || tgroup > MAX_TGROUPS) { + memprintf(err, "'%s' thread-group number must be between 1 and %d (was %ld)", args[0], MAX_TGROUPS, tgroup); + return -1; + } + + /* look for a preliminary definition of any thread pointing to this + * group, and remove them. + */ + if (ha_tgroup_info[tgroup-1].count) { + ha_warning("parsing [%s:%d] : '%s %ld' was already defined and will be overridden.\n", + file, line, args[0], tgroup); + + for (tnum = ha_tgroup_info[tgroup-1].base; + tnum < ha_tgroup_info[tgroup-1].base + ha_tgroup_info[tgroup-1].count; + tnum++) { + if (ha_thread_info[tnum-1].tg == &ha_tgroup_info[tgroup-1]) { + ha_thread_info[tnum-1].tg = NULL; + ha_thread_info[tnum-1].tgid = 0; + ha_thread_info[tnum-1].tg_ctx = NULL; + } + } + ha_tgroup_info[tgroup-1].count = ha_tgroup_info[tgroup-1].base = 0; + } + + tot = 0; + for (arg = 2; args[arg] && *args[arg]; arg++) { + tend = tnum = strtol(args[arg], &errptr, 10); + + if (*errptr == '-') + tend = strtol(errptr + 1, &errptr, 10); + + if (*errptr || tnum < 1 || tend < 1 || tnum > MAX_THREADS || tend > MAX_THREADS) { + memprintf(err, "'%s %ld' passed an unparsable or invalid thread number '%s' (valid range is 1 to %d)", args[0], tgroup, args[arg], MAX_THREADS); + return -1; + } + + for(; tnum <= tend; tnum++) { + if (ha_thread_info[tnum-1].tg == &ha_tgroup_info[tgroup-1]) { + ha_warning("parsing [%s:%d] : '%s %ld': thread %ld assigned more than once on the same line.\n", + file, line, args[0], tgroup, tnum); + } else if (ha_thread_info[tnum-1].tg) { + ha_warning("parsing [%s:%d] : '%s %ld': thread %ld was previously assigned to thread group %ld and will be overridden.\n", + file, line, args[0], tgroup, tnum, + (long)(ha_thread_info[tnum-1].tg - &ha_tgroup_info[0] + 1)); + } + + if (!ha_tgroup_info[tgroup-1].count) { + ha_tgroup_info[tgroup-1].base = tnum-1; + ha_tgroup_info[tgroup-1].count = 1; + } + else if (tnum >= ha_tgroup_info[tgroup-1].base + ha_tgroup_info[tgroup-1].count) { + ha_tgroup_info[tgroup-1].count = tnum - ha_tgroup_info[tgroup-1].base; + } + else if (tnum < ha_tgroup_info[tgroup-1].base) { + ha_tgroup_info[tgroup-1].count += ha_tgroup_info[tgroup-1].base - tnum-1; + ha_tgroup_info[tgroup-1].base = tnum - 1; + } + + ha_thread_info[tnum-1].tgid = tgroup; + ha_thread_info[tnum-1].tg = &ha_tgroup_info[tgroup-1]; + ha_thread_info[tnum-1].tg_ctx = &ha_tgroup_ctx[tgroup-1]; + tot++; + } + } + + if (ha_tgroup_info[tgroup-1].count > tot) { + memprintf(err, "'%s %ld' assigned sparse threads, only contiguous supported", args[0], tgroup); + return -1; + } + + if (ha_tgroup_info[tgroup-1].count > MAX_THREADS_PER_GROUP) { + memprintf(err, "'%s %ld' assigned too many threads (%d, max=%d)", args[0], tgroup, tot, MAX_THREADS_PER_GROUP); + return -1; + } + + return 0; +} + +/* Parse the "thread-groups" global directive, which takes an integer argument + * that contains the desired number of thread groups. + */ +static int cfg_parse_thread_groups(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + long nbtgroups; + char *errptr; + + if (too_many_args(1, args, err, NULL)) + return -1; + + if (non_global_section_parsed == 1) { + memprintf(err, "'%s' not allowed if a non-global section was previously defined. This parameter must be declared in the first global section", args[0]); + return -1; + } + + nbtgroups = strtol(args[1], &errptr, 10); + if (!*args[1] || *errptr) { + memprintf(err, "'%s' passed a missing or unparsable integer value in '%s'", args[0], args[1]); + return -1; + } + +#ifndef USE_THREAD + if (nbtgroups != 1) { + memprintf(err, "'%s' specified with a value other than 1 while HAProxy is not compiled with threads support. Please check build options for USE_THREAD", args[0]); + return -1; + } +#else + if (nbtgroups < 1 || nbtgroups > MAX_TGROUPS) { + memprintf(err, "'%s' value must be between 1 and %d (was %ld)", args[0], MAX_TGROUPS, nbtgroups); + return -1; + } +#endif + + HA_DIAG_WARNING_COND(global.nbtgroups, + "parsing [%s:%d] : '%s' is already defined and will be overridden.\n", + file, line, args[0]); + + global.nbtgroups = nbtgroups; + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "nbthread", cfg_parse_nbthread, 0 }, + { CFG_GLOBAL, "thread-group", cfg_parse_thread_group, 0 }, + { CFG_GLOBAL, "thread-groups", cfg_parse_thread_groups, 0 }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); diff --git a/src/time.c b/src/time.c new file mode 100644 index 0000000..280b522 --- /dev/null +++ b/src/time.c @@ -0,0 +1,147 @@ +/* + * Time calculation functions. + * + * Copyright 2000-2011 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <sys/time.h> + +#include <haproxy/api.h> +#include <haproxy/time.h> + + +/* + * adds <ms> ms to <from>, set the result to <tv> and returns a pointer <tv> + */ +struct timeval *_tv_ms_add(struct timeval *tv, const struct timeval *from, int ms) +{ + tv->tv_usec = from->tv_usec + (ms % 1000) * 1000; + tv->tv_sec = from->tv_sec + (ms / 1000); + while (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } + return tv; +} + +/* + * compares <tv1> and <tv2> modulo 1ms: returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2 + * Must not be used when either argument is eternity. Use tv_ms_cmp2() for that. + */ +int _tv_ms_cmp(const struct timeval *tv1, const struct timeval *tv2) +{ + return __tv_ms_cmp(tv1, tv2); +} + +/* + * compares <tv1> and <tv2> modulo 1 ms: returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2, + * assuming that TV_ETERNITY is greater than everything. + */ +int _tv_ms_cmp2(const struct timeval *tv1, const struct timeval *tv2) +{ + return __tv_ms_cmp2(tv1, tv2); +} + +/* + * compares <tv1> and <tv2> modulo 1 ms: returns 1 if tv1 <= tv2, 0 if tv1 > tv2, + * assuming that TV_ETERNITY is greater than everything. Returns 0 if tv1 is + * TV_ETERNITY, and always assumes that tv2 != TV_ETERNITY. Designed to replace + * occurrences of (tv_ms_cmp2(tv,now) <= 0). + */ +int _tv_ms_le2(const struct timeval *tv1, const struct timeval *tv2) +{ + return __tv_ms_le2(tv1, tv2); +} + +/* + * returns the remaining time between tv1=now and event=tv2 + * if tv2 is passed, 0 is returned. + * Must not be used when either argument is eternity. + */ +unsigned long _tv_ms_remain(const struct timeval *tv1, const struct timeval *tv2) +{ + return __tv_ms_remain(tv1, tv2); +} + +/* + * returns the remaining time between tv1=now and event=tv2 + * if tv2 is passed, 0 is returned. + * Returns TIME_ETERNITY if tv2 is eternity. + */ +unsigned long _tv_ms_remain2(const struct timeval *tv1, const struct timeval *tv2) +{ + if (tv_iseternity(tv2)) + return TIME_ETERNITY; + + return __tv_ms_remain(tv1, tv2); +} + +/* + * Returns the time in ms elapsed between tv1 and tv2, assuming that tv1<=tv2. + * Must not be used when either argument is eternity. + */ +unsigned long _tv_ms_elapsed(const struct timeval *tv1, const struct timeval *tv2) +{ + return __tv_ms_elapsed(tv1, tv2); +} + +/* + * adds <inc> to <from>, set the result to <tv> and returns a pointer <tv> + */ +struct timeval *_tv_add(struct timeval *tv, const struct timeval *from, const struct timeval *inc) +{ + return __tv_add(tv, from, inc); +} + +/* + * If <inc> is set, then add it to <from> and set the result to <tv>, then + * return 1, otherwise return 0. It is meant to be used in if conditions. + */ +int _tv_add_ifset(struct timeval *tv, const struct timeval *from, const struct timeval *inc) +{ + return __tv_add_ifset(tv, from, inc); +} + +/* + * Computes the remaining time between tv1=now and event=tv2. if tv2 is passed, + * 0 is returned. The result is stored into tv. + */ +struct timeval *_tv_remain(const struct timeval *tv1, const struct timeval *tv2, struct timeval *tv) +{ + return __tv_remain(tv1, tv2, tv); +} + +/* + * Computes the remaining time between tv1=now and event=tv2. if tv2 is passed, + * 0 is returned. The result is stored into tv. Returns ETERNITY if tv2 is + * eternity. + */ +struct timeval *_tv_remain2(const struct timeval *tv1, const struct timeval *tv2, struct timeval *tv) +{ + return __tv_remain2(tv1, tv2, tv); +} + +/* tv_isle: compares <tv1> and <tv2> : returns 1 if tv1 <= tv2, otherwise 0 */ +int _tv_isle(const struct timeval *tv1, const struct timeval *tv2) +{ + return __tv_isle(tv1, tv2); +} + +/* tv_isgt: compares <tv1> and <tv2> : returns 1 if tv1 > tv2, otherwise 0 */ +int _tv_isgt(const struct timeval *tv1, const struct timeval *tv2) +{ + return __tv_isgt(tv1, tv2); +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/tools.c b/src/tools.c new file mode 100644 index 0000000..b2814b5 --- /dev/null +++ b/src/tools.c @@ -0,0 +1,6348 @@ +/* + * General purpose functions. + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#if (defined(__ELF__) && !defined(__linux__)) || defined(USE_DL) +#define _GNU_SOURCE +#include <dlfcn.h> +#include <link.h> +#endif + +#if defined(__FreeBSD__) +#include <elf.h> +#include <dlfcn.h> +extern void *__elf_aux_vector; +#endif + +#if defined(__NetBSD__) +#include <sys/exec_elf.h> +#include <dlfcn.h> +#endif + +#include <ctype.h> +#include <errno.h> +#include <netdb.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/un.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#if defined(__linux__) && defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 16)) +#include <sys/auxv.h> +#endif + +#include <import/eb32sctree.h> +#include <import/eb32tree.h> +#include <import/ebmbtree.h> + +#include <haproxy/api.h> +#include <haproxy/applet.h> +#include <haproxy/chunk.h> +#include <haproxy/dgram.h> +#include <haproxy/global.h> +#include <haproxy/hlua.h> +#include <haproxy/listener.h> +#include <haproxy/namespace.h> +#include <haproxy/net_helper.h> +#include <haproxy/protocol.h> +#include <haproxy/quic_sock.h> +#include <haproxy/resolvers.h> +#include <haproxy/sc_strm.h> +#include <haproxy/sock.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/ssl_utils.h> +#include <haproxy/stconn.h> +#include <haproxy/task.h> +#include <haproxy/tools.h> +#include <haproxy/xxhash.h> + +/* This macro returns false if the test __x is false. Many + * of the following parsing function must be abort the processing + * if it returns 0, so this macro is useful for writing light code. + */ +#define RET0_UNLESS(__x) do { if (!(__x)) return 0; } while (0) + +/* Define the number of line of hash_word */ +#define NB_L_HASH_WORD 15 + +/* return the hash of a string and length for a given key. All keys are valid. */ +#define HA_ANON(key, str, len) (XXH32(str, len, key) & 0xFFFFFF) + +/* enough to store NB_ITOA_STR integers of : + * 2^64-1 = 18446744073709551615 or + * -2^63 = -9223372036854775808 + * + * The HTML version needs room for adding the 25 characters + * '<span class="rls"></span>' around digits at positions 3N+1 in order + * to add spacing at up to 6 positions : 18 446 744 073 709 551 615 + */ +THREAD_LOCAL char itoa_str[NB_ITOA_STR][171]; +THREAD_LOCAL int itoa_idx = 0; /* index of next itoa_str to use */ + +/* sometimes we'll need to quote strings (eg: in stats), and we don't expect + * to quote strings larger than a max configuration line. + */ +THREAD_LOCAL char quoted_str[NB_QSTR][QSTR_SIZE + 1]; +THREAD_LOCAL int quoted_idx = 0; + +/* thread-local PRNG state. It's modified to start from a different sequence + * on all threads upon startup. It must not be used or anything beyond getting + * statistical values as it's 100% predictable. + */ +THREAD_LOCAL unsigned int statistical_prng_state = 2463534242U; + +/* set to true if this is a static build */ +int build_is_static = 0; + +/* A global static table to store hashed words */ +static THREAD_LOCAL char hash_word[NB_L_HASH_WORD][20]; +static THREAD_LOCAL int index_hash = 0; + +/* + * unsigned long long ASCII representation + * + * return the last char '\0' or NULL if no enough + * space in dst + */ +char *ulltoa(unsigned long long n, char *dst, size_t size) +{ + int i = 0; + char *res; + + switch(n) { + case 1ULL ... 9ULL: + i = 0; + break; + + case 10ULL ... 99ULL: + i = 1; + break; + + case 100ULL ... 999ULL: + i = 2; + break; + + case 1000ULL ... 9999ULL: + i = 3; + break; + + case 10000ULL ... 99999ULL: + i = 4; + break; + + case 100000ULL ... 999999ULL: + i = 5; + break; + + case 1000000ULL ... 9999999ULL: + i = 6; + break; + + case 10000000ULL ... 99999999ULL: + i = 7; + break; + + case 100000000ULL ... 999999999ULL: + i = 8; + break; + + case 1000000000ULL ... 9999999999ULL: + i = 9; + break; + + case 10000000000ULL ... 99999999999ULL: + i = 10; + break; + + case 100000000000ULL ... 999999999999ULL: + i = 11; + break; + + case 1000000000000ULL ... 9999999999999ULL: + i = 12; + break; + + case 10000000000000ULL ... 99999999999999ULL: + i = 13; + break; + + case 100000000000000ULL ... 999999999999999ULL: + i = 14; + break; + + case 1000000000000000ULL ... 9999999999999999ULL: + i = 15; + break; + + case 10000000000000000ULL ... 99999999999999999ULL: + i = 16; + break; + + case 100000000000000000ULL ... 999999999999999999ULL: + i = 17; + break; + + case 1000000000000000000ULL ... 9999999999999999999ULL: + i = 18; + break; + + case 10000000000000000000ULL ... ULLONG_MAX: + i = 19; + break; + } + if (i + 2 > size) // (i + 1) + '\0' + return NULL; // too long + res = dst + i + 1; + *res = '\0'; + for (; i >= 0; i--) { + dst[i] = n % 10ULL + '0'; + n /= 10ULL; + } + return res; +} + +/* + * unsigned long ASCII representation + * + * return the last char '\0' or NULL if no enough + * space in dst + */ +char *ultoa_o(unsigned long n, char *dst, size_t size) +{ + int i = 0; + char *res; + + switch (n) { + case 0U ... 9UL: + i = 0; + break; + + case 10U ... 99UL: + i = 1; + break; + + case 100U ... 999UL: + i = 2; + break; + + case 1000U ... 9999UL: + i = 3; + break; + + case 10000U ... 99999UL: + i = 4; + break; + + case 100000U ... 999999UL: + i = 5; + break; + + case 1000000U ... 9999999UL: + i = 6; + break; + + case 10000000U ... 99999999UL: + i = 7; + break; + + case 100000000U ... 999999999UL: + i = 8; + break; +#if __WORDSIZE == 32 + + case 1000000000ULL ... ULONG_MAX: + i = 9; + break; + +#elif __WORDSIZE == 64 + + case 1000000000ULL ... 9999999999UL: + i = 9; + break; + + case 10000000000ULL ... 99999999999UL: + i = 10; + break; + + case 100000000000ULL ... 999999999999UL: + i = 11; + break; + + case 1000000000000ULL ... 9999999999999UL: + i = 12; + break; + + case 10000000000000ULL ... 99999999999999UL: + i = 13; + break; + + case 100000000000000ULL ... 999999999999999UL: + i = 14; + break; + + case 1000000000000000ULL ... 9999999999999999UL: + i = 15; + break; + + case 10000000000000000ULL ... 99999999999999999UL: + i = 16; + break; + + case 100000000000000000ULL ... 999999999999999999UL: + i = 17; + break; + + case 1000000000000000000ULL ... 9999999999999999999UL: + i = 18; + break; + + case 10000000000000000000ULL ... ULONG_MAX: + i = 19; + break; + +#endif + } + if (i + 2 > size) // (i + 1) + '\0' + return NULL; // too long + res = dst + i + 1; + *res = '\0'; + for (; i >= 0; i--) { + dst[i] = n % 10U + '0'; + n /= 10U; + } + return res; +} + +/* + * signed long ASCII representation + * + * return the last char '\0' or NULL if no enough + * space in dst + */ +char *ltoa_o(long int n, char *dst, size_t size) +{ + char *pos = dst; + + if (n < 0) { + if (size < 3) + return NULL; // min size is '-' + digit + '\0' but another test in ultoa + *pos = '-'; + pos++; + dst = ultoa_o(-n, pos, size - 1); + } else { + dst = ultoa_o(n, dst, size); + } + return dst; +} + +/* + * signed long long ASCII representation + * + * return the last char '\0' or NULL if no enough + * space in dst + */ +char *lltoa(long long n, char *dst, size_t size) +{ + char *pos = dst; + + if (n < 0) { + if (size < 3) + return NULL; // min size is '-' + digit + '\0' but another test in ulltoa + *pos = '-'; + pos++; + dst = ulltoa(-n, pos, size - 1); + } else { + dst = ulltoa(n, dst, size); + } + return dst; +} + +/* + * write a ascii representation of a unsigned into dst, + * return a pointer to the last character + * Pad the ascii representation with '0', using size. + */ +char *utoa_pad(unsigned int n, char *dst, size_t size) +{ + int i = 0; + char *ret; + + switch(n) { + case 0U ... 9U: + i = 0; + break; + + case 10U ... 99U: + i = 1; + break; + + case 100U ... 999U: + i = 2; + break; + + case 1000U ... 9999U: + i = 3; + break; + + case 10000U ... 99999U: + i = 4; + break; + + case 100000U ... 999999U: + i = 5; + break; + + case 1000000U ... 9999999U: + i = 6; + break; + + case 10000000U ... 99999999U: + i = 7; + break; + + case 100000000U ... 999999999U: + i = 8; + break; + + case 1000000000U ... 4294967295U: + i = 9; + break; + } + if (i + 2 > size) // (i + 1) + '\0' + return NULL; // too long + i = size - 2; // padding - '\0' + + ret = dst + i + 1; + *ret = '\0'; + for (; i >= 0; i--) { + dst[i] = n % 10U + '0'; + n /= 10U; + } + return ret; +} + +/* + * copies at most <size-1> chars from <src> to <dst>. Last char is always + * set to 0, unless <size> is 0. The number of chars copied is returned + * (excluding the terminating zero). + * This code has been optimized for size and speed : on x86, it's 45 bytes + * long, uses only registers, and consumes only 4 cycles per char. + */ +int strlcpy2(char *dst, const char *src, int size) +{ + char *orig = dst; + if (size) { + while (--size && (*dst = *src)) { + src++; dst++; + } + *dst = 0; + } + return dst - orig; +} + +/* + * This function simply returns a locally allocated string containing + * the ascii representation for number 'n' in decimal. + */ +char *ultoa_r(unsigned long n, char *buffer, int size) +{ + char *pos; + + pos = buffer + size - 1; + *pos-- = '\0'; + + do { + *pos-- = '0' + n % 10; + n /= 10; + } while (n && pos >= buffer); + return pos + 1; +} + +/* + * This function simply returns a locally allocated string containing + * the ascii representation for number 'n' in decimal. + */ +char *lltoa_r(long long int in, char *buffer, int size) +{ + char *pos; + int neg = 0; + unsigned long long int n; + + pos = buffer + size - 1; + *pos-- = '\0'; + + if (in < 0) { + neg = 1; + n = -in; + } + else + n = in; + + do { + *pos-- = '0' + n % 10; + n /= 10; + } while (n && pos >= buffer); + if (neg && pos > buffer) + *pos-- = '-'; + return pos + 1; +} + +/* + * This function simply returns a locally allocated string containing + * the ascii representation for signed number 'n' in decimal. + */ +char *sltoa_r(long n, char *buffer, int size) +{ + char *pos; + + if (n >= 0) + return ultoa_r(n, buffer, size); + + pos = ultoa_r(-n, buffer + 1, size - 1) - 1; + *pos = '-'; + return pos; +} + +/* + * This function simply returns a locally allocated string containing + * the ascii representation for number 'n' in decimal, formatted for + * HTML output with tags to create visual grouping by 3 digits. The + * output needs to support at least 171 characters. + */ +const char *ulltoh_r(unsigned long long n, char *buffer, int size) +{ + char *start; + int digit = 0; + + start = buffer + size; + *--start = '\0'; + + do { + if (digit == 3 && start >= buffer + 7) + memcpy(start -= 7, "</span>", 7); + + if (start >= buffer + 1) { + *--start = '0' + n % 10; + n /= 10; + } + + if (digit == 3 && start >= buffer + 18) + memcpy(start -= 18, "<span class=\"rls\">", 18); + + if (digit++ == 3) + digit = 1; + } while (n && start > buffer); + return start; +} + +/* + * This function simply returns a locally allocated string containing the ascii + * representation for number 'n' in decimal, unless n is 0 in which case it + * returns the alternate string (or an empty string if the alternate string is + * NULL). It use is intended for limits reported in reports, where it's + * desirable not to display anything if there is no limit. Warning! it shares + * the same vector as ultoa_r(). + */ +const char *limit_r(unsigned long n, char *buffer, int size, const char *alt) +{ + return (n) ? ultoa_r(n, buffer, size) : (alt ? alt : ""); +} + +/* Trims the first "%f" float in a string to its minimum number of digits after + * the decimal point by trimming trailing zeroes, even dropping the decimal + * point if not needed. The string is in <buffer> of length <len>, and the + * number is expected to start at or after position <num_start> (the first + * point appearing there is considered). A NUL character is always placed at + * the end if some trimming occurs. The new buffer length is returned. + */ +size_t flt_trim(char *buffer, size_t num_start, size_t len) +{ + char *end = buffer + len; + char *p = buffer + num_start; + char *trim; + + do { + if (p >= end) + return len; + trim = p++; + } while (*trim != '.'); + + /* For now <trim> is on the decimal point. Let's look for any other + * meaningful digit after it. + */ + while (p < end) { + if (*p++ != '0') + trim = p; + } + + if (trim < end) + *trim = 0; + + return trim - buffer; +} + +/* + * This function simply returns a locally allocated string containing + * the ascii representation for number 'n' in decimal with useless trailing + * zeroes trimmed. + */ +char *ftoa_r(double n, char *buffer, int size) +{ + flt_trim(buffer, 0, snprintf(buffer, size, "%f", n)); + return buffer; +} + +/* returns a locally allocated string containing the quoted encoding of the + * input string. The output may be truncated to QSTR_SIZE chars, but it is + * guaranteed that the string will always be properly terminated. Quotes are + * encoded by doubling them as is commonly done in CSV files. QSTR_SIZE must + * always be at least 4 chars. + */ +const char *qstr(const char *str) +{ + char *ret = quoted_str[quoted_idx]; + char *p, *end; + + if (++quoted_idx >= NB_QSTR) + quoted_idx = 0; + + p = ret; + end = ret + QSTR_SIZE; + + *p++ = '"'; + + /* always keep 3 chars to support passing "" and the ending " */ + while (*str && p < end - 3) { + if (*str == '"') { + *p++ = '"'; + *p++ = '"'; + } + else + *p++ = *str; + str++; + } + *p++ = '"'; + return ret; +} + +/* + * Returns non-zero if character <s> is a hex digit (0-9, a-f, A-F), else zero. + * + * It looks like this one would be a good candidate for inlining, but this is + * not interesting because it around 35 bytes long and often called multiple + * times within the same function. + */ +int ishex(char s) +{ + s -= '0'; + if ((unsigned char)s <= 9) + return 1; + s -= 'A' - '0'; + if ((unsigned char)s <= 5) + return 1; + s -= 'a' - 'A'; + if ((unsigned char)s <= 5) + return 1; + return 0; +} + +/* rounds <i> down to the closest value having max 2 digits */ +unsigned int round_2dig(unsigned int i) +{ + unsigned int mul = 1; + + while (i >= 100) { + i /= 10; + mul *= 10; + } + return i * mul; +} + +/* + * Checks <name> for invalid characters. Valid chars are [A-Za-z0-9_:.-]. If an + * invalid character is found, a pointer to it is returned. If everything is + * fine, NULL is returned. + */ +const char *invalid_char(const char *name) +{ + if (!*name) + return name; + + while (*name) { + if (!isalnum((unsigned char)*name) && *name != '.' && *name != ':' && + *name != '_' && *name != '-') + return name; + name++; + } + return NULL; +} + +/* + * Checks <name> for invalid characters. Valid chars are [_.-] and those + * accepted by <f> function. + * If an invalid character is found, a pointer to it is returned. + * If everything is fine, NULL is returned. + */ +static inline const char *__invalid_char(const char *name, int (*f)(int)) { + + if (!*name) + return name; + + while (*name) { + if (!f((unsigned char)*name) && *name != '.' && + *name != '_' && *name != '-') + return name; + + name++; + } + + return NULL; +} + +/* + * Checks <name> for invalid characters. Valid chars are [A-Za-z0-9_.-]. + * If an invalid character is found, a pointer to it is returned. + * If everything is fine, NULL is returned. + */ +const char *invalid_domainchar(const char *name) { + return __invalid_char(name, isalnum); +} + +/* + * Checks <name> for invalid characters. Valid chars are [A-Za-z_.-]. + * If an invalid character is found, a pointer to it is returned. + * If everything is fine, NULL is returned. + */ +const char *invalid_prefix_char(const char *name) { + return __invalid_char(name, isalnum); +} + +/* + * converts <str> to a struct sockaddr_storage* provided by the caller. The + * caller must have zeroed <sa> first, and may have set sa->ss_family to force + * parse a specific address format. If the ss_family is 0 or AF_UNSPEC, then + * the function tries to guess the address family from the syntax. If the + * family is forced and the format doesn't match, an error is returned. The + * string is assumed to contain only an address, no port. The address can be a + * dotted IPv4 address, an IPv6 address, a host name, or empty or "*" to + * indicate INADDR_ANY. NULL is returned if the host part cannot be resolved. + * The return address will only have the address family and the address set, + * all other fields remain zero. The string is not supposed to be modified. + * The IPv6 '::' address is IN6ADDR_ANY. If <resolve> is non-zero, the hostname + * is resolved, otherwise only IP addresses are resolved, and anything else + * returns NULL. If the address contains a port, this one is preserved. + */ +struct sockaddr_storage *str2ip2(const char *str, struct sockaddr_storage *sa, int resolve) +{ + struct hostent *he; + /* max IPv6 length, including brackets and terminating NULL */ + char tmpip[48]; + int port = get_host_port(sa); + + /* check IPv6 with square brackets */ + if (str[0] == '[') { + size_t iplength = strlen(str); + + if (iplength < 4) { + /* minimal size is 4 when using brackets "[::]" */ + goto fail; + } + else if (iplength >= sizeof(tmpip)) { + /* IPv6 literal can not be larger than tmpip */ + goto fail; + } + else { + if (str[iplength - 1] != ']') { + /* if address started with bracket, it should end with bracket */ + goto fail; + } + else { + memcpy(tmpip, str + 1, iplength - 2); + tmpip[iplength - 2] = '\0'; + str = tmpip; + } + } + } + + /* Any IPv6 address */ + if (str[0] == ':' && str[1] == ':' && !str[2]) { + if (!sa->ss_family || sa->ss_family == AF_UNSPEC) + sa->ss_family = AF_INET6; + else if (sa->ss_family != AF_INET6) + goto fail; + set_host_port(sa, port); + return sa; + } + + /* Any address for the family, defaults to IPv4 */ + if (!str[0] || (str[0] == '*' && !str[1])) { + if (!sa->ss_family || sa->ss_family == AF_UNSPEC) + sa->ss_family = AF_INET; + set_host_port(sa, port); + return sa; + } + + /* check for IPv6 first */ + if ((!sa->ss_family || sa->ss_family == AF_UNSPEC || sa->ss_family == AF_INET6) && + inet_pton(AF_INET6, str, &((struct sockaddr_in6 *)sa)->sin6_addr)) { + sa->ss_family = AF_INET6; + set_host_port(sa, port); + return sa; + } + + /* then check for IPv4 */ + if ((!sa->ss_family || sa->ss_family == AF_UNSPEC || sa->ss_family == AF_INET) && + inet_pton(AF_INET, str, &((struct sockaddr_in *)sa)->sin_addr)) { + sa->ss_family = AF_INET; + set_host_port(sa, port); + return sa; + } + + if (!resolve) + return NULL; + + if (!resolv_hostname_validation(str, NULL)) + return NULL; + +#ifdef USE_GETADDRINFO + if (global.tune.options & GTUNE_USE_GAI) { + struct addrinfo hints, *result; + int success = 0; + + memset(&result, 0, sizeof(result)); + memset(&hints, 0, sizeof(hints)); + hints.ai_family = sa->ss_family ? sa->ss_family : AF_UNSPEC; + hints.ai_socktype = SOCK_DGRAM; + hints.ai_flags = 0; + hints.ai_protocol = 0; + + if (getaddrinfo(str, NULL, &hints, &result) == 0) { + if (!sa->ss_family || sa->ss_family == AF_UNSPEC) + sa->ss_family = result->ai_family; + else if (sa->ss_family != result->ai_family) { + freeaddrinfo(result); + goto fail; + } + + switch (result->ai_family) { + case AF_INET: + memcpy((struct sockaddr_in *)sa, result->ai_addr, result->ai_addrlen); + set_host_port(sa, port); + success = 1; + break; + case AF_INET6: + memcpy((struct sockaddr_in6 *)sa, result->ai_addr, result->ai_addrlen); + set_host_port(sa, port); + success = 1; + break; + } + } + + if (result) + freeaddrinfo(result); + + if (success) + return sa; + } +#endif + /* try to resolve an IPv4/IPv6 hostname */ + he = gethostbyname(str); + if (he) { + if (!sa->ss_family || sa->ss_family == AF_UNSPEC) + sa->ss_family = he->h_addrtype; + else if (sa->ss_family != he->h_addrtype) + goto fail; + + switch (sa->ss_family) { + case AF_INET: + ((struct sockaddr_in *)sa)->sin_addr = *(struct in_addr *) *(he->h_addr_list); + set_host_port(sa, port); + return sa; + case AF_INET6: + ((struct sockaddr_in6 *)sa)->sin6_addr = *(struct in6_addr *) *(he->h_addr_list); + set_host_port(sa, port); + return sa; + } + } + + /* unsupported address family */ + fail: + return NULL; +} + +/* + * Converts <str> to a locally allocated struct sockaddr_storage *, and a port + * range or offset consisting in two integers that the caller will have to + * check to find the relevant input format. The following format are supported : + * + * String format | address | port | low | high + * addr | <addr> | 0 | 0 | 0 + * addr: | <addr> | 0 | 0 | 0 + * addr:port | <addr> | <port> | <port> | <port> + * addr:pl-ph | <addr> | <pl> | <pl> | <ph> + * addr:+port | <addr> | <port> | 0 | <port> + * addr:-port | <addr> |-<port> | <port> | 0 + * + * The detection of a port range or increment by the caller is made by + * comparing <low> and <high>. If both are equal, then port 0 means no port + * was specified. The caller may pass NULL for <low> and <high> if it is not + * interested in retrieving port ranges. + * + * Note that <addr> above may also be : + * - empty ("") => family will be AF_INET and address will be INADDR_ANY + * - "*" => family will be AF_INET and address will be INADDR_ANY + * - "::" => family will be AF_INET6 and address will be IN6ADDR_ANY + * - a host name => family and address will depend on host name resolving. + * + * A prefix may be passed in before the address above to force the family : + * - "ipv4@" => force address to resolve as IPv4 and fail if not possible. + * - "ipv6@" => force address to resolve as IPv6 and fail if not possible. + * - "unix@" => force address to be a path to a UNIX socket even if the + * path does not start with a '/' + * - 'abns@' -> force address to belong to the abstract namespace (Linux + * only). These sockets are just like Unix sockets but without + * the need for an underlying file system. The address is a + * string. Technically it's like a Unix socket with a zero in + * the first byte of the address. + * - "fd@" => an integer must follow, and is a file descriptor number. + * + * IPv6 addresses can be declared with or without square brackets. When using + * square brackets for IPv6 addresses, the port separator (colon) is optional. + * If not using square brackets, and in order to avoid any ambiguity with + * IPv6 addresses, the last colon ':' is mandatory even when no port is specified. + * NULL is returned if the address cannot be parsed. The <low> and <high> ports + * are always initialized if non-null, even for non-IP families. + * + * If <pfx> is non-null, it is used as a string prefix before any path-based + * address (typically the path to a unix socket). + * + * if <fqdn> is non-null, it will be filled with : + * - a pointer to the FQDN of the server name to resolve if there's one, and + * that the caller will have to free(), + * - NULL if there was an explicit address that doesn't require resolution. + * + * Hostnames are only resolved if <opts> has PA_O_RESOLVE. Otherwise <fqdn> is + * still honored so it is possible for the caller to know whether a resolution + * failed by clearing this flag and checking if <fqdn> was filled, indicating + * the need for a resolution. + * + * When a file descriptor is passed, its value is put into the s_addr part of + * the address when cast to sockaddr_in and the address family is + * AF_CUST_EXISTING_FD. + * + * The matching protocol will be set into <proto> if non-null. + * The address protocol and transport types hints which are directly resolved + * will be set into <sa_type> if not NULL. + * + * Any known file descriptor is also assigned to <fd> if non-null, otherwise it + * is forced to -1. + */ +struct sockaddr_storage *str2sa_range(const char *str, int *port, int *low, int *high, int *fd, + struct protocol **proto, struct net_addr_type *sa_type, + char **err, const char *pfx, char **fqdn, unsigned int opts) +{ + static THREAD_LOCAL struct sockaddr_storage ss; + struct sockaddr_storage *ret = NULL; + struct protocol *new_proto = NULL; + char *back, *str2; + char *port1, *port2; + int portl, porth, porta; + int abstract = 0; + int new_fd = -1; + enum proto_type proto_type = 0; // to shut gcc warning + int ctrl_type = 0; // to shut gcc warning + + portl = porth = porta = 0; + if (fqdn) + *fqdn = NULL; + + str2 = back = env_expand(strdup(str)); + if (str2 == NULL) { + memprintf(err, "out of memory in '%s'", __FUNCTION__); + goto out; + } + + if (!*str2) { + memprintf(err, "'%s' resolves to an empty address (environment variable missing?)", str); + goto out; + } + + memset(&ss, 0, sizeof(ss)); + + /* prepare the default socket types */ + if ((opts & (PA_O_STREAM|PA_O_DGRAM)) == PA_O_DGRAM || + ((opts & (PA_O_STREAM|PA_O_DGRAM)) == (PA_O_DGRAM|PA_O_STREAM) && (opts & PA_O_DEFAULT_DGRAM))) { + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_DGRAM; + } else { + proto_type = PROTO_TYPE_STREAM; + ctrl_type = SOCK_STREAM; + } + + if (strncmp(str2, "stream+", 7) == 0) { + str2 += 7; + proto_type = PROTO_TYPE_STREAM; + ctrl_type = SOCK_STREAM; + } + else if (strncmp(str2, "dgram+", 6) == 0) { + str2 += 6; + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_DGRAM; + } + else if (strncmp(str2, "quic+", 5) == 0) { + str2 += 5; + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_STREAM; + } + + if (strncmp(str2, "unix@", 5) == 0) { + str2 += 5; + abstract = 0; + ss.ss_family = AF_UNIX; + } + else if (strncmp(str2, "uxdg@", 5) == 0) { + str2 += 5; + abstract = 0; + ss.ss_family = AF_UNIX; + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_DGRAM; + } + else if (strncmp(str2, "uxst@", 5) == 0) { + str2 += 5; + abstract = 0; + ss.ss_family = AF_UNIX; + proto_type = PROTO_TYPE_STREAM; + ctrl_type = SOCK_STREAM; + } + else if (strncmp(str2, "abns@", 5) == 0) { + str2 += 5; + abstract = 1; + ss.ss_family = AF_UNIX; + } + else if (strncmp(str2, "ip@", 3) == 0) { + str2 += 3; + ss.ss_family = AF_UNSPEC; + } + else if (strncmp(str2, "ipv4@", 5) == 0) { + str2 += 5; + ss.ss_family = AF_INET; + } + else if (strncmp(str2, "ipv6@", 5) == 0) { + str2 += 5; + ss.ss_family = AF_INET6; + } + else if (strncmp(str2, "tcp4@", 5) == 0) { + str2 += 5; + ss.ss_family = AF_INET; + proto_type = PROTO_TYPE_STREAM; + ctrl_type = SOCK_STREAM; + } + else if (strncmp(str2, "udp4@", 5) == 0) { + str2 += 5; + ss.ss_family = AF_INET; + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_DGRAM; + } + else if (strncmp(str2, "tcp6@", 5) == 0) { + str2 += 5; + ss.ss_family = AF_INET6; + proto_type = PROTO_TYPE_STREAM; + ctrl_type = SOCK_STREAM; + } + else if (strncmp(str2, "udp6@", 5) == 0) { + str2 += 5; + ss.ss_family = AF_INET6; + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_DGRAM; + } + else if (strncmp(str2, "tcp@", 4) == 0) { + str2 += 4; + ss.ss_family = AF_UNSPEC; + proto_type = PROTO_TYPE_STREAM; + ctrl_type = SOCK_STREAM; + } + else if (strncmp(str2, "udp@", 4) == 0) { + str2 += 4; + ss.ss_family = AF_UNSPEC; + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_DGRAM; + } + else if (strncmp(str2, "quic4@", 6) == 0) { + str2 += 6; + ss.ss_family = AF_INET; + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_STREAM; + } + else if (strncmp(str2, "quic6@", 6) == 0) { + str2 += 6; + ss.ss_family = AF_INET6; + proto_type = PROTO_TYPE_DGRAM; + ctrl_type = SOCK_STREAM; + } + else if (strncmp(str2, "fd@", 3) == 0) { + str2 += 3; + ss.ss_family = AF_CUST_EXISTING_FD; + } + else if (strncmp(str2, "sockpair@", 9) == 0) { + str2 += 9; + ss.ss_family = AF_CUST_SOCKPAIR; + } + else if (strncmp(str2, "rhttp@", 3) == 0) { + /* TODO duplicated code from check_kw_experimental() */ + if (!experimental_directives_allowed) { + memprintf(err, "Address '%s' is experimental, must be allowed via a global 'expose-experimental-directives'", str2); + goto out; + } + mark_tainted(TAINTED_CONFIG_EXP_KW_DECLARED); + + str2 += 4; + ss.ss_family = AF_CUST_RHTTP_SRV; + } + else if (*str2 == '/') { + ss.ss_family = AF_UNIX; + } + else + ss.ss_family = AF_UNSPEC; + + if (ss.ss_family == AF_CUST_SOCKPAIR) { + struct sockaddr_storage ss2; + socklen_t addr_len; + char *endptr; + + new_fd = strtol(str2, &endptr, 10); + if (!*str2 || new_fd < 0 || *endptr) { + memprintf(err, "file descriptor '%s' is not a valid integer in '%s'", str2, str); + goto out; + } + + /* just verify that it's a socket */ + addr_len = sizeof(ss2); + if (getsockname(new_fd, (struct sockaddr *)&ss2, &addr_len) == -1) { + memprintf(err, "cannot use file descriptor '%d' : %s.", new_fd, strerror(errno)); + goto out; + } + + ((struct sockaddr_in *)&ss)->sin_addr.s_addr = new_fd; + ((struct sockaddr_in *)&ss)->sin_port = 0; + } + else if (ss.ss_family == AF_CUST_EXISTING_FD) { + char *endptr; + + new_fd = strtol(str2, &endptr, 10); + if (!*str2 || new_fd < 0 || *endptr) { + memprintf(err, "file descriptor '%s' is not a valid integer in '%s'", str2, str); + goto out; + } + + if (opts & PA_O_SOCKET_FD) { + socklen_t addr_len; + int type; + + addr_len = sizeof(ss); + if (getsockname(new_fd, (struct sockaddr *)&ss, &addr_len) == -1) { + memprintf(err, "cannot use file descriptor '%d' : %s.", new_fd, strerror(errno)); + goto out; + } + + addr_len = sizeof(type); + if (getsockopt(new_fd, SOL_SOCKET, SO_TYPE, &type, &addr_len) != 0 || + (type == SOCK_STREAM) != (proto_type == PROTO_TYPE_STREAM)) { + memprintf(err, "socket on file descriptor '%d' is of the wrong type.", new_fd); + goto out; + } + + porta = portl = porth = get_host_port(&ss); + } else if (opts & PA_O_RAW_FD) { + ((struct sockaddr_in *)&ss)->sin_addr.s_addr = new_fd; + ((struct sockaddr_in *)&ss)->sin_port = 0; + } else { + memprintf(err, "a file descriptor is not acceptable here in '%s'", str); + goto out; + } + } + else if (ss.ss_family == AF_UNIX) { + struct sockaddr_un *un = (struct sockaddr_un *)&ss; + int prefix_path_len; + int max_path_len; + int adr_len; + + /* complete unix socket path name during startup or soft-restart is + * <unix_bind_prefix><path>.<pid>.<bak|tmp> + */ + prefix_path_len = (pfx && !abstract) ? strlen(pfx) : 0; + max_path_len = (sizeof(un->sun_path) - 1) - + (abstract ? 0 : prefix_path_len + 1 + 5 + 1 + 3); + + adr_len = strlen(str2); + if (adr_len > max_path_len) { + memprintf(err, "socket path '%s' too long (max %d)", str, max_path_len); + goto out; + } + + /* when abstract==1, we skip the first zero and copy all bytes except the trailing zero */ + memset(un->sun_path, 0, sizeof(un->sun_path)); + if (prefix_path_len) + memcpy(un->sun_path, pfx, prefix_path_len); + memcpy(un->sun_path + prefix_path_len + abstract, str2, adr_len + 1 - abstract); + } + else if (ss.ss_family == AF_CUST_RHTTP_SRV) { + /* Nothing to do here. */ + } + else { /* IPv4 and IPv6 */ + char *end = str2 + strlen(str2); + char *chr; + + /* search for : or ] whatever comes first */ + for (chr = end-1; chr > str2; chr--) { + if (*chr == ']' || *chr == ':') + break; + } + + if (*chr == ':') { + /* Found a colon before a closing-bracket, must be a port separator. + * This guarantee backward compatibility. + */ + if (!(opts & PA_O_PORT_OK)) { + memprintf(err, "port specification not permitted here in '%s'", str); + goto out; + } + *chr++ = '\0'; + port1 = chr; + } + else { + /* Either no colon and no closing-bracket + * or directly ending with a closing-bracket. + * However, no port. + */ + if (opts & PA_O_PORT_MAND) { + memprintf(err, "missing port specification in '%s'", str); + goto out; + } + port1 = ""; + } + + if (isdigit((unsigned char)*port1)) { /* single port or range */ + char *endptr; + + port2 = strchr(port1, '-'); + if (port2) { + if (!(opts & PA_O_PORT_RANGE)) { + memprintf(err, "port range not permitted here in '%s'", str); + goto out; + } + *port2++ = '\0'; + } + else + port2 = port1; + portl = strtol(port1, &endptr, 10); + if (*endptr != '\0') { + memprintf(err, "invalid character '%c' in port number '%s' in '%s'", *endptr, port1, str); + goto out; + } + porth = strtol(port2, &endptr, 10); + if (*endptr != '\0') { + memprintf(err, "invalid character '%c' in port number '%s' in '%s'", *endptr, port2, str); + goto out; + } + + if (portl < !!(opts & PA_O_PORT_MAND) || portl > 65535) { + memprintf(err, "invalid port '%s'", port1); + goto out; + } + + if (porth < !!(opts & PA_O_PORT_MAND) || porth > 65535) { + memprintf(err, "invalid port '%s'", port2); + goto out; + } + + if (portl > porth) { + memprintf(err, "invalid port range '%d-%d'", portl, porth); + goto out; + } + + porta = portl; + } + else if (*port1 == '-') { /* negative offset */ + char *endptr; + + if (!(opts & PA_O_PORT_OFS)) { + memprintf(err, "port offset not permitted here in '%s'", str); + goto out; + } + portl = strtol(port1 + 1, &endptr, 10); + if (*endptr != '\0') { + memprintf(err, "invalid character '%c' in port number '%s' in '%s'", *endptr, port1 + 1, str); + goto out; + } + porta = -portl; + } + else if (*port1 == '+') { /* positive offset */ + char *endptr; + + if (!(opts & PA_O_PORT_OFS)) { + memprintf(err, "port offset not permitted here in '%s'", str); + goto out; + } + porth = strtol(port1 + 1, &endptr, 10); + if (*endptr != '\0') { + memprintf(err, "invalid character '%c' in port number '%s' in '%s'", *endptr, port1 + 1, str); + goto out; + } + porta = porth; + } + else if (*port1) { /* other any unexpected char */ + memprintf(err, "invalid character '%c' in port number '%s' in '%s'", *port1, port1, str); + goto out; + } + else if (opts & PA_O_PORT_MAND) { + memprintf(err, "missing port specification in '%s'", str); + goto out; + } + + /* first try to parse the IP without resolving. If it fails, it + * tells us we need to keep a copy of the FQDN to resolve later + * and to enable DNS. In this case we can proceed if <fqdn> is + * set or if PA_O_RESOLVE is set, otherwise it's an error. + */ + if (str2ip2(str2, &ss, 0) == NULL) { + if ((!(opts & PA_O_RESOLVE) && !fqdn) || + ((opts & PA_O_RESOLVE) && str2ip2(str2, &ss, 1) == NULL)) { + memprintf(err, "invalid address: '%s' in '%s'", str2, str); + goto out; + } + + if (fqdn) { + if (str2 != back) + memmove(back, str2, strlen(str2) + 1); + *fqdn = back; + back = NULL; + } + } + set_host_port(&ss, porta); + } + + if (ctrl_type == SOCK_STREAM && !(opts & PA_O_STREAM)) { + memprintf(err, "stream-type address not acceptable in '%s'", str); + goto out; + } + else if (ctrl_type == SOCK_DGRAM && !(opts & PA_O_DGRAM)) { + memprintf(err, "dgram-type address not acceptable in '%s'", str); + goto out; + } + + if (proto || (opts & PA_O_CONNECT)) { + /* Note: if the caller asks for a proto, we must find one, + * except if we inherit from a raw FD (family == AF_CUST_EXISTING_FD) + * orif we return with an fqdn that will resolve later, + * in which case the address is not known yet (this is only + * for servers actually). + */ + new_proto = protocol_lookup(ss.ss_family, + proto_type, + ctrl_type == SOCK_DGRAM); + + if (!new_proto && (!fqdn || !*fqdn) && (ss.ss_family != AF_CUST_EXISTING_FD)) { + memprintf(err, "unsupported %s protocol for %s family %d address '%s'%s", + (ctrl_type == SOCK_DGRAM) ? "datagram" : "stream", + (proto_type == PROTO_TYPE_DGRAM) ? "datagram" : "stream", + ss.ss_family, + str, +#ifndef USE_QUIC + (ctrl_type == SOCK_STREAM && proto_type == PROTO_TYPE_DGRAM) + ? "; QUIC is not compiled in if this is what you were looking for." + : "" +#else + "" +#endif + ); + goto out; + } + + if ((opts & PA_O_CONNECT) && new_proto && !new_proto->connect) { + memprintf(err, "connect() not supported for this protocol family %d used by address '%s'", ss.ss_family, str); + goto out; + } + } + + ret = &ss; + out: + if (port) + *port = porta; + if (low) + *low = portl; + if (high) + *high = porth; + if (fd) + *fd = new_fd; + if (proto) + *proto = new_proto; + if (sa_type) { + sa_type->proto_type = proto_type; + sa_type->xprt_type = (ctrl_type == SOCK_DGRAM) ? PROTO_TYPE_DGRAM : PROTO_TYPE_STREAM; + } + free(back); + return ret; +} + +/* converts <addr> and <port> into a string representation of the address and port. This is sort + * of an inverse of str2sa_range, with some restrictions. The supported families are AF_INET, + * AF_INET6, AF_UNIX, and AF_CUST_SOCKPAIR. If the family is unsopported NULL is returned. + * If map_ports is true, then the sign of the port is included in the output, to indicate it is + * relative to the incoming port. AF_INET and AF_INET6 will be in the form "<addr>:<port>". + * AF_UNIX will either be just the path (if using a pathname) or "abns@<path>" if it is abstract. + * AF_CUST_SOCKPAIR will be of the form "sockpair@<fd>". + * + * The returned char* is allocated, and it is the responsibility of the caller to free it. + */ +char * sa2str(const struct sockaddr_storage *addr, int port, int map_ports) +{ + char buffer[INET6_ADDRSTRLEN]; + char *out = NULL; + const void *ptr; + const char *path; + + switch (addr->ss_family) { + case AF_INET: + ptr = &((struct sockaddr_in *)addr)->sin_addr; + break; + case AF_INET6: + ptr = &((struct sockaddr_in6 *)addr)->sin6_addr; + break; + case AF_UNIX: + path = ((struct sockaddr_un *)addr)->sun_path; + if (path[0] == '\0') { + const int max_length = sizeof(struct sockaddr_un) - offsetof(struct sockaddr_un, sun_path) - 1; + return memprintf(&out, "abns@%.*s", max_length, path+1); + } else { + return strdup(path); + } + case AF_CUST_SOCKPAIR: + return memprintf(&out, "sockpair@%d", ((struct sockaddr_in *)addr)->sin_addr.s_addr); + default: + return NULL; + } + if (inet_ntop(addr->ss_family, ptr, buffer, sizeof(buffer)) == NULL) { + BUG_ON(errno == ENOSPC); + return NULL; + } + if (map_ports) + return memprintf(&out, "%s:%+d", buffer, port); + else + return memprintf(&out, "%s:%d", buffer, port); +} + + +/* converts <str> to a struct in_addr containing a network mask. It can be + * passed in dotted form (255.255.255.0) or in CIDR form (24). It returns 1 + * if the conversion succeeds otherwise zero. + */ +int str2mask(const char *str, struct in_addr *mask) +{ + if (strchr(str, '.') != NULL) { /* dotted notation */ + if (!inet_pton(AF_INET, str, mask)) + return 0; + } + else { /* mask length */ + char *err; + unsigned long len = strtol(str, &err, 10); + + if (!*str || (err && *err) || (unsigned)len > 32) + return 0; + + len2mask4(len, mask); + } + return 1; +} + +/* converts <str> to a struct in6_addr containing a network mask. It can be + * passed in quadruplet form (ffff:ffff::) or in CIDR form (64). It returns 1 + * if the conversion succeeds otherwise zero. + */ +int str2mask6(const char *str, struct in6_addr *mask) +{ + if (strchr(str, ':') != NULL) { /* quadruplet notation */ + if (!inet_pton(AF_INET6, str, mask)) + return 0; + } + else { /* mask length */ + char *err; + unsigned long len = strtol(str, &err, 10); + + if (!*str || (err && *err) || (unsigned)len > 128) + return 0; + + len2mask6(len, mask); + } + return 1; +} + +/* convert <cidr> to struct in_addr <mask>. It returns 1 if the conversion + * succeeds otherwise zero. + */ +int cidr2dotted(int cidr, struct in_addr *mask) { + + if (cidr < 0 || cidr > 32) + return 0; + + mask->s_addr = cidr ? htonl(~0UL << (32 - cidr)) : 0; + return 1; +} + +/* Convert mask from bit length form to in_addr form. + * This function never fails. + */ +void len2mask4(int len, struct in_addr *addr) +{ + if (len >= 32) { + addr->s_addr = 0xffffffff; + return; + } + if (len <= 0) { + addr->s_addr = 0x00000000; + return; + } + addr->s_addr = 0xffffffff << (32 - len); + addr->s_addr = htonl(addr->s_addr); +} + +/* Convert mask from bit length form to in6_addr form. + * This function never fails. + */ +void len2mask6(int len, struct in6_addr *addr) +{ + len2mask4(len, (struct in_addr *)&addr->s6_addr[0]); /* msb */ + len -= 32; + len2mask4(len, (struct in_addr *)&addr->s6_addr[4]); + len -= 32; + len2mask4(len, (struct in_addr *)&addr->s6_addr[8]); + len -= 32; + len2mask4(len, (struct in_addr *)&addr->s6_addr[12]); /* lsb */ +} + +/* + * converts <str> to two struct in_addr* which must be pre-allocated. + * The format is "addr[/mask]", where "addr" cannot be empty, and mask + * is optional and either in the dotted or CIDR notation. + * Note: "addr" can also be a hostname. Returns 1 if OK, 0 if error. + */ +int str2net(const char *str, int resolve, struct in_addr *addr, struct in_addr *mask) +{ + __label__ out_free, out_err; + char *c, *s; + int ret_val; + + s = strdup(str); + if (!s) + return 0; + + memset(mask, 0, sizeof(*mask)); + memset(addr, 0, sizeof(*addr)); + + if ((c = strrchr(s, '/')) != NULL) { + *c++ = '\0'; + /* c points to the mask */ + if (!str2mask(c, mask)) + goto out_err; + } + else { + mask->s_addr = ~0U; + } + if (!inet_pton(AF_INET, s, addr)) { + struct hostent *he; + + if (!resolve) + goto out_err; + + if ((he = gethostbyname(s)) == NULL) { + goto out_err; + } + else + *addr = *(struct in_addr *) *(he->h_addr_list); + } + + ret_val = 1; + out_free: + free(s); + return ret_val; + out_err: + ret_val = 0; + goto out_free; +} + + +/* + * converts <str> to two struct in6_addr* which must be pre-allocated. + * The format is "addr[/mask]", where "addr" cannot be empty, and mask + * is an optional number of bits (128 being the default). + * Returns 1 if OK, 0 if error. + */ +int str62net(const char *str, struct in6_addr *addr, unsigned char *mask) +{ + char *c, *s; + int ret_val = 0; + char *err; + unsigned long len = 128; + + s = strdup(str); + if (!s) + return 0; + + memset(mask, 0, sizeof(*mask)); + memset(addr, 0, sizeof(*addr)); + + if ((c = strrchr(s, '/')) != NULL) { + *c++ = '\0'; /* c points to the mask */ + if (!*c) + goto out_free; + + len = strtoul(c, &err, 10); + if ((err && *err) || (unsigned)len > 128) + goto out_free; + } + *mask = len; /* OK we have a valid mask in <len> */ + + if (!inet_pton(AF_INET6, s, addr)) + goto out_free; + + ret_val = 1; + out_free: + free(s); + return ret_val; +} + + +/* + * Parse IPv4 address found in url. Return the number of bytes parsed. It + * expects exactly 4 numbers between 0 and 255 delimited by dots, and returns + * zero in case of mismatch. + */ +int url2ipv4(const char *addr, struct in_addr *dst) +{ + int saw_digit, octets, ch; + u_char tmp[4], *tp; + const char *cp = addr; + + saw_digit = 0; + octets = 0; + *(tp = tmp) = 0; + + while (*addr) { + unsigned char digit = (ch = *addr) - '0'; + if (digit > 9 && ch != '.') + break; + addr++; + if (digit <= 9) { + u_int new = *tp * 10 + digit; + if (new > 255) + return 0; + *tp = new; + if (!saw_digit) { + if (++octets > 4) + return 0; + saw_digit = 1; + } + } else if (ch == '.' && saw_digit) { + if (octets == 4) + return 0; + *++tp = 0; + saw_digit = 0; + } else + return 0; + } + + if (octets < 4) + return 0; + + memcpy(&dst->s_addr, tmp, 4); + return addr - cp; +} + +/* + * Resolve destination server from URL. Convert <str> to a sockaddr_storage. + * <out> contain the code of the detected scheme, the start and length of + * the hostname. Actually only http and https are supported. <out> can be NULL. + * This function returns the consumed length. It is useful if you parse complete + * url like http://host:port/path, because the consumed length corresponds to + * the first character of the path. If the conversion fails, it returns -1. + * + * This function tries to resolve the DNS name if haproxy is in starting mode. + * So, this function may be used during the configuration parsing. + */ +int url2sa(const char *url, int ulen, struct sockaddr_storage *addr, struct split_url *out) +{ + const char *curr = url, *cp = url; + const char *end; + int ret, url_code = 0; + unsigned long long int http_code = 0; + int default_port; + struct hostent *he; + char *p; + + /* Firstly, try to find :// pattern */ + while (curr < url+ulen && url_code != 0x3a2f2f) { + url_code = ((url_code & 0xffff) << 8); + url_code += (unsigned char)*curr++; + } + + /* Secondly, if :// pattern is found, verify parsed stuff + * before pattern is matching our http pattern. + * If so parse ip address and port in uri. + * + * WARNING: Current code doesn't support dynamic async dns resolver. + */ + if (url_code != 0x3a2f2f) + return -1; + + /* Copy scheme, and utrn to lower case. */ + while (cp < curr - 3) + http_code = (http_code << 8) + *cp++; + http_code |= 0x2020202020202020ULL; /* Turn everything to lower case */ + + /* HTTP or HTTPS url matching */ + if (http_code == 0x2020202068747470ULL) { + default_port = 80; + if (out) + out->scheme = SCH_HTTP; + } + else if (http_code == 0x2020206874747073ULL) { + default_port = 443; + if (out) + out->scheme = SCH_HTTPS; + } + else + return -1; + + /* If the next char is '[', the host address is IPv6. */ + if (*curr == '[') { + curr++; + + /* Check trash size */ + if (trash.size < ulen) + return -1; + + /* Look for ']' and copy the address in a trash buffer. */ + p = trash.area; + for (end = curr; + end < url + ulen && *end != ']'; + end++, p++) + *p = *end; + if (*end != ']') + return -1; + *p = '\0'; + + /* Update out. */ + if (out) { + out->host = curr; + out->host_len = end - curr; + } + + /* Try IPv6 decoding. */ + if (!inet_pton(AF_INET6, trash.area, &((struct sockaddr_in6 *)addr)->sin6_addr)) + return -1; + end++; + + /* Decode port. */ + if (end < url + ulen && *end == ':') { + end++; + default_port = read_uint(&end, url + ulen); + } + ((struct sockaddr_in6 *)addr)->sin6_port = htons(default_port); + ((struct sockaddr_in6 *)addr)->sin6_family = AF_INET6; + return end - url; + } + else { + /* we need to copy the string into the trash because url2ipv4 + * needs a \0 at the end of the string */ + if (trash.size < ulen) + return -1; + + memcpy(trash.area, curr, ulen - (curr - url)); + trash.area[ulen - (curr - url)] = '\0'; + + /* We are looking for IP address. If you want to parse and + * resolve hostname found in url, you can use str2sa_range(), but + * be warned this can slow down global daemon performances + * while handling lagging dns responses. + */ + ret = url2ipv4(trash.area, &((struct sockaddr_in *)addr)->sin_addr); + if (ret) { + /* Update out. */ + if (out) { + out->host = curr; + out->host_len = ret; + } + + curr += ret; + + /* Decode port. */ + if (curr < url + ulen && *curr == ':') { + curr++; + default_port = read_uint(&curr, url + ulen); + } + ((struct sockaddr_in *)addr)->sin_port = htons(default_port); + + /* Set family. */ + ((struct sockaddr_in *)addr)->sin_family = AF_INET; + return curr - url; + } + else if (global.mode & MODE_STARTING) { + /* The IPv4 and IPv6 decoding fails, maybe the url contain name. Try to execute + * synchronous DNS request only if HAProxy is in the start state. + */ + + /* look for : or / or end */ + for (end = curr; + end < url + ulen && *end != '/' && *end != ':'; + end++); + memcpy(trash.area, curr, end - curr); + trash.area[end - curr] = '\0'; + + /* try to resolve an IPv4/IPv6 hostname */ + he = gethostbyname(trash.area); + if (!he) + return -1; + + /* Update out. */ + if (out) { + out->host = curr; + out->host_len = end - curr; + } + + /* Decode port. */ + if (end < url + ulen && *end == ':') { + end++; + default_port = read_uint(&end, url + ulen); + } + + /* Copy IP address, set port and family. */ + switch (he->h_addrtype) { + case AF_INET: + ((struct sockaddr_in *)addr)->sin_addr = *(struct in_addr *) *(he->h_addr_list); + ((struct sockaddr_in *)addr)->sin_port = htons(default_port); + ((struct sockaddr_in *)addr)->sin_family = AF_INET; + return end - url; + + case AF_INET6: + ((struct sockaddr_in6 *)addr)->sin6_addr = *(struct in6_addr *) *(he->h_addr_list); + ((struct sockaddr_in6 *)addr)->sin6_port = htons(default_port); + ((struct sockaddr_in6 *)addr)->sin6_family = AF_INET6; + return end - url; + } + } + } + return -1; +} + +/* Tries to convert a sockaddr_storage address to text form. Upon success, the + * address family is returned so that it's easy for the caller to adapt to the + * output format. Zero is returned if the address family is not supported. -1 + * is returned upon error, with errno set. AF_INET, AF_INET6 and AF_UNIX are + * supported. + */ +int addr_to_str(const struct sockaddr_storage *addr, char *str, int size) +{ + + const void *ptr; + + if (size < 5) + return 0; + *str = '\0'; + + switch (addr->ss_family) { + case AF_INET: + ptr = &((struct sockaddr_in *)addr)->sin_addr; + break; + case AF_INET6: + ptr = &((struct sockaddr_in6 *)addr)->sin6_addr; + break; + case AF_UNIX: + memcpy(str, "unix", 5); + return addr->ss_family; + default: + return 0; + } + + if (inet_ntop(addr->ss_family, ptr, str, size)) + return addr->ss_family; + + /* failed */ + return -1; +} + +/* Tries to convert a sockaddr_storage port to text form. Upon success, the + * address family is returned so that it's easy for the caller to adapt to the + * output format. Zero is returned if the address family is not supported. -1 + * is returned upon error, with errno set. AF_INET, AF_INET6 and AF_UNIX are + * supported. + */ +int port_to_str(const struct sockaddr_storage *addr, char *str, int size) +{ + + uint16_t port; + + + if (size < 6) + return 0; + *str = '\0'; + + switch (addr->ss_family) { + case AF_INET: + port = ((struct sockaddr_in *)addr)->sin_port; + break; + case AF_INET6: + port = ((struct sockaddr_in6 *)addr)->sin6_port; + break; + case AF_UNIX: + memcpy(str, "unix", 5); + return addr->ss_family; + default: + return 0; + } + + snprintf(str, size, "%u", ntohs(port)); + return addr->ss_family; +} + +/* check if the given address is local to the system or not. It will return + * -1 when it's not possible to know, 0 when the address is not local, 1 when + * it is. We don't want to iterate over all interfaces for this (and it is not + * portable). So instead we try to bind in UDP to this address on a free non + * privileged port and to connect to the same address, port 0 (connect doesn't + * care). If it succeeds, we own the address. Note that non-inet addresses are + * considered local since they're most likely AF_UNIX. + */ +int addr_is_local(const struct netns_entry *ns, + const struct sockaddr_storage *orig) +{ + struct sockaddr_storage addr; + int result; + int fd; + + if (!is_inet_addr(orig)) + return 1; + + memcpy(&addr, orig, sizeof(addr)); + set_host_port(&addr, 0); + + fd = my_socketat(ns, addr.ss_family, SOCK_DGRAM, IPPROTO_UDP); + if (fd < 0) + return -1; + + result = -1; + if (bind(fd, (struct sockaddr *)&addr, get_addr_len(&addr)) == 0) { + if (connect(fd, (struct sockaddr *)&addr, get_addr_len(&addr)) == -1) + result = 0; // fail, non-local address + else + result = 1; // success, local address + } + else { + if (errno == EADDRNOTAVAIL) + result = 0; // definitely not local :-) + } + close(fd); + + return result; +} + +/* will try to encode the string <string> replacing all characters tagged in + * <map> with the hexadecimal representation of their ASCII-code (2 digits) + * prefixed by <escape>, and will store the result between <start> (included) + * and <stop> (excluded), and will always terminate the string with a '\0' + * before <stop>. The position of the '\0' is returned if the conversion + * completes. If bytes are missing between <start> and <stop>, then the + * conversion will be incomplete and truncated. If <stop> <= <start>, the '\0' + * cannot even be stored so we return <start> without writing the 0. + * The input string must also be zero-terminated. + */ +const char hextab[16] = "0123456789ABCDEF"; +char *encode_string(char *start, char *stop, + const char escape, const long *map, + const char *string) +{ + if (start < stop) { + stop--; /* reserve one byte for the final '\0' */ + while (start < stop && *string != '\0') { + if (!ha_bit_test((unsigned char)(*string), map)) + *start++ = *string; + else { + if (start + 3 >= stop) + break; + *start++ = escape; + *start++ = hextab[(*string >> 4) & 15]; + *start++ = hextab[*string & 15]; + } + string++; + } + *start = '\0'; + } + return start; +} + +/* + * Same behavior as encode_string() above, except that it encodes chunk + * <chunk> instead of a string. + */ +char *encode_chunk(char *start, char *stop, + const char escape, const long *map, + const struct buffer *chunk) +{ + char *str = chunk->area; + char *end = chunk->area + chunk->data; + + if (start < stop) { + stop--; /* reserve one byte for the final '\0' */ + while (start < stop && str < end) { + if (!ha_bit_test((unsigned char)(*str), map)) + *start++ = *str; + else { + if (start + 3 >= stop) + break; + *start++ = escape; + *start++ = hextab[(*str >> 4) & 15]; + *start++ = hextab[*str & 15]; + } + str++; + } + *start = '\0'; + } + return start; +} + +/* + * Tries to prefix characters tagged in the <map> with the <escape> + * character. The input <string> is processed until string_stop + * is reached or NULL-byte is encountered. The result will + * be stored between <start> (included) and <stop> (excluded). This + * function will always try to terminate the resulting string with a '\0' + * before <stop>, and will return its position if the conversion + * completes. + */ +char *escape_string(char *start, char *stop, + const char escape, const long *map, + const char *string, const char *string_stop) +{ + if (start < stop) { + stop--; /* reserve one byte for the final '\0' */ + while (start < stop && string < string_stop && *string != '\0') { + if (!ha_bit_test((unsigned char)(*string), map)) + *start++ = *string; + else { + if (start + 2 >= stop) + break; + *start++ = escape; + *start++ = *string; + } + string++; + } + *start = '\0'; + } + return start; +} + +/* Check a string for using it in a CSV output format. If the string contains + * one of the following four char <">, <,>, CR or LF, the string is + * encapsulated between <"> and the <"> are escaped by a <""> sequence. + * <str> is the input string to be escaped. The function assumes that + * the input string is null-terminated. + * + * If <quote> is 0, the result is returned escaped but without double quote. + * It is useful if the escaped string is used between double quotes in the + * format. + * + * printf("..., \"%s\", ...\r\n", csv_enc(str, 0, 0, &trash)); + * + * If <quote> is 1, the converter puts the quotes only if any reserved character + * is present. If <quote> is 2, the converter always puts the quotes. + * + * If <oneline> is not 0, CRs are skipped and LFs are replaced by spaces. + * This re-format multi-lines strings to only one line. The purpose is to + * allow a line by line parsing but also to keep the output compliant with + * the CLI witch uses LF to defines the end of the response. + * + * If <oneline> is 2, In addition to previous action, the trailing spaces are + * removed. + * + * <output> is a struct buffer used for storing the output string. + * + * The function returns the converted string on its output. If an error + * occurs, the function returns an empty string. This type of output is useful + * for using the function directly as printf() argument. + * + * If the output buffer is too short to contain the input string, the result + * is truncated. + * + * This function appends the encoding to the existing output chunk, and it + * guarantees that it starts immediately at the first available character of + * the chunk. Please use csv_enc() instead if you want to replace the output + * chunk. + */ +const char *csv_enc_append(const char *str, int quote, int oneline, struct buffer *output) +{ + char *end = output->area + output->size; + char *out = output->area + output->data; + char *ptr = out; + + if (quote == 1) { + /* automatic quoting: first verify if we'll have to quote the string */ + if (!strpbrk(str, "\n\r,\"")) + quote = 0; + } + + if (quote) + *ptr++ = '"'; + + while (*str && ptr < end - 2) { /* -2 for reserving space for <"> and \0. */ + if (oneline) { + if (*str == '\n' ) { + /* replace LF by a space */ + *ptr++ = ' '; + str++; + continue; + } + else if (*str == '\r' ) { + /* skip CR */ + str++; + continue; + } + } + *ptr = *str; + if (*str == '"') { + ptr++; + if (ptr >= end - 2) { + ptr--; + break; + } + *ptr = '"'; + } + ptr++; + str++; + } + + if (oneline == 2) { + /* remove trailing spaces */ + while (ptr > out && *(ptr - 1) == ' ') + ptr--; + } + + if (quote) + *ptr++ = '"'; + + *ptr = '\0'; + output->data = ptr - output->area; + return out; +} + +/* Decode an URL-encoded string in-place. The resulting string might + * be shorter. If some forbidden characters are found, the conversion is + * aborted, the string is truncated before the issue and a negative value is + * returned, otherwise the operation returns the length of the decoded string. + * If the 'in_form' argument is non-nul the string is assumed to be part of + * an "application/x-www-form-urlencoded" encoded string, and the '+' will be + * turned to a space. If it's zero, this will only be done after a question + * mark ('?'). + */ +int url_decode(char *string, int in_form) +{ + char *in, *out; + int ret = -1; + + in = string; + out = string; + while (*in) { + switch (*in) { + case '+' : + *out++ = in_form ? ' ' : *in; + break; + case '%' : + if (!ishex(in[1]) || !ishex(in[2])) + goto end; + *out++ = (hex2i(in[1]) << 4) + hex2i(in[2]); + in += 2; + break; + case '?': + in_form = 1; + __fallthrough; + default: + *out++ = *in; + break; + } + in++; + } + ret = out - string; /* success */ + end: + *out = 0; + return ret; +} + +unsigned int str2ui(const char *s) +{ + return __str2ui(s); +} + +unsigned int str2uic(const char *s) +{ + return __str2uic(s); +} + +unsigned int strl2ui(const char *s, int len) +{ + return __strl2ui(s, len); +} + +unsigned int strl2uic(const char *s, int len) +{ + return __strl2uic(s, len); +} + +unsigned int read_uint(const char **s, const char *end) +{ + return __read_uint(s, end); +} + +/* This function reads an unsigned integer from the string pointed to by <s> and + * returns it. The <s> pointer is adjusted to point to the first unread char. The + * function automatically stops at <end>. If the number overflows, the 2^64-1 + * value is returned. + */ +unsigned long long int read_uint64(const char **s, const char *end) +{ + const char *ptr = *s; + unsigned long long int i = 0, tmp; + unsigned int j; + + while (ptr < end) { + + /* read next char */ + j = *ptr - '0'; + if (j > 9) + goto read_uint64_end; + + /* add char to the number and check overflow. */ + tmp = i * 10; + if (tmp / 10 != i) { + i = ULLONG_MAX; + goto read_uint64_eat; + } + if (ULLONG_MAX - tmp < j) { + i = ULLONG_MAX; + goto read_uint64_eat; + } + i = tmp + j; + ptr++; + } +read_uint64_eat: + /* eat each numeric char */ + while (ptr < end) { + if ((unsigned int)(*ptr - '0') > 9) + break; + ptr++; + } +read_uint64_end: + *s = ptr; + return i; +} + +/* This function reads an integer from the string pointed to by <s> and returns + * it. The <s> pointer is adjusted to point to the first unread char. The function + * automatically stops at <end>. Il the number is bigger than 2^63-2, the 2^63-1 + * value is returned. If the number is lowest than -2^63-1, the -2^63 value is + * returned. + */ +long long int read_int64(const char **s, const char *end) +{ + unsigned long long int i = 0; + int neg = 0; + + /* Look for minus char. */ + if (**s == '-') { + neg = 1; + (*s)++; + } + else if (**s == '+') + (*s)++; + + /* convert as positive number. */ + i = read_uint64(s, end); + + if (neg) { + if (i > 0x8000000000000000ULL) + return LLONG_MIN; + return -i; + } + if (i > 0x7fffffffffffffffULL) + return LLONG_MAX; + return i; +} + +/* This one is 7 times faster than strtol() on athlon with checks. + * It returns the value of the number composed of all valid digits read, + * and can process negative numbers too. + */ +int strl2ic(const char *s, int len) +{ + int i = 0; + int j, k; + + if (len > 0) { + if (*s != '-') { + /* positive number */ + while (len-- > 0) { + j = (*s++) - '0'; + k = i * 10; + if (j > 9) + break; + i = k + j; + } + } else { + /* negative number */ + s++; + while (--len > 0) { + j = (*s++) - '0'; + k = i * 10; + if (j > 9) + break; + i = k - j; + } + } + } + return i; +} + + +/* This function reads exactly <len> chars from <s> and converts them to a + * signed integer which it stores into <ret>. It accurately detects any error + * (truncated string, invalid chars, overflows). It is meant to be used in + * applications designed for hostile environments. It returns zero when the + * number has successfully been converted, non-zero otherwise. When an error + * is returned, the <ret> value is left untouched. It is yet 5 to 40 times + * faster than strtol(). + */ +int strl2irc(const char *s, int len, int *ret) +{ + int i = 0; + int j; + + if (!len) + return 1; + + if (*s != '-') { + /* positive number */ + while (len-- > 0) { + j = (*s++) - '0'; + if (j > 9) return 1; /* invalid char */ + if (i > INT_MAX / 10) return 1; /* check for multiply overflow */ + i = i * 10; + if (i + j < i) return 1; /* check for addition overflow */ + i = i + j; + } + } else { + /* negative number */ + s++; + while (--len > 0) { + j = (*s++) - '0'; + if (j > 9) return 1; /* invalid char */ + if (i < INT_MIN / 10) return 1; /* check for multiply overflow */ + i = i * 10; + if (i - j > i) return 1; /* check for subtract overflow */ + i = i - j; + } + } + *ret = i; + return 0; +} + + +/* This function reads exactly <len> chars from <s> and converts them to a + * signed integer which it stores into <ret>. It accurately detects any error + * (truncated string, invalid chars, overflows). It is meant to be used in + * applications designed for hostile environments. It returns zero when the + * number has successfully been converted, non-zero otherwise. When an error + * is returned, the <ret> value is left untouched. It is about 3 times slower + * than strl2irc(). + */ + +int strl2llrc(const char *s, int len, long long *ret) +{ + long long i = 0; + int j; + + if (!len) + return 1; + + if (*s != '-') { + /* positive number */ + while (len-- > 0) { + j = (*s++) - '0'; + if (j > 9) return 1; /* invalid char */ + if (i > LLONG_MAX / 10LL) return 1; /* check for multiply overflow */ + i = i * 10LL; + if (i + j < i) return 1; /* check for addition overflow */ + i = i + j; + } + } else { + /* negative number */ + s++; + while (--len > 0) { + j = (*s++) - '0'; + if (j > 9) return 1; /* invalid char */ + if (i < LLONG_MIN / 10LL) return 1; /* check for multiply overflow */ + i = i * 10LL; + if (i - j > i) return 1; /* check for subtract overflow */ + i = i - j; + } + } + *ret = i; + return 0; +} + +/* This function is used with pat_parse_dotted_ver(). It converts a string + * composed by two number separated by a dot. Each part must contain in 16 bits + * because internally they will be represented as a 32-bit quantity stored in + * a 64-bit integer. It returns zero when the number has successfully been + * converted, non-zero otherwise. When an error is returned, the <ret> value + * is left untouched. + * + * "1.3" -> 0x0000000000010003 + * "65535.65535" -> 0x00000000ffffffff + */ +int strl2llrc_dotted(const char *text, int len, long long *ret) +{ + const char *end = &text[len]; + const char *p; + long long major, minor; + + /* Look for dot. */ + for (p = text; p < end; p++) + if (*p == '.') + break; + + /* Convert major. */ + if (strl2llrc(text, p - text, &major) != 0) + return 1; + + /* Check major. */ + if (major >= 65536) + return 1; + + /* Convert minor. */ + minor = 0; + if (p < end) + if (strl2llrc(p + 1, end - (p + 1), &minor) != 0) + return 1; + + /* Check minor. */ + if (minor >= 65536) + return 1; + + /* Compose value. */ + *ret = (major << 16) | (minor & 0xffff); + return 0; +} + +/* This function parses a time value optionally followed by a unit suffix among + * "d", "h", "m", "s", "ms" or "us". It converts the value into the unit + * expected by the caller. The computation does its best to avoid overflows. + * The value is returned in <ret> if everything is fine, and a NULL is returned + * by the function. In case of error, a pointer to the error is returned and + * <ret> is left untouched. Values are automatically rounded up when needed. + * Values resulting in values larger than or equal to 2^31 after conversion are + * reported as an overflow as value PARSE_TIME_OVER. Non-null values resulting + * in an underflow are reported as an underflow as value PARSE_TIME_UNDER. + */ +const char *parse_time_err(const char *text, unsigned *ret, unsigned unit_flags) +{ + unsigned long long imult, idiv; + unsigned long long omult, odiv; + unsigned long long value, result; + const char *str = text; + + if (!isdigit((unsigned char)*text)) + return text; + + omult = odiv = 1; + + switch (unit_flags & TIME_UNIT_MASK) { + case TIME_UNIT_US: omult = 1000000; break; + case TIME_UNIT_MS: omult = 1000; break; + case TIME_UNIT_S: break; + case TIME_UNIT_MIN: odiv = 60; break; + case TIME_UNIT_HOUR: odiv = 3600; break; + case TIME_UNIT_DAY: odiv = 86400; break; + default: break; + } + + value = 0; + + while (1) { + unsigned int j; + + j = *text - '0'; + if (j > 9) + break; + text++; + value *= 10; + value += j; + } + + imult = idiv = 1; + switch (*text) { + case '\0': /* no unit = default unit */ + imult = omult = idiv = odiv = 1; + goto end; + case 's': /* second = unscaled unit */ + break; + case 'u': /* microsecond : "us" */ + if (text[1] == 's') { + idiv = 1000000; + text++; + break; + } + return text; + case 'm': /* millisecond : "ms" or minute: "m" */ + if (text[1] == 's') { + idiv = 1000; + text++; + } else + imult = 60; + break; + case 'h': /* hour : "h" */ + imult = 3600; + break; + case 'd': /* day : "d" */ + imult = 86400; + break; + default: + return text; + } + if (*(++text) != '\0') { + ha_warning("unexpected character '%c' after the timer value '%s', only " + "(us=microseconds,ms=milliseconds,s=seconds,m=minutes,h=hours,d=days) are supported." + " This will be reported as an error in next versions.\n", *text, str); + } + + end: + if (omult % idiv == 0) { omult /= idiv; idiv = 1; } + if (idiv % omult == 0) { idiv /= omult; omult = 1; } + if (imult % odiv == 0) { imult /= odiv; odiv = 1; } + if (odiv % imult == 0) { odiv /= imult; imult = 1; } + + result = (value * (imult * omult) + (idiv * odiv - 1)) / (idiv * odiv); + if (result >= 0x80000000) + return PARSE_TIME_OVER; + if (!result && value) + return PARSE_TIME_UNDER; + *ret = result; + return NULL; +} + +/* this function converts the string starting at <text> to an unsigned int + * stored in <ret>. If an error is detected, the pointer to the unexpected + * character is returned. If the conversion is successful, NULL is returned. + */ +const char *parse_size_err(const char *text, unsigned *ret) { + unsigned value = 0; + + if (!isdigit((unsigned char)*text)) + return text; + + while (1) { + unsigned int j; + + j = *text - '0'; + if (j > 9) + break; + if (value > ~0U / 10) + return text; + value *= 10; + if (value > (value + j)) + return text; + value += j; + text++; + } + + switch (*text) { + case '\0': + break; + case 'K': + case 'k': + if (value > ~0U >> 10) + return text; + value = value << 10; + break; + case 'M': + case 'm': + if (value > ~0U >> 20) + return text; + value = value << 20; + break; + case 'G': + case 'g': + if (value > ~0U >> 30) + return text; + value = value << 30; + break; + default: + return text; + } + + if (*text != '\0' && *++text != '\0') + return text; + + *ret = value; + return NULL; +} + +/* + * Parse binary string written in hexadecimal (source) and store the decoded + * result into binstr and set binstrlen to the length of binstr. Memory for + * binstr is allocated by the function. In case of error, returns 0 with an + * error message in err. In success case, it returns the consumed length. + */ +int parse_binary(const char *source, char **binstr, int *binstrlen, char **err) +{ + int len; + const char *p = source; + int i,j; + int alloc; + + len = strlen(source); + if (len % 2) { + memprintf(err, "an even number of hex digit is expected"); + return 0; + } + + len = len >> 1; + + if (!*binstr) { + *binstr = calloc(len, sizeof(**binstr)); + if (!*binstr) { + memprintf(err, "out of memory while loading string pattern"); + return 0; + } + alloc = 1; + } + else { + if (*binstrlen < len) { + memprintf(err, "no space available in the buffer. expect %d, provides %d", + len, *binstrlen); + return 0; + } + alloc = 0; + } + *binstrlen = len; + + i = j = 0; + while (j < len) { + if (!ishex(p[i++])) + goto bad_input; + if (!ishex(p[i++])) + goto bad_input; + (*binstr)[j++] = (hex2i(p[i-2]) << 4) + hex2i(p[i-1]); + } + return len << 1; + +bad_input: + memprintf(err, "an hex digit is expected (found '%c')", p[i-1]); + if (alloc) + ha_free(binstr); + return 0; +} + +/* copies at most <n> characters from <src> and always terminates with '\0' */ +char *my_strndup(const char *src, int n) +{ + int len = 0; + char *ret; + + while (len < n && src[len]) + len++; + + ret = malloc(len + 1); + if (!ret) + return ret; + memcpy(ret, src, len); + ret[len] = '\0'; + return ret; +} + +/* + * search needle in haystack + * returns the pointer if found, returns NULL otherwise + */ +const void *my_memmem(const void *haystack, size_t haystacklen, const void *needle, size_t needlelen) +{ + const void *c = NULL; + unsigned char f; + + if ((haystack == NULL) || (needle == NULL) || (haystacklen < needlelen)) + return NULL; + + f = *(char *)needle; + c = haystack; + while ((c = memchr(c, f, haystacklen - (c - haystack))) != NULL) { + if ((haystacklen - (c - haystack)) < needlelen) + return NULL; + + if (memcmp(c, needle, needlelen) == 0) + return c; + ++c; + } + return NULL; +} + +/* get length of the initial segment consisting entirely of bytes in <accept> */ +size_t my_memspn(const void *str, size_t len, const void *accept, size_t acceptlen) +{ + size_t ret = 0; + + while (ret < len && memchr(accept, *((int *)str), acceptlen)) { + str++; + ret++; + } + return ret; +} + +/* get length of the initial segment consisting entirely of bytes not in <rejcet> */ +size_t my_memcspn(const void *str, size_t len, const void *reject, size_t rejectlen) +{ + size_t ret = 0; + + while (ret < len) { + if(memchr(reject, *((int *)str), rejectlen)) + return ret; + str++; + ret++; + } + return ret; +} + +/* This function returns the first unused key greater than or equal to <key> in + * ID tree <root>. Zero is returned if no place is found. + */ +unsigned int get_next_id(struct eb_root *root, unsigned int key) +{ + struct eb32_node *used; + + do { + used = eb32_lookup_ge(root, key); + if (!used || used->key > key) + return key; /* key is available */ + key++; + } while (key); + return key; +} + +/* dump the full tree to <file> in DOT format for debugging purposes. Will + * optionally highlight node <subj> if found, depending on operation <op> : + * 0 : nothing + * >0 : insertion, node/leaf are surrounded in red + * <0 : removal, node/leaf are dashed with no background + * Will optionally add "desc" as a label on the graph if set and non-null. + */ +void eb32sc_to_file(FILE *file, struct eb_root *root, const struct eb32sc_node *subj, int op, const char *desc) +{ + struct eb32sc_node *node; + unsigned long scope = -1; + + fprintf(file, "digraph ebtree {\n"); + + if (desc && *desc) { + fprintf(file, + " fontname=\"fixed\";\n" + " fontsize=8;\n" + " label=\"%s\";\n", desc); + } + + fprintf(file, + " node [fontname=\"fixed\" fontsize=8 shape=\"box\" style=\"filled\" color=\"black\" fillcolor=\"white\"];\n" + " edge [fontname=\"fixed\" fontsize=8 style=\"solid\" color=\"magenta\" dir=\"forward\"];\n" + " \"%lx_n\" [label=\"root\\n%lx\"]\n", (long)eb_root_to_node(root), (long)root + ); + + fprintf(file, " \"%lx_n\" -> \"%lx_%c\" [taillabel=\"L\"];\n", + (long)eb_root_to_node(root), + (long)eb_root_to_node(eb_clrtag(root->b[0])), + eb_gettag(root->b[0]) == EB_LEAF ? 'l' : 'n'); + + node = eb32sc_first(root, scope); + while (node) { + if (node->node.node_p) { + /* node part is used */ + fprintf(file, " \"%lx_n\" [label=\"%lx\\nkey=%u\\nscope=%lx\\nbit=%d\" fillcolor=\"lightskyblue1\" %s];\n", + (long)node, (long)node, node->key, node->node_s, node->node.bit, + (node == subj) ? (op < 0 ? "color=\"red\" style=\"dashed\"" : op > 0 ? "color=\"red\"" : "") : ""); + + fprintf(file, " \"%lx_n\" -> \"%lx_n\" [taillabel=\"%c\"];\n", + (long)node, + (long)eb_root_to_node(eb_clrtag(node->node.node_p)), + eb_gettag(node->node.node_p) ? 'R' : 'L'); + + fprintf(file, " \"%lx_n\" -> \"%lx_%c\" [taillabel=\"L\"];\n", + (long)node, + (long)eb_root_to_node(eb_clrtag(node->node.branches.b[0])), + eb_gettag(node->node.branches.b[0]) == EB_LEAF ? 'l' : 'n'); + + fprintf(file, " \"%lx_n\" -> \"%lx_%c\" [taillabel=\"R\"];\n", + (long)node, + (long)eb_root_to_node(eb_clrtag(node->node.branches.b[1])), + eb_gettag(node->node.branches.b[1]) == EB_LEAF ? 'l' : 'n'); + } + + fprintf(file, " \"%lx_l\" [label=\"%lx\\nkey=%u\\nscope=%lx\\npfx=%u\" fillcolor=\"yellow\" %s];\n", + (long)node, (long)node, node->key, node->leaf_s, node->node.pfx, + (node == subj) ? (op < 0 ? "color=\"red\" style=\"dashed\"" : op > 0 ? "color=\"red\"" : "") : ""); + + fprintf(file, " \"%lx_l\" -> \"%lx_n\" [taillabel=\"%c\"];\n", + (long)node, + (long)eb_root_to_node(eb_clrtag(node->node.leaf_p)), + eb_gettag(node->node.leaf_p) ? 'R' : 'L'); + node = eb32sc_next(node, scope); + } + fprintf(file, "}\n"); +} + +/* dump the full tree to <file> in DOT format for debugging purposes. Will + * optionally highlight node <subj> if found, depending on operation <op> : + * 0 : nothing + * >0 : insertion, node/leaf are surrounded in red + * <0 : removal, node/leaf are dashed with no background + * Will optionally add "desc" as a label on the graph if set and non-null. The + * key is printed as a u32 hex value. A full-sized hex dump would be better but + * is left to be implemented. + */ +void ebmb_to_file(FILE *file, struct eb_root *root, const struct ebmb_node *subj, int op, const char *desc) +{ + struct ebmb_node *node; + + fprintf(file, "digraph ebtree {\n"); + + if (desc && *desc) { + fprintf(file, + " fontname=\"fixed\";\n" + " fontsize=8;\n" + " label=\"%s\";\n", desc); + } + + fprintf(file, + " node [fontname=\"fixed\" fontsize=8 shape=\"box\" style=\"filled\" color=\"black\" fillcolor=\"white\"];\n" + " edge [fontname=\"fixed\" fontsize=8 style=\"solid\" color=\"magenta\" dir=\"forward\"];\n" + " \"%lx_n\" [label=\"root\\n%lx\"]\n", (long)eb_root_to_node(root), (long)root + ); + + fprintf(file, " \"%lx_n\" -> \"%lx_%c\" [taillabel=\"L\"];\n", + (long)eb_root_to_node(root), + (long)eb_root_to_node(eb_clrtag(root->b[0])), + eb_gettag(root->b[0]) == EB_LEAF ? 'l' : 'n'); + + node = ebmb_first(root); + while (node) { + if (node->node.node_p) { + /* node part is used */ + fprintf(file, " \"%lx_n\" [label=\"%lx\\nkey=%#x\\nbit=%d\" fillcolor=\"lightskyblue1\" %s];\n", + (long)node, (long)node, read_u32(node->key), node->node.bit, + (node == subj) ? (op < 0 ? "color=\"red\" style=\"dashed\"" : op > 0 ? "color=\"red\"" : "") : ""); + + fprintf(file, " \"%lx_n\" -> \"%lx_n\" [taillabel=\"%c\"];\n", + (long)node, + (long)eb_root_to_node(eb_clrtag(node->node.node_p)), + eb_gettag(node->node.node_p) ? 'R' : 'L'); + + fprintf(file, " \"%lx_n\" -> \"%lx_%c\" [taillabel=\"L\"];\n", + (long)node, + (long)eb_root_to_node(eb_clrtag(node->node.branches.b[0])), + eb_gettag(node->node.branches.b[0]) == EB_LEAF ? 'l' : 'n'); + + fprintf(file, " \"%lx_n\" -> \"%lx_%c\" [taillabel=\"R\"];\n", + (long)node, + (long)eb_root_to_node(eb_clrtag(node->node.branches.b[1])), + eb_gettag(node->node.branches.b[1]) == EB_LEAF ? 'l' : 'n'); + } + + fprintf(file, " \"%lx_l\" [label=\"%lx\\nkey=%#x\\npfx=%u\" fillcolor=\"yellow\" %s];\n", + (long)node, (long)node, read_u32(node->key), node->node.pfx, + (node == subj) ? (op < 0 ? "color=\"red\" style=\"dashed\"" : op > 0 ? "color=\"red\"" : "") : ""); + + fprintf(file, " \"%lx_l\" -> \"%lx_n\" [taillabel=\"%c\"];\n", + (long)node, + (long)eb_root_to_node(eb_clrtag(node->node.leaf_p)), + eb_gettag(node->node.leaf_p) ? 'R' : 'L'); + node = ebmb_next(node); + } + fprintf(file, "}\n"); +} + +/* This function compares a sample word possibly followed by blanks to another + * clean word. The compare is case-insensitive. 1 is returned if both are equal, + * otherwise zero. This intends to be used when checking HTTP headers for some + * values. Note that it validates a word followed only by blanks but does not + * validate a word followed by blanks then other chars. + */ +int word_match(const char *sample, int slen, const char *word, int wlen) +{ + if (slen < wlen) + return 0; + + while (wlen) { + char c = *sample ^ *word; + if (c && c != ('A' ^ 'a')) + return 0; + sample++; + word++; + slen--; + wlen--; + } + + while (slen) { + if (*sample != ' ' && *sample != '\t') + return 0; + sample++; + slen--; + } + return 1; +} + +/* Converts any text-formatted IPv4 address to a host-order IPv4 address. It + * is particularly fast because it avoids expensive operations such as + * multiplies, which are optimized away at the end. It requires a properly + * formatted address though (3 points). + */ +unsigned int inetaddr_host(const char *text) +{ + const unsigned int ascii_zero = ('0' << 24) | ('0' << 16) | ('0' << 8) | '0'; + register unsigned int dig100, dig10, dig1; + int s; + const char *p, *d; + + dig1 = dig10 = dig100 = ascii_zero; + s = 24; + + p = text; + while (1) { + if (((unsigned)(*p - '0')) <= 9) { + p++; + continue; + } + + /* here, we have a complete byte between <text> and <p> (exclusive) */ + if (p == text) + goto end; + + d = p - 1; + dig1 |= (unsigned int)(*d << s); + if (d == text) + goto end; + + d--; + dig10 |= (unsigned int)(*d << s); + if (d == text) + goto end; + + d--; + dig100 |= (unsigned int)(*d << s); + end: + if (!s || *p != '.') + break; + + s -= 8; + text = ++p; + } + + dig100 -= ascii_zero; + dig10 -= ascii_zero; + dig1 -= ascii_zero; + return ((dig100 * 10) + dig10) * 10 + dig1; +} + +/* + * Idem except the first unparsed character has to be passed in <stop>. + */ +unsigned int inetaddr_host_lim(const char *text, const char *stop) +{ + const unsigned int ascii_zero = ('0' << 24) | ('0' << 16) | ('0' << 8) | '0'; + register unsigned int dig100, dig10, dig1; + int s; + const char *p, *d; + + dig1 = dig10 = dig100 = ascii_zero; + s = 24; + + p = text; + while (1) { + if (((unsigned)(*p - '0')) <= 9 && p < stop) { + p++; + continue; + } + + /* here, we have a complete byte between <text> and <p> (exclusive) */ + if (p == text) + goto end; + + d = p - 1; + dig1 |= (unsigned int)(*d << s); + if (d == text) + goto end; + + d--; + dig10 |= (unsigned int)(*d << s); + if (d == text) + goto end; + + d--; + dig100 |= (unsigned int)(*d << s); + end: + if (!s || p == stop || *p != '.') + break; + + s -= 8; + text = ++p; + } + + dig100 -= ascii_zero; + dig10 -= ascii_zero; + dig1 -= ascii_zero; + return ((dig100 * 10) + dig10) * 10 + dig1; +} + +/* + * Idem except the pointer to first unparsed byte is returned into <ret> which + * must not be NULL. + */ +unsigned int inetaddr_host_lim_ret(char *text, char *stop, char **ret) +{ + const unsigned int ascii_zero = ('0' << 24) | ('0' << 16) | ('0' << 8) | '0'; + register unsigned int dig100, dig10, dig1; + int s; + char *p, *d; + + dig1 = dig10 = dig100 = ascii_zero; + s = 24; + + p = text; + while (1) { + if (((unsigned)(*p - '0')) <= 9 && p < stop) { + p++; + continue; + } + + /* here, we have a complete byte between <text> and <p> (exclusive) */ + if (p == text) + goto end; + + d = p - 1; + dig1 |= (unsigned int)(*d << s); + if (d == text) + goto end; + + d--; + dig10 |= (unsigned int)(*d << s); + if (d == text) + goto end; + + d--; + dig100 |= (unsigned int)(*d << s); + end: + if (!s || p == stop || *p != '.') + break; + + s -= 8; + text = ++p; + } + + *ret = p; + dig100 -= ascii_zero; + dig10 -= ascii_zero; + dig1 -= ascii_zero; + return ((dig100 * 10) + dig10) * 10 + dig1; +} + +/* Convert a fixed-length string to an IP address. Returns 0 in case of error, + * or the number of chars read in case of success. Maybe this could be replaced + * by one of the functions above. Also, apparently this function does not support + * hosts above 255 and requires exactly 4 octets. + * The destination is only modified on success. + */ +int buf2ip(const char *buf, size_t len, struct in_addr *dst) +{ + const char *addr; + int saw_digit, octets, ch; + u_char tmp[4], *tp; + const char *cp = buf; + + saw_digit = 0; + octets = 0; + *(tp = tmp) = 0; + + for (addr = buf; addr - buf < len; addr++) { + unsigned char digit = (ch = *addr) - '0'; + + if (digit > 9 && ch != '.') + break; + + if (digit <= 9) { + u_int new = *tp * 10 + digit; + + if (new > 255) + return 0; + + *tp = new; + + if (!saw_digit) { + if (++octets > 4) + return 0; + saw_digit = 1; + } + } else if (ch == '.' && saw_digit) { + if (octets == 4) + return 0; + + *++tp = 0; + saw_digit = 0; + } else + return 0; + } + + if (octets < 4) + return 0; + + memcpy(&dst->s_addr, tmp, 4); + return addr - cp; +} + +/* This function converts the string in <buf> of the len <len> to + * struct in6_addr <dst> which must be allocated by the caller. + * This function returns 1 in success case, otherwise zero. + * The destination is only modified on success. + */ +int buf2ip6(const char *buf, size_t len, struct in6_addr *dst) +{ + char null_term_ip6[INET6_ADDRSTRLEN + 1]; + struct in6_addr out; + + if (len > INET6_ADDRSTRLEN) + return 0; + + memcpy(null_term_ip6, buf, len); + null_term_ip6[len] = '\0'; + + if (!inet_pton(AF_INET6, null_term_ip6, &out)) + return 0; + + *dst = out; + return 1; +} + +/* To be used to quote config arg positions. Returns the short string at <ptr> + * surrounded by simple quotes if <ptr> is valid and non-empty, or "end of line" + * if ptr is NULL or empty. The string is locally allocated. + */ +const char *quote_arg(const char *ptr) +{ + static THREAD_LOCAL char val[32]; + int i; + + if (!ptr || !*ptr) + return "end of line"; + val[0] = '\''; + for (i = 1; i < sizeof(val) - 2 && *ptr; i++) + val[i] = *ptr++; + val[i++] = '\''; + val[i] = '\0'; + return val; +} + +/* returns an operator among STD_OP_* for string <str> or < 0 if unknown */ +int get_std_op(const char *str) +{ + int ret = -1; + + if (*str == 'e' && str[1] == 'q') + ret = STD_OP_EQ; + else if (*str == 'n' && str[1] == 'e') + ret = STD_OP_NE; + else if (*str == 'l') { + if (str[1] == 'e') ret = STD_OP_LE; + else if (str[1] == 't') ret = STD_OP_LT; + } + else if (*str == 'g') { + if (str[1] == 'e') ret = STD_OP_GE; + else if (str[1] == 't') ret = STD_OP_GT; + } + + if (ret == -1 || str[2] != '\0') + return -1; + return ret; +} + +/* hash a 32-bit integer to another 32-bit integer */ +unsigned int full_hash(unsigned int a) +{ + return __full_hash(a); +} + +/* Return the bit position in mask <m> of the nth bit set of rank <r>, between + * 0 and LONGBITS-1 included, starting from the left. For example ranks 0,1,2,3 + * for mask 0x55 will be 6, 4, 2 and 0 respectively. This algorithm is based on + * a popcount variant and is described here : + * https://graphics.stanford.edu/~seander/bithacks.html + */ +unsigned int mask_find_rank_bit(unsigned int r, unsigned long m) +{ + unsigned long a, b, c, d; + unsigned int s; + unsigned int t; + + a = m - ((m >> 1) & ~0UL/3); + b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5); + c = (b + (b >> 4)) & ~0UL/0x11; + d = (c + (c >> 8)) & ~0UL/0x101; + + r++; // make r be 1..64 + + t = 0; + s = LONGBITS; + if (s > 32) { + unsigned long d2 = (d >> 16) >> 16; + t = d2 + (d2 >> 16); + s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); + } + + t = (d >> (s - 16)) & 0xff; + s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); + t = (c >> (s - 8)) & 0xf; + s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); + t = (b >> (s - 4)) & 0x7; + s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); + t = (a >> (s - 2)) & 0x3; + s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); + t = (m >> (s - 1)) & 0x1; + s -= ((t - r) & 256) >> 8; + + return s - 1; +} + +/* Same as mask_find_rank_bit() above but makes use of pre-computed bitmaps + * based on <m>, in <a..d>. These ones must be updated whenever <m> changes + * using mask_prep_rank_map() below. + */ +unsigned int mask_find_rank_bit_fast(unsigned int r, unsigned long m, + unsigned long a, unsigned long b, + unsigned long c, unsigned long d) +{ + unsigned int s; + unsigned int t; + + r++; // make r be 1..64 + + t = 0; + s = LONGBITS; + if (s > 32) { + unsigned long d2 = (d >> 16) >> 16; + t = d2 + (d2 >> 16); + s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8)); + } + + t = (d >> (s - 16)) & 0xff; + s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8)); + t = (c >> (s - 8)) & 0xf; + s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8)); + t = (b >> (s - 4)) & 0x7; + s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8)); + t = (a >> (s - 2)) & 0x3; + s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8)); + t = (m >> (s - 1)) & 0x1; + s -= ((t - r) & 256) >> 8; + + return s - 1; +} + +/* Prepare the bitmaps used by the fast implementation of the find_rank_bit() + * above. + */ +void mask_prep_rank_map(unsigned long m, + unsigned long *a, unsigned long *b, + unsigned long *c, unsigned long *d) +{ + *a = m - ((m >> 1) & ~0UL/3); + *b = (*a & ~0UL/5) + ((*a >> 2) & ~0UL/5); + *c = (*b + (*b >> 4)) & ~0UL/0x11; + *d = (*c + (*c >> 8)) & ~0UL/0x101; +} + +/* Returns the position of one bit set in <v>, starting at position <bit>, and + * searching in other halves if not found. This is intended to be used to + * report the position of one bit set among several based on a counter or a + * random generator while preserving a relatively good distribution so that + * values made of holes in the middle do not see one of the bits around the + * hole being returned much more often than the other one. It can be seen as a + * disturbed ffsl() where the initial search starts at bit <bit>. The look up + * is performed in O(logN) time for N bit words, yielding a bit among 64 in + * about 16 cycles. Its usage differs from the rank find function in that the + * bit passed doesn't need to be limited to the value's popcount, making the + * function easier to use for random picking, and twice as fast. Passing value + * 0 for <v> makes no sense and -1 is returned in this case. + */ +int one_among_mask(unsigned long v, int bit) +{ + /* note, these masks may be produced by ~0UL/((1UL<<scale)+1) but + * that's more expensive. + */ + static const unsigned long halves[] = { + (unsigned long)0x5555555555555555ULL, + (unsigned long)0x3333333333333333ULL, + (unsigned long)0x0F0F0F0F0F0F0F0FULL, + (unsigned long)0x00FF00FF00FF00FFULL, + (unsigned long)0x0000FFFF0000FFFFULL, + (unsigned long)0x00000000FFFFFFFFULL + }; + unsigned long halfword = ~0UL; + int scope = 0; + int mirror; + int scale; + + if (!v) + return -1; + + /* we check if the exact bit is set or if it's present in a mirror + * position based on the current scale we're checking, in which case + * it's returned with its current (or mirrored) value. Otherwise we'll + * make sure there's at least one bit in the half we're in, and will + * scale down to a smaller scope and try again, until we find the + * closest bit. + */ + for (scale = (sizeof(long) > 4) ? 5 : 4; scale >= 0; scale--) { + halfword >>= (1UL << scale); + scope |= (1UL << scale); + mirror = bit ^ (1UL << scale); + if (v & ((1UL << bit) | (1UL << mirror))) + return (v & (1UL << bit)) ? bit : mirror; + + if (!((v >> (bit & scope)) & halves[scale] & halfword)) + bit = mirror; + } + return bit; +} + +/* Return non-zero if IPv4 address is part of the network, + * otherwise zero. Note that <addr> may not necessarily be aligned + * while the two other ones must. + */ +int in_net_ipv4(const void *addr, const struct in_addr *mask, const struct in_addr *net) +{ + struct in_addr addr_copy; + + memcpy(&addr_copy, addr, sizeof(addr_copy)); + return((addr_copy.s_addr & mask->s_addr) == (net->s_addr & mask->s_addr)); +} + +/* Return non-zero if IPv6 address is part of the network, + * otherwise zero. Note that <addr> may not necessarily be aligned + * while the two other ones must. + */ +int in_net_ipv6(const void *addr, const struct in6_addr *mask, const struct in6_addr *net) +{ + int i; + struct in6_addr addr_copy; + + memcpy(&addr_copy, addr, sizeof(addr_copy)); + for (i = 0; i < sizeof(struct in6_addr) / sizeof(int); i++) + if (((((int *)&addr_copy)[i] & ((int *)mask)[i])) != + (((int *)net)[i] & ((int *)mask)[i])) + return 0; + return 1; +} + +/* Map IPv4 address on IPv6 address, as specified in RFC4291 + * "IPv4-Mapped IPv6 Address" (using the :ffff: prefix) + * + * Input and output may overlap. + */ +void v4tov6(struct in6_addr *sin6_addr, struct in_addr *sin_addr) +{ + uint32_t ip4_addr; + + ip4_addr = sin_addr->s_addr; + memset(&sin6_addr->s6_addr, 0, 10); + write_u16(&sin6_addr->s6_addr[10], htons(0xFFFF)); + write_u32(&sin6_addr->s6_addr[12], ip4_addr); +} + +/* Try to convert IPv6 address to IPv4 address thanks to the + * following mapping methods: + * - RFC4291 IPv4-Mapped IPv6 Address (preferred method) + * -> ::ffff:ip:v4 + * - RFC4291 IPv4-Compatible IPv6 Address (deprecated, RFC3513 legacy for + * "IPv6 Addresses with Embedded IPv4 Addresses) + * -> ::0000:ip:v4 + * - 6to4 (defined in RFC3056 proposal, seems deprecated nowadays) + * -> 2002:ip:v4:: + * Return true if conversion is possible and false otherwise. + */ +int v6tov4(struct in_addr *sin_addr, struct in6_addr *sin6_addr) +{ + if (read_u64(&sin6_addr->s6_addr[0]) == 0 && + (read_u32(&sin6_addr->s6_addr[8]) == htonl(0xFFFF) || + read_u32(&sin6_addr->s6_addr[8]) == 0)) { + // RFC4291 ipv4 mapped or compatible ipv6 address + sin_addr->s_addr = read_u32(&sin6_addr->s6_addr[12]); + } else if (read_u16(&sin6_addr->s6_addr[0]) == htons(0x2002)) { + // RFC3056 6to4 address + sin_addr->s_addr = htonl((ntohs(read_u16(&sin6_addr->s6_addr[2])) << 16) + + ntohs(read_u16(&sin6_addr->s6_addr[4]))); + } + else + return 0; /* unrecognized input */ + return 1; /* mapping completed */ +} + +/* compare two struct sockaddr_storage, including port if <check_port> is true, + * and return: + * 0 (true) if the addr is the same in both + * 1 (false) if the addr is not the same in both + * -1 (unable) if one of the addr is not AF_INET* + */ +int ipcmp(const struct sockaddr_storage *ss1, const struct sockaddr_storage *ss2, int check_port) +{ + if ((ss1->ss_family != AF_INET) && (ss1->ss_family != AF_INET6)) + return -1; + + if ((ss2->ss_family != AF_INET) && (ss2->ss_family != AF_INET6)) + return -1; + + if (ss1->ss_family != ss2->ss_family) + return 1; + + switch (ss1->ss_family) { + case AF_INET: + return (memcmp(&((struct sockaddr_in *)ss1)->sin_addr, + &((struct sockaddr_in *)ss2)->sin_addr, + sizeof(struct in_addr)) != 0) || + (check_port && get_net_port(ss1) != get_net_port(ss2)); + case AF_INET6: + return (memcmp(&((struct sockaddr_in6 *)ss1)->sin6_addr, + &((struct sockaddr_in6 *)ss2)->sin6_addr, + sizeof(struct in6_addr)) != 0) || + (check_port && get_net_port(ss1) != get_net_port(ss2)); + } + + return 1; +} + +/* compare a struct sockaddr_storage to a struct net_addr and return : + * 0 (true) if <addr> is matching <net> + * 1 (false) if <addr> is not matching <net> + * -1 (unable) if <addr> or <net> is not AF_INET* + */ +int ipcmp2net(const struct sockaddr_storage *addr, const struct net_addr *net) +{ + if ((addr->ss_family != AF_INET) && (addr->ss_family != AF_INET6)) + return -1; + + if ((net->family != AF_INET) && (net->family != AF_INET6)) + return -1; + + if (addr->ss_family != net->family) + return 1; + + if (addr->ss_family == AF_INET && + (((struct sockaddr_in *)addr)->sin_addr.s_addr & net->addr.v4.mask.s_addr) == net->addr.v4.ip.s_addr) + return 0; + else { + const struct in6_addr *addr6 = &(((const struct sockaddr_in6*)addr)->sin6_addr); + const struct in6_addr *nip6 = &net->addr.v6.ip; + const struct in6_addr *nmask6 = &net->addr.v6.mask; + + if ((read_u32(&addr6->s6_addr[0]) & read_u32(&nmask6->s6_addr[0])) == read_u32(&nip6->s6_addr[0]) && + (read_u32(&addr6->s6_addr[4]) & read_u32(&nmask6->s6_addr[4])) == read_u32(&nip6->s6_addr[4]) && + (read_u32(&addr6->s6_addr[8]) & read_u32(&nmask6->s6_addr[8])) == read_u32(&nip6->s6_addr[8]) && + (read_u32(&addr6->s6_addr[12]) & read_u32(&nmask6->s6_addr[12])) == read_u32(&nip6->s6_addr[12])) + return 0; + } + + return 1; +} + +/* copy IP address from <source> into <dest> + * The caller must allocate and clear <dest> before calling. + * The source must be in either AF_INET or AF_INET6 family, or the destination + * address will be undefined. If the destination address used to hold a port, + * it is preserved, so that this function can be used to switch to another + * address family with no risk. Returns a pointer to the destination. + */ +struct sockaddr_storage *ipcpy(const struct sockaddr_storage *source, struct sockaddr_storage *dest) +{ + int prev_port; + + prev_port = get_net_port(dest); + memset(dest, 0, sizeof(*dest)); + dest->ss_family = source->ss_family; + + /* copy new addr and apply it */ + switch (source->ss_family) { + case AF_INET: + ((struct sockaddr_in *)dest)->sin_addr.s_addr = ((struct sockaddr_in *)source)->sin_addr.s_addr; + ((struct sockaddr_in *)dest)->sin_port = prev_port; + break; + case AF_INET6: + memcpy(((struct sockaddr_in6 *)dest)->sin6_addr.s6_addr, ((struct sockaddr_in6 *)source)->sin6_addr.s6_addr, sizeof(struct in6_addr)); + ((struct sockaddr_in6 *)dest)->sin6_port = prev_port; + break; + } + + return dest; +} + +char *human_time(int t, short hz_div) { + static char rv[sizeof("24855d23h")+1]; // longest of "23h59m" and "59m59s" + char *p = rv; + char *end = rv + sizeof(rv); + int cnt=2; // print two numbers + + if (unlikely(t < 0 || hz_div <= 0)) { + snprintf(p, end - p, "?"); + return rv; + } + + if (unlikely(hz_div > 1)) + t /= hz_div; + + if (t >= DAY) { + p += snprintf(p, end - p, "%dd", t / DAY); + cnt--; + } + + if (cnt && t % DAY / HOUR) { + p += snprintf(p, end - p, "%dh", t % DAY / HOUR); + cnt--; + } + + if (cnt && t % HOUR / MINUTE) { + p += snprintf(p, end - p, "%dm", t % HOUR / MINUTE); + cnt--; + } + + if ((cnt && t % MINUTE) || !t) // also display '0s' + p += snprintf(p, end - p, "%ds", t % MINUTE / SEC); + + return rv; +} + +const char *monthname[12] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" +}; + +/* date2str_log: write a date in the format : + * sprintf(str, "%02d/%s/%04d:%02d:%02d:%02d.%03d", + * tm.tm_mday, monthname[tm.tm_mon], tm.tm_year+1900, + * tm.tm_hour, tm.tm_min, tm.tm_sec, (int)date.tv_usec/1000); + * + * without using sprintf. return a pointer to the last char written (\0) or + * NULL if there isn't enough space. + */ +char *date2str_log(char *dst, const struct tm *tm, const struct timeval *date, size_t size) +{ + + if (size < 25) /* the size is fixed: 24 chars + \0 */ + return NULL; + + dst = utoa_pad((unsigned int)tm->tm_mday, dst, 3); // day + if (!dst) + return NULL; + *dst++ = '/'; + + memcpy(dst, monthname[tm->tm_mon], 3); // month + dst += 3; + *dst++ = '/'; + + dst = utoa_pad((unsigned int)tm->tm_year+1900, dst, 5); // year + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_hour, dst, 3); // hour + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_min, dst, 3); // minutes + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_sec, dst, 3); // secondes + if (!dst) + return NULL; + *dst++ = '.'; + + dst = utoa_pad((unsigned int)(date->tv_usec/1000)%1000, dst, 4); // milliseconds + if (!dst) + return NULL; + *dst = '\0'; + + return dst; +} + +/* Base year used to compute leap years */ +#define TM_YEAR_BASE 1900 + +/* Return the difference in seconds between two times (leap seconds are ignored). + * Retrieved from glibc 2.18 source code. + */ +static int my_tm_diff(const struct tm *a, const struct tm *b) +{ + /* Compute intervening leap days correctly even if year is negative. + * Take care to avoid int overflow in leap day calculations, + * but it's OK to assume that A and B are close to each other. + */ + int a4 = (a->tm_year >> 2) + (TM_YEAR_BASE >> 2) - ! (a->tm_year & 3); + int b4 = (b->tm_year >> 2) + (TM_YEAR_BASE >> 2) - ! (b->tm_year & 3); + int a100 = a4 / 25 - (a4 % 25 < 0); + int b100 = b4 / 25 - (b4 % 25 < 0); + int a400 = a100 >> 2; + int b400 = b100 >> 2; + int intervening_leap_days = (a4 - b4) - (a100 - b100) + (a400 - b400); + int years = a->tm_year - b->tm_year; + int days = (365 * years + intervening_leap_days + + (a->tm_yday - b->tm_yday)); + return (60 * (60 * (24 * days + (a->tm_hour - b->tm_hour)) + + (a->tm_min - b->tm_min)) + + (a->tm_sec - b->tm_sec)); +} + +/* Return the GMT offset for a specific local time. + * Both t and tm must represent the same time. + * The string returned has the same format as returned by strftime(... "%z", tm). + * Offsets are kept in an internal cache for better performances. + */ +const char *get_gmt_offset(time_t t, struct tm *tm) +{ + /* Cache offsets from GMT (depending on whether DST is active or not) */ + static THREAD_LOCAL char gmt_offsets[2][5+1] = { "", "" }; + + char *gmt_offset; + struct tm tm_gmt; + int diff; + int isdst = tm->tm_isdst; + + /* Pretend DST not active if its status is unknown */ + if (isdst < 0) + isdst = 0; + + /* Fetch the offset and initialize it if needed */ + gmt_offset = gmt_offsets[isdst & 0x01]; + if (unlikely(!*gmt_offset)) { + get_gmtime(t, &tm_gmt); + diff = my_tm_diff(tm, &tm_gmt); + if (diff < 0) { + diff = -diff; + *gmt_offset = '-'; + } else { + *gmt_offset = '+'; + } + diff %= 86400U; + diff /= 60; /* Convert to minutes */ + snprintf(gmt_offset+1, 4+1, "%02d%02d", diff/60, diff%60); + } + + return gmt_offset; +} + +/* gmt2str_log: write a date in the format : + * "%02d/%s/%04d:%02d:%02d:%02d +0000" without using snprintf + * return a pointer to the last char written (\0) or + * NULL if there isn't enough space. + */ +char *gmt2str_log(char *dst, struct tm *tm, size_t size) +{ + if (size < 27) /* the size is fixed: 26 chars + \0 */ + return NULL; + + dst = utoa_pad((unsigned int)tm->tm_mday, dst, 3); // day + if (!dst) + return NULL; + *dst++ = '/'; + + memcpy(dst, monthname[tm->tm_mon], 3); // month + dst += 3; + *dst++ = '/'; + + dst = utoa_pad((unsigned int)tm->tm_year+1900, dst, 5); // year + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_hour, dst, 3); // hour + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_min, dst, 3); // minutes + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_sec, dst, 3); // secondes + if (!dst) + return NULL; + *dst++ = ' '; + *dst++ = '+'; + *dst++ = '0'; + *dst++ = '0'; + *dst++ = '0'; + *dst++ = '0'; + *dst = '\0'; + + return dst; +} + +/* localdate2str_log: write a date in the format : + * "%02d/%s/%04d:%02d:%02d:%02d +0000(local timezone)" without using snprintf + * Both t and tm must represent the same time. + * return a pointer to the last char written (\0) or + * NULL if there isn't enough space. + */ +char *localdate2str_log(char *dst, time_t t, struct tm *tm, size_t size) +{ + const char *gmt_offset; + if (size < 27) /* the size is fixed: 26 chars + \0 */ + return NULL; + + gmt_offset = get_gmt_offset(t, tm); + + dst = utoa_pad((unsigned int)tm->tm_mday, dst, 3); // day + if (!dst) + return NULL; + *dst++ = '/'; + + memcpy(dst, monthname[tm->tm_mon], 3); // month + dst += 3; + *dst++ = '/'; + + dst = utoa_pad((unsigned int)tm->tm_year+1900, dst, 5); // year + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_hour, dst, 3); // hour + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_min, dst, 3); // minutes + if (!dst) + return NULL; + *dst++ = ':'; + + dst = utoa_pad((unsigned int)tm->tm_sec, dst, 3); // secondes + if (!dst) + return NULL; + *dst++ = ' '; + + memcpy(dst, gmt_offset, 5); // Offset from local time to GMT + dst += 5; + *dst = '\0'; + + return dst; +} + +/* Returns the number of seconds since 01/01/1970 0:0:0 GMT for GMT date <tm>. + * It is meant as a portable replacement for timegm() for use with valid inputs. + * Returns undefined results for invalid dates (eg: months out of range 0..11). + */ +time_t my_timegm(const struct tm *tm) +{ + /* Each month has 28, 29, 30 or 31 days, or 28+N. The date in the year + * is thus (current month - 1)*28 + cumulated_N[month] to count the + * sum of the extra N days for elapsed months. The sum of all these N + * days doesn't exceed 30 for a complete year (366-12*28) so it fits + * in a 5-bit word. This means that with 60 bits we can represent a + * matrix of all these values at once, which is fast and efficient to + * access. The extra February day for leap years is not counted here. + * + * Jan : none = 0 (0) + * Feb : Jan = 3 (3) + * Mar : Jan..Feb = 3 (3 + 0) + * Apr : Jan..Mar = 6 (3 + 0 + 3) + * May : Jan..Apr = 8 (3 + 0 + 3 + 2) + * Jun : Jan..May = 11 (3 + 0 + 3 + 2 + 3) + * Jul : Jan..Jun = 13 (3 + 0 + 3 + 2 + 3 + 2) + * Aug : Jan..Jul = 16 (3 + 0 + 3 + 2 + 3 + 2 + 3) + * Sep : Jan..Aug = 19 (3 + 0 + 3 + 2 + 3 + 2 + 3 + 3) + * Oct : Jan..Sep = 21 (3 + 0 + 3 + 2 + 3 + 2 + 3 + 3 + 2) + * Nov : Jan..Oct = 24 (3 + 0 + 3 + 2 + 3 + 2 + 3 + 3 + 2 + 3) + * Dec : Jan..Nov = 26 (3 + 0 + 3 + 2 + 3 + 2 + 3 + 3 + 2 + 3 + 2) + */ + uint64_t extra = + ( 0ULL << 0*5) + ( 3ULL << 1*5) + ( 3ULL << 2*5) + /* Jan, Feb, Mar, */ + ( 6ULL << 3*5) + ( 8ULL << 4*5) + (11ULL << 5*5) + /* Apr, May, Jun, */ + (13ULL << 6*5) + (16ULL << 7*5) + (19ULL << 8*5) + /* Jul, Aug, Sep, */ + (21ULL << 9*5) + (24ULL << 10*5) + (26ULL << 11*5); /* Oct, Nov, Dec, */ + + unsigned int y = tm->tm_year + 1900; + unsigned int m = tm->tm_mon; + unsigned long days = 0; + + /* days since 1/1/1970 for full years */ + days += days_since_zero(y) - days_since_zero(1970); + + /* days for full months in the current year */ + days += 28 * m + ((extra >> (m * 5)) & 0x1f); + + /* count + 1 after March for leap years. A leap year is a year multiple + * of 4, unless it's multiple of 100 without being multiple of 400. 2000 + * is leap, 1900 isn't, 1904 is. + */ + if ((m > 1) && !(y & 3) && ((y % 100) || !(y % 400))) + days++; + + days += tm->tm_mday - 1; + return days * 86400ULL + tm->tm_hour * 3600 + tm->tm_min * 60 + tm->tm_sec; +} + +/* This function check a char. It returns true and updates + * <date> and <len> pointer to the new position if the + * character is found. + */ +static inline int parse_expect_char(const char **date, int *len, char c) +{ + if (*len < 1 || **date != c) + return 0; + (*len)--; + (*date)++; + return 1; +} + +/* This function expects a string <str> of len <l>. It return true and updates. + * <date> and <len> if the string matches, otherwise, it returns false. + */ +static inline int parse_strcmp(const char **date, int *len, char *str, int l) +{ + if (*len < l || strncmp(*date, str, l) != 0) + return 0; + (*len) -= l; + (*date) += l; + return 1; +} + +/* This macro converts 3 chars name in integer. */ +#define STR2I3(__a, __b, __c) ((__a) * 65536 + (__b) * 256 + (__c)) + +/* day-name = %x4D.6F.6E ; "Mon", case-sensitive + * / %x54.75.65 ; "Tue", case-sensitive + * / %x57.65.64 ; "Wed", case-sensitive + * / %x54.68.75 ; "Thu", case-sensitive + * / %x46.72.69 ; "Fri", case-sensitive + * / %x53.61.74 ; "Sat", case-sensitive + * / %x53.75.6E ; "Sun", case-sensitive + * + * This array must be alphabetically sorted + */ +static inline int parse_http_dayname(const char **date, int *len, struct tm *tm) +{ + if (*len < 3) + return 0; + switch (STR2I3((*date)[0], (*date)[1], (*date)[2])) { + case STR2I3('M','o','n'): tm->tm_wday = 1; break; + case STR2I3('T','u','e'): tm->tm_wday = 2; break; + case STR2I3('W','e','d'): tm->tm_wday = 3; break; + case STR2I3('T','h','u'): tm->tm_wday = 4; break; + case STR2I3('F','r','i'): tm->tm_wday = 5; break; + case STR2I3('S','a','t'): tm->tm_wday = 6; break; + case STR2I3('S','u','n'): tm->tm_wday = 7; break; + default: return 0; + } + *len -= 3; + *date += 3; + return 1; +} + +/* month = %x4A.61.6E ; "Jan", case-sensitive + * / %x46.65.62 ; "Feb", case-sensitive + * / %x4D.61.72 ; "Mar", case-sensitive + * / %x41.70.72 ; "Apr", case-sensitive + * / %x4D.61.79 ; "May", case-sensitive + * / %x4A.75.6E ; "Jun", case-sensitive + * / %x4A.75.6C ; "Jul", case-sensitive + * / %x41.75.67 ; "Aug", case-sensitive + * / %x53.65.70 ; "Sep", case-sensitive + * / %x4F.63.74 ; "Oct", case-sensitive + * / %x4E.6F.76 ; "Nov", case-sensitive + * / %x44.65.63 ; "Dec", case-sensitive + * + * This array must be alphabetically sorted + */ +static inline int parse_http_monthname(const char **date, int *len, struct tm *tm) +{ + if (*len < 3) + return 0; + switch (STR2I3((*date)[0], (*date)[1], (*date)[2])) { + case STR2I3('J','a','n'): tm->tm_mon = 0; break; + case STR2I3('F','e','b'): tm->tm_mon = 1; break; + case STR2I3('M','a','r'): tm->tm_mon = 2; break; + case STR2I3('A','p','r'): tm->tm_mon = 3; break; + case STR2I3('M','a','y'): tm->tm_mon = 4; break; + case STR2I3('J','u','n'): tm->tm_mon = 5; break; + case STR2I3('J','u','l'): tm->tm_mon = 6; break; + case STR2I3('A','u','g'): tm->tm_mon = 7; break; + case STR2I3('S','e','p'): tm->tm_mon = 8; break; + case STR2I3('O','c','t'): tm->tm_mon = 9; break; + case STR2I3('N','o','v'): tm->tm_mon = 10; break; + case STR2I3('D','e','c'): tm->tm_mon = 11; break; + default: return 0; + } + *len -= 3; + *date += 3; + return 1; +} + +/* day-name-l = %x4D.6F.6E.64.61.79 ; "Monday", case-sensitive + * / %x54.75.65.73.64.61.79 ; "Tuesday", case-sensitive + * / %x57.65.64.6E.65.73.64.61.79 ; "Wednesday", case-sensitive + * / %x54.68.75.72.73.64.61.79 ; "Thursday", case-sensitive + * / %x46.72.69.64.61.79 ; "Friday", case-sensitive + * / %x53.61.74.75.72.64.61.79 ; "Saturday", case-sensitive + * / %x53.75.6E.64.61.79 ; "Sunday", case-sensitive + * + * This array must be alphabetically sorted + */ +static inline int parse_http_ldayname(const char **date, int *len, struct tm *tm) +{ + if (*len < 6) /* Minimum length. */ + return 0; + switch (STR2I3((*date)[0], (*date)[1], (*date)[2])) { + case STR2I3('M','o','n'): + RET0_UNLESS(parse_strcmp(date, len, "Monday", 6)); + tm->tm_wday = 1; + return 1; + case STR2I3('T','u','e'): + RET0_UNLESS(parse_strcmp(date, len, "Tuesday", 7)); + tm->tm_wday = 2; + return 1; + case STR2I3('W','e','d'): + RET0_UNLESS(parse_strcmp(date, len, "Wednesday", 9)); + tm->tm_wday = 3; + return 1; + case STR2I3('T','h','u'): + RET0_UNLESS(parse_strcmp(date, len, "Thursday", 8)); + tm->tm_wday = 4; + return 1; + case STR2I3('F','r','i'): + RET0_UNLESS(parse_strcmp(date, len, "Friday", 6)); + tm->tm_wday = 5; + return 1; + case STR2I3('S','a','t'): + RET0_UNLESS(parse_strcmp(date, len, "Saturday", 8)); + tm->tm_wday = 6; + return 1; + case STR2I3('S','u','n'): + RET0_UNLESS(parse_strcmp(date, len, "Sunday", 6)); + tm->tm_wday = 7; + return 1; + } + return 0; +} + +/* This function parses exactly 1 digit and returns the numeric value in "digit". */ +static inline int parse_digit(const char **date, int *len, int *digit) +{ + if (*len < 1 || **date < '0' || **date > '9') + return 0; + *digit = (**date - '0'); + (*date)++; + (*len)--; + return 1; +} + +/* This function parses exactly 2 digits and returns the numeric value in "digit". */ +static inline int parse_2digit(const char **date, int *len, int *digit) +{ + int value; + + RET0_UNLESS(parse_digit(date, len, &value)); + (*digit) = value * 10; + RET0_UNLESS(parse_digit(date, len, &value)); + (*digit) += value; + + return 1; +} + +/* This function parses exactly 4 digits and returns the numeric value in "digit". */ +static inline int parse_4digit(const char **date, int *len, int *digit) +{ + int value; + + RET0_UNLESS(parse_digit(date, len, &value)); + (*digit) = value * 1000; + + RET0_UNLESS(parse_digit(date, len, &value)); + (*digit) += value * 100; + + RET0_UNLESS(parse_digit(date, len, &value)); + (*digit) += value * 10; + + RET0_UNLESS(parse_digit(date, len, &value)); + (*digit) += value; + + return 1; +} + +/* time-of-day = hour ":" minute ":" second + * ; 00:00:00 - 23:59:60 (leap second) + * + * hour = 2DIGIT + * minute = 2DIGIT + * second = 2DIGIT + */ +static inline int parse_http_time(const char **date, int *len, struct tm *tm) +{ + RET0_UNLESS(parse_2digit(date, len, &tm->tm_hour)); /* hour 2DIGIT */ + RET0_UNLESS(parse_expect_char(date, len, ':')); /* expect ":" */ + RET0_UNLESS(parse_2digit(date, len, &tm->tm_min)); /* min 2DIGIT */ + RET0_UNLESS(parse_expect_char(date, len, ':')); /* expect ":" */ + RET0_UNLESS(parse_2digit(date, len, &tm->tm_sec)); /* sec 2DIGIT */ + return 1; +} + +/* From RFC7231 + * https://tools.ietf.org/html/rfc7231#section-7.1.1.1 + * + * IMF-fixdate = day-name "," SP date1 SP time-of-day SP GMT + * ; fixed length/zone/capitalization subset of the format + * ; see Section 3.3 of [RFC5322] + * + * + * date1 = day SP month SP year + * ; e.g., 02 Jun 1982 + * + * day = 2DIGIT + * year = 4DIGIT + * + * GMT = %x47.4D.54 ; "GMT", case-sensitive + * + * time-of-day = hour ":" minute ":" second + * ; 00:00:00 - 23:59:60 (leap second) + * + * hour = 2DIGIT + * minute = 2DIGIT + * second = 2DIGIT + * + * DIGIT = decimal 0-9 + */ +int parse_imf_date(const char *date, int len, struct tm *tm) +{ + /* tm_gmtoff, if present, ought to be zero'ed */ + memset(tm, 0, sizeof(*tm)); + + RET0_UNLESS(parse_http_dayname(&date, &len, tm)); /* day-name */ + RET0_UNLESS(parse_expect_char(&date, &len, ',')); /* expect "," */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_2digit(&date, &len, &tm->tm_mday)); /* day 2DIGIT */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_http_monthname(&date, &len, tm)); /* Month */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_4digit(&date, &len, &tm->tm_year)); /* year = 4DIGIT */ + tm->tm_year -= 1900; + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_http_time(&date, &len, tm)); /* Parse time. */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_strcmp(&date, &len, "GMT", 3)); /* GMT = %x47.4D.54 ; "GMT", case-sensitive */ + tm->tm_isdst = -1; + return 1; +} + +/* From RFC7231 + * https://tools.ietf.org/html/rfc7231#section-7.1.1.1 + * + * rfc850-date = day-name-l "," SP date2 SP time-of-day SP GMT + * date2 = day "-" month "-" 2DIGIT + * ; e.g., 02-Jun-82 + * + * day = 2DIGIT + */ +int parse_rfc850_date(const char *date, int len, struct tm *tm) +{ + int year; + + /* tm_gmtoff, if present, ought to be zero'ed */ + memset(tm, 0, sizeof(*tm)); + + RET0_UNLESS(parse_http_ldayname(&date, &len, tm)); /* Read the day name */ + RET0_UNLESS(parse_expect_char(&date, &len, ',')); /* expect "," */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_2digit(&date, &len, &tm->tm_mday)); /* day 2DIGIT */ + RET0_UNLESS(parse_expect_char(&date, &len, '-')); /* expect "-" */ + RET0_UNLESS(parse_http_monthname(&date, &len, tm)); /* Month */ + RET0_UNLESS(parse_expect_char(&date, &len, '-')); /* expect "-" */ + + /* year = 2DIGIT + * + * Recipients of a timestamp value in rfc850-(*date) format, which uses a + * two-digit year, MUST interpret a timestamp that appears to be more + * than 50 years in the future as representing the most recent year in + * the past that had the same last two digits. + */ + RET0_UNLESS(parse_2digit(&date, &len, &tm->tm_year)); + + /* expect SP */ + if (!parse_expect_char(&date, &len, ' ')) { + /* Maybe we have the date with 4 digits. */ + RET0_UNLESS(parse_2digit(&date, &len, &year)); + tm->tm_year = (tm->tm_year * 100 + year) - 1900; + /* expect SP */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); + } else { + /* I fix 60 as pivot: >60: +1900, <60: +2000. Note that the + * tm_year is the number of year since 1900, so for +1900, we + * do nothing, and for +2000, we add 100. + */ + if (tm->tm_year <= 60) + tm->tm_year += 100; + } + + RET0_UNLESS(parse_http_time(&date, &len, tm)); /* Parse time. */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_strcmp(&date, &len, "GMT", 3)); /* GMT = %x47.4D.54 ; "GMT", case-sensitive */ + tm->tm_isdst = -1; + + return 1; +} + +/* From RFC7231 + * https://tools.ietf.org/html/rfc7231#section-7.1.1.1 + * + * asctime-date = day-name SP date3 SP time-of-day SP year + * date3 = month SP ( 2DIGIT / ( SP 1DIGIT )) + * ; e.g., Jun 2 + * + * HTTP-date is case sensitive. A sender MUST NOT generate additional + * whitespace in an HTTP-date beyond that specifically included as SP in + * the grammar. + */ +int parse_asctime_date(const char *date, int len, struct tm *tm) +{ + /* tm_gmtoff, if present, ought to be zero'ed */ + memset(tm, 0, sizeof(*tm)); + + RET0_UNLESS(parse_http_dayname(&date, &len, tm)); /* day-name */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_http_monthname(&date, &len, tm)); /* expect month */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + + /* expect SP and 1DIGIT or 2DIGIT */ + if (parse_expect_char(&date, &len, ' ')) + RET0_UNLESS(parse_digit(&date, &len, &tm->tm_mday)); + else + RET0_UNLESS(parse_2digit(&date, &len, &tm->tm_mday)); + + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_http_time(&date, &len, tm)); /* Parse time. */ + RET0_UNLESS(parse_expect_char(&date, &len, ' ')); /* expect SP */ + RET0_UNLESS(parse_4digit(&date, &len, &tm->tm_year)); /* year = 4DIGIT */ + tm->tm_year -= 1900; + tm->tm_isdst = -1; + return 1; +} + +/* From RFC7231 + * https://tools.ietf.org/html/rfc7231#section-7.1.1.1 + * + * HTTP-date = IMF-fixdate / obs-date + * obs-date = rfc850-date / asctime-date + * + * parses an HTTP date in the RFC format and is accepted + * alternatives. <date> is the strinf containing the date, + * len is the len of the string. <tm> is filled with the + * parsed time. We must considers this time as GMT. + */ +int parse_http_date(const char *date, int len, struct tm *tm) +{ + if (parse_imf_date(date, len, tm)) + return 1; + + if (parse_rfc850_date(date, len, tm)) + return 1; + + if (parse_asctime_date(date, len, tm)) + return 1; + + return 0; +} + +/* print the time <ns> in a short form (exactly 7 chars) at the end of buffer + * <out>. "-" is printed if the value is zero, "inf" if larger than 1000 years. + * It returns the new buffer length, or 0 if it doesn't fit. The value will be + * surrounded by <pfx> and <sfx> respectively if not NULL. + */ +int print_time_short(struct buffer *out, const char *pfx, uint64_t ns, const char *sfx) +{ + double val = ns; // 52 bits of mantissa keep ns accuracy over 52 days + const char *unit; + + if (!pfx) + pfx = ""; + if (!sfx) + sfx = ""; + + do { + unit = " - "; if (val <= 0.0) break; + unit = "ns"; if (val < 1000.0) break; + unit = "us"; val /= 1000.0; if (val < 1000.0) break; + unit = "ms"; val /= 1000.0; if (val < 1000.0) break; + unit = "s "; val /= 1000.0; if (val < 60.0) break; + unit = "m "; val /= 60.0; if (val < 60.0) break; + unit = "h "; val /= 60.0; if (val < 24.0) break; + unit = "d "; val /= 24.0; if (val < 365.0) break; + unit = "yr"; val /= 365.0; if (val < 1000.0) break; + unit = " inf "; val = 0.0; break; + } while (0); + + if (val <= 0.0) + return chunk_appendf(out, "%s%7s%s", pfx, unit, sfx); + else if (val < 10.0) + return chunk_appendf(out, "%s%1.3f%s%s", pfx, val, unit, sfx); + else if (val < 100.0) + return chunk_appendf(out, "%s%2.2f%s%s", pfx, val, unit, sfx); + else + return chunk_appendf(out, "%s%3.1f%s%s", pfx, val, unit, sfx); +} + +/* Dynamically allocates a string of the proper length to hold the formatted + * output. NULL is returned on error. The caller is responsible for freeing the + * memory area using free(). The resulting string is returned in <out> if the + * pointer is not NULL. A previous version of <out> might be used to build the + * new string, and it will be freed before returning if it is not NULL, which + * makes it possible to build complex strings from iterative calls without + * having to care about freeing intermediate values, as in the example below : + * + * memprintf(&err, "invalid argument: '%s'", arg); + * ... + * memprintf(&err, "parser said : <%s>\n", *err); + * ... + * free(*err); + * + * This means that <err> must be initialized to NULL before first invocation. + * The return value also holds the allocated string, which eases error checking + * and immediate consumption. If the output pointer is not used, NULL must be + * passed instead and it will be ignored. The returned message will then also + * be NULL so that the caller does not have to bother with freeing anything. + * + * It is also convenient to use it without any free except the last one : + * err = NULL; + * if (!fct1(err)) report(*err); + * if (!fct2(err)) report(*err); + * if (!fct3(err)) report(*err); + * free(*err); + * + * memprintf relies on memvprintf. This last version can be called from any + * function with variadic arguments. + */ +char *memvprintf(char **out, const char *format, va_list orig_args) +{ + va_list args; + char *ret = NULL; + int allocated = 0; + int needed = 0; + + if (!out) + return NULL; + + do { + char buf1; + + /* vsnprintf() will return the required length even when the + * target buffer is NULL. We do this in a loop just in case + * intermediate evaluations get wrong. + */ + va_copy(args, orig_args); + needed = vsnprintf(ret ? ret : &buf1, allocated, format, args); + va_end(args); + if (needed < allocated) { + /* Note: on Solaris 8, the first iteration always + * returns -1 if allocated is zero, so we force a + * retry. + */ + if (!allocated) + needed = 0; + else + break; + } + + allocated = needed + 1; + ret = my_realloc2(ret, allocated); + } while (ret); + + if (needed < 0) { + /* an error was encountered */ + ha_free(&ret); + } + + if (out) { + free(*out); + *out = ret; + } + + return ret; +} + +char *memprintf(char **out, const char *format, ...) +{ + va_list args; + char *ret = NULL; + + va_start(args, format); + ret = memvprintf(out, format, args); + va_end(args); + + return ret; +} + +/* Used to add <level> spaces before each line of <out>, unless there is only one line. + * The input argument is automatically freed and reassigned. The result will have to be + * freed by the caller. It also supports being passed a NULL which results in the same + * output. + * Example of use : + * parse(cmd, &err); (callee: memprintf(&err, ...)) + * fprintf(stderr, "Parser said: %s\n", indent_error(&err)); + * free(err); + */ +char *indent_msg(char **out, int level) +{ + char *ret, *in, *p; + int needed = 0; + int lf = 0; + int lastlf = 0; + int len; + + if (!out || !*out) + return NULL; + + in = *out - 1; + while ((in = strchr(in + 1, '\n')) != NULL) { + lastlf = in - *out; + lf++; + } + + if (!lf) /* single line, no LF, return it as-is */ + return *out; + + len = strlen(*out); + + if (lf == 1 && lastlf == len - 1) { + /* single line, LF at end, strip it and return as-is */ + (*out)[lastlf] = 0; + return *out; + } + + /* OK now we have at least one LF, we need to process the whole string + * as a multi-line string. What we'll do : + * - prefix with an LF if there is none + * - add <level> spaces before each line + * This means at most ( 1 + level + (len-lf) + lf*<1+level) ) = + * 1 + level + len + lf * level = 1 + level * (lf + 1) + len. + */ + + needed = 1 + level * (lf + 1) + len + 1; + p = ret = malloc(needed); + in = *out; + + /* skip initial LFs */ + while (*in == '\n') + in++; + + /* copy each line, prefixed with LF and <level> spaces, and without the trailing LF */ + while (*in) { + *p++ = '\n'; + memset(p, ' ', level); + p += level; + do { + *p++ = *in++; + } while (*in && *in != '\n'); + if (*in) + in++; + } + *p = 0; + + free(*out); + *out = ret; + + return ret; +} + +/* makes a copy of message <in> into <out>, with each line prefixed with <pfx> + * and end of lines replaced with <eol> if not 0. The first line to indent has + * to be indicated in <first> (starts at zero), so that it is possible to skip + * indenting the first line if it has to be appended after an existing message. + * Empty strings are never indented, and NULL strings are considered empty both + * for <in> and <pfx>. It returns non-zero if an EOL was appended as the last + * character, non-zero otherwise. + */ +int append_prefixed_str(struct buffer *out, const char *in, const char *pfx, char eol, int first) +{ + int bol, lf; + int pfxlen = pfx ? strlen(pfx) : 0; + + if (!in) + return 0; + + bol = 1; + lf = 0; + while (*in) { + if (bol && pfxlen) { + if (first > 0) + first--; + else + b_putblk(out, pfx, pfxlen); + bol = 0; + } + + lf = (*in == '\n'); + bol |= lf; + b_putchr(out, (lf && eol) ? eol : *in); + in++; + } + return lf; +} + +/* removes environment variable <name> from the environment as found in + * environ. This is only provided as an alternative for systems without + * unsetenv() (old Solaris and AIX versions). THIS IS NOT THREAD SAFE. + * The principle is to scan environ for each occurrence of variable name + * <name> and to replace the matching pointers with the last pointer of + * the array (since variables are not ordered). + * It always returns 0 (success). + */ +int my_unsetenv(const char *name) +{ + extern char **environ; + char **p = environ; + int vars; + int next; + int len; + + len = strlen(name); + for (vars = 0; p[vars]; vars++) + ; + next = 0; + while (next < vars) { + if (strncmp(p[next], name, len) != 0 || p[next][len] != '=') { + next++; + continue; + } + if (next < vars - 1) + p[next] = p[vars - 1]; + p[--vars] = NULL; + } + return 0; +} + +/* Convert occurrences of environment variables in the input string to their + * corresponding value. A variable is identified as a series of alphanumeric + * characters or underscores following a '$' sign. The <in> string must be + * free()able. NULL returns NULL. The resulting string might be reallocated if + * some expansion is made. Variable names may also be enclosed into braces if + * needed (eg: to concatenate alphanum characters). + */ +char *env_expand(char *in) +{ + char *txt_beg; + char *out; + char *txt_end; + char *var_beg; + char *var_end; + char *value; + char *next; + int out_len; + int val_len; + + if (!in) + return in; + + value = out = NULL; + out_len = 0; + + txt_beg = in; + do { + /* look for next '$' sign in <in> */ + for (txt_end = txt_beg; *txt_end && *txt_end != '$'; txt_end++); + + if (!*txt_end && !out) /* end and no expansion performed */ + return in; + + val_len = 0; + next = txt_end; + if (*txt_end == '$') { + char save; + + var_beg = txt_end + 1; + if (*var_beg == '{') + var_beg++; + + var_end = var_beg; + while (isalnum((unsigned char)*var_end) || *var_end == '_') { + var_end++; + } + + next = var_end; + if (*var_end == '}' && (var_beg > txt_end + 1)) + next++; + + /* get value of the variable name at this location */ + save = *var_end; + *var_end = '\0'; + value = getenv(var_beg); + *var_end = save; + val_len = value ? strlen(value) : 0; + } + + out = my_realloc2(out, out_len + (txt_end - txt_beg) + val_len + 1); + if (txt_end > txt_beg) { + memcpy(out + out_len, txt_beg, txt_end - txt_beg); + out_len += txt_end - txt_beg; + } + if (val_len) { + memcpy(out + out_len, value, val_len); + out_len += val_len; + } + out[out_len] = 0; + txt_beg = next; + } while (*txt_beg); + + /* here we know that <out> was allocated and that we don't need <in> anymore */ + free(in); + return out; +} + + +/* same as strstr() but case-insensitive and with limit length */ +const char *strnistr(const char *str1, int len_str1, const char *str2, int len_str2) +{ + char *pptr, *sptr, *start; + unsigned int slen, plen; + unsigned int tmp1, tmp2; + + if (str1 == NULL || len_str1 == 0) // search pattern into an empty string => search is not found + return NULL; + + if (str2 == NULL || len_str2 == 0) // pattern is empty => every str1 match + return str1; + + if (len_str1 < len_str2) // pattern is longer than string => search is not found + return NULL; + + for (tmp1 = 0, start = (char *)str1, pptr = (char *)str2, slen = len_str1, plen = len_str2; slen >= plen; start++, slen--) { + while (toupper((unsigned char)*start) != toupper((unsigned char)*str2)) { + start++; + slen--; + tmp1++; + + if (tmp1 >= len_str1) + return NULL; + + /* if pattern longer than string */ + if (slen < plen) + return NULL; + } + + sptr = start; + pptr = (char *)str2; + + tmp2 = 0; + while (toupper((unsigned char)*sptr) == toupper((unsigned char)*pptr)) { + sptr++; + pptr++; + tmp2++; + + if (*pptr == '\0' || tmp2 == len_str2) /* end of pattern found */ + return start; + if (*sptr == '\0' || tmp2 == len_str1) /* end of string found and the pattern is not fully found */ + return NULL; + } + } + return NULL; +} + +/* Returns true if s1 < s2 < s3 otherwise zero. Both s1 and s3 may be NULL and + * in this case only non-null strings are compared. This allows to pass initial + * values in iterators and in sort functions. + */ +int strordered(const char *s1, const char *s2, const char *s3) +{ + return (!s1 || strcmp(s1, s2) < 0) && (!s3 || strcmp(s2, s3) < 0); +} + +/* This function read the next valid utf8 char. + * <s> is the byte srray to be decode, <len> is its length. + * The function returns decoded char encoded like this: + * The 4 msb are the return code (UTF8_CODE_*), the 4 lsb + * are the length read. The decoded character is stored in <c>. + */ +unsigned char utf8_next(const char *s, int len, unsigned int *c) +{ + const unsigned char *p = (unsigned char *)s; + int dec; + unsigned char code = UTF8_CODE_OK; + + if (len < 1) + return UTF8_CODE_OK; + + /* Check the type of UTF8 sequence + * + * 0... .... 0x00 <= x <= 0x7f : 1 byte: ascii char + * 10.. .... 0x80 <= x <= 0xbf : invalid sequence + * 110. .... 0xc0 <= x <= 0xdf : 2 bytes + * 1110 .... 0xe0 <= x <= 0xef : 3 bytes + * 1111 0... 0xf0 <= x <= 0xf7 : 4 bytes + * 1111 10.. 0xf8 <= x <= 0xfb : 5 bytes + * 1111 110. 0xfc <= x <= 0xfd : 6 bytes + * 1111 111. 0xfe <= x <= 0xff : invalid sequence + */ + switch (*p) { + case 0x00 ... 0x7f: + *c = *p; + return UTF8_CODE_OK | 1; + + case 0x80 ... 0xbf: + *c = *p; + return UTF8_CODE_BADSEQ | 1; + + case 0xc0 ... 0xdf: + if (len < 2) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x1f; + dec = 1; + break; + + case 0xe0 ... 0xef: + if (len < 3) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x0f; + dec = 2; + break; + + case 0xf0 ... 0xf7: + if (len < 4) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x07; + dec = 3; + break; + + case 0xf8 ... 0xfb: + if (len < 5) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x03; + dec = 4; + break; + + case 0xfc ... 0xfd: + if (len < 6) { + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + *c = *p & 0x01; + dec = 5; + break; + + case 0xfe ... 0xff: + default: + *c = *p; + return UTF8_CODE_BADSEQ | 1; + } + + p++; + + while (dec > 0) { + + /* need 0x10 for the 2 first bits */ + if ( ( *p & 0xc0 ) != 0x80 ) + return UTF8_CODE_BADSEQ | ((p-(unsigned char *)s)&0xffff); + + /* add data at char */ + *c = ( *c << 6 ) | ( *p & 0x3f ); + + dec--; + p++; + } + + /* Check ovelong encoding. + * 1 byte : 5 + 6 : 11 : 0x80 ... 0x7ff + * 2 bytes : 4 + 6 + 6 : 16 : 0x800 ... 0xffff + * 3 bytes : 3 + 6 + 6 + 6 : 21 : 0x10000 ... 0x1fffff + */ + if (( *c <= 0x7f && (p-(unsigned char *)s) > 1) || + (*c >= 0x80 && *c <= 0x7ff && (p-(unsigned char *)s) > 2) || + (*c >= 0x800 && *c <= 0xffff && (p-(unsigned char *)s) > 3) || + (*c >= 0x10000 && *c <= 0x1fffff && (p-(unsigned char *)s) > 4)) + code |= UTF8_CODE_OVERLONG; + + /* Check invalid UTF8 range. */ + if ((*c >= 0xd800 && *c <= 0xdfff) || + (*c >= 0xfffe && *c <= 0xffff)) + code |= UTF8_CODE_INVRANGE; + + return code | ((p-(unsigned char *)s)&0x0f); +} + +/* append a copy of string <str> (in a wordlist) at the end of the list <li> + * On failure : return 0 and <err> filled with an error message. + * The caller is responsible for freeing the <err> and <str> copy + * memory area using free() + */ +int list_append_word(struct list *li, const char *str, char **err) +{ + struct wordlist *wl; + + wl = calloc(1, sizeof(*wl)); + if (!wl) { + memprintf(err, "out of memory"); + goto fail_wl; + } + + wl->s = strdup(str); + if (!wl->s) { + memprintf(err, "out of memory"); + goto fail_wl_s; + } + + LIST_APPEND(li, &wl->list); + + return 1; + +fail_wl_s: + free(wl->s); +fail_wl: + free(wl); + return 0; +} + +/* indicates if a memory location may safely be read or not. The trick consists + * in performing a harmless syscall using this location as an input and letting + * the operating system report whether it's OK or not. For this we have the + * stat() syscall, which will return EFAULT when the memory location supposed + * to contain the file name is not readable. If it is readable it will then + * either return 0 if the area contains an existing file name, or -1 with + * another code. This must not be abused, and some audit systems might detect + * this as abnormal activity. It's used only for unsafe dumps. + */ +int may_access(const void *ptr) +{ + struct stat buf; + + if (stat(ptr, &buf) == 0) + return 1; + if (errno == EFAULT) + return 0; + return 1; +} + +/* print a string of text buffer to <out>. The format is : + * Non-printable chars \t, \n, \r and \e are * encoded in C format. + * Other non-printable chars are encoded "\xHH". Space, '\', and '=' are also escaped. + * Print stopped if null char or <bsize> is reached, or if no more place in the chunk. + */ +int dump_text(struct buffer *out, const char *buf, int bsize) +{ + unsigned char c; + size_t ptr = 0; + + while (ptr < bsize && buf[ptr]) { + c = buf[ptr]; + if (isprint((unsigned char)c) && isascii((unsigned char)c) && c != '\\' && c != ' ' && c != '=') { + if (out->data > out->size - 1) + break; + out->area[out->data++] = c; + } + else if (c == '\t' || c == '\n' || c == '\r' || c == '\e' || c == '\\' || c == ' ' || c == '=') { + if (out->data > out->size - 2) + break; + out->area[out->data++] = '\\'; + switch (c) { + case ' ': c = ' '; break; + case '\t': c = 't'; break; + case '\n': c = 'n'; break; + case '\r': c = 'r'; break; + case '\e': c = 'e'; break; + case '\\': c = '\\'; break; + case '=': c = '='; break; + } + out->area[out->data++] = c; + } + else { + if (out->data > out->size - 4) + break; + out->area[out->data++] = '\\'; + out->area[out->data++] = 'x'; + out->area[out->data++] = hextab[(c >> 4) & 0xF]; + out->area[out->data++] = hextab[c & 0xF]; + } + ptr++; + } + + return ptr; +} + +/* print a buffer in hexa. + * Print stopped if <bsize> is reached, or if no more place in the chunk. + */ +int dump_binary(struct buffer *out, const char *buf, int bsize) +{ + unsigned char c; + int ptr = 0; + + while (ptr < bsize) { + c = buf[ptr]; + + if (out->data > out->size - 2) + break; + out->area[out->data++] = hextab[(c >> 4) & 0xF]; + out->area[out->data++] = hextab[c & 0xF]; + + ptr++; + } + return ptr; +} + +/* Appends into buffer <out> a hex dump of memory area <buf> for <len> bytes, + * prepending each line with prefix <pfx>. The output is *not* initialized. + * The output will not wrap pas the buffer's end so it is more optimal if the + * caller makes sure the buffer is aligned first. A trailing zero will always + * be appended (and not counted) if there is room for it. The caller must make + * sure that the area is dumpable first. If <unsafe> is non-null, the memory + * locations are checked first for being readable. + */ +void dump_hex(struct buffer *out, const char *pfx, const void *buf, int len, int unsafe) +{ + const unsigned char *d = buf; + int i, j, start; + + d = (const unsigned char *)(((unsigned long)buf) & -16); + start = ((unsigned long)buf) & 15; + + for (i = 0; i < start + len; i += 16) { + chunk_appendf(out, (sizeof(void *) == 4) ? "%s%8p: " : "%s%16p: ", pfx, d + i); + + // 0: unchecked, 1: checked safe, 2: danger + unsafe = !!unsafe; + if (unsafe && !may_access(d + i)) + unsafe = 2; + + for (j = 0; j < 16; j++) { + if ((i + j < start) || (i + j >= start + len)) + chunk_strcat(out, "'' "); + else if (unsafe > 1) + chunk_strcat(out, "** "); + else + chunk_appendf(out, "%02x ", d[i + j]); + + if (j == 7) + chunk_strcat(out, "- "); + } + chunk_strcat(out, " "); + for (j = 0; j < 16; j++) { + if ((i + j < start) || (i + j >= start + len)) + chunk_strcat(out, "'"); + else if (unsafe > 1) + chunk_strcat(out, "*"); + else if (isprint((unsigned char)d[i + j])) + chunk_appendf(out, "%c", d[i + j]); + else + chunk_strcat(out, "."); + } + chunk_strcat(out, "\n"); + } +} + +/* dumps <pfx> followed by <n> bytes from <addr> in hex form into buffer <buf> + * enclosed in brackets after the address itself, formatted on 14 chars + * including the "0x" prefix. This is meant to be used as a prefix for code + * areas. For example: + * "0x7f10b6557690 [48 c7 c0 0f 00 00 00 0f]" + * It relies on may_access() to know if the bytes are dumpable, otherwise "--" + * is emitted. A NULL <pfx> will be considered empty. + */ +void dump_addr_and_bytes(struct buffer *buf, const char *pfx, const void *addr, int n) +{ + int ok = 0; + int i; + + chunk_appendf(buf, "%s%#14lx [", pfx ? pfx : "", (long)addr); + + for (i = 0; i < n; i++) { + if (i == 0 || (((long)(addr + i) ^ (long)(addr)) & 4096)) + ok = may_access(addr + i); + if (ok) + chunk_appendf(buf, "%02x%s", ((uint8_t*)addr)[i], (i<n-1) ? " " : "]"); + else + chunk_appendf(buf, "--%s", (i<n-1) ? " " : "]"); + } +} + +/* print a line of text buffer (limited to 70 bytes) to <out>. The format is : + * <2 spaces> <offset=5 digits> <space or plus> <space> <70 chars max> <\n> + * which is 60 chars per line. Non-printable chars \t, \n, \r and \e are + * encoded in C format. Other non-printable chars are encoded "\xHH". Original + * lines are respected within the limit of 70 output chars. Lines that are + * continuation of a previous truncated line begin with "+" instead of " " + * after the offset. The new pointer is returned. + */ +int dump_text_line(struct buffer *out, const char *buf, int bsize, int len, + int *line, int ptr) +{ + int end; + unsigned char c; + + end = out->data + 80; + if (end > out->size) + return ptr; + + chunk_appendf(out, " %05d%c ", ptr, (ptr == *line) ? ' ' : '+'); + + while (ptr < len && ptr < bsize) { + c = buf[ptr]; + if (isprint((unsigned char)c) && isascii((unsigned char)c) && c != '\\') { + if (out->data > end - 2) + break; + out->area[out->data++] = c; + } else if (c == '\t' || c == '\n' || c == '\r' || c == '\e' || c == '\\') { + if (out->data > end - 3) + break; + out->area[out->data++] = '\\'; + switch (c) { + case '\t': c = 't'; break; + case '\n': c = 'n'; break; + case '\r': c = 'r'; break; + case '\e': c = 'e'; break; + case '\\': c = '\\'; break; + } + out->area[out->data++] = c; + } else { + if (out->data > end - 5) + break; + out->area[out->data++] = '\\'; + out->area[out->data++] = 'x'; + out->area[out->data++] = hextab[(c >> 4) & 0xF]; + out->area[out->data++] = hextab[c & 0xF]; + } + if (buf[ptr++] == '\n') { + /* we had a line break, let's return now */ + out->area[out->data++] = '\n'; + *line = ptr; + return ptr; + } + } + /* we have an incomplete line, we return it as-is */ + out->area[out->data++] = '\n'; + return ptr; +} + +/* displays a <len> long memory block at <buf>, assuming first byte of <buf> + * has address <baseaddr>. String <pfx> may be placed as a prefix in front of + * each line. It may be NULL if unused. The output is emitted to file <out>. + */ +void debug_hexdump(FILE *out, const char *pfx, const char *buf, + unsigned int baseaddr, int len) +{ + unsigned int i; + int b, j; + + for (i = 0; i < (len + (baseaddr & 15)); i += 16) { + b = i - (baseaddr & 15); + fprintf(out, "%s%08x: ", pfx ? pfx : "", i + (baseaddr & ~15)); + for (j = 0; j < 8; j++) { + if (b + j >= 0 && b + j < len) + fprintf(out, "%02x ", (unsigned char)buf[b + j]); + else + fprintf(out, " "); + } + + if (b + j >= 0 && b + j < len) + fputc('-', out); + else + fputc(' ', out); + + for (j = 8; j < 16; j++) { + if (b + j >= 0 && b + j < len) + fprintf(out, " %02x", (unsigned char)buf[b + j]); + else + fprintf(out, " "); + } + + fprintf(out, " "); + for (j = 0; j < 16; j++) { + if (b + j >= 0 && b + j < len) { + if (isprint((unsigned char)buf[b + j])) + fputc((unsigned char)buf[b + j], out); + else + fputc('.', out); + } + else + fputc(' ', out); + } + fputc('\n', out); + } +} + +/* Tries to report the executable path name on platforms supporting this. If + * not found or not possible, returns NULL. + */ +const char *get_exec_path() +{ + const char *ret = NULL; + +#if defined(__linux__) && defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 16)) + long execfn = getauxval(AT_EXECFN); + + if (execfn && execfn != ENOENT) + ret = (const char *)execfn; +#elif defined(__FreeBSD__) + Elf_Auxinfo *auxv; + for (auxv = __elf_aux_vector; auxv->a_type != AT_NULL; ++auxv) { + if (auxv->a_type == AT_EXECPATH) { + ret = (const char *)auxv->a_un.a_ptr; + break; + } + } +#elif defined(__NetBSD__) + AuxInfo *auxv; + for (auxv = _dlauxinfo(); auxv->a_type != AT_NULL; ++auxv) { + if (auxv->a_type == AT_SUN_EXECNAME) { + ret = (const char *)auxv->a_v; + break; + } + } +#elif defined(__sun) + ret = getexecname(); +#endif + return ret; +} + +#if (defined(__ELF__) && !defined(__linux__)) || defined(USE_DL) +/* calls dladdr() or dladdr1() on <addr> and <dli>. If dladdr1 is available, + * also returns the symbol size in <size>, otherwise returns 0 there. + */ +static int dladdr_and_size(const void *addr, Dl_info *dli, size_t *size) +{ + int ret; +#if defined(__GLIBC__) && (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 3)) // most detailed one + const ElfW(Sym) *sym __attribute__((may_alias)); + + ret = dladdr1(addr, dli, (void **)&sym, RTLD_DL_SYMENT); + if (ret) + *size = sym ? sym->st_size : 0; +#else +#if defined(__sun) + ret = dladdr((void *)addr, dli); +#else + ret = dladdr(addr, dli); +#endif + *size = 0; +#endif + return ret; +} + +/* Sets build_is_static to true if we detect a static build. Some older glibcs + * tend to crash inside dlsym() in static builds, but tests show that at least + * dladdr() still works (and will fail to resolve anything of course). Thus we + * try to determine if we're on a static build to avoid calling dlsym() in this + * case. + */ +void check_if_static_build() +{ + Dl_info dli = { }; + size_t size = 0; + + /* Now let's try to be smarter */ + if (!dladdr_and_size(&main, &dli, &size)) + build_is_static = 1; + else + build_is_static = 0; +} + +INITCALL0(STG_PREPARE, check_if_static_build); + +/* Tries to retrieve the address of the first occurrence symbol <name>. + * Note that NULL in return is not always an error as a symbol may have that + * address in special situations. + */ +void *get_sym_curr_addr(const char *name) +{ + void *ptr = NULL; + +#ifdef RTLD_DEFAULT + if (!build_is_static) + ptr = dlsym(RTLD_DEFAULT, name); +#endif + return ptr; +} + + +/* Tries to retrieve the address of the next occurrence of symbol <name> + * Note that NULL in return is not always an error as a symbol may have that + * address in special situations. + */ +void *get_sym_next_addr(const char *name) +{ + void *ptr = NULL; + +#ifdef RTLD_NEXT + if (!build_is_static) + ptr = dlsym(RTLD_NEXT, name); +#endif + return ptr; +} + +#else /* elf & linux & dl */ + +/* no possible resolving on other platforms at the moment */ +void *get_sym_curr_addr(const char *name) +{ + return NULL; +} + +void *get_sym_next_addr(const char *name) +{ + return NULL; +} + +#endif /* elf & linux & dl */ + +/* Tries to append to buffer <buf> some indications about the symbol at address + * <addr> using the following form: + * lib:+0xoffset (unresolvable address from lib's base) + * main+0xoffset (unresolvable address from main (+/-)) + * lib:main+0xoffset (unresolvable lib address from main (+/-)) + * name (resolved exact exec address) + * lib:name (resolved exact lib address) + * name+0xoffset/0xsize (resolved address within exec symbol) + * lib:name+0xoffset/0xsize (resolved address within lib symbol) + * + * The file name (lib or executable) is limited to what lies between the last + * '/' and the first following '.'. An optional prefix <pfx> is prepended before + * the output if not null. The file is not dumped when it's the same as the one + * that contains the "main" symbol, or when __ELF__ && USE_DL are not set. + * + * The symbol's base address is returned, or NULL when unresolved, in order to + * allow the caller to match it against known ones. + */ +const void *resolve_sym_name(struct buffer *buf, const char *pfx, const void *addr) +{ + const struct { + const void *func; + const char *name; + } fcts[] = { + { .func = process_stream, .name = "process_stream" }, + { .func = task_run_applet, .name = "task_run_applet" }, + { .func = sc_conn_io_cb, .name = "sc_conn_io_cb" }, + { .func = sock_conn_iocb, .name = "sock_conn_iocb" }, + { .func = dgram_fd_handler, .name = "dgram_fd_handler" }, + { .func = listener_accept, .name = "listener_accept" }, + { .func = manage_global_listener_queue, .name = "manage_global_listener_queue" }, + { .func = poller_pipe_io_handler, .name = "poller_pipe_io_handler" }, + { .func = mworker_accept_wrapper, .name = "mworker_accept_wrapper" }, + { .func = session_expire_embryonic, .name = "session_expire_embryonic" }, +#ifdef USE_THREAD + { .func = accept_queue_process, .name = "accept_queue_process" }, +#endif +#ifdef USE_LUA + { .func = hlua_process_task, .name = "hlua_process_task" }, +#endif +#ifdef SSL_MODE_ASYNC + { .func = ssl_async_fd_free, .name = "ssl_async_fd_free" }, + { .func = ssl_async_fd_handler, .name = "ssl_async_fd_handler" }, +#endif +#ifdef USE_QUIC + { .func = quic_conn_sock_fd_iocb, .name = "quic_conn_sock_fd_iocb" }, +#endif + }; + +#if (defined(__ELF__) && !defined(__linux__)) || defined(USE_DL) + Dl_info dli, dli_main; + size_t size; + const char *fname, *p; +#endif + int i; + + if (pfx) + chunk_appendf(buf, "%s", pfx); + + for (i = 0; i < sizeof(fcts) / sizeof(fcts[0]); i++) { + if (addr == fcts[i].func) { + chunk_appendf(buf, "%s", fcts[i].name); + return addr; + } + } + +#if (defined(__ELF__) && !defined(__linux__)) || defined(USE_DL) + /* Now let's try to be smarter */ + if (!dladdr_and_size(addr, &dli, &size)) + goto unknown; + + /* 1. prefix the library name if it's not the same object as the one + * that contains the main function. The name is picked between last '/' + * and first following '.'. + */ + if (!dladdr(main, &dli_main)) + dli_main.dli_fbase = NULL; + + if (dli_main.dli_fbase != dli.dli_fbase) { + fname = dli.dli_fname; + p = strrchr(fname, '/'); + if (p++) + fname = p; + p = strchr(fname, '.'); + if (!p) + p = fname + strlen(fname); + + chunk_appendf(buf, "%.*s:", (int)(long)(p - fname), fname); + } + + /* 2. symbol name */ + if (dli.dli_sname) { + /* known, dump it and return symbol's address (exact or relative) */ + chunk_appendf(buf, "%s", dli.dli_sname); + if (addr != dli.dli_saddr) { + chunk_appendf(buf, "+%#lx", (long)(addr - dli.dli_saddr)); + if (size) + chunk_appendf(buf, "/%#lx", (long)size); + } + return dli.dli_saddr; + } + else if (dli_main.dli_fbase != dli.dli_fbase) { + /* unresolved symbol from a known library, report relative offset */ + chunk_appendf(buf, "+%#lx", (long)(addr - dli.dli_fbase)); + return NULL; + } +#endif /* __ELF__ && !__linux__ || USE_DL */ + unknown: + /* unresolved symbol from the main file, report relative offset to main */ + if ((void*)addr < (void*)main) + chunk_appendf(buf, "main-%#lx", (long)((void*)main - addr)); + else + chunk_appendf(buf, "main+%#lx", (long)(addr - (void*)main)); + return NULL; +} + +/* On systems where this is supported, let's provide a possibility to enumerate + * the list of object files. The output is appended to a buffer initialized by + * the caller, with one name per line. A trailing zero is always emitted if data + * are written. Only real objects are dumped (executable and .so libs). The + * function returns non-zero if it dumps anything. These functions do not make + * use of the trash so that it is possible for the caller to call them with the + * trash on input. The output format may be platform-specific but at least one + * version must emit raw object file names when argument is zero. + */ +#if defined(HA_HAVE_DUMP_LIBS) +# if defined(HA_HAVE_DL_ITERATE_PHDR) +/* the private <data> we pass below is a dump context initialized like this */ +struct dl_dump_ctx { + struct buffer *buf; + int with_addr; +}; + +static int dl_dump_libs_cb(struct dl_phdr_info *info, size_t size, void *data) +{ + struct dl_dump_ctx *ctx = data; + const char *fname; + size_t p1, p2, beg, end; + int idx; + + if (!info || !info->dlpi_name) + goto leave; + + if (!*info->dlpi_name) + fname = get_exec_path(); + else if (strchr(info->dlpi_name, '/')) + fname = info->dlpi_name; + else + /* else it's a VDSO or similar and we're not interested */ + goto leave; + + if (!ctx->with_addr) + goto dump_name; + + /* virtual addresses are relative to the load address and are per + * pseudo-header, so we have to scan them all to find the furthest + * one from the beginning. In this case we only dump entries if + * they have at least one section. + */ + beg = ~0; end = 0; + for (idx = 0; idx < info->dlpi_phnum; idx++) { + if (!info->dlpi_phdr[idx].p_memsz) + continue; + p1 = info->dlpi_phdr[idx].p_vaddr; + if (p1 < beg) + beg = p1; + p2 = p1 + info->dlpi_phdr[idx].p_memsz - 1; + if (p2 > end) + end = p2; + } + + if (!idx) + goto leave; + + chunk_appendf(ctx->buf, "0x%012llx-0x%012llx (0x%07llx) ", + (ullong)info->dlpi_addr + beg, + (ullong)info->dlpi_addr + end, + (ullong)(end - beg + 1)); + dump_name: + chunk_appendf(ctx->buf, "%s\n", fname); + leave: + return 0; +} + +/* dumps lib names and optionally address ranges */ +int dump_libs(struct buffer *output, int with_addr) +{ + struct dl_dump_ctx ctx = { .buf = output, .with_addr = with_addr }; + size_t old_data = output->data; + + dl_iterate_phdr(dl_dump_libs_cb, &ctx); + return output->data != old_data; +} +# else // no DL_ITERATE_PHDR +# error "No dump_libs() function for this platform" +# endif +#else // no HA_HAVE_DUMP_LIBS + +/* unsupported platform: do not dump anything */ +int dump_libs(struct buffer *output, int with_addr) +{ + return 0; +} + +#endif // HA_HAVE_DUMP_LIBS + +/* + * Allocate an array of unsigned int with <nums> as address from <str> string + * made of integer separated by dot characters. + * + * First, initializes the value with <sz> as address to 0 and initializes the + * array with <nums> as address to NULL. Then allocates the array with <nums> as + * address updating <sz> pointed value to the size of this array. + * + * Returns 1 if succeeded, 0 if not. + */ +int parse_dotted_uints(const char *str, unsigned int **nums, size_t *sz) +{ + unsigned int *n; + const char *s, *end; + + s = str; + *sz = 0; + end = str + strlen(str); + *nums = n = NULL; + + while (1) { + unsigned int r; + + if (s >= end) + break; + + r = read_uint(&s, end); + /* Expected characters after having read an uint: '\0' or '.', + * if '.', must not be terminal. + */ + if (*s != '\0'&& (*s++ != '.' || s == end)) { + free(n); + return 0; + } + + n = my_realloc2(n, (*sz + 1) * sizeof *n); + if (!n) + return 0; + + n[(*sz)++] = r; + } + *nums = n; + + return 1; +} + + +/* returns the number of bytes needed to encode <v> as a varint. An inline + * version exists for use with constants (__varint_bytes()). + */ +int varint_bytes(uint64_t v) +{ + int len = 1; + + if (v >= 240) { + v = (v - 240) >> 4; + while (1) { + len++; + if (v < 128) + break; + v = (v - 128) >> 7; + } + } + return len; +} + + +/* Random number generator state, see below */ +static uint64_t ha_random_state[2] ALIGNED(2*sizeof(uint64_t)); + +/* This is a thread-safe implementation of xoroshiro128** described below: + * http://prng.di.unimi.it/ + * It features a 2^128 long sequence, returns 64 high-quality bits on each call, + * supports fast jumps and passes all common quality tests. It is thread-safe, + * uses a double-cas on 64-bit architectures supporting it, and falls back to a + * local lock on other ones. + */ +uint64_t ha_random64() +{ + uint64_t old[2] ALIGNED(2*sizeof(uint64_t)); + uint64_t new[2] ALIGNED(2*sizeof(uint64_t)); + +#if defined(USE_THREAD) && (!defined(HA_CAS_IS_8B) || !defined(HA_HAVE_CAS_DW)) + static HA_SPINLOCK_T rand_lock; + + HA_SPIN_LOCK(OTHER_LOCK, &rand_lock); +#endif + + old[0] = ha_random_state[0]; + old[1] = ha_random_state[1]; + +#if defined(USE_THREAD) && defined(HA_CAS_IS_8B) && defined(HA_HAVE_CAS_DW) + do { +#endif + new[1] = old[0] ^ old[1]; + new[0] = rotl64(old[0], 24) ^ new[1] ^ (new[1] << 16); // a, b + new[1] = rotl64(new[1], 37); // c + +#if defined(USE_THREAD) && defined(HA_CAS_IS_8B) && defined(HA_HAVE_CAS_DW) + } while (unlikely(!_HA_ATOMIC_DWCAS(ha_random_state, old, new))); +#else + ha_random_state[0] = new[0]; + ha_random_state[1] = new[1]; +#if defined(USE_THREAD) + HA_SPIN_UNLOCK(OTHER_LOCK, &rand_lock); +#endif +#endif + return rotl64(old[0] * 5, 7) * 9; +} + +/* seeds the random state using up to <len> bytes from <seed>, starting with + * the first non-zero byte. + */ +void ha_random_seed(const unsigned char *seed, size_t len) +{ + size_t pos; + + /* the seed must not be all zeroes, so we pre-fill it with alternating + * bits and overwrite part of them with the block starting at the first + * non-zero byte from the seed. + */ + memset(ha_random_state, 0x55, sizeof(ha_random_state)); + + for (pos = 0; pos < len; pos++) + if (seed[pos] != 0) + break; + + if (pos == len) + return; + + seed += pos; + len -= pos; + + if (len > sizeof(ha_random_state)) + len = sizeof(ha_random_state); + + memcpy(ha_random_state, seed, len); +} + +/* This causes a jump to (dist * 2^96) places in the pseudo-random sequence, + * and is equivalent to calling ha_random64() as many times. It is used to + * provide non-overlapping sequences of 2^96 numbers (~7*10^28) to up to 2^32 + * different generators (i.e. different processes after a fork). The <dist> + * argument is the distance to jump to and is used in a loop so it rather not + * be too large if the processing time is a concern. + * + * BEWARE: this function is NOT thread-safe and must not be called during + * concurrent accesses to ha_random64(). + */ +void ha_random_jump96(uint32_t dist) +{ + while (dist--) { + uint64_t s0 = 0; + uint64_t s1 = 0; + int b; + + for (b = 0; b < 64; b++) { + if ((0xd2a98b26625eee7bULL >> b) & 1) { + s0 ^= ha_random_state[0]; + s1 ^= ha_random_state[1]; + } + ha_random64(); + } + + for (b = 0; b < 64; b++) { + if ((0xdddf9b1090aa7ac1ULL >> b) & 1) { + s0 ^= ha_random_state[0]; + s1 ^= ha_random_state[1]; + } + ha_random64(); + } + ha_random_state[0] = s0; + ha_random_state[1] = s1; + } +} + +/* Generates an RFC4122 UUID into chunk <output> which must be at least 37 + * bytes large. + */ +void ha_generate_uuid(struct buffer *output) +{ + uint32_t rnd[4]; + uint64_t last; + + last = ha_random64(); + rnd[0] = last; + rnd[1] = last >> 32; + + last = ha_random64(); + rnd[2] = last; + rnd[3] = last >> 32; + + chunk_printf(output, "%8.8x-%4.4x-%4.4x-%4.4x-%12.12llx", + rnd[0], + rnd[1] & 0xFFFF, + ((rnd[1] >> 16u) & 0xFFF) | 0x4000, // highest 4 bits indicate the uuid version + (rnd[2] & 0x3FFF) | 0x8000, // the highest 2 bits indicate the UUID variant (10), + (long long)((rnd[2] >> 14u) | ((uint64_t) rnd[3] << 18u)) & 0xFFFFFFFFFFFFull); +} + + +/* only used by parse_line() below. It supports writing in place provided that + * <in> is updated to the next location before calling it. In that case, the + * char at <in> may be overwritten. + */ +#define EMIT_CHAR(x) \ + do { \ + char __c = (char)(x); \ + if ((opts & PARSE_OPT_INPLACE) && out+outpos > in) \ + err |= PARSE_ERR_OVERLAP; \ + if (outpos >= outmax) \ + err |= PARSE_ERR_TOOLARGE; \ + if (!err) \ + out[outpos] = __c; \ + outpos++; \ + } while (0) + +/* Parse <in>, copy it into <out> split into isolated words whose pointers + * are put in <args>. If more than <outlen> bytes have to be emitted, the + * extraneous ones are not emitted but <outlen> is updated so that the caller + * knows how much to realloc. Similarly, <args> are not updated beyond <nbargs> + * but the returned <nbargs> indicates how many were found. All trailing args + * up to <nbargs> point to the trailing zero, and as long as <nbargs> is > 0, + * it is guaranteed that at least one arg will point to the zero. It is safe + * to call it with a NULL <args> if <nbargs> is 0. + * + * <out> may overlap with <in> provided that it never goes further, in which + * case the parser will accept to perform in-place parsing and unquoting/ + * unescaping but only if environment variables do not lead to expansion that + * causes overlapping, otherwise the input string being destroyed, the error + * will not be recoverable. Note that even during out-of-place <in> will + * experience temporary modifications in-place for variable resolution and must + * be writable, and will also receive zeroes to delimit words when using + * in-place copy. Parsing options <opts> taken from PARSE_OPT_*. Return value + * is zero on success otherwise a bitwise-or of PARSE_ERR_*. Upon error, the + * starting point of the first invalid character sequence or unmatched + * quote/brace is reported in <errptr> if not NULL. When using in-place parsing + * error reporting might be difficult since zeroes will have been inserted into + * the string. One solution for the caller may consist in replacing all args + * delimiters with spaces in this case. + */ +uint32_t parse_line(char *in, char *out, size_t *outlen, char **args, int *nbargs, uint32_t opts, const char **errptr) +{ + char *quote = NULL; + char *brace = NULL; + char *word_expand = NULL; + unsigned char hex1, hex2; + size_t outmax = *outlen; + int argsmax = *nbargs - 1; + size_t outpos = 0; + int squote = 0; + int dquote = 0; + int arg = 0; + uint32_t err = 0; + + *nbargs = 0; + *outlen = 0; + + /* argsmax may be -1 here, protecting args[] from any write */ + if (arg < argsmax) + args[arg] = out; + + while (1) { + if (*in >= '-' && *in != '\\') { + /* speedup: directly send all regular chars starting + * with '-', '.', '/', alnum etc... + */ + EMIT_CHAR(*in++); + continue; + } + else if (*in == '\0' || *in == '\n' || *in == '\r') { + /* end of line */ + break; + } + else if (*in == '#' && (opts & PARSE_OPT_SHARP) && !squote && !dquote) { + /* comment */ + break; + } + else if (*in == '"' && !squote && (opts & PARSE_OPT_DQUOTE)) { /* double quote outside single quotes */ + if (dquote) { + dquote = 0; + quote = NULL; + } + else { + dquote = 1; + quote = in; + } + in++; + continue; + } + else if (*in == '\'' && !dquote && (opts & PARSE_OPT_SQUOTE)) { /* single quote outside double quotes */ + if (squote) { + squote = 0; + quote = NULL; + } + else { + squote = 1; + quote = in; + } + in++; + continue; + } + else if (*in == '\\' && !squote && (opts & PARSE_OPT_BKSLASH)) { + /* first, we'll replace \\, \<space>, \#, \r, \n, \t, \xXX with their + * C equivalent value but only when they have a special meaning and within + * double quotes for some of them. Other combinations left unchanged (eg: \1). + */ + char tosend = *in; + + switch (in[1]) { + case ' ': + case '\\': + tosend = in[1]; + in++; + break; + + case 't': + tosend = '\t'; + in++; + break; + + case 'n': + tosend = '\n'; + in++; + break; + + case 'r': + tosend = '\r'; + in++; + break; + + case '#': + /* escaping of "#" only if comments are supported */ + if (opts & PARSE_OPT_SHARP) + in++; + tosend = *in; + break; + + case '\'': + /* escaping of "'" only outside single quotes and only if single quotes are supported */ + if (opts & PARSE_OPT_SQUOTE && !squote) + in++; + tosend = *in; + break; + + case '"': + /* escaping of '"' only outside single quotes and only if double quotes are supported */ + if (opts & PARSE_OPT_DQUOTE && !squote) + in++; + tosend = *in; + break; + + case '$': + /* escaping of '$' only inside double quotes and only if env supported */ + if (opts & PARSE_OPT_ENV && dquote) + in++; + tosend = *in; + break; + + case 'x': + if (!ishex(in[2]) || !ishex(in[3])) { + /* invalid or incomplete hex sequence */ + err |= PARSE_ERR_HEX; + if (errptr) + *errptr = in; + goto leave; + } + hex1 = toupper((unsigned char)in[2]) - '0'; + hex2 = toupper((unsigned char)in[3]) - '0'; + if (hex1 > 9) hex1 -= 'A' - '9' - 1; + if (hex2 > 9) hex2 -= 'A' - '9' - 1; + tosend = (hex1 << 4) + hex2; + in += 3; + break; + + default: + /* other combinations are not escape sequences */ + break; + } + + in++; + EMIT_CHAR(tosend); + } + else if (isspace((unsigned char)*in) && !squote && !dquote) { + /* a non-escaped space is an argument separator */ + while (isspace((unsigned char)*in)) + in++; + EMIT_CHAR(0); + arg++; + if (arg < argsmax) + args[arg] = out + outpos; + else + err |= PARSE_ERR_TOOMANY; + } + else if (*in == '$' && (opts & PARSE_OPT_ENV) && (dquote || !(opts & PARSE_OPT_DQUOTE))) { + /* environment variables are evaluated anywhere, or only + * inside double quotes if they are supported. + */ + char *var_name; + char save_char; + const char *value; + + in++; + + if (*in == '{') + brace = in++; + + if (!isalpha((unsigned char)*in) && *in != '_' && *in != '.') { + /* unacceptable character in variable name */ + err |= PARSE_ERR_VARNAME; + if (errptr) + *errptr = in; + goto leave; + } + + var_name = in; + if (*in == '.') + in++; + while (isalnum((unsigned char)*in) || *in == '_') + in++; + + save_char = *in; + *in = '\0'; + if (unlikely(*var_name == '.')) { + /* internal pseudo-variables */ + if (strcmp(var_name, ".LINE") == 0) + value = ultoa(global.cfg_curr_line); + else if (strcmp(var_name, ".FILE") == 0) + value = global.cfg_curr_file; + else if (strcmp(var_name, ".SECTION") == 0) + value = global.cfg_curr_section; + else { + /* unsupported internal variable name */ + err |= PARSE_ERR_VARNAME; + if (errptr) + *errptr = var_name; + goto leave; + } + } else { + value = getenv(var_name); + } + *in = save_char; + + /* support for '[*]' sequence to force word expansion, + * only available inside braces */ + if (*in == '[' && brace && (opts & PARSE_OPT_WORD_EXPAND)) { + word_expand = in++; + + if (*in++ != '*' || *in++ != ']') { + err |= PARSE_ERR_WRONG_EXPAND; + if (errptr) + *errptr = word_expand; + goto leave; + } + } + + if (brace) { + if (*in == '-') { + /* default value starts just after the '-' */ + if (!value) + value = in + 1; + + while (*in && *in != '}') + in++; + if (!*in) + goto no_brace; + *in = 0; // terminate the default value + } + else if (*in != '}') { + no_brace: + /* unmatched brace */ + err |= PARSE_ERR_BRACE; + if (errptr) + *errptr = brace; + goto leave; + } + + /* brace found, skip it */ + in++; + brace = NULL; + } + + if (value) { + while (*value) { + /* expand as individual parameters on a space character */ + if (word_expand && isspace((unsigned char)*value)) { + EMIT_CHAR(0); + ++arg; + if (arg < argsmax) + args[arg] = out + outpos; + else + err |= PARSE_ERR_TOOMANY; + + /* skip consecutive spaces */ + while (isspace((unsigned char)*++value)) + ; + } else { + EMIT_CHAR(*value++); + } + } + } + else { + /* An unmatched environment variable was parsed. + * Let's skip the trailing double-quote character + * and spaces. + */ + if (likely(*var_name != '.') && *in == '"') { + in++; + while (isspace((unsigned char)*in)) + in++; + if (dquote) { + dquote = 0; + quote = NULL; + } + } + } + word_expand = NULL; + } + else { + /* any other regular char */ + EMIT_CHAR(*in++); + } + } + + /* end of output string */ + EMIT_CHAR(0); + + /* Don't add an empty arg after trailing spaces. Note that args[arg] + * may contain some distances relative to NULL if <out> was NULL, or + * pointers beyond the end of <out> in case <outlen> is too short, thus + * we must not dereference it. + */ + if (arg < argsmax && args[arg] != out + outpos - 1) + arg++; + + if (quote) { + /* unmatched quote */ + err |= PARSE_ERR_QUOTE; + if (errptr) + *errptr = quote; + goto leave; + } + leave: + *nbargs = arg; + *outlen = outpos; + + /* empty all trailing args by making them point to the trailing zero, + * at least the last one in any case. + */ + if (arg > argsmax) + arg = argsmax; + + while (arg >= 0 && arg <= argsmax) + args[arg++] = out + outpos - 1; + + return err; +} +#undef EMIT_CHAR + +/* Use <path_fmt> and following arguments as a printf format to build up the + * name of a file, whose first line will be read into the trash buffer. The + * trailing CR and LF if any are stripped. On success, it sets trash.data to + * the number of resulting bytes in the trash and returns this value. Otherwise + * on failure it returns -1 if it could not build the path, -2 on file access + * access error (e.g. permissions), or -3 on file read error. The trash is + * always reset before proceeding. Too large lines are truncated to the size + * of the trash. + */ +ssize_t read_line_to_trash(const char *path_fmt, ...) +{ + va_list args; + FILE *file; + ssize_t ret; + + chunk_reset(&trash); + + va_start(args, path_fmt); + ret = vsnprintf(trash.area, trash.size, path_fmt, args); + va_end(args); + + if (ret >= trash.size) + return -1; + + file = fopen(trash.area, "r"); + if (!file) + return -2; + + ret = -3; + chunk_reset(&trash); + if (fgets(trash.area, trash.size, file)) { + trash.data = strlen(trash.area); + while (trash.data && + (trash.area[trash.data - 1] == '\r' || + trash.area[trash.data - 1] == '\n')) + trash.data--; + trash.area[trash.data] = 0; + ret = trash.data; // success + } + + fclose(file); + return ret; +} + +/* This is used to sanitize an input line that's about to be used for error reporting. + * It will adjust <line> to print approximately <width> chars around <pos>, trying to + * preserve the beginning, with leading or trailing "..." when the line is truncated. + * If non-printable chars are present in the output. It returns the new offset <pos> + * in the modified line. Non-printable characters are replaced with '?'. <width> must + * be at least 6 to support two "..." otherwise the result is undefined. The line + * itself must have at least 7 chars allocated for the same reason. + */ +size_t sanitize_for_printing(char *line, size_t pos, size_t width) +{ + size_t shift = 0; + char *out = line; + char *in = line; + char *end = line + width; + + if (pos >= width) { + /* if we have to shift, we'll be out of context, so let's + * try to put <pos> at the center of width. + */ + shift = pos - width / 2; + in += shift + 3; + end = out + width - 3; + out[0] = out[1] = out[2] = '.'; + out += 3; + } + + while (out < end && *in) { + if (isspace((unsigned char)*in)) + *out++ = ' '; + else if (isprint((unsigned char)*in)) + *out++ = *in; + else + *out++ = '?'; + in++; + } + + if (end < line + width) { + out[0] = out[1] = out[2] = '.'; + out += 3; + } + + *out++ = 0; + return pos - shift; +} + +/* Update array <fp> with the fingerprint of word <word> by counting the + * transitions between characters. <fp> is a 1024-entries array indexed as + * 32*from+to. Positions for 'from' and 'to' are: + * 1..26=letter, 27=digit, 28=other/begin/end. + * Row "from=0" is used to mark the character's presence. Others unused. + */ +void update_word_fingerprint(uint8_t *fp, const char *word) +{ + const char *p; + int from, to; + int c; + + from = 28; // begin + for (p = word; *p; p++) { + c = tolower(*p); + switch(c) { + case 'a'...'z': to = c - 'a' + 1; break; + case 'A'...'Z': to = tolower(c) - 'a' + 1; break; + case '0'...'9': to = 27; break; + default: to = 28; break; + } + fp[to] = 1; + fp[32 * from + to]++; + from = to; + } + to = 28; // end + fp[32 * from + to]++; +} + +/* This function hashes a word, scramble is the anonymizing key, returns + * the hashed word when the key (scramble) != 0, else returns the word. + * This function can be called NB_L_HASH_WORD times in a row, don't call + * it if you called it more than NB_L_HASH_WORD. + */ +const char *hash_anon(uint32_t scramble, const char *string2hash, const char *prefix, const char *suffix) +{ + index_hash++; + if (index_hash == NB_L_HASH_WORD) + index_hash = 0; + + /* don't hash empty strings */ + if (!string2hash[0] || (string2hash[0] == ' ' && string2hash[1] == 0)) + return string2hash; + + if (scramble != 0) { + snprintf(hash_word[index_hash], sizeof(hash_word[index_hash]), "%s%06x%s", + prefix, HA_ANON(scramble, string2hash, strlen(string2hash)), suffix); + return hash_word[index_hash]; + } + else + return string2hash; +} + +/* This function hashes or not an ip address ipstring, scramble is the anonymizing + * key, returns the hashed ip with his port or ipstring when there is nothing to hash. + * Put hasport equal 0 to point out ipstring has no port, else put an other int. + * Without port, return a simple hash or ipstring. + */ +const char *hash_ipanon(uint32_t scramble, char *ipstring, int hasport) +{ + char *errmsg = NULL; + struct sockaddr_storage *sa; + struct sockaddr_storage ss; + char addr[46]; + int port; + + index_hash++; + if (index_hash == NB_L_HASH_WORD) { + index_hash = 0; + } + + if (scramble == 0) { + return ipstring; + } + if (strcmp(ipstring, "localhost") == 0 || + strcmp(ipstring, "stdout") == 0 || + strcmp(ipstring, "stderr") == 0 || + strncmp(ipstring, "fd@", 3) == 0 || + strncmp(ipstring, "sockpair@", 9) == 0) { + return ipstring; + } + else { + if (hasport == 0) { + memset(&ss, 0, sizeof(ss)); + if (str2ip2(ipstring, &ss, 1) == NULL) { + return HA_ANON_STR(scramble, ipstring); + } + sa = &ss; + } + else { + sa = str2sa_range(ipstring, NULL, NULL, NULL, NULL, NULL, NULL, &errmsg, NULL, NULL, + PA_O_PORT_OK | PA_O_STREAM | PA_O_DGRAM | PA_O_XPRT | PA_O_CONNECT | + PA_O_PORT_RANGE | PA_O_PORT_OFS | PA_O_RESOLVE); + if (sa == NULL) { + return HA_ANON_STR(scramble, ipstring); + } + } + addr_to_str(sa, addr, sizeof(addr)); + port = get_host_port(sa); + + switch(sa->ss_family) { + case AF_INET: + if (strncmp(addr, "127", 3) == 0 || strncmp(addr, "255", 3) == 0 || strncmp(addr, "0", 1) == 0) { + return ipstring; + } + else { + if (port != 0) { + snprintf(hash_word[index_hash], sizeof(hash_word[index_hash]), "IPV4(%06x):%d", HA_ANON(scramble, addr, strlen(addr)), port); + return hash_word[index_hash]; + } + else { + snprintf(hash_word[index_hash], sizeof(hash_word[index_hash]), "IPV4(%06x)", HA_ANON(scramble, addr, strlen(addr))); + return hash_word[index_hash]; + } + } + break; + + case AF_INET6: + if (strcmp(addr, "::1") == 0) { + return ipstring; + } + else { + if (port != 0) { + snprintf(hash_word[index_hash], sizeof(hash_word[index_hash]), "IPV6(%06x):%d", HA_ANON(scramble, addr, strlen(addr)), port); + return hash_word[index_hash]; + } + else { + snprintf(hash_word[index_hash], sizeof(hash_word[index_hash]), "IPV6(%06x)", HA_ANON(scramble, addr, strlen(addr))); + return hash_word[index_hash]; + } + } + break; + + case AF_UNIX: + return HA_ANON_STR(scramble, ipstring); + break; + + default: + return ipstring; + break; + }; + } + return ipstring; +} + +/* Initialize array <fp> with the fingerprint of word <word> by counting the + * transitions between characters. <fp> is a 1024-entries array indexed as + * 32*from+to. Positions for 'from' and 'to' are: + * 0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused. + */ +void make_word_fingerprint(uint8_t *fp, const char *word) +{ + memset(fp, 0, 1024); + update_word_fingerprint(fp, word); +} + +/* Return the distance between two word fingerprints created by function + * make_word_fingerprint(). It's a positive integer calculated as the sum of + * the differences between each location. + */ +int word_fingerprint_distance(const uint8_t *fp1, const uint8_t *fp2) +{ + int i, k, dist = 0; + + for (i = 0; i < 1024; i++) { + k = (int)fp1[i] - (int)fp2[i]; + dist += abs(k); + } + return dist; +} + +/* + * This function compares the loaded openssl version with a string <version> + * This function use the same return code as compare_current_version: + * + * -1 : the version in argument is older than the current openssl version + * 0 : the version in argument is the same as the current openssl version + * 1 : the version in argument is newer than the current openssl version + * + * Or some errors: + * -2 : openssl is not available on this process + * -3 : the version in argument is not parsable + */ +int openssl_compare_current_version(const char *version) +{ +#ifdef USE_OPENSSL + int numversion; + + numversion = openssl_version_parser(version); + if (numversion == 0) + return -3; + + if (numversion < OPENSSL_VERSION_NUMBER) + return -1; + else if (numversion > OPENSSL_VERSION_NUMBER) + return 1; + else + return 0; +#else + return -2; +#endif +} + +/* + * This function compares the loaded openssl name with a string <name> + * This function returns 0 if the OpenSSL name starts like the passed parameter, + * 1 otherwise. + */ +int openssl_compare_current_name(const char *name) +{ +#ifdef USE_OPENSSL + int name_len = 0; + const char *openssl_version = OpenSSL_version(OPENSSL_VERSION); + + if (name) { + name_len = strlen(name); + if (strlen(name) <= strlen(openssl_version)) + return strncmp(openssl_version, name, name_len); + } +#endif + return 1; +} + +#if defined(RTLD_DEFAULT) || defined(RTLD_NEXT) +/* redefine dlopen() so that we can detect unexpected replacement of some + * critical symbols, typically init/alloc/free functions coming from alternate + * libraries. When called, a tainted flag is set (TAINTED_SHARED_LIBS). + * It's important to understand that the dynamic linker will present the + * first loaded of each symbol to all libs, so that if haproxy is linked + * with a new lib that uses a static inline or a #define to replace an old + * function, and a dependency was linked against an older version of that + * lib that had a function there, that lib would use all of the newer + * versions of the functions that are already loaded in haproxy, except + * for that unique function which would continue to be the old one. This + * creates all sort of problems when init code allocates smaller structs + * than required for example but uses new functions on them, etc. Thus what + * we do here is to try to detect API consistency: we take a fingerprint of + * a number of known functions, and verify that if they change in a loaded + * library, either there all appeared or all disappeared, but not partially. + * We can check up to 64 symbols that belong to individual groups that are + * checked together. + */ +void *dlopen(const char *filename, int flags) +{ + static void *(*_dlopen)(const char *filename, int flags); + struct { + const char *name; + uint64_t bit, grp; + void *curr, *next; + } check_syms[] = { + /* openssl's libcrypto checks: group bits 0x1f */ + { .name="OPENSSL_init", .bit = 0x0000000000000001, .grp = 0x000000000000001f, }, // openssl 1.0 / 1.1 / 3.0 + { .name="OPENSSL_init_crypto", .bit = 0x0000000000000002, .grp = 0x000000000000001f, }, // openssl 1.1 / 3.0 + { .name="ENGINE_init", .bit = 0x0000000000000004, .grp = 0x000000000000001f, }, // openssl 1.x / 3.x with engine + { .name="EVP_CIPHER_CTX_init", .bit = 0x0000000000000008, .grp = 0x000000000000001f, }, // openssl 1.0 + { .name="HMAC_Init", .bit = 0x0000000000000010, .grp = 0x000000000000001f, }, // openssl 1.x + + /* openssl's libssl checks: group bits 0x3e0 */ + { .name="OPENSSL_init_ssl", .bit = 0x0000000000000020, .grp = 0x00000000000003e0, }, // openssl 1.1 / 3.0 + { .name="SSL_library_init", .bit = 0x0000000000000040, .grp = 0x00000000000003e0, }, // openssl 1.x + { .name="SSL_is_quic", .bit = 0x0000000000000080, .grp = 0x00000000000003e0, }, // quictls + { .name="SSL_CTX_new_ex", .bit = 0x0000000000000100, .grp = 0x00000000000003e0, }, // openssl 3.x + { .name="SSL_CTX_get0_security_ex_data", .bit = 0x0000000000000200, .grp = 0x00000000000003e0, }, // openssl 1.x / 3.x + + /* insert only above, 0 must be the last one */ + { 0 }, + }; + const char *trace; + uint64_t own_fp, lib_fp; // symbols fingerprints + void *addr; + void *ret; + int sym = 0; + + if (!_dlopen) { + _dlopen = get_sym_next_addr("dlopen"); + if (!_dlopen || _dlopen == dlopen) { + _dlopen = NULL; + return NULL; + } + } + + /* save a few pointers to critical symbols. We keep a copy of both the + * current and the next value, because we might already have replaced + * some of them in an inconsistent way (i.e. not all), and we're only + * interested in verifying that a loaded library doesn't come with a + * completely different definition that would be incompatible. We'll + * keep a fingerprint of our own symbols. + */ + own_fp = 0; + for (sym = 0; check_syms[sym].name; sym++) { + check_syms[sym].curr = get_sym_curr_addr(check_syms[sym].name); + check_syms[sym].next = get_sym_next_addr(check_syms[sym].name); + if (check_syms[sym].curr || check_syms[sym].next) + own_fp |= check_syms[sym].bit; + } + + /* now open the requested lib */ + ret = _dlopen(filename, flags); + if (!ret) + return ret; + + mark_tainted(TAINTED_SHARED_LIBS); + + /* and check that critical symbols didn't change */ + lib_fp = 0; + for (sym = 0; check_syms[sym].name; sym++) { + addr = dlsym(ret, check_syms[sym].name); + if (addr) + lib_fp |= check_syms[sym].bit; + } + + if (lib_fp != own_fp) { + /* let's check what changed: */ + uint64_t mask = 0; + + for (sym = 0; check_syms[sym].name; sym++) { + mask = check_syms[sym].grp; + + /* new group of symbols. If they all appeared together + * their use will be consistent. If none appears, it's + * just that the lib doesn't use them. If some appear + * or disappear, it means the lib relies on a different + * dependency and will end up with a mix. + */ + if (!(own_fp & mask) || !(lib_fp & mask) || + (own_fp & mask) == (lib_fp & mask)) + continue; + + /* let's report a symbol that really changes */ + if (!((own_fp ^ lib_fp) & check_syms[sym].bit)) + continue; + + /* OK it's clear that this symbol was redefined */ + mark_tainted(TAINTED_REDEFINITION); + + trace = hlua_show_current_location("\n "); + ha_warning("dlopen(): shared library '%s' brings a different and inconsistent definition of symbol '%s'. The process cannot be trusted anymore!%s%s\n", + filename, check_syms[sym].name, + trace ? " Suspected call location: \n " : "", + trace ? trace : ""); + } + } + + return ret; +} +#endif + +static int init_tools_per_thread() +{ + /* Let's make each thread start from a different position */ + statistical_prng_state += tid * MAX_THREADS; + if (!statistical_prng_state) + statistical_prng_state++; + return 1; +} +REGISTER_PER_THREAD_INIT(init_tools_per_thread); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/trace.c b/src/trace.c new file mode 100644 index 0000000..a233c0d --- /dev/null +++ b/src/trace.c @@ -0,0 +1,997 @@ +/* + * Runtime tracing API + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/cfgparse.h> +#include <haproxy/cli.h> +#include <haproxy/errors.h> +#include <haproxy/istbuf.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/sink.h> +#include <haproxy/trace.h> + +struct list trace_sources = LIST_HEAD_INIT(trace_sources); +THREAD_LOCAL struct buffer trace_buf = { }; + +/* allocates the trace buffers. Returns 0 in case of failure. It is safe to + * call to call this function multiple times if the size changes. + */ +static int alloc_trace_buffers_per_thread() +{ + chunk_init(&trace_buf, my_realloc2(trace_buf.area, global.tune.bufsize), global.tune.bufsize); + return !!trace_buf.area; +} + +static void free_trace_buffers_per_thread() +{ + chunk_destroy(&trace_buf); +} + +REGISTER_PER_THREAD_ALLOC(alloc_trace_buffers_per_thread); +REGISTER_PER_THREAD_FREE(free_trace_buffers_per_thread); + +/* pick the lowest non-null argument with a non-null arg_def mask */ +static inline const void *trace_pick_arg(uint32_t arg_def, const void *a1, const void *a2, const void *a3, const void *a4) +{ + if (arg_def & 0x0000FFFF) { + if ((arg_def & 0x000000FF) && a1) + return a1; + if ((arg_def & 0x0000FF00) && a2) + return a2; + } + + if (arg_def & 0xFFFF0000) { + if ((arg_def & 0x00FF0000) && a3) + return a3; + if ((arg_def & 0xFF000000) && a4) + return a4; + } + + return NULL; +} + +/* Reports whether the trace is enabled for the specified arguments, needs to enable + * or disable tracking. It gets the same API as __trace() except for <cb> and <msg> + * which are not used and were dropped, and plockptr which is an optional pointer to + * the lockptr to be updated (or NULL) for tracking. The function returns: + * 0 if the trace is not enabled for the module or these values + * <0 if the trace matches some locking criteria but don't have the proper level. + * In this case the interested caller might have to consider disabling tracking. + * >0 if the trace is enabled for the given criteria. + * In all cases, <plockptr> will only be set if non-null and if a locking criterion + * matched. It will be up to the caller to enable tracking if desired. A casual + * tester not interested in adjusting tracking (i.e. calling the function before + * deciding so prepare a buffer to be dumped) will only need to pass 0 for plockptr + * and check if the result is >0. + */ +int __trace_enabled(enum trace_level level, uint64_t mask, struct trace_source *src, + const struct ist where, const char *func, + const void *a1, const void *a2, const void *a3, const void *a4, + const void **plockptr) +{ + const struct listener *li = NULL; + const struct proxy *fe = NULL; + const struct proxy *be = NULL; + const struct server *srv = NULL; + const struct session *sess = NULL; + const struct stream *strm = NULL; + const struct connection *conn = NULL; + const struct check *check = NULL; + const struct quic_conn *qc = NULL; + const struct appctx *appctx = NULL; + const void *lockon_ptr = NULL; + + if (likely(src->state == TRACE_STATE_STOPPED)) + return 0; + + /* check that at least one action is interested by this event */ + if (((src->report_events | src->start_events | src->pause_events | src->stop_events) & mask) == 0) + return 0; + + /* retrieve available information from the caller's arguments */ + if (src->arg_def & TRC_ARGS_CONN) + conn = trace_pick_arg(src->arg_def & TRC_ARGS_CONN, a1, a2, a3, a4); + + if (src->arg_def & TRC_ARGS_SESS) + sess = trace_pick_arg(src->arg_def & TRC_ARGS_SESS, a1, a2, a3, a4); + + if (src->arg_def & TRC_ARGS_STRM) + strm = trace_pick_arg(src->arg_def & TRC_ARGS_STRM, a1, a2, a3, a4); + + if (src->arg_def & TRC_ARGS_CHK) + check = trace_pick_arg(src->arg_def & TRC_ARGS_CHK, a1, a2, a3, a4); + + if (src->arg_def & TRC_ARGS_QCON) + qc = trace_pick_arg(src->arg_def & TRC_ARGS_QCON, a1, a2, a3, a4); + + if (src->arg_def & TRC_ARGS_APPCTX) + appctx = trace_pick_arg(src->arg_def & TRC_ARGS_APPCTX, a1, a2, a3, a4); + + if (!sess && strm) + sess = strm->sess; + else if (!sess && conn && LIST_INLIST(&conn->session_list)) + sess = conn->owner; + else if (!sess && check) + sess = check->sess; + else if (!sess && appctx) + sess = appctx->sess; + + if (sess) { + fe = sess->fe; + li = sess->listener; + } + + if (!li && conn) + li = objt_listener(conn->target); + + if (li && !fe) + fe = li->bind_conf->frontend; + + if (strm) { + be = strm->be; + srv = strm->srv_conn; + } + if (check) { + srv = check->server; + be = (srv ? srv->proxy : NULL); + } + + if (!srv && conn) + srv = objt_server(conn->target); + + if (srv && !be) + be = srv->proxy; + + if (!be && conn) + be = objt_proxy(conn->target); + + /* TODO: add handling of filters here, return if no match (not even update states) */ + + /* check if we need to start the trace now */ + if (src->state == TRACE_STATE_WAITING) { + if ((src->start_events & mask) == 0) + return 0; + + /* TODO: add update of lockon+lockon_ptr here */ + HA_ATOMIC_STORE(&src->state, TRACE_STATE_RUNNING); + } + + /* we may want to lock on a particular object */ + if (src->lockon != TRACE_LOCKON_NOTHING) { + switch (src->lockon) { + case TRACE_LOCKON_BACKEND: lockon_ptr = be; break; + case TRACE_LOCKON_CONNECTION: lockon_ptr = conn; break; + case TRACE_LOCKON_FRONTEND: lockon_ptr = fe; break; + case TRACE_LOCKON_LISTENER: lockon_ptr = li; break; + case TRACE_LOCKON_SERVER: lockon_ptr = srv; break; + case TRACE_LOCKON_SESSION: lockon_ptr = sess; break; + case TRACE_LOCKON_STREAM: lockon_ptr = strm; break; + case TRACE_LOCKON_CHECK: lockon_ptr = check; break; + case TRACE_LOCKON_THREAD: lockon_ptr = ti; break; + case TRACE_LOCKON_QCON: lockon_ptr = qc; break; + case TRACE_LOCKON_APPCTX: lockon_ptr = appctx; break; + case TRACE_LOCKON_ARG1: lockon_ptr = a1; break; + case TRACE_LOCKON_ARG2: lockon_ptr = a2; break; + case TRACE_LOCKON_ARG3: lockon_ptr = a3; break; + case TRACE_LOCKON_ARG4: lockon_ptr = a4; break; + default: break; // silence stupid gcc -Wswitch + } + + if (src->lockon_ptr && src->lockon_ptr != lockon_ptr) + return 0; + + if (*plockptr && !src->lockon_ptr && lockon_ptr && src->state == TRACE_STATE_RUNNING) + *plockptr = lockon_ptr; + } + + /* here the trace is running and is tracking a desired item */ + if ((src->report_events & mask) == 0 || level > src->level) { + /* tracking did match, and might have to be disabled */ + return -1; + } + + /* OK trace still enabled */ + return 1; +} + +/* write a message for the given trace source */ +void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, + const struct ist where, const char *func, + const void *a1, const void *a2, const void *a3, const void *a4, + void (*cb)(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4), + const struct ist msg) +{ + const void *lockon_ptr; + struct ist ist_func = ist(func); + char tnum[4]; + struct ist line[12]; + int words = 0; + int ret; + + lockon_ptr = NULL; + ret = __trace_enabled(level, mask, src, where, func, a1, a2, a3, a4, &lockon_ptr); + if (lockon_ptr) + HA_ATOMIC_STORE(&src->lockon_ptr, lockon_ptr); + + if (ret <= 0) { + if (ret < 0) // may have to disable tracking + goto end; + return; + } + + /* log the logging location truncated to 10 chars from the right so that + * the line number and the end of the file name are there. + */ + line[words++] = ist("["); + tnum[0] = '0' + tid / 10; + tnum[1] = '0' + tid % 10; + tnum[2] = '|'; + tnum[3] = 0; + line[words++] = ist(tnum); + line[words++] = src->name; + line[words++] = ist("|"); + line[words++] = ist2("012345" + level, 1); // "0" to "5" + line[words++] = ist("|"); + line[words] = where; + if (line[words].len > 13) { + line[words].ptr += (line[words].len - 13); + line[words].len = 13; + } + words++; + line[words++] = ist("] "); + + if (isttest(ist_func)) { + line[words++] = ist_func; + line[words++] = ist("(): "); + } + + if (!cb) + cb = src->default_cb; + + if (cb && src->verbosity) { + /* decode function passed, we want to pre-fill the + * buffer with the message and let the decode function + * do its job, possibly even overwriting it. + */ + b_reset(&trace_buf); + b_istput(&trace_buf, msg); + cb(level, mask, src, where, ist_func, a1, a2, a3, a4); + line[words] = ist2(trace_buf.area, trace_buf.data); + words++; + } + else { + /* Note that here we could decide to print some args whose type + * is known, when verbosity is above the quiet level, and even + * to print the name and values of those which are declared for + * lock-on. + */ + line[words++] = msg; + } + + if (src->sink) + sink_write(src->sink, LOG_HEADER_NONE, 0, line, words); + + end: + /* check if we need to stop the trace now */ + if ((src->stop_events & mask) != 0) { + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + HA_ATOMIC_STORE(&src->state, TRACE_STATE_STOPPED); + } + else if ((src->pause_events & mask) != 0) { + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + HA_ATOMIC_STORE(&src->state, TRACE_STATE_WAITING); + } +} + +/* this callback may be used when no output modification is desired */ +void trace_no_cb(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + /* do nothing */ +} + +/* registers trace source <source>. Modifies the list element! + * The {start,pause,stop,report} events are not changed so the source may + * preset them. + */ +void trace_register_source(struct trace_source *source) +{ + source->lockon = TRACE_LOCKON_NOTHING; + source->level = TRACE_LEVEL_USER; + source->verbosity = 1; + source->sink = NULL; + source->state = TRACE_STATE_STOPPED; + source->lockon_ptr = NULL; + LIST_APPEND(&trace_sources, &source->source_link); +} + +struct trace_source *trace_find_source(const char *name) +{ + struct trace_source *src; + const struct ist iname = ist(name); + + list_for_each_entry(src, &trace_sources, source_link) + if (isteq(src->name, iname)) + return src; + return NULL; +} + +const struct trace_event *trace_find_event(const struct trace_event *ev, const char *name) +{ + for (; ev && ev->mask; ev++) + if (strcmp(ev->name, name) == 0) + return ev; + return NULL; +} + +/* Returns the level value or a negative error code. */ +static int trace_parse_level(const char *level) +{ + if (!level) + return -1; + + if (strcmp(level, "error") == 0) + return TRACE_LEVEL_ERROR; + else if (strcmp(level, "user") == 0) + return TRACE_LEVEL_USER; + else if (strcmp(level, "proto") == 0) + return TRACE_LEVEL_PROTO; + else if (strcmp(level, "state") == 0) + return TRACE_LEVEL_STATE; + else if (strcmp(level, "data") == 0) + return TRACE_LEVEL_DATA; + else if (strcmp(level, "developer") == 0) + return TRACE_LEVEL_DEVELOPER; + else + return -1; +} + +/* Returns the verbosity value or a negative error code. */ +static int trace_source_parse_verbosity(struct trace_source *src, + const char *verbosity) +{ + const struct name_desc *nd; + int ret; + + if (strcmp(verbosity, "quiet") == 0) { + ret = 0; + goto end; + } + + /* Only "quiet" is defined for all sources. Other identifiers are + * specific to trace source. + */ + BUG_ON(!src); + + if (!src->decoding || !src->decoding[0].name) { + if (strcmp(verbosity, "default") != 0) + return -1; + + ret = 1; + } + else { + for (nd = src->decoding; nd->name && nd->desc; nd++) + if (strcmp(verbosity, nd->name) == 0) + break; + + if (!nd->name || !nd->desc) + return -1; + + ret = nd - src->decoding + 1; + } + + end: + return ret; +} + +/* Parse a "trace" statement. Returns a severity as a LOG_* level and a status + * message that may be delivered to the user, in <msg>. The message will be + * nulled first and msg must be an allocated pointer. A null status message output + * indicates no error. Be careful not to use the return value as a boolean, as + * LOG_* values are not ordered as one could imagine (LOG_EMERG is zero). The + * function may/will use the trash buffer as the storage for the response + * message so that the caller never needs to release anything. + */ +static int trace_parse_statement(char **args, char **msg) +{ + struct trace_source *src; + uint64_t *ev_ptr = NULL; + + /* no error by default */ + *msg = NULL; + + if (!*args[1]) { + /* no arg => report the list of supported sources as a warning */ + chunk_printf(&trash, + "Supported trace sources and states (.=stopped, w=waiting, R=running) :\n" + " [.] 0 : not a source, will immediately stop all traces\n" + ); + + list_for_each_entry(src, &trace_sources, source_link) + chunk_appendf(&trash, " [%c] %-10s : %s\n", trace_state_char(src->state), src->name.ptr, src->desc); + + trash.area[trash.data] = 0; + *msg = strdup(trash.area); + return LOG_WARNING; + } + + if (strcmp(args[1], "0") == 0) { + /* emergency stop of all traces */ + list_for_each_entry(src, &trace_sources, source_link) + HA_ATOMIC_STORE(&src->state, TRACE_STATE_STOPPED); + *msg = strdup("All traces now stopped"); + return LOG_NOTICE; + } + + src = trace_find_source(args[1]); + if (!src) { + memprintf(msg, "No such trace source '%s'", args[1]); + return LOG_ERR; + } + + if (!*args[2]) { + *msg = "Supported commands:\n" + " event : list/enable/disable source-specific event reporting\n" + //" filter : list/enable/disable generic filters\n" + " level : list/set trace reporting level\n" + " lock : automatic lock on thread/connection/stream/...\n" + " pause : pause and automatically restart after a specific event\n" + " sink : list/set event sinks\n" + " start : start immediately or after a specific event\n" + " stop : stop immediately or after a specific event\n" + " verbosity : list/set trace output verbosity\n"; + *msg = strdup(*msg); + return LOG_WARNING; + } + else if ((strcmp(args[2], "event") == 0 && (ev_ptr = &src->report_events)) || + (strcmp(args[2], "pause") == 0 && (ev_ptr = &src->pause_events)) || + (strcmp(args[2], "start") == 0 && (ev_ptr = &src->start_events)) || + (strcmp(args[2], "stop") == 0 && (ev_ptr = &src->stop_events))) { + const struct trace_event *ev; + const char *name = args[3]; + int neg = 0; + int i; + + /* skip prefix '!', '-', '+' and remind negation */ + while (*name) { + if (*name == '!' || *name == '-') + neg = 1; + else if (*name == '+') + neg = 0; + else + break; + name++; + } + + if (!*name) { + chunk_printf(&trash, "Supported events for source %s (+=enabled, -=disabled):\n", src->name.ptr); + if (ev_ptr != &src->report_events) + chunk_appendf(&trash, " - now : don't wait for events, immediately change the state\n"); + chunk_appendf(&trash, " - none : disable all event types\n"); + chunk_appendf(&trash, " - any : enable all event types\n"); + for (i = 0; src->known_events && src->known_events[i].mask; i++) { + chunk_appendf(&trash, " %c %-12s : %s\n", + trace_event_char(*ev_ptr, src->known_events[i].mask), + src->known_events[i].name, src->known_events[i].desc); + } + trash.area[trash.data] = 0; + *msg = strdup(trash.area); + return LOG_WARNING; + } + + if (strcmp(name, "now") == 0 && ev_ptr != &src->report_events) { + HA_ATOMIC_STORE(ev_ptr, 0); + if (ev_ptr == &src->pause_events) { + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + HA_ATOMIC_STORE(&src->state, TRACE_STATE_WAITING); + } + else if (ev_ptr == &src->start_events) { + HA_ATOMIC_STORE(&src->state, TRACE_STATE_RUNNING); + } + else if (ev_ptr == &src->stop_events) { + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + HA_ATOMIC_STORE(&src->state, TRACE_STATE_STOPPED); + } + return 0; + } + + if (strcmp(name, "none") == 0) + HA_ATOMIC_STORE(ev_ptr, 0); + else if (strcmp(name, "any") == 0) + HA_ATOMIC_STORE(ev_ptr, ~0); + else { + ev = trace_find_event(src->known_events, name); + if (!ev) { + memprintf(msg, "No such trace event '%s'", name); + return LOG_ERR; + } + + if (!neg) + HA_ATOMIC_OR(ev_ptr, ev->mask); + else + HA_ATOMIC_AND(ev_ptr, ~ev->mask); + } + } + else if (strcmp(args[2], "sink") == 0) { + const char *name = args[3]; + struct sink *sink; + + if (!*name) { + chunk_printf(&trash, "Supported sinks for source %s (*=current):\n", src->name.ptr); + chunk_appendf(&trash, " %c none : no sink\n", src->sink ? ' ' : '*'); + list_for_each_entry(sink, &sink_list, sink_list) { + chunk_appendf(&trash, " %c %-10s : %s\n", + src->sink == sink ? '*' : ' ', + sink->name, sink->desc); + } + trash.area[trash.data] = 0; + *msg = strdup(trash.area); + return LOG_WARNING; + } + + if (strcmp(name, "none") == 0) + sink = NULL; + else { + sink = sink_find(name); + if (!sink) { + memprintf(msg, "No such trace sink '%s'", name); + return LOG_ERR; + } + } + + HA_ATOMIC_STORE(&src->sink, sink); + } + else if (strcmp(args[2], "level") == 0) { + const char *name = args[3]; + int level; + + if (!*name) { + chunk_printf(&trash, "Supported trace levels for source %s:\n", src->name.ptr); + chunk_appendf(&trash, " %c error : report errors\n", + src->level == TRACE_LEVEL_ERROR ? '*' : ' '); + chunk_appendf(&trash, " %c user : also information useful to the end user\n", + src->level == TRACE_LEVEL_USER ? '*' : ' '); + chunk_appendf(&trash, " %c proto : also protocol-level updates\n", + src->level == TRACE_LEVEL_PROTO ? '*' : ' '); + chunk_appendf(&trash, " %c state : also report internal state changes\n", + src->level == TRACE_LEVEL_STATE ? '*' : ' '); + chunk_appendf(&trash, " %c data : also report data transfers\n", + src->level == TRACE_LEVEL_DATA ? '*' : ' '); + chunk_appendf(&trash, " %c developer : also report information useful only to the developer\n", + src->level == TRACE_LEVEL_DEVELOPER ? '*' : ' '); + trash.area[trash.data] = 0; + *msg = strdup(trash.area); + return LOG_WARNING; + } + + level = trace_parse_level(name); + if (level < 0) { + memprintf(msg, "No such trace level '%s'", name); + return LOG_ERR; + } + + HA_ATOMIC_STORE(&src->level, level); + } + else if (strcmp(args[2], "lock") == 0) { + const char *name = args[3]; + + if (!*name) { + chunk_printf(&trash, "Supported lock-on criteria for source %s:\n", src->name.ptr); + if (src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_STRM)) + chunk_appendf(&trash, " %c backend : lock on the backend that started the trace\n", + src->lockon == TRACE_LOCKON_BACKEND ? '*' : ' '); + + if (src->arg_def & TRC_ARGS_CHK) + chunk_appendf(&trash, " %c check : lock on the check that started the trace\n", + src->lockon == TRACE_LOCKON_CHECK ? '*' : ' '); + + if (src->arg_def & TRC_ARGS_CONN) + chunk_appendf(&trash, " %c connection : lock on the connection that started the trace\n", + src->lockon == TRACE_LOCKON_CONNECTION ? '*' : ' '); + + if (src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_SESS|TRC_ARGS_STRM)) + chunk_appendf(&trash, " %c frontend : lock on the frontend that started the trace\n", + src->lockon == TRACE_LOCKON_FRONTEND ? '*' : ' '); + + if (src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_SESS|TRC_ARGS_STRM)) + chunk_appendf(&trash, " %c listener : lock on the listener that started the trace\n", + src->lockon == TRACE_LOCKON_LISTENER ? '*' : ' '); + + chunk_appendf(&trash, " %c nothing : do not lock on anything\n", + src->lockon == TRACE_LOCKON_NOTHING ? '*' : ' '); + + if (src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_STRM)) + chunk_appendf(&trash, " %c server : lock on the server that started the trace\n", + src->lockon == TRACE_LOCKON_SERVER ? '*' : ' '); + + if (src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_SESS|TRC_ARGS_STRM)) + chunk_appendf(&trash, " %c session : lock on the session that started the trace\n", + src->lockon == TRACE_LOCKON_SESSION ? '*' : ' '); + + if (src->arg_def & TRC_ARGS_STRM) + chunk_appendf(&trash, " %c stream : lock on the stream that started the trace\n", + src->lockon == TRACE_LOCKON_STREAM ? '*' : ' '); + + if (src->arg_def & TRC_ARGS_APPCTX) + chunk_appendf(&trash, " %c applet : lock on the applet that started the trace\n", + src->lockon == TRACE_LOCKON_APPCTX ? '*' : ' '); + + chunk_appendf(&trash, " %c thread : lock on the thread that started the trace\n", + src->lockon == TRACE_LOCKON_THREAD ? '*' : ' '); + + if (src->lockon_args && src->lockon_args[0].name) + chunk_appendf(&trash, " %c %-10s : %s\n", + src->lockon == TRACE_LOCKON_ARG1 ? '*' : ' ', + src->lockon_args[0].name, src->lockon_args[0].desc); + + if (src->lockon_args && src->lockon_args[1].name) + chunk_appendf(&trash, " %c %-10s : %s\n", + src->lockon == TRACE_LOCKON_ARG2 ? '*' : ' ', + src->lockon_args[1].name, src->lockon_args[1].desc); + + if (src->lockon_args && src->lockon_args[2].name) + chunk_appendf(&trash, " %c %-10s : %s\n", + src->lockon == TRACE_LOCKON_ARG3 ? '*' : ' ', + src->lockon_args[2].name, src->lockon_args[2].desc); + + if (src->lockon_args && src->lockon_args[3].name) + chunk_appendf(&trash, " %c %-10s : %s\n", + src->lockon == TRACE_LOCKON_ARG4 ? '*' : ' ', + src->lockon_args[3].name, src->lockon_args[3].desc); + + trash.area[trash.data] = 0; + *msg = strdup(trash.area); + return LOG_WARNING; + } + else if ((src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_STRM)) && strcmp(name, "backend") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_BACKEND); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if ((src->arg_def & TRC_ARGS_CHK) && strcmp(name, "check") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_CHECK); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if ((src->arg_def & TRC_ARGS_CONN) && strcmp(name, "connection") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_CONNECTION); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if ((src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_SESS|TRC_ARGS_STRM)) && strcmp(name, "frontend") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_FRONTEND); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if ((src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_SESS|TRC_ARGS_STRM)) && strcmp(name, "listener") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_LISTENER); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if (strcmp(name, "nothing") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_NOTHING); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if ((src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_STRM)) && strcmp(name, "server") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_SERVER); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if ((src->arg_def & (TRC_ARGS_CONN|TRC_ARGS_SESS|TRC_ARGS_STRM)) && strcmp(name, "session") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_SESSION); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if ((src->arg_def & TRC_ARGS_STRM) && strcmp(name, "stream") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_STREAM); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if ((src->arg_def & TRC_ARGS_APPCTX) && strcmp(name, "appctx") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_APPCTX); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if (strcmp(name, "thread") == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_THREAD); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if (src->lockon_args && src->lockon_args[0].name && strcmp(name, src->lockon_args[0].name) == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_ARG1); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if (src->lockon_args && src->lockon_args[1].name && strcmp(name, src->lockon_args[1].name) == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_ARG2); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if (src->lockon_args && src->lockon_args[2].name && strcmp(name, src->lockon_args[2].name) == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_ARG3); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else if (src->lockon_args && src->lockon_args[3].name && strcmp(name, src->lockon_args[3].name) == 0) { + HA_ATOMIC_STORE(&src->lockon, TRACE_LOCKON_ARG4); + HA_ATOMIC_STORE(&src->lockon_ptr, NULL); + } + else { + memprintf(msg, "Unsupported lock-on criterion '%s'", name); + return LOG_ERR; + } + } + else if (strcmp(args[2], "verbosity") == 0) { + const char *name = args[3]; + const struct name_desc *nd; + int verbosity; + + if (!*name) { + chunk_printf(&trash, "Supported trace verbosities for source %s:\n", src->name.ptr); + chunk_appendf(&trash, " %c quiet : only report basic information with no decoding\n", + src->verbosity == 0 ? '*' : ' '); + if (!src->decoding || !src->decoding[0].name) { + chunk_appendf(&trash, " %c default : report extra information when available\n", + src->verbosity > 0 ? '*' : ' '); + } else { + for (nd = src->decoding; nd->name && nd->desc; nd++) + chunk_appendf(&trash, " %c %-10s : %s\n", + nd == (src->decoding + src->verbosity - 1) ? '*' : ' ', + nd->name, nd->desc); + } + trash.area[trash.data] = 0; + *msg = strdup(trash.area); + return LOG_WARNING; + } + + verbosity = trace_source_parse_verbosity(src, name); + if (verbosity < 0) { + memprintf(msg, "No such verbosity level '%s'", name); + return LOG_ERR; + } + + HA_ATOMIC_STORE(&src->verbosity, verbosity); + } + else { + memprintf(msg, "Unknown trace keyword '%s'", args[2]); + return LOG_ERR; + } + return 0; + +} + +void _trace_parse_cmd(struct trace_source *src, int level, int verbosity) +{ + src->sink = sink_find("stderr"); + src->level = level >= 0 ? level : TRACE_LEVEL_ERROR; + src->verbosity = verbosity >= 0 ? verbosity : 1; + src->state = TRACE_STATE_RUNNING; +} + +/* Parse a process argument specified via "-dt". + * + * Returns 0 on success else non-zero. + */ +int trace_parse_cmd(char *arg, char **errmsg) +{ + char *str; + + if (!arg) { + /* No trace specification, activate all sources on error level. */ + struct trace_source *src = NULL; + + list_for_each_entry(src, &trace_sources, source_link) + _trace_parse_cmd(src, -1, -1); + return 0; + } + + while ((str = strtok(arg, ","))) { + struct trace_source *src = NULL; + char *field, *name; + char *sep; + int level = -1, verbosity = -1; + + /* 1. name */ + name = str; + sep = strchr(str, ':'); + if (sep) { + str = sep + 1; + *sep = '\0'; + } + else { + str = NULL; + } + + if (strlen(name)) { + src = trace_find_source(name); + if (!src) { + memprintf(errmsg, "unknown trace source '%s'", name); + return 1; + } + } + + if (!str || !strlen(str)) + goto parse; + + /* 2. level */ + field = str; + sep = strchr(str, ':'); + if (sep) { + str = sep + 1; + *sep = '\0'; + } + else { + str = NULL; + } + + if (strlen(field)) { + level = trace_parse_level(field); + if (level < 0) { + memprintf(errmsg, "no such level '%s'", field); + return 1; + } + } + + if (!str || !strlen(str)) + goto parse; + + /* 3. verbosity */ + field = str; + if (strchr(field, ':')) { + memprintf(errmsg, "too many double-colon separator"); + return 1; + } + + if (!src && strcmp(field, "quiet") != 0) { + memprintf(errmsg, "trace source must be specified for verbosity other than 'quiet'"); + return 1; + } + + verbosity = trace_source_parse_verbosity(src, field); + if (verbosity < 0) { + memprintf(errmsg, "no such verbosity '%s' for source '%s'", field, name); + return 1; + } + + parse: + if (src) { + _trace_parse_cmd(src, level, verbosity); + } + else { + list_for_each_entry(src, &trace_sources, source_link) + _trace_parse_cmd(src, level, verbosity); + } + + /* Reset arg to NULL for strtok. */ + arg = NULL; + } + + return 0; +} + +/* parse a "trace" statement in the "global" section, returns 1 if a message is returned, otherwise zero */ +static int cfg_parse_trace(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *msg; + int severity; + + severity = trace_parse_statement(args, &msg); + if (msg) { + if (severity >= LOG_NOTICE) + ha_notice("parsing [%s:%d] : '%s': %s\n", file, line, args[0], msg); + else if (severity >= LOG_WARNING) + ha_warning("parsing [%s:%d] : '%s': %s\n", file, line, args[0], msg); + else { + /* let the caller free the message */ + *err = msg; + return -1; + } + ha_free(&msg); + } + return 0; +} + +/* parse the command, returns 1 if a message is returned, otherwise zero */ +static int cli_parse_trace(char **args, char *payload, struct appctx *appctx, void *private) +{ + char *msg; + int severity; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + severity = trace_parse_statement(args, &msg); + if (msg) + return cli_dynmsg(appctx, severity, msg); + + /* total success */ + return 0; +} + +/* parse the command, returns 1 if a message is returned, otherwise zero */ +static int cli_parse_show_trace(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct trace_source *src; + const struct sink *sink; + int i; + + args++; // make args[1] the 1st arg + + if (!*args[1]) { + /* no arg => report the list of supported sources */ + chunk_printf(&trash, + "Supported trace sources and states (.=stopped, w=waiting, R=running) :\n" + ); + + list_for_each_entry(src, &trace_sources, source_link) { + sink = src->sink; + chunk_appendf(&trash, " [%c] %-10s -> %s [drp %u] [%s]\n", + trace_state_char(src->state), src->name.ptr, + sink ? sink->name : "none", + sink ? sink->ctx.dropped : 0, + src->desc); + } + + trash.area[trash.data] = 0; + return cli_msg(appctx, LOG_INFO, trash.area); + } + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + src = trace_find_source(args[1]); + if (!src) + return cli_err(appctx, "No such trace source"); + + sink = src->sink; + chunk_printf(&trash, "Trace status for %s:\n", src->name.ptr); + chunk_appendf(&trash, " - sink: %s [%u dropped]\n", + sink ? sink->name : "none", sink ? sink->ctx.dropped : 0); + + chunk_appendf(&trash, " - event name : report start stop pause\n"); + for (i = 0; src->known_events && src->known_events[i].mask; i++) { + chunk_appendf(&trash, " %-12s : %c %c %c %c\n", + src->known_events[i].name, + trace_event_char(src->report_events, src->known_events[i].mask), + trace_event_char(src->start_events, src->known_events[i].mask), + trace_event_char(src->stop_events, src->known_events[i].mask), + trace_event_char(src->pause_events, src->known_events[i].mask)); + } + + trash.area[trash.data] = 0; + return cli_msg(appctx, LOG_WARNING, trash.area); +} + +static struct cli_kw_list cli_kws = {{ },{ + { { "trace", NULL }, "trace [<module>|0] [cmd [args...]] : manage live tracing (empty to list, 0 to stop all)", cli_parse_trace, NULL, NULL }, + { { "show", "trace", NULL }, "show trace [<module>] : show live tracing state", cli_parse_show_trace, NULL, NULL }, + {{},} +}}; + +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); + +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "trace", cfg_parse_trace, KWF_EXPERIMENTAL }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/uri_auth.c b/src/uri_auth.c new file mode 100644 index 0000000..db7e6c6 --- /dev/null +++ b/src/uri_auth.c @@ -0,0 +1,318 @@ +/* + * URI-based user authentication using the HTTP basic method. + * + * Copyright 2006-2007 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <stdlib.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/base64.h> +#include <haproxy/errors.h> +#include <haproxy/list.h> +#include <haproxy/stats-t.h> +#include <haproxy/uri_auth.h> + + +/* + * Initializes a basic uri_auth structure header and returns a pointer to it. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_check_init_uri_auth(struct uri_auth **root) +{ + struct uri_auth *u; + + if (!root || !*root) { + if ((u = calloc(1, sizeof (*u))) == NULL) + goto out_u; + + LIST_INIT(&u->http_req_rules); + LIST_INIT(&u->admin_rules); + } else + u = *root; + + if (!u->uri_prefix) { + u->uri_len = strlen(STATS_DEFAULT_URI); + if ((u->uri_prefix = strdup(STATS_DEFAULT_URI)) == NULL) + goto out_uri; + } + + if (root && !*root) + *root = u; + + return u; + + out_uri: + if (!root || !*root) + free(u); + out_u: + return NULL; +} + +/* + * Returns a default uri_auth with <uri> set as the uri_prefix. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_set_uri(struct uri_auth **root, char *uri) +{ + struct uri_auth *u; + char *uri_copy; + int uri_len; + + uri_len = strlen(uri); + if ((uri_copy = strdup(uri)) == NULL) + goto out_uri; + + if ((u = stats_check_init_uri_auth(root)) == NULL) + goto out_u; + + free(u->uri_prefix); + u->uri_prefix = uri_copy; + u->uri_len = uri_len; + return u; + + out_u: + free(uri_copy); + out_uri: + return NULL; +} + +/* + * Returns a default uri_auth with <realm> set as the realm. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_set_realm(struct uri_auth **root, char *realm) +{ + struct uri_auth *u; + char *realm_copy; + + if ((realm_copy = strdup(realm)) == NULL) + goto out_realm; + + if ((u = stats_check_init_uri_auth(root)) == NULL) + goto out_u; + + free(u->auth_realm); + u->auth_realm = realm_copy; + return u; + + out_u: + free(realm_copy); + out_realm: + return NULL; +} + +/* + * Returns a default uri_auth with STAT_SHNODE flag enabled and + * <node> set as the name if it is not empty. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_set_node(struct uri_auth **root, char *name) +{ + struct uri_auth *u; + char *node_copy = NULL; + + if (name && *name) { + node_copy = strdup(name); + if (node_copy == NULL) + goto out_realm; + } + + if ((u = stats_check_init_uri_auth(root)) == NULL) + goto out_u; + + if (!stats_set_flag(root, STAT_SHNODE)) + goto out_u; + + if (node_copy) { + free(u->node); + u->node = node_copy; + } + + return u; + + out_u: + free(node_copy); + out_realm: + return NULL; +} + +/* + * Returns a default uri_auth with STAT_SHDESC flag enabled and + * <description> set as the desc if it is not empty. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_set_desc(struct uri_auth **root, char *desc) +{ + struct uri_auth *u; + char *desc_copy = NULL; + + if (desc && *desc) { + desc_copy = strdup(desc); + if (desc_copy == NULL) + goto out_realm; + } + + if ((u = stats_check_init_uri_auth(root)) == NULL) + goto out_u; + + if (!stats_set_flag(root, STAT_SHDESC)) + goto out_u; + + if (desc_copy) { + free(u->desc); + u->desc = desc_copy; + } + + return u; + + out_u: + free(desc_copy); + out_realm: + return NULL; +} + +/* + * Returns a default uri_auth with the <refresh> refresh interval. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_set_refresh(struct uri_auth **root, int interval) +{ + struct uri_auth *u; + + if ((u = stats_check_init_uri_auth(root)) != NULL) + u->refresh = interval; + return u; +} + +/* + * Returns a default uri_auth with the <flag> set. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_set_flag(struct uri_auth **root, int flag) +{ + struct uri_auth *u; + + if ((u = stats_check_init_uri_auth(root)) != NULL) + u->flags |= flag; + return u; +} + +/* + * Returns a default uri_auth with a <user:passwd> entry added to the list of + * authorized users. If a matching entry is found, no update will be performed. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_add_auth(struct uri_auth **root, char *user) +{ + struct uri_auth *u; + struct auth_users *newuser; + char *pass; + + pass = strchr(user, ':'); + if (pass) + *pass++ = '\0'; + else + pass = ""; + + if ((u = stats_check_init_uri_auth(root)) == NULL) + return NULL; + + if (!u->userlist) + u->userlist = calloc(1, sizeof(*u->userlist)); + + if (!u->userlist) + return NULL; + + if (!u->userlist->name) + u->userlist->name = strdup(".internal-stats-userlist"); + + if (!u->userlist->name) + return NULL; + + for (newuser = u->userlist->users; newuser; newuser = newuser->next) + if (strcmp(newuser->user, user) == 0) { + ha_warning("uri auth: ignoring duplicated user '%s'.\n", + user); + return u; + } + + newuser = calloc(1, sizeof(*newuser)); + if (!newuser) + return NULL; + + newuser->user = strdup(user); + if (!newuser->user) { + free(newuser); + return NULL; + } + + newuser->pass = strdup(pass); + if (!newuser->pass) { + free(newuser->user); + free(newuser); + return NULL; + } + + newuser->flags |= AU_O_INSECURE; + newuser->next = u->userlist->users; + u->userlist->users = newuser; + + return u; +} + +/* + * Returns a default uri_auth with a <scope> entry added to the list of + * allowed scopes. If a matching entry is found, no update will be performed. + * Uses the pointer provided if not NULL and not initialized. + */ +struct uri_auth *stats_add_scope(struct uri_auth **root, char *scope) +{ + struct uri_auth *u; + char *new_name; + struct stat_scope *old_scope, **scope_list; + + if ((u = stats_check_init_uri_auth(root)) == NULL) + goto out; + + scope_list = &u->scope; + while ((old_scope = *scope_list)) { + if (strcmp(old_scope->px_id, scope) == 0) + break; + scope_list = &old_scope->next; + } + + if (!old_scope) { + if ((new_name = strdup(scope)) == NULL) + goto out_u; + + if ((old_scope = calloc(1, sizeof(*old_scope))) == NULL) + goto out_name; + + old_scope->px_id = new_name; + old_scope->px_len = strlen(new_name); + *scope_list = old_scope; + } + return u; + + out_name: + free(new_name); + out_u: + free(u); + out: + return NULL; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/uri_normalizer.c b/src/uri_normalizer.c new file mode 100644 index 0000000..bc793f2 --- /dev/null +++ b/src/uri_normalizer.c @@ -0,0 +1,467 @@ +/* + * HTTP request URI normalization. + * + * Copyright 2021 Tim Duesterhus <tim@bastelstu.be> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <import/ist.h> + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/tools.h> +#include <haproxy/uri_normalizer.h> + +/* Encodes '#' as '%23'. */ +enum uri_normalizer_err uri_normalizer_fragment_encode(const struct ist input, struct ist *dst) +{ + enum uri_normalizer_err err; + + const size_t size = istclear(dst); + struct ist output = *dst; + + struct ist scanner = input; + + while (istlen(scanner)) { + const struct ist before_hash = istsplit(&scanner, '#'); + + if (istcat(&output, before_hash, size) < 0) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + + if (istend(before_hash) != istend(scanner)) { + if (istcat(&output, ist("%23"), size) < 0) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + } + } + + *dst = output; + + return URI_NORMALIZER_ERR_NONE; + + fail: + + return err; +} + +/* Returns 1 if the given character is part of the 'unreserved' set in the + * RFC 3986 ABNF. + * Returns 0 if not. + */ +static int is_unreserved_character(unsigned char c) +{ + switch (c) { + case 'A'...'Z': /* ALPHA */ + case 'a'...'z': /* ALPHA */ + case '0'...'9': /* DIGIT */ + case '-': + case '.': + case '_': + case '~': + return 1; + default: + return 0; + } +} + +/* Decodes percent encoded characters that are part of the 'unreserved' set. + * + * RFC 3986, section 2.3: + * > URIs that differ in the replacement of an unreserved character with + * > its corresponding percent-encoded US-ASCII octet are equivalent [...] + * > when found in a URI, should be decoded to their corresponding unreserved + * > characters by URI normalizers. + * + * If `strict` is set to 0 then percent characters that are not followed by a + * hexadecimal digit are returned as-is without performing any decoding. + * If `strict` is set to 1 then `URI_NORMALIZER_ERR_INVALID_INPUT` is returned + * for invalid sequences. + */ +enum uri_normalizer_err uri_normalizer_percent_decode_unreserved(const struct ist input, int strict, struct ist *dst) +{ + enum uri_normalizer_err err; + + const size_t size = istclear(dst); + struct ist output = *dst; + + struct ist scanner = input; + + /* The output will either be shortened or have the same length. */ + if (size < istlen(input)) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + + while (istlen(scanner)) { + const char current = istshift(&scanner); + + if (current == '%') { + if (istlen(scanner) >= 2) { + if (ishex(istptr(scanner)[0]) && ishex(istptr(scanner)[1])) { + char hex1, hex2, c; + + hex1 = istshift(&scanner); + hex2 = istshift(&scanner); + c = (hex2i(hex1) << 4) + hex2i(hex2); + + if (is_unreserved_character(c)) { + output = __istappend(output, c); + } + else { + output = __istappend(output, current); + output = __istappend(output, hex1); + output = __istappend(output, hex2); + } + + continue; + } + } + + if (strict) { + err = URI_NORMALIZER_ERR_INVALID_INPUT; + goto fail; + } + else { + output = __istappend(output, current); + } + } + else { + output = __istappend(output, current); + } + } + + *dst = output; + + return URI_NORMALIZER_ERR_NONE; + + fail: + + return err; +} + +/* Uppercases letters used in percent encoding. + * + * If `strict` is set to 0 then percent characters that are not followed by a + * hexadecimal digit are returned as-is without modifying the following letters. + * If `strict` is set to 1 then `URI_NORMALIZER_ERR_INVALID_INPUT` is returned + * for invalid sequences. + */ +enum uri_normalizer_err uri_normalizer_percent_upper(const struct ist input, int strict, struct ist *dst) +{ + enum uri_normalizer_err err; + + const size_t size = istclear(dst); + struct ist output = *dst; + + struct ist scanner = input; + + /* The output will have the same length. */ + if (size < istlen(input)) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + + while (istlen(scanner)) { + const char current = istshift(&scanner); + + if (current == '%') { + if (istlen(scanner) >= 2) { + if (ishex(istptr(scanner)[0]) && ishex(istptr(scanner)[1])) { + output = __istappend(output, current); + output = __istappend(output, toupper(istshift(&scanner))); + output = __istappend(output, toupper(istshift(&scanner))); + continue; + } + } + + if (strict) { + err = URI_NORMALIZER_ERR_INVALID_INPUT; + goto fail; + } + else { + output = __istappend(output, current); + } + } + else { + output = __istappend(output, current); + } + } + + *dst = output; + + return URI_NORMALIZER_ERR_NONE; + + fail: + + return err; +} + +/* Removes `/./` from the given path. */ +enum uri_normalizer_err uri_normalizer_path_dot(const struct ist path, struct ist *dst) +{ + enum uri_normalizer_err err; + + const size_t size = istclear(dst); + struct ist newpath = *dst; + + struct ist scanner = path; + + /* The path will either be shortened or have the same length. */ + if (size < istlen(path)) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + + while (istlen(scanner) > 0) { + const struct ist segment = istsplit(&scanner, '/'); + + if (!isteq(segment, ist("."))) { + if (istcat(&newpath, segment, size) < 0) { + /* This is impossible, because we checked the size of the destination buffer. */ + my_unreachable(); + err = URI_NORMALIZER_ERR_INTERNAL_ERROR; + goto fail; + } + + if (istend(segment) != istend(scanner)) + newpath = __istappend(newpath, '/'); + } + } + + *dst = newpath; + + return URI_NORMALIZER_ERR_NONE; + + fail: + + return err; +} + +/* Merges `/../` with preceding path segments. + * + * If `full` is set to `0` then `/../` will be printed at the start of the resulting + * path if the number of `/../` exceeds the number of other segments. If `full` is + * set to `1` these will not be printed. + */ +enum uri_normalizer_err uri_normalizer_path_dotdot(const struct ist path, int full, struct ist *dst) +{ + enum uri_normalizer_err err; + + const size_t size = istclear(dst); + char * const tail = istptr(*dst) + size; + char *head = tail; + + ssize_t offset = istlen(path) - 1; + + int up = 0; + + /* The path will either be shortened or have the same length. */ + if (size < istlen(path)) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + + /* Handle `/..` at the end of the path without a trailing slash. */ + if (offset >= 2 && istmatch(istadv(path, offset - 2), ist("/.."))) { + up++; + offset -= 2; + } + + while (offset >= 0) { + if (offset >= 3 && istmatch(istadv(path, offset - 3), ist("/../"))) { + up++; + offset -= 3; + continue; + } + + if (up > 0) { + /* Skip the slash. */ + offset--; + + /* First check whether we already reached the start of the path, + * before popping the current `/../`. + */ + if (offset >= 0) { + up--; + + /* Skip the current path segment. */ + while (offset >= 0 && istptr(path)[offset] != '/') + offset--; + } + } + else { + /* Prepend the slash. */ + *(--head) = istptr(path)[offset]; + offset--; + + /* Prepend the current path segment. */ + while (offset >= 0 && istptr(path)[offset] != '/') { + *(--head) = istptr(path)[offset]; + offset--; + } + } + } + + if (up > 0) { + /* Prepend a trailing slash. */ + *(--head) = '/'; + + if (!full) { + /* Prepend unconsumed `/..`. */ + do { + *(--head) = '.'; + *(--head) = '.'; + *(--head) = '/'; + up--; + } while (up > 0); + } + } + + *dst = ist2(head, tail - head); + + return URI_NORMALIZER_ERR_NONE; + + fail: + + return err; +} + +/* Merges adjacent slashes in the given path. */ +enum uri_normalizer_err uri_normalizer_path_merge_slashes(const struct ist path, struct ist *dst) +{ + enum uri_normalizer_err err; + + const size_t size = istclear(dst); + struct ist newpath = *dst; + + struct ist scanner = path; + + /* The path will either be shortened or have the same length. */ + if (size < istlen(path)) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + + while (istlen(scanner) > 0) { + const char current = istshift(&scanner); + + if (current == '/') { + while (istlen(scanner) > 0 && *istptr(scanner) == '/') + scanner = istnext(scanner); + } + + newpath = __istappend(newpath, current); + } + + *dst = newpath; + + return URI_NORMALIZER_ERR_NONE; + + fail: + + return err; +} + +/* Compares two query parameters by name. Query parameters are ordered + * as with memcmp. Shorter parameter names are ordered lower. Identical + * parameter names are compared by their pointer to maintain a stable + * sort. + */ +static int query_param_cmp(const void *a, const void *b) +{ + const struct ist param_a = *(struct ist*)a; + const struct ist param_b = *(struct ist*)b; + const struct ist param_a_name = iststop(param_a, '='); + const struct ist param_b_name = iststop(param_b, '='); + + int cmp = istdiff(param_a_name, param_b_name); + + if (cmp != 0) + return cmp; + + /* The contents are identical: Compare the pointer. */ + if (istptr(param_a) < istptr(param_b)) + return -1; + + if (istptr(param_a) > istptr(param_b)) + return 1; + + return 0; +} + +/* Sorts the parameters within the given query string. */ +enum uri_normalizer_err uri_normalizer_query_sort(const struct ist query, const char delim, struct ist *dst) +{ + enum uri_normalizer_err err; + + const size_t size = istclear(dst); + struct ist newquery = *dst; + + struct ist scanner = query; + + const struct buffer *trash = get_trash_chunk(); + struct ist *params = (struct ist *)b_orig(trash); + const size_t max_param = b_size(trash) / sizeof(*params); + size_t param_count = 0; + + size_t i; + + /* The query will have the same length. */ + if (size < istlen(query)) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + + /* Handle the leading '?'. */ + newquery = __istappend(newquery, istshift(&scanner)); + + while (istlen(scanner) > 0) { + const struct ist param = istsplit(&scanner, delim); + + if (param_count + 1 > max_param) { + err = URI_NORMALIZER_ERR_ALLOC; + goto fail; + } + + params[param_count] = param; + param_count++; + } + + qsort(params, param_count, sizeof(*params), query_param_cmp); + + for (i = 0; i < param_count; i++) { + if (i > 0) + newquery = __istappend(newquery, delim); + + if (istcat(&newquery, params[i], size) < 0) { + /* This is impossible, because we checked the size of the destination buffer. */ + my_unreachable(); + err = URI_NORMALIZER_ERR_INTERNAL_ERROR; + goto fail; + } + } + + *dst = newquery; + + return URI_NORMALIZER_ERR_NONE; + + fail: + + return err; +} + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/src/vars.c b/src/vars.c new file mode 100644 index 0000000..7ec753e --- /dev/null +++ b/src/vars.c @@ -0,0 +1,1454 @@ +#include <ctype.h> + +#include <haproxy/api.h> +#include <haproxy/arg.h> +#include <haproxy/buf.h> +#include <haproxy/cfgparse.h> +#include <haproxy/check.h> +#include <haproxy/cli.h> +#include <haproxy/global.h> +#include <haproxy/http.h> +#include <haproxy/http_rules.h> +#include <haproxy/list.h> +#include <haproxy/log.h> +#include <haproxy/sample.h> +#include <haproxy/session.h> +#include <haproxy/stream-t.h> +#include <haproxy/tcp_rules.h> +#include <haproxy/tcpcheck.h> +#include <haproxy/tools.h> +#include <haproxy/vars.h> +#include <haproxy/xxhash.h> + + +/* This contains a pool of struct vars */ +DECLARE_STATIC_POOL(var_pool, "vars", sizeof(struct var)); + +/* list of variables for the process scope. */ +struct vars proc_vars THREAD_ALIGNED(64); + +/* This array of int contains the system limits per context. */ +static unsigned int var_global_limit = 0; +static unsigned int var_proc_limit = 0; +static unsigned int var_sess_limit = 0; +static unsigned int var_txn_limit = 0; +static unsigned int var_reqres_limit = 0; +static unsigned int var_check_limit = 0; +static uint64_t var_name_hash_seed = 0; + +/* Structure and array matching set-var conditions to their respective flag + * value. + */ +struct var_set_condition { + const char *cond_str; + uint flag; +}; + +static struct var_set_condition conditions_array[] = { + { "ifexists", VF_COND_IFEXISTS }, + { "ifnotexists", VF_COND_IFNOTEXISTS }, + { "ifempty", VF_COND_IFEMPTY }, + { "ifnotempty", VF_COND_IFNOTEMPTY }, + { "ifset", VF_COND_IFSET }, + { "ifnotset", VF_COND_IFNOTSET }, + { "ifgt", VF_COND_IFGT }, + { "iflt", VF_COND_IFLT }, + { NULL, 0 } +}; + +/* returns the struct vars pointer for a session, stream and scope, or NULL if + * it does not exist. + */ +static inline struct vars *get_vars(struct session *sess, struct stream *strm, enum vars_scope scope) +{ + switch (scope) { + case SCOPE_PROC: + return &proc_vars; + case SCOPE_SESS: + return sess ? &sess->vars : NULL; + case SCOPE_CHECK: { + struct check *check = sess ? objt_check(sess->origin) : NULL; + + return check ? &check->vars : NULL; + } + case SCOPE_TXN: + return strm ? &strm->vars_txn : NULL; + case SCOPE_REQ: + case SCOPE_RES: + default: + return strm ? &strm->vars_reqres : NULL; + } +} + +/* This function adds or remove memory size from the accounting. The inner + * pointers may be null when setting the outer ones only. + */ +void var_accounting_diff(struct vars *vars, struct session *sess, struct stream *strm, int size) +{ + switch (vars->scope) { + case SCOPE_REQ: + case SCOPE_RES: + if (var_reqres_limit && strm) + _HA_ATOMIC_ADD(&strm->vars_reqres.size, size); + __fallthrough; + case SCOPE_TXN: + if (var_txn_limit && strm) + _HA_ATOMIC_ADD(&strm->vars_txn.size, size); + goto scope_sess; + case SCOPE_CHECK: + if (var_check_limit) { + struct check *check = objt_check(sess->origin); + + if (check) + _HA_ATOMIC_ADD(&check->vars.size, size); + } +scope_sess: + __fallthrough; + case SCOPE_SESS: + if (var_sess_limit) + _HA_ATOMIC_ADD(&sess->vars.size, size); + __fallthrough; + case SCOPE_PROC: + if (var_proc_limit || var_global_limit) + _HA_ATOMIC_ADD(&proc_vars.size, size); + } +} + +/* This function returns 1 if the <size> is available in the var + * pool <vars>, otherwise returns 0. If the space is available, + * the size is reserved. The inner pointers may be null when setting + * the outer ones only. The accounting uses either <sess> or <strm> + * depending on the scope. <strm> may be NULL when no stream is known + * and only the session exists (eg: tcp-request connection). + */ +static int var_accounting_add(struct vars *vars, struct session *sess, struct stream *strm, int size) +{ + switch (vars->scope) { + case SCOPE_REQ: + case SCOPE_RES: + if (var_reqres_limit && strm && strm->vars_reqres.size + size > var_reqres_limit) + return 0; + __fallthrough; + case SCOPE_TXN: + if (var_txn_limit && strm && strm->vars_txn.size + size > var_txn_limit) + return 0; + goto scope_sess; + case SCOPE_CHECK: { + struct check *check = objt_check(sess->origin); + + if (var_check_limit && check && check->vars.size + size > var_check_limit) + return 0; + } +scope_sess: + __fallthrough; + case SCOPE_SESS: + if (var_sess_limit && sess->vars.size + size > var_sess_limit) + return 0; + __fallthrough; + case SCOPE_PROC: + /* note: scope proc collects all others and is currently identical to the + * global limit. + */ + if (var_proc_limit && proc_vars.size + size > var_proc_limit) + return 0; + if (var_global_limit && proc_vars.size + size > var_global_limit) + return 0; + } + var_accounting_diff(vars, sess, strm, size); + return 1; +} + +/* This function removes a variable from the list and frees the memory it was + * using. If the variable is marked "VF_PERMANENT", the sample_data is only + * reset to SMP_T_ANY unless <force> is non nul. Returns the freed size. + */ +unsigned int var_clear(struct var *var, int force) +{ + unsigned int size = 0; + + if (var->data.type == SMP_T_STR || var->data.type == SMP_T_BIN) { + ha_free(&var->data.u.str.area); + size += var->data.u.str.data; + } + else if (var->data.type == SMP_T_METH && var->data.u.meth.meth == HTTP_METH_OTHER) { + ha_free(&var->data.u.meth.str.area); + size += var->data.u.meth.str.data; + } + /* wipe the sample */ + var->data.type = SMP_T_ANY; + + if (!(var->flags & VF_PERMANENT) || force) { + LIST_DELETE(&var->l); + pool_free(var_pool, var); + size += sizeof(struct var); + } + return size; +} + +/* This function free all the memory used by all the variables + * in the list. + */ +void vars_prune(struct vars *vars, struct session *sess, struct stream *strm) +{ + struct var *var, *tmp; + unsigned int size = 0; + + vars_wrlock(vars); + list_for_each_entry_safe(var, tmp, &vars->head, l) { + size += var_clear(var, 1); + } + vars_wrunlock(vars); + var_accounting_diff(vars, sess, strm, -size); +} + +/* This function frees all the memory used by all the session variables in the + * list starting at <vars>. + */ +void vars_prune_per_sess(struct vars *vars) +{ + struct var *var, *tmp; + unsigned int size = 0; + + vars_wrlock(vars); + list_for_each_entry_safe(var, tmp, &vars->head, l) { + size += var_clear(var, 1); + } + vars_wrunlock(vars); + + if (var_sess_limit) + _HA_ATOMIC_SUB(&vars->size, size); + if (var_proc_limit || var_global_limit) + _HA_ATOMIC_SUB(&proc_vars.size, size); +} + +/* This function initializes a variables list head */ +void vars_init_head(struct vars *vars, enum vars_scope scope) +{ + LIST_INIT(&vars->head); + vars->scope = scope; + vars->size = 0; + HA_RWLOCK_INIT(&vars->rwlock); +} + +/* This function returns a hash value and a scope for a variable name of a + * specified length. It makes sure that the scope is valid. It returns non-zero + * on success, 0 on failure. Neither hash nor scope may be NULL. + */ +static int vars_hash_name(const char *name, int len, enum vars_scope *scope, + uint64_t *hash, char **err) +{ + const char *tmp; + + /* Check length. */ + if (len == 0) { + memprintf(err, "Empty variable name cannot be accepted"); + return 0; + } + + /* Check scope. */ + if (len > 5 && strncmp(name, "proc.", 5) == 0) { + name += 5; + len -= 5; + *scope = SCOPE_PROC; + } + else if (len > 5 && strncmp(name, "sess.", 5) == 0) { + name += 5; + len -= 5; + *scope = SCOPE_SESS; + } + else if (len > 4 && strncmp(name, "txn.", 4) == 0) { + name += 4; + len -= 4; + *scope = SCOPE_TXN; + } + else if (len > 4 && strncmp(name, "req.", 4) == 0) { + name += 4; + len -= 4; + *scope = SCOPE_REQ; + } + else if (len > 4 && strncmp(name, "res.", 4) == 0) { + name += 4; + len -= 4; + *scope = SCOPE_RES; + } + else if (len > 6 && strncmp(name, "check.", 6) == 0) { + name += 6; + len -= 6; + *scope = SCOPE_CHECK; + } + else { + memprintf(err, "invalid variable name '%.*s'. A variable name must be start by its scope. " + "The scope can be 'proc', 'sess', 'txn', 'req', 'res' or 'check'", len, name); + return 0; + } + + /* Check variable name syntax. */ + for (tmp = name; tmp < name + len; tmp++) { + if (!isalnum((unsigned char)*tmp) && *tmp != '_' && *tmp != '.') { + memprintf(err, "invalid syntax at char '%s'", tmp); + return 0; + } + } + + *hash = XXH3(name, len, var_name_hash_seed); + return 1; +} + +/* This function returns the variable from the given list that matches + * <name_hash> or returns NULL if not found. It's only a linked list since it + * is not expected to have many variables per scope (a few tens at best). + * The caller is responsible for ensuring that <vars> is properly locked. + */ +static struct var *var_get(struct vars *vars, uint64_t name_hash) +{ + struct var *var; + + list_for_each_entry(var, &vars->head, l) + if (var->name_hash == name_hash) + return var; + return NULL; +} + +/* Returns 0 if fails, else returns 1. */ +static int smp_fetch_var(const struct arg *args, struct sample *smp, const char *kw, void *private) +{ + const struct var_desc *var_desc = &args[0].data.var; + const struct buffer *def = NULL; + + if (args[1].type == ARGT_STR) + def = &args[1].data.str; + + return vars_get_by_desc(var_desc, smp, def); +} + +/* + * Clear the contents of a variable so that it can be reset directly. + * This function is used just before a variable is filled out of a sample's + * content. + */ +static inline void var_clear_buffer(struct sample *smp, struct vars *vars, struct var *var, int var_type) +{ + if (var_type == SMP_T_STR || var_type == SMP_T_BIN) { + ha_free(&var->data.u.str.area); + var_accounting_diff(vars, smp->sess, smp->strm, + -var->data.u.str.data); + } + else if (var_type == SMP_T_METH && var->data.u.meth.meth == HTTP_METH_OTHER) { + ha_free(&var->data.u.meth.str.area); + var_accounting_diff(vars, smp->sess, smp->strm, + -var->data.u.meth.str.data); + } +} + +/* This function tries to create a variable whose name hash is <name_hash> in + * scope <scope> and store sample <smp> as its value. + * + * The stream and session are extracted from <smp>, whose stream may be NULL + * when scope is SCOPE_SESS. In case there wouldn't be enough memory to store + * the sample while the variable was already created, it would be changed to + * a bool (which is memory-less). + * + * Flags is a bitfield that may contain one of the following flags: + * - VF_CREATEONLY: do nothing if the variable already exists (success). + * - VF_PERMANENT: this flag will be passed to the variable upon creation + * + * - VF_COND_IFEXISTS: only set variable if it already exists + * - VF_COND_IFNOTEXISTS: only set variable if it did not exist yet + * - VF_COND_IFEMPTY: only set variable if sample is empty + * - VF_COND_IFNOTEMPTY: only set variable if sample is not empty + * - VF_COND_IFSET: only set variable if its type is not SMP_TYPE_ANY + * - VF_COND_IFNOTSET: only set variable if its type is ANY + * - VF_COND_IFGT: only set variable if its value is greater than the sample's + * - VF_COND_IFLT: only set variable if its value is less than the sample's + * + * It returns 0 on failure, non-zero on success. + */ +static int var_set(uint64_t name_hash, enum vars_scope scope, struct sample *smp, uint flags) +{ + struct vars *vars; + struct var *var; + int ret = 0; + int previous_type = SMP_T_ANY; + + vars = get_vars(smp->sess, smp->strm, scope); + if (!vars || vars->scope != scope) + return 0; + + vars_wrlock(vars); + + /* Look for existing variable name. */ + var = var_get(vars, name_hash); + + if (var) { + if (flags & VF_CREATEONLY) { + ret = 1; + goto unlock; + } + + if (flags & VF_COND_IFNOTEXISTS) + goto unlock; + } else { + if (flags & VF_COND_IFEXISTS) + goto unlock; + + /* Check memory available. */ + if (!var_accounting_add(vars, smp->sess, smp->strm, sizeof(struct var))) + goto unlock; + + /* Create new entry. */ + var = pool_alloc(var_pool); + if (!var) + goto unlock; + LIST_APPEND(&vars->head, &var->l); + var->name_hash = name_hash; + var->flags = flags & VF_PERMANENT; + var->data.type = SMP_T_ANY; + } + + /* A variable of type SMP_T_ANY is considered as unset (either created + * and never set or unset-var was called on it). + */ + if ((flags & VF_COND_IFSET && var->data.type == SMP_T_ANY) || + (flags & VF_COND_IFNOTSET && var->data.type != SMP_T_ANY)) + goto unlock; + + /* Set type. */ + previous_type = var->data.type; + var->data.type = smp->data.type; + + if (flags & VF_COND_IFEMPTY) { + switch(smp->data.type) { + case SMP_T_ANY: + case SMP_T_STR: + case SMP_T_BIN: + /* The actual test on the contents of the sample will be + * performed later. + */ + break; + default: + /* The sample cannot be empty since it has a scalar type. */ + var->data.type = previous_type; + goto unlock; + } + } + + /* Copy data. If the data needs memory, the function can fail. */ + switch (var->data.type) { + case SMP_T_BOOL: + var_clear_buffer(smp, vars, var, previous_type); + var->data.u.sint = smp->data.u.sint; + break; + case SMP_T_SINT: + if (previous_type == var->data.type) { + if (((flags & VF_COND_IFGT) && !(var->data.u.sint > smp->data.u.sint)) || + ((flags & VF_COND_IFLT) && !(var->data.u.sint < smp->data.u.sint))) + goto unlock; + } + var_clear_buffer(smp, vars, var, previous_type); + var->data.u.sint = smp->data.u.sint; + break; + case SMP_T_IPV4: + var_clear_buffer(smp, vars, var, previous_type); + var->data.u.ipv4 = smp->data.u.ipv4; + break; + case SMP_T_IPV6: + var_clear_buffer(smp, vars, var, previous_type); + var->data.u.ipv6 = smp->data.u.ipv6; + break; + case SMP_T_STR: + case SMP_T_BIN: + if ((flags & VF_COND_IFNOTEMPTY && !smp->data.u.str.data) || + (flags & VF_COND_IFEMPTY && smp->data.u.str.data)) { + var->data.type = previous_type; + goto unlock; + } + var_clear_buffer(smp, vars, var, previous_type); + if (!var_accounting_add(vars, smp->sess, smp->strm, smp->data.u.str.data)) { + var->data.type = SMP_T_BOOL; /* This type doesn't use additional memory. */ + goto unlock; + } + + var->data.u.str.area = malloc(smp->data.u.str.data); + if (!var->data.u.str.area) { + var_accounting_diff(vars, smp->sess, smp->strm, + -smp->data.u.str.data); + var->data.type = SMP_T_BOOL; /* This type doesn't use additional memory. */ + goto unlock; + } + var->data.u.str.data = smp->data.u.str.data; + memcpy(var->data.u.str.area, smp->data.u.str.area, + var->data.u.str.data); + break; + case SMP_T_METH: + var_clear_buffer(smp, vars, var, previous_type); + var->data.u.meth.meth = smp->data.u.meth.meth; + if (smp->data.u.meth.meth != HTTP_METH_OTHER) + break; + + if (!var_accounting_add(vars, smp->sess, smp->strm, smp->data.u.meth.str.data)) { + var->data.type = SMP_T_BOOL; /* This type doesn't use additional memory. */ + goto unlock; + } + + var->data.u.meth.str.area = malloc(smp->data.u.meth.str.data); + if (!var->data.u.meth.str.area) { + var_accounting_diff(vars, smp->sess, smp->strm, + -smp->data.u.meth.str.data); + var->data.type = SMP_T_BOOL; /* This type doesn't use additional memory. */ + goto unlock; + } + var->data.u.meth.str.data = smp->data.u.meth.str.data; + var->data.u.meth.str.size = smp->data.u.meth.str.data; + memcpy(var->data.u.meth.str.area, smp->data.u.meth.str.area, + var->data.u.meth.str.data); + break; + } + + /* OK, now done */ + ret = 1; + unlock: + vars_wrunlock(vars); + return ret; +} + +/* Deletes a variable matching name hash <name_hash> and scope <scope> for the + * session and stream found in <smp>. Note that stream may be null for + * SCOPE_SESS. Returns 0 if the scope was not found otherwise 1. + */ +static int var_unset(uint64_t name_hash, enum vars_scope scope, struct sample *smp) +{ + struct vars *vars; + struct var *var; + unsigned int size = 0; + + vars = get_vars(smp->sess, smp->strm, scope); + if (!vars || vars->scope != scope) + return 0; + + /* Look for existing variable name. */ + vars_wrlock(vars); + var = var_get(vars, name_hash); + if (var) { + size = var_clear(var, 0); + var_accounting_diff(vars, smp->sess, smp->strm, -size); + } + vars_wrunlock(vars); + return 1; +} + + +/* + * Convert a string set-var condition into its numerical value. + * The corresponding bit is set in the <cond_bitmap> parameter if the + * <cond> is known. + * Returns 1 in case of success. + */ +static int vars_parse_cond_param(const struct buffer *cond, uint *cond_bitmap, char **err) +{ + struct var_set_condition *cond_elt = &conditions_array[0]; + + /* The conditions array is NULL terminated. */ + while (cond_elt->cond_str) { + if (chunk_strcmp(cond, cond_elt->cond_str) == 0) { + *cond_bitmap |= cond_elt->flag; + break; + } + ++cond_elt; + } + + if (cond_elt->cond_str == NULL && err) + memprintf(err, "unknown condition \"%.*s\"", (int)cond->data, cond->area); + + return cond_elt->cond_str != NULL; +} + +/* Returns 0 if fails, else returns 1. */ +static int smp_conv_store(const struct arg *args, struct sample *smp, void *private) +{ + uint conditions = 0; + int cond_idx = 1; + + while (args[cond_idx].type == ARGT_STR) { + if (vars_parse_cond_param(&args[cond_idx++].data.str, &conditions, NULL) == 0) + break; + } + + return var_set(args[0].data.var.name_hash, args[0].data.var.scope, smp, conditions); +} + +/* Returns 0 if fails, else returns 1. */ +static int smp_conv_clear(const struct arg *args, struct sample *smp, void *private) +{ + return var_unset(args[0].data.var.name_hash, args[0].data.var.scope, smp); +} + +/* This functions check an argument entry and fill it with a variable + * type. The argument must be a string. If the variable lookup fails, + * the function returns 0 and fill <err>, otherwise it returns 1. + */ +int vars_check_arg(struct arg *arg, char **err) +{ + enum vars_scope scope; + struct sample empty_smp = { }; + uint64_t hash; + + /* Check arg type. */ + if (arg->type != ARGT_STR) { + memprintf(err, "unexpected argument type"); + return 0; + } + + /* Register new variable name. */ + if (!vars_hash_name(arg->data.str.area, arg->data.str.data, &scope, &hash, err)) + return 0; + + if (scope == SCOPE_PROC && !var_set(hash, scope, &empty_smp, VF_CREATEONLY|VF_PERMANENT)) + return 0; + + /* properly destroy the chunk */ + chunk_destroy(&arg->data.str); + + /* Use the global variable name pointer. */ + arg->type = ARGT_VAR; + arg->data.var.name_hash = hash; + arg->data.var.scope = scope; + return 1; +} + +/* This function stores a sample in a variable unless it is of type "proc" and + * not defined yet. + * Returns zero on failure and non-zero otherwise. The variable not being + * defined is treated as a failure. + */ +int vars_set_by_name_ifexist(const char *name, size_t len, struct sample *smp) +{ + enum vars_scope scope; + uint64_t hash; + + /* Resolve name and scope. */ + if (!vars_hash_name(name, len, &scope, &hash, NULL)) + return 0; + + /* Variable creation is allowed for all scopes apart from the PROC one. */ + return var_set(hash, scope, smp, (scope == SCOPE_PROC) ? VF_COND_IFEXISTS : 0); +} + + +/* This function stores a sample in a variable. + * Returns zero on failure and non-zero otherwise. + */ +int vars_set_by_name(const char *name, size_t len, struct sample *smp) +{ + enum vars_scope scope; + uint64_t hash; + + /* Resolve name and scope. */ + if (!vars_hash_name(name, len, &scope, &hash, NULL)) + return 0; + + return var_set(hash, scope, smp, 0); +} + +/* This function unsets a variable if it was already defined. + * Returns zero on failure and non-zero otherwise. + */ +int vars_unset_by_name_ifexist(const char *name, size_t len, struct sample *smp) +{ + enum vars_scope scope; + uint64_t hash; + + /* Resolve name and scope. */ + if (!vars_hash_name(name, len, &scope, &hash, NULL)) + return 0; + + return var_unset(hash, scope, smp); +} + + +/* This retrieves variable whose hash matches <name_hash> from variables <vars>, + * and if found and not empty, duplicates the result into sample <smp>. + * smp_dup() is used in order to release the variables lock ASAP (so a pre- + * allocated chunk is obtained via get_trash_shunk()). The variables' lock is + * used for reads. + * + * The function returns 0 if the variable was not found and no default + * value was provided in <def>, otherwise 1 with the sample filled. + * Default values are always returned as strings. + */ +static int var_to_smp(struct vars *vars, uint64_t name_hash, struct sample *smp, const struct buffer *def) +{ + struct var *var; + + /* Get the variable entry. */ + vars_rdlock(vars); + var = var_get(vars, name_hash); + if (!var || !var->data.type) { + if (!def) { + vars_rdunlock(vars); + return 0; + } + + /* not found but we have a default value */ + smp->data.type = SMP_T_STR; + smp->data.u.str = *def; + } + else + smp->data = var->data; + + /* Copy sample. */ + smp_dup(smp); + + vars_rdunlock(vars); + return 1; +} + +/* This function fills a sample with the variable content. + * + * Keep in mind that a sample content is duplicated by using smp_dup() + * and it therefore uses a pre-allocated trash chunk as returned by + * get_trash_chunk(). + * + * If the variable is not valid in this scope, 0 is always returned. + * If the variable is valid but not found, either the default value + * <def> is returned if not NULL, or zero is returned. + * + * Returns 1 if the sample is filled, otherwise it returns 0. + */ +int vars_get_by_name(const char *name, size_t len, struct sample *smp, const struct buffer *def) +{ + struct vars *vars; + enum vars_scope scope; + uint64_t hash; + + /* Resolve name and scope. */ + if (!vars_hash_name(name, len, &scope, &hash, NULL)) + return 0; + + /* Select "vars" pool according with the scope. */ + vars = get_vars(smp->sess, smp->strm, scope); + if (!vars || vars->scope != scope) + return 0; + + return var_to_smp(vars, hash, smp, def); +} + +/* This function fills a sample with the content of the variable described + * by <var_desc>. + * + * Keep in mind that a sample content is duplicated by using smp_dup() + * and it therefore uses a pre-allocated trash chunk as returned by + * get_trash_chunk(). + * + * If the variable is not valid in this scope, 0 is always returned. + * If the variable is valid but not found, either the default value + * <def> is returned if not NULL, or zero is returned. + * + * Returns 1 if the sample is filled, otherwise it returns 0. + */ +int vars_get_by_desc(const struct var_desc *var_desc, struct sample *smp, const struct buffer *def) +{ + struct vars *vars; + + /* Select "vars" pool according with the scope. */ + vars = get_vars(smp->sess, smp->strm, var_desc->scope); + + /* Check if the scope is available a this point of processing. */ + if (!vars || vars->scope != var_desc->scope) + return 0; + + return var_to_smp(vars, var_desc->name_hash, smp, def); +} + +/* Always returns ACT_RET_CONT even if an error occurs. */ +static enum act_return action_store(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct buffer *fmtstr = NULL; + struct sample smp; + int dir; + + switch (rule->from) { + case ACT_F_TCP_REQ_CON: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_SES: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_REQ_CNT: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_TCP_RES_CNT: dir = SMP_OPT_DIR_RES; break; + case ACT_F_HTTP_REQ: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_HTTP_RES: dir = SMP_OPT_DIR_RES; break; + case ACT_F_TCP_CHK: dir = SMP_OPT_DIR_REQ; break; + case ACT_F_CFG_PARSER: dir = SMP_OPT_DIR_REQ; break; /* not used anyway */ + case ACT_F_CLI_PARSER: dir = SMP_OPT_DIR_REQ; break; /* not used anyway */ + default: + send_log(px, LOG_ERR, "Vars: internal error while execute action store."); + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) + ha_alert("Vars: internal error while execute action store.\n"); + return ACT_RET_CONT; + } + + /* Process the expression. */ + memset(&smp, 0, sizeof(smp)); + + if (!LIST_ISEMPTY(&rule->arg.vars.fmt)) { + /* a format-string is used */ + + fmtstr = alloc_trash_chunk(); + if (!fmtstr) { + send_log(px, LOG_ERR, "Vars: memory allocation failure while processing store rule."); + if (!(global.mode & MODE_QUIET) || (global.mode & MODE_VERBOSE)) + ha_alert("Vars: memory allocation failure while processing store rule.\n"); + return ACT_RET_CONT; + } + + /* execute the log-format expression */ + fmtstr->data = sess_build_logline(sess, s, fmtstr->area, fmtstr->size, &rule->arg.vars.fmt); + + /* convert it to a sample of type string as it's what the vars + * API consumes, and store it. + */ + smp_set_owner(&smp, px, sess, s, 0); + smp.data.type = SMP_T_STR; + smp.data.u.str = *fmtstr; + var_set(rule->arg.vars.name_hash, rule->arg.vars.scope, &smp, rule->arg.vars.conditions); + } + else { + /* an expression is used */ + if (!sample_process(px, sess, s, dir|SMP_OPT_FINAL, + rule->arg.vars.expr, &smp)) + return ACT_RET_CONT; + } + + /* Store the sample, and ignore errors. */ + var_set(rule->arg.vars.name_hash, rule->arg.vars.scope, &smp, rule->arg.vars.conditions); + free_trash_chunk(fmtstr); + return ACT_RET_CONT; +} + +/* Always returns ACT_RET_CONT even if an error occurs. */ +static enum act_return action_clear(struct act_rule *rule, struct proxy *px, + struct session *sess, struct stream *s, int flags) +{ + struct sample smp; + + memset(&smp, 0, sizeof(smp)); + smp_set_owner(&smp, px, sess, s, SMP_OPT_FINAL); + + /* Clear the variable using the sample context, and ignore errors. */ + var_unset(rule->arg.vars.name_hash, rule->arg.vars.scope, &smp); + return ACT_RET_CONT; +} + +static void release_store_rule(struct act_rule *rule) +{ + struct logformat_node *lf, *lfb; + + list_for_each_entry_safe(lf, lfb, &rule->arg.vars.fmt, list) { + LIST_DELETE(&lf->list); + release_sample_expr(lf->expr); + free(lf->arg); + free(lf); + } + + release_sample_expr(rule->arg.vars.expr); +} + +/* This two function checks the variable name and replace the + * configuration string name by the global string name. its + * the same string, but the global pointer can be easy to + * compare. They return non-zero on success, zero on failure. + * + * The first function checks a sample-fetch and the second + * checks a converter. + */ +static int smp_check_var(struct arg *args, char **err) +{ + return vars_check_arg(&args[0], err); +} + +static int conv_check_var(struct arg *args, struct sample_conv *conv, + const char *file, int line, char **err_msg) +{ + int cond_idx = 1; + uint conditions = 0; + int retval = vars_check_arg(&args[0], err_msg); + + while (retval && args[cond_idx].type == ARGT_STR) + retval = vars_parse_cond_param(&args[cond_idx++].data.str, &conditions, err_msg); + + return retval; +} + +/* This function is a common parser for using variables. It understands + * the format: + * + * set-var-fmt(<variable-name>[,<cond> ...]) <format-string> + * set-var(<variable-name>[,<cond> ...]) <expression> + * unset-var(<variable-name>) + * + * It returns ACT_RET_PRS_ERR if fails and <err> is filled with an error + * message. Otherwise, it returns ACT_RET_PRS_OK and the variable <expr> + * is filled with the pointer to the expression to execute. The proxy is + * only used to retrieve the ->conf entries. + */ +static enum act_parse_ret parse_store(const char **args, int *arg, struct proxy *px, + struct act_rule *rule, char **err) +{ + const char *var_name = args[*arg-1]; + int var_len; + const char *kw_name; + int flags = 0, set_var = 0; /* 0=unset-var, 1=set-var, 2=set-var-fmt */ + struct sample empty_smp = { }; + struct ist condition = IST_NULL; + struct ist var = IST_NULL; + struct ist varname_ist = IST_NULL; + + if (strncmp(var_name, "set-var-fmt", 11) == 0) { + var_name += 11; + set_var = 2; + } + else if (strncmp(var_name, "set-var", 7) == 0) { + var_name += 7; + set_var = 1; + } + else if (strncmp(var_name, "unset-var", 9) == 0) { + var_name += 9; + set_var = 0; + } + + if (*var_name != '(') { + memprintf(err, "invalid or incomplete action '%s'. Expects 'set-var(<var-name>)', 'set-var-fmt(<var-name>)' or 'unset-var(<var-name>)'", + args[*arg-1]); + return ACT_RET_PRS_ERR; + } + var_name++; /* jump the '(' */ + var_len = strlen(var_name); + var_len--; /* remove the ')' */ + if (var_name[var_len] != ')') { + memprintf(err, "incomplete argument after action '%s'. Expects 'set-var(<var-name>)', 'set-var-fmt(<var-name>)' or 'unset-var(<var-name>)'", + args[*arg-1]); + return ACT_RET_PRS_ERR; + } + + /* Parse the optional conditions. */ + var = ist2(var_name, var_len); + varname_ist = istsplit(&var, ','); + var_len = istlen(varname_ist); + + condition = istsplit(&var, ','); + + if (istlen(condition) && set_var == 0) { + memprintf(err, "unset-var does not expect parameters after the variable name. Only \"set-var\" and \"set-var-fmt\" manage conditions"); + return ACT_RET_PRS_ERR; + } + + while (istlen(condition)) { + struct buffer cond = {}; + + chunk_initlen(&cond, istptr(condition), 0, istlen(condition)); + if (vars_parse_cond_param(&cond, &rule->arg.vars.conditions, err) == 0) + return ACT_RET_PRS_ERR; + + condition = istsplit(&var, ','); + } + + LIST_INIT(&rule->arg.vars.fmt); + if (!vars_hash_name(var_name, var_len, &rule->arg.vars.scope, &rule->arg.vars.name_hash, err)) + return ACT_RET_PRS_ERR; + + if (rule->arg.vars.scope == SCOPE_PROC && + !var_set(rule->arg.vars.name_hash, rule->arg.vars.scope, &empty_smp, VF_CREATEONLY|VF_PERMANENT)) + return 0; + + /* There is no fetch method when variable is unset. Just set the right + * action and return. */ + if (!set_var) { + rule->action = ACT_CUSTOM; + rule->action_ptr = action_clear; + rule->release_ptr = release_store_rule; + return ACT_RET_PRS_OK; + } + + kw_name = args[*arg-1]; + + switch (rule->from) { + case ACT_F_TCP_REQ_CON: + flags = SMP_VAL_FE_CON_ACC; + px->conf.args.ctx = ARGC_TCO; + break; + case ACT_F_TCP_REQ_SES: + flags = SMP_VAL_FE_SES_ACC; + px->conf.args.ctx = ARGC_TSE; + break; + case ACT_F_TCP_REQ_CNT: + if (px->cap & PR_CAP_FE) + flags |= SMP_VAL_FE_REQ_CNT; + if (px->cap & PR_CAP_BE) + flags |= SMP_VAL_BE_REQ_CNT; + px->conf.args.ctx = ARGC_TRQ; + break; + case ACT_F_TCP_RES_CNT: + if (px->cap & PR_CAP_FE) + flags |= SMP_VAL_FE_RES_CNT; + if (px->cap & PR_CAP_BE) + flags |= SMP_VAL_BE_RES_CNT; + px->conf.args.ctx = ARGC_TRS; + break; + case ACT_F_HTTP_REQ: + if (px->cap & PR_CAP_FE) + flags |= SMP_VAL_FE_HRQ_HDR; + if (px->cap & PR_CAP_BE) + flags |= SMP_VAL_BE_HRQ_HDR; + px->conf.args.ctx = ARGC_HRQ; + break; + case ACT_F_HTTP_RES: + if (px->cap & PR_CAP_FE) + flags |= SMP_VAL_FE_HRS_HDR; + if (px->cap & PR_CAP_BE) + flags |= SMP_VAL_BE_HRS_HDR; + px->conf.args.ctx = ARGC_HRS; + break; + case ACT_F_TCP_CHK: + flags = SMP_VAL_BE_CHK_RUL; + px->conf.args.ctx = ARGC_TCK; + break; + case ACT_F_CFG_PARSER: + flags = SMP_VAL_CFG_PARSER; + px->conf.args.ctx = ARGC_CFG; + break; + case ACT_F_CLI_PARSER: + flags = SMP_VAL_CLI_PARSER; + px->conf.args.ctx = ARGC_CLI; + break; + default: + memprintf(err, + "internal error, unexpected rule->from=%d, please report this bug!", + rule->from); + return ACT_RET_PRS_ERR; + } + + if (set_var == 2) { /* set-var-fmt */ + if (!parse_logformat_string(args[*arg], px, &rule->arg.vars.fmt, 0, flags, err)) + return ACT_RET_PRS_ERR; + + (*arg)++; + + /* for late error reporting */ + free(px->conf.lfs_file); + px->conf.lfs_file = strdup(px->conf.args.file); + px->conf.lfs_line = px->conf.args.line; + } else { + /* set-var */ + rule->arg.vars.expr = sample_parse_expr((char **)args, arg, px->conf.args.file, + px->conf.args.line, err, &px->conf.args, NULL); + if (!rule->arg.vars.expr) + return ACT_RET_PRS_ERR; + + if (!(rule->arg.vars.expr->fetch->val & flags)) { + memprintf(err, + "fetch method '%s' extracts information from '%s', none of which is available here", + kw_name, sample_src_names(rule->arg.vars.expr->fetch->use)); + free(rule->arg.vars.expr); + return ACT_RET_PRS_ERR; + } + } + + rule->action = ACT_CUSTOM; + rule->action_ptr = action_store; + rule->release_ptr = release_store_rule; + return ACT_RET_PRS_OK; +} + + +/* parses a global "set-var" directive. It will create a temporary rule and + * expression that are parsed, processed, and released on the fly so that we + * respect the real set-var syntax. These directives take the following format: + * set-var <name> <expression> + * set-var-fmt <name> <fmt> + * Note that parse_store() expects "set-var(name) <expression>" so we have to + * temporarily replace the keyword here. + */ +static int vars_parse_global_set_var(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + struct proxy px = { + .id = "CFG", + .conf.args = { .file = file, .line = line, }, + }; + struct act_rule rule = { + .arg.vars.scope = SCOPE_PROC, + .from = ACT_F_CFG_PARSER, + .conf = { .file = (char *)file, .line = line, }, + }; + enum obj_type objt = OBJ_TYPE_NONE; + struct session *sess = NULL; + enum act_parse_ret p_ret; + char *old_arg1; + char *tmp_arg1; + int arg = 2; // variable name + int ret = -1; + int use_fmt = 0; + + LIST_INIT(&px.conf.args.list); + + use_fmt = strcmp(args[0], "set-var-fmt") == 0; + + if (!*args[1] || !*args[2]) { + if (use_fmt) + memprintf(err, "'%s' requires a process-wide variable name ('proc.<name>') and a format string.", args[0]); + else + memprintf(err, "'%s' requires a process-wide variable name ('proc.<name>') and a sample expression.", args[0]); + goto end; + } + + tmp_arg1 = NULL; + if (!memprintf(&tmp_arg1, "set-var%s(%s)", use_fmt ? "-fmt" : "", args[1])) + goto end; + + /* parse_store() will always return a message in <err> on error */ + old_arg1 = args[1]; args[1] = tmp_arg1; + p_ret = parse_store((const char **)args, &arg, &px, &rule, err); + free(args[1]); args[1] = old_arg1; + + if (p_ret != ACT_RET_PRS_OK) + goto end; + + if (rule.arg.vars.scope != SCOPE_PROC) { + memprintf(err, "'%s': cannot set variable '%s', only scope 'proc' is permitted in the global section.", args[0], args[1]); + goto end; + } + + if (smp_resolve_args(&px, err) != 0) { + release_sample_expr(rule.arg.vars.expr); + indent_msg(err, 2); + goto end; + } + + if (use_fmt && !(sess = session_new(&px, NULL, &objt))) { + release_sample_expr(rule.arg.vars.expr); + memprintf(err, "'%s': out of memory when trying to set variable '%s' in the global section.", args[0], args[1]); + goto end; + } + + action_store(&rule, &px, sess, NULL, 0); + release_sample_expr(rule.arg.vars.expr); + if (sess) + session_free(sess); + + ret = 0; + end: + return ret; +} + +/* parse CLI's "get var <name>" */ +static int vars_parse_cli_get_var(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct vars *vars; + struct sample smp = { }; + int i; + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + if (!*args[2]) + return cli_err(appctx, "Missing process-wide variable identifier.\n"); + + vars = get_vars(NULL, NULL, SCOPE_PROC); + if (!vars || vars->scope != SCOPE_PROC) + return 0; + + if (!vars_get_by_name(args[2], strlen(args[2]), &smp, NULL)) + return cli_err(appctx, "Variable not found.\n"); + + /* the sample returned by vars_get_by_name() is allocated into a trash + * chunk so we have no constraint to manipulate it. + */ + chunk_printf(&trash, "%s: type=%s value=", args[2], smp_to_type[smp.data.type]); + + if (!sample_casts[smp.data.type][SMP_T_STR] || + !sample_casts[smp.data.type][SMP_T_STR](&smp)) { + chunk_appendf(&trash, "(undisplayable)\n"); + } else { + /* Display the displayable chars*. */ + b_putchr(&trash, '<'); + for (i = 0; i < smp.data.u.str.data; i++) { + if (isprint((unsigned char)smp.data.u.str.area[i])) + b_putchr(&trash, smp.data.u.str.area[i]); + else + b_putchr(&trash, '.'); + } + b_putchr(&trash, '>'); + b_putchr(&trash, '\n'); + b_putchr(&trash, 0); + } + return cli_msg(appctx, LOG_INFO, trash.area); +} + +/* parse CLI's "set var <name>". It accepts: + * - set var <name> <expression> + * - set var <name> expr <expression> + * - set var <name> fmt <format> + */ +static int vars_parse_cli_set_var(char **args, char *payload, struct appctx *appctx, void *private) +{ + struct proxy px = { + .id = "CLI", + .conf.args = { .file = "CLI", .line = 0, }, + }; + struct act_rule rule = { + .arg.vars.scope = SCOPE_PROC, + .from = ACT_F_CLI_PARSER, + .conf = { .file = "CLI", .line = 0, }, + }; + enum obj_type objt = OBJ_TYPE_NONE; + struct session *sess = NULL; + enum act_parse_ret p_ret; + const char *tmp_args[3]; + int tmp_arg; + char *tmp_act; + char *err = NULL; + int nberr; + int use_fmt = 0; + + LIST_INIT(&px.conf.args.list); + + if (!cli_has_level(appctx, ACCESS_LVL_OPER)) + return 1; + + if (!*args[2]) + return cli_err(appctx, "Missing process-wide variable identifier.\n"); + + if (!*args[3]) + return cli_err(appctx, "Missing either 'expr', 'fmt' or expression.\n"); + + if (*args[4]) { + /* this is the long format */ + if (strcmp(args[3], "fmt") == 0) + use_fmt = 1; + else if (strcmp(args[3], "expr") != 0) { + memprintf(&err, "'%s %s': arg type must be either 'expr' or 'fmt' but got '%s'.", args[0], args[1], args[3]); + goto fail; + } + } + + tmp_act = NULL; + if (!memprintf(&tmp_act, "set-var%s(%s)", use_fmt ? "-fmt" : "", args[2])) { + memprintf(&err, "memory allocation error."); + goto fail; + } + + /* parse_store() will always return a message in <err> on error */ + tmp_args[0] = tmp_act; + tmp_args[1] = (*args[4]) ? args[4] : args[3]; + tmp_args[2] = ""; + tmp_arg = 1; // must point to the first arg after the action + p_ret = parse_store(tmp_args, &tmp_arg, &px, &rule, &err); + free(tmp_act); + + if (p_ret != ACT_RET_PRS_OK) + goto fail; + + if (rule.arg.vars.scope != SCOPE_PROC) { + memprintf(&err, "'%s %s': cannot set variable '%s', only scope 'proc' is permitted here.", args[0], args[1], args[2]); + goto fail; + } + + err = NULL; + nberr = smp_resolve_args(&px, &err); + if (nberr) { + release_sample_expr(rule.arg.vars.expr); + indent_msg(&err, 2); + goto fail; + } + + if (use_fmt && !(sess = session_new(&px, NULL, &objt))) { + release_sample_expr(rule.arg.vars.expr); + memprintf(&err, "memory allocation error."); + goto fail; + } + + action_store(&rule, &px, sess, NULL, 0); + release_sample_expr(rule.arg.vars.expr); + if (sess) + session_free(sess); + + appctx->st0 = CLI_ST_PROMPT; + return 0; + fail: + return cli_dynerr(appctx, err); +} + +static int vars_max_size(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err, unsigned int *limit) +{ + char *error; + + *limit = strtol(args[1], &error, 10); + if (*error != 0) { + memprintf(err, "%s: '%s' is an invalid size", args[0], args[1]); + return -1; + } + return 0; +} + +static int vars_max_size_global(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return vars_max_size(args, section_type, curpx, defpx, file, line, err, &var_global_limit); +} + +static int vars_max_size_proc(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return vars_max_size(args, section_type, curpx, defpx, file, line, err, &var_proc_limit); +} + +static int vars_max_size_sess(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return vars_max_size(args, section_type, curpx, defpx, file, line, err, &var_sess_limit); +} + +static int vars_max_size_txn(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return vars_max_size(args, section_type, curpx, defpx, file, line, err, &var_txn_limit); +} + +static int vars_max_size_reqres(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return vars_max_size(args, section_type, curpx, defpx, file, line, err, &var_reqres_limit); +} + +static int vars_max_size_check(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + return vars_max_size(args, section_type, curpx, defpx, file, line, err, &var_check_limit); +} + +/* early boot initialization */ +static void vars_init() +{ + var_name_hash_seed = ha_random64(); + /* Initialize process vars */ + vars_init_head(&proc_vars, SCOPE_PROC); +} + +INITCALL0(STG_PREPARE, vars_init); + +static struct sample_fetch_kw_list sample_fetch_keywords = {ILH, { + + { "var", smp_fetch_var, ARG2(1,STR,STR), smp_check_var, SMP_T_ANY, SMP_USE_CONST }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_fetches, &sample_fetch_keywords); + +static struct sample_conv_kw_list sample_conv_kws = {ILH, { + { "set-var", smp_conv_store, ARG5(1,STR,STR,STR,STR,STR), conv_check_var, SMP_T_ANY, SMP_T_ANY }, + { "unset-var", smp_conv_clear, ARG1(1,STR), conv_check_var, SMP_T_ANY, SMP_T_ANY }, + { /* END */ }, +}}; + +INITCALL1(STG_REGISTER, sample_register_convs, &sample_conv_kws); + +static struct action_kw_list tcp_req_conn_kws = { { }, { + { "set-var-fmt", parse_store, KWF_MATCH_PREFIX }, + { "set-var", parse_store, KWF_MATCH_PREFIX }, + { "unset-var", parse_store, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_conn_keywords_register, &tcp_req_conn_kws); + +static struct action_kw_list tcp_req_sess_kws = { { }, { + { "set-var-fmt", parse_store, KWF_MATCH_PREFIX }, + { "set-var", parse_store, KWF_MATCH_PREFIX }, + { "unset-var", parse_store, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_sess_keywords_register, &tcp_req_sess_kws); + +static struct action_kw_list tcp_req_cont_kws = { { }, { + { "set-var-fmt", parse_store, KWF_MATCH_PREFIX }, + { "set-var", parse_store, KWF_MATCH_PREFIX }, + { "unset-var", parse_store, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_req_cont_keywords_register, &tcp_req_cont_kws); + +static struct action_kw_list tcp_res_kws = { { }, { + { "set-var-fmt", parse_store, KWF_MATCH_PREFIX }, + { "set-var", parse_store, KWF_MATCH_PREFIX }, + { "unset-var", parse_store, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_res_cont_keywords_register, &tcp_res_kws); + +static struct action_kw_list tcp_check_kws = {ILH, { + { "set-var-fmt", parse_store, KWF_MATCH_PREFIX }, + { "set-var", parse_store, KWF_MATCH_PREFIX }, + { "unset-var", parse_store, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, tcp_check_keywords_register, &tcp_check_kws); + +static struct action_kw_list http_req_kws = { { }, { + { "set-var-fmt", parse_store, KWF_MATCH_PREFIX }, + { "set-var", parse_store, KWF_MATCH_PREFIX }, + { "unset-var", parse_store, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_req_keywords_register, &http_req_kws); + +static struct action_kw_list http_res_kws = { { }, { + { "set-var-fmt", parse_store, KWF_MATCH_PREFIX }, + { "set-var", parse_store, KWF_MATCH_PREFIX }, + { "unset-var", parse_store, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_res_keywords_register, &http_res_kws); + +static struct action_kw_list http_after_res_kws = { { }, { + { "set-var-fmt", parse_store, KWF_MATCH_PREFIX }, + { "set-var", parse_store, KWF_MATCH_PREFIX }, + { "unset-var", parse_store, KWF_MATCH_PREFIX }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, http_after_res_keywords_register, &http_after_res_kws); + +static struct cfg_kw_list cfg_kws = {{ },{ + { CFG_GLOBAL, "set-var", vars_parse_global_set_var }, + { CFG_GLOBAL, "set-var-fmt", vars_parse_global_set_var }, + { CFG_GLOBAL, "tune.vars.global-max-size", vars_max_size_global }, + { CFG_GLOBAL, "tune.vars.proc-max-size", vars_max_size_proc }, + { CFG_GLOBAL, "tune.vars.sess-max-size", vars_max_size_sess }, + { CFG_GLOBAL, "tune.vars.txn-max-size", vars_max_size_txn }, + { CFG_GLOBAL, "tune.vars.reqres-max-size", vars_max_size_reqres }, + { CFG_GLOBAL, "tune.vars.check-max-size", vars_max_size_check }, + { /* END */ } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); + + +/* register cli keywords */ +static struct cli_kw_list cli_kws = {{ },{ + { { "get", "var", NULL }, "get var <name> : retrieve contents of a process-wide variable", vars_parse_cli_get_var, NULL }, + { { "set", "var", NULL }, "set var <name> [fmt|expr] {<fmt>|<expr>}: set variable from an expression or a format", vars_parse_cli_set_var, NULL, NULL, NULL, ACCESS_EXPERIMENTAL }, + { { NULL }, NULL, NULL, NULL } +}}; +INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws); diff --git a/src/version.c b/src/version.c new file mode 100644 index 0000000..e7bb748 --- /dev/null +++ b/src/version.c @@ -0,0 +1,28 @@ +/* + * Version reporting : all user-visible version information should come from + * this file so that rebuilding only this one is enough to report the latest + * code version. + */ + +#include <haproxy/global.h> +#include <haproxy/version.h> + +/* These ones are made variables and not constants so that they are stored into + * the data region and prominently appear in core files. + */ +char haproxy_version_here[] = "HAProxy version follows"; +char haproxy_version[] = HAPROXY_VERSION; +char haproxy_date[] = HAPROXY_DATE; +char stats_version_string[] = STATS_VERSION_STRING; + +#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +#define SANITIZE_STRING " with address sanitizer" +#else +#define SANITIZE_STRING "" +#endif + +#if defined(__clang_version__) +REGISTER_BUILD_OPTS("Built with clang compiler version " __clang_version__ "" SANITIZE_STRING); +#elif defined(__VERSION__) +REGISTER_BUILD_OPTS("Built with gcc compiler version " __VERSION__ "" SANITIZE_STRING); +#endif diff --git a/src/wdt.c b/src/wdt.c new file mode 100644 index 0000000..865bb7b --- /dev/null +++ b/src/wdt.c @@ -0,0 +1,193 @@ +/* + * Thread lockup detection + * + * Copyright 2000-2019 Willy Tarreau <willy@haproxy.org>. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <signal.h> +#include <time.h> + +#include <haproxy/api.h> +#include <haproxy/clock.h> +#include <haproxy/debug.h> +#include <haproxy/errors.h> +#include <haproxy/global.h> +#include <haproxy/signal-t.h> +#include <haproxy/thread.h> +#include <haproxy/tools.h> + + +/* + * It relies on timer_create() and timer_settime() which are only available in + * this case. + */ +#if defined(USE_RT) && defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME) + +/* define a dummy value to designate "no timer". Use only 32 bits. */ +#ifndef TIMER_INVALID +#define TIMER_INVALID ((timer_t)(unsigned long)(0xfffffffful)) +#endif + +static timer_t per_thread_wd_timer[MAX_THREADS]; + +/* Setup (or ping) the watchdog timer for thread <thr>. Returns non-zero on + * success, zero on failure. It interrupts once per second of CPU time. It + * happens that timers based on the CPU time are not automatically re-armed + * so we only use the value and leave the interval unset. + */ +int wdt_ping(int thr) +{ + struct itimerspec its; + + its.it_value.tv_sec = 1; its.it_value.tv_nsec = 0; + its.it_interval.tv_sec = 0; its.it_interval.tv_nsec = 0; + return timer_settime(per_thread_wd_timer[thr], 0, &its, NULL) == 0; +} + +/* This is the WDTSIG signal handler */ +void wdt_handler(int sig, siginfo_t *si, void *arg) +{ + unsigned long long n, p; + ulong thr_bit; + int thr, tgrp; + + switch (si->si_code) { + case SI_TIMER: + /* A thread's timer fired, the thread ID is in si_int. We have + * no guarantee that the thread handling this signal is in any + * way related to the one triggering it, so we need to retrieve + * the thread number from there. Note: this thread might + * continue to execute in parallel. + */ + thr = si->si_value.sival_int; + + /* cannot happen unless an unknown timer tries to play with our + * nerves. Let's die for now if this happens. + */ + if (thr < 0 || thr >= global.nbthread) + break; + + tgrp = ha_thread_info[thr].tgid; + thr_bit = ha_thread_info[thr].ltid_bit; + p = ha_thread_ctx[thr].prev_cpu_time; + n = now_cpu_time_thread(thr); + + /* not yet reached the deadline of 1 sec, + * or p wasn't initialized yet + */ + if (!p || n - p < 1000000000UL) + goto update_and_leave; + + if ((_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_SLEEPING) || + (_HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp-1].threads_harmless) & thr_bit)) { + /* This thread is currently doing exactly nothing + * waiting in the poll loop (unlikely but possible), + * waiting for all other threads to join the rendez-vous + * point (common), or waiting for another thread to + * finish an isolated operation (unlikely but possible). + */ + goto update_and_leave; + } + + /* So the thread indeed appears locked up. In order to be + * certain that we're not witnessing an exceptional spike of + * CPU usage due to a configuration issue (like running tens + * of thousands of tasks in a single loop), we'll check if the + * scheduler is still alive by setting the TH_FL_STUCK flag + * that the scheduler clears when switching to the next task. + * If it's already set, then it's our second call with no + * progress and the thread is dead. + */ + if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) { + _HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK); + goto update_and_leave; + } + + /* No doubt now, there's no hop to recover, die loudly! */ + break; + +#if defined(USE_THREAD) && defined(SI_TKILL) /* Linux uses this */ + + case SI_TKILL: + /* we got a pthread_kill, stop on it */ + thr = tid; + break; + +#elif defined(USE_THREAD) && defined(SI_LWP) /* FreeBSD uses this */ + + case SI_LWP: + /* we got a pthread_kill, stop on it */ + thr = tid; + break; + +#endif + default: + /* unhandled other conditions */ + return; + } + + /* By default we terminate. If we're not on the victim thread, better + * bounce the signal there so that we produce a cleaner stack trace + * with the other thread interrupted exactly where it was running and + * the current one not involved in this. + */ +#ifdef USE_THREAD + if (thr != tid) + ha_tkill(thr, sig); + else +#endif + ha_panic(); + return; + + update_and_leave: + wdt_ping(thr); +} + +int init_wdt_per_thread() +{ + if (!clock_setup_signal_timer(&per_thread_wd_timer[tid], WDTSIG, tid)) + goto fail1; + + if (!wdt_ping(tid)) + goto fail2; + + return 1; + + fail2: + timer_delete(per_thread_wd_timer[tid]); + fail1: + per_thread_wd_timer[tid] = TIMER_INVALID; + ha_warning("Failed to setup watchdog timer for thread %u, disabling lockup detection.\n", tid); + return 1; +} + +void deinit_wdt_per_thread() +{ + if (per_thread_wd_timer[tid] != TIMER_INVALID) + timer_delete(per_thread_wd_timer[tid]); +} + +/* registers the watchdog signal handler and returns 0. This sets up the signal + * handler for WDTSIG, so it must be called once per process. + */ +int init_wdt() +{ + struct sigaction sa; + + sa.sa_handler = NULL; + sa.sa_sigaction = wdt_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + sigaction(WDTSIG, &sa, NULL); + return ERR_NONE; +} + +REGISTER_POST_CHECK(init_wdt); +REGISTER_PER_THREAD_INIT(init_wdt_per_thread); +REGISTER_PER_THREAD_DEINIT(deinit_wdt_per_thread); +#endif diff --git a/src/xprt_handshake.c b/src/xprt_handshake.c new file mode 100644 index 0000000..33f7750 --- /dev/null +++ b/src/xprt_handshake.c @@ -0,0 +1,299 @@ +/* + * Pseudo-xprt to handle any handshake except the SSL handshake + * + * Copyright 2019 HAProxy Technologies, Olivier Houchard <ohouchard@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/connection.h> + +struct xprt_handshake_ctx { + struct connection *conn; + struct wait_event *subs; + struct wait_event wait_event; + const struct xprt_ops *xprt; + void *xprt_ctx; +}; + +DECLARE_STATIC_POOL(xprt_handshake_ctx_pool, "xprt_handshake_ctx", sizeof(struct xprt_handshake_ctx)); + +/* This XPRT doesn't take care of sending or receiving data, once its handshake + * is done, it just removes itself + */ +static size_t xprt_handshake_from_buf(struct connection *conn, void *xprt_ctx, const struct buffer *buf, size_t count, int flags) +{ + return 0; +} + +static size_t xprt_handshake_to_buf(struct connection *conn, void *xprt_ctx, struct buffer *buf, size_t count, int flags) +{ + return 0; +} + +/* xprt_handshake_io_cb is exported to see it resolved in "show fd" */ +struct task *xprt_handshake_io_cb(struct task *t, void *bctx, unsigned int state) +{ + struct xprt_handshake_ctx *ctx = bctx; + struct connection *conn = ctx->conn; + + if (conn->flags & CO_FL_SOCKS4_SEND) + if (!conn_send_socks4_proxy_request(conn)) { + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_SEND, + &ctx->wait_event); + + goto out; + } + + if (conn->flags & CO_FL_SOCKS4_RECV) + if (!conn_recv_socks4_proxy_response(conn)) { + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_RECV, + &ctx->wait_event); + goto out; + } + + if (conn->flags & CO_FL_ACCEPT_CIP) + if (!conn_recv_netscaler_cip(conn, CO_FL_ACCEPT_CIP)) { + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_RECV, + &ctx->wait_event); + goto out; + } + + if (conn->flags & CO_FL_ACCEPT_PROXY) + if (!conn_recv_proxy(conn, CO_FL_ACCEPT_PROXY)) { + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_RECV, + &ctx->wait_event); + goto out; + } + + if (conn->flags & CO_FL_SEND_PROXY) + if (!conn_send_proxy(conn, CO_FL_SEND_PROXY)) { + ctx->xprt->subscribe(conn, ctx->xprt_ctx, SUB_RETRY_SEND, + &ctx->wait_event); + goto out; + } + +out: + /* Wake the stream if we're done with the handshake, or we have a + * connection error + * */ + if ((conn->flags & CO_FL_ERROR) || + !(conn->flags & CO_FL_HANDSHAKE)) { + int ret = 0; + int woke = 0; + int was_conn_ctx = 0; + + /* On error, wake any waiter */ + if (ctx->subs) { + tasklet_wakeup(ctx->subs->tasklet); + ctx->subs->events = 0; + woke = 1; + ctx->subs = NULL; + } + + /* Remove ourself from the xprt chain */ + if (ctx->wait_event.events != 0) + ctx->xprt->unsubscribe(ctx->conn, + ctx->xprt_ctx, + ctx->wait_event.events, + &ctx->wait_event); + if (conn->xprt_ctx == ctx) { + conn->xprt_ctx = ctx->xprt_ctx; + conn->xprt = ctx->xprt; + was_conn_ctx = 1; + } else + conn->xprt->remove_xprt(conn, conn->xprt_ctx, ctx, + ctx->xprt, ctx->xprt_ctx); + /* If we're the first xprt for the connection, let the + * upper layers know. If no mux was set up yet, then call + * conn_create_mux, and if we have a mux, and it has a wake + * method, call it too. + */ + if (was_conn_ctx) { + if (!ctx->conn->mux) + ret = conn_create_mux(ctx->conn); + if (ret >= 0 && !woke && ctx->conn->mux && ctx->conn->mux->wake) + ret = ctx->conn->mux->wake(ctx->conn); + } + tasklet_free(ctx->wait_event.tasklet); + pool_free(xprt_handshake_ctx_pool, ctx); + t = NULL; + } + return t; +} + +static int xprt_handshake_start(struct connection *conn, void *xprt_ctx) +{ + struct xprt_handshake_ctx *ctx = xprt_ctx; + + if (ctx->xprt->start) { + int ret; + + ret = ctx->xprt->start(conn, ctx->xprt_ctx); + if (ret < 0) + return ret; + } + tasklet_wakeup(ctx->wait_event.tasklet); + + return 0; +} + +static int xprt_handshake_init(struct connection *conn, void **xprt_ctx) +{ + struct xprt_handshake_ctx *ctx; + /* already initialized */ + if (*xprt_ctx) + return 0; + + ctx = pool_alloc(xprt_handshake_ctx_pool); + if (!ctx) { + conn->err_code = CO_ER_SSL_NO_MEM; + return -1; + } + ctx->conn = conn; + ctx->wait_event.tasklet = tasklet_new(); + if (!ctx->wait_event.tasklet) { + conn->err_code = CO_ER_SSL_NO_MEM; + pool_free(xprt_handshake_ctx_pool, ctx); + return -1; + } + ctx->wait_event.tasklet->process = xprt_handshake_io_cb; + ctx->wait_event.tasklet->context = ctx; + ctx->wait_event.events = 0; + + ctx->xprt = NULL; + ctx->xprt_ctx = NULL; + ctx->subs = NULL; + *xprt_ctx = ctx; + + return 0; +} + +static void xprt_handshake_close(struct connection *conn, void *xprt_ctx) +{ + struct xprt_handshake_ctx *ctx = xprt_ctx; + + if (ctx) { + if (ctx->wait_event.events != 0) + ctx->xprt->unsubscribe(ctx->conn, ctx->xprt_ctx, + ctx->wait_event.events, + &ctx->wait_event); + if (ctx->subs) { + ctx->subs->events = 0; + tasklet_wakeup(ctx->subs->tasklet); + } + + if (ctx->xprt && ctx->xprt->close) + ctx->xprt->close(conn, ctx->xprt_ctx); + /* Remove any handshake flag, and if we were the connection + * xprt, get back to XPRT_RAW. If we're here because we + * failed an outoging connection, it will be retried using + * the same struct connection, and as xprt_handshake is a bit + * magic, because it requires a call to add_xprt(), it's better + * to fallback to the original XPRT to re-initiate the + * connection. + */ + conn->flags &= ~CO_FL_HANDSHAKE; + if (conn->xprt == xprt_get(XPRT_HANDSHAKE)) + conn->xprt = xprt_get(XPRT_RAW); + tasklet_free(ctx->wait_event.tasklet); + pool_free(xprt_handshake_ctx_pool, ctx); + } +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0. + */ +static int xprt_handshake_subscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + struct xprt_handshake_ctx *ctx = xprt_ctx; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(ctx->subs && ctx->subs != es); + + ctx->subs = es; + es->events |= event_type; + return 0; + +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type>. + * The <es> pointer is not allowed to differ from the one passed to the + * subscribe() call. It always returns zero. + */ +static int xprt_handshake_unsubscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + struct xprt_handshake_ctx *ctx = xprt_ctx; + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(ctx->subs && ctx->subs != es); + + es->events &= ~event_type; + if (!es->events) + ctx->subs = NULL; + + return 0; +} + +/* Use the provided XPRT as an underlying XPRT, and provide the old one. + * Returns 0 on success, and non-zero on failure. + */ +static int xprt_handshake_add_xprt(struct connection *conn, void *xprt_ctx, void *toadd_ctx, const struct xprt_ops *toadd_ops, void **oldxprt_ctx, const struct xprt_ops **oldxprt_ops) +{ + struct xprt_handshake_ctx *ctx = xprt_ctx; + + if (oldxprt_ops) + *oldxprt_ops = ctx->xprt; + if (oldxprt_ctx) + *oldxprt_ctx = ctx->xprt_ctx; + ctx->xprt = toadd_ops; + ctx->xprt_ctx = toadd_ctx; + + return 0; +} + +/* Remove the specified xprt. If if it our underlying XPRT, remove it and + * return 0, otherwise just call the remove_xprt method from the underlying + * XPRT. + */ +static int xprt_handshake_remove_xprt(struct connection *conn, void *xprt_ctx, void *toremove_ctx, const struct xprt_ops *newops, void *newctx) +{ + struct xprt_handshake_ctx *ctx = xprt_ctx; + + if (ctx->xprt_ctx == toremove_ctx) { + ctx->xprt_ctx = newctx; + ctx->xprt = newops; + return 0; + } + return (ctx->xprt->remove_xprt(conn, ctx->xprt_ctx, toremove_ctx, newops, newctx)); +} + +struct xprt_ops xprt_handshake = { + .snd_buf = xprt_handshake_from_buf, + .rcv_buf = xprt_handshake_to_buf, + .subscribe = xprt_handshake_subscribe, + .unsubscribe = xprt_handshake_unsubscribe, + .remove_xprt = xprt_handshake_remove_xprt, + .add_xprt = xprt_handshake_add_xprt, + .init = xprt_handshake_init, + .start = xprt_handshake_start, + .close= xprt_handshake_close, + .rcv_pipe = NULL, + .snd_pipe = NULL, + .shutr = NULL, + .shutw = NULL, + .name = "HS", +}; + +static void __xprt_handshake_init(void) +{ + xprt_register(XPRT_HANDSHAKE, &xprt_handshake); +} + +INITCALL0(STG_REGISTER, __xprt_handshake_init); diff --git a/src/xprt_quic.c b/src/xprt_quic.c new file mode 100644 index 0000000..eda113c --- /dev/null +++ b/src/xprt_quic.c @@ -0,0 +1,175 @@ +/* + * QUIC xprt layer. Act as an abstraction between quic_conn and MUX layers. + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/quic_conn.h> +#include <haproxy/ssl_sock.h> +#include <haproxy/quic_trace.h> +#include <haproxy/trace.h> + +static void quic_close(struct connection *conn, void *xprt_ctx) +{ + struct ssl_sock_ctx *conn_ctx = xprt_ctx; + struct quic_conn *qc = conn_ctx->qc; + + TRACE_ENTER(QUIC_EV_CONN_CLOSE, qc); + + /* Next application data can be dropped. */ + qc->mux_state = QC_MUX_RELEASED; + + /* If the quic-conn timer has already expired or if already in "connection close" + * state, free the quic-conn. + */ + if (qc->flags & (QUIC_FL_CONN_EXP_TIMER|QUIC_FL_CONN_CLOSING)) { + quic_conn_release(qc); + qc = NULL; + goto leave; + } + + /* Schedule a CONNECTION_CLOSE emission. If process stopping is in + * progress, quic-conn idle-timer will be scheduled immediately after + * its emission to ensure an immediate connection closing. + */ + qc_check_close_on_released_mux(qc); + leave: + TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); +} + +/* Called from the upper layer, to subscribe <es> to events <event_type>. The + * event subscriber <es> is not allowed to change from a previous call as long + * as at least one event is still subscribed. The <event_type> must only be a + * combination of SUB_RETRY_RECV and SUB_RETRY_SEND. It always returns 0. + */ +static int quic_conn_subscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + struct quic_conn *qc = conn->handle.qc; + + TRACE_ENTER(QUIC_EV_CONN_SUB, qc); + + BUG_ON(event_type & ~(SUB_RETRY_SEND|SUB_RETRY_RECV)); + BUG_ON(qc->subs && qc->subs != es); + + es->events |= event_type; + qc->subs = es; + + /* TODO implement a check_events to detect if subscriber should be + * woken up immediately ? + */ + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("subscribe(recv)", QUIC_EV_CONN_XPRTRECV, qc); + + if (event_type & SUB_RETRY_SEND) + TRACE_DEVEL("subscribe(send)", QUIC_EV_CONN_XPRTSEND, qc); + + TRACE_LEAVE(QUIC_EV_CONN_SUB, qc); + + return 0; +} + +/* Called from the upper layer, to unsubscribe <es> from events <event_type>. + * The <es> pointer is not allowed to differ from the one passed to the + * subscribe() call. It always returns zero. + */ +static int quic_conn_unsubscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es) +{ + struct quic_conn *qc = conn->handle.qc; + + TRACE_ENTER(QUIC_EV_CONN_SUB, qc); + + if (event_type & SUB_RETRY_RECV) + TRACE_DEVEL("unsubscribe(recv)", QUIC_EV_CONN_XPRTRECV, qc); + if (event_type & SUB_RETRY_SEND) + TRACE_DEVEL("unsubscribe(send)", QUIC_EV_CONN_XPRTSEND, qc); + + es->events &= ~event_type; + if (!es->events) + qc->subs = NULL; + + /* TODO implement ignore_events similar to conn_unsubscribe() ? */ + + TRACE_LEAVE(QUIC_EV_CONN_SUB, qc); + + return 0; +} + +/* Store in <xprt_ctx> the context attached to <conn>. + * Returns always 0. + */ +static int qc_conn_init(struct connection *conn, void **xprt_ctx) +{ + struct quic_conn *qc = conn->handle.qc; + + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + /* Ensure thread connection migration is finalized ASAP. */ + if (qc->flags & QUIC_FL_CONN_AFFINITY_CHANGED) + qc_finalize_affinity_rebind(qc); + + /* do not store the context if already set */ + if (*xprt_ctx) + goto out; + + *xprt_ctx = qc->xprt_ctx; + + out: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + + return 0; +} + +/* Start the QUIC transport layer */ +static int qc_xprt_start(struct connection *conn, void *ctx) +{ + int ret = 0; + struct quic_conn *qc; + + qc = conn->handle.qc; + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + /* mux-quic can now be considered ready. */ + qc->mux_state = QC_MUX_READY; + + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + return ret; +} + +static struct ssl_sock_ctx *qc_get_ssl_sock_ctx(struct connection *conn) +{ + if (!conn || conn->xprt != xprt_get(XPRT_QUIC) || !conn->handle.qc || !conn->xprt_ctx) + return NULL; + + return conn->handle.qc->xprt_ctx; +} + +/* transport-layer operations for QUIC connections. */ +static struct xprt_ops ssl_quic = { + .close = quic_close, + .subscribe = quic_conn_subscribe, + .unsubscribe = quic_conn_unsubscribe, + .init = qc_conn_init, + .start = qc_xprt_start, + .prepare_bind_conf = ssl_sock_prepare_bind_conf, + .destroy_bind_conf = ssl_sock_destroy_bind_conf, + .get_alpn = ssl_sock_get_alpn, + .get_ssl_sock_ctx = qc_get_ssl_sock_ctx, + .name = "QUIC", +}; + +static void __quic_conn_init(void) +{ + xprt_register(XPRT_QUIC, &ssl_quic); +} +INITCALL0(STG_REGISTER, __quic_conn_init); |