summaryrefslogtreecommitdiffstats
path: root/net/netfilter/ipvs/ip_vs_est.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter/ipvs/ip_vs_est.c')
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c951
1 files changed, 951 insertions, 0 deletions
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
new file mode 100644
index 000000000..c5970ba41
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -0,0 +1,951 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * Affected data: est_list and est_lock.
+ * estimation_timer() runs with timer per netns.
+ * get_stats()) do the per cpu summing.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/list.h>
+
+#include <net/ip_vs.h>
+
+/*
+ This code is to estimate rate in a shorter interval (such as 8
+ seconds) for virtual services and real servers. For measure rate in a
+ long interval, it is easy to implement a user level daemon which
+ periodically reads those statistical counters and measure rate.
+
+ We measure rate during the last 8 seconds every 2 seconds:
+
+ avgrate = avgrate*(1-W) + rate*W
+
+ where W = 2^(-2)
+
+ NOTES.
+
+ * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
+
+ * Netlink users can see 64-bit values but sockopt users are restricted
+ to 32-bit values for conns, packets, bps, cps and pps.
+
+ * A lot of code is taken from net/core/gen_estimator.c
+
+ KEY POINTS:
+ - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
+ - kthreads read the cpustats to update the estimators (svcs, dests, total)
+ - the states of estimators can be read (get stats) or modified (zero stats)
+ from processes
+
+ KTHREADS:
+ - estimators are added initially to est_temp_list and later kthread 0
+ distributes them to one or many kthreads for estimation
+ - kthread contexts are created and attached to array
+ - the kthread tasks are started when first service is added, before that
+ the total stats are not estimated
+ - when configuration (cpulist/nice) is changed, the tasks are restarted
+ by work (est_reload_work)
+ - kthread tasks are stopped while the cpulist is empty
+ - the kthread context holds lists with estimators (chains) which are
+ processed every 2 seconds
+ - as estimators can be added dynamically and in bursts, we try to spread
+ them to multiple chains which are estimated at different time
+ - on start, kthread 0 enters calculation phase to determine the chain limits
+ and the limit of estimators per kthread
+ - est_add_ktid: ktid where to add new ests, can point to empty slot where
+ we should add kt data
+ */
+
+static struct lock_class_key __ipvs_est_key;
+
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
+
+static void ip_vs_chain_estimation(struct hlist_head *chain)
+{
+ struct ip_vs_estimator *e;
+ struct ip_vs_cpu_stats *c;
+ struct ip_vs_stats *s;
+ u64 rate;
+
+ hlist_for_each_entry_rcu(e, chain, list) {
+ u64 conns, inpkts, outpkts, inbytes, outbytes;
+ u64 kconns = 0, kinpkts = 0, koutpkts = 0;
+ u64 kinbytes = 0, koutbytes = 0;
+ unsigned int start;
+ int i;
+
+ if (kthread_should_stop())
+ break;
+
+ s = container_of(e, struct ip_vs_stats, est);
+ for_each_possible_cpu(i) {
+ c = per_cpu_ptr(s->cpustats, i);
+ do {
+ start = u64_stats_fetch_begin(&c->syncp);
+ conns = u64_stats_read(&c->cnt.conns);
+ inpkts = u64_stats_read(&c->cnt.inpkts);
+ outpkts = u64_stats_read(&c->cnt.outpkts);
+ inbytes = u64_stats_read(&c->cnt.inbytes);
+ outbytes = u64_stats_read(&c->cnt.outbytes);
+ } while (u64_stats_fetch_retry(&c->syncp, start));
+ kconns += conns;
+ kinpkts += inpkts;
+ koutpkts += outpkts;
+ kinbytes += inbytes;
+ koutbytes += outbytes;
+ }
+
+ spin_lock(&s->lock);
+
+ s->kstats.conns = kconns;
+ s->kstats.inpkts = kinpkts;
+ s->kstats.outpkts = koutpkts;
+ s->kstats.inbytes = kinbytes;
+ s->kstats.outbytes = koutbytes;
+
+ /* scaled by 2^10, but divided 2 seconds */
+ rate = (s->kstats.conns - e->last_conns) << 9;
+ e->last_conns = s->kstats.conns;
+ e->cps += ((s64)rate - (s64)e->cps) >> 2;
+
+ rate = (s->kstats.inpkts - e->last_inpkts) << 9;
+ e->last_inpkts = s->kstats.inpkts;
+ e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
+
+ rate = (s->kstats.outpkts - e->last_outpkts) << 9;
+ e->last_outpkts = s->kstats.outpkts;
+ e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
+
+ /* scaled by 2^5, but divided 2 seconds */
+ rate = (s->kstats.inbytes - e->last_inbytes) << 4;
+ e->last_inbytes = s->kstats.inbytes;
+ e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
+
+ rate = (s->kstats.outbytes - e->last_outbytes) << 4;
+ e->last_outbytes = s->kstats.outbytes;
+ e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
+ spin_unlock(&s->lock);
+ }
+}
+
+static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
+{
+ struct ip_vs_est_tick_data *td;
+ int cid;
+
+ rcu_read_lock();
+ td = rcu_dereference(kd->ticks[row]);
+ if (!td)
+ goto out;
+ for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
+ if (kthread_should_stop())
+ break;
+ ip_vs_chain_estimation(&td->chains[cid]);
+ cond_resched_rcu();
+ td = rcu_dereference(kd->ticks[row]);
+ if (!td)
+ break;
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+static int ip_vs_estimation_kthread(void *data)
+{
+ struct ip_vs_est_kt_data *kd = data;
+ struct netns_ipvs *ipvs = kd->ipvs;
+ int row = kd->est_row;
+ unsigned long now;
+ int id = kd->id;
+ long gap;
+
+ if (id > 0) {
+ if (!ipvs->est_chain_max)
+ return 0;
+ } else {
+ if (!ipvs->est_chain_max) {
+ ipvs->est_calc_phase = 1;
+ /* commit est_calc_phase before reading est_genid */
+ smp_mb();
+ }
+
+ /* kthread 0 will handle the calc phase */
+ if (ipvs->est_calc_phase)
+ ip_vs_est_calc_phase(ipvs);
+ }
+
+ while (1) {
+ if (!id && !hlist_empty(&ipvs->est_temp_list))
+ ip_vs_est_drain_temp_list(ipvs);
+ set_current_state(TASK_IDLE);
+ if (kthread_should_stop())
+ break;
+
+ /* before estimation, check if we should sleep */
+ now = jiffies;
+ gap = kd->est_timer - now;
+ if (gap > 0) {
+ if (gap > IPVS_EST_TICK) {
+ kd->est_timer = now - IPVS_EST_TICK;
+ gap = IPVS_EST_TICK;
+ }
+ schedule_timeout(gap);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ if (gap < -8 * IPVS_EST_TICK)
+ kd->est_timer = now;
+ }
+
+ if (kd->tick_len[row])
+ ip_vs_tick_estimation(kd, row);
+
+ row++;
+ if (row >= IPVS_EST_NTICKS)
+ row = 0;
+ WRITE_ONCE(kd->est_row, row);
+ kd->est_timer += IPVS_EST_TICK;
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+/* Schedule stop/start for kthread tasks */
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+{
+ /* Ignore reloads before first service is added */
+ if (!ipvs->enable)
+ return;
+ ip_vs_est_stopped_recalc(ipvs);
+ /* Bump the kthread configuration genid */
+ atomic_inc(&ipvs->est_genid);
+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
+}
+
+/* Start kthread task with current configuration */
+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+ struct ip_vs_est_kt_data *kd)
+{
+ unsigned long now;
+ int ret = 0;
+ long gap;
+
+ lockdep_assert_held(&ipvs->est_mutex);
+
+ if (kd->task)
+ goto out;
+ now = jiffies;
+ gap = kd->est_timer - now;
+ /* Sync est_timer if task is starting later */
+ if (abs(gap) > 4 * IPVS_EST_TICK)
+ kd->est_timer = now;
+ kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
+ ipvs->gen, kd->id);
+ if (IS_ERR(kd->task)) {
+ ret = PTR_ERR(kd->task);
+ kd->task = NULL;
+ goto out;
+ }
+
+ set_user_nice(kd->task, sysctl_est_nice(ipvs));
+ set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
+
+ pr_info("starting estimator thread %d...\n", kd->id);
+ wake_up_process(kd->task);
+
+out:
+ return ret;
+}
+
+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
+{
+ if (kd->task) {
+ pr_info("stopping estimator thread %d...\n", kd->id);
+ kthread_stop(kd->task);
+ kd->task = NULL;
+ }
+}
+
+/* Apply parameters to kthread */
+static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
+ struct ip_vs_est_kt_data *kd)
+{
+ kd->chain_max = ipvs->est_chain_max;
+ /* We are using single chain on RCU preemption */
+ if (IPVS_EST_TICK_CHAINS == 1)
+ kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
+ kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
+ kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
+}
+
+/* Create and start estimation kthread in a free or new array slot */
+static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_est_kt_data *kd = NULL;
+ int id = ipvs->est_kt_count;
+ int ret = -ENOMEM;
+ void *arr = NULL;
+ int i;
+
+ if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+ ipvs->enable && ipvs->est_max_threads)
+ return -EINVAL;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ for (i = 0; i < id; i++) {
+ if (!ipvs->est_kt_arr[i])
+ break;
+ }
+ if (i >= id) {
+ arr = krealloc_array(ipvs->est_kt_arr, id + 1,
+ sizeof(struct ip_vs_est_kt_data *),
+ GFP_KERNEL);
+ if (!arr)
+ goto out;
+ ipvs->est_kt_arr = arr;
+ } else {
+ id = i;
+ }
+
+ kd = kzalloc(sizeof(*kd), GFP_KERNEL);
+ if (!kd)
+ goto out;
+ kd->ipvs = ipvs;
+ bitmap_fill(kd->avail, IPVS_EST_NTICKS);
+ kd->est_timer = jiffies;
+ kd->id = id;
+ ip_vs_est_set_params(ipvs, kd);
+
+ /* Pre-allocate stats used in calc phase */
+ if (!id && !kd->calc_stats) {
+ kd->calc_stats = ip_vs_stats_alloc();
+ if (!kd->calc_stats)
+ goto out;
+ }
+
+ /* Start kthread tasks only when services are present */
+ if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
+ ret = ip_vs_est_kthread_start(ipvs, kd);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (arr)
+ ipvs->est_kt_count++;
+ ipvs->est_kt_arr[id] = kd;
+ kd = NULL;
+ /* Use most recent kthread for new ests */
+ ipvs->est_add_ktid = id;
+ ret = 0;
+
+out:
+ mutex_unlock(&ipvs->est_mutex);
+ if (kd) {
+ ip_vs_stats_free(kd->calc_stats);
+ kfree(kd);
+ }
+
+ return ret;
+}
+
+/* Select ktid where to add new ests: available, unused or new slot */
+static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
+{
+ int ktid, best = ipvs->est_kt_count;
+ struct ip_vs_est_kt_data *kd;
+
+ for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
+ kd = ipvs->est_kt_arr[ktid];
+ if (kd) {
+ if (kd->est_count < kd->est_max_count) {
+ best = ktid;
+ break;
+ }
+ } else if (ktid < best) {
+ best = ktid;
+ }
+ }
+ ipvs->est_add_ktid = best;
+}
+
+/* Add estimator to current kthread (est_add_ktid) */
+static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
+ struct ip_vs_estimator *est)
+{
+ struct ip_vs_est_kt_data *kd = NULL;
+ struct ip_vs_est_tick_data *td;
+ int ktid, row, crow, cid, ret;
+ int delay = est->ktrow;
+
+ BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
+ "Too many chains for ktcid");
+
+ if (ipvs->est_add_ktid < ipvs->est_kt_count) {
+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+ if (kd)
+ goto add_est;
+ }
+
+ ret = ip_vs_est_add_kthread(ipvs);
+ if (ret < 0)
+ goto out;
+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+
+add_est:
+ ktid = kd->id;
+ /* For small number of estimators prefer to use few ticks,
+ * otherwise try to add into the last estimated row.
+ * est_row and add_row point after the row we should use
+ */
+ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
+ crow = READ_ONCE(kd->est_row);
+ else
+ crow = kd->add_row;
+ crow += delay;
+ if (crow >= IPVS_EST_NTICKS)
+ crow -= IPVS_EST_NTICKS;
+ /* Assume initial delay ? */
+ if (delay >= IPVS_EST_NTICKS - 1) {
+ /* Preserve initial delay or decrease it if no space in tick */
+ row = crow;
+ if (crow < IPVS_EST_NTICKS - 1) {
+ crow++;
+ row = find_last_bit(kd->avail, crow);
+ }
+ if (row >= crow)
+ row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
+ } else {
+ /* Preserve delay or increase it if no space in tick */
+ row = IPVS_EST_NTICKS;
+ if (crow > 0)
+ row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
+ if (row >= IPVS_EST_NTICKS)
+ row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
+ }
+
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ if (!td) {
+ td = kzalloc(sizeof(*td), GFP_KERNEL);
+ if (!td) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rcu_assign_pointer(kd->ticks[row], td);
+ }
+
+ cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
+
+ kd->est_count++;
+ kd->tick_len[row]++;
+ if (!td->chain_len[cid])
+ __set_bit(cid, td->present);
+ td->chain_len[cid]++;
+ est->ktid = ktid;
+ est->ktrow = row;
+ est->ktcid = cid;
+ hlist_add_head_rcu(&est->list, &td->chains[cid]);
+
+ if (td->chain_len[cid] >= kd->chain_max) {
+ __set_bit(cid, td->full);
+ if (kd->tick_len[row] >= kd->tick_max)
+ __clear_bit(row, kd->avail);
+ }
+
+ /* Update est_add_ktid to point to first available/empty kt slot */
+ if (kd->est_count == kd->est_max_count)
+ ip_vs_est_update_ktid(ipvs);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/* Start estimation for stats */
+int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ int ret;
+
+ if (!ipvs->est_max_threads && ipvs->enable)
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
+ est->ktid = -1;
+ est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
+
+ /* We prefer this code to be short, kthread 0 will requeue the
+ * estimator to available chain. If tasks are disabled, we
+ * will not allocate much memory, just for kt 0.
+ */
+ ret = 0;
+ if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+ ret = ip_vs_est_add_kthread(ipvs);
+ if (ret >= 0)
+ hlist_add_head(&est->list, &ipvs->est_temp_list);
+ else
+ INIT_HLIST_NODE(&est->list);
+ return ret;
+}
+
+static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
+{
+ if (kd) {
+ if (kd->task) {
+ pr_info("stop unused estimator thread %d...\n", kd->id);
+ kthread_stop(kd->task);
+ }
+ ip_vs_stats_free(kd->calc_stats);
+ kfree(kd);
+ }
+}
+
+/* Unlink estimator from chain */
+void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_est_tick_data *td;
+ struct ip_vs_est_kt_data *kd;
+ int ktid = est->ktid;
+ int row = est->ktrow;
+ int cid = est->ktcid;
+
+ /* Failed to add to chain ? */
+ if (hlist_unhashed(&est->list))
+ return;
+
+ /* On return, estimator can be freed, dequeue it now */
+
+ /* In est_temp_list ? */
+ if (ktid < 0) {
+ hlist_del(&est->list);
+ goto end_kt0;
+ }
+
+ hlist_del_rcu(&est->list);
+ kd = ipvs->est_kt_arr[ktid];
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ __clear_bit(cid, td->full);
+ td->chain_len[cid]--;
+ if (!td->chain_len[cid])
+ __clear_bit(cid, td->present);
+ kd->tick_len[row]--;
+ __set_bit(row, kd->avail);
+ if (!kd->tick_len[row]) {
+ RCU_INIT_POINTER(kd->ticks[row], NULL);
+ kfree_rcu(td, rcu_head);
+ }
+ kd->est_count--;
+ if (kd->est_count) {
+ /* This kt slot can become available just now, prefer it */
+ if (ktid < ipvs->est_add_ktid)
+ ipvs->est_add_ktid = ktid;
+ return;
+ }
+
+ if (ktid > 0) {
+ mutex_lock(&ipvs->est_mutex);
+ ip_vs_est_kthread_destroy(kd);
+ ipvs->est_kt_arr[ktid] = NULL;
+ if (ktid == ipvs->est_kt_count - 1) {
+ ipvs->est_kt_count--;
+ while (ipvs->est_kt_count > 1 &&
+ !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
+ ipvs->est_kt_count--;
+ }
+ mutex_unlock(&ipvs->est_mutex);
+
+ /* This slot is now empty, prefer another available kt slot */
+ if (ktid == ipvs->est_add_ktid)
+ ip_vs_est_update_ktid(ipvs);
+ }
+
+end_kt0:
+ /* kt 0 is freed after all other kthreads and chains are empty */
+ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
+ kd = ipvs->est_kt_arr[0];
+ if (!kd || !kd->est_count) {
+ mutex_lock(&ipvs->est_mutex);
+ if (kd) {
+ ip_vs_est_kthread_destroy(kd);
+ ipvs->est_kt_arr[0] = NULL;
+ }
+ ipvs->est_kt_count--;
+ mutex_unlock(&ipvs->est_mutex);
+ ipvs->est_add_ktid = 0;
+ }
+ }
+}
+
+/* Register all ests from est_temp_list to kthreads */
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_estimator *est;
+
+ while (1) {
+ int max = 16;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ while (max-- > 0) {
+ est = hlist_entry_safe(ipvs->est_temp_list.first,
+ struct ip_vs_estimator, list);
+ if (est) {
+ if (kthread_should_stop())
+ goto unlock;
+ hlist_del_init(&est->list);
+ if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
+ continue;
+ est->ktid = -1;
+ hlist_add_head(&est->list,
+ &ipvs->est_temp_list);
+ /* Abort, some entries will not be estimated
+ * until next attempt
+ */
+ }
+ goto unlock;
+ }
+ mutex_unlock(&__ip_vs_mutex);
+ cond_resched();
+ }
+
+unlock:
+ mutex_unlock(&__ip_vs_mutex);
+}
+
+/* Calculate limits for all kthreads */
+static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
+{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ struct ip_vs_est_kt_data *kd;
+ struct hlist_head chain;
+ struct ip_vs_stats *s;
+ int cache_factor = 4;
+ int i, loops, ntest;
+ s32 min_est = 0;
+ ktime_t t1, t2;
+ int max = 8;
+ int ret = 1;
+ s64 diff;
+ u64 val;
+
+ INIT_HLIST_HEAD(&chain);
+ mutex_lock(&__ip_vs_mutex);
+ kd = ipvs->est_kt_arr[0];
+ mutex_unlock(&__ip_vs_mutex);
+ s = kd ? kd->calc_stats : NULL;
+ if (!s)
+ goto out;
+ hlist_add_head(&s->est.list, &chain);
+
+ loops = 1;
+ /* Get best result from many tests */
+ for (ntest = 0; ntest < 12; ntest++) {
+ if (!(ntest & 3)) {
+ /* Wait for cpufreq frequency transition */
+ wait_event_idle_timeout(wq, kthread_should_stop(),
+ HZ / 50);
+ if (!ipvs->enable || kthread_should_stop())
+ goto stop;
+ }
+
+ local_bh_disable();
+ rcu_read_lock();
+
+ /* Put stats in cache */
+ ip_vs_chain_estimation(&chain);
+
+ t1 = ktime_get();
+ for (i = loops * cache_factor; i > 0; i--)
+ ip_vs_chain_estimation(&chain);
+ t2 = ktime_get();
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ if (!ipvs->enable || kthread_should_stop())
+ goto stop;
+ cond_resched();
+
+ diff = ktime_to_ns(ktime_sub(t2, t1));
+ if (diff <= 1 * NSEC_PER_USEC) {
+ /* Do more loops on low time resolution */
+ loops *= 2;
+ continue;
+ }
+ if (diff >= NSEC_PER_SEC)
+ continue;
+ val = diff;
+ do_div(val, loops);
+ if (!min_est || val < min_est) {
+ min_est = val;
+ /* goal: 95usec per chain */
+ val = 95 * NSEC_PER_USEC;
+ if (val >= min_est) {
+ do_div(val, min_est);
+ max = (int)val;
+ } else {
+ max = 1;
+ }
+ }
+ }
+
+out:
+ if (s)
+ hlist_del_init(&s->est.list);
+ *chain_max = max;
+ return ret;
+
+stop:
+ ret = 0;
+ goto out;
+}
+
+/* Calculate the parameters and apply them in context of kt #0
+ * ECP: est_calc_phase
+ * ECM: est_chain_max
+ * ECP ECM Insert Chain enable Description
+ * ---------------------------------------------------------------------------
+ * 0 0 est_temp_list 0 create kt #0 context
+ * 0 0 est_temp_list 0->1 service added, start kthread #0 task
+ * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase
+ * 1 0 est_temp_list 1 kt #0: determine est_chain_max,
+ * stop tasks, move ests to est_temp_list
+ * and free kd for kthreads 1..last
+ * 1->0 0->N kt chains 1 ests can go to kthreads
+ * 0 N kt chains 1 drain est_temp_list, create new kthread
+ * contexts, start tasks, estimate
+ */
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
+{
+ int genid = atomic_read(&ipvs->est_genid);
+ struct ip_vs_est_tick_data *td;
+ struct ip_vs_est_kt_data *kd;
+ struct ip_vs_estimator *est;
+ struct ip_vs_stats *stats;
+ int id, row, cid, delay;
+ bool last, last_td;
+ int chain_max;
+ int step;
+
+ if (!ip_vs_est_calc_limits(ipvs, &chain_max))
+ return;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Stop all other tasks, so that we can immediately move the
+ * estimators to est_temp_list without RCU grace period
+ */
+ mutex_lock(&ipvs->est_mutex);
+ for (id = 1; id < ipvs->est_kt_count; id++) {
+ /* netns clean up started, abort */
+ if (!ipvs->enable)
+ goto unlock2;
+ kd = ipvs->est_kt_arr[id];
+ if (!kd)
+ continue;
+ ip_vs_est_kthread_stop(kd);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+
+ /* Move all estimators to est_temp_list but carefully,
+ * all estimators and kthread data can be released while
+ * we reschedule. Even for kthread 0.
+ */
+ step = 0;
+
+ /* Order entries in est_temp_list in ascending delay, so now
+ * walk delay(desc), id(desc), cid(asc)
+ */
+ delay = IPVS_EST_NTICKS;
+
+next_delay:
+ delay--;
+ if (delay < 0)
+ goto end_dequeue;
+
+last_kt:
+ /* Destroy contexts backwards */
+ id = ipvs->est_kt_count;
+
+next_kt:
+ if (!ipvs->enable || kthread_should_stop())
+ goto unlock;
+ id--;
+ if (id < 0)
+ goto next_delay;
+ kd = ipvs->est_kt_arr[id];
+ if (!kd)
+ goto next_kt;
+ /* kt 0 can exist with empty chains */
+ if (!id && kd->est_count <= 1)
+ goto next_delay;
+
+ row = kd->est_row + delay;
+ if (row >= IPVS_EST_NTICKS)
+ row -= IPVS_EST_NTICKS;
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ if (!td)
+ goto next_kt;
+
+ cid = 0;
+
+walk_chain:
+ if (kthread_should_stop())
+ goto unlock;
+ step++;
+ if (!(step & 63)) {
+ /* Give chance estimators to be added (to est_temp_list)
+ * and deleted (releasing kthread contexts)
+ */
+ mutex_unlock(&__ip_vs_mutex);
+ cond_resched();
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Current kt released ? */
+ if (id >= ipvs->est_kt_count)
+ goto last_kt;
+ if (kd != ipvs->est_kt_arr[id])
+ goto next_kt;
+ /* Current td released ? */
+ if (td != rcu_dereference_protected(kd->ticks[row], 1))
+ goto next_kt;
+ /* No fatal changes on the current kd and td */
+ }
+ est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
+ list);
+ if (!est) {
+ cid++;
+ if (cid >= IPVS_EST_TICK_CHAINS)
+ goto next_kt;
+ goto walk_chain;
+ }
+ /* We can cheat and increase est_count to protect kt 0 context
+ * from release but we prefer to keep the last estimator
+ */
+ last = kd->est_count <= 1;
+ /* Do not free kt #0 data */
+ if (!id && last)
+ goto next_delay;
+ last_td = kd->tick_len[row] <= 1;
+ stats = container_of(est, struct ip_vs_stats, est);
+ ip_vs_stop_estimator(ipvs, stats);
+ /* Tasks are stopped, move without RCU grace period */
+ est->ktid = -1;
+ est->ktrow = row - kd->est_row;
+ if (est->ktrow < 0)
+ est->ktrow += IPVS_EST_NTICKS;
+ hlist_add_head(&est->list, &ipvs->est_temp_list);
+ /* kd freed ? */
+ if (last)
+ goto next_kt;
+ /* td freed ? */
+ if (last_td)
+ goto next_kt;
+ goto walk_chain;
+
+end_dequeue:
+ /* All estimators removed while calculating ? */
+ if (!ipvs->est_kt_count)
+ goto unlock;
+ kd = ipvs->est_kt_arr[0];
+ if (!kd)
+ goto unlock;
+ kd->add_row = kd->est_row;
+ ipvs->est_chain_max = chain_max;
+ ip_vs_est_set_params(ipvs, kd);
+
+ pr_info("using max %d ests per chain, %d per kthread\n",
+ kd->chain_max, kd->est_max_count);
+
+ /* Try to keep tot_stats in kt0, enqueue it early */
+ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
+ ipvs->tot_stats->s.est.ktid == -1) {
+ hlist_del(&ipvs->tot_stats->s.est.list);
+ hlist_add_head(&ipvs->tot_stats->s.est.list,
+ &ipvs->est_temp_list);
+ }
+
+ mutex_lock(&ipvs->est_mutex);
+
+ /* We completed the calc phase, new calc phase not requested */
+ if (genid == atomic_read(&ipvs->est_genid))
+ ipvs->est_calc_phase = 0;
+
+unlock2:
+ mutex_unlock(&ipvs->est_mutex);
+
+unlock:
+ mutex_unlock(&__ip_vs_mutex);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_kstats *k = &stats->kstats;
+
+ /* reset counters, caller must hold the stats->lock lock */
+ est->last_inbytes = k->inbytes;
+ est->last_outbytes = k->outbytes;
+ est->last_conns = k->conns;
+ est->last_inpkts = k->inpkts;
+ est->last_outpkts = k->outpkts;
+ est->cps = 0;
+ est->inpps = 0;
+ est->outpps = 0;
+ est->inbps = 0;
+ est->outbps = 0;
+}
+
+/* Get decoded rates */
+void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *e = &stats->est;
+
+ dst->cps = (e->cps + 0x1FF) >> 10;
+ dst->inpps = (e->inpps + 0x1FF) >> 10;
+ dst->outpps = (e->outpps + 0x1FF) >> 10;
+ dst->inbps = (e->inbps + 0xF) >> 5;
+ dst->outbps = (e->outbps + 0xF) >> 5;
+}
+
+int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
+{
+ INIT_HLIST_HEAD(&ipvs->est_temp_list);
+ ipvs->est_kt_arr = NULL;
+ ipvs->est_max_threads = 0;
+ ipvs->est_calc_phase = 0;
+ ipvs->est_chain_max = 0;
+ ipvs->est_kt_count = 0;
+ ipvs->est_add_ktid = 0;
+ atomic_set(&ipvs->est_genid, 0);
+ atomic_set(&ipvs->est_genid_done, 0);
+ __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
+ return 0;
+}
+
+void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
+{
+ int i;
+
+ for (i = 0; i < ipvs->est_kt_count; i++)
+ ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
+ kfree(ipvs->est_kt_arr);
+ mutex_destroy(&ipvs->est_mutex);
+}