diff options
Diffstat (limited to 'kernel/irq/affinity.c')
-rw-r--r-- | kernel/irq/affinity.c | 515 |
1 files changed, 515 insertions, 0 deletions
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c new file mode 100644 index 000000000..5fb78addf --- /dev/null +++ b/kernel/irq/affinity.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/sort.h> + +static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + unsigned int cpus_per_vec) +{ + const struct cpumask *siblmsk; + int cpu, sibl; + + for ( ; cpus_per_vec > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_vec--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_vec > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_vec--; + } + } +} + +static cpumask_var_t *alloc_node_to_cpumask(void) +{ + cpumask_var_t *masks; + int node; + + masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); + if (!masks) + return NULL; + + for (node = 0; node < nr_node_ids; node++) { + if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) + goto out_unwind; + } + + return masks; + +out_unwind: + while (--node >= 0) + free_cpumask_var(masks[node]); + kfree(masks); + return NULL; +} + +static void free_node_to_cpumask(cpumask_var_t *masks) +{ + int node; + + for (node = 0; node < nr_node_ids; node++) + free_cpumask_var(masks[node]); + kfree(masks); +} + +static void build_node_to_cpumask(cpumask_var_t *masks) +{ + int cpu; + + for_each_possible_cpu(cpu) + cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); +} + +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, + const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes = 0; + + /* Calculate the number of nodes in the supplied affinity mask */ + for_each_node(n) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; +} + +struct node_vectors { + unsigned id; + + union { + unsigned nvectors; + unsigned ncpus; + }; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ + const struct node_vectors *ln = l; + const struct node_vectors *rn = r; + + return ln->ncpus - rn->ncpus; +} + +/* + * Allocate vector number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated numbver is <= active CPU number of this node + * + * The actual allocated total vectors may be less than @numvecs when + * active total CPU number is less than @numvecs. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_vectors(unsigned int numvecs, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_vectors *node_vectors) +{ + unsigned n, remaining_ncpus = 0; + + for (n = 0; n < nr_node_ids; n++) { + node_vectors[n].id = n; + node_vectors[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus = cpumask_weight(nmsk); + + if (!ncpus) + continue; + remaining_ncpus += ncpus; + node_vectors[n].ncpus = ncpus; + } + + numvecs = min_t(unsigned, remaining_ncpus, numvecs); + + sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), + ncpus_cmp_func, NULL); + + /* + * Allocate vectors for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at + * least one vector, and the theory is simple: over-allocation + * is only done when this node is assigned by one vector, so + * other nodes will be allocated >= 1 vector, since 'numvecs' is + * bigger than number of numa nodes. + * + * One perfect invariant is that number of allocated vectors for + * each node is <= CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X + * vecs(X) is the vector count allocated to node X via this + * algorithm + * + * ncpu(A) <= ncpu(B) + * ncpu(A) + ncpu(B) = N + * vecs(A) + vecs(B) = V + * + * vecs(A) = max(1, round_down(V * ncpu(A) / N)) + * vecs(B) = V - vecs(A) + * + * both N and V are integer, and 2 <= V <= N, suppose + * V = N - delta, and 0 <= delta <= N - 2 + * + * 2) obviously vecs(A) <= ncpu(A) because: + * + * if vecs(A) is 1, then vecs(A) <= ncpu(A) given + * ncpu(A) >= 1 + * + * otherwise, + * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N + * + * 3) prove how vecs(B) <= ncpu(B): + * + * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so vecs(B) <= ncpu(B), + * + * otherwise: + * + * vecs(A) = + * round_down(V * ncpu(A) / N) = + * round_down((N - delta) * ncpu(A) / N) = + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= + * round_down((N * ncpu(A) - delta * N) / N) = + * cpu(A) - delta + * + * then: + * + * vecs(A) - V >= ncpu(A) - delta - V + * => + * V - vecs(A) <= V + delta - ncpu(A) + * => + * vecs(B) <= N - ncpu(A) + * => + * vecs(B) <= cpu(B) + * + * For nodes >= 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, + * and we always re-calculate 'remaining_ncpus' & 'numvecs', and + * finally for each node X: vecs(X) <= ncpu(X). + * + */ + for (n = 0; n < nr_node_ids; n++) { + unsigned nvectors, ncpus; + + if (node_vectors[n].ncpus == UINT_MAX) + continue; + + WARN_ON_ONCE(numvecs == 0); + + ncpus = node_vectors[n].ncpus; + nvectors = max_t(unsigned, 1, + numvecs * ncpus / remaining_ncpus); + WARN_ON_ONCE(nvectors > ncpus); + + node_vectors[n].nvectors = nvectors; + + remaining_ncpus -= ncpus; + numvecs -= nvectors; + } +} + +static int __irq_build_affinity_masks(unsigned int startvec, + unsigned int numvecs, + unsigned int firstvec, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct irq_affinity_desc *masks) +{ + unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; + unsigned int last_affv = firstvec + numvecs; + unsigned int curvec = startvec; + nodemask_t nodemsk = NODE_MASK_NONE; + struct node_vectors *node_vectors; + + if (!cpumask_weight(cpu_mask)) + return 0; + + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); + + /* + * If the number of nodes in the mask is greater than or equal the + * number of vectors we just spread the vectors across the nodes. + */ + if (numvecs <= nodes) { + for_each_node_mask(n, nodemsk) { + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); + if (++curvec == last_affv) + curvec = firstvec; + } + return numvecs; + } + + node_vectors = kcalloc(nr_node_ids, + sizeof(struct node_vectors), + GFP_KERNEL); + if (!node_vectors) + return -ENOMEM; + + /* allocate vector number for each node */ + alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_vectors); + + for (i = 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; + struct node_vectors *nv = &node_vectors[i]; + + if (nv->nvectors == UINT_MAX) + continue; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); + ncpus = cpumask_weight(nmsk); + if (!ncpus) + continue; + + WARN_ON_ONCE(nv->nvectors > ncpus); + + /* Account for rounding errors */ + extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); + + /* Spread allocated vectors on CPUs of the current node */ + for (v = 0; v < nv->nvectors; v++, curvec++) { + cpus_per_vec = ncpus / nv->nvectors; + + /* Account for extra vectors to compensate rounding errors */ + if (extra_vecs) { + cpus_per_vec++; + --extra_vecs; + } + + /* + * wrapping has to be considered given 'startvec' + * may start anywhere + */ + if (curvec >= last_affv) + curvec = firstvec; + irq_spread_init_one(&masks[curvec].mask, nmsk, + cpus_per_vec); + } + done += nv->nvectors; + } + kfree(node_vectors); + return done; +} + +/* + * build affinity in two stages: + * 1) spread present CPU on these vectors + * 2) spread other possible CPUs on these vectors + */ +static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, + unsigned int firstvec, + struct irq_affinity_desc *masks) +{ + unsigned int curvec = startvec, nr_present = 0, nr_others = 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return ret; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail_nmsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; + + /* Stabilize the cpumasks */ + get_online_cpus(); + build_node_to_cpumask(node_to_cpumask); + + /* Spread on present CPUs starting from affd->pre_vectors */ + ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, + node_to_cpumask, cpu_present_mask, + nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; + + /* + * Spread on non present CPUs starting from the next vector to be + * handled. If the spreading of present CPUs already exhausted the + * vector space, assign the non present CPUs to the already spread + * out vectors. + */ + if (nr_present >= numvecs) + curvec = firstvec; + else + curvec = firstvec + nr_present; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, + node_to_cpumask, npresmsk, nmsk, + masks); + if (ret >= 0) + nr_others = ret; + + fail_build_affinity: + put_online_cpus(); + + if (ret >= 0) + WARN_ON(nr_present + nr_others < numvecs); + + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: + free_cpumask_var(npresmsk); + + fail_nmsk: + free_cpumask_var(nmsk); + return ret < 0 ? ret : 0; +} + +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) +{ + affd->nr_sets = 1; + affd->set_size[0] = affvecs; +} + +/** + * irq_create_affinity_masks - Create affinity masks for multiqueue spreading + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements + * + * Returns the irq_affinity_desc pointer or NULL if allocation failed. + */ +struct irq_affinity_desc * +irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) +{ + unsigned int affvecs, curvec, usedvecs, i; + struct irq_affinity_desc *masks = NULL; + + /* + * Determine the number of vectors which need interrupt affinities + * assigned. If the pre/post request exhausts the available vectors + * then nothing to do here except for invoking the calc_sets() + * callback so the device driver can adjust to the situation. + */ + if (nvecs > affd->pre_vectors + affd->post_vectors) + affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + else + affvecs = 0; + + /* + * Simple invocations do not provide a calc_sets() callback. Install + * the generic one. + */ + if (!affd->calc_sets) + affd->calc_sets = default_calc_sets; + + /* Recalculate the sets */ + affd->calc_sets(affd, affvecs); + + if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS)) + return NULL; + + /* Nothing to assign? */ + if (!affvecs) + return NULL; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + return NULL; + + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(&masks[curvec].mask, irq_default_affinity); + + /* + * Spread on present CPUs starting from affd->pre_vectors. If we + * have multiple sets, build each sets affinity mask separately. + */ + for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { + unsigned int this_vecs = affd->set_size[i]; + int ret; + + ret = irq_build_affinity_masks(curvec, this_vecs, + curvec, masks); + if (ret) { + kfree(masks); + return NULL; + } + curvec += this_vecs; + usedvecs += this_vecs; + } + + /* Fill out vectors at the end that don't need affinity */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors + affvecs; + else + curvec = affd->pre_vectors + usedvecs; + for (; curvec < nvecs; curvec++) + cpumask_copy(&masks[curvec].mask, irq_default_affinity); + + /* Mark the managed interrupts */ + for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++) + masks[i].is_managed = 1; + + return masks; +} + +/** + * irq_calc_affinity_vectors - Calculate the optimal number of vectors + * @minvec: The minimum number of vectors available + * @maxvec: The maximum number of vectors available + * @affd: Description of the affinity requirements + */ +unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, + const struct irq_affinity *affd) +{ + unsigned int resv = affd->pre_vectors + affd->post_vectors; + unsigned int set_vecs; + + if (resv > minvec) + return 0; + + if (affd->calc_sets) { + set_vecs = maxvec - resv; + } else { + get_online_cpus(); + set_vecs = cpumask_weight(cpu_possible_mask); + put_online_cpus(); + } + + return resv + min(set_vecs, maxvec - resv); +} |