diff options
Diffstat (limited to 'include/linux/reciprocal_div.h')
-rw-r--r-- | include/linux/reciprocal_div.h | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h new file mode 100644 index 000000000..585ce89c0 --- /dev/null +++ b/include/linux/reciprocal_div.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RECIPROCAL_DIV_H +#define _LINUX_RECIPROCAL_DIV_H + +#include <linux/types.h> + +/* + * This algorithm is based on the paper "Division by Invariant + * Integers Using Multiplication" by Torbjörn Granlund and Peter + * L. Montgomery. + * + * The assembler implementation from Agner Fog, which this code is + * based on, can be found here: + * http://www.agner.org/optimize/asmlib.zip + * + * This optimization for A/B is helpful if the divisor B is mostly + * runtime invariant. The reciprocal of B is calculated in the + * slow-path with reciprocal_value(). The fast-path can then just use + * a much faster multiplication operation with a variable dividend A + * to calculate the division A/B. + */ + +struct reciprocal_value { + u32 m; + u8 sh1, sh2; +}; + +/* "reciprocal_value" and "reciprocal_divide" together implement the basic + * version of the algorithm described in Figure 4.1 of the paper. + */ +struct reciprocal_value reciprocal_value(u32 d); + +static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) +{ + u32 t = (u32)(((u64)a * R.m) >> 32); + return (t + ((a - t) >> R.sh1)) >> R.sh2; +} + +struct reciprocal_value_adv { + u32 m; + u8 sh, exp; + bool is_wide_m; +}; + +/* "reciprocal_value_adv" implements the advanced version of the algorithm + * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose + * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The + * exception case could be easily handled before calling "reciprocal_value_adv". + * + * The advanced version requires more complex calculation to get the reciprocal + * multiplier and other control variables, but then could reduce the required + * emulation operations. + * + * It makes no sense to use this advanced version for host divide emulation, + * those extra complexities for calculating multiplier etc could completely + * waive our saving on emulation operations. + * + * However, it makes sense to use it for JIT divide code generation for which + * we are willing to trade performance of JITed code with that of host. As shown + * by the following pseudo code, the required emulation operations could go down + * from 6 (the basic version) to 3 or 4. + * + * To use the result of "reciprocal_value_adv", suppose we want to calculate + * n/d, the pseudo C code will be: + * + * struct reciprocal_value_adv rvalue; + * u8 pre_shift, exp; + * + * // handle exception case. + * if (d >= (1U << 31)) { + * result = n >= d; + * return; + * } + * + * rvalue = reciprocal_value_adv(d, 32) + * exp = rvalue.exp; + * if (rvalue.is_wide_m && !(d & 1)) { + * // floor(log2(d & (2^32 -d))) + * pre_shift = fls(d & -d) - 1; + * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); + * } else { + * pre_shift = 0; + * } + * + * // code generation starts. + * if (imm == 1U << exp) { + * result = n >> exp; + * } else if (rvalue.is_wide_m) { + * // pre_shift must be zero when reached here. + * t = (n * rvalue.m) >> 32; + * result = n - t; + * result >>= 1; + * result += t; + * result >>= rvalue.sh - 1; + * } else { + * if (pre_shift) + * result = n >> pre_shift; + * result = ((u64)result * rvalue.m) >> 32; + * result >>= rvalue.sh; + * } + */ +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); + +#endif /* _LINUX_RECIPROCAL_DIV_H */ |