diff options
Diffstat (limited to 'src/cmd/compile/internal/ssa/magic.go')
-rw-r--r-- | src/cmd/compile/internal/ssa/magic.go | 426 |
1 files changed, 426 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ssa/magic.go b/src/cmd/compile/internal/ssa/magic.go new file mode 100644 index 0000000..df4b568 --- /dev/null +++ b/src/cmd/compile/internal/ssa/magic.go @@ -0,0 +1,426 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "math/big" + "math/bits" +) + +// So you want to compute x / c for some constant c? +// Machine division instructions are slow, so we try to +// compute this division with a multiplication + a few +// other cheap instructions instead. +// (We assume here that c != 0, +/- 1, or +/- 2^i. Those +// cases are easy to handle in different ways). + +// Technique from https://gmplib.org/~tege/divcnst-pldi94.pdf + +// First consider unsigned division. +// Our strategy is to precompute 1/c then do +// ⎣x / c⎦ = ⎣x * (1/c)⎦. +// 1/c is less than 1, so we can't compute it directly in +// integer arithmetic. Let's instead compute 2^e/c +// for a value of e TBD (^ = exponentiation). Then +// ⎣x / c⎦ = ⎣x * (2^e/c) / 2^e⎦. +// Dividing by 2^e is easy. 2^e/c isn't an integer, unfortunately. +// So we must approximate it. Let's call its approximation m. +// We'll then compute +// ⎣x * m / 2^e⎦ +// Which we want to be equal to ⎣x / c⎦ for 0 <= x < 2^n-1 +// where n is the word size. +// Setting x = c gives us c * m >= 2^e. +// We'll chose m = ⎡2^e/c⎤ to satisfy that equation. +// What remains is to choose e. +// Let m = 2^e/c + delta, 0 <= delta < 1 +// ⎣x * (2^e/c + delta) / 2^e⎦ +// ⎣x / c + x * delta / 2^e⎦ +// We must have x * delta / 2^e < 1/c so that this +// additional term never rounds differently than ⎣x / c⎦ does. +// Rearranging, +// 2^e > x * delta * c +// x can be at most 2^n-1 and delta can be at most 1. +// So it is sufficient to have 2^e >= 2^n*c. +// So we'll choose e = n + s, with s = ⎡log2(c)⎤. +// +// An additional complication arises because m has n+1 bits in it. +// Hardware restricts us to n bit by n bit multiplies. +// We divide into 3 cases: +// +// Case 1: m is even. +// ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦ +// ⎣x / c⎦ = ⎣x * (m/2) / 2^(n+s-1)⎦ +// ⎣x / c⎦ = ⎣x * (m/2) / 2^n / 2^(s-1)⎦ +// ⎣x / c⎦ = ⎣⎣x * (m/2) / 2^n⎦ / 2^(s-1)⎦ +// multiply + shift +// +// Case 2: c is even. +// ⎣x / c⎦ = ⎣(x/2) / (c/2)⎦ +// ⎣x / c⎦ = ⎣⎣x/2⎦ / (c/2)⎦ +// This is just the original problem, with x' = ⎣x/2⎦, c' = c/2, n' = n-1. +// s' = s-1 +// m' = ⎡2^(n'+s')/c'⎤ +// = ⎡2^(n+s-1)/c⎤ +// = ⎡m/2⎤ +// ⎣x / c⎦ = ⎣x' * m' / 2^(n'+s')⎦ +// ⎣x / c⎦ = ⎣⎣x/2⎦ * ⎡m/2⎤ / 2^(n+s-2)⎦ +// ⎣x / c⎦ = ⎣⎣⎣x/2⎦ * ⎡m/2⎤ / 2^n⎦ / 2^(s-2)⎦ +// shift + multiply + shift +// +// Case 3: everything else +// let k = m - 2^n. k fits in n bits. +// ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦ +// ⎣x / c⎦ = ⎣x * (2^n + k) / 2^(n+s)⎦ +// ⎣x / c⎦ = ⎣(x + x * k / 2^n) / 2^s⎦ +// ⎣x / c⎦ = ⎣(x + ⎣x * k / 2^n⎦) / 2^s⎦ +// ⎣x / c⎦ = ⎣(x + ⎣x * k / 2^n⎦) / 2^s⎦ +// ⎣x / c⎦ = ⎣⎣(x + ⎣x * k / 2^n⎦) / 2⎦ / 2^(s-1)⎦ +// multiply + avg + shift +// +// These can be implemented in hardware using: +// ⎣a * b / 2^n⎦ - aka high n bits of an n-bit by n-bit multiply. +// ⎣(a+b) / 2⎦ - aka "average" of two n-bit numbers. +// (Not just a regular add & shift because the intermediate result +// a+b has n+1 bits in it. Nevertheless, can be done +// in 2 instructions on x86.) + +// umagicOK reports whether we should strength reduce a n-bit divide by c. +func umagicOK(n uint, c int64) bool { + // Convert from ConstX auxint values to the real uint64 constant they represent. + d := uint64(c) << (64 - n) >> (64 - n) + + // Doesn't work for 0. + // Don't use for powers of 2. + return d&(d-1) != 0 +} + +// umagicOKn reports whether we should strength reduce an unsigned n-bit divide by c. +// We can strength reduce when c != 0 and c is not a power of two. +func umagicOK8(c int8) bool { return c&(c-1) != 0 } +func umagicOK16(c int16) bool { return c&(c-1) != 0 } +func umagicOK32(c int32) bool { return c&(c-1) != 0 } +func umagicOK64(c int64) bool { return c&(c-1) != 0 } + +type umagicData struct { + s int64 // ⎡log2(c)⎤ + m uint64 // ⎡2^(n+s)/c⎤ - 2^n +} + +// umagic computes the constants needed to strength reduce unsigned n-bit divides by the constant uint64(c). +// The return values satisfy for all 0 <= x < 2^n +// +// floor(x / uint64(c)) = x * (m + 2^n) >> (n+s) +func umagic(n uint, c int64) umagicData { + // Convert from ConstX auxint values to the real uint64 constant they represent. + d := uint64(c) << (64 - n) >> (64 - n) + + C := new(big.Int).SetUint64(d) + s := C.BitLen() + M := big.NewInt(1) + M.Lsh(M, n+uint(s)) // 2^(n+s) + M.Add(M, C) // 2^(n+s)+c + M.Sub(M, big.NewInt(1)) // 2^(n+s)+c-1 + M.Div(M, C) // ⎡2^(n+s)/c⎤ + if M.Bit(int(n)) != 1 { + panic("n+1st bit isn't set") + } + M.SetBit(M, int(n), 0) + m := M.Uint64() + return umagicData{s: int64(s), m: m} +} + +func umagic8(c int8) umagicData { return umagic(8, int64(c)) } +func umagic16(c int16) umagicData { return umagic(16, int64(c)) } +func umagic32(c int32) umagicData { return umagic(32, int64(c)) } +func umagic64(c int64) umagicData { return umagic(64, c) } + +// For signed division, we use a similar strategy. +// First, we enforce a positive c. +// x / c = -(x / (-c)) +// This will require an additional Neg op for c<0. +// +// If x is positive we're in a very similar state +// to the unsigned case above. We define: +// s = ⎡log2(c)⎤-1 +// m = ⎡2^(n+s)/c⎤ +// Then +// ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦ +// If x is negative we have +// ⎡x / c⎤ = ⎣x * m / 2^(n+s)⎦ + 1 +// (TODO: derivation?) +// +// The multiply is a bit odd, as it is a signed n-bit value +// times an unsigned n-bit value. For n smaller than the +// word size, we can extend x and m appropriately and use the +// signed multiply instruction. For n == word size, +// we must use the signed multiply high and correct +// the result by adding x*2^n. +// +// Adding 1 if x<0 is done by subtracting x>>(n-1). + +func smagicOK(n uint, c int64) bool { + if c < 0 { + // Doesn't work for negative c. + return false + } + // Doesn't work for 0. + // Don't use it for powers of 2. + return c&(c-1) != 0 +} + +// smagicOKn reports whether we should strength reduce an signed n-bit divide by c. +func smagicOK8(c int8) bool { return smagicOK(8, int64(c)) } +func smagicOK16(c int16) bool { return smagicOK(16, int64(c)) } +func smagicOK32(c int32) bool { return smagicOK(32, int64(c)) } +func smagicOK64(c int64) bool { return smagicOK(64, c) } + +type smagicData struct { + s int64 // ⎡log2(c)⎤-1 + m uint64 // ⎡2^(n+s)/c⎤ +} + +// smagic computes the constants needed to strength reduce signed n-bit divides by the constant c. +// Must have c>0. +// The return values satisfy for all -2^(n-1) <= x < 2^(n-1) +// +// trunc(x / c) = x * m >> (n+s) + (x < 0 ? 1 : 0) +func smagic(n uint, c int64) smagicData { + C := new(big.Int).SetInt64(c) + s := C.BitLen() - 1 + M := big.NewInt(1) + M.Lsh(M, n+uint(s)) // 2^(n+s) + M.Add(M, C) // 2^(n+s)+c + M.Sub(M, big.NewInt(1)) // 2^(n+s)+c-1 + M.Div(M, C) // ⎡2^(n+s)/c⎤ + if M.Bit(int(n)) != 0 { + panic("n+1st bit is set") + } + if M.Bit(int(n-1)) == 0 { + panic("nth bit is not set") + } + m := M.Uint64() + return smagicData{s: int64(s), m: m} +} + +func smagic8(c int8) smagicData { return smagic(8, int64(c)) } +func smagic16(c int16) smagicData { return smagic(16, int64(c)) } +func smagic32(c int32) smagicData { return smagic(32, int64(c)) } +func smagic64(c int64) smagicData { return smagic(64, c) } + +// Divisibility x%c == 0 can be checked more efficiently than directly computing +// the modulus x%c and comparing against 0. +// +// The same "Division by invariant integers using multiplication" paper +// by Granlund and Montgomery referenced above briefly mentions this method +// and it is further elaborated in "Hacker's Delight" by Warren Section 10-17 +// +// The first thing to note is that for odd integers, exact division can be computed +// by using the modular inverse with respect to the word size 2^n. +// +// Given c, compute m such that (c * m) mod 2^n == 1 +// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*m mod 2^n +// +// x can range from 0, c, 2c, 3c, ... ⎣(2^n - 1)/c⎦ * c the maximum multiple +// Thus, x*m mod 2^n is 0, 1, 2, 3, ... ⎣(2^n - 1)/c⎦ +// i.e. the quotient takes all values from zero up to max = ⎣(2^n - 1)/c⎦ +// +// If x is not divisible by c, then x*m mod 2^n must take some larger value than max. +// +// This gives x*m mod 2^n <= ⎣(2^n - 1)/c⎦ as a test for divisibility +// involving one multiplication and compare. +// +// To extend this to even integers, consider c = d0 * 2^k where d0 is odd. +// We can test whether x is divisible by both d0 and 2^k. +// For d0, the test is the same as above. Let m be such that m*d0 mod 2^n == 1 +// Then x*m mod 2^n <= ⎣(2^n - 1)/d0⎦ is the first test. +// The test for divisibility by 2^k is a check for k trailing zeroes. +// Note that since d0 is odd, m is odd and thus x*m will have the same number of +// trailing zeroes as x. So the two tests are, +// +// x*m mod 2^n <= ⎣(2^n - 1)/d0⎦ +// and x*m ends in k zero bits +// +// These can be combined into a single comparison by the following +// (theorem ZRU in Hacker's Delight) for unsigned integers. +// +// x <= a and x ends in k zero bits if and only if RotRight(x ,k) <= ⎣a/(2^k)⎦ +// Where RotRight(x ,k) is right rotation of x by k bits. +// +// To prove the first direction, x <= a -> ⎣x/(2^k)⎦ <= ⎣a/(2^k)⎦ +// But since x ends in k zeroes all the rotated bits would be zero too. +// So RotRight(x, k) == ⎣x/(2^k)⎦ <= ⎣a/(2^k)⎦ +// +// If x does not end in k zero bits, then RotRight(x, k) +// has some non-zero bits in the k highest bits. +// ⎣x/(2^k)⎦ has all zeroes in the k highest bits, +// so RotRight(x, k) > ⎣x/(2^k)⎦ +// +// Finally, if x > a and has k trailing zero bits, then RotRight(x, k) == ⎣x/(2^k)⎦ +// and ⎣x/(2^k)⎦ must be greater than ⎣a/(2^k)⎦, that is the top n-k bits of x must +// be greater than the top n-k bits of a because the rest of x bits are zero. +// +// So the two conditions about can be replaced with the single test +// +// RotRight(x*m mod 2^n, k) <= ⎣(2^n - 1)/c⎦ +// +// Where d0*2^k was replaced by c on the right hand side. + +// udivisibleOK reports whether we should strength reduce an unsigned n-bit divisibilty check by c. +func udivisibleOK(n uint, c int64) bool { + // Convert from ConstX auxint values to the real uint64 constant they represent. + d := uint64(c) << (64 - n) >> (64 - n) + + // Doesn't work for 0. + // Don't use for powers of 2. + return d&(d-1) != 0 +} + +func udivisibleOK8(c int8) bool { return udivisibleOK(8, int64(c)) } +func udivisibleOK16(c int16) bool { return udivisibleOK(16, int64(c)) } +func udivisibleOK32(c int32) bool { return udivisibleOK(32, int64(c)) } +func udivisibleOK64(c int64) bool { return udivisibleOK(64, c) } + +type udivisibleData struct { + k int64 // trailingZeros(c) + m uint64 // m * (c>>k) mod 2^n == 1 multiplicative inverse of odd portion modulo 2^n + max uint64 // ⎣(2^n - 1)/ c⎦ max value to for divisibility +} + +func udivisible(n uint, c int64) udivisibleData { + // Convert from ConstX auxint values to the real uint64 constant they represent. + d := uint64(c) << (64 - n) >> (64 - n) + + k := bits.TrailingZeros64(d) + d0 := d >> uint(k) // the odd portion of the divisor + + mask := ^uint64(0) >> (64 - n) + + // Calculate the multiplicative inverse via Newton's method. + // Quadratic convergence doubles the number of correct bits per iteration. + m := d0 // initial guess correct to 3-bits d0*d0 mod 8 == 1 + m = m * (2 - m*d0) // 6-bits + m = m * (2 - m*d0) // 12-bits + m = m * (2 - m*d0) // 24-bits + m = m * (2 - m*d0) // 48-bits + m = m * (2 - m*d0) // 96-bits >= 64-bits + m = m & mask + + max := mask / d + + return udivisibleData{ + k: int64(k), + m: m, + max: max, + } +} + +func udivisible8(c int8) udivisibleData { return udivisible(8, int64(c)) } +func udivisible16(c int16) udivisibleData { return udivisible(16, int64(c)) } +func udivisible32(c int32) udivisibleData { return udivisible(32, int64(c)) } +func udivisible64(c int64) udivisibleData { return udivisible(64, c) } + +// For signed integers, a similar method follows. +// +// Given c > 1 and odd, compute m such that (c * m) mod 2^n == 1 +// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*m mod 2^n +// +// x can range from ⎡-2^(n-1)/c⎤ * c, ... -c, 0, c, ... ⎣(2^(n-1) - 1)/c⎦ * c +// Thus, x*m mod 2^n is ⎡-2^(n-1)/c⎤, ... -2, -1, 0, 1, 2, ... ⎣(2^(n-1) - 1)/c⎦ +// +// So, x is a multiple of c if and only if: +// ⎡-2^(n-1)/c⎤ <= x*m mod 2^n <= ⎣(2^(n-1) - 1)/c⎦ +// +// Since c > 1 and odd, this can be simplified by +// ⎡-2^(n-1)/c⎤ == ⎡(-2^(n-1) + 1)/c⎤ == -⎣(2^(n-1) - 1)/c⎦ +// +// -⎣(2^(n-1) - 1)/c⎦ <= x*m mod 2^n <= ⎣(2^(n-1) - 1)/c⎦ +// +// To extend this to even integers, consider c = d0 * 2^k where d0 is odd. +// We can test whether x is divisible by both d0 and 2^k. +// +// Let m be such that (d0 * m) mod 2^n == 1. +// Let q = x*m mod 2^n. Then c divides x if: +// +// -⎣(2^(n-1) - 1)/d0⎦ <= q <= ⎣(2^(n-1) - 1)/d0⎦ and q ends in at least k 0-bits +// +// To transform this to a single comparison, we use the following theorem (ZRS in Hacker's Delight). +// +// For a >= 0 the following conditions are equivalent: +// 1) -a <= x <= a and x ends in at least k 0-bits +// 2) RotRight(x+a', k) <= ⎣2a'/2^k⎦ +// +// Where a' = a & -2^k (a with its right k bits set to zero) +// +// To see that 1 & 2 are equivalent, note that -a <= x <= a is equivalent to +// -a' <= x <= a' if and only if x ends in at least k 0-bits. Adding -a' to each side gives, +// 0 <= x + a' <= 2a' and x + a' ends in at least k 0-bits if and only if x does since a' has +// k 0-bits by definition. We can use theorem ZRU above with x -> x + a' and a -> 2a' giving 1) == 2). +// +// Let m be such that (d0 * m) mod 2^n == 1. +// Let q = x*m mod 2^n. +// Let a' = ⎣(2^(n-1) - 1)/d0⎦ & -2^k +// +// Then the divisibility test is: +// +// RotRight(q+a', k) <= ⎣2a'/2^k⎦ +// +// Note that the calculation is performed using unsigned integers. +// Since a' can have n-1 bits, 2a' may have n bits and there is no risk of overflow. + +// sdivisibleOK reports whether we should strength reduce a signed n-bit divisibilty check by c. +func sdivisibleOK(n uint, c int64) bool { + if c < 0 { + // Doesn't work for negative c. + return false + } + // Doesn't work for 0. + // Don't use it for powers of 2. + return c&(c-1) != 0 +} + +func sdivisibleOK8(c int8) bool { return sdivisibleOK(8, int64(c)) } +func sdivisibleOK16(c int16) bool { return sdivisibleOK(16, int64(c)) } +func sdivisibleOK32(c int32) bool { return sdivisibleOK(32, int64(c)) } +func sdivisibleOK64(c int64) bool { return sdivisibleOK(64, c) } + +type sdivisibleData struct { + k int64 // trailingZeros(c) + m uint64 // m * (c>>k) mod 2^n == 1 multiplicative inverse of odd portion modulo 2^n + a uint64 // ⎣(2^(n-1) - 1)/ (c>>k)⎦ & -(1<<k) additive constant + max uint64 // ⎣(2 a) / (1<<k)⎦ max value to for divisibility +} + +func sdivisible(n uint, c int64) sdivisibleData { + d := uint64(c) + k := bits.TrailingZeros64(d) + d0 := d >> uint(k) // the odd portion of the divisor + + mask := ^uint64(0) >> (64 - n) + + // Calculate the multiplicative inverse via Newton's method. + // Quadratic convergence doubles the number of correct bits per iteration. + m := d0 // initial guess correct to 3-bits d0*d0 mod 8 == 1 + m = m * (2 - m*d0) // 6-bits + m = m * (2 - m*d0) // 12-bits + m = m * (2 - m*d0) // 24-bits + m = m * (2 - m*d0) // 48-bits + m = m * (2 - m*d0) // 96-bits >= 64-bits + m = m & mask + + a := ((mask >> 1) / d0) & -(1 << uint(k)) + max := (2 * a) >> uint(k) + + return sdivisibleData{ + k: int64(k), + m: m, + a: a, + max: max, + } +} + +func sdivisible8(c int8) sdivisibleData { return sdivisible(8, int64(c)) } +func sdivisible16(c int16) sdivisibleData { return sdivisible(16, int64(c)) } +func sdivisible32(c int32) sdivisibleData { return sdivisible(32, int64(c)) } +func sdivisible64(c int64) sdivisibleData { return sdivisible(64, c) } |