1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
|
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
import (
"math/big"
"math/bits"
)
// So you want to compute x / c for some constant c?
// Machine division instructions are slow, so we try to
// compute this division with a multiplication + a few
// other cheap instructions instead.
// (We assume here that c != 0, +/- 1, or +/- 2^i. Those
// cases are easy to handle in different ways).
// Technique from https://gmplib.org/~tege/divcnst-pldi94.pdf
// First consider unsigned division.
// Our strategy is to precompute 1/c then do
// ⎣x / c⎦ = ⎣x * (1/c)⎦.
// 1/c is less than 1, so we can't compute it directly in
// integer arithmetic. Let's instead compute 2^e/c
// for a value of e TBD (^ = exponentiation). Then
// ⎣x / c⎦ = ⎣x * (2^e/c) / 2^e⎦.
// Dividing by 2^e is easy. 2^e/c isn't an integer, unfortunately.
// So we must approximate it. Let's call its approximation m.
// We'll then compute
// ⎣x * m / 2^e⎦
// Which we want to be equal to ⎣x / c⎦ for 0 <= x < 2^n-1
// where n is the word size.
// Setting x = c gives us c * m >= 2^e.
// We'll chose m = ⎡2^e/c⎤ to satisfy that equation.
// What remains is to choose e.
// Let m = 2^e/c + delta, 0 <= delta < 1
// ⎣x * (2^e/c + delta) / 2^e⎦
// ⎣x / c + x * delta / 2^e⎦
// We must have x * delta / 2^e < 1/c so that this
// additional term never rounds differently than ⎣x / c⎦ does.
// Rearranging,
// 2^e > x * delta * c
// x can be at most 2^n-1 and delta can be at most 1.
// So it is sufficient to have 2^e >= 2^n*c.
// So we'll choose e = n + s, with s = ⎡log2(c)⎤.
//
// An additional complication arises because m has n+1 bits in it.
// Hardware restricts us to n bit by n bit multiplies.
// We divide into 3 cases:
//
// Case 1: m is even.
// ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦
// ⎣x / c⎦ = ⎣x * (m/2) / 2^(n+s-1)⎦
// ⎣x / c⎦ = ⎣x * (m/2) / 2^n / 2^(s-1)⎦
// ⎣x / c⎦ = ⎣⎣x * (m/2) / 2^n⎦ / 2^(s-1)⎦
// multiply + shift
//
// Case 2: c is even.
// ⎣x / c⎦ = ⎣(x/2) / (c/2)⎦
// ⎣x / c⎦ = ⎣⎣x/2⎦ / (c/2)⎦
// This is just the original problem, with x' = ⎣x/2⎦, c' = c/2, n' = n-1.
// s' = s-1
// m' = ⎡2^(n'+s')/c'⎤
// = ⎡2^(n+s-1)/c⎤
// = ⎡m/2⎤
// ⎣x / c⎦ = ⎣x' * m' / 2^(n'+s')⎦
// ⎣x / c⎦ = ⎣⎣x/2⎦ * ⎡m/2⎤ / 2^(n+s-2)⎦
// ⎣x / c⎦ = ⎣⎣⎣x/2⎦ * ⎡m/2⎤ / 2^n⎦ / 2^(s-2)⎦
// shift + multiply + shift
//
// Case 3: everything else
// let k = m - 2^n. k fits in n bits.
// ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦
// ⎣x / c⎦ = ⎣x * (2^n + k) / 2^(n+s)⎦
// ⎣x / c⎦ = ⎣(x + x * k / 2^n) / 2^s⎦
// ⎣x / c⎦ = ⎣(x + ⎣x * k / 2^n⎦) / 2^s⎦
// ⎣x / c⎦ = ⎣(x + ⎣x * k / 2^n⎦) / 2^s⎦
// ⎣x / c⎦ = ⎣⎣(x + ⎣x * k / 2^n⎦) / 2⎦ / 2^(s-1)⎦
// multiply + avg + shift
//
// These can be implemented in hardware using:
// ⎣a * b / 2^n⎦ - aka high n bits of an n-bit by n-bit multiply.
// ⎣(a+b) / 2⎦ - aka "average" of two n-bit numbers.
// (Not just a regular add & shift because the intermediate result
// a+b has n+1 bits in it. Nevertheless, can be done
// in 2 instructions on x86.)
// umagicOK reports whether we should strength reduce a n-bit divide by c.
func umagicOK(n uint, c int64) bool {
// Convert from ConstX auxint values to the real uint64 constant they represent.
d := uint64(c) << (64 - n) >> (64 - n)
// Doesn't work for 0.
// Don't use for powers of 2.
return d&(d-1) != 0
}
// umagicOKn reports whether we should strength reduce an unsigned n-bit divide by c.
// We can strength reduce when c != 0 and c is not a power of two.
func umagicOK8(c int8) bool { return c&(c-1) != 0 }
func umagicOK16(c int16) bool { return c&(c-1) != 0 }
func umagicOK32(c int32) bool { return c&(c-1) != 0 }
func umagicOK64(c int64) bool { return c&(c-1) != 0 }
type umagicData struct {
s int64 // ⎡log2(c)⎤
m uint64 // ⎡2^(n+s)/c⎤ - 2^n
}
// umagic computes the constants needed to strength reduce unsigned n-bit divides by the constant uint64(c).
// The return values satisfy for all 0 <= x < 2^n
//
// floor(x / uint64(c)) = x * (m + 2^n) >> (n+s)
func umagic(n uint, c int64) umagicData {
// Convert from ConstX auxint values to the real uint64 constant they represent.
d := uint64(c) << (64 - n) >> (64 - n)
C := new(big.Int).SetUint64(d)
s := C.BitLen()
M := big.NewInt(1)
M.Lsh(M, n+uint(s)) // 2^(n+s)
M.Add(M, C) // 2^(n+s)+c
M.Sub(M, big.NewInt(1)) // 2^(n+s)+c-1
M.Div(M, C) // ⎡2^(n+s)/c⎤
if M.Bit(int(n)) != 1 {
panic("n+1st bit isn't set")
}
M.SetBit(M, int(n), 0)
m := M.Uint64()
return umagicData{s: int64(s), m: m}
}
func umagic8(c int8) umagicData { return umagic(8, int64(c)) }
func umagic16(c int16) umagicData { return umagic(16, int64(c)) }
func umagic32(c int32) umagicData { return umagic(32, int64(c)) }
func umagic64(c int64) umagicData { return umagic(64, c) }
// For signed division, we use a similar strategy.
// First, we enforce a positive c.
// x / c = -(x / (-c))
// This will require an additional Neg op for c<0.
//
// If x is positive we're in a very similar state
// to the unsigned case above. We define:
// s = ⎡log2(c)⎤-1
// m = ⎡2^(n+s)/c⎤
// Then
// ⎣x / c⎦ = ⎣x * m / 2^(n+s)⎦
// If x is negative we have
// ⎡x / c⎤ = ⎣x * m / 2^(n+s)⎦ + 1
// (TODO: derivation?)
//
// The multiply is a bit odd, as it is a signed n-bit value
// times an unsigned n-bit value. For n smaller than the
// word size, we can extend x and m appropriately and use the
// signed multiply instruction. For n == word size,
// we must use the signed multiply high and correct
// the result by adding x*2^n.
//
// Adding 1 if x<0 is done by subtracting x>>(n-1).
func smagicOK(n uint, c int64) bool {
if c < 0 {
// Doesn't work for negative c.
return false
}
// Doesn't work for 0.
// Don't use it for powers of 2.
return c&(c-1) != 0
}
// smagicOKn reports whether we should strength reduce an signed n-bit divide by c.
func smagicOK8(c int8) bool { return smagicOK(8, int64(c)) }
func smagicOK16(c int16) bool { return smagicOK(16, int64(c)) }
func smagicOK32(c int32) bool { return smagicOK(32, int64(c)) }
func smagicOK64(c int64) bool { return smagicOK(64, c) }
type smagicData struct {
s int64 // ⎡log2(c)⎤-1
m uint64 // ⎡2^(n+s)/c⎤
}
// smagic computes the constants needed to strength reduce signed n-bit divides by the constant c.
// Must have c>0.
// The return values satisfy for all -2^(n-1) <= x < 2^(n-1)
//
// trunc(x / c) = x * m >> (n+s) + (x < 0 ? 1 : 0)
func smagic(n uint, c int64) smagicData {
C := new(big.Int).SetInt64(c)
s := C.BitLen() - 1
M := big.NewInt(1)
M.Lsh(M, n+uint(s)) // 2^(n+s)
M.Add(M, C) // 2^(n+s)+c
M.Sub(M, big.NewInt(1)) // 2^(n+s)+c-1
M.Div(M, C) // ⎡2^(n+s)/c⎤
if M.Bit(int(n)) != 0 {
panic("n+1st bit is set")
}
if M.Bit(int(n-1)) == 0 {
panic("nth bit is not set")
}
m := M.Uint64()
return smagicData{s: int64(s), m: m}
}
func smagic8(c int8) smagicData { return smagic(8, int64(c)) }
func smagic16(c int16) smagicData { return smagic(16, int64(c)) }
func smagic32(c int32) smagicData { return smagic(32, int64(c)) }
func smagic64(c int64) smagicData { return smagic(64, c) }
// Divisibility x%c == 0 can be checked more efficiently than directly computing
// the modulus x%c and comparing against 0.
//
// The same "Division by invariant integers using multiplication" paper
// by Granlund and Montgomery referenced above briefly mentions this method
// and it is further elaborated in "Hacker's Delight" by Warren Section 10-17
//
// The first thing to note is that for odd integers, exact division can be computed
// by using the modular inverse with respect to the word size 2^n.
//
// Given c, compute m such that (c * m) mod 2^n == 1
// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*m mod 2^n
//
// x can range from 0, c, 2c, 3c, ... ⎣(2^n - 1)/c⎦ * c the maximum multiple
// Thus, x*m mod 2^n is 0, 1, 2, 3, ... ⎣(2^n - 1)/c⎦
// i.e. the quotient takes all values from zero up to max = ⎣(2^n - 1)/c⎦
//
// If x is not divisible by c, then x*m mod 2^n must take some larger value than max.
//
// This gives x*m mod 2^n <= ⎣(2^n - 1)/c⎦ as a test for divisibility
// involving one multiplication and compare.
//
// To extend this to even integers, consider c = d0 * 2^k where d0 is odd.
// We can test whether x is divisible by both d0 and 2^k.
// For d0, the test is the same as above. Let m be such that m*d0 mod 2^n == 1
// Then x*m mod 2^n <= ⎣(2^n - 1)/d0⎦ is the first test.
// The test for divisibility by 2^k is a check for k trailing zeroes.
// Note that since d0 is odd, m is odd and thus x*m will have the same number of
// trailing zeroes as x. So the two tests are,
//
// x*m mod 2^n <= ⎣(2^n - 1)/d0⎦
// and x*m ends in k zero bits
//
// These can be combined into a single comparison by the following
// (theorem ZRU in Hacker's Delight) for unsigned integers.
//
// x <= a and x ends in k zero bits if and only if RotRight(x ,k) <= ⎣a/(2^k)⎦
// Where RotRight(x ,k) is right rotation of x by k bits.
//
// To prove the first direction, x <= a -> ⎣x/(2^k)⎦ <= ⎣a/(2^k)⎦
// But since x ends in k zeroes all the rotated bits would be zero too.
// So RotRight(x, k) == ⎣x/(2^k)⎦ <= ⎣a/(2^k)⎦
//
// If x does not end in k zero bits, then RotRight(x, k)
// has some non-zero bits in the k highest bits.
// ⎣x/(2^k)⎦ has all zeroes in the k highest bits,
// so RotRight(x, k) > ⎣x/(2^k)⎦
//
// Finally, if x > a and has k trailing zero bits, then RotRight(x, k) == ⎣x/(2^k)⎦
// and ⎣x/(2^k)⎦ must be greater than ⎣a/(2^k)⎦, that is the top n-k bits of x must
// be greater than the top n-k bits of a because the rest of x bits are zero.
//
// So the two conditions about can be replaced with the single test
//
// RotRight(x*m mod 2^n, k) <= ⎣(2^n - 1)/c⎦
//
// Where d0*2^k was replaced by c on the right hand side.
// udivisibleOK reports whether we should strength reduce an unsigned n-bit divisibilty check by c.
func udivisibleOK(n uint, c int64) bool {
// Convert from ConstX auxint values to the real uint64 constant they represent.
d := uint64(c) << (64 - n) >> (64 - n)
// Doesn't work for 0.
// Don't use for powers of 2.
return d&(d-1) != 0
}
func udivisibleOK8(c int8) bool { return udivisibleOK(8, int64(c)) }
func udivisibleOK16(c int16) bool { return udivisibleOK(16, int64(c)) }
func udivisibleOK32(c int32) bool { return udivisibleOK(32, int64(c)) }
func udivisibleOK64(c int64) bool { return udivisibleOK(64, c) }
type udivisibleData struct {
k int64 // trailingZeros(c)
m uint64 // m * (c>>k) mod 2^n == 1 multiplicative inverse of odd portion modulo 2^n
max uint64 // ⎣(2^n - 1)/ c⎦ max value to for divisibility
}
func udivisible(n uint, c int64) udivisibleData {
// Convert from ConstX auxint values to the real uint64 constant they represent.
d := uint64(c) << (64 - n) >> (64 - n)
k := bits.TrailingZeros64(d)
d0 := d >> uint(k) // the odd portion of the divisor
mask := ^uint64(0) >> (64 - n)
// Calculate the multiplicative inverse via Newton's method.
// Quadratic convergence doubles the number of correct bits per iteration.
m := d0 // initial guess correct to 3-bits d0*d0 mod 8 == 1
m = m * (2 - m*d0) // 6-bits
m = m * (2 - m*d0) // 12-bits
m = m * (2 - m*d0) // 24-bits
m = m * (2 - m*d0) // 48-bits
m = m * (2 - m*d0) // 96-bits >= 64-bits
m = m & mask
max := mask / d
return udivisibleData{
k: int64(k),
m: m,
max: max,
}
}
func udivisible8(c int8) udivisibleData { return udivisible(8, int64(c)) }
func udivisible16(c int16) udivisibleData { return udivisible(16, int64(c)) }
func udivisible32(c int32) udivisibleData { return udivisible(32, int64(c)) }
func udivisible64(c int64) udivisibleData { return udivisible(64, c) }
// For signed integers, a similar method follows.
//
// Given c > 1 and odd, compute m such that (c * m) mod 2^n == 1
// Then if c divides x (x%c ==0), the quotient is given by q = x/c == x*m mod 2^n
//
// x can range from ⎡-2^(n-1)/c⎤ * c, ... -c, 0, c, ... ⎣(2^(n-1) - 1)/c⎦ * c
// Thus, x*m mod 2^n is ⎡-2^(n-1)/c⎤, ... -2, -1, 0, 1, 2, ... ⎣(2^(n-1) - 1)/c⎦
//
// So, x is a multiple of c if and only if:
// ⎡-2^(n-1)/c⎤ <= x*m mod 2^n <= ⎣(2^(n-1) - 1)/c⎦
//
// Since c > 1 and odd, this can be simplified by
// ⎡-2^(n-1)/c⎤ == ⎡(-2^(n-1) + 1)/c⎤ == -⎣(2^(n-1) - 1)/c⎦
//
// -⎣(2^(n-1) - 1)/c⎦ <= x*m mod 2^n <= ⎣(2^(n-1) - 1)/c⎦
//
// To extend this to even integers, consider c = d0 * 2^k where d0 is odd.
// We can test whether x is divisible by both d0 and 2^k.
//
// Let m be such that (d0 * m) mod 2^n == 1.
// Let q = x*m mod 2^n. Then c divides x if:
//
// -⎣(2^(n-1) - 1)/d0⎦ <= q <= ⎣(2^(n-1) - 1)/d0⎦ and q ends in at least k 0-bits
//
// To transform this to a single comparison, we use the following theorem (ZRS in Hacker's Delight).
//
// For a >= 0 the following conditions are equivalent:
// 1) -a <= x <= a and x ends in at least k 0-bits
// 2) RotRight(x+a', k) <= ⎣2a'/2^k⎦
//
// Where a' = a & -2^k (a with its right k bits set to zero)
//
// To see that 1 & 2 are equivalent, note that -a <= x <= a is equivalent to
// -a' <= x <= a' if and only if x ends in at least k 0-bits. Adding -a' to each side gives,
// 0 <= x + a' <= 2a' and x + a' ends in at least k 0-bits if and only if x does since a' has
// k 0-bits by definition. We can use theorem ZRU above with x -> x + a' and a -> 2a' giving 1) == 2).
//
// Let m be such that (d0 * m) mod 2^n == 1.
// Let q = x*m mod 2^n.
// Let a' = ⎣(2^(n-1) - 1)/d0⎦ & -2^k
//
// Then the divisibility test is:
//
// RotRight(q+a', k) <= ⎣2a'/2^k⎦
//
// Note that the calculation is performed using unsigned integers.
// Since a' can have n-1 bits, 2a' may have n bits and there is no risk of overflow.
// sdivisibleOK reports whether we should strength reduce a signed n-bit divisibilty check by c.
func sdivisibleOK(n uint, c int64) bool {
if c < 0 {
// Doesn't work for negative c.
return false
}
// Doesn't work for 0.
// Don't use it for powers of 2.
return c&(c-1) != 0
}
func sdivisibleOK8(c int8) bool { return sdivisibleOK(8, int64(c)) }
func sdivisibleOK16(c int16) bool { return sdivisibleOK(16, int64(c)) }
func sdivisibleOK32(c int32) bool { return sdivisibleOK(32, int64(c)) }
func sdivisibleOK64(c int64) bool { return sdivisibleOK(64, c) }
type sdivisibleData struct {
k int64 // trailingZeros(c)
m uint64 // m * (c>>k) mod 2^n == 1 multiplicative inverse of odd portion modulo 2^n
a uint64 // ⎣(2^(n-1) - 1)/ (c>>k)⎦ & -(1<<k) additive constant
max uint64 // ⎣(2 a) / (1<<k)⎦ max value to for divisibility
}
func sdivisible(n uint, c int64) sdivisibleData {
d := uint64(c)
k := bits.TrailingZeros64(d)
d0 := d >> uint(k) // the odd portion of the divisor
mask := ^uint64(0) >> (64 - n)
// Calculate the multiplicative inverse via Newton's method.
// Quadratic convergence doubles the number of correct bits per iteration.
m := d0 // initial guess correct to 3-bits d0*d0 mod 8 == 1
m = m * (2 - m*d0) // 6-bits
m = m * (2 - m*d0) // 12-bits
m = m * (2 - m*d0) // 24-bits
m = m * (2 - m*d0) // 48-bits
m = m * (2 - m*d0) // 96-bits >= 64-bits
m = m & mask
a := ((mask >> 1) / d0) & -(1 << uint(k))
max := (2 * a) >> uint(k)
return sdivisibleData{
k: int64(k),
m: m,
a: a,
max: max,
}
}
func sdivisible8(c int8) sdivisibleData { return sdivisible(8, int64(c)) }
func sdivisible16(c int16) sdivisibleData { return sdivisible(16, int64(c)) }
func sdivisible32(c int32) sdivisibleData { return sdivisible(32, int64(c)) }
func sdivisible64(c int64) sdivisibleData { return sdivisible(64, c) }
|