diff options
Diffstat (limited to 'src/math/big/float.go')
-rw-r--r-- | src/math/big/float.go | 1730 |
1 files changed, 1730 insertions, 0 deletions
diff --git a/src/math/big/float.go b/src/math/big/float.go new file mode 100644 index 0000000..42050e2 --- /dev/null +++ b/src/math/big/float.go @@ -0,0 +1,1730 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file implements multi-precision floating-point numbers. +// Like in the GNU MPFR library (https://www.mpfr.org/), operands +// can be of mixed precision. Unlike MPFR, the rounding mode is +// not specified with each operation, but with each operand. The +// rounding mode of the result operand determines the rounding +// mode of an operation. This is a from-scratch implementation. + +package big + +import ( + "fmt" + "math" + "math/bits" +) + +const debugFloat = false // enable for debugging + +// A nonzero finite Float represents a multi-precision floating point number +// +// sign × mantissa × 2**exponent +// +// with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp. +// A Float may also be zero (+0, -0) or infinite (+Inf, -Inf). +// All Floats are ordered, and the ordering of two Floats x and y +// is defined by x.Cmp(y). +// +// Each Float value also has a precision, rounding mode, and accuracy. +// The precision is the maximum number of mantissa bits available to +// represent the value. The rounding mode specifies how a result should +// be rounded to fit into the mantissa bits, and accuracy describes the +// rounding error with respect to the exact result. +// +// Unless specified otherwise, all operations (including setters) that +// specify a *Float variable for the result (usually via the receiver +// with the exception of MantExp), round the numeric result according +// to the precision and rounding mode of the result variable. +// +// If the provided result precision is 0 (see below), it is set to the +// precision of the argument with the largest precision value before any +// rounding takes place, and the rounding mode remains unchanged. Thus, +// uninitialized Floats provided as result arguments will have their +// precision set to a reasonable value determined by the operands, and +// their mode is the zero value for RoundingMode (ToNearestEven). +// +// By setting the desired precision to 24 or 53 and using matching rounding +// mode (typically ToNearestEven), Float operations produce the same results +// as the corresponding float32 or float64 IEEE-754 arithmetic for operands +// that correspond to normal (i.e., not denormal) float32 or float64 numbers. +// Exponent underflow and overflow lead to a 0 or an Infinity for different +// values than IEEE-754 because Float exponents have a much larger range. +// +// The zero (uninitialized) value for a Float is ready to use and represents +// the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven. +// +// Operations always take pointer arguments (*Float) rather +// than Float values, and each unique Float value requires +// its own unique *Float pointer. To "copy" a Float value, +// an existing (or newly allocated) Float must be set to +// a new value using the Float.Set method; shallow copies +// of Floats are not supported and may lead to errors. +type Float struct { + prec uint32 + mode RoundingMode + acc Accuracy + form form + neg bool + mant nat + exp int32 +} + +// An ErrNaN panic is raised by a Float operation that would lead to +// a NaN under IEEE-754 rules. An ErrNaN implements the error interface. +type ErrNaN struct { + msg string +} + +func (err ErrNaN) Error() string { + return err.msg +} + +// NewFloat allocates and returns a new Float set to x, +// with precision 53 and rounding mode ToNearestEven. +// NewFloat panics with ErrNaN if x is a NaN. +func NewFloat(x float64) *Float { + if math.IsNaN(x) { + panic(ErrNaN{"NewFloat(NaN)"}) + } + return new(Float).SetFloat64(x) +} + +// Exponent and precision limits. +const ( + MaxExp = math.MaxInt32 // largest supported exponent + MinExp = math.MinInt32 // smallest supported exponent + MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited +) + +// Internal representation: The mantissa bits x.mant of a nonzero finite +// Float x are stored in a nat slice long enough to hold up to x.prec bits; +// the slice may (but doesn't have to) be shorter if the mantissa contains +// trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e., +// the msb is shifted all the way "to the left"). Thus, if the mantissa has +// trailing 0 bits or x.prec is not a multiple of the Word size _W, +// x.mant[0] has trailing zero bits. The msb of the mantissa corresponds +// to the value 0.5; the exponent x.exp shifts the binary point as needed. +// +// A zero or non-finite Float x ignores x.mant and x.exp. +// +// x form neg mant exp +// ---------------------------------------------------------- +// ±0 zero sign - - +// 0 < |x| < +Inf finite sign mantissa exponent +// ±Inf inf sign - - + +// A form value describes the internal representation. +type form byte + +// The form value order is relevant - do not change! +const ( + zero form = iota + finite + inf +) + +// RoundingMode determines how a Float value is rounded to the +// desired precision. Rounding may change the Float value; the +// rounding error is described by the Float's Accuracy. +type RoundingMode byte + +// These constants define supported rounding modes. +const ( + ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven + ToNearestAway // == IEEE 754-2008 roundTiesToAway + ToZero // == IEEE 754-2008 roundTowardZero + AwayFromZero // no IEEE 754-2008 equivalent + ToNegativeInf // == IEEE 754-2008 roundTowardNegative + ToPositiveInf // == IEEE 754-2008 roundTowardPositive +) + +//go:generate stringer -type=RoundingMode + +// Accuracy describes the rounding error produced by the most recent +// operation that generated a Float value, relative to the exact value. +type Accuracy int8 + +// Constants describing the Accuracy of a Float. +const ( + Below Accuracy = -1 + Exact Accuracy = 0 + Above Accuracy = +1 +) + +//go:generate stringer -type=Accuracy + +// SetPrec sets z's precision to prec and returns the (possibly) rounded +// value of z. Rounding occurs according to z's rounding mode if the mantissa +// cannot be represented in prec bits without loss of precision. +// SetPrec(0) maps all finite values to ±0; infinite values remain unchanged. +// If prec > MaxPrec, it is set to MaxPrec. +func (z *Float) SetPrec(prec uint) *Float { + z.acc = Exact // optimistically assume no rounding is needed + + // special case + if prec == 0 { + z.prec = 0 + if z.form == finite { + // truncate z to 0 + z.acc = makeAcc(z.neg) + z.form = zero + } + return z + } + + // general case + if prec > MaxPrec { + prec = MaxPrec + } + old := z.prec + z.prec = uint32(prec) + if z.prec < old { + z.round(0) + } + return z +} + +func makeAcc(above bool) Accuracy { + if above { + return Above + } + return Below +} + +// SetMode sets z's rounding mode to mode and returns an exact z. +// z remains unchanged otherwise. +// z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact. +func (z *Float) SetMode(mode RoundingMode) *Float { + z.mode = mode + z.acc = Exact + return z +} + +// Prec returns the mantissa precision of x in bits. +// The result may be 0 for |x| == 0 and |x| == Inf. +func (x *Float) Prec() uint { + return uint(x.prec) +} + +// MinPrec returns the minimum precision required to represent x exactly +// (i.e., the smallest prec before x.SetPrec(prec) would start rounding x). +// The result is 0 for |x| == 0 and |x| == Inf. +func (x *Float) MinPrec() uint { + if x.form != finite { + return 0 + } + return uint(len(x.mant))*_W - x.mant.trailingZeroBits() +} + +// Mode returns the rounding mode of x. +func (x *Float) Mode() RoundingMode { + return x.mode +} + +// Acc returns the accuracy of x produced by the most recent +// operation, unless explicitly documented otherwise by that +// operation. +func (x *Float) Acc() Accuracy { + return x.acc +} + +// Sign returns: +// +// -1 if x < 0 +// 0 if x is ±0 +// +1 if x > 0 +// +func (x *Float) Sign() int { + if debugFloat { + x.validate() + } + if x.form == zero { + return 0 + } + if x.neg { + return -1 + } + return 1 +} + +// MantExp breaks x into its mantissa and exponent components +// and returns the exponent. If a non-nil mant argument is +// provided its value is set to the mantissa of x, with the +// same precision and rounding mode as x. The components +// satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0. +// Calling MantExp with a nil argument is an efficient way to +// get the exponent of the receiver. +// +// Special cases are: +// +// ( ±0).MantExp(mant) = 0, with mant set to ±0 +// (±Inf).MantExp(mant) = 0, with mant set to ±Inf +// +// x and mant may be the same in which case x is set to its +// mantissa value. +func (x *Float) MantExp(mant *Float) (exp int) { + if debugFloat { + x.validate() + } + if x.form == finite { + exp = int(x.exp) + } + if mant != nil { + mant.Copy(x) + if mant.form == finite { + mant.exp = 0 + } + } + return +} + +func (z *Float) setExpAndRound(exp int64, sbit uint) { + if exp < MinExp { + // underflow + z.acc = makeAcc(z.neg) + z.form = zero + return + } + + if exp > MaxExp { + // overflow + z.acc = makeAcc(!z.neg) + z.form = inf + return + } + + z.form = finite + z.exp = int32(exp) + z.round(sbit) +} + +// SetMantExp sets z to mant × 2**exp and returns z. +// The result z has the same precision and rounding mode +// as mant. SetMantExp is an inverse of MantExp but does +// not require 0.5 <= |mant| < 1.0. Specifically: +// +// mant := new(Float) +// new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0 +// +// Special cases are: +// +// z.SetMantExp( ±0, exp) = ±0 +// z.SetMantExp(±Inf, exp) = ±Inf +// +// z and mant may be the same in which case z's exponent +// is set to exp. +func (z *Float) SetMantExp(mant *Float, exp int) *Float { + if debugFloat { + z.validate() + mant.validate() + } + z.Copy(mant) + + if z.form == finite { + // 0 < |mant| < +Inf + z.setExpAndRound(int64(z.exp)+int64(exp), 0) + } + return z +} + +// Signbit reports whether x is negative or negative zero. +func (x *Float) Signbit() bool { + return x.neg +} + +// IsInf reports whether x is +Inf or -Inf. +func (x *Float) IsInf() bool { + return x.form == inf +} + +// IsInt reports whether x is an integer. +// ±Inf values are not integers. +func (x *Float) IsInt() bool { + if debugFloat { + x.validate() + } + // special cases + if x.form != finite { + return x.form == zero + } + // x.form == finite + if x.exp <= 0 { + return false + } + // x.exp > 0 + return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa +} + +// debugging support +func (x *Float) validate() { + if !debugFloat { + // avoid performance bugs + panic("validate called but debugFloat is not set") + } + if x.form != finite { + return + } + m := len(x.mant) + if m == 0 { + panic("nonzero finite number with empty mantissa") + } + const msb = 1 << (_W - 1) + if x.mant[m-1]&msb == 0 { + panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0))) + } + if x.prec == 0 { + panic("zero precision finite number") + } +} + +// round rounds z according to z.mode to z.prec bits and sets z.acc accordingly. +// sbit must be 0 or 1 and summarizes any "sticky bit" information one might +// have before calling round. z's mantissa must be normalized (with the msb set) +// or empty. +// +// CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the +// sign of z. For correct rounding, the sign of z must be set correctly before +// calling round. +func (z *Float) round(sbit uint) { + if debugFloat { + z.validate() + } + + z.acc = Exact + if z.form != finite { + // ±0 or ±Inf => nothing left to do + return + } + // z.form == finite && len(z.mant) > 0 + // m > 0 implies z.prec > 0 (checked by validate) + + m := uint32(len(z.mant)) // present mantissa length in words + bits := m * _W // present mantissa bits; bits > 0 + if bits <= z.prec { + // mantissa fits => nothing to do + return + } + // bits > z.prec + + // Rounding is based on two bits: the rounding bit (rbit) and the + // sticky bit (sbit). The rbit is the bit immediately before the + // z.prec leading mantissa bits (the "0.5"). The sbit is set if any + // of the bits before the rbit are set (the "0.25", "0.125", etc.): + // + // rbit sbit => "fractional part" + // + // 0 0 == 0 + // 0 1 > 0 , < 0.5 + // 1 0 == 0.5 + // 1 1 > 0.5, < 1.0 + + // bits > z.prec: mantissa too large => round + r := uint(bits - z.prec - 1) // rounding bit position; r >= 0 + rbit := z.mant.bit(r) & 1 // rounding bit; be safe and ensure it's a single bit + // The sticky bit is only needed for rounding ToNearestEven + // or when the rounding bit is zero. Avoid computation otherwise. + if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) { + sbit = z.mant.sticky(r) + } + sbit &= 1 // be safe and ensure it's a single bit + + // cut off extra words + n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision + if m > n { + copy(z.mant, z.mant[m-n:]) // move n last words to front + z.mant = z.mant[:n] + } + + // determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word + ntz := n*_W - z.prec // 0 <= ntz < _W + lsb := Word(1) << ntz + + // round if result is inexact + if rbit|sbit != 0 { + // Make rounding decision: The result mantissa is truncated ("rounded down") + // by default. Decide if we need to increment, or "round up", the (unsigned) + // mantissa. + inc := false + switch z.mode { + case ToNegativeInf: + inc = z.neg + case ToZero: + // nothing to do + case ToNearestEven: + inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0) + case ToNearestAway: + inc = rbit != 0 + case AwayFromZero: + inc = true + case ToPositiveInf: + inc = !z.neg + default: + panic("unreachable") + } + + // A positive result (!z.neg) is Above the exact result if we increment, + // and it's Below if we truncate (Exact results require no rounding). + // For a negative result (z.neg) it is exactly the opposite. + z.acc = makeAcc(inc != z.neg) + + if inc { + // add 1 to mantissa + if addVW(z.mant, z.mant, lsb) != 0 { + // mantissa overflow => adjust exponent + if z.exp >= MaxExp { + // exponent overflow + z.form = inf + return + } + z.exp++ + // adjust mantissa: divide by 2 to compensate for exponent adjustment + shrVU(z.mant, z.mant, 1) + // set msb == carry == 1 from the mantissa overflow above + const msb = 1 << (_W - 1) + z.mant[n-1] |= msb + } + } + } + + // zero out trailing bits in least-significant word + z.mant[0] &^= lsb - 1 + + if debugFloat { + z.validate() + } +} + +func (z *Float) setBits64(neg bool, x uint64) *Float { + if z.prec == 0 { + z.prec = 64 + } + z.acc = Exact + z.neg = neg + if x == 0 { + z.form = zero + return z + } + // x != 0 + z.form = finite + s := bits.LeadingZeros64(x) + z.mant = z.mant.setUint64(x << uint(s)) + z.exp = int32(64 - s) // always fits + if z.prec < 64 { + z.round(0) + } + return z +} + +// SetUint64 sets z to the (possibly rounded) value of x and returns z. +// If z's precision is 0, it is changed to 64 (and rounding will have +// no effect). +func (z *Float) SetUint64(x uint64) *Float { + return z.setBits64(false, x) +} + +// SetInt64 sets z to the (possibly rounded) value of x and returns z. +// If z's precision is 0, it is changed to 64 (and rounding will have +// no effect). +func (z *Float) SetInt64(x int64) *Float { + u := x + if u < 0 { + u = -u + } + // We cannot simply call z.SetUint64(uint64(u)) and change + // the sign afterwards because the sign affects rounding. + return z.setBits64(x < 0, uint64(u)) +} + +// SetFloat64 sets z to the (possibly rounded) value of x and returns z. +// If z's precision is 0, it is changed to 53 (and rounding will have +// no effect). SetFloat64 panics with ErrNaN if x is a NaN. +func (z *Float) SetFloat64(x float64) *Float { + if z.prec == 0 { + z.prec = 53 + } + if math.IsNaN(x) { + panic(ErrNaN{"Float.SetFloat64(NaN)"}) + } + z.acc = Exact + z.neg = math.Signbit(x) // handle -0, -Inf correctly + if x == 0 { + z.form = zero + return z + } + if math.IsInf(x, 0) { + z.form = inf + return z + } + // normalized x != 0 + z.form = finite + fmant, exp := math.Frexp(x) // get normalized mantissa + z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11) + z.exp = int32(exp) // always fits + if z.prec < 53 { + z.round(0) + } + return z +} + +// fnorm normalizes mantissa m by shifting it to the left +// such that the msb of the most-significant word (msw) is 1. +// It returns the shift amount. It assumes that len(m) != 0. +func fnorm(m nat) int64 { + if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) { + panic("msw of mantissa is 0") + } + s := nlz(m[len(m)-1]) + if s > 0 { + c := shlVU(m, m, s) + if debugFloat && c != 0 { + panic("nlz or shlVU incorrect") + } + } + return int64(s) +} + +// SetInt sets z to the (possibly rounded) value of x and returns z. +// If z's precision is 0, it is changed to the larger of x.BitLen() +// or 64 (and rounding will have no effect). +func (z *Float) SetInt(x *Int) *Float { + // TODO(gri) can be more efficient if z.prec > 0 + // but small compared to the size of x, or if there + // are many trailing 0's. + bits := uint32(x.BitLen()) + if z.prec == 0 { + z.prec = umax32(bits, 64) + } + z.acc = Exact + z.neg = x.neg + if len(x.abs) == 0 { + z.form = zero + return z + } + // x != 0 + z.mant = z.mant.set(x.abs) + fnorm(z.mant) + z.setExpAndRound(int64(bits), 0) + return z +} + +// SetRat sets z to the (possibly rounded) value of x and returns z. +// If z's precision is 0, it is changed to the largest of a.BitLen(), +// b.BitLen(), or 64; with x = a/b. +func (z *Float) SetRat(x *Rat) *Float { + if x.IsInt() { + return z.SetInt(x.Num()) + } + var a, b Float + a.SetInt(x.Num()) + b.SetInt(x.Denom()) + if z.prec == 0 { + z.prec = umax32(a.prec, b.prec) + } + return z.Quo(&a, &b) +} + +// SetInf sets z to the infinite Float -Inf if signbit is +// set, or +Inf if signbit is not set, and returns z. The +// precision of z is unchanged and the result is always +// Exact. +func (z *Float) SetInf(signbit bool) *Float { + z.acc = Exact + z.form = inf + z.neg = signbit + return z +} + +// Set sets z to the (possibly rounded) value of x and returns z. +// If z's precision is 0, it is changed to the precision of x +// before setting z (and rounding will have no effect). +// Rounding is performed according to z's precision and rounding +// mode; and z's accuracy reports the result error relative to the +// exact (not rounded) result. +func (z *Float) Set(x *Float) *Float { + if debugFloat { + x.validate() + } + z.acc = Exact + if z != x { + z.form = x.form + z.neg = x.neg + if x.form == finite { + z.exp = x.exp + z.mant = z.mant.set(x.mant) + } + if z.prec == 0 { + z.prec = x.prec + } else if z.prec < x.prec { + z.round(0) + } + } + return z +} + +// Copy sets z to x, with the same precision, rounding mode, and +// accuracy as x, and returns z. x is not changed even if z and +// x are the same. +func (z *Float) Copy(x *Float) *Float { + if debugFloat { + x.validate() + } + if z != x { + z.prec = x.prec + z.mode = x.mode + z.acc = x.acc + z.form = x.form + z.neg = x.neg + if z.form == finite { + z.mant = z.mant.set(x.mant) + z.exp = x.exp + } + } + return z +} + +// msb32 returns the 32 most significant bits of x. +func msb32(x nat) uint32 { + i := len(x) - 1 + if i < 0 { + return 0 + } + if debugFloat && x[i]&(1<<(_W-1)) == 0 { + panic("x not normalized") + } + switch _W { + case 32: + return uint32(x[i]) + case 64: + return uint32(x[i] >> 32) + } + panic("unreachable") +} + +// msb64 returns the 64 most significant bits of x. +func msb64(x nat) uint64 { + i := len(x) - 1 + if i < 0 { + return 0 + } + if debugFloat && x[i]&(1<<(_W-1)) == 0 { + panic("x not normalized") + } + switch _W { + case 32: + v := uint64(x[i]) << 32 + if i > 0 { + v |= uint64(x[i-1]) + } + return v + case 64: + return uint64(x[i]) + } + panic("unreachable") +} + +// Uint64 returns the unsigned integer resulting from truncating x +// towards zero. If 0 <= x <= math.MaxUint64, the result is Exact +// if x is an integer and Below otherwise. +// The result is (0, Above) for x < 0, and (math.MaxUint64, Below) +// for x > math.MaxUint64. +func (x *Float) Uint64() (uint64, Accuracy) { + if debugFloat { + x.validate() + } + + switch x.form { + case finite: + if x.neg { + return 0, Above + } + // 0 < x < +Inf + if x.exp <= 0 { + // 0 < x < 1 + return 0, Below + } + // 1 <= x < Inf + if x.exp <= 64 { + // u = trunc(x) fits into a uint64 + u := msb64(x.mant) >> (64 - uint32(x.exp)) + if x.MinPrec() <= 64 { + return u, Exact + } + return u, Below // x truncated + } + // x too large + return math.MaxUint64, Below + + case zero: + return 0, Exact + + case inf: + if x.neg { + return 0, Above + } + return math.MaxUint64, Below + } + + panic("unreachable") +} + +// Int64 returns the integer resulting from truncating x towards zero. +// If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is +// an integer, and Above (x < 0) or Below (x > 0) otherwise. +// The result is (math.MinInt64, Above) for x < math.MinInt64, +// and (math.MaxInt64, Below) for x > math.MaxInt64. +func (x *Float) Int64() (int64, Accuracy) { + if debugFloat { + x.validate() + } + + switch x.form { + case finite: + // 0 < |x| < +Inf + acc := makeAcc(x.neg) + if x.exp <= 0 { + // 0 < |x| < 1 + return 0, acc + } + // x.exp > 0 + + // 1 <= |x| < +Inf + if x.exp <= 63 { + // i = trunc(x) fits into an int64 (excluding math.MinInt64) + i := int64(msb64(x.mant) >> (64 - uint32(x.exp))) + if x.neg { + i = -i + } + if x.MinPrec() <= uint(x.exp) { + return i, Exact + } + return i, acc // x truncated + } + if x.neg { + // check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64)) + if x.exp == 64 && x.MinPrec() == 1 { + acc = Exact + } + return math.MinInt64, acc + } + // x too large + return math.MaxInt64, Below + + case zero: + return 0, Exact + + case inf: + if x.neg { + return math.MinInt64, Above + } + return math.MaxInt64, Below + } + + panic("unreachable") +} + +// Float32 returns the float32 value nearest to x. If x is too small to be +// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result +// is (0, Below) or (-0, Above), respectively, depending on the sign of x. +// If x is too large to be represented by a float32 (|x| > math.MaxFloat32), +// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x. +func (x *Float) Float32() (float32, Accuracy) { + if debugFloat { + x.validate() + } + + switch x.form { + case finite: + // 0 < |x| < +Inf + + const ( + fbits = 32 // float size + mbits = 23 // mantissa size (excluding implicit msb) + ebits = fbits - mbits - 1 // 8 exponent size + bias = 1<<(ebits-1) - 1 // 127 exponent bias + dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal) + emin = 1 - bias // -126 smallest unbiased exponent (normal) + emax = bias // 127 largest unbiased exponent (normal) + ) + + // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa. + e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 + + // Compute precision p for float32 mantissa. + // If the exponent is too small, we have a denormal number before + // rounding and fewer than p mantissa bits of precision available + // (the exponent remains fixed but the mantissa gets shifted right). + p := mbits + 1 // precision of normal float + if e < emin { + // recompute precision + p = mbits + 1 - emin + int(e) + // If p == 0, the mantissa of x is shifted so much to the right + // that its msb falls immediately to the right of the float32 + // mantissa space. In other words, if the smallest denormal is + // considered "1.0", for p == 0, the mantissa value m is >= 0.5. + // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. + // If m == 0.5, it is rounded down to even, i.e., 0.0. + // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. + if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { + // underflow to ±0 + if x.neg { + var z float32 + return -z, Above + } + return 0.0, Below + } + // otherwise, round up + // We handle p == 0 explicitly because it's easy and because + // Float.round doesn't support rounding to 0 bits of precision. + if p == 0 { + if x.neg { + return -math.SmallestNonzeroFloat32, Below + } + return math.SmallestNonzeroFloat32, Above + } + } + // p > 0 + + // round + var r Float + r.prec = uint32(p) + r.Set(x) + e = r.exp - 1 + + // Rounding may have caused r to overflow to ±Inf + // (rounding never causes underflows to 0). + // If the exponent is too large, also overflow to ±Inf. + if r.form == inf || e > emax { + // overflow + if x.neg { + return float32(math.Inf(-1)), Below + } + return float32(math.Inf(+1)), Above + } + // e <= emax + + // Determine sign, biased exponent, and mantissa. + var sign, bexp, mant uint32 + if x.neg { + sign = 1 << (fbits - 1) + } + + // Rounding may have caused a denormal number to + // become normal. Check again. + if e < emin { + // denormal number: recompute precision + // Since rounding may have at best increased precision + // and we have eliminated p <= 0 early, we know p > 0. + // bexp == 0 for denormals + p = mbits + 1 - emin + int(e) + mant = msb32(r.mant) >> uint(fbits-p) + } else { + // normal number: emin <= e <= emax + bexp = uint32(e+bias) << mbits + mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) + } + + return math.Float32frombits(sign | bexp | mant), r.acc + + case zero: + if x.neg { + var z float32 + return -z, Exact + } + return 0.0, Exact + + case inf: + if x.neg { + return float32(math.Inf(-1)), Exact + } + return float32(math.Inf(+1)), Exact + } + + panic("unreachable") +} + +// Float64 returns the float64 value nearest to x. If x is too small to be +// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result +// is (0, Below) or (-0, Above), respectively, depending on the sign of x. +// If x is too large to be represented by a float64 (|x| > math.MaxFloat64), +// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x. +func (x *Float) Float64() (float64, Accuracy) { + if debugFloat { + x.validate() + } + + switch x.form { + case finite: + // 0 < |x| < +Inf + + const ( + fbits = 64 // float size + mbits = 52 // mantissa size (excluding implicit msb) + ebits = fbits - mbits - 1 // 11 exponent size + bias = 1<<(ebits-1) - 1 // 1023 exponent bias + dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal) + emin = 1 - bias // -1022 smallest unbiased exponent (normal) + emax = bias // 1023 largest unbiased exponent (normal) + ) + + // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa. + e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0 + + // Compute precision p for float64 mantissa. + // If the exponent is too small, we have a denormal number before + // rounding and fewer than p mantissa bits of precision available + // (the exponent remains fixed but the mantissa gets shifted right). + p := mbits + 1 // precision of normal float + if e < emin { + // recompute precision + p = mbits + 1 - emin + int(e) + // If p == 0, the mantissa of x is shifted so much to the right + // that its msb falls immediately to the right of the float64 + // mantissa space. In other words, if the smallest denormal is + // considered "1.0", for p == 0, the mantissa value m is >= 0.5. + // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal. + // If m == 0.5, it is rounded down to even, i.e., 0.0. + // If p < 0, the mantissa value m is <= "0.25" which is never rounded up. + if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ { + // underflow to ±0 + if x.neg { + var z float64 + return -z, Above + } + return 0.0, Below + } + // otherwise, round up + // We handle p == 0 explicitly because it's easy and because + // Float.round doesn't support rounding to 0 bits of precision. + if p == 0 { + if x.neg { + return -math.SmallestNonzeroFloat64, Below + } + return math.SmallestNonzeroFloat64, Above + } + } + // p > 0 + + // round + var r Float + r.prec = uint32(p) + r.Set(x) + e = r.exp - 1 + + // Rounding may have caused r to overflow to ±Inf + // (rounding never causes underflows to 0). + // If the exponent is too large, also overflow to ±Inf. + if r.form == inf || e > emax { + // overflow + if x.neg { + return math.Inf(-1), Below + } + return math.Inf(+1), Above + } + // e <= emax + + // Determine sign, biased exponent, and mantissa. + var sign, bexp, mant uint64 + if x.neg { + sign = 1 << (fbits - 1) + } + + // Rounding may have caused a denormal number to + // become normal. Check again. + if e < emin { + // denormal number: recompute precision + // Since rounding may have at best increased precision + // and we have eliminated p <= 0 early, we know p > 0. + // bexp == 0 for denormals + p = mbits + 1 - emin + int(e) + mant = msb64(r.mant) >> uint(fbits-p) + } else { + // normal number: emin <= e <= emax + bexp = uint64(e+bias) << mbits + mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit) + } + + return math.Float64frombits(sign | bexp | mant), r.acc + + case zero: + if x.neg { + var z float64 + return -z, Exact + } + return 0.0, Exact + + case inf: + if x.neg { + return math.Inf(-1), Exact + } + return math.Inf(+1), Exact + } + + panic("unreachable") +} + +// Int returns the result of truncating x towards zero; +// or nil if x is an infinity. +// The result is Exact if x.IsInt(); otherwise it is Below +// for x > 0, and Above for x < 0. +// If a non-nil *Int argument z is provided, Int stores +// the result in z instead of allocating a new Int. +func (x *Float) Int(z *Int) (*Int, Accuracy) { + if debugFloat { + x.validate() + } + + if z == nil && x.form <= finite { + z = new(Int) + } + + switch x.form { + case finite: + // 0 < |x| < +Inf + acc := makeAcc(x.neg) + if x.exp <= 0 { + // 0 < |x| < 1 + return z.SetInt64(0), acc + } + // x.exp > 0 + + // 1 <= |x| < +Inf + // determine minimum required precision for x + allBits := uint(len(x.mant)) * _W + exp := uint(x.exp) + if x.MinPrec() <= exp { + acc = Exact + } + // shift mantissa as needed + if z == nil { + z = new(Int) + } + z.neg = x.neg + switch { + case exp > allBits: + z.abs = z.abs.shl(x.mant, exp-allBits) + default: + z.abs = z.abs.set(x.mant) + case exp < allBits: + z.abs = z.abs.shr(x.mant, allBits-exp) + } + return z, acc + + case zero: + return z.SetInt64(0), Exact + + case inf: + return nil, makeAcc(x.neg) + } + + panic("unreachable") +} + +// Rat returns the rational number corresponding to x; +// or nil if x is an infinity. +// The result is Exact if x is not an Inf. +// If a non-nil *Rat argument z is provided, Rat stores +// the result in z instead of allocating a new Rat. +func (x *Float) Rat(z *Rat) (*Rat, Accuracy) { + if debugFloat { + x.validate() + } + + if z == nil && x.form <= finite { + z = new(Rat) + } + + switch x.form { + case finite: + // 0 < |x| < +Inf + allBits := int32(len(x.mant)) * _W + // build up numerator and denominator + z.a.neg = x.neg + switch { + case x.exp > allBits: + z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits)) + z.b.abs = z.b.abs[:0] // == 1 (see Rat) + // z already in normal form + default: + z.a.abs = z.a.abs.set(x.mant) + z.b.abs = z.b.abs[:0] // == 1 (see Rat) + // z already in normal form + case x.exp < allBits: + z.a.abs = z.a.abs.set(x.mant) + t := z.b.abs.setUint64(1) + z.b.abs = t.shl(t, uint(allBits-x.exp)) + z.norm() + } + return z, Exact + + case zero: + return z.SetInt64(0), Exact + + case inf: + return nil, makeAcc(x.neg) + } + + panic("unreachable") +} + +// Abs sets z to the (possibly rounded) value |x| (the absolute value of x) +// and returns z. +func (z *Float) Abs(x *Float) *Float { + z.Set(x) + z.neg = false + return z +} + +// Neg sets z to the (possibly rounded) value of x with its sign negated, +// and returns z. +func (z *Float) Neg(x *Float) *Float { + z.Set(x) + z.neg = !z.neg + return z +} + +func validateBinaryOperands(x, y *Float) { + if !debugFloat { + // avoid performance bugs + panic("validateBinaryOperands called but debugFloat is not set") + } + if len(x.mant) == 0 { + panic("empty mantissa for x") + } + if len(y.mant) == 0 { + panic("empty mantissa for y") + } +} + +// z = x + y, ignoring signs of x and y for the addition +// but using the sign of z for rounding the result. +// x and y must have a non-empty mantissa and valid exponent. +func (z *Float) uadd(x, y *Float) { + // Note: This implementation requires 2 shifts most of the + // time. It is also inefficient if exponents or precisions + // differ by wide margins. The following article describes + // an efficient (but much more complicated) implementation + // compatible with the internal representation used here: + // + // Vincent Lefèvre: "The Generic Multiple-Precision Floating- + // Point Addition With Exact Rounding (as in the MPFR Library)" + // http://www.vinc17.net/research/papers/rnc6.pdf + + if debugFloat { + validateBinaryOperands(x, y) + } + + // compute exponents ex, ey for mantissa with "binary point" + // on the right (mantissa.0) - use int64 to avoid overflow + ex := int64(x.exp) - int64(len(x.mant))*_W + ey := int64(y.exp) - int64(len(y.mant))*_W + + al := alias(z.mant, x.mant) || alias(z.mant, y.mant) + + // TODO(gri) having a combined add-and-shift primitive + // could make this code significantly faster + switch { + case ex < ey: + if al { + t := nat(nil).shl(y.mant, uint(ey-ex)) + z.mant = z.mant.add(x.mant, t) + } else { + z.mant = z.mant.shl(y.mant, uint(ey-ex)) + z.mant = z.mant.add(x.mant, z.mant) + } + default: + // ex == ey, no shift needed + z.mant = z.mant.add(x.mant, y.mant) + case ex > ey: + if al { + t := nat(nil).shl(x.mant, uint(ex-ey)) + z.mant = z.mant.add(t, y.mant) + } else { + z.mant = z.mant.shl(x.mant, uint(ex-ey)) + z.mant = z.mant.add(z.mant, y.mant) + } + ex = ey + } + // len(z.mant) > 0 + + z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) +} + +// z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction +// but using the sign of z for rounding the result. +// x and y must have a non-empty mantissa and valid exponent. +func (z *Float) usub(x, y *Float) { + // This code is symmetric to uadd. + // We have not factored the common code out because + // eventually uadd (and usub) should be optimized + // by special-casing, and the code will diverge. + + if debugFloat { + validateBinaryOperands(x, y) + } + + ex := int64(x.exp) - int64(len(x.mant))*_W + ey := int64(y.exp) - int64(len(y.mant))*_W + + al := alias(z.mant, x.mant) || alias(z.mant, y.mant) + + switch { + case ex < ey: + if al { + t := nat(nil).shl(y.mant, uint(ey-ex)) + z.mant = t.sub(x.mant, t) + } else { + z.mant = z.mant.shl(y.mant, uint(ey-ex)) + z.mant = z.mant.sub(x.mant, z.mant) + } + default: + // ex == ey, no shift needed + z.mant = z.mant.sub(x.mant, y.mant) + case ex > ey: + if al { + t := nat(nil).shl(x.mant, uint(ex-ey)) + z.mant = t.sub(t, y.mant) + } else { + z.mant = z.mant.shl(x.mant, uint(ex-ey)) + z.mant = z.mant.sub(z.mant, y.mant) + } + ex = ey + } + + // operands may have canceled each other out + if len(z.mant) == 0 { + z.acc = Exact + z.form = zero + z.neg = false + return + } + // len(z.mant) > 0 + + z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0) +} + +// z = x * y, ignoring signs of x and y for the multiplication +// but using the sign of z for rounding the result. +// x and y must have a non-empty mantissa and valid exponent. +func (z *Float) umul(x, y *Float) { + if debugFloat { + validateBinaryOperands(x, y) + } + + // Note: This is doing too much work if the precision + // of z is less than the sum of the precisions of x + // and y which is often the case (e.g., if all floats + // have the same precision). + // TODO(gri) Optimize this for the common case. + + e := int64(x.exp) + int64(y.exp) + if x == y { + z.mant = z.mant.sqr(x.mant) + } else { + z.mant = z.mant.mul(x.mant, y.mant) + } + z.setExpAndRound(e-fnorm(z.mant), 0) +} + +// z = x / y, ignoring signs of x and y for the division +// but using the sign of z for rounding the result. +// x and y must have a non-empty mantissa and valid exponent. +func (z *Float) uquo(x, y *Float) { + if debugFloat { + validateBinaryOperands(x, y) + } + + // mantissa length in words for desired result precision + 1 + // (at least one extra bit so we get the rounding bit after + // the division) + n := int(z.prec/_W) + 1 + + // compute adjusted x.mant such that we get enough result precision + xadj := x.mant + if d := n - len(x.mant) + len(y.mant); d > 0 { + // d extra words needed => add d "0 digits" to x + xadj = make(nat, len(x.mant)+d) + copy(xadj[d:], x.mant) + } + // TODO(gri): If we have too many digits (d < 0), we should be able + // to shorten x for faster division. But we must be extra careful + // with rounding in that case. + + // Compute d before division since there may be aliasing of x.mant + // (via xadj) or y.mant with z.mant. + d := len(xadj) - len(y.mant) + + // divide + var r nat + z.mant, r = z.mant.div(nil, xadj, y.mant) + e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W + + // The result is long enough to include (at least) the rounding bit. + // If there's a non-zero remainder, the corresponding fractional part + // (if it were computed), would have a non-zero sticky bit (if it were + // zero, it couldn't have a non-zero remainder). + var sbit uint + if len(r) > 0 { + sbit = 1 + } + + z.setExpAndRound(e-fnorm(z.mant), sbit) +} + +// ucmp returns -1, 0, or +1, depending on whether +// |x| < |y|, |x| == |y|, or |x| > |y|. +// x and y must have a non-empty mantissa and valid exponent. +func (x *Float) ucmp(y *Float) int { + if debugFloat { + validateBinaryOperands(x, y) + } + + switch { + case x.exp < y.exp: + return -1 + case x.exp > y.exp: + return +1 + } + // x.exp == y.exp + + // compare mantissas + i := len(x.mant) + j := len(y.mant) + for i > 0 || j > 0 { + var xm, ym Word + if i > 0 { + i-- + xm = x.mant[i] + } + if j > 0 { + j-- + ym = y.mant[j] + } + switch { + case xm < ym: + return -1 + case xm > ym: + return +1 + } + } + + return 0 +} + +// Handling of sign bit as defined by IEEE 754-2008, section 6.3: +// +// When neither the inputs nor result are NaN, the sign of a product or +// quotient is the exclusive OR of the operands’ signs; the sign of a sum, +// or of a difference x−y regarded as a sum x+(−y), differs from at most +// one of the addends’ signs; and the sign of the result of conversions, +// the quantize operation, the roundToIntegral operations, and the +// roundToIntegralExact (see 5.3.1) is the sign of the first or only operand. +// These rules shall apply even when operands or results are zero or infinite. +// +// When the sum of two operands with opposite signs (or the difference of +// two operands with like signs) is exactly zero, the sign of that sum (or +// difference) shall be +0 in all rounding-direction attributes except +// roundTowardNegative; under that attribute, the sign of an exact zero +// sum (or difference) shall be −0. However, x+x = x−(−x) retains the same +// sign as x even when x is zero. +// +// See also: https://play.golang.org/p/RtH3UCt5IH + +// Add sets z to the rounded sum x+y and returns z. If z's precision is 0, +// it is changed to the larger of x's or y's precision before the operation. +// Rounding is performed according to z's precision and rounding mode; and +// z's accuracy reports the result error relative to the exact (not rounded) +// result. Add panics with ErrNaN if x and y are infinities with opposite +// signs. The value of z is undefined in that case. +func (z *Float) Add(x, y *Float) *Float { + if debugFloat { + x.validate() + y.validate() + } + + if z.prec == 0 { + z.prec = umax32(x.prec, y.prec) + } + + if x.form == finite && y.form == finite { + // x + y (common case) + + // Below we set z.neg = x.neg, and when z aliases y this will + // change the y operand's sign. This is fine, because if an + // operand aliases the receiver it'll be overwritten, but we still + // want the original x.neg and y.neg values when we evaluate + // x.neg != y.neg, so we need to save y.neg before setting z.neg. + yneg := y.neg + + z.neg = x.neg + if x.neg == yneg { + // x + y == x + y + // (-x) + (-y) == -(x + y) + z.uadd(x, y) + } else { + // x + (-y) == x - y == -(y - x) + // (-x) + y == y - x == -(x - y) + if x.ucmp(y) > 0 { + z.usub(x, y) + } else { + z.neg = !z.neg + z.usub(y, x) + } + } + if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { + z.neg = true + } + return z + } + + if x.form == inf && y.form == inf && x.neg != y.neg { + // +Inf + -Inf + // -Inf + +Inf + // value of z is undefined but make sure it's valid + z.acc = Exact + z.form = zero + z.neg = false + panic(ErrNaN{"addition of infinities with opposite signs"}) + } + + if x.form == zero && y.form == zero { + // ±0 + ±0 + z.acc = Exact + z.form = zero + z.neg = x.neg && y.neg // -0 + -0 == -0 + return z + } + + if x.form == inf || y.form == zero { + // ±Inf + y + // x + ±0 + return z.Set(x) + } + + // ±0 + y + // x + ±Inf + return z.Set(y) +} + +// Sub sets z to the rounded difference x-y and returns z. +// Precision, rounding, and accuracy reporting are as for Add. +// Sub panics with ErrNaN if x and y are infinities with equal +// signs. The value of z is undefined in that case. +func (z *Float) Sub(x, y *Float) *Float { + if debugFloat { + x.validate() + y.validate() + } + + if z.prec == 0 { + z.prec = umax32(x.prec, y.prec) + } + + if x.form == finite && y.form == finite { + // x - y (common case) + yneg := y.neg + z.neg = x.neg + if x.neg != yneg { + // x - (-y) == x + y + // (-x) - y == -(x + y) + z.uadd(x, y) + } else { + // x - y == x - y == -(y - x) + // (-x) - (-y) == y - x == -(x - y) + if x.ucmp(y) > 0 { + z.usub(x, y) + } else { + z.neg = !z.neg + z.usub(y, x) + } + } + if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact { + z.neg = true + } + return z + } + + if x.form == inf && y.form == inf && x.neg == y.neg { + // +Inf - +Inf + // -Inf - -Inf + // value of z is undefined but make sure it's valid + z.acc = Exact + z.form = zero + z.neg = false + panic(ErrNaN{"subtraction of infinities with equal signs"}) + } + + if x.form == zero && y.form == zero { + // ±0 - ±0 + z.acc = Exact + z.form = zero + z.neg = x.neg && !y.neg // -0 - +0 == -0 + return z + } + + if x.form == inf || y.form == zero { + // ±Inf - y + // x - ±0 + return z.Set(x) + } + + // ±0 - y + // x - ±Inf + return z.Neg(y) +} + +// Mul sets z to the rounded product x*y and returns z. +// Precision, rounding, and accuracy reporting are as for Add. +// Mul panics with ErrNaN if one operand is zero and the other +// operand an infinity. The value of z is undefined in that case. +func (z *Float) Mul(x, y *Float) *Float { + if debugFloat { + x.validate() + y.validate() + } + + if z.prec == 0 { + z.prec = umax32(x.prec, y.prec) + } + + z.neg = x.neg != y.neg + + if x.form == finite && y.form == finite { + // x * y (common case) + z.umul(x, y) + return z + } + + z.acc = Exact + if x.form == zero && y.form == inf || x.form == inf && y.form == zero { + // ±0 * ±Inf + // ±Inf * ±0 + // value of z is undefined but make sure it's valid + z.form = zero + z.neg = false + panic(ErrNaN{"multiplication of zero with infinity"}) + } + + if x.form == inf || y.form == inf { + // ±Inf * y + // x * ±Inf + z.form = inf + return z + } + + // ±0 * y + // x * ±0 + z.form = zero + return z +} + +// Quo sets z to the rounded quotient x/y and returns z. +// Precision, rounding, and accuracy reporting are as for Add. +// Quo panics with ErrNaN if both operands are zero or infinities. +// The value of z is undefined in that case. +func (z *Float) Quo(x, y *Float) *Float { + if debugFloat { + x.validate() + y.validate() + } + + if z.prec == 0 { + z.prec = umax32(x.prec, y.prec) + } + + z.neg = x.neg != y.neg + + if x.form == finite && y.form == finite { + // x / y (common case) + z.uquo(x, y) + return z + } + + z.acc = Exact + if x.form == zero && y.form == zero || x.form == inf && y.form == inf { + // ±0 / ±0 + // ±Inf / ±Inf + // value of z is undefined but make sure it's valid + z.form = zero + z.neg = false + panic(ErrNaN{"division of zero by zero or infinity by infinity"}) + } + + if x.form == zero || y.form == inf { + // ±0 / y + // x / ±Inf + z.form = zero + return z + } + + // x / ±0 + // ±Inf / y + z.form = inf + return z +} + +// Cmp compares x and y and returns: +// +// -1 if x < y +// 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf) +// +1 if x > y +// +func (x *Float) Cmp(y *Float) int { + if debugFloat { + x.validate() + y.validate() + } + + mx := x.ord() + my := y.ord() + switch { + case mx < my: + return -1 + case mx > my: + return +1 + } + // mx == my + + // only if |mx| == 1 we have to compare the mantissae + switch mx { + case -1: + return y.ucmp(x) + case +1: + return x.ucmp(y) + } + + return 0 +} + +// ord classifies x and returns: +// +// -2 if -Inf == x +// -1 if -Inf < x < 0 +// 0 if x == 0 (signed or unsigned) +// +1 if 0 < x < +Inf +// +2 if x == +Inf +// +func (x *Float) ord() int { + var m int + switch x.form { + case finite: + m = 1 + case zero: + return 0 + case inf: + m = 2 + } + if x.neg { + m = -m + } + return m +} + +func umax32(x, y uint32) uint32 { + if x > y { + return x + } + return y +} |