summaryrefslogtreecommitdiffstats
path: root/src/math/big/float.go
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:23:18 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:23:18 +0000
commit43a123c1ae6613b3efeed291fa552ecd909d3acf (patch)
treefd92518b7024bc74031f78a1cf9e454b65e73665 /src/math/big/float.go
parentInitial commit. (diff)
downloadgolang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.tar.xz
golang-1.20-43a123c1ae6613b3efeed291fa552ecd909d3acf.zip
Adding upstream version 1.20.14.upstream/1.20.14upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/math/big/float.go')
-rw-r--r--src/math/big/float.go1729
1 files changed, 1729 insertions, 0 deletions
diff --git a/src/math/big/float.go b/src/math/big/float.go
new file mode 100644
index 0000000..84666d8
--- /dev/null
+++ b/src/math/big/float.go
@@ -0,0 +1,1729 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file implements multi-precision floating-point numbers.
+// Like in the GNU MPFR library (https://www.mpfr.org/), operands
+// can be of mixed precision. Unlike MPFR, the rounding mode is
+// not specified with each operation, but with each operand. The
+// rounding mode of the result operand determines the rounding
+// mode of an operation. This is a from-scratch implementation.
+
+package big
+
+import (
+ "fmt"
+ "math"
+ "math/bits"
+)
+
+const debugFloat = false // enable for debugging
+
+// A nonzero finite Float represents a multi-precision floating point number
+//
+// sign × mantissa × 2**exponent
+//
+// with 0.5 <= mantissa < 1.0, and MinExp <= exponent <= MaxExp.
+// A Float may also be zero (+0, -0) or infinite (+Inf, -Inf).
+// All Floats are ordered, and the ordering of two Floats x and y
+// is defined by x.Cmp(y).
+//
+// Each Float value also has a precision, rounding mode, and accuracy.
+// The precision is the maximum number of mantissa bits available to
+// represent the value. The rounding mode specifies how a result should
+// be rounded to fit into the mantissa bits, and accuracy describes the
+// rounding error with respect to the exact result.
+//
+// Unless specified otherwise, all operations (including setters) that
+// specify a *Float variable for the result (usually via the receiver
+// with the exception of MantExp), round the numeric result according
+// to the precision and rounding mode of the result variable.
+//
+// If the provided result precision is 0 (see below), it is set to the
+// precision of the argument with the largest precision value before any
+// rounding takes place, and the rounding mode remains unchanged. Thus,
+// uninitialized Floats provided as result arguments will have their
+// precision set to a reasonable value determined by the operands, and
+// their mode is the zero value for RoundingMode (ToNearestEven).
+//
+// By setting the desired precision to 24 or 53 and using matching rounding
+// mode (typically ToNearestEven), Float operations produce the same results
+// as the corresponding float32 or float64 IEEE-754 arithmetic for operands
+// that correspond to normal (i.e., not denormal) float32 or float64 numbers.
+// Exponent underflow and overflow lead to a 0 or an Infinity for different
+// values than IEEE-754 because Float exponents have a much larger range.
+//
+// The zero (uninitialized) value for a Float is ready to use and represents
+// the number +0.0 exactly, with precision 0 and rounding mode ToNearestEven.
+//
+// Operations always take pointer arguments (*Float) rather
+// than Float values, and each unique Float value requires
+// its own unique *Float pointer. To "copy" a Float value,
+// an existing (or newly allocated) Float must be set to
+// a new value using the Float.Set method; shallow copies
+// of Floats are not supported and may lead to errors.
+type Float struct {
+ prec uint32
+ mode RoundingMode
+ acc Accuracy
+ form form
+ neg bool
+ mant nat
+ exp int32
+}
+
+// An ErrNaN panic is raised by a Float operation that would lead to
+// a NaN under IEEE-754 rules. An ErrNaN implements the error interface.
+type ErrNaN struct {
+ msg string
+}
+
+func (err ErrNaN) Error() string {
+ return err.msg
+}
+
+// NewFloat allocates and returns a new Float set to x,
+// with precision 53 and rounding mode ToNearestEven.
+// NewFloat panics with ErrNaN if x is a NaN.
+func NewFloat(x float64) *Float {
+ if math.IsNaN(x) {
+ panic(ErrNaN{"NewFloat(NaN)"})
+ }
+ return new(Float).SetFloat64(x)
+}
+
+// Exponent and precision limits.
+const (
+ MaxExp = math.MaxInt32 // largest supported exponent
+ MinExp = math.MinInt32 // smallest supported exponent
+ MaxPrec = math.MaxUint32 // largest (theoretically) supported precision; likely memory-limited
+)
+
+// Internal representation: The mantissa bits x.mant of a nonzero finite
+// Float x are stored in a nat slice long enough to hold up to x.prec bits;
+// the slice may (but doesn't have to) be shorter if the mantissa contains
+// trailing 0 bits. x.mant is normalized if the msb of x.mant == 1 (i.e.,
+// the msb is shifted all the way "to the left"). Thus, if the mantissa has
+// trailing 0 bits or x.prec is not a multiple of the Word size _W,
+// x.mant[0] has trailing zero bits. The msb of the mantissa corresponds
+// to the value 0.5; the exponent x.exp shifts the binary point as needed.
+//
+// A zero or non-finite Float x ignores x.mant and x.exp.
+//
+// x form neg mant exp
+// ----------------------------------------------------------
+// ±0 zero sign - -
+// 0 < |x| < +Inf finite sign mantissa exponent
+// ±Inf inf sign - -
+
+// A form value describes the internal representation.
+type form byte
+
+// The form value order is relevant - do not change!
+const (
+ zero form = iota
+ finite
+ inf
+)
+
+// RoundingMode determines how a Float value is rounded to the
+// desired precision. Rounding may change the Float value; the
+// rounding error is described by the Float's Accuracy.
+type RoundingMode byte
+
+// These constants define supported rounding modes.
+const (
+ ToNearestEven RoundingMode = iota // == IEEE 754-2008 roundTiesToEven
+ ToNearestAway // == IEEE 754-2008 roundTiesToAway
+ ToZero // == IEEE 754-2008 roundTowardZero
+ AwayFromZero // no IEEE 754-2008 equivalent
+ ToNegativeInf // == IEEE 754-2008 roundTowardNegative
+ ToPositiveInf // == IEEE 754-2008 roundTowardPositive
+)
+
+//go:generate stringer -type=RoundingMode
+
+// Accuracy describes the rounding error produced by the most recent
+// operation that generated a Float value, relative to the exact value.
+type Accuracy int8
+
+// Constants describing the Accuracy of a Float.
+const (
+ Below Accuracy = -1
+ Exact Accuracy = 0
+ Above Accuracy = +1
+)
+
+//go:generate stringer -type=Accuracy
+
+// SetPrec sets z's precision to prec and returns the (possibly) rounded
+// value of z. Rounding occurs according to z's rounding mode if the mantissa
+// cannot be represented in prec bits without loss of precision.
+// SetPrec(0) maps all finite values to ±0; infinite values remain unchanged.
+// If prec > MaxPrec, it is set to MaxPrec.
+func (z *Float) SetPrec(prec uint) *Float {
+ z.acc = Exact // optimistically assume no rounding is needed
+
+ // special case
+ if prec == 0 {
+ z.prec = 0
+ if z.form == finite {
+ // truncate z to 0
+ z.acc = makeAcc(z.neg)
+ z.form = zero
+ }
+ return z
+ }
+
+ // general case
+ if prec > MaxPrec {
+ prec = MaxPrec
+ }
+ old := z.prec
+ z.prec = uint32(prec)
+ if z.prec < old {
+ z.round(0)
+ }
+ return z
+}
+
+func makeAcc(above bool) Accuracy {
+ if above {
+ return Above
+ }
+ return Below
+}
+
+// SetMode sets z's rounding mode to mode and returns an exact z.
+// z remains unchanged otherwise.
+// z.SetMode(z.Mode()) is a cheap way to set z's accuracy to Exact.
+func (z *Float) SetMode(mode RoundingMode) *Float {
+ z.mode = mode
+ z.acc = Exact
+ return z
+}
+
+// Prec returns the mantissa precision of x in bits.
+// The result may be 0 for |x| == 0 and |x| == Inf.
+func (x *Float) Prec() uint {
+ return uint(x.prec)
+}
+
+// MinPrec returns the minimum precision required to represent x exactly
+// (i.e., the smallest prec before x.SetPrec(prec) would start rounding x).
+// The result is 0 for |x| == 0 and |x| == Inf.
+func (x *Float) MinPrec() uint {
+ if x.form != finite {
+ return 0
+ }
+ return uint(len(x.mant))*_W - x.mant.trailingZeroBits()
+}
+
+// Mode returns the rounding mode of x.
+func (x *Float) Mode() RoundingMode {
+ return x.mode
+}
+
+// Acc returns the accuracy of x produced by the most recent
+// operation, unless explicitly documented otherwise by that
+// operation.
+func (x *Float) Acc() Accuracy {
+ return x.acc
+}
+
+// Sign returns:
+//
+// -1 if x < 0
+// 0 if x is ±0
+// +1 if x > 0
+func (x *Float) Sign() int {
+ if debugFloat {
+ x.validate()
+ }
+ if x.form == zero {
+ return 0
+ }
+ if x.neg {
+ return -1
+ }
+ return 1
+}
+
+// MantExp breaks x into its mantissa and exponent components
+// and returns the exponent. If a non-nil mant argument is
+// provided its value is set to the mantissa of x, with the
+// same precision and rounding mode as x. The components
+// satisfy x == mant × 2**exp, with 0.5 <= |mant| < 1.0.
+// Calling MantExp with a nil argument is an efficient way to
+// get the exponent of the receiver.
+//
+// Special cases are:
+//
+// ( ±0).MantExp(mant) = 0, with mant set to ±0
+// (±Inf).MantExp(mant) = 0, with mant set to ±Inf
+//
+// x and mant may be the same in which case x is set to its
+// mantissa value.
+func (x *Float) MantExp(mant *Float) (exp int) {
+ if debugFloat {
+ x.validate()
+ }
+ if x.form == finite {
+ exp = int(x.exp)
+ }
+ if mant != nil {
+ mant.Copy(x)
+ if mant.form == finite {
+ mant.exp = 0
+ }
+ }
+ return
+}
+
+func (z *Float) setExpAndRound(exp int64, sbit uint) {
+ if exp < MinExp {
+ // underflow
+ z.acc = makeAcc(z.neg)
+ z.form = zero
+ return
+ }
+
+ if exp > MaxExp {
+ // overflow
+ z.acc = makeAcc(!z.neg)
+ z.form = inf
+ return
+ }
+
+ z.form = finite
+ z.exp = int32(exp)
+ z.round(sbit)
+}
+
+// SetMantExp sets z to mant × 2**exp and returns z.
+// The result z has the same precision and rounding mode
+// as mant. SetMantExp is an inverse of MantExp but does
+// not require 0.5 <= |mant| < 1.0. Specifically, for a
+// given x of type *Float, SetMantExp relates to MantExp
+// as follows:
+//
+// mant := new(Float)
+// new(Float).SetMantExp(mant, x.MantExp(mant)).Cmp(x) == 0
+//
+// Special cases are:
+//
+// z.SetMantExp( ±0, exp) = ±0
+// z.SetMantExp(±Inf, exp) = ±Inf
+//
+// z and mant may be the same in which case z's exponent
+// is set to exp.
+func (z *Float) SetMantExp(mant *Float, exp int) *Float {
+ if debugFloat {
+ z.validate()
+ mant.validate()
+ }
+ z.Copy(mant)
+
+ if z.form == finite {
+ // 0 < |mant| < +Inf
+ z.setExpAndRound(int64(z.exp)+int64(exp), 0)
+ }
+ return z
+}
+
+// Signbit reports whether x is negative or negative zero.
+func (x *Float) Signbit() bool {
+ return x.neg
+}
+
+// IsInf reports whether x is +Inf or -Inf.
+func (x *Float) IsInf() bool {
+ return x.form == inf
+}
+
+// IsInt reports whether x is an integer.
+// ±Inf values are not integers.
+func (x *Float) IsInt() bool {
+ if debugFloat {
+ x.validate()
+ }
+ // special cases
+ if x.form != finite {
+ return x.form == zero
+ }
+ // x.form == finite
+ if x.exp <= 0 {
+ return false
+ }
+ // x.exp > 0
+ return x.prec <= uint32(x.exp) || x.MinPrec() <= uint(x.exp) // not enough bits for fractional mantissa
+}
+
+// debugging support
+func (x *Float) validate() {
+ if !debugFloat {
+ // avoid performance bugs
+ panic("validate called but debugFloat is not set")
+ }
+ if x.form != finite {
+ return
+ }
+ m := len(x.mant)
+ if m == 0 {
+ panic("nonzero finite number with empty mantissa")
+ }
+ const msb = 1 << (_W - 1)
+ if x.mant[m-1]&msb == 0 {
+ panic(fmt.Sprintf("msb not set in last word %#x of %s", x.mant[m-1], x.Text('p', 0)))
+ }
+ if x.prec == 0 {
+ panic("zero precision finite number")
+ }
+}
+
+// round rounds z according to z.mode to z.prec bits and sets z.acc accordingly.
+// sbit must be 0 or 1 and summarizes any "sticky bit" information one might
+// have before calling round. z's mantissa must be normalized (with the msb set)
+// or empty.
+//
+// CAUTION: The rounding modes ToNegativeInf, ToPositiveInf are affected by the
+// sign of z. For correct rounding, the sign of z must be set correctly before
+// calling round.
+func (z *Float) round(sbit uint) {
+ if debugFloat {
+ z.validate()
+ }
+
+ z.acc = Exact
+ if z.form != finite {
+ // ±0 or ±Inf => nothing left to do
+ return
+ }
+ // z.form == finite && len(z.mant) > 0
+ // m > 0 implies z.prec > 0 (checked by validate)
+
+ m := uint32(len(z.mant)) // present mantissa length in words
+ bits := m * _W // present mantissa bits; bits > 0
+ if bits <= z.prec {
+ // mantissa fits => nothing to do
+ return
+ }
+ // bits > z.prec
+
+ // Rounding is based on two bits: the rounding bit (rbit) and the
+ // sticky bit (sbit). The rbit is the bit immediately before the
+ // z.prec leading mantissa bits (the "0.5"). The sbit is set if any
+ // of the bits before the rbit are set (the "0.25", "0.125", etc.):
+ //
+ // rbit sbit => "fractional part"
+ //
+ // 0 0 == 0
+ // 0 1 > 0 , < 0.5
+ // 1 0 == 0.5
+ // 1 1 > 0.5, < 1.0
+
+ // bits > z.prec: mantissa too large => round
+ r := uint(bits - z.prec - 1) // rounding bit position; r >= 0
+ rbit := z.mant.bit(r) & 1 // rounding bit; be safe and ensure it's a single bit
+ // The sticky bit is only needed for rounding ToNearestEven
+ // or when the rounding bit is zero. Avoid computation otherwise.
+ if sbit == 0 && (rbit == 0 || z.mode == ToNearestEven) {
+ sbit = z.mant.sticky(r)
+ }
+ sbit &= 1 // be safe and ensure it's a single bit
+
+ // cut off extra words
+ n := (z.prec + (_W - 1)) / _W // mantissa length in words for desired precision
+ if m > n {
+ copy(z.mant, z.mant[m-n:]) // move n last words to front
+ z.mant = z.mant[:n]
+ }
+
+ // determine number of trailing zero bits (ntz) and compute lsb mask of mantissa's least-significant word
+ ntz := n*_W - z.prec // 0 <= ntz < _W
+ lsb := Word(1) << ntz
+
+ // round if result is inexact
+ if rbit|sbit != 0 {
+ // Make rounding decision: The result mantissa is truncated ("rounded down")
+ // by default. Decide if we need to increment, or "round up", the (unsigned)
+ // mantissa.
+ inc := false
+ switch z.mode {
+ case ToNegativeInf:
+ inc = z.neg
+ case ToZero:
+ // nothing to do
+ case ToNearestEven:
+ inc = rbit != 0 && (sbit != 0 || z.mant[0]&lsb != 0)
+ case ToNearestAway:
+ inc = rbit != 0
+ case AwayFromZero:
+ inc = true
+ case ToPositiveInf:
+ inc = !z.neg
+ default:
+ panic("unreachable")
+ }
+
+ // A positive result (!z.neg) is Above the exact result if we increment,
+ // and it's Below if we truncate (Exact results require no rounding).
+ // For a negative result (z.neg) it is exactly the opposite.
+ z.acc = makeAcc(inc != z.neg)
+
+ if inc {
+ // add 1 to mantissa
+ if addVW(z.mant, z.mant, lsb) != 0 {
+ // mantissa overflow => adjust exponent
+ if z.exp >= MaxExp {
+ // exponent overflow
+ z.form = inf
+ return
+ }
+ z.exp++
+ // adjust mantissa: divide by 2 to compensate for exponent adjustment
+ shrVU(z.mant, z.mant, 1)
+ // set msb == carry == 1 from the mantissa overflow above
+ const msb = 1 << (_W - 1)
+ z.mant[n-1] |= msb
+ }
+ }
+ }
+
+ // zero out trailing bits in least-significant word
+ z.mant[0] &^= lsb - 1
+
+ if debugFloat {
+ z.validate()
+ }
+}
+
+func (z *Float) setBits64(neg bool, x uint64) *Float {
+ if z.prec == 0 {
+ z.prec = 64
+ }
+ z.acc = Exact
+ z.neg = neg
+ if x == 0 {
+ z.form = zero
+ return z
+ }
+ // x != 0
+ z.form = finite
+ s := bits.LeadingZeros64(x)
+ z.mant = z.mant.setUint64(x << uint(s))
+ z.exp = int32(64 - s) // always fits
+ if z.prec < 64 {
+ z.round(0)
+ }
+ return z
+}
+
+// SetUint64 sets z to the (possibly rounded) value of x and returns z.
+// If z's precision is 0, it is changed to 64 (and rounding will have
+// no effect).
+func (z *Float) SetUint64(x uint64) *Float {
+ return z.setBits64(false, x)
+}
+
+// SetInt64 sets z to the (possibly rounded) value of x and returns z.
+// If z's precision is 0, it is changed to 64 (and rounding will have
+// no effect).
+func (z *Float) SetInt64(x int64) *Float {
+ u := x
+ if u < 0 {
+ u = -u
+ }
+ // We cannot simply call z.SetUint64(uint64(u)) and change
+ // the sign afterwards because the sign affects rounding.
+ return z.setBits64(x < 0, uint64(u))
+}
+
+// SetFloat64 sets z to the (possibly rounded) value of x and returns z.
+// If z's precision is 0, it is changed to 53 (and rounding will have
+// no effect). SetFloat64 panics with ErrNaN if x is a NaN.
+func (z *Float) SetFloat64(x float64) *Float {
+ if z.prec == 0 {
+ z.prec = 53
+ }
+ if math.IsNaN(x) {
+ panic(ErrNaN{"Float.SetFloat64(NaN)"})
+ }
+ z.acc = Exact
+ z.neg = math.Signbit(x) // handle -0, -Inf correctly
+ if x == 0 {
+ z.form = zero
+ return z
+ }
+ if math.IsInf(x, 0) {
+ z.form = inf
+ return z
+ }
+ // normalized x != 0
+ z.form = finite
+ fmant, exp := math.Frexp(x) // get normalized mantissa
+ z.mant = z.mant.setUint64(1<<63 | math.Float64bits(fmant)<<11)
+ z.exp = int32(exp) // always fits
+ if z.prec < 53 {
+ z.round(0)
+ }
+ return z
+}
+
+// fnorm normalizes mantissa m by shifting it to the left
+// such that the msb of the most-significant word (msw) is 1.
+// It returns the shift amount. It assumes that len(m) != 0.
+func fnorm(m nat) int64 {
+ if debugFloat && (len(m) == 0 || m[len(m)-1] == 0) {
+ panic("msw of mantissa is 0")
+ }
+ s := nlz(m[len(m)-1])
+ if s > 0 {
+ c := shlVU(m, m, s)
+ if debugFloat && c != 0 {
+ panic("nlz or shlVU incorrect")
+ }
+ }
+ return int64(s)
+}
+
+// SetInt sets z to the (possibly rounded) value of x and returns z.
+// If z's precision is 0, it is changed to the larger of x.BitLen()
+// or 64 (and rounding will have no effect).
+func (z *Float) SetInt(x *Int) *Float {
+ // TODO(gri) can be more efficient if z.prec > 0
+ // but small compared to the size of x, or if there
+ // are many trailing 0's.
+ bits := uint32(x.BitLen())
+ if z.prec == 0 {
+ z.prec = umax32(bits, 64)
+ }
+ z.acc = Exact
+ z.neg = x.neg
+ if len(x.abs) == 0 {
+ z.form = zero
+ return z
+ }
+ // x != 0
+ z.mant = z.mant.set(x.abs)
+ fnorm(z.mant)
+ z.setExpAndRound(int64(bits), 0)
+ return z
+}
+
+// SetRat sets z to the (possibly rounded) value of x and returns z.
+// If z's precision is 0, it is changed to the largest of a.BitLen(),
+// b.BitLen(), or 64; with x = a/b.
+func (z *Float) SetRat(x *Rat) *Float {
+ if x.IsInt() {
+ return z.SetInt(x.Num())
+ }
+ var a, b Float
+ a.SetInt(x.Num())
+ b.SetInt(x.Denom())
+ if z.prec == 0 {
+ z.prec = umax32(a.prec, b.prec)
+ }
+ return z.Quo(&a, &b)
+}
+
+// SetInf sets z to the infinite Float -Inf if signbit is
+// set, or +Inf if signbit is not set, and returns z. The
+// precision of z is unchanged and the result is always
+// Exact.
+func (z *Float) SetInf(signbit bool) *Float {
+ z.acc = Exact
+ z.form = inf
+ z.neg = signbit
+ return z
+}
+
+// Set sets z to the (possibly rounded) value of x and returns z.
+// If z's precision is 0, it is changed to the precision of x
+// before setting z (and rounding will have no effect).
+// Rounding is performed according to z's precision and rounding
+// mode; and z's accuracy reports the result error relative to the
+// exact (not rounded) result.
+func (z *Float) Set(x *Float) *Float {
+ if debugFloat {
+ x.validate()
+ }
+ z.acc = Exact
+ if z != x {
+ z.form = x.form
+ z.neg = x.neg
+ if x.form == finite {
+ z.exp = x.exp
+ z.mant = z.mant.set(x.mant)
+ }
+ if z.prec == 0 {
+ z.prec = x.prec
+ } else if z.prec < x.prec {
+ z.round(0)
+ }
+ }
+ return z
+}
+
+// Copy sets z to x, with the same precision, rounding mode, and
+// accuracy as x, and returns z. x is not changed even if z and
+// x are the same.
+func (z *Float) Copy(x *Float) *Float {
+ if debugFloat {
+ x.validate()
+ }
+ if z != x {
+ z.prec = x.prec
+ z.mode = x.mode
+ z.acc = x.acc
+ z.form = x.form
+ z.neg = x.neg
+ if z.form == finite {
+ z.mant = z.mant.set(x.mant)
+ z.exp = x.exp
+ }
+ }
+ return z
+}
+
+// msb32 returns the 32 most significant bits of x.
+func msb32(x nat) uint32 {
+ i := len(x) - 1
+ if i < 0 {
+ return 0
+ }
+ if debugFloat && x[i]&(1<<(_W-1)) == 0 {
+ panic("x not normalized")
+ }
+ switch _W {
+ case 32:
+ return uint32(x[i])
+ case 64:
+ return uint32(x[i] >> 32)
+ }
+ panic("unreachable")
+}
+
+// msb64 returns the 64 most significant bits of x.
+func msb64(x nat) uint64 {
+ i := len(x) - 1
+ if i < 0 {
+ return 0
+ }
+ if debugFloat && x[i]&(1<<(_W-1)) == 0 {
+ panic("x not normalized")
+ }
+ switch _W {
+ case 32:
+ v := uint64(x[i]) << 32
+ if i > 0 {
+ v |= uint64(x[i-1])
+ }
+ return v
+ case 64:
+ return uint64(x[i])
+ }
+ panic("unreachable")
+}
+
+// Uint64 returns the unsigned integer resulting from truncating x
+// towards zero. If 0 <= x <= math.MaxUint64, the result is Exact
+// if x is an integer and Below otherwise.
+// The result is (0, Above) for x < 0, and (math.MaxUint64, Below)
+// for x > math.MaxUint64.
+func (x *Float) Uint64() (uint64, Accuracy) {
+ if debugFloat {
+ x.validate()
+ }
+
+ switch x.form {
+ case finite:
+ if x.neg {
+ return 0, Above
+ }
+ // 0 < x < +Inf
+ if x.exp <= 0 {
+ // 0 < x < 1
+ return 0, Below
+ }
+ // 1 <= x < Inf
+ if x.exp <= 64 {
+ // u = trunc(x) fits into a uint64
+ u := msb64(x.mant) >> (64 - uint32(x.exp))
+ if x.MinPrec() <= 64 {
+ return u, Exact
+ }
+ return u, Below // x truncated
+ }
+ // x too large
+ return math.MaxUint64, Below
+
+ case zero:
+ return 0, Exact
+
+ case inf:
+ if x.neg {
+ return 0, Above
+ }
+ return math.MaxUint64, Below
+ }
+
+ panic("unreachable")
+}
+
+// Int64 returns the integer resulting from truncating x towards zero.
+// If math.MinInt64 <= x <= math.MaxInt64, the result is Exact if x is
+// an integer, and Above (x < 0) or Below (x > 0) otherwise.
+// The result is (math.MinInt64, Above) for x < math.MinInt64,
+// and (math.MaxInt64, Below) for x > math.MaxInt64.
+func (x *Float) Int64() (int64, Accuracy) {
+ if debugFloat {
+ x.validate()
+ }
+
+ switch x.form {
+ case finite:
+ // 0 < |x| < +Inf
+ acc := makeAcc(x.neg)
+ if x.exp <= 0 {
+ // 0 < |x| < 1
+ return 0, acc
+ }
+ // x.exp > 0
+
+ // 1 <= |x| < +Inf
+ if x.exp <= 63 {
+ // i = trunc(x) fits into an int64 (excluding math.MinInt64)
+ i := int64(msb64(x.mant) >> (64 - uint32(x.exp)))
+ if x.neg {
+ i = -i
+ }
+ if x.MinPrec() <= uint(x.exp) {
+ return i, Exact
+ }
+ return i, acc // x truncated
+ }
+ if x.neg {
+ // check for special case x == math.MinInt64 (i.e., x == -(0.5 << 64))
+ if x.exp == 64 && x.MinPrec() == 1 {
+ acc = Exact
+ }
+ return math.MinInt64, acc
+ }
+ // x too large
+ return math.MaxInt64, Below
+
+ case zero:
+ return 0, Exact
+
+ case inf:
+ if x.neg {
+ return math.MinInt64, Above
+ }
+ return math.MaxInt64, Below
+ }
+
+ panic("unreachable")
+}
+
+// Float32 returns the float32 value nearest to x. If x is too small to be
+// represented by a float32 (|x| < math.SmallestNonzeroFloat32), the result
+// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
+// If x is too large to be represented by a float32 (|x| > math.MaxFloat32),
+// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
+func (x *Float) Float32() (float32, Accuracy) {
+ if debugFloat {
+ x.validate()
+ }
+
+ switch x.form {
+ case finite:
+ // 0 < |x| < +Inf
+
+ const (
+ fbits = 32 // float size
+ mbits = 23 // mantissa size (excluding implicit msb)
+ ebits = fbits - mbits - 1 // 8 exponent size
+ bias = 1<<(ebits-1) - 1 // 127 exponent bias
+ dmin = 1 - bias - mbits // -149 smallest unbiased exponent (denormal)
+ emin = 1 - bias // -126 smallest unbiased exponent (normal)
+ emax = bias // 127 largest unbiased exponent (normal)
+ )
+
+ // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float32 mantissa.
+ e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
+
+ // Compute precision p for float32 mantissa.
+ // If the exponent is too small, we have a denormal number before
+ // rounding and fewer than p mantissa bits of precision available
+ // (the exponent remains fixed but the mantissa gets shifted right).
+ p := mbits + 1 // precision of normal float
+ if e < emin {
+ // recompute precision
+ p = mbits + 1 - emin + int(e)
+ // If p == 0, the mantissa of x is shifted so much to the right
+ // that its msb falls immediately to the right of the float32
+ // mantissa space. In other words, if the smallest denormal is
+ // considered "1.0", for p == 0, the mantissa value m is >= 0.5.
+ // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
+ // If m == 0.5, it is rounded down to even, i.e., 0.0.
+ // If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
+ if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
+ // underflow to ±0
+ if x.neg {
+ var z float32
+ return -z, Above
+ }
+ return 0.0, Below
+ }
+ // otherwise, round up
+ // We handle p == 0 explicitly because it's easy and because
+ // Float.round doesn't support rounding to 0 bits of precision.
+ if p == 0 {
+ if x.neg {
+ return -math.SmallestNonzeroFloat32, Below
+ }
+ return math.SmallestNonzeroFloat32, Above
+ }
+ }
+ // p > 0
+
+ // round
+ var r Float
+ r.prec = uint32(p)
+ r.Set(x)
+ e = r.exp - 1
+
+ // Rounding may have caused r to overflow to ±Inf
+ // (rounding never causes underflows to 0).
+ // If the exponent is too large, also overflow to ±Inf.
+ if r.form == inf || e > emax {
+ // overflow
+ if x.neg {
+ return float32(math.Inf(-1)), Below
+ }
+ return float32(math.Inf(+1)), Above
+ }
+ // e <= emax
+
+ // Determine sign, biased exponent, and mantissa.
+ var sign, bexp, mant uint32
+ if x.neg {
+ sign = 1 << (fbits - 1)
+ }
+
+ // Rounding may have caused a denormal number to
+ // become normal. Check again.
+ if e < emin {
+ // denormal number: recompute precision
+ // Since rounding may have at best increased precision
+ // and we have eliminated p <= 0 early, we know p > 0.
+ // bexp == 0 for denormals
+ p = mbits + 1 - emin + int(e)
+ mant = msb32(r.mant) >> uint(fbits-p)
+ } else {
+ // normal number: emin <= e <= emax
+ bexp = uint32(e+bias) << mbits
+ mant = msb32(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
+ }
+
+ return math.Float32frombits(sign | bexp | mant), r.acc
+
+ case zero:
+ if x.neg {
+ var z float32
+ return -z, Exact
+ }
+ return 0.0, Exact
+
+ case inf:
+ if x.neg {
+ return float32(math.Inf(-1)), Exact
+ }
+ return float32(math.Inf(+1)), Exact
+ }
+
+ panic("unreachable")
+}
+
+// Float64 returns the float64 value nearest to x. If x is too small to be
+// represented by a float64 (|x| < math.SmallestNonzeroFloat64), the result
+// is (0, Below) or (-0, Above), respectively, depending on the sign of x.
+// If x is too large to be represented by a float64 (|x| > math.MaxFloat64),
+// the result is (+Inf, Above) or (-Inf, Below), depending on the sign of x.
+func (x *Float) Float64() (float64, Accuracy) {
+ if debugFloat {
+ x.validate()
+ }
+
+ switch x.form {
+ case finite:
+ // 0 < |x| < +Inf
+
+ const (
+ fbits = 64 // float size
+ mbits = 52 // mantissa size (excluding implicit msb)
+ ebits = fbits - mbits - 1 // 11 exponent size
+ bias = 1<<(ebits-1) - 1 // 1023 exponent bias
+ dmin = 1 - bias - mbits // -1074 smallest unbiased exponent (denormal)
+ emin = 1 - bias // -1022 smallest unbiased exponent (normal)
+ emax = bias // 1023 largest unbiased exponent (normal)
+ )
+
+ // Float mantissa m is 0.5 <= m < 1.0; compute exponent e for float64 mantissa.
+ e := x.exp - 1 // exponent for normal mantissa m with 1.0 <= m < 2.0
+
+ // Compute precision p for float64 mantissa.
+ // If the exponent is too small, we have a denormal number before
+ // rounding and fewer than p mantissa bits of precision available
+ // (the exponent remains fixed but the mantissa gets shifted right).
+ p := mbits + 1 // precision of normal float
+ if e < emin {
+ // recompute precision
+ p = mbits + 1 - emin + int(e)
+ // If p == 0, the mantissa of x is shifted so much to the right
+ // that its msb falls immediately to the right of the float64
+ // mantissa space. In other words, if the smallest denormal is
+ // considered "1.0", for p == 0, the mantissa value m is >= 0.5.
+ // If m > 0.5, it is rounded up to 1.0; i.e., the smallest denormal.
+ // If m == 0.5, it is rounded down to even, i.e., 0.0.
+ // If p < 0, the mantissa value m is <= "0.25" which is never rounded up.
+ if p < 0 /* m <= 0.25 */ || p == 0 && x.mant.sticky(uint(len(x.mant))*_W-1) == 0 /* m == 0.5 */ {
+ // underflow to ±0
+ if x.neg {
+ var z float64
+ return -z, Above
+ }
+ return 0.0, Below
+ }
+ // otherwise, round up
+ // We handle p == 0 explicitly because it's easy and because
+ // Float.round doesn't support rounding to 0 bits of precision.
+ if p == 0 {
+ if x.neg {
+ return -math.SmallestNonzeroFloat64, Below
+ }
+ return math.SmallestNonzeroFloat64, Above
+ }
+ }
+ // p > 0
+
+ // round
+ var r Float
+ r.prec = uint32(p)
+ r.Set(x)
+ e = r.exp - 1
+
+ // Rounding may have caused r to overflow to ±Inf
+ // (rounding never causes underflows to 0).
+ // If the exponent is too large, also overflow to ±Inf.
+ if r.form == inf || e > emax {
+ // overflow
+ if x.neg {
+ return math.Inf(-1), Below
+ }
+ return math.Inf(+1), Above
+ }
+ // e <= emax
+
+ // Determine sign, biased exponent, and mantissa.
+ var sign, bexp, mant uint64
+ if x.neg {
+ sign = 1 << (fbits - 1)
+ }
+
+ // Rounding may have caused a denormal number to
+ // become normal. Check again.
+ if e < emin {
+ // denormal number: recompute precision
+ // Since rounding may have at best increased precision
+ // and we have eliminated p <= 0 early, we know p > 0.
+ // bexp == 0 for denormals
+ p = mbits + 1 - emin + int(e)
+ mant = msb64(r.mant) >> uint(fbits-p)
+ } else {
+ // normal number: emin <= e <= emax
+ bexp = uint64(e+bias) << mbits
+ mant = msb64(r.mant) >> ebits & (1<<mbits - 1) // cut off msb (implicit 1 bit)
+ }
+
+ return math.Float64frombits(sign | bexp | mant), r.acc
+
+ case zero:
+ if x.neg {
+ var z float64
+ return -z, Exact
+ }
+ return 0.0, Exact
+
+ case inf:
+ if x.neg {
+ return math.Inf(-1), Exact
+ }
+ return math.Inf(+1), Exact
+ }
+
+ panic("unreachable")
+}
+
+// Int returns the result of truncating x towards zero;
+// or nil if x is an infinity.
+// The result is Exact if x.IsInt(); otherwise it is Below
+// for x > 0, and Above for x < 0.
+// If a non-nil *Int argument z is provided, Int stores
+// the result in z instead of allocating a new Int.
+func (x *Float) Int(z *Int) (*Int, Accuracy) {
+ if debugFloat {
+ x.validate()
+ }
+
+ if z == nil && x.form <= finite {
+ z = new(Int)
+ }
+
+ switch x.form {
+ case finite:
+ // 0 < |x| < +Inf
+ acc := makeAcc(x.neg)
+ if x.exp <= 0 {
+ // 0 < |x| < 1
+ return z.SetInt64(0), acc
+ }
+ // x.exp > 0
+
+ // 1 <= |x| < +Inf
+ // determine minimum required precision for x
+ allBits := uint(len(x.mant)) * _W
+ exp := uint(x.exp)
+ if x.MinPrec() <= exp {
+ acc = Exact
+ }
+ // shift mantissa as needed
+ if z == nil {
+ z = new(Int)
+ }
+ z.neg = x.neg
+ switch {
+ case exp > allBits:
+ z.abs = z.abs.shl(x.mant, exp-allBits)
+ default:
+ z.abs = z.abs.set(x.mant)
+ case exp < allBits:
+ z.abs = z.abs.shr(x.mant, allBits-exp)
+ }
+ return z, acc
+
+ case zero:
+ return z.SetInt64(0), Exact
+
+ case inf:
+ return nil, makeAcc(x.neg)
+ }
+
+ panic("unreachable")
+}
+
+// Rat returns the rational number corresponding to x;
+// or nil if x is an infinity.
+// The result is Exact if x is not an Inf.
+// If a non-nil *Rat argument z is provided, Rat stores
+// the result in z instead of allocating a new Rat.
+func (x *Float) Rat(z *Rat) (*Rat, Accuracy) {
+ if debugFloat {
+ x.validate()
+ }
+
+ if z == nil && x.form <= finite {
+ z = new(Rat)
+ }
+
+ switch x.form {
+ case finite:
+ // 0 < |x| < +Inf
+ allBits := int32(len(x.mant)) * _W
+ // build up numerator and denominator
+ z.a.neg = x.neg
+ switch {
+ case x.exp > allBits:
+ z.a.abs = z.a.abs.shl(x.mant, uint(x.exp-allBits))
+ z.b.abs = z.b.abs[:0] // == 1 (see Rat)
+ // z already in normal form
+ default:
+ z.a.abs = z.a.abs.set(x.mant)
+ z.b.abs = z.b.abs[:0] // == 1 (see Rat)
+ // z already in normal form
+ case x.exp < allBits:
+ z.a.abs = z.a.abs.set(x.mant)
+ t := z.b.abs.setUint64(1)
+ z.b.abs = t.shl(t, uint(allBits-x.exp))
+ z.norm()
+ }
+ return z, Exact
+
+ case zero:
+ return z.SetInt64(0), Exact
+
+ case inf:
+ return nil, makeAcc(x.neg)
+ }
+
+ panic("unreachable")
+}
+
+// Abs sets z to the (possibly rounded) value |x| (the absolute value of x)
+// and returns z.
+func (z *Float) Abs(x *Float) *Float {
+ z.Set(x)
+ z.neg = false
+ return z
+}
+
+// Neg sets z to the (possibly rounded) value of x with its sign negated,
+// and returns z.
+func (z *Float) Neg(x *Float) *Float {
+ z.Set(x)
+ z.neg = !z.neg
+ return z
+}
+
+func validateBinaryOperands(x, y *Float) {
+ if !debugFloat {
+ // avoid performance bugs
+ panic("validateBinaryOperands called but debugFloat is not set")
+ }
+ if len(x.mant) == 0 {
+ panic("empty mantissa for x")
+ }
+ if len(y.mant) == 0 {
+ panic("empty mantissa for y")
+ }
+}
+
+// z = x + y, ignoring signs of x and y for the addition
+// but using the sign of z for rounding the result.
+// x and y must have a non-empty mantissa and valid exponent.
+func (z *Float) uadd(x, y *Float) {
+ // Note: This implementation requires 2 shifts most of the
+ // time. It is also inefficient if exponents or precisions
+ // differ by wide margins. The following article describes
+ // an efficient (but much more complicated) implementation
+ // compatible with the internal representation used here:
+ //
+ // Vincent Lefèvre: "The Generic Multiple-Precision Floating-
+ // Point Addition With Exact Rounding (as in the MPFR Library)"
+ // http://www.vinc17.net/research/papers/rnc6.pdf
+
+ if debugFloat {
+ validateBinaryOperands(x, y)
+ }
+
+ // compute exponents ex, ey for mantissa with "binary point"
+ // on the right (mantissa.0) - use int64 to avoid overflow
+ ex := int64(x.exp) - int64(len(x.mant))*_W
+ ey := int64(y.exp) - int64(len(y.mant))*_W
+
+ al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
+
+ // TODO(gri) having a combined add-and-shift primitive
+ // could make this code significantly faster
+ switch {
+ case ex < ey:
+ if al {
+ t := nat(nil).shl(y.mant, uint(ey-ex))
+ z.mant = z.mant.add(x.mant, t)
+ } else {
+ z.mant = z.mant.shl(y.mant, uint(ey-ex))
+ z.mant = z.mant.add(x.mant, z.mant)
+ }
+ default:
+ // ex == ey, no shift needed
+ z.mant = z.mant.add(x.mant, y.mant)
+ case ex > ey:
+ if al {
+ t := nat(nil).shl(x.mant, uint(ex-ey))
+ z.mant = z.mant.add(t, y.mant)
+ } else {
+ z.mant = z.mant.shl(x.mant, uint(ex-ey))
+ z.mant = z.mant.add(z.mant, y.mant)
+ }
+ ex = ey
+ }
+ // len(z.mant) > 0
+
+ z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
+}
+
+// z = x - y for |x| > |y|, ignoring signs of x and y for the subtraction
+// but using the sign of z for rounding the result.
+// x and y must have a non-empty mantissa and valid exponent.
+func (z *Float) usub(x, y *Float) {
+ // This code is symmetric to uadd.
+ // We have not factored the common code out because
+ // eventually uadd (and usub) should be optimized
+ // by special-casing, and the code will diverge.
+
+ if debugFloat {
+ validateBinaryOperands(x, y)
+ }
+
+ ex := int64(x.exp) - int64(len(x.mant))*_W
+ ey := int64(y.exp) - int64(len(y.mant))*_W
+
+ al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
+
+ switch {
+ case ex < ey:
+ if al {
+ t := nat(nil).shl(y.mant, uint(ey-ex))
+ z.mant = t.sub(x.mant, t)
+ } else {
+ z.mant = z.mant.shl(y.mant, uint(ey-ex))
+ z.mant = z.mant.sub(x.mant, z.mant)
+ }
+ default:
+ // ex == ey, no shift needed
+ z.mant = z.mant.sub(x.mant, y.mant)
+ case ex > ey:
+ if al {
+ t := nat(nil).shl(x.mant, uint(ex-ey))
+ z.mant = t.sub(t, y.mant)
+ } else {
+ z.mant = z.mant.shl(x.mant, uint(ex-ey))
+ z.mant = z.mant.sub(z.mant, y.mant)
+ }
+ ex = ey
+ }
+
+ // operands may have canceled each other out
+ if len(z.mant) == 0 {
+ z.acc = Exact
+ z.form = zero
+ z.neg = false
+ return
+ }
+ // len(z.mant) > 0
+
+ z.setExpAndRound(ex+int64(len(z.mant))*_W-fnorm(z.mant), 0)
+}
+
+// z = x * y, ignoring signs of x and y for the multiplication
+// but using the sign of z for rounding the result.
+// x and y must have a non-empty mantissa and valid exponent.
+func (z *Float) umul(x, y *Float) {
+ if debugFloat {
+ validateBinaryOperands(x, y)
+ }
+
+ // Note: This is doing too much work if the precision
+ // of z is less than the sum of the precisions of x
+ // and y which is often the case (e.g., if all floats
+ // have the same precision).
+ // TODO(gri) Optimize this for the common case.
+
+ e := int64(x.exp) + int64(y.exp)
+ if x == y {
+ z.mant = z.mant.sqr(x.mant)
+ } else {
+ z.mant = z.mant.mul(x.mant, y.mant)
+ }
+ z.setExpAndRound(e-fnorm(z.mant), 0)
+}
+
+// z = x / y, ignoring signs of x and y for the division
+// but using the sign of z for rounding the result.
+// x and y must have a non-empty mantissa and valid exponent.
+func (z *Float) uquo(x, y *Float) {
+ if debugFloat {
+ validateBinaryOperands(x, y)
+ }
+
+ // mantissa length in words for desired result precision + 1
+ // (at least one extra bit so we get the rounding bit after
+ // the division)
+ n := int(z.prec/_W) + 1
+
+ // compute adjusted x.mant such that we get enough result precision
+ xadj := x.mant
+ if d := n - len(x.mant) + len(y.mant); d > 0 {
+ // d extra words needed => add d "0 digits" to x
+ xadj = make(nat, len(x.mant)+d)
+ copy(xadj[d:], x.mant)
+ }
+ // TODO(gri): If we have too many digits (d < 0), we should be able
+ // to shorten x for faster division. But we must be extra careful
+ // with rounding in that case.
+
+ // Compute d before division since there may be aliasing of x.mant
+ // (via xadj) or y.mant with z.mant.
+ d := len(xadj) - len(y.mant)
+
+ // divide
+ var r nat
+ z.mant, r = z.mant.div(nil, xadj, y.mant)
+ e := int64(x.exp) - int64(y.exp) - int64(d-len(z.mant))*_W
+
+ // The result is long enough to include (at least) the rounding bit.
+ // If there's a non-zero remainder, the corresponding fractional part
+ // (if it were computed), would have a non-zero sticky bit (if it were
+ // zero, it couldn't have a non-zero remainder).
+ var sbit uint
+ if len(r) > 0 {
+ sbit = 1
+ }
+
+ z.setExpAndRound(e-fnorm(z.mant), sbit)
+}
+
+// ucmp returns -1, 0, or +1, depending on whether
+// |x| < |y|, |x| == |y|, or |x| > |y|.
+// x and y must have a non-empty mantissa and valid exponent.
+func (x *Float) ucmp(y *Float) int {
+ if debugFloat {
+ validateBinaryOperands(x, y)
+ }
+
+ switch {
+ case x.exp < y.exp:
+ return -1
+ case x.exp > y.exp:
+ return +1
+ }
+ // x.exp == y.exp
+
+ // compare mantissas
+ i := len(x.mant)
+ j := len(y.mant)
+ for i > 0 || j > 0 {
+ var xm, ym Word
+ if i > 0 {
+ i--
+ xm = x.mant[i]
+ }
+ if j > 0 {
+ j--
+ ym = y.mant[j]
+ }
+ switch {
+ case xm < ym:
+ return -1
+ case xm > ym:
+ return +1
+ }
+ }
+
+ return 0
+}
+
+// Handling of sign bit as defined by IEEE 754-2008, section 6.3:
+//
+// When neither the inputs nor result are NaN, the sign of a product or
+// quotient is the exclusive OR of the operands’ signs; the sign of a sum,
+// or of a difference x−y regarded as a sum x+(−y), differs from at most
+// one of the addends’ signs; and the sign of the result of conversions,
+// the quantize operation, the roundToIntegral operations, and the
+// roundToIntegralExact (see 5.3.1) is the sign of the first or only operand.
+// These rules shall apply even when operands or results are zero or infinite.
+//
+// When the sum of two operands with opposite signs (or the difference of
+// two operands with like signs) is exactly zero, the sign of that sum (or
+// difference) shall be +0 in all rounding-direction attributes except
+// roundTowardNegative; under that attribute, the sign of an exact zero
+// sum (or difference) shall be −0. However, x+x = x−(−x) retains the same
+// sign as x even when x is zero.
+//
+// See also: https://play.golang.org/p/RtH3UCt5IH
+
+// Add sets z to the rounded sum x+y and returns z. If z's precision is 0,
+// it is changed to the larger of x's or y's precision before the operation.
+// Rounding is performed according to z's precision and rounding mode; and
+// z's accuracy reports the result error relative to the exact (not rounded)
+// result. Add panics with ErrNaN if x and y are infinities with opposite
+// signs. The value of z is undefined in that case.
+func (z *Float) Add(x, y *Float) *Float {
+ if debugFloat {
+ x.validate()
+ y.validate()
+ }
+
+ if z.prec == 0 {
+ z.prec = umax32(x.prec, y.prec)
+ }
+
+ if x.form == finite && y.form == finite {
+ // x + y (common case)
+
+ // Below we set z.neg = x.neg, and when z aliases y this will
+ // change the y operand's sign. This is fine, because if an
+ // operand aliases the receiver it'll be overwritten, but we still
+ // want the original x.neg and y.neg values when we evaluate
+ // x.neg != y.neg, so we need to save y.neg before setting z.neg.
+ yneg := y.neg
+
+ z.neg = x.neg
+ if x.neg == yneg {
+ // x + y == x + y
+ // (-x) + (-y) == -(x + y)
+ z.uadd(x, y)
+ } else {
+ // x + (-y) == x - y == -(y - x)
+ // (-x) + y == y - x == -(x - y)
+ if x.ucmp(y) > 0 {
+ z.usub(x, y)
+ } else {
+ z.neg = !z.neg
+ z.usub(y, x)
+ }
+ }
+ if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
+ z.neg = true
+ }
+ return z
+ }
+
+ if x.form == inf && y.form == inf && x.neg != y.neg {
+ // +Inf + -Inf
+ // -Inf + +Inf
+ // value of z is undefined but make sure it's valid
+ z.acc = Exact
+ z.form = zero
+ z.neg = false
+ panic(ErrNaN{"addition of infinities with opposite signs"})
+ }
+
+ if x.form == zero && y.form == zero {
+ // ±0 + ±0
+ z.acc = Exact
+ z.form = zero
+ z.neg = x.neg && y.neg // -0 + -0 == -0
+ return z
+ }
+
+ if x.form == inf || y.form == zero {
+ // ±Inf + y
+ // x + ±0
+ return z.Set(x)
+ }
+
+ // ±0 + y
+ // x + ±Inf
+ return z.Set(y)
+}
+
+// Sub sets z to the rounded difference x-y and returns z.
+// Precision, rounding, and accuracy reporting are as for Add.
+// Sub panics with ErrNaN if x and y are infinities with equal
+// signs. The value of z is undefined in that case.
+func (z *Float) Sub(x, y *Float) *Float {
+ if debugFloat {
+ x.validate()
+ y.validate()
+ }
+
+ if z.prec == 0 {
+ z.prec = umax32(x.prec, y.prec)
+ }
+
+ if x.form == finite && y.form == finite {
+ // x - y (common case)
+ yneg := y.neg
+ z.neg = x.neg
+ if x.neg != yneg {
+ // x - (-y) == x + y
+ // (-x) - y == -(x + y)
+ z.uadd(x, y)
+ } else {
+ // x - y == x - y == -(y - x)
+ // (-x) - (-y) == y - x == -(x - y)
+ if x.ucmp(y) > 0 {
+ z.usub(x, y)
+ } else {
+ z.neg = !z.neg
+ z.usub(y, x)
+ }
+ }
+ if z.form == zero && z.mode == ToNegativeInf && z.acc == Exact {
+ z.neg = true
+ }
+ return z
+ }
+
+ if x.form == inf && y.form == inf && x.neg == y.neg {
+ // +Inf - +Inf
+ // -Inf - -Inf
+ // value of z is undefined but make sure it's valid
+ z.acc = Exact
+ z.form = zero
+ z.neg = false
+ panic(ErrNaN{"subtraction of infinities with equal signs"})
+ }
+
+ if x.form == zero && y.form == zero {
+ // ±0 - ±0
+ z.acc = Exact
+ z.form = zero
+ z.neg = x.neg && !y.neg // -0 - +0 == -0
+ return z
+ }
+
+ if x.form == inf || y.form == zero {
+ // ±Inf - y
+ // x - ±0
+ return z.Set(x)
+ }
+
+ // ±0 - y
+ // x - ±Inf
+ return z.Neg(y)
+}
+
+// Mul sets z to the rounded product x*y and returns z.
+// Precision, rounding, and accuracy reporting are as for Add.
+// Mul panics with ErrNaN if one operand is zero and the other
+// operand an infinity. The value of z is undefined in that case.
+func (z *Float) Mul(x, y *Float) *Float {
+ if debugFloat {
+ x.validate()
+ y.validate()
+ }
+
+ if z.prec == 0 {
+ z.prec = umax32(x.prec, y.prec)
+ }
+
+ z.neg = x.neg != y.neg
+
+ if x.form == finite && y.form == finite {
+ // x * y (common case)
+ z.umul(x, y)
+ return z
+ }
+
+ z.acc = Exact
+ if x.form == zero && y.form == inf || x.form == inf && y.form == zero {
+ // ±0 * ±Inf
+ // ±Inf * ±0
+ // value of z is undefined but make sure it's valid
+ z.form = zero
+ z.neg = false
+ panic(ErrNaN{"multiplication of zero with infinity"})
+ }
+
+ if x.form == inf || y.form == inf {
+ // ±Inf * y
+ // x * ±Inf
+ z.form = inf
+ return z
+ }
+
+ // ±0 * y
+ // x * ±0
+ z.form = zero
+ return z
+}
+
+// Quo sets z to the rounded quotient x/y and returns z.
+// Precision, rounding, and accuracy reporting are as for Add.
+// Quo panics with ErrNaN if both operands are zero or infinities.
+// The value of z is undefined in that case.
+func (z *Float) Quo(x, y *Float) *Float {
+ if debugFloat {
+ x.validate()
+ y.validate()
+ }
+
+ if z.prec == 0 {
+ z.prec = umax32(x.prec, y.prec)
+ }
+
+ z.neg = x.neg != y.neg
+
+ if x.form == finite && y.form == finite {
+ // x / y (common case)
+ z.uquo(x, y)
+ return z
+ }
+
+ z.acc = Exact
+ if x.form == zero && y.form == zero || x.form == inf && y.form == inf {
+ // ±0 / ±0
+ // ±Inf / ±Inf
+ // value of z is undefined but make sure it's valid
+ z.form = zero
+ z.neg = false
+ panic(ErrNaN{"division of zero by zero or infinity by infinity"})
+ }
+
+ if x.form == zero || y.form == inf {
+ // ±0 / y
+ // x / ±Inf
+ z.form = zero
+ return z
+ }
+
+ // x / ±0
+ // ±Inf / y
+ z.form = inf
+ return z
+}
+
+// Cmp compares x and y and returns:
+//
+// -1 if x < y
+// 0 if x == y (incl. -0 == 0, -Inf == -Inf, and +Inf == +Inf)
+// +1 if x > y
+func (x *Float) Cmp(y *Float) int {
+ if debugFloat {
+ x.validate()
+ y.validate()
+ }
+
+ mx := x.ord()
+ my := y.ord()
+ switch {
+ case mx < my:
+ return -1
+ case mx > my:
+ return +1
+ }
+ // mx == my
+
+ // only if |mx| == 1 we have to compare the mantissae
+ switch mx {
+ case -1:
+ return y.ucmp(x)
+ case +1:
+ return x.ucmp(y)
+ }
+
+ return 0
+}
+
+// ord classifies x and returns:
+//
+// -2 if -Inf == x
+// -1 if -Inf < x < 0
+// 0 if x == 0 (signed or unsigned)
+// +1 if 0 < x < +Inf
+// +2 if x == +Inf
+func (x *Float) ord() int {
+ var m int
+ switch x.form {
+ case finite:
+ m = 1
+ case zero:
+ return 0
+ case inf:
+ m = 2
+ }
+ if x.neg {
+ m = -m
+ }
+ return m
+}
+
+func umax32(x, y uint32) uint32 {
+ if x > y {
+ return x
+ }
+ return y
+}