diff options
Diffstat (limited to 'third_party/rust/wpf-gpu-raster/src/real.rs')
-rw-r--r-- | third_party/rust/wpf-gpu-raster/src/real.rs | 163 |
1 files changed, 163 insertions, 0 deletions
diff --git a/third_party/rust/wpf-gpu-raster/src/real.rs b/third_party/rust/wpf-gpu-raster/src/real.rs new file mode 100644 index 0000000000..a9144ec149 --- /dev/null +++ b/third_party/rust/wpf-gpu-raster/src/real.rs @@ -0,0 +1,163 @@ +pub mod CFloatFPU { + // Maximum allowed argument for SmallRound + // const sc_uSmallMax: u32 = 0xFFFFF; + + // Binary representation of static_cast<float>(sc_uSmallMax) + const sc_uBinaryFloatSmallMax: u32 = 0x497ffff0; + + fn LargeRound(x: f32) -> i32 { + //XXX: the SSE2 version is probably slower than a naive SSE4 implementation that can use roundss + #[cfg(target_feature = "sse2")] + unsafe { + #[cfg(target_arch = "x86")] + use std::arch::x86::{__m128, _mm_set_ss, _mm_cvtss_si32, _mm_cvtsi32_ss, _mm_sub_ss, _mm_cmple_ss, _mm_store_ss, _mm_setzero_ps}; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::{__m128, _mm_set_ss, _mm_cvtss_si32, _mm_cvtsi32_ss, _mm_sub_ss, _mm_cmple_ss, _mm_store_ss, _mm_setzero_ps}; + + let given: __m128 = _mm_set_ss(x); // load given value + let result = _mm_cvtss_si32(given); + let rounded: __m128 = _mm_setzero_ps(); // convert it to integer (rounding mode doesn't matter) + let rounded = _mm_cvtsi32_ss(rounded, result); // convert back to float + let diff = _mm_sub_ss(rounded, given); // diff = (rounded - given) + let negHalf = _mm_set_ss(-0.5); // load -0.5f + let mask = _mm_cmple_ss(diff, negHalf); // get all-ones if (rounded - given) < -0.5f + let mut correction: i32 = 0; + _mm_store_ss((&mut correction) as *mut _ as *mut _, mask); // get comparison result as integer + return result - correction; // correct the result of rounding + } + #[cfg(not(target_feature = "sse2"))] + return (x + 0.5).floor() as i32; + } + + + //+------------------------------------------------------------------------ +// +// Function: CFloatFPU::SmallRound +// +// Synopsis: Convert given floating point value to nearest integer. +// Half-integers are rounded up. +// +// Important: this routine is fast but restricted: +// given x should be within (-(0x100000-.5) < x < (0x100000-.5)) +// +// Details: Implementation has abnormal looking that use to confuse +// many people. However, it indeed works, being tested +// thoroughly on x86 and ia64 platforms for literally +// each possible argument values in the given range. +// +// More details: +// Implementation is based on the knowledge of floating point +// value representation. This 32-bits value consists of three parts: +// v & 0x80000000 = sign +// v & 0x7F800000 = exponent +// v & 0x007FFFFF - mantissa +// +// Let N to be a floating point number within -0x400000 <= N <= 0x3FFFFF. +// The sum (S = 0xC00000 + N) thus will satisfy Ox800000 <= S <= 0xFFFFFF. +// All the numbers within this range (sometimes referred to as "binade") +// have same position of most significant bit, i.e. 0x800000. +// Therefore they are normalized equal way, thus +// providing the weights on mantissa's bits to be the same +// as integer numbers have. In other words, to get +// integer value of floating point S, when Ox800000 <= S <= 0xFFFFFF, +// we can just throw away the exponent and sign, and add assumed +// most significant bit (that is always 1 and therefore is not stored +// in floating point value): +// (int)S = (<float S as int> & 0x7FFFFF | 0x800000); +// To get given N in as integer, we need to subtract back +// the value 0xC00000 that was added in order to obtain +// proper normalization: +// N = (<float S as int> & 0x7FFFFF | 0x800000) - 0xC00000. +// or +// N = (<float S as int> & 0x7FFFFF ) - 0x400000. +// +// Hopefully, the text above explains how +// following routine works: +// int SmallRound1(float x) +// { +// union +// { +// __int32 i; +// float f; +// } u; +// +// u.f = x + float(0x00C00000); +// return ((u.i - (int)0x00400000) << 9) >> 9; +// } +// Unfortunatelly it is imperfect, due to the way how FPU +// use to round intermediate calculation results. +// By default, rounding mode is set to "nearest". +// This means that when it calculates N+float(0x00C00000), +// the 80-bit precise result will not fit in 32-bit float, +// so some least significant bits will be thrown away. +// Rounding to nearest means that S consisting of intS + fraction, +// where 0 <= fraction < 1, will be converted to intS +// when fraction < 0.5 and to intS+1 if fraction > 0.5. +// What would happen with fraction exactly equal to 0.5? +// Smart thing: S will go to intS if intS is even and +// to intS+1 if intS is odd. In other words, half-integers +// are rounded to nearest even number. +// This FPU feature apparently is useful to minimize +// average rounding error when somebody is, say, +// digitally simulating electrons' behavior in plasma. +// However for graphics this is not desired. +// +// We want to move half-integers up, therefore +// define SmallRound(x) as {return SmallRound1(x*2+.5) >> 1;}. +// This may require more comments. +// Let given x = i+f, where i is integer and f is fraction, 0 <= f < 1. +// Let's wee what is y = x*2+.5: +// y = i*2 + (f*2 + .5) = i*2 + g, where g = f*2 + .5; +// If "f" is in the range 0 <= f < .5 (so correct rounding result should be "i"), +// then range for "g" is .5 <= g < 1.5. The very first value, .5 will force +// SmallRound1 result to be "i*2", due to round-to-even rule; the remaining +// will lead to "i*2+1". Consequent shift will throw away extra "1" and give +// us desired "i". +// When "f" in in the range .5 <= f < 1, then 1.5 <= g < 2.5. +// All these values will round to 2, so SmallRound1 will return (2*i+2), +// and the final shift will give desired 1+1. +// +// To get final routine looking we need to transform the combines +// expression for u.f: +// (x*2) + .5 + float(0x00C00000) == +// (x + (.25 + double(0x00600000)) )*2 +// Note that the ratio "2" means nothing for following operations, +// since it affects only exponent bits that are ignored anyway. +// So we can save some processor cycles avoiding this multiplication. +// +// And, the very final beautification: +// to avoid subtracting 0x00400000 let's ignore this bit. +// This mean that we effectively decrease available range by 1 bit, +// but we're chasing for performance and found it acceptable. +// So +// return ((u.i - (int)0x00400000) << 9) >> 9; +// is converted to +// return ((u.i ) << 10) >> 10; +// Eventually, will found that final shift by 10 bits may be combined +// with shift by 1 in the definition {return SmallRound1(x*2+.5) >> 1;}, +// we'll just shift by 11 bits. That's it. +// +//------------------------------------------------------------------------- +fn SmallRound(x: f32) -> i32 +{ + //AssertPrecisionAndRoundingMode(); + debug_assert!(-(0x100000 as f64 -0.5) < x as f64 && (x as f64) < (0x100000 as f64 -0.5)); + + + let fi = (x as f64 + (0x00600000 as f64 + 0.25)) as f32; + let result = ((fi.to_bits() as i32) << 10) >> 11; + + debug_assert!(x < (result as f32) + 0.5 && x >= (result as f32) - 0.5); + return result; +} + +pub fn Round(x: f32) -> i32 +{ + // cut off sign + let xAbs: u32 = x.to_bits() & 0x7FFFFFFF; + + return if xAbs <= sc_uBinaryFloatSmallMax {SmallRound(x)} else {LargeRound(x)}; +} +} + +macro_rules! TOREAL { ($e: expr) => { $e as REAL } } |