/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "jit/MacroAssembler.h" #include "jit/x86-shared/MacroAssembler-x86-shared.h" #include "jit/MacroAssembler-inl.h" using namespace js; using namespace js::jit; using mozilla::DebugOnly; using mozilla::FloatingPoint; using mozilla::Maybe; using mozilla::SpecificNaN; void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); vmovd(input, output); if (HasAVX2()) { vbroadcastb(Operand(output), output); return; } vpxor(scratch, scratch, scratch); vpshufb(scratch, output, output); } void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) { vmovd(input, output); if (HasAVX2()) { vbroadcastw(Operand(output), output); return; } vpshuflw(0, output, output); vpshufd(0, output, output); } void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) { vmovd(input, output); if (HasAVX2()) { vbroadcastd(Operand(output), output); return; } vpshufd(0, output, output); } void MacroAssemblerX86Shared::splatX4(FloatRegister input, FloatRegister output) { MOZ_ASSERT(input.isSingle() && output.isSimd128()); if (HasAVX2()) { vbroadcastss(Operand(input), output); return; } input = asMasm().moveSimd128FloatIfNotAVX(input.asSimd128(), output); vshufps(0, input, input, output); } void MacroAssemblerX86Shared::splatX2(FloatRegister input, FloatRegister output) { MOZ_ASSERT(input.isDouble() && output.isSimd128()); vmovddup(Operand(input.asSimd128()), output); } void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input, Register output, unsigned lane) { if (lane == 0) { // The value we want to extract is in the low double-word moveLowInt32(input, output); } else { vpextrd(lane, input, output); } } void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input, FloatRegister output, unsigned lane) { MOZ_ASSERT(input.isSimd128() && output.isSingle()); if (lane == 0) { // The value we want to extract is in the low double-word if (input.asSingle() != output) { moveFloat32(input, output); } } else if (lane == 2) { moveHighPairToLowPairFloat32(input, output); } else { uint32_t mask = MacroAssembler::ComputeShuffleMask(lane); FloatRegister dest = output.asSimd128(); input = moveSimd128FloatIfNotAVX(input, dest); vshufps(mask, input, input, dest); } } void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input, FloatRegister output, unsigned lane) { MOZ_ASSERT(input.isSimd128() && output.isDouble()); if (lane == 0) { // The value we want to extract is in the low quadword if (input.asDouble() != output) { moveDouble(input, output); } } else { vpalignr(Operand(input), output, output, 8); } } void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input, Register output, unsigned lane, SimdSign sign) { vpextrw(lane, input, Operand(output)); if (sign == SimdSign::Signed) { movswl(output, output); } } void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input, Register output, unsigned lane, SimdSign sign) { vpextrb(lane, input, Operand(output)); if (sign == SimdSign::Signed) { if (!AllocatableGeneralRegisterSet(Registers::SingleByteRegs).has(output)) { xchgl(eax, output); movsbl(eax, eax); xchgl(eax, output); } else { movsbl(output, output); } } } void MacroAssemblerX86Shared::replaceLaneFloat32x4(unsigned lane, FloatRegister lhs, FloatRegister rhs, FloatRegister dest) { MOZ_ASSERT(lhs.isSimd128() && rhs.isSingle()); if (lane == 0) { if (rhs.asSimd128() == lhs) { // no-op, although this should not normally happen for type checking // reasons higher up in the stack. moveSimd128Float(lhs, dest); } else { // move low dword of value into low dword of output vmovss(rhs, lhs, dest); } } else { vinsertps(vinsertpsMask(0, lane), rhs, lhs, dest); } } void MacroAssemblerX86Shared::replaceLaneFloat64x2(unsigned lane, FloatRegister lhs, FloatRegister rhs, FloatRegister dest) { MOZ_ASSERT(lhs.isSimd128() && rhs.isDouble()); if (lane == 0) { if (rhs.asSimd128() == lhs) { // no-op, although this should not normally happen for type checking // reasons higher up in the stack. moveSimd128Float(lhs, dest); } else { // move low qword of value into low qword of output vmovsd(rhs, lhs, dest); } } else { // move low qword of value into high qword of output vshufpd(0, rhs, lhs, dest); } } void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output, FloatRegister temp, const uint8_t lanes[16]) { asMasm().loadConstantSimd128Int( SimdConstant::CreateX16(reinterpret_cast(lanes)), temp); vpblendvb(temp, rhs, lhs, output); } void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs, FloatRegister output, const uint16_t lanes[8]) { uint32_t mask = 0; for (unsigned i = 0; i < 8; i++) { if (lanes[i]) { mask |= (1 << i); } } vpblendw(mask, rhs, lhs, output); } void MacroAssemblerX86Shared::laneSelectSimd128(FloatRegister mask, FloatRegister lhs, FloatRegister rhs, FloatRegister output) { vpblendvb(mask, lhs, rhs, output); } void MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output, const uint8_t lanes[16]) { ScratchSimd128Scope scratch(asMasm()); // Use pshufb instructions to gather the lanes from each source vector. // A negative index creates a zero lane, so the two vectors can be combined. // Set scratch = lanes from rhs. int8_t idx[16]; for (unsigned i = 0; i < 16; i++) { idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1; } rhs = moveSimd128IntIfNotAVX(rhs, scratch); asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), rhs, scratch); // Set output = lanes from lhs. for (unsigned i = 0; i < 16; i++) { idx[i] = lanes[i] < 16 ? lanes[i] : -1; } lhs = moveSimd128IntIfNotAVX(lhs, output); asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), lhs, output); // Combine. vpor(scratch, output, output); } static inline FloatRegister ToSimdFloatRegister(const Operand& op) { return FloatRegister(op.fpu(), FloatRegister::Codes::ContentType::Simd128); } void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { switch (cond) { case Assembler::Condition::GreaterThan: vpcmpgtb(rhs, lhs, output); break; case Assembler::Condition::Equal: vpcmpeqb(rhs, lhs, output); break; case Assembler::Condition::LessThan: { ScratchSimd128Scope scratch(asMasm()); if (lhs == output) { moveSimd128Int(lhs, scratch); lhs = scratch; } if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), output); } else { loadAlignedSimd128Int(rhs, output); } vpcmpgtb(Operand(lhs), output, output); break; } case Assembler::Condition::NotEqual: vpcmpeqb(rhs, lhs, output); asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Condition::GreaterThanOrEqual: { ScratchSimd128Scope scratch(asMasm()); if (lhs == output) { moveSimd128Int(lhs, scratch); lhs = scratch; } if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), output); } else { loadAlignedSimd128Int(rhs, output); } vpcmpgtb(Operand(lhs), output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Condition::LessThanOrEqual: // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. vpcmpgtb(rhs, lhs, output); asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Above: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpminub(rhs, lhs, output); vpcmpeqb(Operand(lhs), output, output); } else { vpmaxub(rhs, lhs, output); vpcmpeqb(rhs, output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::BelowOrEqual: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpminub(rhs, lhs, output); vpcmpeqb(Operand(lhs), output, output); } else { vpmaxub(rhs, lhs, output); vpcmpeqb(rhs, output, output); } break; case Assembler::Below: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpmaxub(rhs, lhs, output); vpcmpeqb(Operand(lhs), output, output); } else { vpminub(rhs, lhs, output); vpcmpeqb(rhs, output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::AboveOrEqual: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpmaxub(rhs, lhs, output); vpcmpeqb(Operand(lhs), output, output); } else { vpminub(rhs, lhs, output); vpcmpeqb(rhs, output, output); } break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareInt8x16(Assembler::Condition cond, FloatRegister lhs, const SimdConstant& rhs, FloatRegister dest) { bool complement = false; switch (cond) { case Assembler::Condition::NotEqual: complement = true; [[fallthrough]]; case Assembler::Condition::Equal: binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqb, &MacroAssembler::vpcmpeqbSimd128); break; case Assembler::Condition::LessThanOrEqual: complement = true; [[fallthrough]]; case Assembler::Condition::GreaterThan: binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtb, &MacroAssembler::vpcmpgtbSimd128); break; default: MOZ_CRASH("unexpected condition op"); } if (complement) { asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest); } } void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { switch (cond) { case Assembler::Condition::GreaterThan: vpcmpgtw(rhs, lhs, output); break; case Assembler::Condition::Equal: vpcmpeqw(rhs, lhs, output); break; case Assembler::Condition::LessThan: { ScratchSimd128Scope scratch(asMasm()); if (lhs == output) { moveSimd128Int(lhs, scratch); lhs = scratch; } if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), output); } else { loadAlignedSimd128Int(rhs, output); } vpcmpgtw(Operand(lhs), output, output); break; } case Assembler::Condition::NotEqual: vpcmpeqw(rhs, lhs, output); asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Condition::GreaterThanOrEqual: { ScratchSimd128Scope scratch(asMasm()); if (lhs == output) { moveSimd128Int(lhs, scratch); lhs = scratch; } if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), output); } else { loadAlignedSimd128Int(rhs, output); } vpcmpgtw(Operand(lhs), output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Condition::LessThanOrEqual: // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. vpcmpgtw(rhs, lhs, output); asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Above: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpminuw(rhs, lhs, output); vpcmpeqw(Operand(lhs), output, output); } else { vpmaxuw(rhs, lhs, output); vpcmpeqw(rhs, output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::BelowOrEqual: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpminuw(rhs, lhs, output); vpcmpeqw(Operand(lhs), output, output); } else { vpmaxuw(rhs, lhs, output); vpcmpeqw(rhs, output, output); } break; case Assembler::Below: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpmaxuw(rhs, lhs, output); vpcmpeqw(Operand(lhs), output, output); } else { vpminuw(rhs, lhs, output); vpcmpeqw(rhs, output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::AboveOrEqual: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpmaxuw(rhs, lhs, output); vpcmpeqw(Operand(lhs), output, output); } else { vpminuw(rhs, lhs, output); vpcmpeqw(rhs, output, output); } break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareInt16x8(Assembler::Condition cond, FloatRegister lhs, const SimdConstant& rhs, FloatRegister dest) { bool complement = false; switch (cond) { case Assembler::Condition::NotEqual: complement = true; [[fallthrough]]; case Assembler::Condition::Equal: binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqw, &MacroAssembler::vpcmpeqwSimd128); break; case Assembler::Condition::LessThanOrEqual: complement = true; [[fallthrough]]; case Assembler::Condition::GreaterThan: binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtw, &MacroAssembler::vpcmpgtwSimd128); break; default: MOZ_CRASH("unexpected condition op"); } if (complement) { asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest); } } void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { switch (cond) { case Assembler::Condition::GreaterThan: vpcmpgtd(rhs, lhs, output); break; case Assembler::Condition::Equal: vpcmpeqd(rhs, lhs, output); break; case Assembler::Condition::LessThan: { ScratchSimd128Scope scratch(asMasm()); if (lhs == output) { moveSimd128Int(lhs, scratch); lhs = scratch; } if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), output); } else { loadAlignedSimd128Int(rhs, output); } vpcmpgtd(Operand(lhs), output, output); break; } case Assembler::Condition::NotEqual: vpcmpeqd(rhs, lhs, output); asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Condition::GreaterThanOrEqual: { ScratchSimd128Scope scratch(asMasm()); if (lhs == output) { moveSimd128Int(lhs, scratch); lhs = scratch; } if (rhs.kind() == Operand::FPREG) { moveSimd128Int(ToSimdFloatRegister(rhs), output); } else { loadAlignedSimd128Int(rhs, output); } vpcmpgtd(Operand(lhs), output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Condition::LessThanOrEqual: // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here. vpcmpgtd(rhs, lhs, output); asMasm().bitwiseNotSimd128(output, output); break; case Assembler::Above: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpminud(rhs, lhs, output); vpcmpeqd(Operand(lhs), output, output); } else { vpmaxud(rhs, lhs, output); vpcmpeqd(rhs, output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::BelowOrEqual: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpminud(rhs, lhs, output); vpcmpeqd(Operand(lhs), output, output); } else { vpmaxud(rhs, lhs, output); vpcmpeqd(rhs, output, output); } break; case Assembler::Below: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpmaxud(rhs, lhs, output); vpcmpeqd(Operand(lhs), output, output); } else { vpminud(rhs, lhs, output); vpcmpeqd(rhs, output, output); } asMasm().bitwiseNotSimd128(output, output); break; case Assembler::AboveOrEqual: if (rhs.kind() == Operand::FPREG && ToSimdFloatRegister(rhs) == output) { vpmaxud(rhs, lhs, output); vpcmpeqd(Operand(lhs), output, output); } else { vpminud(rhs, lhs, output); vpcmpeqd(rhs, output, output); } break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareInt32x4(Assembler::Condition cond, FloatRegister lhs, const SimdConstant& rhs, FloatRegister dest) { bool complement = false; switch (cond) { case Assembler::Condition::NotEqual: complement = true; [[fallthrough]]; case Assembler::Condition::Equal: binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpeqd, &MacroAssembler::vpcmpeqdSimd128); break; case Assembler::Condition::LessThanOrEqual: complement = true; [[fallthrough]]; case Assembler::Condition::GreaterThan: binarySimd128(lhs, rhs, dest, &MacroAssembler::vpcmpgtd, &MacroAssembler::vpcmpgtdSimd128); break; default: MOZ_CRASH("unexpected condition op"); } if (complement) { asMasm().bitwiseXorSimd128(dest, SimdConstant::SplatX16(-1), dest); } } void MacroAssemblerX86Shared::compareForEqualityInt64x2( FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { static const SimdConstant allOnes = SimdConstant::SplatX4(-1); switch (cond) { case Assembler::Condition::Equal: vpcmpeqq(rhs, lhs, output); break; case Assembler::Condition::NotEqual: vpcmpeqq(rhs, lhs, output); asMasm().bitwiseXorSimd128(output, allOnes, output); break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareForOrderingInt64x2( FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { static const SimdConstant allOnes = SimdConstant::SplatX4(-1); // The pseudo code is for (e.g. > comparison): // __m128i pcmpgtq_sse2 (__m128i a, __m128i b) { // __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a)); // r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b)); // return _mm_shuffle_epi32(r, _MM_SHUFFLE(3,3,1,1)); // } // Credits to https://stackoverflow.com/a/65175746 switch (cond) { case Assembler::Condition::GreaterThan: vmovdqa(rhs, temp1); vmovdqa(Operand(lhs), temp2); vpsubq(Operand(lhs), temp1, temp1); vpcmpeqd(rhs, temp2, temp2); vandpd(temp2, temp1, temp1); lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); vpcmpgtd(rhs, lhs, output); vpor(Operand(temp1), output, output); vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); break; case Assembler::Condition::LessThan: vmovdqa(rhs, temp1); vmovdqa(Operand(lhs), temp2); vpcmpgtd(Operand(lhs), temp1, temp1); vpcmpeqd(Operand(rhs), temp2, temp2); lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); vpsubq(rhs, lhs, output); vandpd(temp2, output, output); vpor(Operand(temp1), output, output); vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); break; case Assembler::Condition::GreaterThanOrEqual: vmovdqa(rhs, temp1); vmovdqa(Operand(lhs), temp2); vpcmpgtd(Operand(lhs), temp1, temp1); vpcmpeqd(Operand(rhs), temp2, temp2); lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); vpsubq(rhs, lhs, output); vandpd(temp2, output, output); vpor(Operand(temp1), output, output); vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); asMasm().bitwiseXorSimd128(output, allOnes, output); break; case Assembler::Condition::LessThanOrEqual: vmovdqa(rhs, temp1); vmovdqa(Operand(lhs), temp2); vpsubq(Operand(lhs), temp1, temp1); vpcmpeqd(rhs, temp2, temp2); vandpd(temp2, temp1, temp1); lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); vpcmpgtd(rhs, lhs, output); vpor(Operand(temp1), output, output); vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); asMasm().bitwiseXorSimd128(output, allOnes, output); break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareForOrderingInt64x2AVX( FloatRegister lhs, FloatRegister rhs, Assembler::Condition cond, FloatRegister output) { MOZ_ASSERT(HasSSE42()); static const SimdConstant allOnes = SimdConstant::SplatX4(-1); switch (cond) { case Assembler::Condition::GreaterThan: vpcmpgtq(Operand(rhs), lhs, output); break; case Assembler::Condition::LessThan: vpcmpgtq(Operand(lhs), rhs, output); break; case Assembler::Condition::GreaterThanOrEqual: vpcmpgtq(Operand(lhs), rhs, output); asMasm().bitwiseXorSimd128(output, allOnes, output); break; case Assembler::Condition::LessThanOrEqual: vpcmpgtq(Operand(rhs), lhs, output); asMasm().bitwiseXorSimd128(output, allOnes, output); break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { // TODO Can do better here with three-address compares // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output. // This is bad, but Ion does not need this fixup. ScratchSimd128Scope scratch(asMasm()); if (!HasAVX() && !lhs.aliases(output)) { if (rhs.kind() == Operand::FPREG && output.aliases(FloatRegister::FromCode(rhs.fpu()))) { vmovaps(rhs, scratch); rhs = Operand(scratch); } vmovaps(lhs, output); lhs = output; } switch (cond) { case Assembler::Condition::Equal: vcmpeqps(rhs, lhs, output); break; case Assembler::Condition::LessThan: vcmpltps(rhs, lhs, output); break; case Assembler::Condition::LessThanOrEqual: vcmpleps(rhs, lhs, output); break; case Assembler::Condition::NotEqual: vcmpneqps(rhs, lhs, output); break; case Assembler::Condition::GreaterThanOrEqual: case Assembler::Condition::GreaterThan: // We reverse these operations in the -inl.h file so that we don't have to // copy into and out of temporaries after codegen. MOZ_CRASH("should have reversed this"); default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareFloat32x4(Assembler::Condition cond, FloatRegister lhs, const SimdConstant& rhs, FloatRegister dest) { switch (cond) { case Assembler::Condition::Equal: binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpeqps, &MacroAssembler::vcmpeqpsSimd128); break; case Assembler::Condition::LessThan: binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpltps, &MacroAssembler::vcmpltpsSimd128); break; case Assembler::Condition::LessThanOrEqual: binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpleps, &MacroAssembler::vcmplepsSimd128); break; case Assembler::Condition::NotEqual: binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpneqps, &MacroAssembler::vcmpneqpsSimd128); break; default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs, Assembler::Condition cond, FloatRegister output) { // TODO Can do better here with three-address compares // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output. // This is bad, but Ion does not need this fixup. ScratchSimd128Scope scratch(asMasm()); if (!HasAVX() && !lhs.aliases(output)) { if (rhs.kind() == Operand::FPREG && output.aliases(FloatRegister::FromCode(rhs.fpu()))) { vmovapd(rhs, scratch); rhs = Operand(scratch); } vmovapd(lhs, output); lhs = output; } switch (cond) { case Assembler::Condition::Equal: vcmpeqpd(rhs, lhs, output); break; case Assembler::Condition::LessThan: vcmpltpd(rhs, lhs, output); break; case Assembler::Condition::LessThanOrEqual: vcmplepd(rhs, lhs, output); break; case Assembler::Condition::NotEqual: vcmpneqpd(rhs, lhs, output); break; case Assembler::Condition::GreaterThanOrEqual: case Assembler::Condition::GreaterThan: // We reverse these operations in the -inl.h file so that we don't have to // copy into and out of temporaries after codegen. MOZ_CRASH("should have reversed this"); default: MOZ_CRASH("unexpected condition op"); } } void MacroAssemblerX86Shared::compareFloat64x2(Assembler::Condition cond, FloatRegister lhs, const SimdConstant& rhs, FloatRegister dest) { switch (cond) { case Assembler::Condition::Equal: binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpeqpd, &MacroAssembler::vcmpeqpdSimd128); break; case Assembler::Condition::LessThan: binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpltpd, &MacroAssembler::vcmpltpdSimd128); break; case Assembler::Condition::LessThanOrEqual: binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmplepd, &MacroAssembler::vcmplepdSimd128); break; case Assembler::Condition::NotEqual: binarySimd128(lhs, rhs, dest, &MacroAssembler::vcmpneqpd, &MacroAssembler::vcmpneqpdSimd128); break; default: MOZ_CRASH("unexpected condition op"); } } // Semantics of wasm max and min. // // * -0 < 0 // * If one input is NaN then that NaN is the output // * If both inputs are NaN then the output is selected nondeterministically // * Any returned NaN is always made quiet // * The MVP spec 2.2.3 says "No distinction is made between signalling and // quiet NaNs", suggesting SNaN inputs are allowed and should not fault // // Semantics of maxps/minps/maxpd/minpd: // // * If the values are both +/-0 the rhs is returned // * If the rhs is SNaN then the rhs is returned // * If either value is NaN then the rhs is returned // * An SNaN operand does not appear to give rise to an exception, at least // not in the JS shell on Linux, though the Intel spec lists Invalid // as one of the possible exceptions // Various unaddressed considerations: // // It's pretty insane for this to take an Operand rhs - it really needs to be // a register, given the number of times we access it. // // Constant load can be folded into the ANDPS. Do we care? It won't save us // any registers, since output/temp1/temp2/scratch are all live at the same time // after the first instruction of the slow path. // // Can we use blend for the NaN extraction/insertion? We'd need xmm0 for the // mask, which is no fun. But it would be lhs UNORD lhs -> mask, blend; // rhs UNORD rhs -> mask; blend. Better than the mess we have below. But // we'd still need to setup the QNaN bits, unless we can blend those too // with the lhs UNORD rhs mask? // // If we could determine that both input lanes are NaN then the result of the // fast path should be fine modulo the QNaN bits, but it's not obvious this is // much of an advantage. void MacroAssemblerX86Shared::minMaxFloat32x4(bool isMin, FloatRegister lhs, Operand rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); Label l; SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000))); /* clang-format off */ /* leave my comments alone */ lhs = moveSimd128FloatIfNotAVXOrOther(lhs, scratch, output); if (isMin) { vmovaps(lhs, output); // compute vminps(rhs, output, output); // min lhs, rhs vmovaps(rhs, temp1); // compute vminps(Operand(lhs), temp1, temp1); // min rhs, lhs vorps(temp1, output, output); // fix min(-0, 0) with OR } else { vmovaps(lhs, output); // compute vmaxps(rhs, output, output); // max lhs, rhs vmovaps(rhs, temp1); // compute vmaxps(Operand(lhs), temp1, temp1); // max rhs, lhs vandps(temp1, output, output); // fix max(-0, 0) with AND } vmovaps(lhs, temp1); // compute vcmpunordps(rhs, temp1, temp1); // lhs UNORD rhs vptest(temp1, temp1); // check if any unordered j(Assembler::Equal, &l); // and exit if not // Slow path. // output has result for non-NaN lanes, garbage in NaN lanes. // temp1 has lhs UNORD rhs. // temp2 is dead. vmovaps(temp1, temp2); // clear NaN lanes of result vpandn(output, temp2, temp2); // result now in temp2 asMasm().vpandSimd128(quietBits, temp1, temp1); // setup QNaN bits in NaN lanes vorps(temp1, temp2, temp2); // and OR into result vmovaps(lhs, temp1); // find NaN lanes vcmpunordps(Operand(temp1), temp1, temp1); // in lhs vmovaps(temp1, output); // (and save them for later) vandps(lhs, temp1, temp1); // and extract the NaNs vorps(temp1, temp2, temp2); // and add to the result vmovaps(rhs, temp1); // find NaN lanes vcmpunordps(Operand(temp1), temp1, temp1); // in rhs vpandn(temp1, output, output); // except if they were in lhs vandps(rhs, output, output); // and extract the NaNs vorps(temp2, output, output); // and add to the result bind(&l); /* clang-format on */ } void MacroAssemblerX86Shared::minMaxFloat32x4AVX(bool isMin, FloatRegister lhs, FloatRegister rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); Label l; SimdConstant quietBits(SimdConstant::SplatX4(int32_t(0x00400000))); /* clang-format off */ /* leave my comments alone */ FloatRegister lhsCopy = moveSimd128FloatIfEqual(lhs, scratch, output); // Allow rhs be assigned to scratch when rhs == lhs and == output -- // don't make a special case since the semantics require setup QNaN bits. FloatRegister rhsCopy = moveSimd128FloatIfEqual(rhs, scratch, output); if (isMin) { vminps(Operand(rhs), lhs, temp2); // min lhs, rhs vminps(Operand(lhs), rhs, temp1); // min rhs, lhs vorps(temp1, temp2, output); // fix min(-0, 0) with OR } else { vmaxps(Operand(rhs), lhs, temp2); // max lhs, rhs vmaxps(Operand(lhs), rhs, temp1); // max rhs, lhs vandps(temp1, temp2, output); // fix max(-0, 0) with AND } vcmpunordps(Operand(rhsCopy), lhsCopy, temp1); // lhs UNORD rhs vptest(temp1, temp1); // check if any unordered j(Assembler::Equal, &l); // and exit if not // Slow path. // output has result for non-NaN lanes, garbage in NaN lanes. // temp1 has lhs UNORD rhs. // temp2 is dead. vcmpunordps(Operand(lhsCopy), lhsCopy, temp2); // find NaN lanes in lhs vblendvps(temp2, lhsCopy, rhsCopy, temp2); // add other lines from rhs asMasm().vporSimd128(quietBits, temp2, temp2); // setup QNaN bits in NaN lanes vblendvps(temp1, temp2, output, output); // replace NaN lines from temp2 bind(&l); /* clang-format on */ } // Exactly as above. void MacroAssemblerX86Shared::minMaxFloat64x2(bool isMin, FloatRegister lhs, Operand rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); Label l; SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull))); /* clang-format off */ /* leave my comments alone */ lhs = moveSimd128FloatIfNotAVXOrOther(lhs, scratch, output); if (isMin) { vmovapd(lhs, output); // compute vminpd(rhs, output, output); // min lhs, rhs vmovapd(rhs, temp1); // compute vminpd(Operand(lhs), temp1, temp1); // min rhs, lhs vorpd(temp1, output, output); // fix min(-0, 0) with OR } else { vmovapd(lhs, output); // compute vmaxpd(rhs, output, output); // max lhs, rhs vmovapd(rhs, temp1); // compute vmaxpd(Operand(lhs), temp1, temp1); // max rhs, lhs vandpd(temp1, output, output); // fix max(-0, 0) with AND } vmovapd(lhs, temp1); // compute vcmpunordpd(rhs, temp1, temp1); // lhs UNORD rhs vptest(temp1, temp1); // check if any unordered j(Assembler::Equal, &l); // and exit if not // Slow path. // output has result for non-NaN lanes, garbage in NaN lanes. // temp1 has lhs UNORD rhs. // temp2 is dead. vmovapd(temp1, temp2); // clear NaN lanes of result vpandn(output, temp2, temp2); // result now in temp2 asMasm().vpandSimd128(quietBits, temp1, temp1); // setup QNaN bits in NaN lanes vorpd(temp1, temp2, temp2); // and OR into result vmovapd(lhs, temp1); // find NaN lanes vcmpunordpd(Operand(temp1), temp1, temp1); // in lhs vmovapd(temp1, output); // (and save them for later) vandpd(lhs, temp1, temp1); // and extract the NaNs vorpd(temp1, temp2, temp2); // and add to the result vmovapd(rhs, temp1); // find NaN lanes vcmpunordpd(Operand(temp1), temp1, temp1); // in rhs vpandn(temp1, output, output); // except if they were in lhs vandpd(rhs, output, output); // and extract the NaNs vorpd(temp2, output, output); // and add to the result bind(&l); /* clang-format on */ } void MacroAssemblerX86Shared::minMaxFloat64x2AVX(bool isMin, FloatRegister lhs, FloatRegister rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); Label l; SimdConstant quietBits(SimdConstant::SplatX2(int64_t(0x0008000000000000ull))); /* clang-format off */ /* leave my comments alone */ FloatRegister lhsCopy = moveSimd128FloatIfEqual(lhs, scratch, output); // Allow rhs be assigned to scratch when rhs == lhs and == output -- // don't make a special case since the semantics require setup QNaN bits. FloatRegister rhsCopy = moveSimd128FloatIfEqual(rhs, scratch, output); if (isMin) { vminpd(Operand(rhs), lhs, temp2); // min lhs, rhs vminpd(Operand(lhs), rhs, temp1); // min rhs, lhs vorpd(temp1, temp2, output); // fix min(-0, 0) with OR } else { vmaxpd(Operand(rhs), lhs, temp2); // max lhs, rhs vmaxpd(Operand(lhs), rhs, temp1); // max rhs, lhs vandpd(temp1, temp2, output); // fix max(-0, 0) with AND } vcmpunordpd(Operand(rhsCopy), lhsCopy, temp1); // lhs UNORD rhs vptest(temp1, temp1); // check if any unordered j(Assembler::Equal, &l); // and exit if not // Slow path. // output has result for non-NaN lanes, garbage in NaN lanes. // temp1 has lhs UNORD rhs. // temp2 is dead. vcmpunordpd(Operand(lhsCopy), lhsCopy, temp2); // find NaN lanes in lhs vblendvpd(temp2, lhsCopy, rhsCopy, temp2); // add other lines from rhs asMasm().vporSimd128(quietBits, temp2, temp2); // setup QNaN bits in NaN lanes vblendvpd(temp1, temp2, output, output); // replace NaN lines from temp2 bind(&l); /* clang-format on */ } void MacroAssemblerX86Shared::minFloat32x4(FloatRegister lhs, FloatRegister rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { if (HasAVX()) { minMaxFloat32x4AVX(/*isMin=*/true, lhs, rhs, temp1, temp2, output); return; } minMaxFloat32x4(/*isMin=*/true, lhs, Operand(rhs), temp1, temp2, output); } void MacroAssemblerX86Shared::maxFloat32x4(FloatRegister lhs, FloatRegister rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { if (HasAVX()) { minMaxFloat32x4AVX(/*isMin=*/false, lhs, rhs, temp1, temp2, output); return; } minMaxFloat32x4(/*isMin=*/false, lhs, Operand(rhs), temp1, temp2, output); } void MacroAssemblerX86Shared::minFloat64x2(FloatRegister lhs, FloatRegister rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { if (HasAVX()) { minMaxFloat64x2AVX(/*isMin=*/true, lhs, rhs, temp1, temp2, output); return; } minMaxFloat64x2(/*isMin=*/true, lhs, Operand(rhs), temp1, temp2, output); } void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, FloatRegister rhs, FloatRegister temp1, FloatRegister temp2, FloatRegister output) { if (HasAVX()) { minMaxFloat64x2AVX(/*isMin=*/false, lhs, rhs, temp1, temp2, output); return; } minMaxFloat64x2(/*isMin=*/false, lhs, Operand(rhs), temp1, temp2, output); } void MacroAssemblerX86Shared::packedShiftByScalarInt8x16( FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest, void (MacroAssemblerX86Shared::*shift)(FloatRegister, FloatRegister, FloatRegister), void (MacroAssemblerX86Shared::*extend)(const Operand&, FloatRegister)) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); // High bytes vpalignr(Operand(in), xtmp, xtmp, 8); (this->*extend)(Operand(xtmp), xtmp); (this->*shift)(scratch, xtmp, xtmp); // Low bytes (this->*extend)(Operand(dest), dest); (this->*shift)(scratch, dest, dest); // Mask off garbage to avoid saturation during packing asMasm().loadConstantSimd128Int(SimdConstant::SplatX4(int32_t(0x00FF00FF)), scratch); vpand(Operand(scratch), xtmp, xtmp); vpand(Operand(scratch), dest, dest); vpackuswb(Operand(xtmp), dest, dest); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16( FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) { packedShiftByScalarInt8x16(in, count, xtmp, dest, &MacroAssemblerX86Shared::vpsllw, &MacroAssemblerX86Shared::vpmovzxbw); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16( Imm32 count, FloatRegister src, FloatRegister dest) { MOZ_ASSERT(count.value <= 7); if (MOZ_UNLIKELY(count.value == 0)) { moveSimd128Int(src, dest); return; } src = asMasm().moveSimd128IntIfNotAVX(src, dest); // Use the doubling trick for low shift counts, otherwise mask off the bits // that are shifted out of the low byte of each word and use word shifts. The // optimal cutoff remains to be explored. if (count.value <= 3) { vpaddb(Operand(src), src, dest); for (int32_t shift = count.value - 1; shift > 0; --shift) { vpaddb(Operand(dest), dest, dest); } } else { asMasm().bitwiseAndSimd128(src, SimdConstant::SplatX16(0xFF >> count.value), dest); vpsllw(count, dest, dest); } } void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16( FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) { packedShiftByScalarInt8x16(in, count, xtmp, dest, &MacroAssemblerX86Shared::vpsraw, &MacroAssemblerX86Shared::vpmovsxbw); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16( Imm32 count, FloatRegister src, FloatRegister dest) { MOZ_ASSERT(count.value <= 7); ScratchSimd128Scope scratch(asMasm()); vpunpckhbw(src, scratch, scratch); vpunpcklbw(src, dest, dest); vpsraw(Imm32(count.value + 8), scratch, scratch); vpsraw(Imm32(count.value + 8), dest, dest); vpacksswb(Operand(scratch), dest, dest); } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16( FloatRegister in, Register count, FloatRegister xtmp, FloatRegister dest) { packedShiftByScalarInt8x16(in, count, xtmp, dest, &MacroAssemblerX86Shared::vpsrlw, &MacroAssemblerX86Shared::vpmovzxbw); } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16( Imm32 count, FloatRegister src, FloatRegister dest) { MOZ_ASSERT(count.value <= 7); src = asMasm().moveSimd128IntIfNotAVX(src, dest); asMasm().bitwiseAndSimd128( src, SimdConstant::SplatX16((0xFF << count.value) & 0xFF), dest); vpsrlw(count, dest, dest); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt16x8( FloatRegister in, Register count, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); vpsllw(scratch, in, dest); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt16x8( FloatRegister in, Register count, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); vpsraw(scratch, in, dest); } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt16x8( FloatRegister in, Register count, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); vpsrlw(scratch, in, dest); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt32x4( FloatRegister in, Register count, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); vpslld(scratch, in, dest); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt32x4( FloatRegister in, Register count, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); vpsrad(scratch, in, dest); } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt32x4( FloatRegister in, Register count, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); vpsrld(scratch, in, dest); } void MacroAssemblerX86Shared::packedLeftShiftByScalarInt64x2( FloatRegister in, Register count, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); vpsllq(scratch, in, dest); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2( FloatRegister in, Register count, FloatRegister temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, temp); asMasm().signReplicationInt64x2(in, scratch); in = asMasm().moveSimd128FloatIfNotAVX(in, dest); // Invert if negative, shift all, invert back if negative. vpxor(Operand(scratch), in, dest); vpsrlq(temp, dest, dest); vpxor(Operand(scratch), dest, dest); } void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2( FloatRegister in, Register count, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, scratch); vpsrlq(scratch, in, dest); } void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2( Imm32 count, FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); asMasm().signReplicationInt64x2(src, scratch); // Invert if negative, shift all, invert back if negative. src = asMasm().moveSimd128FloatIfNotAVX(src, dest); vpxor(Operand(scratch), src, dest); vpsrlq(Imm32(count.value & 63), dest, dest); vpxor(Operand(scratch), dest, dest); } void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse, FloatRegister temp, FloatRegister output) { // Normally the codegen will attempt to enforce these register assignments so // that the moves are avoided. onTrue = asMasm().moveSimd128IntIfNotAVX(onTrue, output); if (MOZ_UNLIKELY(mask == onTrue)) { vpor(Operand(onFalse), onTrue, output); return; } mask = asMasm().moveSimd128IntIfNotAVX(mask, temp); vpand(Operand(mask), onTrue, output); vpandn(Operand(onFalse), mask, temp); vpor(Operand(temp), output, output); } // Code sequences for int32x4<->float32x4 culled from v8; commentary added. void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4( FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); src = asMasm().moveSimd128IntIfNotAVX(src, dest); vpxor(Operand(scratch), scratch, scratch); // extract low bits vpblendw(0x55, src, scratch, scratch); // into scratch vpsubd(Operand(scratch), src, dest); // and high bits into dest vcvtdq2ps(scratch, scratch); // convert low bits vpsrld(Imm32(1), dest, dest); // get high into unsigned range vcvtdq2ps(dest, dest); // convert vaddps(Operand(dest), dest, dest); // and back into signed vaddps(Operand(scratch), dest, dest); // combine high+low: may round } void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); // The cvttps2dq instruction is the workhorse but does not handle NaN or out // of range values as we need it to. We want to saturate too-large positive // values to 7FFFFFFFh and too-large negative values to 80000000h. NaN and -0 // become 0. // Convert NaN to 0 by masking away values that compare unordered to itself. if (HasAVX()) { vcmpeqps(Operand(src), src, scratch); vpand(Operand(scratch), src, dest); } else { vmovaps(src, scratch); vcmpeqps(Operand(scratch), scratch, scratch); moveSimd128Float(src, dest); vpand(Operand(scratch), dest, dest); } // Make lanes in scratch == 0xFFFFFFFFh, if dest overflows during cvttps2dq, // otherwise 0. static const SimdConstant minOverflowedInt = SimdConstant::SplatX4(2147483648.f); if (HasAVX()) { asMasm().vcmpgepsSimd128(minOverflowedInt, dest, scratch); } else { asMasm().loadConstantSimd128Float(minOverflowedInt, scratch); vcmpleps(Operand(dest), scratch, scratch); } // Convert. This will make the output 80000000h if the input is out of range. vcvttps2dq(dest, dest); // Convert overflow lanes to 0x7FFFFFFF. vpxor(Operand(scratch), dest, dest); } void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4( FloatRegister src, FloatRegister temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); src = asMasm().moveSimd128FloatIfNotAVX(src, dest); // The cvttps2dq instruction is the workhorse but does not handle NaN or out // of range values as we need it to. We want to saturate too-large positive // values to FFFFFFFFh and negative values to zero. NaN and -0 become 0. // Convert NaN and negative values to zeroes in dest. vxorps(Operand(scratch), scratch, scratch); vmaxps(Operand(scratch), src, dest); // Place the largest positive signed integer in all lanes in scratch. // We use it to bias the conversion to handle edge cases. asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(2147483647.f), scratch); // temp = dest - 7FFFFFFFh (as floating), this brings integers in the unsigned // range but above the signed range into the signed range; 0 => -7FFFFFFFh. vmovaps(dest, temp); vsubps(Operand(scratch), temp, temp); // scratch = mask of biased values that are greater than 7FFFFFFFh. vcmpleps(Operand(temp), scratch, scratch); // Convert the biased values to integer. Positive values above 7FFFFFFFh will // have been converted to 80000000h, all others become the expected integer. vcvttps2dq(temp, temp); // As lanes of scratch are ~0 where the result overflows, this computes // 7FFFFFFF in lanes of temp that are 80000000h, and leaves other lanes // untouched as the biased integer. vpxor(Operand(scratch), temp, temp); // Convert negative biased lanes in temp to zero. After this, temp will be // zero where the result should be zero or is less than 80000000h, 7FFFFFFF // where the result overflows, and will have the converted biased result in // other lanes (for input values >= 80000000h). vpxor(Operand(scratch), scratch, scratch); vpmaxsd(Operand(scratch), temp, temp); // Convert. Overflow lanes above 7FFFFFFFh will be 80000000h, other lanes will // be what they should be. vcvttps2dq(dest, dest); // Add temp to the result. Overflow lanes with 80000000h becomes FFFFFFFFh, // biased high-value unsigned lanes become unbiased, everything else is left // unchanged. vpaddd(Operand(temp), dest, dest); } void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4Relaxed( FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); src = asMasm().moveSimd128FloatIfNotAVX(src, dest); // Place lanes below 80000000h into dest, otherwise into scratch. // Keep dest or scratch 0 as default. asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(0x4f000000), scratch); vcmpltps(Operand(src), scratch, scratch); vpand(Operand(src), scratch, scratch); vpxor(Operand(scratch), src, dest); // Convert lanes below 80000000h into unsigned int without issues. vcvttps2dq(dest, dest); // Knowing IEEE-754 number representation: convert lanes above 7FFFFFFFh, // mutiply by 2 (to add 1 in exponent) and shift to the left by 8 bits. vaddps(Operand(scratch), scratch, scratch); vpslld(Imm32(8), scratch, scratch); // Combine the results. vpaddd(Operand(scratch), dest, dest); } void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat64x2( FloatRegister src, FloatRegister dest) { src = asMasm().moveSimd128FloatIfNotAVX(src, dest); asMasm().vunpcklpsSimd128(SimdConstant::SplatX4(0x43300000), src, dest); asMasm().vsubpdSimd128(SimdConstant::SplatX2(4503599627370496.0), dest, dest); } void MacroAssemblerX86Shared::truncSatFloat64x2ToInt32x4(FloatRegister src, FloatRegister temp, FloatRegister dest) { FloatRegister srcForTemp = asMasm().moveSimd128FloatIfNotAVX(src, temp); vcmpeqpd(Operand(srcForTemp), srcForTemp, temp); src = asMasm().moveSimd128FloatIfNotAVX(src, dest); asMasm().vandpdSimd128(SimdConstant::SplatX2(2147483647.0), temp, temp); vminpd(Operand(temp), src, dest); vcvttpd2dq(dest, dest); } void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4( FloatRegister src, FloatRegister temp, FloatRegister dest) { src = asMasm().moveSimd128FloatIfNotAVX(src, dest); vxorpd(temp, temp, temp); vmaxpd(Operand(temp), src, dest); asMasm().vminpdSimd128(SimdConstant::SplatX2(4294967295.0), dest, dest); vroundpd(SSERoundingMode::Trunc, Operand(dest), dest); asMasm().vaddpdSimd128(SimdConstant::SplatX2(4503599627370496.0), dest, dest); // temp == 0 vshufps(0x88, temp, dest, dest); } void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4Relaxed( FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); // The same as unsignedConvertInt32x4ToFloat64x2, but without NaN // and out-of-bounds checks. vroundpd(SSERoundingMode::Trunc, Operand(src), dest); asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4503599627370496.0), scratch); vaddpd(Operand(scratch), dest, dest); // The scratch has zeros in f32x4 lanes with index 0 and 2. The in-memory // representantation of the splatted double contantant contains zero in its // low bits. vshufps(0x88, scratch, dest, dest); } void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src, FloatRegister temp, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0x0f), scratch); FloatRegister srcForTemp = asMasm().moveSimd128IntIfNotAVX(src, temp); vpand(scratch, srcForTemp, temp); vpandn(src, scratch, scratch); int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output); vpsrlw(Imm32(4), scratch, scratch); vpshufb(temp, output, output); asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), temp); vpshufb(scratch, temp, temp); vpaddb(Operand(temp), output, output); } void MacroAssemblerX86Shared::dotBFloat16x8ThenAdd(FloatRegister lhs, FloatRegister rhs, FloatRegister dest, FloatRegister temp) { ScratchSimd128Scope scratch(asMasm()); FloatRegister lhsCopy = asMasm().moveSimd128IntIfNotAVX(lhs, scratch); FloatRegister rhsCopy = asMasm().moveSimd128IntIfNotAVX(rhs, temp); vpslld(Imm32(16), lhsCopy, scratch); vpslld(Imm32(16), rhsCopy, temp); if (HasFMA()) { vfmadd231ps(temp, scratch, dest); } else { asMasm().mulFloat32x4(scratch, temp, scratch); asMasm().addFloat32x4(dest, scratch, dest); } // The temp has 0 in low half-word. Use pblendw instead of `& 0xFFFF0000`. FloatRegister tempCopy = asMasm().moveSimd128IntIfNotAVX(temp, scratch); vpblendw(0xAA, lhs, tempCopy, scratch); vpblendw(0xAA, rhs, temp, temp); if (HasFMA()) { vfmadd231ps(temp, scratch, dest); } else { asMasm().mulFloat32x4(scratch, temp, scratch); asMasm().addFloat32x4(dest, scratch, dest); } }