1 files changed, 3862 insertions, 0 deletions
diff --git a/src/libs/dxvk-native-1.9.2a/src/dxso/dxso_compiler.cpp b/src/libs/dxvk-native-1.9.2a/src/dxso/dxso_compiler.cpp
new file mode 100644
index 00000000..f7bd312b
--- /dev/null
+++ b/src/libs/dxvk-native-1.9.2a/src/dxso/dxso_compiler.cpp
@@ -0,0 +1,3862 @@
+#include "dxso_compiler.h"
+
+#include "dxso_analysis.h"
+
+#include "../d3d9/d3d9_caps.h"
+#include "../d3d9/d3d9_constant_set.h"
+#include "../d3d9/d3d9_state.h"
+#include "../d3d9/d3d9_spec_constants.h"
+#include "../d3d9/d3d9_fixed_function.h"
+#include "dxso_util.h"
+
+#include "../dxvk/dxvk_spec_const.h"
+
+#include <cfloat>
+
+namespace dxvk {
+
+  DxsoCompiler::DxsoCompiler(
+    const std::string&        fileName,
+    const DxsoModuleInfo&     moduleInfo,
+    const DxsoProgramInfo&    programInfo,
+    const DxsoAnalysisInfo&   analysis,
+    const D3D9ConstantLayout& layout)
+    : m_moduleInfo ( moduleInfo )
+    , m_programInfo( programInfo )
+    , m_analysis   ( &analysis )
+    , m_layout     ( &layout )
+    , m_module     ( spvVersion(1, 3) ) {
+    // Declare an entry point ID. We'll need it during the
+    // initialization phase where the execution mode is set.
+    m_entryPointId = m_module.allocateId();
+
+    // Set the shader name so that we recognize it in renderdoc
+    m_module.setDebugSource(
+      spv::SourceLanguageUnknown, 0,
+      m_module.addDebugString(fileName.c_str()),
+      nullptr);
+
+    // Set the memory model. This is the same for all shaders.
+    m_module.setMemoryModel(
+      spv::AddressingModelLogical,
+      spv::MemoryModelGLSL450);
+
+    m_usedSamplers = 0;
+    m_usedRTs      = 0;
+
+    for (uint32_t i = 0; i < m_rRegs.size(); i++)
+      m_rRegs.at(i)  = DxsoRegisterPointer{ };
+
+    for (uint32_t i = 0; i < m_cFloat.size(); i++)
+      m_cFloat.at(i) = 0;
+
+    for (uint32_t i = 0; i < m_cInt.size(); i++)
+      m_cInt.at(i)   = 0;
+
+    for (uint32_t i = 0; i < m_cBool.size(); i++)
+      m_cBool.at(i)  = 0;
+
+    m_vs.addr        = DxsoRegisterPointer{ };
+    m_vs.oPos        = DxsoRegisterPointer{ };
+    m_fog            = DxsoRegisterPointer{ };
+    m_vs.oPSize      = DxsoRegisterPointer{ };
+
+    for (uint32_t i = 0; i < m_ps.oColor.size(); i++)
+      m_ps.oColor.at(i) = DxsoRegisterPointer{ };
+    m_ps.oDepth      = DxsoRegisterPointer{ };
+    m_ps.vFace       = DxsoRegisterPointer{ };
+    m_ps.vPos        = DxsoRegisterPointer{ };
+
+    m_loopCounter = DxsoRegisterPointer{ };
+
+    this->emitInit();
+  }
+
+
+  void DxsoCompiler::processInstruction(
+    const DxsoInstructionContext& ctx,
+          uint32_t                currentCoissueIdx) {
+    const DxsoOpcode opcode = ctx.instruction.opcode;
+
+    for (const auto& coissue : m_analysis->coissues) {
+      if (coissue.instructionIdx == ctx.instructionIdx &&
+          coissue.instructionIdx != currentCoissueIdx)
+        return;
+
+      if (coissue.instructionIdx == ctx.instructionIdx + 1)
+        processInstruction(coissue, coissue.instructionIdx);
+    }
+
+    switch (opcode) {
+    case DxsoOpcode::Nop:
+      return;
+
+    case DxsoOpcode::Dcl:
+      return this->emitDcl(ctx);
+
+    case DxsoOpcode::Def:
+    case DxsoOpcode::DefI:
+    case DxsoOpcode::DefB:
+      return this->emitDef(ctx);
+
+    case DxsoOpcode::Mov:
+    case DxsoOpcode::Mova:
+      return this->emitMov(ctx);
+
+    case DxsoOpcode::Add:
+    case DxsoOpcode::Sub:
+    case DxsoOpcode::Mad:
+    case DxsoOpcode::Mul:
+    case DxsoOpcode::Rcp:
+    case DxsoOpcode::Rsq:
+    case DxsoOpcode::Dp3:
+    case DxsoOpcode::Dp4:
+    case DxsoOpcode::Slt:
+    case DxsoOpcode::Sge:
+    case DxsoOpcode::Min:
+    case DxsoOpcode::ExpP:
+    case DxsoOpcode::Exp:
+    case DxsoOpcode::Max:
+    case DxsoOpcode::Pow:
+    case DxsoOpcode::Crs:
+    case DxsoOpcode::Abs:
+    case DxsoOpcode::Sgn:
+    case DxsoOpcode::Nrm:
+    case DxsoOpcode::SinCos:
+    case DxsoOpcode::Lit:
+    case DxsoOpcode::Dst:
+    case DxsoOpcode::LogP:
+    case DxsoOpcode::Log:
+    case DxsoOpcode::Lrp:
+    case DxsoOpcode::Frc:
+    case DxsoOpcode::Cmp:
+    case DxsoOpcode::Cnd:
+    case DxsoOpcode::Dp2Add:
+    case DxsoOpcode::DsX:
+    case DxsoOpcode::DsY:
+      return this->emitVectorAlu(ctx);
+
+    case DxsoOpcode::SetP:
+      return this->emitPredicateOp(ctx);
+
+    case DxsoOpcode::M3x2:
+    case DxsoOpcode::M3x3:
+    case DxsoOpcode::M3x4:
+    case DxsoOpcode::M4x3:
+    case DxsoOpcode::M4x4:
+      return this->emitMatrixAlu(ctx);
+
+    case DxsoOpcode::Loop:
+      return this->emitControlFlowLoop(ctx);
+    case DxsoOpcode::EndLoop:
+      return this->emitControlFlowEndLoop(ctx);
+
+    case DxsoOpcode::Rep:
+      return this->emitControlFlowRep(ctx);
+    case DxsoOpcode::EndRep:
+      return this->emitControlFlowEndRep(ctx);
+
+    case DxsoOpcode::Break:
+      return this->emitControlFlowBreak(ctx);
+    case DxsoOpcode::BreakC:
+      return this->emitControlFlowBreakC(ctx);
+
+    case DxsoOpcode::If:
+    case DxsoOpcode::Ifc:
+      return this->emitControlFlowIf(ctx);
+    case DxsoOpcode::Else:
+      return this->emitControlFlowElse(ctx);
+    case DxsoOpcode::EndIf:
+      return this->emitControlFlowEndIf(ctx);
+
+    case DxsoOpcode::TexCoord:
+      return this->emitTexCoord(ctx);
+
+    case DxsoOpcode::Tex:
+    case DxsoOpcode::TexLdl:
+    case DxsoOpcode::TexLdd:
+    case DxsoOpcode::TexDp3Tex:
+    case DxsoOpcode::TexReg2Ar:
+    case DxsoOpcode::TexReg2Gb:
+    case DxsoOpcode::TexReg2Rgb:
+    case DxsoOpcode::TexBem:
+    case DxsoOpcode::TexBemL:
+    case DxsoOpcode::TexM3x2Tex:
+    case DxsoOpcode::TexM3x3Tex:
+    case DxsoOpcode::TexM3x3Spec:
+    case DxsoOpcode::TexM3x3VSpec:
+      return this->emitTextureSample(ctx);
+    case DxsoOpcode::TexKill:
+      return this->emitTextureKill(ctx);
+    case DxsoOpcode::TexDepth:
+      return this->emitTextureDepth(ctx);
+
+    case DxsoOpcode::TexM3x3Pad:
+    case DxsoOpcode::TexM3x2Pad:
+      // We don't need to do anything here, these are just padding instructions
+      break;
+
+    case DxsoOpcode::End:
+    case DxsoOpcode::Comment:
+    case DxsoOpcode::Phase:
+      break;
+
+    default:
+      Logger::warn(str::format("DxsoCompiler::processInstruction: unhandled opcode: ", opcode));
+      break;
+    }
+  }
+
+  void DxsoCompiler::finalize() {
+    if (m_programInfo.type() == DxsoProgramTypes::VertexShader)
+      this->emitVsFinalize();
+    else
+      this->emitPsFinalize();
+
+    // Declare the entry point, we now have all the
+    // information we need, including the interfaces
+    m_module.addEntryPoint(m_entryPointId,
+      m_programInfo.executionModel(), "main",
+      m_entryPointInterfaces.size(),
+      m_entryPointInterfaces.data());
+    m_module.setDebugName(m_entryPointId, "main");
+  }
+
+
+  DxsoPermutations DxsoCompiler::compile() {
+    DxsoPermutations permutations = { };
+
+    // Create the shader module object
+    permutations[D3D9ShaderPermutations::None] = compileShader();
+
+    // If we need to add more permuations, might be worth making a copy of module
+    // before we do anything more. :-)
+    if (m_programInfo.type() == DxsoProgramType::PixelShader) {
+      if (m_ps.diffuseColorIn)
+        m_module.decorate(m_ps.diffuseColorIn, spv::DecorationFlat);
+
+      if (m_ps.specularColorIn)
+        m_module.decorate(m_ps.specularColorIn, spv::DecorationFlat);
+
+      permutations[D3D9ShaderPermutations::FlatShade] = compileShader();
+    }
+
+    return permutations;
+  }
+
+
+  Rc<DxvkShader> DxsoCompiler::compileShader() {
+    DxvkShaderOptions shaderOptions = { };
+    DxvkShaderConstData constData = { };
+
+    return new DxvkShader(
+      m_programInfo.shaderStage(),
+      m_resourceSlots.size(),
+      m_resourceSlots.data(),
+      m_interfaceSlots,
+      m_module.compile(),
+      shaderOptions,
+      std::move(constData));
+  }
+
+  void DxsoCompiler::emitInit() {
+    // Set up common capabilities for all shaders
+    m_module.enableCapability(spv::CapabilityShader);
+    m_module.enableCapability(spv::CapabilityImageQuery);
+
+    this->emitDclConstantBuffer();
+    this->emitDclInputArray();
+
+    // Initialize the shader module with capabilities
+    // etc. Each shader type has its own peculiarities.
+    switch (m_programInfo.type()) {
+      case DxsoProgramTypes::VertexShader: return this->emitVsInit();
+      case DxsoProgramTypes::PixelShader:  return this->emitPsInit();
+      default: break;
+    }
+  }
+
+
+  void DxsoCompiler::emitDclConstantBuffer() {
+    const bool asSsbo = m_moduleInfo.options.vertexConstantBufferAsSSBO &&
+      m_programInfo.type() == DxsoProgramType::VertexShader;
+
+    std::array<uint32_t, 3> members = {
+      // float f[256 or 224 or 8192]
+      m_module.defArrayTypeUnique(
+        getVectorTypeId({ DxsoScalarType::Float32, 4 }),
+        m_module.constu32(m_layout->floatCount)),
+
+      // int i[16 or 2048]
+      m_module.defArrayTypeUnique(
+        getVectorTypeId({ DxsoScalarType::Sint32, 4 }),
+        m_module.constu32(m_layout->intCount)),
+
+      //    uint32_t boolBitmask
+      // or uvec4    boolBitmask[512]
+      // Defined later...
+      0
+    };
+
+    // Decorate array strides, this is required.
+    m_module.decorateArrayStride(members[0], 16);
+    m_module.decorateArrayStride(members[1], 16);
+
+    const bool swvp = m_layout->bitmaskCount != 1;
+
+    if (swvp) {
+      // Must be a multiple of 4 otherwise.
+      members[2] = m_module.defArrayTypeUnique(
+        getVectorTypeId({ DxsoScalarType::Uint32, 4 }),
+        m_module.constu32(m_layout->bitmaskCount / 4));
+
+      m_module.decorateArrayStride(members[2], 16);
+    }
+
+    const uint32_t structType =
+      m_module.defStructType(swvp ? 3 : 2, members.data());
+
+    m_module.decorate(structType, asSsbo
+      ? spv::DecorationBufferBlock
+      : spv::DecorationBlock);
+
+    m_module.memberDecorateOffset(structType, 0, m_layout->floatOffset());
+    m_module.memberDecorateOffset(structType, 1, m_layout->intOffset());
+
+    if (swvp)
+      m_module.memberDecorateOffset(structType, 2, m_layout->bitmaskOffset());
+
+    m_module.setDebugName(structType, "cbuffer_t");
+    m_module.setDebugMemberName(structType, 0, "f");
+    m_module.setDebugMemberName(structType, 1, "i");
+
+    if (swvp)
+      m_module.setDebugMemberName(structType, 2, "b");
+
+    m_cBuffer = m_module.newVar(
+      m_module.defPointerType(structType, spv::StorageClassUniform),
+      spv::StorageClassUniform);
+
+    m_module.setDebugName(m_cBuffer, "c");
+
+    const uint32_t bindingId = computeResourceSlotId(
+      m_programInfo.type(), DxsoBindingType::ConstantBuffer,
+      0);
+
+    m_module.decorateDescriptorSet(m_cBuffer, 0);
+    m_module.decorateBinding(m_cBuffer, bindingId);
+
+    if (asSsbo)
+      m_module.decorate(m_cBuffer, spv::DecorationNonWritable);
+
+    DxvkResourceSlot resource;
+    resource.slot   = bindingId;
+    resource.type   = asSsbo
+      ? VK_DESCRIPTOR_TYPE_STORAGE_BUFFER
+      : VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    resource.view   = VK_IMAGE_VIEW_TYPE_MAX_ENUM;
+    resource.access = VK_ACCESS_UNIFORM_READ_BIT;
+    m_resourceSlots.push_back(resource);
+
+    m_boolSpecConstant = m_module.specConst32(m_module.defIntType(32, 0), 0);
+    m_module.decorateSpecId(m_boolSpecConstant, getSpecId(
+      m_programInfo.type() == DxsoProgramType::VertexShader
+      ? D3D9SpecConstantId::VertexShaderBools
+      : D3D9SpecConstantId::PixelShaderBools));
+    m_module.setDebugName(m_boolSpecConstant, "boolConstants");
+
+    m_depthSpecConstant = m_module.specConst32(m_module.defIntType(32, 0), 0);
+    m_module.decorateSpecId(m_depthSpecConstant, getSpecId(D3D9SpecConstantId::SamplerDepthMode));
+    m_module.setDebugName(m_depthSpecConstant, "depthSamplers");
+  }
+
+
+  void DxsoCompiler::emitDclInputArray() {
+    DxsoArrayType info;
+    info.ctype   = DxsoScalarType::Float32;
+    info.ccount  = 4;
+    info.alength = DxsoMaxInterfaceRegs;
+
+    uint32_t arrayTypeId = getArrayTypeId(info);
+
+    // Define the actual variable. Note that this is private
+    // because we will copy input registers
+    // to the array during the setup phase.
+    const uint32_t ptrTypeId = m_module.defPointerType(
+      arrayTypeId, spv::StorageClassPrivate);
+
+    m_vArray = m_module.newVar(
+      ptrTypeId, spv::StorageClassPrivate);
+    m_module.setDebugName(m_vArray, "v");
+  }
+
+  void DxsoCompiler::emitDclOutputArray() {
+    DxsoArrayType info;
+    info.ctype   = DxsoScalarType::Float32;
+    info.ccount  = 4;
+    info.alength = m_programInfo.type() == DxsoProgramTypes::VertexShader
+      ? DxsoMaxInterfaceRegs
+      : caps::MaxSimultaneousRenderTargets;
+
+    uint32_t arrayTypeId = getArrayTypeId(info);
+
+    // Define the actual variable. Note that this is private
+    // because we will copy input registers
+    // to the array during the setup phase.
+    const uint32_t ptrTypeId = m_module.defPointerType(
+      arrayTypeId, spv::StorageClassPrivate);
+
+    m_oArray = m_module.newVar(
+      ptrTypeId, spv::StorageClassPrivate);
+    m_module.setDebugName(m_oArray, "o");
+  }
+
+
+  void DxsoCompiler::emitVsInit() {
+    m_module.enableCapability(spv::CapabilityClipDistance);
+
+    // Only VS needs this, because PS has
+    // non-indexable specialized output regs
+    this->emitDclOutputArray();
+
+    // Main function of the vertex shader
+    m_vs.functionId = m_module.allocateId();
+    m_module.setDebugName(m_vs.functionId, "vs_main");
+
+    this->setupRenderStateInfo();
+
+    this->emitFunctionBegin(
+      m_vs.functionId,
+      m_module.defVoidType(),
+      m_module.defFunctionType(
+        m_module.defVoidType(), 0, nullptr));
+    this->emitFunctionLabel();
+  }
+
+
+  void DxsoCompiler::emitPsSharedConstants() {
+    m_ps.sharedState = GetSharedConstants(m_module);
+
+    const uint32_t bindingId = computeResourceSlotId(
+      m_programInfo.type(), DxsoBindingType::ConstantBuffer,
+      PSShared);
+
+    m_module.decorateDescriptorSet(m_ps.sharedState, 0);
+    m_module.decorateBinding(m_ps.sharedState, bindingId);
+
+    DxvkResourceSlot resource;
+    resource.slot   = bindingId;
+    resource.type   = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    resource.view   = VK_IMAGE_VIEW_TYPE_MAX_ENUM;
+    resource.access = VK_ACCESS_UNIFORM_READ_BIT;
+    m_resourceSlots.push_back(resource);
+  }
+
+
+  void DxsoCompiler::emitPsInit() {
+    m_module.enableCapability(spv::CapabilityDerivativeControl);
+
+    m_module.setExecutionMode(m_entryPointId,
+      spv::ExecutionModeOriginUpperLeft);
+
+
+    // Main function of the pixel shader
+    m_ps.functionId = m_module.allocateId();
+    m_module.setDebugName(m_ps.functionId, "ps_main");
+
+    if (m_programInfo.majorVersion() < 2 || m_moduleInfo.options.forceSamplerTypeSpecConstants) {
+      m_ps.samplerTypeSpec = m_module.specConst32(m_module.defIntType(32, 0), 0);
+      m_module.decorateSpecId(m_ps.samplerTypeSpec, getSpecId(D3D9SpecConstantId::SamplerType));
+      m_module.setDebugName(m_ps.samplerTypeSpec, "s_sampler_types");
+
+      if (m_programInfo.majorVersion() < 2) {
+        m_ps.projectionSpec = m_module.specConst32(m_module.defIntType(32, 0), 0);
+        m_module.decorateSpecId(m_ps.projectionSpec, getSpecId(D3D9SpecConstantId::ProjectionType));
+        m_module.setDebugName(m_ps.projectionSpec, "s_projections");
+      }
+    }
+
+    m_ps.fetch4Spec = m_module.specConst32(m_module.defIntType(32, 0), 0);
+    m_module.decorateSpecId(m_ps.fetch4Spec, getSpecId(D3D9SpecConstantId::Fetch4));
+    m_module.setDebugName(m_ps.fetch4Spec, "s_fetch4");
+
+    this->setupRenderStateInfo();
+    this->emitPsSharedConstants();
+
+    this->emitFunctionBegin(
+      m_ps.functionId,
+      m_module.defVoidType(),
+      m_module.defFunctionType(
+        m_module.defVoidType(), 0, nullptr));
+    this->emitFunctionLabel();
+
+    // We may have to defer kill operations to the end of
+    // the shader in order to keep derivatives correct.
+    if (m_analysis->usesKill && m_moduleInfo.options.useDemoteToHelperInvocation) {
+      // This extension basically implements D3D-style discard
+      m_module.enableExtension("SPV_EXT_demote_to_helper_invocation");
+      m_module.enableCapability(spv::CapabilityDemoteToHelperInvocationEXT);
+    }
+    else if (m_analysis->usesKill && m_analysis->usesDerivatives) {
+      m_ps.killState = m_module.newVarInit(
+        m_module.defPointerType(m_module.defBoolType(), spv::StorageClassPrivate),
+        spv::StorageClassPrivate, m_module.constBool(false));
+
+      m_module.setDebugName(m_ps.killState, "ps_kill");
+
+      if (m_moduleInfo.options.useSubgroupOpsForEarlyDiscard) {
+        m_module.enableCapability(spv::CapabilityGroupNonUniform);
+        m_module.enableCapability(spv::CapabilityGroupNonUniformBallot);
+
+        DxsoRegisterInfo laneId;
+        laneId.type = { DxsoScalarType::Uint32, 1, 0 };
+        laneId.sclass = spv::StorageClassInput;
+
+        m_ps.builtinLaneId = emitNewBuiltinVariable(
+          laneId, spv::BuiltInSubgroupLocalInvocationId,
+          "fLaneId", 0);
+      }
+    }
+  }
+
+
+  void DxsoCompiler::emitFunctionBegin(
+          uint32_t                entryPoint,
+          uint32_t                returnType,
+          uint32_t                funcType) {
+    this->emitFunctionEnd();
+
+    m_module.functionBegin(
+      returnType, entryPoint, funcType,
+      spv::FunctionControlMaskNone);
+
+    m_insideFunction = true;
+  }
+
+
+  void DxsoCompiler::emitFunctionEnd() {
+    if (m_insideFunction) {
+      m_module.opReturn();
+      m_module.functionEnd();
+    }
+
+    m_insideFunction = false;
+  }
+
+
+  uint32_t DxsoCompiler::emitFunctionLabel() {
+    uint32_t labelId = m_module.allocateId();
+    m_module.opLabel(labelId);
+    return labelId;
+  }
+
+
+  void DxsoCompiler::emitMainFunctionBegin() {
+    this->emitFunctionBegin(
+      m_entryPointId,
+      m_module.defVoidType(),
+      m_module.defFunctionType(
+        m_module.defVoidType(), 0, nullptr));
+    m_mainFuncLabel = this->emitFunctionLabel();
+  }
+
+
+  uint32_t DxsoCompiler::emitNewVariable(const DxsoRegisterInfo& info) {
+    const uint32_t ptrTypeId = this->getPointerTypeId(info);
+    return m_module.newVar(ptrTypeId, info.sclass);
+  }
+
+
+  uint32_t DxsoCompiler::emitNewVariableDefault(
+    const DxsoRegisterInfo& info,
+          uint32_t          value) {
+    const uint32_t ptrTypeId = this->getPointerTypeId(info);
+    if (value == 0)
+      return m_module.newVar(ptrTypeId, info.sclass);
+    else
+      return m_module.newVarInit(ptrTypeId, info.sclass, value);
+  }
+
+
+  uint32_t DxsoCompiler::emitNewBuiltinVariable(
+    const DxsoRegisterInfo& info,
+          spv::BuiltIn      builtIn,
+    const char*             name,
+          uint32_t          value) {
+    const uint32_t varId = emitNewVariableDefault(info, value);
+
+    m_module.setDebugName(varId, name);
+    m_module.decorateBuiltIn(varId, builtIn);
+
+    if (m_programInfo.type() == DxsoProgramTypes::PixelShader
+     && info.type.ctype != DxsoScalarType::Float32
+     && info.type.ctype != DxsoScalarType::Bool
+     && info.sclass == spv::StorageClassInput)
+      m_module.decorate(varId, spv::DecorationFlat);
+
+    m_entryPointInterfaces.push_back(varId);
+    return varId;
+  }
+
+  DxsoCfgBlock* DxsoCompiler::cfgFindBlock(
+    const std::initializer_list<DxsoCfgBlockType>& types) {
+    for (auto cur =  m_controlFlowBlocks.rbegin();
+              cur != m_controlFlowBlocks.rend(); cur++) {
+      for (auto type : types) {
+        if (cur->type == type)
+          return &(*cur);
+      }
+    }
+
+    return nullptr;
+  }
+
+  spv::BuiltIn semanticToBuiltIn(bool input, DxsoSemantic semantic) {
+    if (input)
+      return spv::BuiltInMax;
+
+    if (semantic == DxsoSemantic{ DxsoUsage::Position, 0 })
+      return spv::BuiltInPosition;
+
+    if (semantic == DxsoSemantic{ DxsoUsage::PointSize, 0 })
+      return spv::BuiltInPointSize;
+
+    return spv::BuiltInMax;
+  }
+
+  void DxsoCompiler::emitDclInterface(
+            bool         input,
+            uint32_t     regNumber,
+            DxsoSemantic semantic,
+            DxsoRegMask  mask,
+            bool         centroid) {
+    auto& sgn = input
+      ? m_isgn : m_osgn;
+
+    const bool pixel  = m_programInfo.type() == DxsoProgramTypes::PixelShader;
+    const bool vertex = !pixel;
+
+    if (pixel && input && semantic.usage == DxsoUsage::Color && m_programInfo.majorVersion() < 3)
+      centroid = true;
+
+    uint32_t slot = 0;
+
+    uint32_t& slots = input
+      ? m_interfaceSlots.inputSlots
+      : m_interfaceSlots.outputSlots;
+
+    uint16_t& explicits = input
+      ? m_explicitInputs
+      : m_explicitOutputs;
+
+    // Some things we consider builtins could be packed in an output reg.
+    bool builtin = semanticToBuiltIn(input, semantic) != spv::BuiltInMax;
+
+    uint32_t i = sgn.elemCount++;
+
+    if (input && vertex) {
+      // Any slot will do! Let's chose the next one
+      slot = i;
+    }
+    else if ( (!input && vertex)
+           || (input  && pixel ) ) {
+      // Don't register the slot if it belongs to a builtin
+      if (!builtin)
+        slot = RegisterLinkerSlot(semantic);
+    }
+    else { //if (!input && pixel)
+      // We want to make the output slot the same as the
+      // output register for pixel shaders so they go to
+      // the right render target.
+      slot = regNumber;
+    }
+
+    // Don't want to mark down any of these builtins.
+    if (!builtin)
+      slots   |= 1u << slot;
+    explicits |= 1u << regNumber;
+
+    auto& elem = sgn.elems[i];
+    elem.slot      = slot;
+    elem.regNumber = regNumber;
+    elem.semantic  = semantic;
+    elem.mask      = mask;
+    elem.centroid  = centroid;
+  }
+
+  void DxsoCompiler::emitDclSampler(
+          uint32_t        idx,
+          DxsoTextureType type) {
+    m_usedSamplers |= (1u << idx);
+
+    VkImageViewType viewType = VK_IMAGE_VIEW_TYPE_MAX_ENUM;
+
+    auto DclSampler = [this, &viewType](
+      uint32_t        idx,
+      uint32_t        bindingId,
+      DxsoSamplerType type,
+      bool            depth,
+      bool            implicit) {
+      // Setup our combines sampler.
+      DxsoSamplerInfo& sampler = !depth
+        ? m_samplers[idx].color[type]
+        : m_samplers[idx].depth[type];
+
+      spv::Dim dimensionality;
+
+      const char* suffix = "_2d";
+
+      switch (type) {
+        default:
+        case SamplerTypeTexture2D:
+          sampler.dimensions = 2;
+          dimensionality = spv::Dim2D;
+          viewType = VK_IMAGE_VIEW_TYPE_2D;
+          break;
+
+        case SamplerTypeTextureCube:
+          suffix = "_cube";
+          sampler.dimensions = 3;
+          dimensionality = spv::DimCube;
+          viewType = VK_IMAGE_VIEW_TYPE_CUBE;
+          break;
+
+        case SamplerTypeTexture3D:
+          suffix = "_3d";
+          sampler.dimensions = 3;
+          dimensionality = spv::Dim3D;
+          viewType = VK_IMAGE_VIEW_TYPE_3D;
+          break;
+      }
+
+      sampler.imageTypeId = m_module.defImageType(
+        m_module.defFloatType(32),
+        dimensionality, depth ? 1 : 0, 0, 0, 1,
+        spv::ImageFormatUnknown);
+
+      sampler.typeId = m_module.defSampledImageType(sampler.imageTypeId);
+
+      sampler.varId = m_module.newVar(
+        m_module.defPointerType(
+          sampler.typeId, spv::StorageClassUniformConstant),
+        spv::StorageClassUniformConstant);
+
+      std::string name = str::format("s", idx, suffix, depth ? "_shadow" : "");
+      m_module.setDebugName(sampler.varId, name.c_str());
+
+      m_module.decorateDescriptorSet(sampler.varId, 0);
+      m_module.decorateBinding      (sampler.varId, bindingId);
+    };
+
+    const uint32_t binding = computeResourceSlotId(m_programInfo.type(),
+      DxsoBindingType::Image,
+      idx);
+
+    const bool implicit = m_programInfo.majorVersion() < 2 || m_moduleInfo.options.forceSamplerTypeSpecConstants;
+
+    if (!implicit) {
+      DxsoSamplerType samplerType = 
+        SamplerTypeFromTextureType(type);
+
+      DclSampler(idx, binding, samplerType, false, implicit);
+
+      if (samplerType != SamplerTypeTexture3D) {
+        // We could also be depth compared!
+        DclSampler(idx, binding, samplerType, true, implicit);
+      }
+    }
+    else {
+      // Could be any of these!
+      // We will check with the spec constant at sample time.
+      for (uint32_t i = 0; i < SamplerTypeCount; i++) {
+        auto samplerType = static_cast<DxsoSamplerType>(i);
+
+        DclSampler(idx, binding, samplerType, false, implicit);
+
+        if (samplerType != SamplerTypeTexture3D)
+          DclSampler(idx, binding, samplerType, true, implicit);
+      }
+    }
+
+    DxsoSampler& sampler = m_samplers[idx];
+    sampler.boundConst = m_module.specConstBool(true);
+    sampler.type = type;
+    m_module.decorateSpecId(sampler.boundConst, binding);
+    m_module.setDebugName(sampler.boundConst,
+      str::format("s", idx, "_bound").c_str());
+
+    // Store descriptor info for the shader interface
+    DxvkResourceSlot resource;
+    resource.slot   = binding;
+    resource.type   = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+    resource.view   = implicit ? VK_IMAGE_VIEW_TYPE_MAX_ENUM : viewType;
+    resource.access = VK_ACCESS_SHADER_READ_BIT;
+    m_resourceSlots.push_back(resource);
+  }
+
+
+  uint32_t DxsoCompiler::emitArrayIndex(
+            uint32_t          idx,
+      const DxsoBaseRegister* relative) {
+    uint32_t result = m_module.consti32(idx);
+
+    if (relative != nullptr) {
+      DxsoRegisterValue offset = emitRegisterLoad(*relative, DxsoRegMask(true, false, false, false), nullptr);
+
+      result = m_module.opIAdd(
+        getVectorTypeId(offset.type),
+        result, offset.id);
+    }
+
+    return result;
+  }
+
+
+  DxsoRegisterPointer DxsoCompiler::emitInputPtr(
+            bool              texture,
+      const DxsoBaseRegister& reg,
+      const DxsoBaseRegister* relative) {
+    uint32_t idx = reg.id.num;
+
+    // Account for the two color regs.
+    if (texture)
+      idx += 2;
+
+    DxsoRegisterPointer input;
+
+    input.type = DxsoVectorType{ DxsoScalarType::Float32, 4 };
+
+    uint32_t index = this->emitArrayIndex(idx, relative);
+
+    const uint32_t typeId = getVectorTypeId(input.type);
+    input.id = m_module.opAccessChain(
+      m_module.defPointerType(typeId, spv::StorageClassPrivate),
+      m_vArray,
+      1, &index);
+
+    return input;
+  }
+
+  DxsoRegisterPointer DxsoCompiler::emitRegisterPtr(
+      const char*             name,
+            DxsoScalarType    ctype,
+            uint32_t          ccount,
+            uint32_t          defaultVal,
+            spv::StorageClass storageClass,
+            spv::BuiltIn      builtIn) {
+    DxsoRegisterPointer result;
+
+    DxsoRegisterInfo info;
+    info.type.ctype    = ctype;
+    info.type.ccount   = ccount;
+    info.type.alength  = 1;
+    info.sclass        = storageClass;
+
+    result.type = DxsoVectorType{ ctype, ccount };
+    if (builtIn == spv::BuiltInMax) {
+      result.id = this->emitNewVariableDefault(info, defaultVal);
+      m_module.setDebugName(result.id, name);
+    }
+    else {
+      result.id = this->emitNewBuiltinVariable(
+        info, builtIn, name, defaultVal);
+    }
+
+    return result;
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitLoadConstant(
+      const DxsoBaseRegister& reg,
+      const DxsoBaseRegister* relative) {
+    // struct cBuffer_t {
+    //
+    //   Type     Member        Index
+    //
+    //   float    f[256 or 224];       0
+    //   int32_t  i[16];        1
+    //   uint32_t boolBitmask;  2
+    // }
+    DxsoRegisterValue result = { };
+
+    switch (reg.id.type) {
+      case DxsoRegisterType::Const:
+        result.type = { DxsoScalarType::Float32, 4 };
+
+        if (!relative)
+          result.id = m_cFloat.at(reg.id.num);
+        break;
+      
+      case DxsoRegisterType::ConstInt:
+        result.type = { DxsoScalarType::Sint32, 4 };
+        result.id = m_cInt.at(reg.id.num);
+        break;
+      
+      case DxsoRegisterType::ConstBool:
+        result.type = { DxsoScalarType::Bool, 1 };
+        result.id = m_cBool.at(reg.id.num);
+        break;
+      
+      default: break;
+    }
+
+    if (result.id)
+      return result;
+
+    switch (reg.id.type) {
+      case DxsoRegisterType::Const:
+        if (!relative) {
+          m_meta.maxConstIndexF = std::max(m_meta.maxConstIndexF, reg.id.num + 1);
+          m_meta.maxConstIndexF = std::min(m_meta.maxConstIndexF, m_layout->floatCount);
+        } else {
+          m_meta.maxConstIndexF = m_layout->floatCount;
+          m_meta.needsConstantCopies |= m_moduleInfo.options.strictConstantCopies
+                                     || m_cFloat.at(reg.id.num) != 0;
+        }
+        break;
+      
+      case DxsoRegisterType::ConstInt:
+        m_meta.maxConstIndexI = std::max(m_meta.maxConstIndexI, reg.id.num + 1);
+        m_meta.maxConstIndexI = std::min(m_meta.maxConstIndexI, m_layout->intCount);
+        break;
+      
+      case DxsoRegisterType::ConstBool:
+        m_meta.maxConstIndexB = std::max(m_meta.maxConstIndexB, reg.id.num + 1);
+        m_meta.maxConstIndexB = std::min(m_meta.maxConstIndexB, m_layout->boolCount);
+        m_meta.boolConstantMask |= 1 << reg.id.num;
+        break;
+      
+      default: break;
+    }
+
+    uint32_t relativeIdx = this->emitArrayIndex(reg.id.num, relative);
+
+    if (reg.id.type != DxsoRegisterType::ConstBool) {
+      uint32_t structIdx = reg.id.type == DxsoRegisterType::Const
+        ? m_module.constu32(0)
+        : m_module.constu32(1);
+
+      std::array<uint32_t, 2> indices = { structIdx, relativeIdx };
+
+      uint32_t typeId = getVectorTypeId(result.type);
+      uint32_t ptrId = m_module.opAccessChain(
+        m_module.defPointerType(typeId, spv::StorageClassUniform),
+        m_cBuffer, indices.size(), indices.data());
+
+      result.id = m_module.opLoad(typeId, ptrId);
+
+      if (relative) {
+        uint32_t constCount = m_module.constu32(m_layout->floatCount);
+
+        // Expand condition to bvec4 since the result has four components
+        uint32_t cond = m_module.opULessThan(m_module.defBoolType(), relativeIdx, constCount);
+        std::array<uint32_t, 4> condIds = { cond, cond, cond, cond };
+
+        cond = m_module.opCompositeConstruct(
+          m_module.defVectorType(m_module.defBoolType(), 4),
+          condIds.size(), condIds.data());
+
+        result.id = m_module.opSelect(typeId, cond, result.id,
+          m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f));
+      }
+    } else {
+      // Bool constants have no relative indexing, so we can do the bitfield
+      // magic for SWVP at compile time.
+
+      uint32_t uintType  = getScalarTypeId(DxsoScalarType::Uint32);
+      uint32_t uvec4Type = getVectorTypeId({ DxsoScalarType::Uint32, 4 });
+
+      // If not SWVP, spec const this
+      uint32_t bitfield;
+      if (m_layout->bitmaskCount != 1) {
+        std::array<uint32_t, 2> indices = { m_module.constu32(2), m_module.constu32(reg.id.num / 128) };
+
+        uint32_t indexCount = m_layout->bitmaskCount == 1 ? 1 : 2;
+        uint32_t accessType = m_layout->bitmaskCount == 1 ? uintType : uvec4Type;
+
+        uint32_t ptrId = m_module.opAccessChain(
+          m_module.defPointerType(accessType, spv::StorageClassUniform),
+          m_cBuffer, indexCount, indices.data());
+
+        bitfield = m_module.opLoad(accessType, ptrId);
+      }
+      else
+        bitfield = m_boolSpecConstant;
+
+      uint32_t bitIdx = m_module.consti32(reg.id.num % 32);
+
+      if (m_layout->bitmaskCount != 1) {
+        uint32_t index = (reg.id.num % 128) / 32;
+        bitfield = m_module.opCompositeExtract(uintType, bitfield, 1, &index);
+      }
+      uint32_t bit = m_module.opBitFieldUExtract(
+        uintType, bitfield, bitIdx, m_module.consti32(1));
+
+      result.id = m_module.opINotEqual(
+        getVectorTypeId(result.type),
+        bit, m_module.constu32(0));
+    }
+
+    return result;
+  }
+
+
+  DxsoRegisterPointer DxsoCompiler::emitOutputPtr(
+            bool              texcrdOut,
+      const DxsoBaseRegister& reg,
+      const DxsoBaseRegister* relative) {
+    uint32_t idx = reg.id.num;
+
+    // Account for the two color regs.
+    if (texcrdOut)
+      idx += 2;
+
+    DxsoRegisterPointer input;
+
+    input.type = DxsoVectorType{ DxsoScalarType::Float32, 4 };
+
+    uint32_t index = this->emitArrayIndex(idx, relative);
+
+    const uint32_t typeId = getVectorTypeId(input.type);
+    input.id = m_module.opAccessChain(
+      m_module.defPointerType(typeId, spv::StorageClassPrivate),
+      m_oArray,
+      1, &index);
+
+    return input;
+  }
+
+
+  DxsoRegisterPointer DxsoCompiler::emitGetOperandPtr(
+      const DxsoBaseRegister& reg,
+      const DxsoBaseRegister* relative) {
+    switch (reg.id.type) {
+      case DxsoRegisterType::Temp: {
+        DxsoRegisterPointer& ptr = m_rRegs.at(reg.id.num);
+        if (ptr.id == 0) {
+          std::string name = str::format("r", reg.id.num);
+          ptr = this->emitRegisterPtr(
+            name.c_str(), DxsoScalarType::Float32, 4,
+            m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f));
+        }
+        return ptr;
+      }
+
+      case DxsoRegisterType::Input: {
+        if (!(m_explicitInputs & 1u << reg.id.num)) {
+          this->emitDclInterface(
+            true, reg.id.num,
+            DxsoSemantic{ DxsoUsage::Color, reg.id.num },
+            IdentityWriteMask, false);
+        }
+
+        return this->emitInputPtr(false, reg, relative);
+      }
+
+      case DxsoRegisterType::PixelTexcoord:
+      case DxsoRegisterType::Texture: {
+        if (m_programInfo.type() == DxsoProgramTypes::PixelShader) {
+          // Texture register
+
+          // SM2, or SM 1.4
+          if (reg.id.type == DxsoRegisterType::PixelTexcoord
+          ||  m_programInfo.majorVersion() >= 2
+          || (m_programInfo.majorVersion() == 1
+           && m_programInfo.minorVersion() == 4)) {
+            uint32_t adjustedNumber = reg.id.num + 2;
+            if (!(m_explicitInputs & 1u << adjustedNumber)) {
+              this->emitDclInterface(
+                true, adjustedNumber,
+                DxsoSemantic{ DxsoUsage::Texcoord, reg.id.num },
+                IdentityWriteMask, false);
+            }
+
+            return this->emitInputPtr(true, reg, relative);
+          }
+          else {
+            // User must use tex/texcoord to put data in this private register.
+            // We use the an oob id which fxc never generates for the texcoord data.
+            DxsoRegisterPointer& ptr = m_tRegs.at(reg.id.num);
+            if (ptr.id == 0) {
+              std::string name = str::format("t", reg.id.num);
+              ptr = this->emitRegisterPtr(
+                name.c_str(), DxsoScalarType::Float32, 4,
+                m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f));
+            }
+            return ptr;
+          }
+        }
+        else {
+          // Address register
+          if (m_vs.addr.id == 0) {
+            m_vs.addr = this->emitRegisterPtr(
+              "a0", DxsoScalarType::Sint32, 4,
+              m_module.constvec4i32(0, 0, 0, 0));
+          }
+          return m_vs.addr;
+        }
+      }
+
+      case DxsoRegisterType::RasterizerOut:
+        switch (reg.id.num) {
+          case RasterOutPosition:
+            if (m_vs.oPos.id == 0) {
+              m_vs.oPos = this->emitRegisterPtr(
+                "oPos", DxsoScalarType::Float32, 4,
+                m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f),
+                spv::StorageClassOutput, spv::BuiltInPosition);
+            }
+            return m_vs.oPos;
+
+          case RasterOutFog:
+            if (m_fog.id == 0) {
+              bool input = m_programInfo.type() == DxsoProgramType::PixelShader;
+              DxsoSemantic semantic = DxsoSemantic{ DxsoUsage::Fog, 0 };
+
+              uint32_t slot = RegisterLinkerSlot(semantic);
+
+              uint32_t& slots = input
+                ? m_interfaceSlots.inputSlots
+                : m_interfaceSlots.outputSlots;
+
+              slots |= 1u << slot;
+
+              m_fog = this->emitRegisterPtr(
+                input ? "vFog" : "oFog",
+                DxsoScalarType::Float32, 1,
+                input ? 0 : m_module.constf32(1.0f),
+                input ? spv::StorageClassInput : spv::StorageClassOutput);
+
+              m_entryPointInterfaces.push_back(m_fog.id);
+
+              m_module.decorateLocation(m_fog.id, slot);
+            }
+            return m_fog;
+
+          case RasterOutPointSize:
+            if (m_vs.oPSize.id == 0) {
+              m_vs.oPSize = this->emitRegisterPtr(
+                "oPSize", DxsoScalarType::Float32, 1,
+                m_module.constf32(0.0f),
+                spv::StorageClassOutput, spv::BuiltInPointSize);
+            }
+            return m_vs.oPSize;
+        }
+
+      case DxsoRegisterType::ColorOut: {
+        uint32_t idx = std::min(reg.id.num, 4u);
+
+        if (m_ps.oColor[idx].id == 0) {
+          std::string name = str::format("oC", idx);
+          m_ps.oColor[idx] = this->emitRegisterPtr(
+            name.c_str(), DxsoScalarType::Float32, 4,
+            m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f),
+            spv::StorageClassOutput);
+
+          m_interfaceSlots.outputSlots |= 1u << idx;
+          m_module.decorateLocation(m_ps.oColor[idx].id, idx);
+          m_module.decorateIndex(m_ps.oColor[idx].id, 0);
+
+          m_entryPointInterfaces.push_back(m_ps.oColor[idx].id);
+          m_usedRTs |= (1u << idx);
+        }
+        return m_ps.oColor[idx];
+      }
+
+      case DxsoRegisterType::AttributeOut: {
+        auto ptr = this->emitOutputPtr(false, reg, nullptr);
+
+        if (!(m_explicitOutputs & 1u << reg.id.num)) {
+          this->emitDclInterface(
+            false, reg.id.num,
+            DxsoSemantic{ DxsoUsage::Color, reg.id.num },
+            IdentityWriteMask, false);
+
+          m_module.opStore(ptr.id, m_module.constfReplicant(0, ptr.type.ccount));
+        }
+
+        return ptr;
+      }
+
+      case DxsoRegisterType::Output: {
+        bool texcrdOut = m_programInfo.type() == DxsoProgramTypes::VertexShader
+                      && m_programInfo.majorVersion() != 3;
+
+        auto ptr = this->emitOutputPtr(texcrdOut, reg, !texcrdOut ? relative : nullptr);
+
+        if (texcrdOut) {
+          uint32_t adjustedNumber = reg.id.num + 2;
+          if (!(m_explicitOutputs & 1u << adjustedNumber)) {
+            this->emitDclInterface(
+              false, adjustedNumber,
+              DxsoSemantic{ DxsoUsage::Texcoord, reg.id.num },
+              IdentityWriteMask, false);
+
+            m_module.opStore(ptr.id, m_module.constfReplicant(0, ptr.type.ccount));
+          }
+        }
+
+        return ptr;
+      }
+
+      case DxsoRegisterType::DepthOut:
+        if (m_ps.oDepth.id == 0) {
+          m_module.setExecutionMode(m_entryPointId,
+            spv::ExecutionModeDepthReplacing);
+
+          m_ps.oDepth = this->emitRegisterPtr(
+            "oDepth", DxsoScalarType::Float32, 1,
+            m_module.constf32(0.0f),
+            spv::StorageClassOutput, spv::BuiltInFragDepth);
+        }
+        return m_ps.oDepth;
+
+      case DxsoRegisterType::Loop:
+        if (m_loopCounter.id == 0) {
+          m_loopCounter = this->emitRegisterPtr(
+            "aL", DxsoScalarType::Sint32, 1,
+            m_module.consti32(0));
+        }
+        return m_loopCounter;
+
+      case DxsoRegisterType::MiscType:
+        if (reg.id.num == MiscTypePosition) {
+          if (m_ps.vPos.id == 0) {
+            m_ps.vPos = this->emitRegisterPtr(
+              "vPos", DxsoScalarType::Float32, 4, 0);
+          }
+          return m_ps.vPos;
+        }
+        else { // MiscTypeFace
+          if (m_ps.vFace.id == 0) {
+            m_ps.vFace = this->emitRegisterPtr(
+              "vFace", DxsoScalarType::Float32, 4, 0);
+          }
+          return m_ps.vFace;
+        }
+
+      case DxsoRegisterType::Predicate: {
+        DxsoRegisterPointer& ptr = m_pRegs.at(reg.id.num);
+        if (ptr.id == 0) {
+          std::string name = str::format("p", reg.id.num);
+          ptr = this->emitRegisterPtr(
+            name.c_str(), DxsoScalarType::Bool, 4,
+            m_module.constvec4b32(false, false, false, false));
+        }
+        return ptr;
+      }
+
+      default: {
+        //Logger::warn(str::format("emitGetOperandPtr: unhandled reg type: ", reg.id.type));
+
+        DxsoRegisterPointer nullPointer;
+        nullPointer.id = 0;
+        return nullPointer;
+      }
+    }
+  }
+
+
+  uint32_t DxsoCompiler::emitBoolComparison(DxsoVectorType type, DxsoComparison cmp, uint32_t a, uint32_t b) {
+    const uint32_t typeId = getVectorTypeId(type);
+    switch (cmp) {
+      default:
+      case DxsoComparison::Never:        return m_module.constbReplicant(false, type.ccount);  break;
+      case DxsoComparison::GreaterThan:  return m_module.opFOrdGreaterThan     (typeId, a, b); break;
+      case DxsoComparison::Equal:        return m_module.opFOrdEqual           (typeId, a, b); break;
+      case DxsoComparison::GreaterEqual: return m_module.opFOrdGreaterThanEqual(typeId, a, b); break;
+      case DxsoComparison::LessThan:     return m_module.opFOrdLessThan        (typeId, a, b); break;
+      case DxsoComparison::NotEqual:     return m_module.opFOrdNotEqual        (typeId, a, b); break;
+      case DxsoComparison::LessEqual:    return m_module.opFOrdLessThanEqual   (typeId, a, b); break;
+      case DxsoComparison::Always:       return m_module.constbReplicant(true, type.ccount);   break;
+    }
+}
+
+
+  DxsoRegisterValue DxsoCompiler::emitValueLoad(
+            DxsoRegisterPointer ptr) {
+    DxsoRegisterValue result;
+    result.type = ptr.type;
+    result.id   = m_module.opLoad(
+      getVectorTypeId(result.type),
+      ptr.id);
+    return result;
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::applyPredicate(DxsoRegisterValue pred, DxsoRegisterValue dst, DxsoRegisterValue src) {
+    if (dst.type.ccount != pred.type.ccount) {
+      DxsoRegMask mask = DxsoRegMask(
+        pred.type.ccount > 0,
+        pred.type.ccount > 1,
+        pred.type.ccount > 2,
+        pred.type.ccount > 3);
+
+      pred = emitRegisterSwizzle(pred, IdentitySwizzle, mask);
+    }
+
+    dst.id = m_module.opSelect(
+      getVectorTypeId(dst.type),
+      pred.id,
+      src.id, dst.id);
+
+    return dst;
+  }
+
+
+  void DxsoCompiler::emitValueStore(
+          DxsoRegisterPointer     ptr,
+          DxsoRegisterValue       value,
+          DxsoRegMask             writeMask,
+          DxsoRegisterValue       predicate) {
+    // If the source value consists of only one component,
+    // it is stored in all components of the destination.
+    if (value.type.ccount == 1)
+      value = emitRegisterExtend(value, writeMask.popCount());
+
+    if (ptr.type.ccount == writeMask.popCount()) {
+      if (predicate.id)
+        value = applyPredicate(predicate, emitValueLoad(ptr), value);
+
+      // Simple case: We write to the entire register
+      m_module.opStore(ptr.id, value.id);
+    } else {
+      // We only write to part of the destination
+      // register, so we need to load and modify it
+      DxsoRegisterValue tmp = emitValueLoad(ptr);
+      tmp = emitRegisterInsert(tmp, value, writeMask);
+
+      if (predicate.id)
+        value = applyPredicate(predicate, emitValueLoad(ptr), tmp);
+
+      m_module.opStore(ptr.id, tmp.id);
+    }
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitClampBoundReplicant(
+            DxsoRegisterValue       srcValue,
+            float                   lb,
+            float                   ub) {
+    srcValue.id = m_module.opFClamp(getVectorTypeId(srcValue.type), srcValue.id,
+      m_module.constfReplicant(lb, srcValue.type.ccount),
+      m_module.constfReplicant(ub, srcValue.type.ccount));
+
+    return srcValue;
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitSaturate(
+            DxsoRegisterValue       srcValue) {
+    return emitClampBoundReplicant(srcValue, 0.0f, 1.0f);
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitDot(
+            DxsoRegisterValue       a,
+            DxsoRegisterValue       b) {
+    DxsoRegisterValue dot;
+    dot.type        = a.type;
+    dot.type.ccount = 1;
+
+    dot.id = m_module.opDot(getVectorTypeId(dot.type), a.id, b.id);
+
+    return dot;
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitRegisterInsert(
+            DxsoRegisterValue       dstValue,
+            DxsoRegisterValue       srcValue,
+            DxsoRegMask             srcMask) {
+    DxsoRegisterValue result;
+    result.type = dstValue.type;
+
+    const uint32_t typeId = getVectorTypeId(result.type);
+
+    if (srcMask.popCount() == 0) {
+      // Nothing to do if the insertion mask is empty
+      result.id = dstValue.id;
+    } else if (dstValue.type.ccount == 1) {
+      // Both values are scalar, so the first component
+      // of the write mask decides which one to take.
+      result.id = srcMask[0] ? srcValue.id : dstValue.id;
+    } else if (srcValue.type.ccount == 1) {
+      // The source value is scalar. Since OpVectorShuffle
+      // requires both arguments to be vectors, we have to
+      // use OpCompositeInsert to modify the vector instead.
+      const uint32_t componentId = srcMask.firstSet();
+
+      result.id = m_module.opCompositeInsert(typeId,
+        srcValue.id, dstValue.id, 1, &componentId);
+    } else {
+      // Both arguments are vectors. We can determine which
+      // components to take from which vector and use the
+      // OpVectorShuffle instruction.
+      std::array<uint32_t, 4> components;
+      uint32_t srcComponentId = dstValue.type.ccount;
+
+      for (uint32_t i = 0; i < dstValue.type.ccount; i++)
+        components.at(i) = srcMask[i] ? srcComponentId++ : i;
+
+      result.id = m_module.opVectorShuffle(
+        typeId, dstValue.id, srcValue.id,
+        dstValue.type.ccount, components.data());
+    }
+
+    return result;
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitRegisterLoadRaw(
+      const DxsoBaseRegister& reg,
+      const DxsoBaseRegister* relative) {
+    switch (reg.id.type) {
+      case DxsoRegisterType::Const:
+      case DxsoRegisterType::ConstInt:
+      case DxsoRegisterType::ConstBool:
+        return emitLoadConstant(reg, relative);
+      
+      default:
+        return emitValueLoad(emitGetOperandPtr(reg, relative));
+    }
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitRegisterExtend(
+            DxsoRegisterValue       value,
+            uint32_t                size) {
+    if (size == 1)
+      return value;
+
+    std::array<uint32_t, 4> ids = {{
+      value.id, value.id,
+      value.id, value.id,
+    }};
+
+    DxsoRegisterValue result;
+    result.type.ctype  = value.type.ctype;
+    result.type.ccount = size;
+    result.id = m_module.opCompositeConstruct(
+      getVectorTypeId(result.type),
+      size, ids.data());
+    return result;
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitRegisterSwizzle(
+            DxsoRegisterValue       value,
+            DxsoRegSwizzle          swizzle,
+            DxsoRegMask             writeMask) {
+    if (value.type.ccount == 1)
+      return emitRegisterExtend(value, writeMask.popCount());
+
+    std::array<uint32_t, 4> indices;
+
+    uint32_t dstIndex = 0;
+
+    for (uint32_t i = 0; i < 4; i++) {
+      if (writeMask[i])
+        indices[dstIndex++] = swizzle[i];
+    }
+
+    // If the swizzle combined with the mask can be reduced
+    // to a no-op, we don't need to insert any instructions.
+    bool isIdentitySwizzle = dstIndex == value.type.ccount;
+
+    for (uint32_t i = 0; i < dstIndex && isIdentitySwizzle; i++)
+      isIdentitySwizzle &= indices[i] == i;
+
+    if (isIdentitySwizzle)
+      return value;
+
+    // Use OpCompositeExtract if the resulting vector contains
+    // only one component, and OpVectorShuffle if it is a vector.
+    DxsoRegisterValue result;
+    result.type.ctype  = value.type.ctype;
+    result.type.ccount = dstIndex;
+
+    const uint32_t typeId = getVectorTypeId(result.type);
+
+    if (dstIndex == 1) {
+      result.id = m_module.opCompositeExtract(
+        typeId, value.id, 1, indices.data());
+    } else {
+      result.id = m_module.opVectorShuffle(
+        typeId, value.id, value.id,
+        dstIndex, indices.data());
+    }
+
+    return result;
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitSrcOperandPreSwizzleModifiers(
+            DxsoRegisterValue       value,
+            DxsoRegModifier         modifier) {
+    // r / r.z
+    // r / r.w
+    if (modifier == DxsoRegModifier::Dz
+     || modifier == DxsoRegModifier::Dw) {
+      const uint32_t index = modifier == DxsoRegModifier::Dz ? 2 : 3;
+
+      std::array<uint32_t, 4> indices = { index, index, index, index };
+
+      uint32_t component = m_module.opVectorShuffle(
+        getVectorTypeId(value.type), value.id, value.id, value.type.ccount, indices.data());
+
+      value.id = m_module.opFDiv(
+        getVectorTypeId(value.type), value.id, component);
+    }
+
+    return value;
+  }
+
+
+  DxsoRegisterValue DxsoCompiler::emitSrcOperandPostSwizzleModifiers(
+            DxsoRegisterValue       value,
+            DxsoRegModifier         modifier) {
+    // r - 0.5
+    if (modifier == DxsoRegModifier::Bias
+     || modifier == DxsoRegModifier::BiasNeg) {
+      uint32_t halfVec = m_module.constfReplicant(
+        0.5f, value.type.ccount);
+
+      value.id = m_module.opFSub(
+        getVectorTypeId(value.type), value.id, halfVec);
+    }
+
+    // fma(r, 2.0f, -1.0f)
+    if (modifier == DxsoRegModifier::Sign
+     || modifier == DxsoRegModifier::SignNeg) {
+      uint32_t twoVec = m_module.constfReplicant(
+        2.0f, value.type.ccount);
+
+      uint32_t minusOneVec = m_module.constfReplicant(
+        -1.0f, value.type.ccount);
+
+      value.id = m_module.opFFma(
+        getVectorTypeId(value.type), value.id, twoVec, minusOneVec);
+    }
+
+    // 1 - r
+    if (modifier == DxsoRegModifier::Comp) {
+      uint32_t oneVec = m_module.constfReplicant(
+        1.0f, value.type.ccount);
+
+      value.id = m_module.opFSub(
+        getVectorTypeId(value.type), oneVec, value.id);
+    }
+
+    // r * 2
+    if (modifier == DxsoRegModifier::X2
+     || modifier == DxsoRegModifier::X2Neg) {
+      uint32_t twoVec = m_module.constfReplicant(
+        2.0f, value.type.ccount);
+
+      value.id = m_module.opFMul(
+        getVectorTypeId(value.type), value.id, twoVec);
+    }
+
+    // abs( r )
+    if (modifier == DxsoRegModifier::Abs
+     || modifier == DxsoRegModifier::AbsNeg) {
+      value.id = m_module.opFAbs(
+        getVectorTypeId(value.type), value.id);
+    }
+
+    // !r
+    if (modifier == DxsoRegModifier::Not) {
+      value.id =
+        m_module.opLogicalNot(getVectorTypeId(value.type), value.id);
+    }
+
+    // -r
+    // Treating as -r
+    // Treating as -r
+    // -r * 2
+    // -abs(r)
+    if (modifier == DxsoRegModifier::Neg
+     || modifier == DxsoRegModifier::BiasNeg
+     || modifier == DxsoRegModifier::SignNeg
+     || modifier == DxsoRegModifier::X2Neg
+     || modifier == DxsoRegModifier::AbsNeg) {
+      value.id = m_module.opFNegate(
+        getVectorTypeId(value.type), value.id);
+    }
+
+    return value;
+  }
+
+  DxsoRegisterValue DxsoCompiler::emitRegisterLoad(
+      const DxsoBaseRegister& reg,
+            DxsoRegMask       writeMask,
+      const DxsoBaseRegister* relative) {
+    // Load operand from the operand pointer
+    DxsoRegisterValue result = emitRegisterLoadRaw(reg, relative);
+
+    // PS 1.x clamps float constants
+    if (m_programInfo.type() == DxsoProgramType::PixelShader && m_programInfo.majorVersion() == 1
+      && reg.id.type == DxsoRegisterType::Const)
+      result = emitClampBoundReplicant(result, -1.0f, 1.0f);
+
+    // Apply operand modifiers
+    result = emitSrcOperandPreSwizzleModifiers(result, reg.modifier);
+
+    // Apply operand swizzle to the operand value
+    result = emitRegisterSwizzle(result, reg.swizzle, writeMask);
+
+    // Apply operand modifiers
+    result = emitSrcOperandPostSwizzleModifiers(result, reg.modifier);
+    return result;
+  }
+
+  void DxsoCompiler::emitDcl(const DxsoInstructionContext& ctx) {
+    auto id = ctx.dst.id;
+
+    if (id.type == DxsoRegisterType::Sampler) {
+      this->emitDclSampler(
+        ctx.dst.id.num,
+        ctx.dcl.textureType);
+    }
+    else if (id.type == DxsoRegisterType::Input
+          || id.type == DxsoRegisterType::Texture
+          || id.type == DxsoRegisterType::Output) {
+      DxsoSemantic semantic = ctx.dcl.semantic;
+
+      uint32_t vIndex = id.num;
+
+      if (m_programInfo.type() == DxsoProgramTypes::PixelShader) {
+        // Semantic in PS < 3 is based upon id.
+        if (m_programInfo.majorVersion() < 3) {
+          // Account for the two color registers.
+          if (id.type == DxsoRegisterType::Texture)
+            vIndex += 2;
+
+          semantic = DxsoSemantic{
+            id.type == DxsoRegisterType::Texture ? DxsoUsage::Texcoord : DxsoUsage::Color,
+            id.num };
+        }
+      }
+
+      this->emitDclInterface(
+        id.type != DxsoRegisterType::Output,
+        vIndex,
+        semantic,
+        ctx.dst.mask,
+        ctx.dst.centroid);
+    }
+    else {
+      //Logger::warn(str::format("DxsoCompiler::emitDcl: unhandled register type ", id.type));
+    }
+  }
+
+  void DxsoCompiler::emitDef(const DxsoInstructionContext& ctx) {
+    switch (ctx.instruction.opcode) {
+      case DxsoOpcode::Def:  emitDefF(ctx); break;
+      case DxsoOpcode::DefI: emitDefI(ctx); break;
+      case DxsoOpcode::DefB: emitDefB(ctx); break;
+      default:
+        throw DxvkError("DxsoCompiler::emitDef: Invalid definition opcode");
+        break;
+    }
+  }
+
+  void DxsoCompiler::emitDefF(const DxsoInstructionContext& ctx) {
+    const float* data = ctx.def.float32;
+
+    uint32_t constId = m_module.constvec4f32(data[0], data[1], data[2], data[3]);
+    m_cFloat.at(ctx.dst.id.num) = constId;
+
+    std::string name = str::format("cF", ctx.dst.id.num, "_def");
+    m_module.setDebugName(constId, name.c_str());
+
+    DxsoDefinedConstant constant;
+    constant.uboIdx = ctx.dst.id.num;
+    for (uint32_t i = 0; i < 4; i++)
+      constant.float32[i] = data[i];
+    m_constants.push_back(constant);
+  }
+
+  void DxsoCompiler::emitDefI(const DxsoInstructionContext& ctx) {
+    const int32_t* data = ctx.def.int32;
+
+    uint32_t constId = m_module.constvec4i32(data[0], data[1], data[2], data[3]);
+    m_cInt.at(ctx.dst.id.num) = constId;
+
+    std::string name = str::format("cI", ctx.dst.id.num, "_def");
+    m_module.setDebugName(constId, name.c_str());
+  }
+
+  void DxsoCompiler::emitDefB(const DxsoInstructionContext& ctx) {
+    const int32_t* data = ctx.def.int32;
+
+    uint32_t constId = m_module.constBool(data[0] != 0);
+    m_cBool.at(ctx.dst.id.num) = constId;
+
+    std::string name = str::format("cB", ctx.dst.id.num, "_def");
+    m_module.setDebugName(constId, name.c_str());
+  }
+
+
+  bool DxsoCompiler::isScalarRegister(DxsoRegisterId id) {
+    return id == DxsoRegisterId{DxsoRegisterType::DepthOut, 0}
+        || id == DxsoRegisterId{DxsoRegisterType::RasterizerOut, RasterOutPointSize}
+        || id == DxsoRegisterId{DxsoRegisterType::RasterizerOut, RasterOutFog};
+  }
+
+
+  void DxsoCompiler::emitMov(const DxsoInstructionContext& ctx) {
+    DxsoRegisterPointer dst = emitGetOperandPtr(ctx.dst);
+
+    DxsoRegMask mask = ctx.dst.mask;
+
+    if (isScalarRegister(ctx.dst.id))
+      mask = DxsoRegMask(true, false, false, false);
+
+    DxsoRegisterValue src0 = emitRegisterLoad(ctx.src[0], mask);
+
+    DxsoRegisterValue result;
+    result.type.ctype  = dst.type.ctype;
+    result.type.ccount = mask.popCount();
+
+    const uint32_t typeId = getVectorTypeId(result.type);
+
+    if (dst.type.ctype != src0.type.ctype) {
+      // We have Mova for this... but it turns out Mov has the same behaviour in d3d9!
+
+      // Convert float -> int32_t
+      // and vice versa
+      if (dst.type.ctype == DxsoScalarType::Sint32) {
+        // We need to floor for VS 1.1 and below, the documentation is a dirty stinking liar.
+        if (m_programInfo.majorVersion() < 2 && m_programInfo.minorVersion() < 2)
+          result.id = m_module.opFloor(getVectorTypeId(src0.type), src0.id);
+        else
+          result.id = m_module.opRound(getVectorTypeId(src0.type), src0.id);
+
+        result.id = m_module.opConvertFtoS(typeId, result.id);
+      }
+      else // Float32
+        result.id = m_module.opConvertStoF(typeId, src0.id);
+    }
+    else // No special stuff needed!
+      result.id = src0.id;
+
+    this->emitDstStore(dst, result, mask, ctx.dst.saturate, emitPredicateLoad(ctx), ctx.dst.shift, ctx.dst.id);
+  }
+
+
+  void DxsoCompiler::emitVectorAlu(const DxsoInstructionContext& ctx) {
+    const auto& src = ctx.src;
+
+    DxsoRegMask mask = ctx.dst.mask;
+
+    DxsoRegisterPointer dst = emitGetOperandPtr(ctx.dst);
+
+    if (isScalarRegister(ctx.dst.id))
+      mask = DxsoRegMask(true, false, false, false);
+
+    DxsoRegisterValue result;
+    result.type.ctype  = dst.type.ctype;
+    result.type.ccount = mask.popCount();
+
+    DxsoVectorType scalarType = result.type;
+    scalarType.ccount = 1;
+
+    const uint32_t typeId       = getVectorTypeId(result.type);
+    const uint32_t scalarTypeId = getVectorTypeId(scalarType);
+
+    const DxsoOpcode opcode = ctx.instruction.opcode;
+    switch (opcode) {
+      case DxsoOpcode::Add:
+        result.id = m_module.opFAdd(typeId,
+          emitRegisterLoad(src[0], mask).id,
+          emitRegisterLoad(src[1], mask).id);
+        break;
+      case DxsoOpcode::Sub:
+        result.id = m_module.opFSub(typeId,
+          emitRegisterLoad(src[0], mask).id,
+          emitRegisterLoad(src[1], mask).id);
+        break;
+      case DxsoOpcode::Mad:
+        if (!m_moduleInfo.options.longMad) {
+          result.id = m_module.opFFma(typeId,
+            emitRegisterLoad(src[0], mask).id,
+            emitRegisterLoad(src[1], mask).id,
+            emitRegisterLoad(src[2], mask).id);
+        }
+        else {
+          result.id = m_module.opFMul(typeId,
+            emitRegisterLoad(src[0], mask).id,
+            emitRegisterLoad(src[1], mask).id);
+
+          result.id = m_module.opFAdd(typeId,
+            result.id,
+            emitRegisterLoad(src[2], mask).id);
+        }
+        break;
+      case DxsoOpcode::Mul:
+        result.id = m_module.opFMul(typeId,
+          emitRegisterLoad(src[0], mask).id,
+          emitRegisterLoad(src[1], mask).id);
+        break;
+      case DxsoOpcode::Rcp:
+        result.id = m_module.opFDiv(typeId,
+          m_module.constfReplicant(1.0f, result.type.ccount),
+          emitRegisterLoad(src[0], mask).id);
+
+        if (m_moduleInfo.options.d3d9FloatEmulation) {
+          result.id = m_module.opNMin(typeId, result.id,
+            m_module.constfReplicant(FLT_MAX, result.type.ccount));
+        }
+        break;
+      case DxsoOpcode::Rsq: 
+        result.id = m_module.opFAbs(typeId,
+          emitRegisterLoad(src[0], mask).id);
+
+        result.id = m_module.opInverseSqrt(typeId,
+          result.id);
+
+        if (m_moduleInfo.options.d3d9FloatEmulation) {
+          result.id = m_module.opNMin(typeId, result.id,
+            m_module.constfReplicant(FLT_MAX, result.type.ccount));
+        }
+        break;
+      case DxsoOpcode::Dp3: {
+        DxsoRegMask srcMask(true, true, true, false);
+        result = emitDot(
+          emitRegisterLoad(src[0], srcMask),
+          emitRegisterLoad(src[1], srcMask));
+        break;
+      }
+      case DxsoOpcode::Dp4:
+        result = emitDot(
+          emitRegisterLoad(src[0], IdentityWriteMask),
+          emitRegisterLoad(src[1], IdentityWriteMask));
+        break;
+      case DxsoOpcode::Slt:
+      case DxsoOpcode::Sge: {
+        const uint32_t boolTypeId =
+          getVectorTypeId({ DxsoScalarType::Bool, result.type.ccount });
+
+        uint32_t cmpResult = opcode == DxsoOpcode::Slt
+          ? m_module.opFOrdLessThan        (boolTypeId, emitRegisterLoad(src[0], mask).id, emitRegisterLoad(src[1], mask).id)
+          : m_module.opFOrdGreaterThanEqual(boolTypeId, emitRegisterLoad(src[0], mask).id, emitRegisterLoad(src[1], mask).id);
+
+        result.id = m_module.opSelect(typeId, cmpResult,
+          m_module.constfReplicant(1.0f, result.type.ccount),
+          m_module.constfReplicant(0.0f, result.type.ccount));
+        break;
+      }
+      case DxsoOpcode::Min:
+        result.id = m_module.opFMin(typeId,
+          emitRegisterLoad(src[0], mask).id,
+          emitRegisterLoad(src[1], mask).id);
+        break;
+      case DxsoOpcode::Max:
+        result.id = m_module.opFMax(typeId,
+          emitRegisterLoad(src[0], mask).id,
+          emitRegisterLoad(src[1], mask).id);
+        break;
+      case DxsoOpcode::ExpP:
+        if (m_programInfo.majorVersion() < 2) {
+          DxsoRegMask srcMask(true, false, false, false);
+          uint32_t src0 = emitRegisterLoad(src[0], srcMask).id;
+
+          uint32_t index = 0;
+
+          std::array<uint32_t, 4> resultIndices;
+
+          if (mask[0]) resultIndices[index++] = m_module.opExp2(scalarTypeId, m_module.opFloor(scalarTypeId, src0));
+          if (mask[1]) resultIndices[index++] = m_module.opFSub(scalarTypeId, src0, m_module.opFloor(scalarTypeId, src0));
+          if (mask[2]) resultIndices[index++] = m_module.opExp2(scalarTypeId, src0);
+          if (mask[3]) resultIndices[index++] = m_module.constf32(1.0f);
+
+          if (result.type.ccount == 1)
+            result.id = resultIndices[0];
+          else
+            result.id = m_module.opCompositeConstruct(typeId, result.type.ccount, resultIndices.data());
+
+          break;
+        }
+      case DxsoOpcode::Exp:
+        result.id = m_module.opExp2(typeId,
+          emitRegisterLoad(src[0], mask).id);
+        break;
+      case DxsoOpcode::Pow: {
+        uint32_t base = emitRegisterLoad(src[0], mask).id;
+        base = m_module.opFAbs(typeId, base);
+
+        uint32_t exponent = emitRegisterLoad(src[1], mask).id;
+
+        result.id = m_module.opPow(typeId, base, exponent);
+
+        if (m_moduleInfo.options.strictPow && m_moduleInfo.options.d3d9FloatEmulation) {
+          DxsoRegisterValue cmp;
+          cmp.type  = { DxsoScalarType::Bool, result.type.ccount };
+          cmp.id    = m_module.opFOrdEqual(getVectorTypeId(cmp.type),
+            exponent, m_module.constfReplicant(0.0f, cmp.type.ccount));
+
+          result.id = m_module.opSelect(typeId, cmp.id,
+            m_module.constfReplicant(1.0f, cmp.type.ccount), result.id);
+        }
+        break;
+      }
+      case DxsoOpcode::Crs: {
+        DxsoRegMask vec3Mask(true, true, true, false);
+        
+        DxsoRegisterValue crossValue;
+        crossValue.type = { DxsoScalarType::Float32, 3 };
+        crossValue.id = m_module.opCross(getVectorTypeId(crossValue.type),
+          emitRegisterLoad(src[0], vec3Mask).id,
+          emitRegisterLoad(src[1], vec3Mask).id);
+
+        std::array<uint32_t, 3> indices = { 0, 0, 0 };
+
+        uint32_t index = 0;
+        for (uint32_t i = 0; i < indices.size(); i++) {
+          if (mask[i])
+            indices[index++] = m_module.opCompositeExtract(m_module.defFloatType(32), crossValue.id, 1, &i);
+        }
+
+        result.id = m_module.opCompositeConstruct(getVectorTypeId(result.type), result.type.ccount, indices.data());
+
+        break;
+      }
+      case DxsoOpcode::Abs:
+        result.id = m_module.opFAbs(typeId,
+          emitRegisterLoad(src[0], mask).id);
+        break;
+      case DxsoOpcode::Sgn:
+        result.id = m_module.opFSign(typeId,
+          emitRegisterLoad(src[0], mask).id);
+        break;
+      case DxsoOpcode::Nrm: {
+        // Nrm is 3D...
+        DxsoRegMask srcMask(true, true, true, false);
+        auto vec3 = emitRegisterLoad(src[0], srcMask);
+
+        DxsoRegisterValue dot = emitDot(vec3, vec3);
+        dot.id = m_module.opInverseSqrt (scalarTypeId, dot.id);
+        if (m_moduleInfo.options.d3d9FloatEmulation) {
+          dot.id = m_module.opNMin        (scalarTypeId, dot.id,
+            m_module.constf32(FLT_MAX));
+        }
+
+        // r * rsq(r . r);
+        result.id = m_module.opVectorTimesScalar(
+          typeId,
+          emitRegisterLoad(src[0], mask).id,
+          dot.id);
+        break;
+      }
+      case DxsoOpcode::SinCos: {
+        DxsoRegMask srcMask(true, false, false, false);
+        uint32_t src0 = emitRegisterLoad(src[0], srcMask).id;
+
+        std::array<uint32_t, 4> sincosVectorIndices = { 0, 0, 0, 0 };
+
+        uint32_t index = 0;
+        if (mask[0])
+          sincosVectorIndices[index++] = m_module.opCos(scalarTypeId, src0);
+
+        if (mask[1])
+          sincosVectorIndices[index++] = m_module.opSin(scalarTypeId, src0);
+
+        for (; index < result.type.ccount; index++) {
+          if (sincosVectorIndices[index] == 0)
+            sincosVectorIndices[index] = m_module.constf32(0.0f);
+        }
+            
+        if (result.type.ccount == 1)
+          result.id = sincosVectorIndices[0];
+        else
+          result.id = m_module.opCompositeConstruct(typeId, result.type.ccount, sincosVectorIndices.data());
+
+        break;
+      }
+      case DxsoOpcode::Lit: {
+        DxsoRegMask srcMask(true, true, true, true);
+        uint32_t srcOp = emitRegisterLoad(src[0], srcMask).id;
+
+        const uint32_t x = 0;
+        const uint32_t y = 1;
+        const uint32_t w = 3;
+
+        uint32_t srcX = m_module.opCompositeExtract(scalarTypeId, srcOp, 1, &x);
+        uint32_t srcY = m_module.opCompositeExtract(scalarTypeId, srcOp, 1, &y);
+        uint32_t srcW = m_module.opCompositeExtract(scalarTypeId, srcOp, 1, &w);
+
+        uint32_t power = m_module.opFClamp(
+          scalarTypeId, srcW,
+          m_module.constf32(-127.9961f), m_module.constf32(127.9961f));
+
+        std::array<uint32_t, 4> resultIndices;
+
+        uint32_t index = 0;
+
+        if (mask[0]) resultIndices[index++] = m_module.constf32(1.0f);
+        if (mask[1]) resultIndices[index++] = m_module.opFMax(scalarTypeId, srcX, m_module.constf32(0));
+        if (mask[2]) resultIndices[index++] = m_module.opPow (scalarTypeId, m_module.opFMax(scalarTypeId, srcY, m_module.constf32(0)), power);
+        if (mask[3]) resultIndices[index++] = m_module.constf32(1.0f);
+
+        const uint32_t boolType = m_module.defBoolType();
+        uint32_t zTestX = m_module.opFOrdGreaterThanEqual(boolType, srcX, m_module.constf32(0));
+        uint32_t zTestY = m_module.opFOrdGreaterThanEqual(boolType, srcY, m_module.constf32(0));
+        uint32_t zTest  = m_module.opLogicalAnd(boolType, zTestX, zTestY);
+
+        if (result.type.ccount > 2)
+          resultIndices[2] = m_module.opSelect(
+            scalarTypeId,
+            zTest,
+            resultIndices[2],
+            m_module.constf32(0.0f));
+
+        if (result.type.ccount == 1)
+          result.id = resultIndices[0];
+        else
+          result.id = m_module.opCompositeConstruct(typeId, result.type.ccount, resultIndices.data());
+        break;
+      }
+      case DxsoOpcode::Dst: {
+        //dest.x = 1;
+        //dest.y = src0.y * src1.y;
+        //dest.z = src0.z;
+        //dest.w = src1.w;
+
+        DxsoRegMask srcMask(true, true, true, true);
+
+        uint32_t src0 = emitRegisterLoad(src[0], srcMask).id;
+        uint32_t src1 = emitRegisterLoad(src[1], srcMask).id;
+
+        const uint32_t y = 1;
+        const uint32_t z = 2;
+        const uint32_t w = 3;
+
+        uint32_t src0Y = m_module.opCompositeExtract(scalarTypeId, src0, 1, &y);
+        uint32_t src1Y = m_module.opCompositeExtract(scalarTypeId, src1, 1, &y);
+
+        uint32_t src0Z = m_module.opCompositeExtract(scalarTypeId, src0, 1, &z);
+        uint32_t src1W = m_module.opCompositeExtract(scalarTypeId, src1, 1, &w);
+
+        std::array<uint32_t, 4> resultIndices;
+        resultIndices[0] = m_module.constf32(1.0f);
+        resultIndices[1] = m_module.opFMul(scalarTypeId, src0Y, src1Y);
+        resultIndices[2] = src0Z;
+        resultIndices[3] = src1W;
+
+        if (result.type.ccount == 1)
+          result.id = resultIndices[0];
+        else
+          result.id = m_module.opCompositeConstruct(typeId, result.type.ccount, resultIndices.data());
+        break;
+      }
+      case DxsoOpcode::LogP:
+      case DxsoOpcode::Log:
+        result.id = m_module.opFAbs(typeId, emitRegisterLoad(src[0], mask).id);
+        result.id = m_module.opLog2(typeId, result.id);
+        if (m_moduleInfo.options.d3d9FloatEmulation) {
+          result.id = m_module.opNMax(typeId, result.id,
+            m_module.constfReplicant(-FLT_MAX, result.type.ccount));
+        }
+        break;
+      case DxsoOpcode::Lrp:
+        result.id = m_module.opFMix(typeId,
+          emitRegisterLoad(src[2], mask).id,
+          emitRegisterLoad(src[1], mask).id,
+          emitRegisterLoad(src[0], mask).id);
+        break;
+      case DxsoOpcode::Frc:
+        result.id = m_module.opFract(typeId,
+          emitRegisterLoad(src[0], mask).id);
+        break;
+      case DxsoOpcode::Cmp: {
+        const uint32_t boolTypeId =
+          getVectorTypeId({ DxsoScalarType::Bool, result.type.ccount });
+
+        uint32_t cmp = m_module.opFOrdGreaterThanEqual(
+          boolTypeId,
+          emitRegisterLoad(src[0], mask).id,
+          m_module.constfReplicant(0.0f, result.type.ccount));
+
+        result.id = m_module.opSelect(
+          typeId, cmp,
+          emitRegisterLoad(src[1], mask).id,
+          emitRegisterLoad(src[2], mask).id);
+        break;
+      }
+      case DxsoOpcode::Cnd: {
+        const uint32_t boolTypeId =
+          getVectorTypeId({ DxsoScalarType::Bool, result.type.ccount });
+
+        uint32_t cmp = m_module.opFOrdGreaterThan(
+          boolTypeId,
+          emitRegisterLoad(src[0], mask).id,
+          m_module.constfReplicant(0.5f, result.type.ccount));
+
+        result.id = m_module.opSelect(
+          typeId, cmp,
+          emitRegisterLoad(src[1], mask).id,
+          emitRegisterLoad(src[2], mask).id);
+        break;
+      }
+      case DxsoOpcode::Dp2Add: {
+        DxsoRegMask dotSrcMask(true, true, false, false);
+        DxsoRegMask addSrcMask(true, false, false, false);
+
+        DxsoRegisterValue dot = emitDot(
+          emitRegisterLoad(src[0], dotSrcMask),
+          emitRegisterLoad(src[1], dotSrcMask));
+
+        dot.id = m_module.opFAdd(scalarTypeId,
+          dot.id, emitRegisterLoad(src[2], addSrcMask).id);
+
+        result.id   = dot.id;
+        result.type = scalarType;
+        break;
+      }
+      case DxsoOpcode::DsX:
+        result.id = m_module.opDpdx(
+          typeId, emitRegisterLoad(src[0], mask).id);
+        break;
+      case DxsoOpcode::DsY:
+        result.id = m_module.opDpdy(
+          typeId, emitRegisterLoad(src[0], mask).id);
+        break;
+      default:
+        Logger::warn(str::format("DxsoCompiler::emitVectorAlu: unimplemented op ", opcode));
+        return;
+    }
+
+    this->emitDstStore(dst, result, mask, ctx.dst.saturate, emitPredicateLoad(ctx), ctx.dst.shift, ctx.dst.id);
+  }
+
+
+  void DxsoCompiler::emitPredicateOp(const DxsoInstructionContext& ctx) {
+    const auto& src = ctx.src;
+
+    DxsoRegMask mask = ctx.dst.mask;
+
+    DxsoRegisterPointer dst = emitGetOperandPtr(ctx.dst);
+
+    DxsoRegisterValue result;
+    result.type.ctype  = dst.type.ctype;
+    result.type.ccount = mask.popCount();
+
+    result.id = emitBoolComparison(
+      result.type, ctx.instruction.specificData.comparison,
+      emitRegisterLoad(src[0], mask).id, emitRegisterLoad(src[1], mask).id);
+
+    this->emitValueStore(dst, result, mask, emitPredicateLoad(ctx));
+  }
+
+
+  void DxsoCompiler::emitMatrixAlu(const DxsoInstructionContext& ctx) {
+    const DxsoOpcode opcode = ctx.instruction.opcode;
+
+    uint32_t dotCount;
+    uint32_t componentCount;
+
+    switch (opcode) {
+      case DxsoOpcode::M3x2:
+        dotCount       = 3;
+        componentCount = 2;
+        break;
+      case DxsoOpcode::M3x3:
+        dotCount       = 3;
+        componentCount = 3;
+        break;
+      case DxsoOpcode::M3x4:
+        dotCount       = 3;
+        componentCount = 4;
+        break;
+      case DxsoOpcode::M4x3:
+        dotCount       = 4;
+        componentCount = 3;
+        break;
+      case DxsoOpcode::M4x4:
+        dotCount       = 4;
+        componentCount = 4;
+        break;
+      default:
+        Logger::warn(str::format("DxsoCompiler::emitMatrixAlu: unimplemented op ", opcode));
+        return;
+    }
+
+    DxsoRegisterPointer dst = emitGetOperandPtr(ctx.dst);
+
+    // Fix the dst mask if componentCount != maskCount
+    // ie. M4x3 on .xyzw.
+    uint32_t maskCnt = 0;
+    uint8_t mask = 0;
+    for (uint32_t i = 0; i < 4 && maskCnt < componentCount; i++) {
+      if (ctx.dst.mask[i]) {
+        mask |= 1 << i;
+        maskCnt++;
+      }
+    }
+    DxsoRegMask dstMask = DxsoRegMask(mask);
+
+    DxsoRegisterValue result;
+    result.type.ctype  = dst.type.ctype;
+    result.type.ccount = componentCount;
+
+    DxsoVectorType scalarType;
+    scalarType.ctype = result.type.ctype;
+    scalarType.ccount = 1;
+
+    const uint32_t typeId = getVectorTypeId(result.type);
+    const uint32_t scalarTypeId = getVectorTypeId(scalarType);
+
+    DxsoRegMask srcMask(true, true, true, dotCount == 4);
+    std::array<uint32_t, 4> indices;
+
+    DxsoRegister src0 = ctx.src[0];
+    DxsoRegister src1 = ctx.src[1];
+
+    for (uint32_t i = 0; i < componentCount; i++) {
+      indices[i] = m_module.opDot(scalarTypeId,
+        emitRegisterLoad(src0, srcMask).id,
+        emitRegisterLoad(src1, srcMask).id);
+
+      src1.id.num++;
+    }
+
+    result.id = m_module.opCompositeConstruct(
+      typeId, componentCount, indices.data());
+
+    this->emitDstStore(dst, result, dstMask, ctx.dst.saturate, emitPredicateLoad(ctx), ctx.dst.shift, ctx.dst.id);
+  }
+
+
+void DxsoCompiler::emitControlFlowGenericLoop(
+          bool     count,
+          uint32_t initialVar,
+          uint32_t strideVar,
+          uint32_t iterationCountVar) {
+    const uint32_t itType = m_module.defIntType(32, 1);
+
+    DxsoCfgBlock block;
+    block.type = DxsoCfgBlockType::Loop;
+    block.b_loop.labelHeader   = m_module.allocateId();
+    block.b_loop.labelBegin    = m_module.allocateId();
+    block.b_loop.labelContinue = m_module.allocateId();
+    block.b_loop.labelBreak    = m_module.allocateId();
+    block.b_loop.iteratorPtr   = m_module.newVar(
+      m_module.defPointerType(itType, spv::StorageClassPrivate), spv::StorageClassPrivate);
+    block.b_loop.strideVar     = strideVar;
+    block.b_loop.countBackup   = 0;
+
+    if (count) {
+      DxsoBaseRegister loop;
+      loop.id = { DxsoRegisterType::Loop, 0 };
+
+      DxsoRegisterPointer loopPtr = emitGetOperandPtr(loop, nullptr);
+      uint32_t loopVal = m_module.opLoad(
+        getVectorTypeId(loopPtr.type), loopPtr.id);
+
+      block.b_loop.countBackup = loopVal;
+
+      m_module.opStore(loopPtr.id, initialVar);
+    }
+
+    m_module.setDebugName(block.b_loop.iteratorPtr, "iter");
+
+    m_module.opStore(block.b_loop.iteratorPtr, iterationCountVar);
+
+    m_module.opBranch(block.b_loop.labelHeader);
+    m_module.opLabel (block.b_loop.labelHeader);
+
+    m_module.opLoopMerge(
+      block.b_loop.labelBreak,
+      block.b_loop.labelContinue,
+      spv::LoopControlMaskNone);
+
+    m_module.opBranch(block.b_loop.labelBegin);
+    m_module.opLabel (block.b_loop.labelBegin);
+
+    uint32_t iterator = m_module.opLoad(itType, block.b_loop.iteratorPtr);
+    uint32_t complete = m_module.opIEqual(m_module.defBoolType(), iterator, m_module.consti32(0));
+
+    const uint32_t breakBlock = m_module.allocateId();
+    const uint32_t mergeBlock = m_module.allocateId();
+
+    m_module.opSelectionMerge(mergeBlock,
+      spv::SelectionControlMaskNone);
+
+    m_module.opBranchConditional(
+      complete, breakBlock, mergeBlock);
+
+    m_module.opLabel(breakBlock);
+
+    m_module.opBranch(block.b_loop.labelBreak);
+
+    m_module.opLabel(mergeBlock);
+
+    iterator = m_module.opISub(itType, iterator, m_module.consti32(1));
+    m_module.opStore(block.b_loop.iteratorPtr, iterator);
+
+    m_controlFlowBlocks.push_back(block);
+  }
+
+  void DxsoCompiler::emitControlFlowGenericLoopEnd() {
+    if (m_controlFlowBlocks.size() == 0
+      || m_controlFlowBlocks.back().type != DxsoCfgBlockType::Loop)
+      throw DxvkError("DxsoCompiler: 'EndRep' without 'Rep' or 'Loop' found");
+
+    // Remove the block from the stack, it's closed
+    const DxsoCfgBlock block = m_controlFlowBlocks.back();
+    m_controlFlowBlocks.pop_back();
+
+    if (block.b_loop.strideVar) {
+      DxsoBaseRegister loop;
+      loop.id = { DxsoRegisterType::Loop, 0 };
+
+      DxsoRegisterPointer loopPtr = emitGetOperandPtr(loop, nullptr);
+      uint32_t val = m_module.opLoad(
+        getVectorTypeId(loopPtr.type), loopPtr.id);
+
+      val = m_module.opIAdd(
+        getVectorTypeId(loopPtr.type),
+        val, block.b_loop.strideVar);
+
+      m_module.opStore(loopPtr.id, val);
+    }
+
+    // Declare the continue block
+    m_module.opBranch(block.b_loop.labelContinue);
+    m_module.opLabel(block.b_loop.labelContinue);
+
+    // Declare the merge block
+    m_module.opBranch(block.b_loop.labelHeader);
+    m_module.opLabel(block.b_loop.labelBreak);
+
+    if (block.b_loop.countBackup) {
+      DxsoBaseRegister loop;
+      loop.id = { DxsoRegisterType::Loop, 0 };
+
+      DxsoRegisterPointer loopPtr = emitGetOperandPtr(loop, nullptr);
+
+      m_module.opStore(loopPtr.id, block.b_loop.countBackup);
+    }
+  }
+
+  void DxsoCompiler::emitControlFlowRep(const DxsoInstructionContext& ctx) {
+    DxsoRegMask srcMask(true, false, false, false);
+    this->emitControlFlowGenericLoop(
+      false, 0, 0,
+      emitRegisterLoad(ctx.src[0], srcMask).id);
+  }
+
+  void DxsoCompiler::emitControlFlowEndRep(const DxsoInstructionContext& ctx) {
+    emitControlFlowGenericLoopEnd();
+  }
+
+  void DxsoCompiler::emitControlFlowLoop(const DxsoInstructionContext& ctx) {
+    const uint32_t itType = m_module.defIntType(32, 1);
+
+    DxsoRegMask srcMask(true, true, true, false);
+    uint32_t integerRegister = emitRegisterLoad(ctx.src[1], srcMask).id;
+    uint32_t x = 0;
+    uint32_t y = 1;
+    uint32_t z = 2;
+
+    uint32_t iterCount    = m_module.opCompositeExtract(itType, integerRegister, 1, &x);
+    uint32_t initialValue = m_module.opCompositeExtract(itType, integerRegister, 1, &y);
+    uint32_t strideSize   = m_module.opCompositeExtract(itType, integerRegister, 1, &z);
+
+    this->emitControlFlowGenericLoop(
+      true,
+      initialValue,
+      strideSize,
+      iterCount);
+  }
+
+  void DxsoCompiler::emitControlFlowEndLoop(const DxsoInstructionContext& ctx) {
+    this->emitControlFlowGenericLoopEnd();
+  }
+
+  void DxsoCompiler::emitControlFlowBreak(const DxsoInstructionContext& ctx) {
+    DxsoCfgBlock* cfgBlock =
+      cfgFindBlock({ DxsoCfgBlockType::Loop });
+
+    if (cfgBlock == nullptr)
+      throw DxvkError("DxbcCompiler: 'Break' outside 'Rep' or 'Loop' found");
+
+    m_module.opBranch(cfgBlock->b_loop.labelBreak);
+
+    // Subsequent instructions assume that there is an open block
+    const uint32_t labelId = m_module.allocateId();
+    m_module.opLabel(labelId);
+  }
+
+  void DxsoCompiler::emitControlFlowBreakC(const DxsoInstructionContext& ctx) {
+    DxsoCfgBlock* cfgBlock =
+      cfgFindBlock({ DxsoCfgBlockType::Loop });
+
+    if (cfgBlock == nullptr)
+      throw DxvkError("DxbcCompiler: 'BreakC' outside 'Rep' or 'Loop' found");
+
+    DxsoRegMask srcMask(true, false, false, false);
+    auto a = emitRegisterLoad(ctx.src[0], srcMask);
+    auto b = emitRegisterLoad(ctx.src[1], srcMask);
+
+    uint32_t result = this->emitBoolComparison(
+      { DxsoScalarType::Bool, a.type.ccount },
+      ctx.instruction.specificData.comparison,
+      a.id, b.id);
+
+    // We basically have to wrap this into an 'if' block
+    const uint32_t breakBlock = m_module.allocateId();
+    const uint32_t mergeBlock = m_module.allocateId();
+
+    m_module.opSelectionMerge(mergeBlock,
+      spv::SelectionControlMaskNone);
+
+    m_module.opBranchConditional(
+      result, breakBlock, mergeBlock);
+
+    m_module.opLabel(breakBlock);
+
+    m_module.opBranch(cfgBlock->b_loop.labelBreak);
+
+    m_module.opLabel(mergeBlock);
+  }
+
+  void DxsoCompiler::emitControlFlowIf(const DxsoInstructionContext& ctx) {
+    const auto opcode = ctx.instruction.opcode;
+
+    uint32_t result;
+
+    DxsoRegMask srcMask(true, false, false, false);
+    if (opcode == DxsoOpcode::Ifc) {
+      auto a = emitRegisterLoad(ctx.src[0], srcMask);
+      auto b = emitRegisterLoad(ctx.src[1], srcMask);
+
+      result = this->emitBoolComparison(
+        { DxsoScalarType::Bool, a.type.ccount },
+        ctx.instruction.specificData.comparison,
+        a.id, b.id);
+    } else
+      result = emitRegisterLoad(ctx.src[0], srcMask).id;
+
+    // Declare the 'if' block. We do not know if there
+    // will be an 'else' block or not, so we'll assume
+    // that there is one and leave it empty otherwise.
+    DxsoCfgBlock block;
+    block.type = DxsoCfgBlockType::If;
+    block.b_if.ztestId   = result;
+    block.b_if.labelIf   = m_module.allocateId();
+    block.b_if.labelElse = 0;
+    block.b_if.labelEnd  = m_module.allocateId();
+    block.b_if.headerPtr = m_module.getInsertionPtr();
+    m_controlFlowBlocks.push_back(block);
+
+    // We'll insert the branch instruction when closing
+    // the block, since we don't know whether or not an
+    // else block is needed right now.
+    m_module.opLabel(block.b_if.labelIf);
+  }
+
+  void DxsoCompiler::emitControlFlowElse(const DxsoInstructionContext& ctx) {
+    if (m_controlFlowBlocks.size() == 0
+     || m_controlFlowBlocks.back().type != DxsoCfgBlockType::If
+     || m_controlFlowBlocks.back().b_if.labelElse != 0)
+      throw DxvkError("DxsoCompiler: 'Else' without 'If' found");
+    
+    // Set the 'Else' flag so that we do
+    // not insert a dummy block on 'EndIf'
+    DxsoCfgBlock& block = m_controlFlowBlocks.back();
+    block.b_if.labelElse = m_module.allocateId();
+    
+    // Close the 'If' block by branching to
+    // the merge block we declared earlier
+    m_module.opBranch(block.b_if.labelEnd);
+    m_module.opLabel (block.b_if.labelElse);
+  }
+
+  void DxsoCompiler::emitControlFlowEndIf(const DxsoInstructionContext& ctx) {
+    if (m_controlFlowBlocks.size() == 0
+     || m_controlFlowBlocks.back().type != DxsoCfgBlockType::If)
+      throw DxvkError("DxsoCompiler: 'EndIf' without 'If' found");
+    
+    // Remove the block from the stack, it's closed
+    DxsoCfgBlock block = m_controlFlowBlocks.back();
+    m_controlFlowBlocks.pop_back();
+    
+    // Write out the 'if' header
+    m_module.beginInsertion(block.b_if.headerPtr);
+    
+    m_module.opSelectionMerge(
+      block.b_if.labelEnd,
+      spv::SelectionControlMaskNone);
+    
+    m_module.opBranchConditional(
+      block.b_if.ztestId,
+      block.b_if.labelIf,
+      block.b_if.labelElse != 0
+        ? block.b_if.labelElse
+        : block.b_if.labelEnd);
+    
+    m_module.endInsertion();
+    
+    // End the active 'if' or 'else' block
+    m_module.opBranch(block.b_if.labelEnd);
+    m_module.opLabel (block.b_if.labelEnd);
+  }
+
+
+  void DxsoCompiler::emitTexCoord(const DxsoInstructionContext& ctx) {
+    DxsoRegisterValue result;
+
+    if (m_programInfo.majorVersion() == 1 && m_programInfo.minorVersion() == 4) {
+      // TexCrd Op (PS 1.4)
+      result = emitRegisterLoad(ctx.src[0], ctx.dst.mask);
+    } else {
+      // TexCoord Op (PS 1.0 - PS 1.3)
+      DxsoRegister texcoord;
+      texcoord.id.type = DxsoRegisterType::PixelTexcoord;
+      texcoord.id.num  = ctx.dst.id.num;
+
+      result = emitRegisterLoadRaw(texcoord, nullptr);
+      // Saturate
+      result = emitSaturate(result);
+      // w = 1.0f
+      uint32_t wIndex = 3;
+      result.id = m_module.opCompositeInsert(getVectorTypeId(result.type),
+        m_module.constf32(1.0f),
+        result.id,
+        1, &wIndex);
+    }
+
+    DxsoRegisterPointer dst = emitGetOperandPtr(ctx.dst);
+
+    this->emitDstStore(dst, result, ctx.dst.mask, ctx.dst.saturate, emitPredicateLoad(ctx), ctx.dst.shift, ctx.dst.id);
+  }
+
+  void DxsoCompiler::emitTextureSample(const DxsoInstructionContext& ctx) {
+    DxsoRegisterPointer dst = emitGetOperandPtr(ctx.dst);
+
+    const DxsoOpcode opcode = ctx.instruction.opcode;
+
+    DxsoRegisterValue texcoordVar;
+    uint32_t samplerIdx;
+
+    DxsoRegMask vec3Mask(true, true, true,  false);
+    DxsoRegMask srcMask (true, true, true,  true);
+
+    auto GetProjectionValue = [&]() {
+      uint32_t w = 3;
+      return m_module.opCompositeExtract(
+        m_module.defFloatType(32), texcoordVar.id, 1, &w);
+    };
+
+    if (opcode == DxsoOpcode::TexM3x2Tex || opcode == DxsoOpcode::TexM3x3Tex || opcode == DxsoOpcode::TexM3x3Spec || opcode == DxsoOpcode::TexM3x3VSpec) {
+      const uint32_t count = opcode == DxsoOpcode::TexM3x2Tex ? 2 : 3;
+
+      auto n = emitRegisterLoad(ctx.src[0], vec3Mask);
+
+      std::array<uint32_t, 4> indices = { 0, 0, m_module.constf32(0.0f), m_module.constf32(0.0f) };
+      for (uint32_t i = 0; i < count; i++) {
+        auto reg = ctx.dst;
+        reg.id.num -= (count - 1) - i;
+        auto m = emitRegisterLoadTexcoord(reg, vec3Mask);
+
+        indices[i] = m_module.opDot(getScalarTypeId(DxsoScalarType::Float32), m.id, n.id);
+      }
+
+      if (opcode == DxsoOpcode::TexM3x3Spec || opcode == DxsoOpcode::TexM3x3VSpec) {
+        uint32_t vec3Type = getVectorTypeId({ DxsoScalarType::Float32, 3 });
+        uint32_t normal = m_module.opCompositeConstruct(vec3Type, 3, indices.data());
+
+        uint32_t eyeRay;
+        // VSpec -> Create eye ray from .w of last 3 tex coords (m, m-1, m-2)
+        // Spec -> Get eye ray from src[1]
+        if (opcode == DxsoOpcode::TexM3x3VSpec) {
+          DxsoRegMask wMask(false, false, false, true);
+
+          std::array<uint32_t, 3> eyeRayIndices;
+          for (uint32_t i = 0; i < 3; i++) {
+            auto reg = ctx.dst;
+            reg.id.num -= (count - 1) - i;
+            eyeRayIndices[i] = emitRegisterLoadTexcoord(reg, wMask).id;
+          }
+
+          eyeRay = m_module.opCompositeConstruct(vec3Type, eyeRayIndices.size(), eyeRayIndices.data());
+        }
+        else
+          eyeRay = emitRegisterLoad(ctx.src[1], vec3Mask).id;
+
+        eyeRay = m_module.opNormalize(vec3Type, eyeRay);
+        normal = m_module.opNormalize(vec3Type, normal);
+        uint32_t reflection = m_module.opReflect(vec3Type, eyeRay, normal);
+        reflection = m_module.opFNegate(vec3Type, reflection);
+
+        for (uint32_t i = 0; i < 3; i++)
+          indices[i] = m_module.opCompositeExtract(m_module.defFloatType(32), reflection, 1, &i);
+      }
+
+      texcoordVar.type = { DxsoScalarType::Float32, 4 };
+      texcoordVar.id   = m_module.opCompositeConstruct(getVectorTypeId(texcoordVar.type), indices.size(), indices.data());
+      
+      samplerIdx = ctx.dst.id.num;
+    }
+    else if (opcode == DxsoOpcode::TexBem || opcode == DxsoOpcode::TexBemL) {
+      auto m = emitRegisterLoadTexcoord(ctx.dst, srcMask);
+      auto n = emitRegisterLoad(ctx.src[0], srcMask);
+
+      texcoordVar = m;
+      samplerIdx = ctx.dst.id.num;
+
+      uint32_t texcoord_t = getVectorTypeId(texcoordVar.type);
+
+      // The projection (/.w) happens before this...
+      // Of course it does...
+      uint32_t bool_t = m_module.defBoolType();
+
+      uint32_t shouldProj = m_module.opBitFieldUExtract(
+        m_module.defIntType(32, 0), m_ps.projectionSpec,
+        m_module.consti32(samplerIdx), m_module.consti32(1));
+
+      shouldProj = m_module.opIEqual(bool_t, shouldProj, m_module.constu32(1));
+
+      uint32_t bvec4_t = m_module.defVectorType(bool_t, 4);
+      std::array<uint32_t, 4> indices = { shouldProj, shouldProj, shouldProj, shouldProj };
+      shouldProj = m_module.opCompositeConstruct(bvec4_t, indices.size(), indices.data());
+
+      uint32_t projScalar = m_module.opFDiv(m_module.defFloatType(32), m_module.constf32(1.0), GetProjectionValue());
+      uint32_t projResult = m_module.opVectorTimesScalar(texcoord_t, texcoordVar.id, projScalar);
+
+      texcoordVar.id = m_module.opSelect(texcoord_t, shouldProj, projResult, texcoordVar.id);
+
+      // u' = tc(m).x + [bm00(m) * t(n).x + bm10(m) * t(n).y]
+      // v' = tc(m).y + [bm01(m) * t(n).x + bm11(m) * t(n).y]
+
+      // But we flipped the bm indices so we can use dot here...
+
+      // u' = tc(m).x + dot(bm0, tn)
+      // v' = tc(m).y + dot(bm1, tn)
+
+      for (uint32_t i = 0; i < 2; i++) {
+        uint32_t fl_t   = getScalarTypeId(DxsoScalarType::Float32);
+        uint32_t vec2_t = getVectorTypeId({ DxsoScalarType::Float32, 2 });
+        std::array<uint32_t, 4> indices = { 0, 1, 2, 3 };
+
+        uint32_t tc_m_n = m_module.opCompositeExtract(fl_t, texcoordVar.id, 1, &i);
+
+        uint32_t offset = m_module.constu32(D3D9SharedPSStages_Count * ctx.dst.id.num + D3D9SharedPSStages_BumpEnvMat0 + i);
+        uint32_t bm     = m_module.opAccessChain(m_module.defPointerType(vec2_t, spv::StorageClassUniform),
+                                                 m_ps.sharedState, 1, &offset);
+                 bm     = m_module.opLoad(vec2_t, bm);
+
+        uint32_t t      = m_module.opVectorShuffle(vec2_t, n.id, n.id, 2, indices.data());
+
+        uint32_t dot    = m_module.opDot(fl_t, bm, t);
+
+        uint32_t result = m_module.opFAdd(fl_t, tc_m_n, dot);
+        texcoordVar.id  = m_module.opCompositeInsert(getVectorTypeId(texcoordVar.type), result, texcoordVar.id, 1, &i);
+      }
+    }
+    else if (opcode == DxsoOpcode::TexReg2Ar) {
+      texcoordVar = emitRegisterLoad(ctx.src[0], srcMask);
+      texcoordVar = emitRegisterSwizzle(texcoordVar, DxsoRegSwizzle(3, 0, 0, 0), srcMask);
+
+      samplerIdx = ctx.dst.id.num;
+    }
+    else if (opcode == DxsoOpcode::TexReg2Gb) {
+      texcoordVar = emitRegisterLoad(ctx.src[0], srcMask);
+      texcoordVar = emitRegisterSwizzle(texcoordVar, DxsoRegSwizzle(1, 2, 2, 2), srcMask);
+
+      samplerIdx = ctx.dst.id.num;
+    }
+    else if (opcode == DxsoOpcode::TexReg2Rgb) {
+      texcoordVar = emitRegisterLoad(ctx.src[0], srcMask);
+      texcoordVar = emitRegisterSwizzle(texcoordVar, DxsoRegSwizzle(0, 1, 2, 2), srcMask);
+
+      samplerIdx = ctx.dst.id.num;
+    }
+    else if (opcode == DxsoOpcode::TexDp3Tex) {
+      auto m = emitRegisterLoadTexcoord(ctx.dst,    vec3Mask);
+      auto n = emitRegisterLoad(ctx.src[0], vec3Mask);
+
+      auto dot = emitDot(m, n);
+
+      std::array<uint32_t, 4> indices = { dot.id, m_module.constf32(0.0f), m_module.constf32(0.0f), m_module.constf32(0.0f) };
+
+      texcoordVar.type = { DxsoScalarType::Float32, 4 };
+      texcoordVar.id   = m_module.opCompositeConstruct(getVectorTypeId(texcoordVar.type),
+        indices.size(), indices.data());
+
+      samplerIdx  = ctx.dst.id.num;
+    }
+    else {
+      if (m_programInfo.majorVersion() >= 2) { // SM 2.0+
+        texcoordVar = emitRegisterLoad(ctx.src[0], srcMask);
+        samplerIdx  = ctx.src[1].id.num;
+      } else if (
+        m_programInfo.majorVersion() == 1
+     && m_programInfo.minorVersion() == 4) { // SM 1.4
+        texcoordVar = emitRegisterLoad(ctx.src[0], srcMask);
+        samplerIdx  = ctx.dst.id.num;
+      }
+      else { // SM 1.0-1.3
+        texcoordVar = emitRegisterLoadTexcoord(ctx.dst, srcMask);
+        samplerIdx  = ctx.dst.id.num;
+      }
+    }
+
+    // SM < 1.x does not have dcl sampler type.
+    if (m_programInfo.majorVersion() < 2 && m_samplers[samplerIdx].color[SamplerTypeTexture2D].varId == 0)
+      emitDclSampler(samplerIdx, DxsoTextureType::Texture2D);
+
+    DxsoSampler sampler = m_samplers.at(samplerIdx);
+
+    auto SampleImage = [this, opcode, dst, ctx, samplerIdx, GetProjectionValue](DxsoRegisterValue texcoordVar, DxsoSamplerInfo& sampler, bool depth, DxsoSamplerType samplerType, uint32_t specConst) {
+      DxsoRegisterValue result;
+      result.type.ctype  = dst.type.ctype;
+      result.type.ccount = depth ? 1 : 4;
+
+      const uint32_t typeId = getVectorTypeId(result.type);
+
+      SpirvImageOperands imageOperands;
+      if (m_programInfo.type() == DxsoProgramTypes::VertexShader) {
+        imageOperands.sLod = m_module.constf32(0.0f);
+        imageOperands.flags |= spv::ImageOperandsLodMask;
+      }
+
+      if (opcode == DxsoOpcode::TexLdl) {
+        uint32_t w = 3;
+        imageOperands.sLod = m_module.opCompositeExtract(
+          m_module.defFloatType(32), texcoordVar.id, 1, &w);
+        imageOperands.flags |= spv::ImageOperandsLodMask;
+      }
+
+      if (opcode == DxsoOpcode::TexLdd) {
+        DxsoRegMask gradMask(true, true, sampler.dimensions == 3, false);
+        imageOperands.flags |= spv::ImageOperandsGradMask;
+        imageOperands.sGradX = emitRegisterLoad(ctx.src[2], gradMask).id;
+        imageOperands.sGradY = emitRegisterLoad(ctx.src[3], gradMask).id;
+      }
+
+      uint32_t projDivider = 0;
+
+      if (opcode == DxsoOpcode::Tex
+        && m_programInfo.majorVersion() >= 2) {
+        if (ctx.instruction.specificData.texld == DxsoTexLdMode::Project) {
+          projDivider = GetProjectionValue();
+        }
+        else if (ctx.instruction.specificData.texld == DxsoTexLdMode::Bias) {
+          uint32_t w = 3;
+          imageOperands.sLodBias = m_module.opCompositeExtract(
+            m_module.defFloatType(32), texcoordVar.id, 1, &w);
+          imageOperands.flags |= spv::ImageOperandsBiasMask;
+        }
+      }
+
+      bool switchProjResult = m_programInfo.majorVersion() < 2 && samplerType != SamplerTypeTextureCube;
+
+      if (switchProjResult)
+        projDivider = GetProjectionValue();
+
+      // We already handled this...
+      if (opcode == DxsoOpcode::TexBem) {
+        switchProjResult = false;
+        projDivider = 0;
+      }
+
+      uint32_t reference = 0;
+
+      if (depth) {
+        uint32_t component = sampler.dimensions;
+        reference = m_module.opCompositeExtract(
+          m_module.defFloatType(32), texcoordVar.id, 1, &component);
+      }
+
+      if (projDivider != 0) {
+        for (uint32_t i = sampler.dimensions; i < 4; i++) {
+          texcoordVar.id = m_module.opCompositeInsert(getVectorTypeId(texcoordVar.type),
+            projDivider, texcoordVar.id, 1, &i);
+        }
+      }
+
+      uint32_t fetch4 = 0;
+      if (m_programInfo.type() == DxsoProgramType::PixelShader && samplerType != SamplerTypeTexture3D) {
+        fetch4 = m_module.opBitFieldUExtract(
+          m_module.defIntType(32, 0), m_ps.fetch4Spec,
+          m_module.consti32(samplerIdx), m_module.consti32(1));
+
+        uint32_t bool_t = m_module.defBoolType();
+        fetch4 = m_module.opIEqual(bool_t, fetch4, m_module.constu32(1));
+
+        uint32_t bvec4_t = m_module.defVectorType(bool_t, 4);
+        std::array<uint32_t, 4> indices = { fetch4, fetch4, fetch4, fetch4 };
+        fetch4 = m_module.opCompositeConstruct(bvec4_t, indices.size(), indices.data());
+      }
+
+      result.id = this->emitSample(
+        projDivider != 0,
+        typeId,
+        sampler,
+        texcoordVar,
+        reference,
+        fetch4,
+        imageOperands);
+
+      if (switchProjResult) {
+        uint32_t bool_t = m_module.defBoolType();
+
+        uint32_t nonProjResult = this->emitSample(
+          0,
+          typeId,
+          sampler,
+          texcoordVar,
+          reference,
+          fetch4,
+          imageOperands);
+
+        uint32_t shouldProj = m_module.opBitFieldUExtract(
+          m_module.defIntType(32, 0), m_ps.projectionSpec,
+          m_module.consti32(samplerIdx), m_module.consti32(1));
+
+        shouldProj = m_module.opIEqual(m_module.defBoolType(), shouldProj, m_module.constu32(1));
+
+        // Depth  -> .x
+        // Colour -> .xyzw
+        // Need to replicate the bool for the opSelect.
+        if (!depth) {
+          uint32_t bvec4_t = m_module.defVectorType(bool_t, 4);
+          std::array<uint32_t, 4> indices = { shouldProj, shouldProj, shouldProj, shouldProj };
+          shouldProj = m_module.opCompositeConstruct(bvec4_t, indices.size(), indices.data());
+        }
+
+        result.id = m_module.opSelect(typeId, shouldProj, result.id, nonProjResult);
+      }
+
+      // If we are sampling depth we've already specc'ed this!
+      // This path is always size 4 because it only hits on color.
+      if (specConst != 0) {
+        uint32_t bool_t = m_module.defBoolType();
+        uint32_t bvec4_t = m_module.defVectorType(bool_t, 4);
+        std::array<uint32_t, 4> indices = { specConst, specConst, specConst, specConst };
+        specConst = m_module.opCompositeConstruct(bvec4_t, indices.size(), indices.data());
+        result.id = m_module.opSelect(typeId, specConst, result.id, m_module.constvec4f32(0.0f, 0.0f, 0.0f, 1.0f));
+      }
+
+      // Apply operand swizzle to the operand value
+      result = emitRegisterSwizzle(result, IdentitySwizzle, ctx.dst.mask);
+
+      if (opcode == DxsoOpcode::TexBemL) {
+        uint32_t float_t = m_module.defFloatType(32);
+
+        uint32_t index = m_module.constu32(D3D9SharedPSStages_Count * ctx.dst.id.num + D3D9SharedPSStages_BumpEnvLScale);
+        uint32_t lScale = m_module.opAccessChain(m_module.defPointerType(float_t, spv::StorageClassUniform),
+                                                 m_ps.sharedState, 1, &index);
+                 lScale = m_module.opLoad(float_t, lScale);
+
+                 index = m_module.constu32(D3D9SharedPSStages_Count * ctx.dst.id.num + D3D9SharedPSStages_BumpEnvLOffset);
+        uint32_t lOffset = m_module.opAccessChain(m_module.defPointerType(float_t, spv::StorageClassUniform),
+                                                  m_ps.sharedState, 1, &index);
+                 lOffset = m_module.opLoad(float_t, lOffset);
+            
+        uint32_t zIndex = 2;
+        uint32_t scale = m_module.opCompositeExtract(float_t, result.id, 1, &zIndex);
+                 scale = m_module.opFMul(float_t, scale, lScale);
+                 scale = m_module.opFAdd(float_t, scale, lOffset);
+                 scale = m_module.opFClamp(float_t, scale, m_module.constf32(0.0f), m_module.constf32(1.0));
+
+        result.id = m_module.opVectorTimesScalar(getVectorTypeId(result.type), result.id, scale);
+      }
+
+      this->emitDstStore(dst, result, ctx.dst.mask, ctx.dst.saturate, emitPredicateLoad(ctx), ctx.dst.shift, ctx.dst.id);
+    };
+
+    auto SampleType = [&](DxsoSamplerType samplerType) {
+      // Only do the check for depth comp. samplers
+      // if we aren't a 3D texture
+      if (samplerType != SamplerTypeTexture3D) {
+        uint32_t colorLabel  = m_module.allocateId();
+        uint32_t depthLabel  = m_module.allocateId();
+        uint32_t endLabel    = m_module.allocateId();
+
+        uint32_t typeId = m_module.defIntType(32, 0);
+        uint32_t offset  = m_module.consti32(m_programInfo.type() == DxsoProgramTypes::VertexShader ? samplerIdx + 17 : samplerIdx);
+        uint32_t bitCnt  = m_module.consti32(1);
+        uint32_t isDepth = m_module.opBitFieldUExtract(typeId, m_depthSpecConstant, offset, bitCnt);
+        isDepth = m_module.opIEqual(m_module.defBoolType(), isDepth, m_module.constu32(1));
+
+        m_module.opSelectionMerge(endLabel, spv::SelectionControlMaskNone);
+        m_module.opBranchConditional(isDepth, depthLabel, colorLabel);
+
+        m_module.opLabel(colorLabel);
+        SampleImage(texcoordVar, sampler.color[samplerType], false, samplerType, sampler.boundConst);
+        m_module.opBranch(endLabel);
+
+        m_module.opLabel(depthLabel);
+        // No spec constant as if we are unbound we always fall down the color path.
+        SampleImage(texcoordVar, sampler.depth[samplerType], true, samplerType, 0);
+        m_module.opBranch(endLabel);
+
+        m_module.opLabel(endLabel);
+      }
+      else
+        SampleImage(texcoordVar, sampler.color[samplerType], false, samplerType, sampler.boundConst);
+    };
+
+    if (m_programInfo.majorVersion() >= 2 && !m_moduleInfo.options.forceSamplerTypeSpecConstants) {
+      DxsoSamplerType samplerType =
+        SamplerTypeFromTextureType(sampler.type);
+
+      SampleType(samplerType);
+    }
+    else {
+      std::array<SpirvSwitchCaseLabel, 3> typeCaseLabels = {{
+        { uint32_t(SamplerTypeTexture2D),           m_module.allocateId() },
+        { uint32_t(SamplerTypeTexture3D),           m_module.allocateId() },
+        { uint32_t(SamplerTypeTextureCube),         m_module.allocateId() },
+      }};
+
+      uint32_t switchEndLabel = m_module.allocateId();
+
+      uint32_t typeId = m_module.defIntType(32, 0);
+
+      uint32_t offset  = m_module.consti32(samplerIdx * 2);
+      uint32_t bitCnt  = m_module.consti32(2);
+      uint32_t type    = m_module.opBitFieldUExtract(typeId, m_ps.samplerTypeSpec, offset, bitCnt);
+
+      m_module.opSelectionMerge(switchEndLabel, spv::SelectionControlMaskNone);
+      m_module.opSwitch(type,
+        typeCaseLabels[uint32_t(SamplerTypeTexture2D)].labelId,
+        typeCaseLabels.size(),
+        typeCaseLabels.data());
+
+      for (const auto& label : typeCaseLabels) {
+        m_module.opLabel(label.labelId);
+
+        SampleType(DxsoSamplerType(label.literal));
+
+        m_module.opBranch(switchEndLabel);
+      }
+
+      m_module.opLabel(switchEndLabel);
+    }
+  }
+
+  void DxsoCompiler::emitTextureKill(const DxsoInstructionContext& ctx) {
+    DxsoRegisterValue texReg;
+
+    if (m_programInfo.majorVersion() >= 2 ||
+       (m_programInfo.majorVersion() == 1
+     && m_programInfo.minorVersion() == 4)) // SM 2.0+ or 1.4
+      texReg = emitRegisterLoadRaw(ctx.dst, ctx.dst.hasRelative ? &ctx.dst.relative : nullptr);
+    else { // SM 1.0-1.3
+      DxsoRegister texcoord;
+      texcoord.id = { DxsoRegisterType::PixelTexcoord, ctx.dst.id.num };
+
+      texReg = emitRegisterLoadRaw(texcoord, nullptr);
+    }
+
+    std::array<uint32_t, 4> indices = { 0, 1, 2, 3 };
+
+    // On SM1 it only works on the first 
+    if (m_programInfo.majorVersion() < 2) {
+      texReg.type.ccount = 3;
+
+      texReg.id = m_module.opVectorShuffle(
+        getVectorTypeId(texReg.type),
+        texReg.id, texReg.id,
+        texReg.type.ccount, indices.data());
+    }
+    else {
+      // The writemask actually applies and works here...
+      // (FXC doesn't generate this but it fixes broken ENB shaders)
+      texReg = emitRegisterSwizzle(texReg, IdentitySwizzle, ctx.dst.mask);
+    }
+
+    const uint32_t boolVecTypeId =
+      getVectorTypeId({ DxsoScalarType::Bool, texReg.type.ccount });
+
+    uint32_t result = m_module.opFOrdLessThan(
+      boolVecTypeId, texReg.id,
+      m_module.constfReplicant(0.0f, texReg.type.ccount));
+
+    if (texReg.type.ccount != 1)
+      result = m_module.opAny(m_module.defBoolType(), result);
+
+    if (m_ps.killState == 0) {
+      uint32_t labelIf = m_module.allocateId();
+      uint32_t labelEnd = m_module.allocateId();
+
+      m_module.opSelectionMerge(labelEnd, spv::SelectionControlMaskNone);
+      m_module.opBranchConditional(result, labelIf, labelEnd);
+
+      m_module.opLabel(labelIf);
+
+      if (m_moduleInfo.options.useDemoteToHelperInvocation) {
+        m_module.opDemoteToHelperInvocation();
+        m_module.opBranch(labelEnd);
+      } else {
+        // OpKill terminates the block
+        m_module.opKill();
+      }
+
+      m_module.opLabel(labelEnd);
+    }
+    else {
+      uint32_t typeId = m_module.defBoolType();
+      
+      uint32_t killState = m_module.opLoad     (typeId, m_ps.killState);
+               killState = m_module.opLogicalOr(typeId, killState, result);
+      m_module.opStore(m_ps.killState, killState);
+
+      if (m_moduleInfo.options.useSubgroupOpsForEarlyDiscard) {
+        uint32_t ballot = m_module.opGroupNonUniformBallot(
+          getVectorTypeId({ DxsoScalarType::Uint32, 4 }),
+          m_module.constu32(spv::ScopeSubgroup),
+          killState);
+        
+        uint32_t laneId = m_module.opLoad(
+          getScalarTypeId(DxsoScalarType::Uint32),
+          m_ps.builtinLaneId);
+
+        uint32_t laneIdPart = m_module.opShiftRightLogical(
+          getScalarTypeId(DxsoScalarType::Uint32),
+          laneId, m_module.constu32(5));
+
+        uint32_t laneMask = m_module.opVectorExtractDynamic(
+          getScalarTypeId(DxsoScalarType::Uint32),
+          ballot, laneIdPart);
+
+        uint32_t laneIdQuad = m_module.opBitwiseAnd(
+          getScalarTypeId(DxsoScalarType::Uint32),
+          laneId, m_module.constu32(0x1c));
+
+        laneMask = m_module.opShiftRightLogical(
+          getScalarTypeId(DxsoScalarType::Uint32),
+          laneMask, laneIdQuad);
+
+        laneMask = m_module.opBitwiseAnd(
+          getScalarTypeId(DxsoScalarType::Uint32),
+          laneMask, m_module.constu32(0xf));
+        
+        uint32_t killSubgroup = m_module.opIEqual(
+          m_module.defBoolType(),
+          laneMask, m_module.constu32(0xf));
+        
+        uint32_t labelIf  = m_module.allocateId();
+        uint32_t labelEnd = m_module.allocateId();
+        
+        m_module.opSelectionMerge(labelEnd, spv::SelectionControlMaskNone);
+        m_module.opBranchConditional(killSubgroup, labelIf, labelEnd);
+        
+        // OpKill terminates the block
+        m_module.opLabel(labelIf);
+        m_module.opKill();
+        
+        m_module.opLabel(labelEnd);
+      }
+    }
+  }
+
+  void DxsoCompiler::emitTextureDepth(const DxsoInstructionContext& ctx) {
+    const uint32_t fType = m_module.defFloatType(32);
+
+    DxsoRegMask srcMask(true, true, false, false);
+    uint32_t r5 = emitRegisterLoad(ctx.src[0], srcMask).id;
+    uint32_t x = 0;
+    uint32_t y = 1;
+
+    uint32_t xValue = m_module.opCompositeExtract(fType, r5, 1, &x);
+    uint32_t yValue = m_module.opCompositeExtract(fType, r5, 1, &y);
+
+    // The docs say if yValue is 0 the result is 1.0 but native drivers return
+    // 0 for xValue <= 0. So we don't have to do anything special since -INF and
+    // NAN get clamped to 0 at the end of the shader.
+    uint32_t result = m_module.opFDiv(fType, xValue, yValue);
+
+    DxsoBaseRegister depth;
+    depth.id = { DxsoRegisterType::DepthOut, 0 };
+
+    DxsoRegisterPointer depthPtr = emitGetOperandPtr(depth, nullptr);
+
+    m_module.opStore(depthPtr.id, result);
+  }
+
+
+  uint32_t DxsoCompiler::emitSample(
+          bool                    projected,
+          uint32_t                resultType,
+          DxsoSamplerInfo&        samplerInfo,
+          DxsoRegisterValue       coordinates,
+          uint32_t                reference,
+          uint32_t                fetch4,
+    const SpirvImageOperands&     operands) {
+    const bool depthCompare = reference != 0;
+    const bool explicitLod  =
+       (operands.flags & spv::ImageOperandsLodMask)
+    || (operands.flags & spv::ImageOperandsGradMask);
+
+    const uint32_t sampledImage = m_module.opLoad(samplerInfo.typeId, samplerInfo.varId);
+
+    uint32_t val;
+
+    // No Fetch 4
+    if (projected) {
+      if (depthCompare) {
+        if (explicitLod)
+          val = m_module.opImageSampleProjDrefExplicitLod(resultType, sampledImage, coordinates.id, reference, operands);
+        else
+          val = m_module.opImageSampleProjDrefImplicitLod(resultType, sampledImage, coordinates.id, reference, operands);
+      }
+      else {
+        if (explicitLod)
+          val = m_module.opImageSampleProjExplicitLod(resultType, sampledImage, coordinates.id, operands);
+        else
+          val = m_module.opImageSampleProjImplicitLod(resultType, sampledImage, coordinates.id, operands);
+      }
+    }
+    else {
+      if (depthCompare) {
+        if (explicitLod)
+          val = m_module.opImageSampleDrefExplicitLod(resultType, sampledImage, coordinates.id, reference, operands);
+        else
+          val = m_module.opImageSampleDrefImplicitLod(resultType, sampledImage, coordinates.id, reference, operands);
+      }
+      else {
+        if (explicitLod)
+          val = m_module.opImageSampleExplicitLod(resultType, sampledImage, coordinates.id, operands);
+        else
+          val = m_module.opImageSampleImplicitLod(resultType, sampledImage, coordinates.id, operands);
+      }
+    }
+
+
+    if (fetch4 && !depthCompare) {
+      SpirvImageOperands fetch4Operands = operands;
+      fetch4Operands.flags &= ~spv::ImageOperandsLodMask;
+      fetch4Operands.flags &= ~spv::ImageOperandsGradMask;
+      fetch4Operands.flags &= ~spv::ImageOperandsBiasMask;
+
+      // Doesn't really work for cubes...
+      // D3D9 does support gather on 3D but we cannot :<
+      // Nothing probably relies on that though.
+      // If we come back to this ever, make sure to handle cube/3d differences.
+      if (samplerInfo.dimensions == 2) {
+        uint32_t image = m_module.opImage(samplerInfo.imageTypeId, sampledImage);
+
+        // Account for half texel offset...
+        // textureSize = 1.0f / float(2 * textureSize(sampler, 0))
+        DxsoRegisterValue textureSize;
+        textureSize.type = { DxsoScalarType::Sint32, samplerInfo.dimensions };
+        textureSize.id = m_module.opImageQuerySizeLod(getVectorTypeId(textureSize.type), image, m_module.consti32(0));
+        textureSize.id = m_module.opIMul(getVectorTypeId(textureSize.type), textureSize.id, m_module.constiReplicant(2, samplerInfo.dimensions));
+
+        textureSize.type = { DxsoScalarType::Float32, samplerInfo.dimensions };
+        textureSize.id = m_module.opConvertStoF(getVectorTypeId(textureSize.type), textureSize.id);
+        // HACK: Bias fetch4 half-texel offset to avoid a "grid" effect.
+        // Technically we should only do that for non-powers of two
+        // as only then does the imprecision need to be biased
+        // towards infinity -- but that's not really worth doing...
+        float numerator = 1.0f - 1.0f / 256.0f;
+        textureSize.id = m_module.opFDiv(getVectorTypeId(textureSize.type), m_module.constfReplicant(numerator, samplerInfo.dimensions), textureSize.id);
+
+        // coord => same dimensions as texture size (no cube here !)
+        const std::array<uint32_t, 4> naturalIndices = { 0, 1, 2, 3 };
+        coordinates.type.ccount = samplerInfo.dimensions;
+        coordinates.id = m_module.opVectorShuffle(getVectorTypeId(coordinates.type), coordinates.id, coordinates.id, coordinates.type.ccount, naturalIndices.data());
+        // coord += textureSize;
+        coordinates.id = m_module.opFAdd(getVectorTypeId(coordinates.type), coordinates.id, textureSize.id);
+      }
+
+      uint32_t fetch4Val = m_module.opImageGather(resultType, sampledImage, coordinates.id, m_module.consti32(0), fetch4Operands);
+      // B R G A swizzle... Funny D3D9 order.
+      const std::array<uint32_t, 4> indices = { 2, 0, 1, 3 };
+      fetch4Val = m_module.opVectorShuffle(resultType, fetch4Val, fetch4Val, indices.size(), indices.data());
+
+      val = m_module.opSelect(resultType, fetch4, fetch4Val, val);
+    }
+
+    return val;
+  }
+
+
+  void DxsoCompiler::emitInputSetup() {
+    uint32_t pointCoord = 0;
+    D3D9PointSizeInfoPS pointInfo;
+
+    if (m_programInfo.type() == DxsoProgramType::PixelShader) {
+      pointCoord = GetPointCoord(m_module, m_entryPointInterfaces);
+      pointInfo  = GetPointSizeInfoPS(m_module, m_rsBlock);
+    }
+
+    for (uint32_t i = 0; i < m_isgn.elemCount; i++) {
+      const auto& elem = m_isgn.elems[i];
+      const uint32_t slot = elem.slot;
+      
+      DxsoRegisterInfo info;
+      info.type.ctype   = DxsoScalarType::Float32;
+      info.type.ccount  = 4;
+      info.type.alength = 1;
+      info.sclass       = spv::StorageClassInput;
+
+      DxsoRegisterPointer inputPtr;
+      inputPtr.id          = emitNewVariable(info);
+      inputPtr.type.ctype  = DxsoScalarType::Float32;
+      inputPtr.type.ccount = info.type.ccount;
+
+      m_module.decorateLocation(inputPtr.id, slot);
+
+      std::string name =
+        str::format("in_", elem.semantic.usage, elem.semantic.usageIndex);
+      m_module.setDebugName(inputPtr.id, name.c_str());
+
+      if (elem.centroid)
+        m_module.decorate(inputPtr.id, spv::DecorationCentroid);
+
+      m_entryPointInterfaces.push_back(inputPtr.id);
+
+      uint32_t typeId    = this->getVectorTypeId({ DxsoScalarType::Float32, 4 });
+      uint32_t ptrTypeId = m_module.defPointerType(typeId, spv::StorageClassPrivate);
+
+      uint32_t regNumVar = m_module.constu32(elem.regNumber);
+
+      DxsoRegisterPointer indexPtr;
+      indexPtr.id   = m_module.opAccessChain(ptrTypeId, m_vArray, 1, &regNumVar);
+      indexPtr.type = inputPtr.type;
+      indexPtr.type.ccount = 4;
+
+      DxsoRegisterValue indexVal = this->emitValueLoad(inputPtr);
+
+      DxsoRegisterValue workingReg;
+      workingReg.type = indexVal.type;
+
+      workingReg.id = m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f);
+
+      DxsoRegMask mask = elem.mask;
+      if (mask.popCount() == 0)
+        mask = DxsoRegMask(true, true, true, true);
+
+      std::array<uint32_t, 4> indices = { 0, 1, 2, 3 };
+      uint32_t count = 0;
+      for (uint32_t i = 0; i < 4; i++) {
+        if (mask[i]) {
+          indices[i] = i + 4;
+          count++;
+        }
+      }
+
+      workingReg.id = m_module.opVectorShuffle(getVectorTypeId(workingReg.type),
+        workingReg.id, indexVal.id, 4, indices.data());
+
+      // We need to replace TEXCOORD inputs with gl_PointCoord
+      // if D3DRS_POINTSPRITEENABLE is set.
+      if (m_programInfo.type() == DxsoProgramType::PixelShader && elem.semantic.usage == DxsoUsage::Texcoord)
+        workingReg.id = m_module.opSelect(getVectorTypeId(workingReg.type), pointInfo.isSprite, pointCoord, workingReg.id);
+
+      if (m_programInfo.type() == DxsoProgramType::PixelShader && elem.semantic.usage == DxsoUsage::Color) {
+        if (elem.semantic.usageIndex == 0)
+          m_ps.diffuseColorIn = inputPtr.id;
+        else if (elem.semantic.usageIndex == 1)
+          m_ps.specularColorIn = inputPtr.id;
+      }
+
+      m_module.opStore(indexPtr.id, workingReg.id);
+    }
+  }
+
+
+  void DxsoCompiler::emitLinkerOutputSetup() {
+    bool outputtedColor0 = false;
+    bool outputtedColor1 = false;
+
+    for (uint32_t i = 0; i < m_osgn.elemCount; i++) {
+      const auto& elem = m_osgn.elems[i];
+      const uint32_t slot = elem.slot;
+
+      if (elem.semantic.usage == DxsoUsage::Color) {
+        if (elem.semantic.usageIndex == 0)
+          outputtedColor0 = true;
+        else
+          outputtedColor1 = true;
+      }
+      
+      DxsoRegisterInfo info;
+      info.type.ctype   = DxsoScalarType::Float32;
+      info.type.ccount  = 4;
+      info.type.alength = 1;
+      info.sclass       = spv::StorageClassOutput;
+
+      spv::BuiltIn builtIn =
+        semanticToBuiltIn(false, elem.semantic);
+
+      DxsoRegisterPointer outputPtr;
+      outputPtr.type.ctype  = DxsoScalarType::Float32;
+      outputPtr.type.ccount = 4;
+
+      DxsoRegMask mask = elem.mask;
+
+      bool scalar = false;
+
+      if (builtIn == spv::BuiltInMax) {
+        outputPtr.id = emitNewVariableDefault(info,
+          m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f));
+        m_module.decorateLocation(outputPtr.id, slot);
+
+        std::string name =
+          str::format("out_", elem.semantic.usage, elem.semantic.usageIndex);
+        m_module.setDebugName(outputPtr.id, name.c_str());
+      }
+      else {
+        const char* name = "unknown_builtin";
+        if (builtIn == spv::BuiltInPosition)
+          name = "oPos";
+        else if (builtIn == spv::BuiltInPointSize) {
+          outputPtr.type.ccount = 1;
+          info.type.ccount = 1;
+          name = "oPSize";
+          bool maskValues[4];
+          for (uint32_t i = 0; i < 4; i++)
+            maskValues[i] = i == elem.mask.firstSet();
+          mask = DxsoRegMask(maskValues[0], maskValues[1], maskValues[2], maskValues[3]);
+        }
+
+        outputPtr.id = emitNewVariableDefault(info,
+          m_module.constfReplicant(0.0f, info.type.ccount));
+
+        m_module.setDebugName(outputPtr.id, name);
+        m_module.decorateBuiltIn(outputPtr.id, builtIn);
+
+        if (builtIn == spv::BuiltInPosition)
+          m_vs.oPos = outputPtr;
+        else if (builtIn == spv::BuiltInPointSize) {
+          scalar = true;
+          m_vs.oPSize = outputPtr;
+        }
+      }
+
+      m_entryPointInterfaces.push_back(outputPtr.id);
+
+      uint32_t typeId    = this->getVectorTypeId({ DxsoScalarType::Float32, 4 });
+      uint32_t ptrTypeId = m_module.defPointerType(typeId, spv::StorageClassPrivate);
+
+      uint32_t regNumVar = m_module.constu32(elem.regNumber);
+
+      DxsoRegisterPointer indexPtr;
+      indexPtr.id   = m_module.opAccessChain(ptrTypeId, m_oArray, 1, &regNumVar);
+      indexPtr.type = outputPtr.type;
+      indexPtr.type.ccount = 4;
+
+      DxsoRegisterValue indexVal = this->emitValueLoad(indexPtr);
+
+      DxsoRegisterValue workingReg;
+      workingReg.type.ctype  = indexVal.type.ctype;
+      workingReg.type.ccount = scalar ? 1 : 4;
+
+      workingReg.id = scalar
+        ? m_module.constf32(0.0f)
+        : m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f);
+
+      std::array<uint32_t, 4> indices = { 0, 1, 2, 3 };
+
+      if (scalar) {
+        workingReg.id = m_module.opCompositeExtract(getVectorTypeId(workingReg.type),
+          indexVal.id, 1, indices.data());
+      } else {
+        if (mask.popCount() == 0)
+          mask = DxsoRegMask(true, true, true, true);
+
+        uint32_t count = 0;
+        for (uint32_t i = 0; i < 4; i++) {
+          if (mask[i])
+            indices[count++] = i + 4;
+        }
+
+
+        workingReg.id = m_module.opVectorShuffle(getVectorTypeId(workingReg.type),
+          workingReg.id, indexVal.id, 4, indices.data());
+      }
+
+      // Ie. 0 or 1 for diffuse and specular color
+      // and for Shader Model 1 or 2
+      // (because those have dedicated color registers
+      // where this rule applies)
+      if (elem.semantic.usage == DxsoUsage::Color &&
+          elem.semantic.usageIndex < 2 &&
+          m_programInfo.majorVersion() < 3)
+        workingReg = emitSaturate(workingReg);
+
+      m_module.opStore(outputPtr.id, workingReg.id);
+    }
+
+    auto OutputDefault = [&](DxsoSemantic semantic) {
+      DxsoRegisterInfo info;
+      info.type.ctype   = DxsoScalarType::Float32;
+      info.type.ccount  = 4;
+      info.type.alength = 1;
+      info.sclass       = spv::StorageClassOutput;
+
+      uint32_t slot = RegisterLinkerSlot(semantic);
+
+      uint32_t value = semantic == DxsoSemantic{ DxsoUsage::Color, 0 }
+        ? m_module.constvec4f32(1.0f, 1.0f, 1.0f, 1.0f)
+        : m_module.constvec4f32(0.0f, 0.0f, 0.0f, 0.0f);
+
+
+      uint32_t outputPtr = emitNewVariableDefault(info, value);
+
+      m_module.decorateLocation(outputPtr, slot);
+
+      std::string name =
+        str::format("out_", semantic.usage, semantic.usageIndex, "_default");
+
+      m_module.setDebugName(outputPtr, name.c_str());
+
+      m_interfaceSlots.outputSlots |= 1u << slot;
+      m_entryPointInterfaces.push_back(outputPtr);
+    };
+
+    if (!outputtedColor0)
+      OutputDefault(DxsoSemantic{ DxsoUsage::Color, 0 });
+
+    if (!outputtedColor1)
+      OutputDefault(DxsoSemantic{ DxsoUsage::Color, 1 });
+
+    auto pointInfo = GetPointSizeInfoVS(m_module, m_vs.oPos.id, 0, 0, m_rsBlock, false);
+
+    if (m_vs.oPSize.id == 0) {
+      m_vs.oPSize = this->emitRegisterPtr(
+        "oPSize", DxsoScalarType::Float32, 1, 0,
+        spv::StorageClassOutput, spv::BuiltInPointSize);
+
+      uint32_t pointSize = m_module.opFClamp(m_module.defFloatType(32), pointInfo.defaultValue, pointInfo.min, pointInfo.max);
+
+      m_module.opStore(m_vs.oPSize.id, pointSize);
+    }
+    else {
+      uint32_t float_t = m_module.defFloatType(32);
+      uint32_t pointSize = m_module.opFClamp(m_module.defFloatType(32), m_module.opLoad(float_t, m_vs.oPSize.id), pointInfo.min, pointInfo.max);
+      m_module.opStore(m_vs.oPSize.id, pointSize);
+    }
+  }
+
+
+  void DxsoCompiler::emitVsClipping() {
+    uint32_t clipPlaneCountId = m_module.constu32(caps::MaxClipPlanes);
+    
+    uint32_t floatType = m_module.defFloatType(32);
+    uint32_t vec4Type  = m_module.defVectorType(floatType, 4);
+    
+    // Declare uniform buffer containing clip planes
+    uint32_t clipPlaneArray  = m_module.defArrayTypeUnique(vec4Type, clipPlaneCountId);
+    uint32_t clipPlaneStruct = m_module.defStructTypeUnique(1, &clipPlaneArray);
+    uint32_t clipPlaneBlock  = m_module.newVar(
+      m_module.defPointerType(clipPlaneStruct, spv::StorageClassUniform),
+      spv::StorageClassUniform);
+    
+    m_module.decorateArrayStride  (clipPlaneArray, 16);
+    
+    m_module.setDebugName         (clipPlaneStruct, "clip_info_t");
+    m_module.setDebugMemberName   (clipPlaneStruct, 0, "clip_planes");
+    m_module.decorate             (clipPlaneStruct, spv::DecorationBlock);
+    m_module.memberDecorateOffset (clipPlaneStruct, 0, 0);
+    
+    uint32_t bindingId = computeResourceSlotId(
+      m_programInfo.type(), DxsoBindingType::ConstantBuffer,
+      DxsoConstantBuffers::VSClipPlanes);
+    
+    m_module.setDebugName         (clipPlaneBlock, "clip_info");
+    m_module.decorateDescriptorSet(clipPlaneBlock, 0);
+    m_module.decorateBinding      (clipPlaneBlock, bindingId);
+    
+    DxvkResourceSlot resource;
+    resource.slot   = bindingId;
+    resource.type   = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+    resource.view   = VK_IMAGE_VIEW_TYPE_MAX_ENUM;
+    resource.access = VK_ACCESS_UNIFORM_READ_BIT;
+    m_resourceSlots.push_back(resource);
+    
+    // Declare output array for clip distances
+    uint32_t clipDistArray = m_module.newVar(
+      m_module.defPointerType(
+        m_module.defArrayType(floatType, clipPlaneCountId),
+        spv::StorageClassOutput),
+      spv::StorageClassOutput);
+
+    m_module.decorateBuiltIn(clipDistArray, spv::BuiltInClipDistance);
+    m_entryPointInterfaces.push_back(clipDistArray);
+
+    if (m_moduleInfo.options.invariantPosition)
+      m_module.decorate(m_vs.oPos.id, spv::DecorationInvariant);
+    
+    const uint32_t positionPtr = m_vs.oPos.id;
+
+    // We generated a bad shader, let's not make it even worse.
+    if (positionPtr == 0) {
+      Logger::warn("Shader without Position output. Something is likely wrong here.");
+      return;
+    }
+
+    // Compute clip distances
+    uint32_t positionId = m_module.opLoad(vec4Type, positionPtr);
+    
+    for (uint32_t i = 0; i < caps::MaxClipPlanes; i++) {
+      std::array<uint32_t, 2> blockMembers = {{
+        m_module.constu32(0),
+        m_module.constu32(i),
+      }};
+      
+      uint32_t planeId = m_module.opLoad(vec4Type,
+        m_module.opAccessChain(
+          m_module.defPointerType(vec4Type, spv::StorageClassUniform),
+          clipPlaneBlock, blockMembers.size(), blockMembers.data()));
+      
+      uint32_t distId = m_module.opDot(floatType, positionId, planeId);
+      
+      m_module.opStore(
+        m_module.opAccessChain(
+          m_module.defPointerType(floatType, spv::StorageClassOutput),
+          clipDistArray, 1, &blockMembers[1]),
+        distId);
+    }
+  }
+
+
+  void DxsoCompiler::setupRenderStateInfo() {
+    uint32_t count;
+
+    // Only need alpha ref for PS 3.
+    // No FF fog component.
+    if (m_programInfo.type() == DxsoProgramType::PixelShader) {
+      if (m_programInfo.majorVersion() == 3) {
+        m_interfaceSlots.pushConstOffset = offsetof(D3D9RenderStateInfo, alphaRef);
+        m_interfaceSlots.pushConstSize   = sizeof(float);
+      }
+      else {
+        m_interfaceSlots.pushConstOffset = 0;
+        m_interfaceSlots.pushConstSize   = offsetof(D3D9RenderStateInfo, pointSize);
+      }
+
+      count = 5;
+    }
+    else {
+      m_interfaceSlots.pushConstOffset = offsetof(D3D9RenderStateInfo, pointSize);
+      // Point scale never triggers on programmable
+      m_interfaceSlots.pushConstSize   = sizeof(float) * 3;
+      count = 8;
+    }
+
+    m_rsBlock = SetupRenderStateBlock(m_module, count);
+  }
+
+
+  void DxsoCompiler::emitFog() {
+    DxsoRegister color0;
+    color0.id = DxsoRegisterId{ DxsoRegisterType::ColorOut, 0 };
+    auto oColor0Ptr = this->emitGetOperandPtr(color0);
+
+    DxsoRegister vFog;
+    vFog.id = DxsoRegisterId{ DxsoRegisterType::RasterizerOut, RasterOutFog };
+    auto vFogPtr = this->emitGetOperandPtr(vFog);
+
+    DxsoRegister vPos;
+    vPos.id = DxsoRegisterId{ DxsoRegisterType::MiscType, DxsoMiscTypeIndices::MiscTypePosition };
+    auto vPosPtr = this->emitGetOperandPtr(vPos);
+
+    D3D9FogContext fogCtx;
+    fogCtx.IsPixel     = true;
+    fogCtx.RangeFog    = false;
+    fogCtx.RenderState = m_rsBlock;
+    fogCtx.vPos        = m_module.opLoad(getVectorTypeId(vPosPtr.type),    vPosPtr.id);
+    fogCtx.vFog        = m_module.opLoad(getVectorTypeId(vFogPtr.type),    vFogPtr.id);
+    fogCtx.oColor      = m_module.opLoad(getVectorTypeId(oColor0Ptr.type), oColor0Ptr.id);
+    fogCtx.IsFixedFunction = false;
+    fogCtx.IsPositionT = false;
+    fogCtx.HasSpecular = false;
+    fogCtx.Specular    = 0;
+
+    m_module.opStore(oColor0Ptr.id, DoFixedFunctionFog(m_module, fogCtx));
+  }
+
+  
+  void DxsoCompiler::emitPsProcessing() {
+    uint32_t boolType  = m_module.defBoolType();
+    uint32_t floatType = m_module.defFloatType(32);
+    uint32_t floatPtr  = m_module.defPointerType(floatType, spv::StorageClassPushConstant);
+    
+    uint32_t alphaFuncId = m_module.specConst32(m_module.defIntType(32, 0), 0);
+    m_module.setDebugName   (alphaFuncId, "alpha_func");
+    m_module.decorateSpecId (alphaFuncId, getSpecId(D3D9SpecConstantId::AlphaCompareOp));
+
+    // Implement alpha test and fog
+    DxsoRegister color0;
+    color0.id = DxsoRegisterId{ DxsoRegisterType::ColorOut, 0 };
+    auto oC0 = this->emitGetOperandPtr(color0);
+    
+    if (oC0.id) {
+      if (m_programInfo.majorVersion() < 3)
+        emitFog();
+
+      // Labels for the alpha test
+      std::array<SpirvSwitchCaseLabel, 8> atestCaseLabels = {{
+        { uint32_t(VK_COMPARE_OP_NEVER),            m_module.allocateId() },
+        { uint32_t(VK_COMPARE_OP_LESS),             m_module.allocateId() },
+        { uint32_t(VK_COMPARE_OP_EQUAL),            m_module.allocateId() },
+        { uint32_t(VK_COMPARE_OP_LESS_OR_EQUAL),    m_module.allocateId() },
+        { uint32_t(VK_COMPARE_OP_GREATER),          m_module.allocateId() },
+        { uint32_t(VK_COMPARE_OP_NOT_EQUAL),        m_module.allocateId() },
+        { uint32_t(VK_COMPARE_OP_GREATER_OR_EQUAL), m_module.allocateId() },
+        { uint32_t(VK_COMPARE_OP_ALWAYS),           m_module.allocateId() },
+      }};
+      
+      uint32_t atestBeginLabel   = m_module.allocateId();
+      uint32_t atestTestLabel    = m_module.allocateId();
+      uint32_t atestDiscardLabel = m_module.allocateId();
+      uint32_t atestKeepLabel    = m_module.allocateId();
+      uint32_t atestSkipLabel    = m_module.allocateId();
+      
+      // if (alpha_func != ALWAYS) { ... }
+      uint32_t isNotAlways = m_module.opINotEqual(boolType, alphaFuncId, m_module.constu32(VK_COMPARE_OP_ALWAYS));
+      m_module.opSelectionMerge(atestSkipLabel, spv::SelectionControlMaskNone);
+      m_module.opBranchConditional(isNotAlways, atestBeginLabel, atestSkipLabel);
+      m_module.opLabel(atestBeginLabel);
+      
+      // Load alpha component
+      uint32_t alphaComponentId = 3;
+      uint32_t alphaId = m_module.opCompositeExtract(floatType,
+        m_module.opLoad(m_module.defVectorType(floatType, 4), oC0.id),
+        1, &alphaComponentId);
+
+      if (m_moduleInfo.options.alphaTestWiggleRoom) {
+        // NV has wonky interpolation of all 1's in a VS -> PS going to 0.999999...
+        // This causes garbage-looking graphics on people's clothing in EverQuest 2 as it does alpha == 1.0.
+
+        // My testing shows the alpha test has a precision of 1/256 for all A8 and below formats,
+        // and around 1 / 2048 for A32F formats and 1 / 4096 for A16F formats (It makes no sense to me too)
+        // so anyway, we're just going to round this to a precision of 1 / 4096 and hopefully this should make things happy
+        // everywhere.
+        const uint32_t alphaSizeId = m_module.constf32(4096.0f);
+
+        alphaId = m_module.opFMul(floatType, alphaId, alphaSizeId);
+        alphaId = m_module.opRound(floatType, alphaId);
+        alphaId = m_module.opFDiv(floatType, alphaId, alphaSizeId);
+      }
+      
+      // Load alpha reference
+      uint32_t alphaRefMember = m_module.constu32(uint32_t(D3D9RenderStateItem::AlphaRef));
+      uint32_t alphaRefId = m_module.opLoad(floatType,
+        m_module.opAccessChain(floatPtr, m_rsBlock, 1, &alphaRefMember));
+      
+      // switch (alpha_func) { ... }
+      m_module.opSelectionMerge(atestTestLabel, spv::SelectionControlMaskNone);
+      m_module.opSwitch(alphaFuncId,
+        atestCaseLabels[uint32_t(VK_COMPARE_OP_ALWAYS)].labelId,
+        atestCaseLabels.size(),
+        atestCaseLabels.data());
+      
+      std::array<SpirvPhiLabel, 8> atestVariables;
+      
+      for (uint32_t i = 0; i < atestCaseLabels.size(); i++) {
+        m_module.opLabel(atestCaseLabels[i].labelId);
+        
+        atestVariables[i].labelId = atestCaseLabels[i].labelId;
+        atestVariables[i].varId   = [&] {
+          switch (VkCompareOp(atestCaseLabels[i].literal)) {
+            case VK_COMPARE_OP_NEVER:            return m_module.constBool(false);
+            case VK_COMPARE_OP_LESS:             return m_module.opFOrdLessThan        (boolType, alphaId, alphaRefId);
+            case VK_COMPARE_OP_EQUAL:            return m_module.opFOrdEqual           (boolType, alphaId, alphaRefId);
+            case VK_COMPARE_OP_LESS_OR_EQUAL:    return m_module.opFOrdLessThanEqual   (boolType, alphaId, alphaRefId);
+            case VK_COMPARE_OP_GREATER:          return m_module.opFOrdGreaterThan     (boolType, alphaId, alphaRefId);
+            case VK_COMPARE_OP_NOT_EQUAL:        return m_module.opFOrdNotEqual        (boolType, alphaId, alphaRefId);
+            case VK_COMPARE_OP_GREATER_OR_EQUAL: return m_module.opFOrdGreaterThanEqual(boolType, alphaId, alphaRefId);
+            default:
+            case VK_COMPARE_OP_ALWAYS:           return m_module.constBool(true);
+          }
+        }();
+        
+        m_module.opBranch(atestTestLabel);
+      }
+      
+      // end switch
+      m_module.opLabel(atestTestLabel);
+      
+      uint32_t atestResult = m_module.opPhi(boolType,
+        atestVariables.size(),
+        atestVariables.data());
+      uint32_t atestDiscard = m_module.opLogicalNot(boolType, atestResult);
+      
+      // if (do_discard) { ... }
+      m_module.opSelectionMerge(atestKeepLabel, spv::SelectionControlMaskNone);
+      m_module.opBranchConditional(atestDiscard, atestDiscardLabel, atestKeepLabel);
+      
+      m_module.opLabel(atestDiscardLabel);
+      m_module.opKill();
+      
+      // end if (do_discard)
+      m_module.opLabel(atestKeepLabel);
+      m_module.opBranch(atestSkipLabel);
+      
+      // end if (alpha_test)
+      m_module.opLabel(atestSkipLabel);
+    }
+  }
+
+  void DxsoCompiler::emitOutputDepthClamp() {
+    // HACK: Some drivers do not clamp FragDepth to [minDepth..maxDepth]
+    // before writing to the depth attachment, but we do not have acccess
+    // to those. Clamp to [0..1] instead.
+
+    if (m_ps.oDepth.id != 0) {
+      auto result = emitValueLoad(m_ps.oDepth);
+
+      result = emitSaturate(result);
+
+      m_module.opStore(
+        m_ps.oDepth.id,
+        result.id);
+    }
+}
+
+
+  void DxsoCompiler::emitVsFinalize() {
+    this->emitMainFunctionBegin();
+
+    this->emitInputSetup();
+    m_module.opFunctionCall(
+      m_module.defVoidType(),
+      m_vs.functionId, 0, nullptr);
+    this->emitLinkerOutputSetup();
+
+    this->emitVsClipping();
+
+    this->emitFunctionEnd();
+  }
+
+  void DxsoCompiler::emitPsFinalize() {
+    this->emitMainFunctionBegin();
+
+    this->emitInputSetup();
+
+    bool canUsePixelFog = m_programInfo.majorVersion() < 3;
+
+    if (canUsePixelFog) {
+      // Look up vPos so it gets initted.
+      DxsoRegister vPos;
+      vPos.id = DxsoRegisterId{ DxsoRegisterType::MiscType, DxsoMiscTypeIndices::MiscTypePosition };
+      this->emitGetOperandPtr(vPos);
+    }
+
+    if (m_ps.vPos.id != 0) {
+      DxsoRegisterPointer fragCoord = this->emitRegisterPtr(
+        "ps_frag_coord", DxsoScalarType::Float32, 4, 0,
+        spv::StorageClassInput, spv::BuiltInFragCoord);
+
+      DxsoRegisterValue val = this->emitValueLoad(fragCoord);
+      val.id = m_module.opFSub(
+        getVectorTypeId(val.type), val.id,
+        m_module.constvec4f32(0.5f, 0.5f, 0.0f, 0.0f));
+
+      m_module.opStore(m_ps.vPos.id, val.id);
+    }
+
+    if (m_ps.vFace.id != 0) {
+      DxsoRegisterPointer faceBool = this->emitRegisterPtr(
+        "ps_is_front_face", DxsoScalarType::Bool, 1, 0,
+        spv::StorageClassInput, spv::BuiltInFrontFacing);
+
+      DxsoRegisterValue frontFace = emitValueLoad(faceBool);
+      DxsoRegisterValue selectOp = emitRegisterExtend(frontFace, 4);
+
+      m_module.opStore(
+        m_ps.vFace.id,
+        m_module.opSelect(getVectorTypeId(m_ps.vFace.type), selectOp.id,
+          m_module.constvec4f32( 1.0f,  1.0f,  1.0f,  1.0f),
+          m_module.constvec4f32(-1.0f, -1.0f, -1.0f, -1.0f)));
+    }
+
+    m_module.opFunctionCall(
+      m_module.defVoidType(),
+      m_ps.functionId, 0, nullptr);
+
+    if (m_ps.killState != 0) {
+      uint32_t labelIf  = m_module.allocateId();
+      uint32_t labelEnd = m_module.allocateId();
+      
+      uint32_t killTest = m_module.opLoad(m_module.defBoolType(), m_ps.killState);
+      
+      m_module.opSelectionMerge(labelEnd, spv::SelectionControlMaskNone);
+      m_module.opBranchConditional(killTest, labelIf, labelEnd);
+      
+      m_module.opLabel(labelIf);
+      m_module.opKill();
+      
+      m_module.opLabel(labelEnd);
+    }
+
+    // r0 in PS1 is the colour output register. Move r0 -> cO0 here.
+    if (m_programInfo.majorVersion() == 1
+    && m_programInfo.type() == DxsoProgramTypes::PixelShader) {
+      DxsoRegister r0;
+      r0.id = { DxsoRegisterType::Temp, 0 };
+
+      DxsoRegister c0;
+      c0.id = { DxsoRegisterType::ColorOut, 0 };
+
+      DxsoRegisterValue val   = emitRegisterLoadRaw(r0, nullptr);
+      DxsoRegisterPointer out = emitGetOperandPtr(c0);
+      m_module.opStore(out.id, val.id);
+    }
+
+    // No need to setup output here as it's non-indexable
+    // everything has already gone to the right place!
+
+    this->emitPsProcessing();
+    this->emitOutputDepthClamp();
+    this->emitFunctionEnd();
+  }
+
+
+
+  uint32_t DxsoCompiler::getScalarTypeId(DxsoScalarType type) {
+    switch (type) {
+      case DxsoScalarType::Uint32:  return m_module.defIntType(32, 0);
+      case DxsoScalarType::Sint32:  return m_module.defIntType(32, 1);
+      case DxsoScalarType::Float32: return m_module.defFloatType(32);
+      case DxsoScalarType::Bool:    return m_module.defBoolType();
+    }
+
+    throw DxvkError("DxsoCompiler: Invalid scalar type");
+  }
+
+
+  uint32_t DxsoCompiler::getVectorTypeId(const DxsoVectorType& type) {
+    uint32_t typeId = this->getScalarTypeId(type.ctype);
+
+    if (type.ccount > 1)
+      typeId = m_module.defVectorType(typeId, type.ccount);
+
+    return typeId;
+  }
+
+
+  uint32_t DxsoCompiler::getArrayTypeId(const DxsoArrayType& type) {
+    DxsoVectorType vtype;
+    vtype.ctype  = type.ctype;
+    vtype.ccount = type.ccount;
+
+    uint32_t typeId = this->getVectorTypeId(vtype);
+
+    if (type.alength > 1) {
+      typeId = m_module.defArrayType(typeId,
+        m_module.constu32(type.alength));
+    }
+
+    return typeId;
+  }
+
+
+  uint32_t DxsoCompiler::getPointerTypeId(const DxsoRegisterInfo& type) {
+    return m_module.defPointerType(
+      this->getArrayTypeId(type.type),
+      type.sclass);
+  }
+
+}
+\ No newline at end of file