/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- * vim: set ts=8 sts=2 et sw=2 tw=80: * * Copyright 2016 Mozilla Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // This is an INTERNAL header for Wasm baseline compiler: CPU stack frame, // stack maps, and associated logic. #ifndef wasm_wasm_baseline_frame_h #define wasm_wasm_baseline_frame_h #include "wasm/WasmBaselineCompile.h" // For BaseLocalIter #include "wasm/WasmBCDefs.h" #include "wasm/WasmBCRegDefs.h" #include "wasm/WasmBCStk.h" #include "wasm/WasmConstants.h" // For MaxFrameSize // [SMDOC] Wasm baseline compiler's stack frame. // // For background, see "Wasm's ABIs" in WasmFrame.h, the following should never // be in conflict with that. // // The stack frame has four parts ("below" means at lower addresses): // // - the Frame element; // - the Local area, including the DebugFrame element and possibly a spilled // pointer to stack results, if any; allocated below the header with various // forms of alignment; // - the Dynamic area, comprising the temporary storage the compiler uses for // register spilling, allocated below the Local area; // - the Arguments area, comprising memory allocated for outgoing calls, // allocated below the Dynamic area. // // +==============================+ // | Incoming stack arg | // | ... | // ------------- +==============================+ // | Frame (fixed size) | // ------------- +==============================+ <-------------------- FP // ^ | DebugFrame (optional) | ^ ^ ^^ // localSize | Register arg local | | | || // | | ... | | | framePushed // | | Register stack result ptr?| | | || // | | Non-arg local | | | || // | | ... | | | || // | | (padding) | | | || // | | Instance pointer | | | || // | +------------------------------+ | | || // v | (padding) | | v || // ------------- +==============================+ currentStackHeight || // ^ | Dynamic (variable size) | | || // dynamicSize | ... | | || // v | ... | v || // ------------- | (free space, sometimes) | --------- v| // +==============================+ <----- SP not-during calls // | Arguments (sometimes) | | // | ... | v // +==============================+ <----- SP during calls // // The Frame is addressed off the stack pointer. masm.framePushed() is always // correct, and masm.getStackPointer() + masm.framePushed() always addresses the // Frame, with the DebugFrame optionally below it. // // The Local area (including the DebugFrame and, if needed, the spilled value of // the stack results area pointer) is laid out by BaseLocalIter and is allocated // and deallocated by standard prologue and epilogue functions that manipulate // the stack pointer, but it is accessed via BaseStackFrame. // // The Dynamic area is maintained by and accessed via BaseStackFrame. On some // systems (such as ARM64), the Dynamic memory may be allocated in chunks // because the SP needs a specific alignment, and in this case there will // normally be some free space directly above the SP. The stack height does not // include the free space, it reflects the logically used space only. // // The Dynamic area is where space for stack results is allocated when calling // functions that return results on the stack. If a function has stack results, // a pointer to the low address of the stack result area is passed as an // additional argument, according to the usual ABI. See // ABIResultIter::HasStackResults. // // The Arguments area is allocated and deallocated via BaseStackFrame (see // comments later) but is accessed directly off the stack pointer. namespace js { namespace wasm { using namespace js::jit; // Abstraction of the height of the stack frame, to avoid type confusion. class StackHeight { friend class BaseStackFrameAllocator; uint32_t height; public: explicit StackHeight(uint32_t h) : height(h) {} static StackHeight Invalid() { return StackHeight(UINT32_MAX); } bool isValid() const { return height != UINT32_MAX; } bool operator==(StackHeight rhs) const { MOZ_ASSERT(isValid() && rhs.isValid()); return height == rhs.height; } bool operator!=(StackHeight rhs) const { return !(*this == rhs); } }; // Abstraction for where multi-value results go on the machine stack. class StackResultsLoc { uint32_t bytes_; size_t count_; Maybe height_; public: StackResultsLoc() : bytes_(0), count_(0){}; StackResultsLoc(uint32_t bytes, size_t count, uint32_t height) : bytes_(bytes), count_(count), height_(Some(height)) { MOZ_ASSERT(bytes != 0); MOZ_ASSERT(count != 0); MOZ_ASSERT(height != 0); } uint32_t bytes() const { return bytes_; } uint32_t count() const { return count_; } uint32_t height() const { return height_.value(); } bool hasStackResults() const { return bytes() != 0; } StackResults stackResults() const { return hasStackResults() ? StackResults::HasStackResults : StackResults::NoStackResults; } }; // Abstraction of the baseline compiler's stack frame (except for the Frame / // DebugFrame parts). See comments above for more. Remember, "below" on the // stack means at lower addresses. // // The abstraction is split into two parts: BaseStackFrameAllocator is // responsible for allocating and deallocating space on the stack and for // performing computations that are affected by how the allocation is performed; // BaseStackFrame then provides a pleasant interface for stack frame management. class BaseStackFrameAllocator { MacroAssembler& masm; #ifdef RABALDR_CHUNKY_STACK // On platforms that require the stack pointer to be aligned on a boundary // greater than the typical stack item (eg, ARM64 requires 16-byte alignment // but items are 8 bytes), allocate stack memory in chunks, and use a // separate stack height variable to track the effective stack pointer // within the allocated area. Effectively, there's a variable amount of // free space directly above the stack pointer. See diagram above. // The following must be true in order for the stack height to be // predictable at control flow joins: // // - The Local area is always aligned according to WasmStackAlignment, ie, // masm.framePushed() % WasmStackAlignment is zero after allocating // locals. // // - ChunkSize is always a multiple of WasmStackAlignment. // // - Pushing and popping are always in units of ChunkSize (hence preserving // alignment). // // - The free space on the stack (masm.framePushed() - currentStackHeight_) // is a predictable (nonnegative) amount. // As an optimization, we pre-allocate some space on the stack, the size of // this allocation is InitialChunk and it must be a multiple of ChunkSize. // It is allocated as part of the function prologue and deallocated as part // of the epilogue, along with the locals. // // If ChunkSize is too large then we risk overflowing the stack on simple // recursions with few live values where stack overflow should not be a // risk; if it is too small we spend too much time adjusting the stack // pointer. // // Good values for ChunkSize are the subject of future empirical analysis; // eight words is just an educated guess. static constexpr uint32_t ChunkSize = 8 * sizeof(void*); static constexpr uint32_t InitialChunk = ChunkSize; // The current logical height of the frame is // currentStackHeight_ = localSize_ + dynamicSize // where dynamicSize is not accounted for explicitly and localSize_ also // includes size for the DebugFrame. // // The allocated size of the frame, provided by masm.framePushed(), is usually // larger than currentStackHeight_, notably at the beginning of execution when // we've allocated InitialChunk extra space. uint32_t currentStackHeight_; #endif // Size of the Local area in bytes (stable after BaseCompiler::init() has // called BaseStackFrame::setupLocals(), which in turn calls // BaseStackFrameAllocator::setLocalSize()), always rounded to the proper // stack alignment. The Local area is then allocated in beginFunction(), // following the allocation of the Header. See onFixedStackAllocated() // below. uint32_t localSize_; protected: /////////////////////////////////////////////////////////////////////////// // // Initialization explicit BaseStackFrameAllocator(MacroAssembler& masm) : masm(masm), #ifdef RABALDR_CHUNKY_STACK currentStackHeight_(0), #endif localSize_(UINT32_MAX) { } protected: ////////////////////////////////////////////////////////////////////// // // The Local area - the static part of the frame. // Record the size of the Local area, once it is known. void setLocalSize(uint32_t localSize) { MOZ_ASSERT(localSize == AlignBytes(localSize, sizeof(void*)), "localSize_ should be aligned to at least a pointer"); MOZ_ASSERT(localSize_ == UINT32_MAX); localSize_ = localSize; } // Record the current stack height, after it has become stable in // beginFunction(). See also BaseStackFrame::onFixedStackAllocated(). void onFixedStackAllocated() { MOZ_ASSERT(localSize_ != UINT32_MAX); #ifdef RABALDR_CHUNKY_STACK currentStackHeight_ = localSize_; #endif } public: // The fixed amount of memory, in bytes, allocated on the stack below the // Header for purposes such as locals and other fixed values. Includes all // necessary alignment, and on ARM64 also the initial chunk for the working // stack memory. uint32_t fixedAllocSize() const { MOZ_ASSERT(localSize_ != UINT32_MAX); #ifdef RABALDR_CHUNKY_STACK return localSize_ + InitialChunk; #else return localSize_; #endif } #ifdef RABALDR_CHUNKY_STACK // The allocated frame size is frequently larger than the logical stack // height; we round up to a chunk boundary, and special case the initial // chunk. uint32_t framePushedForHeight(uint32_t logicalHeight) { if (logicalHeight <= fixedAllocSize()) { return fixedAllocSize(); } return fixedAllocSize() + AlignBytes(logicalHeight - fixedAllocSize(), ChunkSize); } #endif protected: ////////////////////////////////////////////////////////////////////// // // The Dynamic area - the dynamic part of the frame, for spilling and saving // intermediate values. // Offset off of sp_ for the slot at stack area location `offset`. int32_t stackOffset(int32_t offset) { MOZ_ASSERT(offset > 0); return masm.framePushed() - offset; } uint32_t computeHeightWithStackResults(StackHeight stackBase, uint32_t stackResultBytes) { MOZ_ASSERT(stackResultBytes); MOZ_ASSERT(currentStackHeight() >= stackBase.height); return stackBase.height + stackResultBytes; } #ifdef RABALDR_CHUNKY_STACK void pushChunkyBytes(uint32_t bytes) { checkChunkyInvariants(); uint32_t freeSpace = masm.framePushed() - currentStackHeight_; if (freeSpace < bytes) { uint32_t bytesToReserve = AlignBytes(bytes - freeSpace, ChunkSize); MOZ_ASSERT(bytesToReserve + freeSpace >= bytes); masm.reserveStack(bytesToReserve); } currentStackHeight_ += bytes; checkChunkyInvariants(); } void popChunkyBytes(uint32_t bytes) { checkChunkyInvariants(); currentStackHeight_ -= bytes; // Sometimes, popChunkyBytes() is used to pop a larger area, as when we drop // values consumed by a call, and we may need to drop several chunks. But // never drop the initial chunk. Crucially, the amount we drop is always an // integral number of chunks. uint32_t freeSpace = masm.framePushed() - currentStackHeight_; if (freeSpace >= ChunkSize) { uint32_t targetAllocSize = framePushedForHeight(currentStackHeight_); uint32_t amountToFree = masm.framePushed() - targetAllocSize; MOZ_ASSERT(amountToFree % ChunkSize == 0); if (amountToFree) { masm.freeStack(amountToFree); } } checkChunkyInvariants(); } #endif uint32_t currentStackHeight() const { #ifdef RABALDR_CHUNKY_STACK return currentStackHeight_; #else return masm.framePushed(); #endif } private: #ifdef RABALDR_CHUNKY_STACK void checkChunkyInvariants() { MOZ_ASSERT(masm.framePushed() >= fixedAllocSize()); MOZ_ASSERT(masm.framePushed() >= currentStackHeight_); MOZ_ASSERT(masm.framePushed() == fixedAllocSize() || masm.framePushed() - currentStackHeight_ < ChunkSize); MOZ_ASSERT((masm.framePushed() - localSize_) % ChunkSize == 0); } #endif // For a given stack height, return the appropriate size of the allocated // frame. uint32_t framePushedForHeight(StackHeight stackHeight) { #ifdef RABALDR_CHUNKY_STACK // A more complicated adjustment is needed. return framePushedForHeight(stackHeight.height); #else // The allocated frame size equals the stack height. return stackHeight.height; #endif } public: // The current height of the stack area, not necessarily zero-based, in a // type-safe way. StackHeight stackHeight() const { return StackHeight(currentStackHeight()); } // Set the frame height to a previously recorded value. void setStackHeight(StackHeight amount) { #ifdef RABALDR_CHUNKY_STACK currentStackHeight_ = amount.height; masm.setFramePushed(framePushedForHeight(amount)); checkChunkyInvariants(); #else masm.setFramePushed(amount.height); #endif } // The current height of the dynamic part of the stack area (ie, the backing // store for the evaluation stack), zero-based. uint32_t dynamicHeight() const { return currentStackHeight() - localSize_; } // Before branching to an outer control label, pop the execution stack to // the level expected by that region, but do not update masm.framePushed() // as that will happen as compilation leaves the block. // // Note these operate directly on the stack pointer register. void popStackBeforeBranch(StackHeight destStackHeight, uint32_t stackResultBytes) { uint32_t framePushedHere = masm.framePushed(); StackHeight heightThere = StackHeight(destStackHeight.height + stackResultBytes); uint32_t framePushedThere = framePushedForHeight(heightThere); if (framePushedHere > framePushedThere) { masm.addToStackPtr(Imm32(framePushedHere - framePushedThere)); } } void popStackBeforeBranch(StackHeight destStackHeight, ResultType type) { popStackBeforeBranch(destStackHeight, ABIResultIter::MeasureStackBytes(type)); } // Given that there are |stackParamSize| bytes on the dynamic stack // corresponding to the stack results, return the stack height once these // parameters are popped. StackHeight stackResultsBase(uint32_t stackParamSize) { return StackHeight(currentStackHeight() - stackParamSize); } // For most of WebAssembly, adjacent instructions have fallthrough control // flow between them, which allows us to simply thread the current stack // height through the compiler. There are two exceptions to this rule: when // leaving a block via dead code, and when entering the "else" arm of an "if". // In these cases, the stack height is the block entry height, plus any stack // values (results in the block exit case, parameters in the else entry case). void resetStackHeight(StackHeight destStackHeight, ResultType type) { uint32_t height = destStackHeight.height; height += ABIResultIter::MeasureStackBytes(type); setStackHeight(StackHeight(height)); } // Return offset of stack result. uint32_t locateStackResult(const ABIResult& result, StackHeight stackBase, uint32_t stackResultBytes) { MOZ_ASSERT(result.onStack()); MOZ_ASSERT(result.stackOffset() + result.size() <= stackResultBytes); uint32_t end = computeHeightWithStackResults(stackBase, stackResultBytes); return end - result.stackOffset(); } public: ////////////////////////////////////////////////////////////////////// // // The Argument area - for outgoing calls. // // We abstract these operations as an optimization: we can merge the freeing // of the argument area and dropping values off the stack after a call. But // they always amount to manipulating the real stack pointer by some amount. // // Note that we do not update currentStackHeight_ for this; the frame does // not know about outgoing arguments. But we do update framePushed(), so we // can still index into the frame below the outgoing arguments area. // This is always equivalent to a masm.reserveStack() call. void allocArgArea(size_t argSize) { if (argSize) { masm.reserveStack(argSize); } } // This frees the argument area allocated by allocArgArea(), and `argSize` // must be equal to the `argSize` argument to allocArgArea(). In addition // we drop some values from the frame, corresponding to the values that were // consumed by the call. void freeArgAreaAndPopBytes(size_t argSize, size_t dropSize) { #ifdef RABALDR_CHUNKY_STACK // Freeing the outgoing arguments and freeing the consumed values have // different semantics here, which is why the operation is split. if (argSize) { masm.freeStack(argSize); } popChunkyBytes(dropSize); #else if (argSize + dropSize) { masm.freeStack(argSize + dropSize); } #endif } }; class BaseStackFrame final : public BaseStackFrameAllocator { MacroAssembler& masm; // The largest observed value of masm.framePushed(), ie, the size of the // stack frame. Read this for its true value only when code generation is // finished. uint32_t maxFramePushed_; // Patch point where we check for stack overflow. CodeOffset stackAddOffset_; // Low byte offset of pointer to stack results, if any. Maybe stackResultsPtrOffset_; // The offset of instance pointer. uint32_t instancePointerOffset_; // Low byte offset of local area for true locals (not parameters). uint32_t varLow_; // High byte offset + 1 of local area for true locals. uint32_t varHigh_; // The stack pointer, cached for brevity. RegisterOrSP sp_; public: explicit BaseStackFrame(MacroAssembler& masm) : BaseStackFrameAllocator(masm), masm(masm), maxFramePushed_(0), stackAddOffset_(0), instancePointerOffset_(UINT32_MAX), varLow_(UINT32_MAX), varHigh_(UINT32_MAX), sp_(masm.getStackPointer()) {} /////////////////////////////////////////////////////////////////////////// // // Stack management and overflow checking // This must be called once beginFunction has allocated space for the Header // (the Frame and DebugFrame) and the Local area, and will record the current // frame size for internal use by the stack abstractions. void onFixedStackAllocated() { maxFramePushed_ = masm.framePushed(); BaseStackFrameAllocator::onFixedStackAllocated(); } // We won't know until after we've generated code how big the frame will be // (we may need arbitrary spill slots and outgoing param slots) so emit a // patchable add that is patched in endFunction(). // // Note the platform scratch register may be used by branchPtr(), so // generally tmp must be something else. void checkStack(Register tmp, BytecodeOffset trapOffset) { stackAddOffset_ = masm.sub32FromStackPtrWithPatch(tmp); Label ok; masm.branchPtr(Assembler::Below, Address(InstanceReg, wasm::Instance::offsetOfStackLimit()), tmp, &ok); masm.wasmTrap(Trap::StackOverflow, trapOffset); masm.bind(&ok); } void patchCheckStack() { masm.patchSub32FromStackPtr(stackAddOffset_, Imm32(int32_t(maxFramePushed_))); } // Very large frames are implausible, probably an attack. bool checkStackHeight() { return maxFramePushed_ <= MaxFrameSize; } /////////////////////////////////////////////////////////////////////////// // // Local area struct Local { // Type of the value. const MIRType type; // Byte offset from Frame "into" the locals, ie positive for true locals // and negative for incoming args that read directly from the arg area. // It assumes the stack is growing down and that locals are on the stack // at lower addresses than Frame, and is the offset from Frame of the // lowest-addressed byte of the local. const int32_t offs; Local(MIRType type, int32_t offs) : type(type), offs(offs) {} bool isStackArgument() const { return offs < 0; } }; // Profiling shows that the number of parameters and locals frequently // touches or exceeds 8. So 16 seems like a reasonable starting point. using LocalVector = Vector; // Initialize `localInfo` based on the types of `locals` and `args`. [[nodiscard]] bool setupLocals(const ValTypeVector& locals, const ArgTypeVector& args, bool debugEnabled, LocalVector* localInfo) { if (!localInfo->reserve(locals.length())) { return false; } DebugOnly index = 0; BaseLocalIter i(locals, args, debugEnabled); for (; !i.done() && i.index() < args.lengthWithoutStackResults(); i++) { MOZ_ASSERT(i.isArg()); MOZ_ASSERT(i.index() == index); localInfo->infallibleEmplaceBack(i.mirType(), i.frameOffset()); index++; } varLow_ = i.frameSize(); for (; !i.done(); i++) { MOZ_ASSERT(!i.isArg()); MOZ_ASSERT(i.index() == index); localInfo->infallibleEmplaceBack(i.mirType(), i.frameOffset()); index++; } varHigh_ = i.frameSize(); // Reserve an additional stack slot for the instance pointer. const uint32_t pointerAlignedVarHigh = AlignBytes(varHigh_, sizeof(void*)); const uint32_t localSize = pointerAlignedVarHigh + sizeof(void*); instancePointerOffset_ = localSize; setLocalSize(AlignBytes(localSize, WasmStackAlignment)); if (args.hasSyntheticStackResultPointerArg()) { stackResultsPtrOffset_ = Some(i.stackResultPointerOffset()); } return true; } void zeroLocals(BaseRegAlloc* ra); Address addressOfLocal(const Local& local, uint32_t additionalOffset = 0) { if (local.isStackArgument()) { return Address(FramePointer, stackArgumentOffsetFromFp(local) + additionalOffset); } return Address(sp_, localOffsetFromSp(local) + additionalOffset); } void loadLocalI32(const Local& src, RegI32 dest) { masm.load32(addressOfLocal(src), dest); } #ifndef JS_PUNBOX64 void loadLocalI64Low(const Local& src, RegI32 dest) { masm.load32(addressOfLocal(src, INT64LOW_OFFSET), dest); } void loadLocalI64High(const Local& src, RegI32 dest) { masm.load32(addressOfLocal(src, INT64HIGH_OFFSET), dest); } #endif void loadLocalI64(const Local& src, RegI64 dest) { masm.load64(addressOfLocal(src), dest); } void loadLocalRef(const Local& src, RegRef dest) { masm.loadPtr(addressOfLocal(src), dest); } void loadLocalF64(const Local& src, RegF64 dest) { masm.loadDouble(addressOfLocal(src), dest); } void loadLocalF32(const Local& src, RegF32 dest) { masm.loadFloat32(addressOfLocal(src), dest); } #ifdef ENABLE_WASM_SIMD void loadLocalV128(const Local& src, RegV128 dest) { masm.loadUnalignedSimd128(addressOfLocal(src), dest); } #endif void storeLocalI32(RegI32 src, const Local& dest) { masm.store32(src, addressOfLocal(dest)); } void storeLocalI64(RegI64 src, const Local& dest) { masm.store64(src, addressOfLocal(dest)); } void storeLocalRef(RegRef src, const Local& dest) { masm.storePtr(src, addressOfLocal(dest)); } void storeLocalF64(RegF64 src, const Local& dest) { masm.storeDouble(src, addressOfLocal(dest)); } void storeLocalF32(RegF32 src, const Local& dest) { masm.storeFloat32(src, addressOfLocal(dest)); } #ifdef ENABLE_WASM_SIMD void storeLocalV128(RegV128 src, const Local& dest) { masm.storeUnalignedSimd128(src, addressOfLocal(dest)); } #endif // Offset off of sp_ for `local`. int32_t localOffsetFromSp(const Local& local) { MOZ_ASSERT(!local.isStackArgument()); return localOffset(local.offs); } // Offset off of frame pointer for `stack argument`. int32_t stackArgumentOffsetFromFp(const Local& local) { MOZ_ASSERT(local.isStackArgument()); return -local.offs; } // The incoming stack result area pointer is for stack results of the function // being compiled. void loadIncomingStackResultAreaPtr(RegPtr reg) { const int32_t offset = stackResultsPtrOffset_.value(); Address src = offset < 0 ? Address(FramePointer, -offset) : Address(sp_, stackOffset(offset)); masm.loadPtr(src, reg); } void storeIncomingStackResultAreaPtr(RegPtr reg) { // If we get here, that means the pointer to the stack results area was // passed in as a register, and therefore it will be spilled below the // frame, so the offset is a positive height. MOZ_ASSERT(stackResultsPtrOffset_.value() > 0); masm.storePtr(reg, Address(sp_, stackOffset(stackResultsPtrOffset_.value()))); } void loadInstancePtr(Register dst) { masm.loadPtr(Address(sp_, stackOffset(instancePointerOffset_)), dst); } void storeInstancePtr(Register instance) { masm.storePtr(instance, Address(sp_, stackOffset(instancePointerOffset_))); } int32_t getInstancePtrOffset() { return stackOffset(instancePointerOffset_); } // An outgoing stack result area pointer is for stack results of callees of // the function being compiled. void computeOutgoingStackResultAreaPtr(const StackResultsLoc& results, RegPtr dest) { MOZ_ASSERT(results.height() <= masm.framePushed()); uint32_t offsetFromSP = masm.framePushed() - results.height(); masm.moveStackPtrTo(dest); if (offsetFromSP) { masm.addPtr(Imm32(offsetFromSP), dest); } } private: // Offset off of sp_ for a local with offset `offset` from Frame. int32_t localOffset(int32_t offset) { return masm.framePushed() - offset; } public: /////////////////////////////////////////////////////////////////////////// // // Dynamic area static constexpr size_t StackSizeOfPtr = ABIResult::StackSizeOfPtr; static constexpr size_t StackSizeOfInt64 = ABIResult::StackSizeOfInt64; static constexpr size_t StackSizeOfFloat = ABIResult::StackSizeOfFloat; static constexpr size_t StackSizeOfDouble = ABIResult::StackSizeOfDouble; #ifdef ENABLE_WASM_SIMD static constexpr size_t StackSizeOfV128 = ABIResult::StackSizeOfV128; #endif // Pushes the register `r` to the stack. This pushes the full 64-bit width on // 64-bit systems, and 32-bits otherwise. uint32_t pushGPR(Register r) { DebugOnly stackBefore = currentStackHeight(); #ifdef RABALDR_CHUNKY_STACK pushChunkyBytes(StackSizeOfPtr); masm.storePtr(r, Address(sp_, stackOffset(currentStackHeight()))); #else masm.Push(r); #endif maxFramePushed_ = std::max(maxFramePushed_, masm.framePushed()); MOZ_ASSERT(stackBefore + StackSizeOfPtr == currentStackHeight()); return currentStackHeight(); } uint32_t pushFloat32(FloatRegister r) { DebugOnly stackBefore = currentStackHeight(); #ifdef RABALDR_CHUNKY_STACK pushChunkyBytes(StackSizeOfFloat); masm.storeFloat32(r, Address(sp_, stackOffset(currentStackHeight()))); #else masm.Push(r); #endif maxFramePushed_ = std::max(maxFramePushed_, masm.framePushed()); MOZ_ASSERT(stackBefore + StackSizeOfFloat == currentStackHeight()); return currentStackHeight(); } #ifdef ENABLE_WASM_SIMD uint32_t pushV128(RegV128 r) { DebugOnly stackBefore = currentStackHeight(); # ifdef RABALDR_CHUNKY_STACK pushChunkyBytes(StackSizeOfV128); # else masm.adjustStack(-(int)StackSizeOfV128); # endif masm.storeUnalignedSimd128(r, Address(sp_, stackOffset(currentStackHeight()))); maxFramePushed_ = std::max(maxFramePushed_, masm.framePushed()); MOZ_ASSERT(stackBefore + StackSizeOfV128 == currentStackHeight()); return currentStackHeight(); } #endif uint32_t pushDouble(FloatRegister r) { DebugOnly stackBefore = currentStackHeight(); #ifdef RABALDR_CHUNKY_STACK pushChunkyBytes(StackSizeOfDouble); masm.storeDouble(r, Address(sp_, stackOffset(currentStackHeight()))); #else masm.Push(r); #endif maxFramePushed_ = std::max(maxFramePushed_, masm.framePushed()); MOZ_ASSERT(stackBefore + StackSizeOfDouble == currentStackHeight()); return currentStackHeight(); } // Pops the stack into the register `r`. This pops the full 64-bit width on // 64-bit systems, and 32-bits otherwise. void popGPR(Register r) { DebugOnly stackBefore = currentStackHeight(); #ifdef RABALDR_CHUNKY_STACK masm.loadPtr(Address(sp_, stackOffset(currentStackHeight())), r); popChunkyBytes(StackSizeOfPtr); #else masm.Pop(r); #endif MOZ_ASSERT(stackBefore - StackSizeOfPtr == currentStackHeight()); } void popFloat32(FloatRegister r) { DebugOnly stackBefore = currentStackHeight(); #ifdef RABALDR_CHUNKY_STACK masm.loadFloat32(Address(sp_, stackOffset(currentStackHeight())), r); popChunkyBytes(StackSizeOfFloat); #else masm.Pop(r); #endif MOZ_ASSERT(stackBefore - StackSizeOfFloat == currentStackHeight()); } void popDouble(FloatRegister r) { DebugOnly stackBefore = currentStackHeight(); #ifdef RABALDR_CHUNKY_STACK masm.loadDouble(Address(sp_, stackOffset(currentStackHeight())), r); popChunkyBytes(StackSizeOfDouble); #else masm.Pop(r); #endif MOZ_ASSERT(stackBefore - StackSizeOfDouble == currentStackHeight()); } #ifdef ENABLE_WASM_SIMD void popV128(RegV128 r) { DebugOnly stackBefore = currentStackHeight(); masm.loadUnalignedSimd128(Address(sp_, stackOffset(currentStackHeight())), r); # ifdef RABALDR_CHUNKY_STACK popChunkyBytes(StackSizeOfV128); # else masm.adjustStack((int)StackSizeOfV128); # endif MOZ_ASSERT(stackBefore - StackSizeOfV128 == currentStackHeight()); } #endif void popBytes(size_t bytes) { if (bytes > 0) { #ifdef RABALDR_CHUNKY_STACK popChunkyBytes(bytes); #else masm.freeStack(bytes); #endif } } void loadStackI32(int32_t offset, RegI32 dest) { masm.load32(Address(sp_, stackOffset(offset)), dest); } void loadStackI64(int32_t offset, RegI64 dest) { masm.load64(Address(sp_, stackOffset(offset)), dest); } #ifndef JS_PUNBOX64 void loadStackI64Low(int32_t offset, RegI32 dest) { masm.load32(Address(sp_, stackOffset(offset - INT64LOW_OFFSET)), dest); } void loadStackI64High(int32_t offset, RegI32 dest) { masm.load32(Address(sp_, stackOffset(offset - INT64HIGH_OFFSET)), dest); } #endif void loadStackRef(int32_t offset, RegRef dest) { masm.loadPtr(Address(sp_, stackOffset(offset)), dest); } void loadStackF64(int32_t offset, RegF64 dest) { masm.loadDouble(Address(sp_, stackOffset(offset)), dest); } void loadStackF32(int32_t offset, RegF32 dest) { masm.loadFloat32(Address(sp_, stackOffset(offset)), dest); } #ifdef ENABLE_WASM_SIMD void loadStackV128(int32_t offset, RegV128 dest) { masm.loadUnalignedSimd128(Address(sp_, stackOffset(offset)), dest); } #endif uint32_t prepareStackResultArea(StackHeight stackBase, uint32_t stackResultBytes) { uint32_t end = computeHeightWithStackResults(stackBase, stackResultBytes); if (currentStackHeight() < end) { uint32_t bytes = end - currentStackHeight(); #ifdef RABALDR_CHUNKY_STACK pushChunkyBytes(bytes); #else masm.reserveStack(bytes); #endif maxFramePushed_ = std::max(maxFramePushed_, masm.framePushed()); } return end; } void finishStackResultArea(StackHeight stackBase, uint32_t stackResultBytes) { uint32_t end = computeHeightWithStackResults(stackBase, stackResultBytes); MOZ_ASSERT(currentStackHeight() >= end); popBytes(currentStackHeight() - end); } // |srcHeight| and |destHeight| are stack heights *including* |bytes|. void shuffleStackResultsTowardFP(uint32_t srcHeight, uint32_t destHeight, uint32_t bytes, Register temp) { MOZ_ASSERT(destHeight < srcHeight); MOZ_ASSERT(bytes % sizeof(uint32_t) == 0); uint32_t destOffset = stackOffset(destHeight) + bytes; uint32_t srcOffset = stackOffset(srcHeight) + bytes; while (bytes >= sizeof(intptr_t)) { destOffset -= sizeof(intptr_t); srcOffset -= sizeof(intptr_t); bytes -= sizeof(intptr_t); masm.loadPtr(Address(sp_, srcOffset), temp); masm.storePtr(temp, Address(sp_, destOffset)); } if (bytes) { MOZ_ASSERT(bytes == sizeof(uint32_t)); destOffset -= sizeof(uint32_t); srcOffset -= sizeof(uint32_t); masm.load32(Address(sp_, srcOffset), temp); masm.store32(temp, Address(sp_, destOffset)); } } // Unlike the overload that operates on raw heights, |srcHeight| and // |destHeight| are stack heights *not including* |bytes|. void shuffleStackResultsTowardFP(StackHeight srcHeight, StackHeight destHeight, uint32_t bytes, Register temp) { MOZ_ASSERT(srcHeight.isValid()); MOZ_ASSERT(destHeight.isValid()); uint32_t src = computeHeightWithStackResults(srcHeight, bytes); uint32_t dest = computeHeightWithStackResults(destHeight, bytes); MOZ_ASSERT(src <= currentStackHeight()); MOZ_ASSERT(dest <= currentStackHeight()); shuffleStackResultsTowardFP(src, dest, bytes, temp); } // |srcHeight| and |destHeight| are stack heights *including* |bytes|. void shuffleStackResultsTowardSP(uint32_t srcHeight, uint32_t destHeight, uint32_t bytes, Register temp) { MOZ_ASSERT(destHeight > srcHeight); MOZ_ASSERT(bytes % sizeof(uint32_t) == 0); uint32_t destOffset = stackOffset(destHeight); uint32_t srcOffset = stackOffset(srcHeight); while (bytes >= sizeof(intptr_t)) { masm.loadPtr(Address(sp_, srcOffset), temp); masm.storePtr(temp, Address(sp_, destOffset)); destOffset += sizeof(intptr_t); srcOffset += sizeof(intptr_t); bytes -= sizeof(intptr_t); } if (bytes) { MOZ_ASSERT(bytes == sizeof(uint32_t)); masm.load32(Address(sp_, srcOffset), temp); masm.store32(temp, Address(sp_, destOffset)); } } // Copy results from the top of the current stack frame to an area of memory, // and pop the stack accordingly. `dest` is the address of the low byte of // that memory. void popStackResultsToMemory(Register dest, uint32_t bytes, Register temp) { MOZ_ASSERT(bytes <= currentStackHeight()); MOZ_ASSERT(bytes % sizeof(uint32_t) == 0); uint32_t bytesToPop = bytes; uint32_t srcOffset = stackOffset(currentStackHeight()); uint32_t destOffset = 0; while (bytes >= sizeof(intptr_t)) { masm.loadPtr(Address(sp_, srcOffset), temp); masm.storePtr(temp, Address(dest, destOffset)); destOffset += sizeof(intptr_t); srcOffset += sizeof(intptr_t); bytes -= sizeof(intptr_t); } if (bytes) { MOZ_ASSERT(bytes == sizeof(uint32_t)); masm.load32(Address(sp_, srcOffset), temp); masm.store32(temp, Address(dest, destOffset)); } popBytes(bytesToPop); } void allocArgArea(size_t argSize) { if (argSize) { BaseStackFrameAllocator::allocArgArea(argSize); maxFramePushed_ = std::max(maxFramePushed_, masm.framePushed()); } } private: void store32BitsToStack(int32_t imm, uint32_t destHeight, Register temp) { masm.move32(Imm32(imm), temp); masm.store32(temp, Address(sp_, stackOffset(destHeight))); } void store64BitsToStack(int64_t imm, uint32_t destHeight, Register temp) { #ifdef JS_PUNBOX64 masm.move64(Imm64(imm), Register64(temp)); masm.store64(Register64(temp), Address(sp_, stackOffset(destHeight))); #else union { int64_t i64; int32_t i32[2]; } bits = {.i64 = imm}; static_assert(sizeof(bits) == 8); store32BitsToStack(bits.i32[0], destHeight, temp); store32BitsToStack(bits.i32[1], destHeight - sizeof(int32_t), temp); #endif } public: void storeImmediatePtrToStack(intptr_t imm, uint32_t destHeight, Register temp) { #ifdef JS_PUNBOX64 static_assert(StackSizeOfPtr == 8); store64BitsToStack(imm, destHeight, temp); #else static_assert(StackSizeOfPtr == 4); store32BitsToStack(int32_t(imm), destHeight, temp); #endif } void storeImmediateI64ToStack(int64_t imm, uint32_t destHeight, Register temp) { store64BitsToStack(imm, destHeight, temp); } void storeImmediateF32ToStack(float imm, uint32_t destHeight, Register temp) { union { int32_t i32; float f32; } bits = {.f32 = imm}; static_assert(sizeof(bits) == 4); // Do not store 4 bytes if StackSizeOfFloat == 8. It's probably OK to do // so, but it costs little to store something predictable. if (StackSizeOfFloat == 4) { store32BitsToStack(bits.i32, destHeight, temp); } else { store64BitsToStack(uint32_t(bits.i32), destHeight, temp); } } void storeImmediateF64ToStack(double imm, uint32_t destHeight, Register temp) { union { int64_t i64; double f64; } bits = {.f64 = imm}; static_assert(sizeof(bits) == 8); store64BitsToStack(bits.i64, destHeight, temp); } #ifdef ENABLE_WASM_SIMD void storeImmediateV128ToStack(V128 imm, uint32_t destHeight, Register temp) { union { int32_t i32[4]; uint8_t bytes[16]; } bits{}; static_assert(sizeof(bits) == 16); memcpy(bits.bytes, imm.bytes, 16); for (unsigned i = 0; i < 4; i++) { store32BitsToStack(bits.i32[i], destHeight - i * sizeof(int32_t), temp); } } #endif }; ////////////////////////////////////////////////////////////////////////////// // // MachineStackTracker, used for stack-slot pointerness tracking. // An expensive operation in stack-map creation is copying of the // MachineStackTracker (MST) into the final StackMap. This is done in // StackMapGenerator::createStackMap. Given that this is basically a // bit-array copy, it is reasonable to ask whether the two classes could have // a more similar representation, so that the copy could then be done with // `memcpy`. // // Although in principle feasible, the follow complications exist, and so for // the moment, this has not been done. // // * StackMap is optimised for compact size (storage) since there will be // many, so it uses a true bitmap. MST is intended to be fast and simple, // and only one exists at once (per compilation thread). Doing this would // require MST to use a true bitmap, and hence .. // // * .. the copying can't be a straight memcpy, since StackMap has entries for // words not covered by MST. Hence the copy would need to shift bits in // each byte left or right (statistically speaking, in 7 cases out of 8) in // order to ensure no "holes" in the resulting bitmap. // // * Furthermore the copying would need to logically invert the direction of // the stacks. For MST, index zero in the vector corresponds to the highest // address in the stack. For StackMap, bit index zero corresponds to the // lowest address in the stack. // // * Finally, StackMap is a variable-length structure whose size must be known // at creation time. The size of an MST by contrast isn't known at creation // time -- it grows as the baseline compiler pushes stuff on its value // stack. That's why it has to have vector entry 0 being the highest address. // // * Although not directly relevant, StackMaps are also created by the via-Ion // compilation routes, by translation from the pre-existing "JS-era" // LSafePoints (CreateStackMapFromLSafepoint). So if we want to mash // StackMap around to suit baseline better, we also need to ensure it // doesn't break Ion somehow. class MachineStackTracker { // Simulates the machine's stack, with one bool per word. The booleans are // represented as `uint8_t`s so as to guarantee the element size is one // byte. Index zero in this vector corresponds to the highest address in // the machine's stack. The last entry corresponds to what SP currently // points at. This all assumes a grow-down stack. // // numPtrs_ contains the number of "true" values in vec_, and is therefore // redundant. But it serves as a constant-time way to detect the common // case where vec_ holds no "true" values. size_t numPtrs_; Vector vec_; public: MachineStackTracker() : numPtrs_(0) {} ~MachineStackTracker() { #ifdef DEBUG size_t n = 0; for (uint8_t b : vec_) { n += (b ? 1 : 0); } MOZ_ASSERT(n == numPtrs_); #endif } // Clone this MachineStackTracker, writing the result at |dst|. [[nodiscard]] bool cloneTo(MachineStackTracker* dst); // Notionally push |n| non-pointers on the stack. [[nodiscard]] bool pushNonGCPointers(size_t n) { return vec_.appendN(uint8_t(false), n); } // Mark the stack slot |offsetFromSP| up from the bottom as holding a // pointer. void setGCPointer(size_t offsetFromSP) { // offsetFromSP == 0 denotes the most recently pushed item, == 1 the // second most recently pushed item, etc. MOZ_ASSERT(offsetFromSP < vec_.length()); size_t offsetFromTop = vec_.length() - 1 - offsetFromSP; numPtrs_ = numPtrs_ + 1 - (vec_[offsetFromTop] ? 1 : 0); vec_[offsetFromTop] = uint8_t(true); } // Query the pointerness of the slot |offsetFromSP| up from the bottom. bool isGCPointer(size_t offsetFromSP) const { MOZ_ASSERT(offsetFromSP < vec_.length()); size_t offsetFromTop = vec_.length() - 1 - offsetFromSP; return bool(vec_[offsetFromTop]); } // Return the number of words tracked by this MachineStackTracker. size_t length() const { return vec_.length(); } // Return the number of pointer-typed words tracked by this // MachineStackTracker. size_t numPtrs() const { MOZ_ASSERT(numPtrs_ <= length()); return numPtrs_; } // Discard all contents, but (per mozilla::Vector::clear semantics) don't // free or reallocate any dynamic storage associated with |vec_|. void clear() { vec_.clear(); numPtrs_ = 0; } // An iterator that produces indices of reftyped slots, starting at the // logical bottom of the (grow-down) stack. Indices have the same meaning // as the arguments to `isGCPointer`. That is, if this iterator produces a // value `i`, then it means that `isGCPointer(i) == true`; if the value `i` // is never produced then `isGCPointer(i) == false`. The values are // produced in ascending order. // // Because most slots are non-reftyped, some effort has been put into // skipping over large groups of non-reftyped slots quickly. class Iter { // Both `bufU8_` and `bufU32_` are made to point to `vec_`s array of // `uint8_t`s, so we can scan (backwards) through it either in bytes or // 32-bit words. Recall that the last element in `vec_` pertains to the // lowest-addressed word in the machine's grow-down stack, and we want to // iterate logically "up" this stack, so we need to iterate backwards // through `vec_`. // // This dual-pointer scheme assumes that the `vec_`s content array is at // least 32-bit aligned. const uint8_t* bufU8_; const uint32_t* bufU32_; // The number of elements in `bufU8_`. const size_t nElems_; // The index in `bufU8_` where the next search should start. size_t next_; public: explicit Iter(const MachineStackTracker& mst) : bufU8_((uint8_t*)mst.vec_.begin()), bufU32_((uint32_t*)mst.vec_.begin()), nElems_(mst.vec_.length()), next_(mst.vec_.length() - 1) { MOZ_ASSERT(uintptr_t(bufU8_) == uintptr_t(bufU32_)); // Check minimum alignment constraint on the array. MOZ_ASSERT(0 == (uintptr_t(bufU8_) & 3)); } ~Iter() { MOZ_ASSERT(uintptr_t(bufU8_) == uintptr_t(bufU32_)); } // It is important, for termination of the search loop in `next()`, that // this has the value obtained by subtracting 1 from size_t(0). static constexpr size_t FINISHED = ~size_t(0); static_assert(FINISHED == size_t(0) - 1); // Returns the next index `i` for which `isGCPointer(i) == true`. size_t get() { while (next_ != FINISHED) { if (bufU8_[next_]) { next_--; return nElems_ - 1 - (next_ + 1); } // Invariant: next_ != FINISHED (so it's still a valid index) // and: bufU8_[next_] == 0 // (so we need to move backwards by at least 1) // // BEGIN optimization -- this could be removed without affecting // correctness. if ((next_ & 7) == 0) { // We're at the "bottom" of the current dual-4-element word. Check // if we can jump backwards by 8. This saves a conditional branch // and a few cycles by ORing two adjacent 32-bit words together, // whilst not requiring 64-bit alignment of `bufU32_`. while (next_ >= 8 && (bufU32_[(next_ - 4) >> 2] | bufU32_[(next_ - 8) >> 2]) == 0) { next_ -= 8; } } // END optimization next_--; } return FINISHED; } }; }; ////////////////////////////////////////////////////////////////////////////// // // StackMapGenerator, which carries all state needed to create stackmaps. enum class HasDebugFrameWithLiveRefs { No, Maybe }; struct StackMapGenerator { private: // --- These are constant for the life of the function's compilation --- // For generating stackmaps, we'll need to know the offsets of registers // as saved by the trap exit stub. const RegisterOffsets& trapExitLayout_; const size_t trapExitLayoutNumWords_; // Completed stackmaps are added here StackMaps* stackMaps_; // So as to be able to get current offset when creating stackmaps const MacroAssembler& masm_; public: // --- These are constant once we've completed beginFunction() --- // The number of words of arguments passed to this function in memory. size_t numStackArgWords; MachineStackTracker machineStackTracker; // tracks machine stack pointerness // This holds masm.framePushed at entry to the function's body. It is a // Maybe because createStackMap needs to know whether or not we're still // in the prologue. It makes a Nothing-to-Some transition just once per // function. Maybe framePushedAtEntryToBody; // --- These can change at any point --- // This holds masm.framePushed at it would be be for a function call // instruction, but excluding the stack area used to pass arguments in // memory. That is, for an upcoming function call, this will hold // // masm.framePushed() at the call instruction - // StackArgAreaSizeUnaligned(argumentTypes) // // This value denotes the lowest-addressed stack word covered by the current // function's stackmap. Words below this point form the highest-addressed // area of the callee's stackmap. Note that all alignment padding above the // arguments-in-memory themselves belongs to the caller's stackmap, which // is why this is defined in terms of StackArgAreaSizeUnaligned() rather than // StackArgAreaSizeAligned(). // // When not inside a function call setup/teardown sequence, it is Nothing. // It can make Nothing-to/from-Some transitions arbitrarily as we progress // through the function body. Maybe framePushedExcludingOutboundCallArgs; // The number of memory-resident, ref-typed entries on the containing // BaseCompiler::stk_. size_t memRefsOnStk; // This is a copy of machineStackTracker that is used only within individual // calls to createStackMap. It is here only to avoid possible heap allocation // costs resulting from making it local to createStackMap(). MachineStackTracker augmentedMst; StackMapGenerator(StackMaps* stackMaps, const RegisterOffsets& trapExitLayout, const size_t trapExitLayoutNumWords, const MacroAssembler& masm) : trapExitLayout_(trapExitLayout), trapExitLayoutNumWords_(trapExitLayoutNumWords), stackMaps_(stackMaps), masm_(masm), numStackArgWords(0), memRefsOnStk(0) {} // At the beginning of a function, we may have live roots in registers (as // arguments) at the point where we perform a stack overflow check. This // method generates the "extra" stackmap entries to describe that, in the // case that the check fails and we wind up calling into the wasm exit // stub, as generated by GenerateTrapExit(). // // The resulting map must correspond precisely with the stack layout // created for the integer registers as saved by (code generated by) // GenerateTrapExit(). To do that we use trapExitLayout_ and // trapExitLayoutNumWords_, which together comprise a description of the // layout and are created by GenerateTrapExitRegisterOffsets(). [[nodiscard]] bool generateStackmapEntriesForTrapExit( const ArgTypeVector& args, ExitStubMapVector* extras); // Creates a stackmap associated with the instruction denoted by // |assemblerOffset|, incorporating pointers from the current operand // stack |stk|, incorporating possible extra pointers in |extra| at the // lower addressed end, and possibly with the associated frame having a // DebugFrame that must be traced, as indicated by |debugFrameWithLiveRefs|. [[nodiscard]] bool createStackMap( const char* who, const ExitStubMapVector& extras, uint32_t assemblerOffset, HasDebugFrameWithLiveRefs debugFrameWithLiveRefs, const StkVector& stk); }; } // namespace wasm } // namespace js #endif // wasm_wasm_baseline_frame_h