1 files changed, 2861 insertions, 0 deletions
diff --git a/gfx/skia/skia/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp b/gfx/skia/skia/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
new file mode 100644
index 0000000000..48d9f26d74
--- /dev/null
+++ b/gfx/skia/skia/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp
@@ -0,0 +1,2861 @@
+/*
+ * Copyright 2022 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "include/core/SkStream.h"
+#include "include/private/SkSLString.h"
+#include "include/private/base/SkMalloc.h"
+#include "include/private/base/SkTo.h"
+#include "include/sksl/SkSLPosition.h"
+#include "src/base/SkArenaAlloc.h"
+#include "src/core/SkOpts.h"
+#include "src/core/SkRasterPipelineOpContexts.h"
+#include "src/core/SkRasterPipelineOpList.h"
+#include "src/sksl/codegen/SkSLRasterPipelineBuilder.h"
+#include "src/sksl/tracing/SkRPDebugTrace.h"
+#include "src/sksl/tracing/SkSLDebugInfo.h"
+#include "src/utils/SkBitSet.h"
+
+#if !defined(SKSL_STANDALONE)
+#include "src/core/SkRasterPipeline.h"
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace SkSL {
+namespace RP {
+
+#define ALL_SINGLE_SLOT_UNARY_OP_CASES  \
+         BuilderOp::acos_float:         \
+    case BuilderOp::asin_float:         \
+    case BuilderOp::atan_float:         \
+    case BuilderOp::cos_float:          \
+    case BuilderOp::exp_float:          \
+    case BuilderOp::exp2_float:         \
+    case BuilderOp::log_float:          \
+    case BuilderOp::log2_float:         \
+    case BuilderOp::sin_float:          \
+    case BuilderOp::sqrt_float:         \
+    case BuilderOp::tan_float
+
+#define ALL_MULTI_SLOT_UNARY_OP_CASES        \
+         BuilderOp::abs_float:               \
+    case BuilderOp::abs_int:                 \
+    case BuilderOp::bitwise_not_int:         \
+    case BuilderOp::cast_to_float_from_int:  \
+    case BuilderOp::cast_to_float_from_uint: \
+    case BuilderOp::cast_to_int_from_float:  \
+    case BuilderOp::cast_to_uint_from_float: \
+    case BuilderOp::ceil_float:              \
+    case BuilderOp::floor_float:             \
+    case BuilderOp::invsqrt_float
+
+#define ALL_N_WAY_BINARY_OP_CASES   \
+         BuilderOp::atan2_n_floats: \
+    case BuilderOp::pow_n_floats
+
+#define ALL_MULTI_SLOT_BINARY_OP_CASES  \
+         BuilderOp::add_n_floats:       \
+    case BuilderOp::add_n_ints:         \
+    case BuilderOp::sub_n_floats:       \
+    case BuilderOp::sub_n_ints:         \
+    case BuilderOp::mul_n_floats:       \
+    case BuilderOp::mul_n_ints:         \
+    case BuilderOp::div_n_floats:       \
+    case BuilderOp::div_n_ints:         \
+    case BuilderOp::div_n_uints:        \
+    case BuilderOp::bitwise_and_n_ints: \
+    case BuilderOp::bitwise_or_n_ints:  \
+    case BuilderOp::bitwise_xor_n_ints: \
+    case BuilderOp::mod_n_floats:       \
+    case BuilderOp::min_n_floats:       \
+    case BuilderOp::min_n_ints:         \
+    case BuilderOp::min_n_uints:        \
+    case BuilderOp::max_n_floats:       \
+    case BuilderOp::max_n_ints:         \
+    case BuilderOp::max_n_uints:        \
+    case BuilderOp::cmple_n_floats:     \
+    case BuilderOp::cmple_n_ints:       \
+    case BuilderOp::cmple_n_uints:      \
+    case BuilderOp::cmplt_n_floats:     \
+    case BuilderOp::cmplt_n_ints:       \
+    case BuilderOp::cmplt_n_uints:      \
+    case BuilderOp::cmpeq_n_floats:     \
+    case BuilderOp::cmpeq_n_ints:       \
+    case BuilderOp::cmpne_n_floats:     \
+    case BuilderOp::cmpne_n_ints
+
+#define ALL_N_WAY_TERNARY_OP_CASES       \
+         BuilderOp::smoothstep_n_floats
+
+#define ALL_MULTI_SLOT_TERNARY_OP_CASES \
+         BuilderOp::mix_n_floats:       \
+    case BuilderOp::mix_n_ints
+
+void Builder::unary_op(BuilderOp op, int32_t slots) {
+    switch (op) {
+        case ALL_SINGLE_SLOT_UNARY_OP_CASES:
+        case ALL_MULTI_SLOT_UNARY_OP_CASES:
+            fInstructions.push_back({op, {}, slots});
+            break;
+
+        default:
+            SkDEBUGFAIL("not a unary op");
+            break;
+    }
+}
+
+void Builder::binary_op(BuilderOp op, int32_t slots) {
+    switch (op) {
+        case ALL_N_WAY_BINARY_OP_CASES:
+        case ALL_MULTI_SLOT_BINARY_OP_CASES:
+            fInstructions.push_back({op, {}, slots});
+            break;
+
+        default:
+            SkDEBUGFAIL("not a binary op");
+            break;
+    }
+}
+
+void Builder::ternary_op(BuilderOp op, int32_t slots) {
+    switch (op) {
+        case ALL_N_WAY_TERNARY_OP_CASES:
+        case ALL_MULTI_SLOT_TERNARY_OP_CASES:
+            fInstructions.push_back({op, {}, slots});
+            break;
+
+        default:
+            SkDEBUGFAIL("not a ternary op");
+            break;
+    }
+}
+
+void Builder::dot_floats(int32_t slots) {
+    switch (slots) {
+        case 1: fInstructions.push_back({BuilderOp::mul_n_floats, {}, slots}); break;
+        case 2: fInstructions.push_back({BuilderOp::dot_2_floats, {}, slots}); break;
+        case 3: fInstructions.push_back({BuilderOp::dot_3_floats, {}, slots}); break;
+        case 4: fInstructions.push_back({BuilderOp::dot_4_floats, {}, slots}); break;
+
+        default:
+            SkDEBUGFAIL("invalid number of slots");
+            break;
+    }
+}
+
+void Builder::refract_floats() {
+    fInstructions.push_back({BuilderOp::refract_4_floats, {}});
+}
+
+void Builder::inverse_matrix(int32_t n) {
+    switch (n) {
+        case 2:  fInstructions.push_back({BuilderOp::inverse_mat2, {}, 4});  break;
+        case 3:  fInstructions.push_back({BuilderOp::inverse_mat3, {}, 9});  break;
+        case 4:  fInstructions.push_back({BuilderOp::inverse_mat4, {}, 16}); break;
+        default: SkUNREACHABLE;
+    }
+}
+
+void Builder::discard_stack(int32_t count) {
+    // If we pushed something onto the stack and then immediately discarded part of it, we can
+    // shrink or eliminate the push.
+    while (count > 0 && !fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        switch (lastInstruction.fOp) {
+            case BuilderOp::discard_stack:
+                // Our last op was actually a separate discard_stack; combine the discards.
+                lastInstruction.fImmA += count;
+                return;
+
+            case BuilderOp::push_zeros:
+            case BuilderOp::push_clone:
+            case BuilderOp::push_clone_from_stack:
+            case BuilderOp::push_clone_indirect_from_stack:
+            case BuilderOp::push_slots:
+            case BuilderOp::push_slots_indirect:
+            case BuilderOp::push_uniform:
+            case BuilderOp::push_uniform_indirect:
+                // Our last op was a multi-slot push; cancel out one discard and eliminate the op
+                // if its count reached zero.
+                --count;
+                --lastInstruction.fImmA;
+                if (lastInstruction.fImmA == 0) {
+                    fInstructions.pop_back();
+                }
+                continue;
+
+            case BuilderOp::push_literal:
+            case BuilderOp::push_condition_mask:
+            case BuilderOp::push_loop_mask:
+            case BuilderOp::push_return_mask:
+                // Our last op was a single-slot push; cancel out one discard and eliminate the op.
+                --count;
+                fInstructions.pop_back();
+                continue;
+
+            default:
+                break;
+        }
+
+        // This instruction wasn't a push.
+        break;
+    }
+
+    if (count > 0) {
+        fInstructions.push_back({BuilderOp::discard_stack, {}, count});
+    }
+}
+
+void Builder::label(int labelID) {
+    SkASSERT(labelID >= 0 && labelID < fNumLabels);
+
+    // If the previous instruction was a branch to this label, it's a no-op; jumping to the very
+    // next instruction is effectively meaningless.
+    while (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+        switch (lastInstruction.fOp) {
+            case BuilderOp::jump:
+            case BuilderOp::branch_if_all_lanes_active:
+            case BuilderOp::branch_if_any_lanes_active:
+            case BuilderOp::branch_if_no_lanes_active:
+            case BuilderOp::branch_if_no_active_lanes_on_stack_top_equal:
+                if (lastInstruction.fImmA == labelID) {
+                    fInstructions.pop_back();
+                    continue;
+                }
+                break;
+
+            default:
+                break;
+        }
+        break;
+    }
+    fInstructions.push_back({BuilderOp::label, {}, labelID});
+}
+
+void Builder::jump(int labelID) {
+    SkASSERT(labelID >= 0 && labelID < fNumLabels);
+    if (!fInstructions.empty() && fInstructions.back().fOp == BuilderOp::jump) {
+        // The previous instruction was also `jump`, so this branch could never possibly occur.
+        return;
+    }
+    fInstructions.push_back({BuilderOp::jump, {}, labelID});
+}
+
+void Builder::branch_if_any_lanes_active(int labelID) {
+    if (!this->executionMaskWritesAreEnabled()) {
+        this->jump(labelID);
+        return;
+    }
+
+    SkASSERT(labelID >= 0 && labelID < fNumLabels);
+    if (!fInstructions.empty() &&
+        (fInstructions.back().fOp == BuilderOp::branch_if_any_lanes_active ||
+         fInstructions.back().fOp == BuilderOp::jump)) {
+        // The previous instruction was `jump` or `branch_if_any_lanes_active`, so this branch
+        // could never possibly occur.
+        return;
+    }
+    fInstructions.push_back({BuilderOp::branch_if_any_lanes_active, {}, labelID});
+}
+
+void Builder::branch_if_all_lanes_active(int labelID) {
+    if (!this->executionMaskWritesAreEnabled()) {
+        this->jump(labelID);
+        return;
+    }
+
+    SkASSERT(labelID >= 0 && labelID < fNumLabels);
+    if (!fInstructions.empty() &&
+        (fInstructions.back().fOp == BuilderOp::branch_if_all_lanes_active ||
+         fInstructions.back().fOp == BuilderOp::jump)) {
+        // The previous instruction was `jump` or `branch_if_all_lanes_active`, so this branch
+        // could never possibly occur.
+        return;
+    }
+    fInstructions.push_back({BuilderOp::branch_if_all_lanes_active, {}, labelID});
+}
+
+void Builder::branch_if_no_lanes_active(int labelID) {
+    if (!this->executionMaskWritesAreEnabled()) {
+        return;
+    }
+
+    SkASSERT(labelID >= 0 && labelID < fNumLabels);
+    if (!fInstructions.empty() &&
+        (fInstructions.back().fOp == BuilderOp::branch_if_no_lanes_active ||
+         fInstructions.back().fOp == BuilderOp::jump)) {
+        // The previous instruction was `jump` or `branch_if_no_lanes_active`, so this branch
+        // could never possibly occur.
+        return;
+    }
+    fInstructions.push_back({BuilderOp::branch_if_no_lanes_active, {}, labelID});
+}
+
+void Builder::branch_if_no_active_lanes_on_stack_top_equal(int value, int labelID) {
+    SkASSERT(labelID >= 0 && labelID < fNumLabels);
+    if (!fInstructions.empty() &&
+        (fInstructions.back().fOp == BuilderOp::jump ||
+         (fInstructions.back().fOp == BuilderOp::branch_if_no_active_lanes_on_stack_top_equal &&
+          fInstructions.back().fImmB == value))) {
+        // The previous instruction was `jump` or `branch_if_no_active_lanes_on_stack_top_equal`
+        // (checking against the same value), so this branch could never possibly occur.
+        return;
+    }
+    fInstructions.push_back({BuilderOp::branch_if_no_active_lanes_on_stack_top_equal,
+                             {}, labelID, value});
+}
+
+void Builder::push_slots(SlotRange src) {
+    SkASSERT(src.count >= 0);
+    if (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        // If the previous instruction was pushing slots contiguous to this range, we can collapse
+        // the two pushes into one larger push.
+        if (lastInstruction.fOp == BuilderOp::push_slots &&
+            lastInstruction.fSlotA + lastInstruction.fImmA == src.index) {
+            lastInstruction.fImmA += src.count;
+            return;
+        }
+
+        // If the previous instruction was discarding an equal number of slots...
+        if (lastInstruction.fOp == BuilderOp::discard_stack && lastInstruction.fImmA == src.count) {
+            // ... and the instruction before that was copying from the stack to the same slots...
+            Instruction& prevInstruction = fInstructions.fromBack(1);
+            if ((prevInstruction.fOp == BuilderOp::copy_stack_to_slots ||
+                 prevInstruction.fOp == BuilderOp::copy_stack_to_slots_unmasked) &&
+                prevInstruction.fSlotA == src.index &&
+                prevInstruction.fImmA == src.count) {
+                // ... we are emitting `copy stack to X, discard stack, copy X to stack`. This is a
+                // common pattern when multiple operations in a row affect the same variable. We can
+                // eliminate the discard and just leave X on the stack.
+                fInstructions.pop_back();
+                return;
+            }
+        }
+    }
+
+    if (src.count > 0) {
+        fInstructions.push_back({BuilderOp::push_slots, {src.index}, src.count});
+    }
+}
+
+void Builder::push_slots_indirect(SlotRange fixedRange, int dynamicStackID, SlotRange limitRange) {
+    // SlotA: fixed-range start
+    // SlotB: limit-range end
+    // immA: number of slots
+    // immB: dynamic stack ID
+    fInstructions.push_back({BuilderOp::push_slots_indirect,
+                             {fixedRange.index, limitRange.index + limitRange.count},
+                             fixedRange.count,
+                             dynamicStackID});
+}
+
+void Builder::push_uniform(SlotRange src) {
+    SkASSERT(src.count >= 0);
+    if (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        // If the previous instruction was pushing uniforms contiguous to this range, we can
+        // collapse the two pushes into one larger push.
+        if (lastInstruction.fOp == BuilderOp::push_uniform &&
+            lastInstruction.fSlotA + lastInstruction.fImmA == src.index) {
+            lastInstruction.fImmA += src.count;
+            return;
+        }
+    }
+
+    if (src.count > 0) {
+        fInstructions.push_back({BuilderOp::push_uniform, {src.index}, src.count});
+    }
+}
+
+void Builder::push_uniform_indirect(SlotRange fixedRange,
+                                    int dynamicStackID,
+                                    SlotRange limitRange) {
+    // SlotA: fixed-range start
+    // SlotB: limit-range end
+    // immA: number of slots
+    // immB: dynamic stack ID
+    fInstructions.push_back({BuilderOp::push_uniform_indirect,
+                             {fixedRange.index, limitRange.index + limitRange.count},
+                             fixedRange.count,
+                             dynamicStackID});
+}
+
+void Builder::push_duplicates(int count) {
+    if (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        // If the previous op is pushing a zero, we can just push more of them.
+        if (lastInstruction.fOp == BuilderOp::push_zeros) {
+            lastInstruction.fImmA += count;
+            return;
+        }
+    }
+    SkASSERT(count >= 0);
+    if (count >= 3) {
+        // Use a swizzle to splat the input into a 4-slot value.
+        this->swizzle(/*consumedSlots=*/1, {0, 0, 0, 0});
+        count -= 3;
+    }
+    for (; count >= 4; count -= 4) {
+        // Clone the splatted value four slots at a time.
+        this->push_clone(/*numSlots=*/4);
+    }
+    // Use a swizzle or clone to handle the trailing items.
+    switch (count) {
+        case 3:  this->swizzle(/*consumedSlots=*/1, {0, 0, 0, 0}); break;
+        case 2:  this->swizzle(/*consumedSlots=*/1, {0, 0, 0});    break;
+        case 1:  this->push_clone(/*numSlots=*/1);                 break;
+        default: break;
+    }
+}
+
+void Builder::push_clone_from_stack(SlotRange range, int otherStackID, int offsetFromStackTop) {
+    // immA: number of slots
+    // immB: other stack ID
+    // immC: offset from stack top
+    offsetFromStackTop -= range.index;
+
+    if (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        // If the previous op is also pushing a clone...
+        if (lastInstruction.fOp == BuilderOp::push_clone_from_stack &&
+            // ... from the same stack...
+            lastInstruction.fImmB == otherStackID &&
+            // ... and this clone starts at the same place that the last clone ends...
+            lastInstruction.fImmC - lastInstruction.fImmA == offsetFromStackTop) {
+            // ... just extend the existing clone-op.
+            lastInstruction.fImmA += range.count;
+            return;
+        }
+    }
+
+    fInstructions.push_back({BuilderOp::push_clone_from_stack, {},
+                             range.count, otherStackID, offsetFromStackTop});
+}
+
+void Builder::push_clone_indirect_from_stack(SlotRange fixedOffset,
+                                             int dynamicStackID,
+                                             int otherStackID,
+                                             int offsetFromStackTop) {
+    // immA: number of slots
+    // immB: other stack ID
+    // immC: offset from stack top
+    // immD: dynamic stack ID
+    offsetFromStackTop -= fixedOffset.index;
+
+    fInstructions.push_back({BuilderOp::push_clone_indirect_from_stack, {},
+                             fixedOffset.count, otherStackID, offsetFromStackTop, dynamicStackID});
+}
+
+void Builder::pop_slots(SlotRange dst) {
+    if (!this->executionMaskWritesAreEnabled()) {
+        this->pop_slots_unmasked(dst);
+        return;
+    }
+
+    this->copy_stack_to_slots(dst);
+    this->discard_stack(dst.count);
+}
+
+void Builder::simplifyPopSlotsUnmasked(SlotRange* dst) {
+    if (!dst->count || fInstructions.empty()) {
+        // There's nothing left to simplify.
+        return;
+    }
+
+    Instruction& lastInstruction = fInstructions.back();
+
+    // If the last instruction is pushing a constant, we can simplify it by copying the constant
+    // directly into the destination slot.
+    if (lastInstruction.fOp == BuilderOp::push_literal) {
+        // Remove the constant-push instruction.
+        int value = lastInstruction.fImmA;
+        fInstructions.pop_back();
+
+        // Consume one destination slot.
+        dst->count--;
+        Slot destinationSlot = dst->index + dst->count;
+
+        // Continue simplifying if possible.
+        this->simplifyPopSlotsUnmasked(dst);
+
+        // Write the constant directly to the destination slot.
+        this->copy_constant(destinationSlot, value);
+        return;
+    }
+
+    // If the last instruction is pushing a zero, we can save a step by directly zeroing out
+    // the destination slot.
+    if (lastInstruction.fOp == BuilderOp::push_zeros) {
+        // Remove one zero-push.
+        lastInstruction.fImmA--;
+        if (lastInstruction.fImmA == 0) {
+            fInstructions.pop_back();
+        }
+
+        // Consume one destination slot.
+        dst->count--;
+        Slot destinationSlot = dst->index + dst->count;
+
+        // Continue simplifying if possible.
+        this->simplifyPopSlotsUnmasked(dst);
+
+        // Zero the destination slot directly.
+        this->zero_slots_unmasked({destinationSlot, 1});
+        return;
+    }
+
+    // If the last instruction is pushing a slot, we can just copy that slot.
+    if (lastInstruction.fOp == BuilderOp::push_slots) {
+        // Get the last slot.
+        Slot sourceSlot = lastInstruction.fSlotA + lastInstruction.fImmA - 1;
+        lastInstruction.fImmA--;
+        if (lastInstruction.fImmA == 0) {
+            fInstructions.pop_back();
+        }
+
+        // Consume one destination slot.
+        dst->count--;
+        Slot destinationSlot = dst->index + dst->count;
+
+        // Try once more.
+        this->simplifyPopSlotsUnmasked(dst);
+
+        // Copy the slot directly.
+        if (destinationSlot != sourceSlot) {
+            this->copy_slots_unmasked({destinationSlot, 1}, {sourceSlot, 1});
+        }
+        return;
+    }
+}
+
+void Builder::pop_slots_unmasked(SlotRange dst) {
+    SkASSERT(dst.count >= 0);
+
+    // If we are popping immediately after a push, we can simplify the code by writing the pushed
+    // value directly to the destination range.
+    this->simplifyPopSlotsUnmasked(&dst);
+
+    // Pop from the stack normally.
+    if (dst.count > 0) {
+        this->copy_stack_to_slots_unmasked(dst);
+        this->discard_stack(dst.count);
+    }
+}
+
+void Builder::copy_stack_to_slots(SlotRange dst, int offsetFromStackTop) {
+    // If the execution mask is known to be all-true, then we can ignore the write mask.
+    if (!this->executionMaskWritesAreEnabled()) {
+        this->copy_stack_to_slots_unmasked(dst, offsetFromStackTop);
+        return;
+    }
+
+    // If the last instruction copied the previous stack slots, just extend it.
+    if (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        // If the last op is copy-stack-to-slots...
+        if (lastInstruction.fOp == BuilderOp::copy_stack_to_slots &&
+            // and this op's destination is immediately after the last copy-slots-op's destination
+            lastInstruction.fSlotA + lastInstruction.fImmA == dst.index &&
+            // and this op's source is immediately after the last copy-slots-op's source
+            lastInstruction.fImmB - lastInstruction.fImmA == offsetFromStackTop) {
+            // then we can just extend the copy!
+            lastInstruction.fImmA += dst.count;
+            return;
+        }
+    }
+
+    fInstructions.push_back({BuilderOp::copy_stack_to_slots, {dst.index},
+                             dst.count, offsetFromStackTop});
+}
+
+void Builder::copy_stack_to_slots_indirect(SlotRange fixedRange,
+                                           int dynamicStackID,
+                                           SlotRange limitRange) {
+    // SlotA: fixed-range start
+    // SlotB: limit-range end
+    // immA: number of slots
+    // immB: dynamic stack ID
+    fInstructions.push_back({BuilderOp::copy_stack_to_slots_indirect,
+                             {fixedRange.index, limitRange.index + limitRange.count},
+                             fixedRange.count,
+                             dynamicStackID});
+}
+
+static bool slot_ranges_overlap(SlotRange x, SlotRange y) {
+    return x.index < y.index + y.count &&
+           y.index < x.index + x.count;
+}
+
+void Builder::copy_slots_unmasked(SlotRange dst, SlotRange src) {
+    // If the last instruction copied adjacent slots, just extend it.
+    if (!fInstructions.empty()) {
+        Instruction& lastInstr = fInstructions.back();
+
+        // If the last op is copy-slots-unmasked...
+        if (lastInstr.fOp == BuilderOp::copy_slot_unmasked &&
+            // and this op's destination is immediately after the last copy-slots-op's destination
+            lastInstr.fSlotA + lastInstr.fImmA == dst.index &&
+            // and this op's source is immediately after the last copy-slots-op's source
+            lastInstr.fSlotB + lastInstr.fImmA == src.index &&
+            // and the source/dest ranges will not overlap
+            !slot_ranges_overlap({lastInstr.fSlotB, lastInstr.fImmA + dst.count},
+                                 {lastInstr.fSlotA, lastInstr.fImmA + dst.count})) {
+            // then we can just extend the copy!
+            lastInstr.fImmA += dst.count;
+            return;
+        }
+    }
+
+    SkASSERT(dst.count == src.count);
+    fInstructions.push_back({BuilderOp::copy_slot_unmasked, {dst.index, src.index}, dst.count});
+}
+
+void Builder::copy_stack_to_slots_unmasked(SlotRange dst, int offsetFromStackTop) {
+    // If the last instruction copied the previous stack slots, just extend it.
+    if (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        // If the last op is copy-stack-to-slots-unmasked...
+        if (lastInstruction.fOp == BuilderOp::copy_stack_to_slots_unmasked &&
+            // and this op's destination is immediately after the last copy-slots-op's destination
+            lastInstruction.fSlotA + lastInstruction.fImmA == dst.index &&
+            // and this op's source is immediately after the last copy-slots-op's source
+            lastInstruction.fImmB - lastInstruction.fImmA == offsetFromStackTop) {
+            // then we can just extend the copy!
+            lastInstruction.fImmA += dst.count;
+            return;
+        }
+    }
+
+    fInstructions.push_back({BuilderOp::copy_stack_to_slots_unmasked, {dst.index},
+                             dst.count, offsetFromStackTop});
+}
+
+void Builder::pop_return_mask() {
+    SkASSERT(this->executionMaskWritesAreEnabled());
+
+    // This instruction is going to overwrite the return mask. If the previous instruction was
+    // masking off the return mask, that's wasted work and it can be eliminated.
+    if (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        if (lastInstruction.fOp == BuilderOp::mask_off_return_mask) {
+            fInstructions.pop_back();
+        }
+    }
+
+    fInstructions.push_back({BuilderOp::pop_return_mask, {}});
+}
+
+void Builder::zero_slots_unmasked(SlotRange dst) {
+    if (!fInstructions.empty()) {
+        Instruction& lastInstruction = fInstructions.back();
+
+        if (lastInstruction.fOp == BuilderOp::zero_slot_unmasked) {
+            if (lastInstruction.fSlotA + lastInstruction.fImmA == dst.index) {
+                // The previous instruction was zeroing the range immediately before this range.
+                // Combine the ranges.
+                lastInstruction.fImmA += dst.count;
+                return;
+            }
+        }
+
+        if (lastInstruction.fOp == BuilderOp::zero_slot_unmasked) {
+            if (lastInstruction.fSlotA == dst.index + dst.count) {
+                // The previous instruction was zeroing the range immediately after this range.
+                // Combine the ranges.
+                lastInstruction.fSlotA = dst.index;
+                lastInstruction.fImmA += dst.count;
+                return;
+            }
+        }
+    }
+
+    fInstructions.push_back({BuilderOp::zero_slot_unmasked, {dst.index}, dst.count});
+}
+
+static int pack_nybbles(SkSpan<const int8_t> components) {
+    // Pack up to 8 elements into nybbles, in reverse order.
+    int packed = 0;
+    for (auto iter = components.rbegin(); iter != components.rend(); ++iter) {
+        SkASSERT(*iter >= 0 && *iter <= 0xF);
+        packed <<= 4;
+        packed |= *iter;
+    }
+    return packed;
+}
+
+static void unpack_nybbles_to_offsets(uint32_t components, SkSpan<uint16_t> offsets) {
+    // Unpack component nybbles into byte-offsets pointing at stack slots.
+    for (size_t index = 0; index < offsets.size(); ++index) {
+        offsets[index] = (components & 0xF) * SkOpts::raster_pipeline_highp_stride * sizeof(float);
+        components >>= 4;
+    }
+}
+
+static int max_packed_nybble(uint32_t components, size_t numComponents) {
+    int largest = 0;
+    for (size_t index = 0; index < numComponents; ++index) {
+        largest = std::max<int>(largest, components & 0xF);
+        components >>= 4;
+    }
+    return largest;
+}
+
+void Builder::swizzle_copy_stack_to_slots(SlotRange dst,
+                                          SkSpan<const int8_t> components,
+                                          int offsetFromStackTop) {
+    // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
+    // extra speed here by implementing and using an unmasked version of this op.
+
+    // SlotA: fixed-range start
+    // immA: number of swizzle components
+    // immB: swizzle components
+    // immC: offset from stack top
+    fInstructions.push_back({BuilderOp::swizzle_copy_stack_to_slots, {dst.index},
+                             (int)components.size(),
+                             pack_nybbles(components),
+                             offsetFromStackTop});
+}
+
+void Builder::swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,
+                                                   int dynamicStackID,
+                                                   SlotRange limitRange,
+                                                   SkSpan<const int8_t> components,
+                                                   int offsetFromStackTop) {
+    // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
+    // extra speed here by implementing and using an unmasked version of this op.
+
+    // SlotA: fixed-range start
+    // SlotB: limit-range end
+    // immA: number of swizzle components
+    // immB: swizzle components
+    // immC: offset from stack top
+    // immD: dynamic stack ID
+    fInstructions.push_back({BuilderOp::swizzle_copy_stack_to_slots_indirect,
+                             {fixedRange.index, limitRange.index + limitRange.count},
+                             (int)components.size(),
+                             pack_nybbles(components),
+                             offsetFromStackTop,
+                             dynamicStackID});
+}
+
+void Builder::swizzle(int consumedSlots, SkSpan<const int8_t> components) {
+    // Consumes `consumedSlots` elements on the stack, then generates `elementSpan.size()` elements.
+    SkASSERT(consumedSlots >= 0);
+
+    // We only allow up to 16 elements, and they can only reach 0-15 slots, due to nybble packing.
+    int numElements = components.size();
+    SkASSERT(numElements <= 16);
+    SkASSERT(std::all_of(components.begin(), components.end(), [](int8_t e){ return e >= 0; }));
+    SkASSERT(std::all_of(components.begin(), components.end(), [](int8_t e){ return e <= 0xF; }));
+
+    // Make a local copy of the element array.
+    int8_t elements[16] = {};
+    std::copy(components.begin(), components.end(), std::begin(elements));
+
+    while (numElements > 0) {
+        // If the first element of the swizzle is zero...
+        if (elements[0] != 0) {
+            break;
+        }
+        // ...and zero isn't used elsewhere in the swizzle...
+        if (std::any_of(&elements[1], &elements[numElements], [](int8_t e) { return e == 0; })) {
+            break;
+        }
+        // We can omit the first slot from the swizzle entirely.
+        // Slide everything forward by one slot, and reduce the element index by one.
+        for (int index = 1; index < numElements; ++index) {
+            elements[index - 1] = elements[index] - 1;
+        }
+        elements[numElements - 1] = 0;
+        --consumedSlots;
+        --numElements;
+    }
+
+    // A completely empty swizzle is a no-op.
+    if (numElements == 0) {
+        this->discard_stack(consumedSlots);
+        return;
+    }
+
+    if (consumedSlots <= 4 && numElements <= 4) {
+        // We can fit everything into a little swizzle.
+        int op = (int)BuilderOp::swizzle_1 + numElements - 1;
+        fInstructions.push_back({(BuilderOp)op, {}, consumedSlots,
+                                 pack_nybbles(SkSpan(elements, numElements))});
+        return;
+    }
+
+    // This is a big swizzle. We use the `shuffle` op to handle these.
+    // Slot usage is packed into immA. The top 16 bits of immA count the consumed slots; the bottom
+    // 16 bits count the generated slots.
+    int slotUsage = consumedSlots << 16;
+    slotUsage |= numElements;
+
+    // Pack immB and immC with the shuffle list in packed-nybble form.
+    fInstructions.push_back({BuilderOp::shuffle, {}, slotUsage,
+                             pack_nybbles(SkSpan(&elements[0], 8)),
+                             pack_nybbles(SkSpan(&elements[8], 8))});
+}
+
+void Builder::transpose(int columns, int rows) {
+    // Transposes a matrix of size CxR on the stack (into a matrix of size RxC).
+    int8_t elements[16] = {};
+    size_t index = 0;
+    for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < columns; ++c) {
+            elements[index++] = (c * rows) + r;
+        }
+    }
+    this->swizzle(/*consumedSlots=*/columns * rows, SkSpan(elements, index));
+}
+
+void Builder::diagonal_matrix(int columns, int rows) {
+    // Generates a CxR diagonal matrix from the top two scalars on the stack.
+    int8_t elements[16] = {};
+    size_t index = 0;
+    for (int c = 0; c < columns; ++c) {
+        for (int r = 0; r < rows; ++r) {
+            elements[index++] = (c == r) ? 1 : 0;
+        }
+    }
+    this->swizzle(/*consumedSlots=*/2, SkSpan(elements, index));
+}
+
+void Builder::matrix_resize(int origColumns, int origRows, int newColumns, int newRows) {
+    // Resizes a CxR matrix at the top of the stack to C'xR'.
+    int8_t elements[16] = {};
+    size_t index = 0;
+
+    size_t consumedSlots = origColumns * origRows;
+    size_t zeroOffset = 0, oneOffset = 0;
+
+    for (int c = 0; c < newColumns; ++c) {
+        for (int r = 0; r < newRows; ++r) {
+            if (c < origColumns && r < origRows) {
+                // Push an element from the original matrix.
+                elements[index++] = (c * origRows) + r;
+            } else {
+                // This element is outside the original matrix; push 1 or 0.
+                if (c == r) {
+                    // We need to synthesize a literal 1.
+                    if (oneOffset == 0) {
+                        this->push_literal_f(1.0f);
+                        oneOffset = consumedSlots++;
+                    }
+                    elements[index++] = oneOffset;
+                } else {
+                    // We need to synthesize a literal 0.
+                    if (zeroOffset == 0) {
+                        this->push_zeros(1);
+                        zeroOffset = consumedSlots++;
+                    }
+                    elements[index++] = zeroOffset;
+                }
+            }
+        }
+    }
+    this->swizzle(consumedSlots, SkSpan(elements, index));
+}
+
+std::unique_ptr<Program> Builder::finish(int numValueSlots,
+                                         int numUniformSlots,
+                                         SkRPDebugTrace* debugTrace) {
+    // Verify that calls to enableExecutionMaskWrites and disableExecutionMaskWrites are balanced.
+    SkASSERT(fExecutionMaskWritesEnabled == 0);
+
+    return std::make_unique<Program>(std::move(fInstructions), numValueSlots, numUniformSlots,
+                                     fNumLabels, debugTrace);
+}
+
+void Program::optimize() {
+    // TODO(johnstiles): perform any last-minute cleanup of the instruction stream here
+}
+
+static int stack_usage(const Instruction& inst) {
+    switch (inst.fOp) {
+        case BuilderOp::push_literal:
+        case BuilderOp::push_condition_mask:
+        case BuilderOp::push_loop_mask:
+        case BuilderOp::push_return_mask:
+            return 1;
+
+        case BuilderOp::push_src_rgba:
+        case BuilderOp::push_dst_rgba:
+            return 4;
+
+        case BuilderOp::push_slots:
+        case BuilderOp::push_slots_indirect:
+        case BuilderOp::push_uniform:
+        case BuilderOp::push_uniform_indirect:
+        case BuilderOp::push_zeros:
+        case BuilderOp::push_clone:
+        case BuilderOp::push_clone_from_stack:
+        case BuilderOp::push_clone_indirect_from_stack:
+            return inst.fImmA;
+
+        case BuilderOp::pop_condition_mask:
+        case BuilderOp::pop_loop_mask:
+        case BuilderOp::pop_and_reenable_loop_mask:
+        case BuilderOp::pop_return_mask:
+            return -1;
+
+        case BuilderOp::pop_src_rg:
+            return -2;
+
+        case BuilderOp::pop_src_rgba:
+        case BuilderOp::pop_dst_rgba:
+            return -4;
+
+        case ALL_N_WAY_BINARY_OP_CASES:
+        case ALL_MULTI_SLOT_BINARY_OP_CASES:
+        case BuilderOp::discard_stack:
+        case BuilderOp::select:
+            return -inst.fImmA;
+
+        case ALL_N_WAY_TERNARY_OP_CASES:
+        case ALL_MULTI_SLOT_TERNARY_OP_CASES:
+            return 2 * -inst.fImmA;
+
+        case BuilderOp::swizzle_1:
+            return 1 - inst.fImmA;  // consumes immA slots and emits a scalar
+        case BuilderOp::swizzle_2:
+            return 2 - inst.fImmA;  // consumes immA slots and emits a 2-slot vector
+        case BuilderOp::swizzle_3:
+            return 3 - inst.fImmA;  // consumes immA slots and emits a 3-slot vector
+        case BuilderOp::swizzle_4:
+            return 4 - inst.fImmA;  // consumes immA slots and emits a 4-slot vector
+
+        case BuilderOp::dot_2_floats:
+            return -3;  // consumes two 2-slot vectors and emits one scalar
+        case BuilderOp::dot_3_floats:
+            return -5;  // consumes two 3-slot vectors and emits one scalar
+        case BuilderOp::dot_4_floats:
+            return -7;  // consumes two 4-slot vectors and emits one scalar
+
+        case BuilderOp::refract_4_floats:
+            return -5;  // consumes nine slots (N + I + eta) and emits a 4-slot vector (R)
+
+        case BuilderOp::shuffle: {
+            int consumed = inst.fImmA >> 16;
+            int generated = inst.fImmA & 0xFFFF;
+            return generated - consumed;
+        }
+        case ALL_SINGLE_SLOT_UNARY_OP_CASES:
+        case ALL_MULTI_SLOT_UNARY_OP_CASES:
+        default:
+            return 0;
+    }
+}
+
+Program::StackDepthMap Program::tempStackMaxDepths() const {
+    StackDepthMap largest;
+    StackDepthMap current;
+
+    int curIdx = 0;
+    for (const Instruction& inst : fInstructions) {
+        if (inst.fOp == BuilderOp::set_current_stack) {
+            curIdx = inst.fImmA;
+        }
+        current[curIdx] += stack_usage(inst);
+        largest[curIdx] = std::max(current[curIdx], largest[curIdx]);
+        SkASSERTF(current[curIdx] >= 0, "unbalanced temp stack push/pop on stack %d", curIdx);
+    }
+
+    for (const auto& [stackIdx, depth] : current) {
+        (void)stackIdx;
+        SkASSERTF(depth == 0, "unbalanced temp stack push/pop");
+    }
+
+    return largest;
+}
+
+Program::Program(SkTArray<Instruction> instrs,
+                 int numValueSlots,
+                 int numUniformSlots,
+                 int numLabels,
+                 SkRPDebugTrace* debugTrace)
+        : fInstructions(std::move(instrs))
+        , fNumValueSlots(numValueSlots)
+        , fNumUniformSlots(numUniformSlots)
+        , fNumLabels(numLabels)
+        , fDebugTrace(debugTrace) {
+    this->optimize();
+
+    fTempStackMaxDepths = this->tempStackMaxDepths();
+
+    fNumTempStackSlots = 0;
+    for (const auto& [stackIdx, depth] : fTempStackMaxDepths) {
+        (void)stackIdx;
+        fNumTempStackSlots += depth;
+    }
+}
+
+void Program::appendCopy(SkTArray<Stage>* pipeline,
+                         SkArenaAlloc* alloc,
+                         ProgramOp baseStage,
+                         float* dst, int dstStride,
+                         const float* src, int srcStride,
+                         int numSlots) const {
+    SkASSERT(numSlots >= 0);
+    while (numSlots > 4) {
+        this->appendCopy(pipeline, alloc, baseStage, dst, dstStride, src, srcStride,/*numSlots=*/4);
+        dst += 4 * dstStride;
+        src += 4 * srcStride;
+        numSlots -= 4;
+    }
+
+    if (numSlots > 0) {
+        SkASSERT(numSlots <= 4);
+        auto stage = (ProgramOp)((int)baseStage + numSlots - 1);
+        auto* ctx = alloc->make<SkRasterPipeline_BinaryOpCtx>();
+        ctx->dst = dst;
+        ctx->src = src;
+        pipeline->push_back({stage, ctx});
+    }
+}
+
+void Program::appendCopySlotsUnmasked(SkTArray<Stage>* pipeline,
+                                      SkArenaAlloc* alloc,
+                                      float* dst,
+                                      const float* src,
+                                      int numSlots) const {
+    this->appendCopy(pipeline, alloc,
+                     ProgramOp::copy_slot_unmasked,
+                     dst, /*dstStride=*/SkOpts::raster_pipeline_highp_stride,
+                     src, /*srcStride=*/SkOpts::raster_pipeline_highp_stride,
+                     numSlots);
+}
+
+void Program::appendCopySlotsMasked(SkTArray<Stage>* pipeline,
+                                    SkArenaAlloc* alloc,
+                                    float* dst,
+                                    const float* src,
+                                    int numSlots) const {
+    this->appendCopy(pipeline, alloc,
+                     ProgramOp::copy_slot_masked,
+                     dst, /*dstStride=*/SkOpts::raster_pipeline_highp_stride,
+                     src, /*srcStride=*/SkOpts::raster_pipeline_highp_stride,
+                     numSlots);
+}
+
+void Program::appendCopyConstants(SkTArray<Stage>* pipeline,
+                                  SkArenaAlloc* alloc,
+                                  float* dst,
+                                  const float* src,
+                                  int numSlots) const {
+    this->appendCopy(pipeline, alloc,
+                     ProgramOp::copy_constant,
+                     dst, /*dstStride=*/SkOpts::raster_pipeline_highp_stride,
+                     src, /*srcStride=*/1,
+                     numSlots);
+}
+
+void Program::appendSingleSlotUnaryOp(SkTArray<Stage>* pipeline, ProgramOp stage,
+                                      float* dst, int numSlots) const {
+    SkASSERT(numSlots >= 0);
+    while (numSlots--) {
+        pipeline->push_back({stage, dst});
+        dst += SkOpts::raster_pipeline_highp_stride;
+    }
+}
+
+void Program::appendMultiSlotUnaryOp(SkTArray<Stage>* pipeline, ProgramOp baseStage,
+                                     float* dst, int numSlots) const {
+    SkASSERT(numSlots >= 0);
+    while (numSlots > 4) {
+        this->appendMultiSlotUnaryOp(pipeline, baseStage, dst, /*numSlots=*/4);
+        dst += 4 * SkOpts::raster_pipeline_highp_stride;
+        numSlots -= 4;
+    }
+
+    SkASSERT(numSlots <= 4);
+    auto stage = (ProgramOp)((int)baseStage + numSlots - 1);
+    pipeline->push_back({stage, dst});
+}
+
+void Program::appendAdjacentNWayBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
+                                         ProgramOp stage,
+                                         float* dst, const float* src, int numSlots) const {
+    // The source and destination must be directly next to one another.
+    SkASSERT(numSlots >= 0);
+    SkASSERT((dst + SkOpts::raster_pipeline_highp_stride * numSlots) == src);
+
+    if (numSlots > 0) {
+        auto ctx = alloc->make<SkRasterPipeline_BinaryOpCtx>();
+        ctx->dst = dst;
+        ctx->src = src;
+        pipeline->push_back({stage, ctx});
+    }
+}
+
+void Program::appendAdjacentMultiSlotBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
+                                              ProgramOp baseStage,
+                                              float* dst, const float* src, int numSlots) const {
+    // The source and destination must be directly next to one another.
+    SkASSERT(numSlots >= 0);
+    SkASSERT((dst + SkOpts::raster_pipeline_highp_stride * numSlots) == src);
+
+    if (numSlots > 4) {
+        this->appendAdjacentNWayBinaryOp(pipeline, alloc, baseStage, dst, src, numSlots);
+        return;
+    }
+    if (numSlots > 0) {
+        auto specializedStage = (ProgramOp)((int)baseStage + numSlots);
+        pipeline->push_back({specializedStage, dst});
+    }
+}
+
+void Program::appendAdjacentNWayTernaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
+                                          ProgramOp stage, float* dst, const float* src0,
+                                          const float* src1, int numSlots) const {
+    // The float pointers must all be immediately adjacent to each other.
+    SkASSERT(numSlots >= 0);
+    SkASSERT((dst  + SkOpts::raster_pipeline_highp_stride * numSlots) == src0);
+    SkASSERT((src0 + SkOpts::raster_pipeline_highp_stride * numSlots) == src1);
+
+    if (numSlots > 0) {
+        auto ctx = alloc->make<SkRasterPipeline_TernaryOpCtx>();
+        ctx->dst = dst;
+        ctx->src0 = src0;
+        ctx->src1 = src1;
+        pipeline->push_back({stage, ctx});
+    }
+}
+
+void Program::appendAdjacentMultiSlotTernaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
+                                               ProgramOp baseStage, float* dst, const float* src0,
+                                               const float* src1, int numSlots) const {
+    // The float pointers must all be immediately adjacent to each other.
+    SkASSERT(numSlots >= 0);
+    SkASSERT((dst  + SkOpts::raster_pipeline_highp_stride * numSlots) == src0);
+    SkASSERT((src0 + SkOpts::raster_pipeline_highp_stride * numSlots) == src1);
+
+    if (numSlots > 4) {
+        this->appendAdjacentNWayTernaryOp(pipeline, alloc, baseStage, dst, src0, src1, numSlots);
+        return;
+    }
+    if (numSlots > 0) {
+        auto specializedStage = (ProgramOp)((int)baseStage + numSlots);
+        pipeline->push_back({specializedStage, dst});
+    }
+}
+
+void Program::appendStackRewind(SkTArray<Stage>* pipeline) const {
+#if defined(SKSL_STANDALONE) || !SK_HAS_MUSTTAIL
+    pipeline->push_back({ProgramOp::stack_rewind, nullptr});
+#endif
+}
+
+static void* context_bit_pun(intptr_t val) {
+    return sk_bit_cast<void*>(val);
+}
+
+Program::SlotData Program::allocateSlotData(SkArenaAlloc* alloc) const {
+    // Allocate a contiguous slab of slot data for values and stack entries.
+    const int N = SkOpts::raster_pipeline_highp_stride;
+    const int vectorWidth = N * sizeof(float);
+    const int allocSize = vectorWidth * (fNumValueSlots + fNumTempStackSlots);
+    float* slotPtr = static_cast<float*>(alloc->makeBytesAlignedTo(allocSize, vectorWidth));
+    sk_bzero(slotPtr, allocSize);
+
+    // Store the temp stack immediately after the values.
+    SlotData s;
+    s.values = SkSpan(slotPtr,        N * fNumValueSlots);
+    s.stack  = SkSpan(s.values.end(), N * fNumTempStackSlots);
+    return s;
+}
+
+#if !defined(SKSL_STANDALONE)
+
+bool Program::appendStages(SkRasterPipeline* pipeline,
+                           SkArenaAlloc* alloc,
+                           RP::Callbacks* callbacks,
+                           SkSpan<const float> uniforms) const {
+    // Convert our Instruction list to an array of ProgramOps.
+    SkTArray<Stage> stages;
+    this->makeStages(&stages, alloc, uniforms, this->allocateSlotData(alloc));
+
+    // Allocate buffers for branch targets and labels; these are needed to convert labels into
+    // actual offsets into the pipeline and fix up branches.
+    SkTArray<SkRasterPipeline_BranchCtx*> branchContexts;
+    branchContexts.reserve_back(fNumLabels);
+    SkTArray<int> labelOffsets;
+    labelOffsets.push_back_n(fNumLabels, -1);
+    SkTArray<int> branchGoesToLabel;
+    branchGoesToLabel.reserve_back(fNumLabels);
+
+    for (const Stage& stage : stages) {
+        switch (stage.op) {
+            case ProgramOp::stack_rewind:
+                pipeline->append_stack_rewind();
+                break;
+
+            case ProgramOp::invoke_shader:
+                if (!callbacks || !callbacks->appendShader(sk_bit_cast<intptr_t>(stage.ctx))) {
+                    return false;
+                }
+                break;
+
+            case ProgramOp::invoke_color_filter:
+                if (!callbacks || !callbacks->appendColorFilter(sk_bit_cast<intptr_t>(stage.ctx))) {
+                    return false;
+                }
+                break;
+
+            case ProgramOp::invoke_blender:
+                if (!callbacks || !callbacks->appendBlender(sk_bit_cast<intptr_t>(stage.ctx))) {
+                    return false;
+                }
+                break;
+
+            case ProgramOp::invoke_to_linear_srgb:
+                if (!callbacks) {
+                    return false;
+                }
+                callbacks->toLinearSrgb();
+                break;
+
+            case ProgramOp::invoke_from_linear_srgb:
+                if (!callbacks) {
+                    return false;
+                }
+                callbacks->fromLinearSrgb();
+                break;
+
+            case ProgramOp::label: {
+                // Remember the absolute pipeline position of this label.
+                int labelID = sk_bit_cast<intptr_t>(stage.ctx);
+                SkASSERT(labelID >= 0 && labelID < fNumLabels);
+                labelOffsets[labelID] = pipeline->getNumStages();
+                break;
+            }
+            case ProgramOp::jump:
+            case ProgramOp::branch_if_all_lanes_active:
+            case ProgramOp::branch_if_any_lanes_active:
+            case ProgramOp::branch_if_no_lanes_active:
+            case ProgramOp::branch_if_no_active_lanes_eq: {
+                // The branch context contain a valid label ID at this point.
+                auto* branchCtx = static_cast<SkRasterPipeline_BranchCtx*>(stage.ctx);
+                int labelID = branchCtx->offset;
+                SkASSERT(labelID >= 0 && labelID < fNumLabels);
+
+                // Replace the label ID in the branch context with the absolute pipeline position.
+                // We will go back over the branch targets at the end and fix them up.
+                branchCtx->offset = pipeline->getNumStages();
+
+                SkASSERT(branchContexts.size() == branchGoesToLabel.size());
+                branchContexts.push_back(branchCtx);
+                branchGoesToLabel.push_back(labelID);
+                [[fallthrough]];
+            }
+            default:
+                // Append a regular op to the program.
+                SkASSERT((int)stage.op < kNumRasterPipelineHighpOps);
+                pipeline->append((SkRasterPipelineOp)stage.op, stage.ctx);
+                break;
+        }
+    }
+
+    // Now that we have assembled the program and know the pipeline positions of each label and
+    // branch, fix up every branch target.
+    SkASSERT(branchContexts.size() == branchGoesToLabel.size());
+    for (int index = 0; index < branchContexts.size(); ++index) {
+        int branchFromIdx = branchContexts[index]->offset;
+        int branchToIdx = labelOffsets[branchGoesToLabel[index]];
+        branchContexts[index]->offset = branchToIdx - branchFromIdx;
+    }
+
+    return true;
+}
+
+#endif
+
+void Program::makeStages(SkTArray<Stage>* pipeline,
+                         SkArenaAlloc* alloc,
+                         SkSpan<const float> uniforms,
+                         const SlotData& slots) const {
+    SkASSERT(fNumUniformSlots == SkToInt(uniforms.size()));
+
+    const int N = SkOpts::raster_pipeline_highp_stride;
+    StackDepthMap tempStackDepth;
+    int currentStack = 0;
+    int mostRecentRewind = 0;
+
+    // Assemble a map holding the current stack-top for each temporary stack. Position each temp
+    // stack immediately after the previous temp stack; temp stacks are never allowed to overlap.
+    int pos = 0;
+    SkTHashMap<int, float*> tempStackMap;
+    for (auto& [idx, depth] : fTempStackMaxDepths) {
+        tempStackMap[idx] = slots.stack.begin() + (pos * N);
+        pos += depth;
+    }
+
+    // Track labels that we have reached in processing.
+    SkBitSet labelsEncountered(fNumLabels);
+
+    auto EmitStackRewindForBackwardsBranch = [&](int labelID) {
+        // If we have already encountered the label associated with this branch, this is a
+        // backwards branch. Add a stack-rewind immediately before the branch to ensure that
+        // long-running loops don't use an unbounded amount of stack space.
+        if (labelsEncountered.test(labelID)) {
+            this->appendStackRewind(pipeline);
+            mostRecentRewind = pipeline->size();
+        }
+    };
+
+    // We can reuse constants from our arena by placing them in this map.
+    SkTHashMap<int, int*> constantLookupMap; // <constant value, pointer into arena>
+
+    // Write each BuilderOp to the pipeline array.
+    pipeline->reserve_back(fInstructions.size());
+    for (const Instruction& inst : fInstructions) {
+        auto SlotA    = [&]() { return &slots.values[N * inst.fSlotA]; };
+        auto SlotB    = [&]() { return &slots.values[N * inst.fSlotB]; };
+        auto UniformA = [&]() { return &uniforms[inst.fSlotA]; };
+        float*& tempStackPtr = tempStackMap[currentStack];
+
+        switch (inst.fOp) {
+            case BuilderOp::label:
+                SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
+                labelsEncountered.set(inst.fImmA);
+                pipeline->push_back({ProgramOp::label, context_bit_pun(inst.fImmA)});
+                break;
+
+            case BuilderOp::jump:
+            case BuilderOp::branch_if_all_lanes_active:
+            case BuilderOp::branch_if_any_lanes_active:
+            case BuilderOp::branch_if_no_lanes_active: {
+                SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
+                EmitStackRewindForBackwardsBranch(inst.fImmA);
+
+                auto* ctx = alloc->make<SkRasterPipeline_BranchCtx>();
+                ctx->offset = inst.fImmA;
+                pipeline->push_back({(ProgramOp)inst.fOp, ctx});
+                break;
+            }
+            case BuilderOp::branch_if_no_active_lanes_on_stack_top_equal: {
+                SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
+                EmitStackRewindForBackwardsBranch(inst.fImmA);
+
+                auto* ctx = alloc->make<SkRasterPipeline_BranchIfEqualCtx>();
+                ctx->offset = inst.fImmA;
+                ctx->value = inst.fImmB;
+                ctx->ptr = reinterpret_cast<int*>(tempStackPtr - N);
+                pipeline->push_back({ProgramOp::branch_if_no_active_lanes_eq, ctx});
+                break;
+            }
+            case BuilderOp::init_lane_masks:
+                pipeline->push_back({ProgramOp::init_lane_masks, nullptr});
+                break;
+
+            case BuilderOp::store_src_rg:
+                pipeline->push_back({ProgramOp::store_src_rg, SlotA()});
+                break;
+
+            case BuilderOp::store_src:
+                pipeline->push_back({ProgramOp::store_src, SlotA()});
+                break;
+
+            case BuilderOp::store_dst:
+                pipeline->push_back({ProgramOp::store_dst, SlotA()});
+                break;
+
+            case BuilderOp::store_device_xy01:
+                pipeline->push_back({ProgramOp::store_device_xy01, SlotA()});
+                break;
+
+            case BuilderOp::load_src:
+                pipeline->push_back({ProgramOp::load_src, SlotA()});
+                break;
+
+            case BuilderOp::load_dst:
+                pipeline->push_back({ProgramOp::load_dst, SlotA()});
+                break;
+
+            case ALL_SINGLE_SLOT_UNARY_OP_CASES: {
+                float* dst = tempStackPtr - (inst.fImmA * N);
+                this->appendSingleSlotUnaryOp(pipeline, (ProgramOp)inst.fOp, dst, inst.fImmA);
+                break;
+            }
+            case ALL_MULTI_SLOT_UNARY_OP_CASES: {
+                float* dst = tempStackPtr - (inst.fImmA * N);
+                this->appendMultiSlotUnaryOp(pipeline, (ProgramOp)inst.fOp, dst, inst.fImmA);
+                break;
+            }
+            case ALL_N_WAY_BINARY_OP_CASES: {
+                float* src = tempStackPtr - (inst.fImmA * N);
+                float* dst = tempStackPtr - (inst.fImmA * 2 * N);
+                this->appendAdjacentNWayBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
+                                                 dst, src, inst.fImmA);
+                break;
+            }
+            case ALL_MULTI_SLOT_BINARY_OP_CASES: {
+                float* src = tempStackPtr - (inst.fImmA * N);
+                float* dst = tempStackPtr - (inst.fImmA * 2 * N);
+                this->appendAdjacentMultiSlotBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
+                                                      dst, src, inst.fImmA);
+                break;
+            }
+            case ALL_N_WAY_TERNARY_OP_CASES: {
+                float* src1 = tempStackPtr - (inst.fImmA * N);
+                float* src0 = tempStackPtr - (inst.fImmA * 2 * N);
+                float* dst  = tempStackPtr - (inst.fImmA * 3 * N);
+                this->appendAdjacentNWayTernaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
+                                                  dst, src0, src1, inst.fImmA);
+                break;
+            }
+            case ALL_MULTI_SLOT_TERNARY_OP_CASES: {
+                float* src1 = tempStackPtr - (inst.fImmA * N);
+                float* src0 = tempStackPtr - (inst.fImmA * 2 * N);
+                float* dst  = tempStackPtr - (inst.fImmA * 3 * N);
+                this->appendAdjacentMultiSlotTernaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
+                                                       dst, src0, src1, inst.fImmA);
+                break;
+            }
+            case BuilderOp::select: {
+                float* src = tempStackPtr - (inst.fImmA * N);
+                float* dst = tempStackPtr - (inst.fImmA * 2 * N);
+                this->appendCopySlotsMasked(pipeline, alloc, dst, src, inst.fImmA);
+                break;
+            }
+            case BuilderOp::copy_slot_masked:
+                this->appendCopySlotsMasked(pipeline, alloc, SlotA(), SlotB(), inst.fImmA);
+                break;
+
+            case BuilderOp::copy_slot_unmasked:
+                this->appendCopySlotsUnmasked(pipeline, alloc, SlotA(), SlotB(), inst.fImmA);
+                break;
+
+            case BuilderOp::zero_slot_unmasked:
+                this->appendMultiSlotUnaryOp(pipeline, ProgramOp::zero_slot_unmasked,
+                                             SlotA(), inst.fImmA);
+                break;
+
+            case BuilderOp::refract_4_floats: {
+                float* dst = tempStackPtr - (9 * N);
+                pipeline->push_back({ProgramOp::refract_4_floats, dst});
+                break;
+            }
+            case BuilderOp::inverse_mat2:
+            case BuilderOp::inverse_mat3:
+            case BuilderOp::inverse_mat4: {
+                float* dst = tempStackPtr - (inst.fImmA * N);
+                pipeline->push_back({(ProgramOp)inst.fOp, dst});
+                break;
+            }
+            case BuilderOp::dot_2_floats:
+            case BuilderOp::dot_3_floats:
+            case BuilderOp::dot_4_floats: {
+                float* dst = tempStackPtr - (inst.fImmA * 2 * N);
+                pipeline->push_back({(ProgramOp)inst.fOp, dst});
+                break;
+            }
+            case BuilderOp::swizzle_1:
+            case BuilderOp::swizzle_2:
+            case BuilderOp::swizzle_3:
+            case BuilderOp::swizzle_4: {
+                auto* ctx = alloc->make<SkRasterPipeline_SwizzleCtx>();
+                ctx->ptr = tempStackPtr - (N * inst.fImmA);
+                // Unpack component nybbles into byte-offsets pointing at stack slots.
+                unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
+                pipeline->push_back({(ProgramOp)inst.fOp, ctx});
+                break;
+            }
+            case BuilderOp::shuffle: {
+                int consumed = inst.fImmA >> 16;
+                int generated = inst.fImmA & 0xFFFF;
+
+                auto* ctx = alloc->make<SkRasterPipeline_ShuffleCtx>();
+                ctx->ptr = tempStackPtr - (N * consumed);
+                ctx->count = generated;
+                // Unpack immB and immC from nybble form into the offset array.
+                unpack_nybbles_to_offsets(inst.fImmB, SkSpan(&ctx->offsets[0], 8));
+                unpack_nybbles_to_offsets(inst.fImmC, SkSpan(&ctx->offsets[8], 8));
+                pipeline->push_back({ProgramOp::shuffle, ctx});
+                break;
+            }
+            case BuilderOp::push_src_rgba: {
+                float* dst = tempStackPtr;
+                pipeline->push_back({ProgramOp::store_src, dst});
+                break;
+            }
+            case BuilderOp::push_dst_rgba: {
+                float* dst = tempStackPtr;
+                pipeline->push_back({ProgramOp::store_dst, dst});
+                break;
+            }
+            case BuilderOp::pop_src_rg: {
+                float* src = tempStackPtr - (2 * N);
+                pipeline->push_back({ProgramOp::load_src_rg, src});
+                break;
+            }
+            case BuilderOp::pop_src_rgba: {
+                float* src = tempStackPtr - (4 * N);
+                pipeline->push_back({ProgramOp::load_src, src});
+                break;
+            }
+            case BuilderOp::pop_dst_rgba: {
+                float* src = tempStackPtr - (4 * N);
+                pipeline->push_back({ProgramOp::load_dst, src});
+                break;
+            }
+            case BuilderOp::push_slots: {
+                float* dst = tempStackPtr;
+                this->appendCopySlotsUnmasked(pipeline, alloc, dst, SlotA(), inst.fImmA);
+                break;
+            }
+            case BuilderOp::copy_stack_to_slots_indirect:
+            case BuilderOp::push_slots_indirect:
+            case BuilderOp::push_uniform_indirect: {
+                // SlotA: fixed-range start
+                // SlotB: limit-range end
+                //  immA: number of slots to copy
+                //  immB: dynamic stack ID
+                ProgramOp op;
+                auto* ctx = alloc->make<SkRasterPipeline_CopyIndirectCtx>();
+                ctx->indirectOffset =
+                        reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmB]) - (1 * N);
+                ctx->indirectLimit = inst.fSlotB - inst.fSlotA - inst.fImmA;
+                ctx->slots = inst.fImmA;
+                if (inst.fOp == BuilderOp::push_slots_indirect) {
+                    op = ProgramOp::copy_from_indirect_unmasked;
+                    ctx->src = SlotA();
+                    ctx->dst = tempStackPtr;
+                } else if (inst.fOp == BuilderOp::push_uniform_indirect) {
+                    op = ProgramOp::copy_from_indirect_uniform_unmasked;
+                    ctx->src = UniformA();
+                    ctx->dst = tempStackPtr;
+                } else {
+                    op = ProgramOp::copy_to_indirect_masked;
+                    ctx->src = tempStackPtr - (ctx->slots * N);
+                    ctx->dst = SlotA();
+                }
+                pipeline->push_back({op, ctx});
+                break;
+            }
+            case BuilderOp::push_uniform: {
+                float* dst = tempStackPtr;
+                this->appendCopyConstants(pipeline, alloc, dst, UniformA(), inst.fImmA);
+                break;
+            }
+            case BuilderOp::push_zeros: {
+                float* dst = tempStackPtr;
+                this->appendMultiSlotUnaryOp(pipeline, ProgramOp::zero_slot_unmasked, dst,
+                                             inst.fImmA);
+                break;
+            }
+            case BuilderOp::push_condition_mask: {
+                float* dst = tempStackPtr;
+                pipeline->push_back({ProgramOp::store_condition_mask, dst});
+                break;
+            }
+            case BuilderOp::pop_condition_mask: {
+                float* src = tempStackPtr - (1 * N);
+                pipeline->push_back({ProgramOp::load_condition_mask, src});
+                break;
+            }
+            case BuilderOp::merge_condition_mask: {
+                float* ptr = tempStackPtr - (2 * N);
+                pipeline->push_back({ProgramOp::merge_condition_mask, ptr});
+                break;
+            }
+            case BuilderOp::push_loop_mask: {
+                float* dst = tempStackPtr;
+                pipeline->push_back({ProgramOp::store_loop_mask, dst});
+                break;
+            }
+            case BuilderOp::pop_loop_mask: {
+                float* src = tempStackPtr - (1 * N);
+                pipeline->push_back({ProgramOp::load_loop_mask, src});
+                break;
+            }
+            case BuilderOp::pop_and_reenable_loop_mask: {
+                float* src = tempStackPtr - (1 * N);
+                pipeline->push_back({ProgramOp::reenable_loop_mask, src});
+                break;
+            }
+            case BuilderOp::reenable_loop_mask:
+                pipeline->push_back({ProgramOp::reenable_loop_mask, SlotA()});
+                break;
+
+            case BuilderOp::mask_off_loop_mask:
+                pipeline->push_back({ProgramOp::mask_off_loop_mask, nullptr});
+                break;
+
+            case BuilderOp::merge_loop_mask: {
+                float* src = tempStackPtr - (1 * N);
+                pipeline->push_back({ProgramOp::merge_loop_mask, src});
+                break;
+            }
+            case BuilderOp::push_return_mask: {
+                float* dst = tempStackPtr;
+                pipeline->push_back({ProgramOp::store_return_mask, dst});
+                break;
+            }
+            case BuilderOp::pop_return_mask: {
+                float* src = tempStackPtr - (1 * N);
+                pipeline->push_back({ProgramOp::load_return_mask, src});
+                break;
+            }
+            case BuilderOp::mask_off_return_mask:
+                pipeline->push_back({ProgramOp::mask_off_return_mask, nullptr});
+                break;
+
+            case BuilderOp::copy_constant:
+            case BuilderOp::push_literal: {
+                float* dst = (inst.fOp == BuilderOp::push_literal) ? tempStackPtr : SlotA();
+                int* constantPtr;
+                if (int** lookup = constantLookupMap.find(inst.fImmA)) {
+                    constantPtr = *lookup;
+                } else {
+                    constantPtr = alloc->make<int>(inst.fImmA);
+                    constantLookupMap[inst.fImmA] = constantPtr;
+                }
+                SkASSERT(constantPtr);
+                this->appendCopyConstants(pipeline, alloc, dst, (float*)constantPtr,/*numSlots=*/1);
+                break;
+            }
+            case BuilderOp::copy_stack_to_slots: {
+                float* src = tempStackPtr - (inst.fImmB * N);
+                this->appendCopySlotsMasked(pipeline, alloc, SlotA(), src, inst.fImmA);
+                break;
+            }
+            case BuilderOp::copy_stack_to_slots_unmasked: {
+                float* src = tempStackPtr - (inst.fImmB * N);
+                this->appendCopySlotsUnmasked(pipeline, alloc, SlotA(), src, inst.fImmA);
+                break;
+            }
+            case BuilderOp::swizzle_copy_stack_to_slots: {
+                // SlotA: fixed-range start
+                // immA: number of swizzle components
+                // immB: swizzle components
+                // immC: offset from stack top
+                auto stage = (ProgramOp)((int)ProgramOp::swizzle_copy_slot_masked + inst.fImmA - 1);
+                auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyCtx>();
+                ctx->src = tempStackPtr - (inst.fImmC * N);
+                ctx->dst = SlotA();
+                unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
+                pipeline->push_back({stage, ctx});
+                break;
+            }
+            case BuilderOp::push_clone: {
+                float* src = tempStackPtr - (inst.fImmB * N);
+                float* dst = tempStackPtr;
+                this->appendCopySlotsUnmasked(pipeline, alloc, dst, src, inst.fImmA);
+                break;
+            }
+            case BuilderOp::push_clone_from_stack: {
+                // immA: number of slots
+                // immB: other stack ID
+                // immC: offset from stack top
+                float* sourceStackPtr = tempStackMap[inst.fImmB];
+                float* src = sourceStackPtr - (inst.fImmC * N);
+                float* dst = tempStackPtr;
+                this->appendCopySlotsUnmasked(pipeline, alloc, dst, src, inst.fImmA);
+                break;
+            }
+            case BuilderOp::push_clone_indirect_from_stack: {
+                // immA: number of slots
+                // immB: other stack ID
+                // immC: offset from stack top
+                // immD: dynamic stack ID
+                float* sourceStackPtr = tempStackMap[inst.fImmB];
+
+                auto* ctx = alloc->make<SkRasterPipeline_CopyIndirectCtx>();
+                ctx->dst = tempStackPtr;
+                ctx->src = sourceStackPtr - (inst.fImmC * N);
+                ctx->indirectOffset =
+                        reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmD]) - (1 * N);
+                ctx->indirectLimit = inst.fImmC - inst.fImmA;
+                ctx->slots = inst.fImmA;
+                pipeline->push_back({ProgramOp::copy_from_indirect_unmasked, ctx});
+                break;
+            }
+            case BuilderOp::swizzle_copy_stack_to_slots_indirect: {
+                // SlotA: fixed-range start
+                // SlotB: limit-range end
+                // immA: number of swizzle components
+                // immB: swizzle components
+                // immC: offset from stack top
+                // immD: dynamic stack ID
+                auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyIndirectCtx>();
+                ctx->src = tempStackPtr - (inst.fImmC * N);
+                ctx->dst = SlotA();
+                ctx->indirectOffset =
+                        reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmD]) - (1 * N);
+                ctx->indirectLimit =
+                        inst.fSlotB - inst.fSlotA - (max_packed_nybble(inst.fImmB, inst.fImmA) + 1);
+                ctx->slots = inst.fImmA;
+                unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
+                pipeline->push_back({ProgramOp::swizzle_copy_to_indirect_masked, ctx});
+                break;
+            }
+            case BuilderOp::case_op: {
+                auto* ctx = alloc->make<SkRasterPipeline_CaseOpCtx>();
+                ctx->ptr = reinterpret_cast<int*>(tempStackPtr - 2 * N);
+                ctx->expectedValue = inst.fImmA;
+                pipeline->push_back({ProgramOp::case_op, ctx});
+                break;
+            }
+            case BuilderOp::discard_stack:
+                break;
+
+            case BuilderOp::set_current_stack:
+                currentStack = inst.fImmA;
+                break;
+
+            case BuilderOp::invoke_shader:
+            case BuilderOp::invoke_color_filter:
+            case BuilderOp::invoke_blender:
+                pipeline->push_back({(ProgramOp)inst.fOp, context_bit_pun(inst.fImmA)});
+                break;
+
+            case BuilderOp::invoke_to_linear_srgb:
+            case BuilderOp::invoke_from_linear_srgb:
+                pipeline->push_back({(ProgramOp)inst.fOp, nullptr});
+                break;
+
+            default:
+                SkDEBUGFAILF("Raster Pipeline: unsupported instruction %d", (int)inst.fOp);
+                break;
+        }
+
+        tempStackPtr += stack_usage(inst) * N;
+        SkASSERT(tempStackPtr >= slots.stack.begin());
+        SkASSERT(tempStackPtr <= slots.stack.end());
+
+        // Periodically rewind the stack every 500 instructions. When SK_HAS_MUSTTAIL is set,
+        // rewinds are not actually used; the appendStackRewind call becomes a no-op. On platforms
+        // that don't support SK_HAS_MUSTTAIL, rewinding the stack periodically can prevent a
+        // potential stack overflow when running a long program.
+        int numPipelineStages = pipeline->size();
+        if (numPipelineStages - mostRecentRewind > 500) {
+            this->appendStackRewind(pipeline);
+            mostRecentRewind = numPipelineStages;
+        }
+    }
+}
+
+// Finds duplicate names in the program and disambiguates them with subscripts.
+SkTArray<std::string> build_unique_slot_name_list(const SkRPDebugTrace* debugTrace) {
+    SkTArray<std::string> slotName;
+    if (debugTrace) {
+        slotName.reserve_back(debugTrace->fSlotInfo.size());
+
+        // The map consists of <variable name, <source position, unique name>>.
+        SkTHashMap<std::string_view, SkTHashMap<int, std::string>> uniqueNameMap;
+
+        for (const SlotDebugInfo& slotInfo : debugTrace->fSlotInfo) {
+            // Look up this variable by its name and source position.
+            int pos = slotInfo.pos.valid() ? slotInfo.pos.startOffset() : 0;
+            SkTHashMap<int, std::string>& positionMap = uniqueNameMap[slotInfo.name];
+            std::string& uniqueName = positionMap[pos];
+
+            // Have we seen this variable name/position combination before?
+            if (uniqueName.empty()) {
+                // This is a unique name/position pair.
+                uniqueName = slotInfo.name;
+
+                // But if it's not a unique _name_, it deserves a subscript to disambiguate it.
+                int subscript = positionMap.count() - 1;
+                if (subscript > 0) {
+                    for (char digit : std::to_string(subscript)) {
+                        // U+2080 through U+2089 (₀₁₂₃₄₅₆₇₈₉) in UTF8:
+                        uniqueName.push_back((char)0xE2);
+                        uniqueName.push_back((char)0x82);
+                        uniqueName.push_back((char)(0x80 + digit - '0'));
+                    }
+                }
+            }
+
+            slotName.push_back(uniqueName);
+        }
+    }
+    return slotName;
+}
+
+void Program::dump(SkWStream* out) const {
+    // Allocate memory for the slot and uniform data, even though the program won't ever be
+    // executed. The program requires pointer ranges for managing its data, and ASAN will report
+    // errors if those pointers are pointing at unallocated memory.
+    SkArenaAlloc alloc(/*firstHeapAllocation=*/1000);
+    const int N = SkOpts::raster_pipeline_highp_stride;
+    SlotData slots = this->allocateSlotData(&alloc);
+    float* uniformPtr = alloc.makeArray<float>(fNumUniformSlots);
+    SkSpan<float> uniforms = SkSpan(uniformPtr, fNumUniformSlots);
+
+    // Turn this program into an array of Raster Pipeline stages.
+    SkTArray<Stage> stages;
+    this->makeStages(&stages, &alloc, uniforms, slots);
+
+    // Find the labels in the program, and keep track of their offsets.
+    SkTHashMap<int, int> labelToStageMap; // <label ID, stage index>
+    for (int index = 0; index < stages.size(); ++index) {
+        if (stages[index].op == ProgramOp::label) {
+            int labelID = sk_bit_cast<intptr_t>(stages[index].ctx);
+            SkASSERT(!labelToStageMap.find(labelID));
+            labelToStageMap[labelID] = index;
+        }
+    }
+
+    // Assign unique names to each variable slot; our trace might have multiple variables with the
+    // same name, which can make a dump hard to read.
+    SkTArray<std::string> slotName = build_unique_slot_name_list(fDebugTrace);
+
+    // Emit the program's instruction list.
+    for (int index = 0; index < stages.size(); ++index) {
+        const Stage& stage = stages[index];
+
+        // Interpret the context value as a branch offset.
+        auto BranchOffset = [&](const SkRasterPipeline_BranchCtx* ctx) -> std::string {
+            // The context's offset field contains a label ID
+            int labelID = ctx->offset;
+            SkASSERT(labelToStageMap.find(labelID));
+            int labelIndex = labelToStageMap[labelID];
+            return SkSL::String::printf("%+d (label %d at #%d)",
+                                        labelIndex - index, labelID, labelIndex + 1);
+        };
+
+        // Print a 32-bit immediate value of unknown type (int/float).
+        auto Imm = [&](float immFloat, bool showAsFloat = true) -> std::string {
+            // Start with `0x3F800000` as a baseline.
+            uint32_t immUnsigned;
+            memcpy(&immUnsigned, &immFloat, sizeof(uint32_t));
+            auto text = SkSL::String::printf("0x%08X", immUnsigned);
+
+            // Extend it to `0x3F800000 (1.0)` for finite floating point values.
+            if (showAsFloat && std::isfinite(immFloat)) {
+                text += " (";
+                text += skstd::to_string(immFloat);
+                text += ")";
+            }
+            return text;
+        };
+
+        // Interpret the context pointer as a 32-bit immediate value of unknown type (int/float).
+        auto ImmCtx = [&](const void* ctx, bool showAsFloat = true) -> std::string {
+            float f;
+            memcpy(&f, &ctx, sizeof(float));
+            return Imm(f, showAsFloat);
+        };
+
+        // Print `1` for single slots and `1..3` for ranges of slots.
+        auto AsRange = [](int first, int count) -> std::string {
+            std::string text = std::to_string(first);
+            if (count > 1) {
+                text += ".." + std::to_string(first + count - 1);
+            }
+            return text;
+        };
+
+        // Come up with a reasonable name for a range of slots, e.g.:
+        // `val`: slot range points at one variable, named val
+        // `val(0..1)`: slot range points at the first and second slot of val (which has 3+ slots)
+        // `foo, bar`: slot range fully covers two variables, named foo and bar
+        // `foo(3), bar(0)`: slot range covers the fourth slot of foo and the first slot of bar
+        auto SlotName = [&](SkSpan<const SlotDebugInfo> debugInfo,
+                            SkSpan<const std::string> names,
+                            SlotRange range) -> std::string {
+            SkASSERT(range.index >= 0 && (range.index + range.count) <= (int)debugInfo.size());
+
+            std::string text;
+            auto separator = SkSL::String::Separator();
+            while (range.count > 0) {
+                const SlotDebugInfo& slotInfo = debugInfo[range.index];
+                text += separator();
+                text += names.empty() ? slotInfo.name : names[range.index];
+
+                // Figure out how many slots we can chomp in this iteration.
+                int entireVariable = slotInfo.columns * slotInfo.rows;
+                int slotsToChomp = std::min(range.count, entireVariable - slotInfo.componentIndex);
+                // If we aren't consuming an entire variable, from first slot to last...
+                if (slotsToChomp != entireVariable) {
+                    // ... decorate it with a range suffix.
+                    text += "(" + AsRange(slotInfo.componentIndex, slotsToChomp) + ")";
+                }
+                range.index += slotsToChomp;
+                range.count -= slotsToChomp;
+            }
+
+            return text;
+        };
+
+        // Attempts to interpret the passed-in pointer as a uniform range.
+        auto UniformPtrCtx = [&](const float* ptr, int numSlots) -> std::string {
+            const float* end = ptr + numSlots;
+            if (ptr >= uniforms.begin() && end <= uniforms.end()) {
+                int uniformIdx = ptr - uniforms.begin();
+                if (fDebugTrace) {
+                    // Handle pointers to named uniform slots.
+                    std::string name = SlotName(fDebugTrace->fUniformInfo, /*names=*/{},
+                                                {uniformIdx, numSlots});
+                    if (!name.empty()) {
+                        return name;
+                    }
+                }
+                // Handle pointers to uniforms (when no debug info exists).
+                return "u" + AsRange(uniformIdx, numSlots);
+            }
+            return {};
+        };
+
+        // Attempts to interpret the passed-in pointer as a value slot range.
+        auto ValuePtrCtx = [&](const float* ptr, int numSlots) -> std::string {
+            const float* end = ptr + (N * numSlots);
+            if (ptr >= slots.values.begin() && end <= slots.values.end()) {
+                int valueIdx = ptr - slots.values.begin();
+                SkASSERT((valueIdx % N) == 0);
+                valueIdx /= N;
+                if (fDebugTrace) {
+                    // Handle pointers to named value slots.
+                    std::string name = SlotName(fDebugTrace->fSlotInfo, slotName,
+                                                {valueIdx, numSlots});
+                    if (!name.empty()) {
+                        return name;
+                    }
+                }
+                // Handle pointers to value slots (when no debug info exists).
+                return "v" + AsRange(valueIdx, numSlots);
+            }
+            return {};
+        };
+
+        // Interpret the context value as a pointer to `count` immediate values.
+        auto MultiImmCtx = [&](const float* ptr, int count) -> std::string {
+            // If this is a uniform, print it by name.
+            if (std::string text = UniformPtrCtx(ptr, count); !text.empty()) {
+                return text;
+            }
+            // Emit a single unbracketed immediate.
+            if (count == 1) {
+                return Imm(*ptr);
+            }
+            // Emit a list like `[0x00000000 (0.0), 0x3F80000 (1.0)]`.
+            std::string text = "[";
+            auto separator = SkSL::String::Separator();
+            while (count--) {
+                text += separator();
+                text += Imm(*ptr++);
+            }
+            return text + "]";
+        };
+
+        // Interpret the context value as a generic pointer.
+        auto PtrCtx = [&](const void* ctx, int numSlots) -> std::string {
+            const float *ctxAsSlot = static_cast<const float*>(ctx);
+            // Check for uniform and value pointers.
+            if (std::string uniform = UniformPtrCtx(ctxAsSlot, numSlots); !uniform.empty()) {
+                return uniform;
+            }
+            if (std::string value = ValuePtrCtx(ctxAsSlot, numSlots); !value.empty()) {
+                return value;
+            }
+            // Handle pointers to temporary stack slots.
+            if (ctxAsSlot >= slots.stack.begin() && ctxAsSlot < slots.stack.end()) {
+                int stackIdx = ctxAsSlot - slots.stack.begin();
+                SkASSERT((stackIdx % N) == 0);
+                return "$" + AsRange(stackIdx / N, numSlots);
+            }
+            // This pointer is out of our expected bounds; this generally isn't expected to happen.
+            return "ExternalPtr(" + AsRange(0, numSlots) + ")";
+        };
+
+        // Interpret the context value as a pointer to two adjacent values.
+        auto AdjacentPtrCtx = [&](const void* ctx,
+                                  int numSlots) -> std::tuple<std::string, std::string> {
+            const float *ctxAsSlot = static_cast<const float*>(ctx);
+            return std::make_tuple(PtrCtx(ctxAsSlot, numSlots),
+                                   PtrCtx(ctxAsSlot + (N * numSlots), numSlots));
+        };
+
+        // Interpret the context value as a pointer to three adjacent values.
+        auto Adjacent3PtrCtx = [&](const void* ctx, int numSlots) ->
+                                  std::tuple<std::string, std::string, std::string> {
+            const float *ctxAsSlot = static_cast<const float*>(ctx);
+            return std::make_tuple(PtrCtx(ctxAsSlot, numSlots),
+                                   PtrCtx(ctxAsSlot + (N * numSlots), numSlots),
+                                   PtrCtx(ctxAsSlot + (2 * N * numSlots), numSlots));
+        };
+
+        // Interpret the context value as a BinaryOp structure for copy_n_slots (numSlots is
+        // dictated by the op itself).
+        auto BinaryOpCtx = [&](const void* v,
+                               int numSlots) -> std::tuple<std::string, std::string> {
+            const auto *ctx = static_cast<const SkRasterPipeline_BinaryOpCtx*>(v);
+            return std::make_tuple(PtrCtx(ctx->dst, numSlots),
+                                   PtrCtx(ctx->src, numSlots));
+        };
+
+        // Interpret the context value as a BinaryOp structure for copy_n_constants (numSlots is
+        // dictated by the op itself).
+        auto CopyConstantCtx = [&](const void* v,
+                                   int numSlots) -> std::tuple<std::string, std::string> {
+            const auto *ctx = static_cast<const SkRasterPipeline_BinaryOpCtx*>(v);
+            return std::make_tuple(PtrCtx(ctx->dst, numSlots),
+                                   MultiImmCtx(ctx->src, numSlots));
+        };
+
+        // Interpret the context value as a BinaryOp structure (numSlots is inferred from the
+        // distance between pointers).
+        auto AdjacentBinaryOpCtx = [&](const void* v) -> std::tuple<std::string, std::string> {
+            const auto *ctx = static_cast<const SkRasterPipeline_BinaryOpCtx*>(v);
+            int numSlots = (ctx->src - ctx->dst) / N;
+            return AdjacentPtrCtx(ctx->dst, numSlots);
+        };
+
+        // Interpret the context value as a TernaryOp structure (numSlots is inferred from the
+        // distance between pointers).
+        auto AdjacentTernaryOpCtx = [&](const void* v) ->
+                                       std::tuple<std::string, std::string, std::string> {
+            const auto* ctx = static_cast<const SkRasterPipeline_TernaryOpCtx*>(v);
+            int numSlots = (ctx->src0 - ctx->dst) / N;
+            return Adjacent3PtrCtx(ctx->dst, numSlots);
+        };
+
+        // Stringize a span of swizzle offsets to the textual equivalent (`xyzw`).
+        auto SwizzleOffsetSpan = [&](SkSpan<const uint16_t> offsets) {
+            std::string src;
+            for (uint16_t offset : offsets) {
+                if (offset == (0 * N * sizeof(float))) {
+                    src.push_back('x');
+                } else if (offset == (1 * N * sizeof(float))) {
+                    src.push_back('y');
+                } else if (offset == (2 * N * sizeof(float))) {
+                    src.push_back('z');
+                } else if (offset == (3 * N * sizeof(float))) {
+                    src.push_back('w');
+                } else {
+                    src.push_back('?');
+                }
+            }
+            return src;
+        };
+
+        // When we decode a swizzle, we don't know the slot width of the original value; that's not
+        // preserved in the instruction encoding. (e.g., myFloat4.y would be indistinguishable from
+        // myFloat2.y.) We do our best to make a readable dump using the data we have.
+        auto SwizzleWidth = [&](SkSpan<const uint16_t> offsets) {
+            size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
+                                      (N * sizeof(float));
+            size_t swizzleWidth = offsets.size();
+            return std::max(swizzleWidth, highestComponent + 1);
+        };
+
+        // Stringize a swizzled pointer.
+        auto SwizzlePtr = [&](const float* ptr, SkSpan<const uint16_t> offsets) {
+            return "(" + PtrCtx(ptr, SwizzleWidth(offsets)) + ")." + SwizzleOffsetSpan(offsets);
+        };
+
+        // Interpret the context value as a Swizzle structure.
+        auto SwizzleCtx = [&](ProgramOp op, const void* v) -> std::tuple<std::string, std::string> {
+            const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCtx*>(v);
+            int destSlots = (int)op - (int)BuilderOp::swizzle_1 + 1;
+
+            return std::make_tuple(PtrCtx(ctx->ptr, destSlots),
+                                   SwizzlePtr(ctx->ptr, SkSpan(ctx->offsets, destSlots)));
+        };
+
+        // Interpret the context value as a SwizzleCopy structure.
+        auto SwizzleCopyCtx = [&](ProgramOp op,
+                                  const void* v) -> std::tuple<std::string, std::string> {
+            const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCopyCtx*>(v);
+            int destSlots = (int)op - (int)BuilderOp::swizzle_copy_slot_masked + 1;
+
+            return std::make_tuple(SwizzlePtr(ctx->dst, SkSpan(ctx->offsets, destSlots)),
+                                   PtrCtx(ctx->src, destSlots));
+        };
+
+        // Interpret the context value as a Shuffle structure.
+        auto ShuffleCtx = [&](const void* v) -> std::tuple<std::string, std::string> {
+            const auto* ctx = static_cast<const SkRasterPipeline_ShuffleCtx*>(v);
+
+            std::string dst = PtrCtx(ctx->ptr, ctx->count);
+            std::string src = "(" + dst + ")[";
+            for (int index = 0; index < ctx->count; ++index) {
+                if (ctx->offsets[index] % (N * sizeof(float))) {
+                    src.push_back('?');
+                } else {
+                    src += std::to_string(ctx->offsets[index] / (N * sizeof(float)));
+                }
+                src.push_back(' ');
+            }
+            src.back() = ']';
+            return std::make_tuple(dst, src);
+        };
+
+        std::string opArg1, opArg2, opArg3, opSwizzle;
+        using POp = ProgramOp;
+        switch (stage.op) {
+            case POp::label:
+            case POp::invoke_shader:
+            case POp::invoke_color_filter:
+            case POp::invoke_blender:
+                opArg1 = ImmCtx(stage.ctx, /*showAsFloat=*/false);
+                break;
+
+            case POp::case_op: {
+                const auto* ctx = static_cast<SkRasterPipeline_CaseOpCtx*>(stage.ctx);
+                opArg1 = PtrCtx(ctx->ptr, 1);
+                opArg2 = PtrCtx(ctx->ptr + N, 1);
+                opArg3 = Imm(sk_bit_cast<float>(ctx->expectedValue), /*showAsFloat=*/false);
+                break;
+            }
+            case POp::swizzle_1:
+            case POp::swizzle_2:
+            case POp::swizzle_3:
+            case POp::swizzle_4:
+                std::tie(opArg1, opArg2) = SwizzleCtx(stage.op, stage.ctx);
+                break;
+
+            case POp::swizzle_copy_slot_masked:
+            case POp::swizzle_copy_2_slots_masked:
+            case POp::swizzle_copy_3_slots_masked:
+            case POp::swizzle_copy_4_slots_masked:
+                std::tie(opArg1, opArg2) = SwizzleCopyCtx(stage.op, stage.ctx);
+                break;
+
+            case POp::refract_4_floats:
+                std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 4);
+                opArg3 = PtrCtx((const float*)(stage.ctx) + (8 * N), 1);
+                break;
+
+            case POp::dot_2_floats:
+                opArg1 = PtrCtx(stage.ctx, 1);
+                std::tie(opArg2, opArg3) = AdjacentPtrCtx(stage.ctx, 2);
+                break;
+
+            case POp::dot_3_floats:
+                opArg1 = PtrCtx(stage.ctx, 1);
+                std::tie(opArg2, opArg3) = AdjacentPtrCtx(stage.ctx, 3);
+                break;
+
+            case POp::dot_4_floats:
+                opArg1 = PtrCtx(stage.ctx, 1);
+                std::tie(opArg2, opArg3) = AdjacentPtrCtx(stage.ctx, 4);
+                break;
+
+            case POp::shuffle:
+                std::tie(opArg1, opArg2) = ShuffleCtx(stage.ctx);
+                break;
+
+            case POp::load_condition_mask:
+            case POp::store_condition_mask:
+            case POp::load_loop_mask:
+            case POp::store_loop_mask:
+            case POp::merge_loop_mask:
+            case POp::reenable_loop_mask:
+            case POp::load_return_mask:
+            case POp::store_return_mask:
+            case POp::zero_slot_unmasked:
+            case POp::bitwise_not_int:
+            case POp::cast_to_float_from_int: case POp::cast_to_float_from_uint:
+            case POp::cast_to_int_from_float: case POp::cast_to_uint_from_float:
+            case POp::abs_float:              case POp::abs_int:
+            case POp::acos_float:
+            case POp::asin_float:
+            case POp::atan_float:
+            case POp::ceil_float:
+            case POp::cos_float:
+            case POp::exp_float:
+            case POp::exp2_float:
+            case POp::log_float:
+            case POp::log2_float:
+            case POp::floor_float:
+            case POp::invsqrt_float:
+            case POp::sin_float:
+            case POp::sqrt_float:
+            case POp::tan_float:
+                opArg1 = PtrCtx(stage.ctx, 1);
+                break;
+
+            case POp::zero_2_slots_unmasked:
+            case POp::bitwise_not_2_ints:
+            case POp::load_src_rg:               case POp::store_src_rg:
+            case POp::cast_to_float_from_2_ints: case POp::cast_to_float_from_2_uints:
+            case POp::cast_to_int_from_2_floats: case POp::cast_to_uint_from_2_floats:
+            case POp::abs_2_floats:              case POp::abs_2_ints:
+            case POp::ceil_2_floats:
+            case POp::floor_2_floats:
+            case POp::invsqrt_2_floats:
+                opArg1 = PtrCtx(stage.ctx, 2);
+                break;
+
+            case POp::zero_3_slots_unmasked:
+            case POp::bitwise_not_3_ints:
+            case POp::cast_to_float_from_3_ints: case POp::cast_to_float_from_3_uints:
+            case POp::cast_to_int_from_3_floats: case POp::cast_to_uint_from_3_floats:
+            case POp::abs_3_floats:              case POp::abs_3_ints:
+            case POp::ceil_3_floats:
+            case POp::floor_3_floats:
+            case POp::invsqrt_3_floats:
+                opArg1 = PtrCtx(stage.ctx, 3);
+                break;
+
+            case POp::load_src:
+            case POp::load_dst:
+            case POp::store_src:
+            case POp::store_dst:
+            case POp::store_device_xy01:
+            case POp::zero_4_slots_unmasked:
+            case POp::bitwise_not_4_ints:
+            case POp::cast_to_float_from_4_ints: case POp::cast_to_float_from_4_uints:
+            case POp::cast_to_int_from_4_floats: case POp::cast_to_uint_from_4_floats:
+            case POp::abs_4_floats:              case POp::abs_4_ints:
+            case POp::ceil_4_floats:
+            case POp::floor_4_floats:
+            case POp::invsqrt_4_floats:
+            case POp::inverse_mat2:
+                opArg1 = PtrCtx(stage.ctx, 4);
+                break;
+
+            case POp::inverse_mat3:
+                opArg1 = PtrCtx(stage.ctx, 9);
+                break;
+
+            case POp::inverse_mat4:
+                opArg1 = PtrCtx(stage.ctx, 16);
+                break;
+
+
+            case POp::copy_constant:
+                std::tie(opArg1, opArg2) = CopyConstantCtx(stage.ctx, 1);
+                break;
+
+            case POp::copy_2_constants:
+                std::tie(opArg1, opArg2) = CopyConstantCtx(stage.ctx, 2);
+                break;
+
+            case POp::copy_3_constants:
+                std::tie(opArg1, opArg2) = CopyConstantCtx(stage.ctx, 3);
+                break;
+
+            case POp::copy_4_constants:
+                std::tie(opArg1, opArg2) = CopyConstantCtx(stage.ctx, 4);
+                break;
+
+            case POp::copy_slot_masked:
+            case POp::copy_slot_unmasked:
+                std::tie(opArg1, opArg2) = BinaryOpCtx(stage.ctx, 1);
+                break;
+
+            case POp::copy_2_slots_masked:
+            case POp::copy_2_slots_unmasked:
+                std::tie(opArg1, opArg2) = BinaryOpCtx(stage.ctx, 2);
+                break;
+
+            case POp::copy_3_slots_masked:
+            case POp::copy_3_slots_unmasked:
+                std::tie(opArg1, opArg2) = BinaryOpCtx(stage.ctx, 3);
+                break;
+
+            case POp::copy_4_slots_masked:
+            case POp::copy_4_slots_unmasked:
+                std::tie(opArg1, opArg2) = BinaryOpCtx(stage.ctx, 4);
+                break;
+
+            case POp::copy_from_indirect_unmasked:
+            case POp::copy_to_indirect_masked: {
+                const auto* ctx = static_cast<SkRasterPipeline_CopyIndirectCtx*>(stage.ctx);
+                // We don't incorporate the indirect-limit in the output
+                opArg1 = PtrCtx(ctx->dst, ctx->slots);
+                opArg2 = PtrCtx(ctx->src, ctx->slots);
+                opArg3 = PtrCtx(ctx->indirectOffset, 1);
+                break;
+            }
+            case POp::copy_from_indirect_uniform_unmasked: {
+                const auto* ctx = static_cast<SkRasterPipeline_CopyIndirectCtx*>(stage.ctx);
+                opArg1 = PtrCtx(ctx->dst, ctx->slots);
+                opArg2 = UniformPtrCtx(ctx->src, ctx->slots);
+                opArg3 = PtrCtx(ctx->indirectOffset, 1);
+                break;
+            }
+            case POp::swizzle_copy_to_indirect_masked: {
+                const auto* ctx = static_cast<SkRasterPipeline_SwizzleCopyIndirectCtx*>(stage.ctx);
+                opArg1 = PtrCtx(ctx->dst, SwizzleWidth(SkSpan(ctx->offsets, ctx->slots)));
+                opArg2 = PtrCtx(ctx->src, ctx->slots);
+                opArg3 = PtrCtx(ctx->indirectOffset, 1);
+                opSwizzle = SwizzleOffsetSpan(SkSpan(ctx->offsets, ctx->slots));
+                break;
+            }
+            case POp::merge_condition_mask:
+            case POp::add_float:   case POp::add_int:
+            case POp::sub_float:   case POp::sub_int:
+            case POp::mul_float:   case POp::mul_int:
+            case POp::div_float:   case POp::div_int:   case POp::div_uint:
+                                   case POp::bitwise_and_int:
+                                   case POp::bitwise_or_int:
+                                   case POp::bitwise_xor_int:
+            case POp::mod_float:
+            case POp::min_float:   case POp::min_int:   case POp::min_uint:
+            case POp::max_float:   case POp::max_int:   case POp::max_uint:
+            case POp::cmplt_float: case POp::cmplt_int: case POp::cmplt_uint:
+            case POp::cmple_float: case POp::cmple_int: case POp::cmple_uint:
+            case POp::cmpeq_float: case POp::cmpeq_int:
+            case POp::cmpne_float: case POp::cmpne_int:
+                std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 1);
+                break;
+
+            case POp::mix_float:   case POp::mix_int:
+                std::tie(opArg1, opArg2, opArg3) = Adjacent3PtrCtx(stage.ctx, 1);
+                break;
+
+            case POp::add_2_floats:   case POp::add_2_ints:
+            case POp::sub_2_floats:   case POp::sub_2_ints:
+            case POp::mul_2_floats:   case POp::mul_2_ints:
+            case POp::div_2_floats:   case POp::div_2_ints:   case POp::div_2_uints:
+                                      case POp::bitwise_and_2_ints:
+                                      case POp::bitwise_or_2_ints:
+                                      case POp::bitwise_xor_2_ints:
+            case POp::mod_2_floats:
+            case POp::min_2_floats:   case POp::min_2_ints:   case POp::min_2_uints:
+            case POp::max_2_floats:   case POp::max_2_ints:   case POp::max_2_uints:
+            case POp::cmplt_2_floats: case POp::cmplt_2_ints: case POp::cmplt_2_uints:
+            case POp::cmple_2_floats: case POp::cmple_2_ints: case POp::cmple_2_uints:
+            case POp::cmpeq_2_floats: case POp::cmpeq_2_ints:
+            case POp::cmpne_2_floats: case POp::cmpne_2_ints:
+                std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 2);
+                break;
+
+            case POp::mix_2_floats:   case POp::mix_2_ints:
+                std::tie(opArg1, opArg2, opArg3) = Adjacent3PtrCtx(stage.ctx, 2);
+                break;
+
+            case POp::add_3_floats:   case POp::add_3_ints:
+            case POp::sub_3_floats:   case POp::sub_3_ints:
+            case POp::mul_3_floats:   case POp::mul_3_ints:
+            case POp::div_3_floats:   case POp::div_3_ints:   case POp::div_3_uints:
+                                      case POp::bitwise_and_3_ints:
+                                      case POp::bitwise_or_3_ints:
+                                      case POp::bitwise_xor_3_ints:
+            case POp::mod_3_floats:
+            case POp::min_3_floats:   case POp::min_3_ints:   case POp::min_3_uints:
+            case POp::max_3_floats:   case POp::max_3_ints:   case POp::max_3_uints:
+            case POp::cmplt_3_floats: case POp::cmplt_3_ints: case POp::cmplt_3_uints:
+            case POp::cmple_3_floats: case POp::cmple_3_ints: case POp::cmple_3_uints:
+            case POp::cmpeq_3_floats: case POp::cmpeq_3_ints:
+            case POp::cmpne_3_floats: case POp::cmpne_3_ints:
+                std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 3);
+                break;
+
+            case POp::mix_3_floats:   case POp::mix_3_ints:
+                std::tie(opArg1, opArg2, opArg3) = Adjacent3PtrCtx(stage.ctx, 3);
+                break;
+
+            case POp::add_4_floats:   case POp::add_4_ints:
+            case POp::sub_4_floats:   case POp::sub_4_ints:
+            case POp::mul_4_floats:   case POp::mul_4_ints:
+            case POp::div_4_floats:   case POp::div_4_ints:   case POp::div_4_uints:
+                                      case POp::bitwise_and_4_ints:
+                                      case POp::bitwise_or_4_ints:
+                                      case POp::bitwise_xor_4_ints:
+            case POp::mod_4_floats:
+            case POp::min_4_floats:   case POp::min_4_ints:   case POp::min_4_uints:
+            case POp::max_4_floats:   case POp::max_4_ints:   case POp::max_4_uints:
+            case POp::cmplt_4_floats: case POp::cmplt_4_ints: case POp::cmplt_4_uints:
+            case POp::cmple_4_floats: case POp::cmple_4_ints: case POp::cmple_4_uints:
+            case POp::cmpeq_4_floats: case POp::cmpeq_4_ints:
+            case POp::cmpne_4_floats: case POp::cmpne_4_ints:
+                std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 4);
+                break;
+
+            case POp::mix_4_floats:   case POp::mix_4_ints:
+                std::tie(opArg1, opArg2, opArg3) = Adjacent3PtrCtx(stage.ctx, 4);
+                break;
+
+            case POp::add_n_floats:   case POp::add_n_ints:
+            case POp::sub_n_floats:   case POp::sub_n_ints:
+            case POp::mul_n_floats:   case POp::mul_n_ints:
+            case POp::div_n_floats:   case POp::div_n_ints:   case POp::div_n_uints:
+                                      case POp::bitwise_and_n_ints:
+                                      case POp::bitwise_or_n_ints:
+                                      case POp::bitwise_xor_n_ints:
+            case POp::mod_n_floats:
+            case POp::min_n_floats:   case POp::min_n_ints:   case POp::min_n_uints:
+            case POp::max_n_floats:   case POp::max_n_ints:   case POp::max_n_uints:
+            case POp::cmplt_n_floats: case POp::cmplt_n_ints: case POp::cmplt_n_uints:
+            case POp::cmple_n_floats: case POp::cmple_n_ints: case POp::cmple_n_uints:
+            case POp::cmpeq_n_floats: case POp::cmpeq_n_ints:
+            case POp::cmpne_n_floats: case POp::cmpne_n_ints:
+            case POp::atan2_n_floats:
+            case POp::pow_n_floats:
+                std::tie(opArg1, opArg2) = AdjacentBinaryOpCtx(stage.ctx);
+                break;
+
+            case POp::mix_n_floats:        case POp::mix_n_ints:
+            case POp::smoothstep_n_floats:
+                std::tie(opArg1, opArg2, opArg3) = AdjacentTernaryOpCtx(stage.ctx);
+                break;
+
+            case POp::jump:
+            case POp::branch_if_all_lanes_active:
+            case POp::branch_if_any_lanes_active:
+            case POp::branch_if_no_lanes_active:
+                opArg1 = BranchOffset(static_cast<SkRasterPipeline_BranchCtx*>(stage.ctx));
+                break;
+
+            case POp::branch_if_no_active_lanes_eq: {
+                const auto* ctx = static_cast<SkRasterPipeline_BranchIfEqualCtx*>(stage.ctx);
+                opArg1 = BranchOffset(ctx);
+                opArg2 = PtrCtx(ctx->ptr, 1);
+                opArg3 = Imm(sk_bit_cast<float>(ctx->value));
+                break;
+            }
+            default:
+                break;
+        }
+
+        std::string_view opName;
+        switch (stage.op) {
+        #define M(x) case POp::x: opName = #x; break;
+            SK_RASTER_PIPELINE_OPS_ALL(M)
+        #undef M
+            case POp::label:                   opName = "label";                   break;
+            case POp::invoke_shader:           opName = "invoke_shader";           break;
+            case POp::invoke_color_filter:     opName = "invoke_color_filter";     break;
+            case POp::invoke_blender:          opName = "invoke_blender";          break;
+            case POp::invoke_to_linear_srgb:   opName = "invoke_to_linear_srgb";   break;
+            case POp::invoke_from_linear_srgb: opName = "invoke_from_linear_srgb"; break;
+        }
+
+        std::string opText;
+        switch (stage.op) {
+            case POp::init_lane_masks:
+                opText = "CondMask = LoopMask = RetMask = true";
+                break;
+
+            case POp::load_condition_mask:
+                opText = "CondMask = " + opArg1;
+                break;
+
+            case POp::store_condition_mask:
+                opText = opArg1 + " = CondMask";
+                break;
+
+            case POp::merge_condition_mask:
+                opText = "CondMask = " + opArg1 + " & " + opArg2;
+                break;
+
+            case POp::load_loop_mask:
+                opText = "LoopMask = " + opArg1;
+                break;
+
+            case POp::store_loop_mask:
+                opText = opArg1 + " = LoopMask";
+                break;
+
+            case POp::mask_off_loop_mask:
+                opText = "LoopMask &= ~(CondMask & LoopMask & RetMask)";
+                break;
+
+            case POp::reenable_loop_mask:
+                opText = "LoopMask |= " + opArg1;
+                break;
+
+            case POp::merge_loop_mask:
+                opText = "LoopMask &= " + opArg1;
+                break;
+
+            case POp::load_return_mask:
+                opText = "RetMask = " + opArg1;
+                break;
+
+            case POp::store_return_mask:
+                opText = opArg1 + " = RetMask";
+                break;
+
+            case POp::mask_off_return_mask:
+                opText = "RetMask &= ~(CondMask & LoopMask & RetMask)";
+                break;
+
+            case POp::store_src_rg:
+                opText = opArg1 + " = src.rg";
+                break;
+
+            case POp::store_src:
+                opText = opArg1 + " = src.rgba";
+                break;
+
+            case POp::store_dst:
+                opText = opArg1 + " = dst.rgba";
+                break;
+
+            case POp::store_device_xy01:
+                opText = opArg1 + " = DeviceCoords.xy01";
+                break;
+
+            case POp::load_src_rg:
+                opText = "src.rg = " + opArg1;
+                break;
+
+            case POp::load_src:
+                opText = "src.rgba = " + opArg1;
+                break;
+
+            case POp::load_dst:
+                opText = "dst.rgba = " + opArg1;
+                break;
+
+            case POp::bitwise_and_int:
+            case POp::bitwise_and_2_ints:
+            case POp::bitwise_and_3_ints:
+            case POp::bitwise_and_4_ints:
+            case POp::bitwise_and_n_ints:
+                opText = opArg1 + " &= " + opArg2;
+                break;
+
+            case POp::bitwise_or_int:
+            case POp::bitwise_or_2_ints:
+            case POp::bitwise_or_3_ints:
+            case POp::bitwise_or_4_ints:
+            case POp::bitwise_or_n_ints:
+                opText = opArg1 + " |= " + opArg2;
+                break;
+
+            case POp::bitwise_xor_int:
+            case POp::bitwise_xor_2_ints:
+            case POp::bitwise_xor_3_ints:
+            case POp::bitwise_xor_4_ints:
+            case POp::bitwise_xor_n_ints:
+                opText = opArg1 + " ^= " + opArg2;
+                break;
+
+            case POp::bitwise_not_int:
+            case POp::bitwise_not_2_ints:
+            case POp::bitwise_not_3_ints:
+            case POp::bitwise_not_4_ints:
+                opText = opArg1 + " = ~" + opArg1;
+                break;
+
+            case POp::cast_to_float_from_int:
+            case POp::cast_to_float_from_2_ints:
+            case POp::cast_to_float_from_3_ints:
+            case POp::cast_to_float_from_4_ints:
+                opText = opArg1 + " = IntToFloat(" + opArg1 + ")";
+                break;
+
+            case POp::cast_to_float_from_uint:
+            case POp::cast_to_float_from_2_uints:
+            case POp::cast_to_float_from_3_uints:
+            case POp::cast_to_float_from_4_uints:
+                opText = opArg1 + " = UintToFloat(" + opArg1 + ")";
+                break;
+
+            case POp::cast_to_int_from_float:
+            case POp::cast_to_int_from_2_floats:
+            case POp::cast_to_int_from_3_floats:
+            case POp::cast_to_int_from_4_floats:
+                opText = opArg1 + " = FloatToInt(" + opArg1 + ")";
+                break;
+
+            case POp::cast_to_uint_from_float:
+            case POp::cast_to_uint_from_2_floats:
+            case POp::cast_to_uint_from_3_floats:
+            case POp::cast_to_uint_from_4_floats:
+                opText = opArg1 + " = FloatToUint(" + opArg1 + ")";
+                break;
+
+            case POp::copy_slot_masked:            case POp::copy_2_slots_masked:
+            case POp::copy_3_slots_masked:         case POp::copy_4_slots_masked:
+            case POp::swizzle_copy_slot_masked:    case POp::swizzle_copy_2_slots_masked:
+            case POp::swizzle_copy_3_slots_masked: case POp::swizzle_copy_4_slots_masked:
+                opText = opArg1 + " = Mask(" + opArg2 + ")";
+                break;
+
+            case POp::copy_constant:               case POp::copy_2_constants:
+            case POp::copy_3_constants:            case POp::copy_4_constants:
+            case POp::copy_slot_unmasked:          case POp::copy_2_slots_unmasked:
+            case POp::copy_3_slots_unmasked:       case POp::copy_4_slots_unmasked:
+            case POp::swizzle_1:                   case POp::swizzle_2:
+            case POp::swizzle_3:                   case POp::swizzle_4:
+            case POp::shuffle:
+                opText = opArg1 + " = " + opArg2;
+                break;
+
+            case POp::copy_from_indirect_unmasked:
+            case POp::copy_from_indirect_uniform_unmasked:
+                opText = opArg1 + " = Indirect(" + opArg2 + " + " + opArg3 + ")";
+                break;
+
+            case POp::copy_to_indirect_masked:
+                opText = "Indirect(" + opArg1 + " + " + opArg3 + ") = Mask(" + opArg2 + ")";
+                break;
+
+            case POp::swizzle_copy_to_indirect_masked:
+                opText = "Indirect(" + opArg1 + " + " + opArg3 + ")." + opSwizzle + " = Mask(" +
+                         opArg2 + ")";
+                break;
+
+            case POp::zero_slot_unmasked:    case POp::zero_2_slots_unmasked:
+            case POp::zero_3_slots_unmasked: case POp::zero_4_slots_unmasked:
+                opText = opArg1 + " = 0";
+                break;
+
+            case POp::abs_float:    case POp::abs_int:
+            case POp::abs_2_floats: case POp::abs_2_ints:
+            case POp::abs_3_floats: case POp::abs_3_ints:
+            case POp::abs_4_floats: case POp::abs_4_ints:
+                opText = opArg1 + " = abs(" + opArg1 + ")";
+                break;
+
+            case POp::acos_float:
+                opText = opArg1 + " = acos(" + opArg1 + ")";
+                break;
+
+            case POp::asin_float:
+                opText = opArg1 + " = asin(" + opArg1 + ")";
+                break;
+
+            case POp::atan_float:
+                opText = opArg1 + " = atan(" + opArg1 + ")";
+                break;
+
+            case POp::atan2_n_floats:
+                opText = opArg1 + " = atan2(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::ceil_float:
+            case POp::ceil_2_floats:
+            case POp::ceil_3_floats:
+            case POp::ceil_4_floats:
+                opText = opArg1 + " = ceil(" + opArg1 + ")";
+                break;
+
+            case POp::cos_float:
+                opText = opArg1 + " = cos(" + opArg1 + ")";
+                break;
+
+            case POp::refract_4_floats:
+                opText = opArg1 + " = refract(" + opArg1 + ", " + opArg2 + ", " + opArg3 + ")";
+                break;
+
+            case POp::dot_2_floats:
+            case POp::dot_3_floats:
+            case POp::dot_4_floats:
+                opText = opArg1 + " = dot(" + opArg2 + ", " + opArg3 + ")";
+                break;
+
+            case POp::exp_float:
+                opText = opArg1 + " = exp(" + opArg1 + ")";
+                break;
+
+            case POp::exp2_float:
+                opText = opArg1 + " = exp2(" + opArg1 + ")";
+                break;
+
+            case POp::log_float:
+                opText = opArg1 + " = log(" + opArg1 + ")";
+                break;
+
+            case POp::log2_float:
+                opText = opArg1 + " = log2(" + opArg1 + ")";
+                break;
+
+            case POp::pow_n_floats:
+                opText = opArg1 + " = pow(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::sin_float:
+                opText = opArg1 + " = sin(" + opArg1 + ")";
+                break;
+
+            case POp::sqrt_float:
+                opText = opArg1 + " = sqrt(" + opArg1 + ")";
+                break;
+
+            case POp::tan_float:
+                opText = opArg1 + " = tan(" + opArg1 + ")";
+                break;
+
+            case POp::floor_float:
+            case POp::floor_2_floats:
+            case POp::floor_3_floats:
+            case POp::floor_4_floats:
+                opText = opArg1 + " = floor(" + opArg1 + ")";
+                break;
+
+            case POp::invsqrt_float:
+            case POp::invsqrt_2_floats:
+            case POp::invsqrt_3_floats:
+            case POp::invsqrt_4_floats:
+                opText = opArg1 + " = inversesqrt(" + opArg1 + ")";
+                break;
+
+            case POp::inverse_mat2:
+            case POp::inverse_mat3:
+            case POp::inverse_mat4:
+                opText = opArg1 + " = inverse(" + opArg1 + ")";
+                break;
+
+            case POp::add_float:    case POp::add_int:
+            case POp::add_2_floats: case POp::add_2_ints:
+            case POp::add_3_floats: case POp::add_3_ints:
+            case POp::add_4_floats: case POp::add_4_ints:
+            case POp::add_n_floats: case POp::add_n_ints:
+                opText = opArg1 + " += " + opArg2;
+                break;
+
+            case POp::sub_float:    case POp::sub_int:
+            case POp::sub_2_floats: case POp::sub_2_ints:
+            case POp::sub_3_floats: case POp::sub_3_ints:
+            case POp::sub_4_floats: case POp::sub_4_ints:
+            case POp::sub_n_floats: case POp::sub_n_ints:
+                opText = opArg1 + " -= " + opArg2;
+                break;
+
+            case POp::mul_float:    case POp::mul_int:
+            case POp::mul_2_floats: case POp::mul_2_ints:
+            case POp::mul_3_floats: case POp::mul_3_ints:
+            case POp::mul_4_floats: case POp::mul_4_ints:
+            case POp::mul_n_floats: case POp::mul_n_ints:
+                opText = opArg1 + " *= " + opArg2;
+                break;
+
+            case POp::div_float:    case POp::div_int:    case POp::div_uint:
+            case POp::div_2_floats: case POp::div_2_ints: case POp::div_2_uints:
+            case POp::div_3_floats: case POp::div_3_ints: case POp::div_3_uints:
+            case POp::div_4_floats: case POp::div_4_ints: case POp::div_4_uints:
+            case POp::div_n_floats: case POp::div_n_ints: case POp::div_n_uints:
+                opText = opArg1 + " /= " + opArg2;
+                break;
+
+            case POp::mod_float:
+            case POp::mod_2_floats:
+            case POp::mod_3_floats:
+            case POp::mod_4_floats:
+            case POp::mod_n_floats:
+                opText = opArg1 + " = mod(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::min_float:    case POp::min_int:    case POp::min_uint:
+            case POp::min_2_floats: case POp::min_2_ints: case POp::min_2_uints:
+            case POp::min_3_floats: case POp::min_3_ints: case POp::min_3_uints:
+            case POp::min_4_floats: case POp::min_4_ints: case POp::min_4_uints:
+            case POp::min_n_floats: case POp::min_n_ints: case POp::min_n_uints:
+                opText = opArg1 + " = min(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::max_float:    case POp::max_int:    case POp::max_uint:
+            case POp::max_2_floats: case POp::max_2_ints: case POp::max_2_uints:
+            case POp::max_3_floats: case POp::max_3_ints: case POp::max_3_uints:
+            case POp::max_4_floats: case POp::max_4_ints: case POp::max_4_uints:
+            case POp::max_n_floats: case POp::max_n_ints: case POp::max_n_uints:
+                opText = opArg1 + " = max(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::cmplt_float:    case POp::cmplt_int:    case POp::cmplt_uint:
+            case POp::cmplt_2_floats: case POp::cmplt_2_ints: case POp::cmplt_2_uints:
+            case POp::cmplt_3_floats: case POp::cmplt_3_ints: case POp::cmplt_3_uints:
+            case POp::cmplt_4_floats: case POp::cmplt_4_ints: case POp::cmplt_4_uints:
+            case POp::cmplt_n_floats: case POp::cmplt_n_ints: case POp::cmplt_n_uints:
+                opText = opArg1 + " = lessThan(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::cmple_float:    case POp::cmple_int:    case POp::cmple_uint:
+            case POp::cmple_2_floats: case POp::cmple_2_ints: case POp::cmple_2_uints:
+            case POp::cmple_3_floats: case POp::cmple_3_ints: case POp::cmple_3_uints:
+            case POp::cmple_4_floats: case POp::cmple_4_ints: case POp::cmple_4_uints:
+            case POp::cmple_n_floats: case POp::cmple_n_ints: case POp::cmple_n_uints:
+                opText = opArg1 + " = lessThanEqual(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::cmpeq_float:    case POp::cmpeq_int:
+            case POp::cmpeq_2_floats: case POp::cmpeq_2_ints:
+            case POp::cmpeq_3_floats: case POp::cmpeq_3_ints:
+            case POp::cmpeq_4_floats: case POp::cmpeq_4_ints:
+            case POp::cmpeq_n_floats: case POp::cmpeq_n_ints:
+                opText = opArg1 + " = equal(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::cmpne_float:    case POp::cmpne_int:
+            case POp::cmpne_2_floats: case POp::cmpne_2_ints:
+            case POp::cmpne_3_floats: case POp::cmpne_3_ints:
+            case POp::cmpne_4_floats: case POp::cmpne_4_ints:
+            case POp::cmpne_n_floats: case POp::cmpne_n_ints:
+                opText = opArg1 + " = notEqual(" + opArg1 + ", " + opArg2 + ")";
+                break;
+
+            case POp::mix_float:      case POp::mix_int:
+            case POp::mix_2_floats:   case POp::mix_2_ints:
+            case POp::mix_3_floats:   case POp::mix_3_ints:
+            case POp::mix_4_floats:   case POp::mix_4_ints:
+            case POp::mix_n_floats:   case POp::mix_n_ints:
+                opText = opArg1 + " = mix(" + opArg2 + ", " + opArg3 + ", " + opArg1 + ")";
+                break;
+
+            case POp::smoothstep_n_floats:
+                opText = opArg1 + " = smoothstep(" + opArg1 + ", " + opArg2 + ", " + opArg3 + ")";
+                break;
+
+            case POp::jump:
+            case POp::branch_if_all_lanes_active:
+            case POp::branch_if_any_lanes_active:
+            case POp::branch_if_no_lanes_active:
+            case POp::invoke_shader:
+            case POp::invoke_color_filter:
+            case POp::invoke_blender:
+                opText = std::string(opName) + " " + opArg1;
+                break;
+
+            case POp::invoke_to_linear_srgb:
+                opText = "src.rgba = toLinearSrgb(src.rgba)";
+                break;
+
+            case POp::invoke_from_linear_srgb:
+                opText = "src.rgba = fromLinearSrgb(src.rgba)";
+                break;
+
+            case POp::branch_if_no_active_lanes_eq:
+                opText = "branch " + opArg1 + " if no lanes of " + opArg2 + " == " + opArg3;
+                break;
+
+            case POp::label:
+                opText = "label " + opArg1;
+                break;
+
+            case POp::case_op: {
+                opText = "if (" + opArg1 + " == " + opArg3 +
+                         ") { LoopMask = true; " + opArg2 + " = false; }";
+                break;
+            }
+            default:
+                break;
+        }
+
+        opName = opName.substr(0, 30);
+        if (!opText.empty()) {
+            out->writeText(SkSL::String::printf("% 5d. %-30.*s %s\n",
+                                                index + 1,
+                                                (int)opName.size(), opName.data(),
+                                                opText.c_str()).c_str());
+        } else {
+            out->writeText(SkSL::String::printf("% 5d. %.*s\n",
+                                                index + 1,
+                                                (int)opName.size(), opName.data()).c_str());
+        }
+    }
+}
+
+}  // namespace RP
+}  // namespace SkSL