summaryrefslogtreecommitdiffstats
path: root/gfx/wr/swgl/src/swgl_ext.h
diff options
context:
space:
mode:
Diffstat (limited to 'gfx/wr/swgl/src/swgl_ext.h')
-rw-r--r--gfx/wr/swgl/src/swgl_ext.h1906
1 files changed, 1906 insertions, 0 deletions
diff --git a/gfx/wr/swgl/src/swgl_ext.h b/gfx/wr/swgl/src/swgl_ext.h
new file mode 100644
index 0000000000..164ca8d7b1
--- /dev/null
+++ b/gfx/wr/swgl/src/swgl_ext.h
@@ -0,0 +1,1906 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// When using a solid color with clip masking, the cost of loading the clip mask
+// in the blend stage exceeds the cost of processing the color. Here we handle
+// the entire span of clip mask texture before the blend stage to more
+// efficiently process it and modulate it with color without incurring blend
+// stage overheads.
+template <typename P, typename C>
+static void commit_masked_solid_span(P* buf, C color, int len) {
+ override_clip_mask();
+ uint8_t* mask = get_clip_mask(buf);
+ for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) {
+ commit_span(
+ buf,
+ blend_span(
+ buf,
+ applyColor(expand_mask(buf, unpack(unaligned_load<PackedR8>(mask))),
+ color)));
+ }
+ restore_clip_mask();
+}
+
+// When using a solid color with anti-aliasing, most of the solid span will not
+// benefit from anti-aliasing in the opaque region. We only want to apply the AA
+// blend stage in the non-opaque start and end of the span where AA is needed.
+template <typename P, typename R>
+static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) {
+ if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) {
+ commit_solid_span<true>(buf, r, start);
+ buf += start;
+ len -= start;
+ }
+ if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) {
+ override_aa();
+ commit_solid_span<true>(buf, r, opaque);
+ restore_aa();
+ buf += opaque;
+ len -= opaque;
+ }
+ if (len > 0) {
+ commit_solid_span<true>(buf, r, len);
+ }
+}
+
+// Forces a value with vector run-class to have scalar run-class.
+template <typename T>
+static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) {
+ return force_scalar(v);
+}
+
+// Advance all varying inperpolants by a single chunk
+#define swgl_stepInterp() step_interp_inputs()
+
+// Pseudo-intrinsic that accesses the interpolation step for a given varying
+#define swgl_interpStep(v) (interp_step.v)
+
+// Commit an entire span of a solid color. This dispatches to clip-masked and
+// anti-aliased fast-paths as appropriate.
+#define swgl_commitSolid(format, v, n) \
+ do { \
+ int len = (n); \
+ if (blend_key) { \
+ if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { \
+ commit_masked_solid_span(swgl_Out##format, \
+ packColor(swgl_Out##format, (v)), len); \
+ } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { \
+ commit_aa_solid_span(swgl_Out##format, \
+ pack_span(swgl_Out##format, (v)), len); \
+ } else { \
+ commit_solid_span<true>(swgl_Out##format, \
+ pack_span(swgl_Out##format, (v)), len); \
+ } \
+ } else { \
+ commit_solid_span<false>(swgl_Out##format, \
+ pack_span(swgl_Out##format, (v)), len); \
+ } \
+ swgl_Out##format += len; \
+ swgl_SpanLength -= len; \
+ } while (0)
+#define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength)
+#define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength)
+#define swgl_commitPartialSolidRGBA8(len, v) \
+ swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength))
+#define swgl_commitPartialSolidR8(len, v) \
+ swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength))
+
+#define swgl_commitChunk(format, chunk) \
+ do { \
+ auto r = chunk; \
+ if (blend_key) r = blend_span(swgl_Out##format, r); \
+ commit_span(swgl_Out##format, r); \
+ swgl_Out##format += swgl_StepSize; \
+ swgl_SpanLength -= swgl_StepSize; \
+ } while (0)
+
+// Commit a single chunk of a color
+#define swgl_commitColor(format, color) \
+ swgl_commitChunk(format, pack_pixels_##format(color))
+#define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color)
+#define swgl_commitColorR8(color) swgl_commitColor(R8, color)
+
+template <typename S>
+static ALWAYS_INLINE bool swgl_isTextureLinear(S s) {
+ return s->filter == TextureFilter::LINEAR;
+}
+
+template <typename S>
+static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) {
+ return s->format == TextureFormat::RGBA8;
+}
+
+template <typename S>
+static ALWAYS_INLINE bool swgl_isTextureR8(S s) {
+ return s->format == TextureFormat::R8;
+}
+
+// Use the default linear quantization scale of 128. This gives 7 bits of
+// fractional precision, which when multiplied with a signed 9 bit value
+// still fits in a 16 bit integer.
+const int swgl_LinearQuantizeScale = 128;
+
+// Quantizes UVs for access into a linear texture.
+template <typename S, typename T>
+static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) {
+ return linearQuantize(p, swgl_LinearQuantizeScale, s);
+}
+
+// Quantizes an interpolation step for UVs for access into a linear texture.
+template <typename S, typename T>
+static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) {
+ return samplerScale(s, p) * swgl_LinearQuantizeScale;
+}
+
+template <typename S>
+static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf,
+ S sampler, ivec2 i) {
+ return textureLinearUnpackedRGBA8(sampler, i);
+}
+
+template <typename S>
+static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf,
+ S sampler, ivec2 i) {
+ return textureLinearUnpackedR8(sampler, i);
+}
+
+template <typename S>
+static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) {
+ return swgl_isTextureRGBA8(s);
+}
+
+template <typename S>
+static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) {
+ return swgl_isTextureR8(s);
+}
+
+// Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets
+// for linear sampling.
+#define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv) \
+ uv = swgl_linearQuantize(sampler, uv); \
+ vec2_scalar uv_step = \
+ float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; \
+ vec2_scalar min_uv = max( \
+ swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \
+ vec2_scalar max_uv = \
+ max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), \
+ min_uv);
+
+// Implements the fallback linear filter that can deal with clamping and
+// arbitrary scales.
+template <bool BLEND, typename S, typename C, typename P>
+static P* blendTextureLinearFallback(S sampler, vec2 uv, int span,
+ vec2_scalar uv_step, vec2_scalar min_uv,
+ vec2_scalar max_uv, C color, P* buf) {
+ for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
+ commit_blend_span<BLEND>(
+ buf, applyColor(textureLinearUnpacked(buf, sampler,
+ ivec2(clamp(uv, min_uv, max_uv))),
+ color));
+ }
+ return buf;
+}
+
+static ALWAYS_INLINE U64 castForShuffle(V16<int16_t> r) {
+ return bit_cast<U64>(r);
+}
+static ALWAYS_INLINE U16 castForShuffle(V4<int16_t> r) {
+ return bit_cast<U16>(r);
+}
+
+static ALWAYS_INLINE V16<int16_t> applyFracX(V16<int16_t> r, I16 fracx) {
+ return r * fracx.xxxxyyyyzzzzwwww;
+}
+static ALWAYS_INLINE V4<int16_t> applyFracX(V4<int16_t> r, I16 fracx) {
+ return r * fracx;
+}
+
+// Implements a faster linear filter that works with axis-aligned constant Y but
+// scales less than 1, i.e. upscaling. In this case we can optimize for the
+// constant Y fraction as well as load all chunks from memory in a single tap
+// for each row.
+template <bool BLEND, typename S, typename C, typename P>
+static void blendTextureLinearUpscale(S sampler, vec2 uv, int span,
+ vec2_scalar uv_step, vec2_scalar min_uv,
+ vec2_scalar max_uv, C color, P* buf) {
+ typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
+ typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
+ typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
+
+ ivec2 i(clamp(uv, min_uv, max_uv));
+ ivec2 frac = i;
+ i >>= 7;
+ P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x));
+ P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x));
+ I16 fracx = computeFracX(sampler, i, frac);
+ int16_t fracy = computeFracY(frac).x;
+ auto src0 =
+ CONVERT(unaligned_load<packed_type>(&row0[i.x.x]), signed_unpacked_type);
+ auto src1 =
+ CONVERT(unaligned_load<packed_type>(&row1[i.x.x]), signed_unpacked_type);
+ auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
+
+ // We attempt to sample ahead by one chunk and interpolate it with the current
+ // one. However, due to the complication of upscaling, we may not necessarily
+ // shift in all the next set of samples.
+ for (P* end = buf + span; buf < end; buf += 4) {
+ uv.x += uv_step.x;
+ I32 ixn = cast(uv.x);
+ I16 fracn = computeFracNoClamp(ixn);
+ ixn >>= 7;
+ auto src0n = CONVERT(unaligned_load<packed_type>(&row0[ixn.x]),
+ signed_unpacked_type);
+ auto src1n = CONVERT(unaligned_load<packed_type>(&row1[ixn.x]),
+ signed_unpacked_type);
+ auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
+
+ // Since we're upscaling, we know that a source pixel has a larger footprint
+ // than the destination pixel, and thus all the source pixels needed for
+ // this chunk will fall within a single chunk of texture data. However,
+ // since the source pixels don't map 1:1 with destination pixels, we need to
+ // shift the source pixels over based on their offset from the start of the
+ // chunk. This could conceivably be optimized better with usage of PSHUFB or
+ // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort
+ // to masking in the correct pixels to avoid having to index into memory.
+ // For the last sample to interpolate with, we need to potentially shift in
+ // a sample from the next chunk over in the case the samples fill out an
+ // entire chunk.
+ auto shuf = src;
+ auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4);
+ if (i.x.y == i.x.x) {
+ shuf = shuf.xxyz;
+ shufn = shufn.xxyz;
+ }
+ if (i.x.z == i.x.y) {
+ shuf = shuf.xyyz;
+ shufn = shufn.xyyz;
+ }
+ if (i.x.w == i.x.z) {
+ shuf = shuf.xyzz;
+ shufn = shufn.xyzz;
+ }
+
+ // Convert back to a signed unpacked type so that we can interpolate the
+ // final result.
+ auto interp = bit_cast<signed_unpacked_type>(shuf);
+ auto interpn = bit_cast<signed_unpacked_type>(shufn);
+ interp += applyFracX(interpn - interp, fracx) >> 7;
+
+ commit_blend_span<BLEND>(
+ buf, applyColor(bit_cast<unpacked_type>(interp), color));
+
+ i.x = ixn;
+ fracx = fracn;
+ src = srcn;
+ }
+}
+
+// This is the fastest variant of the linear filter that still provides
+// filtering. In cases where there is no scaling required, but we have a
+// subpixel offset that forces us to blend in neighboring pixels, we can
+// optimize away most of the memory loads and shuffling that is required by the
+// fallback filter.
+template <bool BLEND, typename S, typename C, typename P>
+static void blendTextureLinearFast(S sampler, vec2 uv, int span,
+ vec2_scalar min_uv, vec2_scalar max_uv,
+ C color, P* buf) {
+ typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
+ typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
+ typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
+
+ ivec2 i(clamp(uv, min_uv, max_uv));
+ ivec2 frac = i;
+ i >>= 7;
+ P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
+ P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
+ int16_t fracx = computeFracX(sampler, i, frac).x;
+ int16_t fracy = computeFracY(frac).x;
+ auto src0 = CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
+ auto src1 = CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
+ auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
+
+ // Since there is no scaling, we sample ahead by one chunk and interpolate it
+ // with the current one. We can then reuse this value on the next iteration.
+ for (P* end = buf + span; buf < end; buf += 4) {
+ row0 += 4;
+ row1 += 4;
+ auto src0n =
+ CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
+ auto src1n =
+ CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
+ auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
+
+ // For the last sample to interpolate with, we need to potentially shift in
+ // a sample from the next chunk over since the samples fill out an entire
+ // chunk.
+ auto interp = bit_cast<signed_unpacked_type>(src);
+ auto interpn =
+ bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 2, 3, 4));
+ interp += ((interpn - interp) * fracx) >> 7;
+
+ commit_blend_span<BLEND>(
+ buf, applyColor(bit_cast<unpacked_type>(interp), color));
+
+ src = srcn;
+ }
+}
+
+// Implements a faster linear filter that works with axis-aligned constant Y but
+// downscaling the texture by half. In this case we can optimize for the
+// constant X/Y fractions and reduction factor while minimizing shuffling.
+template <bool BLEND, typename S, typename C, typename P>
+static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span,
+ vec2_scalar min_uv,
+ vec2_scalar max_uv, C color,
+ P* buf) {
+ typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
+ typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
+ typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
+
+ ivec2 i(clamp(uv, min_uv, max_uv));
+ ivec2 frac = i;
+ i >>= 7;
+ P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
+ P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
+ int16_t fracx = computeFracX(sampler, i, frac).x;
+ int16_t fracy = computeFracY(frac).x;
+
+ for (P* end = buf + span; buf < end; buf += 4) {
+ auto src0 =
+ CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
+ auto src1 =
+ CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
+ auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
+ row0 += 4;
+ row1 += 4;
+ auto src0n =
+ CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
+ auto src1n =
+ CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
+ auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
+ row0 += 4;
+ row1 += 4;
+
+ auto interp =
+ bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 0, 2, 4, 6));
+ auto interpn =
+ bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 3, 5, 7));
+ interp += ((interpn - interp) * fracx) >> 7;
+
+ commit_blend_span<BLEND>(
+ buf, applyColor(bit_cast<unpacked_type>(interp), color));
+ }
+}
+
+enum LinearFilter {
+ // No linear filter is needed.
+ LINEAR_FILTER_NEAREST = 0,
+ // The most general linear filter that handles clamping and varying scales.
+ LINEAR_FILTER_FALLBACK,
+ // A linear filter optimized for axis-aligned upscaling.
+ LINEAR_FILTER_UPSCALE,
+ // A linear filter with no scaling but with subpixel offset.
+ LINEAR_FILTER_FAST,
+ // A linear filter optimized for 2x axis-aligned downscaling.
+ LINEAR_FILTER_DOWNSCALE
+};
+
+// Dispatches to an appropriate linear filter depending on the selected filter.
+template <bool BLEND, typename S, typename C, typename P>
+static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span,
+ vec2_scalar uv_step, vec2_scalar min_uv,
+ vec2_scalar max_uv, C color, P* buf,
+ LinearFilter filter) {
+ P* end = buf + span;
+ if (filter != LINEAR_FILTER_FALLBACK) {
+ // If we're not using the fallback, then Y is constant across the entire
+ // row. We just need to ensure that we handle any samples that might pull
+ // data from before the start of the row and require clamping.
+ float beforeDist = max(0.0f, min_uv.x) - uv.x.x;
+ if (beforeDist > 0) {
+ int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0,
+ int(end - buf));
+ buf = blendTextureLinearFallback<BLEND>(sampler, uv, before, uv_step,
+ min_uv, max_uv, color, buf);
+ uv.x += (before / swgl_StepSize) * uv_step.x;
+ }
+ // We need to check how many samples we can take from inside the row without
+ // requiring clamping. In case the filter oversamples the row by a step, we
+ // subtract off a step from the width to leave some room.
+ float insideDist =
+ min(max_uv.x, float((int(sampler->width) - swgl_StepSize) *
+ swgl_LinearQuantizeScale)) -
+ uv.x.x;
+ if (uv_step.x > 0.0f && insideDist >= uv_step.x) {
+ int32_t inside = int(end - buf);
+ if (filter == LINEAR_FILTER_DOWNSCALE) {
+ inside = min(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) &
+ ~(swgl_StepSize - 1),
+ inside);
+ if (inside > 0) {
+ blendTextureLinearDownscale<BLEND>(sampler, uv, inside, min_uv,
+ max_uv, color, buf);
+ buf += inside;
+ uv.x += (inside / swgl_StepSize) * uv_step.x;
+ }
+ } else if (filter == LINEAR_FILTER_UPSCALE) {
+ inside = min(int(insideDist / uv_step.x) * swgl_StepSize, inside);
+ if (inside > 0) {
+ blendTextureLinearUpscale<BLEND>(sampler, uv, inside, uv_step, min_uv,
+ max_uv, color, buf);
+ buf += inside;
+ uv.x += (inside / swgl_StepSize) * uv_step.x;
+ }
+ } else {
+ inside = min(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) &
+ ~(swgl_StepSize - 1),
+ inside);
+ if (inside > 0) {
+ blendTextureLinearFast<BLEND>(sampler, uv, inside, min_uv, max_uv,
+ color, buf);
+ buf += inside;
+ uv.x += (inside / swgl_StepSize) * uv_step.x;
+ }
+ }
+ }
+ }
+ // If the fallback filter was requested, or if there are any samples left that
+ // may be outside the row and require clamping, then handle that with here.
+ if (buf < end) {
+ buf = blendTextureLinearFallback<BLEND>(
+ sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf);
+ }
+ return buf;
+}
+
+// Helper function to quantize UVs for linear filtering before dispatch
+template <bool BLEND, typename S, typename C, typename P>
+static inline int blendTextureLinear(S sampler, vec2 uv, int span,
+ const vec4_scalar& uv_rect, C color,
+ P* buf, LinearFilter filter) {
+ if (!matchTextureFormat(sampler, buf)) {
+ return 0;
+ }
+ LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
+ blendTextureLinearDispatch<BLEND>(sampler, uv, span, uv_step, min_uv, max_uv,
+ color, buf, filter);
+ return span;
+}
+
+// Samples an axis-aligned span of on a single row of a texture using 1:1
+// nearest filtering. Sampling is constrained to only fall within the given UV
+// bounds. This requires a pointer to the destination buffer. An optional color
+// modulus can be supplied.
+template <bool BLEND, typename S, typename C, typename P>
+static int blendTextureNearestFast(S sampler, vec2 uv, int span,
+ const vec4_scalar& uv_rect, C color,
+ P* buf) {
+ if (!matchTextureFormat(sampler, buf)) {
+ return 0;
+ }
+
+ typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
+
+ ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv)));
+ ivec2_scalar minUV =
+ make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}));
+ ivec2_scalar maxUV =
+ make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}));
+
+ // Calculate the row pointer within the buffer, clamping to within valid row
+ // bounds.
+ P* row =
+ &((P*)sampler
+ ->buf)[clampCoord(clamp(i.y, minUV.y, maxUV.y), sampler->height) *
+ sampler->stride];
+ // Find clamped X bounds within the row.
+ int minX = clamp(minUV.x, 0, sampler->width - 1);
+ int maxX = clamp(maxUV.x, minX, sampler->width - 1);
+ int curX = i.x;
+ int endX = i.x + span;
+ // If we need to start sampling below the valid sample bounds, then we need to
+ // fill this section with a constant clamped sample.
+ if (curX < minX) {
+ int n = min(minX, endX) - curX;
+ auto src =
+ applyColor(unpack(bit_cast<packed_type>(V4<P>(row[minX]))), color);
+ commit_solid_span<BLEND>(buf, src, n);
+ buf += n;
+ curX += n;
+ }
+ // Here we only deal with valid samples within the sample bounds. No clamping
+ // should occur here within these inner loops.
+ int n = max(min(maxX + 1, endX) - curX, 0);
+ // Try to process as many chunks as possible with full loads and stores.
+ for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) {
+ auto src = applyColor(unaligned_load<packed_type>(&row[curX]), color);
+ commit_blend_span<BLEND>(buf, src);
+ }
+ n &= 3;
+ // If we have any leftover samples after processing chunks, use partial loads
+ // and stores.
+ if (n > 0) {
+ auto src = applyColor(partial_load_span<packed_type>(&row[curX], n), color);
+ commit_blend_span<BLEND>(buf, src, n);
+ buf += n;
+ curX += n;
+ }
+ // If we still have samples left above the valid sample bounds, then we again
+ // need to fill this section with a constant clamped sample.
+ if (curX < endX) {
+ auto src =
+ applyColor(unpack(bit_cast<packed_type>(V4<P>(row[maxX]))), color);
+ commit_solid_span<BLEND>(buf, src, endX - curX);
+ }
+ return span;
+}
+
+// We need to verify that the pixel step reasonably approximates stepping by a
+// single texel for every pixel we need to reproduce. Try to ensure that the
+// margin of error is no more than approximately 2^-7. Also, we check here if
+// the scaling can be quantized for acceleration.
+template <typename T>
+static ALWAYS_INLINE int spanNeedsScale(int span, T P) {
+ span &= ~(128 - 1);
+ span += 128;
+ int scaled = round((P.x.y - P.x.x) * span);
+ return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0;
+}
+
+// Helper function to decide whether we can safely apply 1:1 nearest filtering
+// without diverging too much from the linear filter.
+template <typename S, typename T>
+static inline LinearFilter needsTextureLinear(S sampler, T P, int span) {
+ // If each row is not wide enough for linear filtering, then just use nearest
+ // filtering.
+ if (sampler->width < 2) {
+ return LINEAR_FILTER_NEAREST;
+ }
+ // First verify if the row Y doesn't change across samples
+ if (P.y.x != P.y.y) {
+ return LINEAR_FILTER_FALLBACK;
+ }
+ P = samplerScale(sampler, P);
+ if (int scale = spanNeedsScale(span, P)) {
+ // If the source region is not flipped and smaller than the destination,
+ // then we can use the upscaling filter since row Y is constant.
+ return P.x.x < P.x.y && P.x.y - P.x.x <= 1
+ ? LINEAR_FILTER_UPSCALE
+ : (scale == 2 ? LINEAR_FILTER_DOWNSCALE
+ : LINEAR_FILTER_FALLBACK);
+ }
+ // Also verify that we're reasonably close to the center of a texel
+ // so that it doesn't look that much different than if a linear filter
+ // was used.
+ if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 ||
+ (int(P.y.x * 4.0f + 0.5f) & 3) != 2) {
+ // The source and destination regions are the same, but there is a
+ // significant subpixel offset. We can use a faster linear filter to deal
+ // with the offset in this case.
+ return LINEAR_FILTER_FAST;
+ }
+ // Otherwise, we have a constant 1:1 step and we're stepping reasonably close
+ // to the center of each pixel, so it's safe to disable the linear filter and
+ // use nearest.
+ return LINEAR_FILTER_NEAREST;
+}
+
+// Commit an entire span with linear filtering
+#define swgl_commitTextureLinear(format, s, p, uv_rect, color, n) \
+ do { \
+ auto packed_color = packColor(swgl_Out##format, color); \
+ int len = (n); \
+ int drawn = 0; \
+ if (LinearFilter filter = needsTextureLinear(s, p, len)) { \
+ if (blend_key) { \
+ drawn = blendTextureLinear<true>(s, p, len, uv_rect, packed_color, \
+ swgl_Out##format, filter); \
+ } else { \
+ drawn = blendTextureLinear<false>(s, p, len, uv_rect, packed_color, \
+ swgl_Out##format, filter); \
+ } \
+ } else if (blend_key) { \
+ drawn = blendTextureNearestFast<true>(s, p, len, uv_rect, packed_color, \
+ swgl_Out##format); \
+ } else { \
+ drawn = blendTextureNearestFast<false>(s, p, len, uv_rect, packed_color, \
+ swgl_Out##format); \
+ } \
+ swgl_Out##format += drawn; \
+ swgl_SpanLength -= drawn; \
+ } while (0)
+#define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \
+ swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength)
+#define swgl_commitTextureLinearR8(s, p, uv_rect) \
+ swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength)
+
+// Commit a partial span with linear filtering, optionally inverting the color
+#define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \
+ swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), \
+ min(int(len), swgl_SpanLength))
+#define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \
+ swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(), \
+ min(int(len), swgl_SpanLength))
+
+// Commit an entire span with linear filtering that is scaled by a color
+#define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \
+ swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength)
+#define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \
+ swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength)
+
+// Helper function that samples from an R8 texture while expanding it to support
+// a differing framebuffer format.
+template <bool BLEND, typename S, typename C, typename P>
+static inline int blendTextureLinearR8(S sampler, vec2 uv, int span,
+ const vec4_scalar& uv_rect, C color,
+ P* buf) {
+ if (!swgl_isTextureR8(sampler) || sampler->width < 2) {
+ return 0;
+ }
+ LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
+ for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
+ commit_blend_span<BLEND>(
+ buf, applyColor(expand_mask(buf, textureLinearUnpackedR8(
+ sampler,
+ ivec2(clamp(uv, min_uv, max_uv)))),
+ color));
+ }
+ return span;
+}
+
+// Commit an entire span with linear filtering while expanding from R8 to RGBA8
+#define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color) \
+ do { \
+ auto packed_color = packColor(swgl_OutRGBA8, color); \
+ int drawn = 0; \
+ if (blend_key) { \
+ drawn = blendTextureLinearR8<true>(s, p, swgl_SpanLength, uv_rect, \
+ packed_color, swgl_OutRGBA8); \
+ } else { \
+ drawn = blendTextureLinearR8<false>(s, p, swgl_SpanLength, uv_rect, \
+ packed_color, swgl_OutRGBA8); \
+ } \
+ swgl_OutRGBA8 += drawn; \
+ swgl_SpanLength -= drawn; \
+ } while (0)
+#define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \
+ swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor())
+
+// Compute repeating UVs, possibly constrained by tile repeat limits
+static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) {
+ if (tile_repeat.x > 0.0f) {
+ // Clamp to a number slightly less than the tile repeat limit so that
+ // it results in a number close to but not equal to 1 after fract().
+ // This avoids fract() yielding 0 if the limit was left as whole integer.
+ uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f);
+ }
+ return fract(uv);
+}
+
+// Compute the number of non-repeating steps before we need to potentially
+// repeat the UVs.
+static inline int computeNoRepeatSteps(Float uv, float uv_step,
+ float tile_repeat, int steps) {
+ if (uv.w < uv.x) {
+ // Ensure the UV taps are ordered low to high.
+ uv = uv.wzyx;
+ }
+ // Check if the samples cross the boundary of the next whole integer or the
+ // tile repeat limit, whichever is lower.
+ float limit = floor(uv.x) + 1.0f;
+ if (tile_repeat > 0.0f) {
+ limit = min(limit, tile_repeat);
+ }
+ return uv.x >= 0.0f && uv.w < limit
+ ? (uv_step != 0.0f
+ ? int(clamp((limit - uv.x) / uv_step, 0.0f, float(steps)))
+ : steps)
+ : 0;
+}
+
+// Blends an entire span of texture with linear filtering and repeating UVs.
+template <bool BLEND, typename S, typename C, typename P>
+static int blendTextureLinearRepeat(S sampler, vec2 uv, int span,
+ const vec2_scalar& tile_repeat,
+ const vec4_scalar& uv_repeat,
+ const vec4_scalar& uv_rect, C color,
+ P* buf) {
+ if (!matchTextureFormat(sampler, buf)) {
+ return 0;
+ }
+ vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y};
+ vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y};
+ // Choose a linear filter to use for no-repeat sub-spans
+ LinearFilter filter =
+ needsTextureLinear(sampler, uv * uv_scale + uv_offset, span);
+ // We need to step UVs unscaled and unquantized so that we can modulo them
+ // with fract. We use uv_scale and uv_offset to map them into the correct
+ // range.
+ vec2_scalar uv_step =
+ float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
+ uv_scale = swgl_linearQuantizeStep(sampler, uv_scale);
+ uv_offset = swgl_linearQuantize(sampler, uv_offset);
+ vec2_scalar min_uv = max(
+ swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f);
+ vec2_scalar max_uv = max(
+ swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv);
+ for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
+ int steps = int(end - buf) / swgl_StepSize;
+ // Find the sub-span before UVs repeat to avoid expensive repeat math
+ steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
+ if (steps > 0) {
+ steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
+ if (steps > 0) {
+ buf = blendTextureLinearDispatch<BLEND>(
+ sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize,
+ uv_step * uv_scale, min_uv, max_uv, color, buf, filter);
+ if (buf >= end) {
+ break;
+ }
+ uv += steps * uv_step;
+ }
+ }
+ // UVs might repeat within this step, so explicitly compute repeated UVs
+ vec2 repeated_uv = clamp(
+ tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv);
+ commit_blend_span<BLEND>(
+ buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)),
+ color));
+ }
+ return span;
+}
+
+// Commit an entire span with linear filtering and repeating UVs
+#define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat, \
+ uv_rect, color) \
+ do { \
+ auto packed_color = packColor(swgl_Out##format, color); \
+ int drawn = 0; \
+ if (blend_key) { \
+ drawn = blendTextureLinearRepeat<true>(s, p, swgl_SpanLength, \
+ tile_repeat, uv_repeat, uv_rect, \
+ packed_color, swgl_Out##format); \
+ } else { \
+ drawn = blendTextureLinearRepeat<false>(s, p, swgl_SpanLength, \
+ tile_repeat, uv_repeat, uv_rect, \
+ packed_color, swgl_Out##format); \
+ } \
+ swgl_Out##format += drawn; \
+ swgl_SpanLength -= drawn; \
+ } while (0)
+#define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat, \
+ uv_rect) \
+ swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
+ NoColor())
+#define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \
+ uv_rect, color) \
+ swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
+ color)
+
+template <typename S>
+static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf,
+ S sampler, ivec2 i) {
+ return textureNearestPackedRGBA8(sampler, i);
+}
+
+// Blends an entire span of texture with nearest filtering and either
+// repeated or clamped UVs.
+template <bool BLEND, bool REPEAT, typename S, typename C, typename P>
+static int blendTextureNearestRepeat(S sampler, vec2 uv, int span,
+ const vec2_scalar& tile_repeat,
+ const vec4_scalar& uv_rect, C color,
+ P* buf) {
+ if (!matchTextureFormat(sampler, buf)) {
+ return 0;
+ }
+ if (!REPEAT) {
+ // If clamping, then we step pre-scaled to the sampler. For repeat modes,
+ // this will be accomplished via uv_scale instead.
+ uv = samplerScale(sampler, uv);
+ }
+ vec2_scalar uv_step =
+ float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
+ vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y});
+ vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w});
+ vec2_scalar uv_scale = max_uv - min_uv;
+ // If the effective sampling area of this texture is only a single pixel, then
+ // treat it as a solid span. For repeat modes, the bounds are specified on
+ // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so
+ // the test varies depending on which. If the sample range on an axis is
+ // greater than one pixel, we can still check if we don't move far enough from
+ // the pixel center on that axis to hit the next pixel.
+ if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) ||
+ (abs(uv_step.x) * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) &&
+ (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) ||
+ (abs(uv_step.y) * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) {
+ vec2 repeated_uv = REPEAT
+ ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
+ : clamp(uv, min_uv, max_uv);
+ commit_solid_span<BLEND>(buf,
+ applyColor(unpack(textureNearestPacked(
+ buf, sampler, ivec2(repeated_uv))),
+ color),
+ span);
+ } else {
+ for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
+ if (REPEAT) {
+ int steps = int(end - buf) / swgl_StepSize;
+ // Find the sub-span before UVs repeat to avoid expensive repeat math
+ steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
+ if (steps > 0) {
+ steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
+ if (steps > 0) {
+ vec2 inside_uv = fract(uv) * uv_scale + min_uv;
+ vec2 inside_step = uv_step * uv_scale;
+ for (P* outside = &buf[steps * swgl_StepSize]; buf < outside;
+ buf += swgl_StepSize, inside_uv += inside_step) {
+ commit_blend_span<BLEND>(
+ buf, applyColor(
+ textureNearestPacked(buf, sampler, ivec2(inside_uv)),
+ color));
+ }
+ if (buf >= end) {
+ break;
+ }
+ uv += steps * uv_step;
+ }
+ }
+ }
+
+ // UVs might repeat within this step, so explicitly compute repeated UVs
+ vec2 repeated_uv = REPEAT
+ ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
+ : clamp(uv, min_uv, max_uv);
+ commit_blend_span<BLEND>(
+ buf,
+ applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)),
+ color));
+ }
+ }
+ return span;
+}
+
+// Determine if we can use the fast nearest filter for the given nearest mode.
+// If the Y coordinate varies more than half a pixel over
+// the span (which might cause the texel to alias to the next one), or the span
+// needs X scaling, then we have to use the fallback.
+template <typename S, typename T>
+static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) {
+ P = samplerScale(sampler, P);
+ return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P);
+}
+
+// Commit an entire span with nearest filtering and either clamped or repeating
+// UVs
+#define swgl_commitTextureNearest(format, s, p, uv_rect, color) \
+ do { \
+ auto packed_color = packColor(swgl_Out##format, color); \
+ int drawn = 0; \
+ if (needsNearestFallback(s, p, swgl_SpanLength)) { \
+ if (blend_key) { \
+ drawn = blendTextureNearestRepeat<true, false>( \
+ s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \
+ swgl_Out##format); \
+ } else { \
+ drawn = blendTextureNearestRepeat<false, false>( \
+ s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \
+ swgl_Out##format); \
+ } \
+ } else if (blend_key) { \
+ drawn = blendTextureNearestFast<true>(s, p, swgl_SpanLength, uv_rect, \
+ packed_color, swgl_Out##format); \
+ } else { \
+ drawn = blendTextureNearestFast<false>(s, p, swgl_SpanLength, uv_rect, \
+ packed_color, swgl_Out##format); \
+ } \
+ swgl_Out##format += drawn; \
+ swgl_SpanLength -= drawn; \
+ } while (0)
+#define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \
+ swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor())
+#define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \
+ swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color)
+
+#define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \
+ color) \
+ do { \
+ auto packed_color = packColor(swgl_Out##format, color); \
+ int drawn = 0; \
+ if (blend_key) { \
+ drawn = blendTextureNearestRepeat<true, true>( \
+ s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \
+ swgl_Out##format); \
+ } else { \
+ drawn = blendTextureNearestRepeat<false, true>( \
+ s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \
+ swgl_Out##format); \
+ } \
+ swgl_Out##format += drawn; \
+ swgl_SpanLength -= drawn; \
+ } while (0)
+#define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \
+ uv_rect) \
+ swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, \
+ NoColor())
+#define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat, \
+ uv_repeat, uv_rect, color) \
+ swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color)
+
+// Commit an entire span of texture with filtering determined by sampler state.
+#define swgl_commitTexture(format, s, ...) \
+ do { \
+ if (s->filter == TextureFilter::LINEAR) { \
+ swgl_commitTextureLinear##format(s, __VA_ARGS__); \
+ } else { \
+ swgl_commitTextureNearest##format(s, __VA_ARGS__); \
+ } \
+ } while (0)
+#define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__)
+#define swgl_commitTextureColorRGBA8(...) \
+ swgl_commitTexture(ColorRGBA8, __VA_ARGS__)
+#define swgl_commitTextureRepeatRGBA8(...) \
+ swgl_commitTexture(RepeatRGBA8, __VA_ARGS__)
+#define swgl_commitTextureRepeatColorRGBA8(...) \
+ swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__)
+
+// Commit an entire span of a separable pass of a Gaussian blur that falls
+// within the given radius scaled by supplied coefficients, clamped to uv_rect
+// bounds.
+template <bool BLEND, typename S, typename P>
+static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect,
+ P* buf, int span, bool hori, int radius,
+ vec2_scalar coeffs) {
+ if (!matchTextureFormat(sampler, buf)) {
+ return 0;
+ }
+ vec2_scalar size = {float(sampler->width), float(sampler->height)};
+ ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size);
+ ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size));
+ int startX = curUV.x;
+ int endX = min(min(bounds.z, curUV.x + span), int(size.x));
+ if (hori) {
+ for (; curUV.x + swgl_StepSize <= endX;
+ buf += swgl_StepSize, curUV.x += swgl_StepSize) {
+ commit_blend_span<BLEND>(
+ buf, gaussianBlurHorizontal<P>(sampler, curUV, bounds.x, bounds.z,
+ radius, coeffs.x, coeffs.y));
+ }
+ } else {
+ for (; curUV.x + swgl_StepSize <= endX;
+ buf += swgl_StepSize, curUV.x += swgl_StepSize) {
+ commit_blend_span<BLEND>(
+ buf, gaussianBlurVertical<P>(sampler, curUV, bounds.y, bounds.w,
+ radius, coeffs.x, coeffs.y));
+ }
+ }
+ return curUV.x - startX;
+}
+
+#define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs) \
+ do { \
+ int drawn = 0; \
+ if (blend_key) { \
+ drawn = blendGaussianBlur<true>(s, p, uv_rect, swgl_Out##format, \
+ swgl_SpanLength, hori, radius, coeffs); \
+ } else { \
+ drawn = blendGaussianBlur<false>(s, p, uv_rect, swgl_Out##format, \
+ swgl_SpanLength, hori, radius, coeffs); \
+ } \
+ swgl_Out##format += drawn; \
+ swgl_SpanLength -= drawn; \
+ } while (0)
+#define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \
+ swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs)
+#define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \
+ swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs)
+
+// Convert and pack planar YUV samples to RGB output using a color space
+static ALWAYS_INLINE PackedRGBA8 convertYUV(const YUVMatrix& rgb_from_ycbcr,
+ U16 y, U16 u, U16 v) {
+ auto yy = V8<int16_t>(zip(y, y));
+ auto uv = V8<int16_t>(zip(u, v));
+ return rgb_from_ycbcr.convert(yy, uv);
+}
+
+// Helper functions to sample from planar YUV textures before converting to RGB
+template <typename S0>
+static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0,
+ const YUVMatrix& rgb_from_ycbcr,
+ UNUSED int rescaleFactor) {
+ switch (sampler0->format) {
+ case TextureFormat::RGBA8: {
+ auto planar = textureLinearPlanarRGBA8(sampler0, uv0);
+ return convertYUV(rgb_from_ycbcr, highHalf(planar.rg), lowHalf(planar.rg),
+ lowHalf(planar.ba));
+ }
+ case TextureFormat::YUV422: {
+ auto planar = textureLinearPlanarYUV422(sampler0, uv0);
+ return convertYUV(rgb_from_ycbcr, planar.y, planar.u, planar.v);
+ }
+ default:
+ assert(false);
+ return PackedRGBA8(0);
+ }
+}
+
+template <bool BLEND, typename S0, typename P, typename C = NoColor>
+static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
+ const vec4_scalar& uv_rect0, const vec3_scalar& ycbcr_bias,
+ const mat3_scalar& rgb_from_debiased_ycbcr,
+ int rescaleFactor, C color = C()) {
+ if (!swgl_isTextureLinear(sampler0)) {
+ return 0;
+ }
+ LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
+ const auto rgb_from_ycbcr =
+ YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
+ auto c = packColor(buf, color);
+ auto* end = buf + span;
+ for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) {
+ commit_blend_span<BLEND>(
+ buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
+ rgb_from_ycbcr, rescaleFactor),
+ c));
+ }
+ return span;
+}
+
+template <typename S0, typename S1>
+static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
+ ivec2 uv1,
+ const YUVMatrix& rgb_from_ycbcr,
+ int rescaleFactor) {
+ switch (sampler1->format) {
+ case TextureFormat::RG8: {
+ assert(sampler0->format == TextureFormat::R8);
+ auto y = textureLinearUnpackedR8(sampler0, uv0);
+ auto planar = textureLinearPlanarRG8(sampler1, uv1);
+ return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.rg),
+ highHalf(planar.rg));
+ }
+ case TextureFormat::RGBA8: {
+ assert(sampler0->format == TextureFormat::R8);
+ auto y = textureLinearUnpackedR8(sampler0, uv0);
+ auto planar = textureLinearPlanarRGBA8(sampler1, uv1);
+ return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.ba),
+ highHalf(planar.rg));
+ }
+ case TextureFormat::RG16: {
+ assert(sampler0->format == TextureFormat::R16);
+ // The rescaling factor represents how many bits to add to renormalize the
+ // texture to 16 bits, and so the color depth is actually 16 minus the
+ // rescaling factor.
+ // Need to right shift the sample by the amount of bits over 8 it
+ // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
+ // of precision at the low end already, hence 1 is subtracted from the
+ // color depth.
+ int colorDepth = 16 - rescaleFactor;
+ int rescaleBits = (colorDepth - 1) - 8;
+ auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits;
+ auto uv = textureLinearUnpackedRG16(sampler1, uv1) >> rescaleBits;
+ return rgb_from_ycbcr.convert(zip(y, y), uv);
+ }
+ default:
+ assert(false);
+ return PackedRGBA8(0);
+ }
+}
+
+template <bool BLEND, typename S0, typename S1, typename P,
+ typename C = NoColor>
+static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
+ const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
+ const vec4_scalar& uv_rect1, const vec3_scalar& ycbcr_bias,
+ const mat3_scalar& rgb_from_debiased_ycbcr,
+ int rescaleFactor, C color = C()) {
+ if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) {
+ return 0;
+ }
+ LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
+ LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
+ const auto rgb_from_ycbcr =
+ YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
+ auto c = packColor(buf, color);
+ auto* end = buf + span;
+ for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) {
+ commit_blend_span<BLEND>(
+ buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
+ sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
+ rgb_from_ycbcr, rescaleFactor),
+ c));
+ }
+ return span;
+}
+
+template <typename S0, typename S1, typename S2>
+static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
+ ivec2 uv1, S2 sampler2, ivec2 uv2,
+ const YUVMatrix& rgb_from_ycbcr,
+ int rescaleFactor) {
+ assert(sampler0->format == sampler1->format &&
+ sampler0->format == sampler2->format);
+ switch (sampler0->format) {
+ case TextureFormat::R8: {
+ auto y = textureLinearUnpackedR8(sampler0, uv0);
+ auto u = textureLinearUnpackedR8(sampler1, uv1);
+ auto v = textureLinearUnpackedR8(sampler2, uv2);
+ return convertYUV(rgb_from_ycbcr, y, u, v);
+ }
+ case TextureFormat::R16: {
+ // The rescaling factor represents how many bits to add to renormalize the
+ // texture to 16 bits, and so the color depth is actually 16 minus the
+ // rescaling factor.
+ // Need to right shift the sample by the amount of bits over 8 it
+ // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
+ // of precision at the low end already, hence 1 is subtracted from the
+ // color depth.
+ int colorDepth = 16 - rescaleFactor;
+ int rescaleBits = (colorDepth - 1) - 8;
+ auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits;
+ auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits;
+ auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits;
+ return convertYUV(rgb_from_ycbcr, U16(y), U16(u), U16(v));
+ }
+ default:
+ assert(false);
+ return PackedRGBA8(0);
+ }
+}
+
+// Fallback helper for when we can't specifically accelerate YUV with
+// composition.
+template <bool BLEND, typename S0, typename S1, typename S2, typename P,
+ typename C>
+static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0,
+ vec2_scalar uv_step0, vec2_scalar min_uv0,
+ vec2_scalar max_uv0, S1 sampler1, vec2 uv1,
+ vec2_scalar uv_step1, vec2_scalar min_uv1,
+ vec2_scalar max_uv1, S2 sampler2, vec2 uv2,
+ vec2_scalar uv_step2, vec2_scalar min_uv2,
+ vec2_scalar max_uv2, const vec3_scalar& ycbcr_bias,
+ const mat3_scalar& rgb_from_debiased_ycbcr,
+ int rescaleFactor, C color) {
+ const auto rgb_from_ycbcr =
+ YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
+ for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0,
+ uv1 += uv_step1, uv2 += uv_step2) {
+ commit_blend_span<BLEND>(
+ buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
+ sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
+ sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)),
+ rgb_from_ycbcr, rescaleFactor),
+ color));
+ }
+}
+
+template <bool BLEND, typename S0, typename S1, typename S2, typename P,
+ typename C = NoColor>
+static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
+ const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
+ const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2,
+ const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias,
+ const mat3_scalar& rgb_from_debiased_ycbcr,
+ int rescaleFactor, C color = C()) {
+ if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
+ !swgl_isTextureLinear(sampler2)) {
+ return 0;
+ }
+ LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
+ LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
+ LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
+ auto c = packColor(buf, color);
+ blendYUVFallback<BLEND>(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0,
+ sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2,
+ uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias,
+ rgb_from_debiased_ycbcr, rescaleFactor, c);
+ return span;
+}
+
+// A variant of the blendYUV that attempts to reuse the inner loops from the
+// CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on
+// the source data, which in turn allows it to be much faster than blendYUV.
+// At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer
+// and that no color scaling is applied, which we can accomplish via template
+// specialization. We need to further validate inside that texture formats
+// and dimensions are sane for video and that the video is axis-aligned before
+// acceleration can proceed.
+template <bool BLEND>
+static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0,
+ const vec4_scalar& uv_rect0, sampler2DRect sampler1,
+ vec2 uv1, const vec4_scalar& uv_rect1,
+ sampler2DRect sampler2, vec2 uv2,
+ const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias,
+ const mat3_scalar& rgb_from_debiased_ycbcr,
+ int rescaleFactor, NoColor noColor = NoColor()) {
+ if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
+ !swgl_isTextureLinear(sampler2)) {
+ return 0;
+ }
+ LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
+ LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
+ LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
+ auto* end = buf + span;
+ // CompositeYUV imposes further restrictions on the source textures, such that
+ // the the Y/U/V samplers must all have a matching format, the U/V samplers
+ // must have matching sizes and sample coordinates, and there must be no
+ // change in row across the entire span.
+ if (sampler0->format == sampler1->format &&
+ sampler1->format == sampler2->format &&
+ sampler1->width == sampler2->width &&
+ sampler1->height == sampler2->height && uv_step0.y == 0 &&
+ uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 &&
+ uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) {
+ // CompositeYUV does not support a clamp rect, so we must take care to
+ // advance till we're inside the bounds of the clamp rect.
+ int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x,
+ (min_uv1.x - uv1.x.x) / uv_step1.x))),
+ (end - buf) / swgl_StepSize);
+ if (outside > 0) {
+ blendYUVFallback<BLEND>(buf, outside * swgl_StepSize, sampler0, uv0,
+ uv_step0, min_uv0, max_uv0, sampler1, uv1,
+ uv_step1, min_uv1, max_uv1, sampler2, uv2,
+ uv_step2, min_uv2, max_uv2, ycbcr_bias,
+ rgb_from_debiased_ycbcr, rescaleFactor, noColor);
+ buf += outside * swgl_StepSize;
+ uv0.x += outside * uv_step0.x;
+ uv1.x += outside * uv_step1.x;
+ uv2.x += outside * uv_step2.x;
+ }
+ // Find the amount of chunks inside the clamp rect before we hit the
+ // maximum. If there are any chunks inside, we can finally dispatch to
+ // CompositeYUV.
+ int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x,
+ (max_uv1.x - uv1.x.x) / uv_step1.x)),
+ (end - buf) / swgl_StepSize);
+ if (inside > 0) {
+ // We need the color depth, which is relative to the texture format and
+ // rescale factor.
+ int colorDepth =
+ (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor;
+ // Finally, call the inner loop of CompositeYUV.
+ const auto rgb_from_ycbcr =
+ YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor);
+ linear_row_yuv<BLEND>(
+ buf, inside * swgl_StepSize, sampler0, force_scalar(uv0),
+ uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1),
+ uv_step1.x / swgl_StepSize, colorDepth, rgb_from_ycbcr);
+ // Now that we're done, advance past the processed inside portion.
+ buf += inside * swgl_StepSize;
+ uv0.x += inside * uv_step0.x;
+ uv1.x += inside * uv_step1.x;
+ uv2.x += inside * uv_step2.x;
+ }
+ }
+ // We either got here because we have some samples outside the clamp rect, or
+ // because some of the preconditions were not satisfied. Process whatever is
+ // left of the span.
+ blendYUVFallback<BLEND>(buf, end - buf, sampler0, uv0, uv_step0, min_uv0,
+ max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1,
+ sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias,
+ rgb_from_debiased_ycbcr, rescaleFactor, noColor);
+ return span;
+}
+
+// Commit a single chunk of a YUV surface represented by multiple planar
+// textures. This requires a color space specifier selecting how to convert
+// from YUV to RGB output. In the case of HDR formats, a rescaling factor
+// selects how many bits of precision must be utilized on conversion. See the
+// sampleYUV dispatcher functions for the various supported plane
+// configurations this intrinsic accepts.
+#define swgl_commitTextureLinearYUV(...) \
+ do { \
+ int drawn = 0; \
+ if (blend_key) { \
+ drawn = blendYUV<true>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \
+ } else { \
+ drawn = blendYUV<false>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \
+ } \
+ swgl_OutRGBA8 += drawn; \
+ swgl_SpanLength -= drawn; \
+ } while (0)
+
+// Commit a single chunk of a YUV surface scaled by a color.
+#define swgl_commitTextureLinearColorYUV(...) \
+ swgl_commitTextureLinearYUV(__VA_ARGS__)
+
+// Each gradient stops entry is a pair of RGBA32F start color and end step.
+struct GradientStops {
+ Float startColor;
+ union {
+ Float stepColor;
+ vec4_scalar stepData;
+ };
+
+ // Whether this gradient entry can be merged with an adjacent entry. The
+ // step will be equal with the adjacent step if and only if they can be
+ // merged, or rather, that the stops are actually part of a single larger
+ // gradient.
+ bool can_merge(const GradientStops& next) const {
+ return stepData == next.stepData;
+ }
+
+ // Get the interpolated color within the entry based on the offset from its
+ // start.
+ Float interpolate(float offset) const {
+ return startColor + stepColor * offset;
+ }
+
+ // Get the end color of the entry where interpolation stops.
+ Float end_color() const { return startColor + stepColor; }
+};
+
+// Checks if a gradient table of the specified size exists at the UV coords of
+// the address within an RGBA32F texture. If so, a linear address within the
+// texture is returned that may be used to sample the gradient table later. If
+// the address doesn't describe a valid gradient, then a negative value is
+// returned.
+static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address,
+ int entries) {
+ return sampler->format == TextureFormat::RGBA32F && address.y >= 0 &&
+ address.y < int(sampler->height) && address.x >= 0 &&
+ address.x < int(sampler->width) && entries > 0 &&
+ address.x +
+ int(sizeof(GradientStops) / sizeof(Float)) * entries <=
+ int(sampler->width)
+ ? address.y * sampler->stride + address.x * 4
+ : -1;
+}
+
+static inline WideRGBA8 sampleGradient(sampler2D sampler, int address,
+ Float entry) {
+ assert(sampler->format == TextureFormat::RGBA32F);
+ assert(address >= 0 && address < int(sampler->height * sampler->stride));
+ // Get the integer portion of the entry index to find the entry colors.
+ I32 index = cast(entry);
+ // Use the fractional portion of the entry index to control blending between
+ // entry colors.
+ Float offset = entry - cast(index);
+ // Every entry is a pair of colors blended by the fractional offset.
+ assert(test_all(index >= 0 &&
+ index * int(sizeof(GradientStops) / sizeof(Float)) <
+ int(sampler->width)));
+ GradientStops* stops = (GradientStops*)&sampler->buf[address];
+ // Blend between the colors for each SIMD lane, then pack them to RGBA8
+ // result. Since the layout of the RGBA8 framebuffer is actually BGRA while
+ // the gradient table has RGBA colors, swizzling is required.
+ return combine(
+ packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw),
+ round_pixel(stops[index.y].interpolate(offset.y).zyxw)),
+ packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw),
+ round_pixel(stops[index.w].interpolate(offset.w).zyxw)));
+}
+
+// Samples a gradient entry from the gradient at the provided linearized
+// address. The integer portion of the entry index is used to find the entry
+// within the table whereas the fractional portion is used to blend between
+// adjacent table entries.
+#define swgl_commitGradientRGBA8(sampler, address, entry) \
+ swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry))
+
+// Variant that allows specifying a color multiplier of the gradient result.
+#define swgl_commitGradientColorRGBA8(sampler, address, entry, color) \
+ swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \
+ packColor(swgl_OutRGBA, color)))
+
+// Samples an entire span of a linear gradient by crawling the gradient table
+// and looking for consecutive stops that can be merged into a single larger
+// gradient, then interpolating between those larger gradients within the span.
+template <bool BLEND>
+static bool commitLinearGradient(sampler2D sampler, int address, float size,
+ bool tileRepeat, bool gradientRepeat, vec2 pos,
+ const vec2_scalar& scaleDir, float startOffset,
+ uint32_t* buf, int span) {
+ assert(sampler->format == TextureFormat::RGBA32F);
+ assert(address >= 0 && address < int(sampler->height * sampler->stride));
+ GradientStops* stops = (GradientStops*)&sampler->buf[address];
+ // Get the chunk delta from the difference in offset steps. This represents
+ // how far within the gradient table we advance for every step in output,
+ // normalized to gradient table size.
+ vec2_scalar posStep = dFdx(pos) * 4.0f;
+ float delta = dot(posStep, scaleDir);
+ if (!isfinite(delta)) {
+ return false;
+ }
+ // If we have a repeating brush, then the position will be modulo the [0,1)
+ // interval. Compute coefficients that can be used to quickly evaluate the
+ // distance to the interval boundary where the offset will wrap.
+ vec2_scalar distCoeffsX = {0.25f * span, 0.0f};
+ vec2_scalar distCoeffsY = distCoeffsX;
+ if (tileRepeat) {
+ if (posStep.x != 0.0f) {
+ distCoeffsX = vec2_scalar{step(0.0f, posStep.x), 1.0f} * recip(posStep.x);
+ }
+ if (posStep.y != 0.0f) {
+ distCoeffsY = vec2_scalar{step(0.0f, posStep.y), 1.0f} * recip(posStep.y);
+ }
+ }
+ for (; span > 0;) {
+ // Try to process as many chunks as are within the span if possible.
+ float chunks = 0.25f * span;
+ vec2 repeatPos = pos;
+ if (tileRepeat) {
+ // If this is a repeating brush, then limit the chunks to not cross the
+ // interval boundaries.
+ repeatPos = fract(pos);
+ chunks = min(chunks, distCoeffsX.x - repeatPos.x.x * distCoeffsX.y);
+ chunks = min(chunks, distCoeffsY.x - repeatPos.y.x * distCoeffsY.y);
+ }
+ // Compute the gradient offset from the position.
+ Float offset =
+ repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset;
+ // If repeat is desired, we need to limit the offset to a fractional value.
+ if (gradientRepeat) {
+ offset = fract(offset);
+ }
+ // To properly handle both clamping and repeating of the table offset, we
+ // need to ensure we don't run past the 0 and 1 points. Here we compute the
+ // intercept points depending on whether advancing forwards or backwards in
+ // the gradient table to ensure the chunk count is limited by the amount
+ // before intersection. If there is no delta, then we compute no intercept.
+ float startEntry;
+ int minIndex, maxIndex;
+ if (offset.x < 0) {
+ // If we're below the gradient table, use the first color stop. We can
+ // only intercept the table if walking forward.
+ startEntry = 0;
+ minIndex = int(startEntry);
+ maxIndex = minIndex;
+ if (delta > 0) {
+ chunks = min(chunks, -offset.x / delta);
+ }
+ } else if (offset.x < 1) {
+ // Otherwise, we're inside the gradient table. Depending on the direction
+ // we're walking the the table, we may intersect either the 0 or 1 offset.
+ // Compute the start entry based on our initial offset, and compute the
+ // end entry based on the available chunks limited by intercepts. Clamp
+ // them into the valid range of the table.
+ startEntry = 1.0f + offset.x * size;
+ if (delta < 0) {
+ chunks = min(chunks, -offset.x / delta);
+ } else if (delta > 0) {
+ chunks = min(chunks, (1 - offset.x) / delta);
+ }
+ float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size,
+ 0.0f, 1.0f + size);
+ // Now that we know the range of entries we need to sample, we want to
+ // find the largest possible merged gradient within that range. Depending
+ // on which direction we are advancing in the table, we either walk up or
+ // down the table trying to merge the current entry with the adjacent
+ // entry. We finally limit the chunks to only sample from this merged
+ // gradient.
+ minIndex = int(startEntry);
+ maxIndex = minIndex;
+ if (delta > 0) {
+ while (maxIndex + 1 < endEntry &&
+ stops[maxIndex].can_merge(stops[maxIndex + 1])) {
+ maxIndex++;
+ }
+ chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size));
+ } else if (delta < 0) {
+ while (minIndex - 1 > endEntry &&
+ stops[minIndex - 1].can_merge(stops[minIndex])) {
+ minIndex--;
+ }
+ chunks = min(chunks, (minIndex - startEntry) / (delta * size));
+ }
+ } else {
+ // If we're above the gradient table, use the last color stop. We can
+ // only intercept the table if walking backward.
+ startEntry = 1.0f + size;
+ minIndex = int(startEntry);
+ maxIndex = minIndex;
+ if (delta < 0) {
+ chunks = min(chunks, (1 - offset.x) / delta);
+ }
+ }
+ // If there are any amount of whole chunks of a merged gradient found,
+ // then we want to process that as a single gradient span with the start
+ // and end colors from the min and max entries.
+ if (chunks >= 1.0f) {
+ int inside = int(chunks);
+ // Sample the start color from the min entry and the end color from the
+ // max entry of the merged gradient. These are scaled to a range of
+ // 0..0xFF00, as that is the largest shifted value that can fit in a U16.
+ // Since we are only doing addition with the step value, we can still
+ // represent negative step values without having to use an explicit sign
+ // bit, as the result will still come out the same, allowing us to gain an
+ // extra bit of precision. We will later shift these into 8 bit output
+ // range while committing the span, but stepping with higher precision to
+ // avoid banding. We convert from RGBA to BGRA here to avoid doing this in
+ // the inner loop.
+ auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00);
+ auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00);
+ // Get the color range of the merged gradient, normalized to its size.
+ auto colorRangeF =
+ (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex));
+ // Compute the actual starting color of the current start offset within
+ // the merged gradient. The value 0.5 is added to the low bits (0x80) so
+ // that the color will effective round to the nearest increment below.
+ auto colorF =
+ minColorF + colorRangeF * (startEntry - minIndex) + float(0x80);
+ // Compute the portion of the color range that we advance on each chunk.
+ Float deltaColorF = colorRangeF * (delta * size);
+ // Quantize the color delta and current color. These have already been
+ // scaled to the 0..0xFF00 range, so we just need to round them to U16.
+ auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16));
+ auto color =
+ combine(CONVERT(round_pixel(colorF, 1), U16),
+ CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16),
+ CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16),
+ CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16));
+ // Finally, step the current color through the output chunks, shifting
+ // it into 8 bit range and outputting as we go.
+ for (auto* end = buf + inside * 4; buf < end; buf += 4) {
+ commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8));
+ color += deltaColor;
+ }
+ // Deduct the number of chunks inside the gradient from the remaining
+ // overall span. If we exhausted the span, bail out.
+ span -= inside * 4;
+ if (span <= 0) {
+ break;
+ }
+ // Otherwise, assume we're in a transitional section of the gradient that
+ // will probably require per-sample table lookups, so fall through below.
+ // We need to re-evaluate the position and offset first, though.
+ pos += posStep * float(inside);
+ repeatPos = tileRepeat ? fract(pos) : pos;
+ offset =
+ repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset;
+ if (gradientRepeat) {
+ offset = fract(offset);
+ }
+ }
+ // If we get here, there were no whole chunks of a merged gradient found
+ // that we could process, but we still have a non-zero amount of span left.
+ // That means we have segments of gradient that begin or end at the current
+ // entry we're on. For this case, we just fall back to sampleGradient which
+ // will calculate a table entry for each sample, assuming the samples may
+ // have different table entries.
+ Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
+ commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
+ span -= 4;
+ buf += 4;
+ pos += posStep;
+ }
+ return true;
+}
+
+// Commits an entire span of a linear gradient, given the address of a table
+// previously resolved with swgl_validateGradient. The size of the inner portion
+// of the table is given, assuming the table start and ends with a single entry
+// each to deal with clamping. Repeating will be handled if necessary. The
+// initial offset within the table is used to designate where to start the span
+// and how to step through the gradient table.
+#define swgl_commitLinearGradientRGBA8(sampler, address, size, tileRepeat, \
+ gradientRepeat, pos, scaleDir, \
+ startOffset) \
+ do { \
+ bool drawn = false; \
+ if (blend_key) { \
+ drawn = commitLinearGradient<true>( \
+ sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \
+ startOffset, swgl_OutRGBA8, swgl_SpanLength); \
+ } else { \
+ drawn = commitLinearGradient<false>( \
+ sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \
+ startOffset, swgl_OutRGBA8, swgl_SpanLength); \
+ } \
+ if (drawn) { \
+ swgl_OutRGBA8 += swgl_SpanLength; \
+ swgl_SpanLength = 0; \
+ } \
+ } while (0)
+
+template <bool CLAMP, typename V>
+static ALWAYS_INLINE V fastSqrt(V v) {
+#if USE_SSE2 || USE_NEON
+ // Clamp to avoid zero in inversesqrt.
+ return v * inversesqrt(CLAMP ? max(v, V(1.0e-10f)) : v);
+#else
+ return sqrt(v);
+#endif
+}
+
+template <bool CLAMP, typename V>
+static ALWAYS_INLINE auto fastLength(V v) {
+ return fastSqrt<CLAMP>(dot(v, v));
+}
+
+// Samples an entire span of a radial gradient by crawling the gradient table
+// and looking for consecutive stops that can be merged into a single larger
+// gradient, then interpolating between those larger gradients within the span
+// based on the computed position relative to a radius.
+template <bool BLEND>
+static bool commitRadialGradient(sampler2D sampler, int address, float size,
+ bool repeat, vec2 pos, float radius,
+ uint32_t* buf, int span) {
+ assert(sampler->format == TextureFormat::RGBA32F);
+ assert(address >= 0 && address < int(sampler->height * sampler->stride));
+ GradientStops* stops = (GradientStops*)&sampler->buf[address];
+ // clang-format off
+ // Given position p, delta d, and radius r, we need to repeatedly solve the
+ // following quadratic for the pixel offset t:
+ // length(p + t*d) = r
+ // (px + t*dx)^2 + (py + t*dy)^2 = r^2
+ // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is:
+ // t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0
+ // t^2*d.d + t*2*d.p + (p.p-r^2) = 0
+ // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to:
+ // t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d)
+ // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so
+ // we cache them below for faster computation.
+ //
+ // The quadratic has two solutions, representing the span intersecting the
+ // given radius of gradient, which can occur at two offsets. If there is only
+ // one solution (where b^2-4ac = 0), this represents the point at which the
+ // span runs tangent to the radius. This middle point is significant in that
+ // before it, we walk down the gradient ramp, and after it, we walk up the
+ // ramp.
+ // clang-format on
+ vec2_scalar pos0 = {pos.x.x, pos.y.x};
+ vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x};
+ float deltaDelta = dot(delta, delta);
+ if (!isfinite(deltaDelta) || !isfinite(radius)) {
+ return false;
+ }
+ float invDelta, middleT, middleB;
+ if (deltaDelta > 0) {
+ invDelta = 1.0f / deltaDelta;
+ middleT = -dot(delta, pos0) * invDelta;
+ middleB = middleT * middleT - dot(pos0, pos0) * invDelta;
+ } else {
+ // If position is invariant, just set the coefficients so the quadratic
+ // always reduces to the end of the span.
+ invDelta = 0.0f;
+ middleT = float(span);
+ middleB = 0.0f;
+ }
+ // We only want search for merged gradients up to the minimum of either the
+ // mid-point or the span length. Cache those offsets here as they don't vary
+ // in the inner loop.
+ Float middleEndRadius = fastLength<true>(
+ pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f});
+ float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x;
+ float endRadius = middleEndRadius.y;
+ // Convert delta to change in position per chunk.
+ delta *= 4;
+ deltaDelta *= 4 * 4;
+ // clang-format off
+ // Given current position p and delta d, we reduce:
+ // length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p))
+ // where dot(p+d,p+d) can be accumulated as:
+ // (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2)
+ // = p.p + 2p.d + d.d
+ // Since p increases by d every loop iteration, p.d increases by d.d, and thus
+ // we can accumulate d.d to calculate 2p.d, then allowing us to get the next
+ // dot-product by adding it to dot-product p.p of the prior iteration. This
+ // saves us some multiplications and an expensive sqrt inside the inner loop.
+ // clang-format on
+ Float dotPos = dot(pos, pos);
+ Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta;
+ float deltaDelta2 = 2.0f * deltaDelta;
+ for (int t = 0; t < span;) {
+ // Compute the gradient table offset from the current position.
+ Float offset = fastSqrt<true>(dotPos) - radius;
+ float startRadius = radius;
+ // If repeat is desired, we need to limit the offset to a fractional value.
+ if (repeat) {
+ // The non-repeating radius at which the gradient table actually starts,
+ // radius + floor(offset) = radius + (offset - fract(offset)).
+ startRadius += offset.x;
+ offset = fract(offset);
+ startRadius -= offset.x;
+ }
+ // We need to find the min/max index in the table of the gradient we want to
+ // use as well as the intercept point where we leave this gradient.
+ float intercept = -1;
+ int minIndex = 0;
+ int maxIndex = int(1.0f + size);
+ if (offset.x < 0) {
+ // If inside the inner radius of the gradient table, then use the first
+ // stop. Set the intercept to advance forward to the start of the gradient
+ // table.
+ maxIndex = minIndex;
+ if (t >= middleT) {
+ intercept = radius;
+ }
+ } else if (offset.x < 1) {
+ // Otherwise, we're inside the valid part of the gradient table.
+ minIndex = int(1.0f + offset.x * size);
+ maxIndex = minIndex;
+ // Find the offset in the gradient that corresponds to the search limit.
+ // We only search up to the minimum of either the mid-point or the span
+ // length. Get the table index that corresponds to this offset, clamped so
+ // that we avoid hitting the beginning (0) or end (1 + size) of the table.
+ float searchOffset =
+ (t >= middleT ? endRadius : middleRadius) - startRadius;
+ int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size));
+ // If we are past the mid-point, walk up the gradient table trying to
+ // merge stops. If we're below the mid-point, we need to walk down the
+ // table. We note the table index at which we need to look for an
+ // intercept to determine a valid span.
+ if (t >= middleT) {
+ while (maxIndex + 1 <= searchIndex &&
+ stops[maxIndex].can_merge(stops[maxIndex + 1])) {
+ maxIndex++;
+ }
+ intercept = maxIndex + 1;
+ } else {
+ while (minIndex - 1 >= searchIndex &&
+ stops[minIndex - 1].can_merge(stops[minIndex])) {
+ minIndex--;
+ }
+ intercept = minIndex;
+ }
+ // Convert from a table index into units of radius from the center of the
+ // gradient.
+ intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius;
+ } else {
+ // If outside the outer radius of the gradient table, then use the last
+ // stop. Set the intercept to advance toward the valid part of the
+ // gradient table if going in, or just run to the end of the span if going
+ // away from the gradient.
+ minIndex = maxIndex;
+ if (t < middleT) {
+ intercept = radius + 1;
+ }
+ }
+ // Solve the quadratic for t to find where the merged gradient ends. If no
+ // intercept is found, just go to the middle or end of the span.
+ float endT = t >= middleT ? span : min(span, int(middleT));
+ if (intercept >= 0) {
+ float b = middleB + intercept * intercept * invDelta;
+ if (b > 0) {
+ b = fastSqrt<false>(b);
+ endT = min(endT, t >= middleT ? middleT + b : middleT - b);
+ }
+ }
+ // Figure out how many chunks are actually inside the merged gradient.
+ if (t + 4.0f <= endT) {
+ int inside = int(endT - t) & ~3;
+ // Convert start and end colors to BGRA and scale to 0..255 range later.
+ auto minColorF = stops[minIndex].startColor.zyxw * 255.0f;
+ auto maxColorF = stops[maxIndex].end_color().zyxw * 255.0f;
+ // Compute the change in color per change in gradient offset.
+ auto deltaColorF =
+ (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex));
+ // Subtract off the color difference of the beginning of the current span
+ // from the beginning of the gradient.
+ Float colorF =
+ minColorF - deltaColorF * (startRadius + (minIndex - 1) / size);
+ // Finally, walk over the span accumulating the position dot product and
+ // getting its sqrt as an offset into the color ramp. Since we're already
+ // in BGRA format and scaled to 255, we just need to round to an integer
+ // and pack down to pixel format.
+ for (auto* end = buf + inside; buf < end; buf += 4) {
+ Float offsetG = fastSqrt<false>(dotPos);
+ commit_blend_span<BLEND>(
+ buf,
+ combine(
+ packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1),
+ round_pixel(colorF + deltaColorF * offsetG.y, 1)),
+ packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1),
+ round_pixel(colorF + deltaColorF * offsetG.w, 1))));
+ dotPos += dotPosDelta;
+ dotPosDelta += deltaDelta2;
+ }
+ // Advance past the portion of gradient we just processed.
+ t += inside;
+ // If we hit the end of the span, exit out now.
+ if (t >= span) {
+ break;
+ }
+ // Otherwise, we are most likely in a transitional section of the gradient
+ // between stops that will likely require doing per-sample table lookups.
+ // Rather than having to redo all the searching above to figure that out,
+ // just assume that to be the case and fall through below to doing the
+ // table lookups to hopefully avoid an iteration.
+ offset = fastSqrt<true>(dotPos) - radius;
+ if (repeat) {
+ offset = fract(offset);
+ }
+ }
+ // If we got here, that means we still have span left to process but did not
+ // have any whole chunks that fell within a merged gradient. Just fall back
+ // to doing a table lookup for each sample.
+ Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
+ commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
+ buf += 4;
+ t += 4;
+ dotPos += dotPosDelta;
+ dotPosDelta += deltaDelta2;
+ }
+ return true;
+}
+
+// Commits an entire span of a radial gradient similar to
+// swglcommitLinearGradient, but given a varying 2D position scaled to
+// gradient-space and a radius at which the distance from the origin maps to the
+// start of the gradient table.
+#define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos, \
+ radius) \
+ do { \
+ bool drawn = false; \
+ if (blend_key) { \
+ drawn = \
+ commitRadialGradient<true>(sampler, address, size, repeat, pos, \
+ radius, swgl_OutRGBA8, swgl_SpanLength); \
+ } else { \
+ drawn = \
+ commitRadialGradient<false>(sampler, address, size, repeat, pos, \
+ radius, swgl_OutRGBA8, swgl_SpanLength); \
+ } \
+ if (drawn) { \
+ swgl_OutRGBA8 += swgl_SpanLength; \
+ swgl_SpanLength = 0; \
+ } \
+ } while (0)
+
+// Extension to set a clip mask image to be sampled during blending. The offset
+// specifies the positioning of the clip mask image relative to the viewport
+// origin. The bounding box specifies the rectangle relative to the clip mask's
+// origin that constrains sampling within the clip mask. Blending must be
+// enabled for this to work.
+static sampler2D swgl_ClipMask = nullptr;
+static IntPoint swgl_ClipMaskOffset = {0, 0};
+static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0};
+#define swgl_clipMask(mask, offset, bb_origin, bb_size) \
+ do { \
+ if (bb_size != vec2_scalar(0.0f, 0.0f)) { \
+ swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK; \
+ swgl_ClipMask = mask; \
+ swgl_ClipMaskOffset = make_ivec2(offset); \
+ swgl_ClipMaskBounds = \
+ IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \
+ } \
+ } while (0)
+
+// Extension to enable anti-aliasing for the given edges of a quad.
+// Blending must be enable for this to work.
+static int swgl_AAEdgeMask = 0;
+
+static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; }
+static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; }
+static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) {
+ return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) |
+ (mask.w ? 8 : 0);
+}
+
+#define swgl_antiAlias(edges) \
+ do { \
+ swgl_AAEdgeMask = calcAAEdgeMask(edges); \
+ if (swgl_AAEdgeMask) { \
+ swgl_ClipFlags |= SWGL_CLIP_FLAG_AA; \
+ } \
+ } while (0)
+
+#define swgl_blendDropShadow(color) \
+ do { \
+ swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \
+ swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \
+ swgl_BlendColorRGBA8 = packColor<uint32_t>(color); \
+ } while (0)
+
+#define swgl_blendSubpixelText(color) \
+ do { \
+ swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \
+ swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \
+ swgl_BlendColorRGBA8 = packColor<uint32_t>(color); \
+ swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8); \
+ } while (0)
+
+// Dispatch helper used by the GLSL translator to swgl_drawSpan functions.
+// The number of pixels committed is tracked by checking for the difference in
+// swgl_SpanLength. Any varying interpolants used will be advanced past the
+// committed part of the span in case the fragment shader must be executed for
+// any remaining pixels that were not committed by the span shader.
+#define DISPATCH_DRAW_SPAN(self, format) \
+ do { \
+ int total = self->swgl_SpanLength; \
+ self->swgl_drawSpan##format(); \
+ int drawn = total - self->swgl_SpanLength; \
+ if (drawn) self->step_interp_inputs(drawn); \
+ return drawn; \
+ } while (0)