From 0ebf5bdf043a27fd3dfb7f92e0cb63d88954c44d Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:47:29 +0200 Subject: Adding upstream version 115.8.0esr. Signed-off-by: Daniel Baumann --- gfx/wr/swgl/src/swgl_ext.h | 1906 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1906 insertions(+) create mode 100644 gfx/wr/swgl/src/swgl_ext.h (limited to 'gfx/wr/swgl/src/swgl_ext.h') diff --git a/gfx/wr/swgl/src/swgl_ext.h b/gfx/wr/swgl/src/swgl_ext.h new file mode 100644 index 0000000000..164ca8d7b1 --- /dev/null +++ b/gfx/wr/swgl/src/swgl_ext.h @@ -0,0 +1,1906 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// When using a solid color with clip masking, the cost of loading the clip mask +// in the blend stage exceeds the cost of processing the color. Here we handle +// the entire span of clip mask texture before the blend stage to more +// efficiently process it and modulate it with color without incurring blend +// stage overheads. +template +static void commit_masked_solid_span(P* buf, C color, int len) { + override_clip_mask(); + uint8_t* mask = get_clip_mask(buf); + for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) { + commit_span( + buf, + blend_span( + buf, + applyColor(expand_mask(buf, unpack(unaligned_load(mask))), + color))); + } + restore_clip_mask(); +} + +// When using a solid color with anti-aliasing, most of the solid span will not +// benefit from anti-aliasing in the opaque region. We only want to apply the AA +// blend stage in the non-opaque start and end of the span where AA is needed. +template +static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) { + if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) { + commit_solid_span(buf, r, start); + buf += start; + len -= start; + } + if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) { + override_aa(); + commit_solid_span(buf, r, opaque); + restore_aa(); + buf += opaque; + len -= opaque; + } + if (len > 0) { + commit_solid_span(buf, r, len); + } +} + +// Forces a value with vector run-class to have scalar run-class. +template +static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) { + return force_scalar(v); +} + +// Advance all varying inperpolants by a single chunk +#define swgl_stepInterp() step_interp_inputs() + +// Pseudo-intrinsic that accesses the interpolation step for a given varying +#define swgl_interpStep(v) (interp_step.v) + +// Commit an entire span of a solid color. This dispatches to clip-masked and +// anti-aliased fast-paths as appropriate. +#define swgl_commitSolid(format, v, n) \ + do { \ + int len = (n); \ + if (blend_key) { \ + if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { \ + commit_masked_solid_span(swgl_Out##format, \ + packColor(swgl_Out##format, (v)), len); \ + } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { \ + commit_aa_solid_span(swgl_Out##format, \ + pack_span(swgl_Out##format, (v)), len); \ + } else { \ + commit_solid_span(swgl_Out##format, \ + pack_span(swgl_Out##format, (v)), len); \ + } \ + } else { \ + commit_solid_span(swgl_Out##format, \ + pack_span(swgl_Out##format, (v)), len); \ + } \ + swgl_Out##format += len; \ + swgl_SpanLength -= len; \ + } while (0) +#define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength) +#define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength) +#define swgl_commitPartialSolidRGBA8(len, v) \ + swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength)) +#define swgl_commitPartialSolidR8(len, v) \ + swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength)) + +#define swgl_commitChunk(format, chunk) \ + do { \ + auto r = chunk; \ + if (blend_key) r = blend_span(swgl_Out##format, r); \ + commit_span(swgl_Out##format, r); \ + swgl_Out##format += swgl_StepSize; \ + swgl_SpanLength -= swgl_StepSize; \ + } while (0) + +// Commit a single chunk of a color +#define swgl_commitColor(format, color) \ + swgl_commitChunk(format, pack_pixels_##format(color)) +#define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color) +#define swgl_commitColorR8(color) swgl_commitColor(R8, color) + +template +static ALWAYS_INLINE bool swgl_isTextureLinear(S s) { + return s->filter == TextureFilter::LINEAR; +} + +template +static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) { + return s->format == TextureFormat::RGBA8; +} + +template +static ALWAYS_INLINE bool swgl_isTextureR8(S s) { + return s->format == TextureFormat::R8; +} + +// Use the default linear quantization scale of 128. This gives 7 bits of +// fractional precision, which when multiplied with a signed 9 bit value +// still fits in a 16 bit integer. +const int swgl_LinearQuantizeScale = 128; + +// Quantizes UVs for access into a linear texture. +template +static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) { + return linearQuantize(p, swgl_LinearQuantizeScale, s); +} + +// Quantizes an interpolation step for UVs for access into a linear texture. +template +static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) { + return samplerScale(s, p) * swgl_LinearQuantizeScale; +} + +template +static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf, + S sampler, ivec2 i) { + return textureLinearUnpackedRGBA8(sampler, i); +} + +template +static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf, + S sampler, ivec2 i) { + return textureLinearUnpackedR8(sampler, i); +} + +template +static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) { + return swgl_isTextureRGBA8(s); +} + +template +static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) { + return swgl_isTextureR8(s); +} + +// Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets +// for linear sampling. +#define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv) \ + uv = swgl_linearQuantize(sampler, uv); \ + vec2_scalar uv_step = \ + float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; \ + vec2_scalar min_uv = max( \ + swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \ + vec2_scalar max_uv = \ + max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), \ + min_uv); + +// Implements the fallback linear filter that can deal with clamping and +// arbitrary scales. +template +static P* blendTextureLinearFallback(S sampler, vec2 uv, int span, + vec2_scalar uv_step, vec2_scalar min_uv, + vec2_scalar max_uv, C color, P* buf) { + for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { + commit_blend_span( + buf, applyColor(textureLinearUnpacked(buf, sampler, + ivec2(clamp(uv, min_uv, max_uv))), + color)); + } + return buf; +} + +static ALWAYS_INLINE U64 castForShuffle(V16 r) { + return bit_cast(r); +} +static ALWAYS_INLINE U16 castForShuffle(V4 r) { + return bit_cast(r); +} + +static ALWAYS_INLINE V16 applyFracX(V16 r, I16 fracx) { + return r * fracx.xxxxyyyyzzzzwwww; +} +static ALWAYS_INLINE V4 applyFracX(V4 r, I16 fracx) { + return r * fracx; +} + +// Implements a faster linear filter that works with axis-aligned constant Y but +// scales less than 1, i.e. upscaling. In this case we can optimize for the +// constant Y fraction as well as load all chunks from memory in a single tap +// for each row. +template +static void blendTextureLinearUpscale(S sampler, vec2 uv, int span, + vec2_scalar uv_step, vec2_scalar min_uv, + vec2_scalar max_uv, C color, P* buf) { + typedef VectorType packed_type; + typedef VectorType unpacked_type; + typedef VectorType signed_unpacked_type; + + ivec2 i(clamp(uv, min_uv, max_uv)); + ivec2 frac = i; + i >>= 7; + P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x)); + P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x)); + I16 fracx = computeFracX(sampler, i, frac); + int16_t fracy = computeFracY(frac).x; + auto src0 = + CONVERT(unaligned_load(&row0[i.x.x]), signed_unpacked_type); + auto src1 = + CONVERT(unaligned_load(&row1[i.x.x]), signed_unpacked_type); + auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); + + // We attempt to sample ahead by one chunk and interpolate it with the current + // one. However, due to the complication of upscaling, we may not necessarily + // shift in all the next set of samples. + for (P* end = buf + span; buf < end; buf += 4) { + uv.x += uv_step.x; + I32 ixn = cast(uv.x); + I16 fracn = computeFracNoClamp(ixn); + ixn >>= 7; + auto src0n = CONVERT(unaligned_load(&row0[ixn.x]), + signed_unpacked_type); + auto src1n = CONVERT(unaligned_load(&row1[ixn.x]), + signed_unpacked_type); + auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); + + // Since we're upscaling, we know that a source pixel has a larger footprint + // than the destination pixel, and thus all the source pixels needed for + // this chunk will fall within a single chunk of texture data. However, + // since the source pixels don't map 1:1 with destination pixels, we need to + // shift the source pixels over based on their offset from the start of the + // chunk. This could conceivably be optimized better with usage of PSHUFB or + // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort + // to masking in the correct pixels to avoid having to index into memory. + // For the last sample to interpolate with, we need to potentially shift in + // a sample from the next chunk over in the case the samples fill out an + // entire chunk. + auto shuf = src; + auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4); + if (i.x.y == i.x.x) { + shuf = shuf.xxyz; + shufn = shufn.xxyz; + } + if (i.x.z == i.x.y) { + shuf = shuf.xyyz; + shufn = shufn.xyyz; + } + if (i.x.w == i.x.z) { + shuf = shuf.xyzz; + shufn = shufn.xyzz; + } + + // Convert back to a signed unpacked type so that we can interpolate the + // final result. + auto interp = bit_cast(shuf); + auto interpn = bit_cast(shufn); + interp += applyFracX(interpn - interp, fracx) >> 7; + + commit_blend_span( + buf, applyColor(bit_cast(interp), color)); + + i.x = ixn; + fracx = fracn; + src = srcn; + } +} + +// This is the fastest variant of the linear filter that still provides +// filtering. In cases where there is no scaling required, but we have a +// subpixel offset that forces us to blend in neighboring pixels, we can +// optimize away most of the memory loads and shuffling that is required by the +// fallback filter. +template +static void blendTextureLinearFast(S sampler, vec2 uv, int span, + vec2_scalar min_uv, vec2_scalar max_uv, + C color, P* buf) { + typedef VectorType packed_type; + typedef VectorType unpacked_type; + typedef VectorType signed_unpacked_type; + + ivec2 i(clamp(uv, min_uv, max_uv)); + ivec2 frac = i; + i >>= 7; + P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); + P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); + int16_t fracx = computeFracX(sampler, i, frac).x; + int16_t fracy = computeFracY(frac).x; + auto src0 = CONVERT(unaligned_load(row0), signed_unpacked_type); + auto src1 = CONVERT(unaligned_load(row1), signed_unpacked_type); + auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); + + // Since there is no scaling, we sample ahead by one chunk and interpolate it + // with the current one. We can then reuse this value on the next iteration. + for (P* end = buf + span; buf < end; buf += 4) { + row0 += 4; + row1 += 4; + auto src0n = + CONVERT(unaligned_load(row0), signed_unpacked_type); + auto src1n = + CONVERT(unaligned_load(row1), signed_unpacked_type); + auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); + + // For the last sample to interpolate with, we need to potentially shift in + // a sample from the next chunk over since the samples fill out an entire + // chunk. + auto interp = bit_cast(src); + auto interpn = + bit_cast(SHUFFLE(src, srcn, 1, 2, 3, 4)); + interp += ((interpn - interp) * fracx) >> 7; + + commit_blend_span( + buf, applyColor(bit_cast(interp), color)); + + src = srcn; + } +} + +// Implements a faster linear filter that works with axis-aligned constant Y but +// downscaling the texture by half. In this case we can optimize for the +// constant X/Y fractions and reduction factor while minimizing shuffling. +template +static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span, + vec2_scalar min_uv, + vec2_scalar max_uv, C color, + P* buf) { + typedef VectorType packed_type; + typedef VectorType unpacked_type; + typedef VectorType signed_unpacked_type; + + ivec2 i(clamp(uv, min_uv, max_uv)); + ivec2 frac = i; + i >>= 7; + P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); + P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); + int16_t fracx = computeFracX(sampler, i, frac).x; + int16_t fracy = computeFracY(frac).x; + + for (P* end = buf + span; buf < end; buf += 4) { + auto src0 = + CONVERT(unaligned_load(row0), signed_unpacked_type); + auto src1 = + CONVERT(unaligned_load(row1), signed_unpacked_type); + auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); + row0 += 4; + row1 += 4; + auto src0n = + CONVERT(unaligned_load(row0), signed_unpacked_type); + auto src1n = + CONVERT(unaligned_load(row1), signed_unpacked_type); + auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); + row0 += 4; + row1 += 4; + + auto interp = + bit_cast(SHUFFLE(src, srcn, 0, 2, 4, 6)); + auto interpn = + bit_cast(SHUFFLE(src, srcn, 1, 3, 5, 7)); + interp += ((interpn - interp) * fracx) >> 7; + + commit_blend_span( + buf, applyColor(bit_cast(interp), color)); + } +} + +enum LinearFilter { + // No linear filter is needed. + LINEAR_FILTER_NEAREST = 0, + // The most general linear filter that handles clamping and varying scales. + LINEAR_FILTER_FALLBACK, + // A linear filter optimized for axis-aligned upscaling. + LINEAR_FILTER_UPSCALE, + // A linear filter with no scaling but with subpixel offset. + LINEAR_FILTER_FAST, + // A linear filter optimized for 2x axis-aligned downscaling. + LINEAR_FILTER_DOWNSCALE +}; + +// Dispatches to an appropriate linear filter depending on the selected filter. +template +static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span, + vec2_scalar uv_step, vec2_scalar min_uv, + vec2_scalar max_uv, C color, P* buf, + LinearFilter filter) { + P* end = buf + span; + if (filter != LINEAR_FILTER_FALLBACK) { + // If we're not using the fallback, then Y is constant across the entire + // row. We just need to ensure that we handle any samples that might pull + // data from before the start of the row and require clamping. + float beforeDist = max(0.0f, min_uv.x) - uv.x.x; + if (beforeDist > 0) { + int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0, + int(end - buf)); + buf = blendTextureLinearFallback(sampler, uv, before, uv_step, + min_uv, max_uv, color, buf); + uv.x += (before / swgl_StepSize) * uv_step.x; + } + // We need to check how many samples we can take from inside the row without + // requiring clamping. In case the filter oversamples the row by a step, we + // subtract off a step from the width to leave some room. + float insideDist = + min(max_uv.x, float((int(sampler->width) - swgl_StepSize) * + swgl_LinearQuantizeScale)) - + uv.x.x; + if (uv_step.x > 0.0f && insideDist >= uv_step.x) { + int32_t inside = int(end - buf); + if (filter == LINEAR_FILTER_DOWNSCALE) { + inside = min(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) & + ~(swgl_StepSize - 1), + inside); + if (inside > 0) { + blendTextureLinearDownscale(sampler, uv, inside, min_uv, + max_uv, color, buf); + buf += inside; + uv.x += (inside / swgl_StepSize) * uv_step.x; + } + } else if (filter == LINEAR_FILTER_UPSCALE) { + inside = min(int(insideDist / uv_step.x) * swgl_StepSize, inside); + if (inside > 0) { + blendTextureLinearUpscale(sampler, uv, inside, uv_step, min_uv, + max_uv, color, buf); + buf += inside; + uv.x += (inside / swgl_StepSize) * uv_step.x; + } + } else { + inside = min(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) & + ~(swgl_StepSize - 1), + inside); + if (inside > 0) { + blendTextureLinearFast(sampler, uv, inside, min_uv, max_uv, + color, buf); + buf += inside; + uv.x += (inside / swgl_StepSize) * uv_step.x; + } + } + } + } + // If the fallback filter was requested, or if there are any samples left that + // may be outside the row and require clamping, then handle that with here. + if (buf < end) { + buf = blendTextureLinearFallback( + sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf); + } + return buf; +} + +// Helper function to quantize UVs for linear filtering before dispatch +template +static inline int blendTextureLinear(S sampler, vec2 uv, int span, + const vec4_scalar& uv_rect, C color, + P* buf, LinearFilter filter) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); + blendTextureLinearDispatch(sampler, uv, span, uv_step, min_uv, max_uv, + color, buf, filter); + return span; +} + +// Samples an axis-aligned span of on a single row of a texture using 1:1 +// nearest filtering. Sampling is constrained to only fall within the given UV +// bounds. This requires a pointer to the destination buffer. An optional color +// modulus can be supplied. +template +static int blendTextureNearestFast(S sampler, vec2 uv, int span, + const vec4_scalar& uv_rect, C color, + P* buf) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + + typedef VectorType packed_type; + + ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv))); + ivec2_scalar minUV = + make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y})); + ivec2_scalar maxUV = + make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w})); + + // Calculate the row pointer within the buffer, clamping to within valid row + // bounds. + P* row = + &((P*)sampler + ->buf)[clampCoord(clamp(i.y, minUV.y, maxUV.y), sampler->height) * + sampler->stride]; + // Find clamped X bounds within the row. + int minX = clamp(minUV.x, 0, sampler->width - 1); + int maxX = clamp(maxUV.x, minX, sampler->width - 1); + int curX = i.x; + int endX = i.x + span; + // If we need to start sampling below the valid sample bounds, then we need to + // fill this section with a constant clamped sample. + if (curX < minX) { + int n = min(minX, endX) - curX; + auto src = + applyColor(unpack(bit_cast(V4

(row[minX]))), color); + commit_solid_span(buf, src, n); + buf += n; + curX += n; + } + // Here we only deal with valid samples within the sample bounds. No clamping + // should occur here within these inner loops. + int n = max(min(maxX + 1, endX) - curX, 0); + // Try to process as many chunks as possible with full loads and stores. + for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) { + auto src = applyColor(unaligned_load(&row[curX]), color); + commit_blend_span(buf, src); + } + n &= 3; + // If we have any leftover samples after processing chunks, use partial loads + // and stores. + if (n > 0) { + auto src = applyColor(partial_load_span(&row[curX], n), color); + commit_blend_span(buf, src, n); + buf += n; + curX += n; + } + // If we still have samples left above the valid sample bounds, then we again + // need to fill this section with a constant clamped sample. + if (curX < endX) { + auto src = + applyColor(unpack(bit_cast(V4

(row[maxX]))), color); + commit_solid_span(buf, src, endX - curX); + } + return span; +} + +// We need to verify that the pixel step reasonably approximates stepping by a +// single texel for every pixel we need to reproduce. Try to ensure that the +// margin of error is no more than approximately 2^-7. Also, we check here if +// the scaling can be quantized for acceleration. +template +static ALWAYS_INLINE int spanNeedsScale(int span, T P) { + span &= ~(128 - 1); + span += 128; + int scaled = round((P.x.y - P.x.x) * span); + return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0; +} + +// Helper function to decide whether we can safely apply 1:1 nearest filtering +// without diverging too much from the linear filter. +template +static inline LinearFilter needsTextureLinear(S sampler, T P, int span) { + // If each row is not wide enough for linear filtering, then just use nearest + // filtering. + if (sampler->width < 2) { + return LINEAR_FILTER_NEAREST; + } + // First verify if the row Y doesn't change across samples + if (P.y.x != P.y.y) { + return LINEAR_FILTER_FALLBACK; + } + P = samplerScale(sampler, P); + if (int scale = spanNeedsScale(span, P)) { + // If the source region is not flipped and smaller than the destination, + // then we can use the upscaling filter since row Y is constant. + return P.x.x < P.x.y && P.x.y - P.x.x <= 1 + ? LINEAR_FILTER_UPSCALE + : (scale == 2 ? LINEAR_FILTER_DOWNSCALE + : LINEAR_FILTER_FALLBACK); + } + // Also verify that we're reasonably close to the center of a texel + // so that it doesn't look that much different than if a linear filter + // was used. + if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 || + (int(P.y.x * 4.0f + 0.5f) & 3) != 2) { + // The source and destination regions are the same, but there is a + // significant subpixel offset. We can use a faster linear filter to deal + // with the offset in this case. + return LINEAR_FILTER_FAST; + } + // Otherwise, we have a constant 1:1 step and we're stepping reasonably close + // to the center of each pixel, so it's safe to disable the linear filter and + // use nearest. + return LINEAR_FILTER_NEAREST; +} + +// Commit an entire span with linear filtering +#define swgl_commitTextureLinear(format, s, p, uv_rect, color, n) \ + do { \ + auto packed_color = packColor(swgl_Out##format, color); \ + int len = (n); \ + int drawn = 0; \ + if (LinearFilter filter = needsTextureLinear(s, p, len)) { \ + if (blend_key) { \ + drawn = blendTextureLinear(s, p, len, uv_rect, packed_color, \ + swgl_Out##format, filter); \ + } else { \ + drawn = blendTextureLinear(s, p, len, uv_rect, packed_color, \ + swgl_Out##format, filter); \ + } \ + } else if (blend_key) { \ + drawn = blendTextureNearestFast(s, p, len, uv_rect, packed_color, \ + swgl_Out##format); \ + } else { \ + drawn = blendTextureNearestFast(s, p, len, uv_rect, packed_color, \ + swgl_Out##format); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \ + swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength) +#define swgl_commitTextureLinearR8(s, p, uv_rect) \ + swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength) + +// Commit a partial span with linear filtering, optionally inverting the color +#define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \ + swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), \ + min(int(len), swgl_SpanLength)) +#define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \ + swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(), \ + min(int(len), swgl_SpanLength)) + +// Commit an entire span with linear filtering that is scaled by a color +#define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \ + swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength) +#define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \ + swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength) + +// Helper function that samples from an R8 texture while expanding it to support +// a differing framebuffer format. +template +static inline int blendTextureLinearR8(S sampler, vec2 uv, int span, + const vec4_scalar& uv_rect, C color, + P* buf) { + if (!swgl_isTextureR8(sampler) || sampler->width < 2) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); + for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { + commit_blend_span( + buf, applyColor(expand_mask(buf, textureLinearUnpackedR8( + sampler, + ivec2(clamp(uv, min_uv, max_uv)))), + color)); + } + return span; +} + +// Commit an entire span with linear filtering while expanding from R8 to RGBA8 +#define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color) \ + do { \ + auto packed_color = packColor(swgl_OutRGBA8, color); \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendTextureLinearR8(s, p, swgl_SpanLength, uv_rect, \ + packed_color, swgl_OutRGBA8); \ + } else { \ + drawn = blendTextureLinearR8(s, p, swgl_SpanLength, uv_rect, \ + packed_color, swgl_OutRGBA8); \ + } \ + swgl_OutRGBA8 += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \ + swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor()) + +// Compute repeating UVs, possibly constrained by tile repeat limits +static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) { + if (tile_repeat.x > 0.0f) { + // Clamp to a number slightly less than the tile repeat limit so that + // it results in a number close to but not equal to 1 after fract(). + // This avoids fract() yielding 0 if the limit was left as whole integer. + uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f); + } + return fract(uv); +} + +// Compute the number of non-repeating steps before we need to potentially +// repeat the UVs. +static inline int computeNoRepeatSteps(Float uv, float uv_step, + float tile_repeat, int steps) { + if (uv.w < uv.x) { + // Ensure the UV taps are ordered low to high. + uv = uv.wzyx; + } + // Check if the samples cross the boundary of the next whole integer or the + // tile repeat limit, whichever is lower. + float limit = floor(uv.x) + 1.0f; + if (tile_repeat > 0.0f) { + limit = min(limit, tile_repeat); + } + return uv.x >= 0.0f && uv.w < limit + ? (uv_step != 0.0f + ? int(clamp((limit - uv.x) / uv_step, 0.0f, float(steps))) + : steps) + : 0; +} + +// Blends an entire span of texture with linear filtering and repeating UVs. +template +static int blendTextureLinearRepeat(S sampler, vec2 uv, int span, + const vec2_scalar& tile_repeat, + const vec4_scalar& uv_repeat, + const vec4_scalar& uv_rect, C color, + P* buf) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y}; + vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y}; + // Choose a linear filter to use for no-repeat sub-spans + LinearFilter filter = + needsTextureLinear(sampler, uv * uv_scale + uv_offset, span); + // We need to step UVs unscaled and unquantized so that we can modulo them + // with fract. We use uv_scale and uv_offset to map them into the correct + // range. + vec2_scalar uv_step = + float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; + uv_scale = swgl_linearQuantizeStep(sampler, uv_scale); + uv_offset = swgl_linearQuantize(sampler, uv_offset); + vec2_scalar min_uv = max( + swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); + vec2_scalar max_uv = max( + swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv); + for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { + int steps = int(end - buf) / swgl_StepSize; + // Find the sub-span before UVs repeat to avoid expensive repeat math + steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); + if (steps > 0) { + steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); + if (steps > 0) { + buf = blendTextureLinearDispatch( + sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize, + uv_step * uv_scale, min_uv, max_uv, color, buf, filter); + if (buf >= end) { + break; + } + uv += steps * uv_step; + } + } + // UVs might repeat within this step, so explicitly compute repeated UVs + vec2 repeated_uv = clamp( + tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv); + commit_blend_span( + buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)), + color)); + } + return span; +} + +// Commit an entire span with linear filtering and repeating UVs +#define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat, \ + uv_rect, color) \ + do { \ + auto packed_color = packColor(swgl_Out##format, color); \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendTextureLinearRepeat(s, p, swgl_SpanLength, \ + tile_repeat, uv_repeat, uv_rect, \ + packed_color, swgl_Out##format); \ + } else { \ + drawn = blendTextureLinearRepeat(s, p, swgl_SpanLength, \ + tile_repeat, uv_repeat, uv_rect, \ + packed_color, swgl_Out##format); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ + uv_rect) \ + swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ + NoColor()) +#define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \ + uv_rect, color) \ + swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ + color) + +template +static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf, + S sampler, ivec2 i) { + return textureNearestPackedRGBA8(sampler, i); +} + +// Blends an entire span of texture with nearest filtering and either +// repeated or clamped UVs. +template +static int blendTextureNearestRepeat(S sampler, vec2 uv, int span, + const vec2_scalar& tile_repeat, + const vec4_scalar& uv_rect, C color, + P* buf) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + if (!REPEAT) { + // If clamping, then we step pre-scaled to the sampler. For repeat modes, + // this will be accomplished via uv_scale instead. + uv = samplerScale(sampler, uv); + } + vec2_scalar uv_step = + float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; + vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}); + vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}); + vec2_scalar uv_scale = max_uv - min_uv; + // If the effective sampling area of this texture is only a single pixel, then + // treat it as a solid span. For repeat modes, the bounds are specified on + // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so + // the test varies depending on which. If the sample range on an axis is + // greater than one pixel, we can still check if we don't move far enough from + // the pixel center on that axis to hit the next pixel. + if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) || + (abs(uv_step.x) * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) && + (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) || + (abs(uv_step.y) * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) { + vec2 repeated_uv = REPEAT + ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv + : clamp(uv, min_uv, max_uv); + commit_solid_span(buf, + applyColor(unpack(textureNearestPacked( + buf, sampler, ivec2(repeated_uv))), + color), + span); + } else { + for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { + if (REPEAT) { + int steps = int(end - buf) / swgl_StepSize; + // Find the sub-span before UVs repeat to avoid expensive repeat math + steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); + if (steps > 0) { + steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); + if (steps > 0) { + vec2 inside_uv = fract(uv) * uv_scale + min_uv; + vec2 inside_step = uv_step * uv_scale; + for (P* outside = &buf[steps * swgl_StepSize]; buf < outside; + buf += swgl_StepSize, inside_uv += inside_step) { + commit_blend_span( + buf, applyColor( + textureNearestPacked(buf, sampler, ivec2(inside_uv)), + color)); + } + if (buf >= end) { + break; + } + uv += steps * uv_step; + } + } + } + + // UVs might repeat within this step, so explicitly compute repeated UVs + vec2 repeated_uv = REPEAT + ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv + : clamp(uv, min_uv, max_uv); + commit_blend_span( + buf, + applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)), + color)); + } + } + return span; +} + +// Determine if we can use the fast nearest filter for the given nearest mode. +// If the Y coordinate varies more than half a pixel over +// the span (which might cause the texel to alias to the next one), or the span +// needs X scaling, then we have to use the fallback. +template +static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) { + P = samplerScale(sampler, P); + return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P); +} + +// Commit an entire span with nearest filtering and either clamped or repeating +// UVs +#define swgl_commitTextureNearest(format, s, p, uv_rect, color) \ + do { \ + auto packed_color = packColor(swgl_Out##format, color); \ + int drawn = 0; \ + if (needsNearestFallback(s, p, swgl_SpanLength)) { \ + if (blend_key) { \ + drawn = blendTextureNearestRepeat( \ + s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ + swgl_Out##format); \ + } else { \ + drawn = blendTextureNearestRepeat( \ + s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ + swgl_Out##format); \ + } \ + } else if (blend_key) { \ + drawn = blendTextureNearestFast(s, p, swgl_SpanLength, uv_rect, \ + packed_color, swgl_Out##format); \ + } else { \ + drawn = blendTextureNearestFast(s, p, swgl_SpanLength, uv_rect, \ + packed_color, swgl_Out##format); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \ + swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor()) +#define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \ + swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color) + +#define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \ + color) \ + do { \ + auto packed_color = packColor(swgl_Out##format, color); \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendTextureNearestRepeat( \ + s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ + swgl_Out##format); \ + } else { \ + drawn = blendTextureNearestRepeat( \ + s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ + swgl_Out##format); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ + uv_rect) \ + swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, \ + NoColor()) +#define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat, \ + uv_repeat, uv_rect, color) \ + swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color) + +// Commit an entire span of texture with filtering determined by sampler state. +#define swgl_commitTexture(format, s, ...) \ + do { \ + if (s->filter == TextureFilter::LINEAR) { \ + swgl_commitTextureLinear##format(s, __VA_ARGS__); \ + } else { \ + swgl_commitTextureNearest##format(s, __VA_ARGS__); \ + } \ + } while (0) +#define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__) +#define swgl_commitTextureColorRGBA8(...) \ + swgl_commitTexture(ColorRGBA8, __VA_ARGS__) +#define swgl_commitTextureRepeatRGBA8(...) \ + swgl_commitTexture(RepeatRGBA8, __VA_ARGS__) +#define swgl_commitTextureRepeatColorRGBA8(...) \ + swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__) + +// Commit an entire span of a separable pass of a Gaussian blur that falls +// within the given radius scaled by supplied coefficients, clamped to uv_rect +// bounds. +template +static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect, + P* buf, int span, bool hori, int radius, + vec2_scalar coeffs) { + if (!matchTextureFormat(sampler, buf)) { + return 0; + } + vec2_scalar size = {float(sampler->width), float(sampler->height)}; + ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size); + ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size)); + int startX = curUV.x; + int endX = min(min(bounds.z, curUV.x + span), int(size.x)); + if (hori) { + for (; curUV.x + swgl_StepSize <= endX; + buf += swgl_StepSize, curUV.x += swgl_StepSize) { + commit_blend_span( + buf, gaussianBlurHorizontal

(sampler, curUV, bounds.x, bounds.z, + radius, coeffs.x, coeffs.y)); + } + } else { + for (; curUV.x + swgl_StepSize <= endX; + buf += swgl_StepSize, curUV.x += swgl_StepSize) { + commit_blend_span( + buf, gaussianBlurVertical

(sampler, curUV, bounds.y, bounds.w, + radius, coeffs.x, coeffs.y)); + } + } + return curUV.x - startX; +} + +#define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs) \ + do { \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendGaussianBlur(s, p, uv_rect, swgl_Out##format, \ + swgl_SpanLength, hori, radius, coeffs); \ + } else { \ + drawn = blendGaussianBlur(s, p, uv_rect, swgl_Out##format, \ + swgl_SpanLength, hori, radius, coeffs); \ + } \ + swgl_Out##format += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) +#define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \ + swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs) +#define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \ + swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs) + +// Convert and pack planar YUV samples to RGB output using a color space +static ALWAYS_INLINE PackedRGBA8 convertYUV(const YUVMatrix& rgb_from_ycbcr, + U16 y, U16 u, U16 v) { + auto yy = V8(zip(y, y)); + auto uv = V8(zip(u, v)); + return rgb_from_ycbcr.convert(yy, uv); +} + +// Helper functions to sample from planar YUV textures before converting to RGB +template +static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, + const YUVMatrix& rgb_from_ycbcr, + UNUSED int rescaleFactor) { + switch (sampler0->format) { + case TextureFormat::RGBA8: { + auto planar = textureLinearPlanarRGBA8(sampler0, uv0); + return convertYUV(rgb_from_ycbcr, highHalf(planar.rg), lowHalf(planar.rg), + lowHalf(planar.ba)); + } + case TextureFormat::YUV422: { + auto planar = textureLinearPlanarYUV422(sampler0, uv0); + return convertYUV(rgb_from_ycbcr, planar.y, planar.u, planar.v); + } + default: + assert(false); + return PackedRGBA8(0); + } +} + +template +static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, + const vec4_scalar& uv_rect0, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, C color = C()) { + if (!swgl_isTextureLinear(sampler0)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); + const auto rgb_from_ycbcr = + YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); + auto c = packColor(buf, color); + auto* end = buf + span; + for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) { + commit_blend_span( + buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), + rgb_from_ycbcr, rescaleFactor), + c)); + } + return span; +} + +template +static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, + ivec2 uv1, + const YUVMatrix& rgb_from_ycbcr, + int rescaleFactor) { + switch (sampler1->format) { + case TextureFormat::RG8: { + assert(sampler0->format == TextureFormat::R8); + auto y = textureLinearUnpackedR8(sampler0, uv0); + auto planar = textureLinearPlanarRG8(sampler1, uv1); + return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.rg), + highHalf(planar.rg)); + } + case TextureFormat::RGBA8: { + assert(sampler0->format == TextureFormat::R8); + auto y = textureLinearUnpackedR8(sampler0, uv0); + auto planar = textureLinearPlanarRGBA8(sampler1, uv1); + return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.ba), + highHalf(planar.rg)); + } + case TextureFormat::RG16: { + assert(sampler0->format == TextureFormat::R16); + // The rescaling factor represents how many bits to add to renormalize the + // texture to 16 bits, and so the color depth is actually 16 minus the + // rescaling factor. + // Need to right shift the sample by the amount of bits over 8 it + // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit + // of precision at the low end already, hence 1 is subtracted from the + // color depth. + int colorDepth = 16 - rescaleFactor; + int rescaleBits = (colorDepth - 1) - 8; + auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; + auto uv = textureLinearUnpackedRG16(sampler1, uv1) >> rescaleBits; + return rgb_from_ycbcr.convert(zip(y, y), uv); + } + default: + assert(false); + return PackedRGBA8(0); + } +} + +template +static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, + const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, + const vec4_scalar& uv_rect1, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, C color = C()) { + if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); + LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); + const auto rgb_from_ycbcr = + YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); + auto c = packColor(buf, color); + auto* end = buf + span; + for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) { + commit_blend_span( + buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), + sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), + rgb_from_ycbcr, rescaleFactor), + c)); + } + return span; +} + +template +static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, + ivec2 uv1, S2 sampler2, ivec2 uv2, + const YUVMatrix& rgb_from_ycbcr, + int rescaleFactor) { + assert(sampler0->format == sampler1->format && + sampler0->format == sampler2->format); + switch (sampler0->format) { + case TextureFormat::R8: { + auto y = textureLinearUnpackedR8(sampler0, uv0); + auto u = textureLinearUnpackedR8(sampler1, uv1); + auto v = textureLinearUnpackedR8(sampler2, uv2); + return convertYUV(rgb_from_ycbcr, y, u, v); + } + case TextureFormat::R16: { + // The rescaling factor represents how many bits to add to renormalize the + // texture to 16 bits, and so the color depth is actually 16 minus the + // rescaling factor. + // Need to right shift the sample by the amount of bits over 8 it + // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit + // of precision at the low end already, hence 1 is subtracted from the + // color depth. + int colorDepth = 16 - rescaleFactor; + int rescaleBits = (colorDepth - 1) - 8; + auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; + auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits; + auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits; + return convertYUV(rgb_from_ycbcr, U16(y), U16(u), U16(v)); + } + default: + assert(false); + return PackedRGBA8(0); + } +} + +// Fallback helper for when we can't specifically accelerate YUV with +// composition. +template +static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0, + vec2_scalar uv_step0, vec2_scalar min_uv0, + vec2_scalar max_uv0, S1 sampler1, vec2 uv1, + vec2_scalar uv_step1, vec2_scalar min_uv1, + vec2_scalar max_uv1, S2 sampler2, vec2 uv2, + vec2_scalar uv_step2, vec2_scalar min_uv2, + vec2_scalar max_uv2, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, C color) { + const auto rgb_from_ycbcr = + YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); + for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0, + uv1 += uv_step1, uv2 += uv_step2) { + commit_blend_span( + buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), + sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), + sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)), + rgb_from_ycbcr, rescaleFactor), + color)); + } +} + +template +static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, + const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, + const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2, + const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, C color = C()) { + if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || + !swgl_isTextureLinear(sampler2)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); + LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); + LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); + auto c = packColor(buf, color); + blendYUVFallback(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0, + sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, + uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, + rgb_from_debiased_ycbcr, rescaleFactor, c); + return span; +} + +// A variant of the blendYUV that attempts to reuse the inner loops from the +// CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on +// the source data, which in turn allows it to be much faster than blendYUV. +// At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer +// and that no color scaling is applied, which we can accomplish via template +// specialization. We need to further validate inside that texture formats +// and dimensions are sane for video and that the video is axis-aligned before +// acceleration can proceed. +template +static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0, + const vec4_scalar& uv_rect0, sampler2DRect sampler1, + vec2 uv1, const vec4_scalar& uv_rect1, + sampler2DRect sampler2, vec2 uv2, + const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias, + const mat3_scalar& rgb_from_debiased_ycbcr, + int rescaleFactor, NoColor noColor = NoColor()) { + if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || + !swgl_isTextureLinear(sampler2)) { + return 0; + } + LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); + LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); + LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); + auto* end = buf + span; + // CompositeYUV imposes further restrictions on the source textures, such that + // the the Y/U/V samplers must all have a matching format, the U/V samplers + // must have matching sizes and sample coordinates, and there must be no + // change in row across the entire span. + if (sampler0->format == sampler1->format && + sampler1->format == sampler2->format && + sampler1->width == sampler2->width && + sampler1->height == sampler2->height && uv_step0.y == 0 && + uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 && + uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) { + // CompositeYUV does not support a clamp rect, so we must take care to + // advance till we're inside the bounds of the clamp rect. + int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x, + (min_uv1.x - uv1.x.x) / uv_step1.x))), + (end - buf) / swgl_StepSize); + if (outside > 0) { + blendYUVFallback(buf, outside * swgl_StepSize, sampler0, uv0, + uv_step0, min_uv0, max_uv0, sampler1, uv1, + uv_step1, min_uv1, max_uv1, sampler2, uv2, + uv_step2, min_uv2, max_uv2, ycbcr_bias, + rgb_from_debiased_ycbcr, rescaleFactor, noColor); + buf += outside * swgl_StepSize; + uv0.x += outside * uv_step0.x; + uv1.x += outside * uv_step1.x; + uv2.x += outside * uv_step2.x; + } + // Find the amount of chunks inside the clamp rect before we hit the + // maximum. If there are any chunks inside, we can finally dispatch to + // CompositeYUV. + int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x, + (max_uv1.x - uv1.x.x) / uv_step1.x)), + (end - buf) / swgl_StepSize); + if (inside > 0) { + // We need the color depth, which is relative to the texture format and + // rescale factor. + int colorDepth = + (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor; + // Finally, call the inner loop of CompositeYUV. + const auto rgb_from_ycbcr = + YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); + linear_row_yuv( + buf, inside * swgl_StepSize, sampler0, force_scalar(uv0), + uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1), + uv_step1.x / swgl_StepSize, colorDepth, rgb_from_ycbcr); + // Now that we're done, advance past the processed inside portion. + buf += inside * swgl_StepSize; + uv0.x += inside * uv_step0.x; + uv1.x += inside * uv_step1.x; + uv2.x += inside * uv_step2.x; + } + } + // We either got here because we have some samples outside the clamp rect, or + // because some of the preconditions were not satisfied. Process whatever is + // left of the span. + blendYUVFallback(buf, end - buf, sampler0, uv0, uv_step0, min_uv0, + max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, + sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, + rgb_from_debiased_ycbcr, rescaleFactor, noColor); + return span; +} + +// Commit a single chunk of a YUV surface represented by multiple planar +// textures. This requires a color space specifier selecting how to convert +// from YUV to RGB output. In the case of HDR formats, a rescaling factor +// selects how many bits of precision must be utilized on conversion. See the +// sampleYUV dispatcher functions for the various supported plane +// configurations this intrinsic accepts. +#define swgl_commitTextureLinearYUV(...) \ + do { \ + int drawn = 0; \ + if (blend_key) { \ + drawn = blendYUV(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ + } else { \ + drawn = blendYUV(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ + } \ + swgl_OutRGBA8 += drawn; \ + swgl_SpanLength -= drawn; \ + } while (0) + +// Commit a single chunk of a YUV surface scaled by a color. +#define swgl_commitTextureLinearColorYUV(...) \ + swgl_commitTextureLinearYUV(__VA_ARGS__) + +// Each gradient stops entry is a pair of RGBA32F start color and end step. +struct GradientStops { + Float startColor; + union { + Float stepColor; + vec4_scalar stepData; + }; + + // Whether this gradient entry can be merged with an adjacent entry. The + // step will be equal with the adjacent step if and only if they can be + // merged, or rather, that the stops are actually part of a single larger + // gradient. + bool can_merge(const GradientStops& next) const { + return stepData == next.stepData; + } + + // Get the interpolated color within the entry based on the offset from its + // start. + Float interpolate(float offset) const { + return startColor + stepColor * offset; + } + + // Get the end color of the entry where interpolation stops. + Float end_color() const { return startColor + stepColor; } +}; + +// Checks if a gradient table of the specified size exists at the UV coords of +// the address within an RGBA32F texture. If so, a linear address within the +// texture is returned that may be used to sample the gradient table later. If +// the address doesn't describe a valid gradient, then a negative value is +// returned. +static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address, + int entries) { + return sampler->format == TextureFormat::RGBA32F && address.y >= 0 && + address.y < int(sampler->height) && address.x >= 0 && + address.x < int(sampler->width) && entries > 0 && + address.x + + int(sizeof(GradientStops) / sizeof(Float)) * entries <= + int(sampler->width) + ? address.y * sampler->stride + address.x * 4 + : -1; +} + +static inline WideRGBA8 sampleGradient(sampler2D sampler, int address, + Float entry) { + assert(sampler->format == TextureFormat::RGBA32F); + assert(address >= 0 && address < int(sampler->height * sampler->stride)); + // Get the integer portion of the entry index to find the entry colors. + I32 index = cast(entry); + // Use the fractional portion of the entry index to control blending between + // entry colors. + Float offset = entry - cast(index); + // Every entry is a pair of colors blended by the fractional offset. + assert(test_all(index >= 0 && + index * int(sizeof(GradientStops) / sizeof(Float)) < + int(sampler->width))); + GradientStops* stops = (GradientStops*)&sampler->buf[address]; + // Blend between the colors for each SIMD lane, then pack them to RGBA8 + // result. Since the layout of the RGBA8 framebuffer is actually BGRA while + // the gradient table has RGBA colors, swizzling is required. + return combine( + packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw), + round_pixel(stops[index.y].interpolate(offset.y).zyxw)), + packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw), + round_pixel(stops[index.w].interpolate(offset.w).zyxw))); +} + +// Samples a gradient entry from the gradient at the provided linearized +// address. The integer portion of the entry index is used to find the entry +// within the table whereas the fractional portion is used to blend between +// adjacent table entries. +#define swgl_commitGradientRGBA8(sampler, address, entry) \ + swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry)) + +// Variant that allows specifying a color multiplier of the gradient result. +#define swgl_commitGradientColorRGBA8(sampler, address, entry, color) \ + swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \ + packColor(swgl_OutRGBA, color))) + +// Samples an entire span of a linear gradient by crawling the gradient table +// and looking for consecutive stops that can be merged into a single larger +// gradient, then interpolating between those larger gradients within the span. +template +static bool commitLinearGradient(sampler2D sampler, int address, float size, + bool tileRepeat, bool gradientRepeat, vec2 pos, + const vec2_scalar& scaleDir, float startOffset, + uint32_t* buf, int span) { + assert(sampler->format == TextureFormat::RGBA32F); + assert(address >= 0 && address < int(sampler->height * sampler->stride)); + GradientStops* stops = (GradientStops*)&sampler->buf[address]; + // Get the chunk delta from the difference in offset steps. This represents + // how far within the gradient table we advance for every step in output, + // normalized to gradient table size. + vec2_scalar posStep = dFdx(pos) * 4.0f; + float delta = dot(posStep, scaleDir); + if (!isfinite(delta)) { + return false; + } + // If we have a repeating brush, then the position will be modulo the [0,1) + // interval. Compute coefficients that can be used to quickly evaluate the + // distance to the interval boundary where the offset will wrap. + vec2_scalar distCoeffsX = {0.25f * span, 0.0f}; + vec2_scalar distCoeffsY = distCoeffsX; + if (tileRepeat) { + if (posStep.x != 0.0f) { + distCoeffsX = vec2_scalar{step(0.0f, posStep.x), 1.0f} * recip(posStep.x); + } + if (posStep.y != 0.0f) { + distCoeffsY = vec2_scalar{step(0.0f, posStep.y), 1.0f} * recip(posStep.y); + } + } + for (; span > 0;) { + // Try to process as many chunks as are within the span if possible. + float chunks = 0.25f * span; + vec2 repeatPos = pos; + if (tileRepeat) { + // If this is a repeating brush, then limit the chunks to not cross the + // interval boundaries. + repeatPos = fract(pos); + chunks = min(chunks, distCoeffsX.x - repeatPos.x.x * distCoeffsX.y); + chunks = min(chunks, distCoeffsY.x - repeatPos.y.x * distCoeffsY.y); + } + // Compute the gradient offset from the position. + Float offset = + repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset; + // If repeat is desired, we need to limit the offset to a fractional value. + if (gradientRepeat) { + offset = fract(offset); + } + // To properly handle both clamping and repeating of the table offset, we + // need to ensure we don't run past the 0 and 1 points. Here we compute the + // intercept points depending on whether advancing forwards or backwards in + // the gradient table to ensure the chunk count is limited by the amount + // before intersection. If there is no delta, then we compute no intercept. + float startEntry; + int minIndex, maxIndex; + if (offset.x < 0) { + // If we're below the gradient table, use the first color stop. We can + // only intercept the table if walking forward. + startEntry = 0; + minIndex = int(startEntry); + maxIndex = minIndex; + if (delta > 0) { + chunks = min(chunks, -offset.x / delta); + } + } else if (offset.x < 1) { + // Otherwise, we're inside the gradient table. Depending on the direction + // we're walking the the table, we may intersect either the 0 or 1 offset. + // Compute the start entry based on our initial offset, and compute the + // end entry based on the available chunks limited by intercepts. Clamp + // them into the valid range of the table. + startEntry = 1.0f + offset.x * size; + if (delta < 0) { + chunks = min(chunks, -offset.x / delta); + } else if (delta > 0) { + chunks = min(chunks, (1 - offset.x) / delta); + } + float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size, + 0.0f, 1.0f + size); + // Now that we know the range of entries we need to sample, we want to + // find the largest possible merged gradient within that range. Depending + // on which direction we are advancing in the table, we either walk up or + // down the table trying to merge the current entry with the adjacent + // entry. We finally limit the chunks to only sample from this merged + // gradient. + minIndex = int(startEntry); + maxIndex = minIndex; + if (delta > 0) { + while (maxIndex + 1 < endEntry && + stops[maxIndex].can_merge(stops[maxIndex + 1])) { + maxIndex++; + } + chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size)); + } else if (delta < 0) { + while (minIndex - 1 > endEntry && + stops[minIndex - 1].can_merge(stops[minIndex])) { + minIndex--; + } + chunks = min(chunks, (minIndex - startEntry) / (delta * size)); + } + } else { + // If we're above the gradient table, use the last color stop. We can + // only intercept the table if walking backward. + startEntry = 1.0f + size; + minIndex = int(startEntry); + maxIndex = minIndex; + if (delta < 0) { + chunks = min(chunks, (1 - offset.x) / delta); + } + } + // If there are any amount of whole chunks of a merged gradient found, + // then we want to process that as a single gradient span with the start + // and end colors from the min and max entries. + if (chunks >= 1.0f) { + int inside = int(chunks); + // Sample the start color from the min entry and the end color from the + // max entry of the merged gradient. These are scaled to a range of + // 0..0xFF00, as that is the largest shifted value that can fit in a U16. + // Since we are only doing addition with the step value, we can still + // represent negative step values without having to use an explicit sign + // bit, as the result will still come out the same, allowing us to gain an + // extra bit of precision. We will later shift these into 8 bit output + // range while committing the span, but stepping with higher precision to + // avoid banding. We convert from RGBA to BGRA here to avoid doing this in + // the inner loop. + auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00); + auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00); + // Get the color range of the merged gradient, normalized to its size. + auto colorRangeF = + (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex)); + // Compute the actual starting color of the current start offset within + // the merged gradient. The value 0.5 is added to the low bits (0x80) so + // that the color will effective round to the nearest increment below. + auto colorF = + minColorF + colorRangeF * (startEntry - minIndex) + float(0x80); + // Compute the portion of the color range that we advance on each chunk. + Float deltaColorF = colorRangeF * (delta * size); + // Quantize the color delta and current color. These have already been + // scaled to the 0..0xFF00 range, so we just need to round them to U16. + auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16)); + auto color = + combine(CONVERT(round_pixel(colorF, 1), U16), + CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16), + CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16), + CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16)); + // Finally, step the current color through the output chunks, shifting + // it into 8 bit range and outputting as we go. + for (auto* end = buf + inside * 4; buf < end; buf += 4) { + commit_blend_span(buf, bit_cast(color >> 8)); + color += deltaColor; + } + // Deduct the number of chunks inside the gradient from the remaining + // overall span. If we exhausted the span, bail out. + span -= inside * 4; + if (span <= 0) { + break; + } + // Otherwise, assume we're in a transitional section of the gradient that + // will probably require per-sample table lookups, so fall through below. + // We need to re-evaluate the position and offset first, though. + pos += posStep * float(inside); + repeatPos = tileRepeat ? fract(pos) : pos; + offset = + repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset; + if (gradientRepeat) { + offset = fract(offset); + } + } + // If we get here, there were no whole chunks of a merged gradient found + // that we could process, but we still have a non-zero amount of span left. + // That means we have segments of gradient that begin or end at the current + // entry we're on. For this case, we just fall back to sampleGradient which + // will calculate a table entry for each sample, assuming the samples may + // have different table entries. + Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); + commit_blend_span(buf, sampleGradient(sampler, address, entry)); + span -= 4; + buf += 4; + pos += posStep; + } + return true; +} + +// Commits an entire span of a linear gradient, given the address of a table +// previously resolved with swgl_validateGradient. The size of the inner portion +// of the table is given, assuming the table start and ends with a single entry +// each to deal with clamping. Repeating will be handled if necessary. The +// initial offset within the table is used to designate where to start the span +// and how to step through the gradient table. +#define swgl_commitLinearGradientRGBA8(sampler, address, size, tileRepeat, \ + gradientRepeat, pos, scaleDir, \ + startOffset) \ + do { \ + bool drawn = false; \ + if (blend_key) { \ + drawn = commitLinearGradient( \ + sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ + startOffset, swgl_OutRGBA8, swgl_SpanLength); \ + } else { \ + drawn = commitLinearGradient( \ + sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ + startOffset, swgl_OutRGBA8, swgl_SpanLength); \ + } \ + if (drawn) { \ + swgl_OutRGBA8 += swgl_SpanLength; \ + swgl_SpanLength = 0; \ + } \ + } while (0) + +template +static ALWAYS_INLINE V fastSqrt(V v) { +#if USE_SSE2 || USE_NEON + // Clamp to avoid zero in inversesqrt. + return v * inversesqrt(CLAMP ? max(v, V(1.0e-10f)) : v); +#else + return sqrt(v); +#endif +} + +template +static ALWAYS_INLINE auto fastLength(V v) { + return fastSqrt(dot(v, v)); +} + +// Samples an entire span of a radial gradient by crawling the gradient table +// and looking for consecutive stops that can be merged into a single larger +// gradient, then interpolating between those larger gradients within the span +// based on the computed position relative to a radius. +template +static bool commitRadialGradient(sampler2D sampler, int address, float size, + bool repeat, vec2 pos, float radius, + uint32_t* buf, int span) { + assert(sampler->format == TextureFormat::RGBA32F); + assert(address >= 0 && address < int(sampler->height * sampler->stride)); + GradientStops* stops = (GradientStops*)&sampler->buf[address]; + // clang-format off + // Given position p, delta d, and radius r, we need to repeatedly solve the + // following quadratic for the pixel offset t: + // length(p + t*d) = r + // (px + t*dx)^2 + (py + t*dy)^2 = r^2 + // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is: + // t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0 + // t^2*d.d + t*2*d.p + (p.p-r^2) = 0 + // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to: + // t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d) + // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so + // we cache them below for faster computation. + // + // The quadratic has two solutions, representing the span intersecting the + // given radius of gradient, which can occur at two offsets. If there is only + // one solution (where b^2-4ac = 0), this represents the point at which the + // span runs tangent to the radius. This middle point is significant in that + // before it, we walk down the gradient ramp, and after it, we walk up the + // ramp. + // clang-format on + vec2_scalar pos0 = {pos.x.x, pos.y.x}; + vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x}; + float deltaDelta = dot(delta, delta); + if (!isfinite(deltaDelta) || !isfinite(radius)) { + return false; + } + float invDelta, middleT, middleB; + if (deltaDelta > 0) { + invDelta = 1.0f / deltaDelta; + middleT = -dot(delta, pos0) * invDelta; + middleB = middleT * middleT - dot(pos0, pos0) * invDelta; + } else { + // If position is invariant, just set the coefficients so the quadratic + // always reduces to the end of the span. + invDelta = 0.0f; + middleT = float(span); + middleB = 0.0f; + } + // We only want search for merged gradients up to the minimum of either the + // mid-point or the span length. Cache those offsets here as they don't vary + // in the inner loop. + Float middleEndRadius = fastLength( + pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f}); + float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x; + float endRadius = middleEndRadius.y; + // Convert delta to change in position per chunk. + delta *= 4; + deltaDelta *= 4 * 4; + // clang-format off + // Given current position p and delta d, we reduce: + // length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p)) + // where dot(p+d,p+d) can be accumulated as: + // (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2) + // = p.p + 2p.d + d.d + // Since p increases by d every loop iteration, p.d increases by d.d, and thus + // we can accumulate d.d to calculate 2p.d, then allowing us to get the next + // dot-product by adding it to dot-product p.p of the prior iteration. This + // saves us some multiplications and an expensive sqrt inside the inner loop. + // clang-format on + Float dotPos = dot(pos, pos); + Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta; + float deltaDelta2 = 2.0f * deltaDelta; + for (int t = 0; t < span;) { + // Compute the gradient table offset from the current position. + Float offset = fastSqrt(dotPos) - radius; + float startRadius = radius; + // If repeat is desired, we need to limit the offset to a fractional value. + if (repeat) { + // The non-repeating radius at which the gradient table actually starts, + // radius + floor(offset) = radius + (offset - fract(offset)). + startRadius += offset.x; + offset = fract(offset); + startRadius -= offset.x; + } + // We need to find the min/max index in the table of the gradient we want to + // use as well as the intercept point where we leave this gradient. + float intercept = -1; + int minIndex = 0; + int maxIndex = int(1.0f + size); + if (offset.x < 0) { + // If inside the inner radius of the gradient table, then use the first + // stop. Set the intercept to advance forward to the start of the gradient + // table. + maxIndex = minIndex; + if (t >= middleT) { + intercept = radius; + } + } else if (offset.x < 1) { + // Otherwise, we're inside the valid part of the gradient table. + minIndex = int(1.0f + offset.x * size); + maxIndex = minIndex; + // Find the offset in the gradient that corresponds to the search limit. + // We only search up to the minimum of either the mid-point or the span + // length. Get the table index that corresponds to this offset, clamped so + // that we avoid hitting the beginning (0) or end (1 + size) of the table. + float searchOffset = + (t >= middleT ? endRadius : middleRadius) - startRadius; + int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size)); + // If we are past the mid-point, walk up the gradient table trying to + // merge stops. If we're below the mid-point, we need to walk down the + // table. We note the table index at which we need to look for an + // intercept to determine a valid span. + if (t >= middleT) { + while (maxIndex + 1 <= searchIndex && + stops[maxIndex].can_merge(stops[maxIndex + 1])) { + maxIndex++; + } + intercept = maxIndex + 1; + } else { + while (minIndex - 1 >= searchIndex && + stops[minIndex - 1].can_merge(stops[minIndex])) { + minIndex--; + } + intercept = minIndex; + } + // Convert from a table index into units of radius from the center of the + // gradient. + intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius; + } else { + // If outside the outer radius of the gradient table, then use the last + // stop. Set the intercept to advance toward the valid part of the + // gradient table if going in, or just run to the end of the span if going + // away from the gradient. + minIndex = maxIndex; + if (t < middleT) { + intercept = radius + 1; + } + } + // Solve the quadratic for t to find where the merged gradient ends. If no + // intercept is found, just go to the middle or end of the span. + float endT = t >= middleT ? span : min(span, int(middleT)); + if (intercept >= 0) { + float b = middleB + intercept * intercept * invDelta; + if (b > 0) { + b = fastSqrt(b); + endT = min(endT, t >= middleT ? middleT + b : middleT - b); + } + } + // Figure out how many chunks are actually inside the merged gradient. + if (t + 4.0f <= endT) { + int inside = int(endT - t) & ~3; + // Convert start and end colors to BGRA and scale to 0..255 range later. + auto minColorF = stops[minIndex].startColor.zyxw * 255.0f; + auto maxColorF = stops[maxIndex].end_color().zyxw * 255.0f; + // Compute the change in color per change in gradient offset. + auto deltaColorF = + (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex)); + // Subtract off the color difference of the beginning of the current span + // from the beginning of the gradient. + Float colorF = + minColorF - deltaColorF * (startRadius + (minIndex - 1) / size); + // Finally, walk over the span accumulating the position dot product and + // getting its sqrt as an offset into the color ramp. Since we're already + // in BGRA format and scaled to 255, we just need to round to an integer + // and pack down to pixel format. + for (auto* end = buf + inside; buf < end; buf += 4) { + Float offsetG = fastSqrt(dotPos); + commit_blend_span( + buf, + combine( + packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1), + round_pixel(colorF + deltaColorF * offsetG.y, 1)), + packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1), + round_pixel(colorF + deltaColorF * offsetG.w, 1)))); + dotPos += dotPosDelta; + dotPosDelta += deltaDelta2; + } + // Advance past the portion of gradient we just processed. + t += inside; + // If we hit the end of the span, exit out now. + if (t >= span) { + break; + } + // Otherwise, we are most likely in a transitional section of the gradient + // between stops that will likely require doing per-sample table lookups. + // Rather than having to redo all the searching above to figure that out, + // just assume that to be the case and fall through below to doing the + // table lookups to hopefully avoid an iteration. + offset = fastSqrt(dotPos) - radius; + if (repeat) { + offset = fract(offset); + } + } + // If we got here, that means we still have span left to process but did not + // have any whole chunks that fell within a merged gradient. Just fall back + // to doing a table lookup for each sample. + Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); + commit_blend_span(buf, sampleGradient(sampler, address, entry)); + buf += 4; + t += 4; + dotPos += dotPosDelta; + dotPosDelta += deltaDelta2; + } + return true; +} + +// Commits an entire span of a radial gradient similar to +// swglcommitLinearGradient, but given a varying 2D position scaled to +// gradient-space and a radius at which the distance from the origin maps to the +// start of the gradient table. +#define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos, \ + radius) \ + do { \ + bool drawn = false; \ + if (blend_key) { \ + drawn = \ + commitRadialGradient(sampler, address, size, repeat, pos, \ + radius, swgl_OutRGBA8, swgl_SpanLength); \ + } else { \ + drawn = \ + commitRadialGradient(sampler, address, size, repeat, pos, \ + radius, swgl_OutRGBA8, swgl_SpanLength); \ + } \ + if (drawn) { \ + swgl_OutRGBA8 += swgl_SpanLength; \ + swgl_SpanLength = 0; \ + } \ + } while (0) + +// Extension to set a clip mask image to be sampled during blending. The offset +// specifies the positioning of the clip mask image relative to the viewport +// origin. The bounding box specifies the rectangle relative to the clip mask's +// origin that constrains sampling within the clip mask. Blending must be +// enabled for this to work. +static sampler2D swgl_ClipMask = nullptr; +static IntPoint swgl_ClipMaskOffset = {0, 0}; +static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0}; +#define swgl_clipMask(mask, offset, bb_origin, bb_size) \ + do { \ + if (bb_size != vec2_scalar(0.0f, 0.0f)) { \ + swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK; \ + swgl_ClipMask = mask; \ + swgl_ClipMaskOffset = make_ivec2(offset); \ + swgl_ClipMaskBounds = \ + IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \ + } \ + } while (0) + +// Extension to enable anti-aliasing for the given edges of a quad. +// Blending must be enable for this to work. +static int swgl_AAEdgeMask = 0; + +static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; } +static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; } +static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) { + return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) | + (mask.w ? 8 : 0); +} + +#define swgl_antiAlias(edges) \ + do { \ + swgl_AAEdgeMask = calcAAEdgeMask(edges); \ + if (swgl_AAEdgeMask) { \ + swgl_ClipFlags |= SWGL_CLIP_FLAG_AA; \ + } \ + } while (0) + +#define swgl_blendDropShadow(color) \ + do { \ + swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ + swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \ + swgl_BlendColorRGBA8 = packColor(color); \ + } while (0) + +#define swgl_blendSubpixelText(color) \ + do { \ + swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ + swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \ + swgl_BlendColorRGBA8 = packColor(color); \ + swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8); \ + } while (0) + +// Dispatch helper used by the GLSL translator to swgl_drawSpan functions. +// The number of pixels committed is tracked by checking for the difference in +// swgl_SpanLength. Any varying interpolants used will be advanced past the +// committed part of the span in case the fragment shader must be executed for +// any remaining pixels that were not committed by the span shader. +#define DISPATCH_DRAW_SPAN(self, format) \ + do { \ + int total = self->swgl_SpanLength; \ + self->swgl_drawSpan##format(); \ + int drawn = total - self->swgl_SpanLength; \ + if (drawn) self->step_interp_inputs(drawn); \ + return drawn; \ + } while (0) -- cgit v1.2.3