/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // When using a solid color with clip masking, the cost of loading the clip mask // in the blend stage exceeds the cost of processing the color. Here we handle // the entire span of clip mask texture before the blend stage to more // efficiently process it and modulate it with color without incurring blend // stage overheads. template static void commit_masked_solid_span(P* buf, C color, int len) { override_clip_mask(); uint8_t* mask = get_clip_mask(buf); for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) { commit_span( buf, blend_span( buf, applyColor(expand_mask(buf, unpack(unaligned_load(mask))), color))); } restore_clip_mask(); } // When using a solid color with anti-aliasing, most of the solid span will not // benefit from anti-aliasing in the opaque region. We only want to apply the AA // blend stage in the non-opaque start and end of the span where AA is needed. template static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) { if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) { commit_solid_span(buf, r, start); buf += start; len -= start; } if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) { override_aa(); commit_solid_span(buf, r, opaque); restore_aa(); buf += opaque; len -= opaque; } if (len > 0) { commit_solid_span(buf, r, len); } } // Forces a value with vector run-class to have scalar run-class. template static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) { return force_scalar(v); } // Advance all varying inperpolants by a single chunk #define swgl_stepInterp() step_interp_inputs() // Pseudo-intrinsic that accesses the interpolation step for a given varying #define swgl_interpStep(v) (interp_step.v) // Commit an entire span of a solid color. This dispatches to clip-masked and // anti-aliased fast-paths as appropriate. #define swgl_commitSolid(format, v, n) \ do { \ int len = (n); \ if (blend_key) { \ if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { \ commit_masked_solid_span(swgl_Out##format, \ packColor(swgl_Out##format, (v)), len); \ } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { \ commit_aa_solid_span(swgl_Out##format, \ pack_span(swgl_Out##format, (v)), len); \ } else { \ commit_solid_span(swgl_Out##format, \ pack_span(swgl_Out##format, (v)), len); \ } \ } else { \ commit_solid_span(swgl_Out##format, \ pack_span(swgl_Out##format, (v)), len); \ } \ swgl_Out##format += len; \ swgl_SpanLength -= len; \ } while (0) #define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength) #define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength) #define swgl_commitPartialSolidRGBA8(len, v) \ swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength)) #define swgl_commitPartialSolidR8(len, v) \ swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength)) #define swgl_commitChunk(format, chunk) \ do { \ auto r = chunk; \ if (blend_key) r = blend_span(swgl_Out##format, r); \ commit_span(swgl_Out##format, r); \ swgl_Out##format += swgl_StepSize; \ swgl_SpanLength -= swgl_StepSize; \ } while (0) // Commit a single chunk of a color #define swgl_commitColor(format, color) \ swgl_commitChunk(format, pack_pixels_##format(color)) #define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color) #define swgl_commitColorR8(color) swgl_commitColor(R8, color) template static ALWAYS_INLINE bool swgl_isTextureLinear(S s) { return s->filter == TextureFilter::LINEAR; } template static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) { return s->format == TextureFormat::RGBA8; } template static ALWAYS_INLINE bool swgl_isTextureR8(S s) { return s->format == TextureFormat::R8; } // Use the default linear quantization scale of 128. This gives 7 bits of // fractional precision, which when multiplied with a signed 9 bit value // still fits in a 16 bit integer. const int swgl_LinearQuantizeScale = 128; // Quantizes UVs for access into a linear texture. template static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) { return linearQuantize(p, swgl_LinearQuantizeScale, s); } // Quantizes an interpolation step for UVs for access into a linear texture. template static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) { return samplerScale(s, p) * swgl_LinearQuantizeScale; } template static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf, S sampler, ivec2 i) { return textureLinearUnpackedRGBA8(sampler, i); } template static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf, S sampler, ivec2 i) { return textureLinearUnpackedR8(sampler, i); } template static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) { return swgl_isTextureRGBA8(s); } template static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) { return swgl_isTextureR8(s); } // Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets // for linear sampling. #define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv) \ uv = swgl_linearQuantize(sampler, uv); \ vec2_scalar uv_step = \ float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; \ vec2_scalar min_uv = max( \ swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \ vec2_scalar max_uv = \ max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), \ min_uv); // Implements the fallback linear filter that can deal with clamping and // arbitrary scales. template static P* blendTextureLinearFallback(S sampler, vec2 uv, int span, vec2_scalar uv_step, vec2_scalar min_uv, vec2_scalar max_uv, C color, P* buf) { for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { commit_blend_span( buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(clamp(uv, min_uv, max_uv))), color)); } return buf; } static ALWAYS_INLINE U64 castForShuffle(V16 r) { return bit_cast(r); } static ALWAYS_INLINE U16 castForShuffle(V4 r) { return bit_cast(r); } static ALWAYS_INLINE V16 applyFracX(V16 r, I16 fracx) { return r * fracx.xxxxyyyyzzzzwwww; } static ALWAYS_INLINE V4 applyFracX(V4 r, I16 fracx) { return r * fracx; } // Implements a faster linear filter that works with axis-aligned constant Y but // scales less than 1, i.e. upscaling. In this case we can optimize for the // constant Y fraction as well as load all chunks from memory in a single tap // for each row. template static void blendTextureLinearUpscale(S sampler, vec2 uv, int span, vec2_scalar uv_step, vec2_scalar min_uv, vec2_scalar max_uv, C color, P* buf) { typedef VectorType packed_type; typedef VectorType unpacked_type; typedef VectorType signed_unpacked_type; ivec2 i(clamp(uv, min_uv, max_uv)); ivec2 frac = i; i >>= 7; P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x)); P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x)); I16 fracx = computeFracX(sampler, i, frac); int16_t fracy = computeFracY(frac).x; auto src0 = CONVERT(unaligned_load(&row0[i.x.x]), signed_unpacked_type); auto src1 = CONVERT(unaligned_load(&row1[i.x.x]), signed_unpacked_type); auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); // We attempt to sample ahead by one chunk and interpolate it with the current // one. However, due to the complication of upscaling, we may not necessarily // shift in all the next set of samples. for (P* end = buf + span; buf < end; buf += 4) { uv.x += uv_step.x; I32 ixn = cast(uv.x); I16 fracn = computeFracNoClamp(ixn); ixn >>= 7; auto src0n = CONVERT(unaligned_load(&row0[ixn.x]), signed_unpacked_type); auto src1n = CONVERT(unaligned_load(&row1[ixn.x]), signed_unpacked_type); auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); // Since we're upscaling, we know that a source pixel has a larger footprint // than the destination pixel, and thus all the source pixels needed for // this chunk will fall within a single chunk of texture data. However, // since the source pixels don't map 1:1 with destination pixels, we need to // shift the source pixels over based on their offset from the start of the // chunk. This could conceivably be optimized better with usage of PSHUFB or // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort // to masking in the correct pixels to avoid having to index into memory. // For the last sample to interpolate with, we need to potentially shift in // a sample from the next chunk over in the case the samples fill out an // entire chunk. auto shuf = src; auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4); if (i.x.y == i.x.x) { shuf = shuf.xxyz; shufn = shufn.xxyz; } if (i.x.z == i.x.y) { shuf = shuf.xyyz; shufn = shufn.xyyz; } if (i.x.w == i.x.z) { shuf = shuf.xyzz; shufn = shufn.xyzz; } // Convert back to a signed unpacked type so that we can interpolate the // final result. auto interp = bit_cast(shuf); auto interpn = bit_cast(shufn); interp += applyFracX(interpn - interp, fracx) >> 7; commit_blend_span( buf, applyColor(bit_cast(interp), color)); i.x = ixn; fracx = fracn; src = srcn; } } // This is the fastest variant of the linear filter that still provides // filtering. In cases where there is no scaling required, but we have a // subpixel offset that forces us to blend in neighboring pixels, we can // optimize away most of the memory loads and shuffling that is required by the // fallback filter. template static void blendTextureLinearFast(S sampler, vec2 uv, int span, vec2_scalar min_uv, vec2_scalar max_uv, C color, P* buf) { typedef VectorType packed_type; typedef VectorType unpacked_type; typedef VectorType signed_unpacked_type; ivec2 i(clamp(uv, min_uv, max_uv)); ivec2 frac = i; i >>= 7; P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); int16_t fracx = computeFracX(sampler, i, frac).x; int16_t fracy = computeFracY(frac).x; auto src0 = CONVERT(unaligned_load(row0), signed_unpacked_type); auto src1 = CONVERT(unaligned_load(row1), signed_unpacked_type); auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); // Since there is no scaling, we sample ahead by one chunk and interpolate it // with the current one. We can then reuse this value on the next iteration. for (P* end = buf + span; buf < end; buf += 4) { row0 += 4; row1 += 4; auto src0n = CONVERT(unaligned_load(row0), signed_unpacked_type); auto src1n = CONVERT(unaligned_load(row1), signed_unpacked_type); auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); // For the last sample to interpolate with, we need to potentially shift in // a sample from the next chunk over since the samples fill out an entire // chunk. auto interp = bit_cast(src); auto interpn = bit_cast(SHUFFLE(src, srcn, 1, 2, 3, 4)); interp += ((interpn - interp) * fracx) >> 7; commit_blend_span( buf, applyColor(bit_cast(interp), color)); src = srcn; } } // Implements a faster linear filter that works with axis-aligned constant Y but // downscaling the texture by half. In this case we can optimize for the // constant X/Y fractions and reduction factor while minimizing shuffling. template static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span, vec2_scalar min_uv, vec2_scalar max_uv, C color, P* buf) { typedef VectorType packed_type; typedef VectorType unpacked_type; typedef VectorType signed_unpacked_type; ivec2 i(clamp(uv, min_uv, max_uv)); ivec2 frac = i; i >>= 7; P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); int16_t fracx = computeFracX(sampler, i, frac).x; int16_t fracy = computeFracY(frac).x; for (P* end = buf + span; buf < end; buf += 4) { auto src0 = CONVERT(unaligned_load(row0), signed_unpacked_type); auto src1 = CONVERT(unaligned_load(row1), signed_unpacked_type); auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); row0 += 4; row1 += 4; auto src0n = CONVERT(unaligned_load(row0), signed_unpacked_type); auto src1n = CONVERT(unaligned_load(row1), signed_unpacked_type); auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); row0 += 4; row1 += 4; auto interp = bit_cast(SHUFFLE(src, srcn, 0, 2, 4, 6)); auto interpn = bit_cast(SHUFFLE(src, srcn, 1, 3, 5, 7)); interp += ((interpn - interp) * fracx) >> 7; commit_blend_span( buf, applyColor(bit_cast(interp), color)); } } enum LinearFilter { // No linear filter is needed. LINEAR_FILTER_NEAREST = 0, // The most general linear filter that handles clamping and varying scales. LINEAR_FILTER_FALLBACK, // A linear filter optimized for axis-aligned upscaling. LINEAR_FILTER_UPSCALE, // A linear filter with no scaling but with subpixel offset. LINEAR_FILTER_FAST, // A linear filter optimized for 2x axis-aligned downscaling. LINEAR_FILTER_DOWNSCALE }; // Dispatches to an appropriate linear filter depending on the selected filter. template static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span, vec2_scalar uv_step, vec2_scalar min_uv, vec2_scalar max_uv, C color, P* buf, LinearFilter filter) { P* end = buf + span; if (filter != LINEAR_FILTER_FALLBACK) { // If we're not using the fallback, then Y is constant across the entire // row. We just need to ensure that we handle any samples that might pull // data from before the start of the row and require clamping. float beforeDist = max(0.0f, min_uv.x) - uv.x.x; if (beforeDist > 0) { int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0, int(end - buf)); buf = blendTextureLinearFallback(sampler, uv, before, uv_step, min_uv, max_uv, color, buf); uv.x += (before / swgl_StepSize) * uv_step.x; } // We need to check how many samples we can take from inside the row without // requiring clamping. In case the filter oversamples the row by a step, we // subtract off a step from the width to leave some room. float insideDist = min(max_uv.x, float((int(sampler->width) - swgl_StepSize) * swgl_LinearQuantizeScale)) - uv.x.x; if (uv_step.x > 0.0f && insideDist >= uv_step.x) { int32_t inside = int(end - buf); if (filter == LINEAR_FILTER_DOWNSCALE) { inside = min(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) & ~(swgl_StepSize - 1), inside); if (inside > 0) { blendTextureLinearDownscale(sampler, uv, inside, min_uv, max_uv, color, buf); buf += inside; uv.x += (inside / swgl_StepSize) * uv_step.x; } } else if (filter == LINEAR_FILTER_UPSCALE) { inside = min(int(insideDist / uv_step.x) * swgl_StepSize, inside); if (inside > 0) { blendTextureLinearUpscale(sampler, uv, inside, uv_step, min_uv, max_uv, color, buf); buf += inside; uv.x += (inside / swgl_StepSize) * uv_step.x; } } else { inside = min(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) & ~(swgl_StepSize - 1), inside); if (inside > 0) { blendTextureLinearFast(sampler, uv, inside, min_uv, max_uv, color, buf); buf += inside; uv.x += (inside / swgl_StepSize) * uv_step.x; } } } } // If the fallback filter was requested, or if there are any samples left that // may be outside the row and require clamping, then handle that with here. if (buf < end) { buf = blendTextureLinearFallback( sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf); } return buf; } // Helper function to quantize UVs for linear filtering before dispatch template static inline int blendTextureLinear(S sampler, vec2 uv, int span, const vec4_scalar& uv_rect, C color, P* buf, LinearFilter filter) { if (!matchTextureFormat(sampler, buf)) { return 0; } LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); blendTextureLinearDispatch(sampler, uv, span, uv_step, min_uv, max_uv, color, buf, filter); return span; } // Samples an axis-aligned span of on a single row of a texture using 1:1 // nearest filtering. Sampling is constrained to only fall within the given UV // bounds. This requires a pointer to the destination buffer. An optional color // modulus can be supplied. template static int blendTextureNearestFast(S sampler, vec2 uv, int span, const vec4_scalar& uv_rect, C color, P* buf) { if (!matchTextureFormat(sampler, buf)) { return 0; } typedef VectorType packed_type; ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv))); ivec2_scalar minUV = make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y})); ivec2_scalar maxUV = make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w})); // Calculate the row pointer within the buffer, clamping to within valid row // bounds. P* row = &((P*)sampler ->buf)[clampCoord(clamp(i.y, minUV.y, maxUV.y), sampler->height) * sampler->stride]; // Find clamped X bounds within the row. int minX = clamp(minUV.x, 0, sampler->width - 1); int maxX = clamp(maxUV.x, minX, sampler->width - 1); int curX = i.x; int endX = i.x + span; // If we need to start sampling below the valid sample bounds, then we need to // fill this section with a constant clamped sample. if (curX < minX) { int n = min(minX, endX) - curX; auto src = applyColor(unpack(bit_cast(V4

(row[minX]))), color); commit_solid_span(buf, src, n); buf += n; curX += n; } // Here we only deal with valid samples within the sample bounds. No clamping // should occur here within these inner loops. int n = max(min(maxX + 1, endX) - curX, 0); // Try to process as many chunks as possible with full loads and stores. for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) { auto src = applyColor(unaligned_load(&row[curX]), color); commit_blend_span(buf, src); } n &= 3; // If we have any leftover samples after processing chunks, use partial loads // and stores. if (n > 0) { auto src = applyColor(partial_load_span(&row[curX], n), color); commit_blend_span(buf, src, n); buf += n; curX += n; } // If we still have samples left above the valid sample bounds, then we again // need to fill this section with a constant clamped sample. if (curX < endX) { auto src = applyColor(unpack(bit_cast(V4

(row[maxX]))), color); commit_solid_span(buf, src, endX - curX); } return span; } // We need to verify that the pixel step reasonably approximates stepping by a // single texel for every pixel we need to reproduce. Try to ensure that the // margin of error is no more than approximately 2^-7. Also, we check here if // the scaling can be quantized for acceleration. template static ALWAYS_INLINE int spanNeedsScale(int span, T P) { span &= ~(128 - 1); span += 128; int scaled = round((P.x.y - P.x.x) * span); return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0; } // Helper function to decide whether we can safely apply 1:1 nearest filtering // without diverging too much from the linear filter. template static inline LinearFilter needsTextureLinear(S sampler, T P, int span) { // If each row is not wide enough for linear filtering, then just use nearest // filtering. if (sampler->width < 2) { return LINEAR_FILTER_NEAREST; } // First verify if the row Y doesn't change across samples if (P.y.x != P.y.y) { return LINEAR_FILTER_FALLBACK; } P = samplerScale(sampler, P); if (int scale = spanNeedsScale(span, P)) { // If the source region is not flipped and smaller than the destination, // then we can use the upscaling filter since row Y is constant. return P.x.x < P.x.y && P.x.y - P.x.x <= 1 ? LINEAR_FILTER_UPSCALE : (scale == 2 ? LINEAR_FILTER_DOWNSCALE : LINEAR_FILTER_FALLBACK); } // Also verify that we're reasonably close to the center of a texel // so that it doesn't look that much different than if a linear filter // was used. if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 || (int(P.y.x * 4.0f + 0.5f) & 3) != 2) { // The source and destination regions are the same, but there is a // significant subpixel offset. We can use a faster linear filter to deal // with the offset in this case. return LINEAR_FILTER_FAST; } // Otherwise, we have a constant 1:1 step and we're stepping reasonably close // to the center of each pixel, so it's safe to disable the linear filter and // use nearest. return LINEAR_FILTER_NEAREST; } // Commit an entire span with linear filtering #define swgl_commitTextureLinear(format, s, p, uv_rect, color, n) \ do { \ auto packed_color = packColor(swgl_Out##format, color); \ int len = (n); \ int drawn = 0; \ if (LinearFilter filter = needsTextureLinear(s, p, len)) { \ if (blend_key) { \ drawn = blendTextureLinear(s, p, len, uv_rect, packed_color, \ swgl_Out##format, filter); \ } else { \ drawn = blendTextureLinear(s, p, len, uv_rect, packed_color, \ swgl_Out##format, filter); \ } \ } else if (blend_key) { \ drawn = blendTextureNearestFast(s, p, len, uv_rect, packed_color, \ swgl_Out##format); \ } else { \ drawn = blendTextureNearestFast(s, p, len, uv_rect, packed_color, \ swgl_Out##format); \ } \ swgl_Out##format += drawn; \ swgl_SpanLength -= drawn; \ } while (0) #define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \ swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength) #define swgl_commitTextureLinearR8(s, p, uv_rect) \ swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength) // Commit a partial span with linear filtering, optionally inverting the color #define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \ swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), \ min(int(len), swgl_SpanLength)) #define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \ swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(), \ min(int(len), swgl_SpanLength)) // Commit an entire span with linear filtering that is scaled by a color #define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \ swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength) #define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \ swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength) // Helper function that samples from an R8 texture while expanding it to support // a differing framebuffer format. template static inline int blendTextureLinearR8(S sampler, vec2 uv, int span, const vec4_scalar& uv_rect, C color, P* buf) { if (!swgl_isTextureR8(sampler) || sampler->width < 2) { return 0; } LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { commit_blend_span( buf, applyColor(expand_mask(buf, textureLinearUnpackedR8( sampler, ivec2(clamp(uv, min_uv, max_uv)))), color)); } return span; } // Commit an entire span with linear filtering while expanding from R8 to RGBA8 #define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color) \ do { \ auto packed_color = packColor(swgl_OutRGBA8, color); \ int drawn = 0; \ if (blend_key) { \ drawn = blendTextureLinearR8(s, p, swgl_SpanLength, uv_rect, \ packed_color, swgl_OutRGBA8); \ } else { \ drawn = blendTextureLinearR8(s, p, swgl_SpanLength, uv_rect, \ packed_color, swgl_OutRGBA8); \ } \ swgl_OutRGBA8 += drawn; \ swgl_SpanLength -= drawn; \ } while (0) #define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \ swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor()) // Compute repeating UVs, possibly constrained by tile repeat limits static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) { if (tile_repeat.x > 0.0f) { // Clamp to a number slightly less than the tile repeat limit so that // it results in a number close to but not equal to 1 after fract(). // This avoids fract() yielding 0 if the limit was left as whole integer. uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f); } return fract(uv); } // Compute the number of non-repeating steps before we need to potentially // repeat the UVs. static inline int computeNoRepeatSteps(Float uv, float uv_step, float tile_repeat, int steps) { if (uv.w < uv.x) { // Ensure the UV taps are ordered low to high. uv = uv.wzyx; } // Check if the samples cross the boundary of the next whole integer or the // tile repeat limit, whichever is lower. float limit = floor(uv.x) + 1.0f; if (tile_repeat > 0.0f) { limit = min(limit, tile_repeat); } return uv.x >= 0.0f && uv.w < limit ? (uv_step != 0.0f ? int(clamp((limit - uv.x) / uv_step, 0.0f, float(steps))) : steps) : 0; } // Blends an entire span of texture with linear filtering and repeating UVs. template static int blendTextureLinearRepeat(S sampler, vec2 uv, int span, const vec2_scalar& tile_repeat, const vec4_scalar& uv_repeat, const vec4_scalar& uv_rect, C color, P* buf) { if (!matchTextureFormat(sampler, buf)) { return 0; } vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y}; vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y}; // Choose a linear filter to use for no-repeat sub-spans LinearFilter filter = needsTextureLinear(sampler, uv * uv_scale + uv_offset, span); // We need to step UVs unscaled and unquantized so that we can modulo them // with fract. We use uv_scale and uv_offset to map them into the correct // range. vec2_scalar uv_step = float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; uv_scale = swgl_linearQuantizeStep(sampler, uv_scale); uv_offset = swgl_linearQuantize(sampler, uv_offset); vec2_scalar min_uv = max( swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); vec2_scalar max_uv = max( swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv); for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { int steps = int(end - buf) / swgl_StepSize; // Find the sub-span before UVs repeat to avoid expensive repeat math steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); if (steps > 0) { steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); if (steps > 0) { buf = blendTextureLinearDispatch( sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize, uv_step * uv_scale, min_uv, max_uv, color, buf, filter); if (buf >= end) { break; } uv += steps * uv_step; } } // UVs might repeat within this step, so explicitly compute repeated UVs vec2 repeated_uv = clamp( tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv); commit_blend_span( buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)), color)); } return span; } // Commit an entire span with linear filtering and repeating UVs #define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat, \ uv_rect, color) \ do { \ auto packed_color = packColor(swgl_Out##format, color); \ int drawn = 0; \ if (blend_key) { \ drawn = blendTextureLinearRepeat(s, p, swgl_SpanLength, \ tile_repeat, uv_repeat, uv_rect, \ packed_color, swgl_Out##format); \ } else { \ drawn = blendTextureLinearRepeat(s, p, swgl_SpanLength, \ tile_repeat, uv_repeat, uv_rect, \ packed_color, swgl_Out##format); \ } \ swgl_Out##format += drawn; \ swgl_SpanLength -= drawn; \ } while (0) #define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ uv_rect) \ swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ NoColor()) #define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \ uv_rect, color) \ swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ color) template static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf, S sampler, ivec2 i) { return textureNearestPackedRGBA8(sampler, i); } // Blends an entire span of texture with nearest filtering and either // repeated or clamped UVs. template static int blendTextureNearestRepeat(S sampler, vec2 uv, int span, const vec2_scalar& tile_repeat, const vec4_scalar& uv_rect, C color, P* buf) { if (!matchTextureFormat(sampler, buf)) { return 0; } if (!REPEAT) { // If clamping, then we step pre-scaled to the sampler. For repeat modes, // this will be accomplished via uv_scale instead. uv = samplerScale(sampler, uv); } vec2_scalar uv_step = float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}); vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}); vec2_scalar uv_scale = max_uv - min_uv; // If the effective sampling area of this texture is only a single pixel, then // treat it as a solid span. For repeat modes, the bounds are specified on // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so // the test varies depending on which. If the sample range on an axis is // greater than one pixel, we can still check if we don't move far enough from // the pixel center on that axis to hit the next pixel. if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) || (abs(uv_step.x) * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) && (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) || (abs(uv_step.y) * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) { vec2 repeated_uv = REPEAT ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv : clamp(uv, min_uv, max_uv); commit_solid_span(buf, applyColor(unpack(textureNearestPacked( buf, sampler, ivec2(repeated_uv))), color), span); } else { for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { if (REPEAT) { int steps = int(end - buf) / swgl_StepSize; // Find the sub-span before UVs repeat to avoid expensive repeat math steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); if (steps > 0) { steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); if (steps > 0) { vec2 inside_uv = fract(uv) * uv_scale + min_uv; vec2 inside_step = uv_step * uv_scale; for (P* outside = &buf[steps * swgl_StepSize]; buf < outside; buf += swgl_StepSize, inside_uv += inside_step) { commit_blend_span( buf, applyColor( textureNearestPacked(buf, sampler, ivec2(inside_uv)), color)); } if (buf >= end) { break; } uv += steps * uv_step; } } } // UVs might repeat within this step, so explicitly compute repeated UVs vec2 repeated_uv = REPEAT ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv : clamp(uv, min_uv, max_uv); commit_blend_span( buf, applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)), color)); } } return span; } // Determine if we can use the fast nearest filter for the given nearest mode. // If the Y coordinate varies more than half a pixel over // the span (which might cause the texel to alias to the next one), or the span // needs X scaling, then we have to use the fallback. template static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) { P = samplerScale(sampler, P); return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P); } // Commit an entire span with nearest filtering and either clamped or repeating // UVs #define swgl_commitTextureNearest(format, s, p, uv_rect, color) \ do { \ auto packed_color = packColor(swgl_Out##format, color); \ int drawn = 0; \ if (needsNearestFallback(s, p, swgl_SpanLength)) { \ if (blend_key) { \ drawn = blendTextureNearestRepeat( \ s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ swgl_Out##format); \ } else { \ drawn = blendTextureNearestRepeat( \ s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ swgl_Out##format); \ } \ } else if (blend_key) { \ drawn = blendTextureNearestFast(s, p, swgl_SpanLength, uv_rect, \ packed_color, swgl_Out##format); \ } else { \ drawn = blendTextureNearestFast(s, p, swgl_SpanLength, uv_rect, \ packed_color, swgl_Out##format); \ } \ swgl_Out##format += drawn; \ swgl_SpanLength -= drawn; \ } while (0) #define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \ swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor()) #define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \ swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color) #define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \ color) \ do { \ auto packed_color = packColor(swgl_Out##format, color); \ int drawn = 0; \ if (blend_key) { \ drawn = blendTextureNearestRepeat( \ s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ swgl_Out##format); \ } else { \ drawn = blendTextureNearestRepeat( \ s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ swgl_Out##format); \ } \ swgl_Out##format += drawn; \ swgl_SpanLength -= drawn; \ } while (0) #define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ uv_rect) \ swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, \ NoColor()) #define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat, \ uv_repeat, uv_rect, color) \ swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color) // Commit an entire span of texture with filtering determined by sampler state. #define swgl_commitTexture(format, s, ...) \ do { \ if (s->filter == TextureFilter::LINEAR) { \ swgl_commitTextureLinear##format(s, __VA_ARGS__); \ } else { \ swgl_commitTextureNearest##format(s, __VA_ARGS__); \ } \ } while (0) #define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__) #define swgl_commitTextureColorRGBA8(...) \ swgl_commitTexture(ColorRGBA8, __VA_ARGS__) #define swgl_commitTextureRepeatRGBA8(...) \ swgl_commitTexture(RepeatRGBA8, __VA_ARGS__) #define swgl_commitTextureRepeatColorRGBA8(...) \ swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__) // Commit an entire span of a separable pass of a Gaussian blur that falls // within the given radius scaled by supplied coefficients, clamped to uv_rect // bounds. template static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect, P* buf, int span, bool hori, int radius, vec2_scalar coeffs) { if (!matchTextureFormat(sampler, buf)) { return 0; } vec2_scalar size = {float(sampler->width), float(sampler->height)}; ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size); ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size)); int startX = curUV.x; int endX = min(min(bounds.z, curUV.x + span), int(size.x)); if (hori) { for (; curUV.x + swgl_StepSize <= endX; buf += swgl_StepSize, curUV.x += swgl_StepSize) { commit_blend_span( buf, gaussianBlurHorizontal

(sampler, curUV, bounds.x, bounds.z, radius, coeffs.x, coeffs.y)); } } else { for (; curUV.x + swgl_StepSize <= endX; buf += swgl_StepSize, curUV.x += swgl_StepSize) { commit_blend_span( buf, gaussianBlurVertical

(sampler, curUV, bounds.y, bounds.w, radius, coeffs.x, coeffs.y)); } } return curUV.x - startX; } #define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs) \ do { \ int drawn = 0; \ if (blend_key) { \ drawn = blendGaussianBlur(s, p, uv_rect, swgl_Out##format, \ swgl_SpanLength, hori, radius, coeffs); \ } else { \ drawn = blendGaussianBlur(s, p, uv_rect, swgl_Out##format, \ swgl_SpanLength, hori, radius, coeffs); \ } \ swgl_Out##format += drawn; \ swgl_SpanLength -= drawn; \ } while (0) #define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \ swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs) #define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \ swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs) // Convert and pack planar YUV samples to RGB output using a color space static ALWAYS_INLINE PackedRGBA8 convertYUV(const YUVMatrix& rgb_from_ycbcr, U16 y, U16 u, U16 v) { auto yy = V8(zip(y, y)); auto uv = V8(zip(u, v)); return rgb_from_ycbcr.convert(yy, uv); } // Helper functions to sample from planar YUV textures before converting to RGB template static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, const YUVMatrix& rgb_from_ycbcr, UNUSED int rescaleFactor) { switch (sampler0->format) { case TextureFormat::RGBA8: { auto planar = textureLinearPlanarRGBA8(sampler0, uv0); return convertYUV(rgb_from_ycbcr, highHalf(planar.rg), lowHalf(planar.rg), lowHalf(planar.ba)); } case TextureFormat::YUV422: { auto planar = textureLinearPlanarYUV422(sampler0, uv0); return convertYUV(rgb_from_ycbcr, planar.y, planar.u, planar.v); } default: assert(false); return PackedRGBA8(0); } } template static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, const vec4_scalar& uv_rect0, const vec3_scalar& ycbcr_bias, const mat3_scalar& rgb_from_debiased_ycbcr, int rescaleFactor, C color = C()) { if (!swgl_isTextureLinear(sampler0)) { return 0; } LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); const auto rgb_from_ycbcr = YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); auto c = packColor(buf, color); auto* end = buf + span; for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) { commit_blend_span( buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), rgb_from_ycbcr, rescaleFactor), c)); } return span; } template static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, ivec2 uv1, const YUVMatrix& rgb_from_ycbcr, int rescaleFactor) { switch (sampler1->format) { case TextureFormat::RG8: { assert(sampler0->format == TextureFormat::R8); auto y = textureLinearUnpackedR8(sampler0, uv0); auto planar = textureLinearPlanarRG8(sampler1, uv1); return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.rg), highHalf(planar.rg)); } case TextureFormat::RGBA8: { assert(sampler0->format == TextureFormat::R8); auto y = textureLinearUnpackedR8(sampler0, uv0); auto planar = textureLinearPlanarRGBA8(sampler1, uv1); return convertYUV(rgb_from_ycbcr, y, lowHalf(planar.ba), highHalf(planar.rg)); } case TextureFormat::RG16: { assert(sampler0->format == TextureFormat::R16); // The rescaling factor represents how many bits to add to renormalize the // texture to 16 bits, and so the color depth is actually 16 minus the // rescaling factor. // Need to right shift the sample by the amount of bits over 8 it // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit // of precision at the low end already, hence 1 is subtracted from the // color depth. int colorDepth = 16 - rescaleFactor; int rescaleBits = (colorDepth - 1) - 8; auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; auto uv = textureLinearUnpackedRG16(sampler1, uv1) >> rescaleBits; return rgb_from_ycbcr.convert(zip(y, y), uv); } default: assert(false); return PackedRGBA8(0); } } template static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, const vec4_scalar& uv_rect1, const vec3_scalar& ycbcr_bias, const mat3_scalar& rgb_from_debiased_ycbcr, int rescaleFactor, C color = C()) { if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) { return 0; } LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); const auto rgb_from_ycbcr = YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); auto c = packColor(buf, color); auto* end = buf + span; for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) { commit_blend_span( buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), rgb_from_ycbcr, rescaleFactor), c)); } return span; } template static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, ivec2 uv1, S2 sampler2, ivec2 uv2, const YUVMatrix& rgb_from_ycbcr, int rescaleFactor) { assert(sampler0->format == sampler1->format && sampler0->format == sampler2->format); switch (sampler0->format) { case TextureFormat::R8: { auto y = textureLinearUnpackedR8(sampler0, uv0); auto u = textureLinearUnpackedR8(sampler1, uv1); auto v = textureLinearUnpackedR8(sampler2, uv2); return convertYUV(rgb_from_ycbcr, y, u, v); } case TextureFormat::R16: { // The rescaling factor represents how many bits to add to renormalize the // texture to 16 bits, and so the color depth is actually 16 minus the // rescaling factor. // Need to right shift the sample by the amount of bits over 8 it // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit // of precision at the low end already, hence 1 is subtracted from the // color depth. int colorDepth = 16 - rescaleFactor; int rescaleBits = (colorDepth - 1) - 8; auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits; auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits; return convertYUV(rgb_from_ycbcr, U16(y), U16(u), U16(v)); } default: assert(false); return PackedRGBA8(0); } } // Fallback helper for when we can't specifically accelerate YUV with // composition. template static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0, vec2_scalar uv_step0, vec2_scalar min_uv0, vec2_scalar max_uv0, S1 sampler1, vec2 uv1, vec2_scalar uv_step1, vec2_scalar min_uv1, vec2_scalar max_uv1, S2 sampler2, vec2 uv2, vec2_scalar uv_step2, vec2_scalar min_uv2, vec2_scalar max_uv2, const vec3_scalar& ycbcr_bias, const mat3_scalar& rgb_from_debiased_ycbcr, int rescaleFactor, C color) { const auto rgb_from_ycbcr = YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1, uv2 += uv_step2) { commit_blend_span( buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)), rgb_from_ycbcr, rescaleFactor), color)); } } template static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2, const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias, const mat3_scalar& rgb_from_debiased_ycbcr, int rescaleFactor, C color = C()) { if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || !swgl_isTextureLinear(sampler2)) { return 0; } LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); auto c = packColor(buf, color); blendYUVFallback(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor, c); return span; } // A variant of the blendYUV that attempts to reuse the inner loops from the // CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on // the source data, which in turn allows it to be much faster than blendYUV. // At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer // and that no color scaling is applied, which we can accomplish via template // specialization. We need to further validate inside that texture formats // and dimensions are sane for video and that the video is axis-aligned before // acceleration can proceed. template static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0, const vec4_scalar& uv_rect0, sampler2DRect sampler1, vec2 uv1, const vec4_scalar& uv_rect1, sampler2DRect sampler2, vec2 uv2, const vec4_scalar& uv_rect2, const vec3_scalar& ycbcr_bias, const mat3_scalar& rgb_from_debiased_ycbcr, int rescaleFactor, NoColor noColor = NoColor()) { if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || !swgl_isTextureLinear(sampler2)) { return 0; } LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); auto* end = buf + span; // CompositeYUV imposes further restrictions on the source textures, such that // the the Y/U/V samplers must all have a matching format, the U/V samplers // must have matching sizes and sample coordinates, and there must be no // change in row across the entire span. if (sampler0->format == sampler1->format && sampler1->format == sampler2->format && sampler1->width == sampler2->width && sampler1->height == sampler2->height && uv_step0.y == 0 && uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 && uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) { // CompositeYUV does not support a clamp rect, so we must take care to // advance till we're inside the bounds of the clamp rect. int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x, (min_uv1.x - uv1.x.x) / uv_step1.x))), (end - buf) / swgl_StepSize); if (outside > 0) { blendYUVFallback(buf, outside * swgl_StepSize, sampler0, uv0, uv_step0, min_uv0, max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor, noColor); buf += outside * swgl_StepSize; uv0.x += outside * uv_step0.x; uv1.x += outside * uv_step1.x; uv2.x += outside * uv_step2.x; } // Find the amount of chunks inside the clamp rect before we hit the // maximum. If there are any chunks inside, we can finally dispatch to // CompositeYUV. int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x, (max_uv1.x - uv1.x.x) / uv_step1.x)), (end - buf) / swgl_StepSize); if (inside > 0) { // We need the color depth, which is relative to the texture format and // rescale factor. int colorDepth = (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor; // Finally, call the inner loop of CompositeYUV. const auto rgb_from_ycbcr = YUVMatrix::From(ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor); linear_row_yuv( buf, inside * swgl_StepSize, sampler0, force_scalar(uv0), uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1), uv_step1.x / swgl_StepSize, colorDepth, rgb_from_ycbcr); // Now that we're done, advance past the processed inside portion. buf += inside * swgl_StepSize; uv0.x += inside * uv_step0.x; uv1.x += inside * uv_step1.x; uv2.x += inside * uv_step2.x; } } // We either got here because we have some samples outside the clamp rect, or // because some of the preconditions were not satisfied. Process whatever is // left of the span. blendYUVFallback(buf, end - buf, sampler0, uv0, uv_step0, min_uv0, max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, uv2, uv_step2, min_uv2, max_uv2, ycbcr_bias, rgb_from_debiased_ycbcr, rescaleFactor, noColor); return span; } // Commit a single chunk of a YUV surface represented by multiple planar // textures. This requires a color space specifier selecting how to convert // from YUV to RGB output. In the case of HDR formats, a rescaling factor // selects how many bits of precision must be utilized on conversion. See the // sampleYUV dispatcher functions for the various supported plane // configurations this intrinsic accepts. #define swgl_commitTextureLinearYUV(...) \ do { \ int drawn = 0; \ if (blend_key) { \ drawn = blendYUV(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ } else { \ drawn = blendYUV(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ } \ swgl_OutRGBA8 += drawn; \ swgl_SpanLength -= drawn; \ } while (0) // Commit a single chunk of a YUV surface scaled by a color. #define swgl_commitTextureLinearColorYUV(...) \ swgl_commitTextureLinearYUV(__VA_ARGS__) // Each gradient stops entry is a pair of RGBA32F start color and end step. struct GradientStops { Float startColor; union { Float stepColor; vec4_scalar stepData; }; // Whether this gradient entry can be merged with an adjacent entry. The // step will be equal with the adjacent step if and only if they can be // merged, or rather, that the stops are actually part of a single larger // gradient. bool can_merge(const GradientStops& next) const { return stepData == next.stepData; } // Get the interpolated color within the entry based on the offset from its // start. Float interpolate(float offset) const { return startColor + stepColor * offset; } // Get the end color of the entry where interpolation stops. Float end_color() const { return startColor + stepColor; } }; // Checks if a gradient table of the specified size exists at the UV coords of // the address within an RGBA32F texture. If so, a linear address within the // texture is returned that may be used to sample the gradient table later. If // the address doesn't describe a valid gradient, then a negative value is // returned. static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address, int entries) { return sampler->format == TextureFormat::RGBA32F && address.y >= 0 && address.y < int(sampler->height) && address.x >= 0 && address.x < int(sampler->width) && entries > 0 && address.x + int(sizeof(GradientStops) / sizeof(Float)) * entries <= int(sampler->width) ? address.y * sampler->stride + address.x * 4 : -1; } static inline WideRGBA8 sampleGradient(sampler2D sampler, int address, Float entry) { assert(sampler->format == TextureFormat::RGBA32F); assert(address >= 0 && address < int(sampler->height * sampler->stride)); // Get the integer portion of the entry index to find the entry colors. I32 index = cast(entry); // Use the fractional portion of the entry index to control blending between // entry colors. Float offset = entry - cast(index); // Every entry is a pair of colors blended by the fractional offset. assert(test_all(index >= 0 && index * int(sizeof(GradientStops) / sizeof(Float)) < int(sampler->width))); GradientStops* stops = (GradientStops*)&sampler->buf[address]; // Blend between the colors for each SIMD lane, then pack them to RGBA8 // result. Since the layout of the RGBA8 framebuffer is actually BGRA while // the gradient table has RGBA colors, swizzling is required. return combine( packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw), round_pixel(stops[index.y].interpolate(offset.y).zyxw)), packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw), round_pixel(stops[index.w].interpolate(offset.w).zyxw))); } // Samples a gradient entry from the gradient at the provided linearized // address. The integer portion of the entry index is used to find the entry // within the table whereas the fractional portion is used to blend between // adjacent table entries. #define swgl_commitGradientRGBA8(sampler, address, entry) \ swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry)) // Variant that allows specifying a color multiplier of the gradient result. #define swgl_commitGradientColorRGBA8(sampler, address, entry, color) \ swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \ packColor(swgl_OutRGBA, color))) // Samples an entire span of a linear gradient by crawling the gradient table // and looking for consecutive stops that can be merged into a single larger // gradient, then interpolating between those larger gradients within the span. template static bool commitLinearGradient(sampler2D sampler, int address, float size, bool tileRepeat, bool gradientRepeat, vec2 pos, const vec2_scalar& scaleDir, float startOffset, uint32_t* buf, int span) { assert(sampler->format == TextureFormat::RGBA32F); assert(address >= 0 && address < int(sampler->height * sampler->stride)); GradientStops* stops = (GradientStops*)&sampler->buf[address]; // Get the chunk delta from the difference in offset steps. This represents // how far within the gradient table we advance for every step in output, // normalized to gradient table size. vec2_scalar posStep = dFdx(pos) * 4.0f; float delta = dot(posStep, scaleDir); if (!isfinite(delta)) { return false; } // If we have a repeating brush, then the position will be modulo the [0,1) // interval. Compute coefficients that can be used to quickly evaluate the // distance to the interval boundary where the offset will wrap. vec2_scalar distCoeffsX = {0.25f * span, 0.0f}; vec2_scalar distCoeffsY = distCoeffsX; if (tileRepeat) { if (posStep.x != 0.0f) { distCoeffsX = vec2_scalar{step(0.0f, posStep.x), 1.0f} * recip(posStep.x); } if (posStep.y != 0.0f) { distCoeffsY = vec2_scalar{step(0.0f, posStep.y), 1.0f} * recip(posStep.y); } } for (; span > 0;) { // Try to process as many chunks as are within the span if possible. float chunks = 0.25f * span; vec2 repeatPos = pos; if (tileRepeat) { // If this is a repeating brush, then limit the chunks to not cross the // interval boundaries. repeatPos = fract(pos); chunks = min(chunks, distCoeffsX.x - repeatPos.x.x * distCoeffsX.y); chunks = min(chunks, distCoeffsY.x - repeatPos.y.x * distCoeffsY.y); } // Compute the gradient offset from the position. Float offset = repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset; // If repeat is desired, we need to limit the offset to a fractional value. if (gradientRepeat) { offset = fract(offset); } // To properly handle both clamping and repeating of the table offset, we // need to ensure we don't run past the 0 and 1 points. Here we compute the // intercept points depending on whether advancing forwards or backwards in // the gradient table to ensure the chunk count is limited by the amount // before intersection. If there is no delta, then we compute no intercept. float startEntry; int minIndex, maxIndex; if (offset.x < 0) { // If we're below the gradient table, use the first color stop. We can // only intercept the table if walking forward. startEntry = 0; minIndex = int(startEntry); maxIndex = minIndex; if (delta > 0) { chunks = min(chunks, -offset.x / delta); } } else if (offset.x < 1) { // Otherwise, we're inside the gradient table. Depending on the direction // we're walking the the table, we may intersect either the 0 or 1 offset. // Compute the start entry based on our initial offset, and compute the // end entry based on the available chunks limited by intercepts. Clamp // them into the valid range of the table. startEntry = 1.0f + offset.x * size; if (delta < 0) { chunks = min(chunks, -offset.x / delta); } else if (delta > 0) { chunks = min(chunks, (1 - offset.x) / delta); } float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size, 0.0f, 1.0f + size); // Now that we know the range of entries we need to sample, we want to // find the largest possible merged gradient within that range. Depending // on which direction we are advancing in the table, we either walk up or // down the table trying to merge the current entry with the adjacent // entry. We finally limit the chunks to only sample from this merged // gradient. minIndex = int(startEntry); maxIndex = minIndex; if (delta > 0) { while (maxIndex + 1 < endEntry && stops[maxIndex].can_merge(stops[maxIndex + 1])) { maxIndex++; } chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size)); } else if (delta < 0) { while (minIndex - 1 > endEntry && stops[minIndex - 1].can_merge(stops[minIndex])) { minIndex--; } chunks = min(chunks, (minIndex - startEntry) / (delta * size)); } } else { // If we're above the gradient table, use the last color stop. We can // only intercept the table if walking backward. startEntry = 1.0f + size; minIndex = int(startEntry); maxIndex = minIndex; if (delta < 0) { chunks = min(chunks, (1 - offset.x) / delta); } } // If there are any amount of whole chunks of a merged gradient found, // then we want to process that as a single gradient span with the start // and end colors from the min and max entries. if (chunks >= 1.0f) { int inside = int(chunks); // Sample the start color from the min entry and the end color from the // max entry of the merged gradient. These are scaled to a range of // 0..0xFF00, as that is the largest shifted value that can fit in a U16. // Since we are only doing addition with the step value, we can still // represent negative step values without having to use an explicit sign // bit, as the result will still come out the same, allowing us to gain an // extra bit of precision. We will later shift these into 8 bit output // range while committing the span, but stepping with higher precision to // avoid banding. We convert from RGBA to BGRA here to avoid doing this in // the inner loop. auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00); auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00); // Get the color range of the merged gradient, normalized to its size. auto colorRangeF = (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex)); // Compute the actual starting color of the current start offset within // the merged gradient. The value 0.5 is added to the low bits (0x80) so // that the color will effective round to the nearest increment below. auto colorF = minColorF + colorRangeF * (startEntry - minIndex) + float(0x80); // Compute the portion of the color range that we advance on each chunk. Float deltaColorF = colorRangeF * (delta * size); // Quantize the color delta and current color. These have already been // scaled to the 0..0xFF00 range, so we just need to round them to U16. auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16)); auto color = combine(CONVERT(round_pixel(colorF, 1), U16), CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16), CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16), CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16)); // Finally, step the current color through the output chunks, shifting // it into 8 bit range and outputting as we go. for (auto* end = buf + inside * 4; buf < end; buf += 4) { commit_blend_span(buf, bit_cast(color >> 8)); color += deltaColor; } // Deduct the number of chunks inside the gradient from the remaining // overall span. If we exhausted the span, bail out. span -= inside * 4; if (span <= 0) { break; } // Otherwise, assume we're in a transitional section of the gradient that // will probably require per-sample table lookups, so fall through below. // We need to re-evaluate the position and offset first, though. pos += posStep * float(inside); repeatPos = tileRepeat ? fract(pos) : pos; offset = repeatPos.x * scaleDir.x + repeatPos.y * scaleDir.y - startOffset; if (gradientRepeat) { offset = fract(offset); } } // If we get here, there were no whole chunks of a merged gradient found // that we could process, but we still have a non-zero amount of span left. // That means we have segments of gradient that begin or end at the current // entry we're on. For this case, we just fall back to sampleGradient which // will calculate a table entry for each sample, assuming the samples may // have different table entries. Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); commit_blend_span(buf, sampleGradient(sampler, address, entry)); span -= 4; buf += 4; pos += posStep; } return true; } // Commits an entire span of a linear gradient, given the address of a table // previously resolved with swgl_validateGradient. The size of the inner portion // of the table is given, assuming the table start and ends with a single entry // each to deal with clamping. Repeating will be handled if necessary. The // initial offset within the table is used to designate where to start the span // and how to step through the gradient table. #define swgl_commitLinearGradientRGBA8(sampler, address, size, tileRepeat, \ gradientRepeat, pos, scaleDir, \ startOffset) \ do { \ bool drawn = false; \ if (blend_key) { \ drawn = commitLinearGradient( \ sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ startOffset, swgl_OutRGBA8, swgl_SpanLength); \ } else { \ drawn = commitLinearGradient( \ sampler, address, size, tileRepeat, gradientRepeat, pos, scaleDir, \ startOffset, swgl_OutRGBA8, swgl_SpanLength); \ } \ if (drawn) { \ swgl_OutRGBA8 += swgl_SpanLength; \ swgl_SpanLength = 0; \ } \ } while (0) template static ALWAYS_INLINE V fastSqrt(V v) { #if USE_SSE2 || USE_NEON // Clamp to avoid zero in inversesqrt. return v * inversesqrt(CLAMP ? max(v, V(1.0e-10f)) : v); #else return sqrt(v); #endif } template static ALWAYS_INLINE auto fastLength(V v) { return fastSqrt(dot(v, v)); } // Samples an entire span of a radial gradient by crawling the gradient table // and looking for consecutive stops that can be merged into a single larger // gradient, then interpolating between those larger gradients within the span // based on the computed position relative to a radius. template static bool commitRadialGradient(sampler2D sampler, int address, float size, bool repeat, vec2 pos, float radius, uint32_t* buf, int span) { assert(sampler->format == TextureFormat::RGBA32F); assert(address >= 0 && address < int(sampler->height * sampler->stride)); GradientStops* stops = (GradientStops*)&sampler->buf[address]; // clang-format off // Given position p, delta d, and radius r, we need to repeatedly solve the // following quadratic for the pixel offset t: // length(p + t*d) = r // (px + t*dx)^2 + (py + t*dy)^2 = r^2 // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is: // t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0 // t^2*d.d + t*2*d.p + (p.p-r^2) = 0 // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to: // t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d) // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so // we cache them below for faster computation. // // The quadratic has two solutions, representing the span intersecting the // given radius of gradient, which can occur at two offsets. If there is only // one solution (where b^2-4ac = 0), this represents the point at which the // span runs tangent to the radius. This middle point is significant in that // before it, we walk down the gradient ramp, and after it, we walk up the // ramp. // clang-format on vec2_scalar pos0 = {pos.x.x, pos.y.x}; vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x}; float deltaDelta = dot(delta, delta); if (!isfinite(deltaDelta) || !isfinite(radius)) { return false; } float invDelta, middleT, middleB; if (deltaDelta > 0) { invDelta = 1.0f / deltaDelta; middleT = -dot(delta, pos0) * invDelta; middleB = middleT * middleT - dot(pos0, pos0) * invDelta; } else { // If position is invariant, just set the coefficients so the quadratic // always reduces to the end of the span. invDelta = 0.0f; middleT = float(span); middleB = 0.0f; } // We only want search for merged gradients up to the minimum of either the // mid-point or the span length. Cache those offsets here as they don't vary // in the inner loop. Float middleEndRadius = fastLength( pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f}); float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x; float endRadius = middleEndRadius.y; // Convert delta to change in position per chunk. delta *= 4; deltaDelta *= 4 * 4; // clang-format off // Given current position p and delta d, we reduce: // length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p)) // where dot(p+d,p+d) can be accumulated as: // (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2) // = p.p + 2p.d + d.d // Since p increases by d every loop iteration, p.d increases by d.d, and thus // we can accumulate d.d to calculate 2p.d, then allowing us to get the next // dot-product by adding it to dot-product p.p of the prior iteration. This // saves us some multiplications and an expensive sqrt inside the inner loop. // clang-format on Float dotPos = dot(pos, pos); Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta; float deltaDelta2 = 2.0f * deltaDelta; for (int t = 0; t < span;) { // Compute the gradient table offset from the current position. Float offset = fastSqrt(dotPos) - radius; float startRadius = radius; // If repeat is desired, we need to limit the offset to a fractional value. if (repeat) { // The non-repeating radius at which the gradient table actually starts, // radius + floor(offset) = radius + (offset - fract(offset)). startRadius += offset.x; offset = fract(offset); startRadius -= offset.x; } // We need to find the min/max index in the table of the gradient we want to // use as well as the intercept point where we leave this gradient. float intercept = -1; int minIndex = 0; int maxIndex = int(1.0f + size); if (offset.x < 0) { // If inside the inner radius of the gradient table, then use the first // stop. Set the intercept to advance forward to the start of the gradient // table. maxIndex = minIndex; if (t >= middleT) { intercept = radius; } } else if (offset.x < 1) { // Otherwise, we're inside the valid part of the gradient table. minIndex = int(1.0f + offset.x * size); maxIndex = minIndex; // Find the offset in the gradient that corresponds to the search limit. // We only search up to the minimum of either the mid-point or the span // length. Get the table index that corresponds to this offset, clamped so // that we avoid hitting the beginning (0) or end (1 + size) of the table. float searchOffset = (t >= middleT ? endRadius : middleRadius) - startRadius; int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size)); // If we are past the mid-point, walk up the gradient table trying to // merge stops. If we're below the mid-point, we need to walk down the // table. We note the table index at which we need to look for an // intercept to determine a valid span. if (t >= middleT) { while (maxIndex + 1 <= searchIndex && stops[maxIndex].can_merge(stops[maxIndex + 1])) { maxIndex++; } intercept = maxIndex + 1; } else { while (minIndex - 1 >= searchIndex && stops[minIndex - 1].can_merge(stops[minIndex])) { minIndex--; } intercept = minIndex; } // Convert from a table index into units of radius from the center of the // gradient. intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius; } else { // If outside the outer radius of the gradient table, then use the last // stop. Set the intercept to advance toward the valid part of the // gradient table if going in, or just run to the end of the span if going // away from the gradient. minIndex = maxIndex; if (t < middleT) { intercept = radius + 1; } } // Solve the quadratic for t to find where the merged gradient ends. If no // intercept is found, just go to the middle or end of the span. float endT = t >= middleT ? span : min(span, int(middleT)); if (intercept >= 0) { float b = middleB + intercept * intercept * invDelta; if (b > 0) { b = fastSqrt(b); endT = min(endT, t >= middleT ? middleT + b : middleT - b); } } // Figure out how many chunks are actually inside the merged gradient. if (t + 4.0f <= endT) { int inside = int(endT - t) & ~3; // Convert start and end colors to BGRA and scale to 0..255 range later. auto minColorF = stops[minIndex].startColor.zyxw * 255.0f; auto maxColorF = stops[maxIndex].end_color().zyxw * 255.0f; // Compute the change in color per change in gradient offset. auto deltaColorF = (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex)); // Subtract off the color difference of the beginning of the current span // from the beginning of the gradient. Float colorF = minColorF - deltaColorF * (startRadius + (minIndex - 1) / size); // Finally, walk over the span accumulating the position dot product and // getting its sqrt as an offset into the color ramp. Since we're already // in BGRA format and scaled to 255, we just need to round to an integer // and pack down to pixel format. for (auto* end = buf + inside; buf < end; buf += 4) { Float offsetG = fastSqrt(dotPos); commit_blend_span( buf, combine( packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1), round_pixel(colorF + deltaColorF * offsetG.y, 1)), packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1), round_pixel(colorF + deltaColorF * offsetG.w, 1)))); dotPos += dotPosDelta; dotPosDelta += deltaDelta2; } // Advance past the portion of gradient we just processed. t += inside; // If we hit the end of the span, exit out now. if (t >= span) { break; } // Otherwise, we are most likely in a transitional section of the gradient // between stops that will likely require doing per-sample table lookups. // Rather than having to redo all the searching above to figure that out, // just assume that to be the case and fall through below to doing the // table lookups to hopefully avoid an iteration. offset = fastSqrt(dotPos) - radius; if (repeat) { offset = fract(offset); } } // If we got here, that means we still have span left to process but did not // have any whole chunks that fell within a merged gradient. Just fall back // to doing a table lookup for each sample. Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); commit_blend_span(buf, sampleGradient(sampler, address, entry)); buf += 4; t += 4; dotPos += dotPosDelta; dotPosDelta += deltaDelta2; } return true; } // Commits an entire span of a radial gradient similar to // swglcommitLinearGradient, but given a varying 2D position scaled to // gradient-space and a radius at which the distance from the origin maps to the // start of the gradient table. #define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos, \ radius) \ do { \ bool drawn = false; \ if (blend_key) { \ drawn = \ commitRadialGradient(sampler, address, size, repeat, pos, \ radius, swgl_OutRGBA8, swgl_SpanLength); \ } else { \ drawn = \ commitRadialGradient(sampler, address, size, repeat, pos, \ radius, swgl_OutRGBA8, swgl_SpanLength); \ } \ if (drawn) { \ swgl_OutRGBA8 += swgl_SpanLength; \ swgl_SpanLength = 0; \ } \ } while (0) // Extension to set a clip mask image to be sampled during blending. The offset // specifies the positioning of the clip mask image relative to the viewport // origin. The bounding box specifies the rectangle relative to the clip mask's // origin that constrains sampling within the clip mask. Blending must be // enabled for this to work. static sampler2D swgl_ClipMask = nullptr; static IntPoint swgl_ClipMaskOffset = {0, 0}; static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0}; #define swgl_clipMask(mask, offset, bb_origin, bb_size) \ do { \ if (bb_size != vec2_scalar(0.0f, 0.0f)) { \ swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK; \ swgl_ClipMask = mask; \ swgl_ClipMaskOffset = make_ivec2(offset); \ swgl_ClipMaskBounds = \ IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \ } \ } while (0) // Extension to enable anti-aliasing for the given edges of a quad. // Blending must be enable for this to work. static int swgl_AAEdgeMask = 0; static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; } static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; } static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) { return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) | (mask.w ? 8 : 0); } #define swgl_antiAlias(edges) \ do { \ swgl_AAEdgeMask = calcAAEdgeMask(edges); \ if (swgl_AAEdgeMask) { \ swgl_ClipFlags |= SWGL_CLIP_FLAG_AA; \ } \ } while (0) #define swgl_blendDropShadow(color) \ do { \ swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \ swgl_BlendColorRGBA8 = packColor(color); \ } while (0) #define swgl_blendSubpixelText(color) \ do { \ swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \ swgl_BlendColorRGBA8 = packColor(color); \ swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8); \ } while (0) // Dispatch helper used by the GLSL translator to swgl_drawSpan functions. // The number of pixels committed is tracked by checking for the difference in // swgl_SpanLength. Any varying interpolants used will be advanced past the // committed part of the span in case the fragment shader must be executed for // any remaining pixels that were not committed by the span shader. #define DISPATCH_DRAW_SPAN(self, format) \ do { \ int total = self->swgl_SpanLength; \ self->swgl_drawSpan##format(); \ int drawn = total - self->swgl_SpanLength; \ if (drawn) self->step_interp_inputs(drawn); \ return drawn; \ } while (0)