/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ static inline void commit_span(uint32_t* buf, WideRGBA8 r) { if (blend_key) r = blend_pixels(buf, unaligned_load(buf), r); unaligned_store(buf, pack(r)); } static inline void commit_span(uint32_t* buf, PackedRGBA8 r) { if (blend_key) r = pack(blend_pixels(buf, unaligned_load(buf), unpack(r))); unaligned_store(buf, r); } UNUSED static inline void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) { if (blend_key) { for (uint32_t* end = &buf[len & ~3]; buf < end; buf += 4) { unaligned_store( buf, pack(blend_pixels(buf, unaligned_load(buf), r))); } len &= 3; if (len > 0) { partial_store_span( buf, pack(blend_pixels(buf, partial_load_span(buf, len), r, len)), len); } } else { fill_n(buf, len, bit_cast(pack(r)).x); } } static inline void commit_span(uint8_t* buf, WideR8 r) { if (blend_key) r = blend_pixels(buf, unpack(unaligned_load(buf)), r); unaligned_store(buf, pack(r)); } UNUSED static inline void commit_solid_span(uint8_t* buf, WideR8 r, int len) { if (blend_key) { for (uint8_t* end = &buf[len]; buf < end; buf += 4) { unaligned_store(buf, pack(blend_pixels( buf, unpack(unaligned_load(buf)), r))); } } else { fill_n((uint32_t*)buf, len / 4, bit_cast(pack(r))); } } template static inline WideRGBA8 pack_span(uint32_t*, const V& v) { return pack_pixels_RGBA8(v); } static inline WideRGBA8 pack_span(uint32_t*) { return pack_pixels_RGBA8(); } template static inline WideR8 pack_span(uint8_t*, C c) { return pack_pixels_R8(c); } static inline WideR8 pack_span(uint8_t*) { return pack_pixels_R8(); } // Forces a value with vector run-class to have scalar run-class. template static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) { return force_scalar(v); } // Advance all varying inperpolants by a single chunk #define swgl_stepInterp() step_interp_inputs() // Pseudo-intrinsic that accesses the interpolation step for a given varying #define swgl_interpStep(v) (interp_step.v) // Commit an entire span of a solid color #define swgl_commitSolid(format, v) \ do { \ commit_solid_span(swgl_Out##format, pack_span(swgl_Out##format, (v)), \ swgl_SpanLength); \ swgl_Out##format += swgl_SpanLength; \ swgl_SpanLength = 0; \ } while (0) #define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v) #define swgl_commitSolidR8(v) swgl_commitSolid(R8, v) #define swgl_commitChunk(format, chunk) \ do { \ commit_span(swgl_Out##format, chunk); \ swgl_Out##format += swgl_StepSize; \ swgl_SpanLength -= swgl_StepSize; \ } while (0) static inline WideRGBA8 pack_pixels_RGBA8(Float alpha) { I32 i = round_pixel(alpha); HalfRGBA8 c = packRGBA8(zipLow(i, i), zipHigh(i, i)); return combine(zipLow(c, c), zipHigh(c, c)); } static inline WideRGBA8 pack_pixels_RGBA8(float alpha) { I32 i = round_pixel(alpha); HalfRGBA8 c = packRGBA8(i, i); return combine(c, c); } // Commit a single chunk of a color scaled by an alpha weight #define swgl_commitColor(format, color, alpha) \ swgl_commitChunk(format, muldiv255(pack_pixels_##format(color), \ pack_pixels_##format(alpha))) #define swgl_commitColorRGBA8(color, alpha) \ swgl_commitColor(RGBA8, color, alpha) #define swgl_commitColorR8(color, alpha) swgl_commitColor(R8, color, alpha) template static ALWAYS_INLINE bool swgl_isTextureLinear(S s) { return s->filter == TextureFilter::LINEAR; } template static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) { return s->format == TextureFormat::RGBA8; } template static ALWAYS_INLINE bool swgl_isTextureR8(S s) { return s->format == TextureFormat::R8; } // Returns the offset into the texture buffer for the given layer index. If not // a texture array or 3D texture, this will always access the first layer. template static ALWAYS_INLINE int swgl_textureLayerOffset(S s, float layer) { return 0; } UNUSED static ALWAYS_INLINE int swgl_textureLayerOffset(sampler2DArray s, float layer) { return clampCoord(int(layer), s->depth) * s->height_stride; } // Use the default linear quantization scale of 128. This gives 7 bits of // fractional precision, which when multiplied with a signed 9 bit value // still fits in a 16 bit integer. const int swgl_LinearQuantizeScale = 128; // Quantizes UVs for access into a linear texture. template static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) { return linearQuantize(p, swgl_LinearQuantizeScale, s); } // Quantizes an interpolation step for UVs for access into a linear texture. template static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) { return samplerScale(s, p) * swgl_LinearQuantizeScale; } // Commit a single chunk from a linear texture fetch #define swgl_commitTextureLinear(format, s, p, ...) \ swgl_commitChunk(format, \ textureLinearUnpacked##format(s, ivec2(p), __VA_ARGS__)) #define swgl_commitTextureLinearRGBA8(s, p, ...) \ swgl_commitTextureLinear(RGBA8, s, p, __VA_ARGS__) #define swgl_commitTextureLinearR8(s, p, ...) \ swgl_commitTextureLinear(R8, s, p, __VA_ARGS__) // Commit a single chunk from a linear texture fetch that is scaled by a color #define swgl_commitTextureLinearColor(format, s, p, color, ...) \ swgl_commitChunk(format, muldiv255(textureLinearUnpacked##format( \ s, ivec2(p), __VA_ARGS__), \ pack_pixels_##format(color))) #define swgl_commitTextureLinearColorRGBA8(s, p, color, ...) \ swgl_commitTextureLinearColor(RGBA8, s, p, color, __VA_ARGS__) #define swgl_commitTextureLinearColorR8(s, p, color, ...) \ swgl_commitTextureLinearColor(R8, s, p, color, __VA_ARGS__) // Commit an entire span of a separable pass of a Gaussian blur that falls // within the given radius scaled by supplied coefficients, clamped to uv_rect // bounds. #define swgl_commitGaussianBlur(format, type, s, p, uv_rect, hori, radius, \ coeffs, ...) \ do { \ vec2_scalar size = {float(s->width), float(s->height)}; \ ivec2_scalar curUV = make_ivec2(force_scalar(p) * size); \ ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size)); \ int endX = min(bounds.z, curUV.x + swgl_SpanLength * swgl_StepSize); \ if (hori) { \ for (; curUV.x + swgl_StepSize <= endX; curUV.x += swgl_StepSize) { \ swgl_commitChunk(format, gaussianBlurHorizontal( \ s, curUV, bounds.x, bounds.z, radius, \ coeffs.x, coeffs.y, __VA_ARGS__)); \ } \ } else { \ for (; curUV.x + swgl_StepSize <= endX; curUV.x += swgl_StepSize) { \ swgl_commitChunk(format, gaussianBlurVertical( \ s, curUV, bounds.y, bounds.w, radius, \ coeffs.x, coeffs.y, __VA_ARGS__)); \ } \ } \ } while (0) #define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs, ...) \ swgl_commitGaussianBlur(RGBA8, uint32_t, s, p, uv_rect, hori, radius, \ coeffs, __VA_ARGS__) #define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs, ...) \ swgl_commitGaussianBlur(R8, uint8_t, s, p, uv_rect, hori, radius, coeffs, \ __VA_ARGS__) // Convert and pack planar YUV samples to RGB output using a color space static ALWAYS_INLINE PackedRGBA8 convertYUV(int colorSpace, U16 y, U16 u, U16 v) { auto yy = V8(zip(y, y)); auto uv = V8(zip(u, v)); switch (colorSpace) { case REC_601: return YUVConverter::convert(yy, uv); case REC_709: return YUVConverter::convert(yy, uv); case REC_2020: return YUVConverter::convert(yy, uv); default: return YUVConverter::convert(yy, uv); } } // Helper functions to sample from planar YUV textures before converting to RGB template static inline PackedRGBA8 sampleYUV(S0 sampler0, vec2 uv0, int layer0, int colorSpace, int rescaleFactor) { ivec2 i0(uv0); switch (sampler0->format) { case TextureFormat::RGBA8: { auto planar = textureLinearPlanarRGBA8(sampler0, i0, layer0); return convertYUV(colorSpace, highHalf(planar.rg), lowHalf(planar.rg), lowHalf(planar.ba)); } case TextureFormat::YUV422: { auto planar = textureLinearPlanarYUV422(sampler0, i0, layer0); return convertYUV(colorSpace, planar.y, planar.u, planar.v); } default: assert(false); return PackedRGBA8(0); } } template static inline WideRGBA8 sampleColorYUV(S0 sampler0, vec2 uv0, int layer0, int colorSpace, int rescaleFactor, C color) { return muldiv255( unpack(sampleYUV(sampler0, uv0, layer0, colorSpace, rescaleFactor)), pack_pixels_RGBA8(color)); } template static inline PackedRGBA8 sampleYUV(S0 sampler0, vec2 uv0, int layer0, S1 sampler1, vec2 uv1, int layer1, int colorSpace, int rescaleFactor) { ivec2 i0(uv0); ivec2 i1(uv1); switch (sampler1->format) { case TextureFormat::RG8: { assert(sampler0->format == TextureFormat::R8); auto y = textureLinearUnpackedR8(sampler0, i0, layer0); auto planar = textureLinearPlanarRG8(sampler1, i1, layer1); return convertYUV(colorSpace, y, lowHalf(planar.rg), highHalf(planar.rg)); } case TextureFormat::RGBA8: { assert(sampler0->format == TextureFormat::R8); auto y = textureLinearUnpackedR8(sampler0, i0, layer0); auto planar = textureLinearPlanarRGBA8(sampler1, i1, layer1); return convertYUV(colorSpace, y, lowHalf(planar.ba), highHalf(planar.rg)); } default: assert(false); return PackedRGBA8(0); } } template static inline WideRGBA8 sampleColorYUV(S0 sampler0, vec2 uv0, int layer0, S1 sampler1, vec2 uv1, int layer1, int colorSpace, int rescaleFactor, C color) { return muldiv255(unpack(sampleYUV(sampler0, uv0, layer0, sampler1, uv1, layer1, colorSpace, rescaleFactor)), pack_pixels_RGBA8(color)); } template static inline PackedRGBA8 sampleYUV(S0 sampler0, vec2 uv0, int layer0, S1 sampler1, vec2 uv1, int layer1, S2 sampler2, vec2 uv2, int layer2, int colorSpace, int rescaleFactor) { ivec2 i0(uv0); ivec2 i1(uv1); ivec2 i2(uv2); assert(sampler0->format == sampler1->format && sampler0->format == sampler2->format); switch (sampler0->format) { case TextureFormat::R8: { auto y = textureLinearUnpackedR8(sampler0, i0, layer0); auto u = textureLinearUnpackedR8(sampler1, i1, layer1); auto v = textureLinearUnpackedR8(sampler2, i2, layer2); return convertYUV(colorSpace, y, u, v); } case TextureFormat::R16: { // The rescaling factor represents how many bits to add to renormalize the // texture to 16 bits, and so the color depth is actually 16 minus the // rescaling factor. // Need to right shift the sample by the amount of bits over 8 it // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit // of precision at the low end already, hence 1 is subtracted from the // color depth. int colorDepth = 16 - rescaleFactor; int rescaleBits = (colorDepth - 1) - 8; auto y = textureLinearUnpackedR16(sampler0, i0, layer0) >> rescaleBits; auto u = textureLinearUnpackedR16(sampler1, i1, layer1) >> rescaleBits; auto v = textureLinearUnpackedR16(sampler2, i2, layer2) >> rescaleBits; return convertYUV(colorSpace, U16(y), U16(u), U16(v)); } default: assert(false); return PackedRGBA8(0); } } template static inline WideRGBA8 sampleColorYUV(S0 sampler0, vec2 uv0, int layer0, S1 sampler1, vec2 uv1, int layer1, S2 sampler2, vec2 uv2, int layer2, int colorSpace, int rescaleFactor, C color) { return muldiv255( unpack(sampleYUV(sampler0, uv0, layer0, sampler1, uv1, layer1, sampler2, uv2, layer2, colorSpace, rescaleFactor)), pack_pixels_RGBA8(color)); } // Commit a single chunk of a YUV surface represented by multiple planar // textures. This requires a color space specifier selecting how to convert // from YUV to RGB output. In the case of HDR formats, a rescaling factor // selects how many bits of precision must be utilized on conversion. See the // sampleYUV dispatcher functions for the various supported plane // configurations this intrinsic accepts. #define swgl_commitTextureLinearYUV(...) \ swgl_commitChunk(RGBA8, sampleYUV(__VA_ARGS__)) // Commit a single chunk of a YUV surface scaled by a color. #define swgl_commitTextureLinearColorYUV(...) \ swgl_commitChunk(RGBA8, sampleColorYUV(__VA_ARGS__)) // Helper functions to apply a color modulus when available. struct NoColor {}; SI WideRGBA8 applyColor(WideRGBA8 src, NoColor) { return src; } SI WideRGBA8 applyColor(WideRGBA8 src, WideRGBA8 color) { return muldiv255(src, color); } SI PackedRGBA8 applyColor(PackedRGBA8 src, NoColor) { return src; } SI PackedRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) { return pack(muldiv255(unpack(src), color)); } // Samples an axis-aligned span of on a single row of a texture using 1:1 // nearest filtering. Sampling is constrained to only fall within the given UV // bounds. This requires a pointer to the destination buffer. An optional color // modulus can be supplied. template static void blendTextureNearestRGBA8(S sampler, const ivec2_scalar& i, int span, const ivec2_scalar& minUV, const ivec2_scalar& maxUV, C color, uint32_t* buf, int layerOffset = 0) { // Calculate the row pointer within the buffer, clamping to within valid row // bounds. uint32_t* row = &sampler->buf[clamp(clampCoord(i.y, sampler->height), minUV.y, maxUV.y) * sampler->stride + layerOffset]; // Find clamped X bounds within the row. int minX = clamp(minUV.x, 0, sampler->width - 1); int maxX = clamp(maxUV.x, minX, sampler->width - 1); int curX = i.x; // If we need to start sampling below the valid sample bounds, then we need to // fill this section with a constant clamped sample. if (curX < minX) { int n = min(minX - curX, span); auto src = applyColor(unpack(bit_cast(U32(row[minX]))), color); commit_solid_span(buf, src, n); buf += n; span -= n; curX += n; } // Here we only deal with valid samples within the sample bounds. No clamping // should occur here within these inner loops. int n = clamp(maxX + 1 - curX, 0, span); span -= n; // Try to process as many chunks as possible with full loads and stores. if (blend_key) { for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) { auto src = applyColor(unpack(unaligned_load(&row[curX])), color); auto r = blend_pixels(buf, unaligned_load(buf), src); unaligned_store(buf, pack(r)); } } else { for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) { auto src = applyColor(unaligned_load(&row[curX]), color); unaligned_store(buf, src); } } n &= 3; // If we have any leftover samples after processing chunks, use partial loads // and stores. if (n > 0) { if (blend_key) { auto src = applyColor( unpack(partial_load_span(&row[curX], n)), color); auto r = blend_pixels(buf, partial_load_span(buf, n), src, n); partial_store_span(buf, pack(r), n); } else { auto src = applyColor(partial_load_span(&row[curX], n), color); partial_store_span(buf, src, n); } buf += n; curX += n; } // If we still have samples left above the valid sample bounds, then we again // need to fill this section with a constant clamped sample. if (span > 0) { auto src = applyColor(unpack(bit_cast(U32(row[maxX]))), color); commit_solid_span(buf, src, span); } } // TODO: blendTextureNearestR8 if it is actually needed // Commit an entire span of 1:1 nearest texture fetches, potentially scaled by a // color #define swgl_commitTextureNearest(format, s, p, uv_rect, color, ...) \ do { \ ivec2_scalar i = make_ivec2(samplerScale(s, force_scalar(p))); \ ivec2_scalar min_uv = \ make_ivec2(samplerScale(s, vec2_scalar{uv_rect.x, uv_rect.y})); \ ivec2_scalar max_uv = \ make_ivec2(samplerScale(s, vec2_scalar{uv_rect.z, uv_rect.w})); \ blendTextureNearest##format(s, i, swgl_SpanLength, min_uv, max_uv, color, \ swgl_Out##format, __VA_ARGS__); \ swgl_Out##format += swgl_SpanLength; \ swgl_SpanLength = 0; \ } while (0) #define swgl_commitTextureNearestRGBA8(s, p, uv_rect, ...) \ swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor(), __VA_ARGS__) #define swgl_commitTextureNearestR8(s, p, uv_rect, ...) \ swgl_commitTextureNearest(R8, s, p, uv_rect, NoColor(), __VA_ARGS__) #define swgl_commitTextureNearestColor(format, s, p, uv_rect, color, ...) \ swgl_commitTextureNearest(format, s, p, uv_rect, \ pack_pixels_##format(color), __VA_ARGS__) #define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color, ...) \ swgl_commitTextureNearestColor(RGBA8, s, p, uv_rect, color, __VA_ARGS__) #define swgl_commitTextureNearestColorR8(s, p, uv_rect, color, ...) \ swgl_commitTextureNearestColor(R8, s, p, uv_rect, color, __VA_ARGS__) // Helper function to decide whether we can safely apply 1:1 nearest filtering // without diverging too much from the linear filter template static bool allowTextureNearest(S sampler, T P, int span) { // First verify if the row Y doesn't change across samples if (P.y.x != P.y.y) { return false; } P = samplerScale(sampler, P); // We need to verify that the pixel step reasonably approximates stepping // by a single texel for every pixel we need to reproduce. Try to ensure // that the margin of error is no more than approximately 2^-7. span &= ~(128 - 1); span += 128; return round((P.x.y - P.x.x) * span) == span && // Also verify that we're reasonably close to the center of a texel // so that it doesn't look that much different than if a linear filter // was used. (int(P.x.x * 4.0f + 0.5f) & 3) == 2 && (int(P.y.x * 4.0f + 0.5f) & 3) == 2; } // Determine if we can apply 1:1 nearest filtering to a span of texture #define swgl_allowTextureNearest(s, p) \ allowTextureNearest(s, p, swgl_SpanLength) // Extension to set a clip mask image to be sampled during blending. The offset // specifies the positioning of the clip mask image relative to the viewport // origin. The bounding box specifies the rectangle relative to the clip mask's // origin that constrains sampling within the clip mask. static sampler2D swgl_ClipMask = nullptr; static IntPoint swgl_ClipMaskOffset = {0, 0}; static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0}; #define swgl_clipMask(mask, offset, bb_origin, bb_size) \ do { \ if (bb_size != vec2_scalar(0.0f, 0.0f)) { \ swgl_ClipMask = mask; \ swgl_ClipMaskOffset = make_ivec2(offset); \ swgl_ClipMaskBounds = \ IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \ } \ } while (0) // Dispatch helper used by the GLSL translator to swgl_drawSpan functions. // The number of pixels committed is tracked by checking for the difference in // swgl_SpanLength. Any varying interpolants used will be advanced past the // committed part of the span in case the fragment shader must be executed for // any remaining pixels that were not committed by the span shader. #define DISPATCH_DRAW_SPAN(self, format) \ do { \ int total = self->swgl_SpanLength; \ self->swgl_drawSpan##format(); \ int drawn = total - self->swgl_SpanLength; \ if (drawn) self->step_interp_inputs(drawn); \ while (self->swgl_SpanLength > 0) { \ run(self); \ commit_span(self->swgl_Out##format, pack_span(self->swgl_Out##format)); \ self->swgl_Out##format += swgl_StepSize; \ self->swgl_SpanLength -= swgl_StepSize; \ } \ } while (0)